diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 5.235269989993057, + "learning_rate": 2.6737967914438506e-08, + "logits/chosen": -0.07364606857299805, + "logits/rejected": 0.1362065076828003, + "logps/chosen": -1.715688705444336, + "logps/rejected": -1.8891627788543701, + "loss": 0.8421, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.715688705444336, + "rewards/margins": 0.17347395420074463, + "rewards/rejected": -1.8891627788543701, + "sft_loss": 1.4681576490402222, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 9.85463554243418, + "learning_rate": 5.347593582887701e-08, + "logits/chosen": -0.007629724685102701, + "logits/rejected": 0.11350151151418686, + "logps/chosen": -1.80299973487854, + "logps/rejected": -1.8473634719848633, + "loss": 0.9279, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.80299973487854, + "rewards/margins": 0.04436371102929115, + "rewards/rejected": -1.8473634719848633, + "sft_loss": 1.508274793624878, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 11.515063279361502, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": -0.050660718232393265, + "logits/rejected": 0.04635730758309364, + "logps/chosen": -1.6353628635406494, + "logps/rejected": -1.7657358646392822, + "loss": 0.9022, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6353628635406494, + "rewards/margins": 0.13037298619747162, + "rewards/rejected": -1.7657358646392822, + "sft_loss": 1.5009872913360596, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 5.258926310151922, + "learning_rate": 1.0695187165775402e-07, + "logits/chosen": -0.06183786317706108, + "logits/rejected": 0.023231148719787598, + "logps/chosen": -1.7252362966537476, + "logps/rejected": -1.8057111501693726, + "loss": 0.9261, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.7252362966537476, + "rewards/margins": 0.08047479391098022, + "rewards/rejected": -1.8057111501693726, + "sft_loss": 1.5002777576446533, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 16.53074244289573, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.06942118704319, + "logits/rejected": 0.015559596940875053, + "logps/chosen": -1.8668476343154907, + "logps/rejected": -1.7766830921173096, + "loss": 1.0363, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -1.8668476343154907, + "rewards/margins": -0.09016446769237518, + "rewards/rejected": -1.7766830921173096, + "sft_loss": 1.5445544719696045, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 9.275972420130646, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": -0.09946934133768082, + "logits/rejected": -0.002832284662872553, + "logps/chosen": -1.9050594568252563, + "logps/rejected": -1.8290824890136719, + "loss": 0.9982, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9050594568252563, + "rewards/margins": -0.07597692310810089, + "rewards/rejected": -1.8290824890136719, + "sft_loss": 1.644878625869751, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 10.27616549684039, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": -0.05647529289126396, + "logits/rejected": 0.10369857400655746, + "logps/chosen": -1.8399736881256104, + "logps/rejected": -1.989367127418518, + "loss": 0.9535, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8399736881256104, + "rewards/margins": 0.1493932157754898, + "rewards/rejected": -1.989367127418518, + "sft_loss": 1.559540867805481, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 9.47278456909826, + "learning_rate": 2.1390374331550805e-07, + "logits/chosen": 0.03216688707470894, + "logits/rejected": 0.20794229209423065, + "logps/chosen": -1.8674113750457764, + "logps/rejected": -1.731610655784607, + "loss": 1.0019, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -1.8674113750457764, + "rewards/margins": -0.13580094277858734, + "rewards/rejected": -1.731610655784607, + "sft_loss": 1.5152695178985596, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 15.430823640809635, + "learning_rate": 2.4064171122994655e-07, + "logits/chosen": 0.007911334745585918, + "logits/rejected": 0.2036576271057129, + "logps/chosen": -1.8118816614151, + "logps/rejected": -1.8485628366470337, + "loss": 0.9646, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8118816614151, + "rewards/margins": 0.036681193858385086, + "rewards/rejected": -1.8485628366470337, + "sft_loss": 1.5256637334823608, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 11.882954872936795, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.049845270812511444, + "logits/rejected": 0.10112782567739487, + "logps/chosen": -1.8521621227264404, + "logps/rejected": -1.7432807683944702, + "loss": 1.0101, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.8521621227264404, + "rewards/margins": -0.10888160765171051, + "rewards/rejected": -1.7432807683944702, + "sft_loss": 1.5658760070800781, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 7.9737766908357575, + "learning_rate": 2.9411764705882356e-07, + "logits/chosen": -0.10056047141551971, + "logits/rejected": 0.12481649219989777, + "logps/chosen": -1.7834703922271729, + "logps/rejected": -1.822126030921936, + "loss": 0.9768, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.7834703922271729, + "rewards/margins": 0.03865557536482811, + "rewards/rejected": -1.822126030921936, + "sft_loss": 1.5616546869277954, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 7.225835353625987, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": -0.08353086560964584, + "logits/rejected": 0.10854407399892807, + "logps/chosen": -1.7223520278930664, + "logps/rejected": -1.8239339590072632, + "loss": 0.9, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7223520278930664, + "rewards/margins": 0.10158195346593857, + "rewards/rejected": -1.8239339590072632, + "sft_loss": 1.5241292715072632, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 5.431638210110103, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": -0.029154837131500244, + "logits/rejected": 0.11996859312057495, + "logps/chosen": -1.5424885749816895, + "logps/rejected": -1.6564009189605713, + "loss": 0.8524, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5424885749816895, + "rewards/margins": 0.1139124184846878, + "rewards/rejected": -1.6564009189605713, + "sft_loss": 1.4341975450515747, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 11.820639617057315, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.07852429151535034, + "logits/rejected": 0.07054585218429565, + "logps/chosen": -1.6553970575332642, + "logps/rejected": -1.6938140392303467, + "loss": 0.9501, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -1.6553970575332642, + "rewards/margins": 0.03841722011566162, + "rewards/rejected": -1.6938140392303467, + "sft_loss": 1.567211389541626, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 12.009311638296472, + "learning_rate": 4.0106951871657757e-07, + "logits/chosen": -0.07746043056249619, + "logits/rejected": 0.10344438254833221, + "logps/chosen": -1.619264006614685, + "logps/rejected": -1.859872579574585, + "loss": 0.8358, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.619264006614685, + "rewards/margins": 0.24060864746570587, + "rewards/rejected": -1.859872579574585, + "sft_loss": 1.5058908462524414, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 7.635140975991649, + "learning_rate": 4.278074866310161e-07, + "logits/chosen": -0.004299764521420002, + "logits/rejected": 0.10244850069284439, + "logps/chosen": -1.5427652597427368, + "logps/rejected": -1.5827980041503906, + "loss": 0.887, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5427652597427368, + "rewards/margins": 0.04003264755010605, + "rewards/rejected": -1.5827980041503906, + "sft_loss": 1.4448814392089844, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 6.061565243102024, + "learning_rate": 4.5454545454545457e-07, + "logits/chosen": -0.18546968698501587, + "logits/rejected": 0.0526907853782177, + "logps/chosen": -1.587787389755249, + "logps/rejected": -1.7247536182403564, + "loss": 0.8549, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.587787389755249, + "rewards/margins": 0.13696610927581787, + "rewards/rejected": -1.7247536182403564, + "sft_loss": 1.424576759338379, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 13.366616211715524, + "learning_rate": 4.812834224598931e-07, + "logits/chosen": 0.05517455190420151, + "logits/rejected": 0.015907561406493187, + "logps/chosen": -1.5135414600372314, + "logps/rejected": -1.592010498046875, + "loss": 0.8748, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.5135414600372314, + "rewards/margins": 0.07846912741661072, + "rewards/rejected": -1.592010498046875, + "sft_loss": 1.3922526836395264, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 4.65872642986279, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.10109523683786392, + "logits/rejected": 0.04699419066309929, + "logps/chosen": -1.4085562229156494, + "logps/rejected": -1.604548454284668, + "loss": 0.8126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4085562229156494, + "rewards/margins": 0.19599218666553497, + "rewards/rejected": -1.604548454284668, + "sft_loss": 1.385668396949768, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 5.225165770190776, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.08418619632720947, + "logits/rejected": -0.020208898931741714, + "logps/chosen": -1.3913484811782837, + "logps/rejected": -1.4861876964569092, + "loss": 0.8452, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3913484811782837, + "rewards/margins": 0.09483925253152847, + "rewards/rejected": -1.4861876964569092, + "sft_loss": 1.3743575811386108, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 4.179439042444936, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.023166431114077568, + "logits/rejected": 0.0029010644648224115, + "logps/chosen": -1.3319205045700073, + "logps/rejected": -1.506927251815796, + "loss": 0.8006, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3319205045700073, + "rewards/margins": 0.17500664293766022, + "rewards/rejected": -1.506927251815796, + "sft_loss": 1.3089672327041626, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 7.120176078055153, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -0.07467556744813919, + "logits/rejected": 0.018729006871581078, + "logps/chosen": -1.2951467037200928, + "logps/rejected": -1.3635486364364624, + "loss": 0.8459, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2951467037200928, + "rewards/margins": 0.06840179860591888, + "rewards/rejected": -1.3635486364364624, + "sft_loss": 1.297489881515503, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 6.7501369743432615, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": -0.05421806126832962, + "logits/rejected": 0.13666459918022156, + "logps/chosen": -1.3326265811920166, + "logps/rejected": -1.5337562561035156, + "loss": 0.8006, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3326265811920166, + "rewards/margins": 0.2011294811964035, + "rewards/rejected": -1.5337562561035156, + "sft_loss": 1.3840888738632202, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 5.809627442148684, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.16003762185573578, + "logits/rejected": -0.007819685153663158, + "logps/chosen": -1.3354389667510986, + "logps/rejected": -1.4167028665542603, + "loss": 0.8465, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3354389667510986, + "rewards/margins": 0.08126381039619446, + "rewards/rejected": -1.4167028665542603, + "sft_loss": 1.3464367389678955, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 4.5054154445634556, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.16295680403709412, + "logits/rejected": -0.038801293820142746, + "logps/chosen": -1.3474639654159546, + "logps/rejected": -1.3577622175216675, + "loss": 0.8773, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3474639654159546, + "rewards/margins": 0.010298268869519234, + "rewards/rejected": -1.3577622175216675, + "sft_loss": 1.4015753269195557, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 5.519825401039347, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": -0.03140866383910179, + "logits/rejected": 0.09492513537406921, + "logps/chosen": -1.382698655128479, + "logps/rejected": -1.499145746231079, + "loss": 0.8341, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.382698655128479, + "rewards/margins": 0.11644729226827621, + "rewards/rejected": -1.499145746231079, + "sft_loss": 1.4506919384002686, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 7.089184282498258, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.11524300277233124, + "logits/rejected": -0.008447563275694847, + "logps/chosen": -1.4253406524658203, + "logps/rejected": -1.5112718343734741, + "loss": 0.8615, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4253406524658203, + "rewards/margins": 0.08593113720417023, + "rewards/rejected": -1.5112718343734741, + "sft_loss": 1.373755693435669, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 7.494836982244487, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.09514065831899643, + "logits/rejected": 0.05069103091955185, + "logps/chosen": -1.4329020977020264, + "logps/rejected": -1.4857370853424072, + "loss": 0.8719, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.4329020977020264, + "rewards/margins": 0.05283506587147713, + "rewards/rejected": -1.4857370853424072, + "sft_loss": 1.4442007541656494, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 8.249196871188497, + "learning_rate": 7.754010695187166e-07, + "logits/chosen": -0.08438152074813843, + "logits/rejected": 0.05761206895112991, + "logps/chosen": -1.3430602550506592, + "logps/rejected": -1.4420830011367798, + "loss": 0.8477, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3430602550506592, + "rewards/margins": 0.09902272373437881, + "rewards/rejected": -1.4420830011367798, + "sft_loss": 1.4123352766036987, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 8.009989628674303, + "learning_rate": 8.021390374331551e-07, + "logits/chosen": -0.12797826528549194, + "logits/rejected": 0.01398629229515791, + "logps/chosen": -1.283252239227295, + "logps/rejected": -1.3031189441680908, + "loss": 0.8647, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.283252239227295, + "rewards/margins": 0.01986684277653694, + "rewards/rejected": -1.3031189441680908, + "sft_loss": 1.2768657207489014, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 5.021232147599984, + "learning_rate": 8.288770053475937e-07, + "logits/chosen": -0.12843099236488342, + "logits/rejected": -0.0827844962477684, + "logps/chosen": -1.298471212387085, + "logps/rejected": -1.4227102994918823, + "loss": 0.825, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.298471212387085, + "rewards/margins": 0.12423906475305557, + "rewards/rejected": -1.4227102994918823, + "sft_loss": 1.3338967561721802, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 5.036413154312052, + "learning_rate": 8.556149732620322e-07, + "logits/chosen": -0.19850830733776093, + "logits/rejected": -0.06920308619737625, + "logps/chosen": -1.3978339433670044, + "logps/rejected": -1.3811728954315186, + "loss": 0.9003, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.3978339433670044, + "rewards/margins": -0.016660813242197037, + "rewards/rejected": -1.3811728954315186, + "sft_loss": 1.3823540210723877, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 8.480938331155045, + "learning_rate": 8.823529411764706e-07, + "logits/chosen": -0.07313909381628036, + "logits/rejected": 0.09209281206130981, + "logps/chosen": -1.3041568994522095, + "logps/rejected": -1.3962047100067139, + "loss": 0.8479, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3041568994522095, + "rewards/margins": 0.09204794466495514, + "rewards/rejected": -1.3962047100067139, + "sft_loss": 1.2982127666473389, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 9.899406966050948, + "learning_rate": 9.090909090909091e-07, + "logits/chosen": -0.12866242229938507, + "logits/rejected": -0.07856561988592148, + "logps/chosen": -1.4354071617126465, + "logps/rejected": -1.5156786441802979, + "loss": 0.8656, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.4354071617126465, + "rewards/margins": 0.08027160167694092, + "rewards/rejected": -1.5156786441802979, + "sft_loss": 1.4268556833267212, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 7.339696230235005, + "learning_rate": 9.358288770053477e-07, + "logits/chosen": 0.010467484593391418, + "logits/rejected": 0.01038367860019207, + "logps/chosen": -1.3264710903167725, + "logps/rejected": -1.4309437274932861, + "loss": 0.8403, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3264710903167725, + "rewards/margins": 0.10447286069393158, + "rewards/rejected": -1.4309437274932861, + "sft_loss": 1.357232928276062, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 6.350510580581484, + "learning_rate": 9.625668449197862e-07, + "logits/chosen": -0.0297098346054554, + "logits/rejected": -0.030283737927675247, + "logps/chosen": -1.3521928787231445, + "logps/rejected": -1.5571117401123047, + "loss": 0.8384, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3521928787231445, + "rewards/margins": 0.2049189805984497, + "rewards/rejected": -1.5571117401123047, + "sft_loss": 1.3775081634521484, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 6.445053054684298, + "learning_rate": 9.893048128342246e-07, + "logits/chosen": -0.14473792910575867, + "logits/rejected": -0.05903983116149902, + "logps/chosen": -1.3446182012557983, + "logps/rejected": -1.4061791896820068, + "loss": 0.8627, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3446182012557983, + "rewards/margins": 0.06156102567911148, + "rewards/rejected": -1.4061791896820068, + "sft_loss": 1.3568413257598877, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 7.0135353443302595, + "learning_rate": 1.016042780748663e-06, + "logits/chosen": -0.07751432806253433, + "logits/rejected": 0.035579387098550797, + "logps/chosen": -1.2608205080032349, + "logps/rejected": -1.405822515487671, + "loss": 0.809, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2608205080032349, + "rewards/margins": 0.14500199258327484, + "rewards/rejected": -1.405822515487671, + "sft_loss": 1.288597583770752, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 4.414701651670081, + "learning_rate": 1.0427807486631017e-06, + "logits/chosen": 0.0497257336974144, + "logits/rejected": 0.20516404509544373, + "logps/chosen": -1.2470285892486572, + "logps/rejected": -1.4114071130752563, + "loss": 0.7964, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2470285892486572, + "rewards/margins": 0.16437852382659912, + "rewards/rejected": -1.4114071130752563, + "sft_loss": 1.2925219535827637, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 14.857571461686588, + "learning_rate": 1.0695187165775401e-06, + "logits/chosen": -0.0697740688920021, + "logits/rejected": 0.06182453781366348, + "logps/chosen": -1.3620140552520752, + "logps/rejected": -1.414411187171936, + "loss": 0.8614, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3620140552520752, + "rewards/margins": 0.05239716172218323, + "rewards/rejected": -1.414411187171936, + "sft_loss": 1.389324426651001, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 8.014046884380164, + "learning_rate": 1.0962566844919785e-06, + "logits/chosen": -0.04965759068727493, + "logits/rejected": 0.08998225629329681, + "logps/chosen": -1.2621620893478394, + "logps/rejected": -1.3641141653060913, + "loss": 0.8279, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2621620893478394, + "rewards/margins": 0.10195207595825195, + "rewards/rejected": -1.3641141653060913, + "sft_loss": 1.2772701978683472, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 7.114198373387648, + "learning_rate": 1.1229946524064172e-06, + "logits/chosen": -0.13316112756729126, + "logits/rejected": 0.047932691872119904, + "logps/chosen": -1.3507158756256104, + "logps/rejected": -1.4856475591659546, + "loss": 0.8328, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3507158756256104, + "rewards/margins": 0.13493159413337708, + "rewards/rejected": -1.4856475591659546, + "sft_loss": 1.339905023574829, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 6.438116798051675, + "learning_rate": 1.1497326203208556e-06, + "logits/chosen": -0.1692911684513092, + "logits/rejected": 0.06463338434696198, + "logps/chosen": -1.388016939163208, + "logps/rejected": -1.4628031253814697, + "loss": 0.8388, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.388016939163208, + "rewards/margins": 0.07478625327348709, + "rewards/rejected": -1.4628031253814697, + "sft_loss": 1.367095947265625, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 13.122564944459192, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": 0.05476145073771477, + "logits/rejected": 0.14929446578025818, + "logps/chosen": -1.3221927881240845, + "logps/rejected": -1.4936447143554688, + "loss": 0.8145, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3221927881240845, + "rewards/margins": 0.17145201563835144, + "rewards/rejected": -1.4936447143554688, + "sft_loss": 1.3450855016708374, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 4.810046075776873, + "learning_rate": 1.2032085561497326e-06, + "logits/chosen": -0.08290469646453857, + "logits/rejected": 0.08052632957696915, + "logps/chosen": -1.34523606300354, + "logps/rejected": -1.4930176734924316, + "loss": 0.8049, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.34523606300354, + "rewards/margins": 0.1477815806865692, + "rewards/rejected": -1.4930176734924316, + "sft_loss": 1.3378939628601074, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 5.88697736522405, + "learning_rate": 1.229946524064171e-06, + "logits/chosen": 0.010171364061534405, + "logits/rejected": 0.08339252322912216, + "logps/chosen": -1.3613780736923218, + "logps/rejected": -1.5585384368896484, + "loss": 0.7995, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3613780736923218, + "rewards/margins": 0.19716022908687592, + "rewards/rejected": -1.5585384368896484, + "sft_loss": 1.309051752090454, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 9.153755459677267, + "learning_rate": 1.2566844919786097e-06, + "logits/chosen": 0.040754929184913635, + "logits/rejected": 0.17237837612628937, + "logps/chosen": -1.3430492877960205, + "logps/rejected": -1.5196940898895264, + "loss": 0.7983, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3430492877960205, + "rewards/margins": 0.17664454877376556, + "rewards/rejected": -1.5196940898895264, + "sft_loss": 1.3170509338378906, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 4.17884202482278, + "learning_rate": 1.2834224598930481e-06, + "logits/chosen": 0.014685697853565216, + "logits/rejected": 0.13835473358631134, + "logps/chosen": -1.3332364559173584, + "logps/rejected": -1.5462366342544556, + "loss": 0.8008, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3332364559173584, + "rewards/margins": 0.21300017833709717, + "rewards/rejected": -1.5462366342544556, + "sft_loss": 1.3689817190170288, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 7.242285914484373, + "learning_rate": 1.3101604278074866e-06, + "logits/chosen": 0.03203669935464859, + "logits/rejected": 0.1445348560810089, + "logps/chosen": -1.4501844644546509, + "logps/rejected": -1.5245463848114014, + "loss": 0.8616, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4501844644546509, + "rewards/margins": 0.0743618980050087, + "rewards/rejected": -1.5245463848114014, + "sft_loss": 1.4526886940002441, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 8.208482870825872, + "learning_rate": 1.3368983957219252e-06, + "logits/chosen": -0.05866866558790207, + "logits/rejected": 0.09590274095535278, + "logps/chosen": -1.342435359954834, + "logps/rejected": -1.4371484518051147, + "loss": 0.854, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.342435359954834, + "rewards/margins": 0.0947132557630539, + "rewards/rejected": -1.4371484518051147, + "sft_loss": 1.3427950143814087, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 7.922536852476926, + "learning_rate": 1.3636363636363636e-06, + "logits/chosen": -0.007726128213107586, + "logits/rejected": 0.1304907500743866, + "logps/chosen": -1.3153059482574463, + "logps/rejected": -1.4615005254745483, + "loss": 0.8035, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3153059482574463, + "rewards/margins": 0.14619456231594086, + "rewards/rejected": -1.4615005254745483, + "sft_loss": 1.2732080221176147, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 4.556168947130132, + "learning_rate": 1.390374331550802e-06, + "logits/chosen": -0.21302637457847595, + "logits/rejected": -0.10784848779439926, + "logps/chosen": -1.401681661605835, + "logps/rejected": -1.5898513793945312, + "loss": 0.7987, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.401681661605835, + "rewards/margins": 0.18816980719566345, + "rewards/rejected": -1.5898513793945312, + "sft_loss": 1.4099761247634888, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 6.962383074856081, + "learning_rate": 1.4171122994652407e-06, + "logits/chosen": -0.06974764168262482, + "logits/rejected": 0.011307650245726109, + "logps/chosen": -1.3614174127578735, + "logps/rejected": -1.5814176797866821, + "loss": 0.8024, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3614174127578735, + "rewards/margins": 0.2200002670288086, + "rewards/rejected": -1.5814176797866821, + "sft_loss": 1.4184319972991943, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 4.12622878270456, + "learning_rate": 1.443850267379679e-06, + "logits/chosen": -0.057353176176548004, + "logits/rejected": 0.06626446545124054, + "logps/chosen": -1.3444592952728271, + "logps/rejected": -1.4615471363067627, + "loss": 0.821, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3444592952728271, + "rewards/margins": 0.11708767712116241, + "rewards/rejected": -1.4615471363067627, + "sft_loss": 1.3590993881225586, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 4.861276699079042, + "learning_rate": 1.4705882352941175e-06, + "logits/chosen": -0.013329749926924706, + "logits/rejected": 0.08376727253198624, + "logps/chosen": -1.285291075706482, + "logps/rejected": -1.5019479990005493, + "loss": 0.7831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.285291075706482, + "rewards/margins": 0.21665680408477783, + "rewards/rejected": -1.5019479990005493, + "sft_loss": 1.275146245956421, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 8.273378567734328, + "learning_rate": 1.4973262032085562e-06, + "logits/chosen": -0.086963951587677, + "logits/rejected": 0.061237942427396774, + "logps/chosen": -1.3376895189285278, + "logps/rejected": -1.4818319082260132, + "loss": 0.8121, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3376895189285278, + "rewards/margins": 0.14414247870445251, + "rewards/rejected": -1.4818319082260132, + "sft_loss": 1.3308953046798706, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 6.401594954449087, + "learning_rate": 1.5240641711229948e-06, + "logits/chosen": -0.03835974261164665, + "logits/rejected": 0.10481540858745575, + "logps/chosen": -1.3693543672561646, + "logps/rejected": -1.4908616542816162, + "loss": 0.8414, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3693543672561646, + "rewards/margins": 0.12150740623474121, + "rewards/rejected": -1.4908616542816162, + "sft_loss": 1.4108846187591553, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 6.691312802522371, + "learning_rate": 1.5508021390374332e-06, + "logits/chosen": -0.10891927778720856, + "logits/rejected": 0.17821714282035828, + "logps/chosen": -1.3916670083999634, + "logps/rejected": -1.5492541790008545, + "loss": 0.8147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3916670083999634, + "rewards/margins": 0.15758727490901947, + "rewards/rejected": -1.5492541790008545, + "sft_loss": 1.3846977949142456, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 6.145719512127213, + "learning_rate": 1.5775401069518718e-06, + "logits/chosen": -0.04946039989590645, + "logits/rejected": 0.005695834755897522, + "logps/chosen": -1.2853538990020752, + "logps/rejected": -1.4636025428771973, + "loss": 0.7909, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2853538990020752, + "rewards/margins": 0.17824865877628326, + "rewards/rejected": -1.4636025428771973, + "sft_loss": 1.2956326007843018, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 6.108065113429084, + "learning_rate": 1.6042780748663103e-06, + "logits/chosen": -0.06966052949428558, + "logits/rejected": 0.09140712022781372, + "logps/chosen": -1.3252753019332886, + "logps/rejected": -1.4613568782806396, + "loss": 0.8185, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3252753019332886, + "rewards/margins": 0.13608156144618988, + "rewards/rejected": -1.4613568782806396, + "sft_loss": 1.38861882686615, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 4.628431738835329, + "learning_rate": 1.6310160427807487e-06, + "logits/chosen": -0.016732195392251015, + "logits/rejected": 0.05728600174188614, + "logps/chosen": -1.4442397356033325, + "logps/rejected": -1.4708423614501953, + "loss": 0.8895, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4442397356033325, + "rewards/margins": 0.026602596044540405, + "rewards/rejected": -1.4708423614501953, + "sft_loss": 1.4393965005874634, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 6.572080956668939, + "learning_rate": 1.6577540106951873e-06, + "logits/chosen": -0.20191077888011932, + "logits/rejected": -0.11043348163366318, + "logps/chosen": -1.400591492652893, + "logps/rejected": -1.5339974164962769, + "loss": 0.8448, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.400591492652893, + "rewards/margins": 0.13340599834918976, + "rewards/rejected": -1.5339974164962769, + "sft_loss": 1.3988441228866577, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 7.112907967871754, + "learning_rate": 1.6844919786096258e-06, + "logits/chosen": -0.012886536307632923, + "logits/rejected": 0.13783465325832367, + "logps/chosen": -1.4094035625457764, + "logps/rejected": -1.6090682744979858, + "loss": 0.829, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4094035625457764, + "rewards/margins": 0.1996646225452423, + "rewards/rejected": -1.6090682744979858, + "sft_loss": 1.4167879819869995, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 5.460820198591911, + "learning_rate": 1.7112299465240644e-06, + "logits/chosen": -0.07375101000070572, + "logits/rejected": 0.057667456567287445, + "logps/chosen": -1.3837597370147705, + "logps/rejected": -1.4656795263290405, + "loss": 0.8424, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3837597370147705, + "rewards/margins": 0.08191985636949539, + "rewards/rejected": -1.4656795263290405, + "sft_loss": 1.3913055658340454, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 6.045630781542912, + "learning_rate": 1.7379679144385028e-06, + "logits/chosen": -0.142831951379776, + "logits/rejected": -0.027666250243782997, + "logps/chosen": -1.392391324043274, + "logps/rejected": -1.7545061111450195, + "loss": 0.7685, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.392391324043274, + "rewards/margins": 0.36211466789245605, + "rewards/rejected": -1.7545061111450195, + "sft_loss": 1.4733895063400269, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 6.701291122816339, + "learning_rate": 1.7647058823529412e-06, + "logits/chosen": -0.015426212921738625, + "logits/rejected": 0.12235834449529648, + "logps/chosen": -1.4288122653961182, + "logps/rejected": -1.6852651834487915, + "loss": 0.7874, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4288122653961182, + "rewards/margins": 0.2564530670642853, + "rewards/rejected": -1.6852651834487915, + "sft_loss": 1.4210307598114014, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 9.467335957114459, + "learning_rate": 1.7914438502673799e-06, + "logits/chosen": 0.03193669766187668, + "logits/rejected": 0.13159868121147156, + "logps/chosen": -1.4515048265457153, + "logps/rejected": -1.5521752834320068, + "loss": 0.8385, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.4515048265457153, + "rewards/margins": 0.10067038238048553, + "rewards/rejected": -1.5521752834320068, + "sft_loss": 1.4200032949447632, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 18.183367879876617, + "learning_rate": 1.8181818181818183e-06, + "logits/chosen": -0.04067561402916908, + "logits/rejected": 0.09822919219732285, + "logps/chosen": -1.5396358966827393, + "logps/rejected": -1.7132318019866943, + "loss": 0.8609, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5396358966827393, + "rewards/margins": 0.17359602451324463, + "rewards/rejected": -1.7132318019866943, + "sft_loss": 1.483871579170227, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 4.864162372810918, + "learning_rate": 1.8449197860962567e-06, + "logits/chosen": 0.03292379528284073, + "logits/rejected": 0.0665588527917862, + "logps/chosen": -1.3988286256790161, + "logps/rejected": -1.6309274435043335, + "loss": 0.7955, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3988286256790161, + "rewards/margins": 0.23209872841835022, + "rewards/rejected": -1.6309274435043335, + "sft_loss": 1.4277395009994507, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 5.878593972427354, + "learning_rate": 1.8716577540106954e-06, + "logits/chosen": -0.009993275627493858, + "logits/rejected": 0.07647766172885895, + "logps/chosen": -1.339165210723877, + "logps/rejected": -1.51967191696167, + "loss": 0.818, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.339165210723877, + "rewards/margins": 0.18050655722618103, + "rewards/rejected": -1.51967191696167, + "sft_loss": 1.3909133672714233, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 6.8717869858531975, + "learning_rate": 1.8983957219251338e-06, + "logits/chosen": -0.0906287282705307, + "logits/rejected": 0.11716796457767487, + "logps/chosen": -1.4287830591201782, + "logps/rejected": -1.5410370826721191, + "loss": 0.8524, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4287830591201782, + "rewards/margins": 0.11225400120019913, + "rewards/rejected": -1.5410370826721191, + "sft_loss": 1.4500958919525146, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 5.933343645442397, + "learning_rate": 1.9251336898395724e-06, + "logits/chosen": -0.050111234188079834, + "logits/rejected": 0.031887516379356384, + "logps/chosen": -1.3715035915374756, + "logps/rejected": -1.5597730875015259, + "loss": 0.8122, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3715035915374756, + "rewards/margins": 0.1882694959640503, + "rewards/rejected": -1.5597730875015259, + "sft_loss": 1.3476603031158447, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 7.354723918458283, + "learning_rate": 1.951871657754011e-06, + "logits/chosen": 0.03143525868654251, + "logits/rejected": 0.10601303726434708, + "logps/chosen": -1.358642339706421, + "logps/rejected": -1.5123530626296997, + "loss": 0.8255, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.358642339706421, + "rewards/margins": 0.15371054410934448, + "rewards/rejected": -1.5123530626296997, + "sft_loss": 1.3351366519927979, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 5.571755992115051, + "learning_rate": 1.9786096256684493e-06, + "logits/chosen": 0.012722733430564404, + "logits/rejected": 0.10783351957798004, + "logps/chosen": -1.3447506427764893, + "logps/rejected": -1.456059217453003, + "loss": 0.8237, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3447506427764893, + "rewards/margins": 0.11130844056606293, + "rewards/rejected": -1.456059217453003, + "sft_loss": 1.3272932767868042, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 6.290620308688981, + "learning_rate": 2.0053475935828877e-06, + "logits/chosen": -0.052033692598342896, + "logits/rejected": 0.10267876088619232, + "logps/chosen": -1.3179277181625366, + "logps/rejected": -1.5607457160949707, + "loss": 0.7856, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3179277181625366, + "rewards/margins": 0.24281811714172363, + "rewards/rejected": -1.5607457160949707, + "sft_loss": 1.3819175958633423, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 7.164507205084144, + "learning_rate": 2.032085561497326e-06, + "logits/chosen": -0.017917213961482048, + "logits/rejected": 0.06469281017780304, + "logps/chosen": -1.3857579231262207, + "logps/rejected": -1.5951461791992188, + "loss": 0.7983, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3857579231262207, + "rewards/margins": 0.20938809216022491, + "rewards/rejected": -1.5951461791992188, + "sft_loss": 1.389345407485962, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 4.805975521600569, + "learning_rate": 2.058823529411765e-06, + "logits/chosen": 0.04141540080308914, + "logits/rejected": 0.1269879788160324, + "logps/chosen": -1.487316608428955, + "logps/rejected": -1.549757719039917, + "loss": 0.8833, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.487316608428955, + "rewards/margins": 0.062441110610961914, + "rewards/rejected": -1.549757719039917, + "sft_loss": 1.4863076210021973, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 7.329243789675164, + "learning_rate": 2.0855614973262034e-06, + "logits/chosen": 0.08908367156982422, + "logits/rejected": 0.2541596293449402, + "logps/chosen": -1.49787437915802, + "logps/rejected": -1.6316359043121338, + "loss": 0.8545, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.49787437915802, + "rewards/margins": 0.1337617188692093, + "rewards/rejected": -1.6316359043121338, + "sft_loss": 1.4867982864379883, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 6.757557326800351, + "learning_rate": 2.112299465240642e-06, + "logits/chosen": -0.006264881696552038, + "logits/rejected": 0.16049674153327942, + "logps/chosen": -1.4501184225082397, + "logps/rejected": -1.5559766292572021, + "loss": 0.8322, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4501184225082397, + "rewards/margins": 0.10585812479257584, + "rewards/rejected": -1.5559766292572021, + "sft_loss": 1.4444706439971924, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 7.791777236601081, + "learning_rate": 2.1390374331550802e-06, + "logits/chosen": 0.08647340536117554, + "logits/rejected": 0.17804650962352753, + "logps/chosen": -1.4405090808868408, + "logps/rejected": -1.6294504404067993, + "loss": 0.7978, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4405090808868408, + "rewards/margins": 0.18894155323505402, + "rewards/rejected": -1.6294504404067993, + "sft_loss": 1.4118216037750244, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.2666130065917969, + "eval_logits/rejected": 0.3544902503490448, + "eval_logps/chosen": -1.4694241285324097, + "eval_logps/rejected": -1.7231982946395874, + "eval_loss": 0.7942562699317932, + "eval_rewards/accuracies": 0.6045994162559509, + "eval_rewards/chosen": -1.4694241285324097, + "eval_rewards/margins": 0.25377434492111206, + "eval_rewards/rejected": -1.7231982946395874, + "eval_runtime": 45.6824, + "eval_samples_per_second": 29.442, + "eval_sft_loss": 1.4613449573516846, + "eval_steps_per_second": 7.377, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 7.623090780501526, + "learning_rate": 2.1657754010695186e-06, + "logits/chosen": -0.006093514151871204, + "logits/rejected": 0.08722630143165588, + "logps/chosen": -1.4930610656738281, + "logps/rejected": -1.7053945064544678, + "loss": 0.8255, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4930610656738281, + "rewards/margins": 0.2123335301876068, + "rewards/rejected": -1.7053945064544678, + "sft_loss": 1.4521852731704712, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 6.970243240124782, + "learning_rate": 2.192513368983957e-06, + "logits/chosen": 0.0605601966381073, + "logits/rejected": 0.1913018524646759, + "logps/chosen": -1.4390863180160522, + "logps/rejected": -1.6598100662231445, + "loss": 0.8031, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4390863180160522, + "rewards/margins": 0.2207239419221878, + "rewards/rejected": -1.6598100662231445, + "sft_loss": 1.4593582153320312, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 5.0092673817044595, + "learning_rate": 2.219251336898396e-06, + "logits/chosen": 0.055453162640333176, + "logits/rejected": 0.09989786893129349, + "logps/chosen": -1.469822883605957, + "logps/rejected": -1.7281382083892822, + "loss": 0.7949, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.469822883605957, + "rewards/margins": 0.2583152651786804, + "rewards/rejected": -1.7281382083892822, + "sft_loss": 1.4433677196502686, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 6.690456611021986, + "learning_rate": 2.2459893048128343e-06, + "logits/chosen": 0.027899065986275673, + "logits/rejected": 0.23041871190071106, + "logps/chosen": -1.4033445119857788, + "logps/rejected": -1.6399847269058228, + "loss": 0.8002, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4033445119857788, + "rewards/margins": 0.23664002120494843, + "rewards/rejected": -1.6399847269058228, + "sft_loss": 1.4191946983337402, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 6.857911690897056, + "learning_rate": 2.2727272727272728e-06, + "logits/chosen": 0.0018939822912216187, + "logits/rejected": 0.20424532890319824, + "logps/chosen": -1.4903066158294678, + "logps/rejected": -1.8016868829727173, + "loss": 0.785, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4903066158294678, + "rewards/margins": 0.31138020753860474, + "rewards/rejected": -1.8016868829727173, + "sft_loss": 1.5297497510910034, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 7.467088529413112, + "learning_rate": 2.299465240641711e-06, + "logits/chosen": -0.03759707883000374, + "logits/rejected": 0.16821780800819397, + "logps/chosen": -1.475658655166626, + "logps/rejected": -1.8194904327392578, + "loss": 0.782, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.475658655166626, + "rewards/margins": 0.34383195638656616, + "rewards/rejected": -1.8194904327392578, + "sft_loss": 1.495807409286499, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 8.391376519531686, + "learning_rate": 2.3262032085561496e-06, + "logits/chosen": 0.03897671774029732, + "logits/rejected": 0.1309877336025238, + "logps/chosen": -1.4078749418258667, + "logps/rejected": -1.6813606023788452, + "loss": 0.7738, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4078749418258667, + "rewards/margins": 0.27348554134368896, + "rewards/rejected": -1.6813606023788452, + "sft_loss": 1.4570095539093018, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 6.652392161153766, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": 0.03420887142419815, + "logits/rejected": 0.13482454419136047, + "logps/chosen": -1.5327328443527222, + "logps/rejected": -1.7989847660064697, + "loss": 0.8044, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5327328443527222, + "rewards/margins": 0.26625189185142517, + "rewards/rejected": -1.7989847660064697, + "sft_loss": 1.502314805984497, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 13.064384890919003, + "learning_rate": 2.379679144385027e-06, + "logits/chosen": -0.004420773591846228, + "logits/rejected": 0.11586644500494003, + "logps/chosen": -1.6088615655899048, + "logps/rejected": -2.0403130054473877, + "loss": 0.7994, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6088615655899048, + "rewards/margins": 0.43145138025283813, + "rewards/rejected": -2.0403130054473877, + "sft_loss": 1.5593528747558594, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 10.216812585799813, + "learning_rate": 2.4064171122994653e-06, + "logits/chosen": 0.03817467391490936, + "logits/rejected": 0.16011175513267517, + "logps/chosen": -1.5901763439178467, + "logps/rejected": -1.9021117687225342, + "loss": 0.7844, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5901763439178467, + "rewards/margins": 0.31193557381629944, + "rewards/rejected": -1.9021117687225342, + "sft_loss": 1.5255615711212158, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 7.038546807134959, + "learning_rate": 2.4331550802139037e-06, + "logits/chosen": -0.0005314975860528648, + "logits/rejected": 0.08741128444671631, + "logps/chosen": -1.4183223247528076, + "logps/rejected": -1.8327878713607788, + "loss": 0.7654, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4183223247528076, + "rewards/margins": 0.4144655764102936, + "rewards/rejected": -1.8327878713607788, + "sft_loss": 1.4563426971435547, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 5.275609189536791, + "learning_rate": 2.459893048128342e-06, + "logits/chosen": -0.1312110722064972, + "logits/rejected": -0.017421646043658257, + "logps/chosen": -1.5381498336791992, + "logps/rejected": -1.7059634923934937, + "loss": 0.8502, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5381498336791992, + "rewards/margins": 0.16781362891197205, + "rewards/rejected": -1.7059634923934937, + "sft_loss": 1.5450729131698608, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 6.041385703241205, + "learning_rate": 2.4866310160427806e-06, + "logits/chosen": 0.10603974759578705, + "logits/rejected": 0.12657591700553894, + "logps/chosen": -1.4376887083053589, + "logps/rejected": -1.6697343587875366, + "loss": 0.8092, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4376887083053589, + "rewards/margins": 0.2320457249879837, + "rewards/rejected": -1.6697343587875366, + "sft_loss": 1.4443788528442383, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 6.11710821245787, + "learning_rate": 2.5133689839572194e-06, + "logits/chosen": 0.14308522641658783, + "logits/rejected": 0.0967307984828949, + "logps/chosen": -1.394923448562622, + "logps/rejected": -1.6780554056167603, + "loss": 0.7755, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.394923448562622, + "rewards/margins": 0.2831319272518158, + "rewards/rejected": -1.6780554056167603, + "sft_loss": 1.418574571609497, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 6.198722703692081, + "learning_rate": 2.540106951871658e-06, + "logits/chosen": -0.09760783612728119, + "logits/rejected": 0.01914280094206333, + "logps/chosen": -1.4242517948150635, + "logps/rejected": -1.8115177154541016, + "loss": 0.7662, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4242517948150635, + "rewards/margins": 0.3872661590576172, + "rewards/rejected": -1.8115177154541016, + "sft_loss": 1.4599872827529907, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 6.672155844526494, + "learning_rate": 2.5668449197860963e-06, + "logits/chosen": -0.088392473757267, + "logits/rejected": 0.1002928838133812, + "logps/chosen": -1.3997070789337158, + "logps/rejected": -1.6192430257797241, + "loss": 0.799, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3997070789337158, + "rewards/margins": 0.21953590214252472, + "rewards/rejected": -1.6192430257797241, + "sft_loss": 1.4144244194030762, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 7.2195539216686075, + "learning_rate": 2.5935828877005347e-06, + "logits/chosen": -0.06334365159273148, + "logits/rejected": -0.02100563421845436, + "logps/chosen": -1.516610860824585, + "logps/rejected": -1.7579715251922607, + "loss": 0.8013, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.516610860824585, + "rewards/margins": 0.24136073887348175, + "rewards/rejected": -1.7579715251922607, + "sft_loss": 1.4901747703552246, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 12.734882055380117, + "learning_rate": 2.620320855614973e-06, + "logits/chosen": -0.03537796065211296, + "logits/rejected": 0.03429726883769035, + "logps/chosen": -1.5109494924545288, + "logps/rejected": -1.7095212936401367, + "loss": 0.8352, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5109494924545288, + "rewards/margins": 0.19857171177864075, + "rewards/rejected": -1.7095212936401367, + "sft_loss": 1.469315528869629, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 7.778351147647691, + "learning_rate": 2.647058823529412e-06, + "logits/chosen": -0.10972901433706284, + "logits/rejected": -0.08436641842126846, + "logps/chosen": -1.5129988193511963, + "logps/rejected": -1.6879619359970093, + "loss": 0.8408, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5129988193511963, + "rewards/margins": 0.17496302723884583, + "rewards/rejected": -1.6879619359970093, + "sft_loss": 1.5606034994125366, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 5.42051394350465, + "learning_rate": 2.6737967914438504e-06, + "logits/chosen": -0.1078430786728859, + "logits/rejected": -0.01456520240753889, + "logps/chosen": -1.383042812347412, + "logps/rejected": -1.6418497562408447, + "loss": 0.7912, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.383042812347412, + "rewards/margins": 0.25880688428878784, + "rewards/rejected": -1.6418497562408447, + "sft_loss": 1.4060033559799194, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 7.468246318755707, + "learning_rate": 2.700534759358289e-06, + "logits/chosen": -0.10797281563282013, + "logits/rejected": 0.03077404573559761, + "logps/chosen": -1.4886077642440796, + "logps/rejected": -1.6428436040878296, + "loss": 0.8322, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4886077642440796, + "rewards/margins": 0.15423579514026642, + "rewards/rejected": -1.6428436040878296, + "sft_loss": 1.4929144382476807, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 5.792355588006178, + "learning_rate": 2.7272727272727272e-06, + "logits/chosen": 0.043688975274562836, + "logits/rejected": 0.10481522977352142, + "logps/chosen": -1.439705491065979, + "logps/rejected": -1.7611348628997803, + "loss": 0.7658, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.439705491065979, + "rewards/margins": 0.32142946124076843, + "rewards/rejected": -1.7611348628997803, + "sft_loss": 1.4002330303192139, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 5.017281691975952, + "learning_rate": 2.7540106951871656e-06, + "logits/chosen": 0.0008904725546017289, + "logits/rejected": 0.09226072579622269, + "logps/chosen": -1.3720366954803467, + "logps/rejected": -1.6341902017593384, + "loss": 0.7902, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3720366954803467, + "rewards/margins": 0.262153297662735, + "rewards/rejected": -1.6341902017593384, + "sft_loss": 1.401508092880249, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 5.805813893255752, + "learning_rate": 2.780748663101604e-06, + "logits/chosen": -0.13670828938484192, + "logits/rejected": -0.0036086924374103546, + "logps/chosen": -1.4570739269256592, + "logps/rejected": -1.7094089984893799, + "loss": 0.8105, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4570739269256592, + "rewards/margins": 0.2523351311683655, + "rewards/rejected": -1.7094089984893799, + "sft_loss": 1.5445858240127563, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 13.47963552520056, + "learning_rate": 2.807486631016043e-06, + "logits/chosen": 0.04953201487660408, + "logits/rejected": 0.11939598619937897, + "logps/chosen": -1.4688103199005127, + "logps/rejected": -1.7682476043701172, + "loss": 0.8014, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4688103199005127, + "rewards/margins": 0.29943740367889404, + "rewards/rejected": -1.7682476043701172, + "sft_loss": 1.5212879180908203, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 6.1329588367259005, + "learning_rate": 2.8342245989304813e-06, + "logits/chosen": -0.01726909913122654, + "logits/rejected": 0.06347990781068802, + "logps/chosen": -1.4241316318511963, + "logps/rejected": -1.7015587091445923, + "loss": 0.7945, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4241316318511963, + "rewards/margins": 0.2774270474910736, + "rewards/rejected": -1.7015587091445923, + "sft_loss": 1.3766034841537476, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 5.483486136121046, + "learning_rate": 2.8609625668449198e-06, + "logits/chosen": -0.18864202499389648, + "logits/rejected": 0.040623150765895844, + "logps/chosen": -1.4134786128997803, + "logps/rejected": -1.638918161392212, + "loss": 0.7908, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4134786128997803, + "rewards/margins": 0.22543945908546448, + "rewards/rejected": -1.638918161392212, + "sft_loss": 1.3824502229690552, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 8.233006873244952, + "learning_rate": 2.887700534759358e-06, + "logits/chosen": -0.08203691244125366, + "logits/rejected": -0.01693258062005043, + "logps/chosen": -1.5680696964263916, + "logps/rejected": -1.7938575744628906, + "loss": 0.8159, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5680696964263916, + "rewards/margins": 0.22578780353069305, + "rewards/rejected": -1.7938575744628906, + "sft_loss": 1.5657321214675903, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 5.350671086008654, + "learning_rate": 2.9144385026737966e-06, + "logits/chosen": -0.19635380804538727, + "logits/rejected": -0.014688762836158276, + "logps/chosen": -1.4838629961013794, + "logps/rejected": -1.8379631042480469, + "loss": 0.7691, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4838629961013794, + "rewards/margins": 0.35410040616989136, + "rewards/rejected": -1.8379631042480469, + "sft_loss": 1.482094407081604, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 5.3327033465197236, + "learning_rate": 2.941176470588235e-06, + "logits/chosen": -0.07292584329843521, + "logits/rejected": -0.01656881347298622, + "logps/chosen": -1.5360199213027954, + "logps/rejected": -1.8137409687042236, + "loss": 0.7828, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5360199213027954, + "rewards/margins": 0.2777208983898163, + "rewards/rejected": -1.8137409687042236, + "sft_loss": 1.479871153831482, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 8.691037453690216, + "learning_rate": 2.967914438502674e-06, + "logits/chosen": -0.18928229808807373, + "logits/rejected": -0.08067715167999268, + "logps/chosen": -1.572627067565918, + "logps/rejected": -1.7626476287841797, + "loss": 0.8448, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.572627067565918, + "rewards/margins": 0.19002054631710052, + "rewards/rejected": -1.7626476287841797, + "sft_loss": 1.5602760314941406, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 4.949138893882736, + "learning_rate": 2.9946524064171123e-06, + "logits/chosen": -0.03637278825044632, + "logits/rejected": -0.019715752452611923, + "logps/chosen": -1.4272973537445068, + "logps/rejected": -1.723838210105896, + "loss": 0.7948, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4272973537445068, + "rewards/margins": 0.29654109477996826, + "rewards/rejected": -1.723838210105896, + "sft_loss": 1.5645406246185303, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 4.689383987788954, + "learning_rate": 2.999995343036539e-06, + "logits/chosen": -0.038937196135520935, + "logits/rejected": 0.014615943655371666, + "logps/chosen": -1.4650460481643677, + "logps/rejected": -1.75836181640625, + "loss": 0.7855, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4650460481643677, + "rewards/margins": 0.29331594705581665, + "rewards/rejected": -1.75836181640625, + "sft_loss": 1.5022021532058716, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 8.336253133816138, + "learning_rate": 2.9999764241720397e-06, + "logits/chosen": -0.14228703081607819, + "logits/rejected": 0.06473597139120102, + "logps/chosen": -1.4488310813903809, + "logps/rejected": -1.7192370891571045, + "loss": 0.7972, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4488310813903809, + "rewards/margins": 0.27040624618530273, + "rewards/rejected": -1.7192370891571045, + "sft_loss": 1.5263842344284058, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 5.718887405179139, + "learning_rate": 2.9999429525296936e-06, + "logits/chosen": -0.12334390729665756, + "logits/rejected": -0.06649667024612427, + "logps/chosen": -1.4215309619903564, + "logps/rejected": -1.7182035446166992, + "loss": 0.7879, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4215309619903564, + "rewards/margins": 0.2966725826263428, + "rewards/rejected": -1.7182035446166992, + "sft_loss": 1.4421032667160034, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 5.482762190871826, + "learning_rate": 2.9998949284342434e-06, + "logits/chosen": -0.12731342017650604, + "logits/rejected": 0.018348468467593193, + "logps/chosen": -1.4773890972137451, + "logps/rejected": -1.9029382467269897, + "loss": 0.7485, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4773890972137451, + "rewards/margins": 0.4255490303039551, + "rewards/rejected": -1.9029382467269897, + "sft_loss": 1.5035072565078735, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 7.504671278923474, + "learning_rate": 2.99983235235162e-06, + "logits/chosen": -0.19935433566570282, + "logits/rejected": -0.0978468805551529, + "logps/chosen": -1.7026008367538452, + "logps/rejected": -1.903778314590454, + "loss": 0.8722, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.7026008367538452, + "rewards/margins": 0.20117728412151337, + "rewards/rejected": -1.903778314590454, + "sft_loss": 1.6554231643676758, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 8.419389774679347, + "learning_rate": 2.999755224888935e-06, + "logits/chosen": -0.15665586292743683, + "logits/rejected": -0.043528031557798386, + "logps/chosen": -1.57762610912323, + "logps/rejected": -1.7399566173553467, + "loss": 0.8459, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.57762610912323, + "rewards/margins": 0.16233067214488983, + "rewards/rejected": -1.7399566173553467, + "sft_loss": 1.5726512670516968, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 6.543602385073035, + "learning_rate": 2.9996635467944813e-06, + "logits/chosen": -0.06690023839473724, + "logits/rejected": 0.0520065613090992, + "logps/chosen": -1.449690818786621, + "logps/rejected": -1.726157784461975, + "loss": 0.7931, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.449690818786621, + "rewards/margins": 0.2764667570590973, + "rewards/rejected": -1.726157784461975, + "sft_loss": 1.4725532531738281, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 5.818392123470136, + "learning_rate": 2.999557318957719e-06, + "logits/chosen": -0.13861538469791412, + "logits/rejected": 0.00308010121807456, + "logps/chosen": -1.4782884120941162, + "logps/rejected": -1.6739435195922852, + "loss": 0.8127, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4782884120941162, + "rewards/margins": 0.19565510749816895, + "rewards/rejected": -1.6739435195922852, + "sft_loss": 1.4908336400985718, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 8.237093368806667, + "learning_rate": 2.9994365424092717e-06, + "logits/chosen": -0.1953982412815094, + "logits/rejected": -0.11716214567422867, + "logps/chosen": -1.5879408121109009, + "logps/rejected": -1.8976964950561523, + "loss": 0.8077, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5879408121109009, + "rewards/margins": 0.3097555637359619, + "rewards/rejected": -1.8976964950561523, + "sft_loss": 1.5898962020874023, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 12.66322925163889, + "learning_rate": 2.9993012183209135e-06, + "logits/chosen": -0.07549289613962173, + "logits/rejected": 0.07459872961044312, + "logps/chosen": -1.5181553363800049, + "logps/rejected": -1.8104722499847412, + "loss": 0.8062, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5181553363800049, + "rewards/margins": 0.2923170328140259, + "rewards/rejected": -1.8104722499847412, + "sft_loss": 1.50444757938385, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 6.40661361783137, + "learning_rate": 2.9991513480055592e-06, + "logits/chosen": -0.16410446166992188, + "logits/rejected": -0.05960635468363762, + "logps/chosen": -1.53480863571167, + "logps/rejected": -1.9423625469207764, + "loss": 0.7633, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.53480863571167, + "rewards/margins": 0.4075539708137512, + "rewards/rejected": -1.9423625469207764, + "sft_loss": 1.5325727462768555, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 5.863570720455386, + "learning_rate": 2.998986932917252e-06, + "logits/chosen": -0.04310298711061478, + "logits/rejected": 0.019075483083724976, + "logps/chosen": -1.607175588607788, + "logps/rejected": -1.897870659828186, + "loss": 0.808, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.607175588607788, + "rewards/margins": 0.2906948924064636, + "rewards/rejected": -1.897870659828186, + "sft_loss": 1.605529546737671, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 7.59931372209281, + "learning_rate": 2.998807974651147e-06, + "logits/chosen": -0.0006595879676751792, + "logits/rejected": 0.11075425148010254, + "logps/chosen": -1.487177848815918, + "logps/rejected": -1.888089895248413, + "loss": 0.7655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.487177848815918, + "rewards/margins": 0.4009120464324951, + "rewards/rejected": -1.888089895248413, + "sft_loss": 1.5196160078048706, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 10.94150119403967, + "learning_rate": 2.9986144749434987e-06, + "logits/chosen": -0.06883852183818817, + "logits/rejected": 0.04102737456560135, + "logps/chosen": -1.5510679483413696, + "logps/rejected": -1.9344244003295898, + "loss": 0.7503, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5510679483413696, + "rewards/margins": 0.38335639238357544, + "rewards/rejected": -1.9344244003295898, + "sft_loss": 1.5082122087478638, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 5.977494774665963, + "learning_rate": 2.9984064356716413e-06, + "logits/chosen": -0.05886172130703926, + "logits/rejected": 0.18332821130752563, + "logps/chosen": -1.6331151723861694, + "logps/rejected": -1.9419275522232056, + "loss": 0.8171, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6331151723861694, + "rewards/margins": 0.30881232023239136, + "rewards/rejected": -1.9419275522232056, + "sft_loss": 1.6244218349456787, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 15.23682265863437, + "learning_rate": 2.998183858853974e-06, + "logits/chosen": -0.16301007568836212, + "logits/rejected": 0.0349007211625576, + "logps/chosen": -1.6005462408065796, + "logps/rejected": -1.9187949895858765, + "loss": 0.8161, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.6005462408065796, + "rewards/margins": 0.31824877858161926, + "rewards/rejected": -1.9187949895858765, + "sft_loss": 1.648459792137146, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 6.423514529899254, + "learning_rate": 2.997946746649937e-06, + "logits/chosen": -0.12605305016040802, + "logits/rejected": -0.04022118076682091, + "logps/chosen": -1.4993903636932373, + "logps/rejected": -1.9236873388290405, + "loss": 0.749, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4993903636932373, + "rewards/margins": 0.4242970943450928, + "rewards/rejected": -1.9236873388290405, + "sft_loss": 1.4816380739212036, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 8.148623275430047, + "learning_rate": 2.997695101359994e-06, + "logits/chosen": -0.10559117794036865, + "logits/rejected": 0.036014266312122345, + "logps/chosen": -1.6362364292144775, + "logps/rejected": -1.9686295986175537, + "loss": 0.7973, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6362364292144775, + "rewards/margins": 0.3323933184146881, + "rewards/rejected": -1.9686295986175537, + "sft_loss": 1.6498661041259766, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 12.37187154833635, + "learning_rate": 2.997428925425609e-06, + "logits/chosen": -0.03357526287436485, + "logits/rejected": -0.016637753695249557, + "logps/chosen": -1.5325089693069458, + "logps/rejected": -1.891472578048706, + "loss": 0.8019, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5325089693069458, + "rewards/margins": 0.3589635193347931, + "rewards/rejected": -1.891472578048706, + "sft_loss": 1.5624765157699585, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 7.119224348231651, + "learning_rate": 2.997148221429223e-06, + "logits/chosen": -0.044113095849752426, + "logits/rejected": 0.07033444941043854, + "logps/chosen": -1.4734444618225098, + "logps/rejected": -1.6820869445800781, + "loss": 0.8191, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4734444618225098, + "rewards/margins": 0.20864248275756836, + "rewards/rejected": -1.6820869445800781, + "sft_loss": 1.510671854019165, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 7.052877336685435, + "learning_rate": 2.996852992094225e-06, + "logits/chosen": -0.07160480320453644, + "logits/rejected": 0.055411409586668015, + "logps/chosen": -1.4131146669387817, + "logps/rejected": -1.682700514793396, + "loss": 0.7936, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4131146669387817, + "rewards/margins": 0.2695859670639038, + "rewards/rejected": -1.682700514793396, + "sft_loss": 1.4746235609054565, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 4.749428938217011, + "learning_rate": 2.9965432402849336e-06, + "logits/chosen": -0.09310106933116913, + "logits/rejected": 0.09594430029392242, + "logps/chosen": -1.4437072277069092, + "logps/rejected": -1.6759834289550781, + "loss": 0.7985, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4437072277069092, + "rewards/margins": 0.23227599263191223, + "rewards/rejected": -1.6759834289550781, + "sft_loss": 1.5473780632019043, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 5.946458004286902, + "learning_rate": 2.9962189690065614e-06, + "logits/chosen": -0.0892430767416954, + "logits/rejected": -0.020457569509744644, + "logps/chosen": -1.4541094303131104, + "logps/rejected": -1.8192046880722046, + "loss": 0.7613, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4541094303131104, + "rewards/margins": 0.3650952875614166, + "rewards/rejected": -1.8192046880722046, + "sft_loss": 1.5087134838104248, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 5.708891942201608, + "learning_rate": 2.99588018140519e-06, + "logits/chosen": -0.013011714443564415, + "logits/rejected": 0.1486460268497467, + "logps/chosen": -1.5294251441955566, + "logps/rejected": -1.7828242778778076, + "loss": 0.855, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.5294251441955566, + "rewards/margins": 0.2533993124961853, + "rewards/rejected": -1.7828242778778076, + "sft_loss": 1.5098159313201904, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 10.640059246416705, + "learning_rate": 2.995526880767737e-06, + "logits/chosen": -0.04731638729572296, + "logits/rejected": 0.10229668766260147, + "logps/chosen": -1.5006544589996338, + "logps/rejected": -1.7863868474960327, + "loss": 0.8041, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5006544589996338, + "rewards/margins": 0.28573232889175415, + "rewards/rejected": -1.7863868474960327, + "sft_loss": 1.506460428237915, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 7.265291049700372, + "learning_rate": 2.9951590705219287e-06, + "logits/chosen": -0.07539691030979156, + "logits/rejected": -0.03621528297662735, + "logps/chosen": -1.5116404294967651, + "logps/rejected": -1.7689307928085327, + "loss": 0.8229, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.5116404294967651, + "rewards/margins": 0.257290244102478, + "rewards/rejected": -1.7689307928085327, + "sft_loss": 1.5742204189300537, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 7.374285279419172, + "learning_rate": 2.99477675423626e-06, + "logits/chosen": -0.10970363765954971, + "logits/rejected": -0.021756969392299652, + "logps/chosen": -1.4470633268356323, + "logps/rejected": -1.776341199874878, + "loss": 0.7694, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4470633268356323, + "rewards/margins": 0.3292779326438904, + "rewards/rejected": -1.776341199874878, + "sft_loss": 1.4733977317810059, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 8.942277187244555, + "learning_rate": 2.994379935619966e-06, + "logits/chosen": -0.21716026961803436, + "logits/rejected": -0.07959534227848053, + "logps/chosen": -1.6371448040008545, + "logps/rejected": -1.8145755529403687, + "loss": 0.8325, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6371448040008545, + "rewards/margins": 0.17743055522441864, + "rewards/rejected": -1.8145755529403687, + "sft_loss": 1.5957849025726318, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 5.15267596889546, + "learning_rate": 2.9939686185229826e-06, + "logits/chosen": -0.19370415806770325, + "logits/rejected": -0.0161186084151268, + "logps/chosen": -1.5457772016525269, + "logps/rejected": -1.9858157634735107, + "loss": 0.7568, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5457772016525269, + "rewards/margins": 0.4400387704372406, + "rewards/rejected": -1.9858157634735107, + "sft_loss": 1.551759958267212, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 8.31494704459637, + "learning_rate": 2.9935428069359103e-06, + "logits/chosen": -0.06963808834552765, + "logits/rejected": 0.02257571741938591, + "logps/chosen": -1.538611888885498, + "logps/rejected": -1.8600261211395264, + "loss": 0.7655, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.538611888885498, + "rewards/margins": 0.32141461968421936, + "rewards/rejected": -1.8600261211395264, + "sft_loss": 1.5300943851470947, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 12.46625655578267, + "learning_rate": 2.9931025049899744e-06, + "logits/chosen": -0.16380861401557922, + "logits/rejected": 0.0008494898793287575, + "logps/chosen": -1.6448299884796143, + "logps/rejected": -1.9350553750991821, + "loss": 0.791, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6448299884796143, + "rewards/margins": 0.2902253568172455, + "rewards/rejected": -1.9350553750991821, + "sft_loss": 1.5562912225723267, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 7.382525036999301, + "learning_rate": 2.9926477169569865e-06, + "logits/chosen": -0.07919908314943314, + "logits/rejected": 0.1075511947274208, + "logps/chosen": -1.721343994140625, + "logps/rejected": -2.0451881885528564, + "loss": 0.8448, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.721343994140625, + "rewards/margins": 0.32384395599365234, + "rewards/rejected": -2.0451881885528564, + "sft_loss": 1.6712074279785156, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 6.674161020309135, + "learning_rate": 2.9921784472493023e-06, + "logits/chosen": -0.16972795128822327, + "logits/rejected": -0.03377728909254074, + "logps/chosen": -1.4200472831726074, + "logps/rejected": -1.7975457906723022, + "loss": 0.7429, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4200472831726074, + "rewards/margins": 0.3774986267089844, + "rewards/rejected": -1.7975457906723022, + "sft_loss": 1.4895946979522705, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 6.0055759116105785, + "learning_rate": 2.9916947004197784e-06, + "logits/chosen": -0.25825509428977966, + "logits/rejected": -0.10662020742893219, + "logps/chosen": -1.5190376043319702, + "logps/rejected": -1.762810468673706, + "loss": 0.8038, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5190376043319702, + "rewards/margins": 0.2437729835510254, + "rewards/rejected": -1.762810468673706, + "sft_loss": 1.5322901010513306, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 5.420287019764337, + "learning_rate": 2.9911964811617288e-06, + "logits/chosen": -0.22347505390644073, + "logits/rejected": -0.127300426363945, + "logps/chosen": -1.530833125114441, + "logps/rejected": -1.767128586769104, + "loss": 0.8071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.530833125114441, + "rewards/margins": 0.23629550635814667, + "rewards/rejected": -1.767128586769104, + "sft_loss": 1.5706127882003784, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 9.893102035807134, + "learning_rate": 2.990683794308879e-06, + "logits/chosen": -0.20974227786064148, + "logits/rejected": -0.048324812203645706, + "logps/chosen": -1.6057713031768799, + "logps/rejected": -1.8644367456436157, + "loss": 0.813, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6057713031768799, + "rewards/margins": 0.25866541266441345, + "rewards/rejected": -1.8644367456436157, + "sft_loss": 1.5996134281158447, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 5.124644195602197, + "learning_rate": 2.990156644835318e-06, + "logits/chosen": -0.10994444787502289, + "logits/rejected": -0.04099906235933304, + "logps/chosen": -1.5882176160812378, + "logps/rejected": -1.9525806903839111, + "loss": 0.7969, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5882176160812378, + "rewards/margins": 0.36436301469802856, + "rewards/rejected": -1.9525806903839111, + "sft_loss": 1.5708587169647217, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 4.277626498532557, + "learning_rate": 2.989615037855454e-06, + "logits/chosen": -0.20711994171142578, + "logits/rejected": -0.0582943931221962, + "logps/chosen": -1.5332410335540771, + "logps/rejected": -1.9215202331542969, + "loss": 0.7673, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5332410335540771, + "rewards/margins": 0.3882790207862854, + "rewards/rejected": -1.9215202331542969, + "sft_loss": 1.548135757446289, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 6.281198041488248, + "learning_rate": 2.98905897862396e-06, + "logits/chosen": -0.13693243265151978, + "logits/rejected": -0.014755794778466225, + "logps/chosen": -1.5604966878890991, + "logps/rejected": -1.7795257568359375, + "loss": 0.8295, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5604966878890991, + "rewards/margins": 0.21902918815612793, + "rewards/rejected": -1.7795257568359375, + "sft_loss": 1.5735958814620972, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 6.312433351409002, + "learning_rate": 2.9884884725357237e-06, + "logits/chosen": -0.2737407088279724, + "logits/rejected": -0.20347031950950623, + "logps/chosen": -1.5391143560409546, + "logps/rejected": -1.8454147577285767, + "loss": 0.7853, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5391143560409546, + "rewards/margins": 0.3063003718852997, + "rewards/rejected": -1.8454147577285767, + "sft_loss": 1.5820717811584473, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 5.678304869448927, + "learning_rate": 2.9879035251257994e-06, + "logits/chosen": -0.2150966376066208, + "logits/rejected": -0.129004567861557, + "logps/chosen": -1.5198668241500854, + "logps/rejected": -1.7554266452789307, + "loss": 0.7937, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5198668241500854, + "rewards/margins": 0.23555977642536163, + "rewards/rejected": -1.7554266452789307, + "sft_loss": 1.5115150213241577, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 6.440836345300536, + "learning_rate": 2.9873041420693485e-06, + "logits/chosen": -0.1084686741232872, + "logits/rejected": 0.018399396911263466, + "logps/chosen": -1.4943346977233887, + "logps/rejected": -1.9200446605682373, + "loss": 0.7576, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4943346977233887, + "rewards/margins": 0.42570990324020386, + "rewards/rejected": -1.9200446605682373, + "sft_loss": 1.489121913909912, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 5.5172545468887035, + "learning_rate": 2.9866903291815874e-06, + "logits/chosen": -0.2743126451969147, + "logits/rejected": -0.1022576093673706, + "logps/chosen": -1.542785406112671, + "logps/rejected": -1.8750007152557373, + "loss": 0.792, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.542785406112671, + "rewards/margins": 0.3322153389453888, + "rewards/rejected": -1.8750007152557373, + "sft_loss": 1.4891480207443237, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 4.461063714871936, + "learning_rate": 2.986062092417733e-06, + "logits/chosen": -0.34281599521636963, + "logits/rejected": -0.2016495019197464, + "logps/chosen": -1.4909117221832275, + "logps/rejected": -1.7951453924179077, + "loss": 0.7799, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4909117221832275, + "rewards/margins": 0.30423372983932495, + "rewards/rejected": -1.7951453924179077, + "sft_loss": 1.5366796255111694, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 6.228539620529092, + "learning_rate": 2.9854194378729402e-06, + "logits/chosen": -0.2008173018693924, + "logits/rejected": -0.06987674534320831, + "logps/chosen": -1.5338191986083984, + "logps/rejected": -1.9270120859146118, + "loss": 0.7556, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5338191986083984, + "rewards/margins": 0.39319270849227905, + "rewards/rejected": -1.9270120859146118, + "sft_loss": 1.5258783102035522, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 5.831894821473878, + "learning_rate": 2.984762371782246e-06, + "logits/chosen": -0.2618991732597351, + "logits/rejected": -0.1274309605360031, + "logps/chosen": -1.5179414749145508, + "logps/rejected": -1.9443788528442383, + "loss": 0.7495, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5179414749145508, + "rewards/margins": 0.4264375567436218, + "rewards/rejected": -1.9443788528442383, + "sft_loss": 1.5014102458953857, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 10.386180837746535, + "learning_rate": 2.9840909005205093e-06, + "logits/chosen": -0.28757327795028687, + "logits/rejected": -0.09849077463150024, + "logps/chosen": -1.5623376369476318, + "logps/rejected": -2.1427266597747803, + "loss": 0.7518, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5623376369476318, + "rewards/margins": 0.5803892016410828, + "rewards/rejected": -2.1427266597747803, + "sft_loss": 1.5638837814331055, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 6.533690446407395, + "learning_rate": 2.9834050306023467e-06, + "logits/chosen": -0.22992396354675293, + "logits/rejected": -0.14586150646209717, + "logps/chosen": -1.5596520900726318, + "logps/rejected": -1.93422532081604, + "loss": 0.7634, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5596520900726318, + "rewards/margins": 0.374573290348053, + "rewards/rejected": -1.93422532081604, + "sft_loss": 1.5329387187957764, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.12956379354000092, + "eval_logits/rejected": 0.2222956120967865, + "eval_logps/chosen": -1.643437385559082, + "eval_logps/rejected": -2.1177093982696533, + "eval_loss": 0.7630000114440918, + "eval_rewards/accuracies": 0.6461424231529236, + "eval_rewards/chosen": -1.643437385559082, + "eval_rewards/margins": 0.47427213191986084, + "eval_rewards/rejected": -2.1177093982696533, + "eval_runtime": 44.8355, + "eval_samples_per_second": 29.999, + "eval_sft_loss": 1.60084867477417, + "eval_steps_per_second": 7.516, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 9.609958453098086, + "learning_rate": 2.9827047686820714e-06, + "logits/chosen": -0.254824697971344, + "logits/rejected": -0.08545961230993271, + "logps/chosen": -1.6291358470916748, + "logps/rejected": -2.1718828678131104, + "loss": 0.7416, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6291358470916748, + "rewards/margins": 0.5427471995353699, + "rewards/rejected": -2.1718828678131104, + "sft_loss": 1.6229009628295898, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 9.759688116503279, + "learning_rate": 2.981990121553627e-06, + "logits/chosen": -0.14894555509090424, + "logits/rejected": -0.07027033716440201, + "logps/chosen": -1.6724026203155518, + "logps/rejected": -2.1126809120178223, + "loss": 0.773, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6724026203155518, + "rewards/margins": 0.44027847051620483, + "rewards/rejected": -2.1126809120178223, + "sft_loss": 1.6621917486190796, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 9.224170221489683, + "learning_rate": 2.9812610961505237e-06, + "logits/chosen": -0.1752864271402359, + "logits/rejected": -0.04700814187526703, + "logps/chosen": -1.613806128501892, + "logps/rejected": -2.180758237838745, + "loss": 0.7705, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.613806128501892, + "rewards/margins": 0.5669519305229187, + "rewards/rejected": -2.180758237838745, + "sft_loss": 1.6244399547576904, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 7.027893732458123, + "learning_rate": 2.980517699545769e-06, + "logits/chosen": -0.1193653792142868, + "logits/rejected": -0.07351159304380417, + "logps/chosen": -1.5781536102294922, + "logps/rejected": -1.9520412683486938, + "loss": 0.79, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5781536102294922, + "rewards/margins": 0.37388795614242554, + "rewards/rejected": -1.9520412683486938, + "sft_loss": 1.575122356414795, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 6.4472721493338225, + "learning_rate": 2.9797599389518003e-06, + "logits/chosen": -0.15606620907783508, + "logits/rejected": -0.03302518650889397, + "logps/chosen": -1.4220941066741943, + "logps/rejected": -1.8449456691741943, + "loss": 0.7544, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4220941066741943, + "rewards/margins": 0.42285171151161194, + "rewards/rejected": -1.8449456691741943, + "sft_loss": 1.5320099592208862, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 5.574111179052546, + "learning_rate": 2.9789878217204138e-06, + "logits/chosen": -0.07594827562570572, + "logits/rejected": 0.08974309265613556, + "logps/chosen": -1.534173607826233, + "logps/rejected": -1.8344669342041016, + "loss": 0.7843, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.534173607826233, + "rewards/margins": 0.30029329657554626, + "rewards/rejected": -1.8344669342041016, + "sft_loss": 1.5193984508514404, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 6.863528532745195, + "learning_rate": 2.9782013553426944e-06, + "logits/chosen": -0.12380240112543106, + "logits/rejected": 0.006033450365066528, + "logps/chosen": -1.5036627054214478, + "logps/rejected": -1.811418890953064, + "loss": 0.798, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5036627054214478, + "rewards/margins": 0.3077562153339386, + "rewards/rejected": -1.811418890953064, + "sft_loss": 1.5856822729110718, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 6.072297600231805, + "learning_rate": 2.977400547448942e-06, + "logits/chosen": -0.13113507628440857, + "logits/rejected": 0.032567743211984634, + "logps/chosen": -1.581300973892212, + "logps/rejected": -1.9286623001098633, + "loss": 0.8065, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.581300973892212, + "rewards/margins": 0.347361296415329, + "rewards/rejected": -1.9286623001098633, + "sft_loss": 1.6395957469940186, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 4.453440470763169, + "learning_rate": 2.976585405808599e-06, + "logits/chosen": -0.08194790780544281, + "logits/rejected": -0.013257568702101707, + "logps/chosen": -1.526556134223938, + "logps/rejected": -1.7669597864151, + "loss": 0.833, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.526556134223938, + "rewards/margins": 0.24040360748767853, + "rewards/rejected": -1.7669597864151, + "sft_loss": 1.6016641855239868, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 7.4831447992490405, + "learning_rate": 2.9757559383301726e-06, + "logits/chosen": -0.1410936713218689, + "logits/rejected": -0.061996787786483765, + "logps/chosen": -1.5362951755523682, + "logps/rejected": -1.80612313747406, + "loss": 0.775, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5362951755523682, + "rewards/margins": 0.2698282301425934, + "rewards/rejected": -1.80612313747406, + "sft_loss": 1.5365614891052246, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 12.230990745949088, + "learning_rate": 2.9749121530611605e-06, + "logits/chosen": -0.16171444952487946, + "logits/rejected": 0.006842072121798992, + "logps/chosen": -1.5803025960922241, + "logps/rejected": -2.0116724967956543, + "loss": 0.7973, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5803025960922241, + "rewards/margins": 0.43136996030807495, + "rewards/rejected": -2.0116724967956543, + "sft_loss": 1.563303828239441, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 6.27963698166502, + "learning_rate": 2.97405405818797e-06, + "logits/chosen": -0.2611856758594513, + "logits/rejected": -0.09050299972295761, + "logps/chosen": -1.6014102697372437, + "logps/rejected": -2.0758275985717773, + "loss": 0.7568, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6014102697372437, + "rewards/margins": 0.4744173586368561, + "rewards/rejected": -2.0758275985717773, + "sft_loss": 1.601758360862732, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 14.12800084055978, + "learning_rate": 2.9731816620358426e-06, + "logits/chosen": -0.15563152730464935, + "logits/rejected": -0.04933555796742439, + "logps/chosen": -1.5415546894073486, + "logps/rejected": -2.029212474822998, + "loss": 0.7759, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5415546894073486, + "rewards/margins": 0.4876578450202942, + "rewards/rejected": -2.029212474822998, + "sft_loss": 1.5229319334030151, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 5.107147918566259, + "learning_rate": 2.9722949730687687e-06, + "logits/chosen": -0.25618550181388855, + "logits/rejected": 0.02996593713760376, + "logps/chosen": -1.5668599605560303, + "logps/rejected": -2.001129627227783, + "loss": 0.7716, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5668599605560303, + "rewards/margins": 0.43426984548568726, + "rewards/rejected": -2.001129627227783, + "sft_loss": 1.6111844778060913, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 7.689311016766697, + "learning_rate": 2.9713939998894087e-06, + "logits/chosen": -0.16601407527923584, + "logits/rejected": -0.0825173631310463, + "logps/chosen": -1.637319803237915, + "logps/rejected": -1.9128596782684326, + "loss": 0.8601, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.637319803237915, + "rewards/margins": 0.2755400538444519, + "rewards/rejected": -1.9128596782684326, + "sft_loss": 1.6005403995513916, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 5.822451405492217, + "learning_rate": 2.970478751239009e-06, + "logits/chosen": -0.16296057403087616, + "logits/rejected": 0.015431973151862621, + "logps/chosen": -1.6332006454467773, + "logps/rejected": -2.022491455078125, + "loss": 0.7748, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6332006454467773, + "rewards/margins": 0.38929063081741333, + "rewards/rejected": -2.022491455078125, + "sft_loss": 1.5428736209869385, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 7.1342182098162565, + "learning_rate": 2.9695492359973153e-06, + "logits/chosen": -0.23076090216636658, + "logits/rejected": -0.1467093527317047, + "logps/chosen": -1.5775783061981201, + "logps/rejected": -1.9766054153442383, + "loss": 0.7444, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5775783061981201, + "rewards/margins": 0.39902693033218384, + "rewards/rejected": -1.9766054153442383, + "sft_loss": 1.5582143068313599, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 4.5065990404811584, + "learning_rate": 2.9686054631824884e-06, + "logits/chosen": -0.3198297619819641, + "logits/rejected": -0.1856500804424286, + "logps/chosen": -1.5775845050811768, + "logps/rejected": -1.9523508548736572, + "loss": 0.776, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5775845050811768, + "rewards/margins": 0.3747663199901581, + "rewards/rejected": -1.9523508548736572, + "sft_loss": 1.6158215999603271, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 5.474880741824772, + "learning_rate": 2.9676474419510175e-06, + "logits/chosen": -0.08876131474971771, + "logits/rejected": 0.02842838689684868, + "logps/chosen": -1.472720980644226, + "logps/rejected": -1.763304352760315, + "loss": 0.78, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.472720980644226, + "rewards/margins": 0.29058313369750977, + "rewards/rejected": -1.763304352760315, + "sft_loss": 1.5007354021072388, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 5.201428854120871, + "learning_rate": 2.966675181597627e-06, + "logits/chosen": -0.21010169386863708, + "logits/rejected": -0.13299232721328735, + "logps/chosen": -1.471938967704773, + "logps/rejected": -1.901342749595642, + "loss": 0.7557, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.471938967704773, + "rewards/margins": 0.4294038712978363, + "rewards/rejected": -1.901342749595642, + "sft_loss": 1.4958593845367432, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 6.6367592511228155, + "learning_rate": 2.965688691555193e-06, + "logits/chosen": -0.12889441847801208, + "logits/rejected": 0.07438662648200989, + "logps/chosen": -1.6089328527450562, + "logps/rejected": -2.049424886703491, + "loss": 0.7874, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6089328527450562, + "rewards/margins": 0.44049209356307983, + "rewards/rejected": -2.049424886703491, + "sft_loss": 1.6768449544906616, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 4.574768350267725, + "learning_rate": 2.964687981394644e-06, + "logits/chosen": -0.16920894384384155, + "logits/rejected": -0.05619668960571289, + "logps/chosen": -1.6071586608886719, + "logps/rejected": -1.8837287425994873, + "loss": 0.8244, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6071586608886719, + "rewards/margins": 0.2765699028968811, + "rewards/rejected": -1.8837287425994873, + "sft_loss": 1.5901567935943604, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 6.431528305861505, + "learning_rate": 2.963673060824877e-06, + "logits/chosen": -0.19680675864219666, + "logits/rejected": 0.0035483776591718197, + "logps/chosen": -1.5793983936309814, + "logps/rejected": -1.9441983699798584, + "loss": 0.7786, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5793983936309814, + "rewards/margins": 0.364799827337265, + "rewards/rejected": -1.9441983699798584, + "sft_loss": 1.553386926651001, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 6.012695412310331, + "learning_rate": 2.9626439396926536e-06, + "logits/chosen": -0.04484427347779274, + "logits/rejected": 0.11925461143255234, + "logps/chosen": -1.4713695049285889, + "logps/rejected": -1.9322984218597412, + "loss": 0.767, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4713695049285889, + "rewards/margins": 0.46092891693115234, + "rewards/rejected": -1.9322984218597412, + "sft_loss": 1.53690505027771, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 10.552874460492138, + "learning_rate": 2.9616006279825125e-06, + "logits/chosen": -0.18911555409431458, + "logits/rejected": -0.0048009916208684444, + "logps/chosen": -1.656640648841858, + "logps/rejected": -2.062116861343384, + "loss": 0.775, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.656640648841858, + "rewards/margins": 0.4054761826992035, + "rewards/rejected": -2.062116861343384, + "sft_loss": 1.6129440069198608, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 9.290842105366092, + "learning_rate": 2.9605431358166687e-06, + "logits/chosen": -0.19927628338336945, + "logits/rejected": -0.07802639156579971, + "logps/chosen": -1.581866979598999, + "logps/rejected": -2.1346230506896973, + "loss": 0.7585, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.581866979598999, + "rewards/margins": 0.5527556538581848, + "rewards/rejected": -2.1346230506896973, + "sft_loss": 1.5932109355926514, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 6.556638001860026, + "learning_rate": 2.959471473454915e-06, + "logits/chosen": -0.1524762213230133, + "logits/rejected": -0.10123306512832642, + "logps/chosen": -1.6552999019622803, + "logps/rejected": -2.068732500076294, + "loss": 0.7793, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6552999019622803, + "rewards/margins": 0.41343265771865845, + "rewards/rejected": -2.068732500076294, + "sft_loss": 1.6616418361663818, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 7.961322356971732, + "learning_rate": 2.9583856512945257e-06, + "logits/chosen": -0.1716580092906952, + "logits/rejected": -0.04362250119447708, + "logps/chosen": -1.6354115009307861, + "logps/rejected": -2.064568519592285, + "loss": 0.7676, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6354115009307861, + "rewards/margins": 0.4291567802429199, + "rewards/rejected": -2.064568519592285, + "sft_loss": 1.6227165460586548, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 9.101576014649456, + "learning_rate": 2.957285679870151e-06, + "logits/chosen": -0.20420575141906738, + "logits/rejected": -0.04625245928764343, + "logps/chosen": -1.6196963787078857, + "logps/rejected": -2.151869297027588, + "loss": 0.7236, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6196963787078857, + "rewards/margins": 0.5321725010871887, + "rewards/rejected": -2.151869297027588, + "sft_loss": 1.5830497741699219, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 6.42046355799275, + "learning_rate": 2.9561715698537184e-06, + "logits/chosen": -0.18746407330036163, + "logits/rejected": 0.010445961728692055, + "logps/chosen": -1.6953752040863037, + "logps/rejected": -2.0557973384857178, + "loss": 0.8408, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.6953752040863037, + "rewards/margins": 0.3604220747947693, + "rewards/rejected": -2.0557973384857178, + "sft_loss": 1.6522247791290283, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 6.449408194941958, + "learning_rate": 2.955043332054329e-06, + "logits/chosen": -0.09924498200416565, + "logits/rejected": 0.15976925194263458, + "logps/chosen": -1.6739656925201416, + "logps/rejected": -2.0317275524139404, + "loss": 0.8074, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.6739656925201416, + "rewards/margins": 0.3577619194984436, + "rewards/rejected": -2.0317275524139404, + "sft_loss": 1.6839134693145752, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 5.321255191283596, + "learning_rate": 2.95390097741815e-06, + "logits/chosen": -0.12195520102977753, + "logits/rejected": 0.0469212606549263, + "logps/chosen": -1.6282774209976196, + "logps/rejected": -1.9903730154037476, + "loss": 0.7827, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6282774209976196, + "rewards/margins": 0.36209559440612793, + "rewards/rejected": -1.9903730154037476, + "sft_loss": 1.6381866931915283, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 6.7507291304657455, + "learning_rate": 2.952744517028312e-06, + "logits/chosen": -0.015586107969284058, + "logits/rejected": -0.0066694943234324455, + "logps/chosen": -1.693063497543335, + "logps/rejected": -2.098609209060669, + "loss": 0.7952, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.693063497543335, + "rewards/margins": 0.4055456519126892, + "rewards/rejected": -2.098609209060669, + "sft_loss": 1.7138296365737915, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 5.7441300960801795, + "learning_rate": 2.951573962104798e-06, + "logits/chosen": -0.013513225130736828, + "logits/rejected": -0.0030169696547091007, + "logps/chosen": -1.5411711931228638, + "logps/rejected": -1.8659536838531494, + "loss": 0.788, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5411711931228638, + "rewards/margins": 0.32478243112564087, + "rewards/rejected": -1.8659536838531494, + "sft_loss": 1.5452463626861572, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 6.264181623160948, + "learning_rate": 2.950389324004337e-06, + "logits/chosen": -0.13159170746803284, + "logits/rejected": 0.10122290998697281, + "logps/chosen": -1.6037967205047607, + "logps/rejected": -1.958993673324585, + "loss": 0.7625, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6037967205047607, + "rewards/margins": 0.3551969528198242, + "rewards/rejected": -1.958993673324585, + "sft_loss": 1.6397666931152344, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 8.960837585492582, + "learning_rate": 2.949190614220294e-06, + "logits/chosen": -0.1174493283033371, + "logits/rejected": 0.11396725475788116, + "logps/chosen": -1.6792113780975342, + "logps/rejected": -2.021131992340088, + "loss": 0.8056, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6792113780975342, + "rewards/margins": 0.3419206738471985, + "rewards/rejected": -2.021131992340088, + "sft_loss": 1.6655311584472656, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 7.038292814007257, + "learning_rate": 2.9479778443825553e-06, + "logits/chosen": 0.003808742854744196, + "logits/rejected": 0.24790827929973602, + "logps/chosen": -1.6043720245361328, + "logps/rejected": -1.9450041055679321, + "loss": 0.7922, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6043720245361328, + "rewards/margins": 0.3406318724155426, + "rewards/rejected": -1.9450041055679321, + "sft_loss": 1.6884558200836182, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 7.6017791758830215, + "learning_rate": 2.9467510262574204e-06, + "logits/chosen": 0.10805141925811768, + "logits/rejected": 0.13546641170978546, + "logps/chosen": -1.4662773609161377, + "logps/rejected": -1.9129825830459595, + "loss": 0.7364, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4662773609161377, + "rewards/margins": 0.44670534133911133, + "rewards/rejected": -1.9129825830459595, + "sft_loss": 1.5612038373947144, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 6.2631760907636576, + "learning_rate": 2.9455101717474834e-06, + "logits/chosen": 0.095148466527462, + "logits/rejected": 0.19982023537158966, + "logps/chosen": -1.5556526184082031, + "logps/rejected": -1.8492791652679443, + "loss": 0.8283, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5556526184082031, + "rewards/margins": 0.2936265468597412, + "rewards/rejected": -1.8492791652679443, + "sft_loss": 1.6193443536758423, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 6.357423162704157, + "learning_rate": 2.9442552928915203e-06, + "logits/chosen": 0.0740412250161171, + "logits/rejected": 0.25112438201904297, + "logps/chosen": -1.5824158191680908, + "logps/rejected": -2.0103745460510254, + "loss": 0.7808, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.5824158191680908, + "rewards/margins": 0.42795872688293457, + "rewards/rejected": -2.0103745460510254, + "sft_loss": 1.619419813156128, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 8.362648905453156, + "learning_rate": 2.942986401864371e-06, + "logits/chosen": 0.06260992586612701, + "logits/rejected": 0.28578799962997437, + "logps/chosen": -1.686570167541504, + "logps/rejected": -2.054899215698242, + "loss": 0.8101, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.686570167541504, + "rewards/margins": 0.3683289885520935, + "rewards/rejected": -2.054899215698242, + "sft_loss": 1.7220935821533203, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 6.220464946890453, + "learning_rate": 2.9417035109768225e-06, + "logits/chosen": 0.0265726987272501, + "logits/rejected": 0.27460747957229614, + "logps/chosen": -1.4630142450332642, + "logps/rejected": -1.9338045120239258, + "loss": 0.7449, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4630142450332642, + "rewards/margins": 0.470790296792984, + "rewards/rejected": -1.9338045120239258, + "sft_loss": 1.490442156791687, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 6.538918997856583, + "learning_rate": 2.9404066326754874e-06, + "logits/chosen": 0.04017296060919762, + "logits/rejected": 0.27316930890083313, + "logps/chosen": -1.5119431018829346, + "logps/rejected": -1.8872439861297607, + "loss": 0.7696, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5119431018829346, + "rewards/margins": 0.3753008246421814, + "rewards/rejected": -1.8872439861297607, + "sft_loss": 1.5683563947677612, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 6.4969536058578345, + "learning_rate": 2.9390957795426847e-06, + "logits/chosen": 0.04273226857185364, + "logits/rejected": 0.24941368401050568, + "logps/chosen": -1.5459063053131104, + "logps/rejected": -1.9651594161987305, + "loss": 0.7461, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5459063053131104, + "rewards/margins": 0.4192531108856201, + "rewards/rejected": -1.9651594161987305, + "sft_loss": 1.6078563928604126, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 7.022785391979763, + "learning_rate": 2.9377709642963177e-06, + "logits/chosen": 0.0010808638762682676, + "logits/rejected": 0.1716040074825287, + "logps/chosen": -1.5438032150268555, + "logps/rejected": -2.0570969581604004, + "loss": 0.738, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5438032150268555, + "rewards/margins": 0.5132937431335449, + "rewards/rejected": -2.0570969581604004, + "sft_loss": 1.6060693264007568, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 9.420728979604682, + "learning_rate": 2.9364321997897485e-06, + "logits/chosen": 0.03571401163935661, + "logits/rejected": 0.14158421754837036, + "logps/chosen": -1.6367580890655518, + "logps/rejected": -1.9894298315048218, + "loss": 0.8179, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6367580890655518, + "rewards/margins": 0.35267171263694763, + "rewards/rejected": -1.9894298315048218, + "sft_loss": 1.6678476333618164, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 6.60932904173704, + "learning_rate": 2.935079499011677e-06, + "logits/chosen": -0.013311957940459251, + "logits/rejected": 0.1487562358379364, + "logps/chosen": -1.6255792379379272, + "logps/rejected": -1.8736343383789062, + "loss": 0.816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6255792379379272, + "rewards/margins": 0.2480551302433014, + "rewards/rejected": -1.8736343383789062, + "sft_loss": 1.628200888633728, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 8.164177080214404, + "learning_rate": 2.9337128750860126e-06, + "logits/chosen": 0.03370757773518562, + "logits/rejected": 0.25221627950668335, + "logps/chosen": -1.501709222793579, + "logps/rejected": -1.8876798152923584, + "loss": 0.7654, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.501709222793579, + "rewards/margins": 0.38597044348716736, + "rewards/rejected": -1.8876798152923584, + "sft_loss": 1.5484182834625244, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 4.63992866038779, + "learning_rate": 2.932332341271746e-06, + "logits/chosen": -0.0409303642809391, + "logits/rejected": 0.15158711373806, + "logps/chosen": -1.5028371810913086, + "logps/rejected": -1.949318289756775, + "loss": 0.756, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5028371810913086, + "rewards/margins": 0.4464810788631439, + "rewards/rejected": -1.949318289756775, + "sft_loss": 1.5938483476638794, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 6.014972415086379, + "learning_rate": 2.930937910962822e-06, + "logits/chosen": -0.07076011598110199, + "logits/rejected": 0.06818946450948715, + "logps/chosen": -1.6399303674697876, + "logps/rejected": -2.096925973892212, + "loss": 0.7769, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6399303674697876, + "rewards/margins": 0.45699548721313477, + "rewards/rejected": -2.096925973892212, + "sft_loss": 1.6774276494979858, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 8.395683170770335, + "learning_rate": 2.9295295976880107e-06, + "logits/chosen": 0.025820106267929077, + "logits/rejected": 0.11879418045282364, + "logps/chosen": -1.6487575769424438, + "logps/rejected": -2.074948787689209, + "loss": 0.7623, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6487575769424438, + "rewards/margins": 0.4261912405490875, + "rewards/rejected": -2.074948787689209, + "sft_loss": 1.6632426977157593, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 7.7402768083105125, + "learning_rate": 2.9281074151107727e-06, + "logits/chosen": 0.029402291402220726, + "logits/rejected": 0.2545176148414612, + "logps/chosen": -1.71028733253479, + "logps/rejected": -2.077282428741455, + "loss": 0.7812, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.71028733253479, + "rewards/margins": 0.36699482798576355, + "rewards/rejected": -2.077282428741455, + "sft_loss": 1.6955442428588867, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 8.751835002398574, + "learning_rate": 2.926671377029129e-06, + "logits/chosen": 0.03397374227643013, + "logits/rejected": 0.2006794512271881, + "logps/chosen": -1.6462209224700928, + "logps/rejected": -2.1480746269226074, + "loss": 0.751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6462209224700928, + "rewards/margins": 0.5018535256385803, + "rewards/rejected": -2.1480746269226074, + "sft_loss": 1.752246618270874, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 5.540643425073099, + "learning_rate": 2.9252214973755294e-06, + "logits/chosen": -0.1207038015127182, + "logits/rejected": 0.19626577198505402, + "logps/chosen": -1.6974194049835205, + "logps/rejected": -2.197737216949463, + "loss": 0.7394, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6974194049835205, + "rewards/margins": 0.5003179311752319, + "rewards/rejected": -2.197737216949463, + "sft_loss": 1.7291381359100342, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 5.055148253174504, + "learning_rate": 2.923757790216711e-06, + "logits/chosen": -0.024127524346113205, + "logits/rejected": 0.15421162545681, + "logps/chosen": -1.6369282007217407, + "logps/rejected": -2.1902050971984863, + "loss": 0.7588, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6369282007217407, + "rewards/margins": 0.5532768368721008, + "rewards/rejected": -2.1902050971984863, + "sft_loss": 1.687150001525879, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 8.581801582300574, + "learning_rate": 2.922280269753568e-06, + "logits/chosen": -0.10929293930530548, + "logits/rejected": 0.02730230614542961, + "logps/chosen": -1.718034029006958, + "logps/rejected": -2.0976719856262207, + "loss": 0.8, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.718034029006958, + "rewards/margins": 0.37963828444480896, + "rewards/rejected": -2.0976719856262207, + "sft_loss": 1.7732467651367188, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 8.258353844488562, + "learning_rate": 2.9207889503210094e-06, + "logits/chosen": 0.008437035605311394, + "logits/rejected": 0.232884019613266, + "logps/chosen": -1.6477196216583252, + "logps/rejected": -1.899294137954712, + "loss": 0.8345, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.6477196216583252, + "rewards/margins": 0.2515743374824524, + "rewards/rejected": -1.899294137954712, + "sft_loss": 1.666416883468628, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 5.123591274328121, + "learning_rate": 2.9192838463878236e-06, + "logits/chosen": -0.015348012559115887, + "logits/rejected": 0.1270010769367218, + "logps/chosen": -1.6096594333648682, + "logps/rejected": -1.8980051279067993, + "loss": 0.8205, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6096594333648682, + "rewards/margins": 0.2883456349372864, + "rewards/rejected": -1.8980051279067993, + "sft_loss": 1.6213926076889038, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 6.556141747620839, + "learning_rate": 2.917764972556535e-06, + "logits/chosen": -0.17863938212394714, + "logits/rejected": 0.011336810886859894, + "logps/chosen": -1.5950143337249756, + "logps/rejected": -2.0163919925689697, + "loss": 0.7568, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5950143337249756, + "rewards/margins": 0.421377569437027, + "rewards/rejected": -2.0163919925689697, + "sft_loss": 1.6503280401229858, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 6.749858086812364, + "learning_rate": 2.9162323435632657e-06, + "logits/chosen": -0.027099858969449997, + "logits/rejected": 0.11431686580181122, + "logps/chosen": -1.5036207437515259, + "logps/rejected": -2.1065268516540527, + "loss": 0.7322, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5036207437515259, + "rewards/margins": 0.6029061079025269, + "rewards/rejected": -2.1065268516540527, + "sft_loss": 1.5539976358413696, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 7.768714321436251, + "learning_rate": 2.914685974277587e-06, + "logits/chosen": -0.11938049644231796, + "logits/rejected": -0.018264098092913628, + "logps/chosen": -1.6016819477081299, + "logps/rejected": -1.9748432636260986, + "loss": 0.7893, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6016819477081299, + "rewards/margins": 0.3731613755226135, + "rewards/rejected": -1.9748432636260986, + "sft_loss": 1.5768858194351196, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 7.446064522717903, + "learning_rate": 2.9131258797023814e-06, + "logits/chosen": -0.03684517741203308, + "logits/rejected": 0.1460929960012436, + "logps/chosen": -1.5739840269088745, + "logps/rejected": -1.9133260250091553, + "loss": 0.7733, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5739840269088745, + "rewards/margins": 0.3393420875072479, + "rewards/rejected": -1.9133260250091553, + "sft_loss": 1.6007308959960938, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 6.611446978075237, + "learning_rate": 2.9115520749736934e-06, + "logits/chosen": 0.03872992843389511, + "logits/rejected": 0.23826126754283905, + "logps/chosen": -1.5514007806777954, + "logps/rejected": -2.116300106048584, + "loss": 0.7284, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5514007806777954, + "rewards/margins": 0.5648993253707886, + "rewards/rejected": -2.116300106048584, + "sft_loss": 1.5213690996170044, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 5.789445436806434, + "learning_rate": 2.909964575360583e-06, + "logits/chosen": -0.1557280272245407, + "logits/rejected": -0.0005622118478640914, + "logps/chosen": -1.5989017486572266, + "logps/rejected": -2.1404976844787598, + "loss": 0.7563, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5989017486572266, + "rewards/margins": 0.541595995426178, + "rewards/rejected": -2.1404976844787598, + "sft_loss": 1.6561466455459595, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 8.153135086048819, + "learning_rate": 2.9083633962649783e-06, + "logits/chosen": -0.1161702498793602, + "logits/rejected": 0.13406352698802948, + "logps/chosen": -1.603650450706482, + "logps/rejected": -2.1677873134613037, + "loss": 0.7233, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.603650450706482, + "rewards/margins": 0.5641368627548218, + "rewards/rejected": -2.1677873134613037, + "sft_loss": 1.594813585281372, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 8.061547650135914, + "learning_rate": 2.906748553221527e-06, + "logits/chosen": 0.09124667942523956, + "logits/rejected": 0.18374478816986084, + "logps/chosen": -1.6448787450790405, + "logps/rejected": -2.11365008354187, + "loss": 0.7742, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6448787450790405, + "rewards/margins": 0.468771368265152, + "rewards/rejected": -2.11365008354187, + "sft_loss": 1.5670969486236572, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 6.680659188192423, + "learning_rate": 2.9051200618974418e-06, + "logits/chosen": -0.06118954345583916, + "logits/rejected": 0.16990908980369568, + "logps/chosen": -1.720273733139038, + "logps/rejected": -2.3065876960754395, + "loss": 0.7217, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.720273733139038, + "rewards/margins": 0.5863139033317566, + "rewards/rejected": -2.3065876960754395, + "sft_loss": 1.604292869567871, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 7.164458316966575, + "learning_rate": 2.903477938092354e-06, + "logits/chosen": -0.04646755009889603, + "logits/rejected": 0.010129129514098167, + "logps/chosen": -1.670170545578003, + "logps/rejected": -1.9611806869506836, + "loss": 0.8287, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.670170545578003, + "rewards/margins": 0.2910100519657135, + "rewards/rejected": -1.9611806869506836, + "sft_loss": 1.7245380878448486, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 6.094378907879845, + "learning_rate": 2.901822197738155e-06, + "logits/chosen": -0.21642693877220154, + "logits/rejected": -0.050178252160549164, + "logps/chosen": -1.6112639904022217, + "logps/rejected": -2.1255059242248535, + "loss": 0.7721, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6112639904022217, + "rewards/margins": 0.5142418146133423, + "rewards/rejected": -2.1255059242248535, + "sft_loss": 1.666547179222107, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 7.323723960289789, + "learning_rate": 2.9001528568988454e-06, + "logits/chosen": -0.18746520578861237, + "logits/rejected": -0.022803576663136482, + "logps/chosen": -1.5549800395965576, + "logps/rejected": -2.027247905731201, + "loss": 0.7463, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5549800395965576, + "rewards/margins": 0.4722679555416107, + "rewards/rejected": -2.027247905731201, + "sft_loss": 1.4974477291107178, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 6.302595786091669, + "learning_rate": 2.898469931770378e-06, + "logits/chosen": -0.06833770126104355, + "logits/rejected": 0.04650373011827469, + "logps/chosen": -1.5727870464324951, + "logps/rejected": -1.8890917301177979, + "loss": 0.7904, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5727870464324951, + "rewards/margins": 0.3163047134876251, + "rewards/rejected": -1.8890917301177979, + "sft_loss": 1.5973747968673706, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 5.606944351727298, + "learning_rate": 2.896773438680498e-06, + "logits/chosen": -0.015208597294986248, + "logits/rejected": 0.1008622795343399, + "logps/chosen": -1.539618968963623, + "logps/rejected": -2.0273754596710205, + "loss": 0.7368, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.539618968963623, + "rewards/margins": 0.4877566397190094, + "rewards/rejected": -2.0273754596710205, + "sft_loss": 1.613294243812561, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 7.726677796070765, + "learning_rate": 2.8950633940885908e-06, + "logits/chosen": -0.08709158003330231, + "logits/rejected": 0.01412753202021122, + "logps/chosen": -1.5183520317077637, + "logps/rejected": -1.8736242055892944, + "loss": 0.7847, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5183520317077637, + "rewards/margins": 0.3552722632884979, + "rewards/rejected": -1.8736242055892944, + "sft_loss": 1.5644044876098633, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 5.49002962622439, + "learning_rate": 2.893339814585516e-06, + "logits/chosen": -0.123673215508461, + "logits/rejected": 0.0587911494076252, + "logps/chosen": -1.8219196796417236, + "logps/rejected": -2.1566295623779297, + "loss": 0.835, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.8219196796417236, + "rewards/margins": 0.3347100615501404, + "rewards/rejected": -2.1566295623779297, + "sft_loss": 1.7723875045776367, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 5.3585714206015815, + "learning_rate": 2.8916027168934483e-06, + "logits/chosen": -0.13725906610488892, + "logits/rejected": 0.07069502025842667, + "logps/chosen": -1.5604981184005737, + "logps/rejected": -1.9629894495010376, + "loss": 0.7946, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.5604981184005737, + "rewards/margins": 0.40249133110046387, + "rewards/rejected": -1.9629894495010376, + "sft_loss": 1.6075115203857422, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 5.74098336279485, + "learning_rate": 2.889852117865718e-06, + "logits/chosen": -0.18351632356643677, + "logits/rejected": -0.016420545056462288, + "logps/chosen": -1.703129529953003, + "logps/rejected": -2.1396589279174805, + "loss": 0.7581, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.703129529953003, + "rewards/margins": 0.4365292489528656, + "rewards/rejected": -2.1396589279174805, + "sft_loss": 1.6762657165527344, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 7.0558580015091845, + "learning_rate": 2.888088034486645e-06, + "logits/chosen": -0.09006930887699127, + "logits/rejected": 0.07247094810009003, + "logps/chosen": -1.6955833435058594, + "logps/rejected": -2.146712303161621, + "loss": 0.78, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6955833435058594, + "rewards/margins": 0.4511287212371826, + "rewards/rejected": -2.146712303161621, + "sft_loss": 1.6365978717803955, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 8.2556377644557, + "learning_rate": 2.886310483871373e-06, + "logits/chosen": -0.18919400870800018, + "logits/rejected": -0.01640934683382511, + "logps/chosen": -1.6706031560897827, + "logps/rejected": -2.201122522354126, + "loss": 0.7321, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6706031560897827, + "rewards/margins": 0.5305193662643433, + "rewards/rejected": -2.201122522354126, + "sft_loss": 1.6959350109100342, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 6.3300107901147245, + "learning_rate": 2.8845194832657067e-06, + "logits/chosen": -0.11349854618310928, + "logits/rejected": 0.02640039101243019, + "logps/chosen": -1.5338860750198364, + "logps/rejected": -2.1164541244506836, + "loss": 0.7103, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5338860750198364, + "rewards/margins": 0.5825680494308472, + "rewards/rejected": -2.1164541244506836, + "sft_loss": 1.634878158569336, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 9.000964400043555, + "learning_rate": 2.882715050045941e-06, + "logits/chosen": -0.14477837085723877, + "logits/rejected": -0.06517032533884048, + "logps/chosen": -1.6217753887176514, + "logps/rejected": -2.018050193786621, + "loss": 0.7941, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6217753887176514, + "rewards/margins": 0.3962748050689697, + "rewards/rejected": -2.018050193786621, + "sft_loss": 1.6124019622802734, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.30408504605293274, + "eval_logits/rejected": 0.417248010635376, + "eval_logps/chosen": -1.6028857231140137, + "eval_logps/rejected": -2.1043779850006104, + "eval_loss": 0.744583249092102, + "eval_rewards/accuracies": 0.6750741600990295, + "eval_rewards/chosen": -1.6028857231140137, + "eval_rewards/margins": 0.5014922618865967, + "eval_rewards/rejected": -2.1043779850006104, + "eval_runtime": 45.0345, + "eval_samples_per_second": 29.866, + "eval_sft_loss": 1.6243925094604492, + "eval_steps_per_second": 7.483, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 14.93244550145814, + "learning_rate": 2.8808972017186957e-06, + "logits/chosen": -0.23679903149604797, + "logits/rejected": -0.015266534872353077, + "logps/chosen": -1.5985362529754639, + "logps/rejected": -2.0280582904815674, + "loss": 0.7523, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5985362529754639, + "rewards/margins": 0.4295217990875244, + "rewards/rejected": -2.0280582904815674, + "sft_loss": 1.6371889114379883, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 7.8160679964001405, + "learning_rate": 2.8790659559207434e-06, + "logits/chosen": -0.16882798075675964, + "logits/rejected": 0.08145350217819214, + "logps/chosen": -1.5531269311904907, + "logps/rejected": -1.9778798818588257, + "loss": 0.7507, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5531269311904907, + "rewards/margins": 0.42475301027297974, + "rewards/rejected": -1.9778798818588257, + "sft_loss": 1.5700221061706543, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 8.18777345683683, + "learning_rate": 2.877221330418838e-06, + "logits/chosen": -0.2044234722852707, + "logits/rejected": -0.053410958498716354, + "logps/chosen": -1.627963662147522, + "logps/rejected": -1.924864411354065, + "loss": 0.811, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.627963662147522, + "rewards/margins": 0.29690080881118774, + "rewards/rejected": -1.924864411354065, + "sft_loss": 1.6557378768920898, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 7.985816202450262, + "learning_rate": 2.875363343109545e-06, + "logits/chosen": 0.01616957038640976, + "logits/rejected": 0.1678108274936676, + "logps/chosen": -1.5112775564193726, + "logps/rejected": -1.9317729473114014, + "loss": 0.7606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5112775564193726, + "rewards/margins": 0.42049551010131836, + "rewards/rejected": -1.9317729473114014, + "sft_loss": 1.5209019184112549, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 7.040315791608339, + "learning_rate": 2.8734920120190645e-06, + "logits/chosen": -0.2711160480976105, + "logits/rejected": -0.0027374387718737125, + "logps/chosen": -1.6637003421783447, + "logps/rejected": -2.088355302810669, + "loss": 0.774, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6637003421783447, + "rewards/margins": 0.42465487122535706, + "rewards/rejected": -2.088355302810669, + "sft_loss": 1.6742616891860962, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 7.348545963113847, + "learning_rate": 2.8716073553030593e-06, + "logits/chosen": -0.12881150841712952, + "logits/rejected": 0.00364801287651062, + "logps/chosen": -1.654126763343811, + "logps/rejected": -2.066636562347412, + "loss": 0.7908, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.654126763343811, + "rewards/margins": 0.4125099182128906, + "rewards/rejected": -2.066636562347412, + "sft_loss": 1.6164060831069946, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 6.3291520509791335, + "learning_rate": 2.8697093912464782e-06, + "logits/chosen": -0.13783837854862213, + "logits/rejected": 0.0442417673766613, + "logps/chosen": -1.6532104015350342, + "logps/rejected": -2.0197830200195312, + "loss": 0.7792, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6532104015350342, + "rewards/margins": 0.3665724992752075, + "rewards/rejected": -2.0197830200195312, + "sft_loss": 1.7437490224838257, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 7.43163637379486, + "learning_rate": 2.8677981382633753e-06, + "logits/chosen": -0.2456088811159134, + "logits/rejected": -0.07947979867458344, + "logps/chosen": -1.6115837097167969, + "logps/rejected": -2.0740396976470947, + "loss": 0.7513, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6115837097167969, + "rewards/margins": 0.4624561369419098, + "rewards/rejected": -2.0740396976470947, + "sft_loss": 1.6971622705459595, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 7.948237008152092, + "learning_rate": 2.8658736148967366e-06, + "logits/chosen": -0.1307731419801712, + "logits/rejected": 0.09868714213371277, + "logps/chosen": -1.650904655456543, + "logps/rejected": -2.1201839447021484, + "loss": 0.7719, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.650904655456543, + "rewards/margins": 0.46927928924560547, + "rewards/rejected": -2.1201839447021484, + "sft_loss": 1.6839383840560913, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 9.092640065524614, + "learning_rate": 2.8639358398182947e-06, + "logits/chosen": -0.12447915971279144, + "logits/rejected": 0.12984387576580048, + "logps/chosen": -1.7692009210586548, + "logps/rejected": -2.0938119888305664, + "loss": 0.8577, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.7692009210586548, + "rewards/margins": 0.324611097574234, + "rewards/rejected": -2.0938119888305664, + "sft_loss": 1.7437636852264404, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 7.49659909202242, + "learning_rate": 2.8619848318283538e-06, + "logits/chosen": -0.16396114230155945, + "logits/rejected": -0.017033612355589867, + "logps/chosen": -1.546359658241272, + "logps/rejected": -1.9179470539093018, + "loss": 0.7799, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.546359658241272, + "rewards/margins": 0.37158721685409546, + "rewards/rejected": -1.9179470539093018, + "sft_loss": 1.6477253437042236, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 6.720611032104559, + "learning_rate": 2.860020609855601e-06, + "logits/chosen": -0.20362278819084167, + "logits/rejected": -0.04654115065932274, + "logps/chosen": -1.5199604034423828, + "logps/rejected": -2.017423391342163, + "loss": 0.7452, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5199604034423828, + "rewards/margins": 0.4974629878997803, + "rewards/rejected": -2.017423391342163, + "sft_loss": 1.5844711065292358, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 5.9415895380295085, + "learning_rate": 2.858043192956926e-06, + "logits/chosen": -0.12424206733703613, + "logits/rejected": 0.06748121976852417, + "logps/chosen": -1.5377591848373413, + "logps/rejected": -1.9634453058242798, + "loss": 0.7536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5377591848373413, + "rewards/margins": 0.4256861209869385, + "rewards/rejected": -1.9634453058242798, + "sft_loss": 1.579796552658081, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 5.826261694860266, + "learning_rate": 2.856052600317237e-06, + "logits/chosen": -0.2191907912492752, + "logits/rejected": -0.10827469825744629, + "logps/chosen": -1.5648930072784424, + "logps/rejected": -1.954262137413025, + "loss": 0.7743, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5648930072784424, + "rewards/margins": 0.38936907052993774, + "rewards/rejected": -1.954262137413025, + "sft_loss": 1.5946085453033447, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 6.307597170102964, + "learning_rate": 2.8540488512492725e-06, + "logits/chosen": -0.2155335247516632, + "logits/rejected": -0.09447634220123291, + "logps/chosen": -1.6340783834457397, + "logps/rejected": -2.0027661323547363, + "loss": 0.7815, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6340783834457397, + "rewards/margins": 0.36868780851364136, + "rewards/rejected": -2.0027661323547363, + "sft_loss": 1.5978288650512695, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 8.27533099546194, + "learning_rate": 2.8520319651934147e-06, + "logits/chosen": -0.25640377402305603, + "logits/rejected": -0.0609319731593132, + "logps/chosen": -1.7309064865112305, + "logps/rejected": -2.0606789588928223, + "loss": 0.8249, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7309064865112305, + "rewards/margins": 0.3297719657421112, + "rewards/rejected": -2.0606789588928223, + "sft_loss": 1.7498449087142944, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 10.994412069006197, + "learning_rate": 2.8500019617175005e-06, + "logits/chosen": -0.22293055057525635, + "logits/rejected": -0.045440297573804855, + "logps/chosen": -1.5989841222763062, + "logps/rejected": -1.9689960479736328, + "loss": 0.7614, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5989841222763062, + "rewards/margins": 0.37001150846481323, + "rewards/rejected": -1.9689960479736328, + "sft_loss": 1.5846779346466064, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 7.880113373200392, + "learning_rate": 2.847958860516633e-06, + "logits/chosen": -0.32245585322380066, + "logits/rejected": -0.14119119942188263, + "logps/chosen": -1.7176250219345093, + "logps/rejected": -2.05019474029541, + "loss": 0.8125, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.7176250219345093, + "rewards/margins": 0.3325696587562561, + "rewards/rejected": -2.05019474029541, + "sft_loss": 1.7187855243682861, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 5.070148969896155, + "learning_rate": 2.8459026814129887e-06, + "logits/chosen": -0.22318537533283234, + "logits/rejected": -0.22121672332286835, + "logps/chosen": -1.699710488319397, + "logps/rejected": -2.2298336029052734, + "loss": 0.7527, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.699710488319397, + "rewards/margins": 0.5301231145858765, + "rewards/rejected": -2.2298336029052734, + "sft_loss": 1.6993358135223389, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 6.5213917284801415, + "learning_rate": 2.8438334443556268e-06, + "logits/chosen": -0.22244945168495178, + "logits/rejected": 0.08072423934936523, + "logps/chosen": -1.5851320028305054, + "logps/rejected": -2.119144916534424, + "loss": 0.7376, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5851320028305054, + "rewards/margins": 0.5340126156806946, + "rewards/rejected": -2.119144916534424, + "sft_loss": 1.6274535655975342, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 5.763934363832557, + "learning_rate": 2.8417511694202938e-06, + "logits/chosen": -0.02451368421316147, + "logits/rejected": 0.03239697217941284, + "logps/chosen": -1.5781588554382324, + "logps/rejected": -2.0349514484405518, + "loss": 0.7633, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5781588554382324, + "rewards/margins": 0.45679235458374023, + "rewards/rejected": -2.0349514484405518, + "sft_loss": 1.5774492025375366, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 9.017956492956781, + "learning_rate": 2.83965587680923e-06, + "logits/chosen": -0.04484400898218155, + "logits/rejected": 0.052457988262176514, + "logps/chosen": -1.602957010269165, + "logps/rejected": -2.0960071086883545, + "loss": 0.7555, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.602957010269165, + "rewards/margins": 0.4930500090122223, + "rewards/rejected": -2.0960071086883545, + "sft_loss": 1.6403892040252686, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 6.8452646381985, + "learning_rate": 2.837547586850974e-06, + "logits/chosen": -0.16252757608890533, + "logits/rejected": 0.05827758461236954, + "logps/chosen": -1.597303867340088, + "logps/rejected": -2.0770814418792725, + "loss": 0.7452, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.597303867340088, + "rewards/margins": 0.4797777235507965, + "rewards/rejected": -2.0770814418792725, + "sft_loss": 1.5819872617721558, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 6.745747568412131, + "learning_rate": 2.8354263200001645e-06, + "logits/chosen": -0.27575942873954773, + "logits/rejected": -0.023251216858625412, + "logps/chosen": -1.5655136108398438, + "logps/rejected": -1.9516299962997437, + "loss": 0.768, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.5655136108398438, + "rewards/margins": 0.3861163854598999, + "rewards/rejected": -1.9516299962997437, + "sft_loss": 1.6041730642318726, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 8.458459704309899, + "learning_rate": 2.8332920968373414e-06, + "logits/chosen": -0.05385139584541321, + "logits/rejected": 0.08625955879688263, + "logps/chosen": -1.6114717721939087, + "logps/rejected": -1.9976829290390015, + "loss": 0.818, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6114717721939087, + "rewards/margins": 0.3862113058567047, + "rewards/rejected": -1.9976829290390015, + "sft_loss": 1.580669641494751, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 7.909363378162102, + "learning_rate": 2.831144938068747e-06, + "logits/chosen": -0.08213544636964798, + "logits/rejected": 0.07000809907913208, + "logps/chosen": -1.5530449151992798, + "logps/rejected": -1.940569519996643, + "loss": 0.772, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5530449151992798, + "rewards/margins": 0.38752469420433044, + "rewards/rejected": -1.940569519996643, + "sft_loss": 1.5636192560195923, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 7.785721622620933, + "learning_rate": 2.8289848645261253e-06, + "logits/chosen": -0.07729779183864594, + "logits/rejected": 0.04112401232123375, + "logps/chosen": -1.6301319599151611, + "logps/rejected": -2.0394530296325684, + "loss": 0.7538, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6301319599151611, + "rewards/margins": 0.40932124853134155, + "rewards/rejected": -2.0394530296325684, + "sft_loss": 1.6521657705307007, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 8.457543080999693, + "learning_rate": 2.826811897166519e-06, + "logits/chosen": -0.06580647081136703, + "logits/rejected": -0.020910892635583878, + "logps/chosen": -1.5988067388534546, + "logps/rejected": -1.9672935009002686, + "loss": 0.7746, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5988067388534546, + "rewards/margins": 0.3684867322444916, + "rewards/rejected": -1.9672935009002686, + "sft_loss": 1.5970875024795532, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 6.277693217201755, + "learning_rate": 2.8246260570720673e-06, + "logits/chosen": -0.023764025419950485, + "logits/rejected": 0.2361760437488556, + "logps/chosen": -1.5750267505645752, + "logps/rejected": -2.120636463165283, + "loss": 0.7102, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5750267505645752, + "rewards/margins": 0.5456094145774841, + "rewards/rejected": -2.120636463165283, + "sft_loss": 1.624132752418518, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 9.984181907521135, + "learning_rate": 2.8224273654498007e-06, + "logits/chosen": -0.03629612922668457, + "logits/rejected": 0.047414567321538925, + "logps/chosen": -1.7118265628814697, + "logps/rejected": -1.9818274974822998, + "loss": 0.8381, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7118265628814697, + "rewards/margins": 0.2700011730194092, + "rewards/rejected": -1.9818274974822998, + "sft_loss": 1.7460143566131592, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 7.1133242656780125, + "learning_rate": 2.8202158436314348e-06, + "logits/chosen": -0.10063023865222931, + "logits/rejected": 0.27981701493263245, + "logps/chosen": -1.7411918640136719, + "logps/rejected": -2.2518210411071777, + "loss": 0.7536, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7411918640136719, + "rewards/margins": 0.5106293559074402, + "rewards/rejected": -2.2518210411071777, + "sft_loss": 1.7656304836273193, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 6.833573511308134, + "learning_rate": 2.817991513073163e-06, + "logits/chosen": -0.18759292364120483, + "logits/rejected": -0.015145483426749706, + "logps/chosen": -1.769029974937439, + "logps/rejected": -2.3505871295928955, + "loss": 0.7549, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.769029974937439, + "rewards/margins": 0.5815570950508118, + "rewards/rejected": -2.3505871295928955, + "sft_loss": 1.8363697528839111, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 4.852027119528442, + "learning_rate": 2.8157543953554515e-06, + "logits/chosen": -0.10282772779464722, + "logits/rejected": 0.0718497559428215, + "logps/chosen": -1.6821658611297607, + "logps/rejected": -2.2472102642059326, + "loss": 0.7334, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6821658611297607, + "rewards/margins": 0.5650441646575928, + "rewards/rejected": -2.2472102642059326, + "sft_loss": 1.7046318054199219, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 18.696305660313005, + "learning_rate": 2.813504512182825e-06, + "logits/chosen": -0.07099226117134094, + "logits/rejected": 0.09033173322677612, + "logps/chosen": -1.7145551443099976, + "logps/rejected": -2.405233383178711, + "loss": 0.7138, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7145551443099976, + "rewards/margins": 0.6906784772872925, + "rewards/rejected": -2.405233383178711, + "sft_loss": 1.7227901220321655, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 5.02265596060493, + "learning_rate": 2.811241885383661e-06, + "logits/chosen": -0.12921445071697235, + "logits/rejected": 0.052244264632463455, + "logps/chosen": -1.6103248596191406, + "logps/rejected": -2.2286019325256348, + "loss": 0.7146, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6103248596191406, + "rewards/margins": 0.6182770133018494, + "rewards/rejected": -2.2286019325256348, + "sft_loss": 1.6762888431549072, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 5.4476182172147585, + "learning_rate": 2.8089665369099737e-06, + "logits/chosen": -0.14373573660850525, + "logits/rejected": 0.03006540611386299, + "logps/chosen": -1.6545798778533936, + "logps/rejected": -2.0341227054595947, + "loss": 0.8217, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6545798778533936, + "rewards/margins": 0.37954264879226685, + "rewards/rejected": -2.0341227054595947, + "sft_loss": 1.6523634195327759, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 11.388814753067097, + "learning_rate": 2.806678488837205e-06, + "logits/chosen": -0.1133163794875145, + "logits/rejected": 0.06256228685379028, + "logps/chosen": -1.5948195457458496, + "logps/rejected": -2.063190221786499, + "loss": 0.7666, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5948195457458496, + "rewards/margins": 0.4683706760406494, + "rewards/rejected": -2.063190221786499, + "sft_loss": 1.6374050378799438, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 6.862074073482828, + "learning_rate": 2.804377763364006e-06, + "logits/chosen": -0.016701694577932358, + "logits/rejected": 0.14885933697223663, + "logps/chosen": -1.7333139181137085, + "logps/rejected": -2.1773841381073, + "loss": 0.7851, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7333139181137085, + "rewards/margins": 0.44407039880752563, + "rewards/rejected": -2.1773841381073, + "sft_loss": 1.7752745151519775, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 6.992114465819229, + "learning_rate": 2.8020643828120263e-06, + "logits/chosen": 0.005483886227011681, + "logits/rejected": 0.1073252409696579, + "logps/chosen": -1.7122163772583008, + "logps/rejected": -2.009945869445801, + "loss": 0.8039, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.7122163772583008, + "rewards/margins": 0.2977294325828552, + "rewards/rejected": -2.009945869445801, + "sft_loss": 1.707984209060669, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 6.161097410804302, + "learning_rate": 2.799738369625694e-06, + "logits/chosen": -0.2201133668422699, + "logits/rejected": -0.025361087173223495, + "logps/chosen": -1.7411420345306396, + "logps/rejected": -2.049943447113037, + "loss": 0.7949, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.7411420345306396, + "rewards/margins": 0.30880117416381836, + "rewards/rejected": -2.049943447113037, + "sft_loss": 1.7520923614501953, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 5.8230421438631845, + "learning_rate": 2.7973997463719993e-06, + "logits/chosen": -0.07737503945827484, + "logits/rejected": 0.17626357078552246, + "logps/chosen": -1.4855806827545166, + "logps/rejected": -1.9931854009628296, + "loss": 0.7551, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4855806827545166, + "rewards/margins": 0.507604718208313, + "rewards/rejected": -1.9931854009628296, + "sft_loss": 1.5371906757354736, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 7.449046474721441, + "learning_rate": 2.7950485357402754e-06, + "logits/chosen": -0.0965823158621788, + "logits/rejected": 0.13526314496994019, + "logps/chosen": -1.5863357782363892, + "logps/rejected": -1.9766420125961304, + "loss": 0.7637, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5863357782363892, + "rewards/margins": 0.390306293964386, + "rewards/rejected": -1.9766420125961304, + "sft_loss": 1.6442428827285767, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 9.240029352233087, + "learning_rate": 2.7926847605419776e-06, + "logits/chosen": 0.014005353674292564, + "logits/rejected": 0.19778305292129517, + "logps/chosen": -1.5677052736282349, + "logps/rejected": -1.7215681076049805, + "loss": 0.8335, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.5677052736282349, + "rewards/margins": 0.15386290848255157, + "rewards/rejected": -1.7215681076049805, + "sft_loss": 1.5489507913589478, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 7.163399271299538, + "learning_rate": 2.7903084437104633e-06, + "logits/chosen": -0.013630482368171215, + "logits/rejected": 0.17022158205509186, + "logps/chosen": -1.5031455755233765, + "logps/rejected": -2.0510964393615723, + "loss": 0.7328, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5031455755233765, + "rewards/margins": 0.547950804233551, + "rewards/rejected": -2.0510964393615723, + "sft_loss": 1.5734002590179443, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 6.1120135616231925, + "learning_rate": 2.787919608300769e-06, + "logits/chosen": -0.031419310718774796, + "logits/rejected": 0.11188580840826035, + "logps/chosen": -1.590998649597168, + "logps/rejected": -2.08305287361145, + "loss": 0.7482, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.590998649597168, + "rewards/margins": 0.4920540750026703, + "rewards/rejected": -2.08305287361145, + "sft_loss": 1.6233997344970703, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 9.09816129229086, + "learning_rate": 2.785518277489387e-06, + "logits/chosen": -0.1941632479429245, + "logits/rejected": -0.026421820744872093, + "logps/chosen": -1.6917638778686523, + "logps/rejected": -2.080397129058838, + "loss": 0.7829, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6917638778686523, + "rewards/margins": 0.3886331617832184, + "rewards/rejected": -2.080397129058838, + "sft_loss": 1.698979139328003, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 6.987506345238123, + "learning_rate": 2.783104474574038e-06, + "logits/chosen": -0.02514765039086342, + "logits/rejected": 0.04142391309142113, + "logps/chosen": -1.6556907892227173, + "logps/rejected": -2.2444541454315186, + "loss": 0.7402, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6556907892227173, + "rewards/margins": 0.588763415813446, + "rewards/rejected": -2.2444541454315186, + "sft_loss": 1.7001619338989258, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 5.335480638813095, + "learning_rate": 2.7806782229734495e-06, + "logits/chosen": -0.1490539163351059, + "logits/rejected": -0.008489152416586876, + "logps/chosen": -1.6659870147705078, + "logps/rejected": -2.0349597930908203, + "loss": 0.7977, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6659870147705078, + "rewards/margins": 0.3689727187156677, + "rewards/rejected": -2.0349597930908203, + "sft_loss": 1.715635061264038, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 10.068039319515478, + "learning_rate": 2.7782395462271247e-06, + "logits/chosen": -0.21546879410743713, + "logits/rejected": 0.06337271630764008, + "logps/chosen": -1.651076316833496, + "logps/rejected": -2.0089659690856934, + "loss": 0.7988, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.651076316833496, + "rewards/margins": 0.3578898012638092, + "rewards/rejected": -2.0089659690856934, + "sft_loss": 1.7310740947723389, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 7.27339121991531, + "learning_rate": 2.7757884679951167e-06, + "logits/chosen": -0.03317772597074509, + "logits/rejected": 0.07568792998790741, + "logps/chosen": -1.6018059253692627, + "logps/rejected": -1.968252182006836, + "loss": 0.8057, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6018059253692627, + "rewards/margins": 0.36644667387008667, + "rewards/rejected": -1.968252182006836, + "sft_loss": 1.6354246139526367, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 6.348609737780041, + "learning_rate": 2.7733250120577967e-06, + "logits/chosen": -0.13050243258476257, + "logits/rejected": 0.08431114256381989, + "logps/chosen": -1.5470638275146484, + "logps/rejected": -2.0073821544647217, + "loss": 0.7476, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5470638275146484, + "rewards/margins": 0.46031856536865234, + "rewards/rejected": -2.0073821544647217, + "sft_loss": 1.5971641540527344, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 5.868396696875582, + "learning_rate": 2.770849202315625e-06, + "logits/chosen": -0.11732141673564911, + "logits/rejected": 0.11058878898620605, + "logps/chosen": -1.561550498008728, + "logps/rejected": -2.060333728790283, + "loss": 0.7369, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.561550498008728, + "rewards/margins": 0.4987828731536865, + "rewards/rejected": -2.060333728790283, + "sft_loss": 1.5517923831939697, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 6.244212935607668, + "learning_rate": 2.768361062788919e-06, + "logits/chosen": -0.06370851397514343, + "logits/rejected": 0.10079216957092285, + "logps/chosen": -1.6988623142242432, + "logps/rejected": -2.111222743988037, + "loss": 0.7808, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6988623142242432, + "rewards/margins": 0.4123605191707611, + "rewards/rejected": -2.111222743988037, + "sft_loss": 1.7255029678344727, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 6.201364762208477, + "learning_rate": 2.7658606176176186e-06, + "logits/chosen": -0.12157303094863892, + "logits/rejected": -0.08812706172466278, + "logps/chosen": -1.6759192943572998, + "logps/rejected": -2.098238706588745, + "loss": 0.7934, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.6759192943572998, + "rewards/margins": 0.4223194122314453, + "rewards/rejected": -2.098238706588745, + "sft_loss": 1.7222354412078857, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 5.880804483864086, + "learning_rate": 2.763347891061054e-06, + "logits/chosen": -0.16590115427970886, + "logits/rejected": 0.06696538627147675, + "logps/chosen": -1.616180658340454, + "logps/rejected": -2.0431764125823975, + "loss": 0.7687, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.616180658340454, + "rewards/margins": 0.4269959032535553, + "rewards/rejected": -2.0431764125823975, + "sft_loss": 1.6643760204315186, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 5.8402211520482545, + "learning_rate": 2.7608229074977103e-06, + "logits/chosen": 0.031211012974381447, + "logits/rejected": 0.17653919756412506, + "logps/chosen": -1.5481231212615967, + "logps/rejected": -2.1435084342956543, + "loss": 0.7344, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.5481231212615967, + "rewards/margins": 0.5953856110572815, + "rewards/rejected": -2.1435084342956543, + "sft_loss": 1.5866037607192993, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 7.676572876305737, + "learning_rate": 2.758285691424988e-06, + "logits/chosen": -0.013167837634682655, + "logits/rejected": 0.19646652042865753, + "logps/chosen": -1.597267508506775, + "logps/rejected": -2.110203266143799, + "loss": 0.7468, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.597267508506775, + "rewards/margins": 0.5129359364509583, + "rewards/rejected": -2.110203266143799, + "sft_loss": 1.5865503549575806, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 6.985150266196622, + "learning_rate": 2.7557362674589687e-06, + "logits/chosen": -0.09934534132480621, + "logits/rejected": 0.03581539914011955, + "logps/chosen": -1.5587165355682373, + "logps/rejected": -2.0029618740081787, + "loss": 0.7548, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5587165355682373, + "rewards/margins": 0.44424518942832947, + "rewards/rejected": -2.0029618740081787, + "sft_loss": 1.5543994903564453, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 8.482719831775292, + "learning_rate": 2.753174660334175e-06, + "logits/chosen": -0.07365381717681885, + "logits/rejected": 0.04320163652300835, + "logps/chosen": -1.8012605905532837, + "logps/rejected": -2.065605640411377, + "loss": 0.8401, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.8012605905532837, + "rewards/margins": 0.26434528827667236, + "rewards/rejected": -2.065605640411377, + "sft_loss": 1.815129280090332, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 9.343651805064166, + "learning_rate": 2.750600894903331e-06, + "logits/chosen": -0.1390191912651062, + "logits/rejected": -0.007775820791721344, + "logps/chosen": -1.685133695602417, + "logps/rejected": -2.1161274909973145, + "loss": 0.811, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.685133695602417, + "rewards/margins": 0.4309937357902527, + "rewards/rejected": -2.1161274909973145, + "sft_loss": 1.7652587890625, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 6.14800348144563, + "learning_rate": 2.7480149961371194e-06, + "logits/chosen": -0.028079237788915634, + "logits/rejected": 0.07078806310892105, + "logps/chosen": -1.6217378377914429, + "logps/rejected": -2.157701253890991, + "loss": 0.7215, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6217378377914429, + "rewards/margins": 0.5359634160995483, + "rewards/rejected": -2.157701253890991, + "sft_loss": 1.6147205829620361, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 4.329378205807125, + "learning_rate": 2.745416989123942e-06, + "logits/chosen": -0.07498262822628021, + "logits/rejected": 0.3165915608406067, + "logps/chosen": -1.6001355648040771, + "logps/rejected": -2.1155784130096436, + "loss": 0.7218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6001355648040771, + "rewards/margins": 0.5154424905776978, + "rewards/rejected": -2.1155784130096436, + "sft_loss": 1.60519278049469, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 6.007531073170756, + "learning_rate": 2.7428068990696735e-06, + "logits/chosen": 0.04040871933102608, + "logits/rejected": 0.12215709686279297, + "logps/chosen": -1.5383670330047607, + "logps/rejected": -1.989495038986206, + "loss": 0.7401, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.5383670330047607, + "rewards/margins": 0.45112818479537964, + "rewards/rejected": -1.989495038986206, + "sft_loss": 1.5585277080535889, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 6.64268477918211, + "learning_rate": 2.7401847512974194e-06, + "logits/chosen": 0.011279207654297352, + "logits/rejected": 0.12318231165409088, + "logps/chosen": -1.601244568824768, + "logps/rejected": -2.068206310272217, + "loss": 0.7514, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.601244568824768, + "rewards/margins": 0.46696168184280396, + "rewards/rejected": -2.068206310272217, + "sft_loss": 1.7155154943466187, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 5.734341288848589, + "learning_rate": 2.7375505712472695e-06, + "logits/chosen": -0.02312772534787655, + "logits/rejected": 0.23983144760131836, + "logps/chosen": -1.6255518198013306, + "logps/rejected": -2.0686745643615723, + "loss": 0.7814, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.6255518198013306, + "rewards/margins": 0.4431230425834656, + "rewards/rejected": -2.0686745643615723, + "sft_loss": 1.6115245819091797, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 10.20412258403517, + "learning_rate": 2.734904384476049e-06, + "logits/chosen": -0.01249808631837368, + "logits/rejected": 0.1528748869895935, + "logps/chosen": -1.6915109157562256, + "logps/rejected": -2.112687587738037, + "loss": 0.7771, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6915109157562256, + "rewards/margins": 0.4211767613887787, + "rewards/rejected": -2.112687587738037, + "sft_loss": 1.6461522579193115, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 7.623823205688982, + "learning_rate": 2.732246216657075e-06, + "logits/chosen": -0.01207329798489809, + "logits/rejected": 0.24696707725524902, + "logps/chosen": -1.6187397241592407, + "logps/rejected": -2.0573651790618896, + "loss": 0.7527, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6187397241592407, + "rewards/margins": 0.43862539529800415, + "rewards/rejected": -2.0573651790618896, + "sft_loss": 1.648451805114746, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 8.944893339121828, + "learning_rate": 2.729576093579902e-06, + "logits/chosen": 0.03348899632692337, + "logits/rejected": 0.25046640634536743, + "logps/chosen": -1.6148052215576172, + "logps/rejected": -2.2734577655792236, + "loss": 0.7025, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6148052215576172, + "rewards/margins": 0.6586524248123169, + "rewards/rejected": -2.2734577655792236, + "sft_loss": 1.6397300958633423, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 6.307415239596535, + "learning_rate": 2.726894041150077e-06, + "logits/chosen": 0.03806239366531372, + "logits/rejected": 0.25313812494277954, + "logps/chosen": -1.619058609008789, + "logps/rejected": -2.052539348602295, + "loss": 0.7735, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.619058609008789, + "rewards/margins": 0.4334811270236969, + "rewards/rejected": -2.052539348602295, + "sft_loss": 1.6786972284317017, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 7.4342369032944555, + "learning_rate": 2.7242000853888833e-06, + "logits/chosen": -0.17027874290943146, + "logits/rejected": 0.17580825090408325, + "logps/chosen": -1.7059392929077148, + "logps/rejected": -2.3058700561523438, + "loss": 0.7296, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7059392929077148, + "rewards/margins": 0.5999307036399841, + "rewards/rejected": -2.3058700561523438, + "sft_loss": 1.739965796470642, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 7.867776592990231, + "learning_rate": 2.7214942524330918e-06, + "logits/chosen": -0.1580534279346466, + "logits/rejected": 0.19921769201755524, + "logps/chosen": -1.7531449794769287, + "logps/rejected": -2.4730262756347656, + "loss": 0.7485, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7531449794769287, + "rewards/margins": 0.7198811769485474, + "rewards/rejected": -2.4730262756347656, + "sft_loss": 1.7198894023895264, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 8.211176392099416, + "learning_rate": 2.7187765685347063e-06, + "logits/chosen": -0.014283919706940651, + "logits/rejected": 0.07657746970653534, + "logps/chosen": -1.8897743225097656, + "logps/rejected": -2.3584890365600586, + "loss": 0.8054, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.8897743225097656, + "rewards/margins": 0.46871480345726013, + "rewards/rejected": -2.3584890365600586, + "sft_loss": 1.9435539245605469, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 6.126682030455825, + "learning_rate": 2.7160470600607076e-06, + "logits/chosen": -0.01688101515173912, + "logits/rejected": 0.08603396266698837, + "logps/chosen": -1.7661564350128174, + "logps/rejected": -2.2313408851623535, + "loss": 0.7853, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7661564350128174, + "rewards/margins": 0.46518421173095703, + "rewards/rejected": -2.2313408851623535, + "sft_loss": 1.8172565698623657, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 8.24892072870615, + "learning_rate": 2.7133057534927986e-06, + "logits/chosen": 0.03328476846218109, + "logits/rejected": 0.078876793384552, + "logps/chosen": -1.6054766178131104, + "logps/rejected": -1.9629688262939453, + "loss": 0.7886, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6054766178131104, + "rewards/margins": 0.3574923872947693, + "rewards/rejected": -1.9629688262939453, + "sft_loss": 1.6771936416625977, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 5.792199888523707, + "learning_rate": 2.710552675427148e-06, + "logits/chosen": 0.02124195173382759, + "logits/rejected": 0.15871360898017883, + "logps/chosen": -1.5020825862884521, + "logps/rejected": -1.7976001501083374, + "loss": 0.7749, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5020825862884521, + "rewards/margins": 0.2955175042152405, + "rewards/rejected": -1.7976001501083374, + "sft_loss": 1.5216575860977173, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 7.840732618913243, + "learning_rate": 2.707787852574131e-06, + "logits/chosen": 0.045780718326568604, + "logits/rejected": 0.37935394048690796, + "logps/chosen": -1.541059970855713, + "logps/rejected": -1.887292504310608, + "loss": 0.7606, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.541059970855713, + "rewards/margins": 0.3462323546409607, + "rewards/rejected": -1.887292504310608, + "sft_loss": 1.5794686079025269, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 5.1328056249816685, + "learning_rate": 2.7050113117580716e-06, + "logits/chosen": 0.0036704824306070805, + "logits/rejected": 0.23880800604820251, + "logps/chosen": -1.5183892250061035, + "logps/rejected": -1.9667367935180664, + "loss": 0.7293, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5183892250061035, + "rewards/margins": 0.44834762811660767, + "rewards/rejected": -1.9667367935180664, + "sft_loss": 1.5417373180389404, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 8.270899666456597, + "learning_rate": 2.70222307991698e-06, + "logits/chosen": -0.07949081808328629, + "logits/rejected": 0.018253307789564133, + "logps/chosen": -1.5519993305206299, + "logps/rejected": -1.8914324045181274, + "loss": 0.7772, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5519993305206299, + "rewards/margins": 0.3394331634044647, + "rewards/rejected": -1.8914324045181274, + "sft_loss": 1.6139189004898071, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 8.178223146247955, + "learning_rate": 2.6994231841022947e-06, + "logits/chosen": 0.028050830587744713, + "logits/rejected": 0.12468210607767105, + "logps/chosen": -1.7248175144195557, + "logps/rejected": -1.9740760326385498, + "loss": 0.8437, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7248175144195557, + "rewards/margins": 0.24925847351551056, + "rewards/rejected": -1.9740760326385498, + "sft_loss": 1.7152729034423828, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 5.537793636346068, + "learning_rate": 2.6966116514786166e-06, + "logits/chosen": -0.11753810942173004, + "logits/rejected": 0.15751810371875763, + "logps/chosen": -1.6346890926361084, + "logps/rejected": -2.2291836738586426, + "loss": 0.7152, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6346890926361084, + "rewards/margins": 0.5944945216178894, + "rewards/rejected": -2.2291836738586426, + "sft_loss": 1.6563717126846313, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.46628129482269287, + "eval_logits/rejected": 0.6004635691642761, + "eval_logps/chosen": -1.6570273637771606, + "eval_logps/rejected": -2.1579315662384033, + "eval_loss": 0.7451155781745911, + "eval_rewards/accuracies": 0.6795251965522766, + "eval_rewards/chosen": -1.6570273637771606, + "eval_rewards/margins": 0.5009041428565979, + "eval_rewards/rejected": -2.1579315662384033, + "eval_runtime": 44.5394, + "eval_samples_per_second": 30.198, + "eval_sft_loss": 1.6680430173873901, + "eval_steps_per_second": 7.566, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 6.601498318061748, + "learning_rate": 2.6937885093234477e-06, + "logits/chosen": -0.11071095615625381, + "logits/rejected": 0.22233088314533234, + "logps/chosen": -1.6684538125991821, + "logps/rejected": -2.2091896533966064, + "loss": 0.7292, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6684538125991821, + "rewards/margins": 0.5407360792160034, + "rewards/rejected": -2.2091896533966064, + "sft_loss": 1.6963183879852295, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 6.60084492891464, + "learning_rate": 2.6909537850269256e-06, + "logits/chosen": -0.10238151252269745, + "logits/rejected": 0.16353382170200348, + "logps/chosen": -1.6431251764297485, + "logps/rejected": -2.2469396591186523, + "loss": 0.7289, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6431251764297485, + "rewards/margins": 0.6038146018981934, + "rewards/rejected": -2.2469396591186523, + "sft_loss": 1.7117202281951904, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 10.47007979184553, + "learning_rate": 2.688107506091558e-06, + "logits/chosen": 0.021958637982606888, + "logits/rejected": 0.21035261452198029, + "logps/chosen": -1.7854686975479126, + "logps/rejected": -2.2965052127838135, + "loss": 0.8082, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7854686975479126, + "rewards/margins": 0.5110365152359009, + "rewards/rejected": -2.2965052127838135, + "sft_loss": 1.8058786392211914, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 6.514639401513543, + "learning_rate": 2.6852497001319555e-06, + "logits/chosen": 0.053987015038728714, + "logits/rejected": 0.2950461804866791, + "logps/chosen": -1.567812204360962, + "logps/rejected": -2.0840001106262207, + "loss": 0.7485, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.567812204360962, + "rewards/margins": 0.5161879658699036, + "rewards/rejected": -2.0840001106262207, + "sft_loss": 1.611684799194336, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 8.313247489947376, + "learning_rate": 2.682380394874564e-06, + "logits/chosen": 0.1495714783668518, + "logits/rejected": 0.21326008439064026, + "logps/chosen": -1.6688334941864014, + "logps/rejected": -1.993452787399292, + "loss": 0.8008, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6688334941864014, + "rewards/margins": 0.32461923360824585, + "rewards/rejected": -1.993452787399292, + "sft_loss": 1.6152689456939697, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 6.079190187498352, + "learning_rate": 2.6794996181573953e-06, + "logits/chosen": 0.08869186043739319, + "logits/rejected": 0.32504457235336304, + "logps/chosen": -1.5763978958129883, + "logps/rejected": -1.9891843795776367, + "loss": 0.7714, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5763978958129883, + "rewards/margins": 0.41278642416000366, + "rewards/rejected": -1.9891843795776367, + "sft_loss": 1.5958096981048584, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 7.231629336566373, + "learning_rate": 2.6766073979297584e-06, + "logits/chosen": -0.017764057964086533, + "logits/rejected": 0.1814512312412262, + "logps/chosen": -1.5102531909942627, + "logps/rejected": -2.041884422302246, + "loss": 0.7378, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5102531909942627, + "rewards/margins": 0.5316312909126282, + "rewards/rejected": -2.041884422302246, + "sft_loss": 1.5394649505615234, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 6.9019460717918895, + "learning_rate": 2.6737037622519866e-06, + "logits/chosen": 0.023942243307828903, + "logits/rejected": 0.23042920231819153, + "logps/chosen": -1.5446743965148926, + "logps/rejected": -2.143510580062866, + "loss": 0.7289, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.5446743965148926, + "rewards/margins": 0.5988361239433289, + "rewards/rejected": -2.143510580062866, + "sft_loss": 1.5816099643707275, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 7.135847127478541, + "learning_rate": 2.670788739295166e-06, + "logits/chosen": 0.11366844177246094, + "logits/rejected": 0.2004258632659912, + "logps/chosen": -1.556762456893921, + "logps/rejected": -1.9616931676864624, + "loss": 0.7568, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.556762456893921, + "rewards/margins": 0.40493065118789673, + "rewards/rejected": -1.9616931676864624, + "sft_loss": 1.5798929929733276, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 8.48477994398459, + "learning_rate": 2.6678623573408613e-06, + "logits/chosen": 0.19071733951568604, + "logits/rejected": 0.287414014339447, + "logps/chosen": -1.592995524406433, + "logps/rejected": -2.1026222705841064, + "loss": 0.7272, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.592995524406433, + "rewards/margins": 0.5096268057823181, + "rewards/rejected": -2.1026222705841064, + "sft_loss": 1.5550458431243896, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 7.666285597766726, + "learning_rate": 2.664924644780844e-06, + "logits/chosen": 0.09412811696529388, + "logits/rejected": 0.25759047269821167, + "logps/chosen": -1.665856957435608, + "logps/rejected": -2.1644914150238037, + "loss": 0.7627, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.665856957435608, + "rewards/margins": 0.498634397983551, + "rewards/rejected": -2.1644914150238037, + "sft_loss": 1.6397167444229126, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 8.385920510432712, + "learning_rate": 2.661975630116813e-06, + "logits/chosen": 0.15583749115467072, + "logits/rejected": 0.21677199006080627, + "logps/chosen": -1.567179560661316, + "logps/rejected": -2.111881971359253, + "loss": 0.7222, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.567179560661316, + "rewards/margins": 0.544702410697937, + "rewards/rejected": -2.111881971359253, + "sft_loss": 1.5149648189544678, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 7.318357315401352, + "learning_rate": 2.6590153419601236e-06, + "logits/chosen": 0.17675212025642395, + "logits/rejected": 0.273038387298584, + "logps/chosen": -1.7771793603897095, + "logps/rejected": -2.1778042316436768, + "loss": 0.8118, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.7771793603897095, + "rewards/margins": 0.40062466263771057, + "rewards/rejected": -2.1778042316436768, + "sft_loss": 1.7589333057403564, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 6.249906925635093, + "learning_rate": 2.656043809031503e-06, + "logits/chosen": 0.15232793986797333, + "logits/rejected": 0.41597262024879456, + "logps/chosen": -1.7666094303131104, + "logps/rejected": -2.173813819885254, + "loss": 0.8284, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7666094303131104, + "rewards/margins": 0.4072045683860779, + "rewards/rejected": -2.173813819885254, + "sft_loss": 1.6762611865997314, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 8.986403126194638, + "learning_rate": 2.6530610601607764e-06, + "logits/chosen": 0.1481233537197113, + "logits/rejected": 0.4590158462524414, + "logps/chosen": -1.7123301029205322, + "logps/rejected": -2.2884089946746826, + "loss": 0.7548, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7123301029205322, + "rewards/margins": 0.5760786533355713, + "rewards/rejected": -2.2884089946746826, + "sft_loss": 1.7525371313095093, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 7.50493689044621, + "learning_rate": 2.6500671242865877e-06, + "logits/chosen": 0.04240292310714722, + "logits/rejected": 0.22901475429534912, + "logps/chosen": -1.7242082357406616, + "logps/rejected": -2.1248300075531006, + "loss": 0.7762, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.7242082357406616, + "rewards/margins": 0.40062183141708374, + "rewards/rejected": -2.1248300075531006, + "sft_loss": 1.7412922382354736, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 7.410719554486841, + "learning_rate": 2.6470620304561147e-06, + "logits/chosen": 0.1224837675690651, + "logits/rejected": 0.4860759675502777, + "logps/chosen": -1.6327342987060547, + "logps/rejected": -2.1257483959198, + "loss": 0.7689, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6327342987060547, + "rewards/margins": 0.4930140972137451, + "rewards/rejected": -2.1257483959198, + "sft_loss": 1.6574312448501587, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 7.020214850908995, + "learning_rate": 2.6440458078247914e-06, + "logits/chosen": 0.14286457002162933, + "logits/rejected": 0.45216941833496094, + "logps/chosen": -1.5427910089492798, + "logps/rejected": -2.09596586227417, + "loss": 0.7161, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5427910089492798, + "rewards/margins": 0.553174614906311, + "rewards/rejected": -2.09596586227417, + "sft_loss": 1.6083276271820068, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 11.83409335374771, + "learning_rate": 2.641018485656023e-06, + "logits/chosen": 0.004470625426620245, + "logits/rejected": 0.2434651106595993, + "logps/chosen": -1.6732524633407593, + "logps/rejected": -2.092607021331787, + "loss": 0.7896, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6732524633407593, + "rewards/margins": 0.41935428977012634, + "rewards/rejected": -2.092607021331787, + "sft_loss": 1.71701180934906, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 6.48895966929563, + "learning_rate": 2.6379800933209028e-06, + "logits/chosen": 0.2479771375656128, + "logits/rejected": 0.1528288722038269, + "logps/chosen": -1.6353261470794678, + "logps/rejected": -1.9076725244522095, + "loss": 0.8118, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6353261470794678, + "rewards/margins": 0.2723463475704193, + "rewards/rejected": -1.9076725244522095, + "sft_loss": 1.6776100397109985, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 8.304131930398496, + "learning_rate": 2.634930660297926e-06, + "logits/chosen": 0.2298423945903778, + "logits/rejected": 0.45017942786216736, + "logps/chosen": -1.585171103477478, + "logps/rejected": -2.002087116241455, + "loss": 0.7519, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.585171103477478, + "rewards/margins": 0.4169161915779114, + "rewards/rejected": -2.002087116241455, + "sft_loss": 1.6092761754989624, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 7.157798481884509, + "learning_rate": 2.631870216172705e-06, + "logits/chosen": 0.20391225814819336, + "logits/rejected": 0.35752061009407043, + "logps/chosen": -1.6702439785003662, + "logps/rejected": -2.085132122039795, + "loss": 0.7718, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6702439785003662, + "rewards/margins": 0.4148883819580078, + "rewards/rejected": -2.085132122039795, + "sft_loss": 1.7260020971298218, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 13.800397931675189, + "learning_rate": 2.6287987906376834e-06, + "logits/chosen": 0.18047472834587097, + "logits/rejected": 0.4845646023750305, + "logps/chosen": -1.7837867736816406, + "logps/rejected": -2.2137465476989746, + "loss": 0.8152, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7837867736816406, + "rewards/margins": 0.42995983362197876, + "rewards/rejected": -2.2137465476989746, + "sft_loss": 1.7706577777862549, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 7.729179220447571, + "learning_rate": 2.6257164134918435e-06, + "logits/chosen": 0.22672733664512634, + "logits/rejected": 0.3549592196941376, + "logps/chosen": -1.5702648162841797, + "logps/rejected": -2.2024621963500977, + "loss": 0.7135, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.5702648162841797, + "rewards/margins": 0.6321974992752075, + "rewards/rejected": -2.2024621963500977, + "sft_loss": 1.6143814325332642, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 4.679694118516009, + "learning_rate": 2.622623114640423e-06, + "logits/chosen": 0.28641635179519653, + "logits/rejected": 0.4558026194572449, + "logps/chosen": -1.6455614566802979, + "logps/rejected": -2.2941269874572754, + "loss": 0.7082, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6455614566802979, + "rewards/margins": 0.6485655903816223, + "rewards/rejected": -2.2941269874572754, + "sft_loss": 1.7281744480133057, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 6.28941507290267, + "learning_rate": 2.6195189240946205e-06, + "logits/chosen": 0.32530251145362854, + "logits/rejected": 0.41650062799453735, + "logps/chosen": -1.6033023595809937, + "logps/rejected": -1.9561445713043213, + "loss": 0.7921, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6033023595809937, + "rewards/margins": 0.35284224152565, + "rewards/rejected": -1.9561445713043213, + "sft_loss": 1.6567922830581665, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 6.87768035184655, + "learning_rate": 2.6164038719713065e-06, + "logits/chosen": 0.011040126904845238, + "logits/rejected": 0.3350133001804352, + "logps/chosen": -1.5925350189208984, + "logps/rejected": -2.326432228088379, + "loss": 0.7131, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5925350189208984, + "rewards/margins": 0.73389732837677, + "rewards/rejected": -2.326432228088379, + "sft_loss": 1.5929383039474487, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 9.636323633968454, + "learning_rate": 2.6132779884927303e-06, + "logits/chosen": 0.07610784471035004, + "logits/rejected": 0.3584909737110138, + "logps/chosen": -1.6358665227890015, + "logps/rejected": -2.1494953632354736, + "loss": 0.7415, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6358665227890015, + "rewards/margins": 0.5136286616325378, + "rewards/rejected": -2.1494953632354736, + "sft_loss": 1.5950847864151, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 7.027797013049016, + "learning_rate": 2.6101413039862274e-06, + "logits/chosen": 0.23933513462543488, + "logits/rejected": 0.28316718339920044, + "logps/chosen": -1.6864540576934814, + "logps/rejected": -2.1790852546691895, + "loss": 0.7679, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6864540576934814, + "rewards/margins": 0.49263089895248413, + "rewards/rejected": -2.1790852546691895, + "sft_loss": 1.7424131631851196, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 12.721083258187122, + "learning_rate": 2.606993848883924e-06, + "logits/chosen": 0.09545941650867462, + "logits/rejected": 0.19137129187583923, + "logps/chosen": -1.8484070301055908, + "logps/rejected": -2.3486435413360596, + "loss": 0.8024, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.8484070301055908, + "rewards/margins": 0.5002365708351135, + "rewards/rejected": -2.3486435413360596, + "sft_loss": 1.8460218906402588, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 8.609108934546553, + "learning_rate": 2.6038356537224433e-06, + "logits/chosen": 0.06818292289972305, + "logits/rejected": 0.2434753179550171, + "logps/chosen": -1.664947509765625, + "logps/rejected": -2.1902339458465576, + "loss": 0.7376, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.664947509765625, + "rewards/margins": 0.5252864956855774, + "rewards/rejected": -2.1902339458465576, + "sft_loss": 1.6970077753067017, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 7.1753681171926935, + "learning_rate": 2.6006667491426098e-06, + "logits/chosen": 0.09856332838535309, + "logits/rejected": 0.31549859046936035, + "logps/chosen": -1.640303373336792, + "logps/rejected": -2.1187996864318848, + "loss": 0.7552, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.640303373336792, + "rewards/margins": 0.47849640250205994, + "rewards/rejected": -2.1187996864318848, + "sft_loss": 1.704172134399414, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 9.577984645970636, + "learning_rate": 2.5974871658891483e-06, + "logits/chosen": 0.15941819548606873, + "logits/rejected": 0.21040339767932892, + "logps/chosen": -1.6682817935943604, + "logps/rejected": -2.1734302043914795, + "loss": 0.7621, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6682817935943604, + "rewards/margins": 0.5051483511924744, + "rewards/rejected": -2.1734302043914795, + "sft_loss": 1.6816104650497437, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 7.484776619651517, + "learning_rate": 2.59429693481039e-06, + "logits/chosen": 0.1602596491575241, + "logits/rejected": 0.38191017508506775, + "logps/chosen": -1.6603100299835205, + "logps/rejected": -2.0156192779541016, + "loss": 0.787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6603100299835205, + "rewards/margins": 0.355309396982193, + "rewards/rejected": -2.0156192779541016, + "sft_loss": 1.7396914958953857, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 5.852626179114656, + "learning_rate": 2.5910960868579707e-06, + "logits/chosen": 0.08808886259794235, + "logits/rejected": 0.24856379628181458, + "logps/chosen": -1.573021411895752, + "logps/rejected": -2.024200916290283, + "loss": 0.748, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.573021411895752, + "rewards/margins": 0.451179563999176, + "rewards/rejected": -2.024200916290283, + "sft_loss": 1.6066830158233643, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 5.772519203781361, + "learning_rate": 2.5878846530865316e-06, + "logits/chosen": 0.04618818685412407, + "logits/rejected": 0.22156529128551483, + "logps/chosen": -1.5426143407821655, + "logps/rejected": -1.9907680749893188, + "loss": 0.7462, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5426143407821655, + "rewards/margins": 0.4481539726257324, + "rewards/rejected": -1.9907680749893188, + "sft_loss": 1.5462530851364136, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 6.2206325351188125, + "learning_rate": 2.584662664653417e-06, + "logits/chosen": 0.1688314825296402, + "logits/rejected": 0.2749001383781433, + "logps/chosen": -1.5096161365509033, + "logps/rejected": -1.8245433568954468, + "loss": 0.7684, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5096161365509033, + "rewards/margins": 0.31492722034454346, + "rewards/rejected": -1.8245433568954468, + "sft_loss": 1.5106897354125977, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 6.730515249486428, + "learning_rate": 2.5814301528183724e-06, + "logits/chosen": 0.2601849436759949, + "logits/rejected": 0.36312350630760193, + "logps/chosen": -1.6283996105194092, + "logps/rejected": -1.9735686779022217, + "loss": 0.7775, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6283996105194092, + "rewards/margins": 0.3451687693595886, + "rewards/rejected": -1.9735686779022217, + "sft_loss": 1.647060751914978, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 7.436456784173762, + "learning_rate": 2.5781871489432425e-06, + "logits/chosen": 0.04569276422262192, + "logits/rejected": 0.25311240553855896, + "logps/chosen": -1.5339930057525635, + "logps/rejected": -2.056906223297119, + "loss": 0.7323, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5339930057525635, + "rewards/margins": 0.5229132175445557, + "rewards/rejected": -2.056906223297119, + "sft_loss": 1.5986318588256836, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 6.760066420168773, + "learning_rate": 2.5749336844916644e-06, + "logits/chosen": 0.04725943133234978, + "logits/rejected": 0.14323343336582184, + "logps/chosen": -1.6129106283187866, + "logps/rejected": -2.0528836250305176, + "loss": 0.7555, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6129106283187866, + "rewards/margins": 0.43997272849082947, + "rewards/rejected": -2.0528836250305176, + "sft_loss": 1.6782392263412476, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 9.43713410579148, + "learning_rate": 2.5716697910287653e-06, + "logits/chosen": -0.09104229509830475, + "logits/rejected": 0.10284809023141861, + "logps/chosen": -1.605081558227539, + "logps/rejected": -2.299532890319824, + "loss": 0.695, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.605081558227539, + "rewards/margins": 0.6944512128829956, + "rewards/rejected": -2.299532890319824, + "sft_loss": 1.6521618366241455, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 5.794534536309435, + "learning_rate": 2.5683955002208533e-06, + "logits/chosen": 0.1189044937491417, + "logits/rejected": 0.3134990334510803, + "logps/chosen": -1.6299169063568115, + "logps/rejected": -2.1406655311584473, + "loss": 0.7279, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6299169063568115, + "rewards/margins": 0.5107485055923462, + "rewards/rejected": -2.1406655311584473, + "sft_loss": 1.6376746892929077, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 8.353254849903834, + "learning_rate": 2.5651108438351125e-06, + "logits/chosen": 0.058274924755096436, + "logits/rejected": 0.3000137209892273, + "logps/chosen": -1.657879114151001, + "logps/rejected": -2.141011953353882, + "loss": 0.7511, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.657879114151001, + "rewards/margins": 0.48313283920288086, + "rewards/rejected": -2.141011953353882, + "sft_loss": 1.7078145742416382, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 9.533470110124561, + "learning_rate": 2.5618158537392933e-06, + "logits/chosen": 0.09321188926696777, + "logits/rejected": 0.18731586635112762, + "logps/chosen": -1.7326152324676514, + "logps/rejected": -2.2076475620269775, + "loss": 0.7547, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7326152324676514, + "rewards/margins": 0.475032240152359, + "rewards/rejected": -2.2076475620269775, + "sft_loss": 1.6305125951766968, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 7.706042556473839, + "learning_rate": 2.5585105619014042e-06, + "logits/chosen": -0.011135217733681202, + "logits/rejected": 0.25493383407592773, + "logps/chosen": -1.6696735620498657, + "logps/rejected": -2.350343942642212, + "loss": 0.726, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6696735620498657, + "rewards/margins": 0.6806705594062805, + "rewards/rejected": -2.350343942642212, + "sft_loss": 1.7117812633514404, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 5.795157279285816, + "learning_rate": 2.555195000389401e-06, + "logits/chosen": 0.1898316591978073, + "logits/rejected": 0.2543187737464905, + "logps/chosen": -1.7861602306365967, + "logps/rejected": -2.1704885959625244, + "loss": 0.7909, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7861602306365967, + "rewards/margins": 0.38432830572128296, + "rewards/rejected": -2.1704885959625244, + "sft_loss": 1.7805356979370117, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 8.683569554952136, + "learning_rate": 2.5518692013708764e-06, + "logits/chosen": 0.04368092864751816, + "logits/rejected": 0.16964897513389587, + "logps/chosen": -1.7581827640533447, + "logps/rejected": -2.094151020050049, + "loss": 0.7886, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7581827640533447, + "rewards/margins": 0.33596810698509216, + "rewards/rejected": -2.094151020050049, + "sft_loss": 1.794694185256958, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 7.225102717341863, + "learning_rate": 2.5485331971127467e-06, + "logits/chosen": 0.07228956371545792, + "logits/rejected": 0.2951813042163849, + "logps/chosen": -1.7531318664550781, + "logps/rejected": -2.31132435798645, + "loss": 0.7368, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7531318664550781, + "rewards/margins": 0.5581925511360168, + "rewards/rejected": -2.31132435798645, + "sft_loss": 1.783738374710083, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 12.923394962710496, + "learning_rate": 2.5451870199809398e-06, + "logits/chosen": 0.10603566467761993, + "logits/rejected": 0.23745813965797424, + "logps/chosen": -1.676622748374939, + "logps/rejected": -2.164050817489624, + "loss": 0.7748, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.676622748374939, + "rewards/margins": 0.4874279499053955, + "rewards/rejected": -2.164050817489624, + "sft_loss": 1.7047998905181885, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 10.201943390793542, + "learning_rate": 2.5418307024400808e-06, + "logits/chosen": -0.11501812934875488, + "logits/rejected": 0.07067526131868362, + "logps/chosen": -1.8136136531829834, + "logps/rejected": -2.133671522140503, + "loss": 0.8336, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.8136136531829834, + "rewards/margins": 0.32005780935287476, + "rewards/rejected": -2.133671522140503, + "sft_loss": 1.7607166767120361, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 9.19390158695412, + "learning_rate": 2.538464277053178e-06, + "logits/chosen": 0.030479732900857925, + "logits/rejected": 0.22223253548145294, + "logps/chosen": -1.6502141952514648, + "logps/rejected": -2.242980480194092, + "loss": 0.753, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.6502141952514648, + "rewards/margins": 0.5927663445472717, + "rewards/rejected": -2.242980480194092, + "sft_loss": 1.6727278232574463, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 9.843198268142263, + "learning_rate": 2.5350877764813042e-06, + "logits/chosen": 0.046414703130722046, + "logits/rejected": 0.1939028948545456, + "logps/chosen": -1.8641245365142822, + "logps/rejected": -2.352881908416748, + "loss": 0.7795, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8641245365142822, + "rewards/margins": 0.48875731229782104, + "rewards/rejected": -2.352881908416748, + "sft_loss": 1.8196094036102295, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 7.2829821078345045, + "learning_rate": 2.531701233483284e-06, + "logits/chosen": 0.020082779228687286, + "logits/rejected": 0.1269410252571106, + "logps/chosen": -1.678328514099121, + "logps/rejected": -2.256639003753662, + "loss": 0.7472, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.678328514099121, + "rewards/margins": 0.5783103704452515, + "rewards/rejected": -2.256639003753662, + "sft_loss": 1.707733154296875, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 7.79692306989129, + "learning_rate": 2.5283046809153708e-06, + "logits/chosen": -0.010091030970215797, + "logits/rejected": 0.2002343237400055, + "logps/chosen": -1.7502202987670898, + "logps/rejected": -2.349459409713745, + "loss": 0.7276, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7502202987670898, + "rewards/margins": 0.5992392301559448, + "rewards/rejected": -2.349459409713745, + "sft_loss": 1.7806326150894165, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 8.722278298424193, + "learning_rate": 2.524898151730934e-06, + "logits/chosen": -0.06084078550338745, + "logits/rejected": 0.14076320827007294, + "logps/chosen": -1.6607754230499268, + "logps/rejected": -2.188734292984009, + "loss": 0.727, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6607754230499268, + "rewards/margins": 0.5279589891433716, + "rewards/rejected": -2.188734292984009, + "sft_loss": 1.6254417896270752, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 9.050633472129125, + "learning_rate": 2.5214816789801337e-06, + "logits/chosen": 0.08807148039340973, + "logits/rejected": 0.3059306740760803, + "logps/chosen": -1.6033060550689697, + "logps/rejected": -2.382601737976074, + "loss": 0.6675, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6033060550689697, + "rewards/margins": 0.7792957425117493, + "rewards/rejected": -2.382601737976074, + "sft_loss": 1.6539281606674194, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 6.391447041119939, + "learning_rate": 2.518055295809604e-06, + "logits/chosen": 0.034144409000873566, + "logits/rejected": 0.13075736165046692, + "logps/chosen": -1.551403284072876, + "logps/rejected": -2.2758407592773438, + "loss": 0.6782, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.551403284072876, + "rewards/margins": 0.7244374752044678, + "rewards/rejected": -2.2758407592773438, + "sft_loss": 1.5701818466186523, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 7.35968853019176, + "learning_rate": 2.5146190354621295e-06, + "logits/chosen": -0.05034959316253662, + "logits/rejected": 0.23992769420146942, + "logps/chosen": -1.6645405292510986, + "logps/rejected": -2.3523929119110107, + "loss": 0.6936, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6645405292510986, + "rewards/margins": 0.6878524422645569, + "rewards/rejected": -2.3523929119110107, + "sft_loss": 1.8230438232421875, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 9.065484153335452, + "learning_rate": 2.511172931276323e-06, + "logits/chosen": 0.0478929728269577, + "logits/rejected": 0.11166934669017792, + "logps/chosen": -1.6970382928848267, + "logps/rejected": -2.2658262252807617, + "loss": 0.7139, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6970382928848267, + "rewards/margins": 0.5687879323959351, + "rewards/rejected": -2.2658262252807617, + "sft_loss": 1.7181049585342407, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 6.836928685094494, + "learning_rate": 2.5077170166863026e-06, + "logits/chosen": -0.17804375290870667, + "logits/rejected": 0.23447522521018982, + "logps/chosen": -1.707627534866333, + "logps/rejected": -2.4188055992126465, + "loss": 0.7031, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.707627534866333, + "rewards/margins": 0.711178183555603, + "rewards/rejected": -2.4188055992126465, + "sft_loss": 1.7988073825836182, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 5.74765813055918, + "learning_rate": 2.504251325221366e-06, + "logits/chosen": -0.00038936137570999563, + "logits/rejected": 0.23388922214508057, + "logps/chosen": -1.734164834022522, + "logps/rejected": -2.2840001583099365, + "loss": 0.7582, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.734164834022522, + "rewards/margins": 0.5498352646827698, + "rewards/rejected": -2.2840001583099365, + "sft_loss": 1.7423995733261108, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 6.630794066105069, + "learning_rate": 2.500775890505668e-06, + "logits/chosen": -0.14226017892360687, + "logits/rejected": 0.04709906131029129, + "logps/chosen": -1.6069732904434204, + "logps/rejected": -2.14501690864563, + "loss": 0.7097, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6069732904434204, + "rewards/margins": 0.5380436182022095, + "rewards/rejected": -2.14501690864563, + "sft_loss": 1.659594178199768, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 5.908499740183493, + "learning_rate": 2.497290746257891e-06, + "logits/chosen": -0.05062161758542061, + "logits/rejected": 0.07945103198289871, + "logps/chosen": -1.5502407550811768, + "logps/rejected": -2.061922073364258, + "loss": 0.7352, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5502407550811768, + "rewards/margins": 0.5116813778877258, + "rewards/rejected": -2.061922073364258, + "sft_loss": 1.616845726966858, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 5.940899205934954, + "learning_rate": 2.49379592629092e-06, + "logits/chosen": -0.09226035326719284, + "logits/rejected": 0.03236633166670799, + "logps/chosen": -1.425779938697815, + "logps/rejected": -2.10670804977417, + "loss": 0.647, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.425779938697815, + "rewards/margins": 0.6809282302856445, + "rewards/rejected": -2.10670804977417, + "sft_loss": 1.4918690919876099, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 9.575546027123126, + "learning_rate": 2.4902914645115135e-06, + "logits/chosen": -0.24684646725654602, + "logits/rejected": 0.06735511124134064, + "logps/chosen": -1.6334794759750366, + "logps/rejected": -2.2642669677734375, + "loss": 0.6957, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6334794759750366, + "rewards/margins": 0.6307875514030457, + "rewards/rejected": -2.2642669677734375, + "sft_loss": 1.6950395107269287, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 9.858340363293193, + "learning_rate": 2.4867773949199748e-06, + "logits/chosen": -0.17375081777572632, + "logits/rejected": 0.04624699801206589, + "logps/chosen": -1.605721116065979, + "logps/rejected": -2.3855223655700684, + "loss": 0.6545, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.605721116065979, + "rewards/margins": 0.7798011302947998, + "rewards/rejected": -2.3855223655700684, + "sft_loss": 1.6923332214355469, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 11.33617656132988, + "learning_rate": 2.483253751609823e-06, + "logits/chosen": -0.18652328848838806, + "logits/rejected": 0.10901524871587753, + "logps/chosen": -1.7739999294281006, + "logps/rejected": -2.652946949005127, + "loss": 0.6709, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7739999294281006, + "rewards/margins": 0.8789469599723816, + "rewards/rejected": -2.652946949005127, + "sft_loss": 1.8353183269500732, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 9.74986680394346, + "learning_rate": 2.4797205687674608e-06, + "logits/chosen": -0.1306121051311493, + "logits/rejected": 0.02134639583528042, + "logps/chosen": -1.7014802694320679, + "logps/rejected": -2.6165051460266113, + "loss": 0.6604, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7014802694320679, + "rewards/margins": 0.9150252342224121, + "rewards/rejected": -2.6165051460266113, + "sft_loss": 1.692243218421936, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 5.831908928059212, + "learning_rate": 2.476177880671843e-06, + "logits/chosen": -0.2056030035018921, + "logits/rejected": 0.04198329523205757, + "logps/chosen": -1.7714817523956299, + "logps/rejected": -2.8920578956604004, + "loss": 0.6578, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7714817523956299, + "rewards/margins": 1.12057626247406, + "rewards/rejected": -2.8920578956604004, + "sft_loss": 1.8091742992401123, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 7.788069753673046, + "learning_rate": 2.4726257216941463e-06, + "logits/chosen": -0.04350770264863968, + "logits/rejected": 0.2956755757331848, + "logps/chosen": -1.7248331308364868, + "logps/rejected": -2.539656162261963, + "loss": 0.6874, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7248331308364868, + "rewards/margins": 0.8148230314254761, + "rewards/rejected": -2.539656162261963, + "sft_loss": 1.768866777420044, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 8.523253347431723, + "learning_rate": 2.4690641262974317e-06, + "logits/chosen": -0.04888535290956497, + "logits/rejected": 0.06253420561552048, + "logps/chosen": -1.5181758403778076, + "logps/rejected": -2.1794214248657227, + "loss": 0.6798, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5181758403778076, + "rewards/margins": 0.6612456440925598, + "rewards/rejected": -2.1794214248657227, + "sft_loss": 1.5654313564300537, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 7.259280654329731, + "learning_rate": 2.4654931290363135e-06, + "logits/chosen": 0.0067944610491395, + "logits/rejected": 0.055947817862033844, + "logps/chosen": -1.548032522201538, + "logps/rejected": -2.189936876296997, + "loss": 0.6885, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.548032522201538, + "rewards/margins": 0.6419044733047485, + "rewards/rejected": -2.189936876296997, + "sft_loss": 1.6117805242538452, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 6.715053666512424, + "learning_rate": 2.461912764556623e-06, + "logits/chosen": 0.05709873512387276, + "logits/rejected": 0.1856270730495453, + "logps/chosen": -1.5187469720840454, + "logps/rejected": -2.3932878971099854, + "loss": 0.6485, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5187469720840454, + "rewards/margins": 0.8745408058166504, + "rewards/rejected": -2.3932878971099854, + "sft_loss": 1.5913379192352295, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 6.017921568556773, + "learning_rate": 2.4583230675950717e-06, + "logits/chosen": -0.05736943334341049, + "logits/rejected": 0.16763488948345184, + "logps/chosen": -1.5151335000991821, + "logps/rejected": -2.175865888595581, + "loss": 0.6878, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5151335000991821, + "rewards/margins": 0.6607326865196228, + "rewards/rejected": -2.175865888595581, + "sft_loss": 1.5635812282562256, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 6.6961536875898275, + "learning_rate": 2.4547240729789156e-06, + "logits/chosen": 0.06216694042086601, + "logits/rejected": 0.18044105172157288, + "logps/chosen": -1.4949309825897217, + "logps/rejected": -2.222695827484131, + "loss": 0.6665, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4949309825897217, + "rewards/margins": 0.7277650833129883, + "rewards/rejected": -2.222695827484131, + "sft_loss": 1.5416090488433838, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 8.87357434364449, + "learning_rate": 2.451115815625617e-06, + "logits/chosen": 0.10345292091369629, + "logits/rejected": 0.2729285657405853, + "logps/chosen": -1.700848937034607, + "logps/rejected": -2.4436535835266113, + "loss": 0.7246, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.700848937034607, + "rewards/margins": 0.7428046464920044, + "rewards/rejected": -2.4436535835266113, + "sft_loss": 1.7029168605804443, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 8.704491288742465, + "learning_rate": 2.4474983305425025e-06, + "logits/chosen": -0.024038607254624367, + "logits/rejected": 0.22523216903209686, + "logps/chosen": -1.666006326675415, + "logps/rejected": -2.2513585090637207, + "loss": 0.7235, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.666006326675415, + "rewards/margins": 0.58535236120224, + "rewards/rejected": -2.2513585090637207, + "sft_loss": 1.6220099925994873, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 8.784010034526936, + "learning_rate": 2.4438716528264307e-06, + "logits/chosen": -0.11897747218608856, + "logits/rejected": 0.03994118422269821, + "logps/chosen": -1.6286365985870361, + "logps/rejected": -2.2337546348571777, + "loss": 0.6894, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6286365985870361, + "rewards/margins": 0.605117917060852, + "rewards/rejected": -2.2337546348571777, + "sft_loss": 1.6159461736679077, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 10.821060177471066, + "learning_rate": 2.440235817663443e-06, + "logits/chosen": 0.013584012165665627, + "logits/rejected": 0.22464172542095184, + "logps/chosen": -1.5461862087249756, + "logps/rejected": -2.386045455932617, + "loss": 0.6557, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5461862087249756, + "rewards/margins": 0.8398593068122864, + "rewards/rejected": -2.386045455932617, + "sft_loss": 1.5868008136749268, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 13.482773873106167, + "learning_rate": 2.4365908603284285e-06, + "logits/chosen": -0.14218154549598694, + "logits/rejected": 0.0756453424692154, + "logps/chosen": -1.697597861289978, + "logps/rejected": -2.438603162765503, + "loss": 0.7358, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.697597861289978, + "rewards/margins": 0.7410050630569458, + "rewards/rejected": -2.438603162765503, + "sft_loss": 1.698129415512085, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.37108588218688965, + "eval_logits/rejected": 0.5037340521812439, + "eval_logps/chosen": -1.6992080211639404, + "eval_logps/rejected": -2.333348035812378, + "eval_loss": 0.7325444221496582, + "eval_rewards/accuracies": 0.6824925541877747, + "eval_rewards/chosen": -1.6992080211639404, + "eval_rewards/margins": 0.6341398358345032, + "eval_rewards/rejected": -2.333348035812378, + "eval_runtime": 44.5266, + "eval_samples_per_second": 30.207, + "eval_sft_loss": 1.6954635381698608, + "eval_steps_per_second": 7.569, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 7.605727648934152, + "learning_rate": 2.4329368161847796e-06, + "logits/chosen": -0.07299210876226425, + "logits/rejected": 0.017824098467826843, + "logps/chosen": -1.6549274921417236, + "logps/rejected": -2.272775888442993, + "loss": 0.7452, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6549274921417236, + "rewards/margins": 0.6178484559059143, + "rewards/rejected": -2.272775888442993, + "sft_loss": 1.7160135507583618, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 9.74872118532032, + "learning_rate": 2.4292737206840483e-06, + "logits/chosen": -0.0690990686416626, + "logits/rejected": 0.09985318779945374, + "logps/chosen": -1.5765421390533447, + "logps/rejected": -2.249239444732666, + "loss": 0.6883, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5765421390533447, + "rewards/margins": 0.6726974248886108, + "rewards/rejected": -2.249239444732666, + "sft_loss": 1.6347439289093018, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 8.904622520925692, + "learning_rate": 2.4256016093656035e-06, + "logits/chosen": -0.05548501014709473, + "logits/rejected": 0.13582788407802582, + "logps/chosen": -1.6699409484863281, + "logps/rejected": -2.430842876434326, + "loss": 0.6711, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6699409484863281, + "rewards/margins": 0.7609016299247742, + "rewards/rejected": -2.430842876434326, + "sft_loss": 1.666124701499939, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 9.258511250573624, + "learning_rate": 2.421920517856285e-06, + "logits/chosen": -0.24321499466896057, + "logits/rejected": 0.07653270661830902, + "logps/chosen": -1.772458791732788, + "logps/rejected": -2.632817506790161, + "loss": 0.6476, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.772458791732788, + "rewards/margins": 0.8603585958480835, + "rewards/rejected": -2.632817506790161, + "sft_loss": 1.785118818283081, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 8.845285976389446, + "learning_rate": 2.418230481870058e-06, + "logits/chosen": -0.09658778458833694, + "logits/rejected": 0.12872019410133362, + "logps/chosen": -1.7949399948120117, + "logps/rejected": -2.6743321418762207, + "loss": 0.6967, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7949399948120117, + "rewards/margins": 0.8793922662734985, + "rewards/rejected": -2.6743321418762207, + "sft_loss": 1.900602102279663, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 6.76917907946159, + "learning_rate": 2.41453153720767e-06, + "logits/chosen": -0.10698221623897552, + "logits/rejected": -0.07826292514801025, + "logps/chosen": -1.686453104019165, + "logps/rejected": -2.3061447143554688, + "loss": 0.7118, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.686453104019165, + "rewards/margins": 0.6196914911270142, + "rewards/rejected": -2.3061447143554688, + "sft_loss": 1.7430803775787354, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 14.528060052297567, + "learning_rate": 2.4108237197562963e-06, + "logits/chosen": -0.15176725387573242, + "logits/rejected": 0.0883040651679039, + "logps/chosen": -1.708646535873413, + "logps/rejected": -2.4836783409118652, + "loss": 0.6979, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.708646535873413, + "rewards/margins": 0.7750316858291626, + "rewards/rejected": -2.4836783409118652, + "sft_loss": 1.7798616886138916, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 67.9308800768942, + "learning_rate": 2.407107065489199e-06, + "logits/chosen": -0.177690327167511, + "logits/rejected": -0.0974874347448349, + "logps/chosen": -1.7028429508209229, + "logps/rejected": -2.4875786304473877, + "loss": 0.7027, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7028429508209229, + "rewards/margins": 0.7847355604171753, + "rewards/rejected": -2.4875786304473877, + "sft_loss": 1.746551513671875, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 9.402312106224876, + "learning_rate": 2.403381610465374e-06, + "logits/chosen": -0.12390387058258057, + "logits/rejected": -0.05445731431245804, + "logps/chosen": -1.6808083057403564, + "logps/rejected": -2.3618881702423096, + "loss": 0.6685, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6808083057403564, + "rewards/margins": 0.6810798645019531, + "rewards/rejected": -2.3618881702423096, + "sft_loss": 1.643741250038147, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 8.707169869649329, + "learning_rate": 2.3996473908292017e-06, + "logits/chosen": -0.3125987946987152, + "logits/rejected": -0.17020562291145325, + "logps/chosen": -1.5637733936309814, + "logps/rejected": -2.0839359760284424, + "loss": 0.7291, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5637733936309814, + "rewards/margins": 0.5201625823974609, + "rewards/rejected": -2.0839359760284424, + "sft_loss": 1.6524698734283447, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 8.344695734884622, + "learning_rate": 2.3959044428100985e-06, + "logits/chosen": -0.30680757761001587, + "logits/rejected": -0.17020884156227112, + "logps/chosen": -1.5836832523345947, + "logps/rejected": -2.2736763954162598, + "loss": 0.6871, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5836832523345947, + "rewards/margins": 0.6899932622909546, + "rewards/rejected": -2.2736763954162598, + "sft_loss": 1.6809183359146118, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 8.356682337941303, + "learning_rate": 2.392152802722162e-06, + "logits/chosen": -0.3003597855567932, + "logits/rejected": -0.26015612483024597, + "logps/chosen": -1.6600055694580078, + "logps/rejected": -2.41102933883667, + "loss": 0.6863, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6600055694580078, + "rewards/margins": 0.7510236501693726, + "rewards/rejected": -2.41102933883667, + "sft_loss": 1.7412002086639404, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 7.936263142727307, + "learning_rate": 2.38839250696382e-06, + "logits/chosen": -0.39390525221824646, + "logits/rejected": -0.2608678936958313, + "logps/chosen": -1.6280196905136108, + "logps/rejected": -2.410038709640503, + "loss": 0.6648, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6280196905136108, + "rewards/margins": 0.7820190787315369, + "rewards/rejected": -2.410038709640503, + "sft_loss": 1.6248575448989868, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 7.693840717242871, + "learning_rate": 2.3846235920174794e-06, + "logits/chosen": -0.38650065660476685, + "logits/rejected": -0.23895108699798584, + "logps/chosen": -1.5643054246902466, + "logps/rejected": -2.4510586261749268, + "loss": 0.6087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5643054246902466, + "rewards/margins": 0.886753261089325, + "rewards/rejected": -2.4510586261749268, + "sft_loss": 1.6124528646469116, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 11.677231302491558, + "learning_rate": 2.380846094449169e-06, + "logits/chosen": -0.43027549982070923, + "logits/rejected": -0.3176945447921753, + "logps/chosen": -1.7545477151870728, + "logps/rejected": -2.565573215484619, + "loss": 0.684, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7545477151870728, + "rewards/margins": 0.8110259175300598, + "rewards/rejected": -2.565573215484619, + "sft_loss": 1.8772166967391968, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 8.634788691396096, + "learning_rate": 2.3770600509081872e-06, + "logits/chosen": -0.48604917526245117, + "logits/rejected": -0.2944543957710266, + "logps/chosen": -1.668334722518921, + "logps/rejected": -2.3512473106384277, + "loss": 0.6855, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.668334722518921, + "rewards/margins": 0.6829127073287964, + "rewards/rejected": -2.3512473106384277, + "sft_loss": 1.7718915939331055, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 13.019395453877044, + "learning_rate": 2.373265498126745e-06, + "logits/chosen": -0.44035762548446655, + "logits/rejected": -0.33537784218788147, + "logps/chosen": -1.7430732250213623, + "logps/rejected": -2.501746654510498, + "loss": 0.6909, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7430732250213623, + "rewards/margins": 0.7586732506752014, + "rewards/rejected": -2.501746654510498, + "sft_loss": 1.801311731338501, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 7.660138579765188, + "learning_rate": 2.36946247291961e-06, + "logits/chosen": -0.46764713525772095, + "logits/rejected": -0.480417400598526, + "logps/chosen": -1.7001616954803467, + "logps/rejected": -2.311063051223755, + "loss": 0.7318, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7001616954803467, + "rewards/margins": 0.6109012365341187, + "rewards/rejected": -2.311063051223755, + "sft_loss": 1.8168401718139648, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 9.935148956573007, + "learning_rate": 2.3656510121837492e-06, + "logits/chosen": -0.474683940410614, + "logits/rejected": -0.3218488097190857, + "logps/chosen": -1.7965023517608643, + "logps/rejected": -2.385735034942627, + "loss": 0.7291, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7965023517608643, + "rewards/margins": 0.5892330408096313, + "rewards/rejected": -2.385735034942627, + "sft_loss": 1.8091661930084229, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 11.440049691581173, + "learning_rate": 2.3618311528979717e-06, + "logits/chosen": -0.33886945247650146, + "logits/rejected": -0.2977084219455719, + "logps/chosen": -1.750841498374939, + "logps/rejected": -2.3571510314941406, + "loss": 0.7186, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.750841498374939, + "rewards/margins": 0.6063095331192017, + "rewards/rejected": -2.3571510314941406, + "sft_loss": 1.7378523349761963, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 14.193161191328313, + "learning_rate": 2.3580029321225692e-06, + "logits/chosen": -0.32112354040145874, + "logits/rejected": -0.17997024953365326, + "logps/chosen": -1.6737353801727295, + "logps/rejected": -2.603375196456909, + "loss": 0.6354, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6737353801727295, + "rewards/margins": 0.9296396374702454, + "rewards/rejected": -2.603375196456909, + "sft_loss": 1.660453200340271, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 6.997715864560877, + "learning_rate": 2.354166386998956e-06, + "logits/chosen": -0.4019525945186615, + "logits/rejected": -0.23506435751914978, + "logps/chosen": -1.623871088027954, + "logps/rejected": -2.5584957599639893, + "loss": 0.6729, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.623871088027954, + "rewards/margins": 0.9346246719360352, + "rewards/rejected": -2.5584957599639893, + "sft_loss": 1.688677430152893, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 9.19616365742752, + "learning_rate": 2.3503215547493097e-06, + "logits/chosen": -0.2555497884750366, + "logits/rejected": -0.21385636925697327, + "logps/chosen": -1.6596953868865967, + "logps/rejected": -2.4315128326416016, + "loss": 0.7161, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.6596953868865967, + "rewards/margins": 0.7718173861503601, + "rewards/rejected": -2.4315128326416016, + "sft_loss": 1.7026008367538452, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 10.297571955288168, + "learning_rate": 2.3464684726762104e-06, + "logits/chosen": -0.3922201991081238, + "logits/rejected": -0.36036157608032227, + "logps/chosen": -1.6120132207870483, + "logps/rejected": -2.2474522590637207, + "loss": 0.7029, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6120132207870483, + "rewards/margins": 0.6354392170906067, + "rewards/rejected": -2.2474522590637207, + "sft_loss": 1.6690866947174072, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 6.7377464975333625, + "learning_rate": 2.342607178162276e-06, + "logits/chosen": -0.37472158670425415, + "logits/rejected": -0.29021531343460083, + "logps/chosen": -1.583504319190979, + "logps/rejected": -2.4073872566223145, + "loss": 0.6434, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.583504319190979, + "rewards/margins": 0.823883056640625, + "rewards/rejected": -2.4073872566223145, + "sft_loss": 1.5988887548446655, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 11.760038410167141, + "learning_rate": 2.338737708669804e-06, + "logits/chosen": -0.323408305644989, + "logits/rejected": -0.0538100004196167, + "logps/chosen": -1.6298744678497314, + "logps/rejected": -2.435746669769287, + "loss": 0.6673, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6298744678497314, + "rewards/margins": 0.8058720827102661, + "rewards/rejected": -2.435746669769287, + "sft_loss": 1.6865326166152954, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 8.472135920026245, + "learning_rate": 2.334860101740404e-06, + "logits/chosen": -0.3944898545742035, + "logits/rejected": -0.21247251331806183, + "logps/chosen": -1.6303770542144775, + "logps/rejected": -2.5265893936157227, + "loss": 0.6389, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6303770542144775, + "rewards/margins": 0.8962124586105347, + "rewards/rejected": -2.5265893936157227, + "sft_loss": 1.6558125019073486, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 12.639854069616208, + "learning_rate": 2.330974394994635e-06, + "logits/chosen": -0.4155654311180115, + "logits/rejected": -0.2664666771888733, + "logps/chosen": -1.7755460739135742, + "logps/rejected": -2.537376880645752, + "loss": 0.7204, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7755460739135742, + "rewards/margins": 0.7618308067321777, + "rewards/rejected": -2.537376880645752, + "sft_loss": 1.8307090997695923, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 8.78714107660372, + "learning_rate": 2.327080626131641e-06, + "logits/chosen": -0.4042625427246094, + "logits/rejected": -0.3261919915676117, + "logps/chosen": -1.6089550256729126, + "logps/rejected": -2.6261086463928223, + "loss": 0.6509, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6089550256729126, + "rewards/margins": 1.0171538591384888, + "rewards/rejected": -2.6261086463928223, + "sft_loss": 1.7027835845947266, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 7.584058614904452, + "learning_rate": 2.3231788329287855e-06, + "logits/chosen": -0.45290595293045044, + "logits/rejected": -0.39946696162223816, + "logps/chosen": -1.7527068853378296, + "logps/rejected": -2.5772976875305176, + "loss": 0.6657, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7527068853378296, + "rewards/margins": 0.8245910406112671, + "rewards/rejected": -2.5772976875305176, + "sft_loss": 1.7863349914550781, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 10.128455616245532, + "learning_rate": 2.3192690532412827e-06, + "logits/chosen": -0.3807825744152069, + "logits/rejected": -0.29959550499916077, + "logps/chosen": -1.7997996807098389, + "logps/rejected": -2.4800992012023926, + "loss": 0.7069, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7997996807098389, + "rewards/margins": 0.680299699306488, + "rewards/rejected": -2.4800992012023926, + "sft_loss": 1.8867028951644897, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 12.687593297925929, + "learning_rate": 2.315351325001832e-06, + "logits/chosen": -0.4369320273399353, + "logits/rejected": -0.34756121039390564, + "logps/chosen": -1.7206451892852783, + "logps/rejected": -2.6387972831726074, + "loss": 0.6661, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7206451892852783, + "rewards/margins": 0.9181524515151978, + "rewards/rejected": -2.6387972831726074, + "sft_loss": 1.7650476694107056, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 9.336298496804801, + "learning_rate": 2.3114256862202495e-06, + "logits/chosen": -0.3852900564670563, + "logits/rejected": -0.19119954109191895, + "logps/chosen": -1.6125036478042603, + "logps/rejected": -2.5013768672943115, + "loss": 0.6474, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6125036478042603, + "rewards/margins": 0.888873279094696, + "rewards/rejected": -2.5013768672943115, + "sft_loss": 1.68994140625, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 6.085677306048667, + "learning_rate": 2.3074921749831013e-06, + "logits/chosen": -0.33006614446640015, + "logits/rejected": -0.14145353436470032, + "logps/chosen": -1.5735447406768799, + "logps/rejected": -2.319394826889038, + "loss": 0.6759, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5735447406768799, + "rewards/margins": 0.7458503246307373, + "rewards/rejected": -2.319394826889038, + "sft_loss": 1.598747968673706, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 7.322613120246413, + "learning_rate": 2.30355082945333e-06, + "logits/chosen": -0.32234734296798706, + "logits/rejected": -0.10814448446035385, + "logps/chosen": -1.6050329208374023, + "logps/rejected": -2.175632953643799, + "loss": 0.6955, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6050329208374023, + "rewards/margins": 0.5705999135971069, + "rewards/rejected": -2.175632953643799, + "sft_loss": 1.65104079246521, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 6.753212414597791, + "learning_rate": 2.2996016878698866e-06, + "logits/chosen": -0.37758100032806396, + "logits/rejected": -0.2867276072502136, + "logps/chosen": -1.5658433437347412, + "logps/rejected": -2.327582597732544, + "loss": 0.6572, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5658433437347412, + "rewards/margins": 0.7617393732070923, + "rewards/rejected": -2.327582597732544, + "sft_loss": 1.6486505270004272, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 9.635485572560357, + "learning_rate": 2.2956447885473607e-06, + "logits/chosen": -0.3054151237010956, + "logits/rejected": -0.12504413723945618, + "logps/chosen": -1.6871017217636108, + "logps/rejected": -2.3588860034942627, + "loss": 0.6751, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6871017217636108, + "rewards/margins": 0.6717841029167175, + "rewards/rejected": -2.3588860034942627, + "sft_loss": 1.712669014930725, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 8.445965479118785, + "learning_rate": 2.2916801698756063e-06, + "logits/chosen": -0.20240584015846252, + "logits/rejected": -0.16871638596057892, + "logps/chosen": -1.7465060949325562, + "logps/rejected": -2.494576930999756, + "loss": 0.7111, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7465060949325562, + "rewards/margins": 0.7480708360671997, + "rewards/rejected": -2.494576930999756, + "sft_loss": 1.8895080089569092, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 10.299835270794993, + "learning_rate": 2.287707870319372e-06, + "logits/chosen": -0.30202627182006836, + "logits/rejected": -0.18142291903495789, + "logps/chosen": -1.794844627380371, + "logps/rejected": -2.659106969833374, + "loss": 0.7158, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.794844627380371, + "rewards/margins": 0.8642624616622925, + "rewards/rejected": -2.659106969833374, + "sft_loss": 1.8789043426513672, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 7.726045950177308, + "learning_rate": 2.283727928417925e-06, + "logits/chosen": -0.3212895095348358, + "logits/rejected": -0.3393111824989319, + "logps/chosen": -1.6944576501846313, + "logps/rejected": -2.492414951324463, + "loss": 0.6955, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6944576501846313, + "rewards/margins": 0.7979571223258972, + "rewards/rejected": -2.492414951324463, + "sft_loss": 1.7866626977920532, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 7.864280883375943, + "learning_rate": 2.27974038278468e-06, + "logits/chosen": -0.3887438178062439, + "logits/rejected": -0.14263615012168884, + "logps/chosen": -1.5852651596069336, + "logps/rejected": -2.2923355102539062, + "loss": 0.6687, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.5852651596069336, + "rewards/margins": 0.707070529460907, + "rewards/rejected": -2.2923355102539062, + "sft_loss": 1.6131317615509033, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 8.543298981272972, + "learning_rate": 2.2757452721068206e-06, + "logits/chosen": -0.34999722242355347, + "logits/rejected": -0.22460785508155823, + "logps/chosen": -1.4379886388778687, + "logps/rejected": -2.2692742347717285, + "loss": 0.6365, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4379886388778687, + "rewards/margins": 0.8312854766845703, + "rewards/rejected": -2.2692742347717285, + "sft_loss": 1.5093804597854614, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 9.561382786924911, + "learning_rate": 2.2717426351449294e-06, + "logits/chosen": -0.32528331875801086, + "logits/rejected": -0.219674751162529, + "logps/chosen": -1.7008552551269531, + "logps/rejected": -2.5366382598876953, + "loss": 0.6654, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7008552551269531, + "rewards/margins": 0.8357831239700317, + "rewards/rejected": -2.5366382598876953, + "sft_loss": 1.6650569438934326, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 9.799822625464461, + "learning_rate": 2.2677325107326067e-06, + "logits/chosen": -0.4135669767856598, + "logits/rejected": -0.2664993405342102, + "logps/chosen": -1.5439913272857666, + "logps/rejected": -2.251284122467041, + "loss": 0.7167, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5439913272857666, + "rewards/margins": 0.7072926759719849, + "rewards/rejected": -2.251284122467041, + "sft_loss": 1.6164175271987915, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 10.801150636874329, + "learning_rate": 2.2637149377760985e-06, + "logits/chosen": -0.4365982413291931, + "logits/rejected": -0.16107413172721863, + "logps/chosen": -1.523956537246704, + "logps/rejected": -2.4188663959503174, + "loss": 0.6255, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.523956537246704, + "rewards/margins": 0.894909679889679, + "rewards/rejected": -2.4188663959503174, + "sft_loss": 1.5979201793670654, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 12.468546658158331, + "learning_rate": 2.2596899552539136e-06, + "logits/chosen": -0.43537306785583496, + "logits/rejected": -0.26023873686790466, + "logps/chosen": -1.6386340856552124, + "logps/rejected": -2.6180009841918945, + "loss": 0.662, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6386340856552124, + "rewards/margins": 0.9793673753738403, + "rewards/rejected": -2.6180009841918945, + "sft_loss": 1.6425418853759766, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 8.992344025907165, + "learning_rate": 2.2556576022164516e-06, + "logits/chosen": -0.4005703926086426, + "logits/rejected": -0.16512002050876617, + "logps/chosen": -1.6079756021499634, + "logps/rejected": -2.5010671615600586, + "loss": 0.649, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6079756021499634, + "rewards/margins": 0.8930916786193848, + "rewards/rejected": -2.5010671615600586, + "sft_loss": 1.6396543979644775, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 6.610130419520371, + "learning_rate": 2.2516179177856182e-06, + "logits/chosen": -0.3925136625766754, + "logits/rejected": -0.18449433147907257, + "logps/chosen": -1.6953229904174805, + "logps/rejected": -2.4890971183776855, + "loss": 0.6671, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6953229904174805, + "rewards/margins": 0.7937743067741394, + "rewards/rejected": -2.4890971183776855, + "sft_loss": 1.7729851007461548, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 8.240460161632697, + "learning_rate": 2.2475709411544503e-06, + "logits/chosen": -0.30860430002212524, + "logits/rejected": -0.2732142508029938, + "logps/chosen": -1.6084105968475342, + "logps/rejected": -2.398030996322632, + "loss": 0.6585, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6084105968475342, + "rewards/margins": 0.7896206378936768, + "rewards/rejected": -2.398030996322632, + "sft_loss": 1.6830050945281982, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 10.591418712449052, + "learning_rate": 2.2435167115867325e-06, + "logits/chosen": -0.22040650248527527, + "logits/rejected": -0.18510958552360535, + "logps/chosen": -1.6661109924316406, + "logps/rejected": -2.5022969245910645, + "loss": 0.6552, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6661109924316406, + "rewards/margins": 0.8361861109733582, + "rewards/rejected": -2.5022969245910645, + "sft_loss": 1.700455665588379, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 10.77881916958613, + "learning_rate": 2.239455268416618e-06, + "logits/chosen": -0.3600291609764099, + "logits/rejected": -0.2691943049430847, + "logps/chosen": -1.7021760940551758, + "logps/rejected": -2.4420957565307617, + "loss": 0.7208, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7021760940551758, + "rewards/margins": 0.7399194240570068, + "rewards/rejected": -2.4420957565307617, + "sft_loss": 1.7061984539031982, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 9.660888461433778, + "learning_rate": 2.2353866510482463e-06, + "logits/chosen": -0.2715977430343628, + "logits/rejected": -0.2946680188179016, + "logps/chosen": -1.6520048379898071, + "logps/rejected": -2.2550692558288574, + "loss": 0.7035, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6520048379898071, + "rewards/margins": 0.603064239025116, + "rewards/rejected": -2.2550692558288574, + "sft_loss": 1.6675376892089844, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 7.99149684732698, + "learning_rate": 2.231310898955361e-06, + "logits/chosen": -0.3659631609916687, + "logits/rejected": -0.27866271138191223, + "logps/chosen": -1.6420866250991821, + "logps/rejected": -2.4726719856262207, + "loss": 0.6717, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6420866250991821, + "rewards/margins": 0.8305851221084595, + "rewards/rejected": -2.4726719856262207, + "sft_loss": 1.739808440208435, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 11.614010206633175, + "learning_rate": 2.2272280516809262e-06, + "logits/chosen": -0.46790918707847595, + "logits/rejected": -0.265636146068573, + "logps/chosen": -1.6137558221817017, + "logps/rejected": -2.491194248199463, + "loss": 0.6536, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6137558221817017, + "rewards/margins": 0.8774384260177612, + "rewards/rejected": -2.491194248199463, + "sft_loss": 1.6100658178329468, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 11.325637763794075, + "learning_rate": 2.2231381488367447e-06, + "logits/chosen": -0.3395164906978607, + "logits/rejected": -0.18752549588680267, + "logps/chosen": -1.6160014867782593, + "logps/rejected": -2.5542397499084473, + "loss": 0.6362, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6160014867782593, + "rewards/margins": 0.9382384419441223, + "rewards/rejected": -2.5542397499084473, + "sft_loss": 1.6488006114959717, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 8.49763007403377, + "learning_rate": 2.2190412301030717e-06, + "logits/chosen": -0.4595802426338196, + "logits/rejected": -0.24137744307518005, + "logps/chosen": -1.5575565099716187, + "logps/rejected": -2.3483619689941406, + "loss": 0.6619, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5575565099716187, + "rewards/margins": 0.7908056378364563, + "rewards/rejected": -2.3483619689941406, + "sft_loss": 1.6235309839248657, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 6.357494880268698, + "learning_rate": 2.2149373352282307e-06, + "logits/chosen": -0.4294605255126953, + "logits/rejected": -0.20918670296669006, + "logps/chosen": -1.781286597251892, + "logps/rejected": -2.775089740753174, + "loss": 0.641, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.781286597251892, + "rewards/margins": 0.9938033819198608, + "rewards/rejected": -2.775089740753174, + "sft_loss": 1.7867755889892578, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 8.736802708009721, + "learning_rate": 2.2108265040282275e-06, + "logits/chosen": -0.5402215719223022, + "logits/rejected": -0.37220120429992676, + "logps/chosen": -1.6326793432235718, + "logps/rejected": -2.4899330139160156, + "loss": 0.6923, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6326793432235718, + "rewards/margins": 0.8572534322738647, + "rewards/rejected": -2.4899330139160156, + "sft_loss": 1.692399263381958, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 8.117762434913253, + "learning_rate": 2.2067087763863644e-06, + "logits/chosen": -0.4696386456489563, + "logits/rejected": -0.37050861120224, + "logps/chosen": -1.6940395832061768, + "logps/rejected": -2.565840721130371, + "loss": 0.6976, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6940395832061768, + "rewards/margins": 0.8718010783195496, + "rewards/rejected": -2.565840721130371, + "sft_loss": 1.8267343044281006, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 12.180098640706655, + "learning_rate": 2.202584192252854e-06, + "logits/chosen": -0.42436861991882324, + "logits/rejected": -0.29395556449890137, + "logps/chosen": -1.6638511419296265, + "logps/rejected": -2.3967151641845703, + "loss": 0.7281, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6638511419296265, + "rewards/margins": 0.7328639626502991, + "rewards/rejected": -2.3967151641845703, + "sft_loss": 1.7181631326675415, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 8.704967823405475, + "learning_rate": 2.1984527916444283e-06, + "logits/chosen": -0.4253556728363037, + "logits/rejected": -0.28073593974113464, + "logps/chosen": -1.7703256607055664, + "logps/rejected": -2.572084426879883, + "loss": 0.6992, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7703256607055664, + "rewards/margins": 0.8017589449882507, + "rewards/rejected": -2.572084426879883, + "sft_loss": 1.7371975183486938, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 10.179021449194039, + "learning_rate": 2.1943146146439557e-06, + "logits/chosen": -0.35127347707748413, + "logits/rejected": -0.03087422251701355, + "logps/chosen": -1.6630665063858032, + "logps/rejected": -2.5277600288391113, + "loss": 0.6639, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6630665063858032, + "rewards/margins": 0.8646937608718872, + "rewards/rejected": -2.5277600288391113, + "sft_loss": 1.6645368337631226, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 9.438073157043084, + "learning_rate": 2.190169701400046e-06, + "logits/chosen": -0.40794649720191956, + "logits/rejected": -0.1953791379928589, + "logps/chosen": -1.686069130897522, + "logps/rejected": -2.6183743476867676, + "loss": 0.649, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.686069130897522, + "rewards/margins": 0.9323051571846008, + "rewards/rejected": -2.6183743476867676, + "sft_loss": 1.7290821075439453, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 7.470935986197104, + "learning_rate": 2.186018092126666e-06, + "logits/chosen": -0.2606565058231354, + "logits/rejected": -0.22562718391418457, + "logps/chosen": -1.6814906597137451, + "logps/rejected": -2.4694440364837646, + "loss": 0.6754, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6814906597137451, + "rewards/margins": 0.7879533767700195, + "rewards/rejected": -2.4694440364837646, + "sft_loss": 1.7424710988998413, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 6.658630183259569, + "learning_rate": 2.181859827102748e-06, + "logits/chosen": -0.2056102305650711, + "logits/rejected": -0.11757795512676239, + "logps/chosen": -1.7347863912582397, + "logps/rejected": -2.6536970138549805, + "loss": 0.6324, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7347863912582397, + "rewards/margins": 0.9189106225967407, + "rewards/rejected": -2.6536970138549805, + "sft_loss": 1.7204700708389282, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 9.669308852371636, + "learning_rate": 2.1776949466717967e-06, + "logits/chosen": -0.4139643609523773, + "logits/rejected": -0.27617889642715454, + "logps/chosen": -1.7208669185638428, + "logps/rejected": -2.599398612976074, + "loss": 0.6994, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7208669185638428, + "rewards/margins": 0.8785317540168762, + "rewards/rejected": -2.599398612976074, + "sft_loss": 1.7972770929336548, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 9.718597987579729, + "learning_rate": 2.1735234912415007e-06, + "logits/chosen": -0.25888964533805847, + "logits/rejected": -0.17799128592014313, + "logps/chosen": -1.749355673789978, + "logps/rejected": -2.605766534805298, + "loss": 0.6546, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.749355673789978, + "rewards/margins": 0.8564105033874512, + "rewards/rejected": -2.605766534805298, + "sft_loss": 1.746603012084961, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 9.033444917435077, + "learning_rate": 2.1693455012833388e-06, + "logits/chosen": -0.4220157563686371, + "logits/rejected": -0.1659930944442749, + "logps/chosen": -1.6804784536361694, + "logps/rejected": -2.5973219871520996, + "loss": 0.6568, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6804784536361694, + "rewards/margins": 0.9168437123298645, + "rewards/rejected": -2.5973219871520996, + "sft_loss": 1.7155706882476807, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 10.432645490776679, + "learning_rate": 2.1651610173321877e-06, + "logits/chosen": -0.31514835357666016, + "logits/rejected": -0.1127225011587143, + "logps/chosen": -1.734143853187561, + "logps/rejected": -2.6728904247283936, + "loss": 0.6587, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.734143853187561, + "rewards/margins": 0.9387462735176086, + "rewards/rejected": -2.6728904247283936, + "sft_loss": 1.746917963027954, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 8.798045041122512, + "learning_rate": 2.1609700799859287e-06, + "logits/chosen": -0.2637856602668762, + "logits/rejected": -0.07604242861270905, + "logps/chosen": -1.7247288227081299, + "logps/rejected": -2.5924761295318604, + "loss": 0.6642, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7247288227081299, + "rewards/margins": 0.8677471280097961, + "rewards/rejected": -2.5924761295318604, + "sft_loss": 1.7420657873153687, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 12.482508611552822, + "learning_rate": 2.1567727299050555e-06, + "logits/chosen": -0.18308880925178528, + "logits/rejected": -0.01936594396829605, + "logps/chosen": -1.6722066402435303, + "logps/rejected": -2.7605667114257812, + "loss": 0.6528, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6722066402435303, + "rewards/margins": 1.0883598327636719, + "rewards/rejected": -2.7605667114257812, + "sft_loss": 1.7239611148834229, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 10.514418161646946, + "learning_rate": 2.152569007812276e-06, + "logits/chosen": -0.25235840678215027, + "logits/rejected": -0.04004128649830818, + "logps/chosen": -1.748984932899475, + "logps/rejected": -2.873537063598633, + "loss": 0.6379, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.748984932899475, + "rewards/margins": 1.1245522499084473, + "rewards/rejected": -2.873537063598633, + "sft_loss": 1.8425092697143555, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 9.25848867545407, + "learning_rate": 2.1483589544921202e-06, + "logits/chosen": -0.1993493139743805, + "logits/rejected": 0.022964054718613625, + "logps/chosen": -1.7990487813949585, + "logps/rejected": -2.8367419242858887, + "loss": 0.6619, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7990487813949585, + "rewards/margins": 1.0376932621002197, + "rewards/rejected": -2.8367419242858887, + "sft_loss": 1.8404858112335205, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 10.107413557567574, + "learning_rate": 2.144142610790545e-06, + "logits/chosen": -0.17361178994178772, + "logits/rejected": 0.04688585549592972, + "logps/chosen": -1.7829437255859375, + "logps/rejected": -2.7042758464813232, + "loss": 0.652, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7829437255859375, + "rewards/margins": 0.9213320016860962, + "rewards/rejected": -2.7042758464813232, + "sft_loss": 1.8419139385223389, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 9.000121174768116, + "learning_rate": 2.1399200176145344e-06, + "logits/chosen": -0.2919735908508301, + "logits/rejected": -0.03339837118983269, + "logps/chosen": -1.6188087463378906, + "logps/rejected": -2.5424160957336426, + "loss": 0.6578, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6188087463378906, + "rewards/margins": 0.9236071705818176, + "rewards/rejected": -2.5424160957336426, + "sft_loss": 1.657747507095337, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 9.970070470489274, + "learning_rate": 2.1356912159317067e-06, + "logits/chosen": -0.20595316588878632, + "logits/rejected": 0.05183644965291023, + "logps/chosen": -1.8907188177108765, + "logps/rejected": -2.9290881156921387, + "loss": 0.6846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8907188177108765, + "rewards/margins": 1.0383695363998413, + "rewards/rejected": -2.9290881156921387, + "sft_loss": 1.9077237844467163, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 9.730449938501128, + "learning_rate": 2.1314562467699133e-06, + "logits/chosen": -0.10945296287536621, + "logits/rejected": 0.033916451036930084, + "logps/chosen": -1.73921799659729, + "logps/rejected": -2.6286468505859375, + "loss": 0.6592, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.73921799659729, + "rewards/margins": 0.8894292116165161, + "rewards/rejected": -2.6286468505859375, + "sft_loss": 1.737277626991272, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 8.618815475427098, + "learning_rate": 2.1272151512168453e-06, + "logits/chosen": -0.09236228466033936, + "logits/rejected": -0.020017240196466446, + "logps/chosen": -1.6293678283691406, + "logps/rejected": -2.670914888381958, + "loss": 0.6069, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6293678283691406, + "rewards/margins": 1.041547179222107, + "rewards/rejected": -2.670914888381958, + "sft_loss": 1.6871839761734009, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 8.556501943617222, + "learning_rate": 2.122967970419629e-06, + "logits/chosen": -0.3508361876010895, + "logits/rejected": -0.13678878545761108, + "logps/chosen": -1.6169979572296143, + "logps/rejected": -2.42988920211792, + "loss": 0.6512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6169979572296143, + "rewards/margins": 0.8128914833068848, + "rewards/rejected": -2.42988920211792, + "sft_loss": 1.6969658136367798, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 13.939047832399783, + "learning_rate": 2.118714745584431e-06, + "logits/chosen": -0.17858004570007324, + "logits/rejected": 0.028571343049407005, + "logps/chosen": -1.6750202178955078, + "logps/rejected": -2.4766621589660645, + "loss": 0.6698, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6750202178955078, + "rewards/margins": 0.801642119884491, + "rewards/rejected": -2.4766621589660645, + "sft_loss": 1.7185461521148682, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.3708411455154419, + "eval_logits/rejected": 0.5028015375137329, + "eval_logps/chosen": -1.8658210039138794, + "eval_logps/rejected": -2.657525062561035, + "eval_loss": 0.7332260012626648, + "eval_rewards/accuracies": 0.7017804384231567, + "eval_rewards/chosen": -1.8658210039138794, + "eval_rewards/margins": 0.7917039394378662, + "eval_rewards/rejected": -2.657525062561035, + "eval_runtime": 44.466, + "eval_samples_per_second": 30.248, + "eval_sft_loss": 1.8307745456695557, + "eval_steps_per_second": 7.579, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 7.013331295730143, + "learning_rate": 2.1144555179760582e-06, + "logits/chosen": -0.09877729415893555, + "logits/rejected": 0.0729081779718399, + "logps/chosen": -1.7698421478271484, + "logps/rejected": -2.887197971343994, + "loss": 0.647, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7698421478271484, + "rewards/margins": 1.1173558235168457, + "rewards/rejected": -2.887197971343994, + "sft_loss": 1.81331467628479, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 11.39958801066816, + "learning_rate": 2.110190328917555e-06, + "logits/chosen": -0.23239700496196747, + "logits/rejected": 0.026508072391152382, + "logps/chosen": -1.7946094274520874, + "logps/rejected": -2.501302480697632, + "loss": 0.7146, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7946094274520874, + "rewards/margins": 0.706693172454834, + "rewards/rejected": -2.501302480697632, + "sft_loss": 1.8153241872787476, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 5.811504402152971, + "learning_rate": 2.1059192197898044e-06, + "logits/chosen": -0.10666403919458389, + "logits/rejected": 0.0190547164529562, + "logps/chosen": -1.6683290004730225, + "logps/rejected": -2.7554128170013428, + "loss": 0.6516, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6683290004730225, + "rewards/margins": 1.0870840549468994, + "rewards/rejected": -2.7554128170013428, + "sft_loss": 1.7128015756607056, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 9.912255314604257, + "learning_rate": 2.1016422320311257e-06, + "logits/chosen": -0.2023269683122635, + "logits/rejected": -0.04736703261733055, + "logps/chosen": -1.7052671909332275, + "logps/rejected": -2.5506842136383057, + "loss": 0.6552, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7052671909332275, + "rewards/margins": 0.8454171419143677, + "rewards/rejected": -2.5506842136383057, + "sft_loss": 1.7814832925796509, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 7.379974362789548, + "learning_rate": 2.097359407136873e-06, + "logits/chosen": -0.18297527730464935, + "logits/rejected": -0.0790904313325882, + "logps/chosen": -1.5330262184143066, + "logps/rejected": -2.1949267387390137, + "loss": 0.6696, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5330262184143066, + "rewards/margins": 0.6619004011154175, + "rewards/rejected": -2.1949267387390137, + "sft_loss": 1.6252708435058594, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 8.932389676446226, + "learning_rate": 2.093070786659033e-06, + "logits/chosen": -0.17055395245552063, + "logits/rejected": -0.08287496864795685, + "logps/chosen": -1.6713578701019287, + "logps/rejected": -2.418421983718872, + "loss": 0.689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6713578701019287, + "rewards/margins": 0.7470639944076538, + "rewards/rejected": -2.418421983718872, + "sft_loss": 1.7103633880615234, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 6.242947917759311, + "learning_rate": 2.0887764122058195e-06, + "logits/chosen": -0.199593186378479, + "logits/rejected": -0.01157109159976244, + "logps/chosen": -1.6459394693374634, + "logps/rejected": -2.19657039642334, + "loss": 0.7567, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6459394693374634, + "rewards/margins": 0.5506308078765869, + "rewards/rejected": -2.19657039642334, + "sft_loss": 1.6807940006256104, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 7.125673957478325, + "learning_rate": 2.084476325441272e-06, + "logits/chosen": -0.25697240233421326, + "logits/rejected": -0.1042226105928421, + "logps/chosen": -1.5503084659576416, + "logps/rejected": -2.366608142852783, + "loss": 0.6555, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5503084659576416, + "rewards/margins": 0.8162997364997864, + "rewards/rejected": -2.366608142852783, + "sft_loss": 1.5341964960098267, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 10.415414680809302, + "learning_rate": 2.0801705680848523e-06, + "logits/chosen": -0.2940608859062195, + "logits/rejected": -0.10844705253839493, + "logps/chosen": -1.6265678405761719, + "logps/rejected": -2.294830799102783, + "loss": 0.6989, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6265678405761719, + "rewards/margins": 0.668262779712677, + "rewards/rejected": -2.294830799102783, + "sft_loss": 1.6152546405792236, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 11.225044180641651, + "learning_rate": 2.0758591819110364e-06, + "logits/chosen": -0.27031898498535156, + "logits/rejected": -0.07324796915054321, + "logps/chosen": -1.5835988521575928, + "logps/rejected": -2.4217069149017334, + "loss": 0.65, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5835988521575928, + "rewards/margins": 0.8381081819534302, + "rewards/rejected": -2.4217069149017334, + "sft_loss": 1.6001770496368408, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 8.44842759276969, + "learning_rate": 2.071542208748912e-06, + "logits/chosen": -0.3392409682273865, + "logits/rejected": -0.014614325948059559, + "logps/chosen": -1.6830085515975952, + "logps/rejected": -2.501263380050659, + "loss": 0.67, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6830085515975952, + "rewards/margins": 0.8182545900344849, + "rewards/rejected": -2.501263380050659, + "sft_loss": 1.7062549591064453, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 15.463460340085286, + "learning_rate": 2.0672196904817715e-06, + "logits/chosen": -0.24148209393024445, + "logits/rejected": -0.09284202754497528, + "logps/chosen": -1.7609403133392334, + "logps/rejected": -2.5735113620758057, + "loss": 0.7034, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7609403133392334, + "rewards/margins": 0.8125707507133484, + "rewards/rejected": -2.5735113620758057, + "sft_loss": 1.7544368505477905, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 7.50603733440921, + "learning_rate": 2.0628916690467066e-06, + "logits/chosen": -0.2833019196987152, + "logits/rejected": -0.17756600677967072, + "logps/chosen": -1.7068138122558594, + "logps/rejected": -2.726548671722412, + "loss": 0.6412, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7068138122558594, + "rewards/margins": 1.019735336303711, + "rewards/rejected": -2.726548671722412, + "sft_loss": 1.6803268194198608, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 10.466797247061209, + "learning_rate": 2.0585581864341995e-06, + "logits/chosen": -0.4477129578590393, + "logits/rejected": -0.2852556109428406, + "logps/chosen": -1.763668417930603, + "logps/rejected": -2.5296313762664795, + "loss": 0.7053, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.763668417930603, + "rewards/margins": 0.7659630179405212, + "rewards/rejected": -2.5296313762664795, + "sft_loss": 1.833539605140686, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 13.883944751015369, + "learning_rate": 2.0542192846877177e-06, + "logits/chosen": -0.2172568291425705, + "logits/rejected": -0.15765947103500366, + "logps/chosen": -1.688741683959961, + "logps/rejected": -2.4696295261383057, + "loss": 0.6615, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.688741683959961, + "rewards/margins": 0.7808881402015686, + "rewards/rejected": -2.4696295261383057, + "sft_loss": 1.7033189535140991, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 7.087008884341495, + "learning_rate": 2.049875005903305e-06, + "logits/chosen": -0.36735719442367554, + "logits/rejected": -0.13228847086429596, + "logps/chosen": -1.671006441116333, + "logps/rejected": -2.6411962509155273, + "loss": 0.6407, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.671006441116333, + "rewards/margins": 0.9701893925666809, + "rewards/rejected": -2.6411962509155273, + "sft_loss": 1.818447470664978, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 10.870812188582518, + "learning_rate": 2.045525392229174e-06, + "logits/chosen": -0.10764901340007782, + "logits/rejected": 0.10250584781169891, + "logps/chosen": -1.6554419994354248, + "logps/rejected": -2.5783419609069824, + "loss": 0.6931, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6554419994354248, + "rewards/margins": 0.9229000806808472, + "rewards/rejected": -2.5783419609069824, + "sft_loss": 1.7975515127182007, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 12.510974976874456, + "learning_rate": 2.0411704858652946e-06, + "logits/chosen": -0.24468784034252167, + "logits/rejected": -0.1647200733423233, + "logps/chosen": -1.6226732730865479, + "logps/rejected": -2.521732807159424, + "loss": 0.6373, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6226732730865479, + "rewards/margins": 0.8990596532821655, + "rewards/rejected": -2.521732807159424, + "sft_loss": 1.6786104440689087, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 10.37553722653345, + "learning_rate": 2.0368103290629877e-06, + "logits/chosen": -0.14447571337223053, + "logits/rejected": -0.11947256326675415, + "logps/chosen": -1.6237910985946655, + "logps/rejected": -2.3520302772521973, + "loss": 0.6834, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6237910985946655, + "rewards/margins": 0.7282392382621765, + "rewards/rejected": -2.3520302772521973, + "sft_loss": 1.6659843921661377, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 12.08940901319854, + "learning_rate": 2.0324449641245145e-06, + "logits/chosen": -0.13080765306949615, + "logits/rejected": 0.09991051256656647, + "logps/chosen": -1.6093822717666626, + "logps/rejected": -2.3164443969726562, + "loss": 0.6842, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6093822717666626, + "rewards/margins": 0.7070624232292175, + "rewards/rejected": -2.3164443969726562, + "sft_loss": 1.6584736108779907, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 6.952355678407748, + "learning_rate": 2.028074433402664e-06, + "logits/chosen": -0.14452217519283295, + "logits/rejected": 0.09375777095556259, + "logps/chosen": -1.6005226373672485, + "logps/rejected": -2.555150270462036, + "loss": 0.6405, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6005226373672485, + "rewards/margins": 0.9546276926994324, + "rewards/rejected": -2.555150270462036, + "sft_loss": 1.6105680465698242, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 17.396140250961892, + "learning_rate": 2.023698779300344e-06, + "logits/chosen": -0.21792694926261902, + "logits/rejected": -0.025183891877532005, + "logps/chosen": -1.6324084997177124, + "logps/rejected": -2.5301878452301025, + "loss": 0.6377, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6324084997177124, + "rewards/margins": 0.8977789878845215, + "rewards/rejected": -2.5301878452301025, + "sft_loss": 1.7066516876220703, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 7.09240283401546, + "learning_rate": 2.019318044270171e-06, + "logits/chosen": -0.10281024128198624, + "logits/rejected": 0.037469811737537384, + "logps/chosen": -1.694016695022583, + "logps/rejected": -2.766887664794922, + "loss": 0.6299, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.694016695022583, + "rewards/margins": 1.0728710889816284, + "rewards/rejected": -2.766887664794922, + "sft_loss": 1.7665046453475952, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 7.659323922479531, + "learning_rate": 2.0149322708140545e-06, + "logits/chosen": -0.19877466559410095, + "logits/rejected": -0.08290411531925201, + "logps/chosen": -1.8053525686264038, + "logps/rejected": -2.6758005619049072, + "loss": 0.6598, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8053525686264038, + "rewards/margins": 0.8704478144645691, + "rewards/rejected": -2.6758005619049072, + "sft_loss": 1.7328903675079346, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 16.288994541979754, + "learning_rate": 2.0105415014827886e-06, + "logits/chosen": -0.22056837379932404, + "logits/rejected": -0.09690554440021515, + "logps/chosen": -1.9371941089630127, + "logps/rejected": -3.0349159240722656, + "loss": 0.693, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9371941089630127, + "rewards/margins": 1.0977216958999634, + "rewards/rejected": -3.0349159240722656, + "sft_loss": 2.0370259284973145, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 10.201814213064157, + "learning_rate": 2.006145778875636e-06, + "logits/chosen": -0.1919160783290863, + "logits/rejected": -0.1108192577958107, + "logps/chosen": -1.7822622060775757, + "logps/rejected": -2.6225311756134033, + "loss": 0.7171, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.7822622060775757, + "rewards/margins": 0.8402689695358276, + "rewards/rejected": -2.6225311756134033, + "sft_loss": 1.843174934387207, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 9.233419850296873, + "learning_rate": 2.0017451456399165e-06, + "logits/chosen": -0.28379935026168823, + "logits/rejected": -0.08224531263113022, + "logps/chosen": -1.7444589138031006, + "logps/rejected": -2.6347079277038574, + "loss": 0.6493, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7444589138031006, + "rewards/margins": 0.8902491331100464, + "rewards/rejected": -2.6347079277038574, + "sft_loss": 1.7515875101089478, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 12.342674686825728, + "learning_rate": 1.9973396444705934e-06, + "logits/chosen": -0.16968822479248047, + "logits/rejected": 0.08616310358047485, + "logps/chosen": -1.7560157775878906, + "logps/rejected": -2.532700300216675, + "loss": 0.696, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7560157775878906, + "rewards/margins": 0.7766846418380737, + "rewards/rejected": -2.532700300216675, + "sft_loss": 1.8065052032470703, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 11.152899998213138, + "learning_rate": 1.9929293181098588e-06, + "logits/chosen": -0.19315733015537262, + "logits/rejected": 0.08167880773544312, + "logps/chosen": -1.709506630897522, + "logps/rejected": -2.686140298843384, + "loss": 0.6498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.709506630897522, + "rewards/margins": 0.9766336679458618, + "rewards/rejected": -2.686140298843384, + "sft_loss": 1.7340914011001587, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 10.255858787571785, + "learning_rate": 1.988514209346718e-06, + "logits/chosen": -0.19811835885047913, + "logits/rejected": 0.03873931244015694, + "logps/chosen": -1.832733392715454, + "logps/rejected": -2.5946521759033203, + "loss": 0.7137, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.832733392715454, + "rewards/margins": 0.7619189023971558, + "rewards/rejected": -2.5946521759033203, + "sft_loss": 1.8450162410736084, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 15.489292910590374, + "learning_rate": 1.984094361016575e-06, + "logits/chosen": -0.10327938944101334, + "logits/rejected": 0.0570363774895668, + "logps/chosen": -1.7769912481307983, + "logps/rejected": -2.718773603439331, + "loss": 0.7235, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7769912481307983, + "rewards/margins": 0.9417825937271118, + "rewards/rejected": -2.718773603439331, + "sft_loss": 1.8660234212875366, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 11.174376711520368, + "learning_rate": 1.9796698160008187e-06, + "logits/chosen": -0.14350536465644836, + "logits/rejected": 0.033099908381700516, + "logps/chosen": -1.7975308895111084, + "logps/rejected": -2.6718554496765137, + "loss": 0.6903, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7975308895111084, + "rewards/margins": 0.8743244409561157, + "rewards/rejected": -2.6718554496765137, + "sft_loss": 1.814619779586792, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 11.23257514840121, + "learning_rate": 1.975240617226404e-06, + "logits/chosen": -0.18500396609306335, + "logits/rejected": 0.023821836337447166, + "logps/chosen": -1.7192474603652954, + "logps/rejected": -2.6152572631835938, + "loss": 0.6758, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7192474603652954, + "rewards/margins": 0.8960098028182983, + "rewards/rejected": -2.6152572631835938, + "sft_loss": 1.811990737915039, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 6.369737713360051, + "learning_rate": 1.9708068076654364e-06, + "logits/chosen": -0.05014703422784805, + "logits/rejected": 0.04246297478675842, + "logps/chosen": -1.6130955219268799, + "logps/rejected": -2.40543794631958, + "loss": 0.6577, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6130955219268799, + "rewards/margins": 0.792342483997345, + "rewards/rejected": -2.40543794631958, + "sft_loss": 1.639413595199585, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 7.0527777236925795, + "learning_rate": 1.966368430334756e-06, + "logits/chosen": -0.18567633628845215, + "logits/rejected": 0.03269173949956894, + "logps/chosen": -1.575241208076477, + "logps/rejected": -2.3695342540740967, + "loss": 0.6552, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.575241208076477, + "rewards/margins": 0.7942931056022644, + "rewards/rejected": -2.3695342540740967, + "sft_loss": 1.6236913204193115, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 8.006785224860513, + "learning_rate": 1.961925528295519e-06, + "logits/chosen": -0.0933329164981842, + "logits/rejected": 0.025968652218580246, + "logps/chosen": -1.6438376903533936, + "logps/rejected": -2.2044715881347656, + "loss": 0.7073, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6438376903533936, + "rewards/margins": 0.5606337785720825, + "rewards/rejected": -2.2044715881347656, + "sft_loss": 1.7238785028457642, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 8.934485275229912, + "learning_rate": 1.9574781446527806e-06, + "logits/chosen": 0.0503157377243042, + "logits/rejected": 0.274901419878006, + "logps/chosen": -1.5226280689239502, + "logps/rejected": -2.387205123901367, + "loss": 0.6122, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5226280689239502, + "rewards/margins": 0.8645769953727722, + "rewards/rejected": -2.387205123901367, + "sft_loss": 1.559669852256775, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 9.143236160600896, + "learning_rate": 1.9530263225550765e-06, + "logits/chosen": -0.08170856535434723, + "logits/rejected": 0.09695029258728027, + "logps/chosen": -1.5636448860168457, + "logps/rejected": -2.303366184234619, + "loss": 0.6796, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5636448860168457, + "rewards/margins": 0.7397211790084839, + "rewards/rejected": -2.303366184234619, + "sft_loss": 1.6848373413085938, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 7.935835067471001, + "learning_rate": 1.9485701051940037e-06, + "logits/chosen": -0.05064685270190239, + "logits/rejected": 0.035549163818359375, + "logps/chosen": -1.5907132625579834, + "logps/rejected": -2.247149705886841, + "loss": 0.6933, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.5907132625579834, + "rewards/margins": 0.6564362645149231, + "rewards/rejected": -2.247149705886841, + "sft_loss": 1.6183847188949585, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 13.587108491088857, + "learning_rate": 1.9441095358038035e-06, + "logits/chosen": 0.04309100657701492, + "logits/rejected": 0.22402247786521912, + "logps/chosen": -1.639139175415039, + "logps/rejected": -2.354151964187622, + "loss": 0.6767, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.639139175415039, + "rewards/margins": 0.7150126695632935, + "rewards/rejected": -2.354151964187622, + "sft_loss": 1.6775423288345337, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 8.460133838125511, + "learning_rate": 1.9396446576609387e-06, + "logits/chosen": 0.08298873901367188, + "logits/rejected": 0.17587396502494812, + "logps/chosen": -1.6868808269500732, + "logps/rejected": -2.4492154121398926, + "loss": 0.6663, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6868808269500732, + "rewards/margins": 0.7623344659805298, + "rewards/rejected": -2.4492154121398926, + "sft_loss": 1.7489010095596313, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 15.705647021024319, + "learning_rate": 1.935175514083677e-06, + "logits/chosen": 0.12293801456689835, + "logits/rejected": 0.19598381221294403, + "logps/chosen": -1.7710424661636353, + "logps/rejected": -2.66403865814209, + "loss": 0.7171, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7710424661636353, + "rewards/margins": 0.8929961919784546, + "rewards/rejected": -2.66403865814209, + "sft_loss": 1.8463138341903687, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 15.344801126200535, + "learning_rate": 1.9307021484316693e-06, + "logits/chosen": -0.021174585446715355, + "logits/rejected": 0.21488766372203827, + "logps/chosen": -1.6551218032836914, + "logps/rejected": -2.5742876529693604, + "loss": 0.6726, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6551218032836914, + "rewards/margins": 0.9191659688949585, + "rewards/rejected": -2.5742876529693604, + "sft_loss": 1.7234073877334595, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 10.968618406303506, + "learning_rate": 1.926224604105529e-06, + "logits/chosen": 0.048258017748594284, + "logits/rejected": 0.034669678658246994, + "logps/chosen": -1.6769109964370728, + "logps/rejected": -2.3088769912719727, + "loss": 0.7337, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6769109964370728, + "rewards/margins": 0.6319661736488342, + "rewards/rejected": -2.3088769912719727, + "sft_loss": 1.7413244247436523, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 13.51222024151536, + "learning_rate": 1.92174292454641e-06, + "logits/chosen": 0.0023163154255598783, + "logits/rejected": 0.2284691035747528, + "logps/chosen": -1.6658554077148438, + "logps/rejected": -2.547213554382324, + "loss": 0.6688, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6658554077148438, + "rewards/margins": 0.8813580274581909, + "rewards/rejected": -2.547213554382324, + "sft_loss": 1.6388963460922241, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 7.5136948275784965, + "learning_rate": 1.917257153235587e-06, + "logits/chosen": -0.14208294451236725, + "logits/rejected": 0.1281939297914505, + "logps/chosen": -1.6982141733169556, + "logps/rejected": -2.5021536350250244, + "loss": 0.6809, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6982141733169556, + "rewards/margins": 0.8039396405220032, + "rewards/rejected": -2.5021536350250244, + "sft_loss": 1.706189751625061, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 12.495512298700909, + "learning_rate": 1.9127673336940335e-06, + "logits/chosen": -0.06291162967681885, + "logits/rejected": 0.0830179750919342, + "logps/chosen": -1.616612434387207, + "logps/rejected": -2.4798736572265625, + "loss": 0.6679, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.616612434387207, + "rewards/margins": 0.8632608652114868, + "rewards/rejected": -2.4798736572265625, + "sft_loss": 1.6792612075805664, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 8.40465437215762, + "learning_rate": 1.908273509481998e-06, + "logits/chosen": 0.00954020582139492, + "logits/rejected": 0.1751098483800888, + "logps/chosen": -1.7633380889892578, + "logps/rejected": -2.5841474533081055, + "loss": 0.6787, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7633380889892578, + "rewards/margins": 0.8208094835281372, + "rewards/rejected": -2.5841474533081055, + "sft_loss": 1.7911789417266846, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 11.217731923889714, + "learning_rate": 1.9037757241985832e-06, + "logits/chosen": -0.03795923292636871, + "logits/rejected": 0.07577961683273315, + "logps/chosen": -1.6473220586776733, + "logps/rejected": -2.5562381744384766, + "loss": 0.6378, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6473220586776733, + "rewards/margins": 0.9089161157608032, + "rewards/rejected": -2.5562381744384766, + "sft_loss": 1.6741434335708618, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 9.172700095556124, + "learning_rate": 1.899274021481321e-06, + "logits/chosen": -0.0980779379606247, + "logits/rejected": 0.1461961269378662, + "logps/chosen": -1.7238715887069702, + "logps/rejected": -2.869298219680786, + "loss": 0.6641, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7238715887069702, + "rewards/margins": 1.1454265117645264, + "rewards/rejected": -2.869298219680786, + "sft_loss": 1.7315521240234375, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 13.165880615063447, + "learning_rate": 1.8947684450057516e-06, + "logits/chosen": 0.014770316891372204, + "logits/rejected": 0.26207587122917175, + "logps/chosen": -1.6157658100128174, + "logps/rejected": -2.5762152671813965, + "loss": 0.6117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6157658100128174, + "rewards/margins": 0.9604493379592896, + "rewards/rejected": -2.5762152671813965, + "sft_loss": 1.691689133644104, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 18.071064934386342, + "learning_rate": 1.890259038484997e-06, + "logits/chosen": 0.08018441498279572, + "logits/rejected": 0.19620773196220398, + "logps/chosen": -1.7247365713119507, + "logps/rejected": -2.584254741668701, + "loss": 0.7137, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7247365713119507, + "rewards/margins": 0.8595183491706848, + "rewards/rejected": -2.584254741668701, + "sft_loss": 1.7407766580581665, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 11.9692190525311, + "learning_rate": 1.8857458456693398e-06, + "logits/chosen": -0.14878655970096588, + "logits/rejected": 0.09677322208881378, + "logps/chosen": -1.712206244468689, + "logps/rejected": -2.5729756355285645, + "loss": 0.6599, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.712206244468689, + "rewards/margins": 0.8607694506645203, + "rewards/rejected": -2.5729756355285645, + "sft_loss": 1.8201481103897095, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 10.683630966053455, + "learning_rate": 1.881228910345796e-06, + "logits/chosen": 0.02798542007803917, + "logits/rejected": 0.19053414463996887, + "logps/chosen": -1.7972015142440796, + "logps/rejected": -2.5500593185424805, + "loss": 0.686, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7972015142440796, + "rewards/margins": 0.7528579235076904, + "rewards/rejected": -2.5500593185424805, + "sft_loss": 1.8112598657608032, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 14.183303772598652, + "learning_rate": 1.8767082763376916e-06, + "logits/chosen": -0.0879894495010376, + "logits/rejected": 0.17549023032188416, + "logps/chosen": -1.79047429561615, + "logps/rejected": -2.5765414237976074, + "loss": 0.7087, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.79047429561615, + "rewards/margins": 0.7860671281814575, + "rewards/rejected": -2.5765414237976074, + "sft_loss": 1.6728007793426514, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 19.600034784025812, + "learning_rate": 1.8721839875042386e-06, + "logits/chosen": -0.04705687612295151, + "logits/rejected": 0.2245139628648758, + "logps/chosen": -1.707383155822754, + "logps/rejected": -2.55816650390625, + "loss": 0.6819, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.707383155822754, + "rewards/margins": 0.8507832288742065, + "rewards/rejected": -2.55816650390625, + "sft_loss": 1.7719299793243408, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 9.769090087528488, + "learning_rate": 1.8676560877401062e-06, + "logits/chosen": -0.0794854611158371, + "logits/rejected": 0.29655495285987854, + "logps/chosen": -1.6657311916351318, + "logps/rejected": -2.5681443214416504, + "loss": 0.6365, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6657311916351318, + "rewards/margins": 0.9024130702018738, + "rewards/rejected": -2.5681443214416504, + "sft_loss": 1.7102835178375244, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 17.409084631580033, + "learning_rate": 1.8631246209749982e-06, + "logits/chosen": -0.1895194798707962, + "logits/rejected": 0.25256744027137756, + "logps/chosen": -1.6916043758392334, + "logps/rejected": -2.622830390930176, + "loss": 0.6574, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6916043758392334, + "rewards/margins": 0.9312260746955872, + "rewards/rejected": -2.622830390930176, + "sft_loss": 1.6841598749160767, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 10.378353695060042, + "learning_rate": 1.8585896311732247e-06, + "logits/chosen": 0.15478946268558502, + "logits/rejected": 0.17653344571590424, + "logps/chosen": -1.6762475967407227, + "logps/rejected": -2.5309839248657227, + "loss": 0.6824, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6762475967407227, + "rewards/margins": 0.8547362089157104, + "rewards/rejected": -2.5309839248657227, + "sft_loss": 1.6760101318359375, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 12.61956310328868, + "learning_rate": 1.854051162333277e-06, + "logits/chosen": 0.02709706500172615, + "logits/rejected": 0.3328257203102112, + "logps/chosen": -1.6858981847763062, + "logps/rejected": -2.506181478500366, + "loss": 0.6767, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6858981847763062, + "rewards/margins": 0.8202834129333496, + "rewards/rejected": -2.506181478500366, + "sft_loss": 1.7573429346084595, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 7.289511410035405, + "learning_rate": 1.8495092584873992e-06, + "logits/chosen": -0.05549658462405205, + "logits/rejected": 0.3597896695137024, + "logps/chosen": -1.5350788831710815, + "logps/rejected": -2.65586519241333, + "loss": 0.5958, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5350788831710815, + "rewards/margins": 1.1207860708236694, + "rewards/rejected": -2.65586519241333, + "sft_loss": 1.5353453159332275, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 9.031671007744759, + "learning_rate": 1.844963963701163e-06, + "logits/chosen": 0.15731653571128845, + "logits/rejected": 0.17976097762584686, + "logps/chosen": -1.6493396759033203, + "logps/rejected": -2.5455853939056396, + "loss": 0.6425, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6493396759033203, + "rewards/margins": 0.8962458372116089, + "rewards/rejected": -2.5455853939056396, + "sft_loss": 1.652875542640686, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 7.0803131127486, + "learning_rate": 1.8404153220730383e-06, + "logits/chosen": -0.05579300969839096, + "logits/rejected": 0.12802644073963165, + "logps/chosen": -1.57602059841156, + "logps/rejected": -2.395585775375366, + "loss": 0.6724, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.57602059841156, + "rewards/margins": 0.8195652961730957, + "rewards/rejected": -2.395585775375366, + "sft_loss": 1.6549571752548218, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 9.39056548534232, + "learning_rate": 1.8358633777339654e-06, + "logits/chosen": 0.07542654126882553, + "logits/rejected": 0.2509918808937073, + "logps/chosen": -1.641409158706665, + "logps/rejected": -2.3671250343322754, + "loss": 0.6663, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.641409158706665, + "rewards/margins": 0.725716233253479, + "rewards/rejected": -2.3671250343322754, + "sft_loss": 1.6149475574493408, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 8.775922496769965, + "learning_rate": 1.831308174846929e-06, + "logits/chosen": 0.039566848427057266, + "logits/rejected": 0.22721309959888458, + "logps/chosen": -1.639269232749939, + "logps/rejected": -2.531994104385376, + "loss": 0.6459, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.639269232749939, + "rewards/margins": 0.892724871635437, + "rewards/rejected": -2.531994104385376, + "sft_loss": 1.6838048696517944, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 11.858878928388116, + "learning_rate": 1.826749757606527e-06, + "logits/chosen": 0.053937554359436035, + "logits/rejected": 0.36660197377204895, + "logps/chosen": -1.6890392303466797, + "logps/rejected": -2.7090961933135986, + "loss": 0.6462, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6890392303466797, + "rewards/margins": 1.0200568437576294, + "rewards/rejected": -2.7090961933135986, + "sft_loss": 1.7080329656600952, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 7.710103826960804, + "learning_rate": 1.8221881702385435e-06, + "logits/chosen": 0.023390358313918114, + "logits/rejected": 0.3567085266113281, + "logps/chosen": -1.600010633468628, + "logps/rejected": -2.635223150253296, + "loss": 0.6197, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.600010633468628, + "rewards/margins": 1.0352122783660889, + "rewards/rejected": -2.635223150253296, + "sft_loss": 1.7037324905395508, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 14.175525032890683, + "learning_rate": 1.8176234569995196e-06, + "logits/chosen": 0.0604880228638649, + "logits/rejected": 0.25084275007247925, + "logps/chosen": -1.7276502847671509, + "logps/rejected": -2.919510841369629, + "loss": 0.6415, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7276502847671509, + "rewards/margins": 1.1918604373931885, + "rewards/rejected": -2.919510841369629, + "sft_loss": 1.7756588459014893, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 9.28594869278176, + "learning_rate": 1.8130556621763223e-06, + "logits/chosen": -0.02487350068986416, + "logits/rejected": 0.2390095293521881, + "logps/chosen": -1.6136411428451538, + "logps/rejected": -2.550739288330078, + "loss": 0.6491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6136411428451538, + "rewards/margins": 0.9370980262756348, + "rewards/rejected": -2.550739288330078, + "sft_loss": 1.6545941829681396, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 8.845444980867176, + "learning_rate": 1.808484830085718e-06, + "logits/chosen": 0.12213625013828278, + "logits/rejected": 0.3251255750656128, + "logps/chosen": -1.7759946584701538, + "logps/rejected": -2.85046124458313, + "loss": 0.6402, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7759946584701538, + "rewards/margins": 1.0744664669036865, + "rewards/rejected": -2.85046124458313, + "sft_loss": 1.8285300731658936, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 16.5671755062521, + "learning_rate": 1.8039110050739394e-06, + "logits/chosen": 0.11437705904245377, + "logits/rejected": 0.31481966376304626, + "logps/chosen": -1.6738694906234741, + "logps/rejected": -2.713113784790039, + "loss": 0.6268, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6738694906234741, + "rewards/margins": 1.0392444133758545, + "rewards/rejected": -2.713113784790039, + "sft_loss": 1.7562519311904907, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 9.197601666045005, + "learning_rate": 1.7993342315162563e-06, + "logits/chosen": -0.05201379582285881, + "logits/rejected": 0.29744741320610046, + "logps/chosen": -1.7652403116226196, + "logps/rejected": -2.9192309379577637, + "loss": 0.6118, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7652403116226196, + "rewards/margins": 1.1539907455444336, + "rewards/rejected": -2.9192309379577637, + "sft_loss": 1.8131014108657837, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 8.21473865646731, + "learning_rate": 1.794754553816546e-06, + "logits/chosen": 0.09198231995105743, + "logits/rejected": 0.3146513104438782, + "logps/chosen": -1.705095887184143, + "logps/rejected": -2.66255521774292, + "loss": 0.629, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.705095887184143, + "rewards/margins": 0.9574591517448425, + "rewards/rejected": -2.66255521774292, + "sft_loss": 1.8125232458114624, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 12.512621153535388, + "learning_rate": 1.7901720164068623e-06, + "logits/chosen": -0.0585850365459919, + "logits/rejected": 0.0636616125702858, + "logps/chosen": -1.644086480140686, + "logps/rejected": -2.435697078704834, + "loss": 0.6884, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.644086480140686, + "rewards/margins": 0.7916107177734375, + "rewards/rejected": -2.435697078704834, + "sft_loss": 1.699789047241211, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 12.624327577956587, + "learning_rate": 1.7855866637470027e-06, + "logits/chosen": 0.05995064973831177, + "logits/rejected": 0.19660253822803497, + "logps/chosen": -1.6978027820587158, + "logps/rejected": -2.7610092163085938, + "loss": 0.6434, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6978027820587158, + "rewards/margins": 1.0632063150405884, + "rewards/rejected": -2.7610092163085938, + "sft_loss": 1.748500108718872, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 7.568036513490646, + "learning_rate": 1.780998540324079e-06, + "logits/chosen": 0.12871405482292175, + "logits/rejected": 0.32352548837661743, + "logps/chosen": -1.7610708475112915, + "logps/rejected": -2.594722032546997, + "loss": 0.7147, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7610708475112915, + "rewards/margins": 0.833651065826416, + "rewards/rejected": -2.594722032546997, + "sft_loss": 1.7433416843414307, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 7.421231582880555, + "learning_rate": 1.776407690652084e-06, + "logits/chosen": 0.012217795476317406, + "logits/rejected": 0.24228866398334503, + "logps/chosen": -1.7647712230682373, + "logps/rejected": -2.819415330886841, + "loss": 0.6542, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7647712230682373, + "rewards/margins": 1.054643988609314, + "rewards/rejected": -2.819415330886841, + "sft_loss": 1.765459418296814, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 11.134341558803175, + "learning_rate": 1.7718141592714628e-06, + "logits/chosen": 0.1779087483882904, + "logits/rejected": 0.13158239424228668, + "logps/chosen": -1.6792852878570557, + "logps/rejected": -2.597214698791504, + "loss": 0.7041, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6792852878570557, + "rewards/margins": 0.9179295301437378, + "rewards/rejected": -2.597214698791504, + "sft_loss": 1.7933791875839233, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 8.557433582386375, + "learning_rate": 1.7672179907486757e-06, + "logits/chosen": 0.2755883038043976, + "logits/rejected": 0.27128463983535767, + "logps/chosen": -1.5790361166000366, + "logps/rejected": -2.3961989879608154, + "loss": 0.685, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5790361166000366, + "rewards/margins": 0.817162811756134, + "rewards/rejected": -2.3961989879608154, + "sft_loss": 1.6117353439331055, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 11.627721912885242, + "learning_rate": 1.7626192296757708e-06, + "logits/chosen": 0.1090032309293747, + "logits/rejected": 0.20505475997924805, + "logps/chosen": -1.6978752613067627, + "logps/rejected": -2.478768825531006, + "loss": 0.6975, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6978752613067627, + "rewards/margins": 0.7808934450149536, + "rewards/rejected": -2.478768825531006, + "sft_loss": 1.7721240520477295, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.7225221395492554, + "eval_logits/rejected": 0.880984365940094, + "eval_logps/chosen": -1.7409367561340332, + "eval_logps/rejected": -2.4820258617401123, + "eval_loss": 0.7278200387954712, + "eval_rewards/accuracies": 0.68916916847229, + "eval_rewards/chosen": -1.7409367561340332, + "eval_rewards/margins": 0.7410891056060791, + "eval_rewards/rejected": -2.4820258617401123, + "eval_runtime": 44.6007, + "eval_samples_per_second": 30.157, + "eval_sft_loss": 1.7287054061889648, + "eval_steps_per_second": 7.556, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 6.69535209611385, + "learning_rate": 1.7580179206699475e-06, + "logits/chosen": -0.0975351482629776, + "logits/rejected": 0.19071190059185028, + "logps/chosen": -1.4848353862762451, + "logps/rejected": -2.3775861263275146, + "loss": 0.6259, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.4848353862762451, + "rewards/margins": 0.8927507400512695, + "rewards/rejected": -2.3775861263275146, + "sft_loss": 1.5436747074127197, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 13.330593601208003, + "learning_rate": 1.7534141083731262e-06, + "logits/chosen": 0.13205106556415558, + "logits/rejected": 0.27563416957855225, + "logps/chosen": -1.6868665218353271, + "logps/rejected": -2.556553363800049, + "loss": 0.682, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6868665218353271, + "rewards/margins": 0.8696869611740112, + "rewards/rejected": -2.556553363800049, + "sft_loss": 1.7678245306015015, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 8.232942043945219, + "learning_rate": 1.7488078374515143e-06, + "logits/chosen": 0.19379249215126038, + "logits/rejected": 0.3722130060195923, + "logps/chosen": -1.672188401222229, + "logps/rejected": -2.7497897148132324, + "loss": 0.6054, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.672188401222229, + "rewards/margins": 1.0776013135910034, + "rewards/rejected": -2.7497897148132324, + "sft_loss": 1.6930681467056274, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 9.209473289799588, + "learning_rate": 1.7441991525951722e-06, + "logits/chosen": 0.056553877890110016, + "logits/rejected": 0.4158708453178406, + "logps/chosen": -1.71318781375885, + "logps/rejected": -2.620293617248535, + "loss": 0.6823, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.71318781375885, + "rewards/margins": 0.9071057438850403, + "rewards/rejected": -2.620293617248535, + "sft_loss": 1.7801010608673096, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 20.748635995851046, + "learning_rate": 1.7395880985175808e-06, + "logits/chosen": -0.02775495871901512, + "logits/rejected": 0.28328195214271545, + "logps/chosen": -1.798081398010254, + "logps/rejected": -2.9397964477539062, + "loss": 0.6369, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.798081398010254, + "rewards/margins": 1.141715168952942, + "rewards/rejected": -2.9397964477539062, + "sft_loss": 1.7914934158325195, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 9.5241745735613, + "learning_rate": 1.7349747199552063e-06, + "logits/chosen": 0.16219733655452728, + "logits/rejected": 0.358007550239563, + "logps/chosen": -1.6569722890853882, + "logps/rejected": -2.6283373832702637, + "loss": 0.634, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6569722890853882, + "rewards/margins": 0.9713649749755859, + "rewards/rejected": -2.6283373832702637, + "sft_loss": 1.7422775030136108, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 11.82818399467519, + "learning_rate": 1.7303590616670683e-06, + "logits/chosen": 0.022676551714539528, + "logits/rejected": 0.27947741746902466, + "logps/chosen": -1.7626409530639648, + "logps/rejected": -2.8061025142669678, + "loss": 0.6341, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7626409530639648, + "rewards/margins": 1.043461799621582, + "rewards/rejected": -2.8061025142669678, + "sft_loss": 1.8145999908447266, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 12.272501099504066, + "learning_rate": 1.7257411684343042e-06, + "logits/chosen": 0.01731787994503975, + "logits/rejected": 0.1518246829509735, + "logps/chosen": -1.7838151454925537, + "logps/rejected": -2.5523946285247803, + "loss": 0.7211, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7838151454925537, + "rewards/margins": 0.768579363822937, + "rewards/rejected": -2.5523946285247803, + "sft_loss": 1.8348591327667236, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 8.989420292425729, + "learning_rate": 1.7211210850597333e-06, + "logits/chosen": 0.01916094496846199, + "logits/rejected": 0.18893815577030182, + "logps/chosen": -1.8312240839004517, + "logps/rejected": -2.7193756103515625, + "loss": 0.7274, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8312240839004517, + "rewards/margins": 0.8881517648696899, + "rewards/rejected": -2.7193756103515625, + "sft_loss": 1.772420883178711, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 9.066222177106056, + "learning_rate": 1.7164988563674256e-06, + "logits/chosen": -0.017062615603208542, + "logits/rejected": 0.14022260904312134, + "logps/chosen": -1.7420536279678345, + "logps/rejected": -2.8658266067504883, + "loss": 0.6622, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7420536279678345, + "rewards/margins": 1.1237730979919434, + "rewards/rejected": -2.8658266067504883, + "sft_loss": 1.751794457435608, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 11.820725471020838, + "learning_rate": 1.7118745272022635e-06, + "logits/chosen": -0.041482336819171906, + "logits/rejected": 0.24770434200763702, + "logps/chosen": -1.812748908996582, + "logps/rejected": -2.7241785526275635, + "loss": 0.6673, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.812748908996582, + "rewards/margins": 0.9114295840263367, + "rewards/rejected": -2.7241785526275635, + "sft_loss": 1.8595969676971436, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 9.634310646961953, + "learning_rate": 1.7072481424295097e-06, + "logits/chosen": -0.1236807107925415, + "logits/rejected": 0.18752439320087433, + "logps/chosen": -1.664668083190918, + "logps/rejected": -2.4004406929016113, + "loss": 0.6743, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.664668083190918, + "rewards/margins": 0.7357724905014038, + "rewards/rejected": -2.4004406929016113, + "sft_loss": 1.6978073120117188, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 9.221992342972545, + "learning_rate": 1.702619746934369e-06, + "logits/chosen": -0.14590224623680115, + "logits/rejected": 0.09175385534763336, + "logps/chosen": -1.7224514484405518, + "logps/rejected": -2.482299327850342, + "loss": 0.7081, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7224514484405518, + "rewards/margins": 0.7598481178283691, + "rewards/rejected": -2.482299327850342, + "sft_loss": 1.7601163387298584, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 11.372601623636282, + "learning_rate": 1.6979893856215547e-06, + "logits/chosen": -0.02332393266260624, + "logits/rejected": 0.1746397167444229, + "logps/chosen": -1.6565109491348267, + "logps/rejected": -2.316208839416504, + "loss": 0.6962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6565109491348267, + "rewards/margins": 0.6596980094909668, + "rewards/rejected": -2.316208839416504, + "sft_loss": 1.6300147771835327, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 10.141313724386336, + "learning_rate": 1.6933571034148531e-06, + "logits/chosen": -0.003986936993896961, + "logits/rejected": 0.17912557721138, + "logps/chosen": -1.6668100357055664, + "logps/rejected": -2.422069787979126, + "loss": 0.652, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6668100357055664, + "rewards/margins": 0.75525963306427, + "rewards/rejected": -2.422069787979126, + "sft_loss": 1.6519018411636353, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 11.36954785563402, + "learning_rate": 1.6887229452566859e-06, + "logits/chosen": 0.14806845784187317, + "logits/rejected": 0.31873518228530884, + "logps/chosen": -1.5642637014389038, + "logps/rejected": -2.630958080291748, + "loss": 0.6148, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5642637014389038, + "rewards/margins": 1.0666944980621338, + "rewards/rejected": -2.630958080291748, + "sft_loss": 1.6302076578140259, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 16.28888372734567, + "learning_rate": 1.6840869561076761e-06, + "logits/chosen": -0.03490322828292847, + "logits/rejected": 0.1576215922832489, + "logps/chosen": -1.6812372207641602, + "logps/rejected": -2.6249706745147705, + "loss": 0.6642, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6812372207641602, + "rewards/margins": 0.9437335133552551, + "rewards/rejected": -2.6249706745147705, + "sft_loss": 1.7672827243804932, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 9.777297758031644, + "learning_rate": 1.6794491809462108e-06, + "logits/chosen": -0.05005021020770073, + "logits/rejected": 0.26095980405807495, + "logps/chosen": -1.705004096031189, + "logps/rejected": -2.7471156120300293, + "loss": 0.6291, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.705004096031189, + "rewards/margins": 1.0421111583709717, + "rewards/rejected": -2.7471156120300293, + "sft_loss": 1.715197205543518, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 8.10937142440469, + "learning_rate": 1.674809664768005e-06, + "logits/chosen": -0.06505431979894638, + "logits/rejected": 0.19787968695163727, + "logps/chosen": -1.6374775171279907, + "logps/rejected": -2.6413464546203613, + "loss": 0.6291, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6374775171279907, + "rewards/margins": 1.0038686990737915, + "rewards/rejected": -2.6413464546203613, + "sft_loss": 1.6606299877166748, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 11.073727655098075, + "learning_rate": 1.6701684525856647e-06, + "logits/chosen": 0.05843139812350273, + "logits/rejected": 0.21685531735420227, + "logps/chosen": -1.689962387084961, + "logps/rejected": -2.689544439315796, + "loss": 0.6346, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.689962387084961, + "rewards/margins": 0.9995821118354797, + "rewards/rejected": -2.689544439315796, + "sft_loss": 1.7143598794937134, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 21.911674709660502, + "learning_rate": 1.6655255894282515e-06, + "logits/chosen": 0.13442903757095337, + "logits/rejected": 0.1398596167564392, + "logps/chosen": -1.7459805011749268, + "logps/rejected": -2.8175759315490723, + "loss": 0.6528, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7459805011749268, + "rewards/margins": 1.0715951919555664, + "rewards/rejected": -2.8175759315490723, + "sft_loss": 1.777316689491272, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 16.44153112785128, + "learning_rate": 1.6608811203408437e-06, + "logits/chosen": -0.0002602711319923401, + "logits/rejected": 0.17666465044021606, + "logps/chosen": -1.6980969905853271, + "logps/rejected": -2.5463755130767822, + "loss": 0.6795, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6980969905853271, + "rewards/margins": 0.8482787013053894, + "rewards/rejected": -2.5463755130767822, + "sft_loss": 1.7703033685684204, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 17.147443377069976, + "learning_rate": 1.6562350903841002e-06, + "logits/chosen": 0.09378467500209808, + "logits/rejected": 0.40250760316848755, + "logps/chosen": -1.775538682937622, + "logps/rejected": -2.9476616382598877, + "loss": 0.6534, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.775538682937622, + "rewards/margins": 1.172122836112976, + "rewards/rejected": -2.9476616382598877, + "sft_loss": 1.8604596853256226, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 20.611801792196648, + "learning_rate": 1.651587544633825e-06, + "logits/chosen": 0.048998866230249405, + "logits/rejected": 0.2327841818332672, + "logps/chosen": -1.7560018301010132, + "logps/rejected": -2.8011298179626465, + "loss": 0.6707, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7560018301010132, + "rewards/margins": 1.0451281070709229, + "rewards/rejected": -2.8011298179626465, + "sft_loss": 1.8246772289276123, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 12.615014844831247, + "learning_rate": 1.6469385281805267e-06, + "logits/chosen": 0.06845826655626297, + "logits/rejected": 0.2191547155380249, + "logps/chosen": -1.6676260232925415, + "logps/rejected": -2.61234188079834, + "loss": 0.6907, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6676260232925415, + "rewards/margins": 0.9447160959243774, + "rewards/rejected": -2.61234188079834, + "sft_loss": 1.7090753316879272, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 13.437188454899843, + "learning_rate": 1.642288086128984e-06, + "logits/chosen": -0.11175362765789032, + "logits/rejected": 0.15200158953666687, + "logps/chosen": -1.580471396446228, + "logps/rejected": -2.7887420654296875, + "loss": 0.64, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.580471396446228, + "rewards/margins": 1.2082706689834595, + "rewards/rejected": -2.7887420654296875, + "sft_loss": 1.6918761730194092, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 13.49614515718196, + "learning_rate": 1.6376362635978055e-06, + "logits/chosen": -0.06129909306764603, + "logits/rejected": 0.14012238383293152, + "logps/chosen": -1.6711757183074951, + "logps/rejected": -2.4994843006134033, + "loss": 0.6692, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6711757183074951, + "rewards/margins": 0.8283087015151978, + "rewards/rejected": -2.4994843006134033, + "sft_loss": 1.7221336364746094, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 9.791108225155257, + "learning_rate": 1.6329831057189936e-06, + "logits/chosen": -0.08597782999277115, + "logits/rejected": 0.14627881348133087, + "logps/chosen": -1.6221815347671509, + "logps/rejected": -2.6455671787261963, + "loss": 0.6517, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6221815347671509, + "rewards/margins": 1.023385763168335, + "rewards/rejected": -2.6455671787261963, + "sft_loss": 1.6977074146270752, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 14.082836241989783, + "learning_rate": 1.6283286576375069e-06, + "logits/chosen": -0.06954207271337509, + "logits/rejected": 0.11793907731771469, + "logps/chosen": -1.6429466009140015, + "logps/rejected": -2.4059674739837646, + "loss": 0.678, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6429466009140015, + "rewards/margins": 0.7630206942558289, + "rewards/rejected": -2.4059674739837646, + "sft_loss": 1.658942461013794, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 16.784536910984507, + "learning_rate": 1.623672964510821e-06, + "logits/chosen": -0.05539491027593613, + "logits/rejected": 0.33161354064941406, + "logps/chosen": -1.6232349872589111, + "logps/rejected": -2.6007208824157715, + "loss": 0.6308, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6232349872589111, + "rewards/margins": 0.9774861335754395, + "rewards/rejected": -2.6007208824157715, + "sft_loss": 1.6661285161972046, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 13.008739717837097, + "learning_rate": 1.6190160715084909e-06, + "logits/chosen": -0.0022641464602202177, + "logits/rejected": 0.18418416380882263, + "logps/chosen": -1.7134850025177002, + "logps/rejected": -2.587226390838623, + "loss": 0.6859, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7134850025177002, + "rewards/margins": 0.873741626739502, + "rewards/rejected": -2.587226390838623, + "sft_loss": 1.8100296258926392, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 6.973299002605783, + "learning_rate": 1.6143580238117132e-06, + "logits/chosen": -0.13283856213092804, + "logits/rejected": 0.06323808431625366, + "logps/chosen": -1.6416757106781006, + "logps/rejected": -2.5051159858703613, + "loss": 0.6403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6416757106781006, + "rewards/margins": 0.8634401559829712, + "rewards/rejected": -2.5051159858703613, + "sft_loss": 1.6961336135864258, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 9.975646896565582, + "learning_rate": 1.6096988666128867e-06, + "logits/chosen": -0.08490502089262009, + "logits/rejected": 0.07857394218444824, + "logps/chosen": -1.6207164525985718, + "logps/rejected": -2.5260729789733887, + "loss": 0.6522, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6207164525985718, + "rewards/margins": 0.9053562879562378, + "rewards/rejected": -2.5260729789733887, + "sft_loss": 1.631158471107483, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 11.374496403588678, + "learning_rate": 1.6050386451151753e-06, + "logits/chosen": -0.11497664451599121, + "logits/rejected": 0.17683936655521393, + "logps/chosen": -1.729366660118103, + "logps/rejected": -2.531245708465576, + "loss": 0.6987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.729366660118103, + "rewards/margins": 0.8018789291381836, + "rewards/rejected": -2.531245708465576, + "sft_loss": 1.7939300537109375, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 9.7540309383818, + "learning_rate": 1.6003774045320686e-06, + "logits/chosen": -0.052069902420043945, + "logits/rejected": 0.1832505762577057, + "logps/chosen": -1.6778125762939453, + "logps/rejected": -2.5040578842163086, + "loss": 0.6592, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6778125762939453, + "rewards/margins": 0.8262453079223633, + "rewards/rejected": -2.5040578842163086, + "sft_loss": 1.7541424036026, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 9.26386326156349, + "learning_rate": 1.5957151900869425e-06, + "logits/chosen": -0.1480114758014679, + "logits/rejected": 0.12639665603637695, + "logps/chosen": -1.7525148391723633, + "logps/rejected": -2.531358242034912, + "loss": 0.6629, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7525148391723633, + "rewards/margins": 0.7788435816764832, + "rewards/rejected": -2.531358242034912, + "sft_loss": 1.7702747583389282, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 11.504689539648137, + "learning_rate": 1.5910520470126228e-06, + "logits/chosen": -0.07063053548336029, + "logits/rejected": 0.24231629073619843, + "logps/chosen": -1.7522459030151367, + "logps/rejected": -2.680307626724243, + "loss": 0.6822, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7522459030151367, + "rewards/margins": 0.9280616641044617, + "rewards/rejected": -2.680307626724243, + "sft_loss": 1.7290763854980469, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 13.933657904347603, + "learning_rate": 1.5863880205509432e-06, + "logits/chosen": -0.162372887134552, + "logits/rejected": 0.15010753273963928, + "logps/chosen": -1.6161205768585205, + "logps/rejected": -2.57654070854187, + "loss": 0.6338, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6161205768585205, + "rewards/margins": 0.9604201316833496, + "rewards/rejected": -2.57654070854187, + "sft_loss": 1.6565601825714111, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 12.510482806484635, + "learning_rate": 1.5817231559523097e-06, + "logits/chosen": -0.03692112863063812, + "logits/rejected": 0.1306377351284027, + "logps/chosen": -1.7245012521743774, + "logps/rejected": -2.869750499725342, + "loss": 0.6361, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7245012521743774, + "rewards/margins": 1.145249605178833, + "rewards/rejected": -2.869750499725342, + "sft_loss": 1.8045265674591064, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 6.8442139916725955, + "learning_rate": 1.5770574984752582e-06, + "logits/chosen": -0.10571374744176865, + "logits/rejected": 0.183961883187294, + "logps/chosen": -1.7865946292877197, + "logps/rejected": -2.5776772499084473, + "loss": 0.7157, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7865946292877197, + "rewards/margins": 0.7910826802253723, + "rewards/rejected": -2.5776772499084473, + "sft_loss": 1.7589298486709595, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 20.445526107029675, + "learning_rate": 1.5723910933860191e-06, + "logits/chosen": -0.24091443419456482, + "logits/rejected": 0.02261008694767952, + "logps/chosen": -1.7237069606781006, + "logps/rejected": -2.4907679557800293, + "loss": 0.6989, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7237069606781006, + "rewards/margins": 0.7670608758926392, + "rewards/rejected": -2.4907679557800293, + "sft_loss": 1.7233402729034424, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 12.619053572340551, + "learning_rate": 1.5677239859580742e-06, + "logits/chosen": -0.21961793303489685, + "logits/rejected": -0.012505131773650646, + "logps/chosen": -1.6292585134506226, + "logps/rejected": -2.373121500015259, + "loss": 0.7005, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6292585134506226, + "rewards/margins": 0.7438629865646362, + "rewards/rejected": -2.373121500015259, + "sft_loss": 1.6691501140594482, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 11.904233827176936, + "learning_rate": 1.5630562214717205e-06, + "logits/chosen": 0.12450895458459854, + "logits/rejected": 0.23923444747924805, + "logps/chosen": -1.7282705307006836, + "logps/rejected": -2.4345037937164307, + "loss": 0.6853, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7282705307006836, + "rewards/margins": 0.7062332630157471, + "rewards/rejected": -2.4345037937164307, + "sft_loss": 1.7057040929794312, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 9.554738287588028, + "learning_rate": 1.5583878452136296e-06, + "logits/chosen": -0.14793828129768372, + "logits/rejected": 0.06307663023471832, + "logps/chosen": -1.6740642786026, + "logps/rejected": -2.3620121479034424, + "loss": 0.6848, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6740642786026, + "rewards/margins": 0.6879477500915527, + "rewards/rejected": -2.3620121479034424, + "sft_loss": 1.7369064092636108, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 9.06908807591873, + "learning_rate": 1.5537189024764086e-06, + "logits/chosen": -0.09235244989395142, + "logits/rejected": 0.16227707266807556, + "logps/chosen": -1.6051900386810303, + "logps/rejected": -2.3181235790252686, + "loss": 0.6849, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6051900386810303, + "rewards/margins": 0.712933361530304, + "rewards/rejected": -2.3181235790252686, + "sft_loss": 1.6950048208236694, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 8.173684180423933, + "learning_rate": 1.5490494385581599e-06, + "logits/chosen": -0.041692376136779785, + "logits/rejected": 0.19014063477516174, + "logps/chosen": -1.721787452697754, + "logps/rejected": -2.479731559753418, + "loss": 0.69, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.721787452697754, + "rewards/margins": 0.7579439878463745, + "rewards/rejected": -2.479731559753418, + "sft_loss": 1.7510948181152344, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 11.774384400783354, + "learning_rate": 1.5443794987620433e-06, + "logits/chosen": 0.039117418229579926, + "logits/rejected": 0.2529299855232239, + "logps/chosen": -1.6293704509735107, + "logps/rejected": -2.246772289276123, + "loss": 0.6823, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6293704509735107, + "rewards/margins": 0.6174014806747437, + "rewards/rejected": -2.246772289276123, + "sft_loss": 1.6525089740753174, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 8.21281256966865, + "learning_rate": 1.539709128395835e-06, + "logits/chosen": -0.022275418043136597, + "logits/rejected": 0.078637033700943, + "logps/chosen": -1.5396145582199097, + "logps/rejected": -2.624133586883545, + "loss": 0.6204, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5396145582199097, + "rewards/margins": 1.0845190286636353, + "rewards/rejected": -2.624133586883545, + "sft_loss": 1.617453932762146, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 19.586850922708145, + "learning_rate": 1.5350383727714888e-06, + "logits/chosen": -0.01572037860751152, + "logits/rejected": 0.11370836198329926, + "logps/chosen": -1.6482702493667603, + "logps/rejected": -2.335761547088623, + "loss": 0.7206, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6482702493667603, + "rewards/margins": 0.6874914169311523, + "rewards/rejected": -2.335761547088623, + "sft_loss": 1.6958271265029907, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 10.801187419105617, + "learning_rate": 1.5303672772046963e-06, + "logits/chosen": -0.11493013054132462, + "logits/rejected": 0.08882128447294235, + "logps/chosen": -1.7629035711288452, + "logps/rejected": -2.957852840423584, + "loss": 0.5951, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7629035711288452, + "rewards/margins": 1.1949495077133179, + "rewards/rejected": -2.957852840423584, + "sft_loss": 1.8507661819458008, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 10.842179095103713, + "learning_rate": 1.525695887014447e-06, + "logits/chosen": -0.14992229640483856, + "logits/rejected": 0.1002429947257042, + "logps/chosen": -1.8385114669799805, + "logps/rejected": -2.759986639022827, + "loss": 0.6714, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8385114669799805, + "rewards/margins": 0.9214752912521362, + "rewards/rejected": -2.759986639022827, + "sft_loss": 1.8441314697265625, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 8.295916061287276, + "learning_rate": 1.5210242475225896e-06, + "logits/chosen": -0.07416633516550064, + "logits/rejected": 0.20603366196155548, + "logps/chosen": -1.7949146032333374, + "logps/rejected": -2.755343198776245, + "loss": 0.6977, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7949146032333374, + "rewards/margins": 0.9604288935661316, + "rewards/rejected": -2.755343198776245, + "sft_loss": 1.8887898921966553, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 16.52306270227895, + "learning_rate": 1.5163524040533903e-06, + "logits/chosen": 0.057475216686725616, + "logits/rejected": 0.17760422825813293, + "logps/chosen": -1.803520917892456, + "logps/rejected": -2.734199285507202, + "loss": 0.6897, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.803520917892456, + "rewards/margins": 0.9306782484054565, + "rewards/rejected": -2.734199285507202, + "sft_loss": 1.8802194595336914, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 10.33295629864258, + "learning_rate": 1.5116804019330951e-06, + "logits/chosen": -0.07347237318754196, + "logits/rejected": 0.10067732632160187, + "logps/chosen": -1.736215353012085, + "logps/rejected": -2.6383612155914307, + "loss": 0.6816, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.736215353012085, + "rewards/margins": 0.9021456837654114, + "rewards/rejected": -2.6383612155914307, + "sft_loss": 1.8136123418807983, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 8.405140276522918, + "learning_rate": 1.5070082864894892e-06, + "logits/chosen": -0.12049313634634018, + "logits/rejected": -0.0011329979170113802, + "logps/chosen": -1.590718150138855, + "logps/rejected": -2.4013075828552246, + "loss": 0.637, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.590718150138855, + "rewards/margins": 0.8105891942977905, + "rewards/rejected": -2.4013075828552246, + "sft_loss": 1.5965020656585693, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 8.034046778505555, + "learning_rate": 1.5023361030514572e-06, + "logits/chosen": -0.15744206309318542, + "logits/rejected": 0.1334286630153656, + "logps/chosen": -1.4672434329986572, + "logps/rejected": -2.358227491378784, + "loss": 0.6205, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4672434329986572, + "rewards/margins": 0.890984058380127, + "rewards/rejected": -2.358227491378784, + "sft_loss": 1.5532448291778564, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 9.347068597730125, + "learning_rate": 1.4976638969485433e-06, + "logits/chosen": 0.07527122646570206, + "logits/rejected": 0.10903845727443695, + "logps/chosen": -1.618255615234375, + "logps/rejected": -2.4998650550842285, + "loss": 0.6589, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.618255615234375, + "rewards/margins": 0.8816096186637878, + "rewards/rejected": -2.4998650550842285, + "sft_loss": 1.666682243347168, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 13.857367394301217, + "learning_rate": 1.492991713510511e-06, + "logits/chosen": 0.0880415141582489, + "logits/rejected": 0.20547275245189667, + "logps/chosen": -1.6247081756591797, + "logps/rejected": -2.3860676288604736, + "loss": 0.7297, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6247081756591797, + "rewards/margins": 0.761359453201294, + "rewards/rejected": -2.3860676288604736, + "sft_loss": 1.6987422704696655, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 10.965157699832295, + "learning_rate": 1.4883195980669052e-06, + "logits/chosen": 0.04823315888643265, + "logits/rejected": 0.30748096108436584, + "logps/chosen": -1.6990811824798584, + "logps/rejected": -2.5591132640838623, + "loss": 0.6654, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6990811824798584, + "rewards/margins": 0.8600322008132935, + "rewards/rejected": -2.5591132640838623, + "sft_loss": 1.6902282238006592, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 11.246003054270304, + "learning_rate": 1.48364759594661e-06, + "logits/chosen": -0.09684957563877106, + "logits/rejected": 0.09521036595106125, + "logps/chosen": -1.6307718753814697, + "logps/rejected": -2.4601705074310303, + "loss": 0.6574, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6307718753814697, + "rewards/margins": 0.8293987512588501, + "rewards/rejected": -2.4601705074310303, + "sft_loss": 1.7225332260131836, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 16.764918687679643, + "learning_rate": 1.4789757524774105e-06, + "logits/chosen": -0.12152546644210815, + "logits/rejected": 0.18736448884010315, + "logps/chosen": -1.7070300579071045, + "logps/rejected": -2.435864210128784, + "loss": 0.7022, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7070300579071045, + "rewards/margins": 0.7288340330123901, + "rewards/rejected": -2.435864210128784, + "sft_loss": 1.7718076705932617, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 9.033133052675327, + "learning_rate": 1.474304112985553e-06, + "logits/chosen": -0.023256715387105942, + "logits/rejected": 0.186649352312088, + "logps/chosen": -1.6628717184066772, + "logps/rejected": -2.6158218383789062, + "loss": 0.6279, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.6628717184066772, + "rewards/margins": 0.9529500007629395, + "rewards/rejected": -2.6158218383789062, + "sft_loss": 1.6220327615737915, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 12.038268695656548, + "learning_rate": 1.469632722795304e-06, + "logits/chosen": 0.042193703353405, + "logits/rejected": 0.22168488800525665, + "logps/chosen": -1.6942789554595947, + "logps/rejected": -2.6612579822540283, + "loss": 0.6318, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6942789554595947, + "rewards/margins": 0.9669791460037231, + "rewards/rejected": -2.6612579822540283, + "sft_loss": 1.7724997997283936, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 8.128194965079711, + "learning_rate": 1.4649616272285115e-06, + "logits/chosen": -0.13571251928806305, + "logits/rejected": 0.12857648730278015, + "logps/chosen": -1.7519333362579346, + "logps/rejected": -2.70003080368042, + "loss": 0.6764, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7519333362579346, + "rewards/margins": 0.9480972290039062, + "rewards/rejected": -2.70003080368042, + "sft_loss": 1.7731482982635498, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 9.886863369863011, + "learning_rate": 1.4602908716041651e-06, + "logits/chosen": -0.0866248831152916, + "logits/rejected": 0.12165629863739014, + "logps/chosen": -1.9829012155532837, + "logps/rejected": -2.9655168056488037, + "loss": 0.6768, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9829012155532837, + "rewards/margins": 0.9826154708862305, + "rewards/rejected": -2.9655168056488037, + "sft_loss": 1.8548479080200195, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 10.09112548492932, + "learning_rate": 1.4556205012379568e-06, + "logits/chosen": -0.02071903459727764, + "logits/rejected": 0.2770634889602661, + "logps/chosen": -1.8720626831054688, + "logps/rejected": -2.761143922805786, + "loss": 0.6717, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8720626831054688, + "rewards/margins": 0.8890812993049622, + "rewards/rejected": -2.761143922805786, + "sft_loss": 1.9153035879135132, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 13.63668991969161, + "learning_rate": 1.4509505614418402e-06, + "logits/chosen": 0.001990280346944928, + "logits/rejected": 0.155452698469162, + "logps/chosen": -1.950551986694336, + "logps/rejected": -2.8099985122680664, + "loss": 0.6996, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.950551986694336, + "rewards/margins": 0.85944664478302, + "rewards/rejected": -2.8099985122680664, + "sft_loss": 1.9364144802093506, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 9.391206321349692, + "learning_rate": 1.4462810975235915e-06, + "logits/chosen": -0.29772359132766724, + "logits/rejected": -0.08676508814096451, + "logps/chosen": -1.6872972249984741, + "logps/rejected": -2.605922222137451, + "loss": 0.6559, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6872972249984741, + "rewards/margins": 0.9186250567436218, + "rewards/rejected": -2.605922222137451, + "sft_loss": 1.749237060546875, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 8.542656017395316, + "learning_rate": 1.4416121547863703e-06, + "logits/chosen": 0.013849747367203236, + "logits/rejected": 0.23345601558685303, + "logps/chosen": -1.7267534732818604, + "logps/rejected": -2.6541545391082764, + "loss": 0.6792, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.7267534732818604, + "rewards/margins": 0.927401065826416, + "rewards/rejected": -2.6541545391082764, + "sft_loss": 1.7446855306625366, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 10.312933253800834, + "learning_rate": 1.4369437785282794e-06, + "logits/chosen": -0.17418628931045532, + "logits/rejected": 0.011349151842296124, + "logps/chosen": -1.736092209815979, + "logps/rejected": -2.6571297645568848, + "loss": 0.6453, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.736092209815979, + "rewards/margins": 0.9210373163223267, + "rewards/rejected": -2.6571297645568848, + "sft_loss": 1.7266775369644165, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 9.714136663542105, + "learning_rate": 1.4322760140419259e-06, + "logits/chosen": -0.17709819972515106, + "logits/rejected": -0.012431099079549313, + "logps/chosen": -1.5618194341659546, + "logps/rejected": -2.543933153152466, + "loss": 0.6422, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5618194341659546, + "rewards/margins": 0.9821133613586426, + "rewards/rejected": -2.543933153152466, + "sft_loss": 1.5918623208999634, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 15.467849504617234, + "learning_rate": 1.427608906613981e-06, + "logits/chosen": -0.010245876386761665, + "logits/rejected": -0.06519723683595657, + "logps/chosen": -1.7043612003326416, + "logps/rejected": -2.712526559829712, + "loss": 0.6457, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7043612003326416, + "rewards/margins": 1.0081654787063599, + "rewards/rejected": -2.712526559829712, + "sft_loss": 1.843871831893921, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 11.24888997921364, + "learning_rate": 1.4229425015247414e-06, + "logits/chosen": -0.26887112855911255, + "logits/rejected": -0.08024895191192627, + "logps/chosen": -1.7478374242782593, + "logps/rejected": -2.45278263092041, + "loss": 0.7308, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.7478374242782593, + "rewards/margins": 0.7049452662467957, + "rewards/rejected": -2.45278263092041, + "sft_loss": 1.819427490234375, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 10.239901554975248, + "learning_rate": 1.4182768440476904e-06, + "logits/chosen": -0.13010664284229279, + "logits/rejected": 0.025120923295617104, + "logps/chosen": -1.6842975616455078, + "logps/rejected": -2.596648693084717, + "loss": 0.6478, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6842975616455078, + "rewards/margins": 0.912351131439209, + "rewards/rejected": -2.596648693084717, + "sft_loss": 1.7377408742904663, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 12.921080947407273, + "learning_rate": 1.4136119794490567e-06, + "logits/chosen": -0.15024778246879578, + "logits/rejected": 0.0847291350364685, + "logps/chosen": -1.7774083614349365, + "logps/rejected": -2.4142112731933594, + "loss": 0.7723, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7774083614349365, + "rewards/margins": 0.6368028521537781, + "rewards/rejected": -2.4142112731933594, + "sft_loss": 1.8198562860488892, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 10.081702354572228, + "learning_rate": 1.4089479529873773e-06, + "logits/chosen": 0.01988459937274456, + "logits/rejected": 0.10248573869466782, + "logps/chosen": -1.7284653186798096, + "logps/rejected": -2.532827615737915, + "loss": 0.6867, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7284653186798096, + "rewards/margins": 0.804362416267395, + "rewards/rejected": -2.532827615737915, + "sft_loss": 1.7420072555541992, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 12.41462553686139, + "learning_rate": 1.4042848099130574e-06, + "logits/chosen": 0.02473408542573452, + "logits/rejected": 0.09276667982339859, + "logps/chosen": -1.6429874897003174, + "logps/rejected": -2.2225372791290283, + "loss": 0.7315, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6429874897003174, + "rewards/margins": 0.5795496702194214, + "rewards/rejected": -2.2225372791290283, + "sft_loss": 1.7064769268035889, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 7.7584792999715795, + "learning_rate": 1.3996225954679317e-06, + "logits/chosen": -0.1788342446088791, + "logits/rejected": 0.09930647909641266, + "logps/chosen": -1.5633602142333984, + "logps/rejected": -2.3918919563293457, + "loss": 0.6304, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5633602142333984, + "rewards/margins": 0.8285319209098816, + "rewards/rejected": -2.3918919563293457, + "sft_loss": 1.564711332321167, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 11.94172789929859, + "learning_rate": 1.3949613548848248e-06, + "logits/chosen": -0.1356816589832306, + "logits/rejected": 0.039621032774448395, + "logps/chosen": -1.5667574405670166, + "logps/rejected": -2.434279441833496, + "loss": 0.6524, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5667574405670166, + "rewards/margins": 0.8675218820571899, + "rewards/rejected": -2.434279441833496, + "sft_loss": 1.5231401920318604, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 11.96012738340361, + "learning_rate": 1.3903011333871134e-06, + "logits/chosen": -0.0068131862208247185, + "logits/rejected": 0.2507849633693695, + "logps/chosen": -1.7056745290756226, + "logps/rejected": -2.4680447578430176, + "loss": 0.7081, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7056745290756226, + "rewards/margins": 0.7623699307441711, + "rewards/rejected": -2.4680447578430176, + "sft_loss": 1.7112070322036743, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.44144490361213684, + "eval_logits/rejected": 0.573809802532196, + "eval_logps/chosen": -1.691447138786316, + "eval_logps/rejected": -2.342031240463257, + "eval_loss": 0.727591872215271, + "eval_rewards/accuracies": 0.6772996783256531, + "eval_rewards/chosen": -1.691447138786316, + "eval_rewards/margins": 0.6505837440490723, + "eval_rewards/rejected": -2.342031240463257, + "eval_runtime": 44.3817, + "eval_samples_per_second": 30.305, + "eval_sft_loss": 1.6777325868606567, + "eval_steps_per_second": 7.593, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 8.07626648371608, + "learning_rate": 1.3856419761882875e-06, + "logits/chosen": -0.14924605190753937, + "logits/rejected": 0.030980080366134644, + "logps/chosen": -1.6322886943817139, + "logps/rejected": -2.489731550216675, + "loss": 0.6552, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6322886943817139, + "rewards/margins": 0.8574431538581848, + "rewards/rejected": -2.489731550216675, + "sft_loss": 1.647454023361206, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 10.278073443422485, + "learning_rate": 1.3809839284915096e-06, + "logits/chosen": -0.12130733579397202, + "logits/rejected": 0.05340119078755379, + "logps/chosen": -1.6221942901611328, + "logps/rejected": -2.3026764392852783, + "loss": 0.7143, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.6221942901611328, + "rewards/margins": 0.680482029914856, + "rewards/rejected": -2.3026764392852783, + "sft_loss": 1.6290735006332397, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 10.784190354520463, + "learning_rate": 1.3763270354891795e-06, + "logits/chosen": -0.05685758590698242, + "logits/rejected": 0.1100848913192749, + "logps/chosen": -1.6883134841918945, + "logps/rejected": -2.5045347213745117, + "loss": 0.6823, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6883134841918945, + "rewards/margins": 0.8162212371826172, + "rewards/rejected": -2.5045347213745117, + "sft_loss": 1.6928250789642334, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 10.191700276852643, + "learning_rate": 1.3716713423624936e-06, + "logits/chosen": -0.14868003129959106, + "logits/rejected": 0.2683381140232086, + "logps/chosen": -1.8373911380767822, + "logps/rejected": -2.7939047813415527, + "loss": 0.7222, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8373911380767822, + "rewards/margins": 0.9565135836601257, + "rewards/rejected": -2.7939047813415527, + "sft_loss": 1.7662341594696045, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 9.249713190171791, + "learning_rate": 1.367016894281007e-06, + "logits/chosen": -0.09739672392606735, + "logits/rejected": 0.10374144464731216, + "logps/chosen": -1.5907642841339111, + "logps/rejected": -2.488351345062256, + "loss": 0.639, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5907642841339111, + "rewards/margins": 0.8975872993469238, + "rewards/rejected": -2.488351345062256, + "sft_loss": 1.6401208639144897, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 19.03202453774214, + "learning_rate": 1.3623637364021952e-06, + "logits/chosen": -0.17357507348060608, + "logits/rejected": 0.0912250429391861, + "logps/chosen": -1.7484121322631836, + "logps/rejected": -2.9147703647613525, + "loss": 0.6135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7484121322631836, + "rewards/margins": 1.1663583517074585, + "rewards/rejected": -2.9147703647613525, + "sft_loss": 1.764880895614624, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 10.605975138159263, + "learning_rate": 1.3577119138710165e-06, + "logits/chosen": -0.20004332065582275, + "logits/rejected": -0.06560181826353073, + "logps/chosen": -1.7282603979110718, + "logps/rejected": -2.6194205284118652, + "loss": 0.6598, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7282603979110718, + "rewards/margins": 0.8911601305007935, + "rewards/rejected": -2.6194205284118652, + "sft_loss": 1.7524601221084595, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 9.412493447124648, + "learning_rate": 1.3530614718194734e-06, + "logits/chosen": -0.08897742629051208, + "logits/rejected": 0.1184593215584755, + "logps/chosen": -1.6912317276000977, + "logps/rejected": -2.774104595184326, + "loss": 0.6417, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6912317276000977, + "rewards/margins": 1.082872748374939, + "rewards/rejected": -2.774104595184326, + "sft_loss": 1.6165220737457275, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 12.166200755671625, + "learning_rate": 1.3484124553661754e-06, + "logits/chosen": -0.2774050831794739, + "logits/rejected": -0.04774611443281174, + "logps/chosen": -1.6932928562164307, + "logps/rejected": -2.6121184825897217, + "loss": 0.6716, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6932928562164307, + "rewards/margins": 0.9188259840011597, + "rewards/rejected": -2.6121184825897217, + "sft_loss": 1.652806282043457, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 8.449276004917175, + "learning_rate": 1.3437649096159e-06, + "logits/chosen": -0.057559557259082794, + "logits/rejected": 0.21200330555438995, + "logps/chosen": -1.6470773220062256, + "logps/rejected": -2.6044399738311768, + "loss": 0.6359, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6470773220062256, + "rewards/margins": 0.9573624730110168, + "rewards/rejected": -2.6044399738311768, + "sft_loss": 1.6401879787445068, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 7.952629055673774, + "learning_rate": 1.3391188796591568e-06, + "logits/chosen": -0.09195758402347565, + "logits/rejected": 0.045628756284713745, + "logps/chosen": -1.7653684616088867, + "logps/rejected": -2.609598159790039, + "loss": 0.6907, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7653684616088867, + "rewards/margins": 0.8442295789718628, + "rewards/rejected": -2.609598159790039, + "sft_loss": 1.7955518960952759, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 10.776842648366552, + "learning_rate": 1.3344744105717487e-06, + "logits/chosen": -0.1589999496936798, + "logits/rejected": 0.024349741637706757, + "logps/chosen": -1.701062560081482, + "logps/rejected": -2.576101779937744, + "loss": 0.6631, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.701062560081482, + "rewards/margins": 0.8750389218330383, + "rewards/rejected": -2.576101779937744, + "sft_loss": 1.7430168390274048, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 12.426345973657531, + "learning_rate": 1.3298315474143354e-06, + "logits/chosen": -0.018960092216730118, + "logits/rejected": 0.15279479324817657, + "logps/chosen": -1.675971269607544, + "logps/rejected": -2.6613776683807373, + "loss": 0.6498, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.675971269607544, + "rewards/margins": 0.985406756401062, + "rewards/rejected": -2.6613776683807373, + "sft_loss": 1.7282222509384155, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 9.337865950173851, + "learning_rate": 1.3251903352319951e-06, + "logits/chosen": -0.14004510641098022, + "logits/rejected": 0.0643129050731659, + "logps/chosen": -1.6384871006011963, + "logps/rejected": -2.6979575157165527, + "loss": 0.6506, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6384871006011963, + "rewards/margins": 1.059470534324646, + "rewards/rejected": -2.6979575157165527, + "sft_loss": 1.6707124710083008, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 11.94103230577254, + "learning_rate": 1.3205508190537895e-06, + "logits/chosen": -0.208818718791008, + "logits/rejected": 0.2646576762199402, + "logps/chosen": -1.664166808128357, + "logps/rejected": -2.5710747241973877, + "loss": 0.633, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.664166808128357, + "rewards/margins": 0.9069075584411621, + "rewards/rejected": -2.5710747241973877, + "sft_loss": 1.705847978591919, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 10.251759506687932, + "learning_rate": 1.3159130438923242e-06, + "logits/chosen": -0.11879494041204453, + "logits/rejected": -0.02837967872619629, + "logps/chosen": -1.5379129648208618, + "logps/rejected": -2.41559100151062, + "loss": 0.6193, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.5379129648208618, + "rewards/margins": 0.8776780962944031, + "rewards/rejected": -2.41559100151062, + "sft_loss": 1.629476547241211, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 9.306495320308388, + "learning_rate": 1.3112770547433144e-06, + "logits/chosen": -0.20200033485889435, + "logits/rejected": 0.11474726349115372, + "logps/chosen": -1.641689658164978, + "logps/rejected": -2.5257320404052734, + "loss": 0.6589, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.641689658164978, + "rewards/margins": 0.8840423822402954, + "rewards/rejected": -2.5257320404052734, + "sft_loss": 1.6782573461532593, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 10.81878563335178, + "learning_rate": 1.3066428965851472e-06, + "logits/chosen": -0.09830178320407867, + "logits/rejected": 0.04885398969054222, + "logps/chosen": -1.6704227924346924, + "logps/rejected": -2.510758876800537, + "loss": 0.6876, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6704227924346924, + "rewards/margins": 0.8403360247612, + "rewards/rejected": -2.510758876800537, + "sft_loss": 1.7284488677978516, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 9.278282957835337, + "learning_rate": 1.3020106143784454e-06, + "logits/chosen": -0.15651771426200867, + "logits/rejected": -0.01825041137635708, + "logps/chosen": -1.8048747777938843, + "logps/rejected": -2.6454477310180664, + "loss": 0.7163, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.8048747777938843, + "rewards/margins": 0.8405729532241821, + "rewards/rejected": -2.6454477310180664, + "sft_loss": 1.8150914907455444, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 8.75353222908878, + "learning_rate": 1.2973802530656314e-06, + "logits/chosen": -0.3050258755683899, + "logits/rejected": -0.1058342456817627, + "logps/chosen": -1.7767432928085327, + "logps/rejected": -2.729957103729248, + "loss": 0.6791, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7767432928085327, + "rewards/margins": 0.9532135725021362, + "rewards/rejected": -2.729957103729248, + "sft_loss": 1.848341703414917, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 13.700892580022298, + "learning_rate": 1.2927518575704906e-06, + "logits/chosen": -0.28139573335647583, + "logits/rejected": -0.033496059477329254, + "logps/chosen": -1.7814384698867798, + "logps/rejected": -2.7443580627441406, + "loss": 0.6742, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7814384698867798, + "rewards/margins": 0.9629195928573608, + "rewards/rejected": -2.7443580627441406, + "sft_loss": 1.786010980606079, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 9.66418456858833, + "learning_rate": 1.2881254727977365e-06, + "logits/chosen": -0.05890367552638054, + "logits/rejected": -0.013821298256516457, + "logps/chosen": -1.7495791912078857, + "logps/rejected": -2.6106810569763184, + "loss": 0.6464, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7495791912078857, + "rewards/margins": 0.8611016273498535, + "rewards/rejected": -2.6106810569763184, + "sft_loss": 1.7882887125015259, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 15.941166826322611, + "learning_rate": 1.2835011436325749e-06, + "logits/chosen": -0.2913353741168976, + "logits/rejected": -0.02926046773791313, + "logps/chosen": -1.732381820678711, + "logps/rejected": -2.5884523391723633, + "loss": 0.6843, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.732381820678711, + "rewards/margins": 0.856070339679718, + "rewards/rejected": -2.5884523391723633, + "sft_loss": 1.7478708028793335, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 7.3216676831575525, + "learning_rate": 1.278878914940267e-06, + "logits/chosen": -0.22623327374458313, + "logits/rejected": 0.11279468238353729, + "logps/chosen": -1.703449010848999, + "logps/rejected": -2.82920503616333, + "loss": 0.6434, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.703449010848999, + "rewards/margins": 1.125756025314331, + "rewards/rejected": -2.82920503616333, + "sft_loss": 1.7550933361053467, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 9.222772282539346, + "learning_rate": 1.2742588315656963e-06, + "logits/chosen": -0.30736929178237915, + "logits/rejected": -0.03906359151005745, + "logps/chosen": -1.6967036724090576, + "logps/rejected": -2.6465067863464355, + "loss": 0.6487, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6967036724090576, + "rewards/margins": 0.9498027563095093, + "rewards/rejected": -2.6465067863464355, + "sft_loss": 1.807225227355957, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 9.902930947660389, + "learning_rate": 1.269640938332932e-06, + "logits/chosen": -0.18472820520401, + "logits/rejected": -0.04140182584524155, + "logps/chosen": -1.5697466135025024, + "logps/rejected": -2.574474811553955, + "loss": 0.6254, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5697466135025024, + "rewards/margins": 1.0047283172607422, + "rewards/rejected": -2.574474811553955, + "sft_loss": 1.6612666845321655, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 23.399323422517572, + "learning_rate": 1.265025280044794e-06, + "logits/chosen": -0.1952822506427765, + "logits/rejected": 0.030080635100603104, + "logps/chosen": -1.7290503978729248, + "logps/rejected": -2.5522682666778564, + "loss": 0.6611, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7290503978729248, + "rewards/margins": 0.8232178688049316, + "rewards/rejected": -2.5522682666778564, + "sft_loss": 1.7029396295547485, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 11.34202465880965, + "learning_rate": 1.2604119014824197e-06, + "logits/chosen": -0.14711865782737732, + "logits/rejected": 0.09781067818403244, + "logps/chosen": -1.6153545379638672, + "logps/rejected": -2.501518726348877, + "loss": 0.6616, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6153545379638672, + "rewards/margins": 0.8861643075942993, + "rewards/rejected": -2.501518726348877, + "sft_loss": 1.6534423828125, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 10.981100868803473, + "learning_rate": 1.2558008474048279e-06, + "logits/chosen": -0.2000230848789215, + "logits/rejected": 0.08384416997432709, + "logps/chosen": -1.5488859415054321, + "logps/rejected": -2.4278249740600586, + "loss": 0.6372, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.5488859415054321, + "rewards/margins": 0.8789387941360474, + "rewards/rejected": -2.4278249740600586, + "sft_loss": 1.6269474029541016, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 11.809243251425459, + "learning_rate": 1.2511921625484857e-06, + "logits/chosen": -0.3878183960914612, + "logits/rejected": -0.222340390086174, + "logps/chosen": -1.7283685207366943, + "logps/rejected": -2.52485990524292, + "loss": 0.6616, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7283685207366943, + "rewards/margins": 0.796491265296936, + "rewards/rejected": -2.52485990524292, + "sft_loss": 1.729326605796814, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 13.993998126297274, + "learning_rate": 1.2465858916268734e-06, + "logits/chosen": -0.10277875512838364, + "logits/rejected": -0.04876113682985306, + "logps/chosen": -1.8125221729278564, + "logps/rejected": -2.5582568645477295, + "loss": 0.7396, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.8125221729278564, + "rewards/margins": 0.7457348704338074, + "rewards/rejected": -2.5582568645477295, + "sft_loss": 1.7993252277374268, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 11.587421995678856, + "learning_rate": 1.2419820793300526e-06, + "logits/chosen": -0.30335888266563416, + "logits/rejected": -0.002621948719024658, + "logps/chosen": -1.6230109930038452, + "logps/rejected": -2.533942699432373, + "loss": 0.6614, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6230109930038452, + "rewards/margins": 0.9109317064285278, + "rewards/rejected": -2.533942699432373, + "sft_loss": 1.6495774984359741, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 11.811907418936265, + "learning_rate": 1.2373807703242293e-06, + "logits/chosen": -0.3585537374019623, + "logits/rejected": -0.08721666783094406, + "logps/chosen": -1.7728416919708252, + "logps/rejected": -2.6852431297302246, + "loss": 0.6668, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7728416919708252, + "rewards/margins": 0.9124017953872681, + "rewards/rejected": -2.6852431297302246, + "sft_loss": 1.833508849143982, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 11.735212113421865, + "learning_rate": 1.232782009251324e-06, + "logits/chosen": -0.3023667633533478, + "logits/rejected": -0.05005481094121933, + "logps/chosen": -1.763489007949829, + "logps/rejected": -2.484192371368408, + "loss": 0.7278, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.763489007949829, + "rewards/margins": 0.720703661441803, + "rewards/rejected": -2.484192371368408, + "sft_loss": 1.8116620779037476, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 13.400719562159768, + "learning_rate": 1.228185840728537e-06, + "logits/chosen": -0.07832861691713333, + "logits/rejected": -0.011995521374046803, + "logps/chosen": -1.8001636266708374, + "logps/rejected": -2.66902232170105, + "loss": 0.728, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8001636266708374, + "rewards/margins": 0.8688589334487915, + "rewards/rejected": -2.66902232170105, + "sft_loss": 1.8136160373687744, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 8.349033077110981, + "learning_rate": 1.2235923093479156e-06, + "logits/chosen": -0.3206074833869934, + "logits/rejected": -0.07655216008424759, + "logps/chosen": -1.666358232498169, + "logps/rejected": -2.565678358078003, + "loss": 0.6547, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.666358232498169, + "rewards/margins": 0.8993202447891235, + "rewards/rejected": -2.565678358078003, + "sft_loss": 1.6472370624542236, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 9.222918908639995, + "learning_rate": 1.219001459675921e-06, + "logits/chosen": -0.2530183792114258, + "logits/rejected": -0.23013608157634735, + "logps/chosen": -1.67684805393219, + "logps/rejected": -2.330507516860962, + "loss": 0.7125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.67684805393219, + "rewards/margins": 0.6536593437194824, + "rewards/rejected": -2.330507516860962, + "sft_loss": 1.6849253177642822, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 11.003158412672942, + "learning_rate": 1.2144133362529974e-06, + "logits/chosen": -0.2570267617702484, + "logits/rejected": -0.04421461373567581, + "logps/chosen": -1.7353019714355469, + "logps/rejected": -2.479492664337158, + "loss": 0.7085, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.7353019714355469, + "rewards/margins": 0.7441903948783875, + "rewards/rejected": -2.479492664337158, + "sft_loss": 1.7561286687850952, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 11.206430050702487, + "learning_rate": 1.2098279835931382e-06, + "logits/chosen": -0.2855226397514343, + "logits/rejected": -0.10674687474966049, + "logps/chosen": -1.5438029766082764, + "logps/rejected": -2.4865102767944336, + "loss": 0.6322, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.5438029766082764, + "rewards/margins": 0.9427075386047363, + "rewards/rejected": -2.4865102767944336, + "sft_loss": 1.5557372570037842, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 8.682823478026071, + "learning_rate": 1.2052454461834544e-06, + "logits/chosen": -0.17236191034317017, + "logits/rejected": 0.019247237592935562, + "logps/chosen": -1.6885963678359985, + "logps/rejected": -2.52380108833313, + "loss": 0.6775, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6885963678359985, + "rewards/margins": 0.8352048993110657, + "rewards/rejected": -2.52380108833313, + "sft_loss": 1.705102562904358, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 12.38921176122187, + "learning_rate": 1.2006657684837445e-06, + "logits/chosen": -0.25058725476264954, + "logits/rejected": -0.04623941332101822, + "logps/chosen": -1.6525062322616577, + "logps/rejected": -2.4225525856018066, + "loss": 0.672, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6525062322616577, + "rewards/margins": 0.7700467109680176, + "rewards/rejected": -2.4225525856018066, + "sft_loss": 1.7160422801971436, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 9.703765936226722, + "learning_rate": 1.1960889949260613e-06, + "logits/chosen": -0.2633149027824402, + "logits/rejected": 0.08391741663217545, + "logps/chosen": -1.7922632694244385, + "logps/rejected": -2.605243682861328, + "loss": 0.6699, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7922632694244385, + "rewards/margins": 0.8129804730415344, + "rewards/rejected": -2.605243682861328, + "sft_loss": 1.7857837677001953, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 17.54487018281387, + "learning_rate": 1.1915151699142825e-06, + "logits/chosen": -0.29941219091415405, + "logits/rejected": -0.12223385274410248, + "logps/chosen": -1.7729759216308594, + "logps/rejected": -2.7748351097106934, + "loss": 0.6732, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7729759216308594, + "rewards/margins": 1.0018593072891235, + "rewards/rejected": -2.7748351097106934, + "sft_loss": 1.8728386163711548, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 19.782965861794818, + "learning_rate": 1.1869443378236782e-06, + "logits/chosen": -0.13840122520923615, + "logits/rejected": 0.024842610582709312, + "logps/chosen": -1.9172168970108032, + "logps/rejected": -2.925680637359619, + "loss": 0.7141, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9172168970108032, + "rewards/margins": 1.008463740348816, + "rewards/rejected": -2.925680637359619, + "sft_loss": 1.9686994552612305, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 9.29544600728352, + "learning_rate": 1.1823765430004812e-06, + "logits/chosen": -0.2544843554496765, + "logits/rejected": -0.1964297890663147, + "logps/chosen": -1.7466745376586914, + "logps/rejected": -2.728344678878784, + "loss": 0.6701, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7466745376586914, + "rewards/margins": 0.9816702604293823, + "rewards/rejected": -2.728344678878784, + "sft_loss": 1.7416541576385498, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 9.722002501123056, + "learning_rate": 1.177811829761457e-06, + "logits/chosen": -0.21858203411102295, + "logits/rejected": -0.038083307445049286, + "logps/chosen": -1.7050504684448242, + "logps/rejected": -2.841707944869995, + "loss": 0.6266, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7050504684448242, + "rewards/margins": 1.136657476425171, + "rewards/rejected": -2.841707944869995, + "sft_loss": 1.7132253646850586, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 12.306560826422983, + "learning_rate": 1.1732502423934737e-06, + "logits/chosen": -0.21269071102142334, + "logits/rejected": -0.06579498946666718, + "logps/chosen": -1.6928743124008179, + "logps/rejected": -2.6516270637512207, + "loss": 0.6169, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6928743124008179, + "rewards/margins": 0.9587525129318237, + "rewards/rejected": -2.6516270637512207, + "sft_loss": 1.7344623804092407, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 14.207434505489841, + "learning_rate": 1.1686918251530716e-06, + "logits/chosen": -0.2616156041622162, + "logits/rejected": -0.11248783767223358, + "logps/chosen": -1.6836540699005127, + "logps/rejected": -2.8523545265197754, + "loss": 0.6497, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6836540699005127, + "rewards/margins": 1.1687004566192627, + "rewards/rejected": -2.8523545265197754, + "sft_loss": 1.7119197845458984, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 10.777338659368121, + "learning_rate": 1.164136622266035e-06, + "logits/chosen": -0.2616646885871887, + "logits/rejected": 0.056531958281993866, + "logps/chosen": -1.7460088729858398, + "logps/rejected": -2.6776375770568848, + "loss": 0.6629, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.7460088729858398, + "rewards/margins": 0.9316285848617554, + "rewards/rejected": -2.6776375770568848, + "sft_loss": 1.8117141723632812, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 13.964630061751256, + "learning_rate": 1.1595846779269622e-06, + "logits/chosen": -0.3164612650871277, + "logits/rejected": -0.06304170936346054, + "logps/chosen": -1.723750114440918, + "logps/rejected": -2.711017608642578, + "loss": 0.6582, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.723750114440918, + "rewards/margins": 0.9872674942016602, + "rewards/rejected": -2.711017608642578, + "sft_loss": 1.7842611074447632, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 9.837501884568988, + "learning_rate": 1.155036036298837e-06, + "logits/chosen": -0.20438392460346222, + "logits/rejected": 0.09091716259717941, + "logps/chosen": -1.8601843118667603, + "logps/rejected": -2.813197374343872, + "loss": 0.7041, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8601843118667603, + "rewards/margins": 0.9530132412910461, + "rewards/rejected": -2.813197374343872, + "sft_loss": 1.8708412647247314, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 10.723020602530795, + "learning_rate": 1.1504907415126008e-06, + "logits/chosen": -0.045906491577625275, + "logits/rejected": 0.10496882349252701, + "logps/chosen": -1.7108036279678345, + "logps/rejected": -2.6833956241607666, + "loss": 0.6472, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7108036279678345, + "rewards/margins": 0.9725920557975769, + "rewards/rejected": -2.6833956241607666, + "sft_loss": 1.714491605758667, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 7.670794619054316, + "learning_rate": 1.1459488376667235e-06, + "logits/chosen": -0.20854513347148895, + "logits/rejected": -0.055694401264190674, + "logps/chosen": -1.5863817930221558, + "logps/rejected": -2.354175329208374, + "loss": 0.6701, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.5863817930221558, + "rewards/margins": 0.7677937150001526, + "rewards/rejected": -2.354175329208374, + "sft_loss": 1.597285509109497, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 8.962136533440228, + "learning_rate": 1.1414103688267756e-06, + "logits/chosen": -0.20327985286712646, + "logits/rejected": -0.07620219141244888, + "logps/chosen": -1.7575534582138062, + "logps/rejected": -2.6742453575134277, + "loss": 0.6806, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.7575534582138062, + "rewards/margins": 0.9166919589042664, + "rewards/rejected": -2.6742453575134277, + "sft_loss": 1.771837592124939, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 14.298445634777934, + "learning_rate": 1.136875379025002e-06, + "logits/chosen": -0.16482272744178772, + "logits/rejected": -0.09752872586250305, + "logps/chosen": -1.657428503036499, + "logps/rejected": -2.488154172897339, + "loss": 0.6659, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.657428503036499, + "rewards/margins": 0.8307255506515503, + "rewards/rejected": -2.488154172897339, + "sft_loss": 1.6614271402359009, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 9.376689059470104, + "learning_rate": 1.132343912259894e-06, + "logits/chosen": -0.02655973471701145, + "logits/rejected": 0.017931750044226646, + "logps/chosen": -1.7234824895858765, + "logps/rejected": -2.5292980670928955, + "loss": 0.6903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7234824895858765, + "rewards/margins": 0.8058153986930847, + "rewards/rejected": -2.5292980670928955, + "sft_loss": 1.782034158706665, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 10.422796001151918, + "learning_rate": 1.1278160124957617e-06, + "logits/chosen": -0.08756308257579803, + "logits/rejected": 0.10158822685480118, + "logps/chosen": -1.6388031244277954, + "logps/rejected": -2.4249258041381836, + "loss": 0.6825, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6388031244277954, + "rewards/margins": 0.7861226797103882, + "rewards/rejected": -2.4249258041381836, + "sft_loss": 1.7268263101577759, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 9.90934852486259, + "learning_rate": 1.1232917236623085e-06, + "logits/chosen": -0.09540453553199768, + "logits/rejected": 0.0738372951745987, + "logps/chosen": -1.6657747030258179, + "logps/rejected": -2.4016175270080566, + "loss": 0.6755, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6657747030258179, + "rewards/margins": 0.7358425855636597, + "rewards/rejected": -2.4016175270080566, + "sft_loss": 1.7676925659179688, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 12.004707107522934, + "learning_rate": 1.1187710896542045e-06, + "logits/chosen": -0.25235700607299805, + "logits/rejected": 0.0003812193754129112, + "logps/chosen": -1.7461745738983154, + "logps/rejected": -2.481781005859375, + "loss": 0.6677, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7461745738983154, + "rewards/margins": 0.7356064915657043, + "rewards/rejected": -2.481781005859375, + "sft_loss": 1.7998241186141968, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 12.96635107429613, + "learning_rate": 1.1142541543306603e-06, + "logits/chosen": -0.07630597800016403, + "logits/rejected": 0.15812484920024872, + "logps/chosen": -1.6824067831039429, + "logps/rejected": -2.72544264793396, + "loss": 0.647, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.6824067831039429, + "rewards/margins": 1.0430357456207275, + "rewards/rejected": -2.72544264793396, + "sft_loss": 1.7791961431503296, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 12.035501698692357, + "learning_rate": 1.109740961515003e-06, + "logits/chosen": -0.18747368454933167, + "logits/rejected": 0.032625712454319, + "logps/chosen": -1.709657907485962, + "logps/rejected": -2.7182281017303467, + "loss": 0.6193, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.709657907485962, + "rewards/margins": 1.0085701942443848, + "rewards/rejected": -2.7182281017303467, + "sft_loss": 1.7642666101455688, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 15.913458782743897, + "learning_rate": 1.1052315549942487e-06, + "logits/chosen": -0.17805495858192444, + "logits/rejected": -0.04204495996236801, + "logps/chosen": -1.6713043451309204, + "logps/rejected": -2.604113817214966, + "loss": 0.6414, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6713043451309204, + "rewards/margins": 0.9328095316886902, + "rewards/rejected": -2.604113817214966, + "sft_loss": 1.707510232925415, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 13.819186328459246, + "learning_rate": 1.100725978518679e-06, + "logits/chosen": -0.19229435920715332, + "logits/rejected": 0.12436362355947495, + "logps/chosen": -1.8118988275527954, + "logps/rejected": -2.694197654724121, + "loss": 0.6863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8118988275527954, + "rewards/margins": 0.8822988271713257, + "rewards/rejected": -2.694197654724121, + "sft_loss": 1.8271783590316772, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 12.142640316077669, + "learning_rate": 1.0962242758014169e-06, + "logits/chosen": -0.24944384396076202, + "logits/rejected": 0.014292346313595772, + "logps/chosen": -1.7131149768829346, + "logps/rejected": -2.722731590270996, + "loss": 0.6575, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.7131149768829346, + "rewards/margins": 1.009616494178772, + "rewards/rejected": -2.722731590270996, + "sft_loss": 1.7885093688964844, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 8.832463656004036, + "learning_rate": 1.091726490518002e-06, + "logits/chosen": -0.14067216217517853, + "logits/rejected": 0.15703320503234863, + "logps/chosen": -1.722751259803772, + "logps/rejected": -2.6994786262512207, + "loss": 0.6509, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.722751259803772, + "rewards/margins": 0.9767271280288696, + "rewards/rejected": -2.6994786262512207, + "sft_loss": 1.7842895984649658, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 12.074613464208511, + "learning_rate": 1.0872326663059668e-06, + "logits/chosen": -0.13440512120723724, + "logits/rejected": -0.029331039637327194, + "logps/chosen": -1.7340189218521118, + "logps/rejected": -2.644932270050049, + "loss": 0.673, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7340189218521118, + "rewards/margins": 0.910913348197937, + "rewards/rejected": -2.644932270050049, + "sft_loss": 1.845868706703186, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 9.783673762859099, + "learning_rate": 1.0827428467644132e-06, + "logits/chosen": -0.23015666007995605, + "logits/rejected": -0.028845742344856262, + "logps/chosen": -1.6154110431671143, + "logps/rejected": -2.612565279006958, + "loss": 0.6492, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6154110431671143, + "rewards/margins": 0.9971543550491333, + "rewards/rejected": -2.612565279006958, + "sft_loss": 1.6820234060287476, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 12.68272767354526, + "learning_rate": 1.0782570754535903e-06, + "logits/chosen": -0.2367834597826004, + "logits/rejected": 0.10590411722660065, + "logps/chosen": -1.6864131689071655, + "logps/rejected": -2.4310550689697266, + "loss": 0.6853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6864131689071655, + "rewards/margins": 0.744641900062561, + "rewards/rejected": -2.4310550689697266, + "sft_loss": 1.7294981479644775, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 10.089494425507207, + "learning_rate": 1.0737753958944712e-06, + "logits/chosen": -0.38933509588241577, + "logits/rejected": 0.024811876937747, + "logps/chosen": -1.6396119594573975, + "logps/rejected": -2.5669679641723633, + "loss": 0.6176, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6396119594573975, + "rewards/margins": 0.927355945110321, + "rewards/rejected": -2.5669679641723633, + "sft_loss": 1.6741783618927002, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 10.666231075184285, + "learning_rate": 1.0692978515683305e-06, + "logits/chosen": -0.09203028678894043, + "logits/rejected": 0.036007750779390335, + "logps/chosen": -1.7163540124893188, + "logps/rejected": -2.5543947219848633, + "loss": 0.6857, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7163540124893188, + "rewards/margins": 0.8380409479141235, + "rewards/rejected": -2.5543947219848633, + "sft_loss": 1.667665719985962, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 9.227970918355686, + "learning_rate": 1.0648244859163227e-06, + "logits/chosen": -0.278619647026062, + "logits/rejected": -0.07375986874103546, + "logps/chosen": -1.6772725582122803, + "logps/rejected": -2.598586082458496, + "loss": 0.6961, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6772725582122803, + "rewards/margins": 0.9213134050369263, + "rewards/rejected": -2.598586082458496, + "sft_loss": 1.693250060081482, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 9.376703127692286, + "learning_rate": 1.0603553423390612e-06, + "logits/chosen": -0.21297414600849152, + "logits/rejected": -0.047949619591236115, + "logps/chosen": -1.6670162677764893, + "logps/rejected": -2.5751991271972656, + "loss": 0.658, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6670162677764893, + "rewards/margins": 0.9081829190254211, + "rewards/rejected": -2.5751991271972656, + "sft_loss": 1.7023769617080688, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 11.975502715594226, + "learning_rate": 1.0558904641961966e-06, + "logits/chosen": -0.14659383893013, + "logits/rejected": -0.027252143248915672, + "logps/chosen": -1.6418933868408203, + "logps/rejected": -2.7534546852111816, + "loss": 0.6329, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6418933868408203, + "rewards/margins": 1.1115614175796509, + "rewards/rejected": -2.7534546852111816, + "sft_loss": 1.7112839221954346, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 7.8401392786102715, + "learning_rate": 1.0514298948059961e-06, + "logits/chosen": -0.2800833582878113, + "logits/rejected": -0.029077952727675438, + "logps/chosen": -1.6596229076385498, + "logps/rejected": -2.5685582160949707, + "loss": 0.6284, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6596229076385498, + "rewards/margins": 0.9089350700378418, + "rewards/rejected": -2.5685582160949707, + "sft_loss": 1.6742641925811768, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 8.695115595309554, + "learning_rate": 1.0469736774449235e-06, + "logits/chosen": -0.10720052570104599, + "logits/rejected": 0.06181992217898369, + "logps/chosen": -1.6598479747772217, + "logps/rejected": -2.563051700592041, + "loss": 0.6968, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6598479747772217, + "rewards/margins": 0.9032036662101746, + "rewards/rejected": -2.563051700592041, + "sft_loss": 1.6553401947021484, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 10.500319573242674, + "learning_rate": 1.0425218553472193e-06, + "logits/chosen": -0.18654967844486237, + "logits/rejected": -0.10578273236751556, + "logps/chosen": -1.5896636247634888, + "logps/rejected": -2.603238821029663, + "loss": 0.6075, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.5896636247634888, + "rewards/margins": 1.0135753154754639, + "rewards/rejected": -2.603238821029663, + "sft_loss": 1.6410157680511475, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 8.631834709897678, + "learning_rate": 1.038074471704481e-06, + "logits/chosen": -0.054431747645139694, + "logits/rejected": 0.03705093264579773, + "logps/chosen": -1.7226107120513916, + "logps/rejected": -2.614551544189453, + "loss": 0.6581, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7226107120513916, + "rewards/margins": 0.8919405937194824, + "rewards/rejected": -2.614551544189453, + "sft_loss": 1.818695306777954, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 10.026498518478865, + "learning_rate": 1.033631569665244e-06, + "logits/chosen": -0.1128377690911293, + "logits/rejected": -0.022760801017284393, + "logps/chosen": -1.6746219396591187, + "logps/rejected": -2.4234249591827393, + "loss": 0.6989, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.6746219396591187, + "rewards/margins": 0.7488031387329102, + "rewards/rejected": -2.4234249591827393, + "sft_loss": 1.7145401239395142, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 6.990504351673148, + "learning_rate": 1.0291931923345635e-06, + "logits/chosen": -0.2978639006614685, + "logits/rejected": 0.0011643856996670365, + "logps/chosen": -1.6716158390045166, + "logps/rejected": -2.5651087760925293, + "loss": 0.6452, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6716158390045166, + "rewards/margins": 0.8934928774833679, + "rewards/rejected": -2.5651087760925293, + "sft_loss": 1.6098390817642212, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 9.659517645727767, + "learning_rate": 1.0247593827735966e-06, + "logits/chosen": -0.12814059853553772, + "logits/rejected": 0.1287405639886856, + "logps/chosen": -1.687361717224121, + "logps/rejected": -2.841590404510498, + "loss": 0.6451, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.687361717224121, + "rewards/margins": 1.154228687286377, + "rewards/rejected": -2.841590404510498, + "sft_loss": 1.7420556545257568, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.30112695693969727, + "eval_logits/rejected": 0.42516347765922546, + "eval_logps/chosen": -1.7517114877700806, + "eval_logps/rejected": -2.491905450820923, + "eval_loss": 0.721508800983429, + "eval_rewards/accuracies": 0.6913946866989136, + "eval_rewards/chosen": -1.7517114877700806, + "eval_rewards/margins": 0.7401941418647766, + "eval_rewards/rejected": -2.491905450820923, + "eval_runtime": 44.6434, + "eval_samples_per_second": 30.128, + "eval_sft_loss": 1.7451032400131226, + "eval_steps_per_second": 7.549, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 11.010850184147426, + "learning_rate": 1.0203301839991816e-06, + "logits/chosen": -0.24366919696331024, + "logits/rejected": -0.20541080832481384, + "logps/chosen": -1.624942421913147, + "logps/rejected": -2.368537187576294, + "loss": 0.6968, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.624942421913147, + "rewards/margins": 0.743594765663147, + "rewards/rejected": -2.368537187576294, + "sft_loss": 1.6673892736434937, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 8.505387043264376, + "learning_rate": 1.0159056389834254e-06, + "logits/chosen": -0.25455576181411743, + "logits/rejected": -0.04168150946497917, + "logps/chosen": -1.6813017129898071, + "logps/rejected": -2.592050790786743, + "loss": 0.6255, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6813017129898071, + "rewards/margins": 0.9107491374015808, + "rewards/rejected": -2.592050790786743, + "sft_loss": 1.7684290409088135, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 8.924174869577604, + "learning_rate": 1.0114857906532827e-06, + "logits/chosen": -0.12101688235998154, + "logits/rejected": 0.046328797936439514, + "logps/chosen": -1.7440468072891235, + "logps/rejected": -2.6186728477478027, + "loss": 0.6596, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7440468072891235, + "rewards/margins": 0.8746261596679688, + "rewards/rejected": -2.6186728477478027, + "sft_loss": 1.7590080499649048, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 18.61178170083434, + "learning_rate": 1.0070706818901417e-06, + "logits/chosen": -0.1992807686328888, + "logits/rejected": -0.07354742288589478, + "logps/chosen": -1.83464777469635, + "logps/rejected": -2.608799695968628, + "loss": 0.7223, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.83464777469635, + "rewards/margins": 0.7741519808769226, + "rewards/rejected": -2.608799695968628, + "sft_loss": 1.8808612823486328, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 9.162346431887451, + "learning_rate": 1.0026603555294073e-06, + "logits/chosen": -0.08581370115280151, + "logits/rejected": -0.12211551517248154, + "logps/chosen": -1.6881921291351318, + "logps/rejected": -2.5504727363586426, + "loss": 0.6617, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.6881921291351318, + "rewards/margins": 0.862280547618866, + "rewards/rejected": -2.5504727363586426, + "sft_loss": 1.743970513343811, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 12.001438140317974, + "learning_rate": 9.982548543600843e-07, + "logits/chosen": -0.17960326373577118, + "logits/rejected": -0.14128082990646362, + "logps/chosen": -1.7421640157699585, + "logps/rejected": -2.7152018547058105, + "loss": 0.6898, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.7421640157699585, + "rewards/margins": 0.973037600517273, + "rewards/rejected": -2.7152018547058105, + "sft_loss": 1.8456947803497314, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 10.323550418368065, + "learning_rate": 9.93854221124365e-07, + "logits/chosen": -0.2688259184360504, + "logits/rejected": -0.14841492474079132, + "logps/chosen": -1.670474648475647, + "logps/rejected": -2.6174325942993164, + "loss": 0.6382, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.670474648475647, + "rewards/margins": 0.9469578862190247, + "rewards/rejected": -2.6174325942993164, + "sft_loss": 1.7346309423446655, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 11.719082491329008, + "learning_rate": 9.894584985172121e-07, + "logits/chosen": -0.1979060173034668, + "logits/rejected": -0.10562784969806671, + "logps/chosen": -1.806494116783142, + "logps/rejected": -2.617361068725586, + "loss": 0.6811, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.806494116783142, + "rewards/margins": 0.8108669519424438, + "rewards/rejected": -2.617361068725586, + "sft_loss": 1.851362943649292, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 13.081773180078448, + "learning_rate": 9.850677291859458e-07, + "logits/chosen": -0.22949905693531036, + "logits/rejected": -0.04544057324528694, + "logps/chosen": -1.9166128635406494, + "logps/rejected": -2.5818140506744385, + "loss": 0.74, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9166128635406494, + "rewards/margins": 0.6652010083198547, + "rewards/rejected": -2.5818140506744385, + "sft_loss": 1.9522733688354492, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 10.408701831951982, + "learning_rate": 9.806819557298295e-07, + "logits/chosen": -0.27124810218811035, + "logits/rejected": -0.12528440356254578, + "logps/chosen": -1.7751926183700562, + "logps/rejected": -2.6433446407318115, + "loss": 0.6761, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.7751926183700562, + "rewards/margins": 0.8681520223617554, + "rewards/rejected": -2.6433446407318115, + "sft_loss": 1.8149782419204712, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 9.351815707751856, + "learning_rate": 9.76301220699656e-07, + "logits/chosen": -0.2165641337633133, + "logits/rejected": -0.04734504967927933, + "logps/chosen": -1.7876564264297485, + "logps/rejected": -2.685206651687622, + "loss": 0.6636, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.7876564264297485, + "rewards/margins": 0.8975504040718079, + "rewards/rejected": -2.685206651687622, + "sft_loss": 1.7731033563613892, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 9.844648724419237, + "learning_rate": 9.719255665973365e-07, + "logits/chosen": -0.26721853017807007, + "logits/rejected": -0.05507396534085274, + "logps/chosen": -1.7108303308486938, + "logps/rejected": -2.5796587467193604, + "loss": 0.6972, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7108303308486938, + "rewards/margins": 0.8688281774520874, + "rewards/rejected": -2.5796587467193604, + "sft_loss": 1.7844226360321045, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 10.436179068836465, + "learning_rate": 9.675550358754857e-07, + "logits/chosen": -0.21287234127521515, + "logits/rejected": -0.07435743510723114, + "logps/chosen": -1.6167386770248413, + "logps/rejected": -2.557102680206299, + "loss": 0.6452, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6167386770248413, + "rewards/margins": 0.940363883972168, + "rewards/rejected": -2.557102680206299, + "sft_loss": 1.6063480377197266, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 8.699087438372885, + "learning_rate": 9.631896709370124e-07, + "logits/chosen": -0.2651059627532959, + "logits/rejected": -0.07905270159244537, + "logps/chosen": -1.6744390726089478, + "logps/rejected": -2.8000216484069824, + "loss": 0.5949, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6744390726089478, + "rewards/margins": 1.1255824565887451, + "rewards/rejected": -2.8000216484069824, + "sft_loss": 1.7917518615722656, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 9.416539046803988, + "learning_rate": 9.588295141347055e-07, + "logits/chosen": -0.23763029277324677, + "logits/rejected": -0.04285631328821182, + "logps/chosen": -1.8838889598846436, + "logps/rejected": -2.9585506916046143, + "loss": 0.6696, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8838889598846436, + "rewards/margins": 1.0746616125106812, + "rewards/rejected": -2.9585506916046143, + "sft_loss": 1.905796766281128, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 9.816601872866762, + "learning_rate": 9.544746077708263e-07, + "logits/chosen": -0.23911122977733612, + "logits/rejected": -0.06813491880893707, + "logps/chosen": -1.5476216077804565, + "logps/rejected": -2.4001009464263916, + "loss": 0.6338, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5476216077804565, + "rewards/margins": 0.8524792790412903, + "rewards/rejected": -2.4001009464263916, + "sft_loss": 1.5902055501937866, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 13.803323828800195, + "learning_rate": 9.50124994096695e-07, + "logits/chosen": -0.24263215065002441, + "logits/rejected": -0.1326196938753128, + "logps/chosen": -1.6562252044677734, + "logps/rejected": -2.5973217487335205, + "loss": 0.6646, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6562252044677734, + "rewards/margins": 0.9410964250564575, + "rewards/rejected": -2.5973217487335205, + "sft_loss": 1.7516815662384033, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 11.91395105204308, + "learning_rate": 9.457807153122826e-07, + "logits/chosen": -0.23275959491729736, + "logits/rejected": 0.011137251742184162, + "logps/chosen": -1.6774663925170898, + "logps/rejected": -2.5975823402404785, + "loss": 0.6629, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.6774663925170898, + "rewards/margins": 0.9201158285140991, + "rewards/rejected": -2.5975823402404785, + "sft_loss": 1.6803147792816162, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 12.148506432063686, + "learning_rate": 9.41441813565801e-07, + "logits/chosen": -0.1868276298046112, + "logits/rejected": -0.0998372808098793, + "logps/chosen": -1.7477929592132568, + "logps/rejected": -2.61071515083313, + "loss": 0.6853, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7477929592132568, + "rewards/margins": 0.8629220724105835, + "rewards/rejected": -2.61071515083313, + "sft_loss": 1.8512369394302368, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 10.420336382904948, + "learning_rate": 9.371083309532938e-07, + "logits/chosen": -0.1475117802619934, + "logits/rejected": 0.02056516334414482, + "logps/chosen": -1.6154321432113647, + "logps/rejected": -2.4313552379608154, + "loss": 0.6518, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6154321432113647, + "rewards/margins": 0.8159233331680298, + "rewards/rejected": -2.4313552379608154, + "sft_loss": 1.6552836894989014, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 9.122500210468697, + "learning_rate": 9.327803095182284e-07, + "logits/chosen": -0.23152145743370056, + "logits/rejected": -0.10704489797353745, + "logps/chosen": -1.7144361734390259, + "logps/rejected": -2.6252634525299072, + "loss": 0.6542, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7144361734390259, + "rewards/margins": 0.9108272790908813, + "rewards/rejected": -2.6252634525299072, + "sft_loss": 1.714613676071167, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 12.112549897952343, + "learning_rate": 9.28457791251088e-07, + "logits/chosen": -0.024995137006044388, + "logits/rejected": 0.034917961806058884, + "logps/chosen": -1.7349681854248047, + "logps/rejected": -2.5234830379486084, + "loss": 0.7018, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7349681854248047, + "rewards/margins": 0.7885150909423828, + "rewards/rejected": -2.5234830379486084, + "sft_loss": 1.8175227642059326, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 12.249636751085148, + "learning_rate": 9.241408180889638e-07, + "logits/chosen": -0.16720399260520935, + "logits/rejected": -0.0917845070362091, + "logps/chosen": -1.706947922706604, + "logps/rejected": -2.6415350437164307, + "loss": 0.6606, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.706947922706604, + "rewards/margins": 0.9345870018005371, + "rewards/rejected": -2.6415350437164307, + "sft_loss": 1.7958767414093018, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 7.905937307037574, + "learning_rate": 9.198294319151478e-07, + "logits/chosen": -0.210923433303833, + "logits/rejected": -0.0747196301817894, + "logps/chosen": -1.6869720220565796, + "logps/rejected": -2.444392681121826, + "loss": 0.6758, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6869720220565796, + "rewards/margins": 0.7574207186698914, + "rewards/rejected": -2.444392681121826, + "sft_loss": 1.7059062719345093, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 10.154743185918921, + "learning_rate": 9.155236745587279e-07, + "logits/chosen": -0.2889309823513031, + "logits/rejected": -0.17309871315956116, + "logps/chosen": -1.7106233835220337, + "logps/rejected": -2.653170347213745, + "loss": 0.6423, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7106233835220337, + "rewards/margins": 0.9425467252731323, + "rewards/rejected": -2.653170347213745, + "sft_loss": 1.8067913055419922, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 8.991117835212687, + "learning_rate": 9.112235877941808e-07, + "logits/chosen": -0.20675882697105408, + "logits/rejected": -0.013511359691619873, + "logps/chosen": -1.6930301189422607, + "logps/rejected": -2.5471932888031006, + "loss": 0.645, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.6930301189422607, + "rewards/margins": 0.8541630506515503, + "rewards/rejected": -2.5471932888031006, + "sft_loss": 1.7349340915679932, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 12.274052101700097, + "learning_rate": 9.069292133409672e-07, + "logits/chosen": -0.12488467991352081, + "logits/rejected": -0.014732986688613892, + "logps/chosen": -1.7833898067474365, + "logps/rejected": -2.6352057456970215, + "loss": 0.6967, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.7833898067474365, + "rewards/margins": 0.8518158197402954, + "rewards/rejected": -2.6352057456970215, + "sft_loss": 1.7981497049331665, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 9.607873466517391, + "learning_rate": 9.026405928631269e-07, + "logits/chosen": -0.1588951051235199, + "logits/rejected": -0.09040139615535736, + "logps/chosen": -1.7691665887832642, + "logps/rejected": -2.666560649871826, + "loss": 0.64, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.7691665887832642, + "rewards/margins": 0.8973941802978516, + "rewards/rejected": -2.666560649871826, + "sft_loss": 1.7730439901351929, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 9.407713788560867, + "learning_rate": 8.983577679688745e-07, + "logits/chosen": -0.16986750066280365, + "logits/rejected": -0.04225274175405502, + "logps/chosen": -1.6801502704620361, + "logps/rejected": -2.897521495819092, + "loss": 0.5563, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6801502704620361, + "rewards/margins": 1.2173711061477661, + "rewards/rejected": -2.897521495819092, + "sft_loss": 1.722196340560913, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 7.194690814469289, + "learning_rate": 8.940807802101961e-07, + "logits/chosen": -0.26174548268318176, + "logits/rejected": -0.11660508811473846, + "logps/chosen": -1.5555036067962646, + "logps/rejected": -2.8029425144195557, + "loss": 0.5484, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.5555036067962646, + "rewards/margins": 1.2474387884140015, + "rewards/rejected": -2.8029425144195557, + "sft_loss": 1.6362333297729492, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 10.458112251228574, + "learning_rate": 8.898096710824455e-07, + "logits/chosen": -0.23806282877922058, + "logits/rejected": -0.06314180046319962, + "logps/chosen": -1.655256986618042, + "logps/rejected": -2.9726920127868652, + "loss": 0.5568, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.655256986618042, + "rewards/margins": 1.3174351453781128, + "rewards/rejected": -2.9726920127868652, + "sft_loss": 1.7911889553070068, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 8.243695700196932, + "learning_rate": 8.855444820239421e-07, + "logits/chosen": -0.316383421421051, + "logits/rejected": -0.25597572326660156, + "logps/chosen": -1.7021299600601196, + "logps/rejected": -3.0542609691619873, + "loss": 0.5715, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7021299600601196, + "rewards/margins": 1.3521312475204468, + "rewards/rejected": -3.0542609691619873, + "sft_loss": 1.8004076480865479, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 11.021054218487246, + "learning_rate": 8.812852544155691e-07, + "logits/chosen": -0.2360994815826416, + "logits/rejected": 0.059898506850004196, + "logps/chosen": -1.7925751209259033, + "logps/rejected": -3.2021727561950684, + "loss": 0.5503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7925751209259033, + "rewards/margins": 1.409597635269165, + "rewards/rejected": -3.2021727561950684, + "sft_loss": 1.8459718227386475, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 8.89962898665759, + "learning_rate": 8.770320295803714e-07, + "logits/chosen": -0.297813355922699, + "logits/rejected": -0.03503280505537987, + "logps/chosen": -1.6437908411026, + "logps/rejected": -3.228015899658203, + "loss": 0.5184, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6437908411026, + "rewards/margins": 1.5842252969741821, + "rewards/rejected": -3.228015899658203, + "sft_loss": 1.7362667322158813, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 9.527797834634185, + "learning_rate": 8.727848487831545e-07, + "logits/chosen": -0.20444516837596893, + "logits/rejected": -0.15804891288280487, + "logps/chosen": -1.7272441387176514, + "logps/rejected": -3.051990270614624, + "loss": 0.5538, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7272441387176514, + "rewards/margins": 1.3247458934783936, + "rewards/rejected": -3.051990270614624, + "sft_loss": 1.751915693283081, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 7.461805123404374, + "learning_rate": 8.685437532300863e-07, + "logits/chosen": -0.16066284477710724, + "logits/rejected": -0.12323828786611557, + "logps/chosen": -1.714807152748108, + "logps/rejected": -3.0106585025787354, + "loss": 0.5737, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.714807152748108, + "rewards/margins": 1.2958513498306274, + "rewards/rejected": -3.0106585025787354, + "sft_loss": 1.7871978282928467, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 13.861265215219532, + "learning_rate": 8.64308784068293e-07, + "logits/chosen": -0.22177617251873016, + "logits/rejected": 0.01492443960160017, + "logps/chosen": -1.7709490060806274, + "logps/rejected": -3.111344575881958, + "loss": 0.5605, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7709490060806274, + "rewards/margins": 1.340395212173462, + "rewards/rejected": -3.111344575881958, + "sft_loss": 1.8111011981964111, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 8.161077241804115, + "learning_rate": 8.600799823854655e-07, + "logits/chosen": -0.27392780780792236, + "logits/rejected": -0.025870636105537415, + "logps/chosen": -1.693743348121643, + "logps/rejected": -3.0894863605499268, + "loss": 0.546, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.693743348121643, + "rewards/margins": 1.3957430124282837, + "rewards/rejected": -3.0894863605499268, + "sft_loss": 1.7753608226776123, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 13.150982071143991, + "learning_rate": 8.558573892094547e-07, + "logits/chosen": -0.11018764972686768, + "logits/rejected": -0.09104446321725845, + "logps/chosen": -1.6371879577636719, + "logps/rejected": -2.8337721824645996, + "loss": 0.5863, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6371879577636719, + "rewards/margins": 1.1965839862823486, + "rewards/rejected": -2.8337721824645996, + "sft_loss": 1.7764679193496704, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 11.446826811574992, + "learning_rate": 8.516410455078793e-07, + "logits/chosen": -0.18594589829444885, + "logits/rejected": 0.011191355995833874, + "logps/chosen": -1.7263634204864502, + "logps/rejected": -3.07102632522583, + "loss": 0.5725, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7263634204864502, + "rewards/margins": 1.3446629047393799, + "rewards/rejected": -3.07102632522583, + "sft_loss": 1.8603509664535522, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 10.537343367924715, + "learning_rate": 8.474309921877238e-07, + "logits/chosen": -0.16531623899936676, + "logits/rejected": 0.026289869099855423, + "logps/chosen": -1.611098051071167, + "logps/rejected": -2.9250524044036865, + "loss": 0.5398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.611098051071167, + "rewards/margins": 1.3139547109603882, + "rewards/rejected": -2.9250524044036865, + "sft_loss": 1.710699439048767, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 10.748584753986586, + "learning_rate": 8.432272700949452e-07, + "logits/chosen": -0.009632068686187267, + "logits/rejected": 0.08833660930395126, + "logps/chosen": -1.725791573524475, + "logps/rejected": -3.4090161323547363, + "loss": 0.47, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.725791573524475, + "rewards/margins": 1.6832244396209717, + "rewards/rejected": -3.4090161323547363, + "sft_loss": 1.6864168643951416, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 10.02228452837305, + "learning_rate": 8.390299200140712e-07, + "logits/chosen": -0.29934918880462646, + "logits/rejected": -0.11233566701412201, + "logps/chosen": -1.8083289861679077, + "logps/rejected": -3.242682695388794, + "loss": 0.544, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8083289861679077, + "rewards/margins": 1.4343535900115967, + "rewards/rejected": -3.242682695388794, + "sft_loss": 1.780664086341858, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 12.221367003206197, + "learning_rate": 8.348389826678129e-07, + "logits/chosen": -0.32494932413101196, + "logits/rejected": 0.01025162823498249, + "logps/chosen": -1.8250830173492432, + "logps/rejected": -3.383464813232422, + "loss": 0.5409, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8250830173492432, + "rewards/margins": 1.5583815574645996, + "rewards/rejected": -3.383464813232422, + "sft_loss": 1.8907978534698486, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 8.50218965044455, + "learning_rate": 8.306544987166615e-07, + "logits/chosen": -0.23265402019023895, + "logits/rejected": -0.1387948989868164, + "logps/chosen": -1.766627311706543, + "logps/rejected": -3.334089756011963, + "loss": 0.5369, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.766627311706543, + "rewards/margins": 1.56746244430542, + "rewards/rejected": -3.334089756011963, + "sft_loss": 1.8673334121704102, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 16.15377170158126, + "learning_rate": 8.264765087584998e-07, + "logits/chosen": -0.34037113189697266, + "logits/rejected": -0.09007980674505234, + "logps/chosen": -1.9173988103866577, + "logps/rejected": -3.484591007232666, + "loss": 0.5551, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.9173988103866577, + "rewards/margins": 1.5671921968460083, + "rewards/rejected": -3.484591007232666, + "sft_loss": 1.9438207149505615, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 9.822403017507577, + "learning_rate": 8.223050533282033e-07, + "logits/chosen": -0.18690574169158936, + "logits/rejected": 0.07956047356128693, + "logps/chosen": -1.7812955379486084, + "logps/rejected": -3.284822940826416, + "loss": 0.5383, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7812955379486084, + "rewards/margins": 1.5035276412963867, + "rewards/rejected": -3.284822940826416, + "sft_loss": 1.862097144126892, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 9.613539376759954, + "learning_rate": 8.181401728972522e-07, + "logits/chosen": -0.1325015127658844, + "logits/rejected": 0.08524034917354584, + "logps/chosen": -1.7051302194595337, + "logps/rejected": -3.2468390464782715, + "loss": 0.5278, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7051302194595337, + "rewards/margins": 1.5417091846466064, + "rewards/rejected": -3.2468390464782715, + "sft_loss": 1.7802455425262451, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 8.044388765783284, + "learning_rate": 8.139819078733338e-07, + "logits/chosen": -0.37993818521499634, + "logits/rejected": -0.01248829998075962, + "logps/chosen": -1.835631012916565, + "logps/rejected": -3.2788097858428955, + "loss": 0.5376, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.835631012916565, + "rewards/margins": 1.4431788921356201, + "rewards/rejected": -3.2788097858428955, + "sft_loss": 1.8542535305023193, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 10.448158474118653, + "learning_rate": 8.098302985999547e-07, + "logits/chosen": -0.2758844792842865, + "logits/rejected": 0.0281488336622715, + "logps/chosen": -1.7699769735336304, + "logps/rejected": -3.043705463409424, + "loss": 0.5792, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7699769735336304, + "rewards/margins": 1.273728609085083, + "rewards/rejected": -3.043705463409424, + "sft_loss": 1.835963487625122, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 8.289442766496714, + "learning_rate": 8.056853853560447e-07, + "logits/chosen": -0.18996943533420563, + "logits/rejected": 0.1399824470281601, + "logps/chosen": -1.731579065322876, + "logps/rejected": -3.3539230823516846, + "loss": 0.5245, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.731579065322876, + "rewards/margins": 1.6223437786102295, + "rewards/rejected": -3.3539230823516846, + "sft_loss": 1.7560529708862305, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 9.080199903118245, + "learning_rate": 8.015472083555717e-07, + "logits/chosen": -0.12962546944618225, + "logits/rejected": 0.12912937998771667, + "logps/chosen": -1.6572837829589844, + "logps/rejected": -3.1103360652923584, + "loss": 0.5277, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6572837829589844, + "rewards/margins": 1.453052282333374, + "rewards/rejected": -3.1103360652923584, + "sft_loss": 1.67610764503479, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 13.440542364143138, + "learning_rate": 7.974158077471461e-07, + "logits/chosen": -0.31929659843444824, + "logits/rejected": -0.05511760711669922, + "logps/chosen": -1.7283598184585571, + "logps/rejected": -3.248244524002075, + "loss": 0.5193, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7283598184585571, + "rewards/margins": 1.5198849439620972, + "rewards/rejected": -3.248244524002075, + "sft_loss": 1.7844501733779907, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 10.580595747927207, + "learning_rate": 7.932912236136356e-07, + "logits/chosen": -0.2645584046840668, + "logits/rejected": -0.1220647320151329, + "logps/chosen": -1.6255719661712646, + "logps/rejected": -3.0554234981536865, + "loss": 0.5271, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6255719661712646, + "rewards/margins": 1.429851770401001, + "rewards/rejected": -3.0554234981536865, + "sft_loss": 1.7420114278793335, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 9.715742112103827, + "learning_rate": 7.891734959717726e-07, + "logits/chosen": -0.1833140105009079, + "logits/rejected": -0.018009066581726074, + "logps/chosen": -1.8586372137069702, + "logps/rejected": -3.337434768676758, + "loss": 0.5519, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8586372137069702, + "rewards/margins": 1.478797197341919, + "rewards/rejected": -3.337434768676758, + "sft_loss": 1.9200775623321533, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 10.608522009342899, + "learning_rate": 7.850626647717698e-07, + "logits/chosen": -0.24772553145885468, + "logits/rejected": 0.03280012682080269, + "logps/chosen": -1.644805908203125, + "logps/rejected": -3.2546234130859375, + "loss": 0.5095, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.644805908203125, + "rewards/margins": 1.6098175048828125, + "rewards/rejected": -3.2546234130859375, + "sft_loss": 1.6860408782958984, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 7.856253604131477, + "learning_rate": 7.809587698969282e-07, + "logits/chosen": -0.22330811619758606, + "logits/rejected": -0.01864556595683098, + "logps/chosen": -1.6774189472198486, + "logps/rejected": -3.3011314868927, + "loss": 0.5318, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6774189472198486, + "rewards/margins": 1.6237128973007202, + "rewards/rejected": -3.3011314868927, + "sft_loss": 1.7449238300323486, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 11.906725492881082, + "learning_rate": 7.768618511632555e-07, + "logits/chosen": -0.09854897111654282, + "logits/rejected": 0.05596822500228882, + "logps/chosen": -1.7890571355819702, + "logps/rejected": -3.2758407592773438, + "loss": 0.601, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7890571355819702, + "rewards/margins": 1.4867838621139526, + "rewards/rejected": -3.2758407592773438, + "sft_loss": 1.907470703125, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 11.247823200509208, + "learning_rate": 7.727719483190737e-07, + "logits/chosen": -0.2568055987358093, + "logits/rejected": 0.07877197861671448, + "logps/chosen": -1.799020528793335, + "logps/rejected": -3.212989091873169, + "loss": 0.5964, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.799020528793335, + "rewards/margins": 1.4139684438705444, + "rewards/rejected": -3.212989091873169, + "sft_loss": 1.7767369747161865, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 11.87217275292588, + "learning_rate": 7.686891010446394e-07, + "logits/chosen": -0.11951699107885361, + "logits/rejected": -0.07706291973590851, + "logps/chosen": -1.7854347229003906, + "logps/rejected": -3.206778049468994, + "loss": 0.5454, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7854347229003906, + "rewards/margins": 1.421343445777893, + "rewards/rejected": -3.206778049468994, + "sft_loss": 1.8132820129394531, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 10.066601523454645, + "learning_rate": 7.646133489517535e-07, + "logits/chosen": -0.15235088765621185, + "logits/rejected": -0.025425296276807785, + "logps/chosen": -1.7517459392547607, + "logps/rejected": -3.148427963256836, + "loss": 0.5729, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7517459392547607, + "rewards/margins": 1.3966816663742065, + "rewards/rejected": -3.148427963256836, + "sft_loss": 1.758193016052246, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 8.791899762296065, + "learning_rate": 7.605447315833821e-07, + "logits/chosen": -0.12592525780200958, + "logits/rejected": 0.06427010893821716, + "logps/chosen": -1.6173303127288818, + "logps/rejected": -2.993237257003784, + "loss": 0.5463, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6173303127288818, + "rewards/margins": 1.3759069442749023, + "rewards/rejected": -2.993237257003784, + "sft_loss": 1.674353003501892, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 12.42157001012568, + "learning_rate": 7.564832884132672e-07, + "logits/chosen": -0.22605355083942413, + "logits/rejected": 0.006971999071538448, + "logps/chosen": -1.7891266345977783, + "logps/rejected": -3.117318868637085, + "loss": 0.5928, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7891266345977783, + "rewards/margins": 1.328192114830017, + "rewards/rejected": -3.117318868637085, + "sft_loss": 1.857444167137146, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 12.580124209491172, + "learning_rate": 7.524290588455499e-07, + "logits/chosen": -0.16494357585906982, + "logits/rejected": 0.06749799102544785, + "logps/chosen": -1.7576053142547607, + "logps/rejected": -3.490067720413208, + "loss": 0.539, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7576053142547607, + "rewards/margins": 1.7324621677398682, + "rewards/rejected": -3.490067720413208, + "sft_loss": 1.7450393438339233, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 9.97708561203968, + "learning_rate": 7.483820822143816e-07, + "logits/chosen": -0.18638131022453308, + "logits/rejected": -0.03988290950655937, + "logps/chosen": -1.6317148208618164, + "logps/rejected": -3.1146957874298096, + "loss": 0.5352, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6317148208618164, + "rewards/margins": 1.4829809665679932, + "rewards/rejected": -3.1146957874298096, + "sft_loss": 1.704087257385254, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 9.443485900309117, + "learning_rate": 7.443423977835487e-07, + "logits/chosen": -0.3203321099281311, + "logits/rejected": -0.032503314316272736, + "logps/chosen": -1.73452627658844, + "logps/rejected": -3.3747222423553467, + "loss": 0.5313, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.73452627658844, + "rewards/margins": 1.6401962041854858, + "rewards/rejected": -3.3747222423553467, + "sft_loss": 1.7296106815338135, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 9.112141025009626, + "learning_rate": 7.403100447460861e-07, + "logits/chosen": -0.20505361258983612, + "logits/rejected": -0.07048022001981735, + "logps/chosen": -1.780599594116211, + "logps/rejected": -3.380220413208008, + "loss": 0.55, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.780599594116211, + "rewards/margins": 1.5996206998825073, + "rewards/rejected": -3.380220413208008, + "sft_loss": 1.783935785293579, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 14.024169574717062, + "learning_rate": 7.36285062223902e-07, + "logits/chosen": -0.1989278346300125, + "logits/rejected": -0.06960990279912949, + "logps/chosen": -1.6781647205352783, + "logps/rejected": -3.392726182937622, + "loss": 0.5032, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6781647205352783, + "rewards/margins": 1.7145617008209229, + "rewards/rejected": -3.392726182937622, + "sft_loss": 1.6347157955169678, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 10.594240097208568, + "learning_rate": 7.322674892673931e-07, + "logits/chosen": -0.23259444534778595, + "logits/rejected": 0.062645822763443, + "logps/chosen": -1.7942231893539429, + "logps/rejected": -3.0695948600769043, + "loss": 0.6059, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7942231893539429, + "rewards/margins": 1.275371789932251, + "rewards/rejected": -3.0695948600769043, + "sft_loss": 1.895446538925171, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 11.512803044164448, + "learning_rate": 7.282573648550709e-07, + "logits/chosen": -0.09428633749485016, + "logits/rejected": 0.13378095626831055, + "logps/chosen": -1.7537353038787842, + "logps/rejected": -3.412146806716919, + "loss": 0.5365, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7537353038787842, + "rewards/margins": 1.6584113836288452, + "rewards/rejected": -3.412146806716919, + "sft_loss": 1.7974199056625366, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 10.016323025375353, + "learning_rate": 7.242547278931792e-07, + "logits/chosen": -0.2842410206794739, + "logits/rejected": -0.19029872119426727, + "logps/chosen": -1.8298816680908203, + "logps/rejected": -3.660538911819458, + "loss": 0.5288, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8298816680908203, + "rewards/margins": 1.830657720565796, + "rewards/rejected": -3.660538911819458, + "sft_loss": 1.886203408241272, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 9.785118481954505, + "learning_rate": 7.202596172153203e-07, + "logits/chosen": -0.16824455559253693, + "logits/rejected": -0.028798962011933327, + "logps/chosen": -1.769097089767456, + "logps/rejected": -3.587766647338867, + "loss": 0.5097, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.769097089767456, + "rewards/margins": 1.818669319152832, + "rewards/rejected": -3.587766647338867, + "sft_loss": 1.8450851440429688, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 9.842023108246895, + "learning_rate": 7.162720715820742e-07, + "logits/chosen": -0.19046644866466522, + "logits/rejected": -0.0008801884832791984, + "logps/chosen": -1.7467609643936157, + "logps/rejected": -3.541994571685791, + "loss": 0.5272, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7467609643936157, + "rewards/margins": 1.7952334880828857, + "rewards/rejected": -3.541994571685791, + "sft_loss": 1.8788368701934814, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 10.73183612635801, + "learning_rate": 7.122921296806278e-07, + "logits/chosen": -0.1647910624742508, + "logits/rejected": -0.01739250309765339, + "logps/chosen": -1.7866592407226562, + "logps/rejected": -3.479426145553589, + "loss": 0.5402, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7866592407226562, + "rewards/margins": 1.6927671432495117, + "rewards/rejected": -3.479426145553589, + "sft_loss": 1.9050674438476562, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 11.075635255908587, + "learning_rate": 7.083198301243937e-07, + "logits/chosen": -0.13357248902320862, + "logits/rejected": 0.06032276153564453, + "logps/chosen": -1.6821733713150024, + "logps/rejected": -3.0180554389953613, + "loss": 0.5297, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6821733713150024, + "rewards/margins": 1.3358821868896484, + "rewards/rejected": -3.0180554389953613, + "sft_loss": 1.7494080066680908, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 9.903602734776914, + "learning_rate": 7.043552114526395e-07, + "logits/chosen": -0.20195958018302917, + "logits/rejected": -0.04661129415035248, + "logps/chosen": -1.648164987564087, + "logps/rejected": -3.37589693069458, + "loss": 0.5143, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.648164987564087, + "rewards/margins": 1.7277319431304932, + "rewards/rejected": -3.37589693069458, + "sft_loss": 1.8039286136627197, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 14.3969385350356, + "learning_rate": 7.003983121301139e-07, + "logits/chosen": -0.30059370398521423, + "logits/rejected": -0.1018049344420433, + "logps/chosen": -1.7925945520401, + "logps/rejected": -3.5357577800750732, + "loss": 0.5136, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7925945520401, + "rewards/margins": 1.7431633472442627, + "rewards/rejected": -3.5357577800750732, + "sft_loss": 1.8840261697769165, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 12.362747827933322, + "learning_rate": 6.964491705466704e-07, + "logits/chosen": -0.30404019355773926, + "logits/rejected": -0.06907346099615097, + "logps/chosen": -1.7132093906402588, + "logps/rejected": -3.352640151977539, + "loss": 0.5358, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7132093906402588, + "rewards/margins": 1.6394307613372803, + "rewards/rejected": -3.352640151977539, + "sft_loss": 1.7619565725326538, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 11.567400767912353, + "learning_rate": 6.92507825016899e-07, + "logits/chosen": -0.3080520033836365, + "logits/rejected": 0.12285199016332626, + "logps/chosen": -1.803656816482544, + "logps/rejected": -3.43890643119812, + "loss": 0.549, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.803656816482544, + "rewards/margins": 1.6352497339248657, + "rewards/rejected": -3.43890643119812, + "sft_loss": 1.87128484249115, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 8.353058020832075, + "learning_rate": 6.885743137797502e-07, + "logits/chosen": -0.125919371843338, + "logits/rejected": 0.018805870786309242, + "logps/chosen": -1.6860202550888062, + "logps/rejected": -3.4220077991485596, + "loss": 0.5342, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6860202550888062, + "rewards/margins": 1.735987901687622, + "rewards/rejected": -3.4220077991485596, + "sft_loss": 1.7403955459594727, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.4547114670276642, + "eval_logits/rejected": 0.6009575128555298, + "eval_logps/chosen": -1.995704174041748, + "eval_logps/rejected": -2.9729912281036377, + "eval_loss": 0.7365527153015137, + "eval_rewards/accuracies": 0.6965875625610352, + "eval_rewards/chosen": -1.995704174041748, + "eval_rewards/margins": 0.9772871136665344, + "eval_rewards/rejected": -2.9729912281036377, + "eval_runtime": 50.3486, + "eval_samples_per_second": 26.714, + "eval_sft_loss": 1.9275230169296265, + "eval_steps_per_second": 6.693, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 9.355836024773891, + "learning_rate": 6.846486749981684e-07, + "logits/chosen": -0.13395535945892334, + "logits/rejected": 0.21117381751537323, + "logps/chosen": -1.8070415258407593, + "logps/rejected": -3.1795265674591064, + "loss": 0.5625, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8070415258407593, + "rewards/margins": 1.372484803199768, + "rewards/rejected": -3.1795265674591064, + "sft_loss": 1.7828906774520874, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 8.776826626684906, + "learning_rate": 6.807309467587173e-07, + "logits/chosen": -0.1883133351802826, + "logits/rejected": -0.03017752803862095, + "logps/chosen": -1.6705434322357178, + "logps/rejected": -3.0100858211517334, + "loss": 0.5629, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.6705434322357178, + "rewards/margins": 1.3395423889160156, + "rewards/rejected": -3.0100858211517334, + "sft_loss": 1.7361271381378174, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 13.296893227824357, + "learning_rate": 6.768211670712146e-07, + "logits/chosen": -0.17326074838638306, + "logits/rejected": 0.22264091670513153, + "logps/chosen": -1.7377557754516602, + "logps/rejected": -3.068744659423828, + "loss": 0.5836, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7377557754516602, + "rewards/margins": 1.3309885263442993, + "rewards/rejected": -3.068744659423828, + "sft_loss": 1.7873904705047607, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 10.847948103374218, + "learning_rate": 6.729193738683589e-07, + "logits/chosen": -0.2886696457862854, + "logits/rejected": -0.07660894840955734, + "logps/chosen": -1.813215970993042, + "logps/rejected": -3.3827805519104004, + "loss": 0.5688, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.813215970993042, + "rewards/margins": 1.5695644617080688, + "rewards/rejected": -3.3827805519104004, + "sft_loss": 1.8843132257461548, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 10.189727666685554, + "learning_rate": 6.690256050053652e-07, + "logits/chosen": -0.16914470493793488, + "logits/rejected": 0.016266096383333206, + "logps/chosen": -1.7264938354492188, + "logps/rejected": -3.296316146850586, + "loss": 0.5369, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7264938354492188, + "rewards/margins": 1.5698221921920776, + "rewards/rejected": -3.296316146850586, + "sft_loss": 1.772139549255371, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 11.467675187382033, + "learning_rate": 6.651398982595967e-07, + "logits/chosen": -0.220795676112175, + "logits/rejected": -0.10813482105731964, + "logps/chosen": -1.7274131774902344, + "logps/rejected": -3.434593677520752, + "loss": 0.5267, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7274131774902344, + "rewards/margins": 1.7071806192398071, + "rewards/rejected": -3.434593677520752, + "sft_loss": 1.786871314048767, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 10.839498118605151, + "learning_rate": 6.612622913301961e-07, + "logits/chosen": -0.12224410474300385, + "logits/rejected": -0.11186661571264267, + "logps/chosen": -1.654087781906128, + "logps/rejected": -2.9117302894592285, + "loss": 0.5797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.654087781906128, + "rewards/margins": 1.257642149925232, + "rewards/rejected": -2.9117302894592285, + "sft_loss": 1.757502794265747, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 20.997272379592218, + "learning_rate": 6.573928218377243e-07, + "logits/chosen": -0.1622733771800995, + "logits/rejected": -0.13422775268554688, + "logps/chosen": -1.6278194189071655, + "logps/rejected": -3.17354154586792, + "loss": 0.5248, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6278194189071655, + "rewards/margins": 1.5457221269607544, + "rewards/rejected": -3.17354154586792, + "sft_loss": 1.6407041549682617, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 11.928275021931617, + "learning_rate": 6.5353152732379e-07, + "logits/chosen": -0.12512122094631195, + "logits/rejected": 0.0744471549987793, + "logps/chosen": -1.7665351629257202, + "logps/rejected": -3.165983200073242, + "loss": 0.5881, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.7665351629257202, + "rewards/margins": 1.3994477987289429, + "rewards/rejected": -3.165983200073242, + "sft_loss": 1.835588812828064, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 15.500424986031952, + "learning_rate": 6.496784452506907e-07, + "logits/chosen": -0.2709681987762451, + "logits/rejected": -0.051780473440885544, + "logps/chosen": -1.868085503578186, + "logps/rejected": -3.219057083129883, + "loss": 0.6104, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.868085503578186, + "rewards/margins": 1.3509716987609863, + "rewards/rejected": -3.219057083129883, + "sft_loss": 1.995994210243225, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 12.45975798521115, + "learning_rate": 6.458336130010442e-07, + "logits/chosen": -0.08536555618047714, + "logits/rejected": 0.01686207577586174, + "logps/chosen": -1.7968616485595703, + "logps/rejected": -2.991499662399292, + "loss": 0.5666, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7968616485595703, + "rewards/margins": 1.1946380138397217, + "rewards/rejected": -2.991499662399292, + "sft_loss": 1.8622022867202759, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 10.50043219283165, + "learning_rate": 6.419970678774312e-07, + "logits/chosen": -0.09494493901729584, + "logits/rejected": 0.17411960661411285, + "logps/chosen": -1.7018855810165405, + "logps/rejected": -3.1860663890838623, + "loss": 0.5551, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7018855810165405, + "rewards/margins": 1.4841810464859009, + "rewards/rejected": -3.1860663890838623, + "sft_loss": 1.834429383277893, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 13.040927123294718, + "learning_rate": 6.381688471020282e-07, + "logits/chosen": -0.1964532434940338, + "logits/rejected": -0.075645811855793, + "logps/chosen": -1.725067138671875, + "logps/rejected": -3.4409821033477783, + "loss": 0.5163, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.725067138671875, + "rewards/margins": 1.7159149646759033, + "rewards/rejected": -3.4409821033477783, + "sft_loss": 1.7923214435577393, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 10.862096898834187, + "learning_rate": 6.34348987816251e-07, + "logits/chosen": -0.09906373918056488, + "logits/rejected": 0.2573404610157013, + "logps/chosen": -1.6807048320770264, + "logps/rejected": -3.358582019805908, + "loss": 0.5648, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.6807048320770264, + "rewards/margins": 1.6778767108917236, + "rewards/rejected": -3.358582019805908, + "sft_loss": 1.8171613216400146, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 9.631800757457958, + "learning_rate": 6.3053752708039e-07, + "logits/chosen": -0.15538927912712097, + "logits/rejected": 0.13674817979335785, + "logps/chosen": -1.7523047924041748, + "logps/rejected": -3.158905506134033, + "loss": 0.5498, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7523047924041748, + "rewards/margins": 1.4066009521484375, + "rewards/rejected": -3.158905506134033, + "sft_loss": 1.7659852504730225, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 10.829533507448382, + "learning_rate": 6.267345018732552e-07, + "logits/chosen": -0.17978882789611816, + "logits/rejected": 0.055534325540065765, + "logps/chosen": -1.9052941799163818, + "logps/rejected": -3.4997200965881348, + "loss": 0.5773, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9052941799163818, + "rewards/margins": 1.5944254398345947, + "rewards/rejected": -3.4997200965881348, + "sft_loss": 1.9153798818588257, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 10.35769049326784, + "learning_rate": 6.229399490918126e-07, + "logits/chosen": -0.05187790468335152, + "logits/rejected": 0.01336099486798048, + "logps/chosen": -1.7664661407470703, + "logps/rejected": -3.2258377075195312, + "loss": 0.538, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7664661407470703, + "rewards/margins": 1.459371566772461, + "rewards/rejected": -3.2258377075195312, + "sft_loss": 1.7747611999511719, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 13.643349299342804, + "learning_rate": 6.19153905550831e-07, + "logits/chosen": -0.3395062983036041, + "logits/rejected": -0.01758493110537529, + "logps/chosen": -1.7421958446502686, + "logps/rejected": -3.3181235790252686, + "loss": 0.5333, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7421958446502686, + "rewards/margins": 1.575927972793579, + "rewards/rejected": -3.3181235790252686, + "sft_loss": 1.83111572265625, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 12.224809135626256, + "learning_rate": 6.153764079825211e-07, + "logits/chosen": -0.2562378942966461, + "logits/rejected": -0.11864246428012848, + "logps/chosen": -1.9074604511260986, + "logps/rejected": -3.3651721477508545, + "loss": 0.5714, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9074604511260986, + "rewards/margins": 1.4577118158340454, + "rewards/rejected": -3.3651721477508545, + "sft_loss": 1.9094829559326172, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 13.238715949649285, + "learning_rate": 6.116074930361803e-07, + "logits/chosen": -0.1316947191953659, + "logits/rejected": 0.1430431455373764, + "logps/chosen": -1.772538185119629, + "logps/rejected": -3.622473955154419, + "loss": 0.4966, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.772538185119629, + "rewards/margins": 1.849935531616211, + "rewards/rejected": -3.622473955154419, + "sft_loss": 1.9160066843032837, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 13.230339307295868, + "learning_rate": 6.078471972778388e-07, + "logits/chosen": -0.1815367043018341, + "logits/rejected": 0.12857958674430847, + "logps/chosen": -2.000633716583252, + "logps/rejected": -3.649420976638794, + "loss": 0.5479, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.000633716583252, + "rewards/margins": 1.648787260055542, + "rewards/rejected": -3.649420976638794, + "sft_loss": 1.9365339279174805, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 23.65904937913438, + "learning_rate": 6.040955571899018e-07, + "logits/chosen": -0.1651362031698227, + "logits/rejected": 0.1419135481119156, + "logps/chosen": -1.855017066001892, + "logps/rejected": -3.6009979248046875, + "loss": 0.5504, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.855017066001892, + "rewards/margins": 1.7459806203842163, + "rewards/rejected": -3.6009979248046875, + "sft_loss": 1.9201616048812866, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 12.137879991792099, + "learning_rate": 6.003526091707986e-07, + "logits/chosen": -0.10553675889968872, + "logits/rejected": 0.04338879883289337, + "logps/chosen": -1.8180103302001953, + "logps/rejected": -3.356022596359253, + "loss": 0.5322, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.8180103302001953, + "rewards/margins": 1.538012146949768, + "rewards/rejected": -3.356022596359253, + "sft_loss": 1.8594329357147217, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 10.662598877065893, + "learning_rate": 5.966183895346264e-07, + "logits/chosen": -0.17935439944267273, + "logits/rejected": 0.005321676842868328, + "logps/chosen": -1.747471809387207, + "logps/rejected": -3.3066420555114746, + "loss": 0.5446, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.747471809387207, + "rewards/margins": 1.5591704845428467, + "rewards/rejected": -3.3066420555114746, + "sft_loss": 1.8495490550994873, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 13.506934757683247, + "learning_rate": 5.928929345108015e-07, + "logits/chosen": -0.24749357998371124, + "logits/rejected": 0.08834774792194366, + "logps/chosen": -1.7615807056427002, + "logps/rejected": -3.527405261993408, + "loss": 0.5179, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7615807056427002, + "rewards/margins": 1.765824556350708, + "rewards/rejected": -3.527405261993408, + "sft_loss": 1.8211628198623657, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 11.390020408641043, + "learning_rate": 5.891762802437039e-07, + "logits/chosen": -0.07019664347171783, + "logits/rejected": 0.07711862027645111, + "logps/chosen": -1.8021306991577148, + "logps/rejected": -3.505963087081909, + "loss": 0.5491, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.8021306991577148, + "rewards/margins": 1.7038323879241943, + "rewards/rejected": -3.505963087081909, + "sft_loss": 1.889589548110962, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 9.145040462123765, + "learning_rate": 5.854684627923306e-07, + "logits/chosen": -0.009453767910599709, + "logits/rejected": -0.058390479534864426, + "logps/chosen": -1.8435354232788086, + "logps/rejected": -3.6739730834960938, + "loss": 0.587, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8435354232788086, + "rewards/margins": 1.830437421798706, + "rewards/rejected": -3.6739730834960938, + "sft_loss": 1.9032466411590576, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 11.307495342011872, + "learning_rate": 5.817695181299418e-07, + "logits/chosen": -0.3465738594532013, + "logits/rejected": -0.2500944435596466, + "logps/chosen": -1.727786660194397, + "logps/rejected": -3.165914535522461, + "loss": 0.5609, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.727786660194397, + "rewards/margins": 1.4381277561187744, + "rewards/rejected": -3.165914535522461, + "sft_loss": 1.7594287395477295, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 14.285651119113195, + "learning_rate": 5.780794821437158e-07, + "logits/chosen": 0.0007110525039024651, + "logits/rejected": 0.23650093376636505, + "logps/chosen": -1.7984707355499268, + "logps/rejected": -3.303368330001831, + "loss": 0.549, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7984707355499268, + "rewards/margins": 1.5048975944519043, + "rewards/rejected": -3.303368330001831, + "sft_loss": 1.8620703220367432, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 9.87744380658177, + "learning_rate": 5.743983906343969e-07, + "logits/chosen": -0.16267967224121094, + "logits/rejected": 0.022295860573649406, + "logps/chosen": -1.6229957342147827, + "logps/rejected": -3.110755205154419, + "loss": 0.5272, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6229957342147827, + "rewards/margins": 1.4877592325210571, + "rewards/rejected": -3.110755205154419, + "sft_loss": 1.7183735370635986, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 11.17666651918986, + "learning_rate": 5.707262793159521e-07, + "logits/chosen": -0.07476671040058136, + "logits/rejected": -0.0714937299489975, + "logps/chosen": -1.7283849716186523, + "logps/rejected": -3.067058563232422, + "loss": 0.5599, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7283849716186523, + "rewards/margins": 1.3386733531951904, + "rewards/rejected": -3.067058563232422, + "sft_loss": 1.735907793045044, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 12.837943811048008, + "learning_rate": 5.670631838152204e-07, + "logits/chosen": -0.15067186951637268, + "logits/rejected": 0.055976539850234985, + "logps/chosen": -1.7886425256729126, + "logps/rejected": -3.221102237701416, + "loss": 0.5309, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7886425256729126, + "rewards/margins": 1.432459831237793, + "rewards/rejected": -3.221102237701416, + "sft_loss": 1.831111192703247, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 11.290213521591763, + "learning_rate": 5.634091396715716e-07, + "logits/chosen": -0.14092954993247986, + "logits/rejected": 0.06754940748214722, + "logps/chosen": -1.7412573099136353, + "logps/rejected": -3.4222846031188965, + "loss": 0.5348, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7412573099136353, + "rewards/margins": 1.6810274124145508, + "rewards/rejected": -3.4222846031188965, + "sft_loss": 1.834660530090332, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 14.537318801359874, + "learning_rate": 5.59764182336557e-07, + "logits/chosen": 0.06082998961210251, + "logits/rejected": 0.15074776113033295, + "logps/chosen": -1.8112719058990479, + "logps/rejected": -3.479727268218994, + "loss": 0.5349, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8112719058990479, + "rewards/margins": 1.668454885482788, + "rewards/rejected": -3.479727268218994, + "sft_loss": 1.9149051904678345, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 13.515429797572333, + "learning_rate": 5.561283471735695e-07, + "logits/chosen": -0.1406007707118988, + "logits/rejected": -0.0066071366891264915, + "logps/chosen": -1.7152063846588135, + "logps/rejected": -3.051826000213623, + "loss": 0.555, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7152063846588135, + "rewards/margins": 1.3366196155548096, + "rewards/rejected": -3.051826000213623, + "sft_loss": 1.7660102844238281, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 8.484953749577219, + "learning_rate": 5.52501669457497e-07, + "logits/chosen": -0.23328259587287903, + "logits/rejected": 0.12310652434825897, + "logps/chosen": -1.715920090675354, + "logps/rejected": -3.359726667404175, + "loss": 0.5203, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.715920090675354, + "rewards/margins": 1.6438068151474, + "rewards/rejected": -3.359726667404175, + "sft_loss": 1.79128897190094, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 12.362665149402407, + "learning_rate": 5.488841843743833e-07, + "logits/chosen": -0.16568274796009064, + "logits/rejected": -0.1361156404018402, + "logps/chosen": -1.6876897811889648, + "logps/rejected": -3.312028408050537, + "loss": 0.5408, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6876897811889648, + "rewards/margins": 1.6243385076522827, + "rewards/rejected": -3.312028408050537, + "sft_loss": 1.7390552759170532, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 10.878629371047102, + "learning_rate": 5.452759270210839e-07, + "logits/chosen": 0.02488498017191887, + "logits/rejected": 0.15020744502544403, + "logps/chosen": -1.7068469524383545, + "logps/rejected": -3.4873459339141846, + "loss": 0.5362, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7068469524383545, + "rewards/margins": 1.7804988622665405, + "rewards/rejected": -3.4873459339141846, + "sft_loss": 1.7590528726577759, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 12.157426108313896, + "learning_rate": 5.416769324049282e-07, + "logits/chosen": -0.2870050072669983, + "logits/rejected": -0.06136869266629219, + "logps/chosen": -1.7316116094589233, + "logps/rejected": -3.0164847373962402, + "loss": 0.5823, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7316116094589233, + "rewards/margins": 1.2848728895187378, + "rewards/rejected": -3.0164847373962402, + "sft_loss": 1.808619737625122, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 8.79018380961107, + "learning_rate": 5.38087235443377e-07, + "logits/chosen": 0.06400427967309952, + "logits/rejected": 0.05108444765210152, + "logps/chosen": -1.8447471857070923, + "logps/rejected": -3.415586471557617, + "loss": 0.5767, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8447471857070923, + "rewards/margins": 1.570839285850525, + "rewards/rejected": -3.415586471557617, + "sft_loss": 1.8895288705825806, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 14.225038986307883, + "learning_rate": 5.345068709636866e-07, + "logits/chosen": -0.19258277118206024, + "logits/rejected": -0.09260249137878418, + "logps/chosen": -1.691738486289978, + "logps/rejected": -3.1312484741210938, + "loss": 0.5474, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.691738486289978, + "rewards/margins": 1.4395099878311157, + "rewards/rejected": -3.1312484741210938, + "sft_loss": 1.6973931789398193, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 13.30953173328577, + "learning_rate": 5.309358737025682e-07, + "logits/chosen": -0.1457730382680893, + "logits/rejected": 0.04253808781504631, + "logps/chosen": -1.786547064781189, + "logps/rejected": -3.726893186569214, + "loss": 0.5525, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.786547064781189, + "rewards/margins": 1.9403464794158936, + "rewards/rejected": -3.726893186569214, + "sft_loss": 1.837989091873169, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 12.139194174292625, + "learning_rate": 5.273742783058537e-07, + "logits/chosen": -0.09968717396259308, + "logits/rejected": 0.1227853074669838, + "logps/chosen": -1.7654327154159546, + "logps/rejected": -3.4704456329345703, + "loss": 0.5415, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7654327154159546, + "rewards/margins": 1.7050129175186157, + "rewards/rejected": -3.4704456329345703, + "sft_loss": 1.7903417348861694, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 9.577959973939867, + "learning_rate": 5.23822119328157e-07, + "logits/chosen": -0.2234119474887848, + "logits/rejected": 0.1251845508813858, + "logps/chosen": -1.6994167566299438, + "logps/rejected": -3.445323944091797, + "loss": 0.5213, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6994167566299438, + "rewards/margins": 1.7459074258804321, + "rewards/rejected": -3.445323944091797, + "sft_loss": 1.7618118524551392, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 8.35456484773565, + "learning_rate": 5.202794312325399e-07, + "logits/chosen": -0.18785110116004944, + "logits/rejected": 0.2039102017879486, + "logps/chosen": -1.8394941091537476, + "logps/rejected": -3.561155319213867, + "loss": 0.5432, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8394941091537476, + "rewards/margins": 1.7216612100601196, + "rewards/rejected": -3.561155319213867, + "sft_loss": 1.8366172313690186, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 12.0073634491718, + "learning_rate": 5.167462483901773e-07, + "logits/chosen": -0.16625425219535828, + "logits/rejected": -0.0023239790461957455, + "logps/chosen": -1.8147470951080322, + "logps/rejected": -3.441155195236206, + "loss": 0.5651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8147470951080322, + "rewards/margins": 1.626408576965332, + "rewards/rejected": -3.441155195236206, + "sft_loss": 1.7933012247085571, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 9.280989355670226, + "learning_rate": 5.132226050800256e-07, + "logits/chosen": -0.10036615282297134, + "logits/rejected": 0.023008223623037338, + "logps/chosen": -1.7863056659698486, + "logps/rejected": -3.2340240478515625, + "loss": 0.5709, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7863056659698486, + "rewards/margins": 1.447718620300293, + "rewards/rejected": -3.2340240478515625, + "sft_loss": 1.8817899227142334, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 9.99760092205392, + "learning_rate": 5.097085354884869e-07, + "logits/chosen": -0.10582532733678818, + "logits/rejected": 0.061595212668180466, + "logps/chosen": -1.7237050533294678, + "logps/rejected": -3.2690975666046143, + "loss": 0.5349, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7237050533294678, + "rewards/margins": 1.5453920364379883, + "rewards/rejected": -3.2690975666046143, + "sft_loss": 1.8347012996673584, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 16.739494504756493, + "learning_rate": 5.062040737090806e-07, + "logits/chosen": -0.2041359394788742, + "logits/rejected": 0.05440496653318405, + "logps/chosen": -1.8450266122817993, + "logps/rejected": -3.412508487701416, + "loss": 0.5655, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8450266122817993, + "rewards/margins": 1.5674818754196167, + "rewards/rejected": -3.412508487701416, + "sft_loss": 1.886196494102478, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 11.652650195235836, + "learning_rate": 5.027092537421091e-07, + "logits/chosen": -0.13598737120628357, + "logits/rejected": 0.16030022501945496, + "logps/chosen": -1.831965684890747, + "logps/rejected": -3.417522430419922, + "loss": 0.5562, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.831965684890747, + "rewards/margins": 1.5855567455291748, + "rewards/rejected": -3.417522430419922, + "sft_loss": 1.8175779581069946, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 17.4581711422509, + "learning_rate": 4.992241094943326e-07, + "logits/chosen": -0.1738833487033844, + "logits/rejected": 0.28060057759284973, + "logps/chosen": -1.8021186590194702, + "logps/rejected": -3.683699131011963, + "loss": 0.5179, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8021186590194702, + "rewards/margins": 1.8815805912017822, + "rewards/rejected": -3.683699131011963, + "sft_loss": 1.8290512561798096, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 13.679028455286472, + "learning_rate": 4.957486747786342e-07, + "logits/chosen": -0.05840907618403435, + "logits/rejected": 0.07495447248220444, + "logps/chosen": -1.7225723266601562, + "logps/rejected": -3.1784443855285645, + "loss": 0.5278, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7225723266601562, + "rewards/margins": 1.4558724164962769, + "rewards/rejected": -3.1784443855285645, + "sft_loss": 1.7190653085708618, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 10.263336372814239, + "learning_rate": 4.922829833136984e-07, + "logits/chosen": -0.2786995768547058, + "logits/rejected": 0.02108999527990818, + "logps/chosen": -1.7660636901855469, + "logps/rejected": -3.4693591594696045, + "loss": 0.5368, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7660636901855469, + "rewards/margins": 1.7032954692840576, + "rewards/rejected": -3.4693591594696045, + "sft_loss": 1.8371864557266235, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 14.78777525659592, + "learning_rate": 4.888270687236773e-07, + "logits/chosen": -0.0698157548904419, + "logits/rejected": 0.33996957540512085, + "logps/chosen": -1.844203233718872, + "logps/rejected": -3.4948344230651855, + "loss": 0.5632, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.844203233718872, + "rewards/margins": 1.650631308555603, + "rewards/rejected": -3.4948344230651855, + "sft_loss": 1.8448089361190796, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 11.605848690657226, + "learning_rate": 4.853809645378709e-07, + "logits/chosen": -0.12247265875339508, + "logits/rejected": 0.07382510602474213, + "logps/chosen": -1.9160645008087158, + "logps/rejected": -3.572585344314575, + "loss": 0.5848, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.9160645008087158, + "rewards/margins": 1.656521201133728, + "rewards/rejected": -3.572585344314575, + "sft_loss": 2.0068259239196777, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 11.465961025206745, + "learning_rate": 4.81944704190396e-07, + "logits/chosen": -0.19016000628471375, + "logits/rejected": -0.025381360203027725, + "logps/chosen": -1.7586004734039307, + "logps/rejected": -3.3733391761779785, + "loss": 0.5428, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7586004734039307, + "rewards/margins": 1.6147382259368896, + "rewards/rejected": -3.3733391761779785, + "sft_loss": 1.859588623046875, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 13.012619295863841, + "learning_rate": 4.785183210198667e-07, + "logits/chosen": -0.0383647158741951, + "logits/rejected": -0.06396958976984024, + "logps/chosen": -1.7566505670547485, + "logps/rejected": -3.5018608570098877, + "loss": 0.5177, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7566505670547485, + "rewards/margins": 1.7452104091644287, + "rewards/rejected": -3.5018608570098877, + "sft_loss": 1.855015754699707, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 13.414407789673984, + "learning_rate": 4.7510184826906626e-07, + "logits/chosen": -0.22793325781822205, + "logits/rejected": 0.05166008323431015, + "logps/chosen": -1.9015623331069946, + "logps/rejected": -3.5119667053222656, + "loss": 0.5761, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.9015623331069946, + "rewards/margins": 1.6104040145874023, + "rewards/rejected": -3.5119667053222656, + "sft_loss": 1.9202884435653687, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 12.77803177041485, + "learning_rate": 4.7169531908462953e-07, + "logits/chosen": -0.1987219750881195, + "logits/rejected": -0.06591467559337616, + "logps/chosen": -1.8806768655776978, + "logps/rejected": -3.3210761547088623, + "loss": 0.569, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8806768655776978, + "rewards/margins": 1.4403992891311646, + "rewards/rejected": -3.3210761547088623, + "sft_loss": 1.8865550756454468, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 8.365346335648512, + "learning_rate": 4.6829876651671636e-07, + "logits/chosen": -0.047934405505657196, + "logits/rejected": 0.14092543721199036, + "logps/chosen": -1.7616697549819946, + "logps/rejected": -3.305771589279175, + "loss": 0.5396, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7616697549819946, + "rewards/margins": 1.5441023111343384, + "rewards/rejected": -3.305771589279175, + "sft_loss": 1.7759618759155273, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 16.408527609658467, + "learning_rate": 4.64912223518696e-07, + "logits/chosen": -0.17129859328269958, + "logits/rejected": 0.026264000684022903, + "logps/chosen": -1.8189198970794678, + "logps/rejected": -3.6464335918426514, + "loss": 0.5141, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8189198970794678, + "rewards/margins": 1.8275134563446045, + "rewards/rejected": -3.6464335918426514, + "sft_loss": 1.9214423894882202, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 8.565563478898516, + "learning_rate": 4.615357229468221e-07, + "logits/chosen": -0.17534136772155762, + "logits/rejected": 0.1626088172197342, + "logps/chosen": -1.7512375116348267, + "logps/rejected": -3.546313762664795, + "loss": 0.5015, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7512375116348267, + "rewards/margins": 1.7950763702392578, + "rewards/rejected": -3.546313762664795, + "sft_loss": 1.7669951915740967, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 12.406748421040726, + "learning_rate": 4.581692975599192e-07, + "logits/chosen": -0.14899741113185883, + "logits/rejected": 0.1268022507429123, + "logps/chosen": -1.8081623315811157, + "logps/rejected": -3.2024803161621094, + "loss": 0.5774, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.8081623315811157, + "rewards/margins": 1.3943179845809937, + "rewards/rejected": -3.2024803161621094, + "sft_loss": 1.8954441547393799, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 7.905745882689756, + "learning_rate": 4.548129800190603e-07, + "logits/chosen": -0.19442422688007355, + "logits/rejected": 0.04682071506977081, + "logps/chosen": -1.7260282039642334, + "logps/rejected": -3.445497989654541, + "loss": 0.5062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7260282039642334, + "rewards/margins": 1.719469428062439, + "rewards/rejected": -3.445497989654541, + "sft_loss": 1.78412663936615, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 14.716758462611983, + "learning_rate": 4.5146680288725367e-07, + "logits/chosen": -0.17263540625572205, + "logits/rejected": 0.0990188717842102, + "logps/chosen": -1.7226759195327759, + "logps/rejected": -3.3513336181640625, + "loss": 0.5579, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7226759195327759, + "rewards/margins": 1.6286576986312866, + "rewards/rejected": -3.3513336181640625, + "sft_loss": 1.8069121837615967, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 9.244420649045196, + "learning_rate": 4.481307986291237e-07, + "logits/chosen": -0.1909504383802414, + "logits/rejected": 0.000193992251297459, + "logps/chosen": -1.854528784751892, + "logps/rejected": -3.418144941329956, + "loss": 0.5682, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.854528784751892, + "rewards/margins": 1.5636165142059326, + "rewards/rejected": -3.418144941329956, + "sft_loss": 1.8442004919052124, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 12.451343095718101, + "learning_rate": 4.4480499961059915e-07, + "logits/chosen": -0.13839785754680634, + "logits/rejected": -0.007095733191817999, + "logps/chosen": -1.7823936939239502, + "logps/rejected": -3.1268227100372314, + "loss": 0.5734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7823936939239502, + "rewards/margins": 1.3444294929504395, + "rewards/rejected": -3.1268227100372314, + "sft_loss": 1.7421305179595947, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 9.502520310548556, + "learning_rate": 4.414894380985959e-07, + "logits/chosen": -0.2586430013179779, + "logits/rejected": 0.06767354905605316, + "logps/chosen": -1.6843284368515015, + "logps/rejected": -3.4498672485351562, + "loss": 0.5072, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.6843284368515015, + "rewards/margins": 1.7655388116836548, + "rewards/rejected": -3.4498672485351562, + "sft_loss": 1.7884721755981445, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 13.702985197452257, + "learning_rate": 4.3818414626070703e-07, + "logits/chosen": -0.1704205572605133, + "logits/rejected": -0.08272019028663635, + "logps/chosen": -1.8510723114013672, + "logps/rejected": -3.326447010040283, + "loss": 0.5778, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8510723114013672, + "rewards/margins": 1.475374460220337, + "rewards/rejected": -3.326447010040283, + "sft_loss": 1.8802316188812256, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 15.894625661985877, + "learning_rate": 4.3488915616488757e-07, + "logits/chosen": -0.094416543841362, + "logits/rejected": -0.005819836165755987, + "logps/chosen": -1.8883235454559326, + "logps/rejected": -3.532397747039795, + "loss": 0.5487, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8883235454559326, + "rewards/margins": 1.6440744400024414, + "rewards/rejected": -3.532397747039795, + "sft_loss": 1.9055078029632568, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 10.532115045678573, + "learning_rate": 4.316044997791469e-07, + "logits/chosen": -0.25973159074783325, + "logits/rejected": -0.03989795595407486, + "logps/chosen": -1.8471174240112305, + "logps/rejected": -3.4026947021484375, + "loss": 0.5316, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.8471174240112305, + "rewards/margins": 1.5555775165557861, + "rewards/rejected": -3.4026947021484375, + "sft_loss": 1.9095427989959717, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 9.625215379597115, + "learning_rate": 4.283302089712348e-07, + "logits/chosen": -0.23575392365455627, + "logits/rejected": 0.14210402965545654, + "logps/chosen": -1.812212586402893, + "logps/rejected": -3.431943416595459, + "loss": 0.5148, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.812212586402893, + "rewards/margins": 1.6197311878204346, + "rewards/rejected": -3.431943416595459, + "sft_loss": 1.8462364673614502, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 11.218224313231298, + "learning_rate": 4.250663155083357e-07, + "logits/chosen": -0.07904136180877686, + "logits/rejected": -0.06815527379512787, + "logps/chosen": -1.7699768543243408, + "logps/rejected": -3.287299394607544, + "loss": 0.5626, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7699768543243408, + "rewards/margins": 1.5173224210739136, + "rewards/rejected": -3.287299394607544, + "sft_loss": 1.7975574731826782, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 14.30035500062535, + "learning_rate": 4.218128510567578e-07, + "logits/chosen": -0.1746416985988617, + "logits/rejected": 0.035987790673971176, + "logps/chosen": -1.700383186340332, + "logps/rejected": -3.543325424194336, + "loss": 0.4811, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.700383186340332, + "rewards/margins": 1.842942476272583, + "rewards/rejected": -3.543325424194336, + "sft_loss": 1.7460041046142578, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 14.162568631836546, + "learning_rate": 4.185698471816279e-07, + "logits/chosen": -0.2638034522533417, + "logits/rejected": 0.07215817272663116, + "logps/chosen": -1.8092349767684937, + "logps/rejected": -3.4781856536865234, + "loss": 0.5587, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8092349767684937, + "rewards/margins": 1.6689503192901611, + "rewards/rejected": -3.4781856536865234, + "sft_loss": 1.8944950103759766, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 8.184554287863792, + "learning_rate": 4.1533733534658326e-07, + "logits/chosen": -0.20982761681079865, + "logits/rejected": 0.12584097683429718, + "logps/chosen": -1.7665157318115234, + "logps/rejected": -3.4161884784698486, + "loss": 0.5444, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7665157318115234, + "rewards/margins": 1.6496728658676147, + "rewards/rejected": -3.4161884784698486, + "sft_loss": 1.8240835666656494, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 10.047872500559155, + "learning_rate": 4.121153469134686e-07, + "logits/chosen": -0.1830652505159378, + "logits/rejected": 0.015270600095391273, + "logps/chosen": -1.7029507160186768, + "logps/rejected": -3.1996734142303467, + "loss": 0.5528, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7029507160186768, + "rewards/margins": 1.4967228174209595, + "rewards/rejected": -3.1996734142303467, + "sft_loss": 1.7361400127410889, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 8.511297364088247, + "learning_rate": 4.089039131420292e-07, + "logits/chosen": -0.19872130453586578, + "logits/rejected": -0.007671922445297241, + "logps/chosen": -1.7023446559906006, + "logps/rejected": -3.085238456726074, + "loss": 0.5746, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7023446559906006, + "rewards/margins": 1.3828939199447632, + "rewards/rejected": -3.085238456726074, + "sft_loss": 1.770695447921753, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 11.969921138929104, + "learning_rate": 4.0570306518961027e-07, + "logits/chosen": -0.1552981734275818, + "logits/rejected": 0.1038542240858078, + "logps/chosen": -1.7612078189849854, + "logps/rejected": -3.5550270080566406, + "loss": 0.5463, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7612078189849854, + "rewards/margins": 1.7938191890716553, + "rewards/rejected": -3.5550270080566406, + "sft_loss": 1.792492151260376, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 9.20119844833594, + "learning_rate": 4.025128341108517e-07, + "logits/chosen": -0.2136857956647873, + "logits/rejected": 0.02675015665590763, + "logps/chosen": -1.8333107233047485, + "logps/rejected": -3.263059616088867, + "loss": 0.5733, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8333107233047485, + "rewards/margins": 1.429748773574829, + "rewards/rejected": -3.263059616088867, + "sft_loss": 1.905210256576538, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.44999179244041443, + "eval_logits/rejected": 0.5968734622001648, + "eval_logps/chosen": -2.064203977584839, + "eval_logps/rejected": -3.1107640266418457, + "eval_loss": 0.7454193830490112, + "eval_rewards/accuracies": 0.7054896354675293, + "eval_rewards/chosen": -2.064203977584839, + "eval_rewards/margins": 1.0465601682662964, + "eval_rewards/rejected": -3.1107640266418457, + "eval_runtime": 46.4553, + "eval_samples_per_second": 28.953, + "eval_sft_loss": 1.9592351913452148, + "eval_steps_per_second": 7.254, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 17.643542932785405, + "learning_rate": 3.9933325085739047e-07, + "logits/chosen": -0.2071973830461502, + "logits/rejected": -0.1868351399898529, + "logps/chosen": -1.6352430582046509, + "logps/rejected": -3.1291136741638184, + "loss": 0.508, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6352430582046509, + "rewards/margins": 1.4938703775405884, + "rewards/rejected": -3.1291136741638184, + "sft_loss": 1.6966040134429932, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 10.286590852030196, + "learning_rate": 3.9616434627755624e-07, + "logits/chosen": -0.1404072493314743, + "logits/rejected": -0.02690928615629673, + "logps/chosen": -1.9162696599960327, + "logps/rejected": -3.735090970993042, + "loss": 0.5324, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9162696599960327, + "rewards/margins": 1.818821668624878, + "rewards/rejected": -3.735090970993042, + "sft_loss": 1.9391191005706787, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 9.090805142604072, + "learning_rate": 3.930061511160762e-07, + "logits/chosen": -0.13879060745239258, + "logits/rejected": 0.14609341323375702, + "logps/chosen": -1.764460563659668, + "logps/rejected": -3.3435351848602295, + "loss": 0.5527, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.764460563659668, + "rewards/margins": 1.579074501991272, + "rewards/rejected": -3.3435351848602295, + "sft_loss": 1.7850478887557983, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 14.746188885325457, + "learning_rate": 3.898586960137726e-07, + "logits/chosen": -0.1557859629392624, + "logits/rejected": -0.026083847507834435, + "logps/chosen": -1.7631231546401978, + "logps/rejected": -3.1784684658050537, + "loss": 0.5437, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7631231546401978, + "rewards/margins": 1.4153454303741455, + "rewards/rejected": -3.1784684658050537, + "sft_loss": 1.7551311254501343, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 7.0098680012744685, + "learning_rate": 3.867220115072696e-07, + "logits/chosen": -0.16939975321292877, + "logits/rejected": -0.06898584216833115, + "logps/chosen": -1.5736382007598877, + "logps/rejected": -3.04780912399292, + "loss": 0.5003, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.5736382007598877, + "rewards/margins": 1.4741708040237427, + "rewards/rejected": -3.04780912399292, + "sft_loss": 1.6943461894989014, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 9.608896827672329, + "learning_rate": 3.8359612802869367e-07, + "logits/chosen": -0.2276763916015625, + "logits/rejected": 0.05887297913432121, + "logps/chosen": -1.7581098079681396, + "logps/rejected": -3.37579607963562, + "loss": 0.5385, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7581098079681396, + "rewards/margins": 1.6176865100860596, + "rewards/rejected": -3.37579607963562, + "sft_loss": 1.784156084060669, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 11.627245261504164, + "learning_rate": 3.8048107590537987e-07, + "logits/chosen": -0.22332949936389923, + "logits/rejected": 0.1258862465620041, + "logps/chosen": -1.797519326210022, + "logps/rejected": -3.420186996459961, + "loss": 0.5251, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.797519326210022, + "rewards/margins": 1.622667670249939, + "rewards/rejected": -3.420186996459961, + "sft_loss": 1.8621435165405273, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 10.671438733993377, + "learning_rate": 3.773768853595774e-07, + "logits/chosen": -0.30236560106277466, + "logits/rejected": 0.11213777214288712, + "logps/chosen": -1.7498910427093506, + "logps/rejected": -3.347071409225464, + "loss": 0.5433, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7498910427093506, + "rewards/margins": 1.5971804857254028, + "rewards/rejected": -3.347071409225464, + "sft_loss": 1.8056799173355103, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 11.9378726825724, + "learning_rate": 3.7428358650815706e-07, + "logits/chosen": -0.20918190479278564, + "logits/rejected": 0.14711831510066986, + "logps/chosen": -1.8133995532989502, + "logps/rejected": -3.165956735610962, + "loss": 0.6067, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8133995532989502, + "rewards/margins": 1.3525573015213013, + "rewards/rejected": -3.165956735610962, + "sft_loss": 1.847616195678711, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 18.760705928158977, + "learning_rate": 3.712012093623172e-07, + "logits/chosen": -0.13888953626155853, + "logits/rejected": 0.09737597405910492, + "logps/chosen": -1.838853120803833, + "logps/rejected": -3.607891082763672, + "loss": 0.546, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.838853120803833, + "rewards/margins": 1.7690378427505493, + "rewards/rejected": -3.607891082763672, + "sft_loss": 1.9086978435516357, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 16.21585305836798, + "learning_rate": 3.6812978382729524e-07, + "logits/chosen": -0.27212634682655334, + "logits/rejected": -0.08878588676452637, + "logps/chosen": -1.7782297134399414, + "logps/rejected": -3.4442741870880127, + "loss": 0.5404, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7782297134399414, + "rewards/margins": 1.6660445928573608, + "rewards/rejected": -3.4442741870880127, + "sft_loss": 1.8388874530792236, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 10.750367317481379, + "learning_rate": 3.650693397020744e-07, + "logits/chosen": -0.26451048254966736, + "logits/rejected": 0.0667090192437172, + "logps/chosen": -1.7671161890029907, + "logps/rejected": -3.529853343963623, + "loss": 0.5453, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7671161890029907, + "rewards/margins": 1.7627372741699219, + "rewards/rejected": -3.529853343963623, + "sft_loss": 1.8724162578582764, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 12.298892852396053, + "learning_rate": 3.6201990667909774e-07, + "logits/chosen": -0.26518934965133667, + "logits/rejected": 0.03896629437804222, + "logps/chosen": -1.8639923334121704, + "logps/rejected": -3.3952205181121826, + "loss": 0.5865, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8639923334121704, + "rewards/margins": 1.5312283039093018, + "rewards/rejected": -3.3952205181121826, + "sft_loss": 1.8920514583587646, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 10.507472476225365, + "learning_rate": 3.589815143439772e-07, + "logits/chosen": -0.09764394164085388, + "logits/rejected": 0.029933521524071693, + "logps/chosen": -1.705524206161499, + "logps/rejected": -3.198183536529541, + "loss": 0.5626, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.705524206161499, + "rewards/margins": 1.492659330368042, + "rewards/rejected": -3.198183536529541, + "sft_loss": 1.768334984779358, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 14.25602671123383, + "learning_rate": 3.559541921752091e-07, + "logits/chosen": -0.214193195104599, + "logits/rejected": 0.11325450241565704, + "logps/chosen": -1.8728907108306885, + "logps/rejected": -3.4027392864227295, + "loss": 0.5658, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8728907108306885, + "rewards/margins": 1.5298488140106201, + "rewards/rejected": -3.4027392864227295, + "sft_loss": 1.9027717113494873, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 13.716539716632326, + "learning_rate": 3.5293796954388565e-07, + "logits/chosen": -0.29858314990997314, + "logits/rejected": -0.0883362740278244, + "logps/chosen": -1.6380106210708618, + "logps/rejected": -2.9990859031677246, + "loss": 0.562, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6380106210708618, + "rewards/margins": 1.3610751628875732, + "rewards/rejected": -2.9990859031677246, + "sft_loss": 1.7269871234893799, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 12.231728070967494, + "learning_rate": 3.499328757134129e-07, + "logits/chosen": -0.08925610780715942, + "logits/rejected": 0.019029032438993454, + "logps/chosen": -1.823622465133667, + "logps/rejected": -3.4843475818634033, + "loss": 0.5306, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.823622465133667, + "rewards/margins": 1.6607252359390259, + "rewards/rejected": -3.4843475818634033, + "sft_loss": 1.8130333423614502, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 12.250659890107046, + "learning_rate": 3.469389398392237e-07, + "logits/chosen": -0.24888677895069122, + "logits/rejected": 0.078799307346344, + "logps/chosen": -1.7709062099456787, + "logps/rejected": -3.557257890701294, + "loss": 0.5098, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7709062099456787, + "rewards/margins": 1.7863517999649048, + "rewards/rejected": -3.557257890701294, + "sft_loss": 1.8200101852416992, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 12.730678276563896, + "learning_rate": 3.4395619096849764e-07, + "logits/chosen": -0.305867075920105, + "logits/rejected": 0.02817920409142971, + "logps/chosen": -1.8299095630645752, + "logps/rejected": -3.3529536724090576, + "loss": 0.5612, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8299095630645752, + "rewards/margins": 1.5230443477630615, + "rewards/rejected": -3.3529536724090576, + "sft_loss": 1.9097721576690674, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 12.339190127873952, + "learning_rate": 3.409846580398766e-07, + "logits/chosen": -0.12763968110084534, + "logits/rejected": -0.09702740609645844, + "logps/chosen": -1.7271099090576172, + "logps/rejected": -3.2270724773406982, + "loss": 0.5438, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7271099090576172, + "rewards/margins": 1.4999626874923706, + "rewards/rejected": -3.2270724773406982, + "sft_loss": 1.781528115272522, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 17.315883991753594, + "learning_rate": 3.380243698831869e-07, + "logits/chosen": -0.25092118978500366, + "logits/rejected": 0.0684286579489708, + "logps/chosen": -1.7663710117340088, + "logps/rejected": -3.2613213062286377, + "loss": 0.5525, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7663710117340088, + "rewards/margins": 1.494950532913208, + "rewards/rejected": -3.2613213062286377, + "sft_loss": 1.8157711029052734, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 14.133904776725748, + "learning_rate": 3.350753552191563e-07, + "logits/chosen": -0.24138808250427246, + "logits/rejected": 0.011843997053802013, + "logps/chosen": -1.7978967428207397, + "logps/rejected": -3.3991634845733643, + "loss": 0.5408, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7978967428207397, + "rewards/margins": 1.601266860961914, + "rewards/rejected": -3.3991634845733643, + "sft_loss": 1.801038384437561, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 11.188254604136892, + "learning_rate": 3.3213764265913915e-07, + "logits/chosen": -0.15768598020076752, + "logits/rejected": -0.040329623967409134, + "logps/chosen": -1.6997654438018799, + "logps/rejected": -3.045098066329956, + "loss": 0.563, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6997654438018799, + "rewards/margins": 1.3453330993652344, + "rewards/rejected": -3.045098066329956, + "sft_loss": 1.768566370010376, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 10.000101912949614, + "learning_rate": 3.292112607048343e-07, + "logits/chosen": -0.20942172408103943, + "logits/rejected": -0.018673386424779892, + "logps/chosen": -1.7404597997665405, + "logps/rejected": -3.327239513397217, + "loss": 0.5168, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7404597997665405, + "rewards/margins": 1.5867795944213867, + "rewards/rejected": -3.327239513397217, + "sft_loss": 1.7671947479248047, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 14.781120515094285, + "learning_rate": 3.262962377480136e-07, + "logits/chosen": -0.27843308448791504, + "logits/rejected": 0.012016067281365395, + "logps/chosen": -1.7933326959609985, + "logps/rejected": -3.5190646648406982, + "loss": 0.5038, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7933326959609985, + "rewards/margins": 1.7257320880889893, + "rewards/rejected": -3.5190646648406982, + "sft_loss": 1.8551466464996338, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 17.379751864511018, + "learning_rate": 3.233926020702414e-07, + "logits/chosen": -0.2567359507083893, + "logits/rejected": -0.10769607126712799, + "logps/chosen": -1.7908289432525635, + "logps/rejected": -2.9884819984436035, + "loss": 0.6069, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7908289432525635, + "rewards/margins": 1.19765305519104, + "rewards/rejected": -2.9884819984436035, + "sft_loss": 1.7830078601837158, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 9.92014656389634, + "learning_rate": 3.205003818426047e-07, + "logits/chosen": -0.10162917524576187, + "logits/rejected": 0.063094362616539, + "logps/chosen": -1.7472941875457764, + "logps/rejected": -3.338911533355713, + "loss": 0.5585, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7472941875457764, + "rewards/margins": 1.5916169881820679, + "rewards/rejected": -3.338911533355713, + "sft_loss": 1.8631603717803955, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 15.432061735209999, + "learning_rate": 3.1761960512543627e-07, + "logits/chosen": -0.1377786248922348, + "logits/rejected": 0.0018278755014762282, + "logps/chosen": -1.71317458152771, + "logps/rejected": -3.239314556121826, + "loss": 0.5645, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.71317458152771, + "rewards/margins": 1.5261398553848267, + "rewards/rejected": -3.239314556121826, + "sft_loss": 1.7747024297714233, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 10.205720482051749, + "learning_rate": 3.147502998680447e-07, + "logits/chosen": -0.13008762896060944, + "logits/rejected": 0.029557768255472183, + "logps/chosen": -1.7398678064346313, + "logps/rejected": -3.333784818649292, + "loss": 0.5602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7398678064346313, + "rewards/margins": 1.593916893005371, + "rewards/rejected": -3.333784818649292, + "sft_loss": 1.7981176376342773, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 11.087517393724978, + "learning_rate": 3.11892493908442e-07, + "logits/chosen": -0.22432152926921844, + "logits/rejected": -0.08385895192623138, + "logps/chosen": -1.6868541240692139, + "logps/rejected": -3.2441565990448, + "loss": 0.5468, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6868541240692139, + "rewards/margins": 1.5573023557662964, + "rewards/rejected": -3.2441565990448, + "sft_loss": 1.6810327768325806, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 9.065330947296532, + "learning_rate": 3.0904621497307437e-07, + "logits/chosen": -0.18360400199890137, + "logits/rejected": -0.06750941276550293, + "logps/chosen": -1.8119789361953735, + "logps/rejected": -3.222032070159912, + "loss": 0.5997, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8119789361953735, + "rewards/margins": 1.4100534915924072, + "rewards/rejected": -3.222032070159912, + "sft_loss": 1.913190245628357, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 12.271788183456376, + "learning_rate": 3.062114906765522e-07, + "logits/chosen": -0.3010988235473633, + "logits/rejected": 0.06047710031270981, + "logps/chosen": -1.761810302734375, + "logps/rejected": -3.507551670074463, + "loss": 0.5416, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.761810302734375, + "rewards/margins": 1.745741605758667, + "rewards/rejected": -3.507551670074463, + "sft_loss": 1.7494663000106812, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 14.433516942336327, + "learning_rate": 3.0338834852138346e-07, + "logits/chosen": -0.15207983553409576, + "logits/rejected": 0.00596159091219306, + "logps/chosen": -1.8344218730926514, + "logps/rejected": -3.4356167316436768, + "loss": 0.5405, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8344218730926514, + "rewards/margins": 1.6011947393417358, + "rewards/rejected": -3.4356167316436768, + "sft_loss": 1.7789312601089478, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 13.482372037361836, + "learning_rate": 3.0057681589770526e-07, + "logits/chosen": -0.16331283748149872, + "logits/rejected": 0.10025990009307861, + "logps/chosen": -1.831169843673706, + "logps/rejected": -3.5069286823272705, + "loss": 0.5554, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.831169843673706, + "rewards/margins": 1.675758719444275, + "rewards/rejected": -3.5069286823272705, + "sft_loss": 1.8912452459335327, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 12.934023132184983, + "learning_rate": 2.9777692008301993e-07, + "logits/chosen": -0.06551705300807953, + "logits/rejected": 0.02953300252556801, + "logps/chosen": -1.7379716634750366, + "logps/rejected": -3.3578503131866455, + "loss": 0.5195, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.7379716634750366, + "rewards/margins": 1.6198784112930298, + "rewards/rejected": -3.3578503131866455, + "sft_loss": 1.787366271018982, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 15.069769362589748, + "learning_rate": 2.949886882419284e-07, + "logits/chosen": -0.1420452892780304, + "logits/rejected": -0.06317319720983505, + "logps/chosen": -1.7161645889282227, + "logps/rejected": -3.232700824737549, + "loss": 0.5422, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7161645889282227, + "rewards/margins": 1.5165363550186157, + "rewards/rejected": -3.232700824737549, + "sft_loss": 1.7947757244110107, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 11.340771528217207, + "learning_rate": 2.92212147425869e-07, + "logits/chosen": -0.14552152156829834, + "logits/rejected": 0.0813545510172844, + "logps/chosen": -1.8014802932739258, + "logps/rejected": -3.518508195877075, + "loss": 0.5528, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8014802932739258, + "rewards/margins": 1.717028260231018, + "rewards/rejected": -3.518508195877075, + "sft_loss": 1.9085992574691772, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 12.494784258442676, + "learning_rate": 2.894473245728518e-07, + "logits/chosen": -0.2370075285434723, + "logits/rejected": -0.006445932202041149, + "logps/chosen": -1.718737244606018, + "logps/rejected": -3.3767809867858887, + "loss": 0.5538, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.718737244606018, + "rewards/margins": 1.6580438613891602, + "rewards/rejected": -3.3767809867858887, + "sft_loss": 1.814737319946289, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 21.774382492352622, + "learning_rate": 2.866942465072014e-07, + "logits/chosen": -0.2321983128786087, + "logits/rejected": -0.03561241179704666, + "logps/chosen": -1.802708387374878, + "logps/rejected": -3.5632636547088623, + "loss": 0.5672, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.802708387374878, + "rewards/margins": 1.7605549097061157, + "rewards/rejected": -3.5632636547088623, + "sft_loss": 1.8140722513198853, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 10.109533599048547, + "learning_rate": 2.839529399392924e-07, + "logits/chosen": -0.22016914188861847, + "logits/rejected": 0.14298439025878906, + "logps/chosen": -1.8716926574707031, + "logps/rejected": -3.670722484588623, + "loss": 0.5386, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8716926574707031, + "rewards/margins": 1.799030065536499, + "rewards/rejected": -3.670722484588623, + "sft_loss": 1.953478217124939, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 9.919476961497255, + "learning_rate": 2.812234314652937e-07, + "logits/chosen": -0.18086175620555878, + "logits/rejected": 0.08491306006908417, + "logps/chosen": -1.823604941368103, + "logps/rejected": -3.513345241546631, + "loss": 0.5581, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.823604941368103, + "rewards/margins": 1.689740538597107, + "rewards/rejected": -3.513345241546631, + "sft_loss": 1.879625678062439, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 13.542513430482426, + "learning_rate": 2.785057475669084e-07, + "logits/chosen": -0.2250836342573166, + "logits/rejected": 0.0073054940439760685, + "logps/chosen": -1.771472692489624, + "logps/rejected": -3.573692798614502, + "loss": 0.5247, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.771472692489624, + "rewards/margins": 1.802220106124878, + "rewards/rejected": -3.573692798614502, + "sft_loss": 1.8163973093032837, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 13.091639134361028, + "learning_rate": 2.75799914611117e-07, + "logits/chosen": -0.16010549664497375, + "logits/rejected": 0.07752474397420883, + "logps/chosen": -1.8313506841659546, + "logps/rejected": -3.6223537921905518, + "loss": 0.5446, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8313506841659546, + "rewards/margins": 1.7910032272338867, + "rewards/rejected": -3.6223537921905518, + "sft_loss": 1.8761584758758545, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 9.93265718865091, + "learning_rate": 2.7310595884992354e-07, + "logits/chosen": -0.12645861506462097, + "logits/rejected": 0.18549086153507233, + "logps/chosen": -1.6416807174682617, + "logps/rejected": -3.2978649139404297, + "loss": 0.5107, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6416807174682617, + "rewards/margins": 1.656184196472168, + "rewards/rejected": -3.2978649139404297, + "sft_loss": 1.7991886138916016, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 11.044243573023872, + "learning_rate": 2.7042390642009805e-07, + "logits/chosen": -0.22041960060596466, + "logits/rejected": -0.17886695265769958, + "logps/chosen": -1.7223641872406006, + "logps/rejected": -3.300487995147705, + "loss": 0.5636, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7223641872406006, + "rewards/margins": 1.578123927116394, + "rewards/rejected": -3.300487995147705, + "sft_loss": 1.7671864032745361, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 12.385348094564502, + "learning_rate": 2.6775378334292543e-07, + "logits/chosen": -0.054296769201755524, + "logits/rejected": 0.0722646489739418, + "logps/chosen": -1.7313636541366577, + "logps/rejected": -3.252239942550659, + "loss": 0.5358, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.7313636541366577, + "rewards/margins": 1.520876169204712, + "rewards/rejected": -3.252239942550659, + "sft_loss": 1.797353982925415, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 14.735766602782922, + "learning_rate": 2.650956155239512e-07, + "logits/chosen": -0.10592559725046158, + "logits/rejected": 0.16242524981498718, + "logps/chosen": -1.76133131980896, + "logps/rejected": -3.5244193077087402, + "loss": 0.5086, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.76133131980896, + "rewards/margins": 1.7630879878997803, + "rewards/rejected": -3.5244193077087402, + "sft_loss": 1.7872645854949951, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 15.34427977343139, + "learning_rate": 2.6244942875273093e-07, + "logits/chosen": -0.06889469921588898, + "logits/rejected": 0.09271882474422455, + "logps/chosen": -1.869799017906189, + "logps/rejected": -3.4388561248779297, + "loss": 0.5349, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.869799017906189, + "rewards/margins": 1.5690571069717407, + "rewards/rejected": -3.4388561248779297, + "sft_loss": 1.8109409809112549, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 10.461582072034766, + "learning_rate": 2.59815248702581e-07, + "logits/chosen": -0.1550874561071396, + "logits/rejected": 0.06648962199687958, + "logps/chosen": -1.7301225662231445, + "logps/rejected": -3.288156509399414, + "loss": 0.5342, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7301225662231445, + "rewards/margins": 1.5580341815948486, + "rewards/rejected": -3.288156509399414, + "sft_loss": 1.8114830255508423, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 9.686256783274853, + "learning_rate": 2.5719310093032695e-07, + "logits/chosen": -0.25174325704574585, + "logits/rejected": 0.12775930762290955, + "logps/chosen": -1.801027536392212, + "logps/rejected": -3.4213192462921143, + "loss": 0.5418, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.801027536392212, + "rewards/margins": 1.6202919483184814, + "rewards/rejected": -3.4213192462921143, + "sft_loss": 1.7672901153564453, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 10.986970716143642, + "learning_rate": 2.5458301087605876e-07, + "logits/chosen": -0.19087447226047516, + "logits/rejected": 0.0413995161652565, + "logps/chosen": -1.8017337322235107, + "logps/rejected": -3.272310256958008, + "loss": 0.5838, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.8017337322235107, + "rewards/margins": 1.470576286315918, + "rewards/rejected": -3.272310256958008, + "sft_loss": 1.8942054510116577, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 10.46111832932533, + "learning_rate": 2.5198500386288083e-07, + "logits/chosen": -0.12169595062732697, + "logits/rejected": 0.045823872089385986, + "logps/chosen": -1.7901885509490967, + "logps/rejected": -3.4956653118133545, + "loss": 0.5057, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7901885509490967, + "rewards/margins": 1.705476999282837, + "rewards/rejected": -3.4956653118133545, + "sft_loss": 1.8139406442642212, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 15.651885381161886, + "learning_rate": 2.493991050966694e-07, + "logits/chosen": -0.18022406101226807, + "logits/rejected": -0.04743504524230957, + "logps/chosen": -1.8339494466781616, + "logps/rejected": -3.3705623149871826, + "loss": 0.5623, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8339494466781616, + "rewards/margins": 1.5366131067276, + "rewards/rejected": -3.3705623149871826, + "sft_loss": 1.8706283569335938, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 14.293280222901767, + "learning_rate": 2.4682533966582494e-07, + "logits/chosen": -0.18216542899608612, + "logits/rejected": 0.03202268108725548, + "logps/chosen": -1.7299884557724, + "logps/rejected": -3.0353915691375732, + "loss": 0.5769, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7299884557724, + "rewards/margins": 1.3054029941558838, + "rewards/rejected": -3.0353915691375732, + "sft_loss": 1.789258599281311, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 12.750199691611652, + "learning_rate": 2.442637325410316e-07, + "logits/chosen": -0.057832587510347366, + "logits/rejected": 0.2335827797651291, + "logps/chosen": -1.7248417139053345, + "logps/rejected": -3.3812613487243652, + "loss": 0.5565, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7248417139053345, + "rewards/margins": 1.6564195156097412, + "rewards/rejected": -3.3812613487243652, + "sft_loss": 1.746360421180725, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 18.29241678857937, + "learning_rate": 2.417143085750122e-07, + "logits/chosen": -0.0375925675034523, + "logits/rejected": 0.11547158658504486, + "logps/chosen": -1.7524213790893555, + "logps/rejected": -3.4146087169647217, + "loss": 0.5355, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7524213790893555, + "rewards/margins": 1.6621872186660767, + "rewards/rejected": -3.4146087169647217, + "sft_loss": 1.798270583152771, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 10.692978286152742, + "learning_rate": 2.3917709250228994e-07, + "logits/chosen": -0.1646510511636734, + "logits/rejected": 0.18285346031188965, + "logps/chosen": -1.7572791576385498, + "logps/rejected": -3.271498203277588, + "loss": 0.534, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7572791576385498, + "rewards/margins": 1.5142189264297485, + "rewards/rejected": -3.271498203277588, + "sft_loss": 1.7719749212265015, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 14.719809421351123, + "learning_rate": 2.3665210893894557e-07, + "logits/chosen": -0.039440952241420746, + "logits/rejected": 0.04918104037642479, + "logps/chosen": -1.725327491760254, + "logps/rejected": -3.278855085372925, + "loss": 0.5536, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.725327491760254, + "rewards/margins": 1.5535273551940918, + "rewards/rejected": -3.278855085372925, + "sft_loss": 1.7282207012176514, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 10.885379905130568, + "learning_rate": 2.3413938238238157e-07, + "logits/chosen": -0.09351601451635361, + "logits/rejected": 0.19373974204063416, + "logps/chosen": -1.8117258548736572, + "logps/rejected": -3.4790234565734863, + "loss": 0.556, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8117258548736572, + "rewards/margins": 1.6672979593276978, + "rewards/rejected": -3.4790234565734863, + "sft_loss": 1.8795970678329468, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 10.498041294231749, + "learning_rate": 2.316389372110812e-07, + "logits/chosen": -0.2118692845106125, + "logits/rejected": 0.007659897208213806, + "logps/chosen": -1.747856855392456, + "logps/rejected": -3.2259974479675293, + "loss": 0.5612, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.747856855392456, + "rewards/margins": 1.4781407117843628, + "rewards/rejected": -3.2259974479675293, + "sft_loss": 1.8180797100067139, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 14.091805082180102, + "learning_rate": 2.2915079768437514e-07, + "logits/chosen": -0.06196912005543709, + "logits/rejected": 0.025976702570915222, + "logps/chosen": -1.8319848775863647, + "logps/rejected": -3.4811851978302, + "loss": 0.54, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8319848775863647, + "rewards/margins": 1.649200201034546, + "rewards/rejected": -3.4811851978302, + "sft_loss": 1.812852144241333, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 14.430961778590698, + "learning_rate": 2.2667498794220326e-07, + "logits/chosen": -0.17927943170070648, + "logits/rejected": 0.1337457150220871, + "logps/chosen": -1.8328558206558228, + "logps/rejected": -3.450277328491211, + "loss": 0.5479, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8328558206558228, + "rewards/margins": 1.6174217462539673, + "rewards/rejected": -3.450277328491211, + "sft_loss": 1.8390781879425049, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 11.75329047632989, + "learning_rate": 2.2421153200488332e-07, + "logits/chosen": -0.080213263630867, + "logits/rejected": -0.05093265697360039, + "logps/chosen": -1.8332271575927734, + "logps/rejected": -3.5902411937713623, + "loss": 0.5071, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8332271575927734, + "rewards/margins": 1.7570136785507202, + "rewards/rejected": -3.5902411937713623, + "sft_loss": 1.8736753463745117, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 13.470083247872166, + "learning_rate": 2.217604537728749e-07, + "logits/chosen": -0.1351408064365387, + "logits/rejected": 0.045610688626766205, + "logps/chosen": -1.6412384510040283, + "logps/rejected": -3.2661595344543457, + "loss": 0.4879, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.6412384510040283, + "rewards/margins": 1.6249208450317383, + "rewards/rejected": -3.2661595344543457, + "sft_loss": 1.7293357849121094, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 9.222599257137244, + "learning_rate": 2.1932177702655053e-07, + "logits/chosen": -0.2029998004436493, + "logits/rejected": -0.11901791393756866, + "logps/chosen": -1.8168365955352783, + "logps/rejected": -3.4140625, + "loss": 0.5476, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8168365955352783, + "rewards/margins": 1.5972263813018799, + "rewards/rejected": -3.4140625, + "sft_loss": 1.860943078994751, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 11.158699914225073, + "learning_rate": 2.1689552542596232e-07, + "logits/chosen": -0.08220269531011581, + "logits/rejected": 0.16050629317760468, + "logps/chosen": -1.742254614830017, + "logps/rejected": -3.5307483673095703, + "loss": 0.4996, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.742254614830017, + "rewards/margins": 1.7884941101074219, + "rewards/rejected": -3.5307483673095703, + "sft_loss": 1.8253253698349, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 10.093803537845512, + "learning_rate": 2.1448172251061338e-07, + "logits/chosen": 0.013688882812857628, + "logits/rejected": -0.09023362398147583, + "logps/chosen": -1.7495476007461548, + "logps/rejected": -3.1490283012390137, + "loss": 0.5315, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7495476007461548, + "rewards/margins": 1.3994805812835693, + "rewards/rejected": -3.1490283012390137, + "sft_loss": 1.7908941507339478, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 11.184649941495632, + "learning_rate": 2.1208039169923122e-07, + "logits/chosen": -0.17796917259693146, + "logits/rejected": 0.08611693233251572, + "logps/chosen": -1.880004644393921, + "logps/rejected": -3.462754487991333, + "loss": 0.5436, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.880004644393921, + "rewards/margins": 1.582749605178833, + "rewards/rejected": -3.462754487991333, + "sft_loss": 1.9490007162094116, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 13.925243624028903, + "learning_rate": 2.096915562895369e-07, + "logits/chosen": -0.12639853358268738, + "logits/rejected": -0.05879003927111626, + "logps/chosen": -1.8781654834747314, + "logps/rejected": -3.5193824768066406, + "loss": 0.5751, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.8781654834747314, + "rewards/margins": 1.6412169933319092, + "rewards/rejected": -3.5193824768066406, + "sft_loss": 2.001239061355591, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 11.151390897624402, + "learning_rate": 2.07315239458023e-07, + "logits/chosen": -0.11989846080541611, + "logits/rejected": 0.3027260899543762, + "logps/chosen": -1.857408881187439, + "logps/rejected": -3.7410550117492676, + "loss": 0.4824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.857408881187439, + "rewards/margins": 1.88364577293396, + "rewards/rejected": -3.7410550117492676, + "sft_loss": 1.8817813396453857, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 12.167077440955572, + "learning_rate": 2.0495146425972487e-07, + "logits/chosen": -0.22354164719581604, + "logits/rejected": 0.07373027503490448, + "logps/chosen": -1.765608549118042, + "logps/rejected": -3.5321717262268066, + "loss": 0.5464, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.765608549118042, + "rewards/margins": 1.766563057899475, + "rewards/rejected": -3.5321717262268066, + "sft_loss": 1.8596305847167969, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 13.297410085708984, + "learning_rate": 2.0260025362800078e-07, + "logits/chosen": -0.2584924101829529, + "logits/rejected": -0.12692752480506897, + "logps/chosen": -1.812424898147583, + "logps/rejected": -3.632875442504883, + "loss": 0.4927, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.812424898147583, + "rewards/margins": 1.820450782775879, + "rewards/rejected": -3.632875442504883, + "sft_loss": 1.8927667140960693, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 14.149462733267194, + "learning_rate": 2.002616303743059e-07, + "logits/chosen": -0.23086294531822205, + "logits/rejected": 0.05471482127904892, + "logps/chosen": -1.988516092300415, + "logps/rejected": -3.7733561992645264, + "loss": 0.5631, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.988516092300415, + "rewards/margins": 1.7848399877548218, + "rewards/rejected": -3.7733561992645264, + "sft_loss": 2.062723398208618, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 20.085394294617718, + "learning_rate": 1.979356171879738e-07, + "logits/chosen": -0.13205042481422424, + "logits/rejected": 0.0894799530506134, + "logps/chosen": -1.877374291419983, + "logps/rejected": -3.731405735015869, + "loss": 0.5138, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.877374291419983, + "rewards/margins": 1.8540315628051758, + "rewards/rejected": -3.731405735015869, + "sft_loss": 1.92086923122406, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 16.78769911346286, + "learning_rate": 1.9562223663599399e-07, + "logits/chosen": -0.07414297759532928, + "logits/rejected": 0.09883741289377213, + "logps/chosen": -1.8357845544815063, + "logps/rejected": -3.586291790008545, + "loss": 0.5352, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8357845544815063, + "rewards/margins": 1.7505077123641968, + "rewards/rejected": -3.586291790008545, + "sft_loss": 1.860817313194275, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 12.369865830612776, + "learning_rate": 1.9332151116279557e-07, + "logits/chosen": -0.18610504269599915, + "logits/rejected": -0.0502205491065979, + "logps/chosen": -1.7904930114746094, + "logps/rejected": -3.3864052295684814, + "loss": 0.5387, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7904930114746094, + "rewards/margins": 1.5959125757217407, + "rewards/rejected": -3.3864052295684814, + "sft_loss": 1.866620659828186, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 14.31104044382298, + "learning_rate": 1.9103346309002623e-07, + "logits/chosen": -0.15936878323554993, + "logits/rejected": -0.06936420500278473, + "logps/chosen": -1.8094561100006104, + "logps/rejected": -3.239793062210083, + "loss": 0.5843, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8094561100006104, + "rewards/margins": 1.4303371906280518, + "rewards/rejected": -3.239793062210083, + "sft_loss": 1.8103708028793335, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 11.573837494960328, + "learning_rate": 1.887581146163394e-07, + "logits/chosen": -0.21960671246051788, + "logits/rejected": -0.011276873759925365, + "logps/chosen": -1.8357484340667725, + "logps/rejected": -3.542323350906372, + "loss": 0.5809, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8357484340667725, + "rewards/margins": 1.7065750360488892, + "rewards/rejected": -3.542323350906372, + "sft_loss": 1.8496434688568115, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 13.238059687502597, + "learning_rate": 1.8649548781717506e-07, + "logits/chosen": -0.09515713155269623, + "logits/rejected": 0.08226939290761948, + "logps/chosen": -1.8203575611114502, + "logps/rejected": -3.3756203651428223, + "loss": 0.5381, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8203575611114502, + "rewards/margins": 1.555262804031372, + "rewards/rejected": -3.3756203651428223, + "sft_loss": 1.8084625005722046, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 9.594684997285926, + "learning_rate": 1.8424560464454891e-07, + "logits/chosen": -0.2160584032535553, + "logits/rejected": -0.02173597738146782, + "logps/chosen": -1.7369788885116577, + "logps/rejected": -3.13492488861084, + "loss": 0.5581, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7369788885116577, + "rewards/margins": 1.3979461193084717, + "rewards/rejected": -3.13492488861084, + "sft_loss": 1.8426496982574463, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.5046150088310242, + "eval_logits/rejected": 0.6573245525360107, + "eval_logps/chosen": -2.0441954135894775, + "eval_logps/rejected": -3.071851968765259, + "eval_loss": 0.7417393922805786, + "eval_rewards/accuracies": 0.7017804384231567, + "eval_rewards/chosen": -2.0441954135894775, + "eval_rewards/margins": 1.0276561975479126, + "eval_rewards/rejected": -3.071851968765259, + "eval_runtime": 47.8029, + "eval_samples_per_second": 28.136, + "eval_sft_loss": 1.9637134075164795, + "eval_steps_per_second": 7.05, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 12.447563754874988, + "learning_rate": 1.820084869268369e-07, + "logits/chosen": -0.21850493550300598, + "logits/rejected": -0.04778672754764557, + "logps/chosen": -1.7985130548477173, + "logps/rejected": -3.3076224327087402, + "loss": 0.562, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7985130548477173, + "rewards/margins": 1.5091097354888916, + "rewards/rejected": -3.3076224327087402, + "sft_loss": 1.8247474431991577, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 13.0918948619155, + "learning_rate": 1.7978415636856571e-07, + "logits/chosen": -0.12160871177911758, + "logits/rejected": 0.03960564360022545, + "logps/chosen": -1.7965052127838135, + "logps/rejected": -3.361992359161377, + "loss": 0.5683, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7965052127838135, + "rewards/margins": 1.5654871463775635, + "rewards/rejected": -3.361992359161377, + "sft_loss": 1.8337568044662476, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 13.7797932165533, + "learning_rate": 1.7757263455019906e-07, + "logits/chosen": -0.17800372838974, + "logits/rejected": 0.06227899715304375, + "logps/chosen": -1.6353908777236938, + "logps/rejected": -3.2979979515075684, + "loss": 0.5476, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6353908777236938, + "rewards/margins": 1.6626074314117432, + "rewards/rejected": -3.2979979515075684, + "sft_loss": 1.723488211631775, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 18.837965669479228, + "learning_rate": 1.7537394292793245e-07, + "logits/chosen": -0.09702011197805405, + "logits/rejected": 0.04160100966691971, + "logps/chosen": -1.8251625299453735, + "logps/rejected": -3.1852543354034424, + "loss": 0.5801, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8251625299453735, + "rewards/margins": 1.3600914478302002, + "rewards/rejected": -3.1852543354034424, + "sft_loss": 1.8348585367202759, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 10.661240642290903, + "learning_rate": 1.731881028334808e-07, + "logits/chosen": -0.14085690677165985, + "logits/rejected": 0.08010586351156235, + "logps/chosen": -1.7059447765350342, + "logps/rejected": -3.2035980224609375, + "loss": 0.5339, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7059447765350342, + "rewards/margins": 1.4976532459259033, + "rewards/rejected": -3.2035980224609375, + "sft_loss": 1.7304388284683228, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 9.89564955553788, + "learning_rate": 1.7101513547387487e-07, + "logits/chosen": -0.19714686274528503, + "logits/rejected": 0.04681064933538437, + "logps/chosen": -1.7345222234725952, + "logps/rejected": -3.2551662921905518, + "loss": 0.5284, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.7345222234725952, + "rewards/margins": 1.5206438302993774, + "rewards/rejected": -3.2551662921905518, + "sft_loss": 1.7617794275283813, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 20.774074596573787, + "learning_rate": 1.6885506193125306e-07, + "logits/chosen": -0.309567391872406, + "logits/rejected": -0.00635856669396162, + "logps/chosen": -1.7985738515853882, + "logps/rejected": -3.5609755516052246, + "loss": 0.5236, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7985738515853882, + "rewards/margins": 1.7624021768569946, + "rewards/rejected": -3.5609755516052246, + "sft_loss": 1.8604736328125, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 13.297055407552948, + "learning_rate": 1.667079031626591e-07, + "logits/chosen": -0.22957918047904968, + "logits/rejected": 0.125446155667305, + "logps/chosen": -1.7423770427703857, + "logps/rejected": -3.4894371032714844, + "loss": 0.51, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7423770427703857, + "rewards/margins": 1.7470604181289673, + "rewards/rejected": -3.4894371032714844, + "sft_loss": 1.779083013534546, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 11.427636309390813, + "learning_rate": 1.6457367999983568e-07, + "logits/chosen": -0.15671579539775848, + "logits/rejected": -0.06081641837954521, + "logps/chosen": -1.7776477336883545, + "logps/rejected": -3.3175289630889893, + "loss": 0.5544, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7776477336883545, + "rewards/margins": 1.5398811101913452, + "rewards/rejected": -3.3175289630889893, + "sft_loss": 1.8607282638549805, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 12.595993856207159, + "learning_rate": 1.6245241314902604e-07, + "logits/chosen": -0.32584601640701294, + "logits/rejected": -0.05513492971658707, + "logps/chosen": -1.7751858234405518, + "logps/rejected": -3.486859083175659, + "loss": 0.5335, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7751858234405518, + "rewards/margins": 1.7116730213165283, + "rewards/rejected": -3.486859083175659, + "sft_loss": 1.793172836303711, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 12.650737116268983, + "learning_rate": 1.6034412319077008e-07, + "logits/chosen": -0.08883605897426605, + "logits/rejected": 0.16650724411010742, + "logps/chosen": -1.7191474437713623, + "logps/rejected": -3.4780402183532715, + "loss": 0.5493, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7191474437713623, + "rewards/margins": 1.7588927745819092, + "rewards/rejected": -3.4780402183532715, + "sft_loss": 1.8166354894638062, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 10.588617379104743, + "learning_rate": 1.582488305797068e-07, + "logits/chosen": -0.13550075888633728, + "logits/rejected": 0.012206335552036762, + "logps/chosen": -1.6531245708465576, + "logps/rejected": -3.258253574371338, + "loss": 0.5075, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6531245708465576, + "rewards/margins": 1.6051290035247803, + "rewards/rejected": -3.258253574371338, + "sft_loss": 1.7562519311904907, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 9.513595547076909, + "learning_rate": 1.5616655564437354e-07, + "logits/chosen": -0.2985331416130066, + "logits/rejected": -0.12871314585208893, + "logps/chosen": -1.783439040184021, + "logps/rejected": -3.5234837532043457, + "loss": 0.5195, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.783439040184021, + "rewards/margins": 1.7400442361831665, + "rewards/rejected": -3.5234837532043457, + "sft_loss": 1.803736925125122, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 10.722593488383868, + "learning_rate": 1.5409731858701154e-07, + "logits/chosen": -0.08731357753276825, + "logits/rejected": 0.08228771388530731, + "logps/chosen": -1.6720569133758545, + "logps/rejected": -3.4969024658203125, + "loss": 0.4825, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6720569133758545, + "rewards/margins": 1.8248456716537476, + "rewards/rejected": -3.4969024658203125, + "sft_loss": 1.6960939168930054, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 14.57535968621479, + "learning_rate": 1.5204113948336717e-07, + "logits/chosen": 0.012193548493087292, + "logits/rejected": 0.1711004674434662, + "logps/chosen": -1.70675528049469, + "logps/rejected": -3.585901975631714, + "loss": 0.51, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.70675528049469, + "rewards/margins": 1.879146933555603, + "rewards/rejected": -3.585901975631714, + "sft_loss": 1.8017257452011108, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 9.169941429507046, + "learning_rate": 1.499980382824997e-07, + "logits/chosen": -0.07882848381996155, + "logits/rejected": 0.1298879235982895, + "logps/chosen": -1.7168487310409546, + "logps/rejected": -3.5460994243621826, + "loss": 0.5374, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7168487310409546, + "rewards/margins": 1.8292505741119385, + "rewards/rejected": -3.5460994243621826, + "sft_loss": 1.80367910861969, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 13.128022621245504, + "learning_rate": 1.479680348065855e-07, + "logits/chosen": -0.06857812404632568, + "logits/rejected": 0.012163696810603142, + "logps/chosen": -1.852543592453003, + "logps/rejected": -3.7211203575134277, + "loss": 0.554, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.852543592453003, + "rewards/margins": 1.868577241897583, + "rewards/rejected": -3.7211203575134277, + "sft_loss": 1.9871208667755127, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 11.92261447993522, + "learning_rate": 1.4595114875072762e-07, + "logits/chosen": -0.3111642003059387, + "logits/rejected": -0.004455783870071173, + "logps/chosen": -1.769470453262329, + "logps/rejected": -3.4464504718780518, + "loss": 0.554, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.769470453262329, + "rewards/margins": 1.6769797801971436, + "rewards/rejected": -3.4464504718780518, + "sft_loss": 1.8530012369155884, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 12.377104674604613, + "learning_rate": 1.4394739968276293e-07, + "logits/chosen": -0.17811310291290283, + "logits/rejected": -0.06515610218048096, + "logps/chosen": -1.8002650737762451, + "logps/rejected": -3.0800435543060303, + "loss": 0.6032, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.8002650737762451, + "rewards/margins": 1.2797784805297852, + "rewards/rejected": -3.0800435543060303, + "sft_loss": 1.8826320171356201, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 9.965069281860176, + "learning_rate": 1.4195680704307405e-07, + "logits/chosen": -0.06343531608581543, + "logits/rejected": 0.16841797530651093, + "logps/chosen": -1.6909034252166748, + "logps/rejected": -3.36090087890625, + "loss": 0.5128, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6909034252166748, + "rewards/margins": 1.6699978113174438, + "rewards/rejected": -3.36090087890625, + "sft_loss": 1.759202241897583, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 11.377090498412477, + "learning_rate": 1.3997939014439926e-07, + "logits/chosen": -0.0794590562582016, + "logits/rejected": 0.16901133954524994, + "logps/chosen": -1.8307081460952759, + "logps/rejected": -3.487328052520752, + "loss": 0.5235, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8307081460952759, + "rewards/margins": 1.6566200256347656, + "rewards/rejected": -3.487328052520752, + "sft_loss": 1.9024460315704346, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 11.126550537311987, + "learning_rate": 1.380151681716465e-07, + "logits/chosen": -0.10807999223470688, + "logits/rejected": -0.16313369572162628, + "logps/chosen": -1.82060968875885, + "logps/rejected": -3.688711166381836, + "loss": 0.5498, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.82060968875885, + "rewards/margins": 1.868101716041565, + "rewards/rejected": -3.688711166381836, + "sft_loss": 1.8633701801300049, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 13.851309715159937, + "learning_rate": 1.3606416018170502e-07, + "logits/chosen": -0.1272238790988922, + "logits/rejected": 0.0939817801117897, + "logps/chosen": -1.6587791442871094, + "logps/rejected": -3.3085055351257324, + "loss": 0.5353, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.6587791442871094, + "rewards/margins": 1.6497268676757812, + "rewards/rejected": -3.3085055351257324, + "sft_loss": 1.749477744102478, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 11.026766281995744, + "learning_rate": 1.3412638510326397e-07, + "logits/chosen": -0.12887075543403625, + "logits/rejected": 0.02745388075709343, + "logps/chosen": -1.7454307079315186, + "logps/rejected": -3.394977569580078, + "loss": 0.5568, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7454307079315186, + "rewards/margins": 1.6495468616485596, + "rewards/rejected": -3.394977569580078, + "sft_loss": 1.8268448114395142, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 14.997638937730077, + "learning_rate": 1.3220186173662462e-07, + "logits/chosen": -0.3330609202384949, + "logits/rejected": 0.06650768965482712, + "logps/chosen": -1.735339879989624, + "logps/rejected": -3.4855971336364746, + "loss": 0.537, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.735339879989624, + "rewards/margins": 1.7502572536468506, + "rewards/rejected": -3.4855971336364746, + "sft_loss": 1.8361396789550781, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 13.814316314857964, + "learning_rate": 1.30290608753522e-07, + "logits/chosen": -0.1492258906364441, + "logits/rejected": 0.14451850950717926, + "logps/chosen": -1.8341033458709717, + "logps/rejected": -3.687492847442627, + "loss": 0.5149, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8341033458709717, + "rewards/margins": 1.8533893823623657, + "rewards/rejected": -3.687492847442627, + "sft_loss": 1.835182547569275, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 17.351310846159475, + "learning_rate": 1.2839264469694039e-07, + "logits/chosen": -0.22598882019519806, + "logits/rejected": 0.0706649050116539, + "logps/chosen": -1.8030054569244385, + "logps/rejected": -3.439702272415161, + "loss": 0.5703, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8030054569244385, + "rewards/margins": 1.6366968154907227, + "rewards/rejected": -3.439702272415161, + "sft_loss": 1.8270823955535889, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 13.03043262410308, + "learning_rate": 1.2650798798093577e-07, + "logits/chosen": -0.17963027954101562, + "logits/rejected": -0.021970821544528008, + "logps/chosen": -1.7476119995117188, + "logps/rejected": -3.096475601196289, + "loss": 0.5663, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7476119995117188, + "rewards/margins": 1.3488636016845703, + "rewards/rejected": -3.096475601196289, + "sft_loss": 1.794965147972107, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 10.868616941883515, + "learning_rate": 1.2463665689045533e-07, + "logits/chosen": -0.15986979007720947, + "logits/rejected": 0.1507270336151123, + "logps/chosen": -1.740025281906128, + "logps/rejected": -3.535895586013794, + "loss": 0.5162, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.740025281906128, + "rewards/margins": 1.7958704233169556, + "rewards/rejected": -3.535895586013794, + "sft_loss": 1.7941840887069702, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 12.772663086577158, + "learning_rate": 1.2277866958116207e-07, + "logits/chosen": -0.1350887268781662, + "logits/rejected": 0.13923409581184387, + "logps/chosen": -1.8032842874526978, + "logps/rejected": -3.202432632446289, + "loss": 0.5713, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.8032842874526978, + "rewards/margins": 1.3991484642028809, + "rewards/rejected": -3.202432632446289, + "sft_loss": 1.805890679359436, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 10.774743614579105, + "learning_rate": 1.2093404407925668e-07, + "logits/chosen": -0.1534949690103531, + "logits/rejected": -0.10646752268075943, + "logps/chosen": -1.8330596685409546, + "logps/rejected": -3.3806934356689453, + "loss": 0.5521, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8330596685409546, + "rewards/margins": 1.5476337671279907, + "rewards/rejected": -3.3806934356689453, + "sft_loss": 1.9304492473602295, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 12.074809040302313, + "learning_rate": 1.1910279828130405e-07, + "logits/chosen": -0.09574166685342789, + "logits/rejected": 0.05776409059762955, + "logps/chosen": -1.7001692056655884, + "logps/rejected": -3.1842257976531982, + "loss": 0.5503, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7001692056655884, + "rewards/margins": 1.484057068824768, + "rewards/rejected": -3.1842257976531982, + "sft_loss": 1.759682059288025, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 16.580193681475954, + "learning_rate": 1.1728494995405876e-07, + "logits/chosen": -0.22148558497428894, + "logits/rejected": 0.0023852705489844084, + "logps/chosen": -1.6515419483184814, + "logps/rejected": -3.407543659210205, + "loss": 0.5244, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6515419483184814, + "rewards/margins": 1.7560014724731445, + "rewards/rejected": -3.407543659210205, + "sft_loss": 1.7370100021362305, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 18.67151226763764, + "learning_rate": 1.1548051673429366e-07, + "logits/chosen": -0.07205932587385178, + "logits/rejected": 0.022912006825208664, + "logps/chosen": -1.6140819787979126, + "logps/rejected": -3.376591444015503, + "loss": 0.5139, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6140819787979126, + "rewards/margins": 1.7625093460083008, + "rewards/rejected": -3.376591444015503, + "sft_loss": 1.646588683128357, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 11.77285577417867, + "learning_rate": 1.136895161286271e-07, + "logits/chosen": -0.09294477105140686, + "logits/rejected": -0.0116586210206151, + "logps/chosen": -1.8401886224746704, + "logps/rejected": -3.378535509109497, + "loss": 0.5324, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8401886224746704, + "rewards/margins": 1.538346767425537, + "rewards/rejected": -3.378535509109497, + "sft_loss": 1.8359954357147217, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 12.273981942089009, + "learning_rate": 1.1191196551335547e-07, + "logits/chosen": 0.015403158962726593, + "logits/rejected": 0.08249323815107346, + "logps/chosen": -1.9144912958145142, + "logps/rejected": -3.440058946609497, + "loss": 0.5829, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9144912958145142, + "rewards/margins": 1.525567650794983, + "rewards/rejected": -3.440058946609497, + "sft_loss": 1.866431474685669, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 13.011751079955705, + "learning_rate": 1.1014788213428206e-07, + "logits/chosen": -0.10495848953723907, + "logits/rejected": 0.18491685390472412, + "logps/chosen": -1.7197036743164062, + "logps/rejected": -3.4352753162384033, + "loss": 0.5328, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7197036743164062, + "rewards/margins": 1.715571641921997, + "rewards/rejected": -3.4352753162384033, + "sft_loss": 1.7532308101654053, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 13.754293706238059, + "learning_rate": 1.08397283106552e-07, + "logits/chosen": -0.29204261302948, + "logits/rejected": -0.018084803596138954, + "logps/chosen": -1.6970350742340088, + "logps/rejected": -3.4211604595184326, + "loss": 0.5056, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6970350742340088, + "rewards/margins": 1.7241252660751343, + "rewards/rejected": -3.4211604595184326, + "sft_loss": 1.7624849081039429, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 10.523455836760576, + "learning_rate": 1.0666018541448442e-07, + "logits/chosen": -0.19858674705028534, + "logits/rejected": -0.20855844020843506, + "logps/chosen": -1.7655675411224365, + "logps/rejected": -3.197678804397583, + "loss": 0.561, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7655675411224365, + "rewards/margins": 1.4321115016937256, + "rewards/rejected": -3.197678804397583, + "sft_loss": 1.834460973739624, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 11.925629650215713, + "learning_rate": 1.0493660591140919e-07, + "logits/chosen": -0.15942314267158508, + "logits/rejected": -0.09754550457000732, + "logps/chosen": -1.831282615661621, + "logps/rejected": -3.3784000873565674, + "loss": 0.5714, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.831282615661621, + "rewards/margins": 1.5471172332763672, + "rewards/rejected": -3.3784000873565674, + "sft_loss": 1.872881293296814, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 11.29527891765967, + "learning_rate": 1.0322656131950165e-07, + "logits/chosen": -0.05539718270301819, + "logits/rejected": 0.03757456690073013, + "logps/chosen": -1.790489912033081, + "logps/rejected": -3.222160816192627, + "loss": 0.5455, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.790489912033081, + "rewards/margins": 1.431671142578125, + "rewards/rejected": -3.222160816192627, + "sft_loss": 1.7970340251922607, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 14.081814143998702, + "learning_rate": 1.0153006822962246e-07, + "logits/chosen": -0.029426341876387596, + "logits/rejected": 0.08542537689208984, + "logps/chosen": -1.8488479852676392, + "logps/rejected": -3.433168888092041, + "loss": 0.5694, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8488479852676392, + "rewards/margins": 1.5843212604522705, + "rewards/rejected": -3.433168888092041, + "sft_loss": 1.8777506351470947, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 15.286550642554765, + "learning_rate": 9.984714310115434e-08, + "logits/chosen": -0.17160436511039734, + "logits/rejected": -0.05323296785354614, + "logps/chosen": -1.9492241144180298, + "logps/rejected": -3.5698063373565674, + "loss": 0.5472, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9492241144180298, + "rewards/margins": 1.6205823421478271, + "rewards/rejected": -3.5698063373565674, + "sft_loss": 1.795431137084961, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 19.699631445509663, + "learning_rate": 9.817780226184509e-08, + "logits/chosen": -0.21670648455619812, + "logits/rejected": 0.16964387893676758, + "logps/chosen": -1.7380412817001343, + "logps/rejected": -3.4298624992370605, + "loss": 0.5178, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7380412817001343, + "rewards/margins": 1.6918213367462158, + "rewards/rejected": -3.4298624992370605, + "sft_loss": 1.7912161350250244, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 9.77781571435363, + "learning_rate": 9.652206190764611e-08, + "logits/chosen": -0.2179642617702484, + "logits/rejected": 0.004683566279709339, + "logps/chosen": -1.7119777202606201, + "logps/rejected": -3.185180187225342, + "loss": 0.5484, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7119777202606201, + "rewards/margins": 1.4732027053833008, + "rewards/rejected": -3.185180187225342, + "sft_loss": 1.7350578308105469, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 13.703589343704314, + "learning_rate": 9.487993810255823e-08, + "logits/chosen": -0.1822083592414856, + "logits/rejected": -0.05674133449792862, + "logps/chosen": -1.7465932369232178, + "logps/rejected": -3.5221877098083496, + "loss": 0.5369, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7465932369232178, + "rewards/margins": 1.775594711303711, + "rewards/rejected": -3.5221877098083496, + "sft_loss": 1.764044165611267, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 16.015438120699894, + "learning_rate": 9.325144677847325e-08, + "logits/chosen": -0.1478969156742096, + "logits/rejected": 0.017214369028806686, + "logps/chosen": -1.8455041646957397, + "logps/rejected": -3.473567247390747, + "loss": 0.5391, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.8455041646957397, + "rewards/margins": 1.6280628442764282, + "rewards/rejected": -3.473567247390747, + "sft_loss": 1.917488694190979, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 13.2177412605901, + "learning_rate": 9.163660373502158e-08, + "logits/chosen": 0.07100075483322144, + "logits/rejected": -0.0008317396277561784, + "logps/chosen": -1.8566402196884155, + "logps/rejected": -3.364086866378784, + "loss": 0.5781, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8566402196884155, + "rewards/margins": 1.5074464082717896, + "rewards/rejected": -3.364086866378784, + "sft_loss": 1.8414785861968994, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 26.07751623268177, + "learning_rate": 9.003542463941711e-08, + "logits/chosen": -0.010251370258629322, + "logits/rejected": -0.03044726513326168, + "logps/chosen": -1.6916160583496094, + "logps/rejected": -3.302567720413208, + "loss": 0.5536, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6916160583496094, + "rewards/margins": 1.6109517812728882, + "rewards/rejected": -3.302567720413208, + "sft_loss": 1.7145153284072876, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 8.841371119449688, + "learning_rate": 8.844792502630705e-08, + "logits/chosen": -0.12310369312763214, + "logits/rejected": -0.0008263051277026534, + "logps/chosen": -1.614935278892517, + "logps/rejected": -3.2148826122283936, + "loss": 0.5031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.614935278892517, + "rewards/margins": 1.599947214126587, + "rewards/rejected": -3.2148826122283936, + "sft_loss": 1.6776469945907593, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 9.37571104363395, + "learning_rate": 8.687412029761866e-08, + "logits/chosen": -0.2811339497566223, + "logits/rejected": -0.1394878774881363, + "logps/chosen": -1.6294755935668945, + "logps/rejected": -3.3096911907196045, + "loss": 0.5056, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6294755935668945, + "rewards/margins": 1.680215835571289, + "rewards/rejected": -3.3096911907196045, + "sft_loss": 1.667616605758667, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 12.865506560573404, + "learning_rate": 8.531402572241325e-08, + "logits/chosen": -0.09491724520921707, + "logits/rejected": 0.00872437097132206, + "logps/chosen": -1.6955277919769287, + "logps/rejected": -3.08903169631958, + "loss": 0.5858, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.6955277919769287, + "rewards/margins": 1.3935037851333618, + "rewards/rejected": -3.08903169631958, + "sft_loss": 1.7838642597198486, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 16.708463944884052, + "learning_rate": 8.376765643673462e-08, + "logits/chosen": -0.15276072919368744, + "logits/rejected": 0.24526353180408478, + "logps/chosen": -1.7361557483673096, + "logps/rejected": -3.1856894493103027, + "loss": 0.5386, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7361557483673096, + "rewards/margins": 1.4495340585708618, + "rewards/rejected": -3.1856894493103027, + "sft_loss": 1.7658565044403076, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 9.915045609735312, + "learning_rate": 8.223502744346484e-08, + "logits/chosen": -0.04342980682849884, + "logits/rejected": 0.1702241152524948, + "logps/chosen": -1.6896997690200806, + "logps/rejected": -3.071929454803467, + "loss": 0.5622, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.6896997690200806, + "rewards/margins": 1.3822300434112549, + "rewards/rejected": -3.071929454803467, + "sft_loss": 1.7686045169830322, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 17.72415181893815, + "learning_rate": 8.071615361217648e-08, + "logits/chosen": -0.13096554577350616, + "logits/rejected": -0.029258519411087036, + "logps/chosen": -1.664790391921997, + "logps/rejected": -2.8216605186462402, + "loss": 0.6285, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.664790391921997, + "rewards/margins": 1.1568701267242432, + "rewards/rejected": -2.8216605186462402, + "sft_loss": 1.7525379657745361, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 11.68405514912924, + "learning_rate": 7.92110496789909e-08, + "logits/chosen": -0.2031552791595459, + "logits/rejected": 0.032610904425382614, + "logps/chosen": -1.7019561529159546, + "logps/rejected": -3.2062244415283203, + "loss": 0.5326, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7019561529159546, + "rewards/margins": 1.5042685270309448, + "rewards/rejected": -3.2062244415283203, + "sft_loss": 1.7459650039672852, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 15.34951904415885, + "learning_rate": 7.771973024643241e-08, + "logits/chosen": -0.2016465663909912, + "logits/rejected": -0.010524836368858814, + "logps/chosen": -1.6697314977645874, + "logps/rejected": -3.583134174346924, + "loss": 0.4503, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -1.6697314977645874, + "rewards/margins": 1.913402795791626, + "rewards/rejected": -3.583134174346924, + "sft_loss": 1.6881519556045532, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 14.828298746181426, + "learning_rate": 7.624220978328905e-08, + "logits/chosen": -0.3352576792240143, + "logits/rejected": -0.045596785843372345, + "logps/chosen": -1.755860686302185, + "logps/rejected": -3.416865825653076, + "loss": 0.5331, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.755860686302185, + "rewards/margins": 1.6610050201416016, + "rewards/rejected": -3.416865825653076, + "sft_loss": 1.8021215200424194, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 11.251154843255891, + "learning_rate": 7.477850262447056e-08, + "logits/chosen": -0.30058687925338745, + "logits/rejected": 0.04834729805588722, + "logps/chosen": -1.6716951131820679, + "logps/rejected": -3.47686505317688, + "loss": 0.5118, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6716951131820679, + "rewards/margins": 1.8051700592041016, + "rewards/rejected": -3.47686505317688, + "sft_loss": 1.7458328008651733, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 11.258024242328093, + "learning_rate": 7.332862297087073e-08, + "logits/chosen": -0.08056751638650894, + "logits/rejected": 0.16363118588924408, + "logps/chosen": -1.72183358669281, + "logps/rejected": -3.6885628700256348, + "loss": 0.5159, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.72183358669281, + "rewards/margins": 1.9667285680770874, + "rewards/rejected": -3.6885628700256348, + "sft_loss": 1.7538646459579468, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 15.774492797225584, + "learning_rate": 7.189258488922768e-08, + "logits/chosen": -0.0924944058060646, + "logits/rejected": 0.16677220165729523, + "logps/chosen": -1.7519547939300537, + "logps/rejected": -3.342869997024536, + "loss": 0.5233, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7519547939300537, + "rewards/margins": 1.590915322303772, + "rewards/rejected": -3.342869997024536, + "sft_loss": 1.78385329246521, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 14.513797741987641, + "learning_rate": 7.047040231198959e-08, + "logits/chosen": -0.16937026381492615, + "logits/rejected": 0.014137727208435535, + "logps/chosen": -1.7294788360595703, + "logps/rejected": -3.3136649131774902, + "loss": 0.5617, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7294788360595703, + "rewards/margins": 1.5841859579086304, + "rewards/rejected": -3.3136649131774902, + "sft_loss": 1.752410650253296, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 13.259953642172649, + "learning_rate": 6.906208903717787e-08, + "logits/chosen": -0.27336111664772034, + "logits/rejected": 0.07937004417181015, + "logps/chosen": -1.7311985492706299, + "logps/rejected": -3.4800212383270264, + "loss": 0.5154, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7311985492706299, + "rewards/margins": 1.7488229274749756, + "rewards/rejected": -3.4800212383270264, + "sft_loss": 1.7407493591308594, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 17.246375245141014, + "learning_rate": 6.76676587282542e-08, + "logits/chosen": -0.18381229043006897, + "logits/rejected": -0.07914048433303833, + "logps/chosen": -1.8589649200439453, + "logps/rejected": -3.4577198028564453, + "loss": 0.5432, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8589649200439453, + "rewards/margins": 1.598755121231079, + "rewards/rejected": -3.4577198028564453, + "sft_loss": 1.891333818435669, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 16.4748277084828, + "learning_rate": 6.628712491398736e-08, + "logits/chosen": -0.33703577518463135, + "logits/rejected": 0.010515051893889904, + "logps/chosen": -1.694253921508789, + "logps/rejected": -3.272106170654297, + "loss": 0.5357, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.694253921508789, + "rewards/margins": 1.5778522491455078, + "rewards/rejected": -3.272106170654297, + "sft_loss": 1.8197418451309204, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 13.145713348716752, + "learning_rate": 6.492050098832281e-08, + "logits/chosen": -0.33598920702934265, + "logits/rejected": 0.005089169833809137, + "logps/chosen": -1.788678765296936, + "logps/rejected": -3.5194950103759766, + "loss": 0.5391, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.788678765296936, + "rewards/margins": 1.7308164834976196, + "rewards/rejected": -3.5194950103759766, + "sft_loss": 1.8638890981674194, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 15.523976451818084, + "learning_rate": 6.356780021025161e-08, + "logits/chosen": -0.018414665013551712, + "logits/rejected": 0.07146365940570831, + "logps/chosen": -1.743440866470337, + "logps/rejected": -3.2033050060272217, + "loss": 0.5697, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.743440866470337, + "rewards/margins": 1.4598641395568848, + "rewards/rejected": -3.2033050060272217, + "sft_loss": 1.8230106830596924, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 8.28765342796906, + "learning_rate": 6.222903570368288e-08, + "logits/chosen": -0.09992040693759918, + "logits/rejected": 0.11682212352752686, + "logps/chosen": -1.8085787296295166, + "logps/rejected": -3.3187828063964844, + "loss": 0.5627, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8085787296295166, + "rewards/margins": 1.5102041959762573, + "rewards/rejected": -3.3187828063964844, + "sft_loss": 1.8448479175567627, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 12.553216743920622, + "learning_rate": 6.090422045731525e-08, + "logits/chosen": -0.11003341525793076, + "logits/rejected": 0.10952264070510864, + "logps/chosen": -1.7515653371810913, + "logps/rejected": -3.2995238304138184, + "loss": 0.5639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7515653371810913, + "rewards/margins": 1.5479586124420166, + "rewards/rejected": -3.2995238304138184, + "sft_loss": 1.832606315612793, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 21.07056459916169, + "learning_rate": 5.9593367324512593e-08, + "logits/chosen": -0.203813835978508, + "logits/rejected": -0.0018882930744439363, + "logps/chosen": -1.6920894384384155, + "logps/rejected": -3.2762725353240967, + "loss": 0.5429, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6920894384384155, + "rewards/margins": 1.5841830968856812, + "rewards/rejected": -3.2762725353240967, + "sft_loss": 1.742953896522522, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 11.540131753363577, + "learning_rate": 5.8296489023177305e-08, + "logits/chosen": -0.22834794223308563, + "logits/rejected": -0.0977824255824089, + "logps/chosen": -1.820873498916626, + "logps/rejected": -3.3625118732452393, + "loss": 0.5374, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.820873498916626, + "rewards/margins": 1.5416386127471924, + "rewards/rejected": -3.3625118732452393, + "sft_loss": 1.9106184244155884, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 6.092889140852963, + "learning_rate": 5.7013598135628895e-08, + "logits/chosen": -0.09903047978878021, + "logits/rejected": -0.05335770174860954, + "logps/chosen": -1.6918855905532837, + "logps/rejected": -3.4326744079589844, + "loss": 0.5241, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6918855905532837, + "rewards/margins": 1.740788459777832, + "rewards/rejected": -3.4326744079589844, + "sft_loss": 1.7823829650878906, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 11.178889716782178, + "learning_rate": 5.5744707108479784e-08, + "logits/chosen": -0.15962809324264526, + "logits/rejected": 0.14945785701274872, + "logps/chosen": -1.6994130611419678, + "logps/rejected": -3.3238296508789062, + "loss": 0.5323, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6994130611419678, + "rewards/margins": 1.6244163513183594, + "rewards/rejected": -3.3238296508789062, + "sft_loss": 1.7236995697021484, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 12.037210771580455, + "learning_rate": 5.448982825251686e-08, + "logits/chosen": -0.16476576030254364, + "logits/rejected": 0.023226696997880936, + "logps/chosen": -1.7827539443969727, + "logps/rejected": -3.496924638748169, + "loss": 0.5224, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7827539443969727, + "rewards/margins": 1.7141706943511963, + "rewards/rejected": -3.496924638748169, + "sft_loss": 1.888169527053833, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 13.048210566572077, + "learning_rate": 5.324897374257959e-08, + "logits/chosen": -0.12402637302875519, + "logits/rejected": -0.0067833187058568, + "logps/chosen": -1.8013607263565063, + "logps/rejected": -3.5576987266540527, + "loss": 0.5134, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8013607263565063, + "rewards/margins": 1.756338119506836, + "rewards/rejected": -3.5576987266540527, + "sft_loss": 1.7914527654647827, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 13.314492118569552, + "learning_rate": 5.202215561744461e-08, + "logits/chosen": -0.056083958595991135, + "logits/rejected": 0.0518193356692791, + "logps/chosen": -1.8286384344100952, + "logps/rejected": -3.307530164718628, + "loss": 0.5716, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8286384344100952, + "rewards/margins": 1.478891372680664, + "rewards/rejected": -3.307530164718628, + "sft_loss": 1.9314041137695312, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 9.782592586506926, + "learning_rate": 5.080938577970617e-08, + "logits/chosen": -0.1079416424036026, + "logits/rejected": 0.12869928777217865, + "logps/chosen": -1.6902978420257568, + "logps/rejected": -3.489121913909912, + "loss": 0.5593, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6902978420257568, + "rewards/margins": 1.7988240718841553, + "rewards/rejected": -3.489121913909912, + "sft_loss": 1.781044602394104, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 16.772587561429965, + "learning_rate": 4.961067599566305e-08, + "logits/chosen": -0.26666679978370667, + "logits/rejected": 0.0012994721764698625, + "logps/chosen": -1.7255357503890991, + "logps/rejected": -3.4891388416290283, + "loss": 0.5316, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7255357503890991, + "rewards/margins": 1.7636029720306396, + "rewards/rejected": -3.4891388416290283, + "sft_loss": 1.8183187246322632, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 24.35972300356556, + "learning_rate": 4.8426037895202277e-08, + "logits/chosen": -0.10224726051092148, + "logits/rejected": 0.12815634906291962, + "logps/chosen": -1.7485454082489014, + "logps/rejected": -3.513535976409912, + "loss": 0.502, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7485454082489014, + "rewards/margins": 1.7649905681610107, + "rewards/rejected": -3.513535976409912, + "sft_loss": 1.8304626941680908, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 16.647665933842195, + "learning_rate": 4.725548297168847e-08, + "logits/chosen": -0.2706056237220764, + "logits/rejected": -0.015289786271750927, + "logps/chosen": -1.7003743648529053, + "logps/rejected": -3.4611706733703613, + "loss": 0.5281, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7003743648529053, + "rewards/margins": 1.7607961893081665, + "rewards/rejected": -3.4611706733703613, + "sft_loss": 1.7916524410247803, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.4523681402206421, + "eval_logits/rejected": 0.6008379459381104, + "eval_logps/chosen": -2.066589593887329, + "eval_logps/rejected": -3.1211626529693604, + "eval_loss": 0.7446607351303101, + "eval_rewards/accuracies": 0.7032641172409058, + "eval_rewards/chosen": -2.066589593887329, + "eval_rewards/margins": 1.0545730590820312, + "eval_rewards/rejected": -3.1211626529693604, + "eval_runtime": 48.5224, + "eval_samples_per_second": 27.719, + "eval_sft_loss": 1.9814122915267944, + "eval_steps_per_second": 6.945, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 9.222960461789413, + "learning_rate": 4.609902258185017e-08, + "logits/chosen": -0.09446149319410324, + "logits/rejected": -0.028069287538528442, + "logps/chosen": -1.7660013437271118, + "logps/rejected": -3.2970519065856934, + "loss": 0.5409, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7660013437271118, + "rewards/margins": 1.531050682067871, + "rewards/rejected": -3.2970519065856934, + "sft_loss": 1.7716779708862305, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 9.94103612508856, + "learning_rate": 4.4956667945671496e-08, + "logits/chosen": -0.17274287343025208, + "logits/rejected": 0.008575853891670704, + "logps/chosen": -1.743222951889038, + "logps/rejected": -3.646514415740967, + "loss": 0.4995, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.743222951889038, + "rewards/margins": 1.9032917022705078, + "rewards/rejected": -3.646514415740967, + "sft_loss": 1.7744395732879639, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 12.056605720430364, + "learning_rate": 4.382843014628168e-08, + "logits/chosen": -0.15395765006542206, + "logits/rejected": -0.012604189105331898, + "logps/chosen": -1.720470666885376, + "logps/rejected": -3.3299014568328857, + "loss": 0.5348, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.720470666885376, + "rewards/margins": 1.6094309091567993, + "rewards/rejected": -3.3299014568328857, + "sft_loss": 1.7798278331756592, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 10.60887867025444, + "learning_rate": 4.271432012984938e-08, + "logits/chosen": -0.1706741452217102, + "logits/rejected": -0.04382907226681709, + "logps/chosen": -1.7484376430511475, + "logps/rejected": -3.6875336170196533, + "loss": 0.4884, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7484376430511475, + "rewards/margins": 1.9390960931777954, + "rewards/rejected": -3.6875336170196533, + "sft_loss": 1.8463385105133057, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 12.901398096306242, + "learning_rate": 4.1614348705474534e-08, + "logits/chosen": -0.08455139398574829, + "logits/rejected": 0.15452969074249268, + "logps/chosen": -1.8069339990615845, + "logps/rejected": -3.600315570831299, + "loss": 0.5342, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.8069339990615845, + "rewards/margins": 1.793381690979004, + "rewards/rejected": -3.600315570831299, + "sft_loss": 1.8492975234985352, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 9.586087613823814, + "learning_rate": 4.052852654508482e-08, + "logits/chosen": -0.30744099617004395, + "logits/rejected": -0.060971058905124664, + "logps/chosen": -1.7557405233383179, + "logps/rejected": -3.377889633178711, + "loss": 0.5243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7557405233383179, + "rewards/margins": 1.6221487522125244, + "rewards/rejected": -3.377889633178711, + "sft_loss": 1.7627824544906616, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 13.932972673755787, + "learning_rate": 3.9456864183331557e-08, + "logits/chosen": -0.23899254202842712, + "logits/rejected": -0.01942940428853035, + "logps/chosen": -1.7661815881729126, + "logps/rejected": -3.4561429023742676, + "loss": 0.5015, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7661815881729126, + "rewards/margins": 1.6899614334106445, + "rewards/rejected": -3.4561429023742676, + "sft_loss": 1.7765686511993408, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 18.164615491069522, + "learning_rate": 3.839937201748744e-08, + "logits/chosen": -0.2424912005662918, + "logits/rejected": 0.09279437363147736, + "logps/chosen": -1.8523527383804321, + "logps/rejected": -3.6534671783447266, + "loss": 0.5522, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.8523527383804321, + "rewards/margins": 1.8011143207550049, + "rewards/rejected": -3.6534671783447266, + "sft_loss": 1.8589980602264404, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 11.459943642977123, + "learning_rate": 3.735606030734651e-08, + "logits/chosen": -0.16552621126174927, + "logits/rejected": -0.05069739744067192, + "logps/chosen": -1.7162492275238037, + "logps/rejected": -3.163618564605713, + "loss": 0.5858, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.7162492275238037, + "rewards/margins": 1.4473693370819092, + "rewards/rejected": -3.163618564605713, + "sft_loss": 1.7754487991333008, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 20.628405186287445, + "learning_rate": 3.632693917512331e-08, + "logits/chosen": -0.25471025705337524, + "logits/rejected": -0.04566922411322594, + "logps/chosen": -1.8308833837509155, + "logps/rejected": -3.5067691802978516, + "loss": 0.5753, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8308833837509155, + "rewards/margins": 1.6758854389190674, + "rewards/rejected": -3.5067691802978516, + "sft_loss": 1.892290711402893, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 11.163614574089873, + "learning_rate": 3.531201860535588e-08, + "logits/chosen": -0.23860302567481995, + "logits/rejected": 0.1082988828420639, + "logps/chosen": -1.8234989643096924, + "logps/rejected": -3.4302475452423096, + "loss": 0.5458, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8234989643096924, + "rewards/margins": 1.6067485809326172, + "rewards/rejected": -3.4302475452423096, + "sft_loss": 1.834429144859314, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 15.540488575948466, + "learning_rate": 3.431130844480762e-08, + "logits/chosen": -0.11743469536304474, + "logits/rejected": -0.007621115539222956, + "logps/chosen": -1.7462657690048218, + "logps/rejected": -3.396251678466797, + "loss": 0.5565, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7462657690048218, + "rewards/margins": 1.6499862670898438, + "rewards/rejected": -3.396251678466797, + "sft_loss": 1.870222806930542, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 10.605155490489334, + "learning_rate": 3.332481840237306e-08, + "logits/chosen": -0.3517284691333771, + "logits/rejected": 0.02356250211596489, + "logps/chosen": -1.9717661142349243, + "logps/rejected": -3.6357693672180176, + "loss": 0.579, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9717661142349243, + "rewards/margins": 1.6640033721923828, + "rewards/rejected": -3.6357693672180176, + "sft_loss": 2.0109448432922363, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 13.903346355540794, + "learning_rate": 3.235255804898307e-08, + "logits/chosen": -0.12415225803852081, + "logits/rejected": 0.09575439989566803, + "logps/chosen": -1.6956020593643188, + "logps/rejected": -3.378117799758911, + "loss": 0.4932, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.6956020593643188, + "rewards/margins": 1.6825157403945923, + "rewards/rejected": -3.378117799758911, + "sft_loss": 1.7468980550765991, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 13.282523255632926, + "learning_rate": 3.1394536817511475e-08, + "logits/chosen": -0.13596105575561523, + "logits/rejected": 0.08177070319652557, + "logps/chosen": -1.8514257669448853, + "logps/rejected": -3.500671863555908, + "loss": 0.5372, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.8514257669448853, + "rewards/margins": 1.6492455005645752, + "rewards/rejected": -3.500671863555908, + "sft_loss": 1.8731330633163452, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 11.71948389715706, + "learning_rate": 3.0450764002684926e-08, + "logits/chosen": -0.16972532868385315, + "logits/rejected": 0.15765061974525452, + "logps/chosen": -1.8777910470962524, + "logps/rejected": -3.807753086090088, + "loss": 0.496, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8777910470962524, + "rewards/margins": 1.929961919784546, + "rewards/rejected": -3.807753086090088, + "sft_loss": 1.8987195491790771, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 13.747807261958386, + "learning_rate": 2.9521248760991158e-08, + "logits/chosen": -0.21182194352149963, + "logits/rejected": -0.03415367752313614, + "logps/chosen": -1.7448492050170898, + "logps/rejected": -3.524975538253784, + "loss": 0.523, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7448492050170898, + "rewards/margins": 1.7801263332366943, + "rewards/rejected": -3.524975538253784, + "sft_loss": 1.7703666687011719, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 8.872612705144913, + "learning_rate": 2.8606000110591224e-08, + "logits/chosen": -0.1471521556377411, + "logits/rejected": 0.08794516324996948, + "logps/chosen": -1.7258751392364502, + "logps/rejected": -3.201476573944092, + "loss": 0.5474, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7258751392364502, + "rewards/margins": 1.4756014347076416, + "rewards/rejected": -3.201476573944092, + "sft_loss": 1.788822889328003, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 8.382828109806677, + "learning_rate": 2.770502693123139e-08, + "logits/chosen": -0.27462801337242126, + "logits/rejected": 0.04605941101908684, + "logps/chosen": -1.8597522974014282, + "logps/rejected": -3.6685824394226074, + "loss": 0.5173, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8597522974014282, + "rewards/margins": 1.8088302612304688, + "rewards/rejected": -3.6685824394226074, + "sft_loss": 1.9247268438339233, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 14.595090230431929, + "learning_rate": 2.6818337964157726e-08, + "logits/chosen": -0.13967612385749817, + "logits/rejected": -0.043948955833911896, + "logps/chosen": -1.7874114513397217, + "logps/rejected": -3.5716145038604736, + "loss": 0.4983, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7874114513397217, + "rewards/margins": 1.7842029333114624, + "rewards/rejected": -3.5716145038604736, + "sft_loss": 1.790167212486267, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 12.277514913585133, + "learning_rate": 2.5945941812029973e-08, + "logits/chosen": -0.1475212275981903, + "logits/rejected": 0.059523582458496094, + "logps/chosen": -1.814934492111206, + "logps/rejected": -3.316126585006714, + "loss": 0.574, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.814934492111206, + "rewards/margins": 1.5011920928955078, + "rewards/rejected": -3.316126585006714, + "sft_loss": 1.8950258493423462, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 10.411890019919085, + "learning_rate": 2.5087846938839976e-08, + "logits/chosen": -0.38515740633010864, + "logits/rejected": 0.014054941944777966, + "logps/chosen": -1.7699428796768188, + "logps/rejected": -3.6067757606506348, + "loss": 0.538, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7699428796768188, + "rewards/margins": 1.8368332386016846, + "rewards/rejected": -3.6067757606506348, + "sft_loss": 1.8175370693206787, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 15.027736125517917, + "learning_rate": 2.42440616698274e-08, + "logits/chosen": -0.10710807144641876, + "logits/rejected": 0.15287144482135773, + "logps/chosen": -1.7904659509658813, + "logps/rejected": -3.299647569656372, + "loss": 0.5349, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7904659509658813, + "rewards/margins": 1.5091816186904907, + "rewards/rejected": -3.299647569656372, + "sft_loss": 1.8567641973495483, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 12.107039402674326, + "learning_rate": 2.3414594191401128e-08, + "logits/chosen": -0.10171730816364288, + "logits/rejected": -0.007555614225566387, + "logps/chosen": -1.7247684001922607, + "logps/rejected": -3.2738890647888184, + "loss": 0.5393, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.7247684001922607, + "rewards/margins": 1.5491206645965576, + "rewards/rejected": -3.2738890647888184, + "sft_loss": 1.7477623224258423, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 8.953171514879182, + "learning_rate": 2.2599452551057998e-08, + "logits/chosen": -0.12288618087768555, + "logits/rejected": 0.1192425936460495, + "logps/chosen": -1.8108208179473877, + "logps/rejected": -3.600740432739258, + "loss": 0.4982, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8108208179473877, + "rewards/margins": 1.7899196147918701, + "rewards/rejected": -3.600740432739258, + "sft_loss": 1.8616046905517578, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 17.200615482379387, + "learning_rate": 2.1798644657305857e-08, + "logits/chosen": -0.02967996522784233, + "logits/rejected": 0.12160022556781769, + "logps/chosen": -1.6938245296478271, + "logps/rejected": -3.458918333053589, + "loss": 0.5273, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6938245296478271, + "rewards/margins": 1.7650935649871826, + "rewards/rejected": -3.458918333053589, + "sft_loss": 1.7733080387115479, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 10.512460460524174, + "learning_rate": 2.1012178279586293e-08, + "logits/chosen": 0.0147629976272583, + "logits/rejected": -0.05085957795381546, + "logps/chosen": -1.692972183227539, + "logps/rejected": -3.099750518798828, + "loss": 0.5882, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.692972183227539, + "rewards/margins": 1.406778335571289, + "rewards/rejected": -3.099750518798828, + "sft_loss": 1.752136468887329, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 18.087881816117683, + "learning_rate": 2.02400610481997e-08, + "logits/chosen": -0.0952359288930893, + "logits/rejected": -0.03134078532457352, + "logps/chosen": -1.7446550130844116, + "logps/rejected": -3.2378089427948, + "loss": 0.5408, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7446550130844116, + "rewards/margins": 1.4931542873382568, + "rewards/rejected": -3.2378089427948, + "sft_loss": 1.7278072834014893, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 13.153300138418738, + "learning_rate": 1.948230045423083e-08, + "logits/chosen": -0.27102628350257874, + "logits/rejected": 0.03965846449136734, + "logps/chosen": -1.6783840656280518, + "logps/rejected": -3.341630220413208, + "loss": 0.5003, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6783840656280518, + "rewards/margins": 1.6632461547851562, + "rewards/rejected": -3.341630220413208, + "sft_loss": 1.7147563695907593, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 13.974310829234309, + "learning_rate": 1.8738903849476186e-08, + "logits/chosen": -0.037626512348651886, + "logits/rejected": -0.05883268639445305, + "logps/chosen": -1.848806381225586, + "logps/rejected": -3.4039673805236816, + "loss": 0.571, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.848806381225586, + "rewards/margins": 1.5551612377166748, + "rewards/rejected": -3.4039673805236816, + "sft_loss": 1.8302112817764282, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 11.59311245393821, + "learning_rate": 1.8009878446373083e-08, + "logits/chosen": -0.17010925710201263, + "logits/rejected": -0.07235264778137207, + "logps/chosen": -1.8091201782226562, + "logps/rejected": -3.298564910888672, + "loss": 0.5749, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8091201782226562, + "rewards/margins": 1.4894448518753052, + "rewards/rejected": -3.298564910888672, + "sft_loss": 1.8607683181762695, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 13.035037630640272, + "learning_rate": 1.729523131792887e-08, + "logits/chosen": -0.21425171196460724, + "logits/rejected": 0.10696268081665039, + "logps/chosen": -1.7733795642852783, + "logps/rejected": -3.222611904144287, + "loss": 0.5998, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.7733795642852783, + "rewards/margins": 1.4492326974868774, + "rewards/rejected": -3.222611904144287, + "sft_loss": 1.8633842468261719, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 9.402984291729082, + "learning_rate": 1.6594969397653316e-08, + "logits/chosen": -0.2268737107515335, + "logits/rejected": 0.0147031145170331, + "logps/chosen": -1.7411689758300781, + "logps/rejected": -3.4823131561279297, + "loss": 0.5154, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7411689758300781, + "rewards/margins": 1.7411444187164307, + "rewards/rejected": -3.4823131561279297, + "sft_loss": 1.8098264932632446, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 15.404025802231084, + "learning_rate": 1.5909099479490653e-08, + "logits/chosen": -0.043330464512109756, + "logits/rejected": -0.0020337612368166447, + "logps/chosen": -1.748051643371582, + "logps/rejected": -3.0739541053771973, + "loss": 0.5648, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.748051643371582, + "rewards/margins": 1.3259022235870361, + "rewards/rejected": -3.0739541053771973, + "sft_loss": 1.7951023578643799, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 16.918432357629232, + "learning_rate": 1.5237628217753818e-08, + "logits/chosen": -0.12749573588371277, + "logits/rejected": -0.018352797254920006, + "logps/chosen": -1.6635348796844482, + "logps/rejected": -3.588423252105713, + "loss": 0.519, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6635348796844482, + "rewards/margins": 1.924888253211975, + "rewards/rejected": -3.588423252105713, + "sft_loss": 1.7674754858016968, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 11.488033708406595, + "learning_rate": 1.4580562127059994e-08, + "logits/chosen": -0.24728181958198547, + "logits/rejected": 0.15568208694458008, + "logps/chosen": -1.9034550189971924, + "logps/rejected": -3.6897387504577637, + "loss": 0.5484, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.9034550189971924, + "rewards/margins": 1.7862837314605713, + "rewards/rejected": -3.6897387504577637, + "sft_loss": 1.9504950046539307, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 8.791458916777316, + "learning_rate": 1.3937907582267151e-08, + "logits/chosen": -0.05900830030441284, + "logits/rejected": 0.0781247466802597, + "logps/chosen": -1.6860978603363037, + "logps/rejected": -3.2836456298828125, + "loss": 0.5213, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.6860978603363037, + "rewards/margins": 1.5975478887557983, + "rewards/rejected": -3.2836456298828125, + "sft_loss": 1.7643423080444336, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 13.08532454428947, + "learning_rate": 1.3309670818412446e-08, + "logits/chosen": -0.1450091302394867, + "logits/rejected": 0.0737738236784935, + "logps/chosen": -1.845995545387268, + "logps/rejected": -3.287126064300537, + "loss": 0.5868, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.845995545387268, + "rewards/margins": 1.4411306381225586, + "rewards/rejected": -3.287126064300537, + "sft_loss": 1.9166057109832764, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 11.68997742530587, + "learning_rate": 1.2695857930651921e-08, + "logits/chosen": -0.3569498658180237, + "logits/rejected": 0.0017510965699329972, + "logps/chosen": -1.696563959121704, + "logps/rejected": -3.3799500465393066, + "loss": 0.4984, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.696563959121704, + "rewards/margins": 1.683386206626892, + "rewards/rejected": -3.3799500465393066, + "sft_loss": 1.765641450881958, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 11.348280682159059, + "learning_rate": 1.2096474874200735e-08, + "logits/chosen": -0.2573717534542084, + "logits/rejected": 0.13519421219825745, + "logps/chosen": -1.754091501235962, + "logps/rejected": -3.702437162399292, + "loss": 0.4887, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.754091501235962, + "rewards/margins": 1.9483455419540405, + "rewards/rejected": -3.702437162399292, + "sft_loss": 1.761106252670288, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 14.581538154666081, + "learning_rate": 1.1511527464276194e-08, + "logits/chosen": -0.04870340973138809, + "logits/rejected": 0.03849010914564133, + "logps/chosen": -1.8830690383911133, + "logps/rejected": -3.5184459686279297, + "loss": 0.5275, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8830690383911133, + "rewards/margins": 1.6353766918182373, + "rewards/rejected": -3.5184459686279297, + "sft_loss": 1.9118648767471313, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 8.768613582240272, + "learning_rate": 1.0941021376040305e-08, + "logits/chosen": -0.12008903920650482, + "logits/rejected": 0.060319773852825165, + "logps/chosen": -1.724844217300415, + "logps/rejected": -3.5657737255096436, + "loss": 0.5389, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.724844217300415, + "rewards/margins": 1.8409297466278076, + "rewards/rejected": -3.5657737255096436, + "sft_loss": 1.7985265254974365, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 11.409485948184829, + "learning_rate": 1.0384962144545818e-08, + "logits/chosen": -0.20349383354187012, + "logits/rejected": 0.09238765388727188, + "logps/chosen": -1.806945562362671, + "logps/rejected": -3.303162097930908, + "loss": 0.5579, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.806945562362671, + "rewards/margins": 1.4962167739868164, + "rewards/rejected": -3.303162097930908, + "sft_loss": 1.8973405361175537, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 10.82064851844795, + "learning_rate": 9.843355164681767e-09, + "logits/chosen": -0.12326931953430176, + "logits/rejected": -0.015838781371712685, + "logps/chosen": -1.7402493953704834, + "logps/rejected": -3.36765718460083, + "loss": 0.5728, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7402493953704834, + "rewards/margins": 1.627407431602478, + "rewards/rejected": -3.36765718460083, + "sft_loss": 1.8054853677749634, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 12.522878903507149, + "learning_rate": 9.316205691121515e-09, + "logits/chosen": -0.125594824552536, + "logits/rejected": 0.11583630740642548, + "logps/chosen": -1.8031196594238281, + "logps/rejected": -3.636901378631592, + "loss": 0.4952, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.8031196594238281, + "rewards/margins": 1.8337819576263428, + "rewards/rejected": -3.636901378631592, + "sft_loss": 1.8276653289794922, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 11.717915538770692, + "learning_rate": 8.803518838271463e-09, + "logits/chosen": -0.19515851140022278, + "logits/rejected": 0.07911201566457748, + "logps/chosen": -1.7605504989624023, + "logps/rejected": -3.460129976272583, + "loss": 0.4958, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -1.7605504989624023, + "rewards/margins": 1.6995792388916016, + "rewards/rejected": -3.460129976272583, + "sft_loss": 1.8053300380706787, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 10.57249049296401, + "learning_rate": 8.305299580221748e-09, + "logits/chosen": -0.23829343914985657, + "logits/rejected": -0.08533582836389542, + "logps/chosen": -1.7042344808578491, + "logps/rejected": -3.46440052986145, + "loss": 0.518, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.7042344808578491, + "rewards/margins": 1.7601664066314697, + "rewards/rejected": -3.46440052986145, + "sft_loss": 1.8231385946273804, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 8.628673797910038, + "learning_rate": 7.821552750697958e-09, + "logits/chosen": -0.2571510374546051, + "logits/rejected": -0.014047443866729736, + "logps/chosen": -1.7087123394012451, + "logps/rejected": -3.186110496520996, + "loss": 0.5563, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7087123394012451, + "rewards/margins": 1.4773979187011719, + "rewards/rejected": -3.186110496520996, + "sft_loss": 1.7801802158355713, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 12.093261360034742, + "learning_rate": 7.3522830430136635e-09, + "logits/chosen": 0.09196645021438599, + "logits/rejected": 0.17079466581344604, + "logps/chosen": -1.8431581258773804, + "logps/rejected": -3.837364673614502, + "loss": 0.5346, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.8431581258773804, + "rewards/margins": 1.994206190109253, + "rewards/rejected": -3.837364673614502, + "sft_loss": 1.8825957775115967, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 20.621534055002574, + "learning_rate": 6.897495010025956e-09, + "logits/chosen": 0.05238935351371765, + "logits/rejected": 0.17671489715576172, + "logps/chosen": -1.8023525476455688, + "logps/rejected": -3.4923388957977295, + "loss": 0.5297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8023525476455688, + "rewards/margins": 1.689986228942871, + "rewards/rejected": -3.4923388957977295, + "sft_loss": 1.8439724445343018, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 12.591348257745087, + "learning_rate": 6.4571930640899835e-09, + "logits/chosen": -0.2012520730495453, + "logits/rejected": 0.06406156718730927, + "logps/chosen": -1.826565146446228, + "logps/rejected": -3.2655797004699707, + "loss": 0.5732, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.826565146446228, + "rewards/margins": 1.4390145540237427, + "rewards/rejected": -3.2655797004699707, + "sft_loss": 1.8569958209991455, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 11.981205556368744, + "learning_rate": 6.0313814770174836e-09, + "logits/chosen": -0.14899098873138428, + "logits/rejected": 0.07124531269073486, + "logps/chosen": -1.7477327585220337, + "logps/rejected": -3.3878090381622314, + "loss": 0.5441, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7477327585220337, + "rewards/margins": 1.6400762796401978, + "rewards/rejected": -3.3878090381622314, + "sft_loss": 1.8257734775543213, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 9.461939009323746, + "learning_rate": 5.620064380033985e-09, + "logits/chosen": -0.2407839298248291, + "logits/rejected": 0.11912363767623901, + "logps/chosen": -1.8589773178100586, + "logps/rejected": -3.3650786876678467, + "loss": 0.5388, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8589773178100586, + "rewards/margins": 1.5061014890670776, + "rewards/rejected": -3.3650786876678467, + "sft_loss": 1.8304802179336548, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 15.330362075232408, + "learning_rate": 5.22324576374017e-09, + "logits/chosen": -0.14247074723243713, + "logits/rejected": -0.023439304903149605, + "logps/chosen": -1.7721531391143799, + "logps/rejected": -3.2598586082458496, + "loss": 0.5588, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7721531391143799, + "rewards/margins": 1.4877058267593384, + "rewards/rejected": -3.2598586082458496, + "sft_loss": 1.8422420024871826, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 15.33512833982946, + "learning_rate": 4.840929478071576e-09, + "logits/chosen": -0.08151860535144806, + "logits/rejected": -0.15125760436058044, + "logps/chosen": -1.6786878108978271, + "logps/rejected": -3.1958765983581543, + "loss": 0.5362, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.6786878108978271, + "rewards/margins": 1.5171887874603271, + "rewards/rejected": -3.1958765983581543, + "sft_loss": 1.7440593242645264, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 17.120913980937942, + "learning_rate": 4.47311923226279e-09, + "logits/chosen": -0.14595165848731995, + "logits/rejected": 0.05748923867940903, + "logps/chosen": -1.7532291412353516, + "logps/rejected": -3.318749189376831, + "loss": 0.568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7532291412353516, + "rewards/margins": 1.5655204057693481, + "rewards/rejected": -3.318749189376831, + "sft_loss": 1.8259124755859375, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 10.292380804604246, + "learning_rate": 4.119818594810476e-09, + "logits/chosen": -0.04360206052660942, + "logits/rejected": 0.24839358031749725, + "logps/chosen": -1.686971664428711, + "logps/rejected": -3.217381238937378, + "loss": 0.5255, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.686971664428711, + "rewards/margins": 1.530409574508667, + "rewards/rejected": -3.217381238937378, + "sft_loss": 1.7612766027450562, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 13.509652282634574, + "learning_rate": 3.781030993438573e-09, + "logits/chosen": -0.1148734912276268, + "logits/rejected": -0.03746723383665085, + "logps/chosen": -1.7142328023910522, + "logps/rejected": -3.359126329421997, + "loss": 0.5359, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.7142328023910522, + "rewards/margins": 1.6448932886123657, + "rewards/rejected": -3.359126329421997, + "sft_loss": 1.8104727268218994, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 12.38245292114047, + "learning_rate": 3.4567597150663155e-09, + "logits/chosen": -0.2548621594905853, + "logits/rejected": 0.08878061920404434, + "logps/chosen": -1.7587592601776123, + "logps/rejected": -3.5690054893493652, + "loss": 0.5159, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7587592601776123, + "rewards/margins": 1.810246467590332, + "rewards/rejected": -3.5690054893493652, + "sft_loss": 1.8457529544830322, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 9.47868750971961, + "learning_rate": 3.147007905774768e-09, + "logits/chosen": -0.04488161578774452, + "logits/rejected": 0.1181039810180664, + "logps/chosen": -1.8180328607559204, + "logps/rejected": -3.4446120262145996, + "loss": 0.5414, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.8180328607559204, + "rewards/margins": 1.6265792846679688, + "rewards/rejected": -3.4446120262145996, + "sft_loss": 1.8337246179580688, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 14.651113259446056, + "learning_rate": 2.851778570777508e-09, + "logits/chosen": -0.05829663202166557, + "logits/rejected": -0.03663646802306175, + "logps/chosen": -1.7952572107315063, + "logps/rejected": -3.320112705230713, + "loss": 0.5477, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.7952572107315063, + "rewards/margins": 1.524855375289917, + "rewards/rejected": -3.320112705230713, + "sft_loss": 1.843465805053711, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 9.673833307967234, + "learning_rate": 2.5710745743908192e-09, + "logits/chosen": -0.15737508237361908, + "logits/rejected": 0.07274798303842545, + "logps/chosen": -1.8004143238067627, + "logps/rejected": -3.758871078491211, + "loss": 0.521, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8004143238067627, + "rewards/margins": 1.9584567546844482, + "rewards/rejected": -3.758871078491211, + "sft_loss": 1.8147557973861694, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 15.459388243890075, + "learning_rate": 2.304898640006048e-09, + "logits/chosen": -0.2569485306739807, + "logits/rejected": -0.02802509441971779, + "logps/chosen": -1.707179307937622, + "logps/rejected": -3.340653896331787, + "loss": 0.5397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.707179307937622, + "rewards/margins": 1.633474588394165, + "rewards/rejected": -3.340653896331787, + "sft_loss": 1.824716329574585, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 16.619593710988614, + "learning_rate": 2.0532533500631225e-09, + "logits/chosen": -0.1367146372795105, + "logits/rejected": -0.025258731096982956, + "logps/chosen": -1.7326462268829346, + "logps/rejected": -3.215005874633789, + "loss": 0.5669, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7326462268829346, + "rewards/margins": 1.482359528541565, + "rewards/rejected": -3.215005874633789, + "sft_loss": 1.795987844467163, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 18.319373164749003, + "learning_rate": 1.8161411460262401e-09, + "logits/chosen": -0.12124613672494888, + "logits/rejected": 0.10767862945795059, + "logps/chosen": -1.8736505508422852, + "logps/rejected": -3.7418556213378906, + "loss": 0.5365, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.8736505508422852, + "rewards/margins": 1.8682053089141846, + "rewards/rejected": -3.7418556213378906, + "sft_loss": 1.906241774559021, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 12.831447954821886, + "learning_rate": 1.5935643283585545e-09, + "logits/chosen": -0.2294524610042572, + "logits/rejected": 0.14085766673088074, + "logps/chosen": -1.854288101196289, + "logps/rejected": -3.3342108726501465, + "loss": 0.546, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.854288101196289, + "rewards/margins": 1.4799230098724365, + "rewards/rejected": -3.3342108726501465, + "sft_loss": 1.9192848205566406, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 16.18105063589966, + "learning_rate": 1.3855250565015244e-09, + "logits/chosen": -0.09225011616945267, + "logits/rejected": -0.02986457571387291, + "logps/chosen": -1.7413854598999023, + "logps/rejected": -3.3444411754608154, + "loss": 0.5748, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7413854598999023, + "rewards/margins": 1.6030553579330444, + "rewards/rejected": -3.3444411754608154, + "sft_loss": 1.8075097799301147, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 12.385490252476076, + "learning_rate": 1.1920253488530986e-09, + "logits/chosen": -0.29104679822921753, + "logits/rejected": -0.04165268689393997, + "logps/chosen": -1.7731249332427979, + "logps/rejected": -3.362682819366455, + "loss": 0.5416, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7731249332427979, + "rewards/margins": 1.5895576477050781, + "rewards/rejected": -3.362682819366455, + "sft_loss": 1.7579162120819092, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 11.035865192919763, + "learning_rate": 1.0130670827482314e-09, + "logits/chosen": -0.1392376571893692, + "logits/rejected": 0.03444616496562958, + "logps/chosen": -1.7362483739852905, + "logps/rejected": -3.165205478668213, + "loss": 0.5506, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.7362483739852905, + "rewards/margins": 1.4289571046829224, + "rewards/rejected": -3.165205478668213, + "sft_loss": 1.7826095819473267, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 13.187865717975045, + "learning_rate": 8.4865199444073e-10, + "logits/chosen": -0.028293948620557785, + "logits/rejected": 0.07021278142929077, + "logps/chosen": -1.8036301136016846, + "logps/rejected": -3.4158120155334473, + "loss": 0.5414, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8036301136016846, + "rewards/margins": 1.6121822595596313, + "rewards/rejected": -3.4158120155334473, + "sft_loss": 1.844257116317749, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 16.444463729797935, + "learning_rate": 6.987816790866019e-10, + "logits/chosen": -0.12553277611732483, + "logits/rejected": 0.1887044906616211, + "logps/chosen": -1.8647096157073975, + "logps/rejected": -3.710162401199341, + "loss": 0.5544, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.8647096157073975, + "rewards/margins": 1.845452904701233, + "rewards/rejected": -3.710162401199341, + "sft_loss": 1.8833541870117188, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 10.646863416876881, + "learning_rate": 5.634575907284001e-10, + "logits/chosen": -0.027017977088689804, + "logits/rejected": -0.0031074334401637316, + "logps/chosen": -1.7922757863998413, + "logps/rejected": -3.2982144355773926, + "loss": 0.5833, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.7922757863998413, + "rewards/margins": 1.5059386491775513, + "rewards/rejected": -3.2982144355773926, + "sft_loss": 1.8631671667099, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 10.595165009454474, + "learning_rate": 4.426810422809013e-10, + "logits/chosen": -0.19915366172790527, + "logits/rejected": -0.05446736887097359, + "logps/chosen": -1.6954681873321533, + "logps/rejected": -3.2648377418518066, + "loss": 0.5377, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.6954681873321533, + "rewards/margins": 1.5693693161010742, + "rewards/rejected": -3.2648377418518066, + "sft_loss": 1.7231022119522095, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 10.660523203834975, + "learning_rate": 3.36453205518783e-10, + "logits/chosen": -0.14437521994113922, + "logits/rejected": 0.060533732175827026, + "logps/chosen": -1.7375590801239014, + "logps/rejected": -3.7755045890808105, + "loss": 0.4866, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.7375590801239014, + "rewards/margins": 2.0379457473754883, + "rewards/rejected": -3.7755045890808105, + "sft_loss": 1.7783511877059937, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 14.052826810699122, + "learning_rate": 2.447751110647989e-10, + "logits/chosen": -0.13428905606269836, + "logits/rejected": 0.12192927300930023, + "logps/chosen": -1.6841440200805664, + "logps/rejected": -3.4735703468322754, + "loss": 0.5293, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.6841440200805664, + "rewards/margins": 1.7894260883331299, + "rewards/rejected": -3.4735703468322754, + "sft_loss": 1.753156304359436, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 9.401172317085365, + "learning_rate": 1.6764764838045342e-10, + "logits/chosen": -0.278870165348053, + "logits/rejected": 0.15991750359535217, + "logps/chosen": -1.7664949893951416, + "logps/rejected": -3.2208919525146484, + "loss": 0.5423, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.7664949893951416, + "rewards/margins": 1.4543970823287964, + "rewards/rejected": -3.2208919525146484, + "sft_loss": 1.7974853515625, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 10.847683056633393, + "learning_rate": 1.0507156575650934e-10, + "logits/chosen": -0.20414504408836365, + "logits/rejected": 0.07850085198879242, + "logps/chosen": -1.7775157690048218, + "logps/rejected": -3.567974090576172, + "loss": 0.5171, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.7775157690048218, + "rewards/margins": 1.790458083152771, + "rewards/rejected": -3.567974090576172, + "sft_loss": 1.896284818649292, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 9.824306185085138, + "learning_rate": 5.7047470306659246e-11, + "logits/chosen": -0.09952554851770401, + "logits/rejected": -0.00851814728230238, + "logps/chosen": -1.907403588294983, + "logps/rejected": -3.7734904289245605, + "loss": 0.554, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.907403588294983, + "rewards/margins": 1.8660869598388672, + "rewards/rejected": -3.7734904289245605, + "sft_loss": 1.86178457736969, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 12.186981043972446, + "learning_rate": 2.3575827960697906e-11, + "logits/chosen": -0.13862931728363037, + "logits/rejected": 0.1427539885044098, + "logps/chosen": -1.7564480304718018, + "logps/rejected": -3.5415992736816406, + "loss": 0.5121, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.7564480304718018, + "rewards/margins": 1.7851512432098389, + "rewards/rejected": -3.5415992736816406, + "sft_loss": 1.837898850440979, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 11.285735169160725, + "learning_rate": 4.656963460691888e-12, + "logits/chosen": -0.1521228700876236, + "logits/rejected": 0.05595237761735916, + "logps/chosen": -1.8109420537948608, + "logps/rejected": -3.670588731765747, + "loss": 0.5395, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8109420537948608, + "rewards/margins": 1.8596467971801758, + "rewards/rejected": -3.670588731765747, + "sft_loss": 1.8834893703460693, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.4339616000652313, + "eval_logits/rejected": 0.5812661647796631, + "eval_logps/chosen": -2.0735538005828857, + "eval_logps/rejected": -3.1333560943603516, + "eval_loss": 0.7447991967201233, + "eval_rewards/accuracies": 0.7069733142852783, + "eval_rewards/chosen": -2.0735538005828857, + "eval_rewards/margins": 1.0598026514053345, + "eval_rewards/rejected": -3.1333560943603516, + "eval_runtime": 47.8747, + "eval_samples_per_second": 28.094, + "eval_sft_loss": 1.9881486892700195, + "eval_steps_per_second": 7.039, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.6688217556451066, + "train_runtime": 34631.8364, + "train_samples_per_second": 5.179, + "train_steps_per_second": 0.162 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}