{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 16.01126099526898, "learning_rate": 1.0638297872340425e-08, "logits/chosen": 1.7974858283996582, "logits/rejected": 1.927241563796997, "logps/chosen": -269.252685546875, "logps/rejected": -268.1457824707031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004274646005877639, "grad_norm": 16.887384715665494, "learning_rate": 2.127659574468085e-08, "logits/chosen": 1.7954446077346802, "logits/rejected": 1.7747466564178467, "logps/chosen": -377.0067138671875, "logps/rejected": -384.9050598144531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.006411969008816457, "grad_norm": 18.97306536839058, "learning_rate": 3.191489361702127e-08, "logits/chosen": 2.0545246601104736, "logits/rejected": 2.030306577682495, "logps/chosen": -355.0686950683594, "logps/rejected": -359.64227294921875, "loss": 0.6956, "rewards/accuracies": 0.625, "rewards/chosen": 0.01744360849261284, "rewards/margins": 0.021479416638612747, "rewards/rejected": -0.004035805352032185, "step": 3 }, { "epoch": 0.008549292011755277, "grad_norm": 17.345901334680676, "learning_rate": 4.25531914893617e-08, "logits/chosen": 2.01006817817688, "logits/rejected": 1.9202210903167725, "logps/chosen": -344.22540283203125, "logps/rejected": -336.4920654296875, "loss": 0.695, "rewards/accuracies": 0.4375, "rewards/chosen": 0.004112933296710253, "rewards/margins": 0.008065867237746716, "rewards/rejected": -0.003952931612730026, "step": 4 }, { "epoch": 0.010686615014694095, "grad_norm": 16.581897782195426, "learning_rate": 5.3191489361702123e-08, "logits/chosen": 1.6817424297332764, "logits/rejected": 1.723240613937378, "logps/chosen": -459.6365966796875, "logps/rejected": -435.55584716796875, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.009510684758424759, "rewards/margins": 0.012329434975981712, "rewards/rejected": -0.021840117871761322, "step": 5 }, { "epoch": 0.012823938017632914, "grad_norm": 16.750335274402644, "learning_rate": 6.382978723404254e-08, "logits/chosen": 1.8463941812515259, "logits/rejected": 2.0205507278442383, "logps/chosen": -318.8436584472656, "logps/rejected": -333.49664306640625, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": -0.008932210505008698, "rewards/margins": 0.020014524459838867, "rewards/rejected": -0.028946734964847565, "step": 6 }, { "epoch": 0.014961261020571734, "grad_norm": 16.740726706909392, "learning_rate": 7.446808510638298e-08, "logits/chosen": 1.9784932136535645, "logits/rejected": 2.030113458633423, "logps/chosen": -490.574951171875, "logps/rejected": -506.27325439453125, "loss": 0.6907, "rewards/accuracies": 0.5625, "rewards/chosen": 0.023572897538542747, "rewards/margins": 0.0131209846585989, "rewards/rejected": 0.010451912879943848, "step": 7 }, { "epoch": 0.017098584023510555, "grad_norm": 19.90480338228391, "learning_rate": 8.51063829787234e-08, "logits/chosen": 2.1563167572021484, "logits/rejected": 2.082562208175659, "logps/chosen": -391.081298828125, "logps/rejected": -380.1916809082031, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": -0.0003595345187932253, "rewards/margins": -0.007014084607362747, "rewards/rejected": 0.006654549390077591, "step": 8 }, { "epoch": 0.01923590702644937, "grad_norm": 17.58226872418466, "learning_rate": 9.574468085106382e-08, "logits/chosen": 2.088273525238037, "logits/rejected": 2.1584815979003906, "logps/chosen": -398.4393615722656, "logps/rejected": -407.91717529296875, "loss": 0.6991, "rewards/accuracies": 0.6875, "rewards/chosen": -0.010095404461026192, "rewards/margins": 0.026253772899508476, "rewards/rejected": -0.03634917736053467, "step": 9 }, { "epoch": 0.02137323002938819, "grad_norm": 15.285861983785653, "learning_rate": 1.0638297872340425e-07, "logits/chosen": 1.5849823951721191, "logits/rejected": 1.6066241264343262, "logps/chosen": -275.02215576171875, "logps/rejected": -285.42071533203125, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008988383924588561, "rewards/margins": -0.001907038502395153, "rewards/rejected": 0.002805877011269331, "step": 10 }, { "epoch": 0.02351055303232701, "grad_norm": 17.931865230874028, "learning_rate": 1.1702127659574468e-07, "logits/chosen": 1.9127209186553955, "logits/rejected": 1.9879655838012695, "logps/chosen": -378.0723876953125, "logps/rejected": -410.64752197265625, "loss": 0.6937, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05186593905091286, "rewards/margins": 0.0336148738861084, "rewards/rejected": 0.01825106143951416, "step": 11 }, { "epoch": 0.02564787603526583, "grad_norm": 15.757238933424347, "learning_rate": 1.2765957446808508e-07, "logits/chosen": 2.188413619995117, "logits/rejected": 2.0202558040618896, "logps/chosen": -393.43817138671875, "logps/rejected": -369.0041198730469, "loss": 0.6912, "rewards/accuracies": 0.375, "rewards/chosen": -0.009139979258179665, "rewards/margins": -0.023510374128818512, "rewards/rejected": 0.014370394870638847, "step": 12 }, { "epoch": 0.027785199038204648, "grad_norm": 20.954337431493947, "learning_rate": 1.3829787234042553e-07, "logits/chosen": 2.3227410316467285, "logits/rejected": 2.255707025527954, "logps/chosen": -425.1292724609375, "logps/rejected": -398.930908203125, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": 0.016432786360383034, "rewards/margins": 0.021908044815063477, "rewards/rejected": -0.0054752579890191555, "step": 13 }, { "epoch": 0.029922522041143467, "grad_norm": 17.575256149749116, "learning_rate": 1.4893617021276595e-07, "logits/chosen": 0.9818891286849976, "logits/rejected": 1.0016374588012695, "logps/chosen": -221.91647338867188, "logps/rejected": -221.5160369873047, "loss": 0.6911, "rewards/accuracies": 0.3125, "rewards/chosen": 0.01453709602355957, "rewards/margins": -0.007119536399841309, "rewards/rejected": 0.02165663242340088, "step": 14 }, { "epoch": 0.03205984504408229, "grad_norm": 15.987335944794728, "learning_rate": 1.5957446808510638e-07, "logits/chosen": 1.8959016799926758, "logits/rejected": 1.9433327913284302, "logps/chosen": -296.5621032714844, "logps/rejected": -310.80889892578125, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": 0.011275790631771088, "rewards/margins": 0.00550649082288146, "rewards/rejected": 0.00576929934322834, "step": 15 }, { "epoch": 0.03419716804702111, "grad_norm": 18.43866679208328, "learning_rate": 1.702127659574468e-07, "logits/chosen": 1.4447689056396484, "logits/rejected": 1.5044986009597778, "logps/chosen": -256.1441345214844, "logps/rejected": -300.1876525878906, "loss": 0.697, "rewards/accuracies": 0.4375, "rewards/chosen": 0.009331870824098587, "rewards/margins": -0.006017827428877354, "rewards/rejected": 0.015349699184298515, "step": 16 }, { "epoch": 0.03633449104995993, "grad_norm": 15.462024767466382, "learning_rate": 1.8085106382978725e-07, "logits/chosen": 2.2961068153381348, "logits/rejected": 2.213308572769165, "logps/chosen": -369.3326110839844, "logps/rejected": -354.31683349609375, "loss": 0.6885, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015958024188876152, "rewards/margins": 0.03008664958178997, "rewards/rejected": -0.014128627255558968, "step": 17 }, { "epoch": 0.03847181405289874, "grad_norm": 17.489044223273293, "learning_rate": 1.9148936170212765e-07, "logits/chosen": 1.8175828456878662, "logits/rejected": 1.859854817390442, "logps/chosen": -414.024169921875, "logps/rejected": -413.8213195800781, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": 0.024924801662564278, "rewards/margins": -0.003364275209605694, "rewards/rejected": 0.028289081528782845, "step": 18 }, { "epoch": 0.04060913705583756, "grad_norm": 17.114195544232533, "learning_rate": 2.0212765957446807e-07, "logits/chosen": 2.317970037460327, "logits/rejected": 2.2373180389404297, "logps/chosen": -442.814453125, "logps/rejected": -414.55535888671875, "loss": 0.6946, "rewards/accuracies": 0.375, "rewards/chosen": -0.005524086765944958, "rewards/margins": 0.01554189808666706, "rewards/rejected": -0.021065985783934593, "step": 19 }, { "epoch": 0.04274646005877638, "grad_norm": 16.890434285248393, "learning_rate": 2.127659574468085e-07, "logits/chosen": 1.76038658618927, "logits/rejected": 1.7405922412872314, "logps/chosen": -383.553955078125, "logps/rejected": -391.88543701171875, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": 0.00688550528138876, "rewards/margins": 0.004018557257950306, "rewards/rejected": 0.0028669475577771664, "step": 20 }, { "epoch": 0.0448837830617152, "grad_norm": 17.504320922484634, "learning_rate": 2.2340425531914892e-07, "logits/chosen": 1.5609700679779053, "logits/rejected": 1.7058910131454468, "logps/chosen": -323.5635681152344, "logps/rejected": -348.6455078125, "loss": 0.687, "rewards/accuracies": 0.3125, "rewards/chosen": -0.004861879162490368, "rewards/margins": -0.000568365678191185, "rewards/rejected": -0.004293511621654034, "step": 21 }, { "epoch": 0.04702110606465402, "grad_norm": 18.494669357857983, "learning_rate": 2.3404255319148937e-07, "logits/chosen": 1.8989413976669312, "logits/rejected": 1.9474773406982422, "logps/chosen": -387.58453369140625, "logps/rejected": -415.0438232421875, "loss": 0.6875, "rewards/accuracies": 0.4375, "rewards/chosen": -0.021361876279115677, "rewards/margins": -0.0013632788322865963, "rewards/rejected": -0.019998596981167793, "step": 22 }, { "epoch": 0.04915842906759284, "grad_norm": 16.13049724308586, "learning_rate": 2.4468085106382976e-07, "logits/chosen": 1.3913708925247192, "logits/rejected": 1.5202478170394897, "logps/chosen": -348.6595458984375, "logps/rejected": -392.2963562011719, "loss": 0.6932, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010332870297133923, "rewards/margins": 0.025678444653749466, "rewards/rejected": -0.015345573425292969, "step": 23 }, { "epoch": 0.05129575207053166, "grad_norm": 17.143905423978246, "learning_rate": 2.5531914893617016e-07, "logits/chosen": 1.722089409828186, "logits/rejected": 1.7591125965118408, "logps/chosen": -420.39404296875, "logps/rejected": -425.396484375, "loss": 0.6909, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010764744132757187, "rewards/margins": 0.019897134974598885, "rewards/rejected": -0.009132390841841698, "step": 24 }, { "epoch": 0.053433075073470476, "grad_norm": 16.877922335832068, "learning_rate": 2.659574468085106e-07, "logits/chosen": 2.259049415588379, "logits/rejected": 2.423696756362915, "logps/chosen": -292.7757568359375, "logps/rejected": -319.800048828125, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": -0.01868593692779541, "rewards/margins": 0.0022046808153390884, "rewards/rejected": -0.0208906177431345, "step": 25 }, { "epoch": 0.055570398076409296, "grad_norm": 17.29347939663918, "learning_rate": 2.7659574468085106e-07, "logits/chosen": 1.9354510307312012, "logits/rejected": 1.9527926445007324, "logps/chosen": -345.2893981933594, "logps/rejected": -388.90911865234375, "loss": 0.6942, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03965914249420166, "rewards/margins": -0.005191686097532511, "rewards/rejected": -0.034467458724975586, "step": 26 }, { "epoch": 0.057707721079348115, "grad_norm": 17.646670785000858, "learning_rate": 2.872340425531915e-07, "logits/chosen": 1.8476933240890503, "logits/rejected": 1.9482815265655518, "logps/chosen": -333.46820068359375, "logps/rejected": -358.3786315917969, "loss": 0.6909, "rewards/accuracies": 0.4375, "rewards/chosen": -0.027103139087557793, "rewards/margins": -0.0378154031932354, "rewards/rejected": 0.010712266899645329, "step": 27 }, { "epoch": 0.059845044082286934, "grad_norm": 16.761100735068744, "learning_rate": 2.978723404255319e-07, "logits/chosen": 1.7289808988571167, "logits/rejected": 1.6542481184005737, "logps/chosen": -434.14825439453125, "logps/rejected": -440.4168395996094, "loss": 0.6909, "rewards/accuracies": 0.375, "rewards/chosen": -0.008729076012969017, "rewards/margins": -0.01662147231400013, "rewards/rejected": 0.007892394438385963, "step": 28 }, { "epoch": 0.061982367085225754, "grad_norm": 17.745794857594785, "learning_rate": 3.085106382978723e-07, "logits/chosen": 2.139738082885742, "logits/rejected": 1.9200177192687988, "logps/chosen": -337.0323181152344, "logps/rejected": -322.8047790527344, "loss": 0.6864, "rewards/accuracies": 0.375, "rewards/chosen": -0.047266557812690735, "rewards/margins": -0.006123709492385387, "rewards/rejected": -0.04114284738898277, "step": 29 }, { "epoch": 0.06411969008816458, "grad_norm": 17.58518177821876, "learning_rate": 3.1914893617021275e-07, "logits/chosen": 1.9211307764053345, "logits/rejected": 2.0256621837615967, "logps/chosen": -431.0246887207031, "logps/rejected": -466.2309875488281, "loss": 0.685, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03614044189453125, "rewards/margins": -0.02504887804389, "rewards/rejected": -0.0110915657132864, "step": 30 }, { "epoch": 0.06625701309110339, "grad_norm": 16.58761166063222, "learning_rate": 3.2978723404255315e-07, "logits/chosen": 2.0386786460876465, "logits/rejected": 2.175553798675537, "logps/chosen": -430.54254150390625, "logps/rejected": -434.08551025390625, "loss": 0.6851, "rewards/accuracies": 0.4375, "rewards/chosen": -0.023386145010590553, "rewards/margins": 0.005015873815864325, "rewards/rejected": -0.028402019292116165, "step": 31 }, { "epoch": 0.06839433609404222, "grad_norm": 19.112046530771966, "learning_rate": 3.404255319148936e-07, "logits/chosen": 1.7420753240585327, "logits/rejected": 1.7955291271209717, "logps/chosen": -357.76708984375, "logps/rejected": -405.6940002441406, "loss": 0.6898, "rewards/accuracies": 0.5625, "rewards/chosen": -0.013730335980653763, "rewards/margins": -0.009128142148256302, "rewards/rejected": -0.004602192901074886, "step": 32 }, { "epoch": 0.07053165909698103, "grad_norm": 15.87955801875356, "learning_rate": 3.5106382978723405e-07, "logits/chosen": 2.003401756286621, "logits/rejected": 1.9700802564620972, "logps/chosen": -298.1646423339844, "logps/rejected": -315.47320556640625, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": -0.049097828567028046, "rewards/margins": -0.01259935088455677, "rewards/rejected": -0.036498475819826126, "step": 33 }, { "epoch": 0.07266898209991986, "grad_norm": 17.271526767661655, "learning_rate": 3.617021276595745e-07, "logits/chosen": 1.674816608428955, "logits/rejected": 1.674377679824829, "logps/chosen": -262.3334655761719, "logps/rejected": -293.4676818847656, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": -0.05956123024225235, "rewards/margins": -0.0003459937870502472, "rewards/rejected": -0.0592152364552021, "step": 34 }, { "epoch": 0.07480630510285867, "grad_norm": 18.522791044208137, "learning_rate": 3.7234042553191484e-07, "logits/chosen": 2.0599467754364014, "logits/rejected": 1.985105037689209, "logps/chosen": -361.3081970214844, "logps/rejected": -335.7303466796875, "loss": 0.687, "rewards/accuracies": 0.4375, "rewards/chosen": -0.04977235943078995, "rewards/margins": 0.0031501527410000563, "rewards/rejected": -0.052922509610652924, "step": 35 }, { "epoch": 0.07694362810579748, "grad_norm": 16.058029701898118, "learning_rate": 3.829787234042553e-07, "logits/chosen": 2.2823598384857178, "logits/rejected": 2.245945930480957, "logps/chosen": -397.5772705078125, "logps/rejected": -387.1483459472656, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": -0.03420672193169594, "rewards/margins": 0.01202941033989191, "rewards/rejected": -0.04623613506555557, "step": 36 }, { "epoch": 0.07908095110873631, "grad_norm": 16.29611225935786, "learning_rate": 3.9361702127659574e-07, "logits/chosen": 1.3664941787719727, "logits/rejected": 1.407928466796875, "logps/chosen": -344.04736328125, "logps/rejected": -343.9777526855469, "loss": 0.687, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006882575340569019, "rewards/margins": 0.06044764071702957, "rewards/rejected": -0.05356506258249283, "step": 37 }, { "epoch": 0.08121827411167512, "grad_norm": 16.675563423812346, "learning_rate": 4.0425531914893614e-07, "logits/chosen": 1.9885263442993164, "logits/rejected": 2.049900531768799, "logps/chosen": -408.2422790527344, "logps/rejected": -459.45330810546875, "loss": 0.6822, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0520525798201561, "rewards/margins": 0.03479123115539551, "rewards/rejected": -0.0868438109755516, "step": 38 }, { "epoch": 0.08335559711461395, "grad_norm": 15.90555800464618, "learning_rate": 4.148936170212766e-07, "logits/chosen": 2.1601202487945557, "logits/rejected": 2.1942191123962402, "logps/chosen": -406.88995361328125, "logps/rejected": -424.9388122558594, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": -0.11415410041809082, "rewards/margins": -0.005976894870400429, "rewards/rejected": -0.10817721486091614, "step": 39 }, { "epoch": 0.08549292011755276, "grad_norm": 19.234842538679075, "learning_rate": 4.25531914893617e-07, "logits/chosen": 1.4985907077789307, "logits/rejected": 1.512323021888733, "logps/chosen": -295.0309143066406, "logps/rejected": -282.7944641113281, "loss": 0.6873, "rewards/accuracies": 0.4375, "rewards/chosen": -0.13608673214912415, "rewards/margins": -0.009758353233337402, "rewards/rejected": -0.12632837891578674, "step": 40 }, { "epoch": 0.08763024312049159, "grad_norm": 18.1644372771909, "learning_rate": 4.3617021276595744e-07, "logits/chosen": 2.3623297214508057, "logits/rejected": 2.420731544494629, "logps/chosen": -341.2931823730469, "logps/rejected": -369.0897216796875, "loss": 0.6791, "rewards/accuracies": 0.4375, "rewards/chosen": -0.09444458782672882, "rewards/margins": 0.05246647074818611, "rewards/rejected": -0.14691105484962463, "step": 41 }, { "epoch": 0.0897675661234304, "grad_norm": 17.564831976861814, "learning_rate": 4.4680851063829783e-07, "logits/chosen": 2.216895580291748, "logits/rejected": 2.060129165649414, "logps/chosen": -464.18829345703125, "logps/rejected": -397.678955078125, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11240722239017487, "rewards/margins": 0.01255890354514122, "rewards/rejected": -0.12496612221002579, "step": 42 }, { "epoch": 0.09190488912636922, "grad_norm": 17.498276873674637, "learning_rate": 4.574468085106383e-07, "logits/chosen": 1.8796459436416626, "logits/rejected": 1.8276338577270508, "logps/chosen": -378.45849609375, "logps/rejected": -342.8559265136719, "loss": 0.6774, "rewards/accuracies": 0.75, "rewards/chosen": -0.118812195956707, "rewards/margins": 0.056394241750240326, "rewards/rejected": -0.17520645260810852, "step": 43 }, { "epoch": 0.09404221212930804, "grad_norm": 16.939255095109502, "learning_rate": 4.6808510638297873e-07, "logits/chosen": 1.4337763786315918, "logits/rejected": 1.4281963109970093, "logps/chosen": -454.3580627441406, "logps/rejected": -430.2334899902344, "loss": 0.685, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14937520027160645, "rewards/margins": -0.016789698973298073, "rewards/rejected": -0.13258551061153412, "step": 44 }, { "epoch": 0.09617953513224686, "grad_norm": 18.086769935035708, "learning_rate": 4.787234042553192e-07, "logits/chosen": 2.0096044540405273, "logits/rejected": 2.032930612564087, "logps/chosen": -356.0893249511719, "logps/rejected": -363.38531494140625, "loss": 0.6826, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11098766326904297, "rewards/margins": 0.031171085312962532, "rewards/rejected": -0.14215874671936035, "step": 45 }, { "epoch": 0.09831685813518568, "grad_norm": 16.60546022249817, "learning_rate": 4.893617021276595e-07, "logits/chosen": 2.2274961471557617, "logits/rejected": 2.2103538513183594, "logps/chosen": -450.325439453125, "logps/rejected": -453.9230041503906, "loss": 0.6907, "rewards/accuracies": 0.3125, "rewards/chosen": -0.20025189220905304, "rewards/margins": -0.030434010550379753, "rewards/rejected": -0.16981787979602814, "step": 46 }, { "epoch": 0.1004541811381245, "grad_norm": 17.64513453243039, "learning_rate": 5e-07, "logits/chosen": 1.394955039024353, "logits/rejected": 1.4171689748764038, "logps/chosen": -271.34051513671875, "logps/rejected": -280.0254211425781, "loss": 0.6873, "rewards/accuracies": 0.625, "rewards/chosen": -0.13984565436840057, "rewards/margins": 0.09467978030443192, "rewards/rejected": -0.23452544212341309, "step": 47 }, { "epoch": 0.10259150414106331, "grad_norm": 19.298872476175823, "learning_rate": 4.999930062653174e-07, "logits/chosen": 1.808046579360962, "logits/rejected": 1.7487595081329346, "logps/chosen": -357.9656982421875, "logps/rejected": -400.80303955078125, "loss": 0.687, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23653987050056458, "rewards/margins": -0.007849575020372868, "rewards/rejected": -0.22869029641151428, "step": 48 }, { "epoch": 0.10472882714400214, "grad_norm": 17.333339696928125, "learning_rate": 4.999720254525684e-07, "logits/chosen": 1.746777892112732, "logits/rejected": 1.7006160020828247, "logps/chosen": -421.19268798828125, "logps/rejected": -411.2952880859375, "loss": 0.6816, "rewards/accuracies": 0.5, "rewards/chosen": -0.20930640399456024, "rewards/margins": 0.018274232745170593, "rewards/rejected": -0.22758066654205322, "step": 49 }, { "epoch": 0.10686615014694095, "grad_norm": 17.729943763732685, "learning_rate": 4.999370587356267e-07, "logits/chosen": 1.9540718793869019, "logits/rejected": 1.9008055925369263, "logps/chosen": -294.99676513671875, "logps/rejected": -272.04925537109375, "loss": 0.6631, "rewards/accuracies": 0.8125, "rewards/chosen": -0.23157571256160736, "rewards/margins": 0.12571102380752563, "rewards/rejected": -0.3572867512702942, "step": 50 }, { "epoch": 0.10900347314987978, "grad_norm": 17.468777477407095, "learning_rate": 4.998881080708758e-07, "logits/chosen": 1.481137990951538, "logits/rejected": 1.3699580430984497, "logps/chosen": -289.7410888671875, "logps/rejected": -315.044921875, "loss": 0.683, "rewards/accuracies": 0.4375, "rewards/chosen": -0.20251740515232086, "rewards/margins": -0.0064203981310129166, "rewards/rejected": -0.1960970163345337, "step": 51 }, { "epoch": 0.11114079615281859, "grad_norm": 17.54256112367197, "learning_rate": 4.998251761970996e-07, "logits/chosen": 2.0398733615875244, "logits/rejected": 2.158165216445923, "logps/chosen": -387.5774841308594, "logps/rejected": -413.7792053222656, "loss": 0.6729, "rewards/accuracies": 0.75, "rewards/chosen": -0.21002252399921417, "rewards/margins": 0.059471115469932556, "rewards/rejected": -0.26949363946914673, "step": 52 }, { "epoch": 0.11327811915575742, "grad_norm": 18.17193642743448, "learning_rate": 4.997482666353286e-07, "logits/chosen": 1.551992654800415, "logits/rejected": 1.6585577726364136, "logps/chosen": -362.41253662109375, "logps/rejected": -375.1070556640625, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": -0.28190845251083374, "rewards/margins": 0.027381589636206627, "rewards/rejected": -0.3092900216579437, "step": 53 }, { "epoch": 0.11541544215869623, "grad_norm": 19.118405188522143, "learning_rate": 4.996573836886434e-07, "logits/chosen": 2.13954496383667, "logits/rejected": 2.0068345069885254, "logps/chosen": -494.3727722167969, "logps/rejected": -441.12664794921875, "loss": 0.6871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28698673844337463, "rewards/margins": 0.03327002376317978, "rewards/rejected": -0.3202567994594574, "step": 54 }, { "epoch": 0.11755276516163506, "grad_norm": 18.80060244793496, "learning_rate": 4.995525324419337e-07, "logits/chosen": 2.386868715286255, "logits/rejected": 2.2814548015594482, "logps/chosen": -470.53875732421875, "logps/rejected": -430.71636962890625, "loss": 0.6747, "rewards/accuracies": 0.875, "rewards/chosen": -0.23906640708446503, "rewards/margins": 0.13759444653987885, "rewards/rejected": -0.3766608238220215, "step": 55 }, { "epoch": 0.11969008816457387, "grad_norm": 17.20090684503435, "learning_rate": 4.99433718761614e-07, "logits/chosen": 1.7399883270263672, "logits/rejected": 1.6878349781036377, "logps/chosen": -427.767333984375, "logps/rejected": -383.4020690917969, "loss": 0.6745, "rewards/accuracies": 0.5, "rewards/chosen": -0.28416523337364197, "rewards/margins": 0.028113719075918198, "rewards/rejected": -0.31227895617485046, "step": 56 }, { "epoch": 0.1218274111675127, "grad_norm": 17.189006839270423, "learning_rate": 4.993009492952949e-07, "logits/chosen": 2.0051536560058594, "logits/rejected": 1.9158482551574707, "logps/chosen": -328.1854553222656, "logps/rejected": -342.3035583496094, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": -0.3610785901546478, "rewards/margins": 0.006724976003170013, "rewards/rejected": -0.36780354380607605, "step": 57 }, { "epoch": 0.12396473417045151, "grad_norm": 19.604523214040483, "learning_rate": 4.991542314714122e-07, "logits/chosen": 1.3874634504318237, "logits/rejected": 1.4164166450500488, "logps/chosen": -304.51922607421875, "logps/rejected": -336.23944091796875, "loss": 0.671, "rewards/accuracies": 0.6875, "rewards/chosen": -0.30155548453330994, "rewards/margins": 0.09271588921546936, "rewards/rejected": -0.3942714035511017, "step": 58 }, { "epoch": 0.12610205717339032, "grad_norm": 16.687200107986385, "learning_rate": 4.989935734988097e-07, "logits/chosen": 1.4796478748321533, "logits/rejected": 1.3376965522766113, "logps/chosen": -413.5907897949219, "logps/rejected": -392.4472961425781, "loss": 0.6652, "rewards/accuracies": 0.625, "rewards/chosen": -0.24848215281963348, "rewards/margins": 0.027541160583496094, "rewards/rejected": -0.27602332830429077, "step": 59 }, { "epoch": 0.12823938017632916, "grad_norm": 18.754855497046993, "learning_rate": 4.988189843662815e-07, "logits/chosen": 1.5387346744537354, "logits/rejected": 1.6132254600524902, "logps/chosen": -306.87213134765625, "logps/rejected": -335.58721923828125, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": -0.26606473326683044, "rewards/margins": 0.02971341460943222, "rewards/rejected": -0.29577815532684326, "step": 60 }, { "epoch": 0.13037670317926797, "grad_norm": 18.463206919216464, "learning_rate": 4.986304738420683e-07, "logits/chosen": 1.944252848625183, "logits/rejected": 1.896322250366211, "logps/chosen": -341.7672424316406, "logps/rejected": -363.7576904296875, "loss": 0.6524, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2655528485774994, "rewards/margins": 0.11380692571401596, "rewards/rejected": -0.37935978174209595, "step": 61 }, { "epoch": 0.13251402618220678, "grad_norm": 22.5265817847443, "learning_rate": 4.984280524733107e-07, "logits/chosen": 1.4530837535858154, "logits/rejected": 1.3399604558944702, "logps/chosen": -355.2033386230469, "logps/rejected": -351.52618408203125, "loss": 0.6853, "rewards/accuracies": 0.625, "rewards/chosen": -0.4064197242259979, "rewards/margins": 0.10837845504283905, "rewards/rejected": -0.5147981643676758, "step": 62 }, { "epoch": 0.1346513491851456, "grad_norm": 18.564495656237806, "learning_rate": 4.982117315854593e-07, "logits/chosen": 2.016234874725342, "logits/rejected": 1.9874932765960693, "logps/chosen": -416.2107849121094, "logps/rejected": -402.76470947265625, "loss": 0.6711, "rewards/accuracies": 0.5, "rewards/chosen": -0.4489130973815918, "rewards/margins": 0.04024820774793625, "rewards/rejected": -0.48916131258010864, "step": 63 }, { "epoch": 0.13678867218808444, "grad_norm": 18.562107659954208, "learning_rate": 4.979815232816416e-07, "logits/chosen": 2.208319664001465, "logits/rejected": 2.190627336502075, "logps/chosen": -303.19500732421875, "logps/rejected": -326.4004821777344, "loss": 0.6603, "rewards/accuracies": 0.625, "rewards/chosen": -0.2610405683517456, "rewards/margins": 0.05577556788921356, "rewards/rejected": -0.31681615114212036, "step": 64 }, { "epoch": 0.13892599519102325, "grad_norm": 20.69361614885178, "learning_rate": 4.977374404419837e-07, "logits/chosen": 1.9246108531951904, "logits/rejected": 1.8763636350631714, "logps/chosen": -358.9330139160156, "logps/rejected": -346.0728759765625, "loss": 0.6662, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22426418960094452, "rewards/margins": 0.1284552812576294, "rewards/rejected": -0.3527194857597351, "step": 65 }, { "epoch": 0.14106331819396206, "grad_norm": 17.2698240144223, "learning_rate": 4.974794967228907e-07, "logits/chosen": 1.483390212059021, "logits/rejected": 1.375150203704834, "logps/chosen": -315.0908508300781, "logps/rejected": -309.7676696777344, "loss": 0.6689, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3849695920944214, "rewards/margins": 0.13179758191108704, "rewards/rejected": -0.516767144203186, "step": 66 }, { "epoch": 0.14320064119690087, "grad_norm": 18.912694680875394, "learning_rate": 4.972077065562821e-07, "logits/chosen": 1.6654436588287354, "logits/rejected": 1.6484836339950562, "logps/chosen": -415.6891174316406, "logps/rejected": -483.3581848144531, "loss": 0.6528, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3479224145412445, "rewards/margins": 0.30767643451690674, "rewards/rejected": -0.6555988192558289, "step": 67 }, { "epoch": 0.14533796419983971, "grad_norm": 18.224972403748673, "learning_rate": 4.969220851487844e-07, "logits/chosen": 1.7228754758834839, "logits/rejected": 1.616809606552124, "logps/chosen": -387.8039245605469, "logps/rejected": -381.3202209472656, "loss": 0.6676, "rewards/accuracies": 0.5625, "rewards/chosen": -0.41254401206970215, "rewards/margins": 0.05308759585022926, "rewards/rejected": -0.4656316041946411, "step": 68 }, { "epoch": 0.14747528720277853, "grad_norm": 16.86306787165477, "learning_rate": 4.966226484808803e-07, "logits/chosen": 1.2384394407272339, "logits/rejected": 1.2866795063018799, "logps/chosen": -314.2529602050781, "logps/rejected": -329.0174865722656, "loss": 0.6639, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4110863208770752, "rewards/margins": 0.2178199589252472, "rewards/rejected": -0.6289063096046448, "step": 69 }, { "epoch": 0.14961261020571734, "grad_norm": 17.195577161087968, "learning_rate": 4.963094133060148e-07, "logits/chosen": 2.0873541831970215, "logits/rejected": 1.998971700668335, "logps/chosen": -389.9579162597656, "logps/rejected": -384.6530456542969, "loss": 0.6708, "rewards/accuracies": 0.4375, "rewards/chosen": -0.621479332447052, "rewards/margins": 0.08416196703910828, "rewards/rejected": -0.7056412696838379, "step": 70 }, { "epoch": 0.15174993320865615, "grad_norm": 18.106861789518128, "learning_rate": 4.959823971496574e-07, "logits/chosen": 1.5237023830413818, "logits/rejected": 1.6149706840515137, "logps/chosen": -400.80560302734375, "logps/rejected": -411.4190979003906, "loss": 0.6632, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38402536511421204, "rewards/margins": 0.19899356365203857, "rewards/rejected": -0.583018958568573, "step": 71 }, { "epoch": 0.15388725621159496, "grad_norm": 18.8987848353103, "learning_rate": 4.956416183083221e-07, "logits/chosen": 2.0366392135620117, "logits/rejected": 2.1463069915771484, "logps/chosen": -336.7109375, "logps/rejected": -391.3525695800781, "loss": 0.6537, "rewards/accuracies": 0.5, "rewards/chosen": -0.3251726031303406, "rewards/margins": 0.0683506578207016, "rewards/rejected": -0.39352327585220337, "step": 72 }, { "epoch": 0.1560245792145338, "grad_norm": 18.02342571848944, "learning_rate": 4.952870958485431e-07, "logits/chosen": 2.0702877044677734, "logits/rejected": 1.999230146408081, "logps/chosen": -474.9482727050781, "logps/rejected": -470.8036804199219, "loss": 0.6621, "rewards/accuracies": 0.625, "rewards/chosen": -0.5637938976287842, "rewards/margins": 0.12167691439390182, "rewards/rejected": -0.6854707598686218, "step": 73 }, { "epoch": 0.15816190221747262, "grad_norm": 17.122856393777678, "learning_rate": 4.949188496058089e-07, "logits/chosen": 2.1900906562805176, "logits/rejected": 2.1530239582061768, "logps/chosen": -365.1473388671875, "logps/rejected": -355.15826416015625, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": -0.4727271795272827, "rewards/margins": 0.044449158012866974, "rewards/rejected": -0.5171763300895691, "step": 74 }, { "epoch": 0.16029922522041143, "grad_norm": 18.705770867692838, "learning_rate": 4.945369001834514e-07, "logits/chosen": 1.970231533050537, "logits/rejected": 1.8822038173675537, "logps/chosen": -357.3649597167969, "logps/rejected": -340.84735107421875, "loss": 0.6279, "rewards/accuracies": 0.75, "rewards/chosen": -0.39545726776123047, "rewards/margins": 0.2576879858970642, "rewards/rejected": -0.6531451940536499, "step": 75 }, { "epoch": 0.16243654822335024, "grad_norm": 18.463959365090876, "learning_rate": 4.941412689514941e-07, "logits/chosen": 1.9429740905761719, "logits/rejected": 2.05521559715271, "logps/chosen": -399.330078125, "logps/rejected": -448.8360290527344, "loss": 0.6457, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6837693452835083, "rewards/margins": 0.19340023398399353, "rewards/rejected": -0.8771694898605347, "step": 76 }, { "epoch": 0.16457387122628908, "grad_norm": 17.748894790019765, "learning_rate": 4.937319780454559e-07, "logits/chosen": 1.7817144393920898, "logits/rejected": 1.8134974241256714, "logps/chosen": -360.9486999511719, "logps/rejected": -348.0911560058594, "loss": 0.6438, "rewards/accuracies": 0.6875, "rewards/chosen": -0.536231517791748, "rewards/margins": 0.07527366280555725, "rewards/rejected": -0.6115051507949829, "step": 77 }, { "epoch": 0.1667111942292279, "grad_norm": 17.16271899190261, "learning_rate": 4.933090503651128e-07, "logits/chosen": 1.7767925262451172, "logits/rejected": 1.779471755027771, "logps/chosen": -400.331787109375, "logps/rejected": -395.0623779296875, "loss": 0.624, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6241444945335388, "rewards/margins": 0.17532269656658173, "rewards/rejected": -0.7994672060012817, "step": 78 }, { "epoch": 0.1688485172321667, "grad_norm": 18.630940755974848, "learning_rate": 4.928725095732168e-07, "logits/chosen": 1.643775224685669, "logits/rejected": 1.786955714225769, "logps/chosen": -355.02508544921875, "logps/rejected": -399.382568359375, "loss": 0.6417, "rewards/accuracies": 0.5, "rewards/chosen": -0.6717734336853027, "rewards/margins": 0.11283117532730103, "rewards/rejected": -0.7846046090126038, "step": 79 }, { "epoch": 0.17098584023510552, "grad_norm": 18.925906729991304, "learning_rate": 4.924223800941717e-07, "logits/chosen": 2.1771671772003174, "logits/rejected": 2.2715742588043213, "logps/chosen": -449.5519714355469, "logps/rejected": -464.3706970214844, "loss": 0.654, "rewards/accuracies": 0.75, "rewards/chosen": -0.555615246295929, "rewards/margins": 0.14044499397277832, "rewards/rejected": -0.6960601806640625, "step": 80 }, { "epoch": 0.17312316323804436, "grad_norm": 18.40745960551381, "learning_rate": 4.919586871126667e-07, "logits/chosen": 1.9518824815750122, "logits/rejected": 2.082866668701172, "logps/chosen": -403.2046203613281, "logps/rejected": -436.531494140625, "loss": 0.6453, "rewards/accuracies": 0.75, "rewards/chosen": -0.6706172227859497, "rewards/margins": 0.15584391355514526, "rewards/rejected": -0.8264610767364502, "step": 81 }, { "epoch": 0.17526048624098317, "grad_norm": 17.801429215955096, "learning_rate": 4.91481456572267e-07, "logits/chosen": 1.9086239337921143, "logits/rejected": 1.8316439390182495, "logps/chosen": -399.4256591796875, "logps/rejected": -387.94171142578125, "loss": 0.6434, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8419517874717712, "rewards/margins": 0.10202351212501526, "rewards/rejected": -0.9439752101898193, "step": 82 }, { "epoch": 0.17739780924392198, "grad_norm": 18.421125186391436, "learning_rate": 4.909907151739633e-07, "logits/chosen": 2.0825533866882324, "logits/rejected": 2.0777454376220703, "logps/chosen": -358.8985290527344, "logps/rejected": -335.0311279296875, "loss": 0.6416, "rewards/accuracies": 0.6875, "rewards/chosen": -0.886951208114624, "rewards/margins": 0.06833362579345703, "rewards/rejected": -0.9552848935127258, "step": 83 }, { "epoch": 0.1795351322468608, "grad_norm": 17.625368142356823, "learning_rate": 4.904864903746765e-07, "logits/chosen": 2.0355796813964844, "logits/rejected": 2.1017327308654785, "logps/chosen": -406.8382568359375, "logps/rejected": -409.212646484375, "loss": 0.6074, "rewards/accuracies": 0.75, "rewards/chosen": -0.6774367094039917, "rewards/margins": 0.4327796995639801, "rewards/rejected": -1.1102163791656494, "step": 84 }, { "epoch": 0.18167245524979964, "grad_norm": 17.846014275986537, "learning_rate": 4.899688103857222e-07, "logits/chosen": 1.704572081565857, "logits/rejected": 1.714634895324707, "logps/chosen": -386.1976318359375, "logps/rejected": -388.4258728027344, "loss": 0.6381, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9329708814620972, "rewards/margins": 0.04441451653838158, "rewards/rejected": -0.977385401725769, "step": 85 }, { "epoch": 0.18380977825273845, "grad_norm": 17.526539174440522, "learning_rate": 4.894377041712326e-07, "logits/chosen": 2.334737777709961, "logits/rejected": 2.3338565826416016, "logps/chosen": -421.5735778808594, "logps/rejected": -438.7161865234375, "loss": 0.6296, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7888815402984619, "rewards/margins": 0.31321045756340027, "rewards/rejected": -1.1020920276641846, "step": 86 }, { "epoch": 0.18594710125567726, "grad_norm": 19.21020565741226, "learning_rate": 4.888932014465352e-07, "logits/chosen": 1.5965489149093628, "logits/rejected": 1.6148681640625, "logps/chosen": -372.6695556640625, "logps/rejected": -392.4805908203125, "loss": 0.6242, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7540562152862549, "rewards/margins": 0.266312837600708, "rewards/rejected": -1.020369052886963, "step": 87 }, { "epoch": 0.18808442425861607, "grad_norm": 18.29361445908713, "learning_rate": 4.883353326764906e-07, "logits/chosen": 1.5290958881378174, "logits/rejected": 1.6073287725448608, "logps/chosen": -397.7436828613281, "logps/rejected": -399.38385009765625, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": -0.8137935996055603, "rewards/margins": 0.2867809236049652, "rewards/rejected": -1.1005744934082031, "step": 88 }, { "epoch": 0.1902217472615549, "grad_norm": 18.614979910689225, "learning_rate": 4.877641290737883e-07, "logits/chosen": 1.936212420463562, "logits/rejected": 1.8444154262542725, "logps/chosen": -424.4599609375, "logps/rejected": -479.34808349609375, "loss": 0.6174, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9368820190429688, "rewards/margins": 0.2523537874221802, "rewards/rejected": -1.1892356872558594, "step": 89 }, { "epoch": 0.19235907026449373, "grad_norm": 18.218281107090114, "learning_rate": 4.871796225971999e-07, "logits/chosen": 1.7540048360824585, "logits/rejected": 1.8083505630493164, "logps/chosen": -352.07464599609375, "logps/rejected": -390.17193603515625, "loss": 0.6386, "rewards/accuracies": 0.5, "rewards/chosen": -0.8687042593955994, "rewards/margins": 0.04543251544237137, "rewards/rejected": -0.9141367077827454, "step": 90 }, { "epoch": 0.19449639326743254, "grad_norm": 17.42744405233579, "learning_rate": 4.86581845949791e-07, "logits/chosen": 1.7399728298187256, "logits/rejected": 1.881831169128418, "logps/chosen": -386.78070068359375, "logps/rejected": -394.3675231933594, "loss": 0.6278, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8296229839324951, "rewards/margins": 0.33209383487701416, "rewards/rejected": -1.1617166996002197, "step": 91 }, { "epoch": 0.19663371627037135, "grad_norm": 18.157993962219635, "learning_rate": 4.859708325770919e-07, "logits/chosen": 1.8655225038528442, "logits/rejected": 1.7539880275726318, "logps/chosen": -425.1163635253906, "logps/rejected": -450.327392578125, "loss": 0.6352, "rewards/accuracies": 0.75, "rewards/chosen": -0.9147007465362549, "rewards/margins": 0.19170109927654266, "rewards/rejected": -1.1064016819000244, "step": 92 }, { "epoch": 0.1987710392733102, "grad_norm": 18.59822363701581, "learning_rate": 4.853466166652258e-07, "logits/chosen": 1.9111980199813843, "logits/rejected": 2.00334095954895, "logps/chosen": -383.8531494140625, "logps/rejected": -441.35626220703125, "loss": 0.6473, "rewards/accuracies": 0.5, "rewards/chosen": -0.9924455881118774, "rewards/margins": 0.08671611547470093, "rewards/rejected": -1.0791617631912231, "step": 93 }, { "epoch": 0.200908362276249, "grad_norm": 18.689539426987327, "learning_rate": 4.847092331389964e-07, "logits/chosen": 1.7213155031204224, "logits/rejected": 1.7230504751205444, "logps/chosen": -328.0395202636719, "logps/rejected": -307.9144592285156, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": -0.7495980262756348, "rewards/margins": 0.31506437063217163, "rewards/rejected": -1.0646624565124512, "step": 94 }, { "epoch": 0.20304568527918782, "grad_norm": 18.329788390189, "learning_rate": 4.840587176599343e-07, "logits/chosen": 1.8517365455627441, "logits/rejected": 1.867042064666748, "logps/chosen": -284.2342834472656, "logps/rejected": -316.3347473144531, "loss": 0.6473, "rewards/accuracies": 0.5, "rewards/chosen": -1.1318445205688477, "rewards/margins": 0.22313019633293152, "rewards/rejected": -1.3549748659133911, "step": 95 }, { "epoch": 0.20518300828212663, "grad_norm": 20.003558783675533, "learning_rate": 4.833951066243004e-07, "logits/chosen": 1.9658561944961548, "logits/rejected": 2.0219621658325195, "logps/chosen": -385.1724853515625, "logps/rejected": -374.846923828125, "loss": 0.6522, "rewards/accuracies": 0.625, "rewards/chosen": -1.280335783958435, "rewards/margins": 0.1471562385559082, "rewards/rejected": -1.4274920225143433, "step": 96 }, { "epoch": 0.20732033128506547, "grad_norm": 19.439956275385367, "learning_rate": 4.82718437161051e-07, "logits/chosen": 1.449579119682312, "logits/rejected": 1.475942850112915, "logps/chosen": -424.9071044921875, "logps/rejected": -432.41900634765625, "loss": 0.6182, "rewards/accuracies": 0.625, "rewards/chosen": -1.032875657081604, "rewards/margins": 0.14200714230537415, "rewards/rejected": -1.1748827695846558, "step": 97 }, { "epoch": 0.20945765428800428, "grad_norm": 20.720426275007746, "learning_rate": 4.820287471297597e-07, "logits/chosen": 1.2382292747497559, "logits/rejected": 1.3223166465759277, "logps/chosen": -341.36773681640625, "logps/rejected": -373.3889465332031, "loss": 0.6139, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9406616687774658, "rewards/margins": 0.30484455823898315, "rewards/rejected": -1.2455062866210938, "step": 98 }, { "epoch": 0.2115949772909431, "grad_norm": 19.167777501448665, "learning_rate": 4.813260751184992e-07, "logits/chosen": 1.4242631196975708, "logits/rejected": 1.3070321083068848, "logps/chosen": -382.6838073730469, "logps/rejected": -369.42333984375, "loss": 0.638, "rewards/accuracies": 0.5, "rewards/chosen": -1.2732874155044556, "rewards/margins": 0.12794676423072815, "rewards/rejected": -1.4012342691421509, "step": 99 }, { "epoch": 0.2137323002938819, "grad_norm": 19.082592371106056, "learning_rate": 4.806104604416823e-07, "logits/chosen": 1.8438955545425415, "logits/rejected": 1.8277944326400757, "logps/chosen": -375.05303955078125, "logps/rejected": -428.58056640625, "loss": 0.6082, "rewards/accuracies": 0.5625, "rewards/chosen": -1.259270429611206, "rewards/margins": 0.28902101516723633, "rewards/rejected": -1.548291563987732, "step": 100 }, { "epoch": 0.21586962329682075, "grad_norm": 19.40383100777513, "learning_rate": 4.798819431378626e-07, "logits/chosen": 1.5675817728042603, "logits/rejected": 1.487393856048584, "logps/chosen": -274.16741943359375, "logps/rejected": -294.5373840332031, "loss": 0.5981, "rewards/accuracies": 0.5, "rewards/chosen": -1.184496521949768, "rewards/margins": 0.1605721116065979, "rewards/rejected": -1.3450688123703003, "step": 101 }, { "epoch": 0.21800694629975956, "grad_norm": 19.035365387486454, "learning_rate": 4.79140563967494e-07, "logits/chosen": 1.378381371498108, "logits/rejected": 1.5444639921188354, "logps/chosen": -342.8183288574219, "logps/rejected": -344.89239501953125, "loss": 0.6363, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0111204385757446, "rewards/margins": 0.26567333936691284, "rewards/rejected": -1.2767938375473022, "step": 102 }, { "epoch": 0.22014426930269837, "grad_norm": 22.460424893993004, "learning_rate": 4.783863644106502e-07, "logits/chosen": 1.3568328619003296, "logits/rejected": 1.3307393789291382, "logps/chosen": -235.84120178222656, "logps/rejected": -239.3479766845703, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": -0.9905429482460022, "rewards/margins": -0.051335543394088745, "rewards/rejected": -0.9392074346542358, "step": 103 }, { "epoch": 0.22228159230563718, "grad_norm": 20.610208302459373, "learning_rate": 4.776193866647039e-07, "logits/chosen": 1.1602747440338135, "logits/rejected": 1.4320569038391113, "logps/chosen": -399.7038269042969, "logps/rejected": -418.18560791015625, "loss": 0.6695, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4170256853103638, "rewards/margins": 0.1951914280653, "rewards/rejected": -1.6122169494628906, "step": 104 }, { "epoch": 0.224418915308576, "grad_norm": 21.316293798129664, "learning_rate": 4.768396736419662e-07, "logits/chosen": 1.2325228452682495, "logits/rejected": 1.2283470630645752, "logps/chosen": -340.61248779296875, "logps/rejected": -354.34228515625, "loss": 0.6416, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9992186427116394, "rewards/margins": 0.22192566096782684, "rewards/rejected": -1.2211443185806274, "step": 105 }, { "epoch": 0.22655623831151483, "grad_norm": 18.691286982248872, "learning_rate": 4.7604726896728496e-07, "logits/chosen": 2.0024757385253906, "logits/rejected": 2.002178430557251, "logps/chosen": -435.4734802246094, "logps/rejected": -469.6261291503906, "loss": 0.5673, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7990100979804993, "rewards/margins": 0.48767000436782837, "rewards/rejected": -1.2866802215576172, "step": 106 }, { "epoch": 0.22869356131445365, "grad_norm": 17.650688796730755, "learning_rate": 4.752422169756047e-07, "logits/chosen": 2.159564256668091, "logits/rejected": 2.232543468475342, "logps/chosen": -328.35565185546875, "logps/rejected": -342.68707275390625, "loss": 0.6139, "rewards/accuracies": 0.75, "rewards/chosen": -0.8560774326324463, "rewards/margins": 0.4257628917694092, "rewards/rejected": -1.281840443611145, "step": 107 }, { "epoch": 0.23083088431739246, "grad_norm": 19.554037566299336, "learning_rate": 4.744245627094858e-07, "logits/chosen": 1.77793288230896, "logits/rejected": 1.717706322669983, "logps/chosen": -468.2958679199219, "logps/rejected": -462.658935546875, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": -0.9724741578102112, "rewards/margins": 0.3415077030658722, "rewards/rejected": -1.3139818906784058, "step": 108 }, { "epoch": 0.23296820732033127, "grad_norm": 20.50700794956668, "learning_rate": 4.735943519165842e-07, "logits/chosen": 1.9187538623809814, "logits/rejected": 2.0614094734191895, "logps/chosen": -454.2608642578125, "logps/rejected": -481.36431884765625, "loss": 0.6428, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1436271667480469, "rewards/margins": 0.13930538296699524, "rewards/rejected": -1.2829326391220093, "step": 109 }, { "epoch": 0.2351055303232701, "grad_norm": 18.272975924577455, "learning_rate": 4.7275163104709194e-07, "logits/chosen": 1.9010305404663086, "logits/rejected": 1.7875394821166992, "logps/chosen": -367.1365966796875, "logps/rejected": -367.42431640625, "loss": 0.6146, "rewards/accuracies": 0.5, "rewards/chosen": -0.9120699763298035, "rewards/margins": 0.2043510228395462, "rewards/rejected": -1.116420865058899, "step": 110 }, { "epoch": 0.23724285332620892, "grad_norm": 20.295493464767308, "learning_rate": 4.718964472511385e-07, "logits/chosen": 1.4215465784072876, "logits/rejected": 1.4408009052276611, "logps/chosen": -361.8034973144531, "logps/rejected": -382.62408447265625, "loss": 0.6292, "rewards/accuracies": 0.75, "rewards/chosen": -0.853609561920166, "rewards/margins": 0.46648335456848145, "rewards/rejected": -1.3200929164886475, "step": 111 }, { "epoch": 0.23938017632914774, "grad_norm": 18.121395683486867, "learning_rate": 4.710288483761524e-07, "logits/chosen": 1.0649147033691406, "logits/rejected": 1.071373462677002, "logps/chosen": -352.0085144042969, "logps/rejected": -351.5292053222656, "loss": 0.614, "rewards/accuracies": 0.5625, "rewards/chosen": -1.031156301498413, "rewards/margins": 0.21290811896324158, "rewards/rejected": -1.244064450263977, "step": 112 }, { "epoch": 0.24151749933208655, "grad_norm": 20.254445315081554, "learning_rate": 4.7014888296418447e-07, "logits/chosen": 1.5161610841751099, "logits/rejected": 1.550595760345459, "logps/chosen": -460.7918701171875, "logps/rejected": -468.8692321777344, "loss": 0.6563, "rewards/accuracies": 0.5, "rewards/chosen": -1.1960896253585815, "rewards/margins": 0.059063613414764404, "rewards/rejected": -1.2551532983779907, "step": 113 }, { "epoch": 0.2436548223350254, "grad_norm": 18.896077047246234, "learning_rate": 4.692566002491916e-07, "logits/chosen": 1.7081586122512817, "logits/rejected": 1.6641770601272583, "logps/chosen": -387.3111877441406, "logps/rejected": -368.2967224121094, "loss": 0.621, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8478341102600098, "rewards/margins": -0.09017517417669296, "rewards/rejected": -0.7576589584350586, "step": 114 }, { "epoch": 0.2457921453379642, "grad_norm": 21.30499019299629, "learning_rate": 4.683520501542824e-07, "logits/chosen": 1.545853614807129, "logits/rejected": 1.6058801412582397, "logps/chosen": -452.3874206542969, "logps/rejected": -474.2342834472656, "loss": 0.6145, "rewards/accuracies": 0.5, "rewards/chosen": -1.0228480100631714, "rewards/margins": -0.04428707808256149, "rewards/rejected": -0.9785609245300293, "step": 115 }, { "epoch": 0.24792946834090301, "grad_norm": 21.86187899194981, "learning_rate": 4.6743528328892384e-07, "logits/chosen": 1.2116038799285889, "logits/rejected": 1.1613038778305054, "logps/chosen": -325.0521240234375, "logps/rejected": -308.12518310546875, "loss": 0.6305, "rewards/accuracies": 0.4375, "rewards/chosen": -1.3946088552474976, "rewards/margins": -0.029741812497377396, "rewards/rejected": -1.3648672103881836, "step": 116 }, { "epoch": 0.25006679134384185, "grad_norm": 18.91701016098703, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 1.4677801132202148, "logits/rejected": 1.579450249671936, "logps/chosen": -344.372314453125, "logps/rejected": -327.50225830078125, "loss": 0.5949, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8053675293922424, "rewards/margins": 0.2620340585708618, "rewards/rejected": -1.067401647567749, "step": 117 }, { "epoch": 0.25220411434678064, "grad_norm": 20.012466373331762, "learning_rate": 4.655653050994906e-07, "logits/chosen": 1.6915593147277832, "logits/rejected": 1.726331114768982, "logps/chosen": -283.65875244140625, "logps/rejected": -331.01483154296875, "loss": 0.651, "rewards/accuracies": 0.5, "rewards/chosen": -0.5659047365188599, "rewards/margins": 0.2718469202518463, "rewards/rejected": -0.8377516269683838, "step": 118 }, { "epoch": 0.2543414373497195, "grad_norm": 17.44337077106633, "learning_rate": 4.646121984004665e-07, "logits/chosen": 1.8943997621536255, "logits/rejected": 1.9792275428771973, "logps/chosen": -344.978271484375, "logps/rejected": -379.34527587890625, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": -0.8211143612861633, "rewards/margins": 0.16521628201007843, "rewards/rejected": -0.9863307476043701, "step": 119 }, { "epoch": 0.2564787603526583, "grad_norm": 21.735001575170486, "learning_rate": 4.636470841752404e-07, "logits/chosen": 1.5497221946716309, "logits/rejected": 1.5381598472595215, "logps/chosen": -470.0634460449219, "logps/rejected": -428.8343505859375, "loss": 0.6117, "rewards/accuracies": 0.625, "rewards/chosen": -0.9829262495040894, "rewards/margins": 0.3946997821331024, "rewards/rejected": -1.3776261806488037, "step": 120 }, { "epoch": 0.2586160833555971, "grad_norm": 20.694912637280392, "learning_rate": 4.626700164218349e-07, "logits/chosen": 1.6372838020324707, "logits/rejected": 1.5779719352722168, "logps/chosen": -418.0284423828125, "logps/rejected": -402.91796875, "loss": 0.6965, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0081559419631958, "rewards/margins": -0.04968453198671341, "rewards/rejected": -0.9584714770317078, "step": 121 }, { "epoch": 0.26075340635853594, "grad_norm": 17.553458790755446, "learning_rate": 4.6168104980707103e-07, "logits/chosen": 2.0725860595703125, "logits/rejected": 2.064181089401245, "logps/chosen": -377.4428405761719, "logps/rejected": -404.63714599609375, "loss": 0.6212, "rewards/accuracies": 0.5, "rewards/chosen": -0.6986384987831116, "rewards/margins": 0.06995135545730591, "rewards/rejected": -0.7685898542404175, "step": 122 }, { "epoch": 0.26289072936147473, "grad_norm": 20.247623027581653, "learning_rate": 4.606802396635098e-07, "logits/chosen": 1.9462324380874634, "logits/rejected": 1.8845677375793457, "logps/chosen": -366.8088684082031, "logps/rejected": -356.9654846191406, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": -0.7235208749771118, "rewards/margins": 0.027630850672721863, "rewards/rejected": -0.7511517405509949, "step": 123 }, { "epoch": 0.26502805236441357, "grad_norm": 19.849959723892198, "learning_rate": 4.59667641986356e-07, "logits/chosen": 1.8744176626205444, "logits/rejected": 2.0228824615478516, "logps/chosen": -398.7731018066406, "logps/rejected": -424.6177978515625, "loss": 0.6709, "rewards/accuracies": 0.8125, "rewards/chosen": -0.693671464920044, "rewards/margins": 0.2790328860282898, "rewards/rejected": -0.972704291343689, "step": 124 }, { "epoch": 0.2671653753673524, "grad_norm": 17.32173219755259, "learning_rate": 4.5864331343032565e-07, "logits/chosen": 1.621677279472351, "logits/rejected": 1.6809951066970825, "logps/chosen": -452.27984619140625, "logps/rejected": -462.4055480957031, "loss": 0.6348, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9525185823440552, "rewards/margins": 0.3178427219390869, "rewards/rejected": -1.270361304283142, "step": 125 }, { "epoch": 0.2693026983702912, "grad_norm": 16.88106478871165, "learning_rate": 4.576073113064759e-07, "logits/chosen": 2.357682943344116, "logits/rejected": 2.2975120544433594, "logps/chosen": -279.5217590332031, "logps/rejected": -295.60540771484375, "loss": 0.6292, "rewards/accuracies": 0.6875, "rewards/chosen": -0.47640547156333923, "rewards/margins": 0.4183332920074463, "rewards/rejected": -0.8947387933731079, "step": 126 }, { "epoch": 0.27144002137323003, "grad_norm": 18.58434290114408, "learning_rate": 4.565596935789987e-07, "logits/chosen": 1.576242446899414, "logits/rejected": 1.6992592811584473, "logps/chosen": -439.7972106933594, "logps/rejected": -498.26849365234375, "loss": 0.6055, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9947447776794434, "rewards/margins": 0.28498226404190063, "rewards/rejected": -1.2797271013259888, "step": 127 }, { "epoch": 0.2735773443761689, "grad_norm": 21.02105243570409, "learning_rate": 4.555005188619775e-07, "logits/chosen": 2.214823007583618, "logits/rejected": 2.240490674972534, "logps/chosen": -381.3008728027344, "logps/rejected": -405.7254638671875, "loss": 0.6539, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9661093354225159, "rewards/margins": 0.04444655776023865, "rewards/rejected": -1.0105559825897217, "step": 128 }, { "epoch": 0.27571466737910766, "grad_norm": 17.756752976550278, "learning_rate": 4.5442984641610784e-07, "logits/chosen": 1.8476028442382812, "logits/rejected": 1.8882935047149658, "logps/chosen": -436.7222900390625, "logps/rejected": -449.9324035644531, "loss": 0.5981, "rewards/accuracies": 0.625, "rewards/chosen": -0.788104772567749, "rewards/margins": 0.29956191778182983, "rewards/rejected": -1.0876667499542236, "step": 129 }, { "epoch": 0.2778519903820465, "grad_norm": 18.161397454748034, "learning_rate": 4.533477361453819e-07, "logits/chosen": 1.1066253185272217, "logits/rejected": 1.1686846017837524, "logps/chosen": -305.81842041015625, "logps/rejected": -307.0638122558594, "loss": 0.5808, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8689908385276794, "rewards/margins": 0.27876365184783936, "rewards/rejected": -1.1477545499801636, "step": 130 }, { "epoch": 0.2799893133849853, "grad_norm": 17.429213086565447, "learning_rate": 4.5225424859373684e-07, "logits/chosen": 1.4423407316207886, "logits/rejected": 1.51125168800354, "logps/chosen": -355.8777160644531, "logps/rejected": -386.8418884277344, "loss": 0.6476, "rewards/accuracies": 0.75, "rewards/chosen": -0.9852381348609924, "rewards/margins": 0.3623862862586975, "rewards/rejected": -1.3476243019104004, "step": 131 }, { "epoch": 0.2821266363879241, "grad_norm": 16.871614858081898, "learning_rate": 4.511494449416671e-07, "logits/chosen": 1.5764886140823364, "logits/rejected": 1.6248146295547485, "logps/chosen": -409.7974548339844, "logps/rejected": -408.63824462890625, "loss": 0.6094, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9070343971252441, "rewards/margins": 0.17678040266036987, "rewards/rejected": -1.0838148593902588, "step": 132 }, { "epoch": 0.28426395939086296, "grad_norm": 17.095310780630726, "learning_rate": 4.500333870028016e-07, "logits/chosen": 1.6591562032699585, "logits/rejected": 1.5384818315505981, "logps/chosen": -343.987060546875, "logps/rejected": -385.374755859375, "loss": 0.6259, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7391742467880249, "rewards/margins": 0.5074938535690308, "rewards/rejected": -1.2466681003570557, "step": 133 }, { "epoch": 0.28640128239380175, "grad_norm": 19.656778173814363, "learning_rate": 4.489061372204452e-07, "logits/chosen": 1.0009726285934448, "logits/rejected": 1.051519513130188, "logps/chosen": -384.1603088378906, "logps/rejected": -367.7815856933594, "loss": 0.6539, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0506068468093872, "rewards/margins": 0.08326918631792068, "rewards/rejected": -1.1338759660720825, "step": 134 }, { "epoch": 0.2885386053967406, "grad_norm": 19.01652787278696, "learning_rate": 4.4776775866408533e-07, "logits/chosen": 2.3810667991638184, "logits/rejected": 2.3225905895233154, "logps/chosen": -384.6607360839844, "logps/rejected": -368.5036926269531, "loss": 0.5726, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6546584367752075, "rewards/margins": 0.3204587697982788, "rewards/rejected": -0.9751172065734863, "step": 135 }, { "epoch": 0.29067592839967943, "grad_norm": 21.69866647932205, "learning_rate": 4.4661831502586244e-07, "logits/chosen": 1.681275486946106, "logits/rejected": 1.7548019886016846, "logps/chosen": -457.0005187988281, "logps/rejected": -467.3171691894531, "loss": 0.615, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7441829442977905, "rewards/margins": 0.49308401346206665, "rewards/rejected": -1.2372668981552124, "step": 136 }, { "epoch": 0.2928132514026182, "grad_norm": 18.490983761159814, "learning_rate": 4.4545787061700746e-07, "logits/chosen": 1.8112283945083618, "logits/rejected": 1.8490984439849854, "logps/chosen": -429.985595703125, "logps/rejected": -492.1372375488281, "loss": 0.6052, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9648774862289429, "rewards/margins": 0.31009066104888916, "rewards/rejected": -1.274968147277832, "step": 137 }, { "epoch": 0.29495057440555705, "grad_norm": 20.87958822150684, "learning_rate": 4.442864903642427e-07, "logits/chosen": 1.2387418746948242, "logits/rejected": 1.3154277801513672, "logps/chosen": -298.0567626953125, "logps/rejected": -298.1575927734375, "loss": 0.6371, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8304409384727478, "rewards/margins": 0.3910561203956604, "rewards/rejected": -1.2214970588684082, "step": 138 }, { "epoch": 0.29708789740849584, "grad_norm": 16.835799139060907, "learning_rate": 4.4310423980614986e-07, "logits/chosen": 1.3767895698547363, "logits/rejected": 1.4596718549728394, "logps/chosen": -327.63934326171875, "logps/rejected": -384.70782470703125, "loss": 0.6162, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9396180510520935, "rewards/margins": -0.1563718318939209, "rewards/rejected": -0.7832461595535278, "step": 139 }, { "epoch": 0.2992252204114347, "grad_norm": 19.34921651543505, "learning_rate": 4.4191118508950277e-07, "logits/chosen": 1.306443691253662, "logits/rejected": 1.4678008556365967, "logps/chosen": -356.93157958984375, "logps/rejected": -380.83612060546875, "loss": 0.6661, "rewards/accuracies": 0.375, "rewards/chosen": -0.7840622663497925, "rewards/margins": -0.01293850876390934, "rewards/rejected": -0.7711237668991089, "step": 140 }, { "epoch": 0.3013625434143735, "grad_norm": 19.997863233531348, "learning_rate": 4.407073929655666e-07, "logits/chosen": 1.8293468952178955, "logits/rejected": 1.9084713459014893, "logps/chosen": -418.7998046875, "logps/rejected": -456.46282958984375, "loss": 0.626, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0191199779510498, "rewards/margins": 0.17436285316944122, "rewards/rejected": -1.1934828758239746, "step": 141 }, { "epoch": 0.3034998664173123, "grad_norm": 17.11885511546179, "learning_rate": 4.394929307863632e-07, "logits/chosen": 1.5053921937942505, "logits/rejected": 1.5208780765533447, "logps/chosen": -437.318603515625, "logps/rejected": -465.1886291503906, "loss": 0.5784, "rewards/accuracies": 0.625, "rewards/chosen": -1.2199373245239258, "rewards/margins": 0.392232745885849, "rewards/rejected": -1.6121701002120972, "step": 142 }, { "epoch": 0.30563718942025114, "grad_norm": 18.052198878210906, "learning_rate": 4.3826786650090273e-07, "logits/chosen": 1.749753475189209, "logits/rejected": 1.900386929512024, "logps/chosen": -450.9555969238281, "logps/rejected": -499.1803894042969, "loss": 0.6127, "rewards/accuracies": 0.625, "rewards/chosen": -1.1993368864059448, "rewards/margins": 0.33942604064941406, "rewards/rejected": -1.5387628078460693, "step": 143 }, { "epoch": 0.3077745124231899, "grad_norm": 19.061405430342706, "learning_rate": 4.370322686513817e-07, "logits/chosen": 1.7149816751480103, "logits/rejected": 1.6238107681274414, "logps/chosen": -469.9088134765625, "logps/rejected": -504.73382568359375, "loss": 0.6283, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1103051900863647, "rewards/margins": 0.37591469287872314, "rewards/rejected": -1.486219882965088, "step": 144 }, { "epoch": 0.30991183542612877, "grad_norm": 19.971830399651115, "learning_rate": 4.357862063693485e-07, "logits/chosen": 1.6376560926437378, "logits/rejected": 1.64955735206604, "logps/chosen": -450.31072998046875, "logps/rejected": -495.2893371582031, "loss": 0.6152, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3110533952713013, "rewards/margins": 0.012708161026239395, "rewards/rejected": -1.3237614631652832, "step": 145 }, { "epoch": 0.3120491584290676, "grad_norm": 20.61129851426581, "learning_rate": 4.345297493718352e-07, "logits/chosen": 1.9281656742095947, "logits/rejected": 1.8954627513885498, "logps/chosen": -473.4234619140625, "logps/rejected": -445.9279479980469, "loss": 0.6213, "rewards/accuracies": 0.75, "rewards/chosen": -1.2771466970443726, "rewards/margins": 0.2621864378452301, "rewards/rejected": -1.5393332242965698, "step": 146 }, { "epoch": 0.3141864814320064, "grad_norm": 18.097782796746507, "learning_rate": 4.332629679574565e-07, "logits/chosen": 1.952436923980713, "logits/rejected": 2.018927574157715, "logps/chosen": -394.45086669921875, "logps/rejected": -399.55499267578125, "loss": 0.5665, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1240993738174438, "rewards/margins": 0.2864275574684143, "rewards/rejected": -1.410526990890503, "step": 147 }, { "epoch": 0.31632380443494523, "grad_norm": 20.183130534419625, "learning_rate": 4.319859330024777e-07, "logits/chosen": 2.2235820293426514, "logits/rejected": 2.276655673980713, "logps/chosen": -307.6512145996094, "logps/rejected": -317.3448486328125, "loss": 0.6333, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3463375568389893, "rewards/margins": -0.0017591193318367004, "rewards/rejected": -1.3445783853530884, "step": 148 }, { "epoch": 0.3184611274378841, "grad_norm": 25.15607121291107, "learning_rate": 4.3069871595684787e-07, "logits/chosen": 1.5035438537597656, "logits/rejected": 1.4331194162368774, "logps/chosen": -327.4940490722656, "logps/rejected": -331.21673583984375, "loss": 0.6058, "rewards/accuracies": 0.5, "rewards/chosen": -1.3689939975738525, "rewards/margins": -0.0226670503616333, "rewards/rejected": -1.3463268280029297, "step": 149 }, { "epoch": 0.32059845044082286, "grad_norm": 21.360613822616227, "learning_rate": 4.294013888402029e-07, "logits/chosen": 1.6609265804290771, "logits/rejected": 1.6308729648590088, "logps/chosen": -381.4766845703125, "logps/rejected": -370.9427185058594, "loss": 0.6779, "rewards/accuracies": 0.625, "rewards/chosen": -1.2506117820739746, "rewards/margins": 0.20036408305168152, "rewards/rejected": -1.4509758949279785, "step": 150 }, { "epoch": 0.3227357734437617, "grad_norm": 20.441359698002614, "learning_rate": 4.280940242378362e-07, "logits/chosen": 1.9611876010894775, "logits/rejected": 2.0835318565368652, "logps/chosen": -370.7065124511719, "logps/rejected": -373.98919677734375, "loss": 0.6251, "rewards/accuracies": 0.625, "rewards/chosen": -1.1139070987701416, "rewards/margins": 0.22739200294017792, "rewards/rejected": -1.341299057006836, "step": 151 }, { "epoch": 0.3248730964467005, "grad_norm": 18.79562489140208, "learning_rate": 4.2677669529663686e-07, "logits/chosen": 1.4962538480758667, "logits/rejected": 1.5024197101593018, "logps/chosen": -337.7670593261719, "logps/rejected": -365.6881103515625, "loss": 0.6088, "rewards/accuracies": 0.875, "rewards/chosen": -1.1060924530029297, "rewards/margins": 0.7126469612121582, "rewards/rejected": -1.818739652633667, "step": 152 }, { "epoch": 0.3270104194496393, "grad_norm": 18.073724217070875, "learning_rate": 4.254494757209979e-07, "logits/chosen": 1.3349535465240479, "logits/rejected": 1.4294294118881226, "logps/chosen": -372.82452392578125, "logps/rejected": -384.3750915527344, "loss": 0.6056, "rewards/accuracies": 0.625, "rewards/chosen": -1.492186427116394, "rewards/margins": 0.22766265273094177, "rewards/rejected": -1.7198491096496582, "step": 153 }, { "epoch": 0.32914774245257816, "grad_norm": 18.58205194816464, "learning_rate": 4.2411243976869173e-07, "logits/chosen": 1.724280834197998, "logits/rejected": 1.7333550453186035, "logps/chosen": -360.4881591796875, "logps/rejected": -359.3287658691406, "loss": 0.6381, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1819064617156982, "rewards/margins": 0.5323120355606079, "rewards/rejected": -1.7142184972763062, "step": 154 }, { "epoch": 0.33128506545551695, "grad_norm": 19.960829202955406, "learning_rate": 4.227656622467162e-07, "logits/chosen": 1.953383207321167, "logits/rejected": 1.9463614225387573, "logps/chosen": -409.99627685546875, "logps/rejected": -437.938232421875, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": -1.0230939388275146, "rewards/margins": 0.16208747029304504, "rewards/rejected": -1.1851812601089478, "step": 155 }, { "epoch": 0.3334223884584558, "grad_norm": 18.62902600269558, "learning_rate": 4.2140921850710855e-07, "logits/chosen": 1.7730010747909546, "logits/rejected": 1.8674681186676025, "logps/chosen": -416.8703308105469, "logps/rejected": -413.43560791015625, "loss": 0.6169, "rewards/accuracies": 0.625, "rewards/chosen": -1.1183083057403564, "rewards/margins": 0.2687716484069824, "rewards/rejected": -1.3870799541473389, "step": 156 }, { "epoch": 0.3355597114613946, "grad_norm": 18.56104964810593, "learning_rate": 4.200431844427298e-07, "logits/chosen": 1.5295546054840088, "logits/rejected": 1.7054471969604492, "logps/chosen": -362.6158752441406, "logps/rejected": -384.364013671875, "loss": 0.6365, "rewards/accuracies": 0.5, "rewards/chosen": -1.8897120952606201, "rewards/margins": -0.04062645137310028, "rewards/rejected": -1.849085807800293, "step": 157 }, { "epoch": 0.3376970344643334, "grad_norm": 21.508842817831596, "learning_rate": 4.186676364830186e-07, "logits/chosen": 2.0236904621124268, "logits/rejected": 1.945557713508606, "logps/chosen": -344.7059020996094, "logps/rejected": -316.12835693359375, "loss": 0.6745, "rewards/accuracies": 0.5, "rewards/chosen": -1.44972562789917, "rewards/margins": 0.07076007127761841, "rewards/rejected": -1.520485758781433, "step": 158 }, { "epoch": 0.33983435746727225, "grad_norm": 19.368493910930727, "learning_rate": 4.172826515897145e-07, "logits/chosen": 2.0061938762664795, "logits/rejected": 1.9779328107833862, "logps/chosen": -355.9288330078125, "logps/rejected": -391.3779602050781, "loss": 0.6288, "rewards/accuracies": 0.6875, "rewards/chosen": -1.173938512802124, "rewards/margins": 0.46023327112197876, "rewards/rejected": -1.6341716051101685, "step": 159 }, { "epoch": 0.34197168047021104, "grad_norm": 20.382801776648744, "learning_rate": 4.158883072525528e-07, "logits/chosen": 2.31357479095459, "logits/rejected": 2.2882211208343506, "logps/chosen": -408.6782531738281, "logps/rejected": -441.49530029296875, "loss": 0.6609, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3026232719421387, "rewards/margins": 0.30984416604042053, "rewards/rejected": -1.6124674081802368, "step": 160 }, { "epoch": 0.3441090034731499, "grad_norm": 19.084492084822422, "learning_rate": 4.1448468148492814e-07, "logits/chosen": 1.8440988063812256, "logits/rejected": 1.9733848571777344, "logps/chosen": -435.4466552734375, "logps/rejected": -475.33648681640625, "loss": 0.6153, "rewards/accuracies": 0.625, "rewards/chosen": -1.3617444038391113, "rewards/margins": 0.13640473783016205, "rewards/rejected": -1.4981491565704346, "step": 161 }, { "epoch": 0.3462463264760887, "grad_norm": 27.08324422105949, "learning_rate": 4.130718528195303e-07, "logits/chosen": 1.7824862003326416, "logits/rejected": 1.6462738513946533, "logps/chosen": -437.1048889160156, "logps/rejected": -440.6667785644531, "loss": 0.6982, "rewards/accuracies": 0.4375, "rewards/chosen": -1.561838150024414, "rewards/margins": -0.11082294583320618, "rewards/rejected": -1.4510152339935303, "step": 162 }, { "epoch": 0.3483836494790275, "grad_norm": 19.814697570819504, "learning_rate": 4.1164990030394985e-07, "logits/chosen": 1.0093296766281128, "logits/rejected": 1.0821220874786377, "logps/chosen": -423.516357421875, "logps/rejected": -420.7889404296875, "loss": 0.6014, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1094772815704346, "rewards/margins": 0.3603382408618927, "rewards/rejected": -1.4698156118392944, "step": 163 }, { "epoch": 0.35052097248196634, "grad_norm": 19.599198524357487, "learning_rate": 4.10218903496256e-07, "logits/chosen": 1.682007074356079, "logits/rejected": 1.787623643875122, "logps/chosen": -282.51171875, "logps/rejected": -345.05877685546875, "loss": 0.6625, "rewards/accuracies": 0.5, "rewards/chosen": -1.205474615097046, "rewards/margins": 0.271236389875412, "rewards/rejected": -1.4767110347747803, "step": 164 }, { "epoch": 0.3526582954849052, "grad_norm": 17.120301761319624, "learning_rate": 4.087789424605447e-07, "logits/chosen": 1.741114854812622, "logits/rejected": 1.6152499914169312, "logps/chosen": -417.688720703125, "logps/rejected": -433.7536315917969, "loss": 0.6162, "rewards/accuracies": 0.75, "rewards/chosen": -1.1665997505187988, "rewards/margins": 0.2989981770515442, "rewards/rejected": -1.4655979871749878, "step": 165 }, { "epoch": 0.35479561848784397, "grad_norm": 18.747521162934547, "learning_rate": 4.0733009776245937e-07, "logits/chosen": 1.4322879314422607, "logits/rejected": 1.3907232284545898, "logps/chosen": -347.32489013671875, "logps/rejected": -360.52301025390625, "loss": 0.6227, "rewards/accuracies": 0.6875, "rewards/chosen": -1.081840991973877, "rewards/margins": 0.16741850972175598, "rewards/rejected": -1.249259352684021, "step": 166 }, { "epoch": 0.3569329414907828, "grad_norm": 18.0004093305282, "learning_rate": 4.058724504646834e-07, "logits/chosen": 2.36202335357666, "logits/rejected": 2.4911999702453613, "logps/chosen": -383.17584228515625, "logps/rejected": -401.8243408203125, "loss": 0.6195, "rewards/accuracies": 0.75, "rewards/chosen": -1.2278780937194824, "rewards/margins": 0.20441044867038727, "rewards/rejected": -1.4322885274887085, "step": 167 }, { "epoch": 0.3590702644937216, "grad_norm": 17.80844319511304, "learning_rate": 4.0440608212240445e-07, "logits/chosen": 1.4843562841415405, "logits/rejected": 1.5054625272750854, "logps/chosen": -411.7197265625, "logps/rejected": -443.1764221191406, "loss": 0.6053, "rewards/accuracies": 0.875, "rewards/chosen": -0.9291986227035522, "rewards/margins": 0.41230058670043945, "rewards/rejected": -1.3414990901947021, "step": 168 }, { "epoch": 0.36120758749666043, "grad_norm": 18.374934776369027, "learning_rate": 4.0293107477875156e-07, "logits/chosen": 1.700923204421997, "logits/rejected": 1.6426975727081299, "logps/chosen": -396.6122741699219, "logps/rejected": -397.70355224609375, "loss": 0.6373, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2499213218688965, "rewards/margins": 0.22793683409690857, "rewards/rejected": -1.4778581857681274, "step": 169 }, { "epoch": 0.36334491049959927, "grad_norm": 19.145060598159265, "learning_rate": 4.0144751096020497e-07, "logits/chosen": 1.7463726997375488, "logits/rejected": 1.8051655292510986, "logps/chosen": -353.2317810058594, "logps/rejected": -378.6690979003906, "loss": 0.6298, "rewards/accuracies": 0.75, "rewards/chosen": -0.9795432686805725, "rewards/margins": 0.40754634141921997, "rewards/rejected": -1.387089729309082, "step": 170 }, { "epoch": 0.36548223350253806, "grad_norm": 16.861158005133213, "learning_rate": 3.9995547367197843e-07, "logits/chosen": 1.6164271831512451, "logits/rejected": 1.6135661602020264, "logps/chosen": -327.3568115234375, "logps/rejected": -337.6192321777344, "loss": 0.5895, "rewards/accuracies": 0.75, "rewards/chosen": -0.874947190284729, "rewards/margins": 0.4394197463989258, "rewards/rejected": -1.3143669366836548, "step": 171 }, { "epoch": 0.3676195565054769, "grad_norm": 17.777424855400742, "learning_rate": 3.9845504639337535e-07, "logits/chosen": 1.070286750793457, "logits/rejected": 1.2197216749191284, "logps/chosen": -231.44830322265625, "logps/rejected": -272.8808898925781, "loss": 0.57, "rewards/accuracies": 0.875, "rewards/chosen": -0.6078026294708252, "rewards/margins": 0.5821976661682129, "rewards/rejected": -1.190000295639038, "step": 172 }, { "epoch": 0.36975687950841574, "grad_norm": 17.126819669486423, "learning_rate": 3.9694631307311825e-07, "logits/chosen": 2.135113477706909, "logits/rejected": 2.2197318077087402, "logps/chosen": -419.06134033203125, "logps/rejected": -513.2752075195312, "loss": 0.5982, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9003175497055054, "rewards/margins": 0.7730761170387268, "rewards/rejected": -1.673393726348877, "step": 173 }, { "epoch": 0.3718942025113545, "grad_norm": 18.225988784634456, "learning_rate": 3.954293581246514e-07, "logits/chosen": 1.7381099462509155, "logits/rejected": 1.8567441701889038, "logps/chosen": -414.3870544433594, "logps/rejected": -441.8538818359375, "loss": 0.6366, "rewards/accuracies": 0.5, "rewards/chosen": -0.8376742601394653, "rewards/margins": 0.05604010820388794, "rewards/rejected": -0.8937143087387085, "step": 174 }, { "epoch": 0.37403152551429336, "grad_norm": 19.941158048862572, "learning_rate": 3.939042664214184e-07, "logits/chosen": 1.4559072256088257, "logits/rejected": 1.3386807441711426, "logps/chosen": -338.5787658691406, "logps/rejected": -348.70880126953125, "loss": 0.612, "rewards/accuracies": 0.625, "rewards/chosen": -1.0519522428512573, "rewards/margins": 0.3615396022796631, "rewards/rejected": -1.41349196434021, "step": 175 }, { "epoch": 0.37616884851723215, "grad_norm": 17.811647654827375, "learning_rate": 3.92371123292113e-07, "logits/chosen": 0.6934733986854553, "logits/rejected": 0.7196266055107117, "logps/chosen": -258.3397216796875, "logps/rejected": -262.33905029296875, "loss": 0.6108, "rewards/accuracies": 0.75, "rewards/chosen": -0.8770631551742554, "rewards/margins": 0.588469386100769, "rewards/rejected": -1.465532660484314, "step": 176 }, { "epoch": 0.378306171520171, "grad_norm": 19.009550272152616, "learning_rate": 3.908300145159055e-07, "logits/chosen": 2.3191661834716797, "logits/rejected": 2.3956990242004395, "logps/chosen": -498.1864318847656, "logps/rejected": -484.289794921875, "loss": 0.6321, "rewards/accuracies": 0.625, "rewards/chosen": -1.0074068307876587, "rewards/margins": 0.2774062752723694, "rewards/rejected": -1.2848131656646729, "step": 177 }, { "epoch": 0.3804434945231098, "grad_norm": 20.426912799899327, "learning_rate": 3.8928102631764304e-07, "logits/chosen": 1.5165929794311523, "logits/rejected": 1.4213718175888062, "logps/chosen": -411.7895812988281, "logps/rejected": -394.05938720703125, "loss": 0.6124, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4399906396865845, "rewards/margins": 0.20191159844398499, "rewards/rejected": -1.6419024467468262, "step": 178 }, { "epoch": 0.3825808175260486, "grad_norm": 18.22302533315433, "learning_rate": 3.877242453630256e-07, "logits/chosen": 1.3885517120361328, "logits/rejected": 1.6490609645843506, "logps/chosen": -260.5142822265625, "logps/rejected": -311.5865173339844, "loss": 0.5865, "rewards/accuracies": 0.875, "rewards/chosen": -0.7588627338409424, "rewards/margins": 0.8435803651809692, "rewards/rejected": -1.6024430990219116, "step": 179 }, { "epoch": 0.38471814052898745, "grad_norm": 18.319584297433117, "learning_rate": 3.8615975875375676e-07, "logits/chosen": 1.64811110496521, "logits/rejected": 1.57973051071167, "logps/chosen": -375.68115234375, "logps/rejected": -375.6374206542969, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": -1.0208864212036133, "rewards/margins": 0.451630562543869, "rewards/rejected": -1.4725168943405151, "step": 180 }, { "epoch": 0.38685546353192624, "grad_norm": 17.171371040882548, "learning_rate": 3.8458765402267056e-07, "logits/chosen": 2.0625722408294678, "logits/rejected": 2.011898994445801, "logps/chosen": -447.5124206542969, "logps/rejected": -442.49383544921875, "loss": 0.5831, "rewards/accuracies": 0.75, "rewards/chosen": -1.1310521364212036, "rewards/margins": 0.3197172284126282, "rewards/rejected": -1.4507691860198975, "step": 181 }, { "epoch": 0.3889927865348651, "grad_norm": 18.254942031621926, "learning_rate": 3.8300801912883414e-07, "logits/chosen": 2.303025007247925, "logits/rejected": 2.362499713897705, "logps/chosen": -428.386962890625, "logps/rejected": -428.35321044921875, "loss": 0.6029, "rewards/accuracies": 0.625, "rewards/chosen": -1.0741493701934814, "rewards/margins": 0.13146010041236877, "rewards/rejected": -1.2056095600128174, "step": 182 }, { "epoch": 0.3911301095378039, "grad_norm": 18.02286496058953, "learning_rate": 3.8142094245262615e-07, "logits/chosen": 1.539035439491272, "logits/rejected": 1.5929516553878784, "logps/chosen": -343.71429443359375, "logps/rejected": -337.14923095703125, "loss": 0.6112, "rewards/accuracies": 0.625, "rewards/chosen": -1.3707005977630615, "rewards/margins": 0.03174225986003876, "rewards/rejected": -1.4024429321289062, "step": 183 }, { "epoch": 0.3932674325407427, "grad_norm": 19.325777233326907, "learning_rate": 3.7982651279079227e-07, "logits/chosen": 1.77326238155365, "logits/rejected": 1.7490006685256958, "logps/chosen": -453.261962890625, "logps/rejected": -457.6907958984375, "loss": 0.5617, "rewards/accuracies": 0.875, "rewards/chosen": -0.9381343722343445, "rewards/margins": 0.5582923889160156, "rewards/rejected": -1.4964268207550049, "step": 184 }, { "epoch": 0.39540475554368154, "grad_norm": 17.697347986359333, "learning_rate": 3.7822481935147655e-07, "logits/chosen": 1.5330384969711304, "logits/rejected": 1.6347877979278564, "logps/chosen": -455.7304992675781, "logps/rejected": -506.4276123046875, "loss": 0.5943, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9335508346557617, "rewards/margins": 0.45107167959213257, "rewards/rejected": -1.384622573852539, "step": 185 }, { "epoch": 0.3975420785466204, "grad_norm": 17.07660005715345, "learning_rate": 3.766159517492307e-07, "logits/chosen": 1.6240900754928589, "logits/rejected": 1.6119805574417114, "logps/chosen": -381.19952392578125, "logps/rejected": -398.58319091796875, "loss": 0.5969, "rewards/accuracies": 0.625, "rewards/chosen": -1.0835504531860352, "rewards/margins": 0.2535113990306854, "rewards/rejected": -1.337061882019043, "step": 186 }, { "epoch": 0.39967940154955917, "grad_norm": 21.101749897060266, "learning_rate": 3.75e-07, "logits/chosen": 0.8521130084991455, "logits/rejected": 0.9815881252288818, "logps/chosen": -459.6861877441406, "logps/rejected": -452.48126220703125, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": -1.3385112285614014, "rewards/margins": 0.12616467475891113, "rewards/rejected": -1.464676022529602, "step": 187 }, { "epoch": 0.401816724552498, "grad_norm": 16.950089126169885, "learning_rate": 3.7337705451608667e-07, "logits/chosen": 2.048741340637207, "logits/rejected": 1.9400901794433594, "logps/chosen": -363.7098388671875, "logps/rejected": -378.2185363769531, "loss": 0.6111, "rewards/accuracies": 0.875, "rewards/chosen": -0.7874991297721863, "rewards/margins": 0.5683628916740417, "rewards/rejected": -1.355862021446228, "step": 188 }, { "epoch": 0.4039540475554368, "grad_norm": 16.71431664815698, "learning_rate": 3.717472061010918e-07, "logits/chosen": 1.9219565391540527, "logits/rejected": 1.8952841758728027, "logps/chosen": -433.7643737792969, "logps/rejected": -417.7620849609375, "loss": 0.6077, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0933847427368164, "rewards/margins": 0.05321376025676727, "rewards/rejected": -1.1465984582901, "step": 189 }, { "epoch": 0.40609137055837563, "grad_norm": 16.58728088417605, "learning_rate": 3.7011054594483443e-07, "logits/chosen": 1.4869471788406372, "logits/rejected": 1.5433428287506104, "logps/chosen": -349.62884521484375, "logps/rejected": -359.94091796875, "loss": 0.611, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8237789273262024, "rewards/margins": 0.12867197394371033, "rewards/rejected": -0.9524509906768799, "step": 190 }, { "epoch": 0.40822869356131447, "grad_norm": 16.71557444491728, "learning_rate": 3.6846716561824967e-07, "logits/chosen": 1.945807933807373, "logits/rejected": 1.9054489135742188, "logps/chosen": -399.7051696777344, "logps/rejected": -397.4898681640625, "loss": 0.5702, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2767914533615112, "rewards/margins": 0.09971967339515686, "rewards/rejected": -1.3765110969543457, "step": 191 }, { "epoch": 0.41036601656425326, "grad_norm": 17.940217471393204, "learning_rate": 3.668171570682655e-07, "logits/chosen": 1.712918758392334, "logits/rejected": 1.7127172946929932, "logps/chosen": -469.094970703125, "logps/rejected": -432.9211120605469, "loss": 0.6023, "rewards/accuracies": 0.625, "rewards/chosen": -1.2319235801696777, "rewards/margins": 0.32265806198120117, "rewards/rejected": -1.554581642150879, "step": 192 }, { "epoch": 0.4125033395671921, "grad_norm": 19.798434486116633, "learning_rate": 3.6516061261265805e-07, "logits/chosen": 1.795310139656067, "logits/rejected": 1.7313811779022217, "logps/chosen": -429.7494812011719, "logps/rejected": -461.983642578125, "loss": 0.6181, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9297257661819458, "rewards/margins": 0.4894411563873291, "rewards/rejected": -1.419166922569275, "step": 193 }, { "epoch": 0.41464066257013094, "grad_norm": 19.397752291933788, "learning_rate": 3.634976249348867e-07, "logits/chosen": 1.9277091026306152, "logits/rejected": 1.7616665363311768, "logps/chosen": -368.06817626953125, "logps/rejected": -363.0306701660156, "loss": 0.5866, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8755682706832886, "rewards/margins": 0.64577317237854, "rewards/rejected": -1.5213414430618286, "step": 194 }, { "epoch": 0.4167779855730697, "grad_norm": 17.93101753591044, "learning_rate": 3.618282870789081e-07, "logits/chosen": 1.7866439819335938, "logits/rejected": 1.8261215686798096, "logps/chosen": -434.7184753417969, "logps/rejected": -425.9129333496094, "loss": 0.5672, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1498773097991943, "rewards/margins": 0.3212715983390808, "rewards/rejected": -1.4711488485336304, "step": 195 }, { "epoch": 0.41891530857600856, "grad_norm": 20.67237268539849, "learning_rate": 3.601526924439709e-07, "logits/chosen": 1.2071539163589478, "logits/rejected": 1.148329257965088, "logps/chosen": -399.2107238769531, "logps/rejected": -418.5735778808594, "loss": 0.6518, "rewards/accuracies": 0.5, "rewards/chosen": -1.2001055479049683, "rewards/margins": -0.004660561680793762, "rewards/rejected": -1.1954450607299805, "step": 196 }, { "epoch": 0.42105263157894735, "grad_norm": 19.547002379951252, "learning_rate": 3.584709347793895e-07, "logits/chosen": 1.9946776628494263, "logits/rejected": 2.094057559967041, "logps/chosen": -338.82177734375, "logps/rejected": -348.4697265625, "loss": 0.6322, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9167635440826416, "rewards/margins": 0.0018602609634399414, "rewards/rejected": -0.9186238646507263, "step": 197 }, { "epoch": 0.4231899545818862, "grad_norm": 17.58918480785892, "learning_rate": 3.567831081792992e-07, "logits/chosen": 1.9100654125213623, "logits/rejected": 1.8705369234085083, "logps/chosen": -388.90478515625, "logps/rejected": -419.180908203125, "loss": 0.6028, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0298148393630981, "rewards/margins": 0.3086302876472473, "rewards/rejected": -1.3384451866149902, "step": 198 }, { "epoch": 0.425327277584825, "grad_norm": 19.436995082353043, "learning_rate": 3.550893070773914e-07, "logits/chosen": 1.6310675144195557, "logits/rejected": 1.5973904132843018, "logps/chosen": -369.18695068359375, "logps/rejected": -372.208740234375, "loss": 0.6379, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0363404750823975, "rewards/margins": 0.19525709748268127, "rewards/rejected": -1.2315975427627563, "step": 199 }, { "epoch": 0.4274646005877638, "grad_norm": 18.684092669252145, "learning_rate": 3.5338962624163016e-07, "logits/chosen": 1.7700620889663696, "logits/rejected": 1.70797598361969, "logps/chosen": -411.388427734375, "logps/rejected": -411.8304748535156, "loss": 0.6157, "rewards/accuracies": 0.6875, "rewards/chosen": -1.03152596950531, "rewards/margins": 0.24860131740570068, "rewards/rejected": -1.2801272869110107, "step": 200 }, { "epoch": 0.42960192359070265, "grad_norm": 20.121707150971464, "learning_rate": 3.516841607689501e-07, "logits/chosen": 1.967681884765625, "logits/rejected": 2.080134153366089, "logps/chosen": -450.67706298828125, "logps/rejected": -449.7607727050781, "loss": 0.5954, "rewards/accuracies": 0.6875, "rewards/chosen": -1.259590983390808, "rewards/margins": 0.48261815309524536, "rewards/rejected": -1.7422093152999878, "step": 201 }, { "epoch": 0.4317392465936415, "grad_norm": 18.887931551669933, "learning_rate": 3.499730060799352e-07, "logits/chosen": 2.4707322120666504, "logits/rejected": 2.550736427307129, "logps/chosen": -446.833251953125, "logps/rejected": -454.7132568359375, "loss": 0.6176, "rewards/accuracies": 0.5, "rewards/chosen": -1.1572250127792358, "rewards/margins": 0.007829396985471249, "rewards/rejected": -1.1650543212890625, "step": 202 }, { "epoch": 0.4338765695965803, "grad_norm": 18.15280641353194, "learning_rate": 3.482562579134809e-07, "logits/chosen": 1.8412984609603882, "logits/rejected": 1.7161986827850342, "logps/chosen": -391.0166015625, "logps/rejected": -431.3764343261719, "loss": 0.5922, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2463746070861816, "rewards/margins": 0.5344080924987793, "rewards/rejected": -1.7807824611663818, "step": 203 }, { "epoch": 0.4360138925995191, "grad_norm": 19.28039369733405, "learning_rate": 3.465340123214365e-07, "logits/chosen": 1.8423885107040405, "logits/rejected": 1.8443387746810913, "logps/chosen": -418.2002868652344, "logps/rejected": -433.6718444824219, "loss": 0.6318, "rewards/accuracies": 0.75, "rewards/chosen": -1.4090765714645386, "rewards/margins": 0.5609220862388611, "rewards/rejected": -1.969998836517334, "step": 204 }, { "epoch": 0.4381512156024579, "grad_norm": 18.816447396480665, "learning_rate": 3.448063656632321e-07, "logits/chosen": 1.948996663093567, "logits/rejected": 1.820084810256958, "logps/chosen": -379.0099182128906, "logps/rejected": -352.95184326171875, "loss": 0.6106, "rewards/accuracies": 0.625, "rewards/chosen": -1.2226296663284302, "rewards/margins": 0.21551814675331116, "rewards/rejected": -1.438147783279419, "step": 205 }, { "epoch": 0.44028853860539674, "grad_norm": 20.201506617359616, "learning_rate": 3.430734146004863e-07, "logits/chosen": 1.65437912940979, "logits/rejected": 1.5461066961288452, "logps/chosen": -402.8113708496094, "logps/rejected": -382.8777160644531, "loss": 0.6535, "rewards/accuracies": 0.4375, "rewards/chosen": -1.4631258249282837, "rewards/margins": 0.11626932770013809, "rewards/rejected": -1.579395055770874, "step": 206 }, { "epoch": 0.4424258616083356, "grad_norm": 19.67653738402853, "learning_rate": 3.413352560915988e-07, "logits/chosen": 1.7494556903839111, "logits/rejected": 1.737342357635498, "logps/chosen": -361.5591125488281, "logps/rejected": -357.0711364746094, "loss": 0.643, "rewards/accuracies": 0.625, "rewards/chosen": -1.302952766418457, "rewards/margins": 0.4283585548400879, "rewards/rejected": -1.7313114404678345, "step": 207 }, { "epoch": 0.44456318461127436, "grad_norm": 20.51918848235735, "learning_rate": 3.39591987386325e-07, "logits/chosen": 2.0057222843170166, "logits/rejected": 2.0365169048309326, "logps/chosen": -418.554931640625, "logps/rejected": -424.8497009277344, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -1.3311376571655273, "rewards/margins": 0.1549416184425354, "rewards/rejected": -1.4860793352127075, "step": 208 }, { "epoch": 0.4467005076142132, "grad_norm": 18.895094732056656, "learning_rate": 3.378437060203357e-07, "logits/chosen": 1.2798339128494263, "logits/rejected": 1.163004755973816, "logps/chosen": -344.80889892578125, "logps/rejected": -377.0693664550781, "loss": 0.6167, "rewards/accuracies": 0.5, "rewards/chosen": -0.96147221326828, "rewards/margins": 0.33394408226013184, "rewards/rejected": -1.2954163551330566, "step": 209 }, { "epoch": 0.448837830617152, "grad_norm": 18.922009197013033, "learning_rate": 3.360905098097587e-07, "logits/chosen": 1.6536269187927246, "logits/rejected": 1.6847363710403442, "logps/chosen": -446.45416259765625, "logps/rejected": -452.1558837890625, "loss": 0.602, "rewards/accuracies": 0.625, "rewards/chosen": -1.2614518404006958, "rewards/margins": 0.22943194210529327, "rewards/rejected": -1.4908838272094727, "step": 210 }, { "epoch": 0.45097515362009083, "grad_norm": 21.4124521627508, "learning_rate": 3.343324968457075e-07, "logits/chosen": 1.5657013654708862, "logits/rejected": 1.6874078512191772, "logps/chosen": -344.92913818359375, "logps/rejected": -384.62396240234375, "loss": 0.6227, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2245018482208252, "rewards/margins": 0.45444363355636597, "rewards/rejected": -1.678945541381836, "step": 211 }, { "epoch": 0.45311247662302967, "grad_norm": 26.23976086161117, "learning_rate": 3.325697654887918e-07, "logits/chosen": 1.4959322214126587, "logits/rejected": 1.4617286920547485, "logps/chosen": -397.129150390625, "logps/rejected": -358.9854431152344, "loss": 0.6527, "rewards/accuracies": 0.75, "rewards/chosen": -1.2049939632415771, "rewards/margins": 0.24925091862678528, "rewards/rejected": -1.45424485206604, "step": 212 }, { "epoch": 0.45524979962596845, "grad_norm": 19.599613357584392, "learning_rate": 3.30802414363615e-07, "logits/chosen": 1.8930243253707886, "logits/rejected": 1.8906832933425903, "logps/chosen": -369.51617431640625, "logps/rejected": -402.1026611328125, "loss": 0.6202, "rewards/accuracies": 0.875, "rewards/chosen": -1.204052209854126, "rewards/margins": 0.41892537474632263, "rewards/rejected": -1.6229774951934814, "step": 213 }, { "epoch": 0.4573871226289073, "grad_norm": 19.22419030566061, "learning_rate": 3.2903054235325613e-07, "logits/chosen": 1.3914402723312378, "logits/rejected": 1.4307550191879272, "logps/chosen": -392.59539794921875, "logps/rejected": -429.68450927734375, "loss": 0.6409, "rewards/accuracies": 0.6875, "rewards/chosen": -1.376542329788208, "rewards/margins": 0.47458475828170776, "rewards/rejected": -1.851127028465271, "step": 214 }, { "epoch": 0.45952444563184613, "grad_norm": 19.753436323643434, "learning_rate": 3.272542485937368e-07, "logits/chosen": 1.3418879508972168, "logits/rejected": 1.3356355428695679, "logps/chosen": -376.60491943359375, "logps/rejected": -379.2442321777344, "loss": 0.611, "rewards/accuracies": 0.5, "rewards/chosen": -1.1197147369384766, "rewards/margins": 0.5473043322563171, "rewards/rejected": -1.6670188903808594, "step": 215 }, { "epoch": 0.4616617686347849, "grad_norm": 16.73095945133393, "learning_rate": 3.2547363246847546e-07, "logits/chosen": 2.101710319519043, "logits/rejected": 2.2036097049713135, "logps/chosen": -342.0418395996094, "logps/rejected": -385.2028503417969, "loss": 0.5769, "rewards/accuracies": 0.875, "rewards/chosen": -0.7874879240989685, "rewards/margins": 0.38399648666381836, "rewards/rejected": -1.171484351158142, "step": 216 }, { "epoch": 0.46379909163772376, "grad_norm": 21.40612134241797, "learning_rate": 3.2368879360272606e-07, "logits/chosen": 1.7089003324508667, "logits/rejected": 1.89894437789917, "logps/chosen": -470.57440185546875, "logps/rejected": -497.7328186035156, "loss": 0.6595, "rewards/accuracies": 0.5, "rewards/chosen": -1.5549209117889404, "rewards/margins": 0.23799550533294678, "rewards/rejected": -1.7929165363311768, "step": 217 }, { "epoch": 0.46593641464066254, "grad_norm": 18.401537502823185, "learning_rate": 3.218998318580043e-07, "logits/chosen": 1.7439593076705933, "logits/rejected": 1.7078120708465576, "logps/chosen": -417.99560546875, "logps/rejected": -467.07464599609375, "loss": 0.612, "rewards/accuracies": 0.875, "rewards/chosen": -1.0848441123962402, "rewards/margins": 0.633072018623352, "rewards/rejected": -1.7179162502288818, "step": 218 }, { "epoch": 0.4680737376436014, "grad_norm": 19.854545849811785, "learning_rate": 3.201068473265007e-07, "logits/chosen": 1.8572185039520264, "logits/rejected": 1.8451629877090454, "logps/chosen": -476.4209899902344, "logps/rejected": -464.2806701660156, "loss": 0.6046, "rewards/accuracies": 0.75, "rewards/chosen": -1.108113408088684, "rewards/margins": 0.4227336347103119, "rewards/rejected": -1.5308470726013184, "step": 219 }, { "epoch": 0.4702110606465402, "grad_norm": 18.469882304782676, "learning_rate": 3.1830994032548e-07, "logits/chosen": 1.1646744012832642, "logits/rejected": 1.2345802783966064, "logps/chosen": -373.4507141113281, "logps/rejected": -387.0456848144531, "loss": 0.5847, "rewards/accuracies": 0.625, "rewards/chosen": -1.9688059091567993, "rewards/margins": 0.19057507812976837, "rewards/rejected": -2.1593809127807617, "step": 220 }, { "epoch": 0.472348383649479, "grad_norm": 18.9867259815962, "learning_rate": 3.1650921139166874e-07, "logits/chosen": 2.185783624649048, "logits/rejected": 2.1693177223205566, "logps/chosen": -326.4842529296875, "logps/rejected": -349.39556884765625, "loss": 0.5825, "rewards/accuracies": 0.625, "rewards/chosen": -1.598148226737976, "rewards/margins": 0.4604637920856476, "rewards/rejected": -2.058612108230591, "step": 221 }, { "epoch": 0.47448570665241785, "grad_norm": 17.69369969243383, "learning_rate": 3.147047612756302e-07, "logits/chosen": 1.6809875965118408, "logits/rejected": 1.5675013065338135, "logps/chosen": -352.68463134765625, "logps/rejected": -350.1728515625, "loss": 0.5836, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0454927682876587, "rewards/margins": 0.3611335754394531, "rewards/rejected": -1.4066263437271118, "step": 222 }, { "epoch": 0.4766230296553567, "grad_norm": 18.720455518598285, "learning_rate": 3.128966909361271e-07, "logits/chosen": 1.5484097003936768, "logits/rejected": 1.5819151401519775, "logps/chosen": -358.4718017578125, "logps/rejected": -339.5688781738281, "loss": 0.6065, "rewards/accuracies": 0.4375, "rewards/chosen": -1.3430771827697754, "rewards/margins": -0.17120108008384705, "rewards/rejected": -1.1718761920928955, "step": 223 }, { "epoch": 0.4787603526582955, "grad_norm": 16.817886671501697, "learning_rate": 3.110851015344735e-07, "logits/chosen": 1.784898281097412, "logits/rejected": 2.0283141136169434, "logps/chosen": -401.959716796875, "logps/rejected": -411.4044494628906, "loss": 0.5935, "rewards/accuracies": 0.75, "rewards/chosen": -1.5906190872192383, "rewards/margins": 0.40126991271972656, "rewards/rejected": -1.991889238357544, "step": 224 }, { "epoch": 0.4808976756612343, "grad_norm": 17.147517583264804, "learning_rate": 3.0927009442887437e-07, "logits/chosen": 1.4650673866271973, "logits/rejected": 1.5623681545257568, "logps/chosen": -391.3199768066406, "logps/rejected": -397.706787109375, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": -1.2298505306243896, "rewards/margins": 0.3492811918258667, "rewards/rejected": -1.5791317224502563, "step": 225 }, { "epoch": 0.4830349986641731, "grad_norm": 18.38943701452515, "learning_rate": 3.074517711687549e-07, "logits/chosen": 2.138887882232666, "logits/rejected": 2.0834007263183594, "logps/chosen": -324.4191589355469, "logps/rejected": -337.5340270996094, "loss": 0.6122, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0309844017028809, "rewards/margins": 0.10200852155685425, "rewards/rejected": -1.1329929828643799, "step": 226 }, { "epoch": 0.48517232166711194, "grad_norm": 18.020181570236574, "learning_rate": 3.056302334890786e-07, "logits/chosen": 1.6313700675964355, "logits/rejected": 1.5899854898452759, "logps/chosen": -368.62835693359375, "logps/rejected": -400.20013427734375, "loss": 0.571, "rewards/accuracies": 0.625, "rewards/chosen": -1.055250644683838, "rewards/margins": 0.3379458785057068, "rewards/rejected": -1.3931964635849, "step": 227 }, { "epoch": 0.4873096446700508, "grad_norm": 21.231287782547422, "learning_rate": 3.038055833046555e-07, "logits/chosen": 1.821986198425293, "logits/rejected": 1.775019645690918, "logps/chosen": -438.9476318359375, "logps/rejected": -462.4380187988281, "loss": 0.7017, "rewards/accuracies": 0.625, "rewards/chosen": -1.1709932088851929, "rewards/margins": 0.017013883218169212, "rewards/rejected": -1.1880072355270386, "step": 228 }, { "epoch": 0.48944696767298956, "grad_norm": 18.242763714139798, "learning_rate": 3.0197792270443976e-07, "logits/chosen": 1.685359239578247, "logits/rejected": 1.7110083103179932, "logps/chosen": -382.34051513671875, "logps/rejected": -361.8902893066406, "loss": 0.5608, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1712117195129395, "rewards/margins": 0.15365606546401978, "rewards/rejected": -1.3248677253723145, "step": 229 }, { "epoch": 0.4915842906759284, "grad_norm": 19.342657906988098, "learning_rate": 3.001473539458182e-07, "logits/chosen": 1.652226209640503, "logits/rejected": 1.7933372259140015, "logps/chosen": -351.8333435058594, "logps/rejected": -345.5216369628906, "loss": 0.5985, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9596707820892334, "rewards/margins": 0.48877692222595215, "rewards/rejected": -1.4484477043151855, "step": 230 }, { "epoch": 0.49372161367886724, "grad_norm": 18.306301233512528, "learning_rate": 2.983139794488883e-07, "logits/chosen": 2.194626569747925, "logits/rejected": 2.039341926574707, "logps/chosen": -422.2347412109375, "logps/rejected": -381.56793212890625, "loss": 0.6316, "rewards/accuracies": 0.5, "rewards/chosen": -1.5122215747833252, "rewards/margins": -0.1322540044784546, "rewards/rejected": -1.379967451095581, "step": 231 }, { "epoch": 0.49585893668180603, "grad_norm": 18.94320134699598, "learning_rate": 2.964779017907287e-07, "logits/chosen": 1.6469025611877441, "logits/rejected": 1.6451257467269897, "logps/chosen": -387.7391052246094, "logps/rejected": -381.7521667480469, "loss": 0.5867, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2008886337280273, "rewards/margins": 0.707405686378479, "rewards/rejected": -1.9082942008972168, "step": 232 }, { "epoch": 0.49799625968474487, "grad_norm": 17.646525560974858, "learning_rate": 2.9463922369965915e-07, "logits/chosen": 0.9604759812355042, "logits/rejected": 1.081686019897461, "logps/chosen": -320.6999816894531, "logps/rejected": -338.20977783203125, "loss": 0.5859, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3322499990463257, "rewards/margins": 0.4187338948249817, "rewards/rejected": -1.7509838342666626, "step": 233 }, { "epoch": 0.5001335826876837, "grad_norm": 18.940116172317182, "learning_rate": 2.927980480494938e-07, "logits/chosen": 1.430747389793396, "logits/rejected": 1.602396011352539, "logps/chosen": -286.286376953125, "logps/rejected": -305.3640441894531, "loss": 0.568, "rewards/accuracies": 0.625, "rewards/chosen": -1.2952995300292969, "rewards/margins": 0.4015873074531555, "rewards/rejected": -1.6968867778778076, "step": 234 }, { "epoch": 0.5022709056906225, "grad_norm": 23.939013344426872, "learning_rate": 2.909544778537844e-07, "logits/chosen": 1.7608510255813599, "logits/rejected": 1.8317222595214844, "logps/chosen": -354.6390686035156, "logps/rejected": -346.6441650390625, "loss": 0.5761, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8426222205162048, "rewards/margins": 0.4636692404747009, "rewards/rejected": -1.3062914609909058, "step": 235 }, { "epoch": 0.5044082286935613, "grad_norm": 17.859554625792914, "learning_rate": 2.8910861626005773e-07, "logits/chosen": 1.4780582189559937, "logits/rejected": 1.4497883319854736, "logps/chosen": -377.6356506347656, "logps/rejected": -397.4453430175781, "loss": 0.6219, "rewards/accuracies": 0.75, "rewards/chosen": -1.2822011709213257, "rewards/margins": 0.2759856581687927, "rewards/rejected": -1.5581867694854736, "step": 236 }, { "epoch": 0.5065455516965002, "grad_norm": 15.921170465179717, "learning_rate": 2.872605665440436e-07, "logits/chosen": 2.2028841972351074, "logits/rejected": 2.1681137084960938, "logps/chosen": -500.1951599121094, "logps/rejected": -484.7807312011719, "loss": 0.5792, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0691642761230469, "rewards/margins": 0.5099738836288452, "rewards/rejected": -1.579138159751892, "step": 237 }, { "epoch": 0.508682874699439, "grad_norm": 20.104152888553777, "learning_rate": 2.8541043210389726e-07, "logits/chosen": 1.4386096000671387, "logits/rejected": 1.5074604749679565, "logps/chosen": -327.89239501953125, "logps/rejected": -340.6610412597656, "loss": 0.6041, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4297116994857788, "rewards/margins": -0.030831288546323776, "rewards/rejected": -1.3988802433013916, "step": 238 }, { "epoch": 0.5108201977023777, "grad_norm": 18.984989332345673, "learning_rate": 2.8355831645441387e-07, "logits/chosen": 1.4687011241912842, "logits/rejected": 1.5416244268417358, "logps/chosen": -298.0223693847656, "logps/rejected": -331.571533203125, "loss": 0.6282, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4456336498260498, "rewards/margins": 0.42436861991882324, "rewards/rejected": -1.8700025081634521, "step": 239 }, { "epoch": 0.5129575207053166, "grad_norm": 19.616076404098113, "learning_rate": 2.817043232212371e-07, "logits/chosen": 1.4483119249343872, "logits/rejected": 1.4856162071228027, "logps/chosen": -362.3177795410156, "logps/rejected": -404.8100280761719, "loss": 0.6419, "rewards/accuracies": 0.5625, "rewards/chosen": -1.233857274055481, "rewards/margins": 0.10059986263513565, "rewards/rejected": -1.3344570398330688, "step": 240 }, { "epoch": 0.5150948437082554, "grad_norm": 19.564465947045136, "learning_rate": 2.7984855613506106e-07, "logits/chosen": 1.8776293992996216, "logits/rejected": 1.9832379817962646, "logps/chosen": -344.974365234375, "logps/rejected": -392.325927734375, "loss": 0.6338, "rewards/accuracies": 0.75, "rewards/chosen": -1.1595617532730103, "rewards/margins": 0.3295942544937134, "rewards/rejected": -1.4891560077667236, "step": 241 }, { "epoch": 0.5172321667111942, "grad_norm": 16.85077595027773, "learning_rate": 2.7799111902582693e-07, "logits/chosen": 1.8834673166275024, "logits/rejected": 1.9119462966918945, "logps/chosen": -342.9952392578125, "logps/rejected": -360.51220703125, "loss": 0.6157, "rewards/accuracies": 0.5, "rewards/chosen": -1.2162364721298218, "rewards/margins": 0.1725614368915558, "rewards/rejected": -1.3887979984283447, "step": 242 }, { "epoch": 0.5193694897141331, "grad_norm": 18.949703224831786, "learning_rate": 2.761321158169134e-07, "logits/chosen": 2.179548978805542, "logits/rejected": 2.157480478286743, "logps/chosen": -390.4351501464844, "logps/rejected": -449.8084411621094, "loss": 0.6343, "rewards/accuracies": 0.6875, "rewards/chosen": -1.394233226776123, "rewards/margins": 0.2236776202917099, "rewards/rejected": -1.6179107427597046, "step": 243 }, { "epoch": 0.5215068127170719, "grad_norm": 16.607869629392933, "learning_rate": 2.74271650519322e-07, "logits/chosen": 1.8933420181274414, "logits/rejected": 1.9424501657485962, "logps/chosen": -599.1654052734375, "logps/rejected": -581.9706420898438, "loss": 0.5837, "rewards/accuracies": 0.5, "rewards/chosen": -1.4606138467788696, "rewards/margins": 0.1728818118572235, "rewards/rejected": -1.6334956884384155, "step": 244 }, { "epoch": 0.5236441357200107, "grad_norm": 16.905690523597688, "learning_rate": 2.7240982722585837e-07, "logits/chosen": 1.7946285009384155, "logits/rejected": 1.8385579586029053, "logps/chosen": -281.8657531738281, "logps/rejected": -314.0006408691406, "loss": 0.5875, "rewards/accuracies": 0.625, "rewards/chosen": -1.2459651231765747, "rewards/margins": 0.014865804463624954, "rewards/rejected": -1.2608309984207153, "step": 245 }, { "epoch": 0.5257814587229495, "grad_norm": 19.248918362668643, "learning_rate": 2.705467501053076e-07, "logits/chosen": 2.24014949798584, "logits/rejected": 2.2482752799987793, "logps/chosen": -412.81854248046875, "logps/rejected": -382.4270324707031, "loss": 0.6086, "rewards/accuracies": 0.5, "rewards/chosen": -1.1754889488220215, "rewards/margins": 0.09349602460861206, "rewards/rejected": -1.2689850330352783, "step": 246 }, { "epoch": 0.5279187817258884, "grad_norm": 16.8906231683352, "learning_rate": 2.6868252339660607e-07, "logits/chosen": 1.3152052164077759, "logits/rejected": 1.4119510650634766, "logps/chosen": -329.6816101074219, "logps/rejected": -337.0438537597656, "loss": 0.579, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5544246435165405, "rewards/margins": 0.47526106238365173, "rewards/rejected": -2.0296854972839355, "step": 247 }, { "epoch": 0.5300561047288271, "grad_norm": 18.021321805955274, "learning_rate": 2.6681725140300995e-07, "logits/chosen": 1.5704407691955566, "logits/rejected": 1.6143972873687744, "logps/chosen": -369.656494140625, "logps/rejected": -393.353515625, "loss": 0.6217, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3475369215011597, "rewards/margins": 0.3202340602874756, "rewards/rejected": -1.6677708625793457, "step": 248 }, { "epoch": 0.5321934277317659, "grad_norm": 19.509340419547396, "learning_rate": 2.6495103848625854e-07, "logits/chosen": 1.7228862047195435, "logits/rejected": 1.735210657119751, "logps/chosen": -366.4405822753906, "logps/rejected": -398.99603271484375, "loss": 0.6436, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3989753723144531, "rewards/margins": 0.46870413422584534, "rewards/rejected": -1.8676795959472656, "step": 249 }, { "epoch": 0.5343307507347048, "grad_norm": 21.318546314009538, "learning_rate": 2.63083989060736e-07, "logits/chosen": 1.8195374011993408, "logits/rejected": 1.8398746252059937, "logps/chosen": -427.18402099609375, "logps/rejected": -452.298828125, "loss": 0.6685, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5230650901794434, "rewards/margins": 0.3252180218696594, "rewards/rejected": -1.848283052444458, "step": 250 }, { "epoch": 0.5364680737376436, "grad_norm": 19.878831886338375, "learning_rate": 2.6121620758762875e-07, "logits/chosen": 1.4604578018188477, "logits/rejected": 1.3828535079956055, "logps/chosen": -400.2124938964844, "logps/rejected": -367.87689208984375, "loss": 0.5929, "rewards/accuracies": 0.75, "rewards/chosen": -1.4065700769424438, "rewards/margins": 0.5055478811264038, "rewards/rejected": -1.9121177196502686, "step": 251 }, { "epoch": 0.5386053967405824, "grad_norm": 19.27873005266974, "learning_rate": 2.593477985690815e-07, "logits/chosen": 2.034696578979492, "logits/rejected": 1.9050860404968262, "logps/chosen": -502.56475830078125, "logps/rejected": -473.90606689453125, "loss": 0.5616, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5159586668014526, "rewards/margins": 0.5788987278938293, "rewards/rejected": -2.0948572158813477, "step": 252 }, { "epoch": 0.5407427197435213, "grad_norm": 17.037711101859895, "learning_rate": 2.574788665423496e-07, "logits/chosen": 1.8234672546386719, "logits/rejected": 1.8364436626434326, "logps/chosen": -403.5150146484375, "logps/rejected": -435.0159606933594, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": -0.9063006043434143, "rewards/margins": 0.604102611541748, "rewards/rejected": -1.5104031562805176, "step": 253 }, { "epoch": 0.5428800427464601, "grad_norm": 20.75619046985192, "learning_rate": 2.5560951607395126e-07, "logits/chosen": 1.916282296180725, "logits/rejected": 1.836082935333252, "logps/chosen": -396.24560546875, "logps/rejected": -400.76007080078125, "loss": 0.6306, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3977752923965454, "rewards/margins": 0.23083746433258057, "rewards/rejected": -1.628612756729126, "step": 254 }, { "epoch": 0.5450173657493989, "grad_norm": 19.622882332492626, "learning_rate": 2.537398517538159e-07, "logits/chosen": 2.3518788814544678, "logits/rejected": 2.373096227645874, "logps/chosen": -384.93212890625, "logps/rejected": -370.65478515625, "loss": 0.6153, "rewards/accuracies": 0.625, "rewards/chosen": -1.1938955783843994, "rewards/margins": 0.3775806725025177, "rewards/rejected": -1.5714763402938843, "step": 255 }, { "epoch": 0.5471546887523377, "grad_norm": 19.58386952453966, "learning_rate": 2.518699781894332e-07, "logits/chosen": 1.9337654113769531, "logits/rejected": 1.907602071762085, "logps/chosen": -416.7008056640625, "logps/rejected": -424.2861633300781, "loss": 0.5973, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4020006656646729, "rewards/margins": 0.38494953513145447, "rewards/rejected": -1.7869502305984497, "step": 256 }, { "epoch": 0.5492920117552765, "grad_norm": 16.361122872177194, "learning_rate": 2.5e-07, "logits/chosen": 1.7576777935028076, "logits/rejected": 1.6899727582931519, "logps/chosen": -391.47491455078125, "logps/rejected": -425.26531982421875, "loss": 0.6003, "rewards/accuracies": 1.0, "rewards/chosen": -0.7915077805519104, "rewards/margins": 0.6424891948699951, "rewards/rejected": -1.4339970350265503, "step": 257 }, { "epoch": 0.5514293347582153, "grad_norm": 20.14867898121995, "learning_rate": 2.4813002181056676e-07, "logits/chosen": 1.8691372871398926, "logits/rejected": 1.8315614461898804, "logps/chosen": -336.1017761230469, "logps/rejected": -355.53497314453125, "loss": 0.6183, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5710742473602295, "rewards/margins": 0.2784579396247864, "rewards/rejected": -1.8495322465896606, "step": 258 }, { "epoch": 0.5535666577611541, "grad_norm": 18.423158676910308, "learning_rate": 2.4626014824618413e-07, "logits/chosen": 1.8882901668548584, "logits/rejected": 1.8200494050979614, "logps/chosen": -396.83642578125, "logps/rejected": -417.7898254394531, "loss": 0.6253, "rewards/accuracies": 0.625, "rewards/chosen": -1.1084598302841187, "rewards/margins": 0.2924001216888428, "rewards/rejected": -1.4008599519729614, "step": 259 }, { "epoch": 0.555703980764093, "grad_norm": 17.985218179528584, "learning_rate": 2.4439048392604877e-07, "logits/chosen": 1.7273188829421997, "logits/rejected": 1.7321103811264038, "logps/chosen": -319.5902099609375, "logps/rejected": -343.101318359375, "loss": 0.6234, "rewards/accuracies": 0.625, "rewards/chosen": -1.216766595840454, "rewards/margins": 0.5042122602462769, "rewards/rejected": -1.720978856086731, "step": 260 }, { "epoch": 0.5578413037670318, "grad_norm": 18.28183834907717, "learning_rate": 2.4252113345765043e-07, "logits/chosen": 2.003500461578369, "logits/rejected": 2.1276445388793945, "logps/chosen": -425.6303405761719, "logps/rejected": -465.6767578125, "loss": 0.5733, "rewards/accuracies": 0.625, "rewards/chosen": -1.439363718032837, "rewards/margins": 0.22122399508953094, "rewards/rejected": -1.6605877876281738, "step": 261 }, { "epoch": 0.5599786267699706, "grad_norm": 18.461608241814286, "learning_rate": 2.406522014309186e-07, "logits/chosen": 1.563164234161377, "logits/rejected": 1.5734854936599731, "logps/chosen": -325.31903076171875, "logps/rejected": -325.20819091796875, "loss": 0.5877, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7867953777313232, "rewards/margins": 0.3784661591053009, "rewards/rejected": -1.1652615070343018, "step": 262 }, { "epoch": 0.5621159497729095, "grad_norm": 17.283252492595803, "learning_rate": 2.3878379241237134e-07, "logits/chosen": 2.279524087905884, "logits/rejected": 2.296083688735962, "logps/chosen": -328.20220947265625, "logps/rejected": -361.738037109375, "loss": 0.5871, "rewards/accuracies": 0.8125, "rewards/chosen": -1.240782618522644, "rewards/margins": 0.6395030617713928, "rewards/rejected": -1.8802857398986816, "step": 263 }, { "epoch": 0.5642532727758482, "grad_norm": 16.930318409654138, "learning_rate": 2.3691601093926402e-07, "logits/chosen": 1.8105309009552002, "logits/rejected": 1.8964433670043945, "logps/chosen": -310.3644714355469, "logps/rejected": -350.0364685058594, "loss": 0.5622, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1182781457901, "rewards/margins": 0.6728768348693848, "rewards/rejected": -1.7911549806594849, "step": 264 }, { "epoch": 0.566390595778787, "grad_norm": 18.154082784411116, "learning_rate": 2.3504896151374144e-07, "logits/chosen": 1.9459747076034546, "logits/rejected": 1.9903091192245483, "logps/chosen": -496.9175109863281, "logps/rejected": -534.62255859375, "loss": 0.5978, "rewards/accuracies": 0.8125, "rewards/chosen": -1.114598274230957, "rewards/margins": 0.49393293261528015, "rewards/rejected": -1.6085312366485596, "step": 265 }, { "epoch": 0.5685279187817259, "grad_norm": 17.886561891555203, "learning_rate": 2.3318274859699008e-07, "logits/chosen": 1.9951430559158325, "logits/rejected": 2.013209819793701, "logps/chosen": -431.7491455078125, "logps/rejected": -472.21661376953125, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": -1.0381141901016235, "rewards/margins": 0.957762598991394, "rewards/rejected": -1.995876669883728, "step": 266 }, { "epoch": 0.5706652417846647, "grad_norm": 19.118226835611033, "learning_rate": 2.3131747660339394e-07, "logits/chosen": 1.1025474071502686, "logits/rejected": 0.9203678965568542, "logps/chosen": -452.95941162109375, "logps/rejected": -400.9754943847656, "loss": 0.6536, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3553560972213745, "rewards/margins": 0.09373009204864502, "rewards/rejected": -1.4490864276885986, "step": 267 }, { "epoch": 0.5728025647876035, "grad_norm": 17.64914152601931, "learning_rate": 2.2945324989469243e-07, "logits/chosen": 1.9431540966033936, "logits/rejected": 1.746927261352539, "logps/chosen": -369.2441101074219, "logps/rejected": -329.55145263671875, "loss": 0.5729, "rewards/accuracies": 0.625, "rewards/chosen": -1.0598613023757935, "rewards/margins": 0.18452677130699158, "rewards/rejected": -1.2443879842758179, "step": 268 }, { "epoch": 0.5749398877905424, "grad_norm": 16.484270905189813, "learning_rate": 2.2759017277414164e-07, "logits/chosen": 2.0279760360717773, "logits/rejected": 1.9914069175720215, "logps/chosen": -434.94366455078125, "logps/rejected": -388.86077880859375, "loss": 0.5838, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2059601545333862, "rewards/margins": 0.1697658896446228, "rewards/rejected": -1.3757259845733643, "step": 269 }, { "epoch": 0.5770772107934812, "grad_norm": 22.56606378913711, "learning_rate": 2.2572834948067795e-07, "logits/chosen": 2.1591734886169434, "logits/rejected": 2.223292112350464, "logps/chosen": -468.49639892578125, "logps/rejected": -476.2985534667969, "loss": 0.6631, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7323036193847656, "rewards/margins": 0.29446297883987427, "rewards/rejected": -2.026766538619995, "step": 270 }, { "epoch": 0.57921453379642, "grad_norm": 19.87333935608928, "learning_rate": 2.2386788418308665e-07, "logits/chosen": 1.343954086303711, "logits/rejected": 1.1634771823883057, "logps/chosen": -380.564453125, "logps/rejected": -400.9522705078125, "loss": 0.6052, "rewards/accuracies": 0.8125, "rewards/chosen": -1.206040859222412, "rewards/margins": 0.5869691967964172, "rewards/rejected": -1.7930101156234741, "step": 271 }, { "epoch": 0.5813518567993589, "grad_norm": 18.1006643173551, "learning_rate": 2.2200888097417302e-07, "logits/chosen": 1.7346237897872925, "logits/rejected": 1.5861271619796753, "logps/chosen": -320.0159606933594, "logps/rejected": -314.8853759765625, "loss": 0.6227, "rewards/accuracies": 0.6875, "rewards/chosen": -1.077418565750122, "rewards/margins": 0.36749520897865295, "rewards/rejected": -1.4449137449264526, "step": 272 }, { "epoch": 0.5834891798022976, "grad_norm": 19.024746475121724, "learning_rate": 2.2015144386493895e-07, "logits/chosen": 1.7071813344955444, "logits/rejected": 1.628884196281433, "logps/chosen": -341.3028564453125, "logps/rejected": -353.91278076171875, "loss": 0.5773, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3241965770721436, "rewards/margins": 0.444732129573822, "rewards/rejected": -1.7689287662506104, "step": 273 }, { "epoch": 0.5856265028052364, "grad_norm": 18.34263207305998, "learning_rate": 2.1829567677876297e-07, "logits/chosen": 1.7893891334533691, "logits/rejected": 1.8639053106307983, "logps/chosen": -319.2881164550781, "logps/rejected": -349.91851806640625, "loss": 0.6001, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0994592905044556, "rewards/margins": 0.519347071647644, "rewards/rejected": -1.6188066005706787, "step": 274 }, { "epoch": 0.5877638258081752, "grad_norm": 16.918446999240707, "learning_rate": 2.164416835455862e-07, "logits/chosen": 1.8019373416900635, "logits/rejected": 1.6737080812454224, "logps/chosen": -393.3109436035156, "logps/rejected": -404.9291076660156, "loss": 0.5719, "rewards/accuracies": 0.625, "rewards/chosen": -1.1502411365509033, "rewards/margins": 0.16453669965267181, "rewards/rejected": -1.3147778511047363, "step": 275 }, { "epoch": 0.5899011488111141, "grad_norm": 18.434871669590013, "learning_rate": 2.1458956789610277e-07, "logits/chosen": 1.8519237041473389, "logits/rejected": 1.8247859477996826, "logps/chosen": -298.4532165527344, "logps/rejected": -277.18121337890625, "loss": 0.5889, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2886124849319458, "rewards/margins": 0.4535689353942871, "rewards/rejected": -1.7421815395355225, "step": 276 }, { "epoch": 0.5920384718140529, "grad_norm": 18.52956407873372, "learning_rate": 2.1273943345595635e-07, "logits/chosen": 1.8608410358428955, "logits/rejected": 1.8228371143341064, "logps/chosen": -433.77166748046875, "logps/rejected": -428.9039001464844, "loss": 0.6168, "rewards/accuracies": 0.625, "rewards/chosen": -1.420507550239563, "rewards/margins": 0.2784229815006256, "rewards/rejected": -1.6989305019378662, "step": 277 }, { "epoch": 0.5941757948169917, "grad_norm": 17.630697492069693, "learning_rate": 2.1089138373994222e-07, "logits/chosen": 1.5853919982910156, "logits/rejected": 1.4946439266204834, "logps/chosen": -352.94097900390625, "logps/rejected": -350.8989562988281, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": -1.009688377380371, "rewards/margins": 0.2674168348312378, "rewards/rejected": -1.2771050930023193, "step": 278 }, { "epoch": 0.5963131178199306, "grad_norm": 18.10952812595942, "learning_rate": 2.0904552214621556e-07, "logits/chosen": 1.433165192604065, "logits/rejected": 1.5178760290145874, "logps/chosen": -301.49359130859375, "logps/rejected": -378.566162109375, "loss": 0.5841, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9828816056251526, "rewards/margins": 0.8563581109046936, "rewards/rejected": -1.8392397165298462, "step": 279 }, { "epoch": 0.5984504408228694, "grad_norm": 16.077356770517124, "learning_rate": 2.072019519505062e-07, "logits/chosen": 2.150374412536621, "logits/rejected": 2.144287109375, "logps/chosen": -378.2465515136719, "logps/rejected": -414.12860107421875, "loss": 0.5743, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7395087480545044, "rewards/margins": 0.6563863158226013, "rewards/rejected": -1.3958951234817505, "step": 280 }, { "epoch": 0.6005877638258081, "grad_norm": 24.441481846041146, "learning_rate": 2.0536077630034085e-07, "logits/chosen": 1.7997475862503052, "logits/rejected": 1.8200863599777222, "logps/chosen": -391.2835693359375, "logps/rejected": -403.364501953125, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": -1.2694519758224487, "rewards/margins": 0.08388321101665497, "rewards/rejected": -1.3533351421356201, "step": 281 }, { "epoch": 0.602725086828747, "grad_norm": 17.8344144696276, "learning_rate": 2.0352209820927135e-07, "logits/chosen": 1.073244571685791, "logits/rejected": 0.8806475400924683, "logps/chosen": -315.171630859375, "logps/rejected": -333.1543884277344, "loss": 0.6089, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9738328456878662, "rewards/margins": 0.7536094188690186, "rewards/rejected": -1.7274422645568848, "step": 282 }, { "epoch": 0.6048624098316858, "grad_norm": 18.10807913356027, "learning_rate": 2.0168602055111173e-07, "logits/chosen": 1.743872046470642, "logits/rejected": 1.6714892387390137, "logps/chosen": -403.4494323730469, "logps/rejected": -429.84051513671875, "loss": 0.5822, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9764223694801331, "rewards/margins": 0.6022226214408875, "rewards/rejected": -1.57864511013031, "step": 283 }, { "epoch": 0.6069997328346246, "grad_norm": 16.26326383123038, "learning_rate": 1.998526460541818e-07, "logits/chosen": 1.78279709815979, "logits/rejected": 1.7751100063323975, "logps/chosen": -391.4514465332031, "logps/rejected": -400.68658447265625, "loss": 0.547, "rewards/accuracies": 0.75, "rewards/chosen": -1.096341609954834, "rewards/margins": 0.5708625912666321, "rewards/rejected": -1.6672042608261108, "step": 284 }, { "epoch": 0.6091370558375635, "grad_norm": 21.544359613974976, "learning_rate": 1.980220772955602e-07, "logits/chosen": 2.0660159587860107, "logits/rejected": 2.0274107456207275, "logps/chosen": -456.6308898925781, "logps/rejected": -493.3993225097656, "loss": 0.6296, "rewards/accuracies": 0.5625, "rewards/chosen": -1.374593734741211, "rewards/margins": 0.18421491980552673, "rewards/rejected": -1.55880868434906, "step": 285 }, { "epoch": 0.6112743788405023, "grad_norm": 19.038810259481426, "learning_rate": 1.961944166953445e-07, "logits/chosen": 1.1080034971237183, "logits/rejected": 1.2669103145599365, "logps/chosen": -365.549072265625, "logps/rejected": -351.73028564453125, "loss": 0.62, "rewards/accuracies": 0.625, "rewards/chosen": -1.421024203300476, "rewards/margins": 0.5578509569168091, "rewards/rejected": -1.9788750410079956, "step": 286 }, { "epoch": 0.6134117018434411, "grad_norm": 20.136440706184228, "learning_rate": 1.9436976651092142e-07, "logits/chosen": 1.9399811029434204, "logits/rejected": 1.8495838642120361, "logps/chosen": -355.4549255371094, "logps/rejected": -369.4902648925781, "loss": 0.6159, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2287437915802002, "rewards/margins": 0.48982152342796326, "rewards/rejected": -1.7185652256011963, "step": 287 }, { "epoch": 0.6155490248463799, "grad_norm": 17.789228307045246, "learning_rate": 1.9254822883124517e-07, "logits/chosen": 1.929220199584961, "logits/rejected": 1.928113579750061, "logps/chosen": -463.58563232421875, "logps/rejected": -486.4768371582031, "loss": 0.5905, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3614306449890137, "rewards/margins": 0.24906916916370392, "rewards/rejected": -1.6104998588562012, "step": 288 }, { "epoch": 0.6176863478493188, "grad_norm": 17.924045280638552, "learning_rate": 1.9072990557112564e-07, "logits/chosen": 1.9938664436340332, "logits/rejected": 2.048049211502075, "logps/chosen": -314.8684997558594, "logps/rejected": -366.38739013671875, "loss": 0.5973, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2091230154037476, "rewards/margins": 0.43797287344932556, "rewards/rejected": -1.6470959186553955, "step": 289 }, { "epoch": 0.6198236708522575, "grad_norm": 19.138114842281414, "learning_rate": 1.8891489846552644e-07, "logits/chosen": 1.8760986328125, "logits/rejected": 1.8651152849197388, "logps/chosen": -352.26373291015625, "logps/rejected": -363.06231689453125, "loss": 0.5813, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2897804975509644, "rewards/margins": 0.037354469299316406, "rewards/rejected": -1.3271349668502808, "step": 290 }, { "epoch": 0.6219609938551963, "grad_norm": 18.141812527530227, "learning_rate": 1.8710330906387286e-07, "logits/chosen": 1.7009693384170532, "logits/rejected": 1.7097214460372925, "logps/chosen": -401.5587463378906, "logps/rejected": -406.8404541015625, "loss": 0.6046, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2753279209136963, "rewards/margins": 0.20724107325077057, "rewards/rejected": -1.4825689792633057, "step": 291 }, { "epoch": 0.6240983168581352, "grad_norm": 22.82747902823447, "learning_rate": 1.8529523872436977e-07, "logits/chosen": 1.7425600290298462, "logits/rejected": 1.490181803703308, "logps/chosen": -419.9559631347656, "logps/rejected": -406.5120849609375, "loss": 0.6505, "rewards/accuracies": 0.625, "rewards/chosen": -1.4110760688781738, "rewards/margins": 0.21470992267131805, "rewards/rejected": -1.6257858276367188, "step": 292 }, { "epoch": 0.626235639861074, "grad_norm": 17.909633658779036, "learning_rate": 1.8349078860833124e-07, "logits/chosen": 1.7675672769546509, "logits/rejected": 1.8061796426773071, "logps/chosen": -391.7986145019531, "logps/rejected": -431.70172119140625, "loss": 0.6072, "rewards/accuracies": 0.625, "rewards/chosen": -0.6551833152770996, "rewards/margins": 0.3061232566833496, "rewards/rejected": -0.9613065719604492, "step": 293 }, { "epoch": 0.6283729628640128, "grad_norm": 17.64445331496705, "learning_rate": 1.8169005967452e-07, "logits/chosen": 0.9978383779525757, "logits/rejected": 1.074844479560852, "logps/chosen": -396.01104736328125, "logps/rejected": -366.7948303222656, "loss": 0.5871, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1115831136703491, "rewards/margins": 0.7094423174858093, "rewards/rejected": -1.8210253715515137, "step": 294 }, { "epoch": 0.6305102858669517, "grad_norm": 19.11480868885265, "learning_rate": 1.7989315267349933e-07, "logits/chosen": 1.2297173738479614, "logits/rejected": 1.197448968887329, "logps/chosen": -276.0805969238281, "logps/rejected": -330.3874206542969, "loss": 0.6201, "rewards/accuracies": 0.875, "rewards/chosen": -0.9019981026649475, "rewards/margins": 0.9751827716827393, "rewards/rejected": -1.8771809339523315, "step": 295 }, { "epoch": 0.6326476088698905, "grad_norm": 18.788836039541163, "learning_rate": 1.781001681419957e-07, "logits/chosen": 1.2067197561264038, "logits/rejected": 1.1462682485580444, "logps/chosen": -435.5975646972656, "logps/rejected": -410.3743896484375, "loss": 0.6502, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8658223748207092, "rewards/margins": 0.13561075925827026, "rewards/rejected": -1.0014331340789795, "step": 296 }, { "epoch": 0.6347849318728293, "grad_norm": 22.345954181375724, "learning_rate": 1.763112063972739e-07, "logits/chosen": 1.4078682661056519, "logits/rejected": 1.5019978284835815, "logps/chosen": -265.8661804199219, "logps/rejected": -328.0052185058594, "loss": 0.6665, "rewards/accuracies": 0.625, "rewards/chosen": -0.9942247867584229, "rewards/margins": 0.46525779366493225, "rewards/rejected": -1.4594827890396118, "step": 297 }, { "epoch": 0.6369222548757681, "grad_norm": 17.95979388047236, "learning_rate": 1.745263675315245e-07, "logits/chosen": 1.9481912851333618, "logits/rejected": 2.0087857246398926, "logps/chosen": -399.0690002441406, "logps/rejected": -425.6741943359375, "loss": 0.6197, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0927371978759766, "rewards/margins": 0.22326043248176575, "rewards/rejected": -1.3159977197647095, "step": 298 }, { "epoch": 0.6390595778787069, "grad_norm": 20.924697959345067, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 1.2710243463516235, "logits/rejected": 1.1798756122589111, "logps/chosen": -346.3345031738281, "logps/rejected": -352.3256530761719, "loss": 0.6307, "rewards/accuracies": 0.6875, "rewards/chosen": -1.233717441558838, "rewards/margins": 0.1365053653717041, "rewards/rejected": -1.370222806930542, "step": 299 }, { "epoch": 0.6411969008816457, "grad_norm": 17.692093613276146, "learning_rate": 1.7096945764674398e-07, "logits/chosen": 1.7905324697494507, "logits/rejected": 1.7943121194839478, "logps/chosen": -347.2984924316406, "logps/rejected": -340.6080322265625, "loss": 0.5765, "rewards/accuracies": 0.625, "rewards/chosen": -1.07566499710083, "rewards/margins": 0.10227301716804504, "rewards/rejected": -1.1779381036758423, "step": 300 }, { "epoch": 0.6433342238845846, "grad_norm": 19.852613137333368, "learning_rate": 1.6919758563638502e-07, "logits/chosen": 2.267047882080078, "logits/rejected": 2.2171971797943115, "logps/chosen": -509.4197998046875, "logps/rejected": -506.5215759277344, "loss": 0.6171, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9691479802131653, "rewards/margins": 0.4192157983779907, "rewards/rejected": -1.3883638381958008, "step": 301 }, { "epoch": 0.6454715468875234, "grad_norm": 20.951925501886503, "learning_rate": 1.674302345112083e-07, "logits/chosen": 1.9694963693618774, "logits/rejected": 1.918088674545288, "logps/chosen": -429.9407653808594, "logps/rejected": -431.4341735839844, "loss": 0.6454, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0554986000061035, "rewards/margins": 0.7666572332382202, "rewards/rejected": -1.8221558332443237, "step": 302 }, { "epoch": 0.6476088698904622, "grad_norm": 19.559839163696893, "learning_rate": 1.656675031542925e-07, "logits/chosen": 1.6839439868927002, "logits/rejected": 1.764726161956787, "logps/chosen": -374.66827392578125, "logps/rejected": -428.7526550292969, "loss": 0.5975, "rewards/accuracies": 0.75, "rewards/chosen": -1.1381664276123047, "rewards/margins": 0.4832189977169037, "rewards/rejected": -1.6213853359222412, "step": 303 }, { "epoch": 0.649746192893401, "grad_norm": 21.095559884110685, "learning_rate": 1.6390949019024118e-07, "logits/chosen": 1.3224655389785767, "logits/rejected": 1.3726483583450317, "logps/chosen": -343.5739440917969, "logps/rejected": -349.3829040527344, "loss": 0.6829, "rewards/accuracies": 0.5625, "rewards/chosen": -1.487676978111267, "rewards/margins": 0.12136916816234589, "rewards/rejected": -1.6090461015701294, "step": 304 }, { "epoch": 0.6518835158963399, "grad_norm": 19.396572088599502, "learning_rate": 1.621562939796643e-07, "logits/chosen": 1.6602495908737183, "logits/rejected": 1.7938483953475952, "logps/chosen": -333.3067626953125, "logps/rejected": -398.20355224609375, "loss": 0.5991, "rewards/accuracies": 0.6875, "rewards/chosen": -0.591664731502533, "rewards/margins": 0.5985820889472961, "rewards/rejected": -1.190246820449829, "step": 305 }, { "epoch": 0.6540208388992786, "grad_norm": 18.311690178787455, "learning_rate": 1.6040801261367493e-07, "logits/chosen": 1.9007511138916016, "logits/rejected": 1.9012222290039062, "logps/chosen": -379.4654846191406, "logps/rejected": -393.036865234375, "loss": 0.5811, "rewards/accuracies": 0.625, "rewards/chosen": -1.008962631225586, "rewards/margins": 0.23067611455917358, "rewards/rejected": -1.2396388053894043, "step": 306 }, { "epoch": 0.6561581619022174, "grad_norm": 21.266085139828498, "learning_rate": 1.5866474390840124e-07, "logits/chosen": 1.795674443244934, "logits/rejected": 1.858997106552124, "logps/chosen": -380.346435546875, "logps/rejected": -384.34075927734375, "loss": 0.6022, "rewards/accuracies": 0.625, "rewards/chosen": -0.886805534362793, "rewards/margins": 0.37954381108283997, "rewards/rejected": -1.2663494348526, "step": 307 }, { "epoch": 0.6582954849051563, "grad_norm": 17.461304449850505, "learning_rate": 1.569265853995137e-07, "logits/chosen": 1.9612005949020386, "logits/rejected": 1.921550989151001, "logps/chosen": -421.7749328613281, "logps/rejected": -458.66717529296875, "loss": 0.6178, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0546059608459473, "rewards/margins": 0.7568830251693726, "rewards/rejected": -1.8114891052246094, "step": 308 }, { "epoch": 0.6604328079080951, "grad_norm": 19.082726960592346, "learning_rate": 1.5519363433676791e-07, "logits/chosen": 1.519544005393982, "logits/rejected": 1.5249278545379639, "logps/chosen": -421.0302429199219, "logps/rejected": -439.40557861328125, "loss": 0.5887, "rewards/accuracies": 0.625, "rewards/chosen": -1.3705350160598755, "rewards/margins": 0.23214447498321533, "rewards/rejected": -1.6026794910430908, "step": 309 }, { "epoch": 0.6625701309110339, "grad_norm": 18.11403641529886, "learning_rate": 1.5346598767856345e-07, "logits/chosen": 1.587712049484253, "logits/rejected": 1.6687382459640503, "logps/chosen": -351.07464599609375, "logps/rejected": -359.98126220703125, "loss": 0.6091, "rewards/accuracies": 0.75, "rewards/chosen": -0.9926980137825012, "rewards/margins": 0.7598885297775269, "rewards/rejected": -1.7525867223739624, "step": 310 }, { "epoch": 0.6647074539139728, "grad_norm": 16.941104687169506, "learning_rate": 1.517437420865191e-07, "logits/chosen": 1.8143850564956665, "logits/rejected": 1.6587419509887695, "logps/chosen": -404.6020202636719, "logps/rejected": -389.1875305175781, "loss": 0.5913, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0236051082611084, "rewards/margins": 0.31825628876686096, "rewards/rejected": -1.3418614864349365, "step": 311 }, { "epoch": 0.6668447769169116, "grad_norm": 18.994018231106203, "learning_rate": 1.500269939200648e-07, "logits/chosen": 1.5793185234069824, "logits/rejected": 1.6349645853042603, "logps/chosen": -373.96240234375, "logps/rejected": -388.9745788574219, "loss": 0.5628, "rewards/accuracies": 0.875, "rewards/chosen": -0.9590240120887756, "rewards/margins": 0.5476614832878113, "rewards/rejected": -1.5066853761672974, "step": 312 }, { "epoch": 0.6689820999198504, "grad_norm": 19.795530112372177, "learning_rate": 1.4831583923104998e-07, "logits/chosen": 1.6909123659133911, "logits/rejected": 1.7797627449035645, "logps/chosen": -416.3101806640625, "logps/rejected": -438.67462158203125, "loss": 0.6688, "rewards/accuracies": 0.375, "rewards/chosen": -0.9395960569381714, "rewards/margins": 0.044527970254421234, "rewards/rejected": -0.9841240644454956, "step": 313 }, { "epoch": 0.6711194229227893, "grad_norm": 23.065919224015374, "learning_rate": 1.4661037375836987e-07, "logits/chosen": 1.376868486404419, "logits/rejected": 1.4309742450714111, "logps/chosen": -407.21051025390625, "logps/rejected": -379.1553039550781, "loss": 0.6451, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0327116250991821, "rewards/margins": 0.013223947957158089, "rewards/rejected": -1.0459357500076294, "step": 314 }, { "epoch": 0.673256745925728, "grad_norm": 18.097806750182333, "learning_rate": 1.4491069292260866e-07, "logits/chosen": 1.9266318082809448, "logits/rejected": 1.8753666877746582, "logps/chosen": -430.4090576171875, "logps/rejected": -414.37841796875, "loss": 0.6214, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0124588012695312, "rewards/margins": 0.7154525518417358, "rewards/rejected": -1.727911353111267, "step": 315 }, { "epoch": 0.6753940689286668, "grad_norm": 16.883704416038682, "learning_rate": 1.432168918207009e-07, "logits/chosen": 2.082824468612671, "logits/rejected": 2.1169075965881348, "logps/chosen": -444.53173828125, "logps/rejected": -443.7054443359375, "loss": 0.5946, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0104761123657227, "rewards/margins": 0.373771071434021, "rewards/rejected": -1.3842473030090332, "step": 316 }, { "epoch": 0.6775313919316056, "grad_norm": 18.54452166971697, "learning_rate": 1.4152906522061047e-07, "logits/chosen": 1.2201721668243408, "logits/rejected": 1.2176278829574585, "logps/chosen": -415.645263671875, "logps/rejected": -436.7527160644531, "loss": 0.6087, "rewards/accuracies": 0.75, "rewards/chosen": -0.836451530456543, "rewards/margins": 0.502465009689331, "rewards/rejected": -1.338916540145874, "step": 317 }, { "epoch": 0.6796687149345445, "grad_norm": 17.299138202298057, "learning_rate": 1.3984730755602903e-07, "logits/chosen": 1.1743218898773193, "logits/rejected": 1.1369413137435913, "logps/chosen": -279.7085266113281, "logps/rejected": -293.5144958496094, "loss": 0.5782, "rewards/accuracies": 0.625, "rewards/chosen": -0.750356137752533, "rewards/margins": 0.5203050374984741, "rewards/rejected": -1.2706611156463623, "step": 318 }, { "epoch": 0.6818060379374833, "grad_norm": 19.398975454291527, "learning_rate": 1.381717129210918e-07, "logits/chosen": 2.5072126388549805, "logits/rejected": 2.4705142974853516, "logps/chosen": -370.4862060546875, "logps/rejected": -377.8222351074219, "loss": 0.5967, "rewards/accuracies": 0.5, "rewards/chosen": -1.3816720247268677, "rewards/margins": 0.17874151468276978, "rewards/rejected": -1.5604134798049927, "step": 319 }, { "epoch": 0.6839433609404221, "grad_norm": 17.240016620199377, "learning_rate": 1.365023750651133e-07, "logits/chosen": 2.6305899620056152, "logits/rejected": 2.5452349185943604, "logps/chosen": -395.29217529296875, "logps/rejected": -391.28607177734375, "loss": 0.5865, "rewards/accuracies": 0.5625, "rewards/chosen": -1.043142318725586, "rewards/margins": 0.10385166853666306, "rewards/rejected": -1.1469941139221191, "step": 320 }, { "epoch": 0.686080683943361, "grad_norm": 21.048682994153253, "learning_rate": 1.3483938738734195e-07, "logits/chosen": 1.2768375873565674, "logits/rejected": 1.4707895517349243, "logps/chosen": -333.0333251953125, "logps/rejected": -352.16094970703125, "loss": 0.6481, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7810248732566833, "rewards/margins": 0.33725664019584656, "rewards/rejected": -1.118281602859497, "step": 321 }, { "epoch": 0.6882180069462998, "grad_norm": 17.394392159707856, "learning_rate": 1.3318284293173449e-07, "logits/chosen": 1.9551142454147339, "logits/rejected": 1.876376986503601, "logps/chosen": -409.2510986328125, "logps/rejected": -405.74188232421875, "loss": 0.5974, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8536494970321655, "rewards/margins": 0.24086396396160126, "rewards/rejected": -1.0945135354995728, "step": 322 }, { "epoch": 0.6903553299492385, "grad_norm": 18.94853226611994, "learning_rate": 1.3153283438175034e-07, "logits/chosen": 1.839475154876709, "logits/rejected": 1.858643889427185, "logps/chosen": -378.91265869140625, "logps/rejected": -390.3384704589844, "loss": 0.6003, "rewards/accuracies": 0.8125, "rewards/chosen": -0.978223979473114, "rewards/margins": 0.5790024399757385, "rewards/rejected": -1.557226300239563, "step": 323 }, { "epoch": 0.6924926529521774, "grad_norm": 17.111787232424657, "learning_rate": 1.2988945405516565e-07, "logits/chosen": 1.826279640197754, "logits/rejected": 1.8660060167312622, "logps/chosen": -330.77508544921875, "logps/rejected": -358.8695068359375, "loss": 0.5461, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9692739248275757, "rewards/margins": 0.4413262605667114, "rewards/rejected": -1.410600185394287, "step": 324 }, { "epoch": 0.6946299759551162, "grad_norm": 25.561218495275238, "learning_rate": 1.2825279389890818e-07, "logits/chosen": 1.8373939990997314, "logits/rejected": 1.7409933805465698, "logps/chosen": -426.71917724609375, "logps/rejected": -416.2432861328125, "loss": 0.6059, "rewards/accuracies": 0.625, "rewards/chosen": -1.2536462545394897, "rewards/margins": 0.25466710329055786, "rewards/rejected": -1.5083134174346924, "step": 325 }, { "epoch": 0.696767298958055, "grad_norm": 17.8636661186785, "learning_rate": 1.2662294548391328e-07, "logits/chosen": 1.5317473411560059, "logits/rejected": 1.3886245489120483, "logps/chosen": -384.1850280761719, "logps/rejected": -339.3521728515625, "loss": 0.6232, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3272415399551392, "rewards/margins": 0.08453375101089478, "rewards/rejected": -1.4117752313613892, "step": 326 }, { "epoch": 0.6989046219609939, "grad_norm": 17.144689153521533, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 1.782150387763977, "logits/rejected": 1.9099899530410767, "logps/chosen": -380.603271484375, "logps/rejected": -433.623779296875, "loss": 0.5832, "rewards/accuracies": 0.625, "rewards/chosen": -0.9439319372177124, "rewards/margins": 0.28429514169692993, "rewards/rejected": -1.2282270193099976, "step": 327 }, { "epoch": 0.7010419449639327, "grad_norm": 19.552962720871037, "learning_rate": 1.2338404825076935e-07, "logits/chosen": 1.2995223999023438, "logits/rejected": 1.3280295133590698, "logps/chosen": -324.47314453125, "logps/rejected": -334.8476257324219, "loss": 0.6088, "rewards/accuracies": 0.625, "rewards/chosen": -0.9188759326934814, "rewards/margins": 0.35758963227272034, "rewards/rejected": -1.2764655351638794, "step": 328 }, { "epoch": 0.7031792679668715, "grad_norm": 16.96691219753261, "learning_rate": 1.2177518064852348e-07, "logits/chosen": 2.564326524734497, "logits/rejected": 2.5473833084106445, "logps/chosen": -552.3907470703125, "logps/rejected": -558.2307739257812, "loss": 0.565, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9720156192779541, "rewards/margins": 0.3254457116127014, "rewards/rejected": -1.2974613904953003, "step": 329 }, { "epoch": 0.7053165909698104, "grad_norm": 17.13700860515758, "learning_rate": 1.201734872092077e-07, "logits/chosen": 1.8777260780334473, "logits/rejected": 1.888671875, "logps/chosen": -434.1102600097656, "logps/rejected": -476.6391296386719, "loss": 0.6071, "rewards/accuracies": 0.875, "rewards/chosen": -1.1475944519042969, "rewards/margins": 0.6439616084098816, "rewards/rejected": -1.7915560007095337, "step": 330 }, { "epoch": 0.7074539139727491, "grad_norm": 16.56035938775486, "learning_rate": 1.185790575473738e-07, "logits/chosen": 2.2853150367736816, "logits/rejected": 2.4159228801727295, "logps/chosen": -415.8793029785156, "logps/rejected": -461.54559326171875, "loss": 0.5657, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1002264022827148, "rewards/margins": 0.29029062390327454, "rewards/rejected": -1.390516996383667, "step": 331 }, { "epoch": 0.7095912369756879, "grad_norm": 21.13067642035339, "learning_rate": 1.1699198087116588e-07, "logits/chosen": 1.7377692461013794, "logits/rejected": 1.709326982498169, "logps/chosen": -392.8721008300781, "logps/rejected": -427.005859375, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": -1.1958163976669312, "rewards/margins": 0.2574688196182251, "rewards/rejected": -1.4532852172851562, "step": 332 }, { "epoch": 0.7117285599786267, "grad_norm": 16.821147592051055, "learning_rate": 1.1541234597732947e-07, "logits/chosen": 1.5543380975723267, "logits/rejected": 1.5971977710723877, "logps/chosen": -374.1974792480469, "logps/rejected": -394.3440246582031, "loss": 0.5213, "rewards/accuracies": 0.625, "rewards/chosen": -0.9774885773658752, "rewards/margins": 0.5341455936431885, "rewards/rejected": -1.5116342306137085, "step": 333 }, { "epoch": 0.7138658829815656, "grad_norm": 17.020478258816116, "learning_rate": 1.1384024124624322e-07, "logits/chosen": 1.4832370281219482, "logits/rejected": 1.3855926990509033, "logps/chosen": -402.4433288574219, "logps/rejected": -406.5257568359375, "loss": 0.5865, "rewards/accuracies": 0.625, "rewards/chosen": -1.5169297456741333, "rewards/margins": 0.22409147024154663, "rewards/rejected": -1.7410211563110352, "step": 334 }, { "epoch": 0.7160032059845044, "grad_norm": 17.550708004075997, "learning_rate": 1.1227575463697439e-07, "logits/chosen": 1.2210332155227661, "logits/rejected": 1.3429027795791626, "logps/chosen": -275.6416015625, "logps/rejected": -383.0740966796875, "loss": 0.6042, "rewards/accuracies": 0.75, "rewards/chosen": -1.2633944749832153, "rewards/margins": 0.37785589694976807, "rewards/rejected": -1.6412503719329834, "step": 335 }, { "epoch": 0.7181405289874432, "grad_norm": 18.381082023596203, "learning_rate": 1.1071897368235694e-07, "logits/chosen": 1.761547327041626, "logits/rejected": 1.9874017238616943, "logps/chosen": -475.7955017089844, "logps/rejected": -545.6195678710938, "loss": 0.6306, "rewards/accuracies": 0.6875, "rewards/chosen": -1.518486738204956, "rewards/margins": 0.3139525055885315, "rewards/rejected": -1.8324391841888428, "step": 336 }, { "epoch": 0.7202778519903821, "grad_norm": 18.478233920231197, "learning_rate": 1.0916998548409447e-07, "logits/chosen": 1.4830739498138428, "logits/rejected": 1.531538486480713, "logps/chosen": -257.1949157714844, "logps/rejected": -257.0049133300781, "loss": 0.5829, "rewards/accuracies": 0.75, "rewards/chosen": -0.9545426368713379, "rewards/margins": 0.5574601888656616, "rewards/rejected": -1.512002944946289, "step": 337 }, { "epoch": 0.7224151749933209, "grad_norm": 16.386183638471508, "learning_rate": 1.0762887670788701e-07, "logits/chosen": 1.5718142986297607, "logits/rejected": 1.7238279581069946, "logps/chosen": -427.6659240722656, "logps/rejected": -471.98724365234375, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": -0.8210560083389282, "rewards/margins": 0.35570234060287476, "rewards/rejected": -1.1767584085464478, "step": 338 }, { "epoch": 0.7245524979962596, "grad_norm": 17.362629994095062, "learning_rate": 1.0609573357858165e-07, "logits/chosen": 2.049461841583252, "logits/rejected": 2.0207159519195557, "logps/chosen": -350.08209228515625, "logps/rejected": -366.1018981933594, "loss": 0.5832, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9150630831718445, "rewards/margins": 0.3026912808418274, "rewards/rejected": -1.2177543640136719, "step": 339 }, { "epoch": 0.7266898209991985, "grad_norm": 18.074963794700235, "learning_rate": 1.0457064187534861e-07, "logits/chosen": 1.8654290437698364, "logits/rejected": 1.9139888286590576, "logps/chosen": -436.3177490234375, "logps/rejected": -468.21026611328125, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": -1.125885009765625, "rewards/margins": 0.6720912456512451, "rewards/rejected": -1.7979762554168701, "step": 340 }, { "epoch": 0.7288271440021373, "grad_norm": 15.616075952348003, "learning_rate": 1.0305368692688174e-07, "logits/chosen": 1.8928383588790894, "logits/rejected": 1.7172499895095825, "logps/chosen": -320.7323303222656, "logps/rejected": -340.637939453125, "loss": 0.5296, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9158480167388916, "rewards/margins": 0.6651303768157959, "rewards/rejected": -1.5809783935546875, "step": 341 }, { "epoch": 0.7309644670050761, "grad_norm": 16.141766521096457, "learning_rate": 1.0154495360662463e-07, "logits/chosen": 1.8351894617080688, "logits/rejected": 1.88357412815094, "logps/chosen": -417.806884765625, "logps/rejected": -424.0835266113281, "loss": 0.5906, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1564135551452637, "rewards/margins": 0.2745019495487213, "rewards/rejected": -1.4309154748916626, "step": 342 }, { "epoch": 0.733101790008015, "grad_norm": 16.58837646437526, "learning_rate": 1.0004452632802158e-07, "logits/chosen": 2.133150100708008, "logits/rejected": 2.064979076385498, "logps/chosen": -376.74078369140625, "logps/rejected": -378.5306091308594, "loss": 0.5551, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7134183049201965, "rewards/margins": 0.5067132711410522, "rewards/rejected": -1.220131754875183, "step": 343 }, { "epoch": 0.7352391130109538, "grad_norm": 16.39380243963404, "learning_rate": 9.855248903979505e-08, "logits/chosen": 2.0602328777313232, "logits/rejected": 2.0420618057250977, "logps/chosen": -302.2620849609375, "logps/rejected": -306.7290344238281, "loss": 0.5853, "rewards/accuracies": 0.625, "rewards/chosen": -0.4054746925830841, "rewards/margins": 0.44839826226234436, "rewards/rejected": -0.8538729548454285, "step": 344 }, { "epoch": 0.7373764360138926, "grad_norm": 19.731670469441074, "learning_rate": 9.706892522124838e-08, "logits/chosen": 1.8298213481903076, "logits/rejected": 1.9004275798797607, "logps/chosen": -327.8082275390625, "logps/rejected": -365.7606506347656, "loss": 0.6029, "rewards/accuracies": 0.75, "rewards/chosen": -1.0073175430297852, "rewards/margins": 0.22869540750980377, "rewards/rejected": -1.2360128164291382, "step": 345 }, { "epoch": 0.7395137590168315, "grad_norm": 19.011257992667222, "learning_rate": 9.559391787759554e-08, "logits/chosen": 1.3878638744354248, "logits/rejected": 1.568437099456787, "logps/chosen": -372.02978515625, "logps/rejected": -407.9342041015625, "loss": 0.6405, "rewards/accuracies": 0.5, "rewards/chosen": -1.1335147619247437, "rewards/margins": 0.025209851562976837, "rewards/rejected": -1.1587246656417847, "step": 346 }, { "epoch": 0.7416510820197703, "grad_norm": 18.46695592828962, "learning_rate": 9.412754953531663e-08, "logits/chosen": 1.710093379020691, "logits/rejected": 1.6472371816635132, "logps/chosen": -297.39776611328125, "logps/rejected": -267.59954833984375, "loss": 0.5957, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2740660905838013, "rewards/margins": 0.06755752861499786, "rewards/rejected": -1.3416236639022827, "step": 347 }, { "epoch": 0.743788405022709, "grad_norm": 16.87621305823869, "learning_rate": 9.266990223754067e-08, "logits/chosen": 1.8978588581085205, "logits/rejected": 1.8024924993515015, "logps/chosen": -327.6407775878906, "logps/rejected": -362.8518981933594, "loss": 0.625, "rewards/accuracies": 0.625, "rewards/chosen": -1.0301729440689087, "rewards/margins": 0.3730047643184662, "rewards/rejected": -1.4031778573989868, "step": 348 }, { "epoch": 0.7459257280256478, "grad_norm": 19.589160930248937, "learning_rate": 9.12210575394553e-08, "logits/chosen": 1.2508031129837036, "logits/rejected": 1.1444499492645264, "logps/chosen": -450.5746154785156, "logps/rejected": -492.925537109375, "loss": 0.5814, "rewards/accuracies": 0.6875, "rewards/chosen": -1.165111780166626, "rewards/margins": 0.5934293270111084, "rewards/rejected": -1.758541226387024, "step": 349 }, { "epoch": 0.7480630510285867, "grad_norm": 16.899125344870786, "learning_rate": 8.978109650374396e-08, "logits/chosen": 2.270671844482422, "logits/rejected": 2.231257200241089, "logps/chosen": -384.2763977050781, "logps/rejected": -375.3595275878906, "loss": 0.6009, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0201106071472168, "rewards/margins": 0.1503101885318756, "rewards/rejected": -1.1704206466674805, "step": 350 }, { "epoch": 0.7502003740315255, "grad_norm": 16.195586617871964, "learning_rate": 8.835009969605011e-08, "logits/chosen": 1.483668565750122, "logits/rejected": 1.482094645500183, "logps/chosen": -343.3363037109375, "logps/rejected": -386.342041015625, "loss": 0.6039, "rewards/accuracies": 0.5625, "rewards/chosen": -1.092482566833496, "rewards/margins": 0.4346364736557007, "rewards/rejected": -1.5271189212799072, "step": 351 }, { "epoch": 0.7523376970344643, "grad_norm": 21.26793469108042, "learning_rate": 8.692814718046978e-08, "logits/chosen": 1.866996169090271, "logits/rejected": 1.7753551006317139, "logps/chosen": -456.31988525390625, "logps/rejected": -433.0694885253906, "loss": 0.6028, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2143518924713135, "rewards/margins": 0.2675904929637909, "rewards/rejected": -1.4819422960281372, "step": 352 }, { "epoch": 0.7544750200374032, "grad_norm": 18.427744526820025, "learning_rate": 8.551531851507185e-08, "logits/chosen": 1.7221219539642334, "logits/rejected": 1.7266207933425903, "logps/chosen": -395.39874267578125, "logps/rejected": -417.5553894042969, "loss": 0.6136, "rewards/accuracies": 0.8125, "rewards/chosen": -1.024322748184204, "rewards/margins": 0.6090888381004333, "rewards/rejected": -1.6334116458892822, "step": 353 }, { "epoch": 0.756612343040342, "grad_norm": 20.305718374217165, "learning_rate": 8.411169274744723e-08, "logits/chosen": 1.9147306680679321, "logits/rejected": 1.8940410614013672, "logps/chosen": -376.7578430175781, "logps/rejected": -394.53680419921875, "loss": 0.6336, "rewards/accuracies": 0.75, "rewards/chosen": -1.3257228136062622, "rewards/margins": 0.3503354787826538, "rewards/rejected": -1.676058292388916, "step": 354 }, { "epoch": 0.7587496660432808, "grad_norm": 20.54054389713915, "learning_rate": 8.271734841028552e-08, "logits/chosen": 1.9527983665466309, "logits/rejected": 2.0401647090911865, "logps/chosen": -419.7406005859375, "logps/rejected": -460.14605712890625, "loss": 0.6028, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8988450169563293, "rewards/margins": 0.2838803231716156, "rewards/rejected": -1.182725429534912, "step": 355 }, { "epoch": 0.7608869890462197, "grad_norm": 18.611126738741184, "learning_rate": 8.133236351698142e-08, "logits/chosen": 1.9735045433044434, "logits/rejected": 2.096036911010742, "logps/chosen": -400.4068603515625, "logps/rejected": -418.3611755371094, "loss": 0.5567, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8807581663131714, "rewards/margins": 0.4934861361980438, "rewards/rejected": -1.3742443323135376, "step": 356 }, { "epoch": 0.7630243120491584, "grad_norm": 19.445978428810093, "learning_rate": 7.99568155572701e-08, "logits/chosen": 1.6883461475372314, "logits/rejected": 1.5084795951843262, "logps/chosen": -461.36859130859375, "logps/rejected": -436.689453125, "loss": 0.6174, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3846616744995117, "rewards/margins": 0.6382476091384888, "rewards/rejected": -2.022909164428711, "step": 357 }, { "epoch": 0.7651616350520972, "grad_norm": 18.04686433261846, "learning_rate": 7.859078149289144e-08, "logits/chosen": 1.8313043117523193, "logits/rejected": 1.745527744293213, "logps/chosen": -395.2892761230469, "logps/rejected": -412.5917663574219, "loss": 0.5956, "rewards/accuracies": 0.875, "rewards/chosen": -1.17001473903656, "rewards/margins": 0.6119087338447571, "rewards/rejected": -1.7819232940673828, "step": 358 }, { "epoch": 0.7672989580550361, "grad_norm": 21.260482136795545, "learning_rate": 7.723433775328384e-08, "logits/chosen": 1.4848061800003052, "logits/rejected": 1.4556670188903809, "logps/chosen": -301.49517822265625, "logps/rejected": -311.4028015136719, "loss": 0.6213, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8877332806587219, "rewards/margins": 0.2783505916595459, "rewards/rejected": -1.166083812713623, "step": 359 }, { "epoch": 0.7694362810579749, "grad_norm": 20.078171800579664, "learning_rate": 7.588756023130833e-08, "logits/chosen": 1.4349831342697144, "logits/rejected": 1.5936346054077148, "logps/chosen": -403.015380859375, "logps/rejected": -404.7160339355469, "loss": 0.5879, "rewards/accuracies": 0.625, "rewards/chosen": -0.9739541411399841, "rewards/margins": 0.3197196125984192, "rewards/rejected": -1.2936737537384033, "step": 360 }, { "epoch": 0.7715736040609137, "grad_norm": 20.485624067682732, "learning_rate": 7.455052427900213e-08, "logits/chosen": 1.5565801858901978, "logits/rejected": 1.6235904693603516, "logps/chosen": -479.0734558105469, "logps/rejected": -470.5107421875, "loss": 0.6689, "rewards/accuracies": 0.5625, "rewards/chosen": -1.511535406112671, "rewards/margins": 0.019254431128501892, "rewards/rejected": -1.5307896137237549, "step": 361 }, { "epoch": 0.7737109270638525, "grad_norm": 17.04191318961155, "learning_rate": 7.322330470336313e-08, "logits/chosen": 1.7196496725082397, "logits/rejected": 1.6668784618377686, "logps/chosen": -298.79754638671875, "logps/rejected": -327.07879638671875, "loss": 0.6116, "rewards/accuracies": 0.625, "rewards/chosen": -1.298088788986206, "rewards/margins": 0.46125340461730957, "rewards/rejected": -1.7593421936035156, "step": 362 }, { "epoch": 0.7758482500667914, "grad_norm": 22.269060100698063, "learning_rate": 7.190597576216384e-08, "logits/chosen": 1.768049955368042, "logits/rejected": 1.6878827810287476, "logps/chosen": -308.0188903808594, "logps/rejected": -323.5241394042969, "loss": 0.6123, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6722269058227539, "rewards/margins": 0.5267820954322815, "rewards/rejected": -1.1990089416503906, "step": 363 }, { "epoch": 0.7779855730697302, "grad_norm": 19.044197893042973, "learning_rate": 7.059861115979701e-08, "logits/chosen": 1.6526095867156982, "logits/rejected": 1.7360644340515137, "logps/chosen": -311.6914367675781, "logps/rejected": -372.5617980957031, "loss": 0.6656, "rewards/accuracies": 0.75, "rewards/chosen": -0.48367545008659363, "rewards/margins": 0.8194734454154968, "rewards/rejected": -1.303148865699768, "step": 364 }, { "epoch": 0.7801228960726689, "grad_norm": 18.813524672602487, "learning_rate": 6.930128404315214e-08, "logits/chosen": 1.5229746103286743, "logits/rejected": 1.3987520933151245, "logps/chosen": -377.3014221191406, "logps/rejected": -419.2353515625, "loss": 0.6065, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7370649576187134, "rewards/margins": 0.3761173188686371, "rewards/rejected": -2.1131820678710938, "step": 365 }, { "epoch": 0.7822602190756078, "grad_norm": 17.713282448963408, "learning_rate": 6.801406699752229e-08, "logits/chosen": 1.6024835109710693, "logits/rejected": 1.670932412147522, "logps/chosen": -372.3557434082031, "logps/rejected": -386.1446228027344, "loss": 0.5762, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2764437198638916, "rewards/margins": 0.2568923830986023, "rewards/rejected": -1.5333361625671387, "step": 366 }, { "epoch": 0.7843975420785466, "grad_norm": 16.625075332510626, "learning_rate": 6.673703204254347e-08, "logits/chosen": 1.6393647193908691, "logits/rejected": 1.6989482641220093, "logps/chosen": -397.8586120605469, "logps/rejected": -411.6268310546875, "loss": 0.545, "rewards/accuracies": 0.75, "rewards/chosen": -1.292647361755371, "rewards/margins": 0.40703633427619934, "rewards/rejected": -1.6996837854385376, "step": 367 }, { "epoch": 0.7865348650814854, "grad_norm": 19.878935002956247, "learning_rate": 6.547025062816486e-08, "logits/chosen": 1.4603272676467896, "logits/rejected": 1.3453279733657837, "logps/chosen": -299.8364562988281, "logps/rejected": -330.9093017578125, "loss": 0.62, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6287568211555481, "rewards/margins": 0.374176561832428, "rewards/rejected": -1.002933382987976, "step": 368 }, { "epoch": 0.7886721880844243, "grad_norm": 16.758828572680688, "learning_rate": 6.42137936306514e-08, "logits/chosen": 1.7640485763549805, "logits/rejected": 1.8099982738494873, "logps/chosen": -362.0057373046875, "logps/rejected": -391.24224853515625, "loss": 0.5984, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1697113513946533, "rewards/margins": 0.047207579016685486, "rewards/rejected": -1.2169188261032104, "step": 369 }, { "epoch": 0.7908095110873631, "grad_norm": 21.845423639848487, "learning_rate": 6.296773134861824e-08, "logits/chosen": 1.6217718124389648, "logits/rejected": 1.734086513519287, "logps/chosen": -256.1837158203125, "logps/rejected": -272.9673767089844, "loss": 0.5846, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0282855033874512, "rewards/margins": 0.41900819540023804, "rewards/rejected": -1.447293758392334, "step": 370 }, { "epoch": 0.7929468340903019, "grad_norm": 17.92986801930431, "learning_rate": 6.173213349909728e-08, "logits/chosen": 1.8200995922088623, "logits/rejected": 1.7330509424209595, "logps/chosen": -356.558837890625, "logps/rejected": -348.9017028808594, "loss": 0.6052, "rewards/accuracies": 0.625, "rewards/chosen": -0.9982262849807739, "rewards/margins": 0.14351393282413483, "rewards/rejected": -1.1417402029037476, "step": 371 }, { "epoch": 0.7950841570932408, "grad_norm": 17.081174619575314, "learning_rate": 6.050706921363672e-08, "logits/chosen": 1.8869644403457642, "logits/rejected": 1.8416467905044556, "logps/chosen": -354.5775146484375, "logps/rejected": -388.51177978515625, "loss": 0.6067, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0341633558273315, "rewards/margins": 0.3402591943740845, "rewards/rejected": -1.3744226694107056, "step": 372 }, { "epoch": 0.7972214800961795, "grad_norm": 17.662807478755507, "learning_rate": 5.929260703443337e-08, "logits/chosen": 1.3998608589172363, "logits/rejected": 1.4698572158813477, "logps/chosen": -449.04351806640625, "logps/rejected": -449.3763122558594, "loss": 0.585, "rewards/accuracies": 0.625, "rewards/chosen": -1.1849581003189087, "rewards/margins": 0.4725034534931183, "rewards/rejected": -1.6574615240097046, "step": 373 }, { "epoch": 0.7993588030991183, "grad_norm": 17.725555922665713, "learning_rate": 5.808881491049722e-08, "logits/chosen": 1.6900920867919922, "logits/rejected": 1.6598479747772217, "logps/chosen": -412.2428894042969, "logps/rejected": -422.5267333984375, "loss": 0.5971, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0798150300979614, "rewards/margins": 0.28746387362480164, "rewards/rejected": -1.3672789335250854, "step": 374 }, { "epoch": 0.8014961261020572, "grad_norm": 17.16991579530448, "learning_rate": 5.6895760193850145e-08, "logits/chosen": 1.4595896005630493, "logits/rejected": 1.4326881170272827, "logps/chosen": -408.4560852050781, "logps/rejected": -405.8981018066406, "loss": 0.6283, "rewards/accuracies": 0.625, "rewards/chosen": -1.646263599395752, "rewards/margins": 0.3666486144065857, "rewards/rejected": -2.0129122734069824, "step": 375 }, { "epoch": 0.803633449104996, "grad_norm": 17.93410549937266, "learning_rate": 5.571350963575727e-08, "logits/chosen": 1.743166208267212, "logits/rejected": 1.6486046314239502, "logps/chosen": -445.26312255859375, "logps/rejected": -459.120849609375, "loss": 0.6159, "rewards/accuracies": 0.5625, "rewards/chosen": -1.186528205871582, "rewards/margins": 0.33439502120018005, "rewards/rejected": -1.520923376083374, "step": 376 }, { "epoch": 0.8057707721079348, "grad_norm": 16.9845646295775, "learning_rate": 5.454212938299255e-08, "logits/chosen": 1.9025753736495972, "logits/rejected": 1.917330265045166, "logps/chosen": -345.7820129394531, "logps/rejected": -331.1052551269531, "loss": 0.6072, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7959749102592468, "rewards/margins": 0.298651784658432, "rewards/rejected": -1.0946266651153564, "step": 377 }, { "epoch": 0.8079080951108736, "grad_norm": 16.873032439456804, "learning_rate": 5.338168497413756e-08, "logits/chosen": 2.3049681186676025, "logits/rejected": 2.2500970363616943, "logps/chosen": -402.2143249511719, "logps/rejected": -402.95166015625, "loss": 0.537, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9636644124984741, "rewards/margins": 0.5833888053894043, "rewards/rejected": -1.5470532178878784, "step": 378 }, { "epoch": 0.8100454181138125, "grad_norm": 16.325007461853215, "learning_rate": 5.223224133591475e-08, "logits/chosen": 1.597383737564087, "logits/rejected": 1.609183669090271, "logps/chosen": -288.2392578125, "logps/rejected": -337.51092529296875, "loss": 0.5317, "rewards/accuracies": 0.875, "rewards/chosen": -0.7511001825332642, "rewards/margins": 0.693979799747467, "rewards/rejected": -1.445080041885376, "step": 379 }, { "epoch": 0.8121827411167513, "grad_norm": 19.572806409831877, "learning_rate": 5.109386277955477e-08, "logits/chosen": 1.6565470695495605, "logits/rejected": 1.6742223501205444, "logps/chosen": -377.81939697265625, "logps/rejected": -387.5241394042969, "loss": 0.5908, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6612589359283447, "rewards/margins": 0.5410428047180176, "rewards/rejected": -1.2023016214370728, "step": 380 }, { "epoch": 0.81432006411969, "grad_norm": 19.520950615429946, "learning_rate": 4.996661299719845e-08, "logits/chosen": 1.8140640258789062, "logits/rejected": 1.7656865119934082, "logps/chosen": -510.03521728515625, "logps/rejected": -507.64166259765625, "loss": 0.5909, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2178007364273071, "rewards/margins": 0.18270380795001984, "rewards/rejected": -1.4005045890808105, "step": 381 }, { "epoch": 0.8164573871226289, "grad_norm": 16.127377849885388, "learning_rate": 4.885055505833291e-08, "logits/chosen": 1.644123911857605, "logits/rejected": 1.6220022439956665, "logps/chosen": -386.73663330078125, "logps/rejected": -393.9167175292969, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": -1.1794971227645874, "rewards/margins": 0.3469049632549286, "rewards/rejected": -1.5264021158218384, "step": 382 }, { "epoch": 0.8185947101255677, "grad_norm": 17.73846719380124, "learning_rate": 4.774575140626316e-08, "logits/chosen": 1.3664933443069458, "logits/rejected": 1.4751458168029785, "logps/chosen": -344.3662109375, "logps/rejected": -390.3739013671875, "loss": 0.5697, "rewards/accuracies": 0.625, "rewards/chosen": -0.9151713848114014, "rewards/margins": 0.487021267414093, "rewards/rejected": -1.40219247341156, "step": 383 }, { "epoch": 0.8207320331285065, "grad_norm": 17.072732271001154, "learning_rate": 4.6652263854618016e-08, "logits/chosen": 1.934602975845337, "logits/rejected": 2.0105841159820557, "logps/chosen": -426.7675476074219, "logps/rejected": -451.0812683105469, "loss": 0.553, "rewards/accuracies": 0.75, "rewards/chosen": -1.2221730947494507, "rewards/margins": 0.48440563678741455, "rewards/rejected": -1.7065787315368652, "step": 384 }, { "epoch": 0.8228693561314454, "grad_norm": 21.293620391850812, "learning_rate": 4.557015358389216e-08, "logits/chosen": 2.1146156787872314, "logits/rejected": 2.1517491340637207, "logps/chosen": -464.082763671875, "logps/rejected": -470.5016784667969, "loss": 0.6409, "rewards/accuracies": 0.625, "rewards/chosen": -1.3467998504638672, "rewards/margins": 0.40472251176834106, "rewards/rejected": -1.751522421836853, "step": 385 }, { "epoch": 0.8250066791343842, "grad_norm": 17.833652532065038, "learning_rate": 4.449948113802254e-08, "logits/chosen": 1.807660698890686, "logits/rejected": 1.7980822324752808, "logps/chosen": -438.4178771972656, "logps/rejected": -453.2962951660156, "loss": 0.6096, "rewards/accuracies": 0.6875, "rewards/chosen": -1.104383111000061, "rewards/margins": 0.7033094167709351, "rewards/rejected": -1.807692527770996, "step": 386 }, { "epoch": 0.827144002137323, "grad_norm": 19.571597961934124, "learning_rate": 4.3440306421001324e-08, "logits/chosen": 1.7510319948196411, "logits/rejected": 1.6834566593170166, "logps/chosen": -484.4024658203125, "logps/rejected": -485.3985900878906, "loss": 0.6339, "rewards/accuracies": 0.625, "rewards/chosen": -1.0107614994049072, "rewards/margins": 0.25170525908470154, "rewards/rejected": -1.2624667882919312, "step": 387 }, { "epoch": 0.8292813251402619, "grad_norm": 17.82974603873972, "learning_rate": 4.2392688693524055e-08, "logits/chosen": 1.404268741607666, "logits/rejected": 1.3694736957550049, "logps/chosen": -408.179443359375, "logps/rejected": -406.07086181640625, "loss": 0.5722, "rewards/accuracies": 0.625, "rewards/chosen": -1.0687264204025269, "rewards/margins": 0.3735535144805908, "rewards/rejected": -1.4422800540924072, "step": 388 }, { "epoch": 0.8314186481432007, "grad_norm": 17.438473933793833, "learning_rate": 4.1356686569674335e-08, "logits/chosen": 1.3533443212509155, "logits/rejected": 1.4366700649261475, "logps/chosen": -380.77960205078125, "logps/rejected": -405.69097900390625, "loss": 0.6209, "rewards/accuracies": 0.625, "rewards/chosen": -0.9108083844184875, "rewards/margins": 0.4199364483356476, "rewards/rejected": -1.330744981765747, "step": 389 }, { "epoch": 0.8335559711461394, "grad_norm": 20.355804196982337, "learning_rate": 4.0332358013644015e-08, "logits/chosen": 1.4978055953979492, "logits/rejected": 1.5026313066482544, "logps/chosen": -390.0057373046875, "logps/rejected": -386.77313232421875, "loss": 0.5277, "rewards/accuracies": 0.875, "rewards/chosen": -0.7286229133605957, "rewards/margins": 0.4630648195743561, "rewards/rejected": -1.191687822341919, "step": 390 }, { "epoch": 0.8356932941490782, "grad_norm": 18.45237542669808, "learning_rate": 3.9319760336490205e-08, "logits/chosen": 1.5355026721954346, "logits/rejected": 1.543558120727539, "logps/chosen": -339.8648986816406, "logps/rejected": -330.8528747558594, "loss": 0.5491, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5863010883331299, "rewards/margins": 0.04436583071947098, "rewards/rejected": -1.630666971206665, "step": 391 }, { "epoch": 0.8378306171520171, "grad_norm": 16.374379157073367, "learning_rate": 3.831895019292897e-08, "logits/chosen": 1.8390171527862549, "logits/rejected": 1.98207426071167, "logps/chosen": -447.08380126953125, "logps/rejected": -459.7073974609375, "loss": 0.5677, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8593235015869141, "rewards/margins": 0.2716907858848572, "rewards/rejected": -1.131014347076416, "step": 392 }, { "epoch": 0.8399679401549559, "grad_norm": 19.305861548140808, "learning_rate": 3.732998357816514e-08, "logits/chosen": 1.5250623226165771, "logits/rejected": 1.6143085956573486, "logps/chosen": -306.3028564453125, "logps/rejected": -309.7184143066406, "loss": 0.6167, "rewards/accuracies": 0.5, "rewards/chosen": -1.2037074565887451, "rewards/margins": 0.11004292964935303, "rewards/rejected": -1.3137503862380981, "step": 393 }, { "epoch": 0.8421052631578947, "grad_norm": 20.03718277534217, "learning_rate": 3.635291582475963e-08, "logits/chosen": 1.9921854734420776, "logits/rejected": 2.0196597576141357, "logps/chosen": -310.9258728027344, "logps/rejected": -345.7138366699219, "loss": 0.6297, "rewards/accuracies": 0.6875, "rewards/chosen": -1.169144868850708, "rewards/margins": 0.5138453245162964, "rewards/rejected": -1.682990312576294, "step": 394 }, { "epoch": 0.8442425861608336, "grad_norm": 28.142649519687087, "learning_rate": 3.538780159953347e-08, "logits/chosen": 0.4998638331890106, "logits/rejected": 0.8166034817695618, "logps/chosen": -302.5617980957031, "logps/rejected": -285.3108825683594, "loss": 0.6362, "rewards/accuracies": 0.6875, "rewards/chosen": -1.474445104598999, "rewards/margins": 0.5089064240455627, "rewards/rejected": -1.9833515882492065, "step": 395 }, { "epoch": 0.8463799091637724, "grad_norm": 16.81382076431638, "learning_rate": 3.4434694900509345e-08, "logits/chosen": 2.299107789993286, "logits/rejected": 2.3635454177856445, "logps/chosen": -444.2193603515625, "logps/rejected": -458.1888732910156, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -0.7250373363494873, "rewards/margins": 0.566501259803772, "rewards/rejected": -1.2915387153625488, "step": 396 }, { "epoch": 0.8485172321667112, "grad_norm": 19.392060798939607, "learning_rate": 3.349364905389032e-08, "logits/chosen": 1.8515828847885132, "logits/rejected": 1.804177165031433, "logps/chosen": -409.74365234375, "logps/rejected": -418.00323486328125, "loss": 0.63, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2581006288528442, "rewards/margins": 0.11751668900251389, "rewards/rejected": -1.375617265701294, "step": 397 }, { "epoch": 0.85065455516965, "grad_norm": 17.103729406158553, "learning_rate": 3.256471671107616e-08, "logits/chosen": 1.6766064167022705, "logits/rejected": 1.737638235092163, "logps/chosen": -428.44775390625, "logps/rejected": -407.27215576171875, "loss": 0.5907, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2123147249221802, "rewards/margins": 0.46986332535743713, "rewards/rejected": -1.6821781396865845, "step": 398 }, { "epoch": 0.8527918781725888, "grad_norm": 19.577154726036273, "learning_rate": 3.1647949845717585e-08, "logits/chosen": 1.630979299545288, "logits/rejected": 1.6724216938018799, "logps/chosen": -348.2309265136719, "logps/rejected": -371.38079833984375, "loss": 0.5965, "rewards/accuracies": 0.6875, "rewards/chosen": -1.337809681892395, "rewards/margins": 0.36785420775413513, "rewards/rejected": -1.7056639194488525, "step": 399 }, { "epoch": 0.8549292011755276, "grad_norm": 18.826846500494238, "learning_rate": 3.074339975080836e-08, "logits/chosen": 1.4629188776016235, "logits/rejected": 1.4965126514434814, "logps/chosen": -372.1953125, "logps/rejected": -401.7340087890625, "loss": 0.62, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5468955039978027, "rewards/margins": 0.15613311529159546, "rewards/rejected": -1.703028678894043, "step": 400 }, { "epoch": 0.8549292011755276, "eval_logits/chosen": 1.6708999872207642, "eval_logits/rejected": 1.6820895671844482, "eval_logps/chosen": -403.0909729003906, "eval_logps/rejected": -433.6640930175781, "eval_loss": 0.61039799451828, "eval_rewards/accuracies": 0.6975806355476379, "eval_rewards/chosen": -1.0658750534057617, "eval_rewards/margins": 0.3874683082103729, "eval_rewards/rejected": -1.4533432722091675, "eval_runtime": 88.4828, "eval_samples_per_second": 22.162, "eval_steps_per_second": 0.701, "step": 400 }, { "epoch": 0.8570665241784665, "grad_norm": 19.7974029204823, "learning_rate": 2.98511170358155e-08, "logits/chosen": 1.7001169919967651, "logits/rejected": 1.532285213470459, "logps/chosen": -407.736083984375, "logps/rejected": -362.34912109375, "loss": 0.6207, "rewards/accuracies": 0.625, "rewards/chosen": -1.1236542463302612, "rewards/margins": 0.419066846370697, "rewards/rejected": -1.542720913887024, "step": 401 }, { "epoch": 0.8592038471814053, "grad_norm": 21.14677537275842, "learning_rate": 2.8971151623847584e-08, "logits/chosen": 1.7446285486221313, "logits/rejected": 1.785717487335205, "logps/chosen": -397.1467590332031, "logps/rejected": -435.0672302246094, "loss": 0.6311, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2843698263168335, "rewards/margins": 0.144765704870224, "rewards/rejected": -1.4291354417800903, "step": 402 }, { "epoch": 0.8613411701843441, "grad_norm": 18.56274549083658, "learning_rate": 2.8103552748861475e-08, "logits/chosen": 1.6394376754760742, "logits/rejected": 1.714260220527649, "logps/chosen": -428.9457092285156, "logps/rejected": -443.63946533203125, "loss": 0.6151, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0276083946228027, "rewards/margins": 0.4798241853713989, "rewards/rejected": -1.5074325799942017, "step": 403 }, { "epoch": 0.863478493187283, "grad_norm": 18.223738666005126, "learning_rate": 2.724836895290805e-08, "logits/chosen": 2.072843074798584, "logits/rejected": 2.2029428482055664, "logps/chosen": -372.4205017089844, "logps/rejected": -394.25372314453125, "loss": 0.5865, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1573227643966675, "rewards/margins": 0.34818750619888306, "rewards/rejected": -1.5055103302001953, "step": 404 }, { "epoch": 0.8656158161902218, "grad_norm": 16.462464022149383, "learning_rate": 2.6405648083415833e-08, "logits/chosen": 1.7049862146377563, "logits/rejected": 1.6428170204162598, "logps/chosen": -384.18572998046875, "logps/rejected": -400.8232421875, "loss": 0.5885, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0246027708053589, "rewards/margins": 0.5153809189796448, "rewards/rejected": -1.5399837493896484, "step": 405 }, { "epoch": 0.8677531391931605, "grad_norm": 17.711142089994855, "learning_rate": 2.55754372905142e-08, "logits/chosen": 1.3477802276611328, "logits/rejected": 1.3108845949172974, "logps/chosen": -384.1131591796875, "logps/rejected": -423.6861267089844, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": -1.3002797365188599, "rewards/margins": 0.2088843584060669, "rewards/rejected": -1.5091642141342163, "step": 406 }, { "epoch": 0.8698904621960993, "grad_norm": 18.762054039394492, "learning_rate": 2.475778302439524e-08, "logits/chosen": 1.4159724712371826, "logits/rejected": 1.4871361255645752, "logps/chosen": -470.30670166015625, "logps/rejected": -511.0997314453125, "loss": 0.6385, "rewards/accuracies": 0.5, "rewards/chosen": -1.4217987060546875, "rewards/margins": 0.12027350813150406, "rewards/rejected": -1.5420721769332886, "step": 407 }, { "epoch": 0.8720277851990382, "grad_norm": 18.123731183123528, "learning_rate": 2.3952731032714973e-08, "logits/chosen": 2.404209613800049, "logits/rejected": 2.4266862869262695, "logps/chosen": -427.8636169433594, "logps/rejected": -422.4261474609375, "loss": 0.6259, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2890108823776245, "rewards/margins": 0.378464937210083, "rewards/rejected": -1.667475700378418, "step": 408 }, { "epoch": 0.874165108201977, "grad_norm": 16.47161779285551, "learning_rate": 2.3160326358033778e-08, "logits/chosen": 1.366338849067688, "logits/rejected": 1.3981362581253052, "logps/chosen": -380.618408203125, "logps/rejected": -390.28057861328125, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": -0.7875087261199951, "rewards/margins": 0.5520244836807251, "rewards/rejected": -1.3395333290100098, "step": 409 }, { "epoch": 0.8763024312049158, "grad_norm": 17.122362330907116, "learning_rate": 2.2380613335296033e-08, "logits/chosen": 1.8973968029022217, "logits/rejected": 1.8657612800598145, "logps/chosen": -510.1373291015625, "logps/rejected": -504.5380859375, "loss": 0.5772, "rewards/accuracies": 0.75, "rewards/chosen": -1.1060776710510254, "rewards/margins": 0.7315502762794495, "rewards/rejected": -1.8376280069351196, "step": 410 }, { "epoch": 0.8784397542078547, "grad_norm": 18.921033730633166, "learning_rate": 2.1613635589349756e-08, "logits/chosen": 1.4683369398117065, "logits/rejected": 1.5493369102478027, "logps/chosen": -384.0714111328125, "logps/rejected": -391.6690673828125, "loss": 0.6087, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4163315296173096, "rewards/margins": 0.286923348903656, "rewards/rejected": -1.7032545804977417, "step": 411 }, { "epoch": 0.8805770772107935, "grad_norm": 18.56812402329405, "learning_rate": 2.085943603250595e-08, "logits/chosen": 2.2680721282958984, "logits/rejected": 2.329545259475708, "logps/chosen": -436.45550537109375, "logps/rejected": -465.65167236328125, "loss": 0.6226, "rewards/accuracies": 0.6875, "rewards/chosen": -1.101943016052246, "rewards/margins": 0.39092782139778137, "rewards/rejected": -1.4928709268569946, "step": 412 }, { "epoch": 0.8827144002137323, "grad_norm": 17.23421202374209, "learning_rate": 2.0118056862137354e-08, "logits/chosen": 1.721946120262146, "logits/rejected": 1.654489517211914, "logps/chosen": -283.10394287109375, "logps/rejected": -316.85931396484375, "loss": 0.5663, "rewards/accuracies": 0.875, "rewards/chosen": -1.0829007625579834, "rewards/margins": 0.6879382729530334, "rewards/rejected": -1.7708390951156616, "step": 413 }, { "epoch": 0.8848517232166712, "grad_norm": 17.24040512239067, "learning_rate": 1.938953955831771e-08, "logits/chosen": 1.8397215604782104, "logits/rejected": 1.8681014776229858, "logps/chosen": -348.84930419921875, "logps/rejected": -354.12872314453125, "loss": 0.552, "rewards/accuracies": 0.875, "rewards/chosen": -1.1776130199432373, "rewards/margins": 0.4248766303062439, "rewards/rejected": -1.602489948272705, "step": 414 }, { "epoch": 0.88698904621961, "grad_norm": 18.359788171946988, "learning_rate": 1.8673924881500823e-08, "logits/chosen": 1.946815848350525, "logits/rejected": 1.872100591659546, "logps/chosen": -431.5869140625, "logps/rejected": -441.61566162109375, "loss": 0.6415, "rewards/accuracies": 0.4375, "rewards/chosen": -1.366295576095581, "rewards/margins": 0.09268444776535034, "rewards/rejected": -1.4589800834655762, "step": 415 }, { "epoch": 0.8891263692225487, "grad_norm": 19.161300804849038, "learning_rate": 1.797125287024029e-08, "logits/chosen": 1.4459537267684937, "logits/rejected": 1.4782161712646484, "logps/chosen": -321.7972412109375, "logps/rejected": -357.9532775878906, "loss": 0.5477, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3282475471496582, "rewards/margins": 0.6495993733406067, "rewards/rejected": -1.9778469800949097, "step": 416 }, { "epoch": 0.8912636922254876, "grad_norm": 18.975245958523743, "learning_rate": 1.7281562838948966e-08, "logits/chosen": 1.9562761783599854, "logits/rejected": 1.9078611135482788, "logps/chosen": -411.2685241699219, "logps/rejected": -417.2503967285156, "loss": 0.6413, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3274304866790771, "rewards/margins": 0.19440825283527374, "rewards/rejected": -1.5218387842178345, "step": 417 }, { "epoch": 0.8934010152284264, "grad_norm": 17.631994700003954, "learning_rate": 1.6604893375699592e-08, "logits/chosen": 1.4374586343765259, "logits/rejected": 1.4749794006347656, "logps/chosen": -467.3548583984375, "logps/rejected": -471.19305419921875, "loss": 0.5571, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3381353616714478, "rewards/margins": 0.5423804521560669, "rewards/rejected": -1.8805158138275146, "step": 418 }, { "epoch": 0.8955383382313652, "grad_norm": 18.607931434550764, "learning_rate": 1.5941282340065697e-08, "logits/chosen": 1.6795357465744019, "logits/rejected": 1.4486351013183594, "logps/chosen": -454.1962890625, "logps/rejected": -454.06036376953125, "loss": 0.5847, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1021569967269897, "rewards/margins": 0.6963544487953186, "rewards/rejected": -1.7985113859176636, "step": 419 }, { "epoch": 0.897675661234304, "grad_norm": 18.496258966475576, "learning_rate": 1.5290766861003475e-08, "logits/chosen": 1.4845163822174072, "logits/rejected": 1.4758884906768799, "logps/chosen": -339.92315673828125, "logps/rejected": -377.06219482421875, "loss": 0.6292, "rewards/accuracies": 0.75, "rewards/chosen": -1.4483891725540161, "rewards/margins": 0.5280267000198364, "rewards/rejected": -1.9764158725738525, "step": 420 }, { "epoch": 0.8998129842372429, "grad_norm": 18.43461695598332, "learning_rate": 1.4653383334774228e-08, "logits/chosen": 1.790908694267273, "logits/rejected": 1.859453797340393, "logps/chosen": -349.16796875, "logps/rejected": -384.42059326171875, "loss": 0.5825, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3110591173171997, "rewards/margins": 0.5185103416442871, "rewards/rejected": -1.8295693397521973, "step": 421 }, { "epoch": 0.9019503072401817, "grad_norm": 17.81347366573304, "learning_rate": 1.4029167422908105e-08, "logits/chosen": 2.2679755687713623, "logits/rejected": 2.147033214569092, "logps/chosen": -546.00732421875, "logps/rejected": -525.1266479492188, "loss": 0.5897, "rewards/accuracies": 0.5, "rewards/chosen": -1.2079718112945557, "rewards/margins": 0.28895166516304016, "rewards/rejected": -1.4969233274459839, "step": 422 }, { "epoch": 0.9040876302431204, "grad_norm": 18.982282659913977, "learning_rate": 1.3418154050208936e-08, "logits/chosen": 1.8013602495193481, "logits/rejected": 1.6778424978256226, "logps/chosen": -425.6045227050781, "logps/rejected": -390.84942626953125, "loss": 0.5949, "rewards/accuracies": 0.75, "rewards/chosen": -0.9693441390991211, "rewards/margins": 0.3830145001411438, "rewards/rejected": -1.3523586988449097, "step": 423 }, { "epoch": 0.9062249532460593, "grad_norm": 17.345552565300007, "learning_rate": 1.2820377402800064e-08, "logits/chosen": 1.6084281206130981, "logits/rejected": 1.5029420852661133, "logps/chosen": -253.730712890625, "logps/rejected": -260.1260681152344, "loss": 0.6188, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9357234835624695, "rewards/margins": 0.2174171507358551, "rewards/rejected": -1.1531407833099365, "step": 424 }, { "epoch": 0.9083622762489981, "grad_norm": 16.934150955880476, "learning_rate": 1.2235870926211616e-08, "logits/chosen": 2.1916890144348145, "logits/rejected": 2.2400388717651367, "logps/chosen": -285.0917663574219, "logps/rejected": -322.8558349609375, "loss": 0.5861, "rewards/accuracies": 0.6875, "rewards/chosen": -1.114344835281372, "rewards/margins": 0.6048067808151245, "rewards/rejected": -1.7191518545150757, "step": 425 }, { "epoch": 0.9104995992519369, "grad_norm": 16.78887791389279, "learning_rate": 1.1664667323509347e-08, "logits/chosen": 1.4608023166656494, "logits/rejected": 1.4370110034942627, "logps/chosen": -377.3074645996094, "logps/rejected": -393.9685363769531, "loss": 0.5724, "rewards/accuracies": 0.8125, "rewards/chosen": -1.325226902961731, "rewards/margins": 0.3096981942653656, "rewards/rejected": -1.634925127029419, "step": 426 }, { "epoch": 0.9126369222548758, "grad_norm": 16.418296977579498, "learning_rate": 1.1106798553464802e-08, "logits/chosen": 2.0858511924743652, "logits/rejected": 2.105901002883911, "logps/chosen": -367.344970703125, "logps/rejected": -403.23944091796875, "loss": 0.5597, "rewards/accuracies": 0.75, "rewards/chosen": -0.9577431678771973, "rewards/margins": 0.36515700817108154, "rewards/rejected": -1.3229001760482788, "step": 427 }, { "epoch": 0.9147742452578146, "grad_norm": 19.245837329047824, "learning_rate": 1.0562295828767387e-08, "logits/chosen": 1.3938257694244385, "logits/rejected": 1.4853570461273193, "logps/chosen": -355.104248046875, "logps/rejected": -412.22222900390625, "loss": 0.5855, "rewards/accuracies": 0.625, "rewards/chosen": -1.0600641965866089, "rewards/margins": 0.5212962627410889, "rewards/rejected": -1.5813604593276978, "step": 428 }, { "epoch": 0.9169115682607534, "grad_norm": 17.51716706952601, "learning_rate": 1.0031189614277763e-08, "logits/chosen": 1.5417289733886719, "logits/rejected": 1.5729870796203613, "logps/chosen": -321.3535461425781, "logps/rejected": -309.21234130859375, "loss": 0.5932, "rewards/accuracies": 0.5625, "rewards/chosen": -1.24485182762146, "rewards/margins": 0.23819807171821594, "rewards/rejected": -1.483049750328064, "step": 429 }, { "epoch": 0.9190488912636923, "grad_norm": 15.052027680122107, "learning_rate": 9.513509625323518e-09, "logits/chosen": 1.9989356994628906, "logits/rejected": 2.145040273666382, "logps/chosen": -402.7864685058594, "logps/rejected": -425.85296630859375, "loss": 0.5434, "rewards/accuracies": 0.75, "rewards/chosen": -1.2284355163574219, "rewards/margins": 0.4855433702468872, "rewards/rejected": -1.713978886604309, "step": 430 }, { "epoch": 0.921186214266631, "grad_norm": 19.85542406922844, "learning_rate": 9.009284826036689e-09, "logits/chosen": 1.4120731353759766, "logits/rejected": 1.4285330772399902, "logps/chosen": -397.44110107421875, "logps/rejected": -441.92449951171875, "loss": 0.5901, "rewards/accuracies": 0.8125, "rewards/chosen": -1.050657868385315, "rewards/margins": 0.8108992576599121, "rewards/rejected": -1.8615570068359375, "step": 431 }, { "epoch": 0.9233235372695698, "grad_norm": 19.008344723565617, "learning_rate": 8.518543427732949e-09, "logits/chosen": 1.5640474557876587, "logits/rejected": 1.5279781818389893, "logps/chosen": -368.58758544921875, "logps/rejected": -412.3382568359375, "loss": 0.6518, "rewards/accuracies": 0.875, "rewards/chosen": -1.3109221458435059, "rewards/margins": 0.4531381130218506, "rewards/rejected": -1.7640602588653564, "step": 432 }, { "epoch": 0.9254608602725087, "grad_norm": 17.725519985685008, "learning_rate": 8.041312887333396e-09, "logits/chosen": 1.6748428344726562, "logits/rejected": 1.7114133834838867, "logps/chosen": -323.8331298828125, "logps/rejected": -365.55615234375, "loss": 0.6122, "rewards/accuracies": 0.75, "rewards/chosen": -1.0393937826156616, "rewards/margins": 0.4052044153213501, "rewards/rejected": -1.4445981979370117, "step": 433 }, { "epoch": 0.9275981832754475, "grad_norm": 17.42277444922853, "learning_rate": 7.577619905828281e-09, "logits/chosen": 1.5716729164123535, "logits/rejected": 1.6077600717544556, "logps/chosen": -355.86785888671875, "logps/rejected": -387.0648498535156, "loss": 0.5723, "rewards/accuracies": 0.875, "rewards/chosen": -1.3303015232086182, "rewards/margins": 0.4627699553966522, "rewards/rejected": -1.7930715084075928, "step": 434 }, { "epoch": 0.9297355062783863, "grad_norm": 17.161022534670806, "learning_rate": 7.127490426783123e-09, "logits/chosen": 1.7443987131118774, "logits/rejected": 1.7588456869125366, "logps/chosen": -477.2964172363281, "logps/rejected": -463.51165771484375, "loss": 0.6176, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5801703929901123, "rewards/margins": 0.22587111592292786, "rewards/rejected": -1.8060415983200073, "step": 435 }, { "epoch": 0.9318728292813251, "grad_norm": 23.430770890245924, "learning_rate": 6.6909496348871445e-09, "logits/chosen": 1.8086642026901245, "logits/rejected": 1.8920540809631348, "logps/chosen": -347.9205322265625, "logps/rejected": -364.6381530761719, "loss": 0.6435, "rewards/accuracies": 0.875, "rewards/chosen": -0.6429501175880432, "rewards/margins": 0.5737862586975098, "rewards/rejected": -1.2167364358901978, "step": 436 }, { "epoch": 0.934010152284264, "grad_norm": 20.41363056479245, "learning_rate": 6.268021954544095e-09, "logits/chosen": 1.019057273864746, "logits/rejected": 0.9966921806335449, "logps/chosen": -283.1648864746094, "logps/rejected": -311.0303039550781, "loss": 0.651, "rewards/accuracies": 0.75, "rewards/chosen": -0.9427959322929382, "rewards/margins": 0.709195077419281, "rewards/rejected": -1.6519910097122192, "step": 437 }, { "epoch": 0.9361474752872028, "grad_norm": 16.18819460648659, "learning_rate": 5.858731048505927e-09, "logits/chosen": 1.7406284809112549, "logits/rejected": 1.8502442836761475, "logps/chosen": -348.90777587890625, "logps/rejected": -371.7511291503906, "loss": 0.5833, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2307209968566895, "rewards/margins": 0.5154100656509399, "rewards/rejected": -1.7461309432983398, "step": 438 }, { "epoch": 0.9382847982901416, "grad_norm": 19.014687420050503, "learning_rate": 5.463099816548577e-09, "logits/chosen": 1.6393160820007324, "logits/rejected": 1.7532588243484497, "logps/chosen": -459.3277282714844, "logps/rejected": -491.35870361328125, "loss": 0.555, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3520638942718506, "rewards/margins": 0.6467914581298828, "rewards/rejected": -1.9988553524017334, "step": 439 }, { "epoch": 0.9404221212930804, "grad_norm": 21.2096647419158, "learning_rate": 5.08115039419113e-09, "logits/chosen": 1.6956130266189575, "logits/rejected": 1.79883873462677, "logps/chosen": -414.0579528808594, "logps/rejected": -428.4068298339844, "loss": 0.5805, "rewards/accuracies": 0.875, "rewards/chosen": -1.1613951921463013, "rewards/margins": 0.7041161060333252, "rewards/rejected": -1.865511178970337, "step": 440 }, { "epoch": 0.9425594442960192, "grad_norm": 17.17027730259822, "learning_rate": 4.712904151456864e-09, "logits/chosen": 1.5854616165161133, "logits/rejected": 1.6955193281173706, "logps/chosen": -409.63006591796875, "logps/rejected": -437.7228088378906, "loss": 0.5695, "rewards/accuracies": 0.75, "rewards/chosen": -0.9472392797470093, "rewards/margins": 0.22789564728736877, "rewards/rejected": -1.1751350164413452, "step": 441 }, { "epoch": 0.944696767298958, "grad_norm": 20.349000792886113, "learning_rate": 4.358381691677931e-09, "logits/chosen": 1.95444917678833, "logits/rejected": 2.0044314861297607, "logps/chosen": -442.7948303222656, "logps/rejected": -469.05487060546875, "loss": 0.6312, "rewards/accuracies": 0.5, "rewards/chosen": -1.7205852270126343, "rewards/margins": 0.20369315147399902, "rewards/rejected": -1.9242782592773438, "step": 442 }, { "epoch": 0.9468340903018969, "grad_norm": 19.194373432794922, "learning_rate": 4.0176028503425826e-09, "logits/chosen": 0.7825450897216797, "logits/rejected": 0.7275460362434387, "logps/chosen": -357.76983642578125, "logps/rejected": -377.4739074707031, "loss": 0.6453, "rewards/accuracies": 0.5, "rewards/chosen": -1.2489838600158691, "rewards/margins": -0.00024299323558807373, "rewards/rejected": -1.2487409114837646, "step": 443 }, { "epoch": 0.9489714133048357, "grad_norm": 18.315036023173974, "learning_rate": 3.6905866939851983e-09, "logits/chosen": 1.3465955257415771, "logits/rejected": 1.50806725025177, "logps/chosen": -312.5934753417969, "logps/rejected": -344.8988952636719, "loss": 0.5825, "rewards/accuracies": 0.625, "rewards/chosen": -0.9831118583679199, "rewards/margins": 0.4171569347381592, "rewards/rejected": -1.4002689123153687, "step": 444 }, { "epoch": 0.9511087363077745, "grad_norm": 20.014785567703672, "learning_rate": 3.3773515191196646e-09, "logits/chosen": 1.5976628065109253, "logits/rejected": 1.6187068223953247, "logps/chosen": -359.6556091308594, "logps/rejected": -336.0928649902344, "loss": 0.6265, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2695033550262451, "rewards/margins": 0.45331743359565735, "rewards/rejected": -1.72282075881958, "step": 445 }, { "epoch": 0.9532460593107134, "grad_norm": 17.72727377899664, "learning_rate": 3.077914851215585e-09, "logits/chosen": 1.5538979768753052, "logits/rejected": 1.7278094291687012, "logps/chosen": -372.9303283691406, "logps/rejected": -364.25457763671875, "loss": 0.5963, "rewards/accuracies": 0.75, "rewards/chosen": -1.1903377771377563, "rewards/margins": 0.4384117126464844, "rewards/rejected": -1.6287494897842407, "step": 446 }, { "epoch": 0.9553833823136522, "grad_norm": 16.23642380668402, "learning_rate": 2.7922934437178692e-09, "logits/chosen": 1.7799421548843384, "logits/rejected": 1.7960634231567383, "logps/chosen": -444.5595397949219, "logps/rejected": -479.6990966796875, "loss": 0.543, "rewards/accuracies": 0.875, "rewards/chosen": -1.126379132270813, "rewards/margins": 0.5162345170974731, "rewards/rejected": -1.6426136493682861, "step": 447 }, { "epoch": 0.957520705316591, "grad_norm": 19.101807236171627, "learning_rate": 2.5205032771092592e-09, "logits/chosen": 1.3535457849502563, "logits/rejected": 1.2006926536560059, "logps/chosen": -330.3343505859375, "logps/rejected": -374.99310302734375, "loss": 0.5951, "rewards/accuracies": 0.625, "rewards/chosen": -1.1633282899856567, "rewards/margins": 0.4544922709465027, "rewards/rejected": -1.6178205013275146, "step": 448 }, { "epoch": 0.9596580283195298, "grad_norm": 16.56693609193232, "learning_rate": 2.2625595580163247e-09, "logits/chosen": 2.0447781085968018, "logits/rejected": 1.8950867652893066, "logps/chosen": -429.4393005371094, "logps/rejected": -414.31817626953125, "loss": 0.5752, "rewards/accuracies": 0.875, "rewards/chosen": -1.2402998208999634, "rewards/margins": 0.6825169324874878, "rewards/rejected": -1.9228168725967407, "step": 449 }, { "epoch": 0.9617953513224686, "grad_norm": 16.980928002541575, "learning_rate": 2.0184767183584474e-09, "logits/chosen": 1.4854786396026611, "logits/rejected": 1.4846787452697754, "logps/chosen": -308.0284118652344, "logps/rejected": -305.34515380859375, "loss": 0.6133, "rewards/accuracies": 0.625, "rewards/chosen": -1.0690131187438965, "rewards/margins": 0.16039828956127167, "rewards/rejected": -1.2294113636016846, "step": 450 }, { "epoch": 0.9639326743254074, "grad_norm": 18.056225979920594, "learning_rate": 1.7882684145406612e-09, "logits/chosen": 2.019209146499634, "logits/rejected": 2.1171696186065674, "logps/chosen": -350.12774658203125, "logps/rejected": -378.6883850097656, "loss": 0.5985, "rewards/accuracies": 0.625, "rewards/chosen": -1.2527142763137817, "rewards/margins": 0.3824135661125183, "rewards/rejected": -1.6351279020309448, "step": 451 }, { "epoch": 0.9660699973283462, "grad_norm": 17.543262127192886, "learning_rate": 1.5719475266893489e-09, "logits/chosen": 1.4318145513534546, "logits/rejected": 1.5697556734085083, "logps/chosen": -303.3445129394531, "logps/rejected": -348.2174377441406, "loss": 0.5964, "rewards/accuracies": 0.625, "rewards/chosen": -1.186130404472351, "rewards/margins": 0.7902547121047974, "rewards/rejected": -1.9763849973678589, "step": 452 }, { "epoch": 0.9682073203312851, "grad_norm": 18.220000876315183, "learning_rate": 1.3695261579316775e-09, "logits/chosen": 1.6201151609420776, "logits/rejected": 1.7068089246749878, "logps/chosen": -370.4643859863281, "logps/rejected": -408.29852294921875, "loss": 0.6176, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4112707376480103, "rewards/margins": 0.37669438123703003, "rewards/rejected": -1.787965178489685, "step": 453 }, { "epoch": 0.9703446433342239, "grad_norm": 16.929529214225397, "learning_rate": 1.1810156337183908e-09, "logits/chosen": 1.9085679054260254, "logits/rejected": 2.029019832611084, "logps/chosen": -444.6353454589844, "logps/rejected": -482.870849609375, "loss": 0.6378, "rewards/accuracies": 0.625, "rewards/chosen": -1.495185375213623, "rewards/margins": -0.083378866314888, "rewards/rejected": -1.411806344985962, "step": 454 }, { "epoch": 0.9724819663371627, "grad_norm": 19.141724017680925, "learning_rate": 1.0064265011902328e-09, "logits/chosen": 0.8631810545921326, "logits/rejected": 0.833740234375, "logps/chosen": -340.0003356933594, "logps/rejected": -393.02728271484375, "loss": 0.6086, "rewards/accuracies": 0.5625, "rewards/chosen": -1.579225778579712, "rewards/margins": 0.29944878816604614, "rewards/rejected": -1.8786746263504028, "step": 455 }, { "epoch": 0.9746192893401016, "grad_norm": 16.719925817090374, "learning_rate": 8.457685285878091e-10, "logits/chosen": 2.0139966011047363, "logits/rejected": 2.11411714553833, "logps/chosen": -473.91241455078125, "logps/rejected": -476.2071533203125, "loss": 0.5955, "rewards/accuracies": 0.625, "rewards/chosen": -1.1062893867492676, "rewards/margins": 0.22559532523155212, "rewards/rejected": -1.331884741783142, "step": 456 }, { "epoch": 0.9767566123430403, "grad_norm": 16.220077060781502, "learning_rate": 6.990507047049676e-10, "logits/chosen": 1.58895742893219, "logits/rejected": 1.6351604461669922, "logps/chosen": -302.9062805175781, "logps/rejected": -351.12957763671875, "loss": 0.5676, "rewards/accuracies": 0.625, "rewards/chosen": -0.6347337961196899, "rewards/margins": 0.34598052501678467, "rewards/rejected": -0.9807142019271851, "step": 457 }, { "epoch": 0.9788939353459791, "grad_norm": 16.320270998571377, "learning_rate": 5.662812383859794e-10, "logits/chosen": 1.4099833965301514, "logits/rejected": 1.4714851379394531, "logps/chosen": -324.4279479980469, "logps/rejected": -353.4306335449219, "loss": 0.5799, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4264363050460815, "rewards/margins": 0.448211669921875, "rewards/rejected": -1.874647855758667, "step": 458 }, { "epoch": 0.981031258348918, "grad_norm": 19.089311245869983, "learning_rate": 4.4746755806621126e-10, "logits/chosen": 1.4548715353012085, "logits/rejected": 1.485275387763977, "logps/chosen": -355.63311767578125, "logps/rejected": -358.629150390625, "loss": 0.6253, "rewards/accuracies": 0.5, "rewards/chosen": -1.3190171718597412, "rewards/margins": 0.17262953519821167, "rewards/rejected": -1.4916467666625977, "step": 459 }, { "epoch": 0.9831685813518568, "grad_norm": 17.724546722072443, "learning_rate": 3.4261631135654167e-10, "logits/chosen": 1.7169846296310425, "logits/rejected": 1.4584649801254272, "logps/chosen": -417.5559997558594, "logps/rejected": -402.72406005859375, "loss": 0.6109, "rewards/accuracies": 0.625, "rewards/chosen": -1.3284595012664795, "rewards/margins": 0.16694369912147522, "rewards/rejected": -1.4954030513763428, "step": 460 }, { "epoch": 0.9853059043547956, "grad_norm": 18.58153544828685, "learning_rate": 2.5173336467135263e-10, "logits/chosen": 1.4444609880447388, "logits/rejected": 1.4017637968063354, "logps/chosen": -395.44097900390625, "logps/rejected": -413.9075622558594, "loss": 0.5902, "rewards/accuracies": 0.625, "rewards/chosen": -1.2212958335876465, "rewards/margins": 0.5377592444419861, "rewards/rejected": -1.7590551376342773, "step": 461 }, { "epoch": 0.9874432273577345, "grad_norm": 19.162318454777324, "learning_rate": 1.7482380290034792e-10, "logits/chosen": 0.9124627113342285, "logits/rejected": 0.8698334097862244, "logps/chosen": -333.6249694824219, "logps/rejected": -346.1784973144531, "loss": 0.6127, "rewards/accuracies": 0.625, "rewards/chosen": -1.4773962497711182, "rewards/margins": 0.1805485039949417, "rewards/rejected": -1.657944679260254, "step": 462 }, { "epoch": 0.9895805503606733, "grad_norm": 19.724273693538393, "learning_rate": 1.1189192912416933e-10, "logits/chosen": 1.6409908533096313, "logits/rejected": 1.5480575561523438, "logps/chosen": -283.3055114746094, "logps/rejected": -310.44561767578125, "loss": 0.658, "rewards/accuracies": 0.75, "rewards/chosen": -1.3127912282943726, "rewards/margins": 0.5362306237220764, "rewards/rejected": -1.8490217924118042, "step": 463 }, { "epoch": 0.9917178733636121, "grad_norm": 17.93307015830412, "learning_rate": 6.294126437336733e-11, "logits/chosen": 2.145725727081299, "logits/rejected": 2.0747246742248535, "logps/chosen": -436.8760986328125, "logps/rejected": -406.91290283203125, "loss": 0.6152, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6242163181304932, "rewards/margins": 0.16452085971832275, "rewards/rejected": -1.7887372970581055, "step": 464 }, { "epoch": 0.9938551963665508, "grad_norm": 21.39798162878369, "learning_rate": 2.797454743164174e-11, "logits/chosen": 1.4774665832519531, "logits/rejected": 1.4269077777862549, "logps/chosen": -484.0561828613281, "logps/rejected": -466.2383117675781, "loss": 0.6466, "rewards/accuracies": 0.625, "rewards/chosen": -1.3485268354415894, "rewards/margins": 0.09689469635486603, "rewards/rejected": -1.4454214572906494, "step": 465 }, { "epoch": 0.9959925193694897, "grad_norm": 18.82475158368635, "learning_rate": 6.993734682547714e-12, "logits/chosen": 1.9347305297851562, "logits/rejected": 1.9315987825393677, "logps/chosen": -536.5445556640625, "logps/rejected": -522.9882202148438, "loss": 0.6231, "rewards/accuracies": 0.5, "rewards/chosen": -1.4731837511062622, "rewards/margins": -0.18822771310806274, "rewards/rejected": -1.2849558591842651, "step": 466 }, { "epoch": 0.9981298423724285, "grad_norm": 18.113547265416, "learning_rate": 0.0, "logits/chosen": 2.014084577560425, "logits/rejected": 1.9462015628814697, "logps/chosen": -489.33050537109375, "logps/rejected": -409.4538879394531, "loss": 0.633, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1971293687820435, "rewards/margins": 0.26006725430488586, "rewards/rejected": -1.457196593284607, "step": 467 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.6216196353991996, "train_runtime": 6954.5059, "train_samples_per_second": 8.61, "train_steps_per_second": 0.067 } ], "logging_steps": 1, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }