{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998027613412229, "eval_steps": 50000, "global_step": 1824, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00821827744904668, "grad_norm": 48.55606780690502, "learning_rate": 1.358695652173913e-08, "logits/chosen": 26.16689682006836, "logits/rejected": 25.511425018310547, "logps/chosen": -189.36741638183594, "logps/rejected": -78.73792266845703, "loss": 1.79, "rewards/accuracies": 0.2800000011920929, "rewards/chosen": -0.006983796134591103, "rewards/margins": 3.662884410005063e-05, "rewards/rejected": -0.007020425051450729, "sft_loss": 0.661233127117157, "step": 5 }, { "epoch": 0.01643655489809336, "grad_norm": 50.84809399854242, "learning_rate": 2.717391304347826e-08, "logits/chosen": 25.634292602539062, "logits/rejected": 25.165508270263672, "logps/chosen": -175.30511474609375, "logps/rejected": -79.45011901855469, "loss": 1.7672, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": -0.01493214163929224, "rewards/margins": 0.032123688608407974, "rewards/rejected": -0.04705582931637764, "sft_loss": 0.6432023644447327, "step": 10 }, { "epoch": 0.02465483234714004, "grad_norm": 41.52221293409133, "learning_rate": 4.076086956521739e-08, "logits/chosen": 25.897306442260742, "logits/rejected": 25.234777450561523, "logps/chosen": -204.5565643310547, "logps/rejected": -85.37405395507812, "loss": 1.6603, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": -0.028912657871842384, "rewards/margins": 0.18977542221546173, "rewards/rejected": -0.21868810057640076, "sft_loss": 0.7592554688453674, "step": 15 }, { "epoch": 0.03287310979618672, "grad_norm": 30.024466917447533, "learning_rate": 5.434782608695652e-08, "logits/chosen": 26.472496032714844, "logits/rejected": 26.013669967651367, "logps/chosen": -178.9062042236328, "logps/rejected": -87.18224334716797, "loss": 1.5519, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -0.11060313880443573, "rewards/margins": 0.3851660490036011, "rewards/rejected": -0.495769202709198, "sft_loss": 0.6785654425621033, "step": 20 }, { "epoch": 0.041091387245233396, "grad_norm": 23.574332052101575, "learning_rate": 6.793478260869565e-08, "logits/chosen": 26.571308135986328, "logits/rejected": 26.069765090942383, "logps/chosen": -204.71995544433594, "logps/rejected": -95.25181579589844, "loss": 1.4535, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -0.2731512486934662, "rewards/margins": 0.7024775743484497, "rewards/rejected": -0.9756287336349487, "sft_loss": 0.6605415344238281, "step": 25 }, { "epoch": 0.04930966469428008, "grad_norm": 18.127113157576492, "learning_rate": 8.152173913043478e-08, "logits/chosen": 26.70085906982422, "logits/rejected": 26.199695587158203, "logps/chosen": -189.0041961669922, "logps/rejected": -95.67135620117188, "loss": 1.3598, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -0.4376958906650543, "rewards/margins": 0.9910183548927307, "rewards/rejected": -1.4287142753601074, "sft_loss": 0.6798427700996399, "step": 30 }, { "epoch": 0.05752794214332676, "grad_norm": 16.856249874916603, "learning_rate": 9.510869565217392e-08, "logits/chosen": 27.086894989013672, "logits/rejected": 26.779054641723633, "logps/chosen": -202.5185546875, "logps/rejected": -98.5663070678711, "loss": 1.2944, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -0.5852899551391602, "rewards/margins": 1.2753018140792847, "rewards/rejected": -1.8605915307998657, "sft_loss": 0.6831802129745483, "step": 35 }, { "epoch": 0.06574621959237344, "grad_norm": 15.222314216803584, "learning_rate": 1.0869565217391303e-07, "logits/chosen": 26.470937728881836, "logits/rejected": 26.266651153564453, "logps/chosen": -185.2868194580078, "logps/rejected": -96.5091781616211, "loss": 1.2027, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -0.6554566025733948, "rewards/margins": 1.4152508974075317, "rewards/rejected": -2.0707075595855713, "sft_loss": 0.6970738768577576, "step": 40 }, { "epoch": 0.07396449704142012, "grad_norm": 14.365159397400335, "learning_rate": 1.2228260869565216e-07, "logits/chosen": 25.881906509399414, "logits/rejected": 25.525175094604492, "logps/chosen": -202.46238708496094, "logps/rejected": -108.43726348876953, "loss": 1.1328, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.787525475025177, "rewards/margins": 1.8143333196640015, "rewards/rejected": -2.6018588542938232, "sft_loss": 0.6782786846160889, "step": 45 }, { "epoch": 0.08218277449046679, "grad_norm": 13.924602084521048, "learning_rate": 1.358695652173913e-07, "logits/chosen": 24.610755920410156, "logits/rejected": 24.408979415893555, "logps/chosen": -206.8500213623047, "logps/rejected": -113.42557525634766, "loss": 1.0599, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -0.9198395609855652, "rewards/margins": 1.9545520544052124, "rewards/rejected": -2.874391555786133, "sft_loss": 0.7132790088653564, "step": 50 }, { "epoch": 0.09040105193951348, "grad_norm": 11.972485852637668, "learning_rate": 1.4945652173913042e-07, "logits/chosen": 23.996862411499023, "logits/rejected": 24.392988204956055, "logps/chosen": -176.3905487060547, "logps/rejected": -110.62020874023438, "loss": 1.0223, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -0.8744373321533203, "rewards/margins": 2.172375440597534, "rewards/rejected": -3.0468130111694336, "sft_loss": 0.7045189738273621, "step": 55 }, { "epoch": 0.09861932938856016, "grad_norm": 13.242028156676367, "learning_rate": 1.6304347826086955e-07, "logits/chosen": 23.04694366455078, "logits/rejected": 23.079355239868164, "logps/chosen": -186.1154327392578, "logps/rejected": -107.23130798339844, "loss": 1.0046, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -1.0562888383865356, "rewards/margins": 2.0806047916412354, "rewards/rejected": -3.1368932723999023, "sft_loss": 0.6290792226791382, "step": 60 }, { "epoch": 0.10683760683760683, "grad_norm": 11.030176141313747, "learning_rate": 1.766304347826087e-07, "logits/chosen": 21.996606826782227, "logits/rejected": 22.384113311767578, "logps/chosen": -199.38589477539062, "logps/rejected": -116.91275024414062, "loss": 0.9338, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1412394046783447, "rewards/margins": 2.474609613418579, "rewards/rejected": -3.615849018096924, "sft_loss": 0.697711706161499, "step": 65 }, { "epoch": 0.11505588428665352, "grad_norm": 11.76117705302215, "learning_rate": 1.9021739130434784e-07, "logits/chosen": 22.534835815429688, "logits/rejected": 23.107168197631836, "logps/chosen": -216.9481964111328, "logps/rejected": -129.04183959960938, "loss": 0.8671, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -1.3997070789337158, "rewards/margins": 2.9236786365509033, "rewards/rejected": -4.323385715484619, "sft_loss": 0.728801965713501, "step": 70 }, { "epoch": 0.1232741617357002, "grad_norm": 32.386219318167385, "learning_rate": 2.0380434782608694e-07, "logits/chosen": 20.90481948852539, "logits/rejected": 21.215843200683594, "logps/chosen": -247.61224365234375, "logps/rejected": -138.62893676757812, "loss": 0.8076, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -1.5252928733825684, "rewards/margins": 3.3426883220672607, "rewards/rejected": -4.86798095703125, "sft_loss": 0.7596563696861267, "step": 75 }, { "epoch": 0.13149243918474687, "grad_norm": 11.700521911598706, "learning_rate": 2.1739130434782607e-07, "logits/chosen": 20.761672973632812, "logits/rejected": 20.871828079223633, "logps/chosen": -236.5396728515625, "logps/rejected": -138.31297302246094, "loss": 0.842, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -1.9364999532699585, "rewards/margins": 3.281285047531128, "rewards/rejected": -5.217784881591797, "sft_loss": 0.7300873398780823, "step": 80 }, { "epoch": 0.13971071663379356, "grad_norm": 11.895414317868761, "learning_rate": 2.309782608695652e-07, "logits/chosen": 21.150850296020508, "logits/rejected": 21.817951202392578, "logps/chosen": -223.0463104248047, "logps/rejected": -139.8596954345703, "loss": 0.7489, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -2.007277250289917, "rewards/margins": 3.5759541988372803, "rewards/rejected": -5.5832319259643555, "sft_loss": 0.7483465075492859, "step": 85 }, { "epoch": 0.14792899408284024, "grad_norm": 11.018586570679572, "learning_rate": 2.445652173913043e-07, "logits/chosen": 22.40447998046875, "logits/rejected": 22.448156356811523, "logps/chosen": -201.39810180664062, "logps/rejected": -126.50525665283203, "loss": 0.8269, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -2.1027634143829346, "rewards/margins": 3.118117332458496, "rewards/rejected": -5.220880508422852, "sft_loss": 0.7317149639129639, "step": 90 }, { "epoch": 0.15614727153188693, "grad_norm": 9.026135528071627, "learning_rate": 2.499981493451693e-07, "logits/chosen": 20.40322494506836, "logits/rejected": 20.44278907775879, "logps/chosen": -203.20326232910156, "logps/rejected": -124.00860595703125, "loss": 0.8771, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -1.6680656671524048, "rewards/margins": 3.2214581966400146, "rewards/rejected": -4.889523983001709, "sft_loss": 0.7273903489112854, "step": 95 }, { "epoch": 0.16436554898093358, "grad_norm": 10.366938012622036, "learning_rate": 2.499868399863186e-07, "logits/chosen": 20.907590866088867, "logits/rejected": 21.92055892944336, "logps/chosen": -226.97225952148438, "logps/rejected": -144.5021514892578, "loss": 0.7676, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -2.1906163692474365, "rewards/margins": 3.6612253189086914, "rewards/rejected": -5.851841449737549, "sft_loss": 0.7680675983428955, "step": 100 }, { "epoch": 0.17258382642998027, "grad_norm": 9.779078878164054, "learning_rate": 2.4996525033926786e-07, "logits/chosen": 19.350120544433594, "logits/rejected": 19.718740463256836, "logps/chosen": -209.20166015625, "logps/rejected": -136.57321166992188, "loss": 0.7133, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -2.251823663711548, "rewards/margins": 3.696510076522827, "rewards/rejected": -5.948334217071533, "sft_loss": 0.7179654836654663, "step": 105 }, { "epoch": 0.18080210387902695, "grad_norm": 8.45489237540799, "learning_rate": 2.499333821797864e-07, "logits/chosen": 20.7148380279541, "logits/rejected": 20.950342178344727, "logps/chosen": -197.59976196289062, "logps/rejected": -124.13175964355469, "loss": 0.7642, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -2.359647750854492, "rewards/margins": 3.3463170528411865, "rewards/rejected": -5.705965042114258, "sft_loss": 0.7615786790847778, "step": 110 }, { "epoch": 0.18902038132807364, "grad_norm": 10.762078567025862, "learning_rate": 2.4989123812906105e-07, "logits/chosen": 19.379554748535156, "logits/rejected": 20.651145935058594, "logps/chosen": -219.8887176513672, "logps/rejected": -148.8833770751953, "loss": 0.7483, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.958165168762207, "rewards/margins": 3.9372713565826416, "rewards/rejected": -6.895437240600586, "sft_loss": 0.7731737494468689, "step": 115 }, { "epoch": 0.19723865877712032, "grad_norm": 10.354433872987686, "learning_rate": 2.498388216534807e-07, "logits/chosen": 19.773361206054688, "logits/rejected": 21.142953872680664, "logps/chosen": -238.31101989746094, "logps/rejected": -152.0144500732422, "loss": 0.7063, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": -2.7792108058929443, "rewards/margins": 4.163509845733643, "rewards/rejected": -6.942720413208008, "sft_loss": 0.7693167328834534, "step": 120 }, { "epoch": 0.205456936226167, "grad_norm": 11.490346482929228, "learning_rate": 2.49776137064351e-07, "logits/chosen": 19.508024215698242, "logits/rejected": 19.62151527404785, "logps/chosen": -232.81178283691406, "logps/rejected": -151.69398498535156, "loss": 0.7148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.6999313831329346, "rewards/margins": 3.9598686695098877, "rewards/rejected": -6.659799575805664, "sft_loss": 0.8186704516410828, "step": 125 }, { "epoch": 0.21367521367521367, "grad_norm": 13.390026452837366, "learning_rate": 2.4970318951754e-07, "logits/chosen": 19.62987518310547, "logits/rejected": 20.120250701904297, "logps/chosen": -247.29205322265625, "logps/rejected": -159.60348510742188, "loss": 0.6619, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -2.8834011554718018, "rewards/margins": 4.369426727294922, "rewards/rejected": -7.252828598022461, "sft_loss": 0.7933542728424072, "step": 130 }, { "epoch": 0.22189349112426035, "grad_norm": 20.479502968540558, "learning_rate": 2.496199850130537e-07, "logits/chosen": 18.90142059326172, "logits/rejected": 19.151918411254883, "logps/chosen": -231.70069885253906, "logps/rejected": -141.99693298339844, "loss": 0.7109, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -2.819154977798462, "rewards/margins": 3.806306838989258, "rewards/rejected": -6.625460624694824, "sft_loss": 0.7920200228691101, "step": 135 }, { "epoch": 0.23011176857330704, "grad_norm": 16.190350556337812, "learning_rate": 2.4952653039454297e-07, "logits/chosen": 18.546707153320312, "logits/rejected": 18.616119384765625, "logps/chosen": -251.7685089111328, "logps/rejected": -160.7568817138672, "loss": 0.703, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -3.2368268966674805, "rewards/margins": 4.385184288024902, "rewards/rejected": -7.622011184692383, "sft_loss": 0.8116011023521423, "step": 140 }, { "epoch": 0.23833004602235372, "grad_norm": 14.348906773180857, "learning_rate": 2.494228333487403e-07, "logits/chosen": 18.956235885620117, "logits/rejected": 19.919641494750977, "logps/chosen": -210.7549591064453, "logps/rejected": -144.51132202148438, "loss": 0.6182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.001668691635132, "rewards/margins": 3.866687536239624, "rewards/rejected": -6.868356227874756, "sft_loss": 0.7950787544250488, "step": 145 }, { "epoch": 0.2465483234714004, "grad_norm": 11.009157695890236, "learning_rate": 2.4930890240482784e-07, "logits/chosen": 18.876365661621094, "logits/rejected": 19.30438804626465, "logps/chosen": -229.18504333496094, "logps/rejected": -150.90707397460938, "loss": 0.671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3249759674072266, "rewards/margins": 4.095080375671387, "rewards/rejected": -7.4200568199157715, "sft_loss": 0.7879451513290405, "step": 150 }, { "epoch": 0.25476660092044706, "grad_norm": 11.264576367918604, "learning_rate": 2.491847469337356e-07, "logits/chosen": 18.14313316345215, "logits/rejected": 18.77975082397461, "logps/chosen": -219.8468780517578, "logps/rejected": -150.99098205566406, "loss": 0.6461, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -3.250223159790039, "rewards/margins": 4.443104267120361, "rewards/rejected": -7.6933274269104, "sft_loss": 0.8351505994796753, "step": 155 }, { "epoch": 0.26298487836949375, "grad_norm": 17.15390685304222, "learning_rate": 2.4905037714737094e-07, "logits/chosen": 19.779348373413086, "logits/rejected": 19.593463897705078, "logps/chosen": -259.2501220703125, "logps/rejected": -162.26368713378906, "loss": 0.7398, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.7065176963806152, "rewards/margins": 4.470663070678711, "rewards/rejected": -8.177180290222168, "sft_loss": 0.8221470713615417, "step": 160 }, { "epoch": 0.27120315581854043, "grad_norm": 10.266952014618042, "learning_rate": 2.489058040977784e-07, "logits/chosen": 19.731273651123047, "logits/rejected": 19.947425842285156, "logps/chosen": -222.83753967285156, "logps/rejected": -142.3966522216797, "loss": 0.6633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9937241077423096, "rewards/margins": 4.092346668243408, "rewards/rejected": -7.086071968078613, "sft_loss": 0.8631803393363953, "step": 165 }, { "epoch": 0.2794214332675871, "grad_norm": 13.183734224346434, "learning_rate": 2.487510396762309e-07, "logits/chosen": 18.506755828857422, "logits/rejected": 19.725309371948242, "logps/chosen": -246.2398223876953, "logps/rejected": -171.14974975585938, "loss": 0.6512, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.316751480102539, "rewards/margins": 4.549408912658691, "rewards/rejected": -7.8661603927612305, "sft_loss": 0.9392525553703308, "step": 170 }, { "epoch": 0.2876397107166338, "grad_norm": 12.820383998338311, "learning_rate": 2.485860966122514e-07, "logits/chosen": 18.673315048217773, "logits/rejected": 19.47124671936035, "logps/chosen": -239.1477508544922, "logps/rejected": -168.49923706054688, "loss": 0.6218, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.3230719566345215, "rewards/margins": 4.699094295501709, "rewards/rejected": -8.022165298461914, "sft_loss": 0.8536433577537537, "step": 175 }, { "epoch": 0.2958579881656805, "grad_norm": 10.336252791103886, "learning_rate": 2.484109884725661e-07, "logits/chosen": 17.68476104736328, "logits/rejected": 18.92132568359375, "logps/chosen": -248.71087646484375, "logps/rejected": -164.80517578125, "loss": 0.6908, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7376978397369385, "rewards/margins": 4.469425678253174, "rewards/rejected": -8.207123756408691, "sft_loss": 0.7900984883308411, "step": 180 }, { "epoch": 0.30407626561472717, "grad_norm": 9.07674205143479, "learning_rate": 2.4822572965998844e-07, "logits/chosen": 17.927953720092773, "logits/rejected": 18.744905471801758, "logps/chosen": -256.3652038574219, "logps/rejected": -169.36451721191406, "loss": 0.6008, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -3.603369951248169, "rewards/margins": 4.865907192230225, "rewards/rejected": -8.469277381896973, "sft_loss": 0.8645619750022888, "step": 185 }, { "epoch": 0.31229454306377386, "grad_norm": 11.293965527732967, "learning_rate": 2.4803033541223455e-07, "logits/chosen": 19.39400863647461, "logits/rejected": 19.796106338500977, "logps/chosen": -245.06739807128906, "logps/rejected": -164.10296630859375, "loss": 0.6281, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -3.5845892429351807, "rewards/margins": 4.6414408683776855, "rewards/rejected": -8.226030349731445, "sft_loss": 0.8358697295188904, "step": 190 }, { "epoch": 0.32051282051282054, "grad_norm": 11.390930360072153, "learning_rate": 2.478248218006699e-07, "logits/chosen": 17.902259826660156, "logits/rejected": 18.019027709960938, "logps/chosen": -265.0622253417969, "logps/rejected": -175.5810546875, "loss": 0.6158, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.9043285846710205, "rewards/margins": 5.115177154541016, "rewards/rejected": -9.019506454467773, "sft_loss": 0.8782904148101807, "step": 195 }, { "epoch": 0.32873109796186717, "grad_norm": 52.895489458940915, "learning_rate": 2.476092057289873e-07, "logits/chosen": 17.241554260253906, "logits/rejected": 18.226573944091797, "logps/chosen": -249.59454345703125, "logps/rejected": -181.9971923828125, "loss": 0.6044, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.305534362792969, "rewards/margins": 5.0905351638793945, "rewards/rejected": -9.396068572998047, "sft_loss": 0.9349213242530823, "step": 200 }, { "epoch": 0.33694937541091385, "grad_norm": 13.12464260474008, "learning_rate": 2.473835049318167e-07, "logits/chosen": 18.299766540527344, "logits/rejected": 19.57137107849121, "logps/chosen": -248.37832641601562, "logps/rejected": -171.3523406982422, "loss": 0.6532, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -4.206078052520752, "rewards/margins": 4.699835300445557, "rewards/rejected": -8.905913352966309, "sft_loss": 0.9326413869857788, "step": 205 }, { "epoch": 0.34516765285996054, "grad_norm": 8.71116895518069, "learning_rate": 2.4714773797326657e-07, "logits/chosen": 18.58841896057129, "logits/rejected": 19.255895614624023, "logps/chosen": -247.08616638183594, "logps/rejected": -165.45547485351562, "loss": 0.6183, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -3.8454854488372803, "rewards/margins": 4.812742710113525, "rewards/rejected": -8.658228874206543, "sft_loss": 0.853776752948761, "step": 210 }, { "epoch": 0.3533859303090072, "grad_norm": 17.852596870413777, "learning_rate": 2.4690192424539663e-07, "logits/chosen": 18.283300399780273, "logits/rejected": 19.169416427612305, "logps/chosen": -241.07122802734375, "logps/rejected": -173.18699645996094, "loss": 0.6071, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -4.187161445617676, "rewards/margins": 5.0552144050598145, "rewards/rejected": -9.242376327514648, "sft_loss": 0.8952550292015076, "step": 215 }, { "epoch": 0.3616042077580539, "grad_norm": 12.136136465528743, "learning_rate": 2.466460839666233e-07, "logits/chosen": 17.772991180419922, "logits/rejected": 18.684547424316406, "logps/chosen": -255.16156005859375, "logps/rejected": -183.1548614501953, "loss": 0.562, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.293615341186523, "rewards/margins": 5.553874969482422, "rewards/rejected": -9.847491264343262, "sft_loss": 0.8942830562591553, "step": 220 }, { "epoch": 0.3698224852071006, "grad_norm": 13.249996024918259, "learning_rate": 2.463802381800563e-07, "logits/chosen": 17.9425106048584, "logits/rejected": 18.508359909057617, "logps/chosen": -260.12322998046875, "logps/rejected": -176.5136260986328, "loss": 0.6343, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -3.9985711574554443, "rewards/margins": 5.279909133911133, "rewards/rejected": -9.278480529785156, "sft_loss": 0.890729546546936, "step": 225 }, { "epoch": 0.3780407626561473, "grad_norm": 13.483286780837357, "learning_rate": 2.461044087517682e-07, "logits/chosen": 19.322052001953125, "logits/rejected": 19.914690017700195, "logps/chosen": -267.1094970703125, "logps/rejected": -181.53118896484375, "loss": 0.59, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.28004264831543, "rewards/margins": 5.2816243171691895, "rewards/rejected": -9.561667442321777, "sft_loss": 0.8358654975891113, "step": 230 }, { "epoch": 0.38625904010519396, "grad_norm": 10.134479758320998, "learning_rate": 2.458186183689957e-07, "logits/chosen": 18.751750946044922, "logits/rejected": 18.550024032592773, "logps/chosen": -237.7452392578125, "logps/rejected": -155.38726806640625, "loss": 0.6427, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -3.9234371185302734, "rewards/margins": 4.515294075012207, "rewards/rejected": -8.438732147216797, "sft_loss": 0.9805070757865906, "step": 235 }, { "epoch": 0.39447731755424065, "grad_norm": 13.771161444519256, "learning_rate": 2.4552289053827344e-07, "logits/chosen": 18.025060653686523, "logits/rejected": 18.463733673095703, "logps/chosen": -252.61175537109375, "logps/rejected": -171.77259826660156, "loss": 0.5599, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -4.3357720375061035, "rewards/margins": 5.04067325592041, "rewards/rejected": -9.376445770263672, "sft_loss": 0.7902787923812866, "step": 240 }, { "epoch": 0.40269559500328733, "grad_norm": 9.886456705994728, "learning_rate": 2.4521724958350093e-07, "logits/chosen": 18.645158767700195, "logits/rejected": 19.603240966796875, "logps/chosen": -239.74526977539062, "logps/rejected": -162.94131469726562, "loss": 0.6344, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.351040840148926, "rewards/margins": 4.734447002410889, "rewards/rejected": -9.085487365722656, "sft_loss": 0.8848291635513306, "step": 245 }, { "epoch": 0.410913872452334, "grad_norm": 16.574947299413026, "learning_rate": 2.449017206439417e-07, "logits/chosen": 18.770355224609375, "logits/rejected": 19.167869567871094, "logps/chosen": -257.2867431640625, "logps/rejected": -180.79721069335938, "loss": 0.5475, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -4.755511283874512, "rewards/margins": 5.377356052398682, "rewards/rejected": -10.132868766784668, "sft_loss": 0.9855692982673645, "step": 250 }, { "epoch": 0.41913214990138065, "grad_norm": 15.729142249690554, "learning_rate": 2.445763296721554e-07, "logits/chosen": 18.016155242919922, "logits/rejected": 18.655664443969727, "logps/chosen": -243.2661590576172, "logps/rejected": -178.59429931640625, "loss": 0.6424, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.371219635009766, "rewards/margins": 5.091875076293945, "rewards/rejected": -10.463094711303711, "sft_loss": 1.0052944421768188, "step": 255 }, { "epoch": 0.42735042735042733, "grad_norm": 14.846371154809418, "learning_rate": 2.4424110343186345e-07, "logits/chosen": 18.64227867126465, "logits/rejected": 19.062152862548828, "logps/chosen": -241.11070251464844, "logps/rejected": -167.0811767578125, "loss": 0.6183, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -3.9312877655029297, "rewards/margins": 4.8627119064331055, "rewards/rejected": -8.793999671936035, "sft_loss": 0.8778759837150574, "step": 260 }, { "epoch": 0.435568704799474, "grad_norm": 16.788820590336183, "learning_rate": 2.4389606949574767e-07, "logits/chosen": 18.801990509033203, "logits/rejected": 20.348352432250977, "logps/chosen": -266.7105407714844, "logps/rejected": -190.86622619628906, "loss": 0.5961, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -4.232571601867676, "rewards/margins": 5.312459945678711, "rewards/rejected": -9.545029640197754, "sft_loss": 0.8269821405410767, "step": 265 }, { "epoch": 0.4437869822485207, "grad_norm": 9.660029588751273, "learning_rate": 2.435412562431823e-07, "logits/chosen": 18.019432067871094, "logits/rejected": 18.232667922973633, "logps/chosen": -254.80136108398438, "logps/rejected": -172.0924835205078, "loss": 0.547, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.478307723999023, "rewards/margins": 5.105349540710449, "rewards/rejected": -9.583656311035156, "sft_loss": 0.8911004662513733, "step": 270 }, { "epoch": 0.4520052596975674, "grad_norm": 8.447767610497143, "learning_rate": 2.4317669285789964e-07, "logits/chosen": 18.408342361450195, "logits/rejected": 18.87084197998047, "logps/chosen": -296.8369445800781, "logps/rejected": -195.3644561767578, "loss": 0.5759, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.8854217529296875, "rewards/margins": 5.9024529457092285, "rewards/rejected": -10.787875175476074, "sft_loss": 0.8718220591545105, "step": 275 }, { "epoch": 0.46022353714661407, "grad_norm": 14.077509009393875, "learning_rate": 2.428024093255901e-07, "logits/chosen": 17.676301956176758, "logits/rejected": 19.232654571533203, "logps/chosen": -261.8072509765625, "logps/rejected": -193.81626892089844, "loss": 0.6028, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -4.590798854827881, "rewards/margins": 5.75556755065918, "rewards/rejected": -10.346365928649902, "sft_loss": 0.8692941069602966, "step": 280 }, { "epoch": 0.46844181459566075, "grad_norm": 12.255103077032402, "learning_rate": 2.424184364314352e-07, "logits/chosen": 19.874698638916016, "logits/rejected": 19.855077743530273, "logps/chosen": -263.8525085449219, "logps/rejected": -174.5958251953125, "loss": 0.5687, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.0808610916137695, "rewards/margins": 5.203913688659668, "rewards/rejected": -9.284773826599121, "sft_loss": 0.8956073522567749, "step": 285 }, { "epoch": 0.47666009204470744, "grad_norm": 15.082062203409798, "learning_rate": 2.420248057575761e-07, "logits/chosen": 17.83322525024414, "logits/rejected": 17.633359909057617, "logps/chosen": -278.74298095703125, "logps/rejected": -181.1900634765625, "loss": 0.5783, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.548935890197754, "rewards/margins": 5.899779796600342, "rewards/rejected": -10.448714256286621, "sft_loss": 0.8952395915985107, "step": 290 }, { "epoch": 0.4848783694937541, "grad_norm": 11.834958728287821, "learning_rate": 2.416215496805156e-07, "logits/chosen": 18.121597290039062, "logits/rejected": 19.50238037109375, "logps/chosen": -252.4333038330078, "logps/rejected": -197.94659423828125, "loss": 0.5665, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.617161273956299, "rewards/margins": 5.908203125, "rewards/rejected": -11.52536392211914, "sft_loss": 0.9183645844459534, "step": 295 }, { "epoch": 0.4930966469428008, "grad_norm": 13.662146621659161, "learning_rate": 2.412087013684552e-07, "logits/chosen": 16.815900802612305, "logits/rejected": 17.304187774658203, "logps/chosen": -276.7563781738281, "logps/rejected": -191.68553161621094, "loss": 0.6409, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.5067291259765625, "rewards/margins": 5.485719680786133, "rewards/rejected": -10.992449760437012, "sft_loss": 0.9233679175376892, "step": 300 }, { "epoch": 0.5013149243918474, "grad_norm": 12.176993675847571, "learning_rate": 2.407862947785669e-07, "logits/chosen": 18.833539962768555, "logits/rejected": 18.9912109375, "logps/chosen": -301.635498046875, "logps/rejected": -204.53671264648438, "loss": 0.487, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.389955997467041, "rewards/margins": 6.232929706573486, "rewards/rejected": -11.622885704040527, "sft_loss": 0.92539381980896, "step": 305 }, { "epoch": 0.5095332018408941, "grad_norm": 8.075422505238562, "learning_rate": 2.403543646542003e-07, "logits/chosen": 18.5779972076416, "logits/rejected": 19.133594512939453, "logps/chosen": -267.43695068359375, "logps/rejected": -186.43345642089844, "loss": 0.6388, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.979398727416992, "rewards/margins": 5.5010504722595215, "rewards/rejected": -10.480450630187988, "sft_loss": 0.9564525485038757, "step": 310 }, { "epoch": 0.5177514792899408, "grad_norm": 8.97962168945258, "learning_rate": 2.39912946522025e-07, "logits/chosen": 19.53040313720703, "logits/rejected": 20.46470069885254, "logps/chosen": -244.89207458496094, "logps/rejected": -172.9203643798828, "loss": 0.5741, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -4.482312202453613, "rewards/margins": 5.123040676116943, "rewards/rejected": -9.605354309082031, "sft_loss": 0.9498026371002197, "step": 315 }, { "epoch": 0.5259697567389875, "grad_norm": 12.054702965132526, "learning_rate": 2.3946207668910833e-07, "logits/chosen": 18.005373001098633, "logits/rejected": 18.470924377441406, "logps/chosen": -231.72732543945312, "logps/rejected": -168.2989044189453, "loss": 0.5869, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -4.519069671630859, "rewards/margins": 4.713679313659668, "rewards/rejected": -9.232749938964844, "sft_loss": 0.8408420085906982, "step": 320 }, { "epoch": 0.5341880341880342, "grad_norm": 25.950655473924865, "learning_rate": 2.390017922399292e-07, "logits/chosen": 18.79814910888672, "logits/rejected": 19.250444412231445, "logps/chosen": -247.69647216796875, "logps/rejected": -174.35218811035156, "loss": 0.6145, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.683900833129883, "rewards/margins": 5.248979568481445, "rewards/rejected": -9.932881355285645, "sft_loss": 0.9410896301269531, "step": 325 }, { "epoch": 0.5424063116370809, "grad_norm": 10.907505413471052, "learning_rate": 2.385321310333276e-07, "logits/chosen": 17.780803680419922, "logits/rejected": 18.34245491027832, "logps/chosen": -248.3139190673828, "logps/rejected": -172.43350219726562, "loss": 0.6284, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -5.367508411407471, "rewards/margins": 4.742012977600098, "rewards/rejected": -10.109521865844727, "sft_loss": 0.9266583323478699, "step": 330 }, { "epoch": 0.5506245890861275, "grad_norm": 29.199966853282145, "learning_rate": 2.38053131699391e-07, "logits/chosen": 18.024690628051758, "logits/rejected": 18.614425659179688, "logps/chosen": -290.8337707519531, "logps/rejected": -203.31809997558594, "loss": 0.5688, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.330504894256592, "rewards/margins": 6.163724422454834, "rewards/rejected": -11.49422836303711, "sft_loss": 0.9595879316329956, "step": 335 }, { "epoch": 0.5588428665351742, "grad_norm": 11.416242977585302, "learning_rate": 2.3756483363627694e-07, "logits/chosen": 17.60715103149414, "logits/rejected": 18.161012649536133, "logps/chosen": -250.91665649414062, "logps/rejected": -184.4646453857422, "loss": 0.5981, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.246757984161377, "rewards/margins": 5.438488006591797, "rewards/rejected": -10.685246467590332, "sft_loss": 0.9181762933731079, "step": 340 }, { "epoch": 0.5670611439842209, "grad_norm": 9.90321260332983, "learning_rate": 2.3706727700697226e-07, "logits/chosen": 17.566362380981445, "logits/rejected": 18.253488540649414, "logps/chosen": -284.3514404296875, "logps/rejected": -193.24594116210938, "loss": 0.5567, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -4.994836807250977, "rewards/margins": 5.735879421234131, "rewards/rejected": -10.73071575164795, "sft_loss": 1.0169059038162231, "step": 345 }, { "epoch": 0.5752794214332676, "grad_norm": 15.546918377467371, "learning_rate": 2.3656050273598986e-07, "logits/chosen": 17.2511043548584, "logits/rejected": 18.237810134887695, "logps/chosen": -258.5328369140625, "logps/rejected": -191.0077362060547, "loss": 0.5363, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.097340106964111, "rewards/margins": 5.559810638427734, "rewards/rejected": -10.657149314880371, "sft_loss": 0.8693541884422302, "step": 350 }, { "epoch": 0.5834976988823143, "grad_norm": 10.563639895115125, "learning_rate": 2.3604455250600256e-07, "logits/chosen": 18.051647186279297, "logits/rejected": 18.685161590576172, "logps/chosen": -273.46368408203125, "logps/rejected": -202.36537170410156, "loss": 0.516, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.360798358917236, "rewards/margins": 5.977966785430908, "rewards/rejected": -11.338766098022461, "sft_loss": 0.9063312411308289, "step": 355 }, { "epoch": 0.591715976331361, "grad_norm": 11.326441657016302, "learning_rate": 2.3551946875441467e-07, "logits/chosen": 19.21741485595703, "logits/rejected": 19.171350479125977, "logps/chosen": -265.16619873046875, "logps/rejected": -185.63027954101562, "loss": 0.586, "rewards/accuracies": 1.0, "rewards/chosen": -4.824009895324707, "rewards/margins": 5.92770528793335, "rewards/rejected": -10.751714706420898, "sft_loss": 0.967497706413269, "step": 360 }, { "epoch": 0.5999342537804077, "grad_norm": 16.154882276044376, "learning_rate": 2.3498529466987147e-07, "logits/chosen": 18.083656311035156, "logits/rejected": 19.166841506958008, "logps/chosen": -275.3788146972656, "logps/rejected": -196.90736389160156, "loss": 0.6121, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.075117111206055, "rewards/margins": 6.357577323913574, "rewards/rejected": -11.432694435119629, "sft_loss": 0.9689314961433411, "step": 365 }, { "epoch": 0.6081525312294543, "grad_norm": 12.069410065037287, "learning_rate": 2.3444207418870688e-07, "logits/chosen": 17.682310104370117, "logits/rejected": 18.865554809570312, "logps/chosen": -277.48114013671875, "logps/rejected": -195.44508361816406, "loss": 0.5471, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -4.911283493041992, "rewards/margins": 5.8411865234375, "rewards/rejected": -10.75246810913086, "sft_loss": 0.8908612728118896, "step": 370 }, { "epoch": 0.616370808678501, "grad_norm": 17.941774722560346, "learning_rate": 2.3388985199132962e-07, "logits/chosen": 17.635793685913086, "logits/rejected": 18.530078887939453, "logps/chosen": -265.6659240722656, "logps/rejected": -185.41099548339844, "loss": 0.5578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.487802982330322, "rewards/margins": 5.8236083984375, "rewards/rejected": -10.311410903930664, "sft_loss": 0.8852910399436951, "step": 375 }, { "epoch": 0.6245890861275477, "grad_norm": 16.222798143855407, "learning_rate": 2.3332867349854844e-07, "logits/chosen": 18.22924041748047, "logits/rejected": 19.445384979248047, "logps/chosen": -267.8589172363281, "logps/rejected": -200.61328125, "loss": 0.6283, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.089979648590088, "rewards/margins": 6.0606160163879395, "rewards/rejected": -11.150596618652344, "sft_loss": 0.85948646068573, "step": 380 }, { "epoch": 0.6328073635765944, "grad_norm": 58.78518201844404, "learning_rate": 2.3275858486783578e-07, "logits/chosen": 17.743967056274414, "logits/rejected": 19.073143005371094, "logps/chosen": -229.31361389160156, "logps/rejected": -178.3441162109375, "loss": 0.5824, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -4.969345569610596, "rewards/margins": 5.37393045425415, "rewards/rejected": -10.343276023864746, "sft_loss": 0.9465056657791138, "step": 385 }, { "epoch": 0.6410256410256411, "grad_norm": 15.400545086822072, "learning_rate": 2.321796329895317e-07, "logits/chosen": 16.995241165161133, "logits/rejected": 18.397994995117188, "logps/chosen": -266.69647216796875, "logps/rejected": -193.65902709960938, "loss": 0.5813, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.233003616333008, "rewards/margins": 6.00741720199585, "rewards/rejected": -11.240421295166016, "sft_loss": 0.9756826758384705, "step": 390 }, { "epoch": 0.6492439184746877, "grad_norm": 11.604457345989609, "learning_rate": 2.3159186548298688e-07, "logits/chosen": 16.9737606048584, "logits/rejected": 18.478750228881836, "logps/chosen": -257.61419677734375, "logps/rejected": -194.60252380371094, "loss": 0.5278, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.19744873046875, "rewards/margins": 6.024503707885742, "rewards/rejected": -11.221953392028809, "sft_loss": 0.972574770450592, "step": 395 }, { "epoch": 0.6574621959237343, "grad_norm": 14.695134059357779, "learning_rate": 2.3099533069264594e-07, "logits/chosen": 17.685321807861328, "logits/rejected": 18.130495071411133, "logps/chosen": -257.6887512207031, "logps/rejected": -180.2339324951172, "loss": 0.5419, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.080874919891357, "rewards/margins": 5.387575626373291, "rewards/rejected": -10.468450546264648, "sft_loss": 1.00028657913208, "step": 400 }, { "epoch": 0.665680473372781, "grad_norm": 14.2588021174925, "learning_rate": 2.3039007768407098e-07, "logits/chosen": 17.992835998535156, "logits/rejected": 18.434703826904297, "logps/chosen": -278.3475341796875, "logps/rejected": -196.46011352539062, "loss": 0.581, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.298067092895508, "rewards/margins": 6.079626560211182, "rewards/rejected": -11.377694129943848, "sft_loss": 0.9695589542388916, "step": 405 }, { "epoch": 0.6738987508218277, "grad_norm": 14.653004208659825, "learning_rate": 2.2977615623990603e-07, "logits/chosen": 18.65854263305664, "logits/rejected": 19.244489669799805, "logps/chosen": -263.1656188964844, "logps/rejected": -193.50169372558594, "loss": 0.555, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.245527267456055, "rewards/margins": 5.687096118927002, "rewards/rejected": -10.932621955871582, "sft_loss": 0.9538100957870483, "step": 410 }, { "epoch": 0.6821170282708744, "grad_norm": 16.632773914957095, "learning_rate": 2.2915361685578235e-07, "logits/chosen": 18.390525817871094, "logits/rejected": 19.31244468688965, "logps/chosen": -259.29205322265625, "logps/rejected": -189.3291015625, "loss": 0.5501, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.290169715881348, "rewards/margins": 5.542262077331543, "rewards/rejected": -10.83243179321289, "sft_loss": 0.9607923030853271, "step": 415 }, { "epoch": 0.6903353057199211, "grad_norm": 14.010413486772263, "learning_rate": 2.2852251073616503e-07, "logits/chosen": 17.323869705200195, "logits/rejected": 18.94650650024414, "logps/chosen": -282.4395751953125, "logps/rejected": -215.9941864013672, "loss": 0.4948, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.772212505340576, "rewards/margins": 6.878769397735596, "rewards/rejected": -12.650981903076172, "sft_loss": 0.993140697479248, "step": 420 }, { "epoch": 0.6985535831689678, "grad_norm": 14.508340310090572, "learning_rate": 2.2788288979014132e-07, "logits/chosen": 18.25994300842285, "logits/rejected": 19.41350555419922, "logps/chosen": -279.428955078125, "logps/rejected": -197.93687438964844, "loss": 0.5473, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.4432454109191895, "rewards/margins": 5.909384250640869, "rewards/rejected": -11.352629661560059, "sft_loss": 0.9294517040252686, "step": 425 }, { "epoch": 0.7067718606180144, "grad_norm": 15.828121421000128, "learning_rate": 2.2723480662715134e-07, "logits/chosen": 17.447628021240234, "logits/rejected": 18.819887161254883, "logps/chosen": -253.06153869628906, "logps/rejected": -190.72598266601562, "loss": 0.5712, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -5.495950698852539, "rewards/margins": 5.677833080291748, "rewards/rejected": -11.173783302307129, "sft_loss": 1.0165560245513916, "step": 430 }, { "epoch": 0.7149901380670611, "grad_norm": 21.070659832772854, "learning_rate": 2.2657831455266063e-07, "logits/chosen": 19.03611946105957, "logits/rejected": 19.757238388061523, "logps/chosen": -281.93084716796875, "logps/rejected": -194.18865966796875, "loss": 0.6137, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.303485870361328, "rewards/margins": 5.8611884117126465, "rewards/rejected": -11.164673805236816, "sft_loss": 1.0157676935195923, "step": 435 }, { "epoch": 0.7232084155161078, "grad_norm": 10.044668338093802, "learning_rate": 2.2591346756377588e-07, "logits/chosen": 18.8349666595459, "logits/rejected": 19.587926864624023, "logps/chosen": -245.26052856445312, "logps/rejected": -174.76011657714844, "loss": 0.5325, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -4.77711820602417, "rewards/margins": 5.245749473571777, "rewards/rejected": -10.022867202758789, "sft_loss": 0.9105268120765686, "step": 440 }, { "epoch": 0.7314266929651545, "grad_norm": 13.114453854538773, "learning_rate": 2.252403203448034e-07, "logits/chosen": 19.10161781311035, "logits/rejected": 20.04970932006836, "logps/chosen": -325.4466552734375, "logps/rejected": -227.55043029785156, "loss": 0.5582, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.897343635559082, "rewards/margins": 6.912624359130859, "rewards/rejected": -12.809967994689941, "sft_loss": 0.9535994529724121, "step": 445 }, { "epoch": 0.7396449704142012, "grad_norm": 12.5969825666755, "learning_rate": 2.2455892826275155e-07, "logits/chosen": 18.5415096282959, "logits/rejected": 19.55573844909668, "logps/chosen": -302.2394714355469, "logps/rejected": -217.98895263671875, "loss": 0.5556, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.171204090118408, "rewards/margins": 6.812131881713867, "rewards/rejected": -12.9833345413208, "sft_loss": 0.9671850800514221, "step": 450 }, { "epoch": 0.7478632478632479, "grad_norm": 11.483896112432117, "learning_rate": 2.2386934736277666e-07, "logits/chosen": 18.071735382080078, "logits/rejected": 19.025733947753906, "logps/chosen": -237.59962463378906, "logps/rejected": -185.32635498046875, "loss": 0.577, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -5.7157416343688965, "rewards/margins": 5.618371963500977, "rewards/rejected": -11.334112167358398, "sft_loss": 0.9591123461723328, "step": 455 }, { "epoch": 0.7560815253122946, "grad_norm": 13.120210730356671, "learning_rate": 2.2317163436357317e-07, "logits/chosen": 16.842187881469727, "logits/rejected": 18.437271118164062, "logps/chosen": -282.98541259765625, "logps/rejected": -213.07257080078125, "loss": 0.5363, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.7529802322387695, "rewards/margins": 6.836727142333984, "rewards/rejected": -12.589707374572754, "sft_loss": 0.9440767168998718, "step": 460 }, { "epoch": 0.7642998027613412, "grad_norm": 12.516354265498741, "learning_rate": 2.2246584665270855e-07, "logits/chosen": 18.161880493164062, "logits/rejected": 19.371177673339844, "logps/chosen": -298.9051513671875, "logps/rejected": -213.79953002929688, "loss": 0.4837, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.515788555145264, "rewards/margins": 6.570387363433838, "rewards/rejected": -12.086176872253418, "sft_loss": 0.9586593508720398, "step": 465 }, { "epoch": 0.7725180802103879, "grad_norm": 11.048153129151439, "learning_rate": 2.2175204228190308e-07, "logits/chosen": 18.859655380249023, "logits/rejected": 20.116731643676758, "logps/chosen": -261.10186767578125, "logps/rejected": -194.5068817138672, "loss": 0.6008, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.460696220397949, "rewards/margins": 5.922670841217041, "rewards/rejected": -11.383367538452148, "sft_loss": 0.9851782321929932, "step": 470 }, { "epoch": 0.7807363576594346, "grad_norm": 34.036831132798504, "learning_rate": 2.2103027996225512e-07, "logits/chosen": 17.431440353393555, "logits/rejected": 18.033245086669922, "logps/chosen": -278.5311584472656, "logps/rejected": -198.3171844482422, "loss": 0.5997, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.360807418823242, "rewards/margins": 6.381589412689209, "rewards/rejected": -11.74239730834961, "sft_loss": 1.0034022331237793, "step": 475 }, { "epoch": 0.7889546351084813, "grad_norm": 14.859702493359293, "learning_rate": 2.2030061905941193e-07, "logits/chosen": 18.73612403869629, "logits/rejected": 18.83433723449707, "logps/chosen": -264.3339538574219, "logps/rejected": -190.15017700195312, "loss": 0.5072, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.439321517944336, "rewards/margins": 5.989686489105225, "rewards/rejected": -11.429006576538086, "sft_loss": 0.9705156087875366, "step": 480 }, { "epoch": 0.797172912557528, "grad_norm": 10.75919165569494, "learning_rate": 2.1956311958868684e-07, "logits/chosen": 19.243186950683594, "logits/rejected": 19.267446517944336, "logps/chosen": -267.3321228027344, "logps/rejected": -196.00926208496094, "loss": 0.4832, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.953473091125488, "rewards/margins": 5.860842227935791, "rewards/rejected": -11.814314842224121, "sft_loss": 0.9466427564620972, "step": 485 }, { "epoch": 0.8053911900065747, "grad_norm": 17.23206010012729, "learning_rate": 2.1881784221012307e-07, "logits/chosen": 17.544191360473633, "logits/rejected": 18.491127014160156, "logps/chosen": -250.6893768310547, "logps/rejected": -189.68630981445312, "loss": 0.5522, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -6.418759822845459, "rewards/margins": 5.677851676940918, "rewards/rejected": -12.096611022949219, "sft_loss": 1.0340924263000488, "step": 490 }, { "epoch": 0.8136094674556213, "grad_norm": 14.38672703795697, "learning_rate": 2.1806484822350417e-07, "logits/chosen": 17.07558250427246, "logits/rejected": 17.701539993286133, "logps/chosen": -301.8546142578125, "logps/rejected": -211.86402893066406, "loss": 0.511, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.485326766967773, "rewards/margins": 6.297828197479248, "rewards/rejected": -12.78315544128418, "sft_loss": 1.0085182189941406, "step": 495 }, { "epoch": 0.821827744904668, "grad_norm": 11.220505543423183, "learning_rate": 2.1730419956331215e-07, "logits/chosen": 17.45648956298828, "logits/rejected": 18.378616333007812, "logps/chosen": -281.8039245605469, "logps/rejected": -211.0707550048828, "loss": 0.4967, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.824225425720215, "rewards/margins": 6.649372577667236, "rewards/rejected": -12.47359848022461, "sft_loss": 0.9624088406562805, "step": 500 }, { "epoch": 0.8300460223537146, "grad_norm": 19.974838378014, "learning_rate": 2.1653595879363335e-07, "logits/chosen": 18.410470962524414, "logits/rejected": 18.558494567871094, "logps/chosen": -267.88653564453125, "logps/rejected": -197.4770050048828, "loss": 0.5762, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.227014541625977, "rewards/margins": 6.174468517303467, "rewards/rejected": -12.401481628417969, "sft_loss": 0.9929137229919434, "step": 505 }, { "epoch": 0.8382642998027613, "grad_norm": 20.289642932843638, "learning_rate": 2.1576018910301238e-07, "logits/chosen": 18.445819854736328, "logits/rejected": 18.456052780151367, "logps/chosen": -268.7127990722656, "logps/rejected": -191.65673828125, "loss": 0.5308, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.160595893859863, "rewards/margins": 5.674745559692383, "rewards/rejected": -11.835343360900879, "sft_loss": 0.9606292843818665, "step": 510 }, { "epoch": 0.846482577251808, "grad_norm": 12.060714182430129, "learning_rate": 2.1497695429925497e-07, "logits/chosen": 17.933076858520508, "logits/rejected": 18.939220428466797, "logps/chosen": -267.7327575683594, "logps/rejected": -197.41754150390625, "loss": 0.5127, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -5.445801258087158, "rewards/margins": 6.1840291023254395, "rewards/rejected": -11.629830360412598, "sft_loss": 0.8621335029602051, "step": 515 }, { "epoch": 0.8547008547008547, "grad_norm": 10.501846825508975, "learning_rate": 2.1418631880417954e-07, "logits/chosen": 17.952999114990234, "logits/rejected": 19.42998504638672, "logps/chosen": -270.5357360839844, "logps/rejected": -212.4191436767578, "loss": 0.5705, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.491232872009277, "rewards/margins": 6.157339096069336, "rewards/rejected": -12.648571968078613, "sft_loss": 1.0165194272994995, "step": 520 }, { "epoch": 0.8629191321499013, "grad_norm": 38.938347224135214, "learning_rate": 2.1338834764831843e-07, "logits/chosen": 18.03480339050293, "logits/rejected": 18.895524978637695, "logps/chosen": -288.3295593261719, "logps/rejected": -212.9174041748047, "loss": 0.5076, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.212762355804443, "rewards/margins": 6.556905746459961, "rewards/rejected": -12.769665718078613, "sft_loss": 1.0657466650009155, "step": 525 }, { "epoch": 0.871137409598948, "grad_norm": 23.662606552485556, "learning_rate": 2.125831064655693e-07, "logits/chosen": 18.570951461791992, "logits/rejected": 19.01372528076172, "logps/chosen": -299.0896911621094, "logps/rejected": -218.2689666748047, "loss": 0.4869, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.196591377258301, "rewards/margins": 6.7210693359375, "rewards/rejected": -12.9176607131958, "sft_loss": 1.0185062885284424, "step": 530 }, { "epoch": 0.8793556870479947, "grad_norm": 19.788570154737137, "learning_rate": 2.1177066148779655e-07, "logits/chosen": 18.860197067260742, "logits/rejected": 19.767044067382812, "logps/chosen": -318.2361755371094, "logps/rejected": -226.54783630371094, "loss": 0.5328, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.095911979675293, "rewards/margins": 7.498478412628174, "rewards/rejected": -13.594389915466309, "sft_loss": 0.9245139360427856, "step": 535 }, { "epoch": 0.8875739644970414, "grad_norm": 9.861201904757298, "learning_rate": 2.1095107953938348e-07, "logits/chosen": 18.201683044433594, "logits/rejected": 18.54186248779297, "logps/chosen": -252.76708984375, "logps/rejected": -189.79519653320312, "loss": 0.491, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.304187774658203, "rewards/margins": 5.595078945159912, "rewards/rejected": -11.899266242980957, "sft_loss": 1.0021482706069946, "step": 540 }, { "epoch": 0.8957922419460881, "grad_norm": 12.854026542061266, "learning_rate": 2.1012442803173634e-07, "logits/chosen": 16.392040252685547, "logits/rejected": 18.43426513671875, "logps/chosen": -268.9873962402344, "logps/rejected": -213.36622619628906, "loss": 0.452, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.529672622680664, "rewards/margins": 6.670236110687256, "rewards/rejected": -13.199908256530762, "sft_loss": 1.0502568483352661, "step": 545 }, { "epoch": 0.9040105193951348, "grad_norm": 14.317934082382363, "learning_rate": 2.0929077495773927e-07, "logits/chosen": 17.196094512939453, "logits/rejected": 18.512819290161133, "logps/chosen": -301.5859375, "logps/rejected": -215.9300994873047, "loss": 0.5177, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.289539813995361, "rewards/margins": 7.147468090057373, "rewards/rejected": -13.43700885772705, "sft_loss": 1.052231788635254, "step": 550 }, { "epoch": 0.9122287968441815, "grad_norm": 13.793660373919764, "learning_rate": 2.0845018888616212e-07, "logits/chosen": 17.761926651000977, "logits/rejected": 18.349868774414062, "logps/chosen": -275.8336486816406, "logps/rejected": -202.1535186767578, "loss": 0.4794, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -5.62368106842041, "rewards/margins": 6.281108856201172, "rewards/rejected": -11.904790878295898, "sft_loss": 0.9447892904281616, "step": 555 }, { "epoch": 0.9204470742932281, "grad_norm": 13.501353742225147, "learning_rate": 2.0760273895602037e-07, "logits/chosen": 17.632814407348633, "logits/rejected": 17.65854263305664, "logps/chosen": -254.25704956054688, "logps/rejected": -177.63784790039062, "loss": 0.5335, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.114619255065918, "rewards/margins": 5.592235565185547, "rewards/rejected": -10.706855773925781, "sft_loss": 0.9995157718658447, "step": 560 }, { "epoch": 0.9286653517422748, "grad_norm": 19.535542998103256, "learning_rate": 2.0674849487088864e-07, "logits/chosen": 18.379846572875977, "logits/rejected": 19.475313186645508, "logps/chosen": -249.86785888671875, "logps/rejected": -187.93824768066406, "loss": 0.5958, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.827848434448242, "rewards/margins": 5.467617034912109, "rewards/rejected": -11.295466423034668, "sft_loss": 0.9322109222412109, "step": 565 }, { "epoch": 0.9368836291913215, "grad_norm": 25.195757238729385, "learning_rate": 2.0588752689316723e-07, "logits/chosen": 18.46122169494629, "logits/rejected": 18.586881637573242, "logps/chosen": -286.5140075683594, "logps/rejected": -202.23248291015625, "loss": 0.5319, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.130897045135498, "rewards/margins": 6.1991753578186035, "rewards/rejected": -12.330072402954102, "sft_loss": 0.924500048160553, "step": 570 }, { "epoch": 0.9451019066403682, "grad_norm": 14.694663908634908, "learning_rate": 2.0501990583830315e-07, "logits/chosen": 17.5371036529541, "logits/rejected": 18.469070434570312, "logps/chosen": -274.0564270019531, "logps/rejected": -211.01268005371094, "loss": 0.4981, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.664008617401123, "rewards/margins": 6.217647552490234, "rewards/rejected": -12.8816556930542, "sft_loss": 1.0239460468292236, "step": 575 }, { "epoch": 0.9533201840894149, "grad_norm": 8.507356630817076, "learning_rate": 2.0414570306896536e-07, "logits/chosen": 17.411376953125, "logits/rejected": 18.47208023071289, "logps/chosen": -295.3019714355469, "logps/rejected": -213.13792419433594, "loss": 0.5512, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -6.6735124588012695, "rewards/margins": 6.6261305809021, "rewards/rejected": -13.299642562866211, "sft_loss": 1.529820442199707, "step": 580 }, { "epoch": 0.9615384615384616, "grad_norm": 25.681414018757476, "learning_rate": 2.0326499048917527e-07, "logits/chosen": 17.31963348388672, "logits/rejected": 18.280134201049805, "logps/chosen": -282.2524108886719, "logps/rejected": -218.47996520996094, "loss": 0.5755, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.157464981079102, "rewards/margins": 6.622015953063965, "rewards/rejected": -13.779480934143066, "sft_loss": 0.9510271549224854, "step": 585 }, { "epoch": 0.9697567389875082, "grad_norm": 13.878204470039535, "learning_rate": 2.023778405383925e-07, "logits/chosen": 18.141050338745117, "logits/rejected": 18.204177856445312, "logps/chosen": -273.6821594238281, "logps/rejected": -200.89984130859375, "loss": 0.4418, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.963834762573242, "rewards/margins": 6.579600811004639, "rewards/rejected": -12.543435096740723, "sft_loss": 0.9940951466560364, "step": 590 }, { "epoch": 0.9779750164365549, "grad_norm": 11.452199407752436, "learning_rate": 2.0148432618555651e-07, "logits/chosen": 18.627866744995117, "logits/rejected": 18.42972755432129, "logps/chosen": -258.9418029785156, "logps/rejected": -185.6231231689453, "loss": 0.5262, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -5.471505165100098, "rewards/margins": 5.772936820983887, "rewards/rejected": -11.244441032409668, "sft_loss": 0.9383735060691833, "step": 595 }, { "epoch": 0.9861932938856016, "grad_norm": 11.942794396918284, "learning_rate": 2.005845209230851e-07, "logits/chosen": 18.03531265258789, "logits/rejected": 18.720346450805664, "logps/chosen": -292.6284484863281, "logps/rejected": -217.44017028808594, "loss": 0.5167, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.853020191192627, "rewards/margins": 6.340816497802734, "rewards/rejected": -13.193839073181152, "sft_loss": 1.0825438499450684, "step": 600 }, { "epoch": 0.9944115713346483, "grad_norm": 11.995957867465538, "learning_rate": 1.9967849876082937e-07, "logits/chosen": 16.612958908081055, "logits/rejected": 17.676807403564453, "logps/chosen": -290.99993896484375, "logps/rejected": -217.08941650390625, "loss": 0.5367, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.961750030517578, "rewards/margins": 6.5437798500061035, "rewards/rejected": -13.505529403686523, "sft_loss": 1.0639195442199707, "step": 605 }, { "epoch": 1.0026298487836949, "grad_norm": 10.297644271924568, "learning_rate": 1.9876633421998652e-07, "logits/chosen": 17.37873649597168, "logits/rejected": 18.0369815826416, "logps/chosen": -277.8174133300781, "logps/rejected": -203.3291473388672, "loss": 0.4734, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.353253364562988, "rewards/margins": 6.258001804351807, "rewards/rejected": -12.611254692077637, "sft_loss": 0.9542250037193298, "step": 610 }, { "epoch": 1.0108481262327416, "grad_norm": 11.471429971847657, "learning_rate": 1.9784810232697024e-07, "logits/chosen": 17.6014461517334, "logits/rejected": 18.502716064453125, "logps/chosen": -295.8468017578125, "logps/rejected": -225.82949829101562, "loss": 0.4473, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.305618762969971, "rewards/margins": 7.557163238525391, "rewards/rejected": -13.862781524658203, "sft_loss": 0.9756129384040833, "step": 615 }, { "epoch": 1.0190664036817882, "grad_norm": 14.22423049629626, "learning_rate": 1.969238786072398e-07, "logits/chosen": 17.072832107543945, "logits/rejected": 17.857742309570312, "logps/chosen": -318.9200134277344, "logps/rejected": -236.0108184814453, "loss": 0.423, "rewards/accuracies": 1.0, "rewards/chosen": -7.207548141479492, "rewards/margins": 7.39101505279541, "rewards/rejected": -14.598563194274902, "sft_loss": 0.9570875763893127, "step": 620 }, { "epoch": 1.027284681130835, "grad_norm": 14.863752308544749, "learning_rate": 1.9599373907908803e-07, "logits/chosen": 18.62479591369629, "logits/rejected": 19.332067489624023, "logps/chosen": -311.5079650878906, "logps/rejected": -230.38861083984375, "loss": 0.4746, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.520875453948975, "rewards/margins": 7.087317943572998, "rewards/rejected": -14.608192443847656, "sft_loss": 1.0305228233337402, "step": 625 }, { "epoch": 1.0355029585798816, "grad_norm": 11.389098298703924, "learning_rate": 1.9505776024738873e-07, "logits/chosen": 17.646556854248047, "logits/rejected": 18.52758026123047, "logps/chosen": -267.45611572265625, "logps/rejected": -202.84034729003906, "loss": 0.494, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.995048999786377, "rewards/margins": 5.844033241271973, "rewards/rejected": -12.839081764221191, "sft_loss": 1.0837846994400024, "step": 630 }, { "epoch": 1.0437212360289283, "grad_norm": 17.383619355827555, "learning_rate": 1.9411601909730397e-07, "logits/chosen": 16.90384292602539, "logits/rejected": 17.69657325744629, "logps/chosen": -276.2812805175781, "logps/rejected": -210.5614471435547, "loss": 0.5568, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.192663669586182, "rewards/margins": 6.900697231292725, "rewards/rejected": -13.093358993530273, "sft_loss": 1.2382417917251587, "step": 635 }, { "epoch": 1.051939513477975, "grad_norm": 15.094044445712935, "learning_rate": 1.9316859308795215e-07, "logits/chosen": 16.81202507019043, "logits/rejected": 18.695880889892578, "logps/chosen": -257.9354553222656, "logps/rejected": -203.78866577148438, "loss": 0.5268, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.167855262756348, "rewards/margins": 6.644321441650391, "rewards/rejected": -12.812177658081055, "sft_loss": 1.173020839691162, "step": 640 }, { "epoch": 1.0601577909270217, "grad_norm": 15.863163074258626, "learning_rate": 1.9221556014603674e-07, "logits/chosen": 16.538555145263672, "logits/rejected": 18.44594955444336, "logps/chosen": -299.3294982910156, "logps/rejected": -236.79315185546875, "loss": 0.4933, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.601771354675293, "rewards/margins": 7.276884078979492, "rewards/rejected": -14.878654479980469, "sft_loss": 1.1147685050964355, "step": 645 }, { "epoch": 1.0683760683760684, "grad_norm": 12.95009158653796, "learning_rate": 1.9125699865943696e-07, "logits/chosen": 17.819013595581055, "logits/rejected": 18.056425094604492, "logps/chosen": -280.44134521484375, "logps/rejected": -211.0347900390625, "loss": 0.4992, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -6.4677534103393555, "rewards/margins": 6.797198295593262, "rewards/rejected": -13.26495361328125, "sft_loss": 1.0369815826416016, "step": 650 }, { "epoch": 1.076594345825115, "grad_norm": 9.53030890727526, "learning_rate": 1.9029298747076e-07, "logits/chosen": 18.56303596496582, "logits/rejected": 19.128713607788086, "logps/chosen": -301.52069091796875, "logps/rejected": -222.11752319335938, "loss": 0.4653, "rewards/accuracies": 1.0, "rewards/chosen": -6.511043071746826, "rewards/margins": 7.3326520919799805, "rewards/rejected": -13.843696594238281, "sft_loss": 1.039981722831726, "step": 655 }, { "epoch": 1.0848126232741617, "grad_norm": 12.49460951335956, "learning_rate": 1.893236058708565e-07, "logits/chosen": 17.331298828125, "logits/rejected": 18.1816463470459, "logps/chosen": -290.297607421875, "logps/rejected": -212.6442413330078, "loss": 0.4897, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.593270301818848, "rewards/margins": 6.5445356369018555, "rewards/rejected": -13.137805938720703, "sft_loss": 1.0305876731872559, "step": 660 }, { "epoch": 1.0930309007232084, "grad_norm": 10.084660494140396, "learning_rate": 1.8834893359229839e-07, "logits/chosen": 17.249683380126953, "logits/rejected": 18.377492904663086, "logps/chosen": -317.7668151855469, "logps/rejected": -234.8712158203125, "loss": 0.4925, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.917696952819824, "rewards/margins": 7.316926956176758, "rewards/rejected": -14.234623908996582, "sft_loss": 1.0477817058563232, "step": 665 }, { "epoch": 1.101249178172255, "grad_norm": 11.370135962731284, "learning_rate": 1.8736905080282117e-07, "logits/chosen": 17.393232345581055, "logits/rejected": 18.21647071838379, "logps/chosen": -291.6396789550781, "logps/rejected": -215.71307373046875, "loss": 0.5118, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.400353908538818, "rewards/margins": 6.503895282745361, "rewards/rejected": -12.904250144958496, "sft_loss": 1.0789752006530762, "step": 670 }, { "epoch": 1.1094674556213018, "grad_norm": 14.128398069389478, "learning_rate": 1.8638403809872988e-07, "logits/chosen": 18.000486373901367, "logits/rejected": 19.02123260498047, "logps/chosen": -238.9346923828125, "logps/rejected": -187.83901977539062, "loss": 0.4881, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -5.991827011108398, "rewards/margins": 6.166553974151611, "rewards/rejected": -12.158380508422852, "sft_loss": 1.0633037090301514, "step": 675 }, { "epoch": 1.1176857330703485, "grad_norm": 10.039232848979895, "learning_rate": 1.8539397649826993e-07, "logits/chosen": 17.416231155395508, "logits/rejected": 18.53554344177246, "logps/chosen": -271.6786193847656, "logps/rejected": -208.55459594726562, "loss": 0.4408, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.946457386016846, "rewards/margins": 6.493756294250488, "rewards/rejected": -13.440213203430176, "sft_loss": 1.0465832948684692, "step": 680 }, { "epoch": 1.1259040105193951, "grad_norm": 17.7290983481912, "learning_rate": 1.8439894743496336e-07, "logits/chosen": 17.006452560424805, "logits/rejected": 17.804595947265625, "logps/chosen": -289.0384826660156, "logps/rejected": -228.98916625976562, "loss": 0.464, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.2547478675842285, "rewards/margins": 7.524634838104248, "rewards/rejected": -14.779382705688477, "sft_loss": 1.0623209476470947, "step": 685 }, { "epoch": 1.1341222879684418, "grad_norm": 15.995020113178853, "learning_rate": 1.8339903275091085e-07, "logits/chosen": 17.363964080810547, "logits/rejected": 18.096250534057617, "logps/chosen": -313.4389343261719, "logps/rejected": -239.9541015625, "loss": 0.4292, "rewards/accuracies": 1.0, "rewards/chosen": -7.249270439147949, "rewards/margins": 7.737963676452637, "rewards/rejected": -14.987234115600586, "sft_loss": 1.1172467470169067, "step": 690 }, { "epoch": 1.1423405654174885, "grad_norm": 10.290356468777885, "learning_rate": 1.8239431469006e-07, "logits/chosen": 16.6265811920166, "logits/rejected": 18.333799362182617, "logps/chosen": -268.6365966796875, "logps/rejected": -221.0557098388672, "loss": 0.4627, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -6.95206356048584, "rewards/margins": 7.242475986480713, "rewards/rejected": -14.194538116455078, "sft_loss": 1.2080581188201904, "step": 695 }, { "epoch": 1.1505588428665352, "grad_norm": 12.079608347733119, "learning_rate": 1.8138487589144093e-07, "logits/chosen": 16.631559371948242, "logits/rejected": 16.87362289428711, "logps/chosen": -273.40997314453125, "logps/rejected": -210.4160614013672, "loss": 0.5063, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.272107124328613, "rewards/margins": 6.501527786254883, "rewards/rejected": -13.77363395690918, "sft_loss": 1.0478310585021973, "step": 700 }, { "epoch": 1.1587771203155819, "grad_norm": 17.778097749378432, "learning_rate": 1.8037079938236894e-07, "logits/chosen": 17.234224319458008, "logits/rejected": 18.432863235473633, "logps/chosen": -281.38458251953125, "logps/rejected": -223.9882049560547, "loss": 0.4823, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.636561870574951, "rewards/margins": 7.072784423828125, "rewards/rejected": -14.709345817565918, "sft_loss": 0.9729472398757935, "step": 705 }, { "epoch": 1.1669953977646286, "grad_norm": 13.760102505142987, "learning_rate": 1.793521685716154e-07, "logits/chosen": 17.158409118652344, "logits/rejected": 18.147829055786133, "logps/chosen": -339.1050720214844, "logps/rejected": -257.5541687011719, "loss": 0.4268, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.170562744140625, "rewards/margins": 8.136800765991211, "rewards/rejected": -16.307363510131836, "sft_loss": 1.087196946144104, "step": 710 }, { "epoch": 1.1752136752136753, "grad_norm": 12.543576537196508, "learning_rate": 1.7832906724254747e-07, "logits/chosen": 16.710582733154297, "logits/rejected": 17.746997833251953, "logps/chosen": -279.0878601074219, "logps/rejected": -217.86927795410156, "loss": 0.4347, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.324019908905029, "rewards/margins": 6.887091636657715, "rewards/rejected": -14.211112022399902, "sft_loss": 1.0954669713974, "step": 715 }, { "epoch": 1.183431952662722, "grad_norm": 14.156394204679035, "learning_rate": 1.7730157954623685e-07, "logits/chosen": 17.9290828704834, "logits/rejected": 17.706289291381836, "logps/chosen": -284.99176025390625, "logps/rejected": -210.2812957763672, "loss": 0.5001, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.146309852600098, "rewards/margins": 6.681734085083008, "rewards/rejected": -13.828044891357422, "sft_loss": 1.0680426359176636, "step": 720 }, { "epoch": 1.1916502301117686, "grad_norm": 12.575179703681824, "learning_rate": 1.7626978999453794e-07, "logits/chosen": 17.4116268157959, "logits/rejected": 17.362062454223633, "logps/chosen": -319.6551818847656, "logps/rejected": -242.6376495361328, "loss": 0.3929, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.572165012359619, "rewards/margins": 7.830206871032715, "rewards/rejected": -15.402371406555176, "sft_loss": 1.0497316122055054, "step": 725 }, { "epoch": 1.1998685075608153, "grad_norm": 9.969097695004054, "learning_rate": 1.7523378345313714e-07, "logits/chosen": 17.700010299682617, "logits/rejected": 18.3839168548584, "logps/chosen": -291.83917236328125, "logps/rejected": -215.37081909179688, "loss": 0.5242, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.1273322105407715, "rewards/margins": 6.290266036987305, "rewards/rejected": -13.417597770690918, "sft_loss": 1.382573127746582, "step": 730 }, { "epoch": 1.208086785009862, "grad_norm": 17.17576749860381, "learning_rate": 1.741936451345722e-07, "logits/chosen": 18.578615188598633, "logits/rejected": 19.108678817749023, "logps/chosen": -271.18505859375, "logps/rejected": -205.25746154785156, "loss": 0.4562, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -6.272554397583008, "rewards/margins": 6.781675815582275, "rewards/rejected": -13.054230690002441, "sft_loss": 1.151402473449707, "step": 735 }, { "epoch": 1.2163050624589087, "grad_norm": 17.314304500732653, "learning_rate": 1.731494605912235e-07, "logits/chosen": 17.34149932861328, "logits/rejected": 18.757190704345703, "logps/chosen": -262.0509948730469, "logps/rejected": -208.38226318359375, "loss": 0.4598, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.2556867599487305, "rewards/margins": 6.655214309692383, "rewards/rejected": -12.910900115966797, "sft_loss": 1.0516655445098877, "step": 740 }, { "epoch": 1.2245233399079554, "grad_norm": 15.379389005940164, "learning_rate": 1.721013157082774e-07, "logits/chosen": 16.926176071166992, "logits/rejected": 18.068889617919922, "logps/chosen": -276.72833251953125, "logps/rejected": -224.33856201171875, "loss": 0.4921, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.5205397605896, "rewards/margins": 6.801075458526611, "rewards/rejected": -14.321615219116211, "sft_loss": 1.0424396991729736, "step": 745 }, { "epoch": 1.232741617357002, "grad_norm": 16.009052812361457, "learning_rate": 1.7104929669666194e-07, "logits/chosen": 16.49311065673828, "logits/rejected": 17.206867218017578, "logps/chosen": -299.70855712890625, "logps/rejected": -234.7362060546875, "loss": 0.5132, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.55043888092041, "rewards/margins": 7.260469436645508, "rewards/rejected": -14.810908317565918, "sft_loss": 1.148091197013855, "step": 750 }, { "epoch": 1.2409598948060487, "grad_norm": 12.479892072042215, "learning_rate": 1.69993490085956e-07, "logits/chosen": 16.645790100097656, "logits/rejected": 18.348690032958984, "logps/chosen": -289.54217529296875, "logps/rejected": -232.9552001953125, "loss": 0.4746, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.2228593826293945, "rewards/margins": 7.266669273376465, "rewards/rejected": -14.48952865600586, "sft_loss": 1.0830727815628052, "step": 755 }, { "epoch": 1.2491781722550954, "grad_norm": 13.701336630947893, "learning_rate": 1.6893398271727222e-07, "logits/chosen": 17.36661148071289, "logits/rejected": 18.305465698242188, "logps/chosen": -300.6762390136719, "logps/rejected": -228.61175537109375, "loss": 0.4574, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.493809223175049, "rewards/margins": 7.260177135467529, "rewards/rejected": -14.753986358642578, "sft_loss": 1.016793966293335, "step": 760 }, { "epoch": 1.2573964497041419, "grad_norm": 10.12301776047569, "learning_rate": 1.6787086173611407e-07, "logits/chosen": 17.593551635742188, "logits/rejected": 18.34381675720215, "logps/chosen": -280.0817565917969, "logps/rejected": -211.71542358398438, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": -7.457971096038818, "rewards/margins": 6.6875996589660645, "rewards/rejected": -14.1455717086792, "sft_loss": 1.0228469371795654, "step": 765 }, { "epoch": 1.2656147271531886, "grad_norm": 7.684067785358655, "learning_rate": 1.6680421458520813e-07, "logits/chosen": 18.189321517944336, "logits/rejected": 18.308818817138672, "logps/chosen": -280.6365966796875, "logps/rejected": -212.9956817626953, "loss": 0.4905, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.9928879737854, "rewards/margins": 6.62729024887085, "rewards/rejected": -13.62017822265625, "sft_loss": 1.4820358753204346, "step": 770 }, { "epoch": 1.2738330046022353, "grad_norm": 12.91245370337745, "learning_rate": 1.6573412899731187e-07, "logits/chosen": 17.40738868713379, "logits/rejected": 18.874313354492188, "logps/chosen": -299.2168884277344, "logps/rejected": -221.5058135986328, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": -6.799927234649658, "rewards/margins": 6.812719821929932, "rewards/rejected": -13.612646102905273, "sft_loss": 1.1041682958602905, "step": 775 }, { "epoch": 1.282051282051282, "grad_norm": 10.002770129869452, "learning_rate": 1.646606929879975e-07, "logits/chosen": 18.40058135986328, "logits/rejected": 19.07294273376465, "logps/chosen": -323.3199157714844, "logps/rejected": -239.97935485839844, "loss": 0.4266, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.476480484008789, "rewards/margins": 8.036779403686523, "rewards/rejected": -15.513258934020996, "sft_loss": 1.0359128713607788, "step": 780 }, { "epoch": 1.2902695595003286, "grad_norm": 13.874094233494837, "learning_rate": 1.6358399484841268e-07, "logits/chosen": 16.465330123901367, "logits/rejected": 17.001684188842773, "logps/chosen": -302.719482421875, "logps/rejected": -224.98745727539062, "loss": 0.5129, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.293752670288086, "rewards/margins": 7.167456150054932, "rewards/rejected": -14.46120834350586, "sft_loss": 1.1338067054748535, "step": 785 }, { "epoch": 1.2984878369493753, "grad_norm": 16.794137790287348, "learning_rate": 1.625041231380184e-07, "logits/chosen": 16.809955596923828, "logits/rejected": 18.395627975463867, "logps/chosen": -310.674560546875, "logps/rejected": -239.32200622558594, "loss": 0.4581, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.000899791717529, "rewards/margins": 7.625972747802734, "rewards/rejected": -14.626873016357422, "sft_loss": 0.9849548935890198, "step": 790 }, { "epoch": 1.306706114398422, "grad_norm": 12.439364730991043, "learning_rate": 1.6142116667730482e-07, "logits/chosen": 19.75507164001465, "logits/rejected": 20.32160758972168, "logps/chosen": -293.4500732421875, "logps/rejected": -214.4062042236328, "loss": 0.4713, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.081357955932617, "rewards/margins": 7.148606777191162, "rewards/rejected": -13.229966163635254, "sft_loss": 0.9287933111190796, "step": 795 }, { "epoch": 1.3149243918474687, "grad_norm": 11.945683940407063, "learning_rate": 1.6033521454048597e-07, "logits/chosen": 18.249954223632812, "logits/rejected": 19.019634246826172, "logps/chosen": -271.8877258300781, "logps/rejected": -217.09132385253906, "loss": 0.4673, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -6.703191757202148, "rewards/margins": 7.068259239196777, "rewards/rejected": -13.77145004272461, "sft_loss": 1.0365476608276367, "step": 800 }, { "epoch": 1.3231426692965154, "grad_norm": 10.191092591520466, "learning_rate": 1.5924635604817306e-07, "logits/chosen": 17.222694396972656, "logits/rejected": 18.468660354614258, "logps/chosen": -288.8092041015625, "logps/rejected": -236.29319763183594, "loss": 0.4065, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.580431938171387, "rewards/margins": 7.9504780769348145, "rewards/rejected": -15.530909538269043, "sft_loss": 1.162276268005371, "step": 805 }, { "epoch": 1.331360946745562, "grad_norm": 9.751260919138856, "learning_rate": 1.5815468076002771e-07, "logits/chosen": 16.873342514038086, "logits/rejected": 18.183860778808594, "logps/chosen": -312.6845397949219, "logps/rejected": -240.49859619140625, "loss": 0.429, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.772741794586182, "rewards/margins": 8.080373764038086, "rewards/rejected": -15.853116035461426, "sft_loss": 0.9787502288818359, "step": 810 }, { "epoch": 1.3395792241946087, "grad_norm": 13.966159704549986, "learning_rate": 1.5706027846739588e-07, "logits/chosen": 17.78404426574707, "logits/rejected": 18.716482162475586, "logps/chosen": -265.793701171875, "logps/rejected": -212.49057006835938, "loss": 0.4521, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -6.772706985473633, "rewards/margins": 6.92323112487793, "rewards/rejected": -13.695940017700195, "sft_loss": 1.0237793922424316, "step": 815 }, { "epoch": 1.3477975016436554, "grad_norm": 32.697820524211366, "learning_rate": 1.5596323918592227e-07, "logits/chosen": 18.034412384033203, "logits/rejected": 18.671672821044922, "logps/chosen": -253.35609436035156, "logps/rejected": -206.98895263671875, "loss": 0.4833, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -7.246993541717529, "rewards/margins": 6.500965595245361, "rewards/rejected": -13.74795913696289, "sft_loss": 1.0642235279083252, "step": 820 }, { "epoch": 1.356015779092702, "grad_norm": 12.398186085004639, "learning_rate": 1.5486365314814637e-07, "logits/chosen": 17.62421226501465, "logits/rejected": 18.33708953857422, "logps/chosen": -292.3586120605469, "logps/rejected": -230.61155700683594, "loss": 0.4084, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.588433742523193, "rewards/margins": 7.831187725067139, "rewards/rejected": -15.4196195602417, "sft_loss": 1.0407756567001343, "step": 825 }, { "epoch": 1.3642340565417488, "grad_norm": 12.166605913363364, "learning_rate": 1.5376161079608088e-07, "logits/chosen": 17.150541305541992, "logits/rejected": 18.62920379638672, "logps/chosen": -296.70465087890625, "logps/rejected": -242.9381866455078, "loss": 0.46, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.590549945831299, "rewards/margins": 8.183311462402344, "rewards/rejected": -15.773859977722168, "sft_loss": 1.191388487815857, "step": 830 }, { "epoch": 1.3724523339907955, "grad_norm": 10.880603493347238, "learning_rate": 1.5265720277377273e-07, "logits/chosen": 17.14630889892578, "logits/rejected": 19.08263397216797, "logps/chosen": -288.0076904296875, "logps/rejected": -237.15341186523438, "loss": 0.4435, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.4387054443359375, "rewards/margins": 7.707547664642334, "rewards/rejected": -15.146254539489746, "sft_loss": 1.0695911645889282, "step": 835 }, { "epoch": 1.3806706114398422, "grad_norm": 50.18720477246092, "learning_rate": 1.5155051991984745e-07, "logits/chosen": 18.334110260009766, "logits/rejected": 18.69322967529297, "logps/chosen": -315.9974365234375, "logps/rejected": -228.48602294921875, "loss": 0.4849, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.224093437194824, "rewards/margins": 7.033995151519775, "rewards/rejected": -14.258088111877441, "sft_loss": 0.9990159869194031, "step": 840 }, { "epoch": 1.3888888888888888, "grad_norm": 8.437783211213006, "learning_rate": 1.504416532600378e-07, "logits/chosen": 17.403743743896484, "logits/rejected": 18.235454559326172, "logps/chosen": -242.6099853515625, "logps/rejected": -199.91429138183594, "loss": 0.4367, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.768044948577881, "rewards/margins": 6.265518665313721, "rewards/rejected": -13.033564567565918, "sft_loss": 1.0013427734375, "step": 845 }, { "epoch": 1.3971071663379355, "grad_norm": 14.969642809049821, "learning_rate": 1.4933069399969653e-07, "logits/chosen": 17.80324935913086, "logits/rejected": 18.639148712158203, "logps/chosen": -272.4168395996094, "logps/rejected": -217.99310302734375, "loss": 0.4617, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.3702874183654785, "rewards/margins": 6.988955020904541, "rewards/rejected": -14.359243392944336, "sft_loss": 1.1217681169509888, "step": 850 }, { "epoch": 1.4053254437869822, "grad_norm": 14.289009158482923, "learning_rate": 1.4821773351629487e-07, "logits/chosen": 18.467451095581055, "logits/rejected": 19.347543716430664, "logps/chosen": -302.4975280761719, "logps/rejected": -243.9453125, "loss": 0.4132, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.117691040039062, "rewards/margins": 8.244772911071777, "rewards/rejected": -16.362462997436523, "sft_loss": 1.1255364418029785, "step": 855 }, { "epoch": 1.413543721236029, "grad_norm": 10.706706272611981, "learning_rate": 1.4710286335190664e-07, "logits/chosen": 18.262802124023438, "logits/rejected": 18.210296630859375, "logps/chosen": -306.64691162109375, "logps/rejected": -234.53460693359375, "loss": 0.4363, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.4043498039245605, "rewards/margins": 7.886282920837402, "rewards/rejected": -15.290633201599121, "sft_loss": 1.080936074256897, "step": 860 }, { "epoch": 1.4217619986850756, "grad_norm": 13.539503399960063, "learning_rate": 1.4598617520567863e-07, "logits/chosen": 18.688413619995117, "logits/rejected": 19.166378021240234, "logps/chosen": -295.90008544921875, "logps/rejected": -231.57505798339844, "loss": 0.4445, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.257371425628662, "rewards/margins": 7.788801193237305, "rewards/rejected": -15.046174049377441, "sft_loss": 1.04954195022583, "step": 865 }, { "epoch": 1.4299802761341223, "grad_norm": 20.41588952283392, "learning_rate": 1.448677609262885e-07, "logits/chosen": 17.124914169311523, "logits/rejected": 18.068174362182617, "logps/chosen": -291.83245849609375, "logps/rejected": -229.2489776611328, "loss": 0.4916, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.751894474029541, "rewards/margins": 7.248422145843506, "rewards/rejected": -15.000316619873047, "sft_loss": 1.1058861017227173, "step": 870 }, { "epoch": 1.438198553583169, "grad_norm": 10.416378982514427, "learning_rate": 1.4374771250438997e-07, "logits/chosen": 17.683748245239258, "logits/rejected": 18.105945587158203, "logps/chosen": -338.9434814453125, "logps/rejected": -252.90367126464844, "loss": 0.353, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.734278678894043, "rewards/margins": 8.11069107055664, "rewards/rejected": -16.844970703125, "sft_loss": 1.1128793954849243, "step": 875 }, { "epoch": 1.4464168310322156, "grad_norm": 15.631489594193368, "learning_rate": 1.4262612206504653e-07, "logits/chosen": 19.22788429260254, "logits/rejected": 18.560340881347656, "logps/chosen": -288.2774658203125, "logps/rejected": -221.1851806640625, "loss": 0.4398, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.204787254333496, "rewards/margins": 6.889291763305664, "rewards/rejected": -15.094079971313477, "sft_loss": 1.0347801446914673, "step": 880 }, { "epoch": 1.4546351084812623, "grad_norm": 22.470025016143673, "learning_rate": 1.4150308186015428e-07, "logits/chosen": 18.78541374206543, "logits/rejected": 19.072355270385742, "logps/chosen": -266.7073669433594, "logps/rejected": -214.3734130859375, "loss": 0.4864, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.12351131439209, "rewards/margins": 7.012777328491211, "rewards/rejected": -14.1362886428833, "sft_loss": 1.0819884538650513, "step": 885 }, { "epoch": 1.462853385930309, "grad_norm": 11.047306179137715, "learning_rate": 1.4037868426085368e-07, "logits/chosen": 17.600828170776367, "logits/rejected": 17.870738983154297, "logps/chosen": -321.2472229003906, "logps/rejected": -237.96395874023438, "loss": 0.4823, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.274439811706543, "rewards/margins": 8.21683120727539, "rewards/rejected": -15.49127197265625, "sft_loss": 1.1358665227890015, "step": 890 }, { "epoch": 1.4710716633793557, "grad_norm": 9.894309836137355, "learning_rate": 1.3925302174993233e-07, "logits/chosen": 16.768348693847656, "logits/rejected": 18.076475143432617, "logps/chosen": -295.2914123535156, "logps/rejected": -222.6123504638672, "loss": 0.4288, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -6.861530780792236, "rewards/margins": 7.223613262176514, "rewards/rejected": -14.085144996643066, "sft_loss": 0.9808722734451294, "step": 895 }, { "epoch": 1.4792899408284024, "grad_norm": 15.122256978486702, "learning_rate": 1.3812618691421803e-07, "logits/chosen": 17.618257522583008, "logits/rejected": 18.547971725463867, "logps/chosen": -307.7926025390625, "logps/rejected": -228.6370849609375, "loss": 0.4755, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.855221748352051, "rewards/margins": 7.493732929229736, "rewards/rejected": -14.348955154418945, "sft_loss": 0.975628137588501, "step": 900 }, { "epoch": 1.487508218277449, "grad_norm": 14.990640701163656, "learning_rate": 1.3699827243696336e-07, "logits/chosen": 17.19367027282715, "logits/rejected": 18.374305725097656, "logps/chosen": -286.5935363769531, "logps/rejected": -236.76593017578125, "loss": 0.4732, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -7.718534469604492, "rewards/margins": 7.860580921173096, "rewards/rejected": -15.57911491394043, "sft_loss": 1.1146594285964966, "step": 905 }, { "epoch": 1.4957264957264957, "grad_norm": 10.50314444472379, "learning_rate": 1.3586937109022251e-07, "logits/chosen": 16.421382904052734, "logits/rejected": 17.77210235595703, "logps/chosen": -324.25927734375, "logps/rejected": -260.9275207519531, "loss": 0.4663, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.703363418579102, "rewards/margins": 8.462730407714844, "rewards/rejected": -17.166095733642578, "sft_loss": 1.0979522466659546, "step": 910 }, { "epoch": 1.5039447731755424, "grad_norm": 16.690789592498312, "learning_rate": 1.347395757272207e-07, "logits/chosen": 19.563251495361328, "logits/rejected": 19.970426559448242, "logps/chosen": -271.6186218261719, "logps/rejected": -212.50277709960938, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": -6.6580634117126465, "rewards/margins": 7.265621185302734, "rewards/rejected": -13.923684120178223, "sft_loss": 1.0007566213607788, "step": 915 }, { "epoch": 1.5121630506245891, "grad_norm": 21.799881591539336, "learning_rate": 1.3360897927471668e-07, "logits/chosen": 18.252246856689453, "logits/rejected": 18.873050689697266, "logps/chosen": -278.3526611328125, "logps/rejected": -221.5440216064453, "loss": 0.4632, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.180948257446289, "rewards/margins": 7.29295539855957, "rewards/rejected": -14.473901748657227, "sft_loss": 1.0442688465118408, "step": 920 }, { "epoch": 1.5203813280736358, "grad_norm": 10.712033452260947, "learning_rate": 1.3247767472535972e-07, "logits/chosen": 18.07443618774414, "logits/rejected": 19.142240524291992, "logps/chosen": -294.86700439453125, "logps/rejected": -238.5161895751953, "loss": 0.4686, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.611084461212158, "rewards/margins": 8.040576934814453, "rewards/rejected": -15.651662826538086, "sft_loss": 1.0576171875, "step": 925 }, { "epoch": 1.5285996055226825, "grad_norm": 7.019511894014553, "learning_rate": 1.3134575513004073e-07, "logits/chosen": 18.114564895629883, "logits/rejected": 18.515487670898438, "logps/chosen": -303.06329345703125, "logps/rejected": -237.0087432861328, "loss": 0.3908, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.551575183868408, "rewards/margins": 7.9892473220825195, "rewards/rejected": -15.540822982788086, "sft_loss": 1.048262119293213, "step": 930 }, { "epoch": 1.5368178829717292, "grad_norm": 14.009760349607332, "learning_rate": 1.3021331359023874e-07, "logits/chosen": 17.101354598999023, "logits/rejected": 18.246139526367188, "logps/chosen": -310.4385070800781, "logps/rejected": -244.6991424560547, "loss": 0.4262, "rewards/accuracies": 1.0, "rewards/chosen": -7.569284439086914, "rewards/margins": 8.347086906433105, "rewards/rejected": -15.916370391845703, "sft_loss": 1.0606290102005005, "step": 935 }, { "epoch": 1.5450361604207759, "grad_norm": 15.650861724973655, "learning_rate": 1.2908044325036312e-07, "logits/chosen": 17.97089195251465, "logits/rejected": 18.223573684692383, "logps/chosen": -296.1282958984375, "logps/rejected": -233.69146728515625, "loss": 0.4616, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.757159233093262, "rewards/margins": 7.639113903045654, "rewards/rejected": -15.396271705627441, "sft_loss": 1.138619065284729, "step": 940 }, { "epoch": 1.5532544378698225, "grad_norm": 17.515447400155715, "learning_rate": 1.2794723729009255e-07, "logits/chosen": 16.958641052246094, "logits/rejected": 18.472318649291992, "logps/chosen": -298.9012756347656, "logps/rejected": -239.90469360351562, "loss": 0.4502, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.437976837158203, "rewards/margins": 8.138365745544434, "rewards/rejected": -15.576342582702637, "sft_loss": 1.0626742839813232, "step": 945 }, { "epoch": 1.5614727153188692, "grad_norm": 45.641039135520685, "learning_rate": 1.2681378891671082e-07, "logits/chosen": 17.490928649902344, "logits/rejected": 17.976585388183594, "logps/chosen": -306.0874328613281, "logps/rejected": -237.03607177734375, "loss": 0.4737, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.880832672119141, "rewards/margins": 7.584968090057373, "rewards/rejected": -15.465802192687988, "sft_loss": 1.0900439023971558, "step": 950 }, { "epoch": 1.569690992767916, "grad_norm": 19.898061737121086, "learning_rate": 1.2568019135744044e-07, "logits/chosen": 16.957841873168945, "logits/rejected": 17.985727310180664, "logps/chosen": -291.70135498046875, "logps/rejected": -229.38314819335938, "loss": 0.4349, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.644362926483154, "rewards/margins": 7.429901123046875, "rewards/rejected": -15.074263572692871, "sft_loss": 1.0944395065307617, "step": 955 }, { "epoch": 1.5779092702169626, "grad_norm": 12.39680434949017, "learning_rate": 1.2454653785177445e-07, "logits/chosen": 17.493330001831055, "logits/rejected": 18.42995834350586, "logps/chosen": -278.9170837402344, "logps/rejected": -230.72608947753906, "loss": 0.4231, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.324814319610596, "rewards/margins": 7.593767166137695, "rewards/rejected": -14.918582916259766, "sft_loss": 1.0732117891311646, "step": 960 }, { "epoch": 1.5861275476660093, "grad_norm": 21.306042868258853, "learning_rate": 1.2341292164380783e-07, "logits/chosen": 18.833568572998047, "logits/rejected": 18.869935989379883, "logps/chosen": -286.1907653808594, "logps/rejected": -224.49281311035156, "loss": 0.4817, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.71510124206543, "rewards/margins": 7.221285820007324, "rewards/rejected": -14.936385154724121, "sft_loss": 1.3040668964385986, "step": 965 }, { "epoch": 1.594345825115056, "grad_norm": 14.69263028616145, "learning_rate": 1.222794359745675e-07, "logits/chosen": 16.27896499633789, "logits/rejected": 18.376323699951172, "logps/chosen": -300.5797424316406, "logps/rejected": -242.6514129638672, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": -7.52255392074585, "rewards/margins": 8.169685363769531, "rewards/rejected": -15.692238807678223, "sft_loss": 1.0308858156204224, "step": 970 }, { "epoch": 1.6025641025641026, "grad_norm": 13.802476438483277, "learning_rate": 1.2114617407434354e-07, "logits/chosen": 18.055139541625977, "logits/rejected": 19.250368118286133, "logps/chosen": -309.2381286621094, "logps/rejected": -245.81809997558594, "loss": 0.4326, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.345053672790527, "rewards/margins": 8.126486778259277, "rewards/rejected": -15.471541404724121, "sft_loss": 1.123140811920166, "step": 975 }, { "epoch": 1.6107823800131493, "grad_norm": 10.423391619330996, "learning_rate": 1.2001322915502091e-07, "logits/chosen": 16.897199630737305, "logits/rejected": 18.748310089111328, "logps/chosen": -292.1817932128906, "logps/rejected": -235.8812255859375, "loss": 0.3942, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.173976898193359, "rewards/margins": 7.872208118438721, "rewards/rejected": -15.046185493469238, "sft_loss": 1.1811002492904663, "step": 980 }, { "epoch": 1.619000657462196, "grad_norm": 26.973905524007105, "learning_rate": 1.1888069440241243e-07, "logits/chosen": 18.107698440551758, "logits/rejected": 19.736108779907227, "logps/chosen": -317.0016174316406, "logps/rejected": -252.54832458496094, "loss": 0.4222, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.532571792602539, "rewards/margins": 9.049071311950684, "rewards/rejected": -16.581642150878906, "sft_loss": 1.075319766998291, "step": 985 }, { "epoch": 1.6272189349112427, "grad_norm": 16.255178289646476, "learning_rate": 1.1774866296859448e-07, "logits/chosen": 17.9573917388916, "logits/rejected": 19.03142738342285, "logps/chosen": -301.56561279296875, "logps/rejected": -243.9299774169922, "loss": 0.4749, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.284952163696289, "rewards/margins": 8.675047874450684, "rewards/rejected": -15.960000038146973, "sft_loss": 1.1328290700912476, "step": 990 }, { "epoch": 1.6354372123602894, "grad_norm": 10.065426351498546, "learning_rate": 1.1661722796424478e-07, "logits/chosen": 17.292905807495117, "logits/rejected": 18.3796443939209, "logps/chosen": -309.9263000488281, "logps/rejected": -241.42181396484375, "loss": 0.4268, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.671374320983887, "rewards/margins": 8.211640357971191, "rewards/rejected": -15.883017539978027, "sft_loss": 1.0408843755722046, "step": 995 }, { "epoch": 1.643655489809336, "grad_norm": 12.50718545323396, "learning_rate": 1.1548648245098432e-07, "logits/chosen": 17.582983016967773, "logits/rejected": 18.472742080688477, "logps/chosen": -319.5430908203125, "logps/rejected": -253.3585968017578, "loss": 0.4368, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.135196685791016, "rewards/margins": 8.56678295135498, "rewards/rejected": -16.701980590820312, "sft_loss": 1.121424674987793, "step": 1000 }, { "epoch": 1.6518737672583828, "grad_norm": 9.456497156444888, "learning_rate": 1.1435651943372278e-07, "logits/chosen": 16.574844360351562, "logits/rejected": 17.709199905395508, "logps/chosen": -286.1977844238281, "logps/rejected": -229.33741760253906, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": -8.042440414428711, "rewards/margins": 7.619970798492432, "rewards/rejected": -15.662409782409668, "sft_loss": 1.1242254972457886, "step": 1005 }, { "epoch": 1.6600920447074294, "grad_norm": 12.581807587635986, "learning_rate": 1.1322743185300865e-07, "logits/chosen": 17.700603485107422, "logits/rejected": 19.024187088012695, "logps/chosen": -296.780029296875, "logps/rejected": -233.88160705566406, "loss": 0.4889, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.796105861663818, "rewards/margins": 7.478055953979492, "rewards/rejected": -15.274161338806152, "sft_loss": 1.075081467628479, "step": 1010 }, { "epoch": 1.6683103221564761, "grad_norm": 14.09597654178517, "learning_rate": 1.1209931257738503e-07, "logits/chosen": 17.260271072387695, "logits/rejected": 18.022357940673828, "logps/chosen": -306.3436584472656, "logps/rejected": -227.7841339111328, "loss": 0.4487, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -6.75, "rewards/margins": 7.679973602294922, "rewards/rejected": -14.429974555969238, "sft_loss": 1.1023831367492676, "step": 1015 }, { "epoch": 1.6765285996055228, "grad_norm": 10.14530298124155, "learning_rate": 1.1097225439575096e-07, "logits/chosen": 16.790157318115234, "logits/rejected": 17.936586380004883, "logps/chosen": -274.2288818359375, "logps/rejected": -220.5703125, "loss": 0.4648, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.9578022956848145, "rewards/margins": 7.266170501708984, "rewards/rejected": -14.22397232055664, "sft_loss": 1.0298852920532227, "step": 1020 }, { "epoch": 1.6847468770545695, "grad_norm": 14.64734935061402, "learning_rate": 1.0984635000972946e-07, "logits/chosen": 16.42229461669922, "logits/rejected": 17.54804229736328, "logps/chosen": -277.86077880859375, "logps/rejected": -223.43917846679688, "loss": 0.5101, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.510883808135986, "rewards/margins": 7.296814441680908, "rewards/rejected": -14.807699203491211, "sft_loss": 1.089572548866272, "step": 1025 }, { "epoch": 1.6929651545036162, "grad_norm": 14.998745686830942, "learning_rate": 1.0872169202604284e-07, "logits/chosen": 17.45005226135254, "logits/rejected": 18.329872131347656, "logps/chosen": -335.4214782714844, "logps/rejected": -264.5696105957031, "loss": 0.4259, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.625652313232422, "rewards/margins": 8.4821138381958, "rewards/rejected": -17.107765197753906, "sft_loss": 1.1337147951126099, "step": 1030 }, { "epoch": 1.7011834319526629, "grad_norm": 15.126502195785678, "learning_rate": 1.0759837294889546e-07, "logits/chosen": 15.89870834350586, "logits/rejected": 17.66954803466797, "logps/chosen": -324.4315185546875, "logps/rejected": -251.8769073486328, "loss": 0.4365, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.891256332397461, "rewards/margins": 8.40850830078125, "rewards/rejected": -16.299766540527344, "sft_loss": 1.0551294088363647, "step": 1035 }, { "epoch": 1.7094017094017095, "grad_norm": 11.887438634341896, "learning_rate": 1.0647648517236547e-07, "logits/chosen": 17.808908462524414, "logits/rejected": 17.868276596069336, "logps/chosen": -318.5857849121094, "logps/rejected": -237.06268310546875, "loss": 0.4077, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.441680431365967, "rewards/margins": 7.951440811157227, "rewards/rejected": -15.393121719360352, "sft_loss": 1.0577045679092407, "step": 1040 }, { "epoch": 1.7176199868507562, "grad_norm": 13.592964221155555, "learning_rate": 1.0535612097280505e-07, "logits/chosen": 17.357389450073242, "logits/rejected": 18.236921310424805, "logps/chosen": -309.05316162109375, "logps/rejected": -234.39718627929688, "loss": 0.4578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.402204513549805, "rewards/margins": 7.640995025634766, "rewards/rejected": -15.043200492858887, "sft_loss": 1.1290278434753418, "step": 1045 }, { "epoch": 1.725838264299803, "grad_norm": 17.516227986033588, "learning_rate": 1.042373725012508e-07, "logits/chosen": 15.968868255615234, "logits/rejected": 17.182361602783203, "logps/chosen": -277.1082763671875, "logps/rejected": -217.5791778564453, "loss": 0.4706, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.2921223640441895, "rewards/margins": 7.116176605224609, "rewards/rejected": -14.40829849243164, "sft_loss": 1.1019597053527832, "step": 1050 }, { "epoch": 1.7340565417488496, "grad_norm": 14.545988790543376, "learning_rate": 1.0312033177584409e-07, "logits/chosen": 18.982242584228516, "logits/rejected": 18.7514705657959, "logps/chosen": -293.9178466796875, "logps/rejected": -226.5133819580078, "loss": 0.3922, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.807718276977539, "rewards/margins": 7.406096935272217, "rewards/rejected": -15.213815689086914, "sft_loss": 1.0929393768310547, "step": 1055 }, { "epoch": 1.7422748191978963, "grad_norm": 10.680737229216966, "learning_rate": 1.0200509067426243e-07, "logits/chosen": 16.079814910888672, "logits/rejected": 17.51044273376465, "logps/chosen": -302.1490173339844, "logps/rejected": -233.8198699951172, "loss": 0.444, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.101183891296387, "rewards/margins": 7.351180553436279, "rewards/rejected": -15.452364921569824, "sft_loss": 1.2096168994903564, "step": 1060 }, { "epoch": 1.7504930966469427, "grad_norm": 9.891781648367795, "learning_rate": 1.0089174092616271e-07, "logits/chosen": 17.791248321533203, "logits/rejected": 18.2585506439209, "logps/chosen": -280.9420166015625, "logps/rejected": -224.9687957763672, "loss": 0.4607, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.537823677062988, "rewards/margins": 7.212753772735596, "rewards/rejected": -14.750576972961426, "sft_loss": 1.0387908220291138, "step": 1065 }, { "epoch": 1.7587113740959894, "grad_norm": 18.289134457763506, "learning_rate": 9.97803741056361e-08, "logits/chosen": 16.976699829101562, "logits/rejected": 17.30523109436035, "logps/chosen": -275.5840148925781, "logps/rejected": -215.13279724121094, "loss": 0.3879, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.284540176391602, "rewards/margins": 6.921156406402588, "rewards/rejected": -14.205697059631348, "sft_loss": 1.0973351001739502, "step": 1070 }, { "epoch": 1.7669296515450361, "grad_norm": 17.72039206697929, "learning_rate": 9.867108162367594e-08, "logits/chosen": 16.939437866210938, "logits/rejected": 18.218585968017578, "logps/chosen": -294.5352478027344, "logps/rejected": -230.98623657226562, "loss": 0.3974, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.325733661651611, "rewards/margins": 7.582549571990967, "rewards/rejected": -14.908282279968262, "sft_loss": 1.034481406211853, "step": 1075 }, { "epoch": 1.7751479289940828, "grad_norm": 13.466593004835952, "learning_rate": 9.756395472065947e-08, "logits/chosen": 17.363365173339844, "logits/rejected": 18.14643669128418, "logps/chosen": -275.0605163574219, "logps/rejected": -223.0447998046875, "loss": 0.4368, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.5301194190979, "rewards/margins": 7.630979537963867, "rewards/rejected": -15.161099433898926, "sft_loss": 1.191418170928955, "step": 1080 }, { "epoch": 1.7833662064431295, "grad_norm": 17.525060893448625, "learning_rate": 9.645908445884271e-08, "logits/chosen": 17.93121910095215, "logits/rejected": 19.609464645385742, "logps/chosen": -313.574951171875, "logps/rejected": -255.39015197753906, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": -8.089523315429688, "rewards/margins": 8.564504623413086, "rewards/rejected": -16.654027938842773, "sft_loss": 1.0859136581420898, "step": 1085 }, { "epoch": 1.7915844838921762, "grad_norm": 14.818652238656334, "learning_rate": 9.535656171487096e-08, "logits/chosen": 17.432899475097656, "logits/rejected": 18.06930160522461, "logps/chosen": -306.2559814453125, "logps/rejected": -247.05564880371094, "loss": 0.4113, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.375761985778809, "rewards/margins": 8.475983619689941, "rewards/rejected": -16.85174560546875, "sft_loss": 1.2146451473236084, "step": 1090 }, { "epoch": 1.7998027613412229, "grad_norm": 64.12698029544616, "learning_rate": 9.425647717230382e-08, "logits/chosen": 17.3497257232666, "logits/rejected": 18.322324752807617, "logps/chosen": -314.32830810546875, "logps/rejected": -253.83473205566406, "loss": 0.4062, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.878050804138184, "rewards/margins": 8.278247833251953, "rewards/rejected": -17.15629768371582, "sft_loss": 1.077860713005066, "step": 1095 }, { "epoch": 1.8080210387902695, "grad_norm": 13.052337358867197, "learning_rate": 9.315892131415642e-08, "logits/chosen": 16.90951919555664, "logits/rejected": 18.101472854614258, "logps/chosen": -344.9137878417969, "logps/rejected": -264.2882080078125, "loss": 0.3948, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.652148246765137, "rewards/margins": 9.170465469360352, "rewards/rejected": -17.822612762451172, "sft_loss": 1.2117801904678345, "step": 1100 }, { "epoch": 1.8162393162393162, "grad_norm": 11.613352799050077, "learning_rate": 9.206398441545729e-08, "logits/chosen": 17.647083282470703, "logits/rejected": 18.84397315979004, "logps/chosen": -312.7010498046875, "logps/rejected": -254.3484344482422, "loss": 0.3759, "rewards/accuracies": 1.0, "rewards/chosen": -8.489236831665039, "rewards/margins": 8.119637489318848, "rewards/rejected": -16.608875274658203, "sft_loss": 1.01621675491333, "step": 1105 }, { "epoch": 1.824457593688363, "grad_norm": 11.15254994077485, "learning_rate": 9.097175653582299e-08, "logits/chosen": 17.26348114013672, "logits/rejected": 18.160728454589844, "logps/chosen": -284.86114501953125, "logps/rejected": -232.5272979736328, "loss": 0.41, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.840343475341797, "rewards/margins": 7.370659351348877, "rewards/rejected": -15.211003303527832, "sft_loss": 1.1511608362197876, "step": 1110 }, { "epoch": 1.8326758711374096, "grad_norm": 14.803907963552794, "learning_rate": 8.988232751205051e-08, "logits/chosen": 17.386255264282227, "logits/rejected": 17.55118751525879, "logps/chosen": -271.7340087890625, "logps/rejected": -208.06320190429688, "loss": 0.4401, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.613986968994141, "rewards/margins": 6.274531841278076, "rewards/rejected": -13.888518333435059, "sft_loss": 1.144532322883606, "step": 1115 }, { "epoch": 1.8408941485864563, "grad_norm": 14.423568520659874, "learning_rate": 8.879578695072846e-08, "logits/chosen": 17.274259567260742, "logits/rejected": 18.399911880493164, "logps/chosen": -289.1215515136719, "logps/rejected": -230.22369384765625, "loss": 0.4135, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.828088760375977, "rewards/margins": 7.673010349273682, "rewards/rejected": -15.5010986328125, "sft_loss": 1.1277306079864502, "step": 1120 }, { "epoch": 1.849112426035503, "grad_norm": 11.37404702454821, "learning_rate": 8.771222422086639e-08, "logits/chosen": 16.860265731811523, "logits/rejected": 17.736581802368164, "logps/chosen": -297.23956298828125, "logps/rejected": -233.06109619140625, "loss": 0.3998, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.860833168029785, "rewards/margins": 8.063416481018066, "rewards/rejected": -15.924250602722168, "sft_loss": 1.2870830297470093, "step": 1125 }, { "epoch": 1.8573307034845496, "grad_norm": 12.257681191538563, "learning_rate": 8.663172844654452e-08, "logits/chosen": 17.366941452026367, "logits/rejected": 17.93768882751465, "logps/chosen": -300.5145263671875, "logps/rejected": -230.68685913085938, "loss": 0.4455, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.420682430267334, "rewards/margins": 7.759568691253662, "rewards/rejected": -15.180251121520996, "sft_loss": 1.0831838846206665, "step": 1130 }, { "epoch": 1.8655489809335963, "grad_norm": 11.344131200773928, "learning_rate": 8.555438849958296e-08, "logits/chosen": 17.97229766845703, "logits/rejected": 18.921049118041992, "logps/chosen": -319.6356201171875, "logps/rejected": -246.49024963378906, "loss": 0.3864, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.522003650665283, "rewards/margins": 8.551565170288086, "rewards/rejected": -16.07356834411621, "sft_loss": 1.150990605354309, "step": 1135 }, { "epoch": 1.873767258382643, "grad_norm": 20.985079338983198, "learning_rate": 8.448029299223194e-08, "logits/chosen": 17.783571243286133, "logits/rejected": 18.174728393554688, "logps/chosen": -312.2618713378906, "logps/rejected": -233.99496459960938, "loss": 0.4933, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.624851226806641, "rewards/margins": 7.475332260131836, "rewards/rejected": -15.100183486938477, "sft_loss": 1.1498528718948364, "step": 1140 }, { "epoch": 1.8819855358316897, "grad_norm": 14.844798746234286, "learning_rate": 8.340953026988351e-08, "logits/chosen": 17.779254913330078, "logits/rejected": 19.071887969970703, "logps/chosen": -311.01190185546875, "logps/rejected": -248.10272216796875, "loss": 0.4615, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.804770469665527, "rewards/margins": 8.161953926086426, "rewards/rejected": -15.966724395751953, "sft_loss": 1.1634888648986816, "step": 1145 }, { "epoch": 1.8902038132807364, "grad_norm": 11.515222849514643, "learning_rate": 8.234218840380475e-08, "logits/chosen": 16.18383026123047, "logits/rejected": 17.827003479003906, "logps/chosen": -301.19659423828125, "logps/rejected": -245.50054931640625, "loss": 0.4341, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.15174388885498, "rewards/margins": 7.812210559844971, "rewards/rejected": -15.963953971862793, "sft_loss": 1.0311837196350098, "step": 1150 }, { "epoch": 1.898422090729783, "grad_norm": 14.564597779855657, "learning_rate": 8.127835518389417e-08, "logits/chosen": 16.831256866455078, "logits/rejected": 18.508529663085938, "logps/chosen": -311.1943054199219, "logps/rejected": -245.4080047607422, "loss": 0.4095, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.765483856201172, "rewards/margins": 8.15777587890625, "rewards/rejected": -15.923259735107422, "sft_loss": 1.114915132522583, "step": 1155 }, { "epoch": 1.9066403681788298, "grad_norm": 26.10926811927184, "learning_rate": 8.021811811146075e-08, "logits/chosen": 16.842208862304688, "logits/rejected": 17.959400177001953, "logps/chosen": -291.0676574707031, "logps/rejected": -237.74246215820312, "loss": 0.4551, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.678957939147949, "rewards/margins": 8.211709022521973, "rewards/rejected": -15.890668869018555, "sft_loss": 1.1757006645202637, "step": 1160 }, { "epoch": 1.9148586456278764, "grad_norm": 12.813401775007092, "learning_rate": 7.916156439202672e-08, "logits/chosen": 17.37171173095703, "logits/rejected": 18.593181610107422, "logps/chosen": -289.34759521484375, "logps/rejected": -234.8267059326172, "loss": 0.4289, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.662449836730957, "rewards/margins": 7.566576957702637, "rewards/rejected": -15.229025840759277, "sft_loss": 1.1354382038116455, "step": 1165 }, { "epoch": 1.9230769230769231, "grad_norm": 6.596137423450017, "learning_rate": 7.810878092815512e-08, "logits/chosen": 17.296720504760742, "logits/rejected": 17.11487579345703, "logps/chosen": -307.8653869628906, "logps/rejected": -237.65505981445312, "loss": 0.3663, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.926757335662842, "rewards/margins": 7.959318161010742, "rewards/rejected": -15.886076927185059, "sft_loss": 1.1921048164367676, "step": 1170 }, { "epoch": 1.9312952005259696, "grad_norm": 14.579022955412034, "learning_rate": 7.705985431230183e-08, "logits/chosen": 15.675207138061523, "logits/rejected": 16.91021156311035, "logps/chosen": -322.23992919921875, "logps/rejected": -266.904296875, "loss": 0.391, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.967777252197266, "rewards/margins": 8.5900297164917, "rewards/rejected": -17.557802200317383, "sft_loss": 1.228776454925537, "step": 1175 }, { "epoch": 1.9395134779750163, "grad_norm": 12.828599154800472, "learning_rate": 7.601487081969307e-08, "logits/chosen": 18.340225219726562, "logits/rejected": 19.142946243286133, "logps/chosen": -350.186279296875, "logps/rejected": -269.3705749511719, "loss": 0.3851, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -8.475415229797363, "rewards/margins": 9.2521390914917, "rewards/rejected": -17.727554321289062, "sft_loss": 1.1213669776916504, "step": 1180 }, { "epoch": 1.947731755424063, "grad_norm": 13.15248193805534, "learning_rate": 7.497391640122967e-08, "logits/chosen": 18.557586669921875, "logits/rejected": 19.259462356567383, "logps/chosen": -311.15838623046875, "logps/rejected": -252.96751403808594, "loss": 0.4041, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.537500381469727, "rewards/margins": 8.597896575927734, "rewards/rejected": -17.13539695739746, "sft_loss": 1.1180825233459473, "step": 1185 }, { "epoch": 1.9559500328731096, "grad_norm": 20.887376048027924, "learning_rate": 7.393707667641691e-08, "logits/chosen": 16.45261573791504, "logits/rejected": 17.498512268066406, "logps/chosen": -310.4942626953125, "logps/rejected": -250.18203735351562, "loss": 0.4276, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.239749908447266, "rewards/margins": 8.2033109664917, "rewards/rejected": -16.44305992126465, "sft_loss": 1.188431739807129, "step": 1190 }, { "epoch": 1.9641683103221563, "grad_norm": 32.140189305396625, "learning_rate": 7.290443692632281e-08, "logits/chosen": 19.094688415527344, "logits/rejected": 19.616283416748047, "logps/chosen": -291.1233825683594, "logps/rejected": -234.5458526611328, "loss": 0.4942, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.3053131103515625, "rewards/margins": 7.835725784301758, "rewards/rejected": -15.141037940979004, "sft_loss": 1.075373888015747, "step": 1195 }, { "epoch": 1.972386587771203, "grad_norm": 13.526795062615003, "learning_rate": 7.187608208656328e-08, "logits/chosen": 16.982704162597656, "logits/rejected": 17.547874450683594, "logps/chosen": -293.3042297363281, "logps/rejected": -233.2967987060547, "loss": 0.3964, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.24399185180664, "rewards/margins": 7.097829818725586, "rewards/rejected": -15.341819763183594, "sft_loss": 1.063591718673706, "step": 1200 }, { "epoch": 1.9806048652202497, "grad_norm": 12.330320612053741, "learning_rate": 7.085209674031618e-08, "logits/chosen": 18.508739471435547, "logits/rejected": 19.527912139892578, "logps/chosen": -318.8953857421875, "logps/rejected": -255.2642822265625, "loss": 0.3766, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.808796405792236, "rewards/margins": 8.834001541137695, "rewards/rejected": -16.642797470092773, "sft_loss": 1.0131335258483887, "step": 1205 }, { "epoch": 1.9888231426692964, "grad_norm": 19.628735128907458, "learning_rate": 6.983256511136442e-08, "logits/chosen": 17.349624633789062, "logits/rejected": 18.25617218017578, "logps/chosen": -315.596923828125, "logps/rejected": -252.95460510253906, "loss": 0.3878, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.501006126403809, "rewards/margins": 8.494573593139648, "rewards/rejected": -16.995580673217773, "sft_loss": 1.0632458925247192, "step": 1210 }, { "epoch": 1.997041420118343, "grad_norm": 14.674802699510677, "learning_rate": 6.881757105716831e-08, "logits/chosen": 17.45104217529297, "logits/rejected": 18.316680908203125, "logps/chosen": -330.3178405761719, "logps/rejected": -251.63551330566406, "loss": 0.4009, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.423145294189453, "rewards/margins": 8.154979705810547, "rewards/rejected": -16.578125, "sft_loss": 1.0945472717285156, "step": 1215 }, { "epoch": 2.0052596975673898, "grad_norm": 12.624994593347873, "learning_rate": 6.780719806196828e-08, "logits/chosen": 17.815471649169922, "logits/rejected": 19.435829162597656, "logps/chosen": -326.4144287109375, "logps/rejected": -260.8008117675781, "loss": 0.4449, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.356893539428711, "rewards/margins": 8.929654121398926, "rewards/rejected": -17.286548614501953, "sft_loss": 1.1082605123519897, "step": 1220 }, { "epoch": 2.0134779750164364, "grad_norm": 12.725913199026877, "learning_rate": 6.680152922991822e-08, "logits/chosen": 16.1939754486084, "logits/rejected": 17.380538940429688, "logps/chosen": -282.47589111328125, "logps/rejected": -237.52879333496094, "loss": 0.3868, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.493667602539062, "rewards/margins": 7.595485687255859, "rewards/rejected": -16.089153289794922, "sft_loss": 1.1127554178237915, "step": 1225 }, { "epoch": 2.021696252465483, "grad_norm": 16.704211079520014, "learning_rate": 6.580064727824994e-08, "logits/chosen": 17.634016036987305, "logits/rejected": 18.210420608520508, "logps/chosen": -294.94793701171875, "logps/rejected": -239.2569122314453, "loss": 0.4093, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -8.08222770690918, "rewards/margins": 7.951410293579102, "rewards/rejected": -16.03363609313965, "sft_loss": 1.0821824073791504, "step": 1230 }, { "epoch": 2.02991452991453, "grad_norm": 11.063546418547208, "learning_rate": 6.480463453046985e-08, "logits/chosen": 18.466581344604492, "logits/rejected": 18.895183563232422, "logps/chosen": -304.6612243652344, "logps/rejected": -241.4573974609375, "loss": 0.4202, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.562366485595703, "rewards/margins": 7.818039417266846, "rewards/rejected": -16.38040542602539, "sft_loss": 1.1190707683563232, "step": 1235 }, { "epoch": 2.0381328073635765, "grad_norm": 14.946770498466886, "learning_rate": 6.381357290958767e-08, "logits/chosen": 16.804920196533203, "logits/rejected": 17.867015838623047, "logps/chosen": -296.5435485839844, "logps/rejected": -246.7471466064453, "loss": 0.3722, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.007586479187012, "rewards/margins": 8.32363224029541, "rewards/rejected": -16.33121681213379, "sft_loss": 1.104773759841919, "step": 1240 }, { "epoch": 2.046351084812623, "grad_norm": 11.140569334845633, "learning_rate": 6.282754393137796e-08, "logits/chosen": 17.95855140686035, "logits/rejected": 18.640541076660156, "logps/chosen": -310.16778564453125, "logps/rejected": -239.66641235351562, "loss": 0.4065, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.086520195007324, "rewards/margins": 8.207837104797363, "rewards/rejected": -16.294357299804688, "sft_loss": 1.023207187652588, "step": 1245 }, { "epoch": 2.05456936226167, "grad_norm": 15.822685116826385, "learning_rate": 6.184662869767577e-08, "logits/chosen": 17.26742172241211, "logits/rejected": 17.335512161254883, "logps/chosen": -328.2395324707031, "logps/rejected": -263.0542297363281, "loss": 0.4175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.798872947692871, "rewards/margins": 8.849559783935547, "rewards/rejected": -17.648433685302734, "sft_loss": 1.1304852962493896, "step": 1250 }, { "epoch": 2.0627876397107165, "grad_norm": 17.368549612926913, "learning_rate": 6.08709078897056e-08, "logits/chosen": 17.57396125793457, "logits/rejected": 17.95652198791504, "logps/chosen": -302.7294006347656, "logps/rejected": -251.41261291503906, "loss": 0.4021, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.686173439025879, "rewards/margins": 8.318988800048828, "rewards/rejected": -17.005163192749023, "sft_loss": 1.119976282119751, "step": 1255 }, { "epoch": 2.0710059171597632, "grad_norm": 13.875960320644882, "learning_rate": 5.990046176144551e-08, "logits/chosen": 16.934846878051758, "logits/rejected": 17.557884216308594, "logps/chosen": -274.9892578125, "logps/rejected": -239.31570434570312, "loss": 0.4283, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": -8.879440307617188, "rewards/margins": 7.452706336975098, "rewards/rejected": -16.3321475982666, "sft_loss": 1.162746787071228, "step": 1260 }, { "epoch": 2.07922419460881, "grad_norm": 12.354544231223421, "learning_rate": 5.893537013302602e-08, "logits/chosen": 17.52082061767578, "logits/rejected": 18.2637939453125, "logps/chosen": -304.33441162109375, "logps/rejected": -243.52101135253906, "loss": 0.4253, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.936227321624756, "rewards/margins": 8.085640907287598, "rewards/rejected": -16.021867752075195, "sft_loss": 1.0547149181365967, "step": 1265 }, { "epoch": 2.0874424720578566, "grad_norm": 14.221340160175023, "learning_rate": 5.7975712384164795e-08, "logits/chosen": 17.841602325439453, "logits/rejected": 17.95541000366211, "logps/chosen": -295.451416015625, "logps/rejected": -230.86936950683594, "loss": 0.4009, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -7.948279857635498, "rewards/margins": 7.590776443481445, "rewards/rejected": -15.539057731628418, "sft_loss": 1.1430902481079102, "step": 1270 }, { "epoch": 2.0956607495069033, "grad_norm": 15.060512661462361, "learning_rate": 5.702156744763784e-08, "logits/chosen": 17.457277297973633, "logits/rejected": 18.601512908935547, "logps/chosen": -286.6520080566406, "logps/rejected": -236.4774627685547, "loss": 0.4211, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.859719276428223, "rewards/margins": 7.776011943817139, "rewards/rejected": -15.635732650756836, "sft_loss": 1.039507269859314, "step": 1275 }, { "epoch": 2.10387902695595, "grad_norm": 14.43891440512856, "learning_rate": 5.607301380278683e-08, "logits/chosen": 17.887542724609375, "logits/rejected": 18.098596572875977, "logps/chosen": -287.3581848144531, "logps/rejected": -228.4025421142578, "loss": 0.4356, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -7.526928901672363, "rewards/margins": 8.00684642791748, "rewards/rejected": -15.533775329589844, "sft_loss": 1.1267131567001343, "step": 1280 }, { "epoch": 2.1120973044049967, "grad_norm": 12.750181563192855, "learning_rate": 5.513012946906445e-08, "logits/chosen": 17.97955322265625, "logits/rejected": 18.05929183959961, "logps/chosen": -319.4637145996094, "logps/rejected": -245.5413818359375, "loss": 0.3884, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.154685020446777, "rewards/margins": 8.448837280273438, "rewards/rejected": -16.6035213470459, "sft_loss": 1.193272590637207, "step": 1285 }, { "epoch": 2.1203155818540433, "grad_norm": 9.936573876560704, "learning_rate": 5.419299199961708e-08, "logits/chosen": 17.2838077545166, "logits/rejected": 17.822799682617188, "logps/chosen": -337.51031494140625, "logps/rejected": -259.70428466796875, "loss": 0.3565, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.9683146476745605, "rewards/margins": 8.882369041442871, "rewards/rejected": -16.850685119628906, "sft_loss": 1.041199803352356, "step": 1290 }, { "epoch": 2.12853385930309, "grad_norm": 13.38745866462026, "learning_rate": 5.3261678474905785e-08, "logits/chosen": 18.08312225341797, "logits/rejected": 18.110692977905273, "logps/chosen": -324.0693359375, "logps/rejected": -256.90234375, "loss": 0.391, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.347922325134277, "rewards/margins": 8.93021011352539, "rewards/rejected": -17.27813148498535, "sft_loss": 1.1214524507522583, "step": 1295 }, { "epoch": 2.1367521367521367, "grad_norm": 12.52755859911023, "learning_rate": 5.2336265496366774e-08, "logits/chosen": 16.553739547729492, "logits/rejected": 18.280567169189453, "logps/chosen": -298.98480224609375, "logps/rejected": -247.38160705566406, "loss": 0.3604, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.771576881408691, "rewards/margins": 7.8479180335998535, "rewards/rejected": -16.619495391845703, "sft_loss": 1.1661113500595093, "step": 1300 }, { "epoch": 2.1449704142011834, "grad_norm": 15.615159328078256, "learning_rate": 5.141682918011055e-08, "logits/chosen": 17.72024917602539, "logits/rejected": 18.12508773803711, "logps/chosen": -311.2801818847656, "logps/rejected": -245.24436950683594, "loss": 0.4611, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.237349510192871, "rewards/margins": 8.371785163879395, "rewards/rejected": -16.609132766723633, "sft_loss": 1.1050708293914795, "step": 1305 }, { "epoch": 2.15318869165023, "grad_norm": 9.121410431305465, "learning_rate": 5.0503445150661306e-08, "logits/chosen": 17.203432083129883, "logits/rejected": 18.309484481811523, "logps/chosen": -282.98101806640625, "logps/rejected": -231.17942810058594, "loss": 0.3828, "rewards/accuracies": 1.0, "rewards/chosen": -8.479619979858398, "rewards/margins": 7.617303371429443, "rewards/rejected": -16.096921920776367, "sft_loss": 1.110097050666809, "step": 1310 }, { "epoch": 2.1614069690992768, "grad_norm": 16.777024727194785, "learning_rate": 4.959618853473696e-08, "logits/chosen": 16.61244010925293, "logits/rejected": 17.989538192749023, "logps/chosen": -302.3112487792969, "logps/rejected": -246.63719177246094, "loss": 0.4158, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.352829933166504, "rewards/margins": 8.38165283203125, "rewards/rejected": -16.734481811523438, "sft_loss": 1.1263587474822998, "step": 1315 }, { "epoch": 2.1696252465483234, "grad_norm": 12.912024759458012, "learning_rate": 4.8695133955069564e-08, "logits/chosen": 15.624103546142578, "logits/rejected": 16.827468872070312, "logps/chosen": -306.7551574707031, "logps/rejected": -244.3481903076172, "loss": 0.429, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.34555435180664, "rewards/margins": 8.023102760314941, "rewards/rejected": -16.3686580657959, "sft_loss": 1.2823337316513062, "step": 1320 }, { "epoch": 2.17784352399737, "grad_norm": 13.478297039710323, "learning_rate": 4.780035552426787e-08, "logits/chosen": 16.33539581298828, "logits/rejected": 18.024782180786133, "logps/chosen": -325.2061767578125, "logps/rejected": -265.6727600097656, "loss": 0.4175, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.308411598205566, "rewards/margins": 9.635498046875, "rewards/rejected": -17.943910598754883, "sft_loss": 1.1859756708145142, "step": 1325 }, { "epoch": 2.186061801446417, "grad_norm": 13.817488574864614, "learning_rate": 4.691192683872129e-08, "logits/chosen": 16.309165954589844, "logits/rejected": 17.056123733520508, "logps/chosen": -319.3105163574219, "logps/rejected": -256.0588073730469, "loss": 0.382, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.325318336486816, "rewards/margins": 8.578317642211914, "rewards/rejected": -16.903636932373047, "sft_loss": 1.0492181777954102, "step": 1330 }, { "epoch": 2.1942800788954635, "grad_norm": 10.242449837573213, "learning_rate": 4.602992097254646e-08, "logits/chosen": 17.743621826171875, "logits/rejected": 19.387224197387695, "logps/chosen": -307.1810607910156, "logps/rejected": -254.61309814453125, "loss": 0.3948, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.201935768127441, "rewards/margins": 8.72970962524414, "rewards/rejected": -16.9316463470459, "sft_loss": 1.159468173980713, "step": 1335 }, { "epoch": 2.20249835634451, "grad_norm": 12.840091970424348, "learning_rate": 4.515441047157707e-08, "logits/chosen": 17.517444610595703, "logits/rejected": 18.110706329345703, "logps/chosen": -303.7611083984375, "logps/rejected": -246.00747680664062, "loss": 0.4279, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.031211853027344, "rewards/margins": 8.574084281921387, "rewards/rejected": -16.605297088623047, "sft_loss": 1.1109663248062134, "step": 1340 }, { "epoch": 2.210716633793557, "grad_norm": 11.358689713775057, "learning_rate": 4.428546734739666e-08, "logits/chosen": 17.79754066467285, "logits/rejected": 18.65445327758789, "logps/chosen": -310.1402587890625, "logps/rejected": -257.2119445800781, "loss": 0.3393, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.179025650024414, "rewards/margins": 9.184054374694824, "rewards/rejected": -17.363079071044922, "sft_loss": 1.040381669998169, "step": 1345 }, { "epoch": 2.2189349112426036, "grad_norm": 10.0246369651475, "learning_rate": 4.342316307141568e-08, "logits/chosen": 15.378368377685547, "logits/rejected": 17.601299285888672, "logps/chosen": -293.3377380371094, "logps/rejected": -252.94558715820312, "loss": 0.4169, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.73454761505127, "rewards/margins": 8.430728912353516, "rewards/rejected": -17.16527557373047, "sft_loss": 1.088813304901123, "step": 1350 }, { "epoch": 2.2271531886916502, "grad_norm": 8.66405912578809, "learning_rate": 4.256756856899299e-08, "logits/chosen": 16.15410041809082, "logits/rejected": 17.089345932006836, "logps/chosen": -293.54864501953125, "logps/rejected": -243.08554077148438, "loss": 0.3688, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.237372398376465, "rewards/margins": 8.213920593261719, "rewards/rejected": -16.4512939453125, "sft_loss": 1.1098147630691528, "step": 1355 }, { "epoch": 2.235371466140697, "grad_norm": 15.953137176215671, "learning_rate": 4.171875421360202e-08, "logits/chosen": 16.227901458740234, "logits/rejected": 16.872665405273438, "logps/chosen": -329.6645202636719, "logps/rejected": -257.57489013671875, "loss": 0.4039, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.39266300201416, "rewards/margins": 8.796185493469238, "rewards/rejected": -17.1888484954834, "sft_loss": 1.1166497468948364, "step": 1360 }, { "epoch": 2.2435897435897436, "grad_norm": 9.812512910956865, "learning_rate": 4.0876789821042606e-08, "logits/chosen": 16.98467445373535, "logits/rejected": 17.594194412231445, "logps/chosen": -308.341064453125, "logps/rejected": -250.00465393066406, "loss": 0.3941, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.45275592803955, "rewards/margins": 8.58054256439209, "rewards/rejected": -17.03329849243164, "sft_loss": 1.1679203510284424, "step": 1365 }, { "epoch": 2.2518080210387903, "grad_norm": 11.249560857734895, "learning_rate": 4.0041744643698585e-08, "logits/chosen": 17.271631240844727, "logits/rejected": 18.480789184570312, "logps/chosen": -323.33148193359375, "logps/rejected": -265.4918212890625, "loss": 0.4133, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.759995460510254, "rewards/margins": 9.08838176727295, "rewards/rejected": -17.848377227783203, "sft_loss": 1.1703903675079346, "step": 1370 }, { "epoch": 2.260026298487837, "grad_norm": 13.168234484012684, "learning_rate": 3.9213687364841514e-08, "logits/chosen": 17.725706100463867, "logits/rejected": 18.4434871673584, "logps/chosen": -265.5625305175781, "logps/rejected": -229.72801208496094, "loss": 0.3827, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -7.455626964569092, "rewards/margins": 8.284765243530273, "rewards/rejected": -15.740392684936523, "sft_loss": 1.061354160308838, "step": 1375 }, { "epoch": 2.2682445759368837, "grad_norm": 10.595007690116647, "learning_rate": 3.8392686092981716e-08, "logits/chosen": 16.218524932861328, "logits/rejected": 17.454858779907227, "logps/chosen": -330.2020263671875, "logps/rejected": -259.57513427734375, "loss": 0.3713, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.325506210327148, "rewards/margins": 8.986472129821777, "rewards/rejected": -17.311979293823242, "sft_loss": 1.1411256790161133, "step": 1380 }, { "epoch": 2.2764628533859304, "grad_norm": 16.544764732871236, "learning_rate": 3.757880835626601e-08, "logits/chosen": 19.006175994873047, "logits/rejected": 20.302326202392578, "logps/chosen": -322.05242919921875, "logps/rejected": -260.6827087402344, "loss": 0.3984, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.566563606262207, "rewards/margins": 8.706660270690918, "rewards/rejected": -17.273221969604492, "sft_loss": 1.01236891746521, "step": 1385 }, { "epoch": 2.284681130834977, "grad_norm": 9.92900744936661, "learning_rate": 3.677212109692364e-08, "logits/chosen": 16.336091995239258, "logits/rejected": 18.393173217773438, "logps/chosen": -296.00811767578125, "logps/rejected": -255.37149047851562, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": -8.0043363571167, "rewards/margins": 9.217806816101074, "rewards/rejected": -17.222143173217773, "sft_loss": 1.1503466367721558, "step": 1390 }, { "epoch": 2.2928994082840237, "grad_norm": 9.386107838289549, "learning_rate": 3.597269066576017e-08, "logits/chosen": 17.042190551757812, "logits/rejected": 18.17107582092285, "logps/chosen": -300.5311584472656, "logps/rejected": -244.8414306640625, "loss": 0.3695, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.179112434387207, "rewards/margins": 8.550080299377441, "rewards/rejected": -16.72919273376465, "sft_loss": 1.1738831996917725, "step": 1395 }, { "epoch": 2.3011176857330704, "grad_norm": 13.817759213393538, "learning_rate": 3.518058281669996e-08, "logits/chosen": 17.452651977539062, "logits/rejected": 19.167875289916992, "logps/chosen": -325.5849914550781, "logps/rejected": -261.9805908203125, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": -8.316492080688477, "rewards/margins": 9.281232833862305, "rewards/rejected": -17.59772491455078, "sft_loss": 1.0759243965148926, "step": 1400 }, { "epoch": 2.309335963182117, "grad_norm": 14.27386340226445, "learning_rate": 3.439586270137797e-08, "logits/chosen": 16.01079750061035, "logits/rejected": 17.990955352783203, "logps/chosen": -317.35968017578125, "logps/rejected": -265.36737060546875, "loss": 0.382, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.888944625854492, "rewards/margins": 9.316179275512695, "rewards/rejected": -18.205123901367188, "sft_loss": 1.064568281173706, "step": 1405 }, { "epoch": 2.3175542406311638, "grad_norm": 17.736685407866446, "learning_rate": 3.3618594863780993e-08, "logits/chosen": 18.37812042236328, "logits/rejected": 19.024595260620117, "logps/chosen": -319.2788391113281, "logps/rejected": -255.89810180664062, "loss": 0.3468, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.851397514343262, "rewards/margins": 8.883750915527344, "rewards/rejected": -17.73514747619629, "sft_loss": 1.235966682434082, "step": 1410 }, { "epoch": 2.3257725180802105, "grad_norm": 22.487095580329445, "learning_rate": 3.2848843234938694e-08, "logits/chosen": 17.141220092773438, "logits/rejected": 17.714786529541016, "logps/chosen": -302.5834045410156, "logps/rejected": -254.14559936523438, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": -8.972297668457031, "rewards/margins": 8.797745704650879, "rewards/rejected": -17.770044326782227, "sft_loss": 1.1860109567642212, "step": 1415 }, { "epoch": 2.333990795529257, "grad_norm": 11.596948370393193, "learning_rate": 3.208667112766529e-08, "logits/chosen": 17.32436752319336, "logits/rejected": 18.515031814575195, "logps/chosen": -312.43267822265625, "logps/rejected": -266.10052490234375, "loss": 0.3933, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -9.664741516113281, "rewards/margins": 9.025125503540039, "rewards/rejected": -18.689865112304688, "sft_loss": 1.17525315284729, "step": 1420 }, { "epoch": 2.342209072978304, "grad_norm": 16.78732289470905, "learning_rate": 3.1332141231352194e-08, "logits/chosen": 17.367273330688477, "logits/rejected": 17.978761672973633, "logps/chosen": -325.341552734375, "logps/rejected": -261.8766784667969, "loss": 0.3954, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -9.092870712280273, "rewards/margins": 9.265833854675293, "rewards/rejected": -18.358705520629883, "sft_loss": 1.1345161199569702, "step": 1425 }, { "epoch": 2.3504273504273505, "grad_norm": 13.214123565552589, "learning_rate": 3.058531560681141e-08, "logits/chosen": 18.152240753173828, "logits/rejected": 19.055191040039062, "logps/chosen": -327.43487548828125, "logps/rejected": -266.76446533203125, "loss": 0.3363, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.500913619995117, "rewards/margins": 9.283426284790039, "rewards/rejected": -17.784339904785156, "sft_loss": 1.231545329093933, "step": 1430 }, { "epoch": 2.358645627876397, "grad_norm": 22.300498596470074, "learning_rate": 2.984625568117129e-08, "logits/chosen": 18.67966079711914, "logits/rejected": 19.73933982849121, "logps/chosen": -334.677734375, "logps/rejected": -265.227783203125, "loss": 0.4029, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.572039604187012, "rewards/margins": 8.896354675292969, "rewards/rejected": -17.468393325805664, "sft_loss": 1.1262859106063843, "step": 1435 }, { "epoch": 2.366863905325444, "grad_norm": 37.074159109819185, "learning_rate": 2.9115022242823862e-08, "logits/chosen": 17.512964248657227, "logits/rejected": 18.453014373779297, "logps/chosen": -326.0170593261719, "logps/rejected": -263.2306213378906, "loss": 0.3968, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.473074913024902, "rewards/margins": 9.212455749511719, "rewards/rejected": -17.685529708862305, "sft_loss": 1.092557430267334, "step": 1440 }, { "epoch": 2.3750821827744906, "grad_norm": 11.887196623999731, "learning_rate": 2.839167543642511e-08, "logits/chosen": 17.14059066772461, "logits/rejected": 18.407007217407227, "logps/chosen": -291.7596435546875, "logps/rejected": -250.99574279785156, "loss": 0.4211, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.36557388305664, "rewards/margins": 8.695883750915527, "rewards/rejected": -17.06145668029785, "sft_loss": 1.202438235282898, "step": 1445 }, { "epoch": 2.3833004602235373, "grad_norm": 17.524490621614987, "learning_rate": 2.7676274757947816e-08, "logits/chosen": 18.85689926147461, "logits/rejected": 19.545021057128906, "logps/chosen": -287.7202453613281, "logps/rejected": -244.56924438476562, "loss": 0.3838, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.284355163574219, "rewards/margins": 8.70119857788086, "rewards/rejected": -16.985553741455078, "sft_loss": 1.0111671686172485, "step": 1450 }, { "epoch": 2.391518737672584, "grad_norm": 9.1542241365719, "learning_rate": 2.696887904978819e-08, "logits/chosen": 18.2181453704834, "logits/rejected": 18.709545135498047, "logps/chosen": -280.198974609375, "logps/rejected": -226.15415954589844, "loss": 0.4051, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.791984558105469, "rewards/margins": 7.134130477905273, "rewards/rejected": -15.926115989685059, "sft_loss": 1.1695269346237183, "step": 1455 }, { "epoch": 2.3997370151216306, "grad_norm": 13.076192251177769, "learning_rate": 2.6269546495925886e-08, "logits/chosen": 16.172388076782227, "logits/rejected": 17.052417755126953, "logps/chosen": -287.6596984863281, "logps/rejected": -242.12660217285156, "loss": 0.4246, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.466004371643066, "rewards/margins": 8.168050765991211, "rewards/rejected": -16.634056091308594, "sft_loss": 1.1705952882766724, "step": 1460 }, { "epoch": 2.4079552925706773, "grad_norm": 14.024390303705356, "learning_rate": 2.5578334617138236e-08, "logits/chosen": 17.606464385986328, "logits/rejected": 18.12337303161621, "logps/chosen": -301.743408203125, "logps/rejected": -248.48464965820312, "loss": 0.3833, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.655086517333984, "rewards/margins": 8.575737953186035, "rewards/rejected": -17.230825424194336, "sft_loss": 1.08839750289917, "step": 1465 }, { "epoch": 2.416173570019724, "grad_norm": 17.812699456228195, "learning_rate": 2.489530026626932e-08, "logits/chosen": 17.72669219970703, "logits/rejected": 18.6758975982666, "logps/chosen": -306.7005310058594, "logps/rejected": -242.5311279296875, "loss": 0.379, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.553206443786621, "rewards/margins": 8.148569107055664, "rewards/rejected": -16.70177459716797, "sft_loss": 1.1815282106399536, "step": 1470 }, { "epoch": 2.4243918474687707, "grad_norm": 8.680672775515832, "learning_rate": 2.422049962355366e-08, "logits/chosen": 18.41983413696289, "logits/rejected": 19.47545623779297, "logps/chosen": -282.099609375, "logps/rejected": -238.36300659179688, "loss": 0.3486, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.316095352172852, "rewards/margins": 8.191699981689453, "rewards/rejected": -16.507797241210938, "sft_loss": 1.205697774887085, "step": 1475 }, { "epoch": 2.4326101249178174, "grad_norm": 16.775370793665815, "learning_rate": 2.3553988191995208e-08, "logits/chosen": 16.783174514770508, "logits/rejected": 18.405048370361328, "logps/chosen": -304.1385498046875, "logps/rejected": -256.7261047363281, "loss": 0.3744, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.7383451461792, "rewards/margins": 8.930658340454102, "rewards/rejected": -17.669002532958984, "sft_loss": 1.1059280633926392, "step": 1480 }, { "epoch": 2.440828402366864, "grad_norm": 10.437808650182905, "learning_rate": 2.2895820792802474e-08, "logits/chosen": 16.727697372436523, "logits/rejected": 17.59294891357422, "logps/chosen": -328.55389404296875, "logps/rejected": -269.7945251464844, "loss": 0.3695, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -9.062349319458008, "rewards/margins": 9.41024112701416, "rewards/rejected": -18.472591400146484, "sft_loss": 1.173682451248169, "step": 1485 }, { "epoch": 2.4490466798159107, "grad_norm": 20.73492010593765, "learning_rate": 2.2246051560879095e-08, "logits/chosen": 16.899852752685547, "logits/rejected": 17.82339096069336, "logps/chosen": -338.22186279296875, "logps/rejected": -279.0784912109375, "loss": 0.4179, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -9.162134170532227, "rewards/margins": 9.35285758972168, "rewards/rejected": -18.514989852905273, "sft_loss": 1.17171311378479, "step": 1490 }, { "epoch": 2.4572649572649574, "grad_norm": 12.636170820327301, "learning_rate": 2.160473394037149e-08, "logits/chosen": 17.118467330932617, "logits/rejected": 17.36690330505371, "logps/chosen": -335.8661804199219, "logps/rejected": -262.7174072265625, "loss": 0.4504, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.756584167480469, "rewards/margins": 8.854002952575684, "rewards/rejected": -17.610586166381836, "sft_loss": 1.1354836225509644, "step": 1495 }, { "epoch": 2.465483234714004, "grad_norm": 11.838207356236568, "learning_rate": 2.097192068027276e-08, "logits/chosen": 16.54058837890625, "logits/rejected": 17.930091857910156, "logps/chosen": -329.2217712402344, "logps/rejected": -270.410888671875, "loss": 0.3262, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.5565185546875, "rewards/margins": 9.7839937210083, "rewards/rejected": -18.340513229370117, "sft_loss": 1.0987026691436768, "step": 1500 }, { "epoch": 2.473701512163051, "grad_norm": 17.261726485061967, "learning_rate": 2.0347663830084182e-08, "logits/chosen": 16.857637405395508, "logits/rejected": 17.605924606323242, "logps/chosen": -278.7782287597656, "logps/rejected": -237.3050537109375, "loss": 0.3978, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.520981788635254, "rewards/margins": 7.912033557891846, "rewards/rejected": -16.433013916015625, "sft_loss": 1.1526176929473877, "step": 1505 }, { "epoch": 2.4819197896120975, "grad_norm": 19.01971806956554, "learning_rate": 1.9732014735534168e-08, "logits/chosen": 17.1612606048584, "logits/rejected": 17.63095474243164, "logps/chosen": -307.8269958496094, "logps/rejected": -234.46160888671875, "loss": 0.4156, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.234945297241211, "rewards/margins": 7.63665771484375, "rewards/rejected": -15.871603012084961, "sft_loss": 1.1170748472213745, "step": 1510 }, { "epoch": 2.490138067061144, "grad_norm": 10.87683842585221, "learning_rate": 1.9125024034354758e-08, "logits/chosen": 17.20734214782715, "logits/rejected": 17.946365356445312, "logps/chosen": -312.4763488769531, "logps/rejected": -246.2183837890625, "loss": 0.351, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.035243034362793, "rewards/margins": 8.52718734741211, "rewards/rejected": -16.56243133544922, "sft_loss": 1.071519374847412, "step": 1515 }, { "epoch": 2.498356344510191, "grad_norm": 10.244386828979161, "learning_rate": 1.85267416521169e-08, "logits/chosen": 17.724872589111328, "logits/rejected": 18.053852081298828, "logps/chosen": -320.68597412109375, "logps/rejected": -246.91893005371094, "loss": 0.3733, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.582959175109863, "rewards/margins": 8.986913681030273, "rewards/rejected": -16.56987190246582, "sft_loss": 1.0908424854278564, "step": 1520 }, { "epoch": 2.5065746219592375, "grad_norm": 12.813362766851835, "learning_rate": 1.793721679812389e-08, "logits/chosen": 18.601253509521484, "logits/rejected": 19.362607955932617, "logps/chosen": -288.14776611328125, "logps/rejected": -237.0640106201172, "loss": 0.3857, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.49190616607666, "rewards/margins": 7.810946464538574, "rewards/rejected": -16.302852630615234, "sft_loss": 1.091495394706726, "step": 1525 }, { "epoch": 2.5147928994082838, "grad_norm": 11.364641270765484, "learning_rate": 1.735649796136382e-08, "logits/chosen": 15.785613059997559, "logits/rejected": 17.070707321166992, "logps/chosen": -324.2843017578125, "logps/rejected": -258.7143859863281, "loss": 0.3883, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -8.855399131774902, "rewards/margins": 8.804574012756348, "rewards/rejected": -17.65997314453125, "sft_loss": 1.1961203813552856, "step": 1530 }, { "epoch": 2.523011176857331, "grad_norm": 15.007717453848354, "learning_rate": 1.678463290652142e-08, "logits/chosen": 17.604642868041992, "logits/rejected": 17.90863609313965, "logps/chosen": -312.69024658203125, "logps/rejected": -249.09962463378906, "loss": 0.3626, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.328804016113281, "rewards/margins": 8.493717193603516, "rewards/rejected": -16.822521209716797, "sft_loss": 1.157140851020813, "step": 1535 }, { "epoch": 2.531229454306377, "grad_norm": 8.792843348493232, "learning_rate": 1.6221668670049315e-08, "logits/chosen": 16.296873092651367, "logits/rejected": 17.479211807250977, "logps/chosen": -327.1073303222656, "logps/rejected": -271.3090515136719, "loss": 0.3481, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.877535820007324, "rewards/margins": 9.413492202758789, "rewards/rejected": -18.291027069091797, "sft_loss": 1.1908369064331055, "step": 1540 }, { "epoch": 2.5394477317554243, "grad_norm": 14.557660052303598, "learning_rate": 1.5667651556299178e-08, "logits/chosen": 16.44731903076172, "logits/rejected": 17.4537296295166, "logps/chosen": -306.1639709472656, "logps/rejected": -253.69247436523438, "loss": 0.3531, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.432329177856445, "rewards/margins": 8.949870109558105, "rewards/rejected": -17.382200241088867, "sft_loss": 1.114105463027954, "step": 1545 }, { "epoch": 2.5476660092044705, "grad_norm": 15.415629759090677, "learning_rate": 1.5122627133713262e-08, "logits/chosen": 15.742711067199707, "logits/rejected": 17.65005874633789, "logps/chosen": -317.73675537109375, "logps/rejected": -260.23907470703125, "loss": 0.3849, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.840112209320068, "rewards/margins": 9.513845443725586, "rewards/rejected": -17.35395622253418, "sft_loss": 1.292752981185913, "step": 1550 }, { "epoch": 2.5558842866535176, "grad_norm": 11.677314306376815, "learning_rate": 1.4586640231076226e-08, "logits/chosen": 17.83001708984375, "logits/rejected": 18.008840560913086, "logps/chosen": -290.8938293457031, "logps/rejected": -234.77801513671875, "loss": 0.3699, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -8.604043006896973, "rewards/margins": 7.615962028503418, "rewards/rejected": -16.22000503540039, "sft_loss": 1.1707122325897217, "step": 1555 }, { "epoch": 2.564102564102564, "grad_norm": 17.403632913196056, "learning_rate": 1.405973493382806e-08, "logits/chosen": 16.150592803955078, "logits/rejected": 17.557065963745117, "logps/chosen": -321.72802734375, "logps/rejected": -270.1099548339844, "loss": 0.3552, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -9.425326347351074, "rewards/margins": 9.209266662597656, "rewards/rejected": -18.634592056274414, "sft_loss": 1.0887880325317383, "step": 1560 }, { "epoch": 2.572320841551611, "grad_norm": 16.12695707285676, "learning_rate": 1.3541954580437941e-08, "logits/chosen": 18.370115280151367, "logits/rejected": 18.63874626159668, "logps/chosen": -321.462646484375, "logps/rejected": -259.6288757324219, "loss": 0.3254, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.813506126403809, "rewards/margins": 9.208869934082031, "rewards/rejected": -18.022377014160156, "sft_loss": 1.0541073083877563, "step": 1565 }, { "epoch": 2.5805391190006572, "grad_norm": 18.71581348868284, "learning_rate": 1.3033341758839592e-08, "logits/chosen": 16.9278621673584, "logits/rejected": 17.87784767150879, "logps/chosen": -333.1341552734375, "logps/rejected": -271.1338195800781, "loss": 0.4055, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -9.0064058303833, "rewards/margins": 9.4508638381958, "rewards/rejected": -18.4572696685791, "sft_loss": 1.1667834520339966, "step": 1570 }, { "epoch": 2.5887573964497044, "grad_norm": 11.817463136679503, "learning_rate": 1.2533938302928329e-08, "logits/chosen": 17.372867584228516, "logits/rejected": 18.298500061035156, "logps/chosen": -346.6560974121094, "logps/rejected": -274.773681640625, "loss": 0.3683, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.409444808959961, "rewards/margins": 9.886656761169434, "rewards/rejected": -18.296100616455078, "sft_loss": 1.183761477470398, "step": 1575 }, { "epoch": 2.5969756738987506, "grad_norm": 11.086694788731137, "learning_rate": 1.2043785289120409e-08, "logits/chosen": 16.920242309570312, "logits/rejected": 18.36749839782715, "logps/chosen": -333.09539794921875, "logps/rejected": -272.066162109375, "loss": 0.378, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.567721366882324, "rewards/margins": 9.71126937866211, "rewards/rejected": -18.278989791870117, "sft_loss": 1.226511001586914, "step": 1580 }, { "epoch": 2.6051939513477977, "grad_norm": 19.57395022687368, "learning_rate": 1.1562923032974125e-08, "logits/chosen": 17.482685089111328, "logits/rejected": 18.186784744262695, "logps/chosen": -336.7694396972656, "logps/rejected": -273.9622497558594, "loss": 0.3656, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.473450660705566, "rewards/margins": 9.806720733642578, "rewards/rejected": -18.280170440673828, "sft_loss": 1.0997947454452515, "step": 1585 }, { "epoch": 2.613412228796844, "grad_norm": 13.846460377871546, "learning_rate": 1.1091391085874161e-08, "logits/chosen": 17.66254425048828, "logits/rejected": 17.869403839111328, "logps/chosen": -355.09124755859375, "logps/rejected": -262.7408142089844, "loss": 0.3909, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.950118064880371, "rewards/margins": 8.9635009765625, "rewards/rejected": -17.913618087768555, "sft_loss": 1.2338536977767944, "step": 1590 }, { "epoch": 2.621630506245891, "grad_norm": 18.388530342654583, "learning_rate": 1.06292282317781e-08, "logits/chosen": 18.353347778320312, "logits/rejected": 19.111572265625, "logps/chosen": -293.8038024902344, "logps/rejected": -240.29061889648438, "loss": 0.3818, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.344436645507812, "rewards/margins": 8.241558074951172, "rewards/rejected": -16.58599281311035, "sft_loss": 1.1101101636886597, "step": 1595 }, { "epoch": 2.6298487836949374, "grad_norm": 48.6421585527008, "learning_rate": 1.017647248402674e-08, "logits/chosen": 17.27472686767578, "logits/rejected": 17.775699615478516, "logps/chosen": -338.6330871582031, "logps/rejected": -265.278564453125, "loss": 0.4384, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.897347450256348, "rewards/margins": 8.987475395202637, "rewards/rejected": -17.884824752807617, "sft_loss": 1.1422169208526611, "step": 1600 }, { "epoch": 2.6380670611439845, "grad_norm": 13.863026192467665, "learning_rate": 9.733161082217223e-09, "logits/chosen": 16.872806549072266, "logits/rejected": 17.572965621948242, "logps/chosen": -321.6798095703125, "logps/rejected": -258.7831115722656, "loss": 0.4032, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.822911262512207, "rewards/margins": 8.982833862304688, "rewards/rejected": -17.805744171142578, "sft_loss": 1.125891923904419, "step": 1605 }, { "epoch": 2.6462853385930307, "grad_norm": 15.402379291218823, "learning_rate": 9.299330489140125e-09, "logits/chosen": 17.64206314086914, "logits/rejected": 18.37377166748047, "logps/chosen": -285.738037109375, "logps/rejected": -240.1550750732422, "loss": 0.4197, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.605627059936523, "rewards/margins": 8.010725021362305, "rewards/rejected": -16.616352081298828, "sft_loss": 1.0786948204040527, "step": 1610 }, { "epoch": 2.654503616042078, "grad_norm": 11.137896212671778, "learning_rate": 8.87501638778039e-09, "logits/chosen": 16.587888717651367, "logits/rejected": 17.759031295776367, "logps/chosen": -309.4990539550781, "logps/rejected": -254.31495666503906, "loss": 0.4112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.20348834991455, "rewards/margins": 8.457581520080566, "rewards/rejected": -17.66107177734375, "sft_loss": 1.10163414478302, "step": 1615 }, { "epoch": 2.662721893491124, "grad_norm": 10.357256991488983, "learning_rate": 8.460253678382296e-09, "logits/chosen": 17.529693603515625, "logits/rejected": 18.570171356201172, "logps/chosen": -337.939453125, "logps/rejected": -269.9917297363281, "loss": 0.3553, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.473341941833496, "rewards/margins": 9.756902694702148, "rewards/rejected": -18.230243682861328, "sft_loss": 1.0737409591674805, "step": 1620 }, { "epoch": 2.6709401709401708, "grad_norm": 12.585703695972256, "learning_rate": 8.055076475578918e-09, "logits/chosen": 17.500032424926758, "logits/rejected": 18.32237434387207, "logps/chosen": -326.6228942871094, "logps/rejected": -261.5873107910156, "loss": 0.3922, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.563287734985352, "rewards/margins": 9.018136024475098, "rewards/rejected": -17.581424713134766, "sft_loss": 1.1417536735534668, "step": 1625 }, { "epoch": 2.6791584483892175, "grad_norm": 12.800841299642682, "learning_rate": 7.659518105586238e-09, "logits/chosen": 16.294475555419922, "logits/rejected": 18.111600875854492, "logps/chosen": -335.0698547363281, "logps/rejected": -275.7948913574219, "loss": 0.3539, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.872127532958984, "rewards/margins": 9.738655090332031, "rewards/rejected": -18.610782623291016, "sft_loss": 1.171600341796875, "step": 1630 }, { "epoch": 2.687376725838264, "grad_norm": 11.437442046862925, "learning_rate": 7.273611103461836e-09, "logits/chosen": 17.347509384155273, "logits/rejected": 18.348569869995117, "logps/chosen": -303.4100341796875, "logps/rejected": -250.9491729736328, "loss": 0.3316, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.186358451843262, "rewards/margins": 8.977690696716309, "rewards/rejected": -17.164051055908203, "sft_loss": 1.1324518918991089, "step": 1635 }, { "epoch": 2.695595003287311, "grad_norm": 12.932581100678355, "learning_rate": 6.897387210429067e-09, "logits/chosen": 17.321182250976562, "logits/rejected": 18.35422134399414, "logps/chosen": -298.5028381347656, "logps/rejected": -246.92356872558594, "loss": 0.4056, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.715899467468262, "rewards/margins": 8.367709159851074, "rewards/rejected": -17.08361053466797, "sft_loss": 1.143718957901001, "step": 1640 }, { "epoch": 2.7038132807363575, "grad_norm": 14.723722025410018, "learning_rate": 6.530877371266175e-09, "logits/chosen": 16.489261627197266, "logits/rejected": 17.733213424682617, "logps/chosen": -305.61749267578125, "logps/rejected": -256.1786804199219, "loss": 0.3542, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.568675994873047, "rewards/margins": 9.070348739624023, "rewards/rejected": -17.639026641845703, "sft_loss": 1.176300048828125, "step": 1645 }, { "epoch": 2.712031558185404, "grad_norm": 22.923491412294727, "learning_rate": 6.1741117317611196e-09, "logits/chosen": 17.291810989379883, "logits/rejected": 18.644412994384766, "logps/chosen": -321.612060546875, "logps/rejected": -269.1338195800781, "loss": 0.4291, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -9.136231422424316, "rewards/margins": 9.433501243591309, "rewards/rejected": -18.569734573364258, "sft_loss": 1.2353969812393188, "step": 1650 }, { "epoch": 2.720249835634451, "grad_norm": 10.617946186080342, "learning_rate": 5.827119636232017e-09, "logits/chosen": 17.4252872467041, "logits/rejected": 18.208906173706055, "logps/chosen": -308.66943359375, "logps/rejected": -251.34764099121094, "loss": 0.4103, "rewards/accuracies": 0.9300000071525574, "rewards/chosen": -8.655915260314941, "rewards/margins": 8.82339096069336, "rewards/rejected": -17.479307174682617, "sft_loss": 1.2225102186203003, "step": 1655 }, { "epoch": 2.7284681130834976, "grad_norm": 9.888027224233095, "learning_rate": 5.489929625113549e-09, "logits/chosen": 16.691282272338867, "logits/rejected": 17.671295166015625, "logps/chosen": -328.8042297363281, "logps/rejected": -267.9706726074219, "loss": 0.4266, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.37188720703125, "rewards/margins": 9.643902778625488, "rewards/rejected": -18.015790939331055, "sft_loss": 1.2559726238250732, "step": 1660 }, { "epoch": 2.7366863905325443, "grad_norm": 20.805457290074077, "learning_rate": 5.1625694326095506e-09, "logits/chosen": 16.405752182006836, "logits/rejected": 17.14948081970215, "logps/chosen": -341.1684875488281, "logps/rejected": -271.09710693359375, "loss": 0.3332, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.78724193572998, "rewards/margins": 9.57591724395752, "rewards/rejected": -18.363157272338867, "sft_loss": 1.0471839904785156, "step": 1665 }, { "epoch": 2.744904667981591, "grad_norm": 13.812771677348046, "learning_rate": 4.845065984411742e-09, "logits/chosen": 16.383556365966797, "logits/rejected": 17.95462989807129, "logps/chosen": -331.20526123046875, "logps/rejected": -279.220458984375, "loss": 0.357, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -9.316568374633789, "rewards/margins": 9.552423477172852, "rewards/rejected": -18.86899185180664, "sft_loss": 1.163619875907898, "step": 1670 }, { "epoch": 2.7531229454306376, "grad_norm": 14.688896292238876, "learning_rate": 4.5374453954851035e-09, "logits/chosen": 18.362672805786133, "logits/rejected": 19.01654815673828, "logps/chosen": -307.0843200683594, "logps/rejected": -246.02671813964844, "loss": 0.3677, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.525540351867676, "rewards/margins": 8.622610092163086, "rewards/rejected": -17.148151397705078, "sft_loss": 1.1697852611541748, "step": 1675 }, { "epoch": 2.7613412228796843, "grad_norm": 15.151111907515142, "learning_rate": 4.239732967919976e-09, "logits/chosen": 18.35997772216797, "logits/rejected": 18.519113540649414, "logps/chosen": -283.6457214355469, "logps/rejected": -239.9178466796875, "loss": 0.3946, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.615645408630371, "rewards/margins": 8.21639633178711, "rewards/rejected": -16.832042694091797, "sft_loss": 1.185640573501587, "step": 1680 }, { "epoch": 2.769559500328731, "grad_norm": 8.993747704826987, "learning_rate": 3.951953188850762e-09, "logits/chosen": 15.838356018066406, "logits/rejected": 17.58329963684082, "logps/chosen": -300.3641662597656, "logps/rejected": -254.8916473388672, "loss": 0.3834, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.536845207214355, "rewards/margins": 8.757308006286621, "rewards/rejected": -17.294153213500977, "sft_loss": 1.1382744312286377, "step": 1685 }, { "epoch": 2.7777777777777777, "grad_norm": 13.655366152597214, "learning_rate": 3.674129728442013e-09, "logits/chosen": 17.68130874633789, "logits/rejected": 19.080127716064453, "logps/chosen": -268.5239562988281, "logps/rejected": -229.50523376464844, "loss": 0.3877, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.217788696289062, "rewards/margins": 7.664586067199707, "rewards/rejected": -15.882373809814453, "sft_loss": 1.0555132627487183, "step": 1690 }, { "epoch": 2.7859960552268244, "grad_norm": 11.562075341982874, "learning_rate": 3.4062854379414694e-09, "logits/chosen": 17.3222599029541, "logits/rejected": 18.08160972595215, "logps/chosen": -306.2829284667969, "logps/rejected": -251.935546875, "loss": 0.3607, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.6878023147583, "rewards/margins": 8.51749038696289, "rewards/rejected": -17.205289840698242, "sft_loss": 1.1840558052062988, "step": 1695 }, { "epoch": 2.794214332675871, "grad_norm": 25.80729521542422, "learning_rate": 3.1484423478004563e-09, "logits/chosen": 17.99493408203125, "logits/rejected": 18.518619537353516, "logps/chosen": -289.563232421875, "logps/rejected": -243.09219360351562, "loss": 0.4297, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.428235054016113, "rewards/margins": 8.079200744628906, "rewards/rejected": -16.50743865966797, "sft_loss": 1.108068585395813, "step": 1700 }, { "epoch": 2.8024326101249177, "grad_norm": 13.892592196473423, "learning_rate": 2.9006216658619687e-09, "logits/chosen": 16.929012298583984, "logits/rejected": 17.672870635986328, "logps/chosen": -313.2660217285156, "logps/rejected": -257.04034423828125, "loss": 0.399, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.524652481079102, "rewards/margins": 9.001575469970703, "rewards/rejected": -17.526227951049805, "sft_loss": 1.1432716846466064, "step": 1705 }, { "epoch": 2.8106508875739644, "grad_norm": 9.790469642612795, "learning_rate": 2.6628437756162635e-09, "logits/chosen": 17.310102462768555, "logits/rejected": 18.04708480834961, "logps/chosen": -293.9396057128906, "logps/rejected": -240.7176971435547, "loss": 0.3473, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.278569221496582, "rewards/margins": 8.390737533569336, "rewards/rejected": -16.669307708740234, "sft_loss": 1.1700962781906128, "step": 1710 }, { "epoch": 2.818869165023011, "grad_norm": 9.444898259948333, "learning_rate": 2.435128234524228e-09, "logits/chosen": 17.586627960205078, "logits/rejected": 18.176280975341797, "logps/chosen": -299.7925109863281, "logps/rejected": -243.57485961914062, "loss": 0.4067, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -8.356633186340332, "rewards/margins": 8.317458152770996, "rewards/rejected": -16.674091339111328, "sft_loss": 1.1623938083648682, "step": 1715 }, { "epoch": 2.827087442472058, "grad_norm": 11.302990178309454, "learning_rate": 2.2174937724088877e-09, "logits/chosen": 17.02381134033203, "logits/rejected": 18.46286392211914, "logps/chosen": -314.8418273925781, "logps/rejected": -259.57745361328125, "loss": 0.4069, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": -8.880843162536621, "rewards/margins": 8.848891258239746, "rewards/rejected": -17.729736328125, "sft_loss": 1.1079494953155518, "step": 1720 }, { "epoch": 2.8353057199211045, "grad_norm": 9.711126487613186, "learning_rate": 2.009958289914765e-09, "logits/chosen": 17.012800216674805, "logits/rejected": 18.349876403808594, "logps/chosen": -321.7917175292969, "logps/rejected": -270.1522216796875, "loss": 0.3451, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.694296836853027, "rewards/margins": 9.856916427612305, "rewards/rejected": -18.551212310791016, "sft_loss": 1.0486385822296143, "step": 1725 }, { "epoch": 2.843523997370151, "grad_norm": 22.398743525886992, "learning_rate": 1.8125388570355422e-09, "logits/chosen": 16.76806640625, "logits/rejected": 17.946535110473633, "logps/chosen": -312.1168212890625, "logps/rejected": -266.208984375, "loss": 0.3337, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.700928688049316, "rewards/margins": 9.464086532592773, "rewards/rejected": -18.165014266967773, "sft_loss": 1.162864327430725, "step": 1730 }, { "epoch": 2.851742274819198, "grad_norm": 14.549076580676688, "learning_rate": 1.6252517117101017e-09, "logits/chosen": 16.1746768951416, "logits/rejected": 17.028032302856445, "logps/chosen": -316.6230773925781, "logps/rejected": -258.9454345703125, "loss": 0.4137, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.804771423339844, "rewards/margins": 8.894736289978027, "rewards/rejected": -17.699508666992188, "sft_loss": 1.2625643014907837, "step": 1735 }, { "epoch": 2.8599605522682445, "grad_norm": 22.305121337267558, "learning_rate": 1.4481122584868582e-09, "logits/chosen": 16.654598236083984, "logits/rejected": 17.727828979492188, "logps/chosen": -327.3823547363281, "logps/rejected": -264.8335876464844, "loss": 0.4201, "rewards/accuracies": 1.0, "rewards/chosen": -9.304695129394531, "rewards/margins": 8.695550918579102, "rewards/rejected": -18.000246047973633, "sft_loss": 1.1684330701828003, "step": 1740 }, { "epoch": 2.868178829717291, "grad_norm": 12.445565014042396, "learning_rate": 1.2811350672568138e-09, "logits/chosen": 16.678804397583008, "logits/rejected": 18.215984344482422, "logps/chosen": -340.2626953125, "logps/rejected": -277.87872314453125, "loss": 0.4267, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.802419662475586, "rewards/margins": 9.925808906555176, "rewards/rejected": -18.728229522705078, "sft_loss": 1.1311696767807007, "step": 1745 }, { "epoch": 2.876397107166338, "grad_norm": 16.390316522695066, "learning_rate": 1.1243338720550445e-09, "logits/chosen": 16.955345153808594, "logits/rejected": 18.02084732055664, "logps/chosen": -291.6322937011719, "logps/rejected": -249.6865234375, "loss": 0.4018, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.541548728942871, "rewards/margins": 8.891424179077148, "rewards/rejected": -17.432973861694336, "sft_loss": 1.122809648513794, "step": 1750 }, { "epoch": 2.8846153846153846, "grad_norm": 12.452008236969373, "learning_rate": 9.777215699311725e-10, "logits/chosen": 17.285600662231445, "logits/rejected": 18.065244674682617, "logps/chosen": -304.11834716796875, "logps/rejected": -252.4257049560547, "loss": 0.3855, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.604863166809082, "rewards/margins": 8.664654731750488, "rewards/rejected": -17.269519805908203, "sft_loss": 1.2225173711776733, "step": 1755 }, { "epoch": 2.8928336620644313, "grad_norm": 26.02343316648693, "learning_rate": 8.413102198885358e-10, "logits/chosen": 15.385034561157227, "logits/rejected": 16.86432456970215, "logps/chosen": -327.46160888671875, "logps/rejected": -264.8345031738281, "loss": 0.4478, "rewards/accuracies": 1.0, "rewards/chosen": -9.194596290588379, "rewards/margins": 8.853148460388184, "rewards/rejected": -18.047740936279297, "sft_loss": 1.0643724203109741, "step": 1760 }, { "epoch": 2.901051939513478, "grad_norm": 9.970098814112205, "learning_rate": 7.151110418923134e-10, "logits/chosen": 18.434673309326172, "logits/rejected": 18.675090789794922, "logps/chosen": -302.91534423828125, "logps/rejected": -248.91583251953125, "loss": 0.3988, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -8.768433570861816, "rewards/margins": 8.766546249389648, "rewards/rejected": -17.53498077392578, "sft_loss": 1.1868294477462769, "step": 1765 }, { "epoch": 2.9092702169625246, "grad_norm": 18.45143826968204, "learning_rate": 5.991344159466672e-10, "logits/chosen": 16.24605941772461, "logits/rejected": 17.377365112304688, "logps/chosen": -318.8271789550781, "logps/rejected": -257.1405334472656, "loss": 0.352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.925313949584961, "rewards/margins": 8.410311698913574, "rewards/rejected": -17.335628509521484, "sft_loss": 1.1228437423706055, "step": 1770 }, { "epoch": 2.9174884944115713, "grad_norm": 12.923252042791281, "learning_rate": 4.933898812409937e-10, "logits/chosen": 16.73847198486328, "logits/rejected": 17.230134963989258, "logps/chosen": -338.15118408203125, "logps/rejected": -271.0611267089844, "loss": 0.3936, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -9.159259796142578, "rewards/margins": 9.365001678466797, "rewards/rejected": -18.524259567260742, "sft_loss": 1.1974759101867676, "step": 1775 }, { "epoch": 2.925706771860618, "grad_norm": 18.59092085629164, "learning_rate": 3.978861353653301e-10, "logits/chosen": 17.0466251373291, "logits/rejected": 17.81385612487793, "logps/chosen": -301.74603271484375, "logps/rejected": -247.91571044921875, "loss": 0.4187, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.685689926147461, "rewards/margins": 8.343822479248047, "rewards/rejected": -17.02951431274414, "sft_loss": 1.086068034172058, "step": 1780 }, { "epoch": 2.9339250493096647, "grad_norm": 16.48047435187235, "learning_rate": 3.1263103359494005e-10, "logits/chosen": 17.160581588745117, "logits/rejected": 18.65143585205078, "logps/chosen": -300.696533203125, "logps/rejected": -245.1064453125, "loss": 0.3632, "rewards/accuracies": 0.9399999976158142, "rewards/chosen": -7.98746395111084, "rewards/margins": 8.9141206741333, "rewards/rejected": -16.90158462524414, "sft_loss": 1.4039214849472046, "step": 1785 }, { "epoch": 2.9421433267587114, "grad_norm": 14.541496183664128, "learning_rate": 2.3763158824419147e-10, "logits/chosen": 16.89483642578125, "logits/rejected": 17.82222557067871, "logps/chosen": -328.6429748535156, "logps/rejected": -268.05938720703125, "loss": 0.3455, "rewards/accuracies": 1.0, "rewards/chosen": -8.467479705810547, "rewards/margins": 9.740607261657715, "rewards/rejected": -18.208087921142578, "sft_loss": 1.1161048412322998, "step": 1790 }, { "epoch": 2.950361604207758, "grad_norm": 14.008459290888235, "learning_rate": 1.728939680898517e-10, "logits/chosen": 16.10931396484375, "logits/rejected": 17.673229217529297, "logps/chosen": -308.1393737792969, "logps/rejected": -262.28009033203125, "loss": 0.367, "rewards/accuracies": 0.9900000095367432, "rewards/chosen": -9.119178771972656, "rewards/margins": 8.93433666229248, "rewards/rejected": -18.05351448059082, "sft_loss": 1.1963419914245605, "step": 1795 }, { "epoch": 2.9585798816568047, "grad_norm": 16.666553214725845, "learning_rate": 1.184234978636456e-10, "logits/chosen": 16.49167823791504, "logits/rejected": 17.332914352416992, "logps/chosen": -282.5769348144531, "logps/rejected": -248.25242614746094, "loss": 0.3921, "rewards/accuracies": 0.9599999785423279, "rewards/chosen": -8.752376556396484, "rewards/margins": 8.20280933380127, "rewards/rejected": -16.955184936523438, "sft_loss": 1.2729109525680542, "step": 1800 }, { "epoch": 2.9667981591058514, "grad_norm": 15.626709598718353, "learning_rate": 7.422465781431464e-11, "logits/chosen": 16.95427894592285, "logits/rejected": 17.818552017211914, "logps/chosen": -329.6918640136719, "logps/rejected": -266.5020446777344, "loss": 0.3843, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.854848861694336, "rewards/margins": 9.140162467956543, "rewards/rejected": -17.995010375976562, "sft_loss": 1.1496516466140747, "step": 1805 }, { "epoch": 2.975016436554898, "grad_norm": 12.600231440275685, "learning_rate": 4.030108333910598e-11, "logits/chosen": 17.70891571044922, "logits/rejected": 18.366714477539062, "logps/chosen": -295.2488708496094, "logps/rejected": -242.97634887695312, "loss": 0.3819, "rewards/accuracies": 0.9700000286102295, "rewards/chosen": -8.695423126220703, "rewards/margins": 8.07010269165039, "rewards/rejected": -16.765525817871094, "sft_loss": 1.1890416145324707, "step": 1810 }, { "epoch": 2.983234714003945, "grad_norm": 14.846673339349834, "learning_rate": 1.6655564684747713e-11, "logits/chosen": 17.073108673095703, "logits/rejected": 17.751785278320312, "logps/chosen": -334.0798034667969, "logps/rejected": -261.91644287109375, "loss": 0.372, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.156329154968262, "rewards/margins": 8.551715850830078, "rewards/rejected": -17.708045959472656, "sft_loss": 1.1268292665481567, "step": 1815 }, { "epoch": 2.9914529914529915, "grad_norm": 9.077728581968316, "learning_rate": 3.290046717979722e-12, "logits/chosen": 16.00580406188965, "logits/rejected": 16.491676330566406, "logps/chosen": -320.336181640625, "logps/rejected": -255.6234588623047, "loss": 0.3871, "rewards/accuracies": 0.9800000190734863, "rewards/chosen": -7.969948768615723, "rewards/margins": 9.396775245666504, "rewards/rejected": -17.366724014282227, "sft_loss": 1.16538667678833, "step": 1820 }, { "epoch": 2.998027613412229, "step": 1824, "total_flos": 287426369617920.0, "train_loss": 0.5032803327368017, "train_runtime": 76434.0426, "train_samples_per_second": 1.433, "train_steps_per_second": 0.024 } ], "logging_steps": 5, "max_steps": 1824, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 287426369617920.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }