{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999018549416037, "eval_steps": 100, "global_step": 7641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013086007786174632, "grad_norm": 3.453125, "learning_rate": 6.535947712418301e-09, "logits/chosen": -3.133624315261841, "logits/rejected": -2.8881046772003174, "logps/chosen": -299.845458984375, "logps/rejected": -459.1829528808594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0013086007786174633, "grad_norm": 3.46875, "learning_rate": 6.535947712418302e-08, "logits/chosen": -2.74552845954895, "logits/rejected": -2.4910266399383545, "logps/chosen": -268.30487060546875, "logps/rejected": -254.20309448242188, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0011577722616493702, "rewards/margins": 0.0008231456158682704, "rewards/rejected": 0.0003346265875734389, "step": 10 }, { "epoch": 0.0026172015572349266, "grad_norm": 3.34375, "learning_rate": 1.3071895424836603e-07, "logits/chosen": -2.8142099380493164, "logits/rejected": -2.703378200531006, "logps/chosen": -291.9176940917969, "logps/rejected": -261.33306884765625, "loss": 0.6931, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.645151683595031e-05, "rewards/margins": -0.001433581579476595, "rewards/rejected": 0.0013471299316734076, "step": 20 }, { "epoch": 0.00392580233585239, "grad_norm": 3.125, "learning_rate": 1.9607843137254904e-07, "logits/chosen": -2.830498456954956, "logits/rejected": -2.6882920265197754, "logps/chosen": -320.10040283203125, "logps/rejected": -255.1343994140625, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": 0.004387141205370426, "rewards/margins": 0.0030704389791935682, "rewards/rejected": 0.0013167023425921798, "step": 30 }, { "epoch": 0.005234403114469853, "grad_norm": 2.703125, "learning_rate": 2.6143790849673207e-07, "logits/chosen": -2.7558236122131348, "logits/rejected": -2.586261034011841, "logps/chosen": -299.48822021484375, "logps/rejected": -238.7899932861328, "loss": 0.6917, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.010473296977579594, "rewards/margins": 0.0032569519244134426, "rewards/rejected": 0.007216344587504864, "step": 40 }, { "epoch": 0.006543003893087316, "grad_norm": 4.3125, "learning_rate": 3.267973856209151e-07, "logits/chosen": -2.718708038330078, "logits/rejected": -2.715043306350708, "logps/chosen": -217.4893798828125, "logps/rejected": -234.1689453125, "loss": 0.6924, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.012741605751216412, "rewards/margins": 0.0002024418645305559, "rewards/rejected": 0.01253916323184967, "step": 50 }, { "epoch": 0.00785160467170478, "grad_norm": 3.546875, "learning_rate": 3.921568627450981e-07, "logits/chosen": -2.57351016998291, "logits/rejected": -2.622565746307373, "logps/chosen": -215.79931640625, "logps/rejected": -237.44970703125, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": 0.023167148232460022, "rewards/margins": 0.002548126969486475, "rewards/rejected": 0.020619021728634834, "step": 60 }, { "epoch": 0.009160205450322242, "grad_norm": 3.3125, "learning_rate": 4.5751633986928105e-07, "logits/chosen": -2.663116455078125, "logits/rejected": -2.5983188152313232, "logps/chosen": -237.55844116210938, "logps/rejected": -236.5060272216797, "loss": 0.6923, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.02354578860104084, "rewards/margins": 0.00010853088315343484, "rewards/rejected": 0.0234372578561306, "step": 70 }, { "epoch": 0.010468806228939706, "grad_norm": 2.390625, "learning_rate": 5.228758169934641e-07, "logits/chosen": -2.7669997215270996, "logits/rejected": -2.6788549423217773, "logps/chosen": -164.87515258789062, "logps/rejected": -197.90463256835938, "loss": 0.6904, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02290114387869835, "rewards/margins": 0.0025646351277828217, "rewards/rejected": 0.020336508750915527, "step": 80 }, { "epoch": 0.01177740700755717, "grad_norm": 2.609375, "learning_rate": 5.882352941176471e-07, "logits/chosen": -2.851588487625122, "logits/rejected": -2.749753475189209, "logps/chosen": -206.6322479248047, "logps/rejected": -206.440673828125, "loss": 0.6905, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0305376797914505, "rewards/margins": 0.005634760018438101, "rewards/rejected": 0.024902915582060814, "step": 90 }, { "epoch": 0.013086007786174633, "grad_norm": 2.53125, "learning_rate": 6.535947712418302e-07, "logits/chosen": -2.694608449935913, "logits/rejected": -2.6583847999572754, "logps/chosen": -256.8502197265625, "logps/rejected": -230.1700897216797, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03671552985906601, "rewards/margins": 0.004254254046827555, "rewards/rejected": 0.03246127441525459, "step": 100 }, { "epoch": 0.013086007786174633, "eval_logits/chosen": -2.162418842315674, "eval_logits/rejected": -2.064990520477295, "eval_logps/chosen": -261.29095458984375, "eval_logps/rejected": -238.280517578125, "eval_loss": 0.6888117790222168, "eval_rewards/accuracies": 0.6460000276565552, "eval_rewards/chosen": 0.03953809291124344, "eval_rewards/margins": 0.010370594449341297, "eval_rewards/rejected": 0.029167499393224716, "eval_runtime": 846.0459, "eval_samples_per_second": 2.364, "eval_steps_per_second": 0.148, "step": 100 }, { "epoch": 0.014394608564792096, "grad_norm": 3.0625, "learning_rate": 7.189542483660131e-07, "logits/chosen": -2.7804317474365234, "logits/rejected": -2.6632676124572754, "logps/chosen": -244.56723022460938, "logps/rejected": -228.0399932861328, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03750133141875267, "rewards/margins": 0.013126986101269722, "rewards/rejected": 0.024374349042773247, "step": 110 }, { "epoch": 0.01570320934340956, "grad_norm": 2.84375, "learning_rate": 7.843137254901962e-07, "logits/chosen": -2.8821868896484375, "logits/rejected": -2.673898458480835, "logps/chosen": -281.7503356933594, "logps/rejected": -244.0198211669922, "loss": 0.6891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03712741285562515, "rewards/margins": 0.006825722754001617, "rewards/rejected": 0.030301684513688087, "step": 120 }, { "epoch": 0.017011810122027023, "grad_norm": 3.828125, "learning_rate": 8.496732026143792e-07, "logits/chosen": -2.750903606414795, "logits/rejected": -2.7690634727478027, "logps/chosen": -243.74404907226562, "logps/rejected": -245.4093780517578, "loss": 0.6845, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.044052399694919586, "rewards/margins": 0.007450439967215061, "rewards/rejected": 0.0366019569337368, "step": 130 }, { "epoch": 0.018320410900644485, "grad_norm": 2.71875, "learning_rate": 9.150326797385621e-07, "logits/chosen": -2.9141838550567627, "logits/rejected": -2.641928195953369, "logps/chosen": -291.4339904785156, "logps/rejected": -274.22845458984375, "loss": 0.6878, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04081139713525772, "rewards/margins": 0.014349179342389107, "rewards/rejected": 0.026462215930223465, "step": 140 }, { "epoch": 0.01962901167926195, "grad_norm": 3.671875, "learning_rate": 9.80392156862745e-07, "logits/chosen": -2.7080326080322266, "logits/rejected": -2.4110770225524902, "logps/chosen": -259.6366882324219, "logps/rejected": -225.42214965820312, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.040627725422382355, "rewards/margins": 0.017721187323331833, "rewards/rejected": 0.02290654182434082, "step": 150 }, { "epoch": 0.020937612457879413, "grad_norm": 2.890625, "learning_rate": 1.0457516339869283e-06, "logits/chosen": -2.4773736000061035, "logits/rejected": -2.5386953353881836, "logps/chosen": -185.31134033203125, "logps/rejected": -224.26791381835938, "loss": 0.684, "rewards/accuracies": 0.625, "rewards/chosen": 0.03422543406486511, "rewards/margins": 0.012216294184327126, "rewards/rejected": 0.022009138017892838, "step": 160 }, { "epoch": 0.022246213236496875, "grad_norm": 2.859375, "learning_rate": 1.111111111111111e-06, "logits/chosen": -2.6746952533721924, "logits/rejected": -2.5606703758239746, "logps/chosen": -193.47288513183594, "logps/rejected": -210.39797973632812, "loss": 0.6827, "rewards/accuracies": 0.75, "rewards/chosen": 0.036096397787332535, "rewards/margins": 0.023590046912431717, "rewards/rejected": 0.012506348080933094, "step": 170 }, { "epoch": 0.02355481401511434, "grad_norm": 3.0, "learning_rate": 1.1764705882352942e-06, "logits/chosen": -2.7542338371276855, "logits/rejected": -2.7979209423065186, "logps/chosen": -282.67144775390625, "logps/rejected": -264.19122314453125, "loss": 0.6794, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.05426037311553955, "rewards/margins": 0.024076396599411964, "rewards/rejected": 0.030183976516127586, "step": 180 }, { "epoch": 0.024863414793731803, "grad_norm": 3.328125, "learning_rate": 1.2418300653594772e-06, "logits/chosen": -2.851957321166992, "logits/rejected": -2.6551170349121094, "logps/chosen": -246.5672149658203, "logps/rejected": -223.18069458007812, "loss": 0.674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05785245820879936, "rewards/margins": 0.04134316369891167, "rewards/rejected": 0.016509290784597397, "step": 190 }, { "epoch": 0.026172015572349265, "grad_norm": 3.015625, "learning_rate": 1.3071895424836604e-06, "logits/chosen": -2.586324691772461, "logits/rejected": -2.7165029048919678, "logps/chosen": -236.83724975585938, "logps/rejected": -226.9313507080078, "loss": 0.6814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.044018808752298355, "rewards/margins": 0.03245474025607109, "rewards/rejected": 0.011564062908291817, "step": 200 }, { "epoch": 0.026172015572349265, "eval_logits/chosen": -2.1610488891601562, "eval_logits/rejected": -2.0644867420196533, "eval_logps/chosen": -259.57257080078125, "eval_logps/rejected": -239.42713928222656, "eval_loss": 0.6774077415466309, "eval_rewards/accuracies": 0.6729999780654907, "eval_rewards/chosen": 0.05672186240553856, "eval_rewards/margins": 0.03902040794491768, "eval_rewards/rejected": 0.01770145259797573, "eval_runtime": 694.2809, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.18, "step": 200 }, { "epoch": 0.027480616350966727, "grad_norm": 3.78125, "learning_rate": 1.3725490196078434e-06, "logits/chosen": -2.7429161071777344, "logits/rejected": -2.4695589542388916, "logps/chosen": -291.7789001464844, "logps/rejected": -242.9710235595703, "loss": 0.6717, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05582891032099724, "rewards/margins": 0.046665292233228683, "rewards/rejected": 0.009163620881736279, "step": 210 }, { "epoch": 0.028789217129584193, "grad_norm": 3.0, "learning_rate": 1.4379084967320261e-06, "logits/chosen": -2.8057966232299805, "logits/rejected": -2.7114574909210205, "logps/chosen": -259.743896484375, "logps/rejected": -226.40823364257812, "loss": 0.6737, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03897574543952942, "rewards/margins": 0.05734593793749809, "rewards/rejected": -0.018370196223258972, "step": 220 }, { "epoch": 0.030097817908201655, "grad_norm": 3.453125, "learning_rate": 1.5032679738562091e-06, "logits/chosen": -2.7702572345733643, "logits/rejected": -2.7278804779052734, "logps/chosen": -210.2172393798828, "logps/rejected": -222.6385498046875, "loss": 0.668, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03193821758031845, "rewards/margins": 0.04395350068807602, "rewards/rejected": -0.012015283107757568, "step": 230 }, { "epoch": 0.03140641868681912, "grad_norm": 3.0625, "learning_rate": 1.5686274509803923e-06, "logits/chosen": -2.794287919998169, "logits/rejected": -2.6962103843688965, "logps/chosen": -308.37677001953125, "logps/rejected": -265.5712585449219, "loss": 0.6582, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.04428742453455925, "rewards/margins": 0.04681792110204697, "rewards/rejected": -0.002530490979552269, "step": 240 }, { "epoch": 0.03271501946543658, "grad_norm": 3.421875, "learning_rate": 1.6339869281045753e-06, "logits/chosen": -2.7941267490386963, "logits/rejected": -2.683361530303955, "logps/chosen": -236.01657104492188, "logps/rejected": -279.40032958984375, "loss": 0.659, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.051866620779037476, "rewards/margins": 0.06532473862171173, "rewards/rejected": -0.013458112254738808, "step": 250 }, { "epoch": 0.034023620244054045, "grad_norm": 4.21875, "learning_rate": 1.6993464052287585e-06, "logits/chosen": -2.714996814727783, "logits/rejected": -2.521692991256714, "logps/chosen": -250.5869598388672, "logps/rejected": -238.06216430664062, "loss": 0.652, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.03312501311302185, "rewards/margins": 0.11112229526042938, "rewards/rejected": -0.07799728214740753, "step": 260 }, { "epoch": 0.03533222102267151, "grad_norm": 3.453125, "learning_rate": 1.7647058823529414e-06, "logits/chosen": -2.701773166656494, "logits/rejected": -2.495032787322998, "logps/chosen": -276.38482666015625, "logps/rejected": -252.5791473388672, "loss": 0.6626, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.03999749571084976, "rewards/margins": 0.05221300199627876, "rewards/rejected": -0.09221049398183823, "step": 270 }, { "epoch": 0.03664082180128897, "grad_norm": 4.59375, "learning_rate": 1.8300653594771242e-06, "logits/chosen": -2.8904521465301514, "logits/rejected": -2.472062110900879, "logps/chosen": -301.0820007324219, "logps/rejected": -227.261474609375, "loss": 0.6562, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.020010793581604958, "rewards/margins": 0.09073401987552643, "rewards/rejected": -0.07072322815656662, "step": 280 }, { "epoch": 0.03794942257990643, "grad_norm": 5.3125, "learning_rate": 1.8954248366013072e-06, "logits/chosen": -2.9029669761657715, "logits/rejected": -2.5922694206237793, "logps/chosen": -293.40606689453125, "logps/rejected": -230.68606567382812, "loss": 0.6707, "rewards/accuracies": 0.5, "rewards/chosen": -0.0753670334815979, "rewards/margins": 0.019820818677544594, "rewards/rejected": -0.09518785774707794, "step": 290 }, { "epoch": 0.0392580233585239, "grad_norm": 4.0, "learning_rate": 1.96078431372549e-06, "logits/chosen": -2.7734737396240234, "logits/rejected": -2.6455962657928467, "logps/chosen": -253.349365234375, "logps/rejected": -248.7515411376953, "loss": 0.6494, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.12462016195058823, "rewards/margins": 0.09560440480709076, "rewards/rejected": -0.2202245444059372, "step": 300 }, { "epoch": 0.0392580233585239, "eval_logits/chosen": -2.134643077850342, "eval_logits/rejected": -2.039261817932129, "eval_logps/chosen": -276.6416320800781, "eval_logps/rejected": -262.8892517089844, "eval_loss": 0.6550637483596802, "eval_rewards/accuracies": 0.6899999976158142, "eval_rewards/chosen": -0.11396902054548264, "eval_rewards/margins": 0.10295076668262482, "eval_rewards/rejected": -0.21691979467868805, "eval_runtime": 693.9672, "eval_samples_per_second": 2.882, "eval_steps_per_second": 0.18, "step": 300 }, { "epoch": 0.04056662413714136, "grad_norm": 4.46875, "learning_rate": 2.0261437908496734e-06, "logits/chosen": -2.7108452320098877, "logits/rejected": -2.684077024459839, "logps/chosen": -277.8359375, "logps/rejected": -296.11920166015625, "loss": 0.6667, "rewards/accuracies": 0.625, "rewards/chosen": -0.11195526272058487, "rewards/margins": 0.07162605971097946, "rewards/rejected": -0.18358133733272552, "step": 310 }, { "epoch": 0.041875224915758825, "grad_norm": 4.8125, "learning_rate": 2.0915032679738565e-06, "logits/chosen": -2.7928097248077393, "logits/rejected": -2.449176788330078, "logps/chosen": -235.67855834960938, "logps/rejected": -216.2328643798828, "loss": 0.6528, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.049292467534542084, "rewards/margins": 0.0989774763584137, "rewards/rejected": -0.14826996624469757, "step": 320 }, { "epoch": 0.04318382569437629, "grad_norm": 4.0, "learning_rate": 2.1568627450980393e-06, "logits/chosen": -2.60710072517395, "logits/rejected": -2.503129720687866, "logps/chosen": -187.3062286376953, "logps/rejected": -173.73367309570312, "loss": 0.6787, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.15142996609210968, "rewards/margins": 0.02292514406144619, "rewards/rejected": -0.17435510456562042, "step": 330 }, { "epoch": 0.04449242647299375, "grad_norm": 5.5625, "learning_rate": 2.222222222222222e-06, "logits/chosen": -2.7713229656219482, "logits/rejected": -2.4837942123413086, "logps/chosen": -262.5021057128906, "logps/rejected": -260.2453918457031, "loss": 0.6506, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18623539805412292, "rewards/margins": 0.0945606678724289, "rewards/rejected": -0.2807961106300354, "step": 340 }, { "epoch": 0.04580102725161121, "grad_norm": 6.0, "learning_rate": 2.2875816993464053e-06, "logits/chosen": -2.528223991394043, "logits/rejected": -2.587984800338745, "logps/chosen": -222.79415893554688, "logps/rejected": -249.3542938232422, "loss": 0.6454, "rewards/accuracies": 0.625, "rewards/chosen": -0.1431507170200348, "rewards/margins": 0.0797526091337204, "rewards/rejected": -0.22290334105491638, "step": 350 }, { "epoch": 0.04710962803022868, "grad_norm": 3.171875, "learning_rate": 2.3529411764705885e-06, "logits/chosen": -2.748408079147339, "logits/rejected": -2.600682497024536, "logps/chosen": -254.2593994140625, "logps/rejected": -279.53094482421875, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": -0.08743225038051605, "rewards/margins": 0.09762908518314362, "rewards/rejected": -0.18506133556365967, "step": 360 }, { "epoch": 0.048418228808846143, "grad_norm": 7.21875, "learning_rate": 2.4183006535947716e-06, "logits/chosen": -2.843364715576172, "logits/rejected": -2.6083765029907227, "logps/chosen": -300.34149169921875, "logps/rejected": -291.03045654296875, "loss": 0.6476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.04248816519975662, "rewards/margins": 0.0738508552312851, "rewards/rejected": -0.11633902788162231, "step": 370 }, { "epoch": 0.049726829587463606, "grad_norm": 9.875, "learning_rate": 2.4836601307189544e-06, "logits/chosen": -2.674694538116455, "logits/rejected": -2.6630618572235107, "logps/chosen": -261.94354248046875, "logps/rejected": -286.7895202636719, "loss": 0.62, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08060801774263382, "rewards/margins": 0.15736189484596252, "rewards/rejected": -0.23796990513801575, "step": 380 }, { "epoch": 0.05103543036608107, "grad_norm": 7.3125, "learning_rate": 2.549019607843137e-06, "logits/chosen": -2.7141754627227783, "logits/rejected": -2.576420307159424, "logps/chosen": -253.9996795654297, "logps/rejected": -259.6171875, "loss": 0.6314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.19870714843273163, "rewards/margins": 0.16023008525371552, "rewards/rejected": -0.35893720388412476, "step": 390 }, { "epoch": 0.05234403114469853, "grad_norm": 7.1875, "learning_rate": 2.6143790849673208e-06, "logits/chosen": -2.533029794692993, "logits/rejected": -2.4597856998443604, "logps/chosen": -279.82562255859375, "logps/rejected": -253.9936065673828, "loss": 0.6429, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2200792133808136, "rewards/margins": 0.11525889486074448, "rewards/rejected": -0.33533811569213867, "step": 400 }, { "epoch": 0.05234403114469853, "eval_logits/chosen": -2.0982441902160645, "eval_logits/rejected": -2.0027949810028076, "eval_logps/chosen": -275.4341735839844, "eval_logps/rejected": -266.9226379394531, "eval_loss": 0.6381165385246277, "eval_rewards/accuracies": 0.6990000009536743, "eval_rewards/chosen": -0.10189421474933624, "eval_rewards/margins": 0.1553591638803482, "eval_rewards/rejected": -0.25725337862968445, "eval_runtime": 694.92, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 400 }, { "epoch": 0.05365263192331599, "grad_norm": 4.1875, "learning_rate": 2.6797385620915036e-06, "logits/chosen": -2.735205888748169, "logits/rejected": -2.498277187347412, "logps/chosen": -271.0599060058594, "logps/rejected": -236.83224487304688, "loss": 0.6458, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0746004730463028, "rewards/margins": 0.1512421816587448, "rewards/rejected": -0.2258426696062088, "step": 410 }, { "epoch": 0.054961232701933455, "grad_norm": 11.75, "learning_rate": 2.7450980392156867e-06, "logits/chosen": -2.764172077178955, "logits/rejected": -2.623178720474243, "logps/chosen": -254.3063201904297, "logps/rejected": -279.221923828125, "loss": 0.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07089678198099136, "rewards/margins": 0.1841188371181488, "rewards/rejected": -0.25501564145088196, "step": 420 }, { "epoch": 0.056269833480550924, "grad_norm": 3.4375, "learning_rate": 2.8104575163398695e-06, "logits/chosen": -2.7313365936279297, "logits/rejected": -2.6792924404144287, "logps/chosen": -235.04135131835938, "logps/rejected": -275.01837158203125, "loss": 0.6195, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.22802869975566864, "rewards/margins": 0.25247722864151, "rewards/rejected": -0.48050594329833984, "step": 430 }, { "epoch": 0.057578434259168386, "grad_norm": 4.1875, "learning_rate": 2.8758169934640523e-06, "logits/chosen": -2.6332790851593018, "logits/rejected": -2.4834609031677246, "logps/chosen": -310.53094482421875, "logps/rejected": -266.0542297363281, "loss": 0.6139, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13170072436332703, "rewards/margins": 0.2119872272014618, "rewards/rejected": -0.3436879515647888, "step": 440 }, { "epoch": 0.05888703503778585, "grad_norm": 5.28125, "learning_rate": 2.9411764705882355e-06, "logits/chosen": -2.773829221725464, "logits/rejected": -2.738015651702881, "logps/chosen": -326.7082214355469, "logps/rejected": -313.7978820800781, "loss": 0.6435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1770014464855194, "rewards/margins": 0.10400760173797607, "rewards/rejected": -0.28100907802581787, "step": 450 }, { "epoch": 0.06019563581640331, "grad_norm": 5.875, "learning_rate": 3.0065359477124182e-06, "logits/chosen": -2.800029754638672, "logits/rejected": -2.6550393104553223, "logps/chosen": -386.15350341796875, "logps/rejected": -320.4053649902344, "loss": 0.6671, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.224983811378479, "rewards/margins": 0.08370286226272583, "rewards/rejected": -0.30868667364120483, "step": 460 }, { "epoch": 0.06150423659502077, "grad_norm": 6.03125, "learning_rate": 3.071895424836602e-06, "logits/chosen": -2.5105133056640625, "logits/rejected": -2.5750062465667725, "logps/chosen": -351.1463317871094, "logps/rejected": -384.9257507324219, "loss": 0.6259, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1506466418504715, "rewards/margins": 0.2824404239654541, "rewards/rejected": -0.433087021112442, "step": 470 }, { "epoch": 0.06281283737363824, "grad_norm": 6.46875, "learning_rate": 3.1372549019607846e-06, "logits/chosen": -2.66679310798645, "logits/rejected": -2.548027992248535, "logps/chosen": -364.0049743652344, "logps/rejected": -347.79736328125, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": -0.30273300409317017, "rewards/margins": 0.15888556838035583, "rewards/rejected": -0.4616185128688812, "step": 480 }, { "epoch": 0.0641214381522557, "grad_norm": 12.5625, "learning_rate": 3.2026143790849674e-06, "logits/chosen": -2.5177626609802246, "logits/rejected": -2.525653839111328, "logps/chosen": -196.8947296142578, "logps/rejected": -210.3141632080078, "loss": 0.5726, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2424604445695877, "rewards/margins": 0.28150254487991333, "rewards/rejected": -0.5239629149436951, "step": 490 }, { "epoch": 0.06543003893087317, "grad_norm": 7.09375, "learning_rate": 3.2679738562091506e-06, "logits/chosen": -2.6710803508758545, "logits/rejected": -2.6786065101623535, "logps/chosen": -364.4505615234375, "logps/rejected": -350.004150390625, "loss": 0.6276, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.38551443815231323, "rewards/margins": 0.285019189119339, "rewards/rejected": -0.6705336570739746, "step": 500 }, { "epoch": 0.06543003893087317, "eval_logits/chosen": -2.06601619720459, "eval_logits/rejected": -1.974152684211731, "eval_logps/chosen": -322.05047607421875, "eval_logps/rejected": -320.8680725097656, "eval_loss": 0.6315609216690063, "eval_rewards/accuracies": 0.6740000247955322, "eval_rewards/chosen": -0.5680570602416992, "eval_rewards/margins": 0.22865092754364014, "eval_rewards/rejected": -0.7967079281806946, "eval_runtime": 694.9513, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 500 }, { "epoch": 0.06673863970949062, "grad_norm": 6.78125, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -2.68634295463562, "logits/rejected": -2.592205762863159, "logps/chosen": -332.5856018066406, "logps/rejected": -310.7766418457031, "loss": 0.6493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.593994140625, "rewards/margins": 0.22697868943214417, "rewards/rejected": -0.8209728002548218, "step": 510 }, { "epoch": 0.06804724048810809, "grad_norm": 6.25, "learning_rate": 3.398692810457517e-06, "logits/chosen": -2.586400032043457, "logits/rejected": -2.4801104068756104, "logps/chosen": -335.7121276855469, "logps/rejected": -364.86663818359375, "loss": 0.608, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.45729050040245056, "rewards/margins": 0.24585747718811035, "rewards/rejected": -0.7031478881835938, "step": 520 }, { "epoch": 0.06935584126672556, "grad_norm": 7.09375, "learning_rate": 3.4640522875816997e-06, "logits/chosen": -2.4541001319885254, "logits/rejected": -2.4757907390594482, "logps/chosen": -309.0367126464844, "logps/rejected": -338.5701599121094, "loss": 0.5818, "rewards/accuracies": 0.75, "rewards/chosen": -0.2480420172214508, "rewards/margins": 0.39800363779067993, "rewards/rejected": -0.6460456848144531, "step": 530 }, { "epoch": 0.07066444204534301, "grad_norm": 5.1875, "learning_rate": 3.529411764705883e-06, "logits/chosen": -2.6413676738739014, "logits/rejected": -2.627882719039917, "logps/chosen": -260.72222900390625, "logps/rejected": -250.4571990966797, "loss": 0.6137, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.23340921103954315, "rewards/margins": 0.19740113615989685, "rewards/rejected": -0.4308103621006012, "step": 540 }, { "epoch": 0.07197304282396048, "grad_norm": 8.0625, "learning_rate": 3.5947712418300657e-06, "logits/chosen": -2.590179920196533, "logits/rejected": -2.388469696044922, "logps/chosen": -260.7444763183594, "logps/rejected": -249.37210083007812, "loss": 0.6266, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4098880887031555, "rewards/margins": 0.21779771149158478, "rewards/rejected": -0.6276858448982239, "step": 550 }, { "epoch": 0.07328164360257794, "grad_norm": 5.0, "learning_rate": 3.6601307189542484e-06, "logits/chosen": -2.692124605178833, "logits/rejected": -2.6200509071350098, "logps/chosen": -327.5499572753906, "logps/rejected": -355.5756530761719, "loss": 0.6192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3477576673030853, "rewards/margins": 0.25027915835380554, "rewards/rejected": -0.5980368256568909, "step": 560 }, { "epoch": 0.07459024438119541, "grad_norm": 5.25, "learning_rate": 3.7254901960784316e-06, "logits/chosen": -2.704237222671509, "logits/rejected": -2.4131405353546143, "logps/chosen": -299.06121826171875, "logps/rejected": -272.4024963378906, "loss": 0.6138, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2633759081363678, "rewards/margins": 0.38646066188812256, "rewards/rejected": -0.649836540222168, "step": 570 }, { "epoch": 0.07589884515981286, "grad_norm": 12.75, "learning_rate": 3.7908496732026144e-06, "logits/chosen": -2.5342631340026855, "logits/rejected": -2.612247943878174, "logps/chosen": -380.5338134765625, "logps/rejected": -416.96087646484375, "loss": 0.5973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4141547679901123, "rewards/margins": 0.2748681604862213, "rewards/rejected": -0.6890228986740112, "step": 580 }, { "epoch": 0.07720744593843033, "grad_norm": 12.375, "learning_rate": 3.856209150326798e-06, "logits/chosen": -2.5626425743103027, "logits/rejected": -2.4625744819641113, "logps/chosen": -275.0553283691406, "logps/rejected": -270.68896484375, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": -0.42746955156326294, "rewards/margins": 0.2816309630870819, "rewards/rejected": -0.7091004848480225, "step": 590 }, { "epoch": 0.0785160467170478, "grad_norm": 12.875, "learning_rate": 3.92156862745098e-06, "logits/chosen": -2.7567574977874756, "logits/rejected": -2.4238028526306152, "logps/chosen": -329.51495361328125, "logps/rejected": -380.87750244140625, "loss": 0.6018, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6669514179229736, "rewards/margins": 0.3520125150680542, "rewards/rejected": -1.0189640522003174, "step": 600 }, { "epoch": 0.0785160467170478, "eval_logits/chosen": -2.046386480331421, "eval_logits/rejected": -1.9554511308670044, "eval_logps/chosen": -324.3074951171875, "eval_logps/rejected": -329.5355529785156, "eval_loss": 0.6195046305656433, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.5906271934509277, "eval_rewards/margins": 0.2927560806274414, "eval_rewards/rejected": -0.8833832740783691, "eval_runtime": 696.8013, "eval_samples_per_second": 2.87, "eval_steps_per_second": 0.179, "step": 600 }, { "epoch": 0.07982464749566526, "grad_norm": 8.6875, "learning_rate": 3.986928104575164e-06, "logits/chosen": -2.6970603466033936, "logits/rejected": -2.519735336303711, "logps/chosen": -326.96746826171875, "logps/rejected": -291.82672119140625, "loss": 0.6218, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6261174082756042, "rewards/margins": 0.27490442991256714, "rewards/rejected": -0.9010218381881714, "step": 610 }, { "epoch": 0.08113324827428273, "grad_norm": 5.625, "learning_rate": 4.052287581699347e-06, "logits/chosen": -2.7524592876434326, "logits/rejected": -2.517402172088623, "logps/chosen": -364.2415466308594, "logps/rejected": -328.5043029785156, "loss": 0.5483, "rewards/accuracies": 0.75, "rewards/chosen": -0.39932483434677124, "rewards/margins": 0.4169841706752777, "rewards/rejected": -0.8163089752197266, "step": 620 }, { "epoch": 0.08244184905290018, "grad_norm": 14.125, "learning_rate": 4.11764705882353e-06, "logits/chosen": -2.652550220489502, "logits/rejected": -2.453728199005127, "logps/chosen": -328.25750732421875, "logps/rejected": -307.7398681640625, "loss": 0.6334, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6340068578720093, "rewards/margins": 0.24111728370189667, "rewards/rejected": -0.8751241564750671, "step": 630 }, { "epoch": 0.08375044983151765, "grad_norm": 7.84375, "learning_rate": 4.183006535947713e-06, "logits/chosen": -2.729841947555542, "logits/rejected": -2.612236738204956, "logps/chosen": -322.3873291015625, "logps/rejected": -340.8619079589844, "loss": 0.6279, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.6506197452545166, "rewards/margins": 0.23128679394721985, "rewards/rejected": -0.8819063901901245, "step": 640 }, { "epoch": 0.0850590506101351, "grad_norm": 10.25, "learning_rate": 4.2483660130718954e-06, "logits/chosen": -2.4534859657287598, "logits/rejected": -2.5504367351531982, "logps/chosen": -300.8963317871094, "logps/rejected": -353.83697509765625, "loss": 0.6072, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.7499805092811584, "rewards/margins": 0.1903887689113617, "rewards/rejected": -0.9403693079948425, "step": 650 }, { "epoch": 0.08636765138875258, "grad_norm": 5.875, "learning_rate": 4.313725490196079e-06, "logits/chosen": -2.5822176933288574, "logits/rejected": -2.543501377105713, "logps/chosen": -358.6070251464844, "logps/rejected": -350.14166259765625, "loss": 0.5927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.8851343989372253, "rewards/margins": 0.2728622853755951, "rewards/rejected": -1.1579968929290771, "step": 660 }, { "epoch": 0.08767625216737004, "grad_norm": 9.125, "learning_rate": 4.379084967320262e-06, "logits/chosen": -2.5809431076049805, "logits/rejected": -2.489884853363037, "logps/chosen": -355.50958251953125, "logps/rejected": -396.1313781738281, "loss": 0.6378, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7190747857093811, "rewards/margins": 0.33461353182792664, "rewards/rejected": -1.0536882877349854, "step": 670 }, { "epoch": 0.0889848529459875, "grad_norm": 6.65625, "learning_rate": 4.444444444444444e-06, "logits/chosen": -2.5445759296417236, "logits/rejected": -2.5626556873321533, "logps/chosen": -293.88848876953125, "logps/rejected": -375.78509521484375, "loss": 0.5175, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.42032137513160706, "rewards/margins": 0.6218963265419006, "rewards/rejected": -1.0422178506851196, "step": 680 }, { "epoch": 0.09029345372460497, "grad_norm": 7.5625, "learning_rate": 4.509803921568628e-06, "logits/chosen": -2.556027889251709, "logits/rejected": -2.4640331268310547, "logps/chosen": -356.29547119140625, "logps/rejected": -355.5421447753906, "loss": 0.6149, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7879628539085388, "rewards/margins": 0.22192111611366272, "rewards/rejected": -1.009883999824524, "step": 690 }, { "epoch": 0.09160205450322242, "grad_norm": 14.25, "learning_rate": 4.5751633986928105e-06, "logits/chosen": -2.219818353652954, "logits/rejected": -2.0704474449157715, "logps/chosen": -377.2328796386719, "logps/rejected": -441.54180908203125, "loss": 0.636, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.189365029335022, "rewards/margins": 0.37216681241989136, "rewards/rejected": -1.5615317821502686, "step": 700 }, { "epoch": 0.09160205450322242, "eval_logits/chosen": -1.9222079515457153, "eval_logits/rejected": -1.8346830606460571, "eval_logps/chosen": -368.23126220703125, "eval_logps/rejected": -386.8725280761719, "eval_loss": 0.5903733372688293, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": -1.02986478805542, "eval_rewards/margins": 0.4268879294395447, "eval_rewards/rejected": -1.4567525386810303, "eval_runtime": 695.3316, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 700 }, { "epoch": 0.0929106552818399, "grad_norm": 17.875, "learning_rate": 4.640522875816994e-06, "logits/chosen": -2.434007167816162, "logits/rejected": -2.183964967727661, "logps/chosen": -316.2218322753906, "logps/rejected": -324.13037109375, "loss": 0.5938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0364434719085693, "rewards/margins": 0.38746702671051025, "rewards/rejected": -1.42391037940979, "step": 710 }, { "epoch": 0.09421925606045736, "grad_norm": 16.125, "learning_rate": 4.705882352941177e-06, "logits/chosen": -2.454352855682373, "logits/rejected": -2.422165870666504, "logps/chosen": -400.0445251464844, "logps/rejected": -410.7084045410156, "loss": 0.5692, "rewards/accuracies": 0.625, "rewards/chosen": -1.1263761520385742, "rewards/margins": 0.3713650107383728, "rewards/rejected": -1.4977412223815918, "step": 720 }, { "epoch": 0.09552785683907482, "grad_norm": 18.125, "learning_rate": 4.77124183006536e-06, "logits/chosen": -2.2171223163604736, "logits/rejected": -2.0979580879211426, "logps/chosen": -470.1795959472656, "logps/rejected": -457.0987854003906, "loss": 0.6134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6794992685317993, "rewards/margins": 0.5177785158157349, "rewards/rejected": -2.197277545928955, "step": 730 }, { "epoch": 0.09683645761769229, "grad_norm": 12.8125, "learning_rate": 4.836601307189543e-06, "logits/chosen": -2.252986431121826, "logits/rejected": -2.1756319999694824, "logps/chosen": -283.9363708496094, "logps/rejected": -362.4868469238281, "loss": 0.5368, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0286294221878052, "rewards/margins": 0.47106271982192993, "rewards/rejected": -1.4996922016143799, "step": 740 }, { "epoch": 0.09814505839630974, "grad_norm": 11.0625, "learning_rate": 4.901960784313726e-06, "logits/chosen": -2.4927029609680176, "logits/rejected": -2.330719232559204, "logps/chosen": -420.7904357910156, "logps/rejected": -386.94873046875, "loss": 0.5645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9117021560668945, "rewards/margins": 0.4068918228149414, "rewards/rejected": -1.318593978881836, "step": 750 }, { "epoch": 0.09945365917492721, "grad_norm": 7.0625, "learning_rate": 4.967320261437909e-06, "logits/chosen": -2.4609673023223877, "logits/rejected": -2.106842517852783, "logps/chosen": -374.54241943359375, "logps/rejected": -388.4040832519531, "loss": 0.5731, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7846518754959106, "rewards/margins": 0.7572110891342163, "rewards/rejected": -1.541862964630127, "step": 760 }, { "epoch": 0.10076225995354467, "grad_norm": 12.5625, "learning_rate": 4.999993476542427e-06, "logits/chosen": -2.446294069290161, "logits/rejected": -2.277003049850464, "logps/chosen": -419.21624755859375, "logps/rejected": -423.41827392578125, "loss": 0.6029, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8882554769515991, "rewards/margins": 0.4341462254524231, "rewards/rejected": -1.3224016427993774, "step": 770 }, { "epoch": 0.10207086073216214, "grad_norm": 12.5, "learning_rate": 4.999941289086112e-06, "logits/chosen": -2.31380033493042, "logits/rejected": -2.1345832347869873, "logps/chosen": -334.14971923828125, "logps/rejected": -361.8697204589844, "loss": 0.5341, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9129344820976257, "rewards/margins": 0.36324501037597656, "rewards/rejected": -1.276179552078247, "step": 780 }, { "epoch": 0.1033794615107796, "grad_norm": 10.4375, "learning_rate": 4.999836915262896e-06, "logits/chosen": -2.0711236000061035, "logits/rejected": -1.9961268901824951, "logps/chosen": -393.3041687011719, "logps/rejected": -402.29833984375, "loss": 0.6095, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4903818368911743, "rewards/margins": 0.3466147184371948, "rewards/rejected": -1.8369964361190796, "step": 790 }, { "epoch": 0.10468806228939706, "grad_norm": 7.125, "learning_rate": 4.999680357251587e-06, "logits/chosen": -2.2682178020477295, "logits/rejected": -2.026916265487671, "logps/chosen": -458.18585205078125, "logps/rejected": -419.8153381347656, "loss": 0.6106, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6476637125015259, "rewards/margins": 0.43079710006713867, "rewards/rejected": -2.078460931777954, "step": 800 }, { "epoch": 0.10468806228939706, "eval_logits/chosen": -1.826926827430725, "eval_logits/rejected": -1.7387028932571411, "eval_logps/chosen": -400.1948547363281, "eval_logps/rejected": -429.45770263671875, "eval_loss": 0.5813018679618835, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -1.3495010137557983, "eval_rewards/margins": 0.5331032276153564, "eval_rewards/rejected": -1.8826041221618652, "eval_runtime": 694.2525, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.18, "step": 800 }, { "epoch": 0.10599666306801453, "grad_norm": 11.125, "learning_rate": 4.999471618320339e-06, "logits/chosen": -2.2203376293182373, "logits/rejected": -2.1777122020721436, "logps/chosen": -324.2668151855469, "logps/rejected": -390.4042053222656, "loss": 0.6144, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3276973962783813, "rewards/margins": 0.3980697989463806, "rewards/rejected": -1.7257671356201172, "step": 810 }, { "epoch": 0.10730526384663198, "grad_norm": 10.5, "learning_rate": 4.999210702826586e-06, "logits/chosen": -2.3475563526153564, "logits/rejected": -2.3143320083618164, "logps/chosen": -397.86712646484375, "logps/rejected": -435.1482849121094, "loss": 0.6491, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8380275964736938, "rewards/margins": 0.4361690580844879, "rewards/rejected": -1.2741968631744385, "step": 820 }, { "epoch": 0.10861386462524945, "grad_norm": 6.96875, "learning_rate": 4.998897616216947e-06, "logits/chosen": -2.5597987174987793, "logits/rejected": -2.1801209449768066, "logps/chosen": -318.09149169921875, "logps/rejected": -307.72900390625, "loss": 0.5647, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.44484537839889526, "rewards/margins": 0.5076198577880859, "rewards/rejected": -0.9524652361869812, "step": 830 }, { "epoch": 0.10992246540386691, "grad_norm": 13.8125, "learning_rate": 4.998532365027117e-06, "logits/chosen": -2.276048183441162, "logits/rejected": -2.1829562187194824, "logps/chosen": -277.0499572753906, "logps/rejected": -349.51318359375, "loss": 0.5508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8033429384231567, "rewards/margins": 0.4512738585472107, "rewards/rejected": -1.2546168565750122, "step": 840 }, { "epoch": 0.11123106618248438, "grad_norm": 18.375, "learning_rate": 4.9981149568817275e-06, "logits/chosen": -2.4517717361450195, "logits/rejected": -2.336831569671631, "logps/chosen": -358.93292236328125, "logps/rejected": -394.17608642578125, "loss": 0.5933, "rewards/accuracies": 0.625, "rewards/chosen": -0.6200645565986633, "rewards/margins": 0.611357569694519, "rewards/rejected": -1.2314220666885376, "step": 850 }, { "epoch": 0.11253966696110185, "grad_norm": 7.90625, "learning_rate": 4.997645400494192e-06, "logits/chosen": -2.3822641372680664, "logits/rejected": -2.0846621990203857, "logps/chosen": -283.80474853515625, "logps/rejected": -288.5283203125, "loss": 0.5103, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3550966680049896, "rewards/margins": 0.7035893201828003, "rewards/rejected": -1.0586860179901123, "step": 860 }, { "epoch": 0.1138482677397193, "grad_norm": 19.375, "learning_rate": 4.997123705666514e-06, "logits/chosen": -2.5817313194274902, "logits/rejected": -2.4443323612213135, "logps/chosen": -322.55633544921875, "logps/rejected": -331.74627685546875, "loss": 0.6366, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6973295211791992, "rewards/margins": 0.23548975586891174, "rewards/rejected": -0.9328191876411438, "step": 870 }, { "epoch": 0.11515686851833677, "grad_norm": 15.625, "learning_rate": 4.996549883289093e-06, "logits/chosen": -2.4255664348602295, "logits/rejected": -2.1462035179138184, "logps/chosen": -378.1709289550781, "logps/rejected": -354.3892517089844, "loss": 0.5333, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7117990255355835, "rewards/margins": 0.6358679533004761, "rewards/rejected": -1.3476669788360596, "step": 880 }, { "epoch": 0.11646546929695423, "grad_norm": 13.3125, "learning_rate": 4.995923945340495e-06, "logits/chosen": -2.3038673400878906, "logits/rejected": -2.1145660877227783, "logps/chosen": -384.0230407714844, "logps/rejected": -366.76800537109375, "loss": 0.5662, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6253888010978699, "rewards/margins": 0.5550283789634705, "rewards/rejected": -1.1804172992706299, "step": 890 }, { "epoch": 0.1177740700755717, "grad_norm": 6.75, "learning_rate": 4.995245904887195e-06, "logits/chosen": -2.2408289909362793, "logits/rejected": -2.1636033058166504, "logps/chosen": -309.40728759765625, "logps/rejected": -335.7651672363281, "loss": 0.5485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6348327398300171, "rewards/margins": 0.5365878939628601, "rewards/rejected": -1.1714205741882324, "step": 900 }, { "epoch": 0.1177740700755717, "eval_logits/chosen": -1.776275873184204, "eval_logits/rejected": -1.6837424039840698, "eval_logps/chosen": -363.5506286621094, "eval_logps/rejected": -387.5206604003906, "eval_loss": 0.5887011885643005, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": -0.9830588102340698, "eval_rewards/margins": 0.4801746904850006, "eval_rewards/rejected": -1.4632333517074585, "eval_runtime": 695.5276, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 900 }, { "epoch": 0.11908267085418915, "grad_norm": 8.5, "learning_rate": 4.994515776083313e-06, "logits/chosen": -2.247063159942627, "logits/rejected": -2.040231227874756, "logps/chosen": -396.10748291015625, "logps/rejected": -380.2010803222656, "loss": 0.5818, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0978105068206787, "rewards/margins": 0.4304744601249695, "rewards/rejected": -1.5282847881317139, "step": 910 }, { "epoch": 0.12039127163280662, "grad_norm": 40.75, "learning_rate": 4.993733574170316e-06, "logits/chosen": -2.019604444503784, "logits/rejected": -2.072674512863159, "logps/chosen": -332.5871887207031, "logps/rejected": -405.72259521484375, "loss": 0.5236, "rewards/accuracies": 0.75, "rewards/chosen": -1.0697968006134033, "rewards/margins": 0.6795397996902466, "rewards/rejected": -1.7493362426757812, "step": 920 }, { "epoch": 0.12169987241142409, "grad_norm": 6.5, "learning_rate": 4.992899315476696e-06, "logits/chosen": -2.0164577960968018, "logits/rejected": -2.029275417327881, "logps/chosen": -323.4343566894531, "logps/rejected": -357.60015869140625, "loss": 0.5438, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0637717247009277, "rewards/margins": 0.4745445251464844, "rewards/rejected": -1.538316249847412, "step": 930 }, { "epoch": 0.12300847319004155, "grad_norm": 7.28125, "learning_rate": 4.9920130174176354e-06, "logits/chosen": -2.1199328899383545, "logits/rejected": -1.996970772743225, "logps/chosen": -393.02520751953125, "logps/rejected": -407.9481506347656, "loss": 0.5931, "rewards/accuracies": 0.625, "rewards/chosen": -1.3106439113616943, "rewards/margins": 0.5086703300476074, "rewards/rejected": -1.8193143606185913, "step": 940 }, { "epoch": 0.12431707396865901, "grad_norm": 10.0625, "learning_rate": 4.991074698494638e-06, "logits/chosen": -2.2186036109924316, "logits/rejected": -1.9157623052597046, "logps/chosen": -341.947998046875, "logps/rejected": -432.37371826171875, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": -1.0713350772857666, "rewards/margins": 0.5991930961608887, "rewards/rejected": -1.6705281734466553, "step": 950 }, { "epoch": 0.12562567474727648, "grad_norm": 5.03125, "learning_rate": 4.990084378295148e-06, "logits/chosen": -2.022984743118286, "logits/rejected": -2.1141040325164795, "logps/chosen": -304.38726806640625, "logps/rejected": -412.3209533691406, "loss": 0.4921, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8197789192199707, "rewards/margins": 0.9947313070297241, "rewards/rejected": -1.8145103454589844, "step": 960 }, { "epoch": 0.12693427552589392, "grad_norm": 18.0, "learning_rate": 4.989042077492135e-06, "logits/chosen": -1.9898399114608765, "logits/rejected": -1.7887752056121826, "logps/chosen": -371.2681579589844, "logps/rejected": -422.290771484375, "loss": 0.544, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.571070909500122, "rewards/margins": 0.5034223794937134, "rewards/rejected": -2.074493169784546, "step": 970 }, { "epoch": 0.1282428763045114, "grad_norm": 18.375, "learning_rate": 4.987947817843665e-06, "logits/chosen": -2.1672847270965576, "logits/rejected": -2.1956496238708496, "logps/chosen": -399.69384765625, "logps/rejected": -467.14154052734375, "loss": 0.5223, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2246183156967163, "rewards/margins": 0.8070001602172852, "rewards/rejected": -2.031618595123291, "step": 980 }, { "epoch": 0.12955147708312886, "grad_norm": 6.3125, "learning_rate": 4.986801622192453e-06, "logits/chosen": -2.2202255725860596, "logits/rejected": -1.9479494094848633, "logps/chosen": -478.768310546875, "logps/rejected": -475.72613525390625, "loss": 0.5695, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5597856044769287, "rewards/margins": 0.5795885324478149, "rewards/rejected": -2.139374256134033, "step": 990 }, { "epoch": 0.13086007786174633, "grad_norm": 6.5, "learning_rate": 4.985603514465372e-06, "logits/chosen": -2.0669589042663574, "logits/rejected": -2.04697847366333, "logps/chosen": -375.2518310546875, "logps/rejected": -402.5460510253906, "loss": 0.5219, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.244081735610962, "rewards/margins": 0.6269000768661499, "rewards/rejected": -1.8709819316864014, "step": 1000 }, { "epoch": 0.13086007786174633, "eval_logits/chosen": -1.76917564868927, "eval_logits/rejected": -1.6713759899139404, "eval_logps/chosen": -394.4333801269531, "eval_logps/rejected": -431.7823181152344, "eval_loss": 0.5733664035797119, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": -1.2918857336044312, "eval_rewards/margins": 0.6139645576477051, "eval_rewards/rejected": -1.9058502912521362, "eval_runtime": 694.8929, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 1000 }, { "epoch": 0.1321686786403638, "grad_norm": 20.5, "learning_rate": 4.984353519672966e-06, "logits/chosen": -2.2620849609375, "logits/rejected": -2.15934419631958, "logps/chosen": -375.8115539550781, "logps/rejected": -460.546142578125, "loss": 0.5287, "rewards/accuracies": 0.75, "rewards/chosen": -1.2339917421340942, "rewards/margins": 0.8814796209335327, "rewards/rejected": -2.115471363067627, "step": 1010 }, { "epoch": 0.13347727941898124, "grad_norm": 4.65625, "learning_rate": 4.9830516639089226e-06, "logits/chosen": -2.0328898429870605, "logits/rejected": -2.0181996822357178, "logps/chosen": -389.45904541015625, "logps/rejected": -448.26751708984375, "loss": 0.5643, "rewards/accuracies": 0.625, "rewards/chosen": -1.2106788158416748, "rewards/margins": 0.5578769445419312, "rewards/rejected": -1.7685556411743164, "step": 1020 }, { "epoch": 0.1347858801975987, "grad_norm": 5.78125, "learning_rate": 4.9816979743495296e-06, "logits/chosen": -2.1557652950286865, "logits/rejected": -2.241659641265869, "logps/chosen": -353.7569885253906, "logps/rejected": -443.529296875, "loss": 0.612, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7681421041488647, "rewards/margins": 0.6859203577041626, "rewards/rejected": -1.4540624618530273, "step": 1030 }, { "epoch": 0.13609448097621618, "grad_norm": 17.75, "learning_rate": 4.980292479253105e-06, "logits/chosen": -2.3941943645477295, "logits/rejected": -2.3237578868865967, "logps/chosen": -390.670166015625, "logps/rejected": -452.5581970214844, "loss": 0.6326, "rewards/accuracies": 0.75, "rewards/chosen": -0.9859058260917664, "rewards/margins": 0.49169453978538513, "rewards/rejected": -1.477600336074829, "step": 1040 }, { "epoch": 0.13740308175483365, "grad_norm": 15.375, "learning_rate": 4.978835207959414e-06, "logits/chosen": -2.167393922805786, "logits/rejected": -2.225515127182007, "logps/chosen": -301.28094482421875, "logps/rejected": -375.46722412109375, "loss": 0.6572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7368876338005066, "rewards/margins": 0.45300549268722534, "rewards/rejected": -1.1898930072784424, "step": 1050 }, { "epoch": 0.13871168253345112, "grad_norm": 5.90625, "learning_rate": 4.977326190889046e-06, "logits/chosen": -2.0989904403686523, "logits/rejected": -2.10744571685791, "logps/chosen": -305.7406311035156, "logps/rejected": -381.25579833984375, "loss": 0.4961, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5243714451789856, "rewards/margins": 0.7416833639144897, "rewards/rejected": -1.2660547494888306, "step": 1060 }, { "epoch": 0.14002028331206856, "grad_norm": 9.1875, "learning_rate": 4.975765459542788e-06, "logits/chosen": -2.404186248779297, "logits/rejected": -1.9818274974822998, "logps/chosen": -441.0899353027344, "logps/rejected": -398.919921875, "loss": 0.4811, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8376018404960632, "rewards/margins": 0.8651601076126099, "rewards/rejected": -1.7027618885040283, "step": 1070 }, { "epoch": 0.14132888409068603, "grad_norm": 15.3125, "learning_rate": 4.9741530465009665e-06, "logits/chosen": -2.104609489440918, "logits/rejected": -2.057368755340576, "logps/chosen": -322.3277282714844, "logps/rejected": -370.4391174316406, "loss": 0.4454, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9650786519050598, "rewards/margins": 0.7507129311561584, "rewards/rejected": -1.7157914638519287, "step": 1080 }, { "epoch": 0.1426374848693035, "grad_norm": 38.75, "learning_rate": 4.972488985422763e-06, "logits/chosen": -2.1124370098114014, "logits/rejected": -2.0429623126983643, "logps/chosen": -356.8153381347656, "logps/rejected": -454.2369079589844, "loss": 0.6424, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.131348967552185, "rewards/margins": 0.6130356788635254, "rewards/rejected": -1.744384765625, "step": 1090 }, { "epoch": 0.14394608564792097, "grad_norm": 17.375, "learning_rate": 4.970773311045514e-06, "logits/chosen": -1.9918296337127686, "logits/rejected": -1.8689005374908447, "logps/chosen": -398.80426025390625, "logps/rejected": -422.1686096191406, "loss": 0.6123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0254011154174805, "rewards/margins": 0.821550726890564, "rewards/rejected": -1.8469518423080444, "step": 1100 }, { "epoch": 0.14394608564792097, "eval_logits/chosen": -1.7168198823928833, "eval_logits/rejected": -1.615927815437317, "eval_logps/chosen": -374.59619140625, "eval_logps/rejected": -419.5487365722656, "eval_loss": 0.5642207860946655, "eval_rewards/accuracies": 0.7059999704360962, "eval_rewards/chosen": -1.0935142040252686, "eval_rewards/margins": 0.6900003552436829, "eval_rewards/rejected": -1.7835146188735962, "eval_runtime": 695.9388, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 1100 }, { "epoch": 0.14525468642653844, "grad_norm": 11.875, "learning_rate": 4.969006059183984e-06, "logits/chosen": -2.220738649368286, "logits/rejected": -2.179539203643799, "logps/chosen": -370.10064697265625, "logps/rejected": -449.41180419921875, "loss": 0.5202, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1244131326675415, "rewards/margins": 0.6495555639266968, "rewards/rejected": -1.7739686965942383, "step": 1110 }, { "epoch": 0.14656328720515588, "grad_norm": 8.75, "learning_rate": 4.967187266729623e-06, "logits/chosen": -2.217677593231201, "logits/rejected": -2.119666337966919, "logps/chosen": -384.45196533203125, "logps/rejected": -382.54107666015625, "loss": 0.5363, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1153194904327393, "rewards/margins": 0.6102129817008972, "rewards/rejected": -1.7255322933197021, "step": 1120 }, { "epoch": 0.14787188798377335, "grad_norm": 8.9375, "learning_rate": 4.965316971649791e-06, "logits/chosen": -2.2305989265441895, "logits/rejected": -2.216444730758667, "logps/chosen": -451.52117919921875, "logps/rejected": -449.596923828125, "loss": 0.6422, "rewards/accuracies": 0.625, "rewards/chosen": -1.436486005783081, "rewards/margins": 0.3413011431694031, "rewards/rejected": -1.7777869701385498, "step": 1130 }, { "epoch": 0.14918048876239082, "grad_norm": 11.0, "learning_rate": 4.963395212986964e-06, "logits/chosen": -2.3231608867645264, "logits/rejected": -2.097801923751831, "logps/chosen": -472.21368408203125, "logps/rejected": -454.04254150390625, "loss": 0.5467, "rewards/accuracies": 0.5, "rewards/chosen": -1.4883655309677124, "rewards/margins": 0.5178887844085693, "rewards/rejected": -2.0062544345855713, "step": 1140 }, { "epoch": 0.1504890895410083, "grad_norm": 13.6875, "learning_rate": 4.9614220308579285e-06, "logits/chosen": -2.13523006439209, "logits/rejected": -1.9117721319198608, "logps/chosen": -470.48431396484375, "logps/rejected": -471.52642822265625, "loss": 0.5122, "rewards/accuracies": 0.75, "rewards/chosen": -1.264061450958252, "rewards/margins": 0.8524338006973267, "rewards/rejected": -2.116495132446289, "step": 1150 }, { "epoch": 0.15179769031962573, "grad_norm": 6.34375, "learning_rate": 4.9593974664529325e-06, "logits/chosen": -2.5184741020202637, "logits/rejected": -2.238981008529663, "logps/chosen": -452.859375, "logps/rejected": -484.7657165527344, "loss": 0.5432, "rewards/accuracies": 0.75, "rewards/chosen": -1.1028008460998535, "rewards/margins": 0.8372576832771301, "rewards/rejected": -1.9400584697723389, "step": 1160 }, { "epoch": 0.1531062910982432, "grad_norm": 23.25, "learning_rate": 4.957321562034833e-06, "logits/chosen": -2.2394843101501465, "logits/rejected": -2.1320528984069824, "logps/chosen": -414.0975646972656, "logps/rejected": -419.32086181640625, "loss": 0.5877, "rewards/accuracies": 0.75, "rewards/chosen": -1.2223432064056396, "rewards/margins": 0.418790727853775, "rewards/rejected": -1.6411340236663818, "step": 1170 }, { "epoch": 0.15441489187686067, "grad_norm": 8.1875, "learning_rate": 4.955194360938214e-06, "logits/chosen": -2.0289759635925293, "logits/rejected": -1.897950530052185, "logps/chosen": -340.77496337890625, "logps/rejected": -398.953857421875, "loss": 0.5558, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9405407905578613, "rewards/margins": 0.865210235118866, "rewards/rejected": -1.805751085281372, "step": 1180 }, { "epoch": 0.15572349265547814, "grad_norm": 8.0625, "learning_rate": 4.9530159075684735e-06, "logits/chosen": -2.324169158935547, "logits/rejected": -2.324620008468628, "logps/chosen": -385.11883544921875, "logps/rejected": -418.5409240722656, "loss": 0.5828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.891398549079895, "rewards/margins": 0.51731938123703, "rewards/rejected": -1.4087178707122803, "step": 1190 }, { "epoch": 0.1570320934340956, "grad_norm": 6.25, "learning_rate": 4.950786247400908e-06, "logits/chosen": -2.3489413261413574, "logits/rejected": -2.119709014892578, "logps/chosen": -333.1446228027344, "logps/rejected": -384.34307861328125, "loss": 0.5182, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8438904881477356, "rewards/margins": 0.5933254957199097, "rewards/rejected": -1.43721604347229, "step": 1200 }, { "epoch": 0.1570320934340956, "eval_logits/chosen": -1.6661139726638794, "eval_logits/rejected": -1.558234691619873, "eval_logps/chosen": -340.93695068359375, "eval_logps/rejected": -379.10565185546875, "eval_loss": 0.555971086025238, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": -0.7569221258163452, "eval_rewards/margins": 0.6221617460250854, "eval_rewards/rejected": -1.3790837526321411, "eval_runtime": 695.5592, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 1200 }, { "epoch": 0.15834069421271305, "grad_norm": 6.84375, "learning_rate": 4.948505426979756e-06, "logits/chosen": -2.2319328784942627, "logits/rejected": -2.2286579608917236, "logps/chosen": -354.0445861816406, "logps/rejected": -408.71551513671875, "loss": 0.4878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7091296911239624, "rewards/margins": 0.8003088235855103, "rewards/rejected": -1.5094385147094727, "step": 1210 }, { "epoch": 0.15964929499133051, "grad_norm": 12.5, "learning_rate": 4.946173493917228e-06, "logits/chosen": -1.8814340829849243, "logits/rejected": -1.5862020254135132, "logps/chosen": -301.9439392089844, "logps/rejected": -357.1983337402344, "loss": 0.5441, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9751394391059875, "rewards/margins": 0.5223566889762878, "rewards/rejected": -1.4974961280822754, "step": 1220 }, { "epoch": 0.16095789576994798, "grad_norm": 11.125, "learning_rate": 4.943790496892513e-06, "logits/chosen": -2.02860689163208, "logits/rejected": -1.8663527965545654, "logps/chosen": -442.2098083496094, "logps/rejected": -445.437255859375, "loss": 0.5246, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1174280643463135, "rewards/margins": 0.7037648558616638, "rewards/rejected": -1.821192741394043, "step": 1230 }, { "epoch": 0.16226649654856545, "grad_norm": 22.75, "learning_rate": 4.941356485650762e-06, "logits/chosen": -2.0828185081481934, "logits/rejected": -1.8285248279571533, "logps/chosen": -444.2251892089844, "logps/rejected": -429.3589782714844, "loss": 0.5645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5694831609725952, "rewards/margins": 0.5421696901321411, "rewards/rejected": -2.1116528511047363, "step": 1240 }, { "epoch": 0.16357509732718292, "grad_norm": 18.75, "learning_rate": 4.93887151100205e-06, "logits/chosen": -1.8127317428588867, "logits/rejected": -1.6305725574493408, "logps/chosen": -414.1951599121094, "logps/rejected": -434.92236328125, "loss": 0.5987, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.616551160812378, "rewards/margins": 0.7335783839225769, "rewards/rejected": -2.3501296043395996, "step": 1250 }, { "epoch": 0.16488369810580036, "grad_norm": 10.75, "learning_rate": 4.936335624820313e-06, "logits/chosen": -1.8326165676116943, "logits/rejected": -1.729463815689087, "logps/chosen": -431.1419982910156, "logps/rejected": -560.9285888671875, "loss": 0.4057, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9068603515625, "rewards/margins": 0.9250564575195312, "rewards/rejected": -2.831916570663452, "step": 1260 }, { "epoch": 0.16619229888441783, "grad_norm": 9.875, "learning_rate": 4.933748880042271e-06, "logits/chosen": -1.5461452007293701, "logits/rejected": -1.1593910455703735, "logps/chosen": -475.909912109375, "logps/rejected": -578.0513916015625, "loss": 0.5499, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2510344982147217, "rewards/margins": 1.1949458122253418, "rewards/rejected": -3.4459805488586426, "step": 1270 }, { "epoch": 0.1675008996630353, "grad_norm": 14.375, "learning_rate": 4.931111330666317e-06, "logits/chosen": -1.9146766662597656, "logits/rejected": -1.786077857017517, "logps/chosen": -505.5401306152344, "logps/rejected": -577.0504150390625, "loss": 0.603, "rewards/accuracies": 0.75, "rewards/chosen": -2.1563708782196045, "rewards/margins": 0.8009198307991028, "rewards/rejected": -2.9572906494140625, "step": 1280 }, { "epoch": 0.16880950044165277, "grad_norm": 8.9375, "learning_rate": 4.9284230317513906e-06, "logits/chosen": -1.5630689859390259, "logits/rejected": -1.54221510887146, "logps/chosen": -399.7374572753906, "logps/rejected": -521.4496459960938, "loss": 0.4683, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.00687837600708, "rewards/margins": 1.0640445947647095, "rewards/rejected": -3.070923328399658, "step": 1290 }, { "epoch": 0.1701181012202702, "grad_norm": 17.625, "learning_rate": 4.9256840394158325e-06, "logits/chosen": -1.769784688949585, "logits/rejected": -1.7452151775360107, "logps/chosen": -440.3385314941406, "logps/rejected": -547.9986572265625, "loss": 0.5708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8598865270614624, "rewards/margins": 0.7337313890457153, "rewards/rejected": -2.5936179161071777, "step": 1300 }, { "epoch": 0.1701181012202702, "eval_logits/chosen": -1.40134859085083, "eval_logits/rejected": -1.2951076030731201, "eval_logps/chosen": -434.86798095703125, "eval_logps/rejected": -491.09393310546875, "eval_loss": 0.5491604208946228, "eval_rewards/accuracies": 0.7089999914169312, "eval_rewards/chosen": -1.6962319612503052, "eval_rewards/margins": 0.802734375, "eval_rewards/rejected": -2.4989659786224365, "eval_runtime": 695.8486, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 1300 }, { "epoch": 0.17142670199888768, "grad_norm": 11.8125, "learning_rate": 4.922894410836207e-06, "logits/chosen": -1.7147115468978882, "logits/rejected": -1.5505940914154053, "logps/chosen": -373.08221435546875, "logps/rejected": -412.87677001953125, "loss": 0.5714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5971529483795166, "rewards/margins": 0.7218188047409058, "rewards/rejected": -2.318972110748291, "step": 1310 }, { "epoch": 0.17273530277750515, "grad_norm": 13.0625, "learning_rate": 4.920054204246116e-06, "logits/chosen": -1.7671616077423096, "logits/rejected": -1.7780691385269165, "logps/chosen": -471.9405822753906, "logps/rejected": -538.1859130859375, "loss": 0.5885, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8693714141845703, "rewards/margins": 0.7509788870811462, "rewards/rejected": -2.6203503608703613, "step": 1320 }, { "epoch": 0.17404390355612262, "grad_norm": 7.96875, "learning_rate": 4.9171634789349744e-06, "logits/chosen": -1.8843683004379272, "logits/rejected": -1.6631240844726562, "logps/chosen": -450.54443359375, "logps/rejected": -448.0086364746094, "loss": 0.5799, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8109347820281982, "rewards/margins": 0.5625116229057312, "rewards/rejected": -2.373446464538574, "step": 1330 }, { "epoch": 0.1753525043347401, "grad_norm": 12.25, "learning_rate": 4.914222295246782e-06, "logits/chosen": -2.0203306674957275, "logits/rejected": -1.8653415441513062, "logps/chosen": -462.81646728515625, "logps/rejected": -471.01727294921875, "loss": 0.5173, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8872382640838623, "rewards/margins": 0.52006995677948, "rewards/rejected": -2.4073081016540527, "step": 1340 }, { "epoch": 0.17666110511335753, "grad_norm": 7.25, "learning_rate": 4.911230714578858e-06, "logits/chosen": -1.7289997339248657, "logits/rejected": -1.5992791652679443, "logps/chosen": -472.552490234375, "logps/rejected": -549.2454833984375, "loss": 0.4853, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0788731575012207, "rewards/margins": 0.9161497950553894, "rewards/rejected": -2.995023012161255, "step": 1350 }, { "epoch": 0.177969705891975, "grad_norm": 10.5, "learning_rate": 4.908188799380558e-06, "logits/chosen": -1.513983964920044, "logits/rejected": -1.4120738506317139, "logps/chosen": -492.3267517089844, "logps/rejected": -480.42388916015625, "loss": 0.5365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4247565269470215, "rewards/margins": 0.6793891191482544, "rewards/rejected": -3.1041455268859863, "step": 1360 }, { "epoch": 0.17927830667059247, "grad_norm": 8.8125, "learning_rate": 4.905096613151975e-06, "logits/chosen": -1.5550816059112549, "logits/rejected": -1.334904670715332, "logps/chosen": -416.26483154296875, "logps/rejected": -501.09442138671875, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.2082018852233887, "rewards/margins": 1.012620210647583, "rewards/rejected": -3.2208218574523926, "step": 1370 }, { "epoch": 0.18058690744920994, "grad_norm": 13.0625, "learning_rate": 4.90195422044261e-06, "logits/chosen": -1.5727694034576416, "logits/rejected": -1.5611732006072998, "logps/chosen": -480.3589782714844, "logps/rejected": -671.0353393554688, "loss": 0.4422, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2664101123809814, "rewards/margins": 1.2940673828125, "rewards/rejected": -3.5604774951934814, "step": 1380 }, { "epoch": 0.1818955082278274, "grad_norm": 12.25, "learning_rate": 4.898761686850028e-06, "logits/chosen": -1.7550901174545288, "logits/rejected": -1.6036916971206665, "logps/chosen": -518.725341796875, "logps/rejected": -571.9803466796875, "loss": 0.538, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.364833354949951, "rewards/margins": 1.0541280508041382, "rewards/rejected": -3.418961763381958, "step": 1390 }, { "epoch": 0.18320410900644485, "grad_norm": 14.1875, "learning_rate": 4.895519079018485e-06, "logits/chosen": -1.2297025918960571, "logits/rejected": -1.191098928451538, "logps/chosen": -533.7628173828125, "logps/rejected": -645.5424194335938, "loss": 0.5412, "rewards/accuracies": 0.75, "rewards/chosen": -3.0234737396240234, "rewards/margins": 0.8556011319160461, "rewards/rejected": -3.8790745735168457, "step": 1400 }, { "epoch": 0.18320410900644485, "eval_logits/chosen": -1.1873856782913208, "eval_logits/rejected": -1.0746079683303833, "eval_logps/chosen": -556.3367309570312, "eval_logps/rejected": -628.5355834960938, "eval_loss": 0.5652905106544495, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": -2.910919427871704, "eval_rewards/margins": 0.9624634981155396, "eval_rewards/rejected": -3.873382806777954, "eval_runtime": 695.9, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 1400 }, { "epoch": 0.18451270978506232, "grad_norm": 8.25, "learning_rate": 4.89222646463754e-06, "logits/chosen": -1.469787359237671, "logits/rejected": -1.3934171199798584, "logps/chosen": -530.5020751953125, "logps/rejected": -592.0689086914062, "loss": 0.5353, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7660608291625977, "rewards/margins": 0.7049436569213867, "rewards/rejected": -3.4710044860839844, "step": 1410 }, { "epoch": 0.1858213105636798, "grad_norm": 13.5, "learning_rate": 4.888883912440642e-06, "logits/chosen": -1.7719188928604126, "logits/rejected": -1.5662219524383545, "logps/chosen": -541.6236572265625, "logps/rejected": -640.8180541992188, "loss": 0.4952, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.249315023422241, "rewards/margins": 1.2302896976470947, "rewards/rejected": -3.479605197906494, "step": 1420 }, { "epoch": 0.18712991134229726, "grad_norm": 7.875, "learning_rate": 4.885491492203688e-06, "logits/chosen": -1.7445310354232788, "logits/rejected": -1.4956071376800537, "logps/chosen": -531.9814453125, "logps/rejected": -579.8106689453125, "loss": 0.4646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.2735629081726074, "rewards/margins": 1.0514878034591675, "rewards/rejected": -3.3250508308410645, "step": 1430 }, { "epoch": 0.18843851212091473, "grad_norm": 12.125, "learning_rate": 4.882049274743578e-06, "logits/chosen": -1.6057186126708984, "logits/rejected": -1.3012301921844482, "logps/chosen": -502.780029296875, "logps/rejected": -552.8013916015625, "loss": 0.5089, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9857542514801025, "rewards/margins": 1.0310163497924805, "rewards/rejected": -3.016770601272583, "step": 1440 }, { "epoch": 0.18974711289953217, "grad_norm": 23.375, "learning_rate": 4.878557331916729e-06, "logits/chosen": -1.2906947135925293, "logits/rejected": -1.2940791845321655, "logps/chosen": -483.41741943359375, "logps/rejected": -596.1483154296875, "loss": 0.5703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.628056764602661, "rewards/margins": 0.7995930910110474, "rewards/rejected": -3.427649736404419, "step": 1450 }, { "epoch": 0.19105571367814964, "grad_norm": 17.625, "learning_rate": 4.875015736617576e-06, "logits/chosen": -1.3006136417388916, "logits/rejected": -1.2754733562469482, "logps/chosen": -485.7308654785156, "logps/rejected": -512.1370849609375, "loss": 0.5289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.492917537689209, "rewards/margins": 0.6682485342025757, "rewards/rejected": -3.161165952682495, "step": 1460 }, { "epoch": 0.1923643144567671, "grad_norm": 11.6875, "learning_rate": 4.8714245627770515e-06, "logits/chosen": -1.568092703819275, "logits/rejected": -1.2629023790359497, "logps/chosen": -412.546630859375, "logps/rejected": -418.42889404296875, "loss": 0.523, "rewards/accuracies": 0.75, "rewards/chosen": -1.7840602397918701, "rewards/margins": 0.655758261680603, "rewards/rejected": -2.4398183822631836, "step": 1470 }, { "epoch": 0.19367291523538457, "grad_norm": 18.25, "learning_rate": 4.8677838853610445e-06, "logits/chosen": -1.4263960123062134, "logits/rejected": -1.2516647577285767, "logps/chosen": -401.9312744140625, "logps/rejected": -470.92315673828125, "loss": 0.4963, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9050906896591187, "rewards/margins": 1.2092244625091553, "rewards/rejected": -3.1143152713775635, "step": 1480 }, { "epoch": 0.19498151601400202, "grad_norm": 21.5, "learning_rate": 4.864093780368828e-06, "logits/chosen": -1.2440619468688965, "logits/rejected": -1.111897349357605, "logps/chosen": -423.28076171875, "logps/rejected": -545.4578857421875, "loss": 0.4239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6994727849960327, "rewards/margins": 1.295931339263916, "rewards/rejected": -2.9954044818878174, "step": 1490 }, { "epoch": 0.19629011679261948, "grad_norm": 38.25, "learning_rate": 4.860354324831482e-06, "logits/chosen": -1.5215343236923218, "logits/rejected": -1.4310561418533325, "logps/chosen": -491.52447509765625, "logps/rejected": -523.9422607421875, "loss": 0.5234, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.333941698074341, "rewards/margins": 0.7823376655578613, "rewards/rejected": -3.116279125213623, "step": 1500 }, { "epoch": 0.19629011679261948, "eval_logits/chosen": -1.1557797193527222, "eval_logits/rejected": -1.039581537246704, "eval_logps/chosen": -481.9322814941406, "eval_logps/rejected": -562.9971923828125, "eval_loss": 0.558701753616333, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": -2.1668753623962402, "eval_rewards/margins": 1.0511237382888794, "eval_rewards/rejected": -3.21799898147583, "eval_runtime": 695.7127, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 1500 }, { "epoch": 0.19759871757123695, "grad_norm": 12.3125, "learning_rate": 4.856565596810279e-06, "logits/chosen": -1.1147475242614746, "logits/rejected": -0.9326680898666382, "logps/chosen": -485.82525634765625, "logps/rejected": -591.4855346679688, "loss": 0.4781, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3456830978393555, "rewards/margins": 1.2880032062530518, "rewards/rejected": -3.633686065673828, "step": 1510 }, { "epoch": 0.19890731834985442, "grad_norm": 21.25, "learning_rate": 4.852727675395056e-06, "logits/chosen": -1.7259109020233154, "logits/rejected": -1.5380630493164062, "logps/chosen": -502.0819396972656, "logps/rejected": -559.0609741210938, "loss": 0.4859, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7101891040802002, "rewards/margins": 1.3138973712921143, "rewards/rejected": -3.0240862369537354, "step": 1520 }, { "epoch": 0.2002159191284719, "grad_norm": 14.125, "learning_rate": 4.848840640702565e-06, "logits/chosen": -1.6375019550323486, "logits/rejected": -1.3800207376480103, "logps/chosen": -461.59490966796875, "logps/rejected": -524.2827758789062, "loss": 0.5639, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1832072734832764, "rewards/margins": 0.9404889941215515, "rewards/rejected": -3.1236965656280518, "step": 1530 }, { "epoch": 0.20152451990708933, "grad_norm": 14.8125, "learning_rate": 4.844904573874798e-06, "logits/chosen": -1.576744794845581, "logits/rejected": -1.5538368225097656, "logps/chosen": -424.8856506347656, "logps/rejected": -528.5052490234375, "loss": 0.4964, "rewards/accuracies": 0.75, "rewards/chosen": -1.7674299478530884, "rewards/margins": 1.2252588272094727, "rewards/rejected": -2.9926888942718506, "step": 1540 }, { "epoch": 0.2028331206857068, "grad_norm": 12.1875, "learning_rate": 4.840919557077297e-06, "logits/chosen": -1.937974214553833, "logits/rejected": -1.5786306858062744, "logps/chosen": -535.5740966796875, "logps/rejected": -634.5531616210938, "loss": 0.5936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0312094688415527, "rewards/margins": 1.6538267135620117, "rewards/rejected": -3.6850364208221436, "step": 1550 }, { "epoch": 0.20414172146432427, "grad_norm": 15.5625, "learning_rate": 4.836885673497435e-06, "logits/chosen": -1.71206533908844, "logits/rejected": -1.6618907451629639, "logps/chosen": -470.39654541015625, "logps/rejected": -597.3626098632812, "loss": 0.6018, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9266071319580078, "rewards/margins": 1.2596946954727173, "rewards/rejected": -3.1863019466400146, "step": 1560 }, { "epoch": 0.20545032224294174, "grad_norm": 11.75, "learning_rate": 4.832803007342679e-06, "logits/chosen": -1.8466041088104248, "logits/rejected": -1.8102772235870361, "logps/chosen": -415.70147705078125, "logps/rejected": -441.17510986328125, "loss": 0.6309, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8670305013656616, "rewards/margins": 0.4250961244106293, "rewards/rejected": -2.2921266555786133, "step": 1570 }, { "epoch": 0.2067589230215592, "grad_norm": 11.0625, "learning_rate": 4.828671643838839e-06, "logits/chosen": -1.6601440906524658, "logits/rejected": -1.8990176916122437, "logps/chosen": -356.1528625488281, "logps/rejected": -485.62554931640625, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5033555030822754, "rewards/margins": 0.6190498471260071, "rewards/rejected": -2.122405529022217, "step": 1580 }, { "epoch": 0.20806752380017665, "grad_norm": 5.5, "learning_rate": 4.824491669228279e-06, "logits/chosen": -2.021857261657715, "logits/rejected": -1.6990569829940796, "logps/chosen": -355.2151794433594, "logps/rejected": -403.1901550292969, "loss": 0.4869, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3534119129180908, "rewards/margins": 0.8787269592285156, "rewards/rejected": -2.2321391105651855, "step": 1590 }, { "epoch": 0.20937612457879412, "grad_norm": 16.875, "learning_rate": 4.8202631707681245e-06, "logits/chosen": -1.9706346988677979, "logits/rejected": -1.5974613428115845, "logps/chosen": -479.8606872558594, "logps/rejected": -492.86962890625, "loss": 0.4682, "rewards/accuracies": 0.75, "rewards/chosen": -1.911543607711792, "rewards/margins": 0.9140397906303406, "rewards/rejected": -2.8255836963653564, "step": 1600 }, { "epoch": 0.20937612457879412, "eval_logits/chosen": -1.3887721300125122, "eval_logits/rejected": -1.2721153497695923, "eval_logps/chosen": -493.1747741699219, "eval_logps/rejected": -560.3236083984375, "eval_loss": 0.557616651058197, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": -2.2793002128601074, "eval_rewards/margins": 0.9119632840156555, "eval_rewards/rejected": -3.191263198852539, "eval_runtime": 695.5779, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 1600 }, { "epoch": 0.2106847253574116, "grad_norm": 12.375, "learning_rate": 4.815986236728437e-06, "logits/chosen": -1.7245861291885376, "logits/rejected": -1.695064902305603, "logps/chosen": -541.4497680664062, "logps/rejected": -569.51025390625, "loss": 0.7333, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.708329916000366, "rewards/margins": 0.2553494870662689, "rewards/rejected": -2.963679790496826, "step": 1610 }, { "epoch": 0.21199332613602906, "grad_norm": 12.75, "learning_rate": 4.811660956390372e-06, "logits/chosen": -1.6128660440444946, "logits/rejected": -1.5309574604034424, "logps/chosen": -507.1900329589844, "logps/rejected": -578.9901123046875, "loss": 0.5572, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.202852249145508, "rewards/margins": 0.9438586235046387, "rewards/rejected": -3.1467108726501465, "step": 1620 }, { "epoch": 0.21330192691464653, "grad_norm": 16.875, "learning_rate": 4.807287420044319e-06, "logits/chosen": -1.9317996501922607, "logits/rejected": -1.7577718496322632, "logps/chosen": -421.19598388671875, "logps/rejected": -510.766845703125, "loss": 0.5693, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.054368495941162, "rewards/margins": 0.6636547446250916, "rewards/rejected": -2.7180235385894775, "step": 1630 }, { "epoch": 0.21461052769326397, "grad_norm": 6.4375, "learning_rate": 4.802865718988008e-06, "logits/chosen": -1.7789043188095093, "logits/rejected": -1.7528728246688843, "logps/chosen": -442.048828125, "logps/rejected": -490.5374450683594, "loss": 0.5413, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9083898067474365, "rewards/margins": 0.6397808790206909, "rewards/rejected": -2.548171043395996, "step": 1640 }, { "epoch": 0.21591912847188144, "grad_norm": 16.875, "learning_rate": 4.798395945524615e-06, "logits/chosen": -1.842728614807129, "logits/rejected": -1.7828716039657593, "logps/chosen": -451.6410217285156, "logps/rejected": -520.708740234375, "loss": 0.6756, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7983973026275635, "rewards/margins": 0.4851222038269043, "rewards/rejected": -2.2835192680358887, "step": 1650 }, { "epoch": 0.2172277292504989, "grad_norm": 9.6875, "learning_rate": 4.793878192960823e-06, "logits/chosen": -1.7715752124786377, "logits/rejected": -1.7273404598236084, "logps/chosen": -410.7247009277344, "logps/rejected": -500.2601013183594, "loss": 0.5042, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.692538857460022, "rewards/margins": 0.8066767454147339, "rewards/rejected": -2.499215602874756, "step": 1660 }, { "epoch": 0.21853633002911638, "grad_norm": 11.6875, "learning_rate": 4.789312555604887e-06, "logits/chosen": -1.6747548580169678, "logits/rejected": -1.592901349067688, "logps/chosen": -375.0950012207031, "logps/rejected": -456.3650817871094, "loss": 0.5684, "rewards/accuracies": 0.625, "rewards/chosen": -1.738579511642456, "rewards/margins": 0.7269459962844849, "rewards/rejected": -2.4655253887176514, "step": 1670 }, { "epoch": 0.21984493080773382, "grad_norm": 7.84375, "learning_rate": 4.784699128764654e-06, "logits/chosen": -1.8446094989776611, "logits/rejected": -1.5997010469436646, "logps/chosen": -416.3131408691406, "logps/rejected": -491.85272216796875, "loss": 0.4489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3711521625518799, "rewards/margins": 1.2244420051574707, "rewards/rejected": -2.5955939292907715, "step": 1680 }, { "epoch": 0.2211535315863513, "grad_norm": 15.3125, "learning_rate": 4.780038008745581e-06, "logits/chosen": -1.6630077362060547, "logits/rejected": -1.5364792346954346, "logps/chosen": -402.2394104003906, "logps/rejected": -457.0186462402344, "loss": 0.5534, "rewards/accuracies": 0.75, "rewards/chosen": -1.533756136894226, "rewards/margins": 0.8668249249458313, "rewards/rejected": -2.400580883026123, "step": 1690 }, { "epoch": 0.22246213236496876, "grad_norm": 14.9375, "learning_rate": 4.775329292848721e-06, "logits/chosen": -1.7169631719589233, "logits/rejected": -1.5795819759368896, "logps/chosen": -428.14031982421875, "logps/rejected": -480.5162658691406, "loss": 0.5418, "rewards/accuracies": 0.625, "rewards/chosen": -1.5826160907745361, "rewards/margins": 0.4475887417793274, "rewards/rejected": -2.030205011367798, "step": 1700 }, { "epoch": 0.22246213236496876, "eval_logits/chosen": -1.3556407690048218, "eval_logits/rejected": -1.244292140007019, "eval_logps/chosen": -380.1678771972656, "eval_logps/rejected": -428.53924560546875, "eval_loss": 0.5463586449623108, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": -1.149231195449829, "eval_rewards/margins": 0.7241882085800171, "eval_rewards/rejected": -1.8734192848205566, "eval_runtime": 696.2956, "eval_samples_per_second": 2.872, "eval_steps_per_second": 0.18, "step": 1700 }, { "epoch": 0.22377073314358623, "grad_norm": 15.0, "learning_rate": 4.770573079368691e-06, "logits/chosen": -1.709832787513733, "logits/rejected": -1.7700859308242798, "logps/chosen": -428.7425231933594, "logps/rejected": -509.1302795410156, "loss": 0.5339, "rewards/accuracies": 0.75, "rewards/chosen": -1.4129393100738525, "rewards/margins": 0.634269118309021, "rewards/rejected": -2.047208309173584, "step": 1710 }, { "epoch": 0.2250793339222037, "grad_norm": 11.25, "learning_rate": 4.765769467591626e-06, "logits/chosen": -1.4418147802352905, "logits/rejected": -1.5490909814834595, "logps/chosen": -348.7679748535156, "logps/rejected": -516.7386474609375, "loss": 0.508, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3769243955612183, "rewards/margins": 1.078495979309082, "rewards/rejected": -2.4554202556610107, "step": 1720 }, { "epoch": 0.22638793470082114, "grad_norm": 8.875, "learning_rate": 4.760918557793096e-06, "logits/chosen": -1.430027723312378, "logits/rejected": -1.3021106719970703, "logps/chosen": -397.8459777832031, "logps/rejected": -496.3700256347656, "loss": 0.5278, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6415824890136719, "rewards/margins": 1.072715163230896, "rewards/rejected": -2.7142977714538574, "step": 1730 }, { "epoch": 0.2276965354794386, "grad_norm": 7.6875, "learning_rate": 4.756020451236025e-06, "logits/chosen": -1.5673891305923462, "logits/rejected": -1.270593523979187, "logps/chosen": -535.1604614257812, "logps/rejected": -642.0374145507812, "loss": 0.533, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3912014961242676, "rewards/margins": 0.9858331680297852, "rewards/rejected": -3.3770346641540527, "step": 1740 }, { "epoch": 0.22900513625805607, "grad_norm": 15.875, "learning_rate": 4.751075250168569e-06, "logits/chosen": -1.4712398052215576, "logits/rejected": -1.4582046270370483, "logps/chosen": -511.91412353515625, "logps/rejected": -574.1473388671875, "loss": 0.5212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.4587905406951904, "rewards/margins": 0.9689180254936218, "rewards/rejected": -3.427708864212036, "step": 1750 }, { "epoch": 0.23031373703667354, "grad_norm": 12.25, "learning_rate": 4.746083057821981e-06, "logits/chosen": -1.6008590459823608, "logits/rejected": -1.585439920425415, "logps/chosen": -377.8157653808594, "logps/rejected": -493.4844665527344, "loss": 0.5296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6869614124298096, "rewards/margins": 0.9665346145629883, "rewards/rejected": -2.653496026992798, "step": 1760 }, { "epoch": 0.231622337815291, "grad_norm": 7.6875, "learning_rate": 4.741043978408463e-06, "logits/chosen": -1.8323075771331787, "logits/rejected": -1.5091437101364136, "logps/chosen": -421.3490295410156, "logps/rejected": -447.42449951171875, "loss": 0.6054, "rewards/accuracies": 0.625, "rewards/chosen": -1.3869025707244873, "rewards/margins": 0.9462822079658508, "rewards/rejected": -2.3331847190856934, "step": 1770 }, { "epoch": 0.23293093859390845, "grad_norm": 14.875, "learning_rate": 4.735958117118983e-06, "logits/chosen": -1.7147724628448486, "logits/rejected": -1.7283947467803955, "logps/chosen": -358.48480224609375, "logps/rejected": -464.3477478027344, "loss": 0.5646, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4545023441314697, "rewards/margins": 0.6632472276687622, "rewards/rejected": -2.1177496910095215, "step": 1780 }, { "epoch": 0.23423953937252592, "grad_norm": 6.71875, "learning_rate": 4.730825580121084e-06, "logits/chosen": -1.6289749145507812, "logits/rejected": -1.5391643047332764, "logps/chosen": -426.4287109375, "logps/rejected": -477.80438232421875, "loss": 0.4482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.489031434059143, "rewards/margins": 1.1201632022857666, "rewards/rejected": -2.609194755554199, "step": 1790 }, { "epoch": 0.2355481401511434, "grad_norm": 13.625, "learning_rate": 4.725646474556666e-06, "logits/chosen": -1.5141410827636719, "logits/rejected": -1.3265539407730103, "logps/chosen": -402.4884338378906, "logps/rejected": -551.177001953125, "loss": 0.5283, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.941261887550354, "rewards/margins": 1.3314204216003418, "rewards/rejected": -3.2726821899414062, "step": 1800 }, { "epoch": 0.2355481401511434, "eval_logits/chosen": -1.3165640830993652, "eval_logits/rejected": -1.2144968509674072, "eval_logps/chosen": -468.1107482910156, "eval_logps/rejected": -535.9320678710938, "eval_loss": 0.534599244594574, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": -2.0286598205566406, "eval_rewards/margins": 0.9186881184577942, "eval_rewards/rejected": -2.94734787940979, "eval_runtime": 695.2957, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 1800 }, { "epoch": 0.23685674092976086, "grad_norm": 9.9375, "learning_rate": 4.720420908539748e-06, "logits/chosen": -1.9412696361541748, "logits/rejected": -1.5824329853057861, "logps/chosen": -517.2816772460938, "logps/rejected": -486.09088134765625, "loss": 0.5935, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0832431316375732, "rewards/margins": 0.6357382535934448, "rewards/rejected": -2.7189812660217285, "step": 1810 }, { "epoch": 0.2381653417083783, "grad_norm": 14.5, "learning_rate": 4.715148991154216e-06, "logits/chosen": -1.727787733078003, "logits/rejected": -1.5226155519485474, "logps/chosen": -474.5965270996094, "logps/rejected": -504.1936950683594, "loss": 0.5238, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0656206607818604, "rewards/margins": 0.6738082766532898, "rewards/rejected": -2.739428758621216, "step": 1820 }, { "epoch": 0.23947394248699577, "grad_norm": 10.5625, "learning_rate": 4.709830832451538e-06, "logits/chosen": -1.637353539466858, "logits/rejected": -1.239384651184082, "logps/chosen": -477.70245361328125, "logps/rejected": -536.7779541015625, "loss": 0.5576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0769999027252197, "rewards/margins": 1.1462960243225098, "rewards/rejected": -3.2232959270477295, "step": 1830 }, { "epoch": 0.24078254326561324, "grad_norm": 9.375, "learning_rate": 4.704466543448477e-06, "logits/chosen": -1.6385681629180908, "logits/rejected": -1.5267423391342163, "logps/chosen": -511.4849548339844, "logps/rejected": -577.7381591796875, "loss": 0.5037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1862730979919434, "rewards/margins": 0.9554746747016907, "rewards/rejected": -3.1417479515075684, "step": 1840 }, { "epoch": 0.2420911440442307, "grad_norm": 7.9375, "learning_rate": 4.699056236124762e-06, "logits/chosen": -1.677210807800293, "logits/rejected": -1.590447187423706, "logps/chosen": -458.7686462402344, "logps/rejected": -557.1763305664062, "loss": 0.5467, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1360931396484375, "rewards/margins": 1.0241457223892212, "rewards/rejected": -3.160238742828369, "step": 1850 }, { "epoch": 0.24339974482284818, "grad_norm": 7.84375, "learning_rate": 4.693600023420758e-06, "logits/chosen": -1.7774584293365479, "logits/rejected": -1.691724181175232, "logps/chosen": -432.95892333984375, "logps/rejected": -488.84002685546875, "loss": 0.5346, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8443599939346313, "rewards/margins": 0.6142092347145081, "rewards/rejected": -2.458569049835205, "step": 1860 }, { "epoch": 0.24470834560146562, "grad_norm": 10.5, "learning_rate": 4.688098019235108e-06, "logits/chosen": -1.9044535160064697, "logits/rejected": -1.5404713153839111, "logps/chosen": -470.3871154785156, "logps/rejected": -523.7406005859375, "loss": 0.446, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5809365510940552, "rewards/margins": 1.0278409719467163, "rewards/rejected": -2.6087777614593506, "step": 1870 }, { "epoch": 0.2460169463800831, "grad_norm": 10.5625, "learning_rate": 4.682550338422353e-06, "logits/chosen": -1.4271998405456543, "logits/rejected": -1.274027705192566, "logps/chosen": -468.80950927734375, "logps/rejected": -552.8151245117188, "loss": 0.4827, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1071698665618896, "rewards/margins": 1.0387284755706787, "rewards/rejected": -3.1458983421325684, "step": 1880 }, { "epoch": 0.24732554715870056, "grad_norm": 20.25, "learning_rate": 4.676957096790536e-06, "logits/chosen": -1.2747347354888916, "logits/rejected": -1.1613373756408691, "logps/chosen": -450.5853576660156, "logps/rejected": -555.6519165039062, "loss": 0.5564, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4991939067840576, "rewards/margins": 0.936625599861145, "rewards/rejected": -3.435819625854492, "step": 1890 }, { "epoch": 0.24863414793731803, "grad_norm": 17.125, "learning_rate": 4.671318411098782e-06, "logits/chosen": -1.5479834079742432, "logits/rejected": -1.4726171493530273, "logps/chosen": -461.02569580078125, "logps/rejected": -531.9478759765625, "loss": 0.4953, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.833959937095642, "rewards/margins": 1.1573578119277954, "rewards/rejected": -2.9913179874420166, "step": 1900 }, { "epoch": 0.24863414793731803, "eval_logits/chosen": -1.4329824447631836, "eval_logits/rejected": -1.3297497034072876, "eval_logps/chosen": -430.8226318359375, "eval_logps/rejected": -490.2633056640625, "eval_loss": 0.5347898602485657, "eval_rewards/accuracies": 0.7210000157356262, "eval_rewards/chosen": -1.6557786464691162, "eval_rewards/margins": 0.8348817825317383, "eval_rewards/rejected": -2.4906604290008545, "eval_runtime": 694.1282, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.18, "step": 1900 }, { "epoch": 0.2499427487159355, "grad_norm": 11.4375, "learning_rate": 4.665634399054864e-06, "logits/chosen": -1.813912034034729, "logits/rejected": -1.6072871685028076, "logps/chosen": -409.1114807128906, "logps/rejected": -426.57342529296875, "loss": 0.616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6140892505645752, "rewards/margins": 0.6223729848861694, "rewards/rejected": -2.236462354660034, "step": 1910 }, { "epoch": 0.25125134949455297, "grad_norm": 13.4375, "learning_rate": 4.659905179312743e-06, "logits/chosen": -1.6905174255371094, "logits/rejected": -1.721874475479126, "logps/chosen": -408.04193115234375, "logps/rejected": -493.72406005859375, "loss": 0.5809, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6957309246063232, "rewards/margins": 0.6325086355209351, "rewards/rejected": -2.3282394409179688, "step": 1920 }, { "epoch": 0.2525599502731704, "grad_norm": 10.0625, "learning_rate": 4.654130871470093e-06, "logits/chosen": -1.9917497634887695, "logits/rejected": -1.5960171222686768, "logps/chosen": -560.64794921875, "logps/rejected": -571.5491943359375, "loss": 0.5265, "rewards/accuracies": 0.75, "rewards/chosen": -2.1901116371154785, "rewards/margins": 1.1089136600494385, "rewards/rejected": -3.299025774002075, "step": 1930 }, { "epoch": 0.25386855105178785, "grad_norm": 16.375, "learning_rate": 4.6483115960658045e-06, "logits/chosen": -1.6111366748809814, "logits/rejected": -1.618922233581543, "logps/chosen": -424.2569274902344, "logps/rejected": -555.4577026367188, "loss": 0.5275, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0012435913085938, "rewards/margins": 1.2415132522583008, "rewards/rejected": -3.2427566051483154, "step": 1940 }, { "epoch": 0.25517715183040535, "grad_norm": 11.1875, "learning_rate": 4.642447474577466e-06, "logits/chosen": -1.993721604347229, "logits/rejected": -1.8044750690460205, "logps/chosen": -500.1758728027344, "logps/rejected": -562.5256958007812, "loss": 0.505, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.127443313598633, "rewards/margins": 0.8999859690666199, "rewards/rejected": -3.0274291038513184, "step": 1950 }, { "epoch": 0.2564857526090228, "grad_norm": 6.0, "learning_rate": 4.636538629418832e-06, "logits/chosen": -1.8985040187835693, "logits/rejected": -1.737816572189331, "logps/chosen": -503.20782470703125, "logps/rejected": -599.769775390625, "loss": 0.5466, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9882752895355225, "rewards/margins": 0.8945485949516296, "rewards/rejected": -2.882823944091797, "step": 1960 }, { "epoch": 0.2577943533876403, "grad_norm": 9.875, "learning_rate": 4.630585183937263e-06, "logits/chosen": -1.9832836389541626, "logits/rejected": -1.8316504955291748, "logps/chosen": -392.4055480957031, "logps/rejected": -480.58514404296875, "loss": 0.4329, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6308002471923828, "rewards/margins": 1.137982726097107, "rewards/rejected": -2.7687830924987793, "step": 1970 }, { "epoch": 0.2591029541662577, "grad_norm": 36.25, "learning_rate": 4.6245872624111535e-06, "logits/chosen": -1.669372797012329, "logits/rejected": -1.680289626121521, "logps/chosen": -421.1304626464844, "logps/rejected": -517.0955810546875, "loss": 0.4788, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8409557342529297, "rewards/margins": 1.0304902791976929, "rewards/rejected": -2.871446132659912, "step": 1980 }, { "epoch": 0.26041155494487517, "grad_norm": 16.875, "learning_rate": 4.618544990047336e-06, "logits/chosen": -1.862768530845642, "logits/rejected": -1.6982202529907227, "logps/chosen": -546.98779296875, "logps/rejected": -560.3564453125, "loss": 0.4876, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6918388605117798, "rewards/margins": 0.9641841053962708, "rewards/rejected": -2.6560230255126953, "step": 1990 }, { "epoch": 0.26172015572349266, "grad_norm": 8.3125, "learning_rate": 4.612458492978473e-06, "logits/chosen": -1.6869895458221436, "logits/rejected": -1.573582410812378, "logps/chosen": -485.7078552246094, "logps/rejected": -580.96533203125, "loss": 0.5594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2954201698303223, "rewards/margins": 0.857617974281311, "rewards/rejected": -3.1530380249023438, "step": 2000 }, { "epoch": 0.26172015572349266, "eval_logits/chosen": -1.3082515001296997, "eval_logits/rejected": -1.1972744464874268, "eval_logps/chosen": -431.32965087890625, "eval_logps/rejected": -509.7210388183594, "eval_loss": 0.5367922186851501, "eval_rewards/accuracies": 0.7319999933242798, "eval_rewards/chosen": -1.66084885597229, "eval_rewards/margins": 1.0243887901306152, "eval_rewards/rejected": -2.6852378845214844, "eval_runtime": 695.1586, "eval_samples_per_second": 2.877, "eval_steps_per_second": 0.18, "step": 2000 }, { "epoch": 0.2630287565021101, "grad_norm": 7.8125, "learning_rate": 4.606327898260413e-06, "logits/chosen": -1.58231520652771, "logits/rejected": -1.7035701274871826, "logps/chosen": -379.26251220703125, "logps/rejected": -538.2137451171875, "loss": 0.4714, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6309425830841064, "rewards/margins": 1.0421379804611206, "rewards/rejected": -2.6730804443359375, "step": 2010 }, { "epoch": 0.2643373572807276, "grad_norm": 33.5, "learning_rate": 4.600153333869549e-06, "logits/chosen": -1.523889183998108, "logits/rejected": -1.4796324968338013, "logps/chosen": -391.491943359375, "logps/rejected": -477.1109313964844, "loss": 0.5007, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7249523401260376, "rewards/margins": 0.8204771876335144, "rewards/rejected": -2.5454297065734863, "step": 2020 }, { "epoch": 0.26564595805934504, "grad_norm": 17.125, "learning_rate": 4.593934928700141e-06, "logits/chosen": -1.603111982345581, "logits/rejected": -1.4428770542144775, "logps/chosen": -400.0216369628906, "logps/rejected": -526.5708618164062, "loss": 0.4901, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8261020183563232, "rewards/margins": 1.261056661605835, "rewards/rejected": -3.087158679962158, "step": 2030 }, { "epoch": 0.2669545588379625, "grad_norm": 30.875, "learning_rate": 4.587672812561626e-06, "logits/chosen": -1.4515470266342163, "logits/rejected": -1.3571996688842773, "logps/chosen": -500.518798828125, "logps/rejected": -516.5394897460938, "loss": 0.6469, "rewards/accuracies": 0.75, "rewards/chosen": -2.059715747833252, "rewards/margins": 0.9580955505371094, "rewards/rejected": -3.0178115367889404, "step": 2040 }, { "epoch": 0.26826315961658, "grad_norm": 9.1875, "learning_rate": 4.581367116175911e-06, "logits/chosen": -1.7457937002182007, "logits/rejected": -1.5299389362335205, "logps/chosen": -471.89666748046875, "logps/rejected": -587.0848388671875, "loss": 0.5296, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.217672824859619, "rewards/margins": 1.1461336612701416, "rewards/rejected": -3.3638064861297607, "step": 2050 }, { "epoch": 0.2695717603951974, "grad_norm": 7.96875, "learning_rate": 4.5750179711746416e-06, "logits/chosen": -1.7413562536239624, "logits/rejected": -1.5155575275421143, "logps/chosen": -453.27532958984375, "logps/rejected": -506.2584533691406, "loss": 0.5045, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8298585414886475, "rewards/margins": 0.9255337715148926, "rewards/rejected": -2.755392551422119, "step": 2060 }, { "epoch": 0.2708803611738149, "grad_norm": 8.3125, "learning_rate": 4.5686255100964535e-06, "logits/chosen": -1.7482128143310547, "logits/rejected": -1.707728385925293, "logps/chosen": -455.753173828125, "logps/rejected": -560.3676147460938, "loss": 0.4057, "rewards/accuracies": 0.625, "rewards/chosen": -1.8565988540649414, "rewards/margins": 0.931265652179718, "rewards/rejected": -2.7878644466400146, "step": 2070 }, { "epoch": 0.27218896195243236, "grad_norm": 15.25, "learning_rate": 4.562189866384209e-06, "logits/chosen": -1.575791597366333, "logits/rejected": -1.3319919109344482, "logps/chosen": -505.78216552734375, "logps/rejected": -661.2314453125, "loss": 0.5605, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.085358142852783, "rewards/margins": 1.5267078876495361, "rewards/rejected": -3.6120657920837402, "step": 2080 }, { "epoch": 0.2734975627310498, "grad_norm": 10.125, "learning_rate": 4.555711174382209e-06, "logits/chosen": -1.664647102355957, "logits/rejected": -1.5583523511886597, "logps/chosen": -480.11114501953125, "logps/rejected": -615.2495727539062, "loss": 0.4726, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8405815362930298, "rewards/margins": 1.4181773662567139, "rewards/rejected": -3.258759021759033, "step": 2090 }, { "epoch": 0.2748061635096673, "grad_norm": 9.5, "learning_rate": 4.549189569333387e-06, "logits/chosen": -1.9354374408721924, "logits/rejected": -1.7331892251968384, "logps/chosen": -417.3174743652344, "logps/rejected": -546.5352783203125, "loss": 0.4645, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7136468887329102, "rewards/margins": 1.272125482559204, "rewards/rejected": -2.9857726097106934, "step": 2100 }, { "epoch": 0.2748061635096673, "eval_logits/chosen": -1.3679178953170776, "eval_logits/rejected": -1.2530938386917114, "eval_logps/chosen": -433.5981140136719, "eval_logps/rejected": -518.1336059570312, "eval_loss": 0.527845561504364, "eval_rewards/accuracies": 0.7419999837875366, "eval_rewards/chosen": -1.6835333108901978, "eval_rewards/margins": 1.0858298540115356, "eval_rewards/rejected": -2.7693636417388916, "eval_runtime": 694.6418, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 2100 }, { "epoch": 0.27611476428828474, "grad_norm": 20.375, "learning_rate": 4.542625187376491e-06, "logits/chosen": -1.5003135204315186, "logits/rejected": -1.568424940109253, "logps/chosen": -386.8911437988281, "logps/rejected": -521.9318237304688, "loss": 0.5307, "rewards/accuracies": 0.625, "rewards/chosen": -1.7690387964248657, "rewards/margins": 1.1302963495254517, "rewards/rejected": -2.8993351459503174, "step": 2110 }, { "epoch": 0.27742336506690224, "grad_norm": 14.5625, "learning_rate": 4.536018165543239e-06, "logits/chosen": -1.8700469732284546, "logits/rejected": -1.6950562000274658, "logps/chosen": -428.56378173828125, "logps/rejected": -491.93377685546875, "loss": 0.5815, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6707319021224976, "rewards/margins": 0.9030668139457703, "rewards/rejected": -2.573798418045044, "step": 2120 }, { "epoch": 0.2787319658455197, "grad_norm": 8.875, "learning_rate": 4.529368641755453e-06, "logits/chosen": -1.903988242149353, "logits/rejected": -1.7123512029647827, "logps/chosen": -437.4276428222656, "logps/rejected": -528.9296875, "loss": 0.3701, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5356169939041138, "rewards/margins": 1.4532382488250732, "rewards/rejected": -2.9888553619384766, "step": 2130 }, { "epoch": 0.2800405666241371, "grad_norm": 20.25, "learning_rate": 4.522676754822189e-06, "logits/chosen": -1.5987727642059326, "logits/rejected": -1.5117210149765015, "logps/chosen": -500.75811767578125, "logps/rejected": -593.6842041015625, "loss": 0.5285, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3646938800811768, "rewards/margins": 1.1388493776321411, "rewards/rejected": -3.5035431385040283, "step": 2140 }, { "epoch": 0.2813491674027546, "grad_norm": 13.0, "learning_rate": 4.515942644436836e-06, "logits/chosen": -1.3344477415084839, "logits/rejected": -1.444723129272461, "logps/chosen": -418.39385986328125, "logps/rejected": -510.5967712402344, "loss": 0.5309, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2026305198669434, "rewards/margins": 0.6743675470352173, "rewards/rejected": -2.8769984245300293, "step": 2150 }, { "epoch": 0.28265776818137206, "grad_norm": 7.125, "learning_rate": 4.509166451174194e-06, "logits/chosen": -1.6802504062652588, "logits/rejected": -1.5801780223846436, "logps/chosen": -504.3372497558594, "logps/rejected": -594.43505859375, "loss": 0.5967, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.127978801727295, "rewards/margins": 1.331575632095337, "rewards/rejected": -3.4595541954040527, "step": 2160 }, { "epoch": 0.28396636895998956, "grad_norm": 6.46875, "learning_rate": 4.502348316487552e-06, "logits/chosen": -1.908908486366272, "logits/rejected": -1.5761991739273071, "logps/chosen": -512.3312377929688, "logps/rejected": -583.1212158203125, "loss": 0.5406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.450554370880127, "rewards/margins": 1.112906575202942, "rewards/rejected": -3.5634608268737793, "step": 2170 }, { "epoch": 0.285274969738607, "grad_norm": 7.0, "learning_rate": 4.495488382705722e-06, "logits/chosen": -1.519909143447876, "logits/rejected": -1.4418302774429321, "logps/chosen": -521.7791748046875, "logps/rejected": -566.8796997070312, "loss": 0.4693, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.099297046661377, "rewards/margins": 1.2185888290405273, "rewards/rejected": -3.3178858757019043, "step": 2180 }, { "epoch": 0.28658357051722444, "grad_norm": 16.625, "learning_rate": 4.488586793030075e-06, "logits/chosen": -1.5278743505477905, "logits/rejected": -1.3011085987091064, "logps/chosen": -541.6138305664062, "logps/rejected": -574.27978515625, "loss": 0.5376, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7595696449279785, "rewards/margins": 0.6572238802909851, "rewards/rejected": -3.4167933464050293, "step": 2190 }, { "epoch": 0.28789217129584194, "grad_norm": 6.65625, "learning_rate": 4.481643691531551e-06, "logits/chosen": -1.6951812505722046, "logits/rejected": -1.4649896621704102, "logps/chosen": -569.0792236328125, "logps/rejected": -577.2061157226562, "loss": 0.647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8665401935577393, "rewards/margins": 0.8099815249443054, "rewards/rejected": -3.6765217781066895, "step": 2200 }, { "epoch": 0.28789217129584194, "eval_logits/chosen": -1.247178077697754, "eval_logits/rejected": -1.1385564804077148, "eval_logps/chosen": -522.7142944335938, "eval_logps/rejected": -604.4509887695312, "eval_loss": 0.5191652774810791, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.574695348739624, "eval_rewards/margins": 1.0578423738479614, "eval_rewards/rejected": -3.632537364959717, "eval_runtime": 694.4148, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.18, "step": 2200 }, { "epoch": 0.2892007720744594, "grad_norm": 23.625, "learning_rate": 4.474659223147652e-06, "logits/chosen": -1.6916568279266357, "logits/rejected": -1.4921634197235107, "logps/chosen": -516.3152465820312, "logps/rejected": -542.737060546875, "loss": 0.6691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.532180070877075, "rewards/margins": 0.6819985508918762, "rewards/rejected": -3.2141788005828857, "step": 2210 }, { "epoch": 0.2905093728530769, "grad_norm": 11.1875, "learning_rate": 4.4676335336794125e-06, "logits/chosen": -1.7520105838775635, "logits/rejected": -1.8018484115600586, "logps/chosen": -493.07293701171875, "logps/rejected": -601.4270629882812, "loss": 0.4883, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.025521993637085, "rewards/margins": 0.969825267791748, "rewards/rejected": -2.995347023010254, "step": 2220 }, { "epoch": 0.2918179736316943, "grad_norm": 7.84375, "learning_rate": 4.46056676978836e-06, "logits/chosen": -1.6020281314849854, "logits/rejected": -1.4960014820098877, "logps/chosen": -370.4130859375, "logps/rejected": -502.255859375, "loss": 0.462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.941162347793579, "rewards/margins": 0.9414232969284058, "rewards/rejected": -2.8825855255126953, "step": 2230 }, { "epoch": 0.29312657441031176, "grad_norm": 10.6875, "learning_rate": 4.453459078993453e-06, "logits/chosen": -1.8231468200683594, "logits/rejected": -1.5648926496505737, "logps/chosen": -533.9778442382812, "logps/rejected": -605.9078979492188, "loss": 0.5031, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4999961853027344, "rewards/margins": 1.1722078323364258, "rewards/rejected": -3.672203779220581, "step": 2240 }, { "epoch": 0.29443517518892925, "grad_norm": 9.625, "learning_rate": 4.446310609668001e-06, "logits/chosen": -1.5453827381134033, "logits/rejected": -1.3776289224624634, "logps/chosen": -536.4172973632812, "logps/rejected": -651.4637451171875, "loss": 0.5345, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7681262493133545, "rewards/margins": 1.1350476741790771, "rewards/rejected": -3.9031741619110107, "step": 2250 }, { "epoch": 0.2957437759675467, "grad_norm": 14.625, "learning_rate": 4.439121511036562e-06, "logits/chosen": -1.3796781301498413, "logits/rejected": -1.1373788118362427, "logps/chosen": -530.2355346679688, "logps/rejected": -564.6028442382812, "loss": 0.6039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.660719394683838, "rewards/margins": 0.8497903943061829, "rewards/rejected": -3.510509490966797, "step": 2260 }, { "epoch": 0.29705237674616414, "grad_norm": 26.0, "learning_rate": 4.431891933171839e-06, "logits/chosen": -1.6418488025665283, "logits/rejected": -1.4657208919525146, "logps/chosen": -531.6192626953125, "logps/rejected": -566.5932006835938, "loss": 0.5029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.61444354057312, "rewards/margins": 0.7309584021568298, "rewards/rejected": -3.345402240753174, "step": 2270 }, { "epoch": 0.29836097752478163, "grad_norm": 14.5, "learning_rate": 4.424622026991536e-06, "logits/chosen": -1.4923235177993774, "logits/rejected": -1.172642469406128, "logps/chosen": -568.1429443359375, "logps/rejected": -617.9049072265625, "loss": 0.4839, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8112752437591553, "rewards/margins": 1.1487157344818115, "rewards/rejected": -3.9599907398223877, "step": 2280 }, { "epoch": 0.2996695783033991, "grad_norm": 10.375, "learning_rate": 4.417311944255215e-06, "logits/chosen": -1.6981703042984009, "logits/rejected": -1.2104758024215698, "logps/chosen": -524.5671997070312, "logps/rejected": -652.7542724609375, "loss": 0.5122, "rewards/accuracies": 0.875, "rewards/chosen": -2.322052001953125, "rewards/margins": 1.7539561986923218, "rewards/rejected": -4.076007843017578, "step": 2290 }, { "epoch": 0.3009781790820166, "grad_norm": 12.0625, "learning_rate": 4.409961837561122e-06, "logits/chosen": -1.8415409326553345, "logits/rejected": -1.5724987983703613, "logps/chosen": -487.8397521972656, "logps/rejected": -553.9661254882812, "loss": 0.45, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.173867702484131, "rewards/margins": 1.0187773704528809, "rewards/rejected": -3.1926448345184326, "step": 2300 }, { "epoch": 0.3009781790820166, "eval_logits/chosen": -1.259286880493164, "eval_logits/rejected": -1.1509246826171875, "eval_logps/chosen": -508.93621826171875, "eval_logps/rejected": -598.4382934570312, "eval_loss": 0.5134638547897339, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -2.4369146823883057, "eval_rewards/margins": 1.1354950666427612, "eval_rewards/rejected": -3.5724098682403564, "eval_runtime": 694.8809, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 2300 }, { "epoch": 0.302286779860634, "grad_norm": 8.75, "learning_rate": 4.402571860343006e-06, "logits/chosen": -1.5906555652618408, "logits/rejected": -1.5012412071228027, "logps/chosen": -543.1980590820312, "logps/rejected": -582.337646484375, "loss": 0.5464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.45292329788208, "rewards/margins": 0.9080826640129089, "rewards/rejected": -3.3610057830810547, "step": 2310 }, { "epoch": 0.30359538063925146, "grad_norm": 11.1875, "learning_rate": 4.3951421668669165e-06, "logits/chosen": -1.6357810497283936, "logits/rejected": -1.3863661289215088, "logps/chosen": -509.09124755859375, "logps/rejected": -538.1678466796875, "loss": 0.5476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4928698539733887, "rewards/margins": 0.8148897886276245, "rewards/rejected": -3.3077597618103027, "step": 2320 }, { "epoch": 0.30490398141786895, "grad_norm": 14.4375, "learning_rate": 4.3876729122279784e-06, "logits/chosen": -1.7357715368270874, "logits/rejected": -1.313657522201538, "logps/chosen": -554.638916015625, "logps/rejected": -574.8712158203125, "loss": 0.4463, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.528414011001587, "rewards/margins": 1.293595790863037, "rewards/rejected": -3.822010040283203, "step": 2330 }, { "epoch": 0.3062125821964864, "grad_norm": 17.625, "learning_rate": 4.3801642523471585e-06, "logits/chosen": -1.4806541204452515, "logits/rejected": -1.3904956579208374, "logps/chosen": -478.3817443847656, "logps/rejected": -605.9808349609375, "loss": 0.5859, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.834592342376709, "rewards/margins": 1.0473196506500244, "rewards/rejected": -3.8819117546081543, "step": 2340 }, { "epoch": 0.3075211829751039, "grad_norm": 14.25, "learning_rate": 4.37261634396801e-06, "logits/chosen": -1.6407520771026611, "logits/rejected": -1.6617047786712646, "logps/chosen": -528.7417602539062, "logps/rejected": -558.1602783203125, "loss": 0.6163, "rewards/accuracies": 0.75, "rewards/chosen": -2.441850185394287, "rewards/margins": 0.5042880177497864, "rewards/rejected": -2.946138381958008, "step": 2350 }, { "epoch": 0.30882978375372133, "grad_norm": 8.4375, "learning_rate": 4.365029344653401e-06, "logits/chosen": -1.6370904445648193, "logits/rejected": -1.638794183731079, "logps/chosen": -444.48516845703125, "logps/rejected": -594.4917602539062, "loss": 0.4876, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9613415002822876, "rewards/margins": 1.289072036743164, "rewards/rejected": -3.250413179397583, "step": 2360 }, { "epoch": 0.3101383845323388, "grad_norm": 10.6875, "learning_rate": 4.35740341278222e-06, "logits/chosen": -1.7189481258392334, "logits/rejected": -1.4945071935653687, "logps/chosen": -447.0577087402344, "logps/rejected": -525.7955322265625, "loss": 0.472, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9188346862792969, "rewards/margins": 1.0461013317108154, "rewards/rejected": -2.964935779571533, "step": 2370 }, { "epoch": 0.31144698531095627, "grad_norm": 24.5, "learning_rate": 4.349738707546079e-06, "logits/chosen": -1.7340679168701172, "logits/rejected": -1.5902605056762695, "logps/chosen": -469.064453125, "logps/rejected": -510.56640625, "loss": 0.5282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2131800651550293, "rewards/margins": 0.6754418611526489, "rewards/rejected": -2.8886220455169678, "step": 2380 }, { "epoch": 0.3127555860895737, "grad_norm": 23.75, "learning_rate": 4.3420353889459835e-06, "logits/chosen": -1.4752241373062134, "logits/rejected": -1.2497191429138184, "logps/chosen": -469.91326904296875, "logps/rejected": -533.0279541015625, "loss": 0.5951, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.395948886871338, "rewards/margins": 0.785313606262207, "rewards/rejected": -3.181262254714966, "step": 2390 }, { "epoch": 0.3140641868681912, "grad_norm": 7.4375, "learning_rate": 4.334293617788992e-06, "logits/chosen": -1.7004684209823608, "logits/rejected": -1.4957067966461182, "logps/chosen": -440.1708984375, "logps/rejected": -565.8378295898438, "loss": 0.5644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.938124656677246, "rewards/margins": 0.8542143106460571, "rewards/rejected": -2.7923388481140137, "step": 2400 }, { "epoch": 0.3140641868681912, "eval_logits/chosen": -1.2831792831420898, "eval_logits/rejected": -1.174734115600586, "eval_logps/chosen": -453.87249755859375, "eval_logps/rejected": -524.8180541992188, "eval_loss": 0.5071442127227783, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -1.886277437210083, "eval_rewards/margins": 0.9499303102493286, "eval_rewards/rejected": -2.8362081050872803, "eval_runtime": 694.5568, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.18, "step": 2400 }, { "epoch": 0.31537278764680865, "grad_norm": 12.3125, "learning_rate": 4.326513555684867e-06, "logits/chosen": -1.2551451921463013, "logits/rejected": -1.3377107381820679, "logps/chosen": -398.99981689453125, "logps/rejected": -502.2359924316406, "loss": 0.5076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6906484365463257, "rewards/margins": 0.9891603589057922, "rewards/rejected": -2.6798088550567627, "step": 2410 }, { "epoch": 0.3166813884254261, "grad_norm": 10.5, "learning_rate": 4.31869536504269e-06, "logits/chosen": -1.7102693319320679, "logits/rejected": -1.3676587343215942, "logps/chosen": -440.43194580078125, "logps/rejected": -485.97125244140625, "loss": 0.4978, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8339275121688843, "rewards/margins": 0.9703128933906555, "rewards/rejected": -2.8042402267456055, "step": 2420 }, { "epoch": 0.3179899892040436, "grad_norm": 11.6875, "learning_rate": 4.310839209067482e-06, "logits/chosen": -1.4648754596710205, "logits/rejected": -1.3403747081756592, "logps/chosen": -483.6075744628906, "logps/rejected": -596.6671142578125, "loss": 0.4564, "rewards/accuracies": 0.75, "rewards/chosen": -2.5591557025909424, "rewards/margins": 1.1008437871932983, "rewards/rejected": -3.659999132156372, "step": 2430 }, { "epoch": 0.31929858998266103, "grad_norm": 10.25, "learning_rate": 4.302945251756788e-06, "logits/chosen": -1.4883077144622803, "logits/rejected": -1.4211909770965576, "logps/chosen": -541.9990844726562, "logps/rejected": -758.8839721679688, "loss": 0.5119, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.67798113822937, "rewards/margins": 1.6004884243011475, "rewards/rejected": -4.278469562530518, "step": 2440 }, { "epoch": 0.3206071907612785, "grad_norm": 25.125, "learning_rate": 4.29501365789726e-06, "logits/chosen": -1.4792486429214478, "logits/rejected": -1.3414353132247925, "logps/chosen": -580.2071533203125, "logps/rejected": -649.1226806640625, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.169093370437622, "rewards/margins": 0.7796719670295715, "rewards/rejected": -3.948765277862549, "step": 2450 }, { "epoch": 0.32191579153989597, "grad_norm": 21.625, "learning_rate": 4.2870445930612135e-06, "logits/chosen": -1.5781670808792114, "logits/rejected": -1.508968472480774, "logps/chosen": -643.0986328125, "logps/rejected": -656.67236328125, "loss": 0.5806, "rewards/accuracies": 0.75, "rewards/chosen": -3.212388515472412, "rewards/margins": 0.7488046288490295, "rewards/rejected": -3.961193561553955, "step": 2460 }, { "epoch": 0.3232243923185134, "grad_norm": 15.75, "learning_rate": 4.279038223603171e-06, "logits/chosen": -1.7879718542099, "logits/rejected": -1.6690057516098022, "logps/chosen": -512.6642456054688, "logps/rejected": -626.6468505859375, "loss": 0.5271, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.5096840858459473, "rewards/margins": 1.2333909273147583, "rewards/rejected": -3.743074893951416, "step": 2470 }, { "epoch": 0.3245329930971309, "grad_norm": 11.6875, "learning_rate": 4.2709947166563906e-06, "logits/chosen": -1.5622562170028687, "logits/rejected": -1.3119258880615234, "logps/chosen": -490.9695739746094, "logps/rejected": -526.5206298828125, "loss": 0.4982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0977706909179688, "rewards/margins": 0.9809914827346802, "rewards/rejected": -3.0787625312805176, "step": 2480 }, { "epoch": 0.32584159387574835, "grad_norm": 20.5, "learning_rate": 4.262914240129379e-06, "logits/chosen": -1.8752968311309814, "logits/rejected": -1.7451379299163818, "logps/chosen": -508.0210876464844, "logps/rejected": -560.8042602539062, "loss": 0.5474, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.122480869293213, "rewards/margins": 0.6373887062072754, "rewards/rejected": -2.759869337081909, "step": 2490 }, { "epoch": 0.32715019465436584, "grad_norm": 7.5, "learning_rate": 4.254796962702382e-06, "logits/chosen": -1.593274712562561, "logits/rejected": -1.3462018966674805, "logps/chosen": -470.4585876464844, "logps/rejected": -555.4439697265625, "loss": 0.5536, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4339115619659424, "rewards/margins": 0.8689793348312378, "rewards/rejected": -3.3028907775878906, "step": 2500 }, { "epoch": 0.32715019465436584, "eval_logits/chosen": -1.3134056329727173, "eval_logits/rejected": -1.2049833536148071, "eval_logps/chosen": -483.1492004394531, "eval_logps/rejected": -559.2987060546875, "eval_loss": 0.5034111142158508, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -2.179044485092163, "eval_rewards/margins": 1.0019696950912476, "eval_rewards/rejected": -3.181014060974121, "eval_runtime": 693.6057, "eval_samples_per_second": 2.883, "eval_steps_per_second": 0.18, "step": 2500 }, { "epoch": 0.3284587954329833, "grad_norm": 24.75, "learning_rate": 4.246643053823864e-06, "logits/chosen": -1.5785322189331055, "logits/rejected": -1.4893878698349, "logps/chosen": -459.32757568359375, "logps/rejected": -540.9236450195312, "loss": 0.5731, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4732213020324707, "rewards/margins": 0.547541081905365, "rewards/rejected": -3.0207626819610596, "step": 2510 }, { "epoch": 0.3297673962116007, "grad_norm": 14.5625, "learning_rate": 4.238452683706979e-06, "logits/chosen": -1.475465178489685, "logits/rejected": -1.3133914470672607, "logps/chosen": -468.66717529296875, "logps/rejected": -503.279296875, "loss": 0.5401, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6725516319274902, "rewards/margins": 0.5226112604141235, "rewards/rejected": -3.1951632499694824, "step": 2520 }, { "epoch": 0.3310759969902182, "grad_norm": 33.25, "learning_rate": 4.2302260233260025e-06, "logits/chosen": -1.5935661792755127, "logits/rejected": -1.6617271900177002, "logps/chosen": -405.3570861816406, "logps/rejected": -509.3072204589844, "loss": 0.5042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9357690811157227, "rewards/margins": 0.9486289024353027, "rewards/rejected": -2.8843979835510254, "step": 2530 }, { "epoch": 0.33238459776883567, "grad_norm": 7.875, "learning_rate": 4.2219632444127766e-06, "logits/chosen": -1.4289958477020264, "logits/rejected": -1.3024194240570068, "logps/chosen": -530.4345092773438, "logps/rejected": -614.8536376953125, "loss": 0.5493, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.750037670135498, "rewards/margins": 0.8648878335952759, "rewards/rejected": -3.6149253845214844, "step": 2540 }, { "epoch": 0.33369319854745316, "grad_norm": 15.5625, "learning_rate": 4.213664519453115e-06, "logits/chosen": -1.6038662195205688, "logits/rejected": -1.451554536819458, "logps/chosen": -544.4609375, "logps/rejected": -582.7791748046875, "loss": 0.4224, "rewards/accuracies": 0.625, "rewards/chosen": -2.9737436771392822, "rewards/margins": 0.8079301714897156, "rewards/rejected": -3.7816739082336426, "step": 2550 }, { "epoch": 0.3350017993260706, "grad_norm": 26.75, "learning_rate": 4.205330021683208e-06, "logits/chosen": -1.468815565109253, "logits/rejected": -1.3654879331588745, "logps/chosen": -473.6580505371094, "logps/rejected": -543.9722900390625, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": -2.6373887062072754, "rewards/margins": 0.9053285717964172, "rewards/rejected": -3.542717456817627, "step": 2560 }, { "epoch": 0.33631040010468805, "grad_norm": 14.9375, "learning_rate": 4.196959925086008e-06, "logits/chosen": -1.3406097888946533, "logits/rejected": -1.2105426788330078, "logps/chosen": -575.873779296875, "logps/rejected": -677.7567138671875, "loss": 0.634, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.098935127258301, "rewards/margins": 1.2820883989334106, "rewards/rejected": -4.38102388381958, "step": 2570 }, { "epoch": 0.33761900088330554, "grad_norm": 16.625, "learning_rate": 4.188554404387588e-06, "logits/chosen": -1.5613000392913818, "logits/rejected": -1.3523452281951904, "logps/chosen": -541.5022583007812, "logps/rejected": -691.5540161132812, "loss": 0.4092, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9433772563934326, "rewards/margins": 1.5698347091674805, "rewards/rejected": -4.51321268081665, "step": 2580 }, { "epoch": 0.338927601661923, "grad_norm": 16.875, "learning_rate": 4.180113635053504e-06, "logits/chosen": -1.5481126308441162, "logits/rejected": -1.4092085361480713, "logps/chosen": -584.2247314453125, "logps/rejected": -653.611328125, "loss": 0.5611, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1284544467926025, "rewards/margins": 0.9510751962661743, "rewards/rejected": -4.079529285430908, "step": 2590 }, { "epoch": 0.3402362024405404, "grad_norm": 22.875, "learning_rate": 4.17163779328513e-06, "logits/chosen": -1.5316965579986572, "logits/rejected": -1.400965690612793, "logps/chosen": -475.49932861328125, "logps/rejected": -614.6868286132812, "loss": 0.5293, "rewards/accuracies": 0.875, "rewards/chosen": -2.616956949234009, "rewards/margins": 1.5317518711090088, "rewards/rejected": -4.148708820343018, "step": 2600 }, { "epoch": 0.3402362024405404, "eval_logits/chosen": -1.1776095628738403, "eval_logits/rejected": -1.0626823902130127, "eval_logps/chosen": -583.497802734375, "eval_logps/rejected": -684.5870971679688, "eval_loss": 0.5129390954971313, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -3.1825308799743652, "eval_rewards/margins": 1.2513679265975952, "eval_rewards/rejected": -4.43389892578125, "eval_runtime": 692.898, "eval_samples_per_second": 2.886, "eval_steps_per_second": 0.18, "step": 2600 }, { "epoch": 0.3415448032191579, "grad_norm": 24.375, "learning_rate": 4.163127056015975e-06, "logits/chosen": -1.3916542530059814, "logits/rejected": -1.2087299823760986, "logps/chosen": -585.3460083007812, "logps/rejected": -632.4806518554688, "loss": 0.5077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2067363262176514, "rewards/margins": 0.9125640988349915, "rewards/rejected": -4.119300842285156, "step": 2610 }, { "epoch": 0.34285340399777536, "grad_norm": 23.125, "learning_rate": 4.154581600907994e-06, "logits/chosen": -1.4854247570037842, "logits/rejected": -1.2275075912475586, "logps/chosen": -634.7908935546875, "logps/rejected": -791.5166015625, "loss": 0.5193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6149795055389404, "rewards/margins": 1.6714909076690674, "rewards/rejected": -5.286470413208008, "step": 2620 }, { "epoch": 0.34416200477639286, "grad_norm": 10.25, "learning_rate": 4.14600160634788e-06, "logits/chosen": -1.4131571054458618, "logits/rejected": -1.3662068843841553, "logps/chosen": -538.108642578125, "logps/rejected": -696.3258056640625, "loss": 0.5798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0683820247650146, "rewards/margins": 1.5561141967773438, "rewards/rejected": -4.6244964599609375, "step": 2630 }, { "epoch": 0.3454706055550103, "grad_norm": 16.25, "learning_rate": 4.137387251443335e-06, "logits/chosen": -1.705087661743164, "logits/rejected": -1.557076096534729, "logps/chosen": -530.7237548828125, "logps/rejected": -686.1461181640625, "loss": 0.4637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.818674325942993, "rewards/margins": 1.1382681131362915, "rewards/rejected": -3.9569427967071533, "step": 2640 }, { "epoch": 0.34677920633362774, "grad_norm": 15.9375, "learning_rate": 4.128738716019338e-06, "logits/chosen": -1.6315014362335205, "logits/rejected": -1.5417304039001465, "logps/chosen": -497.52459716796875, "logps/rejected": -592.3180541992188, "loss": 0.5597, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5856919288635254, "rewards/margins": 0.9134233593940735, "rewards/rejected": -3.499115467071533, "step": 2650 }, { "epoch": 0.34808780711224524, "grad_norm": 8.9375, "learning_rate": 4.120056180614386e-06, "logits/chosen": -1.5785077810287476, "logits/rejected": -1.3385107517242432, "logps/chosen": -472.36016845703125, "logps/rejected": -481.4932556152344, "loss": 0.6222, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.446000814437866, "rewards/margins": 0.4800782799720764, "rewards/rejected": -2.926079034805298, "step": 2660 }, { "epoch": 0.3493964078908627, "grad_norm": 8.5, "learning_rate": 4.111339826476725e-06, "logits/chosen": -1.8648369312286377, "logits/rejected": -1.7320747375488281, "logps/chosen": -564.6051635742188, "logps/rejected": -640.2154541015625, "loss": 0.5111, "rewards/accuracies": 0.75, "rewards/chosen": -2.6177937984466553, "rewards/margins": 1.0469467639923096, "rewards/rejected": -3.664740800857544, "step": 2670 }, { "epoch": 0.3507050086694802, "grad_norm": 9.9375, "learning_rate": 4.102589835560572e-06, "logits/chosen": -1.6937118768692017, "logits/rejected": -1.5601654052734375, "logps/chosen": -554.29833984375, "logps/rejected": -612.55908203125, "loss": 0.5027, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.524139404296875, "rewards/margins": 0.7383443117141724, "rewards/rejected": -3.262483596801758, "step": 2680 }, { "epoch": 0.3520136094480976, "grad_norm": 7.59375, "learning_rate": 4.09380639052231e-06, "logits/chosen": -1.5890767574310303, "logits/rejected": -1.4847526550292969, "logps/chosen": -562.1860961914062, "logps/rejected": -659.1439208984375, "loss": 0.5809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9070651531219482, "rewards/margins": 0.9249240159988403, "rewards/rejected": -3.831989288330078, "step": 2690 }, { "epoch": 0.35332221022671506, "grad_norm": 9.8125, "learning_rate": 4.084989674716679e-06, "logits/chosen": -1.5505536794662476, "logits/rejected": -1.4631093740463257, "logps/chosen": -547.5584716796875, "logps/rejected": -636.4954833984375, "loss": 0.5843, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8671863079071045, "rewards/margins": 0.9007315635681152, "rewards/rejected": -3.7679176330566406, "step": 2700 }, { "epoch": 0.35332221022671506, "eval_logits/chosen": -1.262353777885437, "eval_logits/rejected": -1.1514339447021484, "eval_logps/chosen": -537.8978881835938, "eval_logps/rejected": -624.2645874023438, "eval_loss": 0.5062245726585388, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -2.7265307903289795, "eval_rewards/margins": 1.1041425466537476, "eval_rewards/rejected": -3.8306729793548584, "eval_runtime": 694.4301, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.18, "step": 2700 }, { "epoch": 0.35463081100533256, "grad_norm": 11.375, "learning_rate": 4.076139872192949e-06, "logits/chosen": -1.586192011833191, "logits/rejected": -1.4574507474899292, "logps/chosen": -521.71044921875, "logps/rejected": -652.8938598632812, "loss": 0.4882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.864898204803467, "rewards/margins": 0.9544525146484375, "rewards/rejected": -3.819350481033325, "step": 2710 }, { "epoch": 0.35593941178395, "grad_norm": 14.0625, "learning_rate": 4.067257167691074e-06, "logits/chosen": -1.5413246154785156, "logits/rejected": -1.3050501346588135, "logps/chosen": -510.5083923339844, "logps/rejected": -590.1461181640625, "loss": 0.4574, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6288464069366455, "rewards/margins": 1.066295862197876, "rewards/rejected": -3.6951420307159424, "step": 2720 }, { "epoch": 0.3572480125625675, "grad_norm": 10.6875, "learning_rate": 4.05834174663784e-06, "logits/chosen": -1.5661990642547607, "logits/rejected": -1.4986212253570557, "logps/chosen": -507.8670349121094, "logps/rejected": -580.6935424804688, "loss": 0.565, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5862560272216797, "rewards/margins": 0.9425860643386841, "rewards/rejected": -3.5288422107696533, "step": 2730 }, { "epoch": 0.35855661334118494, "grad_norm": 12.5, "learning_rate": 4.0493937951429895e-06, "logits/chosen": -1.9351247549057007, "logits/rejected": -1.8539396524429321, "logps/chosen": -520.7384033203125, "logps/rejected": -611.8561401367188, "loss": 0.517, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.161691427230835, "rewards/margins": 0.8333566784858704, "rewards/rejected": -2.9950482845306396, "step": 2740 }, { "epoch": 0.3598652141198024, "grad_norm": 17.125, "learning_rate": 4.040413499995343e-06, "logits/chosen": -1.9669978618621826, "logits/rejected": -1.6097214221954346, "logps/chosen": -473.77423095703125, "logps/rejected": -546.912841796875, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": -1.856981635093689, "rewards/margins": 1.1666312217712402, "rewards/rejected": -3.0236127376556396, "step": 2750 }, { "epoch": 0.3611738148984199, "grad_norm": 9.4375, "learning_rate": 4.031401048658892e-06, "logits/chosen": -1.881160020828247, "logits/rejected": -1.7074706554412842, "logps/chosen": -496.22369384765625, "logps/rejected": -640.4068603515625, "loss": 0.3763, "rewards/accuracies": 0.875, "rewards/chosen": -2.113619804382324, "rewards/margins": 1.49836266040802, "rewards/rejected": -3.6119823455810547, "step": 2760 }, { "epoch": 0.3624824156770373, "grad_norm": 9.8125, "learning_rate": 4.022356629268894e-06, "logits/chosen": -1.876647710800171, "logits/rejected": -1.4448273181915283, "logps/chosen": -514.8057861328125, "logps/rejected": -588.8934326171875, "loss": 0.4331, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.2392146587371826, "rewards/margins": 1.5038731098175049, "rewards/rejected": -3.7430882453918457, "step": 2770 }, { "epoch": 0.3637910164556548, "grad_norm": 14.9375, "learning_rate": 4.013280430627936e-06, "logits/chosen": -1.754246711730957, "logits/rejected": -1.682611107826233, "logps/chosen": -513.6207275390625, "logps/rejected": -603.0330200195312, "loss": 0.5016, "rewards/accuracies": 0.625, "rewards/chosen": -2.3257832527160645, "rewards/margins": 0.7535048723220825, "rewards/rejected": -3.0792880058288574, "step": 2780 }, { "epoch": 0.36509961723427226, "grad_norm": 11.875, "learning_rate": 4.004172642202002e-06, "logits/chosen": -1.7621902227401733, "logits/rejected": -1.428887963294983, "logps/chosen": -483.3815002441406, "logps/rejected": -518.5245361328125, "loss": 0.5603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.363347291946411, "rewards/margins": 0.8312579393386841, "rewards/rejected": -3.1946051120758057, "step": 2790 }, { "epoch": 0.3664082180128897, "grad_norm": 25.375, "learning_rate": 3.995033454116512e-06, "logits/chosen": -1.6319999694824219, "logits/rejected": -1.4888899326324463, "logps/chosen": -463.6336364746094, "logps/rejected": -562.38623046875, "loss": 0.5032, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.338679552078247, "rewards/margins": 1.1144641637802124, "rewards/rejected": -3.45314359664917, "step": 2800 }, { "epoch": 0.3664082180128897, "eval_logits/chosen": -1.2558997869491577, "eval_logits/rejected": -1.1400363445281982, "eval_logps/chosen": -535.7208251953125, "eval_logps/rejected": -639.2272338867188, "eval_loss": 0.5347518920898438, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -2.7047605514526367, "eval_rewards/margins": 1.2755396366119385, "eval_rewards/rejected": -3.980299711227417, "eval_runtime": 694.4152, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.18, "step": 2800 }, { "epoch": 0.3677168187915072, "grad_norm": 13.4375, "learning_rate": 3.985863057152355e-06, "logits/chosen": -1.3766752481460571, "logits/rejected": -1.0923089981079102, "logps/chosen": -537.7940673828125, "logps/rejected": -704.500244140625, "loss": 0.3125, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.8585939407348633, "rewards/margins": 1.909102439880371, "rewards/rejected": -4.767696380615234, "step": 2810 }, { "epoch": 0.36902541957012464, "grad_norm": 9.5625, "learning_rate": 3.976661642741908e-06, "logits/chosen": -1.44516122341156, "logits/rejected": -1.3598823547363281, "logps/chosen": -581.0471801757812, "logps/rejected": -698.563720703125, "loss": 0.4271, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0821290016174316, "rewards/margins": 1.561016321182251, "rewards/rejected": -4.6431450843811035, "step": 2820 }, { "epoch": 0.37033402034874213, "grad_norm": 6.15625, "learning_rate": 3.967429402965035e-06, "logits/chosen": -1.5437654256820679, "logits/rejected": -1.4281237125396729, "logps/chosen": -594.0179443359375, "logps/rejected": -674.371826171875, "loss": 0.5687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.883504629135132, "rewards/margins": 1.1815776824951172, "rewards/rejected": -4.065082550048828, "step": 2830 }, { "epoch": 0.3716426211273596, "grad_norm": 25.625, "learning_rate": 3.958166530545085e-06, "logits/chosen": -1.5086722373962402, "logits/rejected": -1.292533278465271, "logps/chosen": -519.2114868164062, "logps/rejected": -743.0427856445312, "loss": 0.4888, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5064895153045654, "rewards/margins": 1.90365731716156, "rewards/rejected": -4.410147190093994, "step": 2840 }, { "epoch": 0.372951221905977, "grad_norm": 18.375, "learning_rate": 3.948873218844863e-06, "logits/chosen": -1.5857677459716797, "logits/rejected": -1.4796470403671265, "logps/chosen": -495.1551818847656, "logps/rejected": -558.568115234375, "loss": 0.471, "rewards/accuracies": 0.75, "rewards/chosen": -2.317107677459717, "rewards/margins": 1.0017577409744263, "rewards/rejected": -3.3188652992248535, "step": 2850 }, { "epoch": 0.3742598226845945, "grad_norm": 7.625, "learning_rate": 3.939549661862592e-06, "logits/chosen": -1.6015307903289795, "logits/rejected": -1.456026554107666, "logps/chosen": -506.06158447265625, "logps/rejected": -642.8414306640625, "loss": 0.4866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0716233253479004, "rewards/margins": 1.368452787399292, "rewards/rejected": -3.4400763511657715, "step": 2860 }, { "epoch": 0.37556842346321195, "grad_norm": 8.75, "learning_rate": 3.930196054227871e-06, "logits/chosen": -1.4082565307617188, "logits/rejected": -1.4461169242858887, "logps/chosen": -527.29736328125, "logps/rejected": -670.1265869140625, "loss": 0.4376, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.413400173187256, "rewards/margins": 1.463064432144165, "rewards/rejected": -3.87646484375, "step": 2870 }, { "epoch": 0.37687702424182945, "grad_norm": 25.5, "learning_rate": 3.920812591197604e-06, "logits/chosen": -1.4948174953460693, "logits/rejected": -1.2370578050613403, "logps/chosen": -450.03253173828125, "logps/rejected": -632.4721069335938, "loss": 0.4853, "rewards/accuracies": 0.75, "rewards/chosen": -2.440441131591797, "rewards/margins": 1.8954674005508423, "rewards/rejected": -4.335907936096191, "step": 2880 }, { "epoch": 0.3781856250204469, "grad_norm": 14.75, "learning_rate": 3.9113994686519305e-06, "logits/chosen": -1.38882577419281, "logits/rejected": -1.2719522714614868, "logps/chosen": -480.1973571777344, "logps/rejected": -628.4170532226562, "loss": 0.6177, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6350064277648926, "rewards/margins": 1.384225606918335, "rewards/rejected": -4.019231796264648, "step": 2890 }, { "epoch": 0.37949422579906433, "grad_norm": 12.1875, "learning_rate": 3.90195688309013e-06, "logits/chosen": -1.4793598651885986, "logits/rejected": -1.20169997215271, "logps/chosen": -474.89874267578125, "logps/rejected": -603.9495239257812, "loss": 0.4179, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.409841537475586, "rewards/margins": 1.694087028503418, "rewards/rejected": -4.103928565979004, "step": 2900 }, { "epoch": 0.37949422579906433, "eval_logits/chosen": -1.2393105030059814, "eval_logits/rejected": -1.122414231300354, "eval_logps/chosen": -532.5025634765625, "eval_logps/rejected": -634.56005859375, "eval_loss": 0.5106415152549744, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.6725778579711914, "eval_rewards/margins": 1.2610502243041992, "eval_rewards/rejected": -3.9336278438568115, "eval_runtime": 696.6569, "eval_samples_per_second": 2.871, "eval_steps_per_second": 0.179, "step": 2900 }, { "epoch": 0.38080282657768183, "grad_norm": 32.0, "learning_rate": 3.892485031626527e-06, "logits/chosen": -1.5317245721817017, "logits/rejected": -1.4611523151397705, "logps/chosen": -559.7987060546875, "logps/rejected": -636.7401123046875, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": -3.342151165008545, "rewards/margins": 0.6966226696968079, "rewards/rejected": -4.038773536682129, "step": 2910 }, { "epoch": 0.38211142735629927, "grad_norm": 48.25, "learning_rate": 3.882984111986371e-06, "logits/chosen": -1.4634761810302734, "logits/rejected": -1.241074800491333, "logps/chosen": -495.94830322265625, "logps/rejected": -652.7047119140625, "loss": 0.4448, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.8626575469970703, "rewards/margins": 1.6901658773422241, "rewards/rejected": -4.552823543548584, "step": 2920 }, { "epoch": 0.3834200281349167, "grad_norm": 9.1875, "learning_rate": 3.873454322501711e-06, "logits/chosen": -1.4568628072738647, "logits/rejected": -1.2999298572540283, "logps/chosen": -525.2959594726562, "logps/rejected": -616.8997802734375, "loss": 0.442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3488237857818604, "rewards/margins": 1.093848466873169, "rewards/rejected": -3.4426722526550293, "step": 2930 }, { "epoch": 0.3847286289135342, "grad_norm": 3.515625, "learning_rate": 3.863895862107255e-06, "logits/chosen": -1.5164395570755005, "logits/rejected": -1.6844536066055298, "logps/chosen": -448.1661071777344, "logps/rejected": -633.9898681640625, "loss": 0.4153, "rewards/accuracies": 0.875, "rewards/chosen": -2.6101365089416504, "rewards/margins": 1.464616298675537, "rewards/rejected": -4.0747528076171875, "step": 2940 }, { "epoch": 0.38603722969215165, "grad_norm": 21.125, "learning_rate": 3.854308930336216e-06, "logits/chosen": -1.5274903774261475, "logits/rejected": -1.4654560089111328, "logps/chosen": -591.1707763671875, "logps/rejected": -678.1710815429688, "loss": 0.4993, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2283310890197754, "rewards/margins": 1.2125701904296875, "rewards/rejected": -4.440901756286621, "step": 2950 }, { "epoch": 0.38734583047076915, "grad_norm": 16.5, "learning_rate": 3.844693727316151e-06, "logits/chosen": -1.5820369720458984, "logits/rejected": -1.3588536977767944, "logps/chosen": -584.0565795898438, "logps/rejected": -638.734619140625, "loss": 0.4874, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.015751600265503, "rewards/margins": 1.4503902196884155, "rewards/rejected": -4.466142177581787, "step": 2960 }, { "epoch": 0.3886544312493866, "grad_norm": 12.875, "learning_rate": 3.835050453764779e-06, "logits/chosen": -1.6307380199432373, "logits/rejected": -1.2704654932022095, "logps/chosen": -521.9371337890625, "logps/rejected": -636.7149658203125, "loss": 0.4551, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.3237812519073486, "rewards/margins": 1.6301771402359009, "rewards/rejected": -3.9539589881896973, "step": 2970 }, { "epoch": 0.38996303202800403, "grad_norm": 16.375, "learning_rate": 3.825379310985792e-06, "logits/chosen": -1.5514624118804932, "logits/rejected": -1.5151180028915405, "logps/chosen": -481.9397888183594, "logps/rejected": -687.2791137695312, "loss": 0.5218, "rewards/accuracies": 0.75, "rewards/chosen": -2.5342068672180176, "rewards/margins": 1.7440704107284546, "rewards/rejected": -4.278277397155762, "step": 2980 }, { "epoch": 0.3912716328066215, "grad_norm": 10.8125, "learning_rate": 3.815680500864651e-06, "logits/chosen": -1.7050174474716187, "logits/rejected": -1.4977128505706787, "logps/chosen": -541.2501220703125, "logps/rejected": -677.8220825195312, "loss": 0.4355, "rewards/accuracies": 0.75, "rewards/chosen": -2.3622448444366455, "rewards/margins": 1.2924296855926514, "rewards/rejected": -3.654674530029297, "step": 2990 }, { "epoch": 0.39258023358523897, "grad_norm": 28.625, "learning_rate": 3.80595422586438e-06, "logits/chosen": -1.7262550592422485, "logits/rejected": -1.6192045211791992, "logps/chosen": -535.9850463867188, "logps/rejected": -666.9298095703125, "loss": 0.4537, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4521782398223877, "rewards/margins": 1.3589130640029907, "rewards/rejected": -3.811091661453247, "step": 3000 }, { "epoch": 0.39258023358523897, "eval_logits/chosen": -1.3309398889541626, "eval_logits/rejected": -1.2112724781036377, "eval_logps/chosen": -513.8767700195312, "eval_logps/rejected": -618.0381469726562, "eval_loss": 0.5151113867759705, "eval_rewards/accuracies": 0.7440000176429749, "eval_rewards/chosen": -2.4863200187683105, "eval_rewards/margins": 1.282088041305542, "eval_rewards/rejected": -3.7684082984924316, "eval_runtime": 694.1118, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.18, "step": 3000 }, { "epoch": 0.39388883436385647, "grad_norm": 21.5, "learning_rate": 3.7962006890213266e-06, "logits/chosen": -1.6716268062591553, "logits/rejected": -1.4528642892837524, "logps/chosen": -490.56817626953125, "logps/rejected": -634.67138671875, "loss": 0.5531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5396547317504883, "rewards/margins": 1.15042245388031, "rewards/rejected": -3.690077304840088, "step": 3010 }, { "epoch": 0.3951974351424739, "grad_norm": 27.75, "learning_rate": 3.7864200939409336e-06, "logits/chosen": -1.6464722156524658, "logits/rejected": -1.6111170053482056, "logps/chosen": -471.7593688964844, "logps/rejected": -558.6244506835938, "loss": 0.5954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5306525230407715, "rewards/margins": 0.8771204948425293, "rewards/rejected": -3.4077727794647217, "step": 3020 }, { "epoch": 0.39650603592109135, "grad_norm": 9.875, "learning_rate": 3.7766126447934857e-06, "logits/chosen": -2.019923686981201, "logits/rejected": -1.917360544204712, "logps/chosen": -404.66070556640625, "logps/rejected": -489.58740234375, "loss": 0.4903, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6666208505630493, "rewards/margins": 1.0508415699005127, "rewards/rejected": -2.7174620628356934, "step": 3030 }, { "epoch": 0.39781463669970885, "grad_norm": 13.5, "learning_rate": 3.766778546309847e-06, "logits/chosen": -1.7427985668182373, "logits/rejected": -1.7341394424438477, "logps/chosen": -476.91064453125, "logps/rejected": -544.718505859375, "loss": 0.5466, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.081979274749756, "rewards/margins": 0.5131665468215942, "rewards/rejected": -2.5951457023620605, "step": 3040 }, { "epoch": 0.3991232374783263, "grad_norm": 9.75, "learning_rate": 3.7569180037771868e-06, "logits/chosen": -1.732908844947815, "logits/rejected": -1.720476508140564, "logps/chosen": -448.0504455566406, "logps/rejected": -535.7593383789062, "loss": 0.5356, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9136587381362915, "rewards/margins": 0.896325945854187, "rewards/rejected": -2.8099846839904785, "step": 3050 }, { "epoch": 0.4004318382569438, "grad_norm": 18.0, "learning_rate": 3.7470312230346955e-06, "logits/chosen": -1.946218729019165, "logits/rejected": -1.8611263036727905, "logps/chosen": -499.7594299316406, "logps/rejected": -538.9730834960938, "loss": 0.6222, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.094381332397461, "rewards/margins": 0.530040979385376, "rewards/rejected": -2.624422311782837, "step": 3060 }, { "epoch": 0.4017404390355612, "grad_norm": 8.375, "learning_rate": 3.7371184104692857e-06, "logits/chosen": -1.6800199747085571, "logits/rejected": -1.5499494075775146, "logps/chosen": -493.4293518066406, "logps/rejected": -565.5674438476562, "loss": 0.4934, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.6385226249694824, "rewards/margins": 0.7409487962722778, "rewards/rejected": -3.3794713020324707, "step": 3070 }, { "epoch": 0.40304903981417867, "grad_norm": 31.0, "learning_rate": 3.727179773011289e-06, "logits/chosen": -1.7404781579971313, "logits/rejected": -1.63278329372406, "logps/chosen": -527.97021484375, "logps/rejected": -564.0224609375, "loss": 0.6184, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.5922653675079346, "rewards/margins": 0.9239760637283325, "rewards/rejected": -3.5162415504455566, "step": 3080 }, { "epoch": 0.40435764059279616, "grad_norm": 24.75, "learning_rate": 3.717215518130127e-06, "logits/chosen": -1.6654255390167236, "logits/rejected": -1.6554895639419556, "logps/chosen": -566.3580932617188, "logps/rejected": -657.6038208007812, "loss": 0.5188, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.0430703163146973, "rewards/margins": 0.7673562169075012, "rewards/rejected": -3.8104262351989746, "step": 3090 }, { "epoch": 0.4056662413714136, "grad_norm": 8.25, "learning_rate": 3.7072258538299923e-06, "logits/chosen": -1.6596405506134033, "logits/rejected": -1.3553379774093628, "logps/chosen": -550.8126220703125, "logps/rejected": -646.3988037109375, "loss": 0.4542, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.925769090652466, "rewards/margins": 1.5615020990371704, "rewards/rejected": -4.487270832061768, "step": 3100 }, { "epoch": 0.4056662413714136, "eval_logits/chosen": -1.2731679677963257, "eval_logits/rejected": -1.1532899141311646, "eval_logps/chosen": -589.3881225585938, "eval_logps/rejected": -696.314453125, "eval_loss": 0.5242880582809448, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -3.2414333820343018, "eval_rewards/margins": 1.3097381591796875, "eval_rewards/rejected": -4.551171779632568, "eval_runtime": 695.6074, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 3100 }, { "epoch": 0.4069748421500311, "grad_norm": 18.125, "learning_rate": 3.6972109886454933e-06, "logits/chosen": -1.7896296977996826, "logits/rejected": -1.4752140045166016, "logps/chosen": -621.70654296875, "logps/rejected": -679.8302612304688, "loss": 0.5184, "rewards/accuracies": 0.75, "rewards/chosen": -2.9325177669525146, "rewards/margins": 1.6604194641113281, "rewards/rejected": -4.592937469482422, "step": 3110 }, { "epoch": 0.40828344292864854, "grad_norm": 16.625, "learning_rate": 3.687171131637314e-06, "logits/chosen": -1.704087257385254, "logits/rejected": -1.441663384437561, "logps/chosen": -603.4318237304688, "logps/rejected": -671.3712768554688, "loss": 0.514, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1588664054870605, "rewards/margins": 1.1434485912322998, "rewards/rejected": -4.3023152351379395, "step": 3120 }, { "epoch": 0.409592043707266, "grad_norm": 22.25, "learning_rate": 3.677106492387839e-06, "logits/chosen": -1.5973501205444336, "logits/rejected": -1.5867674350738525, "logps/chosen": -491.87078857421875, "logps/rejected": -588.4395751953125, "loss": 0.5147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5930514335632324, "rewards/margins": 1.1889551877975464, "rewards/rejected": -3.7820067405700684, "step": 3130 }, { "epoch": 0.4109006444858835, "grad_norm": 6.8125, "learning_rate": 3.6670172809967865e-06, "logits/chosen": -1.8411601781845093, "logits/rejected": -1.6065164804458618, "logps/chosen": -502.55487060546875, "logps/rejected": -585.6467895507812, "loss": 0.4883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3908276557922363, "rewards/margins": 1.2807626724243164, "rewards/rejected": -3.6715903282165527, "step": 3140 }, { "epoch": 0.4122092452645009, "grad_norm": 6.34375, "learning_rate": 3.6569037080768153e-06, "logits/chosen": -1.4937376976013184, "logits/rejected": -1.7071326971054077, "logps/chosen": -468.9966735839844, "logps/rejected": -664.2180786132812, "loss": 0.6253, "rewards/accuracies": 0.625, "rewards/chosen": -2.693213701248169, "rewards/margins": 0.9661498069763184, "rewards/rejected": -3.659363269805908, "step": 3150 }, { "epoch": 0.4135178460431184, "grad_norm": 11.4375, "learning_rate": 3.646765984749137e-06, "logits/chosen": -1.578050136566162, "logits/rejected": -1.4790937900543213, "logps/chosen": -497.9654846191406, "logps/rejected": -592.1451416015625, "loss": 0.4246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6293177604675293, "rewards/margins": 0.8938980102539062, "rewards/rejected": -3.5232155323028564, "step": 3160 }, { "epoch": 0.41482644682173586, "grad_norm": 11.5625, "learning_rate": 3.6366043226391e-06, "logits/chosen": -1.4045484066009521, "logits/rejected": -1.4717546701431274, "logps/chosen": -452.7611389160156, "logps/rejected": -658.9154052734375, "loss": 0.4564, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6457111835479736, "rewards/margins": 1.5280063152313232, "rewards/rejected": -4.173717021942139, "step": 3170 }, { "epoch": 0.4161350476003533, "grad_norm": 10.75, "learning_rate": 3.6264189338717766e-06, "logits/chosen": -1.59109365940094, "logits/rejected": -1.7008126974105835, "logps/chosen": -510.23712158203125, "logps/rejected": -651.5572509765625, "loss": 0.4996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8125271797180176, "rewards/margins": 1.0818402767181396, "rewards/rejected": -3.8943676948547363, "step": 3180 }, { "epoch": 0.4174436483789708, "grad_norm": 24.75, "learning_rate": 3.6162100310675334e-06, "logits/chosen": -1.8583190441131592, "logits/rejected": -1.5220565795898438, "logps/chosen": -571.635986328125, "logps/rejected": -600.9414672851562, "loss": 0.6039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.564141035079956, "rewards/margins": 1.0908578634262085, "rewards/rejected": -3.654998779296875, "step": 3190 }, { "epoch": 0.41875224915758824, "grad_norm": 16.625, "learning_rate": 3.605977827337596e-06, "logits/chosen": -1.574501395225525, "logits/rejected": -1.4324970245361328, "logps/chosen": -534.6776123046875, "logps/rejected": -686.1610107421875, "loss": 0.5944, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9534497261047363, "rewards/margins": 1.2858242988586426, "rewards/rejected": -4.239274024963379, "step": 3200 }, { "epoch": 0.41875224915758824, "eval_logits/chosen": -1.3088926076889038, "eval_logits/rejected": -1.1939375400543213, "eval_logps/chosen": -507.91961669921875, "eval_logps/rejected": -602.1553344726562, "eval_loss": 0.5122220516204834, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -2.4267489910125732, "eval_rewards/margins": 1.1828322410583496, "eval_rewards/rejected": -3.6095809936523438, "eval_runtime": 695.6807, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 3200 }, { "epoch": 0.42006084993620574, "grad_norm": 11.8125, "learning_rate": 3.595722536279595e-06, "logits/chosen": -1.729081392288208, "logits/rejected": -1.7405521869659424, "logps/chosen": -506.41436767578125, "logps/rejected": -632.4987182617188, "loss": 0.4882, "rewards/accuracies": 0.75, "rewards/chosen": -2.3745625019073486, "rewards/margins": 1.1471344232559204, "rewards/rejected": -3.5216965675354004, "step": 3210 }, { "epoch": 0.4213694507148232, "grad_norm": 7.5625, "learning_rate": 3.58544437197311e-06, "logits/chosen": -1.7779086828231812, "logits/rejected": -1.5859578847885132, "logps/chosen": -458.27032470703125, "logps/rejected": -621.0375366210938, "loss": 0.3861, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.180241346359253, "rewards/margins": 1.6710054874420166, "rewards/rejected": -3.8512470722198486, "step": 3220 }, { "epoch": 0.4226780514934406, "grad_norm": 16.75, "learning_rate": 3.5751435489752025e-06, "logits/chosen": -1.823320746421814, "logits/rejected": -1.6837263107299805, "logps/chosen": -566.820068359375, "logps/rejected": -659.8258056640625, "loss": 0.6621, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9997572898864746, "rewards/margins": 1.0704104900360107, "rewards/rejected": -4.070168495178223, "step": 3230 }, { "epoch": 0.4239866522720581, "grad_norm": 16.125, "learning_rate": 3.5648202823159317e-06, "logits/chosen": -1.8026784658432007, "logits/rejected": -1.4404916763305664, "logps/chosen": -622.244384765625, "logps/rejected": -610.4708862304688, "loss": 0.6324, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0311617851257324, "rewards/margins": 0.7349166870117188, "rewards/rejected": -3.766078233718872, "step": 3240 }, { "epoch": 0.42529525305067556, "grad_norm": 20.625, "learning_rate": 3.554474787493873e-06, "logits/chosen": -1.3546491861343384, "logits/rejected": -1.3267197608947754, "logps/chosen": -475.6293029785156, "logps/rejected": -644.6146240234375, "loss": 0.4313, "rewards/accuracies": 0.75, "rewards/chosen": -2.454834461212158, "rewards/margins": 1.6549785137176514, "rewards/rejected": -4.109813213348389, "step": 3250 }, { "epoch": 0.42660385382929306, "grad_norm": 5.6875, "learning_rate": 3.5441072804716125e-06, "logits/chosen": -1.6513841152191162, "logits/rejected": -1.7092317342758179, "logps/chosen": -572.1702270507812, "logps/rejected": -623.626953125, "loss": 0.5335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.773787260055542, "rewards/margins": 0.9913013577461243, "rewards/rejected": -3.7650885581970215, "step": 3260 }, { "epoch": 0.4279124546079105, "grad_norm": 25.375, "learning_rate": 3.5337179776712427e-06, "logits/chosen": -1.7246463298797607, "logits/rejected": -1.6264114379882812, "logps/chosen": -492.4591369628906, "logps/rejected": -596.2656860351562, "loss": 0.7304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.7241177558898926, "rewards/margins": 0.7503716349601746, "rewards/rejected": -3.474489212036133, "step": 3270 }, { "epoch": 0.42922105538652794, "grad_norm": 8.5625, "learning_rate": 3.5233070959698445e-06, "logits/chosen": -1.9887495040893555, "logits/rejected": -1.8792444467544556, "logps/chosen": -513.8070068359375, "logps/rejected": -656.2811279296875, "loss": 0.393, "rewards/accuracies": 0.875, "rewards/chosen": -2.0297443866729736, "rewards/margins": 1.5495188236236572, "rewards/rejected": -3.579263210296631, "step": 3280 }, { "epoch": 0.43052965616514544, "grad_norm": 13.125, "learning_rate": 3.512874852694959e-06, "logits/chosen": -1.709958791732788, "logits/rejected": -1.5588171482086182, "logps/chosen": -496.163330078125, "logps/rejected": -552.5806884765625, "loss": 0.5313, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.375587224960327, "rewards/margins": 0.8190393447875977, "rewards/rejected": -3.194626808166504, "step": 3290 }, { "epoch": 0.4318382569437629, "grad_norm": 10.3125, "learning_rate": 3.5024214656200497e-06, "logits/chosen": -1.6596482992172241, "logits/rejected": -1.4439632892608643, "logps/chosen": -535.1092529296875, "logps/rejected": -518.9168090820312, "loss": 0.6654, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.8239197731018066, "rewards/margins": 0.302544504404068, "rewards/rejected": -3.126464366912842, "step": 3300 }, { "epoch": 0.4318382569437629, "eval_logits/chosen": -1.3095725774765015, "eval_logits/rejected": -1.1881147623062134, "eval_logps/chosen": -469.28533935546875, "eval_logps/rejected": -557.70556640625, "eval_loss": 0.5025055408477783, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -2.040405511856079, "eval_rewards/margins": 1.1246769428253174, "eval_rewards/rejected": -3.1650824546813965, "eval_runtime": 694.8056, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 3300 }, { "epoch": 0.4331468577223803, "grad_norm": 19.625, "learning_rate": 3.491947152959958e-06, "logits/chosen": -1.9054486751556396, "logits/rejected": -1.7244374752044678, "logps/chosen": -452.26458740234375, "logps/rejected": -497.71099853515625, "loss": 0.5405, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0911953449249268, "rewards/margins": 0.9495905041694641, "rewards/rejected": -3.040786027908325, "step": 3310 }, { "epoch": 0.4344554585009978, "grad_norm": 11.6875, "learning_rate": 3.4814521333663497e-06, "logits/chosen": -1.7944968938827515, "logits/rejected": -1.6481037139892578, "logps/chosen": -459.74822998046875, "logps/rejected": -481.28240966796875, "loss": 0.5198, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.026840925216675, "rewards/margins": 0.6192730069160461, "rewards/rejected": -2.646113872528076, "step": 3320 }, { "epoch": 0.43576405927961526, "grad_norm": 8.8125, "learning_rate": 3.4709366259231468e-06, "logits/chosen": -1.8444007635116577, "logits/rejected": -1.4220874309539795, "logps/chosen": -518.8185424804688, "logps/rejected": -541.3088989257812, "loss": 0.4048, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.001307725906372, "rewards/margins": 1.1489601135253906, "rewards/rejected": -3.1502676010131836, "step": 3330 }, { "epoch": 0.43707266005823275, "grad_norm": 13.6875, "learning_rate": 3.460400850141956e-06, "logits/chosen": -1.6114075183868408, "logits/rejected": -1.3459597826004028, "logps/chosen": -507.5619201660156, "logps/rejected": -646.8705444335938, "loss": 0.5523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.628382921218872, "rewards/margins": 1.1458734273910522, "rewards/rejected": -3.774256467819214, "step": 3340 }, { "epoch": 0.4383812608368502, "grad_norm": 9.25, "learning_rate": 3.4498450259574858e-06, "logits/chosen": -1.5650197267532349, "logits/rejected": -1.2937581539154053, "logps/chosen": -520.2511596679688, "logps/rejected": -697.6277465820312, "loss": 0.4513, "rewards/accuracies": 0.875, "rewards/chosen": -2.4966273307800293, "rewards/margins": 1.7321059703826904, "rewards/rejected": -4.228733539581299, "step": 3350 }, { "epoch": 0.43968986161546764, "grad_norm": 17.625, "learning_rate": 3.439269373722957e-06, "logits/chosen": -1.7883548736572266, "logits/rejected": -1.4868028163909912, "logps/chosen": -616.6824340820312, "logps/rejected": -672.1302490234375, "loss": 0.4637, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.858654022216797, "rewards/margins": 1.4091975688934326, "rewards/rejected": -4.267851829528809, "step": 3360 }, { "epoch": 0.44099846239408513, "grad_norm": 16.125, "learning_rate": 3.4286741142055014e-06, "logits/chosen": -1.7326698303222656, "logits/rejected": -1.5673801898956299, "logps/chosen": -526.6082153320312, "logps/rejected": -613.0726318359375, "loss": 0.5093, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3958287239074707, "rewards/margins": 1.4321609735488892, "rewards/rejected": -3.827989101409912, "step": 3370 }, { "epoch": 0.4423070631727026, "grad_norm": 9.875, "learning_rate": 3.4180594685815536e-06, "logits/chosen": -1.890716314315796, "logits/rejected": -1.6409393548965454, "logps/chosen": -537.9542236328125, "logps/rejected": -582.9957885742188, "loss": 0.5903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.820783853530884, "rewards/margins": 0.7240446209907532, "rewards/rejected": -3.544828414916992, "step": 3380 }, { "epoch": 0.44361566395132007, "grad_norm": 9.625, "learning_rate": 3.4074256584322336e-06, "logits/chosen": -1.7863727807998657, "logits/rejected": -1.6485319137573242, "logps/chosen": -489.6431579589844, "logps/rejected": -643.8218994140625, "loss": 0.5473, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.752276659011841, "rewards/margins": 1.697196364402771, "rewards/rejected": -4.449472904205322, "step": 3390 }, { "epoch": 0.4449242647299375, "grad_norm": 22.25, "learning_rate": 3.3967729057387213e-06, "logits/chosen": -1.640373945236206, "logits/rejected": -1.5733720064163208, "logps/chosen": -481.93048095703125, "logps/rejected": -639.561767578125, "loss": 0.4912, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8140573501586914, "rewards/margins": 1.2256587743759155, "rewards/rejected": -4.039715766906738, "step": 3400 }, { "epoch": 0.4449242647299375, "eval_logits/chosen": -1.3499432802200317, "eval_logits/rejected": -1.2282084226608276, "eval_logps/chosen": -553.343505859375, "eval_logps/rejected": -650.1497802734375, "eval_loss": 0.500731348991394, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -2.8809876441955566, "eval_rewards/margins": 1.2085371017456055, "eval_rewards/rejected": -4.089524745941162, "eval_runtime": 696.0638, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.18, "step": 3400 }, { "epoch": 0.44623286550855495, "grad_norm": 10.125, "learning_rate": 3.386101432877624e-06, "logits/chosen": -1.5949907302856445, "logits/rejected": -1.6778361797332764, "logps/chosen": -505.5342712402344, "logps/rejected": -662.1611328125, "loss": 0.4366, "rewards/accuracies": 0.75, "rewards/chosen": -3.0132851600646973, "rewards/margins": 1.1687366962432861, "rewards/rejected": -4.1820220947265625, "step": 3410 }, { "epoch": 0.44754146628717245, "grad_norm": 17.5, "learning_rate": 3.375411462616332e-06, "logits/chosen": -1.9308531284332275, "logits/rejected": -1.8361985683441162, "logps/chosen": -536.4913940429688, "logps/rejected": -613.3707275390625, "loss": 0.5757, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.0489821434020996, "rewards/margins": 0.8292190432548523, "rewards/rejected": -3.8782010078430176, "step": 3420 }, { "epoch": 0.4488500670657899, "grad_norm": 12.3125, "learning_rate": 3.3647032181083696e-06, "logits/chosen": -1.7581918239593506, "logits/rejected": -1.600229263305664, "logps/chosen": -579.3132934570312, "logps/rejected": -706.5642700195312, "loss": 0.4868, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.120950222015381, "rewards/margins": 1.2254760265350342, "rewards/rejected": -4.346426486968994, "step": 3430 }, { "epoch": 0.4501586678444074, "grad_norm": 10.375, "learning_rate": 3.3539769228887382e-06, "logits/chosen": -1.780623435974121, "logits/rejected": -1.481384515762329, "logps/chosen": -506.5231018066406, "logps/rejected": -612.3474731445312, "loss": 0.5016, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5143914222717285, "rewards/margins": 1.3679008483886719, "rewards/rejected": -3.8822925090789795, "step": 3440 }, { "epoch": 0.45146726862302483, "grad_norm": 9.75, "learning_rate": 3.343232800869247e-06, "logits/chosen": -1.8489558696746826, "logits/rejected": -1.644845724105835, "logps/chosen": -463.353271484375, "logps/rejected": -629.66796875, "loss": 0.466, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4078497886657715, "rewards/margins": 1.2588884830474854, "rewards/rejected": -3.6667380332946777, "step": 3450 }, { "epoch": 0.4527758694016423, "grad_norm": 8.8125, "learning_rate": 3.33247107633384e-06, "logits/chosen": -1.5641610622406006, "logits/rejected": -1.4959639310836792, "logps/chosen": -542.1463623046875, "logps/rejected": -654.1150512695312, "loss": 0.4687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.46083402633667, "rewards/margins": 1.435610294342041, "rewards/rejected": -3.8964438438415527, "step": 3460 }, { "epoch": 0.45408447018025977, "grad_norm": 13.625, "learning_rate": 3.3216919739339155e-06, "logits/chosen": -1.8993631601333618, "logits/rejected": -1.6521832942962646, "logps/chosen": -535.3679809570312, "logps/rejected": -560.9085693359375, "loss": 0.5767, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.4370663166046143, "rewards/margins": 0.6876465678215027, "rewards/rejected": -3.124713182449341, "step": 3470 }, { "epoch": 0.4553930709588772, "grad_norm": 12.125, "learning_rate": 3.310895718683635e-06, "logits/chosen": -1.8658168315887451, "logits/rejected": -1.520081877708435, "logps/chosen": -442.6490173339844, "logps/rejected": -483.6497497558594, "loss": 0.5215, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9672330617904663, "rewards/margins": 1.0211654901504517, "rewards/rejected": -2.988398551940918, "step": 3480 }, { "epoch": 0.4567016717374947, "grad_norm": 9.3125, "learning_rate": 3.3000825359552256e-06, "logits/chosen": -2.064654588699341, "logits/rejected": -1.6796842813491821, "logps/chosen": -482.516357421875, "logps/rejected": -534.9873657226562, "loss": 0.5468, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.250298500061035, "rewards/margins": 0.8945142030715942, "rewards/rejected": -3.144812822341919, "step": 3490 }, { "epoch": 0.45801027251611215, "grad_norm": 8.5625, "learning_rate": 3.2892526514742778e-06, "logits/chosen": -1.9131724834442139, "logits/rejected": -1.847906470298767, "logps/chosen": -453.24285888671875, "logps/rejected": -551.8905029296875, "loss": 0.5173, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.173557996749878, "rewards/margins": 0.7999250888824463, "rewards/rejected": -2.9734833240509033, "step": 3500 }, { "epoch": 0.45801027251611215, "eval_logits/chosen": -1.4178496599197388, "eval_logits/rejected": -1.2988539934158325, "eval_logps/chosen": -469.79180908203125, "eval_logps/rejected": -561.34130859375, "eval_loss": 0.49362248182296753, "eval_rewards/accuracies": 0.7490000128746033, "eval_rewards/chosen": -2.0454704761505127, "eval_rewards/margins": 1.155969500541687, "eval_rewards/rejected": -3.2014400959014893, "eval_runtime": 694.1197, "eval_samples_per_second": 2.881, "eval_steps_per_second": 0.18, "step": 3500 }, { "epoch": 0.4593188732947296, "grad_norm": 20.25, "learning_rate": 3.27840629131503e-06, "logits/chosen": -1.7646443843841553, "logits/rejected": -1.7446094751358032, "logps/chosen": -381.1290588378906, "logps/rejected": -528.4498291015625, "loss": 0.4864, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6455059051513672, "rewards/margins": 1.4792696237564087, "rewards/rejected": -3.1247754096984863, "step": 3510 }, { "epoch": 0.4606274740733471, "grad_norm": 10.8125, "learning_rate": 3.2675436818956522e-06, "logits/chosen": -1.7564834356307983, "logits/rejected": -1.7105242013931274, "logps/chosen": -440.3970642089844, "logps/rejected": -540.5269775390625, "loss": 0.5168, "rewards/accuracies": 0.75, "rewards/chosen": -1.9719520807266235, "rewards/margins": 1.1469027996063232, "rewards/rejected": -3.1188549995422363, "step": 3520 }, { "epoch": 0.46193607485196453, "grad_norm": 10.0, "learning_rate": 3.2566650499735185e-06, "logits/chosen": -1.6547462940216064, "logits/rejected": -1.661376714706421, "logps/chosen": -439.95501708984375, "logps/rejected": -541.7905883789062, "loss": 0.4517, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1423721313476562, "rewards/margins": 0.9662687182426453, "rewards/rejected": -3.1086409091949463, "step": 3530 }, { "epoch": 0.463244675630582, "grad_norm": 19.75, "learning_rate": 3.2457706226404715e-06, "logits/chosen": -2.033189535140991, "logits/rejected": -1.8344275951385498, "logps/chosen": -471.42401123046875, "logps/rejected": -588.0511474609375, "loss": 0.4965, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.229851245880127, "rewards/margins": 1.1928203105926514, "rewards/rejected": -3.42267107963562, "step": 3540 }, { "epoch": 0.46455327640919947, "grad_norm": 7.53125, "learning_rate": 3.2348606273180847e-06, "logits/chosen": -1.8436062335968018, "logits/rejected": -1.7366678714752197, "logps/chosen": -519.3294677734375, "logps/rejected": -599.0130615234375, "loss": 0.5173, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.772615909576416, "rewards/margins": 0.923565685749054, "rewards/rejected": -3.696181535720825, "step": 3550 }, { "epoch": 0.4658618771878169, "grad_norm": 12.3125, "learning_rate": 3.2239352917529165e-06, "logits/chosen": -1.8784258365631104, "logits/rejected": -1.7558300495147705, "logps/chosen": -485.6536560058594, "logps/rejected": -590.959716796875, "loss": 0.5791, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6665985584259033, "rewards/margins": 0.9306347966194153, "rewards/rejected": -3.5972328186035156, "step": 3560 }, { "epoch": 0.4671704779664344, "grad_norm": 32.75, "learning_rate": 3.2129948440117487e-06, "logits/chosen": -1.895696997642517, "logits/rejected": -1.6934400796890259, "logps/chosen": -591.0433349609375, "logps/rejected": -715.1436157226562, "loss": 0.4013, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9297986030578613, "rewards/margins": 1.5708653926849365, "rewards/rejected": -4.500663757324219, "step": 3570 }, { "epoch": 0.46847907874505185, "grad_norm": 17.125, "learning_rate": 3.202039512476833e-06, "logits/chosen": -1.7980642318725586, "logits/rejected": -1.72818922996521, "logps/chosen": -589.6419067382812, "logps/rejected": -685.8646240234375, "loss": 0.5977, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.269117832183838, "rewards/margins": 0.6257120966911316, "rewards/rejected": -3.8948299884796143, "step": 3580 }, { "epoch": 0.46978767952366934, "grad_norm": 18.0, "learning_rate": 3.1910695258411216e-06, "logits/chosen": -1.3696866035461426, "logits/rejected": -1.4093942642211914, "logps/chosen": -548.2889404296875, "logps/rejected": -754.59326171875, "loss": 0.5247, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3609416484832764, "rewards/margins": 1.658307671546936, "rewards/rejected": -5.019248962402344, "step": 3590 }, { "epoch": 0.4710962803022868, "grad_norm": 18.75, "learning_rate": 3.1800851131034904e-06, "logits/chosen": -1.7912166118621826, "logits/rejected": -1.4859238862991333, "logps/chosen": -586.8577270507812, "logps/rejected": -795.7415161132812, "loss": 0.4581, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.261929750442505, "rewards/margins": 1.7295472621917725, "rewards/rejected": -4.991477012634277, "step": 3600 }, { "epoch": 0.4710962803022868, "eval_logits/chosen": -1.2880799770355225, "eval_logits/rejected": -1.1667639017105103, "eval_logps/chosen": -577.3400268554688, "eval_logps/rejected": -692.02099609375, "eval_loss": 0.502166211605072, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -3.1209521293640137, "eval_rewards/margins": 1.3872848749160767, "eval_rewards/rejected": -4.508236885070801, "eval_runtime": 696.1805, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.18, "step": 3600 }, { "epoch": 0.4724048810809042, "grad_norm": 11.75, "learning_rate": 3.169086503563962e-06, "logits/chosen": -1.6847620010375977, "logits/rejected": -1.5995417833328247, "logps/chosen": -575.2470703125, "logps/rejected": -707.0169067382812, "loss": 0.5695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.015564441680908, "rewards/margins": 1.322938084602356, "rewards/rejected": -4.338501930236816, "step": 3610 }, { "epoch": 0.4737134818595217, "grad_norm": 12.4375, "learning_rate": 3.1580739268189165e-06, "logits/chosen": -1.5064860582351685, "logits/rejected": -1.2185066938400269, "logps/chosen": -518.8358154296875, "logps/rejected": -641.9920654296875, "loss": 0.4479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8274810314178467, "rewards/margins": 1.4161033630371094, "rewards/rejected": -4.243584632873535, "step": 3620 }, { "epoch": 0.47502208263813916, "grad_norm": 19.875, "learning_rate": 3.147047612756302e-06, "logits/chosen": -1.5697287321090698, "logits/rejected": -1.5226030349731445, "logps/chosen": -626.0226440429688, "logps/rejected": -758.9794921875, "loss": 0.5174, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.647606372833252, "rewards/margins": 1.2924569845199585, "rewards/rejected": -4.940062999725342, "step": 3630 }, { "epoch": 0.4763306834167566, "grad_norm": 22.5, "learning_rate": 3.136007791550833e-06, "logits/chosen": -1.6496614217758179, "logits/rejected": -1.516288161277771, "logps/chosen": -590.0843505859375, "logps/rejected": -651.7722778320312, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": -3.4532063007354736, "rewards/margins": 0.5210450291633606, "rewards/rejected": -3.9742507934570312, "step": 3640 }, { "epoch": 0.4776392841953741, "grad_norm": 33.75, "learning_rate": 3.1249546936591848e-06, "logits/chosen": -1.5272998809814453, "logits/rejected": -1.5856997966766357, "logps/chosen": -632.7503662109375, "logps/rejected": -745.5396728515625, "loss": 0.5499, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.376429319381714, "rewards/margins": 1.1326038837432861, "rewards/rejected": -4.509033679962158, "step": 3650 }, { "epoch": 0.47894788497399154, "grad_norm": 21.375, "learning_rate": 3.1138885498151843e-06, "logits/chosen": -1.5109302997589111, "logits/rejected": -1.2995421886444092, "logps/chosen": -561.9118041992188, "logps/rejected": -666.4613647460938, "loss": 0.4663, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.372398853302002, "rewards/margins": 1.3209913969039917, "rewards/rejected": -4.693390369415283, "step": 3660 }, { "epoch": 0.48025648575260904, "grad_norm": 19.125, "learning_rate": 3.1028095910249937e-06, "logits/chosen": -1.5637613534927368, "logits/rejected": -1.5185710191726685, "logps/chosen": -599.4356689453125, "logps/rejected": -724.4076538085938, "loss": 0.4873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.381941556930542, "rewards/margins": 1.212660551071167, "rewards/rejected": -4.594602108001709, "step": 3670 }, { "epoch": 0.4815650865312265, "grad_norm": 7.125, "learning_rate": 3.0917180485622895e-06, "logits/chosen": -1.693584680557251, "logits/rejected": -1.5145460367202759, "logps/chosen": -548.9732666015625, "logps/rejected": -667.4913940429688, "loss": 0.5099, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.194112777709961, "rewards/margins": 1.1516354084014893, "rewards/rejected": -4.345747947692871, "step": 3680 }, { "epoch": 0.4828736873098439, "grad_norm": 7.1875, "learning_rate": 3.0806141539634294e-06, "logits/chosen": -1.791388750076294, "logits/rejected": -1.5186954736709595, "logps/chosen": -670.5835571289062, "logps/rejected": -746.3186645507812, "loss": 0.4474, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.498688220977783, "rewards/margins": 1.1772823333740234, "rewards/rejected": -4.675970554351807, "step": 3690 }, { "epoch": 0.4841822880884614, "grad_norm": 12.1875, "learning_rate": 3.069498139022624e-06, "logits/chosen": -1.4912923574447632, "logits/rejected": -1.3451400995254517, "logps/chosen": -574.1414794921875, "logps/rejected": -735.6250610351562, "loss": 0.4583, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.61517333984375, "rewards/margins": 1.559181809425354, "rewards/rejected": -5.174355506896973, "step": 3700 }, { "epoch": 0.4841822880884614, "eval_logits/chosen": -1.231339693069458, "eval_logits/rejected": -1.1082403659820557, "eval_logps/chosen": -634.9663696289062, "eval_logps/rejected": -755.3518676757812, "eval_loss": 0.5078281760215759, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.6972157955169678, "eval_rewards/margins": 1.4443296194076538, "eval_rewards/rejected": -5.141545295715332, "eval_runtime": 695.5395, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 3700 }, { "epoch": 0.48549088886707886, "grad_norm": 21.25, "learning_rate": 3.0583702357870964e-06, "logits/chosen": -1.7290172576904297, "logits/rejected": -1.6435178518295288, "logps/chosen": -645.6229858398438, "logps/rejected": -730.572509765625, "loss": 0.5472, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.7434241771698, "rewards/margins": 0.9138776063919067, "rewards/rejected": -4.657301425933838, "step": 3710 }, { "epoch": 0.48679948964569636, "grad_norm": 10.5, "learning_rate": 3.0472306765522393e-06, "logits/chosen": -1.7862093448638916, "logits/rejected": -1.3456025123596191, "logps/chosen": -739.7753295898438, "logps/rejected": -827.3670654296875, "loss": 0.5608, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.732377290725708, "rewards/margins": 1.8004722595214844, "rewards/rejected": -5.532849311828613, "step": 3720 }, { "epoch": 0.4881080904243138, "grad_norm": 9.625, "learning_rate": 3.0360796938567628e-06, "logits/chosen": -1.7237190008163452, "logits/rejected": -1.347362995147705, "logps/chosen": -659.125732421875, "logps/rejected": -715.3869018554688, "loss": 0.4667, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4212825298309326, "rewards/margins": 1.6233842372894287, "rewards/rejected": -5.044666767120361, "step": 3730 }, { "epoch": 0.48941669120293124, "grad_norm": 13.5, "learning_rate": 3.0249175204778435e-06, "logits/chosen": -1.6182949542999268, "logits/rejected": -1.394348382949829, "logps/chosen": -571.0550537109375, "logps/rejected": -759.7166748046875, "loss": 0.4055, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.194526195526123, "rewards/margins": 1.9726788997650146, "rewards/rejected": -5.167204856872559, "step": 3740 }, { "epoch": 0.49072529198154874, "grad_norm": 8.375, "learning_rate": 3.0137443894262634e-06, "logits/chosen": -1.5073859691619873, "logits/rejected": -1.252789855003357, "logps/chosen": -642.3861083984375, "logps/rejected": -814.1609497070312, "loss": 0.4954, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5922820568084717, "rewards/margins": 2.0078670978546143, "rewards/rejected": -5.600149631500244, "step": 3750 }, { "epoch": 0.4920338927601662, "grad_norm": 6.625, "learning_rate": 3.0025605339415476e-06, "logits/chosen": -1.5805652141571045, "logits/rejected": -1.468889832496643, "logps/chosen": -583.5850219726562, "logps/rejected": -697.9902954101562, "loss": 0.4048, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.509519100189209, "rewards/margins": 1.546823263168335, "rewards/rejected": -5.056342124938965, "step": 3760 }, { "epoch": 0.4933424935387837, "grad_norm": 15.4375, "learning_rate": 2.9913661874870923e-06, "logits/chosen": -1.8533436059951782, "logits/rejected": -1.537858247756958, "logps/chosen": -560.1351318359375, "logps/rejected": -657.4138793945312, "loss": 0.4871, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.154273748397827, "rewards/margins": 1.509356141090393, "rewards/rejected": -4.66363000869751, "step": 3770 }, { "epoch": 0.4946510943174011, "grad_norm": 17.5, "learning_rate": 2.980161583745294e-06, "logits/chosen": -1.8597667217254639, "logits/rejected": -1.7681039571762085, "logps/chosen": -665.7034301757812, "logps/rejected": -727.350341796875, "loss": 0.5568, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.338721752166748, "rewards/margins": 1.0710680484771729, "rewards/rejected": -4.409789562225342, "step": 3780 }, { "epoch": 0.49595969509601856, "grad_norm": 8.625, "learning_rate": 2.96894695661267e-06, "logits/chosen": -1.8668283224105835, "logits/rejected": -1.6326515674591064, "logps/chosen": -596.162841796875, "logps/rejected": -675.41162109375, "loss": 0.4792, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.962094306945801, "rewards/margins": 1.3406484127044678, "rewards/rejected": -4.302742958068848, "step": 3790 }, { "epoch": 0.49726829587463606, "grad_norm": 13.375, "learning_rate": 2.9577225401949773e-06, "logits/chosen": -1.5791881084442139, "logits/rejected": -1.5485875606536865, "logps/chosen": -540.2509155273438, "logps/rejected": -752.0213623046875, "loss": 0.3869, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9848618507385254, "rewards/margins": 1.9419456720352173, "rewards/rejected": -4.926807403564453, "step": 3800 }, { "epoch": 0.49726829587463606, "eval_logits/chosen": -1.2952755689620972, "eval_logits/rejected": -1.1738520860671997, "eval_logps/chosen": -585.7506713867188, "eval_logps/rejected": -693.0497436523438, "eval_loss": 0.49763408303260803, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -3.205059289932251, "eval_rewards/margins": 1.31346595287323, "eval_rewards/rejected": -4.51852560043335, "eval_runtime": 695.4567, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 3800 }, { "epoch": 0.4985768966532535, "grad_norm": 20.375, "learning_rate": 2.946488568802324e-06, "logits/chosen": -1.6365439891815186, "logits/rejected": -1.5325841903686523, "logps/chosen": -656.7621459960938, "logps/rejected": -687.8626098632812, "loss": 0.5563, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.6412415504455566, "rewards/margins": 0.8097431063652039, "rewards/rejected": -4.450984954833984, "step": 3810 }, { "epoch": 0.499885497431871, "grad_norm": 19.25, "learning_rate": 2.935245276944278e-06, "logits/chosen": -1.489794373512268, "logits/rejected": -1.2377159595489502, "logps/chosen": -632.0386962890625, "logps/rejected": -698.50048828125, "loss": 0.4679, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.783662796020508, "rewards/margins": 1.1051515340805054, "rewards/rejected": -4.8888139724731445, "step": 3820 }, { "epoch": 0.5011940982104884, "grad_norm": 6.15625, "learning_rate": 2.9239928993249723e-06, "logits/chosen": -1.4459822177886963, "logits/rejected": -1.19987154006958, "logps/chosen": -674.6354370117188, "logps/rejected": -782.6141357421875, "loss": 0.4561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.147255897521973, "rewards/margins": 1.5587198734283447, "rewards/rejected": -5.70597505569458, "step": 3830 }, { "epoch": 0.5025026989891059, "grad_norm": 17.875, "learning_rate": 2.912731670838207e-06, "logits/chosen": -1.6370899677276611, "logits/rejected": -1.5937082767486572, "logps/chosen": -682.8154296875, "logps/rejected": -797.2015991210938, "loss": 0.5546, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.10701847076416, "rewards/margins": 1.028357982635498, "rewards/rejected": -5.135376930236816, "step": 3840 }, { "epoch": 0.5038112997677233, "grad_norm": 18.125, "learning_rate": 2.901461826562543e-06, "logits/chosen": -1.7616907358169556, "logits/rejected": -1.6258938312530518, "logps/chosen": -710.2918701171875, "logps/rejected": -815.51416015625, "loss": 0.5263, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.1516008377075195, "rewards/margins": 1.463422417640686, "rewards/rejected": -5.615023136138916, "step": 3850 }, { "epoch": 0.5051199005463408, "grad_norm": 14.6875, "learning_rate": 2.8901836017563966e-06, "logits/chosen": -1.5320640802383423, "logits/rejected": -1.326785683631897, "logps/chosen": -624.3651123046875, "logps/rejected": -741.795654296875, "loss": 0.4679, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8408217430114746, "rewards/margins": 1.2388217449188232, "rewards/rejected": -5.079643726348877, "step": 3860 }, { "epoch": 0.5064285013249583, "grad_norm": 17.875, "learning_rate": 2.8788972318531272e-06, "logits/chosen": -1.623965859413147, "logits/rejected": -1.4727613925933838, "logps/chosen": -597.6658935546875, "logps/rejected": -696.4349365234375, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3530983924865723, "rewards/margins": 1.074458360671997, "rewards/rejected": -4.42755651473999, "step": 3870 }, { "epoch": 0.5077371021035757, "grad_norm": 18.375, "learning_rate": 2.8676029524561255e-06, "logits/chosen": -1.690433144569397, "logits/rejected": -1.7986443042755127, "logps/chosen": -608.478271484375, "logps/rejected": -709.8097534179688, "loss": 0.5891, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.7109780311584473, "rewards/margins": 0.6293495297431946, "rewards/rejected": -4.340327262878418, "step": 3880 }, { "epoch": 0.5090457028821932, "grad_norm": 7.8125, "learning_rate": 2.8563009993338906e-06, "logits/chosen": -1.8673486709594727, "logits/rejected": -1.6094764471054077, "logps/chosen": -616.2351684570312, "logps/rejected": -715.8551635742188, "loss": 0.5278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.904707431793213, "rewards/margins": 1.307159185409546, "rewards/rejected": -4.211866855621338, "step": 3890 }, { "epoch": 0.5103543036608107, "grad_norm": 23.875, "learning_rate": 2.844991608415113e-06, "logits/chosen": -1.7745310068130493, "logits/rejected": -1.5673353672027588, "logps/chosen": -595.7905883789062, "logps/rejected": -615.0700073242188, "loss": 0.56, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.238905429840088, "rewards/margins": 0.7397009134292603, "rewards/rejected": -3.9786064624786377, "step": 3900 }, { "epoch": 0.5103543036608107, "eval_logits/chosen": -1.3363449573516846, "eval_logits/rejected": -1.2195228338241577, "eval_logps/chosen": -561.2437744140625, "eval_logps/rejected": -658.4495849609375, "eval_loss": 0.49360132217407227, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -2.9599900245666504, "eval_rewards/margins": 1.2125335931777954, "eval_rewards/rejected": -4.172523498535156, "eval_runtime": 696.5976, "eval_samples_per_second": 2.871, "eval_steps_per_second": 0.179, "step": 3900 }, { "epoch": 0.5116629044394282, "grad_norm": 8.875, "learning_rate": 2.833675015783746e-06, "logits/chosen": -2.001363754272461, "logits/rejected": -1.818486213684082, "logps/chosen": -607.2144775390625, "logps/rejected": -695.83203125, "loss": 0.3898, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6469178199768066, "rewards/margins": 1.4133679866790771, "rewards/rejected": -4.060286045074463, "step": 3910 }, { "epoch": 0.5129715052180456, "grad_norm": 11.25, "learning_rate": 2.8223514576740784e-06, "logits/chosen": -1.6602163314819336, "logits/rejected": -1.47013258934021, "logps/chosen": -506.7833557128906, "logps/rejected": -596.6107788085938, "loss": 0.4713, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.1107964515686035, "rewards/margins": 0.8096998929977417, "rewards/rejected": -3.9204964637756348, "step": 3920 }, { "epoch": 0.5142801059966631, "grad_norm": 12.0625, "learning_rate": 2.8110211704658073e-06, "logits/chosen": -1.6133801937103271, "logits/rejected": -1.5633536577224731, "logps/chosen": -552.4412841796875, "logps/rejected": -700.2015380859375, "loss": 0.471, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.242786407470703, "rewards/margins": 1.3229143619537354, "rewards/rejected": -4.565701007843018, "step": 3930 }, { "epoch": 0.5155887067752806, "grad_norm": 19.75, "learning_rate": 2.7996843906790955e-06, "logits/chosen": -1.65244460105896, "logits/rejected": -1.5809835195541382, "logps/chosen": -576.1734619140625, "logps/rejected": -670.1942138671875, "loss": 0.5316, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1658072471618652, "rewards/margins": 1.0999482870101929, "rewards/rejected": -4.265755653381348, "step": 3940 }, { "epoch": 0.516897307553898, "grad_norm": 5.0625, "learning_rate": 2.7883413549696396e-06, "logits/chosen": -1.5287904739379883, "logits/rejected": -1.3863648176193237, "logps/chosen": -568.906494140625, "logps/rejected": -705.6056518554688, "loss": 0.4679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0612263679504395, "rewards/margins": 1.7607532739639282, "rewards/rejected": -4.821978569030762, "step": 3950 }, { "epoch": 0.5182059083325155, "grad_norm": 21.375, "learning_rate": 2.776992300123732e-06, "logits/chosen": -1.4502888917922974, "logits/rejected": -1.6204286813735962, "logps/chosen": -583.1156616210938, "logps/rejected": -744.0510864257812, "loss": 0.6304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.792588710784912, "rewards/margins": 0.8381406664848328, "rewards/rejected": -4.630729675292969, "step": 3960 }, { "epoch": 0.519514509111133, "grad_norm": 17.125, "learning_rate": 2.7656374630533113e-06, "logits/chosen": -1.7738450765609741, "logits/rejected": -1.4661903381347656, "logps/chosen": -607.5355224609375, "logps/rejected": -643.1941528320312, "loss": 0.5896, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3561758995056152, "rewards/margins": 0.73542320728302, "rewards/rejected": -4.091599464416504, "step": 3970 }, { "epoch": 0.5208231098897503, "grad_norm": 10.125, "learning_rate": 2.754277080791021e-06, "logits/chosen": -1.4954447746276855, "logits/rejected": -1.5677697658538818, "logps/chosen": -535.3952026367188, "logps/rejected": -685.271484375, "loss": 0.4817, "rewards/accuracies": 0.75, "rewards/chosen": -2.8813157081604004, "rewards/margins": 1.3852542638778687, "rewards/rejected": -4.266570091247559, "step": 3980 }, { "epoch": 0.5221317106683678, "grad_norm": 17.5, "learning_rate": 2.742911390485262e-06, "logits/chosen": -1.7027981281280518, "logits/rejected": -1.6308120489120483, "logps/chosen": -549.6814575195312, "logps/rejected": -612.7118530273438, "loss": 0.4944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.848914384841919, "rewards/margins": 1.2997568845748901, "rewards/rejected": -4.1486711502075195, "step": 3990 }, { "epoch": 0.5234403114469853, "grad_norm": 7.90625, "learning_rate": 2.731540629395239e-06, "logits/chosen": -1.771278738975525, "logits/rejected": -1.4638117551803589, "logps/chosen": -675.165283203125, "logps/rejected": -638.8060302734375, "loss": 0.4668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.957197427749634, "rewards/margins": 0.8651722073554993, "rewards/rejected": -3.8223698139190674, "step": 4000 }, { "epoch": 0.5234403114469853, "eval_logits/chosen": -1.3302693367004395, "eval_logits/rejected": -1.2107725143432617, "eval_logps/chosen": -534.3255615234375, "eval_logps/rejected": -635.4434204101562, "eval_loss": 0.49332401156425476, "eval_rewards/accuracies": 0.7519999742507935, "eval_rewards/chosen": -2.690808057785034, "eval_rewards/margins": 1.2516536712646484, "eval_rewards/rejected": -3.9424619674682617, "eval_runtime": 694.9498, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 4000 }, { "epoch": 0.5247489122256028, "grad_norm": 8.0625, "learning_rate": 2.7201650348860115e-06, "logits/chosen": -1.652878999710083, "logits/rejected": -1.6466137170791626, "logps/chosen": -641.9873046875, "logps/rejected": -762.0330200195312, "loss": 0.4978, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.974180221557617, "rewards/margins": 1.0738807916641235, "rewards/rejected": -4.048061370849609, "step": 4010 }, { "epoch": 0.5260575130042202, "grad_norm": 6.875, "learning_rate": 2.7087848444235354e-06, "logits/chosen": -1.564851999282837, "logits/rejected": -1.5103363990783691, "logps/chosen": -526.6390380859375, "logps/rejected": -606.24755859375, "loss": 0.5202, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.8735172748565674, "rewards/margins": 0.7667762041091919, "rewards/rejected": -3.6402931213378906, "step": 4020 }, { "epoch": 0.5273661137828377, "grad_norm": 19.625, "learning_rate": 2.697400295569707e-06, "logits/chosen": -1.738126516342163, "logits/rejected": -1.4919089078903198, "logps/chosen": -464.1680603027344, "logps/rejected": -593.7843627929688, "loss": 0.4752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7494399547576904, "rewards/margins": 1.2229158878326416, "rewards/rejected": -3.972356081008911, "step": 4030 }, { "epoch": 0.5286747145614552, "grad_norm": 18.0, "learning_rate": 2.6860116259774065e-06, "logits/chosen": -1.8343547582626343, "logits/rejected": -1.5537391901016235, "logps/chosen": -605.8026123046875, "logps/rejected": -685.1412353515625, "loss": 0.4311, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0498857498168945, "rewards/margins": 1.3472473621368408, "rewards/rejected": -4.397133827209473, "step": 4040 }, { "epoch": 0.5299833153400726, "grad_norm": 9.25, "learning_rate": 2.674619073385531e-06, "logits/chosen": -1.7113978862762451, "logits/rejected": -1.4989145994186401, "logps/chosen": -602.2650756835938, "logps/rejected": -734.5204467773438, "loss": 0.4337, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0456416606903076, "rewards/margins": 1.5175418853759766, "rewards/rejected": -4.563183784484863, "step": 4050 }, { "epoch": 0.5312919161186901, "grad_norm": 8.625, "learning_rate": 2.663222875614038e-06, "logits/chosen": -1.8348478078842163, "logits/rejected": -1.5215647220611572, "logps/chosen": -678.8057861328125, "logps/rejected": -716.7047729492188, "loss": 0.4416, "rewards/accuracies": 0.875, "rewards/chosen": -3.3776626586914062, "rewards/margins": 1.2653123140335083, "rewards/rejected": -4.642974376678467, "step": 4060 }, { "epoch": 0.5326005168973076, "grad_norm": 11.0625, "learning_rate": 2.6518232705589775e-06, "logits/chosen": -1.339114785194397, "logits/rejected": -1.4878257513046265, "logps/chosen": -564.9835815429688, "logps/rejected": -710.6820678710938, "loss": 0.4077, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.7180285453796387, "rewards/margins": 1.1239169836044312, "rewards/rejected": -4.841945648193359, "step": 4070 }, { "epoch": 0.533909117675925, "grad_norm": 20.625, "learning_rate": 2.640420496187528e-06, "logits/chosen": -1.6984050273895264, "logits/rejected": -1.5305413007736206, "logps/chosen": -766.7525024414062, "logps/rejected": -906.4940185546875, "loss": 0.4759, "rewards/accuracies": 0.75, "rewards/chosen": -4.33902645111084, "rewards/margins": 1.424915075302124, "rewards/rejected": -5.763940811157227, "step": 4080 }, { "epoch": 0.5352177184545425, "grad_norm": 35.75, "learning_rate": 2.629014790533025e-06, "logits/chosen": -1.6293178796768188, "logits/rejected": -1.2749176025390625, "logps/chosen": -687.2681884765625, "logps/rejected": -739.8741455078125, "loss": 0.54, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.327956199645996, "rewards/margins": 0.9499721527099609, "rewards/rejected": -5.277928352355957, "step": 4090 }, { "epoch": 0.53652631923316, "grad_norm": 15.0, "learning_rate": 2.617606391689996e-06, "logits/chosen": -1.5434972047805786, "logits/rejected": -1.283473253250122, "logps/chosen": -648.1460571289062, "logps/rejected": -710.87060546875, "loss": 0.5857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.354752540588379, "rewards/margins": 0.9358797073364258, "rewards/rejected": -5.290631294250488, "step": 4100 }, { "epoch": 0.53652631923316, "eval_logits/chosen": -1.2196580171585083, "eval_logits/rejected": -1.0955959558486938, "eval_logps/chosen": -664.1624145507812, "eval_logps/rejected": -787.9547119140625, "eval_loss": 0.5041548609733582, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": -3.9891762733459473, "eval_rewards/margins": 1.4783977270126343, "eval_rewards/rejected": -5.467574119567871, "eval_runtime": 695.6622, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 4100 }, { "epoch": 0.5378349200117775, "grad_norm": 21.125, "learning_rate": 2.6061955378091896e-06, "logits/chosen": -1.7529815435409546, "logits/rejected": -1.75946044921875, "logps/chosen": -637.4609985351562, "logps/rejected": -808.1427001953125, "loss": 0.5231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5654518604278564, "rewards/margins": 1.7351741790771484, "rewards/rejected": -5.300625801086426, "step": 4110 }, { "epoch": 0.5391435207903948, "grad_norm": 4.40625, "learning_rate": 2.5947824670926025e-06, "logits/chosen": -1.6849695444107056, "logits/rejected": -1.357318639755249, "logps/chosen": -659.892822265625, "logps/rejected": -769.828857421875, "loss": 0.4174, "rewards/accuracies": 0.75, "rewards/chosen": -3.704875946044922, "rewards/margins": 1.5412250757217407, "rewards/rejected": -5.246101379394531, "step": 4120 }, { "epoch": 0.5404521215690123, "grad_norm": 13.5625, "learning_rate": 2.583367417788508e-06, "logits/chosen": -1.6074358224868774, "logits/rejected": -1.5646072626113892, "logps/chosen": -591.0501708984375, "logps/rejected": -719.7105102539062, "loss": 0.6083, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3567326068878174, "rewards/margins": 1.3733718395233154, "rewards/rejected": -4.730103969573975, "step": 4130 }, { "epoch": 0.5417607223476298, "grad_norm": 15.8125, "learning_rate": 2.5719506281864838e-06, "logits/chosen": -1.8133472204208374, "logits/rejected": -1.8622064590454102, "logps/chosen": -642.7955322265625, "logps/rejected": -787.9906005859375, "loss": 0.4952, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3525757789611816, "rewards/margins": 1.4376331567764282, "rewards/rejected": -4.7902092933654785, "step": 4140 }, { "epoch": 0.5430693231262472, "grad_norm": 10.1875, "learning_rate": 2.5605323366124335e-06, "logits/chosen": -1.631072759628296, "logits/rejected": -1.5486156940460205, "logps/chosen": -613.8638305664062, "logps/rejected": -715.7060546875, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6926982402801514, "rewards/margins": 1.0537121295928955, "rewards/rejected": -4.746410369873047, "step": 4150 }, { "epoch": 0.5443779239048647, "grad_norm": 8.5, "learning_rate": 2.5491127814236172e-06, "logits/chosen": -1.6282669305801392, "logits/rejected": -1.3199020624160767, "logps/chosen": -611.4901123046875, "logps/rejected": -660.4708251953125, "loss": 0.5005, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.5215842723846436, "rewards/margins": 1.1059085130691528, "rewards/rejected": -4.627493381500244, "step": 4160 }, { "epoch": 0.5456865246834822, "grad_norm": 19.625, "learning_rate": 2.537692201003671e-06, "logits/chosen": -1.8528636693954468, "logits/rejected": -1.5919544696807861, "logps/chosen": -668.46826171875, "logps/rejected": -825.2579956054688, "loss": 0.4813, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6828417778015137, "rewards/margins": 1.8348373174667358, "rewards/rejected": -5.517679214477539, "step": 4170 }, { "epoch": 0.5469951254620996, "grad_norm": 14.25, "learning_rate": 2.526270833757635e-06, "logits/chosen": -1.7129522562026978, "logits/rejected": -1.4447823762893677, "logps/chosen": -718.1492309570312, "logps/rejected": -851.3233642578125, "loss": 0.444, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.8317675590515137, "rewards/margins": 1.4836534261703491, "rewards/rejected": -5.315421104431152, "step": 4180 }, { "epoch": 0.5483037262407171, "grad_norm": 11.125, "learning_rate": 2.514848918106971e-06, "logits/chosen": -1.5056499242782593, "logits/rejected": -1.4630565643310547, "logps/chosen": -636.4885864257812, "logps/rejected": -697.7372436523438, "loss": 0.4776, "rewards/accuracies": 0.75, "rewards/chosen": -3.8867645263671875, "rewards/margins": 0.9944122433662415, "rewards/rejected": -4.881176948547363, "step": 4190 }, { "epoch": 0.5496123270193346, "grad_norm": 22.75, "learning_rate": 2.503426692484594e-06, "logits/chosen": -1.6807591915130615, "logits/rejected": -1.51368248462677, "logps/chosen": -573.923095703125, "logps/rejected": -753.5374145507812, "loss": 0.5061, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.537898540496826, "rewards/margins": 1.798473596572876, "rewards/rejected": -5.3363728523254395, "step": 4200 }, { "epoch": 0.5496123270193346, "eval_logits/chosen": -1.268314242362976, "eval_logits/rejected": -1.1463505029678345, "eval_logps/chosen": -637.4093627929688, "eval_logps/rejected": -753.6463012695312, "eval_loss": 0.49350956082344055, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -3.7216458320617676, "eval_rewards/margins": 1.4028441905975342, "eval_rewards/rejected": -5.124490261077881, "eval_runtime": 695.576, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 4200 }, { "epoch": 0.550920927797952, "grad_norm": 23.625, "learning_rate": 2.492004395329883e-06, "logits/chosen": -1.487158179283142, "logits/rejected": -1.3961637020111084, "logps/chosen": -692.4575805664062, "logps/rejected": -737.1976318359375, "loss": 0.5374, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8680713176727295, "rewards/margins": 1.2967933416366577, "rewards/rejected": -5.164864540100098, "step": 4210 }, { "epoch": 0.5522295285765695, "grad_norm": 15.1875, "learning_rate": 2.4805822650837165e-06, "logits/chosen": -1.6531293392181396, "logits/rejected": -1.5040441751480103, "logps/chosen": -594.5394287109375, "logps/rejected": -745.7686767578125, "loss": 0.5086, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.653715133666992, "rewards/margins": 1.2663328647613525, "rewards/rejected": -4.920048236846924, "step": 4220 }, { "epoch": 0.553538129355187, "grad_norm": 11.9375, "learning_rate": 2.4691605401834843e-06, "logits/chosen": -1.8644109964370728, "logits/rejected": -1.6383320093154907, "logps/chosen": -582.258544921875, "logps/rejected": -678.9598388671875, "loss": 0.463, "rewards/accuracies": 0.875, "rewards/chosen": -3.3841350078582764, "rewards/margins": 1.4149315357208252, "rewards/rejected": -4.799066066741943, "step": 4230 }, { "epoch": 0.5548467301338045, "grad_norm": 29.25, "learning_rate": 2.457739459058117e-06, "logits/chosen": -1.6518312692642212, "logits/rejected": -1.326192855834961, "logps/chosen": -544.1250610351562, "logps/rejected": -579.0157470703125, "loss": 0.4815, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3258392810821533, "rewards/margins": 0.9366198778152466, "rewards/rejected": -4.2624592781066895, "step": 4240 }, { "epoch": 0.5561553309124219, "grad_norm": 20.75, "learning_rate": 2.4463192601231054e-06, "logits/chosen": -1.670021653175354, "logits/rejected": -1.5531452894210815, "logps/chosen": -612.171630859375, "logps/rejected": -722.2947998046875, "loss": 0.607, "rewards/accuracies": 0.75, "rewards/chosen": -3.4436404705047607, "rewards/margins": 1.2497693300247192, "rewards/rejected": -4.6934099197387695, "step": 4250 }, { "epoch": 0.5574639316910394, "grad_norm": 13.625, "learning_rate": 2.434900181775524e-06, "logits/chosen": -1.6876760721206665, "logits/rejected": -1.5109765529632568, "logps/chosen": -559.9918212890625, "logps/rejected": -608.171630859375, "loss": 0.5446, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.261996030807495, "rewards/margins": 0.9899412989616394, "rewards/rejected": -4.251937389373779, "step": 4260 }, { "epoch": 0.5587725324696569, "grad_norm": 7.3125, "learning_rate": 2.4234824623890578e-06, "logits/chosen": -1.7460806369781494, "logits/rejected": -1.6814136505126953, "logps/chosen": -564.4183349609375, "logps/rejected": -675.5577392578125, "loss": 0.476, "rewards/accuracies": 0.75, "rewards/chosen": -2.8926351070404053, "rewards/margins": 1.5984983444213867, "rewards/rejected": -4.491133689880371, "step": 4270 }, { "epoch": 0.5600811332482742, "grad_norm": 6.875, "learning_rate": 2.4120663403090193e-06, "logits/chosen": -2.0293831825256348, "logits/rejected": -1.6706844568252563, "logps/chosen": -657.9515380859375, "logps/rejected": -707.3280639648438, "loss": 0.442, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.947570323944092, "rewards/margins": 1.4602850675582886, "rewards/rejected": -4.40785551071167, "step": 4280 }, { "epoch": 0.5613897340268917, "grad_norm": 13.5, "learning_rate": 2.40065205384738e-06, "logits/chosen": -1.8083298206329346, "logits/rejected": -1.5628607273101807, "logps/chosen": -556.0801391601562, "logps/rejected": -572.3630981445312, "loss": 0.4776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.708073854446411, "rewards/margins": 0.9693443179130554, "rewards/rejected": -3.677417755126953, "step": 4290 }, { "epoch": 0.5626983348055092, "grad_norm": 14.0, "learning_rate": 2.389239841277793e-06, "logits/chosen": -1.7907909154891968, "logits/rejected": -1.5201160907745361, "logps/chosen": -531.189208984375, "logps/rejected": -592.7239379882812, "loss": 0.5986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.0696964263916016, "rewards/margins": 1.1197582483291626, "rewards/rejected": -4.189454555511475, "step": 4300 }, { "epoch": 0.5626983348055092, "eval_logits/chosen": -1.3167318105697632, "eval_logits/rejected": -1.1937830448150635, "eval_logps/chosen": -584.9331665039062, "eval_logps/rejected": -694.7586669921875, "eval_loss": 0.49142369627952576, "eval_rewards/accuracies": 0.7390000224113464, "eval_rewards/chosen": -3.1968846321105957, "eval_rewards/margins": 1.338729739189148, "eval_rewards/rejected": -4.535613536834717, "eval_runtime": 695.2938, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 4300 }, { "epoch": 0.5640069355841266, "grad_norm": 21.75, "learning_rate": 2.3778299408306167e-06, "logits/chosen": -1.8261682987213135, "logits/rejected": -1.5532481670379639, "logps/chosen": -577.5880737304688, "logps/rejected": -651.6314697265625, "loss": 0.5061, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2644424438476562, "rewards/margins": 1.2604084014892578, "rewards/rejected": -4.524850368499756, "step": 4310 }, { "epoch": 0.5653155363627441, "grad_norm": 22.0, "learning_rate": 2.3664225906879452e-06, "logits/chosen": -1.6243183612823486, "logits/rejected": -1.4362423419952393, "logps/chosen": -570.15771484375, "logps/rejected": -660.177734375, "loss": 0.4843, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.049072265625, "rewards/margins": 1.3088899850845337, "rewards/rejected": -4.357962131500244, "step": 4320 }, { "epoch": 0.5666241371413616, "grad_norm": 13.9375, "learning_rate": 2.3550180289786357e-06, "logits/chosen": -1.7643531560897827, "logits/rejected": -1.6266472339630127, "logps/chosen": -523.5103759765625, "logps/rejected": -625.3106079101562, "loss": 0.5294, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.1773743629455566, "rewards/margins": 1.2542011737823486, "rewards/rejected": -4.431574821472168, "step": 4330 }, { "epoch": 0.5679327379199791, "grad_norm": 17.125, "learning_rate": 2.343616493773335e-06, "logits/chosen": -1.5832384824752808, "logits/rejected": -1.3051575422286987, "logps/chosen": -629.3339233398438, "logps/rejected": -728.3126220703125, "loss": 0.5021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7843728065490723, "rewards/margins": 1.0797946453094482, "rewards/rejected": -4.8641676902771, "step": 4340 }, { "epoch": 0.5692413386985965, "grad_norm": 11.6875, "learning_rate": 2.3322182230795127e-06, "logits/chosen": -1.71127188205719, "logits/rejected": -1.5035130977630615, "logps/chosen": -550.8839721679688, "logps/rejected": -616.6287231445312, "loss": 0.5175, "rewards/accuracies": 0.75, "rewards/chosen": -3.1866097450256348, "rewards/margins": 1.0948160886764526, "rewards/rejected": -4.281426429748535, "step": 4350 }, { "epoch": 0.570549939477214, "grad_norm": 13.4375, "learning_rate": 2.320823454836491e-06, "logits/chosen": -2.023163318634033, "logits/rejected": -1.7942478656768799, "logps/chosen": -596.3504638671875, "logps/rejected": -618.583984375, "loss": 0.6033, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.1905455589294434, "rewards/margins": 0.5708644986152649, "rewards/rejected": -3.7614102363586426, "step": 4360 }, { "epoch": 0.5718585402558315, "grad_norm": 23.25, "learning_rate": 2.309432426910478e-06, "logits/chosen": -1.572442650794983, "logits/rejected": -1.5749504566192627, "logps/chosen": -555.4691162109375, "logps/rejected": -653.6903076171875, "loss": 0.4802, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.959886074066162, "rewards/margins": 1.118762493133545, "rewards/rejected": -4.078648567199707, "step": 4370 }, { "epoch": 0.5731671410344489, "grad_norm": 8.0, "learning_rate": 2.298045377089604e-06, "logits/chosen": -1.787316083908081, "logits/rejected": -1.680860161781311, "logps/chosen": -535.1393432617188, "logps/rejected": -632.7705078125, "loss": 0.4973, "rewards/accuracies": 0.75, "rewards/chosen": -2.633354425430298, "rewards/margins": 0.8643769025802612, "rewards/rejected": -3.4977314472198486, "step": 4380 }, { "epoch": 0.5744757418130664, "grad_norm": 10.875, "learning_rate": 2.286662543078955e-06, "logits/chosen": -1.8975722789764404, "logits/rejected": -1.9328582286834717, "logps/chosen": -502.79595947265625, "logps/rejected": -595.9024658203125, "loss": 0.5701, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.5636978149414062, "rewards/margins": 0.8220898509025574, "rewards/rejected": -3.3857874870300293, "step": 4390 }, { "epoch": 0.5757843425916839, "grad_norm": 8.4375, "learning_rate": 2.2752841624956125e-06, "logits/chosen": -1.7494583129882812, "logits/rejected": -1.577609658241272, "logps/chosen": -485.70831298828125, "logps/rejected": -626.1027221679688, "loss": 0.4241, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.372349500656128, "rewards/margins": 1.4121707677841187, "rewards/rejected": -3.784520387649536, "step": 4400 }, { "epoch": 0.5757843425916839, "eval_logits/chosen": -1.3677903413772583, "eval_logits/rejected": -1.2460012435913086, "eval_logps/chosen": -530.1074829101562, "eval_logps/rejected": -630.8524780273438, "eval_loss": 0.4893059730529785, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -2.648627281188965, "eval_rewards/margins": 1.2479248046875, "eval_rewards/rejected": -3.8965518474578857, "eval_runtime": 694.9943, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 4400 }, { "epoch": 0.5770929433703013, "grad_norm": 11.8125, "learning_rate": 2.2639104728636915e-06, "logits/chosen": -1.9307305812835693, "logits/rejected": -1.6584625244140625, "logps/chosen": -528.1661987304688, "logps/rejected": -679.614013671875, "loss": 0.4972, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.706604480743408, "rewards/margins": 1.2442266941070557, "rewards/rejected": -3.950831174850464, "step": 4410 }, { "epoch": 0.5784015441489188, "grad_norm": 5.65625, "learning_rate": 2.252541711609384e-06, "logits/chosen": -1.88007390499115, "logits/rejected": -1.5117027759552002, "logps/chosen": -592.3739624023438, "logps/rejected": -675.0255126953125, "loss": 0.5398, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0205793380737305, "rewards/margins": 1.242754340171814, "rewards/rejected": -4.263333797454834, "step": 4420 }, { "epoch": 0.5797101449275363, "grad_norm": 13.3125, "learning_rate": 2.241178116056002e-06, "logits/chosen": -1.350807547569275, "logits/rejected": -1.479506492614746, "logps/chosen": -555.6202392578125, "logps/rejected": -687.30419921875, "loss": 0.6461, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.246364116668701, "rewards/margins": 1.1261978149414062, "rewards/rejected": -4.372561931610107, "step": 4430 }, { "epoch": 0.5810187457061537, "grad_norm": 15.875, "learning_rate": 2.2298199234190236e-06, "logits/chosen": -1.7819865942001343, "logits/rejected": -1.5578216314315796, "logps/chosen": -680.64013671875, "logps/rejected": -759.0550537109375, "loss": 0.5877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4645066261291504, "rewards/margins": 1.302476406097412, "rewards/rejected": -4.7669830322265625, "step": 4440 }, { "epoch": 0.5823273464847711, "grad_norm": 19.75, "learning_rate": 2.218467370801138e-06, "logits/chosen": -1.7171590328216553, "logits/rejected": -1.7118908166885376, "logps/chosen": -618.0673828125, "logps/rejected": -738.876220703125, "loss": 0.3884, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9533591270446777, "rewards/margins": 1.4566929340362549, "rewards/rejected": -4.410052299499512, "step": 4450 }, { "epoch": 0.5836359472633886, "grad_norm": 10.5625, "learning_rate": 2.207120695187304e-06, "logits/chosen": -1.5354106426239014, "logits/rejected": -1.478308081626892, "logps/chosen": -538.1929931640625, "logps/rejected": -668.1854248046875, "loss": 0.5793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.144106388092041, "rewards/margins": 1.2017936706542969, "rewards/rejected": -4.345900535583496, "step": 4460 }, { "epoch": 0.5849445480420061, "grad_norm": 19.875, "learning_rate": 2.195780133439794e-06, "logits/chosen": -1.663332223892212, "logits/rejected": -1.4547230005264282, "logps/chosen": -605.9708251953125, "logps/rejected": -764.2218017578125, "loss": 0.4158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2851853370666504, "rewards/margins": 1.4955469369888306, "rewards/rejected": -4.78073263168335, "step": 4470 }, { "epoch": 0.5862531488206235, "grad_norm": 22.875, "learning_rate": 2.1844459222932535e-06, "logits/chosen": -1.8332163095474243, "logits/rejected": -1.743154764175415, "logps/chosen": -580.7257080078125, "logps/rejected": -687.658203125, "loss": 0.6053, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.903289556503296, "rewards/margins": 1.2255370616912842, "rewards/rejected": -4.128827095031738, "step": 4480 }, { "epoch": 0.587561749599241, "grad_norm": 21.125, "learning_rate": 2.17311829834976e-06, "logits/chosen": -1.745027780532837, "logits/rejected": -1.6781517267227173, "logps/chosen": -497.5025939941406, "logps/rejected": -681.0847778320312, "loss": 0.4242, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5435328483581543, "rewards/margins": 1.3284950256347656, "rewards/rejected": -3.872027635574341, "step": 4490 }, { "epoch": 0.5888703503778585, "grad_norm": 3.671875, "learning_rate": 2.1617974980738814e-06, "logits/chosen": -1.915763258934021, "logits/rejected": -1.815779447555542, "logps/chosen": -543.8959350585938, "logps/rejected": -665.0628662109375, "loss": 0.3475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3466744422912598, "rewards/margins": 1.6810474395751953, "rewards/rejected": -4.027721881866455, "step": 4500 }, { "epoch": 0.5888703503778585, "eval_logits/chosen": -1.351818323135376, "eval_logits/rejected": -1.2300559282302856, "eval_logps/chosen": -547.5814208984375, "eval_logps/rejected": -654.3662109375, "eval_loss": 0.48772692680358887, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -2.82336688041687, "eval_rewards/margins": 1.3083220720291138, "eval_rewards/rejected": -4.131689071655273, "eval_runtime": 695.9918, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 4500 }, { "epoch": 0.5901789511564759, "grad_norm": 15.3125, "learning_rate": 2.150483757787744e-06, "logits/chosen": -1.6354339122772217, "logits/rejected": -1.486480712890625, "logps/chosen": -575.2947387695312, "logps/rejected": -722.3814697265625, "loss": 0.6281, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.221775531768799, "rewards/margins": 1.5928491353988647, "rewards/rejected": -4.814624786376953, "step": 4510 }, { "epoch": 0.5914875519350934, "grad_norm": 16.875, "learning_rate": 2.139177313666093e-06, "logits/chosen": -1.8869001865386963, "logits/rejected": -1.5815259218215942, "logps/chosen": -629.1190795898438, "logps/rejected": -683.7518310546875, "loss": 0.546, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9516806602478027, "rewards/margins": 1.276896595954895, "rewards/rejected": -4.228577136993408, "step": 4520 }, { "epoch": 0.5927961527137109, "grad_norm": 7.40625, "learning_rate": 2.1278784017313688e-06, "logits/chosen": -1.6435188055038452, "logits/rejected": -1.5333058834075928, "logps/chosen": -575.8089599609375, "logps/rejected": -652.0985107421875, "loss": 0.4349, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.914458990097046, "rewards/margins": 1.2834089994430542, "rewards/rejected": -4.1978678703308105, "step": 4530 }, { "epoch": 0.5941047534923283, "grad_norm": 15.1875, "learning_rate": 2.116587257848776e-06, "logits/chosen": -1.7964210510253906, "logits/rejected": -1.6885019540786743, "logps/chosen": -540.5385131835938, "logps/rejected": -650.5846557617188, "loss": 0.5475, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8466668128967285, "rewards/margins": 1.1782000064849854, "rewards/rejected": -4.024867057800293, "step": 4540 }, { "epoch": 0.5954133542709458, "grad_norm": 13.0625, "learning_rate": 2.105304117721361e-06, "logits/chosen": -1.7755730152130127, "logits/rejected": -1.5230886936187744, "logps/chosen": -551.8937377929688, "logps/rejected": -644.3948364257812, "loss": 0.4425, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.6994407176971436, "rewards/margins": 1.225833535194397, "rewards/rejected": -3.92527437210083, "step": 4550 }, { "epoch": 0.5967219550495633, "grad_norm": 14.6875, "learning_rate": 2.0940292168850913e-06, "logits/chosen": -1.752661943435669, "logits/rejected": -1.4747867584228516, "logps/chosen": -502.49395751953125, "logps/rejected": -580.3674926757812, "loss": 0.4879, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.4881184101104736, "rewards/margins": 1.4662063121795654, "rewards/rejected": -3.954324722290039, "step": 4560 }, { "epoch": 0.5980305558281808, "grad_norm": 9.1875, "learning_rate": 2.082762790703939e-06, "logits/chosen": -1.8049396276474, "logits/rejected": -1.708228349685669, "logps/chosen": -524.9848022460938, "logps/rejected": -601.9075317382812, "loss": 0.5125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.6248769760131836, "rewards/margins": 1.0212444067001343, "rewards/rejected": -3.6461215019226074, "step": 4570 }, { "epoch": 0.5993391566067982, "grad_norm": 7.375, "learning_rate": 2.0715050743649674e-06, "logits/chosen": -1.4968044757843018, "logits/rejected": -1.373840570449829, "logps/chosen": -511.71844482421875, "logps/rejected": -639.6237182617188, "loss": 0.439, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8505821228027344, "rewards/margins": 1.574336051940918, "rewards/rejected": -4.424918174743652, "step": 4580 }, { "epoch": 0.6006477573854156, "grad_norm": 14.125, "learning_rate": 2.060256302873421e-06, "logits/chosen": -1.8210341930389404, "logits/rejected": -1.4503523111343384, "logps/chosen": -627.7799072265625, "logps/rejected": -688.2463989257812, "loss": 0.4644, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3093161582946777, "rewards/margins": 1.2572965621948242, "rewards/rejected": -4.56661319732666, "step": 4590 }, { "epoch": 0.6019563581640331, "grad_norm": 18.25, "learning_rate": 2.049016711047822e-06, "logits/chosen": -1.6711362600326538, "logits/rejected": -1.4015945196151733, "logps/chosen": -660.1317138671875, "logps/rejected": -747.46142578125, "loss": 0.4631, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6962978839874268, "rewards/margins": 1.177126169204712, "rewards/rejected": -4.873424053192139, "step": 4600 }, { "epoch": 0.6019563581640331, "eval_logits/chosen": -1.2733272314071655, "eval_logits/rejected": -1.1497946977615356, "eval_logps/chosen": -604.7739868164062, "eval_logps/rejected": -737.2854614257812, "eval_loss": 0.49890589714050293, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -3.395291566848755, "eval_rewards/margins": 1.5655897855758667, "eval_rewards/rejected": -4.960881233215332, "eval_runtime": 695.3911, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 4600 }, { "epoch": 0.6032649589426505, "grad_norm": 16.125, "learning_rate": 2.037786533515064e-06, "logits/chosen": -1.273638367652893, "logits/rejected": -1.1278702020645142, "logps/chosen": -507.4588317871094, "logps/rejected": -598.3023681640625, "loss": 0.73, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4255547523498535, "rewards/margins": 1.1317102909088135, "rewards/rejected": -4.557264804840088, "step": 4610 }, { "epoch": 0.604573559721268, "grad_norm": 16.875, "learning_rate": 2.02656600470552e-06, "logits/chosen": -1.5374267101287842, "logits/rejected": -1.3159191608428955, "logps/chosen": -571.5723876953125, "logps/rejected": -659.8568725585938, "loss": 0.4941, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.450957775115967, "rewards/margins": 1.3765472173690796, "rewards/rejected": -4.827505111694336, "step": 4620 }, { "epoch": 0.6058821604998855, "grad_norm": 15.4375, "learning_rate": 2.015355358848144e-06, "logits/chosen": -1.4822932481765747, "logits/rejected": -1.222393274307251, "logps/chosen": -557.7679443359375, "logps/rejected": -647.2407836914062, "loss": 0.4325, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.387486696243286, "rewards/margins": 1.3585991859436035, "rewards/rejected": -4.7460856437683105, "step": 4630 }, { "epoch": 0.6071907612785029, "grad_norm": 16.5, "learning_rate": 2.004154829965582e-06, "logits/chosen": -1.7486101388931274, "logits/rejected": -1.5975501537322998, "logps/chosen": -610.554443359375, "logps/rejected": -746.5077514648438, "loss": 0.5777, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.491931915283203, "rewards/margins": 1.1562888622283936, "rewards/rejected": -4.648220539093018, "step": 4640 }, { "epoch": 0.6084993620571204, "grad_norm": 29.625, "learning_rate": 1.99296465186929e-06, "logits/chosen": -1.681537389755249, "logits/rejected": -1.7798118591308594, "logps/chosen": -613.89404296875, "logps/rejected": -705.7528076171875, "loss": 0.6322, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.4162821769714355, "rewards/margins": 0.6566082239151001, "rewards/rejected": -4.072890281677246, "step": 4650 }, { "epoch": 0.6098079628357379, "grad_norm": 26.25, "learning_rate": 1.9817850581546488e-06, "logits/chosen": -1.6236194372177124, "logits/rejected": -1.3453150987625122, "logps/chosen": -602.2049560546875, "logps/rejected": -722.5325927734375, "loss": 0.4565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0354931354522705, "rewards/margins": 1.7028968334197998, "rewards/rejected": -4.7383904457092285, "step": 4660 }, { "epoch": 0.6111165636143554, "grad_norm": 31.25, "learning_rate": 1.970616282196091e-06, "logits/chosen": -1.6052782535552979, "logits/rejected": -1.470177173614502, "logps/chosen": -576.8943481445312, "logps/rejected": -730.1336669921875, "loss": 0.5153, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3035781383514404, "rewards/margins": 1.821986198425293, "rewards/rejected": -5.125565052032471, "step": 4670 }, { "epoch": 0.6124251643929728, "grad_norm": 18.5, "learning_rate": 1.959458557142228e-06, "logits/chosen": -1.525893211364746, "logits/rejected": -1.3604334592819214, "logps/chosen": -582.3936767578125, "logps/rejected": -689.6436767578125, "loss": 0.5331, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4725308418273926, "rewards/margins": 1.2655709981918335, "rewards/rejected": -4.738101959228516, "step": 4680 }, { "epoch": 0.6137337651715903, "grad_norm": 12.4375, "learning_rate": 1.948312115910982e-06, "logits/chosen": -1.654853105545044, "logits/rejected": -1.584294080734253, "logps/chosen": -558.103271484375, "logps/rejected": -751.0806884765625, "loss": 0.4058, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1182861328125, "rewards/margins": 1.7219947576522827, "rewards/rejected": -4.840281009674072, "step": 4690 }, { "epoch": 0.6150423659502078, "grad_norm": 10.875, "learning_rate": 1.937177191184729e-06, "logits/chosen": -1.7909324169158936, "logits/rejected": -1.4596047401428223, "logps/chosen": -571.0664672851562, "logps/rejected": -652.2332763671875, "loss": 0.3956, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9365692138671875, "rewards/margins": 1.5999305248260498, "rewards/rejected": -4.536499500274658, "step": 4700 }, { "epoch": 0.6150423659502078, "eval_logits/chosen": -1.2820336818695068, "eval_logits/rejected": -1.1607590913772583, "eval_logps/chosen": -587.1261596679688, "eval_logps/rejected": -702.0703735351562, "eval_loss": 0.4925045073032379, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.2188150882720947, "eval_rewards/margins": 1.3899158239364624, "eval_rewards/rejected": -4.608730792999268, "eval_runtime": 696.4686, "eval_samples_per_second": 2.872, "eval_steps_per_second": 0.179, "step": 4700 }, { "epoch": 0.6163509667288252, "grad_norm": 19.0, "learning_rate": 1.9260540154054317e-06, "logits/chosen": -1.5857264995574951, "logits/rejected": -1.4326543807983398, "logps/chosen": -577.0807495117188, "logps/rejected": -690.3370361328125, "loss": 0.5408, "rewards/accuracies": 0.75, "rewards/chosen": -3.4582877159118652, "rewards/margins": 1.2375998497009277, "rewards/rejected": -4.695887565612793, "step": 4710 }, { "epoch": 0.6176595675074427, "grad_norm": 30.625, "learning_rate": 1.9149428207697983e-06, "logits/chosen": -1.7012113332748413, "logits/rejected": -1.5527856349945068, "logps/chosen": -561.499267578125, "logps/rejected": -725.7141723632812, "loss": 0.4795, "rewards/accuracies": 0.625, "rewards/chosen": -3.2154877185821533, "rewards/margins": 1.4246195554733276, "rewards/rejected": -4.640107154846191, "step": 4720 }, { "epoch": 0.6189681682860602, "grad_norm": 16.125, "learning_rate": 1.9038438392244262e-06, "logits/chosen": -1.632440209388733, "logits/rejected": -1.5574052333831787, "logps/chosen": -590.62890625, "logps/rejected": -683.996337890625, "loss": 0.44, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3319156169891357, "rewards/margins": 1.0429545640945435, "rewards/rejected": -4.374870300292969, "step": 4730 }, { "epoch": 0.6202767690646775, "grad_norm": 22.125, "learning_rate": 1.8927573024609666e-06, "logits/chosen": -1.6485340595245361, "logits/rejected": -1.4890973567962646, "logps/chosen": -578.7037353515625, "logps/rejected": -786.7863159179688, "loss": 0.4042, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1073899269104004, "rewards/margins": 2.0495364665985107, "rewards/rejected": -5.156926155090332, "step": 4740 }, { "epoch": 0.621585369843295, "grad_norm": 19.25, "learning_rate": 1.8816834419112845e-06, "logits/chosen": -1.526113510131836, "logits/rejected": -1.4162414073944092, "logps/chosen": -564.99560546875, "logps/rejected": -668.0380249023438, "loss": 0.5056, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.333219528198242, "rewards/margins": 1.2065274715423584, "rewards/rejected": -4.539747714996338, "step": 4750 }, { "epoch": 0.6228939706219125, "grad_norm": 25.0, "learning_rate": 1.8706224887426283e-06, "logits/chosen": -1.3279767036437988, "logits/rejected": -1.2112728357315063, "logps/chosen": -612.1134643554688, "logps/rejected": -748.1138916015625, "loss": 0.3993, "rewards/accuracies": 0.875, "rewards/chosen": -3.645991563796997, "rewards/margins": 1.6721620559692383, "rewards/rejected": -5.3181538581848145, "step": 4760 }, { "epoch": 0.62420257140053, "grad_norm": 15.6875, "learning_rate": 1.8595746738528045e-06, "logits/chosen": -1.5533462762832642, "logits/rejected": -1.4076683521270752, "logps/chosen": -583.8905029296875, "logps/rejected": -688.8970947265625, "loss": 0.4693, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5534520149230957, "rewards/margins": 1.2550474405288696, "rewards/rejected": -4.808499336242676, "step": 4770 }, { "epoch": 0.6255111721791474, "grad_norm": 6.375, "learning_rate": 1.8485402278653584e-06, "logits/chosen": -1.8111467361450195, "logits/rejected": -1.6955211162567139, "logps/chosen": -603.23095703125, "logps/rejected": -721.2127075195312, "loss": 0.5398, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.449007511138916, "rewards/margins": 1.125487208366394, "rewards/rejected": -4.574494361877441, "step": 4780 }, { "epoch": 0.6268197729577649, "grad_norm": 16.125, "learning_rate": 1.8375193811247577e-06, "logits/chosen": -1.5205495357513428, "logits/rejected": -1.3816944360733032, "logps/chosen": -627.7484741210938, "logps/rejected": -714.5665283203125, "loss": 0.483, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5751616954803467, "rewards/margins": 0.9349249005317688, "rewards/rejected": -4.510087013244629, "step": 4790 }, { "epoch": 0.6281283737363824, "grad_norm": 9.625, "learning_rate": 1.826512363691586e-06, "logits/chosen": -1.6378263235092163, "logits/rejected": -1.6450719833374023, "logps/chosen": -599.9519653320312, "logps/rejected": -750.4071044921875, "loss": 0.4484, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3282954692840576, "rewards/margins": 1.3810827732086182, "rewards/rejected": -4.709378242492676, "step": 4800 }, { "epoch": 0.6281283737363824, "eval_logits/chosen": -1.2532418966293335, "eval_logits/rejected": -1.131803035736084, "eval_logps/chosen": -612.3438720703125, "eval_logps/rejected": -734.9601440429688, "eval_loss": 0.49141788482666016, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -3.4709901809692383, "eval_rewards/margins": 1.4666383266448975, "eval_rewards/rejected": -4.937628746032715, "eval_runtime": 695.8117, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 4800 }, { "epoch": 0.6294369745149998, "grad_norm": 22.0, "learning_rate": 1.8155194053377391e-06, "logits/chosen": -1.79046630859375, "logits/rejected": -1.6563514471054077, "logps/chosen": -635.6248779296875, "logps/rejected": -729.33642578125, "loss": 0.4964, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.6141040325164795, "rewards/margins": 1.248853325843811, "rewards/rejected": -4.86295747756958, "step": 4810 }, { "epoch": 0.6307455752936173, "grad_norm": 18.625, "learning_rate": 1.80454073554163e-06, "logits/chosen": -1.6785757541656494, "logits/rejected": -1.6213468313217163, "logps/chosen": -596.3038330078125, "logps/rejected": -680.255615234375, "loss": 0.6623, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.746147871017456, "rewards/margins": 1.035150170326233, "rewards/rejected": -4.7812981605529785, "step": 4820 }, { "epoch": 0.6320541760722348, "grad_norm": 14.25, "learning_rate": 1.7935765834833966e-06, "logits/chosen": -1.788050889968872, "logits/rejected": -1.4914196729660034, "logps/chosen": -618.23388671875, "logps/rejected": -686.4832153320312, "loss": 0.4351, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.500692844390869, "rewards/margins": 1.209568738937378, "rewards/rejected": -4.710261344909668, "step": 4830 }, { "epoch": 0.6333627768508522, "grad_norm": 24.75, "learning_rate": 1.7826271780401182e-06, "logits/chosen": -1.8286269903182983, "logits/rejected": -1.6340404748916626, "logps/chosen": -532.5169067382812, "logps/rejected": -680.1775512695312, "loss": 0.4989, "rewards/accuracies": 0.75, "rewards/chosen": -2.7815299034118652, "rewards/margins": 1.7420690059661865, "rewards/rejected": -4.523598670959473, "step": 4840 }, { "epoch": 0.6346713776294697, "grad_norm": 7.03125, "learning_rate": 1.7716927477810389e-06, "logits/chosen": -1.6669690608978271, "logits/rejected": -1.5948641300201416, "logps/chosen": -529.2412719726562, "logps/rejected": -654.555908203125, "loss": 0.4122, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.961249828338623, "rewards/margins": 1.6141977310180664, "rewards/rejected": -4.5754475593566895, "step": 4850 }, { "epoch": 0.6359799784080872, "grad_norm": 24.375, "learning_rate": 1.7607735209627953e-06, "logits/chosen": -1.645970106124878, "logits/rejected": -1.5813357830047607, "logps/chosen": -581.43359375, "logps/rejected": -725.5485229492188, "loss": 0.5837, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.132516384124756, "rewards/margins": 1.2668392658233643, "rewards/rejected": -4.399355888366699, "step": 4860 }, { "epoch": 0.6372885791867046, "grad_norm": 30.25, "learning_rate": 1.749869725524651e-06, "logits/chosen": -1.701202154159546, "logits/rejected": -1.5059651136398315, "logps/chosen": -615.5177612304688, "logps/rejected": -684.534912109375, "loss": 0.5163, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.5624091625213623, "rewards/margins": 1.2456378936767578, "rewards/rejected": -4.808046817779541, "step": 4870 }, { "epoch": 0.6385971799653221, "grad_norm": 13.625, "learning_rate": 1.7389815890837392e-06, "logits/chosen": -1.9109199047088623, "logits/rejected": -1.9050347805023193, "logps/chosen": -583.5333251953125, "logps/rejected": -674.4069213867188, "loss": 0.5455, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.326005220413208, "rewards/margins": 0.776532769203186, "rewards/rejected": -4.102538108825684, "step": 4880 }, { "epoch": 0.6399057807439396, "grad_norm": 16.625, "learning_rate": 1.7281093389303105e-06, "logits/chosen": -1.7302379608154297, "logits/rejected": -1.4888559579849243, "logps/chosen": -588.8234252929688, "logps/rejected": -723.6757202148438, "loss": 0.4889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1230404376983643, "rewards/margins": 1.5676895380020142, "rewards/rejected": -4.690730094909668, "step": 4890 }, { "epoch": 0.641214381522557, "grad_norm": 16.875, "learning_rate": 1.7172532020229899e-06, "logits/chosen": -1.7761768102645874, "logits/rejected": -1.551513910293579, "logps/chosen": -625.2261962890625, "logps/rejected": -702.8818359375, "loss": 0.5111, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3841280937194824, "rewards/margins": 1.1456187963485718, "rewards/rejected": -4.529747486114502, "step": 4900 }, { "epoch": 0.641214381522557, "eval_logits/chosen": -1.3263459205627441, "eval_logits/rejected": -1.204804539680481, "eval_logps/chosen": -576.8660888671875, "eval_logps/rejected": -684.3721923828125, "eval_loss": 0.48334625363349915, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -3.1162123680114746, "eval_rewards/margins": 1.3155369758605957, "eval_rewards/rejected": -4.43174934387207, "eval_runtime": 696.2822, "eval_samples_per_second": 2.872, "eval_steps_per_second": 0.18, "step": 4900 }, { "epoch": 0.6425229823011744, "grad_norm": 18.0, "learning_rate": 1.7064134049840359e-06, "logits/chosen": -1.671931505203247, "logits/rejected": -1.4198347330093384, "logps/chosen": -612.1527099609375, "logps/rejected": -650.12353515625, "loss": 0.429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.1876776218414307, "rewards/margins": 1.2654125690460205, "rewards/rejected": -4.453090190887451, "step": 4910 }, { "epoch": 0.6438315830797919, "grad_norm": 26.5, "learning_rate": 1.6955901740946136e-06, "logits/chosen": -2.0300331115722656, "logits/rejected": -1.8272968530654907, "logps/chosen": -580.4107055664062, "logps/rejected": -663.0615234375, "loss": 0.4521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.977374315261841, "rewards/margins": 1.0503928661346436, "rewards/rejected": -4.027767181396484, "step": 4920 }, { "epoch": 0.6451401838584094, "grad_norm": 17.875, "learning_rate": 1.684783735290067e-06, "logits/chosen": -1.9269481897354126, "logits/rejected": -1.8277524709701538, "logps/chosen": -549.035400390625, "logps/rejected": -684.0709838867188, "loss": 0.5127, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.201995372772217, "rewards/margins": 1.278713345527649, "rewards/rejected": -4.480708122253418, "step": 4930 }, { "epoch": 0.6464487846370268, "grad_norm": 6.28125, "learning_rate": 1.6739943141552079e-06, "logits/chosen": -1.740638017654419, "logits/rejected": -1.7198823690414429, "logps/chosen": -577.2246704101562, "logps/rejected": -688.5287475585938, "loss": 0.5422, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.2243359088897705, "rewards/margins": 0.8095100522041321, "rewards/rejected": -4.033845901489258, "step": 4940 }, { "epoch": 0.6477573854156443, "grad_norm": 10.125, "learning_rate": 1.663222135919601e-06, "logits/chosen": -1.5320097208023071, "logits/rejected": -1.3662582635879517, "logps/chosen": -613.9178466796875, "logps/rejected": -705.2393188476562, "loss": 0.4615, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.754957675933838, "rewards/margins": 1.2337870597839355, "rewards/rejected": -4.988745212554932, "step": 4950 }, { "epoch": 0.6490659861942618, "grad_norm": 19.5, "learning_rate": 1.652467425452865e-06, "logits/chosen": -1.513616681098938, "logits/rejected": -1.3814283609390259, "logps/chosen": -588.7526245117188, "logps/rejected": -703.374267578125, "loss": 0.401, "rewards/accuracies": 0.75, "rewards/chosen": -3.1592321395874023, "rewards/margins": 1.4594871997833252, "rewards/rejected": -4.618719577789307, "step": 4960 }, { "epoch": 0.6503745869728792, "grad_norm": 10.625, "learning_rate": 1.6417304072599787e-06, "logits/chosen": -1.6183624267578125, "logits/rejected": -1.6989692449569702, "logps/chosen": -514.5684814453125, "logps/rejected": -631.8204345703125, "loss": 0.4676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0991170406341553, "rewards/margins": 1.2609256505966187, "rewards/rejected": -4.360043048858643, "step": 4970 }, { "epoch": 0.6516831877514967, "grad_norm": 22.0, "learning_rate": 1.6310113054765947e-06, "logits/chosen": -1.5698142051696777, "logits/rejected": -1.549896478652954, "logps/chosen": -570.1005249023438, "logps/rejected": -667.5099487304688, "loss": 0.4385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3367397785186768, "rewards/margins": 1.2216688394546509, "rewards/rejected": -4.558408737182617, "step": 4980 }, { "epoch": 0.6529917885301142, "grad_norm": 13.5, "learning_rate": 1.6203103438643591e-06, "logits/chosen": -1.6289364099502563, "logits/rejected": -1.3671115636825562, "logps/chosen": -541.049560546875, "logps/rejected": -697.2430419921875, "loss": 0.5172, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.991550922393799, "rewards/margins": 1.685948133468628, "rewards/rejected": -4.677499294281006, "step": 4990 }, { "epoch": 0.6543003893087317, "grad_norm": 16.75, "learning_rate": 1.6096277458062417e-06, "logits/chosen": -1.8213369846343994, "logits/rejected": -1.501551866531372, "logps/chosen": -627.9816284179688, "logps/rejected": -722.3287353515625, "loss": 0.5135, "rewards/accuracies": 0.75, "rewards/chosen": -3.457343578338623, "rewards/margins": 1.4982343912124634, "rewards/rejected": -4.955577373504639, "step": 5000 }, { "epoch": 0.6543003893087317, "eval_logits/chosen": -1.313575029373169, "eval_logits/rejected": -1.1914212703704834, "eval_logps/chosen": -588.531005859375, "eval_logps/rejected": -703.1549072265625, "eval_loss": 0.4861777424812317, "eval_rewards/accuracies": 0.7519999742507935, "eval_rewards/chosen": -3.2328617572784424, "eval_rewards/margins": 1.3867149353027344, "eval_rewards/rejected": -4.619576930999756, "eval_runtime": 694.6996, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 5000 }, { "epoch": 0.6556089900873491, "grad_norm": 6.84375, "learning_rate": 1.5989637343018705e-06, "logits/chosen": -2.044959306716919, "logits/rejected": -1.7893301248550415, "logps/chosen": -613.00146484375, "logps/rejected": -706.8289794921875, "loss": 0.5081, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9927618503570557, "rewards/margins": 1.3731175661087036, "rewards/rejected": -4.365879535675049, "step": 5010 }, { "epoch": 0.6569175908659666, "grad_norm": 14.6875, "learning_rate": 1.5883185319628824e-06, "logits/chosen": -1.6845659017562866, "logits/rejected": -1.5657196044921875, "logps/chosen": -532.01123046875, "logps/rejected": -637.2589721679688, "loss": 0.3945, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.079472303390503, "rewards/margins": 1.3351237773895264, "rewards/rejected": -4.414595603942871, "step": 5020 }, { "epoch": 0.6582261916445841, "grad_norm": 11.375, "learning_rate": 1.5776923610082695e-06, "logits/chosen": -1.861328125, "logits/rejected": -1.5965458154678345, "logps/chosen": -613.5578002929688, "logps/rejected": -704.1434936523438, "loss": 0.4431, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.045461893081665, "rewards/margins": 1.7077134847640991, "rewards/rejected": -4.753175258636475, "step": 5030 }, { "epoch": 0.6595347924232015, "grad_norm": 14.0, "learning_rate": 1.5670854432597433e-06, "logits/chosen": -1.7268893718719482, "logits/rejected": -1.3195490837097168, "logps/chosen": -623.6607055664062, "logps/rejected": -706.3313598632812, "loss": 0.4694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.310670852661133, "rewards/margins": 1.7838218212127686, "rewards/rejected": -5.0944929122924805, "step": 5040 }, { "epoch": 0.660843393201819, "grad_norm": 28.875, "learning_rate": 1.556498000137104e-06, "logits/chosen": -1.6131718158721924, "logits/rejected": -1.47737717628479, "logps/chosen": -590.0049438476562, "logps/rejected": -719.5170288085938, "loss": 0.4322, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.512928009033203, "rewards/margins": 1.1317293643951416, "rewards/rejected": -4.644658088684082, "step": 5050 }, { "epoch": 0.6621519939804364, "grad_norm": 7.4375, "learning_rate": 1.5459302526536188e-06, "logits/chosen": -1.513589859008789, "logits/rejected": -1.3736741542816162, "logps/chosen": -640.7074584960938, "logps/rejected": -860.7649536132812, "loss": 0.4291, "rewards/accuracies": 0.875, "rewards/chosen": -3.9044322967529297, "rewards/margins": 2.1638665199279785, "rewards/rejected": -6.06829833984375, "step": 5060 }, { "epoch": 0.6634605947590538, "grad_norm": 27.0, "learning_rate": 1.5353824214114075e-06, "logits/chosen": -1.4623088836669922, "logits/rejected": -1.0714373588562012, "logps/chosen": -631.7174682617188, "logps/rejected": -750.935302734375, "loss": 0.419, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.743971347808838, "rewards/margins": 1.6032378673553467, "rewards/rejected": -5.3472089767456055, "step": 5070 }, { "epoch": 0.6647691955376713, "grad_norm": 19.0, "learning_rate": 1.5248547265968373e-06, "logits/chosen": -1.5414018630981445, "logits/rejected": -1.4777626991271973, "logps/chosen": -657.7684326171875, "logps/rejected": -743.1038818359375, "loss": 0.4408, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.8901729583740234, "rewards/margins": 0.9363168478012085, "rewards/rejected": -4.8264899253845215, "step": 5080 }, { "epoch": 0.6660777963162888, "grad_norm": 8.9375, "learning_rate": 1.5143473879759265e-06, "logits/chosen": -1.6996246576309204, "logits/rejected": -1.6308828592300415, "logps/chosen": -611.0842895507812, "logps/rejected": -780.1282348632812, "loss": 0.4484, "rewards/accuracies": 0.75, "rewards/chosen": -3.6226696968078613, "rewards/margins": 1.625772476196289, "rewards/rejected": -5.24844217300415, "step": 5090 }, { "epoch": 0.6673863970949063, "grad_norm": 18.75, "learning_rate": 1.5038606248897586e-06, "logits/chosen": -1.5328372716903687, "logits/rejected": -1.398322343826294, "logps/chosen": -608.200439453125, "logps/rejected": -683.1607055664062, "loss": 0.576, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.526782989501953, "rewards/margins": 1.088318943977356, "rewards/rejected": -4.6151018142700195, "step": 5100 }, { "epoch": 0.6673863970949063, "eval_logits/chosen": -1.3046430349349976, "eval_logits/rejected": -1.1833548545837402, "eval_logps/chosen": -597.7152709960938, "eval_logps/rejected": -727.6852416992188, "eval_loss": 0.495951771736145, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.324704647064209, "eval_rewards/margins": 1.5401750802993774, "eval_rewards/rejected": -4.864879608154297, "eval_runtime": 695.6963, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 5100 }, { "epoch": 0.6686949978735237, "grad_norm": 22.375, "learning_rate": 1.4933946562499008e-06, "logits/chosen": -1.5024693012237549, "logits/rejected": -1.442991018295288, "logps/chosen": -608.5447387695312, "logps/rejected": -671.8112182617188, "loss": 0.3979, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.543973445892334, "rewards/margins": 1.2633062601089478, "rewards/rejected": -4.807279586791992, "step": 5110 }, { "epoch": 0.6700035986521412, "grad_norm": 17.375, "learning_rate": 1.482949700533835e-06, "logits/chosen": -1.4761431217193604, "logits/rejected": -1.3101270198822021, "logps/chosen": -548.8846435546875, "logps/rejected": -643.3927001953125, "loss": 0.5089, "rewards/accuracies": 0.75, "rewards/chosen": -3.232585906982422, "rewards/margins": 1.3561674356460571, "rewards/rejected": -4.588753700256348, "step": 5120 }, { "epoch": 0.6713121994307587, "grad_norm": 21.5, "learning_rate": 1.4725259757803983e-06, "logits/chosen": -1.6628373861312866, "logits/rejected": -1.4277468919754028, "logps/chosen": -662.7272338867188, "logps/rejected": -785.9891357421875, "loss": 0.6543, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.638927936553955, "rewards/margins": 1.4085365533828735, "rewards/rejected": -5.047464847564697, "step": 5130 }, { "epoch": 0.6726208002093761, "grad_norm": 35.75, "learning_rate": 1.4621236995852314e-06, "logits/chosen": -1.6387958526611328, "logits/rejected": -1.2643970251083374, "logps/chosen": -576.0126953125, "logps/rejected": -660.87841796875, "loss": 0.6377, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.4027156829833984, "rewards/margins": 1.4311861991882324, "rewards/rejected": -4.833901882171631, "step": 5140 }, { "epoch": 0.6739294009879936, "grad_norm": 6.375, "learning_rate": 1.4517430890962337e-06, "logits/chosen": -1.7587352991104126, "logits/rejected": -1.5688607692718506, "logps/chosen": -515.070556640625, "logps/rejected": -676.8639526367188, "loss": 0.4257, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.745851993560791, "rewards/margins": 1.6595748662948608, "rewards/rejected": -4.405426979064941, "step": 5150 }, { "epoch": 0.6752380017666111, "grad_norm": 7.28125, "learning_rate": 1.4413843610090342e-06, "logits/chosen": -1.7208397388458252, "logits/rejected": -1.6680753231048584, "logps/chosen": -564.4064331054688, "logps/rejected": -698.8317260742188, "loss": 0.4942, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.28273344039917, "rewards/margins": 1.1407921314239502, "rewards/rejected": -4.423525333404541, "step": 5160 }, { "epoch": 0.6765466025452285, "grad_norm": 30.25, "learning_rate": 1.4310477315624637e-06, "logits/chosen": -1.729223608970642, "logits/rejected": -1.4785102605819702, "logps/chosen": -567.8930053710938, "logps/rejected": -683.0460815429688, "loss": 0.3723, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.998805522918701, "rewards/margins": 1.6298551559448242, "rewards/rejected": -4.628660678863525, "step": 5170 }, { "epoch": 0.677855203323846, "grad_norm": 23.0, "learning_rate": 1.420733416534045e-06, "logits/chosen": -1.7175891399383545, "logits/rejected": -1.6097275018692017, "logps/chosen": -563.64794921875, "logps/rejected": -676.5087280273438, "loss": 0.5056, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1611363887786865, "rewards/margins": 1.2047687768936157, "rewards/rejected": -4.365904808044434, "step": 5180 }, { "epoch": 0.6791638041024635, "grad_norm": 24.5, "learning_rate": 1.410441631235487e-06, "logits/chosen": -1.4644991159439087, "logits/rejected": -1.4646719694137573, "logps/chosen": -549.29931640625, "logps/rejected": -679.59912109375, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -3.4416236877441406, "rewards/margins": 1.157944917678833, "rewards/rejected": -4.599569320678711, "step": 5190 }, { "epoch": 0.6804724048810809, "grad_norm": 10.75, "learning_rate": 1.4001725905081868e-06, "logits/chosen": -1.8738616704940796, "logits/rejected": -1.6489412784576416, "logps/chosen": -652.4998779296875, "logps/rejected": -731.0206298828125, "loss": 0.4551, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3719563484191895, "rewards/margins": 1.2334692478179932, "rewards/rejected": -4.6054253578186035, "step": 5200 }, { "epoch": 0.6804724048810809, "eval_logits/chosen": -1.325324535369873, "eval_logits/rejected": -1.2041271924972534, "eval_logps/chosen": -590.4718627929688, "eval_logps/rejected": -711.451171875, "eval_loss": 0.4903734624385834, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.2522716522216797, "eval_rewards/margins": 1.4502674341201782, "eval_rewards/rejected": -4.702539443969727, "eval_runtime": 696.0182, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.18, "step": 5200 }, { "epoch": 0.6817810056596983, "grad_norm": 10.25, "learning_rate": 1.3899265087187507e-06, "logits/chosen": -1.8293945789337158, "logits/rejected": -1.743165373802185, "logps/chosen": -623.4713745117188, "logps/rejected": -705.6763305664062, "loss": 0.4751, "rewards/accuracies": 0.75, "rewards/chosen": -3.176431179046631, "rewards/margins": 1.169909954071045, "rewards/rejected": -4.346341133117676, "step": 5210 }, { "epoch": 0.6830896064383158, "grad_norm": 12.625, "learning_rate": 1.3797035997545144e-06, "logits/chosen": -1.805241346359253, "logits/rejected": -1.652181625366211, "logps/chosen": -532.2978515625, "logps/rejected": -662.1207885742188, "loss": 0.4233, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8758301734924316, "rewards/margins": 1.3966044187545776, "rewards/rejected": -4.272434234619141, "step": 5220 }, { "epoch": 0.6843982072169333, "grad_norm": 18.875, "learning_rate": 1.3695040770190816e-06, "logits/chosen": -1.7296545505523682, "logits/rejected": -1.5934122800827026, "logps/chosen": -624.9149169921875, "logps/rejected": -696.55615234375, "loss": 0.4846, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4971401691436768, "rewards/margins": 1.0419518947601318, "rewards/rejected": -4.539092063903809, "step": 5230 }, { "epoch": 0.6857068079955507, "grad_norm": 32.25, "learning_rate": 1.3593281534278651e-06, "logits/chosen": -1.4951655864715576, "logits/rejected": -1.4221090078353882, "logps/chosen": -644.0265502929688, "logps/rejected": -704.1845703125, "loss": 0.6526, "rewards/accuracies": 0.75, "rewards/chosen": -3.443441390991211, "rewards/margins": 0.9216500520706177, "rewards/rejected": -4.365091323852539, "step": 5240 }, { "epoch": 0.6870154087741682, "grad_norm": 14.375, "learning_rate": 1.3491760414036478e-06, "logits/chosen": -1.6998094320297241, "logits/rejected": -1.5929067134857178, "logps/chosen": -584.4478759765625, "logps/rejected": -674.9965209960938, "loss": 0.5767, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.463754653930664, "rewards/margins": 0.9070130586624146, "rewards/rejected": -4.370768070220947, "step": 5250 }, { "epoch": 0.6883240095527857, "grad_norm": 13.125, "learning_rate": 1.3390479528721444e-06, "logits/chosen": -1.8953745365142822, "logits/rejected": -1.7963120937347412, "logps/chosen": -604.052734375, "logps/rejected": -710.6583862304688, "loss": 0.4291, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3326525688171387, "rewards/margins": 1.2116256952285767, "rewards/rejected": -4.544278144836426, "step": 5260 }, { "epoch": 0.6896326103314031, "grad_norm": 11.875, "learning_rate": 1.3289440992575756e-06, "logits/chosen": -2.0331919193267822, "logits/rejected": -1.7444026470184326, "logps/chosen": -630.68310546875, "logps/rejected": -760.4340209960938, "loss": 0.3589, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.35612416267395, "rewards/margins": 1.8966643810272217, "rewards/rejected": -5.252788066864014, "step": 5270 }, { "epoch": 0.6909412111100206, "grad_norm": 27.75, "learning_rate": 1.3188646914782616e-06, "logits/chosen": -1.6064008474349976, "logits/rejected": -1.4912612438201904, "logps/chosen": -596.0897216796875, "logps/rejected": -675.34228515625, "loss": 0.5486, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.9225566387176514, "rewards/margins": 0.7468477487564087, "rewards/rejected": -4.66940450668335, "step": 5280 }, { "epoch": 0.6922498118886381, "grad_norm": 7.78125, "learning_rate": 1.3088099399422109e-06, "logits/chosen": -1.8994146585464478, "logits/rejected": -1.5599989891052246, "logps/chosen": -637.171875, "logps/rejected": -736.212646484375, "loss": 0.3849, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9306888580322266, "rewards/margins": 1.6792547702789307, "rewards/rejected": -4.609943866729736, "step": 5290 }, { "epoch": 0.6935584126672555, "grad_norm": 15.75, "learning_rate": 1.2987800545427353e-06, "logits/chosen": -1.7215349674224854, "logits/rejected": -1.6019662618637085, "logps/chosen": -526.7073364257812, "logps/rejected": -726.6863403320312, "loss": 0.4653, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.12886905670166, "rewards/margins": 1.7121937274932861, "rewards/rejected": -4.841062545776367, "step": 5300 }, { "epoch": 0.6935584126672555, "eval_logits/chosen": -1.321199893951416, "eval_logits/rejected": -1.1999045610427856, "eval_logps/chosen": -595.7976684570312, "eval_logps/rejected": -718.723876953125, "eval_loss": 0.4901818633079529, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -3.3055295944213867, "eval_rewards/margins": 1.4697368144989014, "eval_rewards/rejected": -4.775265693664551, "eval_runtime": 695.8218, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 5300 }, { "epoch": 0.694867013445873, "grad_norm": 23.375, "learning_rate": 1.288775244654062e-06, "logits/chosen": -1.7931926250457764, "logits/rejected": -1.66519296169281, "logps/chosen": -656.0794677734375, "logps/rejected": -813.1991577148438, "loss": 0.3848, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1646921634674072, "rewards/margins": 1.8792455196380615, "rewards/rejected": -5.043937683105469, "step": 5310 }, { "epoch": 0.6961756142244905, "grad_norm": 21.25, "learning_rate": 1.2787957191269696e-06, "logits/chosen": -1.6988649368286133, "logits/rejected": -1.4772835969924927, "logps/chosen": -585.1422729492188, "logps/rejected": -699.8489990234375, "loss": 0.5701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.473609447479248, "rewards/margins": 1.1422603130340576, "rewards/rejected": -4.615869045257568, "step": 5320 }, { "epoch": 0.697484215003108, "grad_norm": 13.3125, "learning_rate": 1.2688416862844193e-06, "logits/chosen": -1.6612980365753174, "logits/rejected": -1.5902245044708252, "logps/chosen": -478.5091857910156, "logps/rejected": -628.2425537109375, "loss": 0.4777, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.942793130874634, "rewards/margins": 1.4725706577301025, "rewards/rejected": -4.415363788604736, "step": 5330 }, { "epoch": 0.6987928157817254, "grad_norm": 16.5, "learning_rate": 1.2589133539172193e-06, "logits/chosen": -1.7620331048965454, "logits/rejected": -1.4778059720993042, "logps/chosen": -679.1590576171875, "logps/rejected": -777.8512573242188, "loss": 0.5073, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.713555097579956, "rewards/margins": 1.4506169557571411, "rewards/rejected": -5.164172172546387, "step": 5340 }, { "epoch": 0.7001014165603429, "grad_norm": 16.875, "learning_rate": 1.249010929279672e-06, "logits/chosen": -1.6138865947723389, "logits/rejected": -1.5538960695266724, "logps/chosen": -539.54736328125, "logps/rejected": -673.0003662109375, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": -3.2207045555114746, "rewards/margins": 1.0804541110992432, "rewards/rejected": -4.301158905029297, "step": 5350 }, { "epoch": 0.7014100173389604, "grad_norm": 25.25, "learning_rate": 1.2391346190852603e-06, "logits/chosen": -1.6585410833358765, "logits/rejected": -1.4979883432388306, "logps/chosen": -632.8004760742188, "logps/rejected": -720.1314086914062, "loss": 0.5086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.555495023727417, "rewards/margins": 1.497219204902649, "rewards/rejected": -5.052713871002197, "step": 5360 }, { "epoch": 0.7027186181175777, "grad_norm": 19.875, "learning_rate": 1.2292846295023222e-06, "logits/chosen": -1.9531097412109375, "logits/rejected": -1.8452699184417725, "logps/chosen": -618.2356567382812, "logps/rejected": -728.1348876953125, "loss": 0.543, "rewards/accuracies": 0.75, "rewards/chosen": -3.266091823577881, "rewards/margins": 1.039518117904663, "rewards/rejected": -4.305609703063965, "step": 5370 }, { "epoch": 0.7040272188961952, "grad_norm": 21.75, "learning_rate": 1.2194611661497576e-06, "logits/chosen": -1.6589546203613281, "logits/rejected": -1.6348285675048828, "logps/chosen": -520.0006103515625, "logps/rejected": -677.3866577148438, "loss": 0.468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1622865200042725, "rewards/margins": 1.5004984140396118, "rewards/rejected": -4.662785530090332, "step": 5380 }, { "epoch": 0.7053358196748127, "grad_norm": 32.75, "learning_rate": 1.2096644340927247e-06, "logits/chosen": -1.778672218322754, "logits/rejected": -1.6992524862289429, "logps/chosen": -610.9495849609375, "logps/rejected": -712.8442993164062, "loss": 0.5223, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2283568382263184, "rewards/margins": 1.3184454441070557, "rewards/rejected": -4.546802043914795, "step": 5390 }, { "epoch": 0.7066444204534301, "grad_norm": 37.0, "learning_rate": 1.19989463783837e-06, "logits/chosen": -1.5817681550979614, "logits/rejected": -1.4843850135803223, "logps/chosen": -558.0791015625, "logps/rejected": -696.6058959960938, "loss": 0.5424, "rewards/accuracies": 0.75, "rewards/chosen": -3.145268440246582, "rewards/margins": 1.5337059497833252, "rewards/rejected": -4.678974628448486, "step": 5400 }, { "epoch": 0.7066444204534301, "eval_logits/chosen": -1.3421860933303833, "eval_logits/rejected": -1.2230432033538818, "eval_logps/chosen": -579.6743774414062, "eval_logps/rejected": -698.2103881835938, "eval_loss": 0.4875990152359009, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.144296884536743, "eval_rewards/margins": 1.4258344173431396, "eval_rewards/rejected": -4.570130825042725, "eval_runtime": 695.9818, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 5400 }, { "epoch": 0.7079530212320476, "grad_norm": 13.1875, "learning_rate": 1.1901519813315495e-06, "logits/chosen": -1.8655706644058228, "logits/rejected": -1.7549632787704468, "logps/chosen": -589.7059326171875, "logps/rejected": -656.1508178710938, "loss": 0.5452, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.937856435775757, "rewards/margins": 0.8295459747314453, "rewards/rejected": -3.767401933670044, "step": 5410 }, { "epoch": 0.7092616220106651, "grad_norm": 23.75, "learning_rate": 1.1804366679505798e-06, "logits/chosen": -1.7637218236923218, "logits/rejected": -1.6402696371078491, "logps/chosen": -579.9437866210938, "logps/rejected": -703.2298583984375, "loss": 0.4548, "rewards/accuracies": 0.75, "rewards/chosen": -2.9874448776245117, "rewards/margins": 1.1213775873184204, "rewards/rejected": -4.108822822570801, "step": 5420 }, { "epoch": 0.7105702227892826, "grad_norm": 39.0, "learning_rate": 1.1707489005029877e-06, "logits/chosen": -1.860687255859375, "logits/rejected": -1.6033604145050049, "logps/chosen": -607.3297119140625, "logps/rejected": -583.9635009765625, "loss": 0.6115, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.055037260055542, "rewards/margins": 0.9378668665885925, "rewards/rejected": -3.9929039478302, "step": 5430 }, { "epoch": 0.7118788235679, "grad_norm": 10.9375, "learning_rate": 1.1610888812212749e-06, "logits/chosen": -1.6722767353057861, "logits/rejected": -1.6074368953704834, "logps/chosen": -645.781982421875, "logps/rejected": -762.2787475585938, "loss": 0.5185, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2982890605926514, "rewards/margins": 1.2845749855041504, "rewards/rejected": -4.582863807678223, "step": 5440 }, { "epoch": 0.7131874243465175, "grad_norm": 15.875, "learning_rate": 1.1514568117587035e-06, "logits/chosen": -1.7381616830825806, "logits/rejected": -1.3915202617645264, "logps/chosen": -492.3623046875, "logps/rejected": -663.5822143554688, "loss": 0.327, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.6514556407928467, "rewards/margins": 2.275938034057617, "rewards/rejected": -4.927393913269043, "step": 5450 }, { "epoch": 0.714496025125135, "grad_norm": 9.375, "learning_rate": 1.1418528931850781e-06, "logits/chosen": -1.782294511795044, "logits/rejected": -1.702467679977417, "logps/chosen": -566.8135375976562, "logps/rejected": -722.4124755859375, "loss": 0.429, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.863522529602051, "rewards/margins": 1.7328885793685913, "rewards/rejected": -4.596411228179932, "step": 5460 }, { "epoch": 0.7158046259037524, "grad_norm": 33.75, "learning_rate": 1.1322773259825563e-06, "logits/chosen": -1.7113138437271118, "logits/rejected": -1.6667699813842773, "logps/chosen": -535.2305297851562, "logps/rejected": -655.229248046875, "loss": 0.4466, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.81361985206604, "rewards/margins": 1.382311224937439, "rewards/rejected": -4.195931434631348, "step": 5470 }, { "epoch": 0.7171132266823699, "grad_norm": 14.0, "learning_rate": 1.1227303100414552e-06, "logits/chosen": -2.0493881702423096, "logits/rejected": -1.8330230712890625, "logps/chosen": -596.6600341796875, "logps/rejected": -691.6264038085938, "loss": 0.4735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.9153642654418945, "rewards/margins": 1.3671013116836548, "rewards/rejected": -4.28246545791626, "step": 5480 }, { "epoch": 0.7184218274609874, "grad_norm": 9.5, "learning_rate": 1.113212044656087e-06, "logits/chosen": -1.5735337734222412, "logits/rejected": -1.576192021369934, "logps/chosen": -497.79400634765625, "logps/rejected": -655.3231811523438, "loss": 0.5475, "rewards/accuracies": 0.75, "rewards/chosen": -2.812741279602051, "rewards/margins": 1.3131688833236694, "rewards/rejected": -4.125910758972168, "step": 5490 }, { "epoch": 0.7197304282396048, "grad_norm": 14.8125, "learning_rate": 1.1037227285205951e-06, "logits/chosen": -1.9826812744140625, "logits/rejected": -1.845492959022522, "logps/chosen": -599.62939453125, "logps/rejected": -704.8819580078125, "loss": 0.5207, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.009406566619873, "rewards/margins": 1.2578799724578857, "rewards/rejected": -4.267286777496338, "step": 5500 }, { "epoch": 0.7197304282396048, "eval_logits/chosen": -1.3513511419296265, "eval_logits/rejected": -1.2321586608886719, "eval_logps/chosen": -572.4053955078125, "eval_logps/rejected": -689.14599609375, "eval_loss": 0.4857368767261505, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.071605920791626, "eval_rewards/margins": 1.4078811407089233, "eval_rewards/rejected": -4.47948694229126, "eval_runtime": 694.5975, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 5500 }, { "epoch": 0.7210390290182223, "grad_norm": 10.1875, "learning_rate": 1.0942625597248028e-06, "logits/chosen": -1.8620054721832275, "logits/rejected": -1.5526872873306274, "logps/chosen": -576.57080078125, "logps/rejected": -691.480712890625, "loss": 0.3856, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.144439220428467, "rewards/margins": 1.655740737915039, "rewards/rejected": -4.800180435180664, "step": 5510 }, { "epoch": 0.7223476297968398, "grad_norm": 14.6875, "learning_rate": 1.0848317357500854e-06, "logits/chosen": -1.8237717151641846, "logits/rejected": -1.7784076929092407, "logps/chosen": -519.8162231445312, "logps/rejected": -657.3223266601562, "loss": 0.4834, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7931079864501953, "rewards/margins": 1.5211684703826904, "rewards/rejected": -4.314276695251465, "step": 5520 }, { "epoch": 0.7236562305754571, "grad_norm": 12.3125, "learning_rate": 1.0754304534652404e-06, "logits/chosen": -1.7132349014282227, "logits/rejected": -1.5195640325546265, "logps/chosen": -570.3129272460938, "logps/rejected": -678.6043701171875, "loss": 0.4986, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.96693754196167, "rewards/margins": 1.48458731174469, "rewards/rejected": -4.4515252113342285, "step": 5530 }, { "epoch": 0.7249648313540746, "grad_norm": 50.0, "learning_rate": 1.0660589091223854e-06, "logits/chosen": -1.6471774578094482, "logits/rejected": -1.481831431388855, "logps/chosen": -615.9620361328125, "logps/rejected": -707.6767578125, "loss": 0.4912, "rewards/accuracies": 0.625, "rewards/chosen": -3.3257815837860107, "rewards/margins": 1.0353418588638306, "rewards/rejected": -4.361123561859131, "step": 5540 }, { "epoch": 0.7262734321326921, "grad_norm": 12.5625, "learning_rate": 1.0567172983528534e-06, "logits/chosen": -1.8069711923599243, "logits/rejected": -1.43010675907135, "logps/chosen": -643.1559448242188, "logps/rejected": -754.8280639648438, "loss": 0.5095, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.392585277557373, "rewards/margins": 1.588444471359253, "rewards/rejected": -4.981030464172363, "step": 5550 }, { "epoch": 0.7275820329113096, "grad_norm": 26.75, "learning_rate": 1.0474058161631168e-06, "logits/chosen": -1.597335696220398, "logits/rejected": -1.6243082284927368, "logps/chosen": -638.9210205078125, "logps/rejected": -776.635498046875, "loss": 0.3842, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.517665147781372, "rewards/margins": 1.4344934225082397, "rewards/rejected": -4.9521589279174805, "step": 5560 }, { "epoch": 0.728890633689927, "grad_norm": 10.0, "learning_rate": 1.0381246569307077e-06, "logits/chosen": -1.720447301864624, "logits/rejected": -1.5852991342544556, "logps/chosen": -553.1087036132812, "logps/rejected": -760.8135986328125, "loss": 0.3923, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0384533405303955, "rewards/margins": 1.9606775045394897, "rewards/rejected": -4.9991302490234375, "step": 5570 }, { "epoch": 0.7301992344685445, "grad_norm": 10.3125, "learning_rate": 1.0288740144001722e-06, "logits/chosen": -1.685927152633667, "logits/rejected": -1.4799247980117798, "logps/chosen": -590.6946411132812, "logps/rejected": -647.121337890625, "loss": 0.5176, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.298147678375244, "rewards/margins": 1.0280678272247314, "rewards/rejected": -4.3262152671813965, "step": 5580 }, { "epoch": 0.731507835247162, "grad_norm": 3.765625, "learning_rate": 1.0196540816790127e-06, "logits/chosen": -1.5072760581970215, "logits/rejected": -1.6032094955444336, "logps/chosen": -501.88720703125, "logps/rejected": -655.1129150390625, "loss": 0.4582, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.039045810699463, "rewards/margins": 1.3204928636550903, "rewards/rejected": -4.359539031982422, "step": 5590 }, { "epoch": 0.7328164360257794, "grad_norm": 9.0, "learning_rate": 1.0104650512336679e-06, "logits/chosen": -1.9047638177871704, "logits/rejected": -1.6786075830459595, "logps/chosen": -623.8545532226562, "logps/rejected": -701.9097900390625, "loss": 0.4543, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1295902729034424, "rewards/margins": 1.3165086507797241, "rewards/rejected": -4.446099281311035, "step": 5600 }, { "epoch": 0.7328164360257794, "eval_logits/chosen": -1.3585971593856812, "eval_logits/rejected": -1.2395100593566895, "eval_logps/chosen": -575.6021118164062, "eval_logps/rejected": -694.3379516601562, "eval_loss": 0.4876547157764435, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -3.1035735607147217, "eval_rewards/margins": 1.4278333187103271, "eval_rewards/rejected": -4.531406879425049, "eval_runtime": 696.268, "eval_samples_per_second": 2.872, "eval_steps_per_second": 0.18, "step": 5600 }, { "epoch": 0.7341250368043969, "grad_norm": 12.875, "learning_rate": 1.0013071148854861e-06, "logits/chosen": -1.7812055349349976, "logits/rejected": -1.7435989379882812, "logps/chosen": -614.515625, "logps/rejected": -726.997314453125, "loss": 0.5252, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4691758155822754, "rewards/margins": 0.9533065557479858, "rewards/rejected": -4.422482490539551, "step": 5610 }, { "epoch": 0.7354336375830144, "grad_norm": 20.5, "learning_rate": 9.921804638067292e-07, "logits/chosen": -1.7911018133163452, "logits/rejected": -1.589203119277954, "logps/chosen": -585.3538818359375, "logps/rejected": -772.3333740234375, "loss": 0.5143, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.257516384124756, "rewards/margins": 1.73715078830719, "rewards/rejected": -4.994667053222656, "step": 5620 }, { "epoch": 0.7367422383616318, "grad_norm": 18.875, "learning_rate": 9.830852885165749e-07, "logits/chosen": -1.713619589805603, "logits/rejected": -1.4997047185897827, "logps/chosen": -602.16552734375, "logps/rejected": -709.8297119140625, "loss": 0.4886, "rewards/accuracies": 0.75, "rewards/chosen": -3.2702927589416504, "rewards/margins": 1.2002174854278564, "rewards/rejected": -4.470510005950928, "step": 5630 }, { "epoch": 0.7380508391402493, "grad_norm": 31.75, "learning_rate": 9.740217788771453e-07, "logits/chosen": -1.5730093717575073, "logits/rejected": -1.4597349166870117, "logps/chosen": -472.8937072753906, "logps/rejected": -598.8754272460938, "loss": 0.6744, "rewards/accuracies": 0.75, "rewards/chosen": -3.0447349548339844, "rewards/margins": 1.1566188335418701, "rewards/rejected": -4.201354026794434, "step": 5640 }, { "epoch": 0.7393594399188668, "grad_norm": 18.875, "learning_rate": 9.649901240895374e-07, "logits/chosen": -1.711038589477539, "logits/rejected": -1.509921669960022, "logps/chosen": -582.7390747070312, "logps/rejected": -758.6683959960938, "loss": 0.5466, "rewards/accuracies": 0.75, "rewards/chosen": -3.257512331008911, "rewards/margins": 1.6090552806854248, "rewards/rejected": -4.866567611694336, "step": 5650 }, { "epoch": 0.7406680406974843, "grad_norm": 23.375, "learning_rate": 9.559905126898803e-07, "logits/chosen": -1.8508403301239014, "logits/rejected": -1.6330080032348633, "logps/chosen": -582.8112182617188, "logps/rejected": -662.5493774414062, "loss": 0.4985, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0745298862457275, "rewards/margins": 1.3550355434417725, "rewards/rejected": -4.4295654296875, "step": 5660 }, { "epoch": 0.7419766414761017, "grad_norm": 13.125, "learning_rate": 9.470231325453958e-07, "logits/chosen": -1.970990777015686, "logits/rejected": -1.8723955154418945, "logps/chosen": -551.1888427734375, "logps/rejected": -599.9124145507812, "loss": 0.5243, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.160214900970459, "rewards/margins": 0.5584818124771118, "rewards/rejected": -3.7186965942382812, "step": 5670 }, { "epoch": 0.7432852422547191, "grad_norm": 11.6875, "learning_rate": 9.380881708504741e-07, "logits/chosen": -1.7954661846160889, "logits/rejected": -1.6302802562713623, "logps/chosen": -584.0460815429688, "logps/rejected": -657.7605590820312, "loss": 0.5861, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0680689811706543, "rewards/margins": 1.22983717918396, "rewards/rejected": -4.297906398773193, "step": 5680 }, { "epoch": 0.7445938430333366, "grad_norm": 12.9375, "learning_rate": 9.291858141227733e-07, "logits/chosen": -1.7174053192138672, "logits/rejected": -1.5762767791748047, "logps/chosen": -576.0792236328125, "logps/rejected": -720.1968383789062, "loss": 0.4207, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.991060733795166, "rewards/margins": 1.6844059228897095, "rewards/rejected": -4.675466537475586, "step": 5690 }, { "epoch": 0.745902443811954, "grad_norm": 19.5, "learning_rate": 9.203162481993175e-07, "logits/chosen": -1.7058305740356445, "logits/rejected": -1.6296281814575195, "logps/chosen": -567.90966796875, "logps/rejected": -639.2178955078125, "loss": 0.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.052389144897461, "rewards/margins": 1.0874216556549072, "rewards/rejected": -4.139810562133789, "step": 5700 }, { "epoch": 0.745902443811954, "eval_logits/chosen": -1.3685537576675415, "eval_logits/rejected": -1.2496318817138672, "eval_logps/chosen": -568.7587890625, "eval_logps/rejected": -683.302734375, "eval_loss": 0.48548778891563416, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.035140037536621, "eval_rewards/margins": 1.3859145641326904, "eval_rewards/rejected": -4.421055316925049, "eval_runtime": 695.6829, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 5700 }, { "epoch": 0.7472110445905715, "grad_norm": 8.0625, "learning_rate": 9.114796582326255e-07, "logits/chosen": -1.8170439004898071, "logits/rejected": -1.7834205627441406, "logps/chosen": -570.89794921875, "logps/rejected": -731.8878784179688, "loss": 0.3608, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.87723970413208, "rewards/margins": 1.4166896343231201, "rewards/rejected": -4.293929100036621, "step": 5710 }, { "epoch": 0.748519645369189, "grad_norm": 20.375, "learning_rate": 9.026762286868373e-07, "logits/chosen": -1.7468593120574951, "logits/rejected": -1.6224712133407593, "logps/chosen": -591.6256713867188, "logps/rejected": -762.6292114257812, "loss": 0.5397, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3939037322998047, "rewards/margins": 1.4484726190567017, "rewards/rejected": -4.842376232147217, "step": 5720 }, { "epoch": 0.7498282461478064, "grad_norm": 18.625, "learning_rate": 8.939061433338722e-07, "logits/chosen": -1.8681215047836304, "logits/rejected": -1.6987602710723877, "logps/chosen": -627.6087036132812, "logps/rejected": -690.0985107421875, "loss": 0.4266, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.094508171081543, "rewards/margins": 1.1115543842315674, "rewards/rejected": -4.206061840057373, "step": 5730 }, { "epoch": 0.7511368469264239, "grad_norm": 11.125, "learning_rate": 8.851695852495867e-07, "logits/chosen": -1.7270786762237549, "logits/rejected": -1.8032896518707275, "logps/chosen": -500.94744873046875, "logps/rejected": -678.7435302734375, "loss": 0.5051, "rewards/accuracies": 0.75, "rewards/chosen": -3.14656925201416, "rewards/margins": 1.3064861297607422, "rewards/rejected": -4.453055381774902, "step": 5740 }, { "epoch": 0.7524454477050414, "grad_norm": 19.0, "learning_rate": 8.764667368099525e-07, "logits/chosen": -1.727224349975586, "logits/rejected": -1.6187747716903687, "logps/chosen": -568.6463623046875, "logps/rejected": -694.8221435546875, "loss": 0.5261, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.4795403480529785, "rewards/margins": 1.3856881856918335, "rewards/rejected": -4.865228176116943, "step": 5750 }, { "epoch": 0.7537540484836589, "grad_norm": 14.1875, "learning_rate": 8.677977796872541e-07, "logits/chosen": -2.06563138961792, "logits/rejected": -1.701287031173706, "logps/chosen": -685.9306640625, "logps/rejected": -758.9682006835938, "loss": 0.4533, "rewards/accuracies": 0.75, "rewards/chosen": -3.540330410003662, "rewards/margins": 1.428117036819458, "rewards/rejected": -4.968447685241699, "step": 5760 }, { "epoch": 0.7550626492622763, "grad_norm": 14.3125, "learning_rate": 8.591628948462913e-07, "logits/chosen": -1.6796232461929321, "logits/rejected": -1.6524699926376343, "logps/chosen": -633.9078369140625, "logps/rejected": -700.4529418945312, "loss": 0.5255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7899765968322754, "rewards/margins": 0.9797791242599487, "rewards/rejected": -4.7697553634643555, "step": 5770 }, { "epoch": 0.7563712500408938, "grad_norm": 13.1875, "learning_rate": 8.505622625406054e-07, "logits/chosen": -1.581960916519165, "logits/rejected": -1.4872361421585083, "logps/chosen": -581.8450927734375, "logps/rejected": -718.3253173828125, "loss": 0.4533, "rewards/accuracies": 0.75, "rewards/chosen": -3.2600536346435547, "rewards/margins": 1.5054900646209717, "rewards/rejected": -4.765543460845947, "step": 5780 }, { "epoch": 0.7576798508195113, "grad_norm": 17.25, "learning_rate": 8.419960623087129e-07, "logits/chosen": -1.6888561248779297, "logits/rejected": -1.6147983074188232, "logps/chosen": -587.061767578125, "logps/rejected": -716.353515625, "loss": 0.5404, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.3572700023651123, "rewards/margins": 1.1649402379989624, "rewards/rejected": -4.522210597991943, "step": 5790 }, { "epoch": 0.7589884515981287, "grad_norm": 13.9375, "learning_rate": 8.334644729703617e-07, "logits/chosen": -1.6837126016616821, "logits/rejected": -1.4583009481430054, "logps/chosen": -599.4341430664062, "logps/rejected": -740.4576416015625, "loss": 0.4744, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4956207275390625, "rewards/margins": 1.3615803718566895, "rewards/rejected": -4.857201099395752, "step": 5800 }, { "epoch": 0.7589884515981287, "eval_logits/chosen": -1.3483850955963135, "eval_logits/rejected": -1.2287226915359497, "eval_logps/chosen": -592.36474609375, "eval_logps/rejected": -711.8522338867188, "eval_loss": 0.48561352491378784, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.2711997032165527, "eval_rewards/margins": 1.4353495836257935, "eval_rewards/rejected": -4.706549167633057, "eval_runtime": 694.5218, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.18, "step": 5800 }, { "epoch": 0.7602970523767462, "grad_norm": 14.125, "learning_rate": 8.249676726227931e-07, "logits/chosen": -1.7065244913101196, "logits/rejected": -1.4500083923339844, "logps/chosen": -580.0753784179688, "logps/rejected": -643.813232421875, "loss": 0.5239, "rewards/accuracies": 0.75, "rewards/chosen": -3.1306607723236084, "rewards/margins": 1.336342453956604, "rewards/rejected": -4.467003345489502, "step": 5810 }, { "epoch": 0.7616056531553637, "grad_norm": 10.0, "learning_rate": 8.165058386370314e-07, "logits/chosen": -1.6466753482818604, "logits/rejected": -1.4635779857635498, "logps/chosen": -501.7496032714844, "logps/rejected": -696.0513916015625, "loss": 0.4479, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.6400680541992188, "rewards/margins": 2.109905242919922, "rewards/rejected": -4.749973297119141, "step": 5820 }, { "epoch": 0.762914253933981, "grad_norm": 21.0, "learning_rate": 8.080791476541721e-07, "logits/chosen": -1.755419373512268, "logits/rejected": -1.6133639812469482, "logps/chosen": -595.66064453125, "logps/rejected": -717.858154296875, "loss": 0.5144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.412973403930664, "rewards/margins": 1.2137845754623413, "rewards/rejected": -4.626757621765137, "step": 5830 }, { "epoch": 0.7642228547125985, "grad_norm": 13.5625, "learning_rate": 7.996877755817026e-07, "logits/chosen": -1.9321653842926025, "logits/rejected": -1.7781460285186768, "logps/chosen": -562.6571655273438, "logps/rejected": -635.8867797851562, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1022896766662598, "rewards/margins": 0.8981400728225708, "rewards/rejected": -4.000429630279541, "step": 5840 }, { "epoch": 0.765531455491216, "grad_norm": 14.0, "learning_rate": 7.913318975898238e-07, "logits/chosen": -1.6550018787384033, "logits/rejected": -1.5244039297103882, "logps/chosen": -588.5064697265625, "logps/rejected": -754.6215209960938, "loss": 0.5132, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1954538822174072, "rewards/margins": 1.333221673965454, "rewards/rejected": -4.528675556182861, "step": 5850 }, { "epoch": 0.7668400562698334, "grad_norm": 8.0625, "learning_rate": 7.830116881077992e-07, "logits/chosen": -2.000688314437866, "logits/rejected": -1.8428118228912354, "logps/chosen": -585.1978759765625, "logps/rejected": -678.6551513671875, "loss": 0.5141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.990133285522461, "rewards/margins": 1.2325167655944824, "rewards/rejected": -4.222650051116943, "step": 5860 }, { "epoch": 0.7681486570484509, "grad_norm": 20.875, "learning_rate": 7.747273208203096e-07, "logits/chosen": -1.6745274066925049, "logits/rejected": -1.4283368587493896, "logps/chosen": -651.7517700195312, "logps/rejected": -825.7810668945312, "loss": 0.5164, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3674492835998535, "rewards/margins": 2.126816511154175, "rewards/rejected": -5.494265556335449, "step": 5870 }, { "epoch": 0.7694572578270684, "grad_norm": 6.84375, "learning_rate": 7.664789686638272e-07, "logits/chosen": -1.3084872961044312, "logits/rejected": -1.2429664134979248, "logps/chosen": -563.489013671875, "logps/rejected": -671.1477661132812, "loss": 0.467, "rewards/accuracies": 0.75, "rewards/chosen": -3.4749913215637207, "rewards/margins": 1.2950966358184814, "rewards/rejected": -4.770088195800781, "step": 5880 }, { "epoch": 0.7707658586056859, "grad_norm": 34.25, "learning_rate": 7.582668038230089e-07, "logits/chosen": -1.7634799480438232, "logits/rejected": -1.6948188543319702, "logps/chosen": -572.4893798828125, "logps/rejected": -629.4076538085938, "loss": 0.4957, "rewards/accuracies": 0.75, "rewards/chosen": -3.0067381858825684, "rewards/margins": 1.2457022666931152, "rewards/rejected": -4.252440452575684, "step": 5890 }, { "epoch": 0.7720744593843033, "grad_norm": 16.125, "learning_rate": 7.500909977271007e-07, "logits/chosen": -1.8120403289794922, "logits/rejected": -1.7461074590682983, "logps/chosen": -573.2329711914062, "logps/rejected": -648.84130859375, "loss": 0.6225, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.4705071449279785, "rewards/margins": 0.7610231041908264, "rewards/rejected": -4.23153018951416, "step": 5900 }, { "epoch": 0.7720744593843033, "eval_logits/chosen": -1.3555693626403809, "eval_logits/rejected": -1.2365773916244507, "eval_logps/chosen": -591.3819580078125, "eval_logps/rejected": -708.8779296875, "eval_loss": 0.48514899611473083, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.261371612548828, "eval_rewards/margins": 1.4154353141784668, "eval_rewards/rejected": -4.676806926727295, "eval_runtime": 696.2462, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.18, "step": 5900 }, { "epoch": 0.7733830601629208, "grad_norm": 12.625, "learning_rate": 7.41951721046357e-07, "logits/chosen": -1.5846604108810425, "logits/rejected": -1.6162481307983398, "logps/chosen": -585.1368408203125, "logps/rejected": -748.6844482421875, "loss": 0.4769, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.304856777191162, "rewards/margins": 1.7695623636245728, "rewards/rejected": -5.074418067932129, "step": 5910 }, { "epoch": 0.7746916609415383, "grad_norm": 8.5625, "learning_rate": 7.338491436884787e-07, "logits/chosen": -1.907774567604065, "logits/rejected": -1.7959816455841064, "logps/chosen": -567.5654296875, "logps/rejected": -662.52734375, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.962843894958496, "rewards/margins": 1.1763789653778076, "rewards/rejected": -4.139223098754883, "step": 5920 }, { "epoch": 0.7760002617201557, "grad_norm": 16.125, "learning_rate": 7.257834347950693e-07, "logits/chosen": -1.6305131912231445, "logits/rejected": -1.5075215101242065, "logps/chosen": -599.8590087890625, "logps/rejected": -701.6593017578125, "loss": 0.5005, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.578688859939575, "rewards/margins": 1.0596659183502197, "rewards/rejected": -4.638354778289795, "step": 5930 }, { "epoch": 0.7773088624987732, "grad_norm": 22.625, "learning_rate": 7.177547627380987e-07, "logits/chosen": -1.7928154468536377, "logits/rejected": -1.655312180519104, "logps/chosen": -652.2453002929688, "logps/rejected": -766.8363037109375, "loss": 0.4666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.7583765983581543, "rewards/margins": 1.1656768321990967, "rewards/rejected": -4.924054145812988, "step": 5940 }, { "epoch": 0.7786174632773907, "grad_norm": 12.8125, "learning_rate": 7.097632951163949e-07, "logits/chosen": -1.8271312713623047, "logits/rejected": -1.6660995483398438, "logps/chosen": -595.4977416992188, "logps/rejected": -697.2222900390625, "loss": 0.4668, "rewards/accuracies": 0.75, "rewards/chosen": -3.0823721885681152, "rewards/margins": 1.3621680736541748, "rewards/rejected": -4.444540500640869, "step": 5950 }, { "epoch": 0.7799260640560081, "grad_norm": 16.875, "learning_rate": 7.018091987521386e-07, "logits/chosen": -1.5156166553497314, "logits/rejected": -1.3137702941894531, "logps/chosen": -612.4935302734375, "logps/rejected": -751.5784301757812, "loss": 0.51, "rewards/accuracies": 0.75, "rewards/chosen": -3.474496364593506, "rewards/margins": 1.3136718273162842, "rewards/rejected": -4.788168430328369, "step": 5960 }, { "epoch": 0.7812346648346256, "grad_norm": 12.75, "learning_rate": 6.93892639687386e-07, "logits/chosen": -1.8230262994766235, "logits/rejected": -1.8296825885772705, "logps/chosen": -525.1339111328125, "logps/rejected": -634.5579223632812, "loss": 0.4895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.1159234046936035, "rewards/margins": 0.848776638507843, "rewards/rejected": -3.9646999835968018, "step": 5970 }, { "epoch": 0.782543265613243, "grad_norm": 5.46875, "learning_rate": 6.860137831806018e-07, "logits/chosen": -1.8564059734344482, "logits/rejected": -1.814237356185913, "logps/chosen": -538.3743896484375, "logps/rejected": -692.922607421875, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": -2.999763011932373, "rewards/margins": 1.707962989807129, "rewards/rejected": -4.707725524902344, "step": 5980 }, { "epoch": 0.7838518663918606, "grad_norm": 11.375, "learning_rate": 6.781727937032054e-07, "logits/chosen": -1.5768226385116577, "logits/rejected": -1.6548197269439697, "logps/chosen": -574.2213745117188, "logps/rejected": -723.91357421875, "loss": 0.4981, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.326831340789795, "rewards/margins": 1.0505492687225342, "rewards/rejected": -4.377380847930908, "step": 5990 }, { "epoch": 0.7851604671704779, "grad_norm": 10.8125, "learning_rate": 6.703698349361437e-07, "logits/chosen": -1.870201826095581, "logits/rejected": -1.7116177082061768, "logps/chosen": -540.0968017578125, "logps/rejected": -695.8267822265625, "loss": 0.411, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0923874378204346, "rewards/margins": 1.4186692237854004, "rewards/rejected": -4.511056423187256, "step": 6000 }, { "epoch": 0.7851604671704779, "eval_logits/chosen": -1.349523663520813, "eval_logits/rejected": -1.2297974824905396, "eval_logps/chosen": -590.9094848632812, "eval_logps/rejected": -708.4647827148438, "eval_loss": 0.48488470911979675, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -3.2566473484039307, "eval_rewards/margins": 1.416027545928955, "eval_rewards/rejected": -4.672675132751465, "eval_runtime": 695.4278, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 6000 }, { "epoch": 0.7864690679490954, "grad_norm": 7.625, "learning_rate": 6.626050697664682e-07, "logits/chosen": -1.6758708953857422, "logits/rejected": -1.6902267932891846, "logps/chosen": -549.23388671875, "logps/rejected": -721.6221923828125, "loss": 0.5647, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0261292457580566, "rewards/margins": 1.7690401077270508, "rewards/rejected": -4.795168876647949, "step": 6010 }, { "epoch": 0.7877776687277129, "grad_norm": 23.5, "learning_rate": 6.548786602839404e-07, "logits/chosen": -1.8626915216445923, "logits/rejected": -1.6568272113800049, "logps/chosen": -637.2047119140625, "logps/rejected": -684.0416259765625, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.433117628097534, "rewards/margins": 0.7503461837768555, "rewards/rejected": -4.1834635734558105, "step": 6020 }, { "epoch": 0.7890862695063303, "grad_norm": 12.5625, "learning_rate": 6.471907677776426e-07, "logits/chosen": -1.5487608909606934, "logits/rejected": -1.3858025074005127, "logps/chosen": -494.33489990234375, "logps/rejected": -598.2453002929688, "loss": 0.4644, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.2357430458068848, "rewards/margins": 1.2526090145111084, "rewards/rejected": -4.488351821899414, "step": 6030 }, { "epoch": 0.7903948702849478, "grad_norm": 12.0, "learning_rate": 6.39541552732617e-07, "logits/chosen": -1.8124374151229858, "logits/rejected": -1.7155539989471436, "logps/chosen": -602.8327026367188, "logps/rejected": -756.3582763671875, "loss": 0.4991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.384129762649536, "rewards/margins": 1.762467384338379, "rewards/rejected": -5.146596908569336, "step": 6040 }, { "epoch": 0.7917034710635653, "grad_norm": 18.5, "learning_rate": 6.319311748265086e-07, "logits/chosen": -1.7840373516082764, "logits/rejected": -1.6791518926620483, "logps/chosen": -471.4497985839844, "logps/rejected": -658.880859375, "loss": 0.5239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8049871921539307, "rewards/margins": 1.5309184789657593, "rewards/rejected": -4.3359055519104, "step": 6050 }, { "epoch": 0.7930120718421827, "grad_norm": 23.375, "learning_rate": 6.243597929262404e-07, "logits/chosen": -1.7939269542694092, "logits/rejected": -1.5417505502700806, "logps/chosen": -566.4649047851562, "logps/rejected": -688.7721557617188, "loss": 0.4873, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0715301036834717, "rewards/margins": 1.454796552658081, "rewards/rejected": -4.526326656341553, "step": 6060 }, { "epoch": 0.7943206726208002, "grad_norm": 11.75, "learning_rate": 6.168275650846875e-07, "logits/chosen": -1.9410231113433838, "logits/rejected": -1.5476337671279907, "logps/chosen": -592.1102294921875, "logps/rejected": -627.6417846679688, "loss": 0.4892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0260634422302246, "rewards/margins": 1.2497179508209229, "rewards/rejected": -4.275781154632568, "step": 6070 }, { "epoch": 0.7956292733994177, "grad_norm": 5.9375, "learning_rate": 6.093346485373863e-07, "logits/chosen": -1.8795009851455688, "logits/rejected": -1.6466052532196045, "logps/chosen": -539.6228637695312, "logps/rejected": -595.3223876953125, "loss": 0.4473, "rewards/accuracies": 0.75, "rewards/chosen": -3.3200783729553223, "rewards/margins": 0.9906002879142761, "rewards/rejected": -4.310678958892822, "step": 6080 }, { "epoch": 0.7969378741780352, "grad_norm": 13.8125, "learning_rate": 6.018811996992455e-07, "logits/chosen": -1.556799292564392, "logits/rejected": -1.5874974727630615, "logps/chosen": -501.92462158203125, "logps/rejected": -692.1066284179688, "loss": 0.4398, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9512500762939453, "rewards/margins": 1.5317250490188599, "rewards/rejected": -4.482975006103516, "step": 6090 }, { "epoch": 0.7982464749566526, "grad_norm": 7.4375, "learning_rate": 5.944673741612866e-07, "logits/chosen": -1.742285966873169, "logits/rejected": -1.5538444519042969, "logps/chosen": -613.9192504882812, "logps/rejected": -707.9895629882812, "loss": 0.3609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2818522453308105, "rewards/margins": 1.5306751728057861, "rewards/rejected": -4.812527656555176, "step": 6100 }, { "epoch": 0.7982464749566526, "eval_logits/chosen": -1.3523858785629272, "eval_logits/rejected": -1.2331287860870361, "eval_logps/chosen": -592.6218872070312, "eval_logps/rejected": -710.9299926757812, "eval_loss": 0.4852555990219116, "eval_rewards/accuracies": 0.7549999952316284, "eval_rewards/chosen": -3.273771047592163, "eval_rewards/margins": 1.4235553741455078, "eval_rewards/rejected": -4.697326183319092, "eval_runtime": 695.1244, "eval_samples_per_second": 2.877, "eval_steps_per_second": 0.18, "step": 6100 }, { "epoch": 0.7995550757352701, "grad_norm": 15.5625, "learning_rate": 5.870933266873916e-07, "logits/chosen": -1.7211681604385376, "logits/rejected": -1.6549797058105469, "logps/chosen": -554.5855712890625, "logps/rejected": -734.6849975585938, "loss": 0.3863, "rewards/accuracies": 0.75, "rewards/chosen": -3.349759578704834, "rewards/margins": 1.427038311958313, "rewards/rejected": -4.776797771453857, "step": 6110 }, { "epoch": 0.8008636765138876, "grad_norm": 12.9375, "learning_rate": 5.797592112110734e-07, "logits/chosen": -1.5681097507476807, "logits/rejected": -1.494216799736023, "logps/chosen": -643.8059692382812, "logps/rejected": -742.4291381835938, "loss": 0.4768, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.5684268474578857, "rewards/margins": 0.7712646722793579, "rewards/rejected": -4.339691162109375, "step": 6120 }, { "epoch": 0.802172277292505, "grad_norm": 12.1875, "learning_rate": 5.724651808322645e-07, "logits/chosen": -1.9596741199493408, "logits/rejected": -1.8974323272705078, "logps/chosen": -645.6394653320312, "logps/rejected": -700.4769897460938, "loss": 0.5938, "rewards/accuracies": 0.75, "rewards/chosen": -3.338953733444214, "rewards/margins": 0.7772713303565979, "rewards/rejected": -4.116225242614746, "step": 6130 }, { "epoch": 0.8034808780711225, "grad_norm": 17.75, "learning_rate": 5.652113878141194e-07, "logits/chosen": -1.8857982158660889, "logits/rejected": -1.8942861557006836, "logps/chosen": -555.5986938476562, "logps/rejected": -708.7097778320312, "loss": 0.4343, "rewards/accuracies": 0.75, "rewards/chosen": -2.85176157951355, "rewards/margins": 1.3535945415496826, "rewards/rejected": -4.205356597900391, "step": 6140 }, { "epoch": 0.80478947884974, "grad_norm": 16.375, "learning_rate": 5.579979835798361e-07, "logits/chosen": -1.7325356006622314, "logits/rejected": -1.6813218593597412, "logps/chosen": -670.2135009765625, "logps/rejected": -704.197509765625, "loss": 0.6118, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.859315872192383, "rewards/margins": 0.5825608968734741, "rewards/rejected": -4.441876411437988, "step": 6150 }, { "epoch": 0.8060980796283573, "grad_norm": 10.125, "learning_rate": 5.508251187094932e-07, "logits/chosen": -1.5644296407699585, "logits/rejected": -1.508806824684143, "logps/chosen": -577.1658935546875, "logps/rejected": -724.7855224609375, "loss": 0.3649, "rewards/accuracies": 0.75, "rewards/chosen": -3.3856117725372314, "rewards/margins": 1.556279182434082, "rewards/rejected": -4.941891193389893, "step": 6160 }, { "epoch": 0.8074066804069748, "grad_norm": 16.25, "learning_rate": 5.436929429369122e-07, "logits/chosen": -1.7199310064315796, "logits/rejected": -1.699110746383667, "logps/chosen": -681.8873291015625, "logps/rejected": -752.0169677734375, "loss": 0.5083, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4851043224334717, "rewards/margins": 1.3659433126449585, "rewards/rejected": -4.851047992706299, "step": 6170 }, { "epoch": 0.8087152811855923, "grad_norm": 11.625, "learning_rate": 5.366016051465245e-07, "logits/chosen": -1.8707669973373413, "logits/rejected": -1.6464478969573975, "logps/chosen": -586.6552734375, "logps/rejected": -702.72021484375, "loss": 0.3996, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.101494312286377, "rewards/margins": 1.3138140439987183, "rewards/rejected": -4.415308475494385, "step": 6180 }, { "epoch": 0.8100238819642097, "grad_norm": 13.0, "learning_rate": 5.295512533702701e-07, "logits/chosen": -1.7169173955917358, "logits/rejected": -1.6458828449249268, "logps/chosen": -654.7156982421875, "logps/rejected": -666.8015747070312, "loss": 0.552, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.502948760986328, "rewards/margins": 0.7677499055862427, "rewards/rejected": -4.2706990242004395, "step": 6190 }, { "epoch": 0.8113324827428272, "grad_norm": 14.375, "learning_rate": 5.225420347845023e-07, "logits/chosen": -1.781553030014038, "logits/rejected": -1.4267860651016235, "logps/chosen": -604.248046875, "logps/rejected": -754.7085571289062, "loss": 0.4411, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2507729530334473, "rewards/margins": 1.7163217067718506, "rewards/rejected": -4.967094421386719, "step": 6200 }, { "epoch": 0.8113324827428272, "eval_logits/chosen": -1.3411527872085571, "eval_logits/rejected": -1.2212930917739868, "eval_logps/chosen": -599.7124633789062, "eval_logps/rejected": -720.2412719726562, "eval_loss": 0.48532843589782715, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -3.3446767330169678, "eval_rewards/margins": 1.4457635879516602, "eval_rewards/rejected": -4.790439605712891, "eval_runtime": 694.9281, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.18, "step": 6200 }, { "epoch": 0.8126410835214447, "grad_norm": 12.9375, "learning_rate": 5.155740957069186e-07, "logits/chosen": -2.103508710861206, "logits/rejected": -1.812971830368042, "logps/chosen": -641.5965576171875, "logps/rejected": -688.5982666015625, "loss": 0.638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.949636936187744, "rewards/margins": 1.0416533946990967, "rewards/rejected": -3.9912898540496826, "step": 6210 }, { "epoch": 0.8139496843000622, "grad_norm": 5.40625, "learning_rate": 5.08647581593506e-07, "logits/chosen": -1.8561872243881226, "logits/rejected": -1.6741949319839478, "logps/chosen": -630.53125, "logps/rejected": -754.6694946289062, "loss": 0.4577, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.133953332901001, "rewards/margins": 1.7758582830429077, "rewards/rejected": -4.909811496734619, "step": 6220 }, { "epoch": 0.8152582850786796, "grad_norm": 11.375, "learning_rate": 5.017626370355014e-07, "logits/chosen": -1.5440386533737183, "logits/rejected": -1.3823237419128418, "logps/chosen": -639.8975830078125, "logps/rejected": -818.4393310546875, "loss": 0.6498, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.654470920562744, "rewards/margins": 1.747463583946228, "rewards/rejected": -5.401934623718262, "step": 6230 }, { "epoch": 0.8165668858572971, "grad_norm": 14.0625, "learning_rate": 4.949194057563783e-07, "logits/chosen": -1.8167362213134766, "logits/rejected": -1.6026986837387085, "logps/chosen": -646.869384765625, "logps/rejected": -729.1396484375, "loss": 0.4201, "rewards/accuracies": 0.75, "rewards/chosen": -3.4211266040802, "rewards/margins": 1.407709002494812, "rewards/rejected": -4.828835487365723, "step": 6240 }, { "epoch": 0.8178754866359146, "grad_norm": 8.5, "learning_rate": 4.881180306088418e-07, "logits/chosen": -1.6104791164398193, "logits/rejected": -1.423105239868164, "logps/chosen": -599.9137573242188, "logps/rejected": -715.0670166015625, "loss": 0.5861, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4482688903808594, "rewards/margins": 1.2372363805770874, "rewards/rejected": -4.685505390167236, "step": 6250 }, { "epoch": 0.819184087414532, "grad_norm": 16.5, "learning_rate": 4.813586535718512e-07, "logits/chosen": -1.731877326965332, "logits/rejected": -1.6918220520019531, "logps/chosen": -643.5286865234375, "logps/rejected": -721.25146484375, "loss": 0.4841, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6342430114746094, "rewards/margins": 0.8564447164535522, "rewards/rejected": -4.490687847137451, "step": 6260 }, { "epoch": 0.8204926881931495, "grad_norm": 15.875, "learning_rate": 4.746414157476506e-07, "logits/chosen": -1.720868468284607, "logits/rejected": -1.429315447807312, "logps/chosen": -609.3848266601562, "logps/rejected": -851.71630859375, "loss": 0.4326, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.4502601623535156, "rewards/margins": 2.3626527786254883, "rewards/rejected": -5.812912940979004, "step": 6270 }, { "epoch": 0.821801288971767, "grad_norm": 14.0, "learning_rate": 4.679664573588294e-07, "logits/chosen": -1.8406994342803955, "logits/rejected": -1.4854843616485596, "logps/chosen": -613.1395263671875, "logps/rejected": -785.4437255859375, "loss": 0.4934, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.249269962310791, "rewards/margins": 1.972414255142212, "rewards/rejected": -5.221684455871582, "step": 6280 }, { "epoch": 0.8231098897503843, "grad_norm": 14.8125, "learning_rate": 4.6133391774538903e-07, "logits/chosen": -1.8763707876205444, "logits/rejected": -1.5732877254486084, "logps/chosen": -642.1503295898438, "logps/rejected": -761.8912353515625, "loss": 0.5815, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5468287467956543, "rewards/margins": 1.6580511331558228, "rewards/rejected": -5.2048797607421875, "step": 6290 }, { "epoch": 0.8244184905290018, "grad_norm": 8.5625, "learning_rate": 4.5474393536184214e-07, "logits/chosen": -1.883325219154358, "logits/rejected": -1.7426612377166748, "logps/chosen": -602.3004760742188, "logps/rejected": -696.4586181640625, "loss": 0.5948, "rewards/accuracies": 0.625, "rewards/chosen": -3.4274890422821045, "rewards/margins": 0.8444126844406128, "rewards/rejected": -4.271901607513428, "step": 6300 }, { "epoch": 0.8244184905290018, "eval_logits/chosen": -1.342966914176941, "eval_logits/rejected": -1.2232600450515747, "eval_logps/chosen": -597.1200561523438, "eval_logps/rejected": -716.5390014648438, "eval_loss": 0.484688401222229, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.3187530040740967, "eval_rewards/margins": 1.4346644878387451, "eval_rewards/rejected": -4.753417015075684, "eval_runtime": 693.6699, "eval_samples_per_second": 2.883, "eval_steps_per_second": 0.18, "step": 6300 }, { "epoch": 0.8257270913076193, "grad_norm": 25.625, "learning_rate": 4.4819664777431243e-07, "logits/chosen": -1.7673009634017944, "logits/rejected": -1.6490665674209595, "logps/chosen": -644.4851684570312, "logps/rejected": -764.7232055664062, "loss": 0.5187, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.473984479904175, "rewards/margins": 1.2754685878753662, "rewards/rejected": -4.749452114105225, "step": 6310 }, { "epoch": 0.8270356920862368, "grad_norm": 7.40625, "learning_rate": 4.416921916576722e-07, "logits/chosen": -1.7410767078399658, "logits/rejected": -1.7484163045883179, "logps/chosen": -600.8318481445312, "logps/rejected": -702.2088012695312, "loss": 0.5332, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4084548950195312, "rewards/margins": 1.0161828994750977, "rewards/rejected": -4.424637794494629, "step": 6320 }, { "epoch": 0.8283442928648542, "grad_norm": 15.3125, "learning_rate": 4.352307027926828e-07, "logits/chosen": -1.7138748168945312, "logits/rejected": -1.619607925415039, "logps/chosen": -542.1904296875, "logps/rejected": -711.8982543945312, "loss": 0.3816, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.079141139984131, "rewards/margins": 1.7302086353302002, "rewards/rejected": -4.809349536895752, "step": 6330 }, { "epoch": 0.8296528936434717, "grad_norm": 9.1875, "learning_rate": 4.288123160631624e-07, "logits/chosen": -1.614119529724121, "logits/rejected": -1.220226526260376, "logps/chosen": -580.2674560546875, "logps/rejected": -650.6781005859375, "loss": 0.4661, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.4264721870422363, "rewards/margins": 1.0997949838638306, "rewards/rejected": -4.526267051696777, "step": 6340 }, { "epoch": 0.8309614944220892, "grad_norm": 24.0, "learning_rate": 4.224371654531731e-07, "logits/chosen": -1.7621088027954102, "logits/rejected": -1.6349855661392212, "logps/chosen": -541.7307739257812, "logps/rejected": -649.169677734375, "loss": 0.5314, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.5355591773986816, "rewards/margins": 0.8103141784667969, "rewards/rejected": -4.3458733558654785, "step": 6350 }, { "epoch": 0.8322700952007066, "grad_norm": 14.75, "learning_rate": 4.1610538404421837e-07, "logits/chosen": -1.9768733978271484, "logits/rejected": -1.7775895595550537, "logps/chosen": -590.9364013671875, "logps/rejected": -687.1650390625, "loss": 0.4703, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2747435569763184, "rewards/margins": 0.987260639667511, "rewards/rejected": -4.2620038986206055, "step": 6360 }, { "epoch": 0.8335786959793241, "grad_norm": 10.9375, "learning_rate": 4.098171040124699e-07, "logits/chosen": -1.8198833465576172, "logits/rejected": -1.7766659259796143, "logps/chosen": -601.62060546875, "logps/rejected": -732.4606323242188, "loss": 0.5297, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2697696685791016, "rewards/margins": 1.1922571659088135, "rewards/rejected": -4.462027072906494, "step": 6370 }, { "epoch": 0.8348872967579416, "grad_norm": 10.5, "learning_rate": 4.03572456626006e-07, "logits/chosen": -1.8584043979644775, "logits/rejected": -1.5334441661834717, "logps/chosen": -587.9584350585938, "logps/rejected": -723.7175903320312, "loss": 0.4668, "rewards/accuracies": 0.875, "rewards/chosen": -3.185436725616455, "rewards/margins": 1.9680631160736084, "rewards/rejected": -5.153500080108643, "step": 6380 }, { "epoch": 0.836195897536559, "grad_norm": 20.5, "learning_rate": 3.9737157224207265e-07, "logits/chosen": -1.43854558467865, "logits/rejected": -1.4808661937713623, "logps/chosen": -561.89599609375, "logps/rejected": -745.5228271484375, "loss": 0.4166, "rewards/accuracies": 0.75, "rewards/chosen": -3.6378579139709473, "rewards/margins": 1.4517529010772705, "rewards/rejected": -5.089611053466797, "step": 6390 }, { "epoch": 0.8375044983151765, "grad_norm": 9.9375, "learning_rate": 3.912145803043596e-07, "logits/chosen": -1.7561668157577515, "logits/rejected": -1.858899712562561, "logps/chosen": -656.6492919921875, "logps/rejected": -744.0940551757812, "loss": 0.5653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3443210124969482, "rewards/margins": 0.8394185900688171, "rewards/rejected": -4.18373966217041, "step": 6400 }, { "epoch": 0.8375044983151765, "eval_logits/chosen": -1.3452728986740112, "eval_logits/rejected": -1.225719928741455, "eval_logps/chosen": -591.9784545898438, "eval_logps/rejected": -710.5149536132812, "eval_loss": 0.4839678406715393, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.267336845397949, "eval_rewards/margins": 1.4258390665054321, "eval_rewards/rejected": -4.693175792694092, "eval_runtime": 694.7064, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 6400 }, { "epoch": 0.838813099093794, "grad_norm": 18.125, "learning_rate": 3.851016093403023e-07, "logits/chosen": -1.7737751007080078, "logits/rejected": -1.7160379886627197, "logps/chosen": -561.7122802734375, "logps/rejected": -660.0095825195312, "loss": 0.4892, "rewards/accuracies": 0.875, "rewards/chosen": -3.0440971851348877, "rewards/margins": 1.3709533214569092, "rewards/rejected": -4.415050506591797, "step": 6410 }, { "epoch": 0.8401216998724115, "grad_norm": 8.75, "learning_rate": 3.7903278695839456e-07, "logits/chosen": -1.6842645406723022, "logits/rejected": -1.6417667865753174, "logps/chosen": -628.8909912109375, "logps/rejected": -711.9481811523438, "loss": 0.493, "rewards/accuracies": 0.75, "rewards/chosen": -3.3538060188293457, "rewards/margins": 1.3485034704208374, "rewards/rejected": -4.702309608459473, "step": 6420 }, { "epoch": 0.8414303006510289, "grad_norm": 12.1875, "learning_rate": 3.7300823984552983e-07, "logits/chosen": -1.5732972621917725, "logits/rejected": -1.5288972854614258, "logps/chosen": -588.18310546875, "logps/rejected": -677.284423828125, "loss": 0.5716, "rewards/accuracies": 0.75, "rewards/chosen": -3.070836067199707, "rewards/margins": 1.3505765199661255, "rewards/rejected": -4.421412467956543, "step": 6430 }, { "epoch": 0.8427389014296464, "grad_norm": 15.1875, "learning_rate": 3.670280937643503e-07, "logits/chosen": -2.0774359703063965, "logits/rejected": -1.9389930963516235, "logps/chosen": -614.8870239257812, "logps/rejected": -670.2506103515625, "loss": 0.4694, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0947606563568115, "rewards/margins": 0.9043132066726685, "rewards/rejected": -3.9990742206573486, "step": 6440 }, { "epoch": 0.8440475022082639, "grad_norm": 19.375, "learning_rate": 3.610924735506274e-07, "logits/chosen": -1.6063387393951416, "logits/rejected": -1.5002729892730713, "logps/chosen": -577.1217041015625, "logps/rejected": -692.9158935546875, "loss": 0.4903, "rewards/accuracies": 0.75, "rewards/chosen": -3.5257344245910645, "rewards/margins": 1.1305897235870361, "rewards/rejected": -4.65632438659668, "step": 6450 }, { "epoch": 0.8453561029868812, "grad_norm": 15.8125, "learning_rate": 3.5520150311065316e-07, "logits/chosen": -1.6022531986236572, "logits/rejected": -1.581345796585083, "logps/chosen": -616.5570068359375, "logps/rejected": -646.38818359375, "loss": 0.563, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.6431708335876465, "rewards/margins": 0.6079776883125305, "rewards/rejected": -4.251148223876953, "step": 6460 }, { "epoch": 0.8466647037654987, "grad_norm": 10.0, "learning_rate": 3.493553054186527e-07, "logits/chosen": -1.8564069271087646, "logits/rejected": -1.7880780696868896, "logps/chosen": -526.815185546875, "logps/rejected": -699.9705200195312, "loss": 0.4911, "rewards/accuracies": 0.75, "rewards/chosen": -3.188946008682251, "rewards/margins": 1.6704984903335571, "rewards/rejected": -4.859443664550781, "step": 6470 }, { "epoch": 0.8479733045441162, "grad_norm": 26.125, "learning_rate": 3.4355400251421977e-07, "logits/chosen": -1.999938726425171, "logits/rejected": -1.747881531715393, "logps/chosen": -646.3642578125, "logps/rejected": -709.4263916015625, "loss": 0.3791, "rewards/accuracies": 0.75, "rewards/chosen": -3.1137003898620605, "rewards/margins": 1.0349650382995605, "rewards/rejected": -4.148665428161621, "step": 6480 }, { "epoch": 0.8492819053227336, "grad_norm": 15.6875, "learning_rate": 3.3779771549976637e-07, "logits/chosen": -1.7633678913116455, "logits/rejected": -1.6560373306274414, "logps/chosen": -616.8046875, "logps/rejected": -745.5782470703125, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -3.132735013961792, "rewards/margins": 1.4247593879699707, "rewards/rejected": -4.557494163513184, "step": 6490 }, { "epoch": 0.8505905061013511, "grad_norm": 25.125, "learning_rate": 3.3208656453799783e-07, "logits/chosen": -1.827301025390625, "logits/rejected": -1.564793586730957, "logps/chosen": -613.1212768554688, "logps/rejected": -731.3037109375, "loss": 0.3609, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.134608268737793, "rewards/margins": 1.7288488149642944, "rewards/rejected": -4.863457679748535, "step": 6500 }, { "epoch": 0.8505905061013511, "eval_logits/chosen": -1.341573715209961, "eval_logits/rejected": -1.2216531038284302, "eval_logps/chosen": -592.7427368164062, "eval_logps/rejected": -710.9508056640625, "eval_loss": 0.4838578999042511, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.274980068206787, "eval_rewards/margins": 1.4225554466247559, "eval_rewards/rejected": -4.697535037994385, "eval_runtime": 695.916, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 6500 }, { "epoch": 0.8518991068799686, "grad_norm": 7.375, "learning_rate": 3.2642066884940064e-07, "logits/chosen": -1.5643433332443237, "logits/rejected": -1.4488433599472046, "logps/chosen": -596.9285888671875, "logps/rejected": -697.2652587890625, "loss": 0.4993, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5802104473114014, "rewards/margins": 1.2176687717437744, "rewards/rejected": -4.797879219055176, "step": 6510 }, { "epoch": 0.8532077076585861, "grad_norm": 11.3125, "learning_rate": 3.2080014670975825e-07, "logits/chosen": -1.3365122079849243, "logits/rejected": -1.317875862121582, "logps/chosen": -579.786865234375, "logps/rejected": -677.9715576171875, "loss": 0.5703, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.8925552368164062, "rewards/margins": 0.8097049593925476, "rewards/rejected": -4.7022600173950195, "step": 6520 }, { "epoch": 0.8545163084372035, "grad_norm": 22.0, "learning_rate": 3.152251154476765e-07, "logits/chosen": -1.6946548223495483, "logits/rejected": -1.5316194295883179, "logps/chosen": -559.4827270507812, "logps/rejected": -692.9683227539062, "loss": 0.5023, "rewards/accuracies": 0.75, "rewards/chosen": -3.2069497108459473, "rewards/margins": 1.5908353328704834, "rewards/rejected": -4.797784805297852, "step": 6530 }, { "epoch": 0.855824909215821, "grad_norm": 24.625, "learning_rate": 3.0969569144214147e-07, "logits/chosen": -1.5754806995391846, "logits/rejected": -1.5629639625549316, "logps/chosen": -586.2955932617188, "logps/rejected": -717.8370361328125, "loss": 0.4743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.188948154449463, "rewards/margins": 1.5423780679702759, "rewards/rejected": -4.731326103210449, "step": 6540 }, { "epoch": 0.8571335099944385, "grad_norm": 10.125, "learning_rate": 3.042119901200824e-07, "logits/chosen": -1.74164617061615, "logits/rejected": -1.6781513690948486, "logps/chosen": -578.2874755859375, "logps/rejected": -681.0450439453125, "loss": 0.4623, "rewards/accuracies": 0.875, "rewards/chosen": -3.1909983158111572, "rewards/margins": 1.2629938125610352, "rewards/rejected": -4.453991889953613, "step": 6550 }, { "epoch": 0.8584421107730559, "grad_norm": 13.5, "learning_rate": 2.9877412595396726e-07, "logits/chosen": -1.6603879928588867, "logits/rejected": -1.671364188194275, "logps/chosen": -563.8631591796875, "logps/rejected": -697.9118041992188, "loss": 0.513, "rewards/accuracies": 0.75, "rewards/chosen": -2.9260380268096924, "rewards/margins": 1.39484441280365, "rewards/rejected": -4.320882320404053, "step": 6560 }, { "epoch": 0.8597507115516734, "grad_norm": 11.3125, "learning_rate": 2.933822124594124e-07, "logits/chosen": -1.8294540643692017, "logits/rejected": -1.698460340499878, "logps/chosen": -592.7259521484375, "logps/rejected": -706.1705322265625, "loss": 0.5275, "rewards/accuracies": 0.75, "rewards/chosen": -3.0040464401245117, "rewards/margins": 1.4533369541168213, "rewards/rejected": -4.457383155822754, "step": 6570 }, { "epoch": 0.8610593123302909, "grad_norm": 6.53125, "learning_rate": 2.880363621928106e-07, "logits/chosen": -1.8534778356552124, "logits/rejected": -1.9341338872909546, "logps/chosen": -603.3294677734375, "logps/rejected": -746.0242919921875, "loss": 0.509, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.323899030685425, "rewards/margins": 1.4007208347320557, "rewards/rejected": -4.7246198654174805, "step": 6580 }, { "epoch": 0.8623679131089083, "grad_norm": 6.0625, "learning_rate": 2.82736686748985e-07, "logits/chosen": -1.6648927927017212, "logits/rejected": -1.6062780618667603, "logps/chosen": -495.8475036621094, "logps/rejected": -625.5374755859375, "loss": 0.5708, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0795795917510986, "rewards/margins": 1.245223045349121, "rewards/rejected": -4.324802398681641, "step": 6590 }, { "epoch": 0.8636765138875258, "grad_norm": 8.0, "learning_rate": 2.774832967588556e-07, "logits/chosen": -1.908159613609314, "logits/rejected": -1.6768789291381836, "logps/chosen": -657.7713012695312, "logps/rejected": -789.3167724609375, "loss": 0.4754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5680668354034424, "rewards/margins": 1.4072271585464478, "rewards/rejected": -4.975294589996338, "step": 6600 }, { "epoch": 0.8636765138875258, "eval_logits/chosen": -1.338658332824707, "eval_logits/rejected": -1.2186274528503418, "eval_logps/chosen": -593.7542114257812, "eval_logps/rejected": -712.2548217773438, "eval_loss": 0.48389020562171936, "eval_rewards/accuracies": 0.7549999952316284, "eval_rewards/chosen": -3.285094738006592, "eval_rewards/margins": 1.425480842590332, "eval_rewards/rejected": -4.710575103759766, "eval_runtime": 694.734, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 6600 }, { "epoch": 0.8649851146661433, "grad_norm": 17.875, "learning_rate": 2.7227630188713326e-07, "logits/chosen": -1.7677122354507446, "logits/rejected": -1.7343463897705078, "logps/chosen": -505.16168212890625, "logps/rejected": -594.2935791015625, "loss": 0.558, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.9479286670684814, "rewards/margins": 1.1722996234893799, "rewards/rejected": -4.120227813720703, "step": 6610 }, { "epoch": 0.8662937154447606, "grad_norm": 30.125, "learning_rate": 2.671158108300284e-07, "logits/chosen": -1.844421148300171, "logits/rejected": -1.5880135297775269, "logps/chosen": -587.9923095703125, "logps/rejected": -678.6886596679688, "loss": 0.4739, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.490217924118042, "rewards/margins": 1.2354539632797241, "rewards/rejected": -4.725671768188477, "step": 6620 }, { "epoch": 0.8676023162233781, "grad_norm": 28.875, "learning_rate": 2.6200193131298376e-07, "logits/chosen": -1.55061936378479, "logits/rejected": -1.4917399883270264, "logps/chosen": -613.244384765625, "logps/rejected": -725.19482421875, "loss": 0.5544, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5699849128723145, "rewards/margins": 1.199249267578125, "rewards/rejected": -4.7692341804504395, "step": 6630 }, { "epoch": 0.8689109170019956, "grad_norm": 22.0, "learning_rate": 2.569347700884217e-07, "logits/chosen": -1.8329975605010986, "logits/rejected": -1.6012241840362549, "logps/chosen": -570.3499755859375, "logps/rejected": -675.1878662109375, "loss": 0.4545, "rewards/accuracies": 0.75, "rewards/chosen": -3.048556089401245, "rewards/margins": 1.419795036315918, "rewards/rejected": -4.468351364135742, "step": 6640 }, { "epoch": 0.8702195177806131, "grad_norm": 11.75, "learning_rate": 2.5191443293352186e-07, "logits/chosen": -1.552416205406189, "logits/rejected": -1.3235156536102295, "logps/chosen": -609.6673583984375, "logps/rejected": -728.4909057617188, "loss": 0.6059, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.4165940284729004, "rewards/margins": 1.5645102262496948, "rewards/rejected": -4.981104373931885, "step": 6650 }, { "epoch": 0.8715281185592305, "grad_norm": 26.5, "learning_rate": 2.469410246480067e-07, "logits/chosen": -2.0214147567749023, "logits/rejected": -1.7325942516326904, "logps/chosen": -604.4178466796875, "logps/rejected": -747.2442626953125, "loss": 0.3338, "rewards/accuracies": 0.875, "rewards/chosen": -3.0971083641052246, "rewards/margins": 2.0269858837127686, "rewards/rejected": -5.124094486236572, "step": 6660 }, { "epoch": 0.872836719337848, "grad_norm": 16.375, "learning_rate": 2.4201464905195955e-07, "logits/chosen": -1.8645200729370117, "logits/rejected": -1.651386022567749, "logps/chosen": -597.8118286132812, "logps/rejected": -696.4151000976562, "loss": 0.4687, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.542731523513794, "rewards/margins": 1.067403793334961, "rewards/rejected": -4.610135555267334, "step": 6670 }, { "epoch": 0.8741453201164655, "grad_norm": 6.125, "learning_rate": 2.3713540898365196e-07, "logits/chosen": -1.6947740316390991, "logits/rejected": -1.7632633447647095, "logps/chosen": -638.929931640625, "logps/rejected": -802.6312255859375, "loss": 0.537, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.500303268432617, "rewards/margins": 1.3233038187026978, "rewards/rejected": -4.823607444763184, "step": 6680 }, { "epoch": 0.8754539208950829, "grad_norm": 35.25, "learning_rate": 2.3230340629740166e-07, "logits/chosen": -1.6549152135849, "logits/rejected": -1.7411181926727295, "logps/chosen": -512.7261962890625, "logps/rejected": -675.0318603515625, "loss": 0.4879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1180179119110107, "rewards/margins": 1.5280996561050415, "rewards/rejected": -4.6461181640625, "step": 6690 }, { "epoch": 0.8767625216737004, "grad_norm": 11.3125, "learning_rate": 2.2751874186144357e-07, "logits/chosen": -1.6540229320526123, "logits/rejected": -1.5683925151824951, "logps/chosen": -629.4344482421875, "logps/rejected": -772.493408203125, "loss": 0.4108, "rewards/accuracies": 0.75, "rewards/chosen": -3.266979694366455, "rewards/margins": 1.4821598529815674, "rewards/rejected": -4.749139308929443, "step": 6700 }, { "epoch": 0.8767625216737004, "eval_logits/chosen": -1.3437899351119995, "eval_logits/rejected": -1.2242987155914307, "eval_logps/chosen": -592.761474609375, "eval_logps/rejected": -711.0748291015625, "eval_loss": 0.4838009178638458, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -3.275167226791382, "eval_rewards/margins": 1.4236088991165161, "eval_rewards/rejected": -4.698775291442871, "eval_runtime": 695.7942, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 6700 }, { "epoch": 0.8780711224523179, "grad_norm": 19.375, "learning_rate": 2.227815155558241e-07, "logits/chosen": -1.7928661108016968, "logits/rejected": -1.3385839462280273, "logps/chosen": -642.1571044921875, "logps/rejected": -805.2886962890625, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": -3.5391764640808105, "rewards/margins": 1.6910613775253296, "rewards/rejected": -5.230238437652588, "step": 6710 }, { "epoch": 0.8793797232309353, "grad_norm": 13.375, "learning_rate": 2.1809182627031883e-07, "logits/chosen": -1.648118257522583, "logits/rejected": -1.6330856084823608, "logps/chosen": -528.6195068359375, "logps/rejected": -741.1982421875, "loss": 0.3942, "rewards/accuracies": 0.875, "rewards/chosen": -3.071549654006958, "rewards/margins": 1.9424279928207397, "rewards/rejected": -5.013978004455566, "step": 6720 }, { "epoch": 0.8806883240095528, "grad_norm": 55.5, "learning_rate": 2.1344977190236372e-07, "logits/chosen": -1.7590625286102295, "logits/rejected": -1.6691687107086182, "logps/chosen": -646.4742431640625, "logps/rejected": -721.5814208984375, "loss": 0.5416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.6345062255859375, "rewards/margins": 1.1899363994598389, "rewards/rejected": -4.8244428634643555, "step": 6730 }, { "epoch": 0.8819969247881703, "grad_norm": 8.375, "learning_rate": 2.0885544935501656e-07, "logits/chosen": -1.7245979309082031, "logits/rejected": -1.5323925018310547, "logps/chosen": -534.8410034179688, "logps/rejected": -674.4937744140625, "loss": 0.3636, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1353626251220703, "rewards/margins": 1.5699174404144287, "rewards/rejected": -4.705279350280762, "step": 6740 }, { "epoch": 0.8833055255667878, "grad_norm": 8.6875, "learning_rate": 2.0430895453492944e-07, "logits/chosen": -1.785875916481018, "logits/rejected": -1.4311304092407227, "logps/chosen": -644.0568237304688, "logps/rejected": -732.4920043945312, "loss": 0.4282, "rewards/accuracies": 0.75, "rewards/chosen": -3.6695289611816406, "rewards/margins": 1.2204630374908447, "rewards/rejected": -4.8899922370910645, "step": 6750 }, { "epoch": 0.8846141263454051, "grad_norm": 19.0, "learning_rate": 1.9981038235035111e-07, "logits/chosen": -1.7990942001342773, "logits/rejected": -1.608801245689392, "logps/chosen": -531.8355102539062, "logps/rejected": -664.3603515625, "loss": 0.4698, "rewards/accuracies": 0.875, "rewards/chosen": -3.1431050300598145, "rewards/margins": 1.483864426612854, "rewards/rejected": -4.626969337463379, "step": 6760 }, { "epoch": 0.8859227271240226, "grad_norm": 11.1875, "learning_rate": 1.9535982670914112e-07, "logits/chosen": -1.8119480609893799, "logits/rejected": -1.5345662832260132, "logps/chosen": -654.2174682617188, "logps/rejected": -706.6146850585938, "loss": 0.4851, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.4034996032714844, "rewards/margins": 1.320252776145935, "rewards/rejected": -4.723752021789551, "step": 6770 }, { "epoch": 0.8872313279026401, "grad_norm": 10.5625, "learning_rate": 1.9095738051681412e-07, "logits/chosen": -1.780968427658081, "logits/rejected": -1.5744168758392334, "logps/chosen": -553.8974609375, "logps/rejected": -627.0613403320312, "loss": 0.464, "rewards/accuracies": 0.75, "rewards/chosen": -3.444225788116455, "rewards/margins": 0.8738017082214355, "rewards/rejected": -4.318027496337891, "step": 6780 }, { "epoch": 0.8885399286812575, "grad_norm": 19.75, "learning_rate": 1.8660313567459703e-07, "logits/chosen": -1.9318901300430298, "logits/rejected": -1.7172155380249023, "logps/chosen": -638.42236328125, "logps/rejected": -682.0265502929688, "loss": 0.4143, "rewards/accuracies": 0.75, "rewards/chosen": -3.057542324066162, "rewards/margins": 1.1934406757354736, "rewards/rejected": -4.250982761383057, "step": 6790 }, { "epoch": 0.889848529459875, "grad_norm": 8.375, "learning_rate": 1.8229718307751165e-07, "logits/chosen": -1.8133220672607422, "logits/rejected": -1.7552887201309204, "logps/chosen": -634.8387451171875, "logps/rejected": -710.4517822265625, "loss": 0.4527, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.375164747238159, "rewards/margins": 1.0913513898849487, "rewards/rejected": -4.466516017913818, "step": 6800 }, { "epoch": 0.889848529459875, "eval_logits/chosen": -1.3495535850524902, "eval_logits/rejected": -1.2306791543960571, "eval_logps/chosen": -593.5831298828125, "eval_logps/rejected": -712.4506225585938, "eval_loss": 0.4838249385356903, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -3.28338360786438, "eval_rewards/margins": 1.4291496276855469, "eval_rewards/rejected": -4.712532997131348, "eval_runtime": 696.7897, "eval_samples_per_second": 2.87, "eval_steps_per_second": 0.179, "step": 6800 }, { "epoch": 0.8911571302384925, "grad_norm": 8.875, "learning_rate": 1.7803961261247864e-07, "logits/chosen": -1.8256447315216064, "logits/rejected": -1.5883971452713013, "logps/chosen": -654.5208129882812, "logps/rejected": -733.9072265625, "loss": 0.4687, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3585166931152344, "rewards/margins": 1.4366153478622437, "rewards/rejected": -4.795132160186768, "step": 6810 }, { "epoch": 0.8924657310171099, "grad_norm": 15.625, "learning_rate": 1.7383051315643772e-07, "logits/chosen": -1.7709274291992188, "logits/rejected": -1.524435043334961, "logps/chosen": -550.1386108398438, "logps/rejected": -708.3090209960938, "loss": 0.3631, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9157252311706543, "rewards/margins": 2.0121233463287354, "rewards/rejected": -4.927848815917969, "step": 6820 }, { "epoch": 0.8937743317957274, "grad_norm": 22.75, "learning_rate": 1.6966997257449685e-07, "logits/chosen": -1.8256194591522217, "logits/rejected": -1.5321848392486572, "logps/chosen": -570.9100341796875, "logps/rejected": -684.6585693359375, "loss": 0.5073, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0833654403686523, "rewards/margins": 1.492516279220581, "rewards/rejected": -4.575881004333496, "step": 6830 }, { "epoch": 0.8950829325743449, "grad_norm": 33.0, "learning_rate": 1.6555807771809375e-07, "logits/chosen": -1.7348064184188843, "logits/rejected": -1.7082138061523438, "logps/chosen": -616.8502807617188, "logps/rejected": -748.4053955078125, "loss": 0.474, "rewards/accuracies": 0.75, "rewards/chosen": -3.4589970111846924, "rewards/margins": 1.4264509677886963, "rewards/rejected": -4.885447978973389, "step": 6840 }, { "epoch": 0.8963915333529624, "grad_norm": 15.8125, "learning_rate": 1.6149491442318617e-07, "logits/chosen": -1.904905080795288, "logits/rejected": -1.5800856351852417, "logps/chosen": -594.7634887695312, "logps/rejected": -725.8511962890625, "loss": 0.4376, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0354695320129395, "rewards/margins": 1.4000697135925293, "rewards/rejected": -4.435539245605469, "step": 6850 }, { "epoch": 0.8977001341315798, "grad_norm": 11.375, "learning_rate": 1.5748056750845786e-07, "logits/chosen": -1.7181541919708252, "logits/rejected": -1.562482476234436, "logps/chosen": -578.9635009765625, "logps/rejected": -708.0796508789062, "loss": 0.4197, "rewards/accuracies": 0.75, "rewards/chosen": -3.3154168128967285, "rewards/margins": 1.2583740949630737, "rewards/rejected": -4.573790550231934, "step": 6860 }, { "epoch": 0.8990087349101973, "grad_norm": 19.875, "learning_rate": 1.5351512077355024e-07, "logits/chosen": -1.7811702489852905, "logits/rejected": -1.4635236263275146, "logps/chosen": -575.0892333984375, "logps/rejected": -647.81298828125, "loss": 0.4907, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2129950523376465, "rewards/margins": 1.1890983581542969, "rewards/rejected": -4.402093887329102, "step": 6870 }, { "epoch": 0.9003173356888148, "grad_norm": 17.625, "learning_rate": 1.4959865699730902e-07, "logits/chosen": -1.7956634759902954, "logits/rejected": -1.7036113739013672, "logps/chosen": -588.7994384765625, "logps/rejected": -707.3984375, "loss": 0.5332, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.237419605255127, "rewards/margins": 1.348104476928711, "rewards/rejected": -4.585524559020996, "step": 6880 }, { "epoch": 0.9016259364674322, "grad_norm": 12.25, "learning_rate": 1.4573125793606202e-07, "logits/chosen": -1.9311128854751587, "logits/rejected": -1.748199224472046, "logps/chosen": -607.0201416015625, "logps/rejected": -689.3958740234375, "loss": 0.4894, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5146842002868652, "rewards/margins": 1.1064493656158447, "rewards/rejected": -4.621133327484131, "step": 6890 }, { "epoch": 0.9029345372460497, "grad_norm": 25.375, "learning_rate": 1.4191300432190634e-07, "logits/chosen": -1.8458545207977295, "logits/rejected": -1.6118013858795166, "logps/chosen": -580.1968994140625, "logps/rejected": -685.7689819335938, "loss": 0.447, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.980220317840576, "rewards/margins": 1.655181646347046, "rewards/rejected": -4.635401725769043, "step": 6900 }, { "epoch": 0.9029345372460497, "eval_logits/chosen": -1.3480076789855957, "eval_logits/rejected": -1.228943109512329, "eval_logps/chosen": -594.08935546875, "eval_logps/rejected": -713.1663208007812, "eval_loss": 0.4839765429496765, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -3.2884459495544434, "eval_rewards/margins": 1.431244134902954, "eval_rewards/rejected": -4.719690322875977, "eval_runtime": 695.7958, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 6900 }, { "epoch": 0.9042431380246672, "grad_norm": 31.5, "learning_rate": 1.381439758610284e-07, "logits/chosen": -1.7899754047393799, "logits/rejected": -1.7772667407989502, "logps/chosen": -586.474365234375, "logps/rejected": -766.3489990234375, "loss": 0.473, "rewards/accuracies": 0.75, "rewards/chosen": -3.35996675491333, "rewards/margins": 1.642038106918335, "rewards/rejected": -5.002005100250244, "step": 6910 }, { "epoch": 0.9055517388032845, "grad_norm": 28.0, "learning_rate": 1.3442425123203596e-07, "logits/chosen": -1.6055282354354858, "logits/rejected": -1.6174838542938232, "logps/chosen": -598.8069458007812, "logps/rejected": -789.2105102539062, "loss": 0.4559, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.52067232131958, "rewards/margins": 1.5137660503387451, "rewards/rejected": -5.034438133239746, "step": 6920 }, { "epoch": 0.906860339581902, "grad_norm": 8.5625, "learning_rate": 1.3075390808431897e-07, "logits/chosen": -1.8285157680511475, "logits/rejected": -1.5711413621902466, "logps/chosen": -572.6306762695312, "logps/rejected": -716.1918334960938, "loss": 0.4591, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0835630893707275, "rewards/margins": 1.659457802772522, "rewards/rejected": -4.743021488189697, "step": 6930 }, { "epoch": 0.9081689403605195, "grad_norm": 16.5, "learning_rate": 1.271330230364262e-07, "logits/chosen": -1.8680012226104736, "logits/rejected": -1.4643158912658691, "logps/chosen": -673.7245483398438, "logps/rejected": -702.3436279296875, "loss": 0.5166, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.461456775665283, "rewards/margins": 1.1683944463729858, "rewards/rejected": -4.629851341247559, "step": 6940 }, { "epoch": 0.9094775411391369, "grad_norm": 10.9375, "learning_rate": 1.2356167167446698e-07, "logits/chosen": -1.7676851749420166, "logits/rejected": -1.7244584560394287, "logps/chosen": -596.0842895507812, "logps/rejected": -694.978271484375, "loss": 0.5403, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3525390625, "rewards/margins": 1.3175437450408936, "rewards/rejected": -4.670083045959473, "step": 6950 }, { "epoch": 0.9107861419177544, "grad_norm": 7.0, "learning_rate": 1.2003992855053326e-07, "logits/chosen": -1.7599279880523682, "logits/rejected": -1.6541827917099, "logps/chosen": -595.4813842773438, "logps/rejected": -751.1033935546875, "loss": 0.5634, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3206565380096436, "rewards/margins": 1.2326548099517822, "rewards/rejected": -4.553311347961426, "step": 6960 }, { "epoch": 0.9120947426963719, "grad_norm": 17.875, "learning_rate": 1.1656786718114239e-07, "logits/chosen": -1.9636318683624268, "logits/rejected": -1.5499491691589355, "logps/chosen": -593.0447387695312, "logps/rejected": -643.0382080078125, "loss": 0.45, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.1098203659057617, "rewards/margins": 1.278965711593628, "rewards/rejected": -4.388786792755127, "step": 6970 }, { "epoch": 0.9134033434749894, "grad_norm": 16.75, "learning_rate": 1.1314556004570487e-07, "logits/chosen": -1.9904924631118774, "logits/rejected": -1.7861770391464233, "logps/chosen": -609.9263305664062, "logps/rejected": -678.1754760742188, "loss": 0.5519, "rewards/accuracies": 0.75, "rewards/chosen": -3.1088786125183105, "rewards/margins": 1.2149235010147095, "rewards/rejected": -4.3238019943237305, "step": 6980 }, { "epoch": 0.9147119442536068, "grad_norm": 15.6875, "learning_rate": 1.0977307858500818e-07, "logits/chosen": -1.8406610488891602, "logits/rejected": -1.4011399745941162, "logps/chosen": -709.1697998046875, "logps/rejected": -695.9796142578125, "loss": 0.5848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.6997153759002686, "rewards/margins": 0.9288283586502075, "rewards/rejected": -4.628544330596924, "step": 6990 }, { "epoch": 0.9160205450322243, "grad_norm": 13.0625, "learning_rate": 1.0645049319972789e-07, "logits/chosen": -1.8441442251205444, "logits/rejected": -1.5748450756072998, "logps/chosen": -599.401123046875, "logps/rejected": -722.9434814453125, "loss": 0.4922, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.8761649131774902, "rewards/margins": 1.7666804790496826, "rewards/rejected": -4.642845153808594, "step": 7000 }, { "epoch": 0.9160205450322243, "eval_logits/chosen": -1.3462055921554565, "eval_logits/rejected": -1.226959228515625, "eval_logps/chosen": -593.9542846679688, "eval_logps/rejected": -712.990478515625, "eval_loss": 0.48405495285987854, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.287095546722412, "eval_rewards/margins": 1.430837631225586, "eval_rewards/rejected": -4.717933177947998, "eval_runtime": 695.3546, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.18, "step": 7000 }, { "epoch": 0.9173291458108418, "grad_norm": 7.9375, "learning_rate": 1.0317787324895634e-07, "logits/chosen": -1.907984972000122, "logits/rejected": -1.6685854196548462, "logps/chosen": -630.0285034179688, "logps/rejected": -662.249755859375, "loss": 0.5157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.326462984085083, "rewards/margins": 1.1342576742172241, "rewards/rejected": -4.460721015930176, "step": 7010 }, { "epoch": 0.9186377465894592, "grad_norm": 6.28125, "learning_rate": 9.995528704875635e-08, "logits/chosen": -1.7404258251190186, "logits/rejected": -1.5319255590438843, "logps/chosen": -554.45947265625, "logps/rejected": -749.9161987304688, "loss": 0.4066, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0650036334991455, "rewards/margins": 1.6026322841644287, "rewards/rejected": -4.667635917663574, "step": 7020 }, { "epoch": 0.9199463473680767, "grad_norm": 4.4375, "learning_rate": 9.678280187073452e-08, "logits/chosen": -1.9358317852020264, "logits/rejected": -1.683153748512268, "logps/chosen": -571.517578125, "logps/rejected": -720.1722412109375, "loss": 0.5368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0634007453918457, "rewards/margins": 1.5408287048339844, "rewards/rejected": -4.604229927062988, "step": 7030 }, { "epoch": 0.9212549481466942, "grad_norm": 11.6875, "learning_rate": 9.366048394063549e-08, "logits/chosen": -1.9469846487045288, "logits/rejected": -1.9349476099014282, "logps/chosen": -632.7986450195312, "logps/rejected": -759.61962890625, "loss": 0.3675, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1143414974212646, "rewards/margins": 1.3724960088729858, "rewards/rejected": -4.486837863922119, "step": 7040 }, { "epoch": 0.9225635489253116, "grad_norm": 9.6875, "learning_rate": 9.058839843696237e-08, "logits/chosen": -1.4811979532241821, "logits/rejected": -1.4132037162780762, "logps/chosen": -591.6810302734375, "logps/rejected": -711.0726928710938, "loss": 0.489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.425321578979492, "rewards/margins": 1.3321223258972168, "rewards/rejected": -4.757443428039551, "step": 7050 }, { "epoch": 0.9238721497039291, "grad_norm": 17.25, "learning_rate": 8.756660948961299e-08, "logits/chosen": -1.653180480003357, "logits/rejected": -1.5093600749969482, "logps/chosen": -519.7325439453125, "logps/rejected": -677.9019775390625, "loss": 0.4548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0616393089294434, "rewards/margins": 1.6190617084503174, "rewards/rejected": -4.68070125579834, "step": 7060 }, { "epoch": 0.9251807504825466, "grad_norm": 11.4375, "learning_rate": 8.459518017854412e-08, "logits/chosen": -1.589348316192627, "logits/rejected": -1.574836015701294, "logps/chosen": -590.6156005859375, "logps/rejected": -812.993896484375, "loss": 0.4552, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.3252310752868652, "rewards/margins": 1.7264232635498047, "rewards/rejected": -5.05165433883667, "step": 7070 }, { "epoch": 0.926489351261164, "grad_norm": 17.125, "learning_rate": 8.167417253245213e-08, "logits/chosen": -1.8403034210205078, "logits/rejected": -1.6021150350570679, "logps/chosen": -585.3240966796875, "logps/rejected": -800.160888671875, "loss": 0.4153, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.167881727218628, "rewards/margins": 1.7198050022125244, "rewards/rejected": -4.887686252593994, "step": 7080 }, { "epoch": 0.9277979520397814, "grad_norm": 11.6875, "learning_rate": 7.880364752747948e-08, "logits/chosen": -1.8788830041885376, "logits/rejected": -1.7896921634674072, "logps/chosen": -535.162109375, "logps/rejected": -623.135986328125, "loss": 0.4726, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8884317874908447, "rewards/margins": 1.0441008806228638, "rewards/rejected": -3.932532548904419, "step": 7090 }, { "epoch": 0.9291065528183989, "grad_norm": 14.1875, "learning_rate": 7.598366508594245e-08, "logits/chosen": -1.6562168598175049, "logits/rejected": -1.5319846868515015, "logps/chosen": -607.3147583007812, "logps/rejected": -695.52099609375, "loss": 0.5316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.615985870361328, "rewards/margins": 1.0196460485458374, "rewards/rejected": -4.635632038116455, "step": 7100 }, { "epoch": 0.9291065528183989, "eval_logits/chosen": -1.3461825847625732, "eval_logits/rejected": -1.2269272804260254, "eval_logps/chosen": -593.8109741210938, "eval_logps/rejected": -712.8836059570312, "eval_loss": 0.48397162556648254, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.28566312789917, "eval_rewards/margins": 1.4311997890472412, "eval_rewards/rejected": -4.716862678527832, "eval_runtime": 694.7682, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 7100 }, { "epoch": 0.9304151535970164, "grad_norm": 14.6875, "learning_rate": 7.32142840750788e-08, "logits/chosen": -1.6233108043670654, "logits/rejected": -1.5525808334350586, "logps/chosen": -539.6690673828125, "logps/rejected": -688.342529296875, "loss": 0.5039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.142035484313965, "rewards/margins": 1.5576280355453491, "rewards/rejected": -4.6996636390686035, "step": 7110 }, { "epoch": 0.9317237543756338, "grad_norm": 11.625, "learning_rate": 7.049556230581872e-08, "logits/chosen": -1.9678618907928467, "logits/rejected": -1.7167441844940186, "logps/chosen": -592.3394165039062, "logps/rejected": -655.6724853515625, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": -3.194664716720581, "rewards/margins": 1.3413351774215698, "rewards/rejected": -4.5360002517700195, "step": 7120 }, { "epoch": 0.9330323551542513, "grad_norm": 15.625, "learning_rate": 6.782755653158085e-08, "logits/chosen": -1.857399344444275, "logits/rejected": -1.7840948104858398, "logps/chosen": -655.7377319335938, "logps/rejected": -766.9114990234375, "loss": 0.541, "rewards/accuracies": 0.75, "rewards/chosen": -3.4340240955352783, "rewards/margins": 1.1490281820297241, "rewards/rejected": -4.583052635192871, "step": 7130 }, { "epoch": 0.9343409559328688, "grad_norm": 14.75, "learning_rate": 6.521032244708375e-08, "logits/chosen": -1.720682144165039, "logits/rejected": -1.541343092918396, "logps/chosen": -524.9146728515625, "logps/rejected": -677.6600341796875, "loss": 0.4257, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0023820400238037, "rewards/margins": 1.6153453588485718, "rewards/rejected": -4.617727279663086, "step": 7140 }, { "epoch": 0.9356495567114862, "grad_norm": 20.125, "learning_rate": 6.264391468718628e-08, "logits/chosen": -1.6618503332138062, "logits/rejected": -1.6115306615829468, "logps/chosen": -640.6536865234375, "logps/rejected": -700.94384765625, "loss": 0.5506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.4430670738220215, "rewards/margins": 1.1901319026947021, "rewards/rejected": -4.6331987380981445, "step": 7150 }, { "epoch": 0.9369581574901037, "grad_norm": 11.8125, "learning_rate": 6.012838682574462e-08, "logits/chosen": -1.7844740152359009, "logits/rejected": -1.6745269298553467, "logps/chosen": -564.6143798828125, "logps/rejected": -648.6922607421875, "loss": 0.4641, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3967483043670654, "rewards/margins": 1.2174148559570312, "rewards/rejected": -4.614163398742676, "step": 7160 }, { "epoch": 0.9382667582687212, "grad_norm": 12.25, "learning_rate": 5.766379137449624e-08, "logits/chosen": -1.7170770168304443, "logits/rejected": -1.7587181329727173, "logps/chosen": -658.6851806640625, "logps/rejected": -731.87646484375, "loss": 0.5109, "rewards/accuracies": 0.625, "rewards/chosen": -3.522146701812744, "rewards/margins": 0.7771105766296387, "rewards/rejected": -4.299256801605225, "step": 7170 }, { "epoch": 0.9395753590473387, "grad_norm": 9.9375, "learning_rate": 5.525017978196295e-08, "logits/chosen": -1.8357822895050049, "logits/rejected": -1.6876189708709717, "logps/chosen": -580.4196166992188, "logps/rejected": -677.5750122070312, "loss": 0.451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.076944351196289, "rewards/margins": 1.3956979513168335, "rewards/rejected": -4.472642421722412, "step": 7180 }, { "epoch": 0.9408839598259561, "grad_norm": 7.3125, "learning_rate": 5.288760243237545e-08, "logits/chosen": -1.6035493612289429, "logits/rejected": -1.584979772567749, "logps/chosen": -513.9579467773438, "logps/rejected": -702.2100830078125, "loss": 0.4281, "rewards/accuracies": 0.75, "rewards/chosen": -3.179330825805664, "rewards/margins": 1.269091010093689, "rewards/rejected": -4.448421955108643, "step": 7190 }, { "epoch": 0.9421925606045736, "grad_norm": 19.0, "learning_rate": 5.0576108644623536e-08, "logits/chosen": -1.7256473302841187, "logits/rejected": -1.6372534036636353, "logps/chosen": -529.20947265625, "logps/rejected": -621.1188354492188, "loss": 0.5271, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3829283714294434, "rewards/margins": 0.9641639590263367, "rewards/rejected": -4.347092628479004, "step": 7200 }, { "epoch": 0.9421925606045736, "eval_logits/chosen": -1.3449697494506836, "eval_logits/rejected": -1.2255713939666748, "eval_logps/chosen": -593.798095703125, "eval_logps/rejected": -712.8786010742188, "eval_loss": 0.4838831424713135, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -3.2855334281921387, "eval_rewards/margins": 1.4312803745269775, "eval_rewards/rejected": -4.716813564300537, "eval_runtime": 695.703, "eval_samples_per_second": 2.875, "eval_steps_per_second": 0.18, "step": 7200 }, { "epoch": 0.9435011613831911, "grad_norm": 20.0, "learning_rate": 4.8315746671225296e-08, "logits/chosen": -1.814037561416626, "logits/rejected": -1.523754358291626, "logps/chosen": -550.5968627929688, "logps/rejected": -679.8839111328125, "loss": 0.4259, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.319317579269409, "rewards/margins": 1.4991252422332764, "rewards/rejected": -4.818442344665527, "step": 7210 }, { "epoch": 0.9448097621618085, "grad_norm": 14.9375, "learning_rate": 4.6106563697320695e-08, "logits/chosen": -1.8934953212738037, "logits/rejected": -1.8022511005401611, "logps/chosen": -556.5667724609375, "logps/rejected": -634.6939697265625, "loss": 0.4196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.3768115043640137, "rewards/margins": 0.8346894979476929, "rewards/rejected": -4.211501121520996, "step": 7220 }, { "epoch": 0.946118362940426, "grad_norm": 19.75, "learning_rate": 4.394860583968624e-08, "logits/chosen": -1.827109694480896, "logits/rejected": -1.785753607749939, "logps/chosen": -600.801025390625, "logps/rejected": -672.4212646484375, "loss": 0.568, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.569920063018799, "rewards/margins": 0.8448535799980164, "rewards/rejected": -4.414773464202881, "step": 7230 }, { "epoch": 0.9474269637190434, "grad_norm": 11.1875, "learning_rate": 4.1841918145771874e-08, "logits/chosen": -1.7053476572036743, "logits/rejected": -1.4699194431304932, "logps/chosen": -634.6055908203125, "logps/rejected": -658.261962890625, "loss": 0.5192, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.739218235015869, "rewards/margins": 0.7895745635032654, "rewards/rejected": -4.528792858123779, "step": 7240 }, { "epoch": 0.9487355644976608, "grad_norm": 5.59375, "learning_rate": 3.978654459276088e-08, "logits/chosen": -1.889049768447876, "logits/rejected": -1.567402958869934, "logps/chosen": -630.5608520507812, "logps/rejected": -736.9276123046875, "loss": 0.3828, "rewards/accuracies": 0.75, "rewards/chosen": -3.3330883979797363, "rewards/margins": 1.9904181957244873, "rewards/rejected": -5.323506832122803, "step": 7250 }, { "epoch": 0.9500441652762783, "grad_norm": 16.375, "learning_rate": 3.778252808665284e-08, "logits/chosen": -1.73220694065094, "logits/rejected": -1.7230247259140015, "logps/chosen": -584.671875, "logps/rejected": -663.2332153320312, "loss": 0.5825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.342334032058716, "rewards/margins": 0.8437948226928711, "rewards/rejected": -4.186129093170166, "step": 7260 }, { "epoch": 0.9513527660548958, "grad_norm": 8.0625, "learning_rate": 3.5829910461366023e-08, "logits/chosen": -1.7115774154663086, "logits/rejected": -1.4188563823699951, "logps/chosen": -625.66162109375, "logps/rejected": -713.1121826171875, "loss": 0.4107, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1217801570892334, "rewards/margins": 1.632599115371704, "rewards/rejected": -4.754378795623779, "step": 7270 }, { "epoch": 0.9526613668335132, "grad_norm": 17.625, "learning_rate": 3.39287324778656e-08, "logits/chosen": -1.824893593788147, "logits/rejected": -1.7379440069198608, "logps/chosen": -584.72705078125, "logps/rejected": -629.3406372070312, "loss": 0.5637, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0686302185058594, "rewards/margins": 0.8275834321975708, "rewards/rejected": -3.8962130546569824, "step": 7280 }, { "epoch": 0.9539699676121307, "grad_norm": 11.875, "learning_rate": 3.207903382331262e-08, "logits/chosen": -1.971449613571167, "logits/rejected": -1.7553508281707764, "logps/chosen": -546.5452270507812, "logps/rejected": -649.3677978515625, "loss": 0.4349, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.9974820613861084, "rewards/margins": 1.3490869998931885, "rewards/rejected": -4.346569061279297, "step": 7290 }, { "epoch": 0.9552785683907482, "grad_norm": 18.125, "learning_rate": 3.028085311023443e-08, "logits/chosen": -1.9943549633026123, "logits/rejected": -1.7560489177703857, "logps/chosen": -600.3551025390625, "logps/rejected": -693.0902709960938, "loss": 0.48, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.2099266052246094, "rewards/margins": 1.2887191772460938, "rewards/rejected": -4.498645782470703, "step": 7300 }, { "epoch": 0.9552785683907482, "eval_logits/chosen": -1.3494045734405518, "eval_logits/rejected": -1.230515718460083, "eval_logps/chosen": -593.845947265625, "eval_logps/rejected": -712.8812255859375, "eval_loss": 0.4840351343154907, "eval_rewards/accuracies": 0.7570000290870667, "eval_rewards/chosen": -3.2860116958618164, "eval_rewards/margins": 1.4308278560638428, "eval_rewards/rejected": -4.716839790344238, "eval_runtime": 695.9935, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.18, "step": 7300 }, { "epoch": 0.9565871691693657, "grad_norm": 7.4375, "learning_rate": 2.8534227875720576e-08, "logits/chosen": -1.7298179864883423, "logits/rejected": -1.6010799407958984, "logps/chosen": -614.6148071289062, "logps/rejected": -716.4520263671875, "loss": 0.5527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.5706825256347656, "rewards/margins": 0.9880639314651489, "rewards/rejected": -4.558746337890625, "step": 7310 }, { "epoch": 0.9578957699479831, "grad_norm": 18.5, "learning_rate": 2.683919458063705e-08, "logits/chosen": -1.5899951457977295, "logits/rejected": -1.561689019203186, "logps/chosen": -611.9046630859375, "logps/rejected": -658.8997192382812, "loss": 0.6295, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4796204566955566, "rewards/margins": 0.8144383430480957, "rewards/rejected": -4.294058799743652, "step": 7320 }, { "epoch": 0.9592043707266006, "grad_norm": 14.1875, "learning_rate": 2.5195788608866345e-08, "logits/chosen": -1.874956488609314, "logits/rejected": -1.6842498779296875, "logps/chosen": -657.554443359375, "logps/rejected": -760.7843017578125, "loss": 0.5998, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.7501330375671387, "rewards/margins": 1.5091949701309204, "rewards/rejected": -5.2593278884887695, "step": 7330 }, { "epoch": 0.9605129715052181, "grad_norm": 25.25, "learning_rate": 2.3604044266569426e-08, "logits/chosen": -1.8406044244766235, "logits/rejected": -1.5763709545135498, "logps/chosen": -604.3179931640625, "logps/rejected": -689.251953125, "loss": 0.5125, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3379364013671875, "rewards/margins": 1.3984531164169312, "rewards/rejected": -4.73638916015625, "step": 7340 }, { "epoch": 0.9618215722838355, "grad_norm": 33.0, "learning_rate": 2.2063994781468256e-08, "logits/chosen": -1.7047882080078125, "logits/rejected": -1.6373428106307983, "logps/chosen": -606.4323120117188, "logps/rejected": -745.7265625, "loss": 0.4598, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3001811504364014, "rewards/margins": 1.642462968826294, "rewards/rejected": -4.942644119262695, "step": 7350 }, { "epoch": 0.963130173062453, "grad_norm": 21.5, "learning_rate": 2.057567230215246e-08, "logits/chosen": -1.6557362079620361, "logits/rejected": -1.6185033321380615, "logps/chosen": -617.3782348632812, "logps/rejected": -760.5680541992188, "loss": 0.3649, "rewards/accuracies": 0.75, "rewards/chosen": -3.3665225505828857, "rewards/margins": 1.7739441394805908, "rewards/rejected": -5.140466690063477, "step": 7360 }, { "epoch": 0.9644387738410705, "grad_norm": 16.375, "learning_rate": 1.9139107897409303e-08, "logits/chosen": -1.7375398874282837, "logits/rejected": -1.4712655544281006, "logps/chosen": -659.6129150390625, "logps/rejected": -732.2275390625, "loss": 0.5395, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4675514698028564, "rewards/margins": 1.327845811843872, "rewards/rejected": -4.79539680480957, "step": 7370 }, { "epoch": 0.9657473746196878, "grad_norm": 15.25, "learning_rate": 1.7754331555573656e-08, "logits/chosen": -1.8118674755096436, "logits/rejected": -1.5070523023605347, "logps/chosen": -655.1289672851562, "logps/rejected": -703.8294067382812, "loss": 0.5138, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.405998945236206, "rewards/margins": 1.1343212127685547, "rewards/rejected": -4.54032039642334, "step": 7380 }, { "epoch": 0.9670559753983053, "grad_norm": 21.0, "learning_rate": 1.642137218390294e-08, "logits/chosen": -1.5985832214355469, "logits/rejected": -1.4141457080841064, "logps/chosen": -696.5066528320312, "logps/rejected": -766.3292846679688, "loss": 0.5185, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.6868484020233154, "rewards/margins": 1.0530002117156982, "rewards/rejected": -4.7398481369018555, "step": 7390 }, { "epoch": 0.9683645761769228, "grad_norm": 12.25, "learning_rate": 1.514025760797344e-08, "logits/chosen": -1.8637206554412842, "logits/rejected": -1.7864269018173218, "logps/chosen": -659.1239624023438, "logps/rejected": -753.0234985351562, "loss": 0.4415, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.5155022144317627, "rewards/margins": 1.0285392999649048, "rewards/rejected": -4.544041156768799, "step": 7400 }, { "epoch": 0.9683645761769228, "eval_logits/chosen": -1.3390041589736938, "eval_logits/rejected": -1.218957543373108, "eval_logps/chosen": -593.6599731445312, "eval_logps/rejected": -712.6990966796875, "eval_loss": 0.4839739501476288, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.284152030944824, "eval_rewards/margins": 1.4308661222457886, "eval_rewards/rejected": -4.7150187492370605, "eval_runtime": 694.669, "eval_samples_per_second": 2.879, "eval_steps_per_second": 0.18, "step": 7400 }, { "epoch": 0.9696731769555403, "grad_norm": 14.5625, "learning_rate": 1.3911014571098835e-08, "logits/chosen": -1.6002109050750732, "logits/rejected": -1.6819194555282593, "logps/chosen": -542.8758544921875, "logps/rejected": -652.9110107421875, "loss": 0.5392, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.405431032180786, "rewards/margins": 1.0547492504119873, "rewards/rejected": -4.460180282592773, "step": 7410 }, { "epoch": 0.9709817777341577, "grad_norm": 25.625, "learning_rate": 1.2733668733773685e-08, "logits/chosen": -1.6041558980941772, "logits/rejected": -1.500565767288208, "logps/chosen": -561.9778442382812, "logps/rejected": -674.3802490234375, "loss": 0.4576, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3195719718933105, "rewards/margins": 1.3539068698883057, "rewards/rejected": -4.673478603363037, "step": 7420 }, { "epoch": 0.9722903785127752, "grad_norm": 15.5, "learning_rate": 1.160824467313526e-08, "logits/chosen": -1.715045690536499, "logits/rejected": -1.2474496364593506, "logps/chosen": -654.7431640625, "logps/rejected": -674.5675659179688, "loss": 0.6205, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.524798631668091, "rewards/margins": 1.2327511310577393, "rewards/rejected": -4.75754976272583, "step": 7430 }, { "epoch": 0.9735989792913927, "grad_norm": 13.1875, "learning_rate": 1.0534765882453113e-08, "logits/chosen": -1.5365979671478271, "logits/rejected": -1.1959383487701416, "logps/chosen": -587.2747802734375, "logps/rejected": -693.1444091796875, "loss": 0.4396, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.5025527477264404, "rewards/margins": 1.5514488220214844, "rewards/rejected": -5.054001808166504, "step": 7440 }, { "epoch": 0.9749075800700101, "grad_norm": 11.75, "learning_rate": 9.513254770636138e-09, "logits/chosen": -1.5698086023330688, "logits/rejected": -1.4948017597198486, "logps/chosen": -581.23974609375, "logps/rejected": -672.81787109375, "loss": 0.416, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.199650287628174, "rewards/margins": 1.153997778892517, "rewards/rejected": -4.3536481857299805, "step": 7450 }, { "epoch": 0.9762161808486276, "grad_norm": 11.75, "learning_rate": 8.543732661767113e-09, "logits/chosen": -1.7190850973129272, "logits/rejected": -1.5902774333953857, "logps/chosen": -569.220703125, "logps/rejected": -650.7015380859375, "loss": 0.5508, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.315713405609131, "rewards/margins": 1.1711820363998413, "rewards/rejected": -4.48689603805542, "step": 7460 }, { "epoch": 0.9775247816272451, "grad_norm": 9.375, "learning_rate": 7.626219794655553e-09, "logits/chosen": -1.8017486333847046, "logits/rejected": -1.5825135707855225, "logps/chosen": -618.5581665039062, "logps/rejected": -707.8232421875, "loss": 0.4229, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.18318772315979, "rewards/margins": 1.571376919746399, "rewards/rejected": -4.75456428527832, "step": 7470 }, { "epoch": 0.9788333824058625, "grad_norm": 12.0, "learning_rate": 6.7607353224163896e-09, "logits/chosen": -1.7092714309692383, "logits/rejected": -1.4618524312973022, "logps/chosen": -558.8264770507812, "logps/rejected": -667.6316528320312, "loss": 0.5093, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.3021621704101562, "rewards/margins": 1.2308542728424072, "rewards/rejected": -4.533016204833984, "step": 7480 }, { "epoch": 0.98014198318448, "grad_norm": 21.875, "learning_rate": 5.947297312070554e-09, "logits/chosen": -1.9320802688598633, "logits/rejected": -1.6934759616851807, "logps/chosen": -664.6417236328125, "logps/rejected": -742.8804931640625, "loss": 0.4497, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.412024736404419, "rewards/margins": 1.3945446014404297, "rewards/rejected": -4.8065690994262695, "step": 7490 }, { "epoch": 0.9814505839630975, "grad_norm": 20.0, "learning_rate": 5.185922744166128e-09, "logits/chosen": -1.9502627849578857, "logits/rejected": -1.6353776454925537, "logps/chosen": -665.599365234375, "logps/rejected": -701.9202880859375, "loss": 0.4848, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.6686770915985107, "rewards/margins": 0.928602397441864, "rewards/rejected": -4.5972795486450195, "step": 7500 }, { "epoch": 0.9814505839630975, "eval_logits/chosen": -1.3430283069610596, "eval_logits/rejected": -1.223465085029602, "eval_logps/chosen": -593.8440551757812, "eval_logps/rejected": -712.92724609375, "eval_loss": 0.48398536443710327, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -3.2859933376312256, "eval_rewards/margins": 1.43130624294281, "eval_rewards/rejected": -4.717299461364746, "eval_runtime": 695.1547, "eval_samples_per_second": 2.877, "eval_steps_per_second": 0.18, "step": 7500 }, { "epoch": 0.982759184741715, "grad_norm": 13.4375, "learning_rate": 4.476627512425558e-09, "logits/chosen": -1.9058698415756226, "logits/rejected": -1.7457468509674072, "logps/chosen": -593.4295043945312, "logps/rejected": -728.8865966796875, "loss": 0.4764, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.221513032913208, "rewards/margins": 1.6537725925445557, "rewards/rejected": -4.8752851486206055, "step": 7510 }, { "epoch": 0.9840677855203324, "grad_norm": 14.5625, "learning_rate": 3.819426423412875e-09, "logits/chosen": -1.9293270111083984, "logits/rejected": -1.695984125137329, "logps/chosen": -639.421142578125, "logps/rejected": -720.97900390625, "loss": 0.5423, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.3547701835632324, "rewards/margins": 1.4750053882598877, "rewards/rejected": -4.829775810241699, "step": 7520 }, { "epoch": 0.9853763862989499, "grad_norm": 14.4375, "learning_rate": 3.2143331962256053e-09, "logits/chosen": -1.8710296154022217, "logits/rejected": -1.6305862665176392, "logps/chosen": -605.0763549804688, "logps/rejected": -734.7957763671875, "loss": 0.3468, "rewards/accuracies": 0.875, "rewards/chosen": -3.137573719024658, "rewards/margins": 1.5910383462905884, "rewards/rejected": -4.728612422943115, "step": 7530 }, { "epoch": 0.9866849870775674, "grad_norm": 14.0625, "learning_rate": 2.6613604622066635e-09, "logits/chosen": -1.6185529232025146, "logits/rejected": -1.4577267169952393, "logps/chosen": -609.0463256835938, "logps/rejected": -776.7925415039062, "loss": 0.4934, "rewards/accuracies": 0.75, "rewards/chosen": -3.294677734375, "rewards/margins": 1.4553929567337036, "rewards/rejected": -4.7500715255737305, "step": 7540 }, { "epoch": 0.9879935878561847, "grad_norm": 12.5625, "learning_rate": 2.1605197646826228e-09, "logits/chosen": -1.8908064365386963, "logits/rejected": -1.7482397556304932, "logps/chosen": -613.5296020507812, "logps/rejected": -707.9548950195312, "loss": 0.5574, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3217430114746094, "rewards/margins": 0.9488459825515747, "rewards/rejected": -4.270589351654053, "step": 7550 }, { "epoch": 0.9893021886348022, "grad_norm": 7.6875, "learning_rate": 1.711821558721405e-09, "logits/chosen": -1.7171605825424194, "logits/rejected": -1.6740245819091797, "logps/chosen": -604.8802490234375, "logps/rejected": -764.9961547851562, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": -3.381549835205078, "rewards/margins": 1.3844246864318848, "rewards/rejected": -4.765974521636963, "step": 7560 }, { "epoch": 0.9906107894134197, "grad_norm": 15.75, "learning_rate": 1.3152752109149569e-09, "logits/chosen": -1.6451746225357056, "logits/rejected": -1.2619988918304443, "logps/chosen": -618.6143188476562, "logps/rejected": -691.6968994140625, "loss": 0.4604, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.460538387298584, "rewards/margins": 1.4919283390045166, "rewards/rejected": -4.95246696472168, "step": 7570 }, { "epoch": 0.9919193901920371, "grad_norm": 15.9375, "learning_rate": 9.708889991830173e-10, "logits/chosen": -1.6769351959228516, "logits/rejected": -1.7159042358398438, "logps/chosen": -591.6790771484375, "logps/rejected": -642.5636596679688, "loss": 0.4979, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.346444606781006, "rewards/margins": 0.9449462890625, "rewards/rejected": -4.291390895843506, "step": 7580 }, { "epoch": 0.9932279909706546, "grad_norm": 19.25, "learning_rate": 6.786701125999218e-10, "logits/chosen": -1.8857370615005493, "logits/rejected": -1.9316622018814087, "logps/chosen": -600.1328125, "logps/rejected": -710.33251953125, "loss": 0.4434, "rewards/accuracies": 0.625, "rewards/chosen": -3.4482452869415283, "rewards/margins": 0.9919483065605164, "rewards/rejected": -4.440194129943848, "step": 7590 }, { "epoch": 0.9945365917492721, "grad_norm": 9.0, "learning_rate": 4.3862465124638873e-10, "logits/chosen": -1.692299246788025, "logits/rejected": -1.6337816715240479, "logps/chosen": -561.1657104492188, "logps/rejected": -646.8262939453125, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -3.3429362773895264, "rewards/margins": 1.1995925903320312, "rewards/rejected": -4.5425286293029785, "step": 7600 }, { "epoch": 0.9945365917492721, "eval_logits/chosen": -1.3436263799667358, "eval_logits/rejected": -1.2241634130477905, "eval_logps/chosen": -593.8394775390625, "eval_logps/rejected": -712.9290161132812, "eval_loss": 0.4839686453342438, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -3.285947561264038, "eval_rewards/margins": 1.4313703775405884, "eval_rewards/rejected": -4.717318058013916, "eval_runtime": 696.6986, "eval_samples_per_second": 2.871, "eval_steps_per_second": 0.179, "step": 7600 }, { "epoch": 0.9958451925278895, "grad_norm": 12.8125, "learning_rate": 2.507576260799005e-10, "logits/chosen": -1.6740825176239014, "logits/rejected": -1.4562678337097168, "logps/chosen": -625.2222900390625, "logps/rejected": -723.4585571289062, "loss": 0.4974, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.37178111076355, "rewards/margins": 1.403822422027588, "rewards/rejected": -4.775603294372559, "step": 7610 }, { "epoch": 0.997153793306507, "grad_norm": 15.0625, "learning_rate": 1.1507295883145253e-10, "logits/chosen": -1.7838990688323975, "logits/rejected": -1.7511736154556274, "logps/chosen": -600.767333984375, "logps/rejected": -677.8153686523438, "loss": 0.5231, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3161072731018066, "rewards/margins": 1.0001850128173828, "rewards/rejected": -4.3162922859191895, "step": 7620 }, { "epoch": 0.9984623940851245, "grad_norm": 12.9375, "learning_rate": 3.1573481923952156e-11, "logits/chosen": -1.7147305011749268, "logits/rejected": -1.479693055152893, "logps/chosen": -678.3225708007812, "logps/rejected": -770.458251953125, "loss": 0.4975, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.4338126182556152, "rewards/margins": 1.3637737035751343, "rewards/rejected": -4.797586441040039, "step": 7630 }, { "epoch": 0.999770994863742, "grad_norm": 7.375, "learning_rate": 2.609384119889313e-13, "logits/chosen": -1.666529893875122, "logits/rejected": -1.4268560409545898, "logps/chosen": -683.90380859375, "logps/rejected": -767.3264770507812, "loss": 0.4479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.568126678466797, "rewards/margins": 1.4261468648910522, "rewards/rejected": -4.9942731857299805, "step": 7640 }, { "epoch": 0.9999018549416037, "step": 7641, "total_flos": 0.0, "train_loss": 0.5203102414925643, "train_runtime": 90452.7583, "train_samples_per_second": 0.676, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 7641, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }