{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.964444444444444, "eval_steps": 100, "global_step": 1344, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005925925925925926, "grad_norm": 42.43893981091583, "learning_rate": 3.7037037037037036e-09, "logits/chosen": -1.310781478881836, "logits/rejected": -1.393431305885315, "logps/chosen": -52.985904693603516, "logps/rejected": -57.095699310302734, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.011851851851851851, "grad_norm": 35.96864569231992, "learning_rate": 7.407407407407407e-09, "logits/chosen": -1.4714622497558594, "logits/rejected": -1.5260519981384277, "logps/chosen": -50.45790100097656, "logps/rejected": -54.5156135559082, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.017777777777777778, "grad_norm": 39.18444558531369, "learning_rate": 1.111111111111111e-08, "logits/chosen": -1.4416756629943848, "logits/rejected": -1.3762359619140625, "logps/chosen": -45.7575798034668, "logps/rejected": -60.49638748168945, "loss": 0.6978, "rewards/accuracies": 0.4375, "rewards/chosen": -0.011297130957245827, "rewards/margins": -0.006299828179180622, "rewards/rejected": -0.00499730184674263, "step": 3 }, { "epoch": 0.023703703703703703, "grad_norm": 39.65552735879418, "learning_rate": 1.4814814814814814e-08, "logits/chosen": -1.4207963943481445, "logits/rejected": -1.5604076385498047, "logps/chosen": -37.122886657714844, "logps/rejected": -54.21310043334961, "loss": 0.7077, "rewards/accuracies": 0.5, "rewards/chosen": 0.026990916579961777, "rewards/margins": -0.015975551679730415, "rewards/rejected": 0.042966462671756744, "step": 4 }, { "epoch": 0.02962962962962963, "grad_norm": 36.596040564401264, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -1.4846137762069702, "logits/rejected": -1.4979444742202759, "logps/chosen": -44.22126388549805, "logps/rejected": -55.1204833984375, "loss": 0.6865, "rewards/accuracies": 0.4375, "rewards/chosen": 0.007357514463365078, "rewards/margins": 0.031326450407505035, "rewards/rejected": -0.023968935012817383, "step": 5 }, { "epoch": 0.035555555555555556, "grad_norm": 36.320492249631755, "learning_rate": 2.222222222222222e-08, "logits/chosen": -1.4549423456192017, "logits/rejected": -1.4336495399475098, "logps/chosen": -42.29203414916992, "logps/rejected": -40.219139099121094, "loss": 0.691, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005795812234282494, "rewards/margins": -0.029732275754213333, "rewards/rejected": 0.03552808612585068, "step": 6 }, { "epoch": 0.04148148148148148, "grad_norm": 34.8509598127472, "learning_rate": 2.5925925925925923e-08, "logits/chosen": -1.5144797563552856, "logits/rejected": -1.5598227977752686, "logps/chosen": -46.01911163330078, "logps/rejected": -48.391319274902344, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": -0.021584009751677513, "rewards/margins": -0.008964895270764828, "rewards/rejected": -0.01261911354959011, "step": 7 }, { "epoch": 0.047407407407407405, "grad_norm": 41.834639343315644, "learning_rate": 2.962962962962963e-08, "logits/chosen": -1.5673420429229736, "logits/rejected": -1.5740389823913574, "logps/chosen": -44.32832717895508, "logps/rejected": -58.74522399902344, "loss": 0.7043, "rewards/accuracies": 0.375, "rewards/chosen": -0.01198368240147829, "rewards/margins": -0.015015724115073681, "rewards/rejected": 0.0030320417135953903, "step": 8 }, { "epoch": 0.05333333333333334, "grad_norm": 38.052460837465965, "learning_rate": 3.3333333333333334e-08, "logits/chosen": -1.5391089916229248, "logits/rejected": -1.533707857131958, "logps/chosen": -62.247337341308594, "logps/rejected": -66.13418579101562, "loss": 0.7055, "rewards/accuracies": 0.6875, "rewards/chosen": 0.042546942830085754, "rewards/margins": 0.04441840201616287, "rewards/rejected": -0.0018714680336415768, "step": 9 }, { "epoch": 0.05925925925925926, "grad_norm": 37.13655017506449, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -1.3837740421295166, "logits/rejected": -1.4348113536834717, "logps/chosen": -57.03742599487305, "logps/rejected": -53.65230178833008, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": -0.038426827639341354, "rewards/margins": 0.047683194279670715, "rewards/rejected": -0.08611002564430237, "step": 10 }, { "epoch": 0.06518518518518518, "grad_norm": 41.67417730129325, "learning_rate": 4.0740740740740745e-08, "logits/chosen": -1.591475248336792, "logits/rejected": -1.522184133529663, "logps/chosen": -55.395172119140625, "logps/rejected": -50.715797424316406, "loss": 0.6957, "rewards/accuracies": 0.5, "rewards/chosen": 0.012220479547977448, "rewards/margins": -0.010918429121375084, "rewards/rejected": 0.023138903081417084, "step": 11 }, { "epoch": 0.07111111111111111, "grad_norm": 42.17041570253324, "learning_rate": 4.444444444444444e-08, "logits/chosen": -1.4714893102645874, "logits/rejected": -1.5695397853851318, "logps/chosen": -50.417884826660156, "logps/rejected": -50.72690200805664, "loss": 0.7116, "rewards/accuracies": 0.4375, "rewards/chosen": -0.003867245279252529, "rewards/margins": -0.008899686858057976, "rewards/rejected": 0.005032443441450596, "step": 12 }, { "epoch": 0.07703703703703704, "grad_norm": 34.71785885257626, "learning_rate": 4.814814814814814e-08, "logits/chosen": -1.4874341487884521, "logits/rejected": -1.5213749408721924, "logps/chosen": -38.02253341674805, "logps/rejected": -43.72047424316406, "loss": 0.695, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0014143595471978188, "rewards/margins": -0.03884340822696686, "rewards/rejected": 0.037429049611091614, "step": 13 }, { "epoch": 0.08296296296296296, "grad_norm": 42.675804996783135, "learning_rate": 5.1851851851851846e-08, "logits/chosen": -1.554866075515747, "logits/rejected": -1.5048046112060547, "logps/chosen": -46.5153694152832, "logps/rejected": -63.505584716796875, "loss": 0.7035, "rewards/accuracies": 0.625, "rewards/chosen": 0.020593512803316116, "rewards/margins": 0.015447389334440231, "rewards/rejected": 0.005146123003214598, "step": 14 }, { "epoch": 0.08888888888888889, "grad_norm": 39.68741922264209, "learning_rate": 5.555555555555555e-08, "logits/chosen": -1.4316121339797974, "logits/rejected": -1.4272632598876953, "logps/chosen": -43.43450927734375, "logps/rejected": -52.10813903808594, "loss": 0.6677, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05863571539521217, "rewards/margins": 0.10169437527656555, "rewards/rejected": -0.04305865988135338, "step": 15 }, { "epoch": 0.09481481481481481, "grad_norm": 37.66976673357867, "learning_rate": 5.925925925925926e-08, "logits/chosen": -1.4010733366012573, "logits/rejected": -1.4859191179275513, "logps/chosen": -44.7211799621582, "logps/rejected": -54.39717102050781, "loss": 0.7125, "rewards/accuracies": 0.4375, "rewards/chosen": -0.021378686651587486, "rewards/margins": -0.03166306018829346, "rewards/rejected": 0.010284376330673695, "step": 16 }, { "epoch": 0.10074074074074074, "grad_norm": 40.434320443594345, "learning_rate": 6.296296296296296e-08, "logits/chosen": -1.587019681930542, "logits/rejected": -1.583219289779663, "logps/chosen": -56.18910598754883, "logps/rejected": -54.055110931396484, "loss": 0.69, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02960667759180069, "rewards/margins": 0.0984027162194252, "rewards/rejected": -0.06879603862762451, "step": 17 }, { "epoch": 0.10666666666666667, "grad_norm": 38.09471520711338, "learning_rate": 6.666666666666667e-08, "logits/chosen": -1.4128365516662598, "logits/rejected": -1.4133434295654297, "logps/chosen": -60.68763732910156, "logps/rejected": -49.51874542236328, "loss": 0.6935, "rewards/accuracies": 0.5625, "rewards/chosen": 0.01688566617667675, "rewards/margins": -0.001714322715997696, "rewards/rejected": 0.018599988892674446, "step": 18 }, { "epoch": 0.11259259259259259, "grad_norm": 39.11514807718614, "learning_rate": 7.037037037037038e-08, "logits/chosen": -1.489649772644043, "logits/rejected": -1.4198248386383057, "logps/chosen": -50.41545867919922, "logps/rejected": -52.61773681640625, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": 0.07306747138500214, "rewards/margins": 0.03087785467505455, "rewards/rejected": 0.042189620435237885, "step": 19 }, { "epoch": 0.11851851851851852, "grad_norm": 39.97363440787134, "learning_rate": 7.407407407407407e-08, "logits/chosen": -1.3663185834884644, "logits/rejected": -1.535215973854065, "logps/chosen": -43.01957702636719, "logps/rejected": -57.80477523803711, "loss": 0.6968, "rewards/accuracies": 0.3125, "rewards/chosen": -0.0261086318641901, "rewards/margins": -0.05911754071712494, "rewards/rejected": 0.03300891071557999, "step": 20 }, { "epoch": 0.12444444444444444, "grad_norm": 41.07559338556434, "learning_rate": 7.777777777777778e-08, "logits/chosen": -1.3014625310897827, "logits/rejected": -1.3382461071014404, "logps/chosen": -49.614036560058594, "logps/rejected": -62.0330810546875, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": 0.022783983498811722, "rewards/margins": 0.005002446472644806, "rewards/rejected": 0.017781544476747513, "step": 21 }, { "epoch": 0.13037037037037036, "grad_norm": 38.35221350226358, "learning_rate": 8.148148148148149e-08, "logits/chosen": -1.3671640157699585, "logits/rejected": -1.4167208671569824, "logps/chosen": -36.43228530883789, "logps/rejected": -43.55463409423828, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.04212541878223419, "rewards/margins": 0.004528047516942024, "rewards/rejected": 0.03759737312793732, "step": 22 }, { "epoch": 0.1362962962962963, "grad_norm": 38.57262376611147, "learning_rate": 8.518518518518517e-08, "logits/chosen": -1.4059712886810303, "logits/rejected": -1.4144426584243774, "logps/chosen": -43.566017150878906, "logps/rejected": -51.942710876464844, "loss": 0.6972, "rewards/accuracies": 0.4375, "rewards/chosen": -4.487065598368645e-05, "rewards/margins": -0.00031371042132377625, "rewards/rejected": 0.00026884046383202076, "step": 23 }, { "epoch": 0.14222222222222222, "grad_norm": 38.93480155482321, "learning_rate": 8.888888888888888e-08, "logits/chosen": -1.4173849821090698, "logits/rejected": -1.436640977859497, "logps/chosen": -49.499488830566406, "logps/rejected": -61.60590362548828, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": -0.043488118797540665, "rewards/margins": 0.019792892038822174, "rewards/rejected": -0.06328101456165314, "step": 24 }, { "epoch": 0.14814814814814814, "grad_norm": 36.27802321770151, "learning_rate": 9.259259259259258e-08, "logits/chosen": -1.4764326810836792, "logits/rejected": -1.6459614038467407, "logps/chosen": -43.81329345703125, "logps/rejected": -57.16102981567383, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.007635999470949173, "rewards/margins": 0.027008555829524994, "rewards/rejected": -0.01937256008386612, "step": 25 }, { "epoch": 0.15407407407407409, "grad_norm": 41.196033400227655, "learning_rate": 9.629629629629629e-08, "logits/chosen": -1.4573525190353394, "logits/rejected": -1.5029797554016113, "logps/chosen": -53.4759521484375, "logps/rejected": -60.73443603515625, "loss": 0.7043, "rewards/accuracies": 0.625, "rewards/chosen": 0.01736607775092125, "rewards/margins": 0.027802232652902603, "rewards/rejected": -0.010436153039336205, "step": 26 }, { "epoch": 0.16, "grad_norm": 44.1756798619816, "learning_rate": 1e-07, "logits/chosen": -1.467996597290039, "logits/rejected": -1.4205275774002075, "logps/chosen": -59.43016815185547, "logps/rejected": -52.35725784301758, "loss": 0.69, "rewards/accuracies": 0.6875, "rewards/chosen": 0.014531444758176804, "rewards/margins": 0.015589211136102676, "rewards/rejected": -0.001057768240571022, "step": 27 }, { "epoch": 0.16592592592592592, "grad_norm": 38.02358706006054, "learning_rate": 1.0370370370370369e-07, "logits/chosen": -1.4217634201049805, "logits/rejected": -1.4778153896331787, "logps/chosen": -45.79845428466797, "logps/rejected": -53.97588348388672, "loss": 0.7112, "rewards/accuracies": 0.25, "rewards/chosen": 0.012649372220039368, "rewards/margins": -0.03449837863445282, "rewards/rejected": 0.04714775085449219, "step": 28 }, { "epoch": 0.17185185185185184, "grad_norm": 36.83123424027711, "learning_rate": 1.074074074074074e-07, "logits/chosen": -1.3519933223724365, "logits/rejected": -1.4353563785552979, "logps/chosen": -33.142181396484375, "logps/rejected": -50.9314079284668, "loss": 0.6873, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014912517741322517, "rewards/margins": 0.017467448487877846, "rewards/rejected": -0.032379962503910065, "step": 29 }, { "epoch": 0.17777777777777778, "grad_norm": 38.82944235868051, "learning_rate": 1.111111111111111e-07, "logits/chosen": -1.3293776512145996, "logits/rejected": -1.4100325107574463, "logps/chosen": -45.714385986328125, "logps/rejected": -62.283958435058594, "loss": 0.688, "rewards/accuracies": 0.5, "rewards/chosen": -0.007679283618927002, "rewards/margins": -0.010211003012955189, "rewards/rejected": 0.002531719394028187, "step": 30 }, { "epoch": 0.1837037037037037, "grad_norm": 41.525322137451916, "learning_rate": 1.148148148148148e-07, "logits/chosen": -1.2476561069488525, "logits/rejected": -1.365776538848877, "logps/chosen": -42.67413330078125, "logps/rejected": -58.3412971496582, "loss": 0.7013, "rewards/accuracies": 0.625, "rewards/chosen": -0.019379708915948868, "rewards/margins": 0.01881086826324463, "rewards/rejected": -0.038190580904483795, "step": 31 }, { "epoch": 0.18962962962962962, "grad_norm": 37.28625199471067, "learning_rate": 1.1851851851851851e-07, "logits/chosen": -1.4711147546768188, "logits/rejected": -1.4542691707611084, "logps/chosen": -57.69325256347656, "logps/rejected": -59.4173698425293, "loss": 0.6773, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008729221299290657, "rewards/margins": 0.050374746322631836, "rewards/rejected": -0.059103965759277344, "step": 32 }, { "epoch": 0.19555555555555557, "grad_norm": 38.71452491388633, "learning_rate": 1.2222222222222222e-07, "logits/chosen": -1.2580482959747314, "logits/rejected": -1.3434240818023682, "logps/chosen": -50.910953521728516, "logps/rejected": -48.82550811767578, "loss": 0.6934, "rewards/accuracies": 0.5625, "rewards/chosen": 0.006866408511996269, "rewards/margins": 0.014877223409712315, "rewards/rejected": -0.008010816760361195, "step": 33 }, { "epoch": 0.20148148148148148, "grad_norm": 35.86254457726259, "learning_rate": 1.2592592592592592e-07, "logits/chosen": -1.5303350687026978, "logits/rejected": -1.5936583280563354, "logps/chosen": -44.84267044067383, "logps/rejected": -43.60268783569336, "loss": 0.6793, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010646509006619453, "rewards/margins": -0.0017105825245380402, "rewards/rejected": 0.012357092462480068, "step": 34 }, { "epoch": 0.2074074074074074, "grad_norm": 40.452247232561945, "learning_rate": 1.2962962962962961e-07, "logits/chosen": -1.4947465658187866, "logits/rejected": -1.5485597848892212, "logps/chosen": -46.03197479248047, "logps/rejected": -60.23851013183594, "loss": 0.7077, "rewards/accuracies": 0.375, "rewards/chosen": -0.008130693808197975, "rewards/margins": -0.0018020663410425186, "rewards/rejected": -0.006328627001494169, "step": 35 }, { "epoch": 0.21333333333333335, "grad_norm": 37.119370880492134, "learning_rate": 1.3333333333333334e-07, "logits/chosen": -1.3620657920837402, "logits/rejected": -1.3718904256820679, "logps/chosen": -42.104820251464844, "logps/rejected": -60.524169921875, "loss": 0.7101, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03648862987756729, "rewards/margins": -0.007312392815947533, "rewards/rejected": 0.04380102455615997, "step": 36 }, { "epoch": 0.21925925925925926, "grad_norm": 36.94520667301027, "learning_rate": 1.3703703703703703e-07, "logits/chosen": -1.5933514833450317, "logits/rejected": -1.5976018905639648, "logps/chosen": -38.13121032714844, "logps/rejected": -50.772254943847656, "loss": 0.694, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006040811538696289, "rewards/margins": 0.006438873242586851, "rewards/rejected": -0.012479686178267002, "step": 37 }, { "epoch": 0.22518518518518518, "grad_norm": 39.60573256682946, "learning_rate": 1.4074074074074075e-07, "logits/chosen": -1.5454288721084595, "logits/rejected": -1.6323232650756836, "logps/chosen": -54.06110382080078, "logps/rejected": -52.358184814453125, "loss": 0.7076, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004916094243526459, "rewards/margins": -0.002387760207056999, "rewards/rejected": -0.0025283321738243103, "step": 38 }, { "epoch": 0.2311111111111111, "grad_norm": 35.63198458544882, "learning_rate": 1.4444444444444442e-07, "logits/chosen": -1.275139331817627, "logits/rejected": -1.3453179597854614, "logps/chosen": -42.013126373291016, "logps/rejected": -48.47417068481445, "loss": 0.7046, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0050153713673353195, "rewards/margins": -0.053548287600278854, "rewards/rejected": 0.048532914370298386, "step": 39 }, { "epoch": 0.23703703703703705, "grad_norm": 38.43712146198748, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -1.41544771194458, "logits/rejected": -1.4644105434417725, "logps/chosen": -51.85929870605469, "logps/rejected": -68.28307342529297, "loss": 0.6958, "rewards/accuracies": 0.5625, "rewards/chosen": 0.028657151386141777, "rewards/margins": -0.009387994185090065, "rewards/rejected": 0.03804514557123184, "step": 40 }, { "epoch": 0.24296296296296296, "grad_norm": 38.20341525404962, "learning_rate": 1.5185185185185184e-07, "logits/chosen": -1.6255518198013306, "logits/rejected": -1.5988264083862305, "logps/chosen": -53.897212982177734, "logps/rejected": -60.99618148803711, "loss": 0.7005, "rewards/accuracies": 0.375, "rewards/chosen": 0.009833859279751778, "rewards/margins": -0.0012347670271992683, "rewards/rejected": 0.011068630963563919, "step": 41 }, { "epoch": 0.24888888888888888, "grad_norm": 39.044305355608955, "learning_rate": 1.5555555555555556e-07, "logits/chosen": -1.3904306888580322, "logits/rejected": -1.4621787071228027, "logps/chosen": -53.870452880859375, "logps/rejected": -57.930389404296875, "loss": 0.6956, "rewards/accuracies": 0.625, "rewards/chosen": 0.030896497890353203, "rewards/margins": 0.02933361567556858, "rewards/rejected": 0.001562881050631404, "step": 42 }, { "epoch": 0.2548148148148148, "grad_norm": 35.3898254807426, "learning_rate": 1.5925925925925926e-07, "logits/chosen": -1.6222436428070068, "logits/rejected": -1.590828776359558, "logps/chosen": -34.60671615600586, "logps/rejected": -40.5382194519043, "loss": 0.6809, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01461029052734375, "rewards/margins": 0.029801419004797935, "rewards/rejected": -0.04441170394420624, "step": 43 }, { "epoch": 0.2607407407407407, "grad_norm": 39.131055743183715, "learning_rate": 1.6296296296296298e-07, "logits/chosen": -1.4192861318588257, "logits/rejected": -1.3798493146896362, "logps/chosen": -39.57558059692383, "logps/rejected": -57.1050910949707, "loss": 0.6817, "rewards/accuracies": 0.625, "rewards/chosen": 0.06627483665943146, "rewards/margins": 0.06393895298242569, "rewards/rejected": 0.0023358799517154694, "step": 44 }, { "epoch": 0.26666666666666666, "grad_norm": 39.36639547497787, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -1.402430772781372, "logits/rejected": -1.4567971229553223, "logps/chosen": -42.92563247680664, "logps/rejected": -59.93791580200195, "loss": 0.7016, "rewards/accuracies": 0.4375, "rewards/chosen": 0.012485409155488014, "rewards/margins": 0.012214185670018196, "rewards/rejected": 0.00027122534811496735, "step": 45 }, { "epoch": 0.2725925925925926, "grad_norm": 38.47445875334048, "learning_rate": 1.7037037037037035e-07, "logits/chosen": -1.4151440858840942, "logits/rejected": -1.458237648010254, "logps/chosen": -55.77102279663086, "logps/rejected": -57.44105911254883, "loss": 0.6955, "rewards/accuracies": 0.5625, "rewards/chosen": 0.015422536060214043, "rewards/margins": 0.024210453033447266, "rewards/rejected": -0.008787919767200947, "step": 46 }, { "epoch": 0.2785185185185185, "grad_norm": 35.573818690899394, "learning_rate": 1.7407407407407407e-07, "logits/chosen": -1.395029067993164, "logits/rejected": -1.434049367904663, "logps/chosen": -61.55660629272461, "logps/rejected": -63.84947967529297, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": 0.03462505340576172, "rewards/margins": 0.07204857468605042, "rewards/rejected": -0.0374235175549984, "step": 47 }, { "epoch": 0.28444444444444444, "grad_norm": 43.30636407908927, "learning_rate": 1.7777777777777776e-07, "logits/chosen": -1.4949756860733032, "logits/rejected": -1.5911140441894531, "logps/chosen": -47.98042297363281, "logps/rejected": -54.70000457763672, "loss": 0.6929, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04974164813756943, "rewards/margins": 0.1031041219830513, "rewards/rejected": -0.053362466394901276, "step": 48 }, { "epoch": 0.2903703703703704, "grad_norm": 39.70891417030015, "learning_rate": 1.8148148148148149e-07, "logits/chosen": -1.4629085063934326, "logits/rejected": -1.4793230295181274, "logps/chosen": -42.80840301513672, "logps/rejected": -49.654842376708984, "loss": 0.693, "rewards/accuracies": 0.625, "rewards/chosen": 0.04174475744366646, "rewards/margins": 0.03507488965988159, "rewards/rejected": 0.006669867318123579, "step": 49 }, { "epoch": 0.2962962962962963, "grad_norm": 37.54848826529638, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -1.2061647176742554, "logits/rejected": -1.3068634271621704, "logps/chosen": -33.46271514892578, "logps/rejected": -49.604408264160156, "loss": 0.6726, "rewards/accuracies": 0.625, "rewards/chosen": 0.014253831468522549, "rewards/margins": 0.03934905678033829, "rewards/rejected": -0.025095226243138313, "step": 50 }, { "epoch": 0.3022222222222222, "grad_norm": 37.46608721291794, "learning_rate": 1.8888888888888888e-07, "logits/chosen": -1.4794844388961792, "logits/rejected": -1.5499848127365112, "logps/chosen": -58.28867721557617, "logps/rejected": -66.0478286743164, "loss": 0.6953, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0060877809301018715, "rewards/margins": -0.03297767788171768, "rewards/rejected": 0.03906545788049698, "step": 51 }, { "epoch": 0.30814814814814817, "grad_norm": 36.44525587089733, "learning_rate": 1.9259259259259257e-07, "logits/chosen": -1.5470339059829712, "logits/rejected": -1.5108013153076172, "logps/chosen": -38.12464141845703, "logps/rejected": -48.32713317871094, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01122739352285862, "rewards/margins": -0.008649253286421299, "rewards/rejected": -0.0025781395379453897, "step": 52 }, { "epoch": 0.31407407407407406, "grad_norm": 34.72154774204845, "learning_rate": 1.962962962962963e-07, "logits/chosen": -1.5386773347854614, "logits/rejected": -1.5055224895477295, "logps/chosen": -41.71277618408203, "logps/rejected": -41.09404754638672, "loss": 0.7031, "rewards/accuracies": 0.4375, "rewards/chosen": -0.019169487059116364, "rewards/margins": -0.00928646419197321, "rewards/rejected": -0.00988302007317543, "step": 53 }, { "epoch": 0.32, "grad_norm": 38.2339131295282, "learning_rate": 2e-07, "logits/chosen": -1.4301159381866455, "logits/rejected": -1.532789945602417, "logps/chosen": -46.67141342163086, "logps/rejected": -68.1341552734375, "loss": 0.6707, "rewards/accuracies": 0.75, "rewards/chosen": 0.04000416025519371, "rewards/margins": 0.05813169479370117, "rewards/rejected": -0.01812753826379776, "step": 54 }, { "epoch": 0.32592592592592595, "grad_norm": 39.43452908201783, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -1.5669282674789429, "logits/rejected": -1.5772104263305664, "logps/chosen": -41.083221435546875, "logps/rejected": -58.070709228515625, "loss": 0.6709, "rewards/accuracies": 0.875, "rewards/chosen": 0.042357515543699265, "rewards/margins": 0.09110362827777863, "rewards/rejected": -0.04874611273407936, "step": 55 }, { "epoch": 0.33185185185185184, "grad_norm": 36.090920523616724, "learning_rate": 2.0740740740740738e-07, "logits/chosen": -1.5565029382705688, "logits/rejected": -1.7220072746276855, "logps/chosen": -61.488285064697266, "logps/rejected": -65.77334594726562, "loss": 0.6651, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08066320419311523, "rewards/margins": 0.11545172333717346, "rewards/rejected": -0.03478851169347763, "step": 56 }, { "epoch": 0.3377777777777778, "grad_norm": 38.58486903890317, "learning_rate": 2.111111111111111e-07, "logits/chosen": -1.534893274307251, "logits/rejected": -1.5569573640823364, "logps/chosen": -48.127193450927734, "logps/rejected": -49.27490997314453, "loss": 0.6916, "rewards/accuracies": 0.4375, "rewards/chosen": 0.017229175195097923, "rewards/margins": -0.015148879960179329, "rewards/rejected": 0.03237805515527725, "step": 57 }, { "epoch": 0.3437037037037037, "grad_norm": 36.94599798372785, "learning_rate": 2.148148148148148e-07, "logits/chosen": -1.4781184196472168, "logits/rejected": -1.5944842100143433, "logps/chosen": -46.216766357421875, "logps/rejected": -53.072357177734375, "loss": 0.6789, "rewards/accuracies": 0.625, "rewards/chosen": 0.0686614066362381, "rewards/margins": 0.07688784599304199, "rewards/rejected": -0.008226440288126469, "step": 58 }, { "epoch": 0.3496296296296296, "grad_norm": 35.29207234594264, "learning_rate": 2.1851851851851852e-07, "logits/chosen": -1.4319119453430176, "logits/rejected": -1.4858628511428833, "logps/chosen": -49.45796203613281, "logps/rejected": -48.20957946777344, "loss": 0.675, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004403375089168549, "rewards/margins": 0.028496291488409042, "rewards/rejected": -0.03289966657757759, "step": 59 }, { "epoch": 0.35555555555555557, "grad_norm": 37.54885156466115, "learning_rate": 2.222222222222222e-07, "logits/chosen": -1.6024725437164307, "logits/rejected": -1.6143323183059692, "logps/chosen": -43.191246032714844, "logps/rejected": -61.6234130859375, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": -0.020701315253973007, "rewards/margins": 0.021595098078250885, "rewards/rejected": -0.042296409606933594, "step": 60 }, { "epoch": 0.36148148148148146, "grad_norm": 36.21246088451733, "learning_rate": 2.2592592592592591e-07, "logits/chosen": -1.5574266910552979, "logits/rejected": -1.6002821922302246, "logps/chosen": -40.11199188232422, "logps/rejected": -50.22590255737305, "loss": 0.6813, "rewards/accuracies": 0.8125, "rewards/chosen": 0.029111528769135475, "rewards/margins": 0.06680956482887268, "rewards/rejected": -0.03769803047180176, "step": 61 }, { "epoch": 0.3674074074074074, "grad_norm": 39.21506378340429, "learning_rate": 2.296296296296296e-07, "logits/chosen": -1.4695520401000977, "logits/rejected": -1.459860920906067, "logps/chosen": -43.92028045654297, "logps/rejected": -49.57878112792969, "loss": 0.6819, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008387397974729538, "rewards/margins": 0.02342185750603676, "rewards/rejected": -0.015034460462629795, "step": 62 }, { "epoch": 0.37333333333333335, "grad_norm": 37.33709678012097, "learning_rate": 2.3333333333333333e-07, "logits/chosen": -1.4608687162399292, "logits/rejected": -1.545125126838684, "logps/chosen": -46.558204650878906, "logps/rejected": -52.98792266845703, "loss": 0.6715, "rewards/accuracies": 0.75, "rewards/chosen": 0.03448185697197914, "rewards/margins": 0.08211689442396164, "rewards/rejected": -0.0476350337266922, "step": 63 }, { "epoch": 0.37925925925925924, "grad_norm": 38.347601536557335, "learning_rate": 2.3703703703703703e-07, "logits/chosen": -1.3680189847946167, "logits/rejected": -1.4239660501480103, "logps/chosen": -42.97498321533203, "logps/rejected": -61.106666564941406, "loss": 0.675, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0033024298027157784, "rewards/margins": 0.007563777267932892, "rewards/rejected": -0.004261349327862263, "step": 64 }, { "epoch": 0.3851851851851852, "grad_norm": 36.216386555506176, "learning_rate": 2.407407407407407e-07, "logits/chosen": -1.3454582691192627, "logits/rejected": -1.3198963403701782, "logps/chosen": -47.199485778808594, "logps/rejected": -52.74918746948242, "loss": 0.6865, "rewards/accuracies": 0.75, "rewards/chosen": 0.07009802013635635, "rewards/margins": 0.08980407565832138, "rewards/rejected": -0.019706057384610176, "step": 65 }, { "epoch": 0.39111111111111113, "grad_norm": 35.898175335015566, "learning_rate": 2.4444444444444445e-07, "logits/chosen": -1.3845653533935547, "logits/rejected": -1.3613636493682861, "logps/chosen": -41.01805877685547, "logps/rejected": -50.0711669921875, "loss": 0.6704, "rewards/accuracies": 0.5, "rewards/chosen": -0.007085941731929779, "rewards/margins": -0.010256503708660603, "rewards/rejected": 0.003170562209561467, "step": 66 }, { "epoch": 0.397037037037037, "grad_norm": 41.27409510225558, "learning_rate": 2.4814814814814814e-07, "logits/chosen": -1.4381773471832275, "logits/rejected": -1.4831494092941284, "logps/chosen": -37.221649169921875, "logps/rejected": -44.666282653808594, "loss": 0.6909, "rewards/accuracies": 0.5625, "rewards/chosen": -0.014814566820859909, "rewards/margins": -0.015786362811923027, "rewards/rejected": 0.0009717955254018307, "step": 67 }, { "epoch": 0.40296296296296297, "grad_norm": 36.64194351414389, "learning_rate": 2.5185185185185184e-07, "logits/chosen": -1.4352205991744995, "logits/rejected": -1.5572233200073242, "logps/chosen": -36.161170959472656, "logps/rejected": -60.05598831176758, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": 0.05245256423950195, "rewards/margins": 0.08905725926160812, "rewards/rejected": -0.03660469129681587, "step": 68 }, { "epoch": 0.4088888888888889, "grad_norm": 39.90280680196674, "learning_rate": 2.5555555555555553e-07, "logits/chosen": -1.5193519592285156, "logits/rejected": -1.562377691268921, "logps/chosen": -38.338375091552734, "logps/rejected": -47.005638122558594, "loss": 0.6841, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003990625962615013, "rewards/margins": -0.0032351240515708923, "rewards/rejected": -0.0007555019110441208, "step": 69 }, { "epoch": 0.4148148148148148, "grad_norm": 38.199525578292175, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -1.5483940839767456, "logits/rejected": -1.5856201648712158, "logps/chosen": -50.09548568725586, "logps/rejected": -60.07989501953125, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.011604977771639824, "rewards/margins": -0.018983270972967148, "rewards/rejected": 0.007378292270004749, "step": 70 }, { "epoch": 0.42074074074074075, "grad_norm": 40.00092221923499, "learning_rate": 2.629629629629629e-07, "logits/chosen": -1.4090768098831177, "logits/rejected": -1.502636432647705, "logps/chosen": -41.552528381347656, "logps/rejected": -69.09208679199219, "loss": 0.6872, "rewards/accuracies": 0.5625, "rewards/chosen": 0.016017243266105652, "rewards/margins": 0.0428960807621479, "rewards/rejected": -0.026878833770751953, "step": 71 }, { "epoch": 0.4266666666666667, "grad_norm": 35.835263654888294, "learning_rate": 2.6666666666666667e-07, "logits/chosen": -1.520636796951294, "logits/rejected": -1.5853071212768555, "logps/chosen": -57.213993072509766, "logps/rejected": -57.933135986328125, "loss": 0.6606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.032182980328798294, "rewards/margins": 0.08221760392189026, "rewards/rejected": -0.050034623593091965, "step": 72 }, { "epoch": 0.4325925925925926, "grad_norm": 35.23686966256771, "learning_rate": 2.7037037037037037e-07, "logits/chosen": -1.522557258605957, "logits/rejected": -1.5760351419448853, "logps/chosen": -40.514347076416016, "logps/rejected": -55.03413391113281, "loss": 0.6688, "rewards/accuracies": 0.6875, "rewards/chosen": 0.033225104212760925, "rewards/margins": 0.06852450221776962, "rewards/rejected": -0.0352993980050087, "step": 73 }, { "epoch": 0.43851851851851853, "grad_norm": 35.70854416440807, "learning_rate": 2.7407407407407406e-07, "logits/chosen": -1.4880564212799072, "logits/rejected": -1.5454859733581543, "logps/chosen": -42.971046447753906, "logps/rejected": -52.73062515258789, "loss": 0.6735, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04866039752960205, "rewards/margins": 0.0654120221734047, "rewards/rejected": -0.016751624643802643, "step": 74 }, { "epoch": 0.4444444444444444, "grad_norm": 40.60623683148219, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -1.4794213771820068, "logits/rejected": -1.5459836721420288, "logps/chosen": -51.75897979736328, "logps/rejected": -66.92141723632812, "loss": 0.6712, "rewards/accuracies": 0.6875, "rewards/chosen": 0.055379390716552734, "rewards/margins": 0.06052131950855255, "rewards/rejected": -0.005141926929354668, "step": 75 }, { "epoch": 0.45037037037037037, "grad_norm": 35.71596057516897, "learning_rate": 2.814814814814815e-07, "logits/chosen": -1.5502272844314575, "logits/rejected": -1.542878270149231, "logps/chosen": -44.19514846801758, "logps/rejected": -54.45147705078125, "loss": 0.6756, "rewards/accuracies": 0.625, "rewards/chosen": 0.046294547617435455, "rewards/margins": 0.07543890923261642, "rewards/rejected": -0.02914435788989067, "step": 76 }, { "epoch": 0.4562962962962963, "grad_norm": 36.700424780574956, "learning_rate": 2.851851851851852e-07, "logits/chosen": -1.4337775707244873, "logits/rejected": -1.5133496522903442, "logps/chosen": -50.23101043701172, "logps/rejected": -58.330902099609375, "loss": 0.6959, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007759109139442444, "rewards/margins": 0.04830126464366913, "rewards/rejected": -0.04752536118030548, "step": 77 }, { "epoch": 0.4622222222222222, "grad_norm": 40.066966979685844, "learning_rate": 2.8888888888888885e-07, "logits/chosen": -1.5571997165679932, "logits/rejected": -1.5534377098083496, "logps/chosen": -40.768104553222656, "logps/rejected": -54.30720138549805, "loss": 0.6604, "rewards/accuracies": 0.625, "rewards/chosen": 0.00044529326260089874, "rewards/margins": 0.07285849750041962, "rewards/rejected": -0.07241320610046387, "step": 78 }, { "epoch": 0.46814814814814815, "grad_norm": 33.600126630332284, "learning_rate": 2.9259259259259254e-07, "logits/chosen": -1.3403937816619873, "logits/rejected": -1.4642250537872314, "logps/chosen": -50.575172424316406, "logps/rejected": -57.23987579345703, "loss": 0.6658, "rewards/accuracies": 0.5, "rewards/chosen": 0.0686764270067215, "rewards/margins": 0.051285505294799805, "rewards/rejected": 0.017390919849276543, "step": 79 }, { "epoch": 0.4740740740740741, "grad_norm": 34.57158710893072, "learning_rate": 2.962962962962963e-07, "logits/chosen": -1.5943031311035156, "logits/rejected": -1.5197596549987793, "logps/chosen": -49.53266525268555, "logps/rejected": -63.10749816894531, "loss": 0.6404, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1665962040424347, "rewards/margins": 0.20507952570915222, "rewards/rejected": -0.03848333656787872, "step": 80 }, { "epoch": 0.48, "grad_norm": 35.921264310459016, "learning_rate": 3e-07, "logits/chosen": -1.4847490787506104, "logits/rejected": -1.4525268077850342, "logps/chosen": -55.6772346496582, "logps/rejected": -56.0514030456543, "loss": 0.6712, "rewards/accuracies": 0.875, "rewards/chosen": 0.07062974572181702, "rewards/margins": 0.11072392761707306, "rewards/rejected": -0.04009418934583664, "step": 81 }, { "epoch": 0.48592592592592593, "grad_norm": 37.39365265650487, "learning_rate": 3.037037037037037e-07, "logits/chosen": -1.2456742525100708, "logits/rejected": -1.3491311073303223, "logps/chosen": -46.3619499206543, "logps/rejected": -55.591796875, "loss": 0.6565, "rewards/accuracies": 0.75, "rewards/chosen": 0.027526114135980606, "rewards/margins": 0.1402171403169632, "rewards/rejected": -0.11269102245569229, "step": 82 }, { "epoch": 0.4918518518518519, "grad_norm": 33.32492318487895, "learning_rate": 3.074074074074074e-07, "logits/chosen": -1.5503648519515991, "logits/rejected": -1.4737271070480347, "logps/chosen": -47.40400695800781, "logps/rejected": -55.980777740478516, "loss": 0.651, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04232100397348404, "rewards/margins": 0.012534653767943382, "rewards/rejected": 0.029786348342895508, "step": 83 }, { "epoch": 0.49777777777777776, "grad_norm": 36.39089635298574, "learning_rate": 3.111111111111111e-07, "logits/chosen": -1.457646369934082, "logits/rejected": -1.4835176467895508, "logps/chosen": -49.07018280029297, "logps/rejected": -51.966102600097656, "loss": 0.6677, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07979030907154083, "rewards/margins": 0.06654424965381622, "rewards/rejected": 0.013246058486402035, "step": 84 }, { "epoch": 0.5037037037037037, "grad_norm": 35.4105632135552, "learning_rate": 3.148148148148148e-07, "logits/chosen": -1.4854512214660645, "logits/rejected": -1.4701765775680542, "logps/chosen": -46.278377532958984, "logps/rejected": -56.43070983886719, "loss": 0.6584, "rewards/accuracies": 0.625, "rewards/chosen": 0.08042871952056885, "rewards/margins": 0.10214265435934067, "rewards/rejected": -0.021713927388191223, "step": 85 }, { "epoch": 0.5096296296296297, "grad_norm": 36.96501992841717, "learning_rate": 3.185185185185185e-07, "logits/chosen": -1.3575414419174194, "logits/rejected": -1.4066803455352783, "logps/chosen": -54.694374084472656, "logps/rejected": -63.215087890625, "loss": 0.6543, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12399441003799438, "rewards/margins": 0.10921817272901535, "rewards/rejected": 0.014776226133108139, "step": 86 }, { "epoch": 0.5155555555555555, "grad_norm": 35.591944225315494, "learning_rate": 3.222222222222222e-07, "logits/chosen": -1.3541371822357178, "logits/rejected": -1.4538600444793701, "logps/chosen": -45.43539810180664, "logps/rejected": -61.852481842041016, "loss": 0.6623, "rewards/accuracies": 0.5, "rewards/chosen": 0.04965083673596382, "rewards/margins": 0.02898262068629265, "rewards/rejected": 0.02066822536289692, "step": 87 }, { "epoch": 0.5214814814814814, "grad_norm": 34.4083607278657, "learning_rate": 3.2592592592592596e-07, "logits/chosen": -1.3457781076431274, "logits/rejected": -1.4116525650024414, "logps/chosen": -37.99258804321289, "logps/rejected": -58.2638053894043, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": 0.10953948646783829, "rewards/margins": 0.05180351808667183, "rewards/rejected": 0.05773596838116646, "step": 88 }, { "epoch": 0.5274074074074074, "grad_norm": 39.99150012269121, "learning_rate": 3.296296296296296e-07, "logits/chosen": -1.4930334091186523, "logits/rejected": -1.468008041381836, "logps/chosen": -52.282691955566406, "logps/rejected": -51.99797439575195, "loss": 0.6597, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02082650549709797, "rewards/margins": 0.06192145496606827, "rewards/rejected": -0.04109494760632515, "step": 89 }, { "epoch": 0.5333333333333333, "grad_norm": 34.13640419254932, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.551436185836792, "logits/rejected": -1.5505115985870361, "logps/chosen": -50.10906982421875, "logps/rejected": -51.82029724121094, "loss": 0.6471, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14836148917675018, "rewards/margins": 0.1573827564716339, "rewards/rejected": -0.009021259844303131, "step": 90 }, { "epoch": 0.5392592592592592, "grad_norm": 36.45897034379589, "learning_rate": 3.37037037037037e-07, "logits/chosen": -1.4737859964370728, "logits/rejected": -1.5076937675476074, "logps/chosen": -48.10447692871094, "logps/rejected": -52.40055847167969, "loss": 0.6466, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14476074278354645, "rewards/margins": 0.14221321046352386, "rewards/rejected": 0.0025475500151515007, "step": 91 }, { "epoch": 0.5451851851851852, "grad_norm": 33.40862505768372, "learning_rate": 3.407407407407407e-07, "logits/chosen": -1.2972910404205322, "logits/rejected": -1.3978445529937744, "logps/chosen": -42.778289794921875, "logps/rejected": -53.14482879638672, "loss": 0.6351, "rewards/accuracies": 0.625, "rewards/chosen": 0.06479823589324951, "rewards/margins": 0.11493370682001114, "rewards/rejected": -0.05013547092676163, "step": 92 }, { "epoch": 0.5511111111111111, "grad_norm": 35.19026968431985, "learning_rate": 3.4444444444444444e-07, "logits/chosen": -1.6557621955871582, "logits/rejected": -1.7111676931381226, "logps/chosen": -48.467952728271484, "logps/rejected": -46.97455596923828, "loss": 0.6568, "rewards/accuracies": 0.3125, "rewards/chosen": 0.07854614406824112, "rewards/margins": -0.029709193855524063, "rewards/rejected": 0.10825533419847488, "step": 93 }, { "epoch": 0.557037037037037, "grad_norm": 33.64323632736495, "learning_rate": 3.4814814814814814e-07, "logits/chosen": -1.4123013019561768, "logits/rejected": -1.5394983291625977, "logps/chosen": -49.88652038574219, "logps/rejected": -44.77579879760742, "loss": 0.6164, "rewards/accuracies": 0.875, "rewards/chosen": 0.10286030173301697, "rewards/margins": 0.14824065566062927, "rewards/rejected": -0.045380350202322006, "step": 94 }, { "epoch": 0.562962962962963, "grad_norm": 38.945535859981476, "learning_rate": 3.5185185185185183e-07, "logits/chosen": -1.3453272581100464, "logits/rejected": -1.3954417705535889, "logps/chosen": -43.751930236816406, "logps/rejected": -54.8797607421875, "loss": 0.6386, "rewards/accuracies": 0.625, "rewards/chosen": 0.17014189064502716, "rewards/margins": 0.1695454716682434, "rewards/rejected": 0.0005964227020740509, "step": 95 }, { "epoch": 0.5688888888888889, "grad_norm": 35.64681034806811, "learning_rate": 3.5555555555555553e-07, "logits/chosen": -1.5312745571136475, "logits/rejected": -1.5875881910324097, "logps/chosen": -57.08066940307617, "logps/rejected": -68.38322448730469, "loss": 0.6137, "rewards/accuracies": 0.75, "rewards/chosen": 0.1007266640663147, "rewards/margins": 0.1999504566192627, "rewards/rejected": -0.0992238000035286, "step": 96 }, { "epoch": 0.5748148148148148, "grad_norm": 39.16907729636469, "learning_rate": 3.592592592592593e-07, "logits/chosen": -1.387762427330017, "logits/rejected": -1.3342286348342896, "logps/chosen": -46.985740661621094, "logps/rejected": -58.00916290283203, "loss": 0.6367, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04014458507299423, "rewards/margins": 0.12731415033340454, "rewards/rejected": -0.08716955780982971, "step": 97 }, { "epoch": 0.5807407407407408, "grad_norm": 37.66257951373822, "learning_rate": 3.6296296296296297e-07, "logits/chosen": -1.6567975282669067, "logits/rejected": -1.6726268529891968, "logps/chosen": -47.0131950378418, "logps/rejected": -54.135921478271484, "loss": 0.6292, "rewards/accuracies": 0.75, "rewards/chosen": 0.09105877578258514, "rewards/margins": 0.16023945808410645, "rewards/rejected": -0.0691806823015213, "step": 98 }, { "epoch": 0.5866666666666667, "grad_norm": 38.05243257959752, "learning_rate": 3.666666666666666e-07, "logits/chosen": -1.4999873638153076, "logits/rejected": -1.5060404539108276, "logps/chosen": -31.823820114135742, "logps/rejected": -53.95246887207031, "loss": 0.6663, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07480905205011368, "rewards/margins": 0.04157290235161781, "rewards/rejected": 0.033236145973205566, "step": 99 }, { "epoch": 0.5925925925925926, "grad_norm": 35.02084859940032, "learning_rate": 3.703703703703703e-07, "logits/chosen": -1.4343098402023315, "logits/rejected": -1.4538511037826538, "logps/chosen": -50.422210693359375, "logps/rejected": -68.70233917236328, "loss": 0.6379, "rewards/accuracies": 0.6875, "rewards/chosen": 0.032022904604673386, "rewards/margins": 0.10641990602016449, "rewards/rejected": -0.0743969976902008, "step": 100 }, { "epoch": 0.5985185185185186, "grad_norm": 34.97282723129252, "learning_rate": 3.7407407407407406e-07, "logits/chosen": -1.4774181842803955, "logits/rejected": -1.4857224225997925, "logps/chosen": -43.08037185668945, "logps/rejected": -58.0771598815918, "loss": 0.6229, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1684502214193344, "rewards/margins": 0.2295190393924713, "rewards/rejected": -0.0610688254237175, "step": 101 }, { "epoch": 0.6044444444444445, "grad_norm": 33.28326626037529, "learning_rate": 3.7777777777777775e-07, "logits/chosen": -1.4652920961380005, "logits/rejected": -1.4563056230545044, "logps/chosen": -40.73276901245117, "logps/rejected": -44.06425476074219, "loss": 0.5857, "rewards/accuracies": 0.875, "rewards/chosen": 0.20217598974704742, "rewards/margins": 0.33883577585220337, "rewards/rejected": -0.13665977120399475, "step": 102 }, { "epoch": 0.6103703703703703, "grad_norm": 32.09719371496252, "learning_rate": 3.8148148148148145e-07, "logits/chosen": -1.7163668870925903, "logits/rejected": -1.633704662322998, "logps/chosen": -44.5764045715332, "logps/rejected": -43.08470153808594, "loss": 0.6316, "rewards/accuracies": 0.875, "rewards/chosen": 0.11517582833766937, "rewards/margins": 0.22480958700180054, "rewards/rejected": -0.10963378101587296, "step": 103 }, { "epoch": 0.6162962962962963, "grad_norm": 34.81825629167962, "learning_rate": 3.8518518518518515e-07, "logits/chosen": -1.4617501497268677, "logits/rejected": -1.5581308603286743, "logps/chosen": -47.830772399902344, "logps/rejected": -57.300086975097656, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.0872277244925499, "rewards/margins": 0.3060223162174225, "rewards/rejected": -0.218794584274292, "step": 104 }, { "epoch": 0.6222222222222222, "grad_norm": 36.39266070401373, "learning_rate": 3.888888888888889e-07, "logits/chosen": -1.5160835981369019, "logits/rejected": -1.6838594675064087, "logps/chosen": -42.137794494628906, "logps/rejected": -65.07337951660156, "loss": 0.6411, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12303625047206879, "rewards/margins": 0.14647573232650757, "rewards/rejected": -0.023439496755599976, "step": 105 }, { "epoch": 0.6281481481481481, "grad_norm": 31.850193009417975, "learning_rate": 3.925925925925926e-07, "logits/chosen": -1.4147698879241943, "logits/rejected": -1.517427921295166, "logps/chosen": -54.854183197021484, "logps/rejected": -51.17685317993164, "loss": 0.586, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11260886490345001, "rewards/margins": 0.28815436363220215, "rewards/rejected": -0.17554549872875214, "step": 106 }, { "epoch": 0.6340740740740741, "grad_norm": 34.65709673375601, "learning_rate": 3.962962962962963e-07, "logits/chosen": -1.3840844631195068, "logits/rejected": -1.4422510862350464, "logps/chosen": -52.301841735839844, "logps/rejected": -68.42982482910156, "loss": 0.5803, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12791098654270172, "rewards/margins": 0.31330111622810364, "rewards/rejected": -0.18539008498191833, "step": 107 }, { "epoch": 0.64, "grad_norm": 32.89951039869555, "learning_rate": 4e-07, "logits/chosen": -1.41471266746521, "logits/rejected": -1.467881202697754, "logps/chosen": -52.74435806274414, "logps/rejected": -60.40370559692383, "loss": 0.5749, "rewards/accuracies": 0.875, "rewards/chosen": 0.07138564437627792, "rewards/margins": 0.2469371259212494, "rewards/rejected": -0.17555147409439087, "step": 108 }, { "epoch": 0.6459259259259259, "grad_norm": 33.493433739647735, "learning_rate": 4.0370370370370373e-07, "logits/chosen": -1.1759183406829834, "logits/rejected": -1.2476868629455566, "logps/chosen": -47.751792907714844, "logps/rejected": -67.92081451416016, "loss": 0.5857, "rewards/accuracies": 0.875, "rewards/chosen": 0.11714056879281998, "rewards/margins": 0.45923230051994324, "rewards/rejected": -0.34209170937538147, "step": 109 }, { "epoch": 0.6518518518518519, "grad_norm": 33.53546295318829, "learning_rate": 4.0740740740740737e-07, "logits/chosen": -1.5570002794265747, "logits/rejected": -1.5066514015197754, "logps/chosen": -50.70806884765625, "logps/rejected": -60.30607223510742, "loss": 0.6235, "rewards/accuracies": 0.625, "rewards/chosen": 0.04970753192901611, "rewards/margins": 0.17559602856636047, "rewards/rejected": -0.12588849663734436, "step": 110 }, { "epoch": 0.6577777777777778, "grad_norm": 32.24746058891704, "learning_rate": 4.1111111111111107e-07, "logits/chosen": -1.5236682891845703, "logits/rejected": -1.4900072813034058, "logps/chosen": -42.65536880493164, "logps/rejected": -52.812469482421875, "loss": 0.6056, "rewards/accuracies": 0.75, "rewards/chosen": 0.12369166314601898, "rewards/margins": 0.23955130577087402, "rewards/rejected": -0.11585965007543564, "step": 111 }, { "epoch": 0.6637037037037037, "grad_norm": 37.062556098993824, "learning_rate": 4.1481481481481476e-07, "logits/chosen": -1.5205527544021606, "logits/rejected": -1.4171969890594482, "logps/chosen": -62.091796875, "logps/rejected": -56.35509490966797, "loss": 0.6062, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14249742031097412, "rewards/margins": 0.30201977491378784, "rewards/rejected": -0.15952235460281372, "step": 112 }, { "epoch": 0.6696296296296296, "grad_norm": 33.200484026739076, "learning_rate": 4.185185185185185e-07, "logits/chosen": -1.4046220779418945, "logits/rejected": -1.4927732944488525, "logps/chosen": -41.91143035888672, "logps/rejected": -46.92932891845703, "loss": 0.612, "rewards/accuracies": 0.625, "rewards/chosen": 0.02149050310254097, "rewards/margins": 0.1793198436498642, "rewards/rejected": -0.15782934427261353, "step": 113 }, { "epoch": 0.6755555555555556, "grad_norm": 35.094056061875875, "learning_rate": 4.222222222222222e-07, "logits/chosen": -1.3244975805282593, "logits/rejected": -1.439337968826294, "logps/chosen": -45.62443542480469, "logps/rejected": -50.95753479003906, "loss": 0.5868, "rewards/accuracies": 0.4375, "rewards/chosen": 0.02482125535607338, "rewards/margins": 0.12921854853630066, "rewards/rejected": -0.10439729690551758, "step": 114 }, { "epoch": 0.6814814814814815, "grad_norm": 32.502983592668585, "learning_rate": 4.259259259259259e-07, "logits/chosen": -1.24369478225708, "logits/rejected": -1.3050041198730469, "logps/chosen": -51.76207733154297, "logps/rejected": -61.87729263305664, "loss": 0.5588, "rewards/accuracies": 0.75, "rewards/chosen": 0.29445695877075195, "rewards/margins": 0.5628924369812012, "rewards/rejected": -0.2684354782104492, "step": 115 }, { "epoch": 0.6874074074074074, "grad_norm": 29.974843969255893, "learning_rate": 4.296296296296296e-07, "logits/chosen": -1.5716416835784912, "logits/rejected": -1.5490162372589111, "logps/chosen": -45.147708892822266, "logps/rejected": -52.94227981567383, "loss": 0.5554, "rewards/accuracies": 0.875, "rewards/chosen": 0.14993491768836975, "rewards/margins": 0.33745861053466797, "rewards/rejected": -0.1875237226486206, "step": 116 }, { "epoch": 0.6933333333333334, "grad_norm": 34.28074065539359, "learning_rate": 4.3333333333333335e-07, "logits/chosen": -1.427602767944336, "logits/rejected": -1.4697047472000122, "logps/chosen": -50.961883544921875, "logps/rejected": -58.997623443603516, "loss": 0.5549, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08439350128173828, "rewards/margins": 0.42768120765686035, "rewards/rejected": -0.34328773617744446, "step": 117 }, { "epoch": 0.6992592592592592, "grad_norm": 31.518026578609504, "learning_rate": 4.3703703703703704e-07, "logits/chosen": -1.4572179317474365, "logits/rejected": -1.3873144388198853, "logps/chosen": -41.546226501464844, "logps/rejected": -49.489315032958984, "loss": 0.5735, "rewards/accuracies": 0.875, "rewards/chosen": 0.05958416312932968, "rewards/margins": 0.34586769342422485, "rewards/rejected": -0.28628358244895935, "step": 118 }, { "epoch": 0.7051851851851851, "grad_norm": 33.033357331372045, "learning_rate": 4.4074074074074074e-07, "logits/chosen": -1.4856173992156982, "logits/rejected": -1.502893090248108, "logps/chosen": -42.551170349121094, "logps/rejected": -47.76485824584961, "loss": 0.5414, "rewards/accuracies": 0.9375, "rewards/chosen": 0.024673819541931152, "rewards/margins": 0.4481239914894104, "rewards/rejected": -0.42345017194747925, "step": 119 }, { "epoch": 0.7111111111111111, "grad_norm": 33.62289241183141, "learning_rate": 4.444444444444444e-07, "logits/chosen": -1.4512724876403809, "logits/rejected": -1.459259033203125, "logps/chosen": -35.332374572753906, "logps/rejected": -48.842464447021484, "loss": 0.5958, "rewards/accuracies": 0.75, "rewards/chosen": 0.10679290443658829, "rewards/margins": 0.1940063238143921, "rewards/rejected": -0.0872134268283844, "step": 120 }, { "epoch": 0.717037037037037, "grad_norm": 33.21896811018362, "learning_rate": 4.4814814814814813e-07, "logits/chosen": -1.4697211980819702, "logits/rejected": -1.5072273015975952, "logps/chosen": -49.4876823425293, "logps/rejected": -63.57400894165039, "loss": 0.5228, "rewards/accuracies": 0.875, "rewards/chosen": 0.07271251082420349, "rewards/margins": 0.3985411524772644, "rewards/rejected": -0.3258286416530609, "step": 121 }, { "epoch": 0.7229629629629629, "grad_norm": 28.810988024096453, "learning_rate": 4.5185185185185183e-07, "logits/chosen": -1.4811208248138428, "logits/rejected": -1.5742110013961792, "logps/chosen": -43.195194244384766, "logps/rejected": -52.73583221435547, "loss": 0.5284, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2379426658153534, "rewards/margins": 0.5730524659156799, "rewards/rejected": -0.3351098299026489, "step": 122 }, { "epoch": 0.7288888888888889, "grad_norm": 33.27927501039274, "learning_rate": 4.555555555555555e-07, "logits/chosen": -1.4409135580062866, "logits/rejected": -1.3866451978683472, "logps/chosen": -54.90428924560547, "logps/rejected": -57.876712799072266, "loss": 0.5717, "rewards/accuracies": 0.625, "rewards/chosen": 0.03284015506505966, "rewards/margins": 0.30528080463409424, "rewards/rejected": -0.272440642118454, "step": 123 }, { "epoch": 0.7348148148148148, "grad_norm": 29.733201192052977, "learning_rate": 4.592592592592592e-07, "logits/chosen": -1.4358739852905273, "logits/rejected": -1.4204962253570557, "logps/chosen": -44.5390739440918, "logps/rejected": -60.877220153808594, "loss": 0.5214, "rewards/accuracies": 0.625, "rewards/chosen": 0.12315364927053452, "rewards/margins": 0.4188612699508667, "rewards/rejected": -0.295707643032074, "step": 124 }, { "epoch": 0.7407407407407407, "grad_norm": 32.2116448331724, "learning_rate": 4.6296296296296297e-07, "logits/chosen": -1.5213638544082642, "logits/rejected": -1.5197651386260986, "logps/chosen": -55.357574462890625, "logps/rejected": -57.78273010253906, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": 0.16674299538135529, "rewards/margins": 0.4866538643836975, "rewards/rejected": -0.31991085410118103, "step": 125 }, { "epoch": 0.7466666666666667, "grad_norm": 30.88294089490252, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -1.4147846698760986, "logits/rejected": -1.5049686431884766, "logps/chosen": -52.510929107666016, "logps/rejected": -57.91138458251953, "loss": 0.5606, "rewards/accuracies": 0.625, "rewards/chosen": 0.10760531574487686, "rewards/margins": 0.30788373947143555, "rewards/rejected": -0.20027843117713928, "step": 126 }, { "epoch": 0.7525925925925926, "grad_norm": 30.408789687578963, "learning_rate": 4.7037037037037036e-07, "logits/chosen": -1.3894518613815308, "logits/rejected": -1.4455385208129883, "logps/chosen": -34.467201232910156, "logps/rejected": -48.339447021484375, "loss": 0.5228, "rewards/accuracies": 0.625, "rewards/chosen": 0.10829809308052063, "rewards/margins": 0.5127183794975281, "rewards/rejected": -0.40442025661468506, "step": 127 }, { "epoch": 0.7585185185185185, "grad_norm": 33.008401257466446, "learning_rate": 4.7407407407407405e-07, "logits/chosen": -1.4528629779815674, "logits/rejected": -1.4002412557601929, "logps/chosen": -43.918251037597656, "logps/rejected": -54.43046569824219, "loss": 0.5779, "rewards/accuracies": 0.5625, "rewards/chosen": 0.017867133021354675, "rewards/margins": 0.2733150124549866, "rewards/rejected": -0.2554478645324707, "step": 128 }, { "epoch": 0.7644444444444445, "grad_norm": 30.60766339230095, "learning_rate": 4.777777777777778e-07, "logits/chosen": -1.5056837797164917, "logits/rejected": -1.4879945516586304, "logps/chosen": -52.15769958496094, "logps/rejected": -57.460479736328125, "loss": 0.5182, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2870657444000244, "rewards/margins": 0.5349029898643494, "rewards/rejected": -0.24783721566200256, "step": 129 }, { "epoch": 0.7703703703703704, "grad_norm": 30.92722489889393, "learning_rate": 4.814814814814814e-07, "logits/chosen": -1.4615930318832397, "logits/rejected": -1.4664386510849, "logps/chosen": -44.34745788574219, "logps/rejected": -59.45315170288086, "loss": 0.5232, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05535607412457466, "rewards/margins": 0.354500949382782, "rewards/rejected": -0.2991448640823364, "step": 130 }, { "epoch": 0.7762962962962963, "grad_norm": 31.911195699332875, "learning_rate": 4.851851851851852e-07, "logits/chosen": -1.3491640090942383, "logits/rejected": -1.3923466205596924, "logps/chosen": -60.32138442993164, "logps/rejected": -51.23785400390625, "loss": 0.5426, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02666044607758522, "rewards/margins": 0.32274696230888367, "rewards/rejected": -0.349407434463501, "step": 131 }, { "epoch": 0.7822222222222223, "grad_norm": 32.780173075722395, "learning_rate": 4.888888888888889e-07, "logits/chosen": -1.3976666927337646, "logits/rejected": -1.4585447311401367, "logps/chosen": -53.38674545288086, "logps/rejected": -50.861080169677734, "loss": 0.5465, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13597837090492249, "rewards/margins": 0.38355204463005066, "rewards/rejected": -0.5195304155349731, "step": 132 }, { "epoch": 0.7881481481481482, "grad_norm": 30.99528148041851, "learning_rate": 4.925925925925926e-07, "logits/chosen": -1.4452307224273682, "logits/rejected": -1.5811454057693481, "logps/chosen": -54.36809158325195, "logps/rejected": -64.48612976074219, "loss": 0.4654, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10771436989307404, "rewards/margins": 0.7881489992141724, "rewards/rejected": -0.6804346442222595, "step": 133 }, { "epoch": 0.794074074074074, "grad_norm": 28.418046758024726, "learning_rate": 4.962962962962963e-07, "logits/chosen": -1.3397469520568848, "logits/rejected": -1.2624437808990479, "logps/chosen": -35.42027282714844, "logps/rejected": -47.60633087158203, "loss": 0.5167, "rewards/accuracies": 0.875, "rewards/chosen": 0.1339387148618698, "rewards/margins": 0.6218531131744385, "rewards/rejected": -0.48791444301605225, "step": 134 }, { "epoch": 0.8, "grad_norm": 32.781124547085, "learning_rate": 5e-07, "logits/chosen": -1.512722373008728, "logits/rejected": -1.5071099996566772, "logps/chosen": -58.30361557006836, "logps/rejected": -54.290679931640625, "loss": 0.5407, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05489421263337135, "rewards/margins": 0.5525442957878113, "rewards/rejected": -0.4976501166820526, "step": 135 }, { "epoch": 0.8059259259259259, "grad_norm": 35.08995247021387, "learning_rate": 4.999991559718872e-07, "logits/chosen": -1.6145902872085571, "logits/rejected": -1.6498243808746338, "logps/chosen": -54.22117614746094, "logps/rejected": -68.34226989746094, "loss": 0.5369, "rewards/accuracies": 0.75, "rewards/chosen": -0.058954719454050064, "rewards/margins": 0.47398126125335693, "rewards/rejected": -0.5329359769821167, "step": 136 }, { "epoch": 0.8118518518518518, "grad_norm": 34.00931163362952, "learning_rate": 4.999966238932478e-07, "logits/chosen": -1.4408268928527832, "logits/rejected": -1.3998584747314453, "logps/chosen": -58.43665313720703, "logps/rejected": -54.154052734375, "loss": 0.5401, "rewards/accuracies": 0.75, "rewards/chosen": -0.1059800386428833, "rewards/margins": 0.38337820768356323, "rewards/rejected": -0.48935821652412415, "step": 137 }, { "epoch": 0.8177777777777778, "grad_norm": 28.812193750939542, "learning_rate": 4.999924037811792e-07, "logits/chosen": -1.337760090827942, "logits/rejected": -1.399741768836975, "logps/chosen": -50.198890686035156, "logps/rejected": -69.07243347167969, "loss": 0.4945, "rewards/accuracies": 0.875, "rewards/chosen": 0.10026273131370544, "rewards/margins": 0.8603047132492065, "rewards/rejected": -0.7600419521331787, "step": 138 }, { "epoch": 0.8237037037037037, "grad_norm": 32.004953488834715, "learning_rate": 4.999864956641761e-07, "logits/chosen": -1.2986966371536255, "logits/rejected": -1.4439055919647217, "logps/chosen": -40.57745361328125, "logps/rejected": -45.291465759277344, "loss": 0.5348, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03176772594451904, "rewards/margins": 0.40621811151504517, "rewards/rejected": -0.4379858374595642, "step": 139 }, { "epoch": 0.8296296296296296, "grad_norm": 28.821970631268016, "learning_rate": 4.99978899582132e-07, "logits/chosen": -1.4549857378005981, "logits/rejected": -1.5415047407150269, "logps/chosen": -48.27933883666992, "logps/rejected": -55.57482147216797, "loss": 0.509, "rewards/accuracies": 0.8125, "rewards/chosen": 0.016005441546440125, "rewards/margins": 0.6398062705993652, "rewards/rejected": -0.6238008737564087, "step": 140 }, { "epoch": 0.8355555555555556, "grad_norm": 29.956916160365633, "learning_rate": 4.999696155863368e-07, "logits/chosen": -1.161665678024292, "logits/rejected": -1.2288399934768677, "logps/chosen": -38.48533248901367, "logps/rejected": -47.85722351074219, "loss": 0.5077, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08423295617103577, "rewards/margins": 0.6424591541290283, "rewards/rejected": -0.5582262277603149, "step": 141 }, { "epoch": 0.8414814814814815, "grad_norm": 28.05076670996194, "learning_rate": 4.999586437394786e-07, "logits/chosen": -1.4714339971542358, "logits/rejected": -1.5132161378860474, "logps/chosen": -48.181156158447266, "logps/rejected": -55.47602844238281, "loss": 0.5124, "rewards/accuracies": 0.6875, "rewards/chosen": 0.003453332930803299, "rewards/margins": 0.6579344868659973, "rewards/rejected": -0.654481053352356, "step": 142 }, { "epoch": 0.8474074074074074, "grad_norm": 29.989008234893884, "learning_rate": 4.999459841156414e-07, "logits/chosen": -1.2807202339172363, "logits/rejected": -1.3370921611785889, "logps/chosen": -35.43395233154297, "logps/rejected": -40.46952819824219, "loss": 0.5811, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05782117694616318, "rewards/margins": 0.4030662178993225, "rewards/rejected": -0.34524503350257874, "step": 143 }, { "epoch": 0.8533333333333334, "grad_norm": 29.438739201559326, "learning_rate": 4.999316368003061e-07, "logits/chosen": -1.5066066980361938, "logits/rejected": -1.4261574745178223, "logps/chosen": -59.00090789794922, "logps/rejected": -62.56212615966797, "loss": 0.5071, "rewards/accuracies": 0.875, "rewards/chosen": 0.1349632292985916, "rewards/margins": 0.6473178863525391, "rewards/rejected": -0.5123546123504639, "step": 144 }, { "epoch": 0.8592592592592593, "grad_norm": 32.72521840563668, "learning_rate": 4.999156018903489e-07, "logits/chosen": -1.3574910163879395, "logits/rejected": -1.4152084589004517, "logps/chosen": -55.105262756347656, "logps/rejected": -59.85123062133789, "loss": 0.5444, "rewards/accuracies": 0.625, "rewards/chosen": -0.2306346744298935, "rewards/margins": 0.2394087314605713, "rewards/rejected": -0.47004345059394836, "step": 145 }, { "epoch": 0.8651851851851852, "grad_norm": 30.579443998829742, "learning_rate": 4.998978794940411e-07, "logits/chosen": -1.5495673418045044, "logits/rejected": -1.4402530193328857, "logps/chosen": -55.61601257324219, "logps/rejected": -49.91148376464844, "loss": 0.5247, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11350102722644806, "rewards/margins": 0.6421040296554565, "rewards/rejected": -0.5286029577255249, "step": 146 }, { "epoch": 0.8711111111111111, "grad_norm": 32.28607136824646, "learning_rate": 4.998784697310482e-07, "logits/chosen": -1.4374938011169434, "logits/rejected": -1.5053014755249023, "logps/chosen": -57.0212516784668, "logps/rejected": -50.56647491455078, "loss": 0.5361, "rewards/accuracies": 0.625, "rewards/chosen": 0.07709996402263641, "rewards/margins": 0.6278308629989624, "rewards/rejected": -0.55073082447052, "step": 147 }, { "epoch": 0.8770370370370371, "grad_norm": 28.758614918413993, "learning_rate": 4.998573727324294e-07, "logits/chosen": -1.4719284772872925, "logits/rejected": -1.4712297916412354, "logps/chosen": -47.55272674560547, "logps/rejected": -77.12329864501953, "loss": 0.4827, "rewards/accuracies": 0.875, "rewards/chosen": 0.19793976843357086, "rewards/margins": 0.7996434569358826, "rewards/rejected": -0.6017036437988281, "step": 148 }, { "epoch": 0.882962962962963, "grad_norm": 26.09070483109539, "learning_rate": 4.998345886406365e-07, "logits/chosen": -1.2813467979431152, "logits/rejected": -1.4254480600357056, "logps/chosen": -40.02638626098633, "logps/rejected": -46.91986083984375, "loss": 0.4656, "rewards/accuracies": 0.625, "rewards/chosen": 0.18909288942813873, "rewards/margins": 0.3858364522457123, "rewards/rejected": -0.19674354791641235, "step": 149 }, { "epoch": 0.8888888888888888, "grad_norm": 28.043633258520604, "learning_rate": 4.998101176095128e-07, "logits/chosen": -1.3586623668670654, "logits/rejected": -1.3965615034103394, "logps/chosen": -49.813751220703125, "logps/rejected": -59.23504638671875, "loss": 0.5246, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11088036000728607, "rewards/margins": 0.8094318509101868, "rewards/rejected": -0.6985514760017395, "step": 150 }, { "epoch": 0.8948148148148148, "grad_norm": 28.749826679176486, "learning_rate": 4.997839598042919e-07, "logits/chosen": -1.2615379095077515, "logits/rejected": -1.4461233615875244, "logps/chosen": -49.363128662109375, "logps/rejected": -53.413047790527344, "loss": 0.4865, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1839275360107422, "rewards/margins": 0.7002414464950562, "rewards/rejected": -0.516313910484314, "step": 151 }, { "epoch": 0.9007407407407407, "grad_norm": 35.14940181472167, "learning_rate": 4.997561154015975e-07, "logits/chosen": -1.4611401557922363, "logits/rejected": -1.5102167129516602, "logps/chosen": -44.313316345214844, "logps/rejected": -49.16785430908203, "loss": 0.5811, "rewards/accuracies": 0.5, "rewards/chosen": -0.16626399755477905, "rewards/margins": 0.05043494701385498, "rewards/rejected": -0.21669892966747284, "step": 152 }, { "epoch": 0.9066666666666666, "grad_norm": 28.277476832575395, "learning_rate": 4.997265845894411e-07, "logits/chosen": -1.5449285507202148, "logits/rejected": -1.5440673828125, "logps/chosen": -56.093666076660156, "logps/rejected": -49.94963073730469, "loss": 0.4695, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16555923223495483, "rewards/margins": 0.6077948808670044, "rewards/rejected": -0.4422355890274048, "step": 153 }, { "epoch": 0.9125925925925926, "grad_norm": 29.18744779214114, "learning_rate": 4.996953675672213e-07, "logits/chosen": -1.4645702838897705, "logits/rejected": -1.456922173500061, "logps/chosen": -44.6121826171875, "logps/rejected": -53.01622772216797, "loss": 0.4783, "rewards/accuracies": 0.75, "rewards/chosen": 0.3893875777721405, "rewards/margins": 0.8575869202613831, "rewards/rejected": -0.46819937229156494, "step": 154 }, { "epoch": 0.9185185185185185, "grad_norm": 28.117909866516314, "learning_rate": 4.996624645457227e-07, "logits/chosen": -1.6098130941390991, "logits/rejected": -1.5549395084381104, "logps/chosen": -61.51123046875, "logps/rejected": -54.52252960205078, "loss": 0.4959, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1888391524553299, "rewards/margins": 0.42284536361694336, "rewards/rejected": -0.23400622606277466, "step": 155 }, { "epoch": 0.9244444444444444, "grad_norm": 27.640916240379614, "learning_rate": 4.996278757471138e-07, "logits/chosen": -1.1909761428833008, "logits/rejected": -1.3579903841018677, "logps/chosen": -47.18357849121094, "logps/rejected": -53.34309387207031, "loss": 0.4488, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1449306756258011, "rewards/margins": 0.980364203453064, "rewards/rejected": -0.8354335427284241, "step": 156 }, { "epoch": 0.9303703703703704, "grad_norm": 34.29518160731201, "learning_rate": 4.995916014049461e-07, "logits/chosen": -1.4977542161941528, "logits/rejected": -1.5214853286743164, "logps/chosen": -62.64368438720703, "logps/rejected": -60.37792205810547, "loss": 0.5706, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18178661167621613, "rewards/margins": 0.18183384835720062, "rewards/rejected": -0.36362048983573914, "step": 157 }, { "epoch": 0.9362962962962963, "grad_norm": 28.776345767229973, "learning_rate": 4.995536417641517e-07, "logits/chosen": -1.3088951110839844, "logits/rejected": -1.3918869495391846, "logps/chosen": -42.50482940673828, "logps/rejected": -51.679649353027344, "loss": 0.4839, "rewards/accuracies": 0.875, "rewards/chosen": -0.012760929763317108, "rewards/margins": 0.6891413331031799, "rewards/rejected": -0.7019021511077881, "step": 158 }, { "epoch": 0.9422222222222222, "grad_norm": 27.13077469944883, "learning_rate": 4.99513997081043e-07, "logits/chosen": -1.5389611721038818, "logits/rejected": -1.6600943803787231, "logps/chosen": -43.073604583740234, "logps/rejected": -62.15262222290039, "loss": 0.4599, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08031359314918518, "rewards/margins": 0.6155234575271606, "rewards/rejected": -0.5352098941802979, "step": 159 }, { "epoch": 0.9481481481481482, "grad_norm": 28.815045476308757, "learning_rate": 4.994726676233097e-07, "logits/chosen": -1.3073569536209106, "logits/rejected": -1.3052654266357422, "logps/chosen": -50.287288665771484, "logps/rejected": -63.46202087402344, "loss": 0.4595, "rewards/accuracies": 0.5, "rewards/chosen": -0.18923993408679962, "rewards/margins": 0.39738625288009644, "rewards/rejected": -0.5866261720657349, "step": 160 }, { "epoch": 0.9540740740740741, "grad_norm": 30.804074809488196, "learning_rate": 4.994296536700177e-07, "logits/chosen": -1.2815802097320557, "logits/rejected": -1.3018033504486084, "logps/chosen": -57.259159088134766, "logps/rejected": -71.91606903076172, "loss": 0.4449, "rewards/accuracies": 0.875, "rewards/chosen": 0.25141018629074097, "rewards/margins": 1.3318860530853271, "rewards/rejected": -1.0804758071899414, "step": 161 }, { "epoch": 0.96, "grad_norm": 30.186158732166184, "learning_rate": 4.993849555116066e-07, "logits/chosen": -1.5598193407058716, "logits/rejected": -1.6054404973983765, "logps/chosen": -33.970062255859375, "logps/rejected": -45.955543518066406, "loss": 0.492, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08695819228887558, "rewards/margins": 0.38726359605789185, "rewards/rejected": -0.3003053665161133, "step": 162 }, { "epoch": 0.965925925925926, "grad_norm": 23.80589053234063, "learning_rate": 4.993385734498887e-07, "logits/chosen": -1.3942906856536865, "logits/rejected": -1.3598270416259766, "logps/chosen": -46.04230499267578, "logps/rejected": -64.92222595214844, "loss": 0.4004, "rewards/accuracies": 0.9375, "rewards/chosen": 0.055475056171417236, "rewards/margins": 1.0718060731887817, "rewards/rejected": -1.0163309574127197, "step": 163 }, { "epoch": 0.9718518518518519, "grad_norm": 35.77255758808572, "learning_rate": 4.992905077980461e-07, "logits/chosen": -1.2013689279556274, "logits/rejected": -1.2357840538024902, "logps/chosen": -54.10811996459961, "logps/rejected": -57.42766571044922, "loss": 0.515, "rewards/accuracies": 0.875, "rewards/chosen": -0.21006661653518677, "rewards/margins": 0.9132038354873657, "rewards/rejected": -1.1232705116271973, "step": 164 }, { "epoch": 0.9777777777777777, "grad_norm": 31.178136497144976, "learning_rate": 4.992407588806287e-07, "logits/chosen": -1.5258628129959106, "logits/rejected": -1.5229212045669556, "logps/chosen": -44.14244842529297, "logps/rejected": -53.07622528076172, "loss": 0.5034, "rewards/accuracies": 0.75, "rewards/chosen": 0.16167354583740234, "rewards/margins": 0.8384672403335571, "rewards/rejected": -0.6767936944961548, "step": 165 }, { "epoch": 0.9837037037037037, "grad_norm": 27.540178875328728, "learning_rate": 4.991893270335525e-07, "logits/chosen": -1.2302640676498413, "logits/rejected": -1.263055682182312, "logps/chosen": -36.58562088012695, "logps/rejected": -54.91083526611328, "loss": 0.4367, "rewards/accuracies": 0.75, "rewards/chosen": 0.26324182748794556, "rewards/margins": 1.0756839513778687, "rewards/rejected": -0.8124420642852783, "step": 166 }, { "epoch": 0.9896296296296296, "grad_norm": 29.12805068065961, "learning_rate": 4.991362126040969e-07, "logits/chosen": -1.5359268188476562, "logits/rejected": -1.68173348903656, "logps/chosen": -38.305641174316406, "logps/rejected": -54.66886520385742, "loss": 0.5059, "rewards/accuracies": 0.625, "rewards/chosen": 0.1688043177127838, "rewards/margins": 0.6014243364334106, "rewards/rejected": -0.43262001872062683, "step": 167 }, { "epoch": 0.9955555555555555, "grad_norm": 30.66836706642539, "learning_rate": 4.990814159509024e-07, "logits/chosen": -1.2793900966644287, "logits/rejected": -1.3615354299545288, "logps/chosen": -52.69702911376953, "logps/rejected": -45.728546142578125, "loss": 0.4884, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0890645831823349, "rewards/margins": 0.5669555068016052, "rewards/rejected": -0.6560201048851013, "step": 168 }, { "epoch": 1.0014814814814814, "grad_norm": 29.387874732293323, "learning_rate": 4.990249374439684e-07, "logits/chosen": -1.24739670753479, "logits/rejected": -1.3176863193511963, "logps/chosen": -38.417179107666016, "logps/rejected": -48.35388946533203, "loss": 0.4576, "rewards/accuracies": 0.875, "rewards/chosen": 0.25613975524902344, "rewards/margins": 1.03694486618042, "rewards/rejected": -0.7808051109313965, "step": 169 }, { "epoch": 1.0074074074074073, "grad_norm": 29.586273409908994, "learning_rate": 4.989667774646505e-07, "logits/chosen": -1.222827672958374, "logits/rejected": -1.2669568061828613, "logps/chosen": -56.15669631958008, "logps/rejected": -51.149871826171875, "loss": 0.5138, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16251149773597717, "rewards/margins": 0.9217841029167175, "rewards/rejected": -0.7592726945877075, "step": 170 }, { "epoch": 1.0133333333333334, "grad_norm": 29.651889304281102, "learning_rate": 4.989069364056579e-07, "logits/chosen": -1.3727763891220093, "logits/rejected": -1.366886019706726, "logps/chosen": -48.37831115722656, "logps/rejected": -37.613990783691406, "loss": 0.4543, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05444521829485893, "rewards/margins": 0.8736615777015686, "rewards/rejected": -0.8192163705825806, "step": 171 }, { "epoch": 1.0192592592592593, "grad_norm": 24.451172053007166, "learning_rate": 4.98845414671051e-07, "logits/chosen": -1.58548903465271, "logits/rejected": -1.5485515594482422, "logps/chosen": -46.289329528808594, "logps/rejected": -54.086387634277344, "loss": 0.3671, "rewards/accuracies": 0.875, "rewards/chosen": 0.090296670794487, "rewards/margins": 1.2733991146087646, "rewards/rejected": -1.1831023693084717, "step": 172 }, { "epoch": 1.0251851851851852, "grad_norm": 25.18785787450401, "learning_rate": 4.987822126762382e-07, "logits/chosen": -1.3124250173568726, "logits/rejected": -1.350297212600708, "logps/chosen": -51.7904167175293, "logps/rejected": -57.94340896606445, "loss": 0.3698, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24363113939762115, "rewards/margins": 1.3216978311538696, "rewards/rejected": -1.0780668258666992, "step": 173 }, { "epoch": 1.031111111111111, "grad_norm": 27.160374266755905, "learning_rate": 4.987173308479737e-07, "logits/chosen": -1.3497616052627563, "logits/rejected": -1.3936090469360352, "logps/chosen": -51.94996643066406, "logps/rejected": -61.93647384643555, "loss": 0.4425, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05416693538427353, "rewards/margins": 1.2709107398986816, "rewards/rejected": -1.2167439460754395, "step": 174 }, { "epoch": 1.037037037037037, "grad_norm": 26.184809377938368, "learning_rate": 4.986507696243543e-07, "logits/chosen": -1.4122731685638428, "logits/rejected": -1.4073926210403442, "logps/chosen": -44.906639099121094, "logps/rejected": -60.822059631347656, "loss": 0.4117, "rewards/accuracies": 0.875, "rewards/chosen": 0.015085561200976372, "rewards/margins": 1.0686676502227783, "rewards/rejected": -1.0535821914672852, "step": 175 }, { "epoch": 1.0429629629629629, "grad_norm": 27.884920522386153, "learning_rate": 4.985825294548162e-07, "logits/chosen": -1.2345750331878662, "logits/rejected": -1.263419508934021, "logps/chosen": -57.070350646972656, "logps/rejected": -55.806800842285156, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 0.012559197843074799, "rewards/margins": 1.3272449970245361, "rewards/rejected": -1.3146858215332031, "step": 176 }, { "epoch": 1.048888888888889, "grad_norm": 29.753523592942816, "learning_rate": 4.985126108001323e-07, "logits/chosen": -1.339246392250061, "logits/rejected": -1.3507680892944336, "logps/chosen": -47.27094268798828, "logps/rejected": -59.008148193359375, "loss": 0.4619, "rewards/accuracies": 0.75, "rewards/chosen": -0.20138312876224518, "rewards/margins": 1.376226544380188, "rewards/rejected": -1.5776095390319824, "step": 177 }, { "epoch": 1.0548148148148149, "grad_norm": 27.788857301475037, "learning_rate": 4.984410141324092e-07, "logits/chosen": -1.2924796342849731, "logits/rejected": -1.3166478872299194, "logps/chosen": -47.26884841918945, "logps/rejected": -58.67825698852539, "loss": 0.4425, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13158157467842102, "rewards/margins": 0.8055871725082397, "rewards/rejected": -0.9371688365936279, "step": 178 }, { "epoch": 1.0607407407407408, "grad_norm": 26.952026837887974, "learning_rate": 4.983677399350838e-07, "logits/chosen": -1.3298072814941406, "logits/rejected": -1.37056565284729, "logps/chosen": -45.015419006347656, "logps/rejected": -68.24140930175781, "loss": 0.4063, "rewards/accuracies": 0.75, "rewards/chosen": -0.010896991938352585, "rewards/margins": 1.1538221836090088, "rewards/rejected": -1.1647191047668457, "step": 179 }, { "epoch": 1.0666666666666667, "grad_norm": 27.254102685735607, "learning_rate": 4.982927887029197e-07, "logits/chosen": -1.3543680906295776, "logits/rejected": -1.4619414806365967, "logps/chosen": -47.35694885253906, "logps/rejected": -63.67597961425781, "loss": 0.4239, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11707471311092377, "rewards/margins": 0.8920746445655823, "rewards/rejected": -0.7749999165534973, "step": 180 }, { "epoch": 1.0725925925925925, "grad_norm": 32.735453727753146, "learning_rate": 4.982161609420047e-07, "logits/chosen": -1.3471641540527344, "logits/rejected": -1.3713692426681519, "logps/chosen": -45.259033203125, "logps/rejected": -72.70370483398438, "loss": 0.461, "rewards/accuracies": 0.875, "rewards/chosen": 0.009991548955440521, "rewards/margins": 1.367063283920288, "rewards/rejected": -1.3570716381072998, "step": 181 }, { "epoch": 1.0785185185185184, "grad_norm": 26.00414349015101, "learning_rate": 4.981378571697466e-07, "logits/chosen": -1.2727996110916138, "logits/rejected": -1.2572351694107056, "logps/chosen": -49.19256591796875, "logps/rejected": -53.22026443481445, "loss": 0.4732, "rewards/accuracies": 0.75, "rewards/chosen": -0.25015491247177124, "rewards/margins": 0.5964915752410889, "rewards/rejected": -0.8466465473175049, "step": 182 }, { "epoch": 1.0844444444444445, "grad_norm": 31.07953663876989, "learning_rate": 4.980578779148702e-07, "logits/chosen": -1.1128710508346558, "logits/rejected": -1.1316419839859009, "logps/chosen": -41.58570861816406, "logps/rejected": -55.04689025878906, "loss": 0.4835, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11291462182998657, "rewards/margins": 0.8946691751480103, "rewards/rejected": -0.7817546129226685, "step": 183 }, { "epoch": 1.0903703703703704, "grad_norm": 25.932575954455, "learning_rate": 4.979762237174131e-07, "logits/chosen": -1.3252081871032715, "logits/rejected": -1.3753544092178345, "logps/chosen": -44.07130813598633, "logps/rejected": -65.07011413574219, "loss": 0.4, "rewards/accuracies": 0.875, "rewards/chosen": -0.3121752440929413, "rewards/margins": 0.870818018913269, "rewards/rejected": -1.1829932928085327, "step": 184 }, { "epoch": 1.0962962962962963, "grad_norm": 27.97479599690778, "learning_rate": 4.978928951287232e-07, "logits/chosen": -1.444221019744873, "logits/rejected": -1.5286879539489746, "logps/chosen": -58.87108612060547, "logps/rejected": -73.10811614990234, "loss": 0.4034, "rewards/accuracies": 0.875, "rewards/chosen": -0.29445308446884155, "rewards/margins": 1.092024326324463, "rewards/rejected": -1.3864774703979492, "step": 185 }, { "epoch": 1.1022222222222222, "grad_norm": 25.653701259430274, "learning_rate": 4.978078927114535e-07, "logits/chosen": -1.3871357440948486, "logits/rejected": -1.356289029121399, "logps/chosen": -38.077735900878906, "logps/rejected": -47.814632415771484, "loss": 0.3951, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3137326240539551, "rewards/margins": 0.9380292892456055, "rewards/rejected": -1.2517619132995605, "step": 186 }, { "epoch": 1.108148148148148, "grad_norm": 31.129655125600156, "learning_rate": 4.977212170395597e-07, "logits/chosen": -1.5308648347854614, "logits/rejected": -1.5554476976394653, "logps/chosen": -53.46440124511719, "logps/rejected": -63.11518096923828, "loss": 0.4521, "rewards/accuracies": 0.75, "rewards/chosen": -0.27529364824295044, "rewards/margins": 1.0567797422409058, "rewards/rejected": -1.3320733308792114, "step": 187 }, { "epoch": 1.114074074074074, "grad_norm": 30.100213679819685, "learning_rate": 4.976328686982954e-07, "logits/chosen": -1.2549948692321777, "logits/rejected": -1.3156468868255615, "logps/chosen": -43.605262756347656, "logps/rejected": -49.80906677246094, "loss": 0.453, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01309351995587349, "rewards/margins": 1.2752408981323242, "rewards/rejected": -1.2621474266052246, "step": 188 }, { "epoch": 1.12, "grad_norm": 28.949873010790967, "learning_rate": 4.975428482842082e-07, "logits/chosen": -1.2587721347808838, "logits/rejected": -1.433422565460205, "logps/chosen": -43.67359161376953, "logps/rejected": -71.4698257446289, "loss": 0.4513, "rewards/accuracies": 0.75, "rewards/chosen": -0.14791209995746613, "rewards/margins": 0.7650717496871948, "rewards/rejected": -0.9129838943481445, "step": 189 }, { "epoch": 1.125925925925926, "grad_norm": 21.901565858294845, "learning_rate": 4.974511564051367e-07, "logits/chosen": -1.3661924600601196, "logits/rejected": -1.418287992477417, "logps/chosen": -42.59568786621094, "logps/rejected": -52.789920806884766, "loss": 0.3271, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16271501779556274, "rewards/margins": 1.124340534210205, "rewards/rejected": -0.9616254568099976, "step": 190 }, { "epoch": 1.1318518518518519, "grad_norm": 27.131664647740493, "learning_rate": 4.973577936802046e-07, "logits/chosen": -1.3555781841278076, "logits/rejected": -1.3374905586242676, "logps/chosen": -50.74223327636719, "logps/rejected": -57.81810760498047, "loss": 0.4049, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07096290588378906, "rewards/margins": 0.9010239243507385, "rewards/rejected": -0.9719868302345276, "step": 191 }, { "epoch": 1.1377777777777778, "grad_norm": 26.914531737531586, "learning_rate": 4.972627607398182e-07, "logits/chosen": -1.3602168560028076, "logits/rejected": -1.4505767822265625, "logps/chosen": -56.723594665527344, "logps/rejected": -67.85716247558594, "loss": 0.4419, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19218865036964417, "rewards/margins": 0.9808802008628845, "rewards/rejected": -1.173068881034851, "step": 192 }, { "epoch": 1.1437037037037037, "grad_norm": 28.445309646618472, "learning_rate": 4.971660582256614e-07, "logits/chosen": -1.4276717901229858, "logits/rejected": -1.4692355394363403, "logps/chosen": -43.50633239746094, "logps/rejected": -47.10520935058594, "loss": 0.3911, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3843527138233185, "rewards/margins": 0.7021951675415039, "rewards/rejected": -1.0865478515625, "step": 193 }, { "epoch": 1.1496296296296296, "grad_norm": 25.24571619509311, "learning_rate": 4.970676867906911e-07, "logits/chosen": -1.3926632404327393, "logits/rejected": -1.4838123321533203, "logps/chosen": -51.921207427978516, "logps/rejected": -66.0678939819336, "loss": 0.3295, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0005776286125183105, "rewards/margins": 1.9195170402526855, "rewards/rejected": -1.9189393520355225, "step": 194 }, { "epoch": 1.1555555555555554, "grad_norm": 25.789717188474583, "learning_rate": 4.969676470991335e-07, "logits/chosen": -1.3526115417480469, "logits/rejected": -1.3458001613616943, "logps/chosen": -46.61028289794922, "logps/rejected": -65.9991455078125, "loss": 0.4138, "rewards/accuracies": 0.875, "rewards/chosen": -0.06992607563734055, "rewards/margins": 1.478632926940918, "rewards/rejected": -1.5485591888427734, "step": 195 }, { "epoch": 1.1614814814814816, "grad_norm": 24.887314733226713, "learning_rate": 4.96865939826479e-07, "logits/chosen": -1.3574655055999756, "logits/rejected": -1.4423139095306396, "logps/chosen": -54.61656951904297, "logps/rejected": -59.239402770996094, "loss": 0.3951, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1115683764219284, "rewards/margins": 1.4205255508422852, "rewards/rejected": -1.53209388256073, "step": 196 }, { "epoch": 1.1674074074074074, "grad_norm": 28.05153572169805, "learning_rate": 4.967625656594781e-07, "logits/chosen": -1.4286890029907227, "logits/rejected": -1.3814754486083984, "logps/chosen": -55.42445755004883, "logps/rejected": -51.32583999633789, "loss": 0.4464, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4502965807914734, "rewards/margins": 0.5142717361450195, "rewards/rejected": -0.9645683765411377, "step": 197 }, { "epoch": 1.1733333333333333, "grad_norm": 23.637547662812796, "learning_rate": 4.966575252961365e-07, "logits/chosen": -1.3578917980194092, "logits/rejected": -1.3719455003738403, "logps/chosen": -49.35991668701172, "logps/rejected": -51.54848098754883, "loss": 0.3677, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2137174755334854, "rewards/margins": 0.9547407627105713, "rewards/rejected": -1.168458342552185, "step": 198 }, { "epoch": 1.1792592592592592, "grad_norm": 26.3291433097316, "learning_rate": 4.9655081944571e-07, "logits/chosen": -1.4350993633270264, "logits/rejected": -1.4053765535354614, "logps/chosen": -43.111515045166016, "logps/rejected": -50.419593811035156, "loss": 0.4421, "rewards/accuracies": 0.875, "rewards/chosen": -0.22111950814723969, "rewards/margins": 0.6849695444107056, "rewards/rejected": -0.9060890674591064, "step": 199 }, { "epoch": 1.1851851851851851, "grad_norm": 28.107972878485548, "learning_rate": 4.964424488287009e-07, "logits/chosen": -1.389237642288208, "logits/rejected": -1.2933259010314941, "logps/chosen": -54.438865661621094, "logps/rejected": -51.278350830078125, "loss": 0.437, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09371478110551834, "rewards/margins": 0.9150485992431641, "rewards/rejected": -1.0087634325027466, "step": 200 }, { "epoch": 1.1911111111111112, "grad_norm": 30.07536501432843, "learning_rate": 4.963324141768518e-07, "logits/chosen": -1.4005461931228638, "logits/rejected": -1.3559218645095825, "logps/chosen": -48.987884521484375, "logps/rejected": -62.02407455444336, "loss": 0.4439, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49109989404678345, "rewards/margins": 0.905448317527771, "rewards/rejected": -1.3965482711791992, "step": 201 }, { "epoch": 1.1970370370370371, "grad_norm": 25.90426712829728, "learning_rate": 4.962207162331414e-07, "logits/chosen": -1.3146004676818848, "logits/rejected": -1.5756001472473145, "logps/chosen": -44.21037292480469, "logps/rejected": -55.566314697265625, "loss": 0.3948, "rewards/accuracies": 0.875, "rewards/chosen": -0.10266552865505219, "rewards/margins": 1.2562565803527832, "rewards/rejected": -1.3589221239089966, "step": 202 }, { "epoch": 1.202962962962963, "grad_norm": 27.074271726368853, "learning_rate": 4.961073557517792e-07, "logits/chosen": -1.1685363054275513, "logits/rejected": -1.2372539043426514, "logps/chosen": -42.1467170715332, "logps/rejected": -44.715858459472656, "loss": 0.4235, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13077600300312042, "rewards/margins": 1.0918397903442383, "rewards/rejected": -0.961063802242279, "step": 203 }, { "epoch": 1.208888888888889, "grad_norm": 27.11020646169495, "learning_rate": 4.95992333498201e-07, "logits/chosen": -1.3404663801193237, "logits/rejected": -1.4096630811691284, "logps/chosen": -50.35668182373047, "logps/rejected": -61.568275451660156, "loss": 0.4222, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15395641326904297, "rewards/margins": 1.2140846252441406, "rewards/rejected": -1.3680410385131836, "step": 204 }, { "epoch": 1.2148148148148148, "grad_norm": 25.823454078758132, "learning_rate": 4.958756502490626e-07, "logits/chosen": -1.3623404502868652, "logits/rejected": -1.361901044845581, "logps/chosen": -42.57477569580078, "logps/rejected": -63.756248474121094, "loss": 0.3732, "rewards/accuracies": 0.875, "rewards/chosen": -0.12022893130779266, "rewards/margins": 1.9076236486434937, "rewards/rejected": -2.0278525352478027, "step": 205 }, { "epoch": 1.2207407407407407, "grad_norm": 23.999886792109795, "learning_rate": 4.957573067922359e-07, "logits/chosen": -1.3816536664962769, "logits/rejected": -1.3499540090560913, "logps/chosen": -42.67706298828125, "logps/rejected": -61.45965576171875, "loss": 0.3764, "rewards/accuracies": 0.875, "rewards/chosen": -0.23336170613765717, "rewards/margins": 1.017636775970459, "rewards/rejected": -1.2509984970092773, "step": 206 }, { "epoch": 1.2266666666666666, "grad_norm": 24.39207189325382, "learning_rate": 4.956373039268021e-07, "logits/chosen": -1.1838045120239258, "logits/rejected": -1.295671820640564, "logps/chosen": -49.812320709228516, "logps/rejected": -56.74592971801758, "loss": 0.3777, "rewards/accuracies": 1.0, "rewards/chosen": -0.1829771101474762, "rewards/margins": 1.9797238111495972, "rewards/rejected": -2.162700891494751, "step": 207 }, { "epoch": 1.2325925925925927, "grad_norm": 24.05256880321274, "learning_rate": 4.955156424630479e-07, "logits/chosen": -1.277360200881958, "logits/rejected": -1.250819206237793, "logps/chosen": -42.28697967529297, "logps/rejected": -58.031089782714844, "loss": 0.3739, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2733173370361328, "rewards/margins": 1.4574378728866577, "rewards/rejected": -1.7307552099227905, "step": 208 }, { "epoch": 1.2385185185185186, "grad_norm": 31.62267871119725, "learning_rate": 4.953923232224586e-07, "logits/chosen": -1.2416062355041504, "logits/rejected": -1.3119093179702759, "logps/chosen": -45.55648422241211, "logps/rejected": -50.12644577026367, "loss": 0.4265, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2814592123031616, "rewards/margins": 1.3618040084838867, "rewards/rejected": -1.643263339996338, "step": 209 }, { "epoch": 1.2444444444444445, "grad_norm": 20.192542093589164, "learning_rate": 4.952673470377137e-07, "logits/chosen": -1.2672306299209595, "logits/rejected": -1.3711098432540894, "logps/chosen": -45.2178840637207, "logps/rejected": -69.5517349243164, "loss": 0.3025, "rewards/accuracies": 0.875, "rewards/chosen": 0.025760654360055923, "rewards/margins": 1.5954328775405884, "rewards/rejected": -1.5696722269058228, "step": 210 }, { "epoch": 1.2503703703703704, "grad_norm": 29.815974642382578, "learning_rate": 4.951407147526803e-07, "logits/chosen": -1.4044413566589355, "logits/rejected": -1.4227166175842285, "logps/chosen": -50.24687194824219, "logps/rejected": -54.735660552978516, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": -0.3008299469947815, "rewards/margins": 1.338200330734253, "rewards/rejected": -1.6390303373336792, "step": 211 }, { "epoch": 1.2562962962962962, "grad_norm": 26.037475671487393, "learning_rate": 4.950124272224082e-07, "logits/chosen": -1.3218319416046143, "logits/rejected": -1.3979120254516602, "logps/chosen": -53.5397834777832, "logps/rejected": -61.23820114135742, "loss": 0.3906, "rewards/accuracies": 0.875, "rewards/chosen": -0.18688765168190002, "rewards/margins": 1.3252506256103516, "rewards/rejected": -1.5121382474899292, "step": 212 }, { "epoch": 1.2622222222222224, "grad_norm": 24.642049438405937, "learning_rate": 4.948824853131236e-07, "logits/chosen": -1.4031530618667603, "logits/rejected": -1.3881592750549316, "logps/chosen": -41.80998992919922, "logps/rejected": -48.333030700683594, "loss": 0.4209, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23013433814048767, "rewards/margins": 0.9226297736167908, "rewards/rejected": -1.152764081954956, "step": 213 }, { "epoch": 1.268148148148148, "grad_norm": 26.73226151907008, "learning_rate": 4.947508899022234e-07, "logits/chosen": -1.3019957542419434, "logits/rejected": -1.3482120037078857, "logps/chosen": -41.626182556152344, "logps/rejected": -45.29669189453125, "loss": 0.3644, "rewards/accuracies": 0.875, "rewards/chosen": -0.22154735028743744, "rewards/margins": 1.0375902652740479, "rewards/rejected": -1.2591376304626465, "step": 214 }, { "epoch": 1.2740740740740741, "grad_norm": 29.166969953835547, "learning_rate": 4.946176418782698e-07, "logits/chosen": -1.3099212646484375, "logits/rejected": -1.2414171695709229, "logps/chosen": -55.92378234863281, "logps/rejected": -71.39763641357422, "loss": 0.3894, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5391191244125366, "rewards/margins": 1.3320989608764648, "rewards/rejected": -1.871217966079712, "step": 215 }, { "epoch": 1.28, "grad_norm": 25.60855418641745, "learning_rate": 4.944827421409829e-07, "logits/chosen": -1.3376116752624512, "logits/rejected": -1.3164410591125488, "logps/chosen": -56.354408264160156, "logps/rejected": -60.64031982421875, "loss": 0.3622, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3232120871543884, "rewards/margins": 1.5070254802703857, "rewards/rejected": -1.830237627029419, "step": 216 }, { "epoch": 1.285925925925926, "grad_norm": 23.201414829534766, "learning_rate": 4.943461916012363e-07, "logits/chosen": -1.2318775653839111, "logits/rejected": -1.352461814880371, "logps/chosen": -50.486663818359375, "logps/rejected": -70.3978042602539, "loss": 0.3015, "rewards/accuracies": 0.875, "rewards/chosen": -0.272786945104599, "rewards/margins": 2.2941455841064453, "rewards/rejected": -2.5669326782226562, "step": 217 }, { "epoch": 1.2918518518518518, "grad_norm": 27.79922817942101, "learning_rate": 4.9420799118105e-07, "logits/chosen": -1.3263601064682007, "logits/rejected": -1.4090485572814941, "logps/chosen": -48.22421646118164, "logps/rejected": -53.179039001464844, "loss": 0.413, "rewards/accuracies": 0.875, "rewards/chosen": -0.4424872100353241, "rewards/margins": 1.7214093208312988, "rewards/rejected": -2.1638965606689453, "step": 218 }, { "epoch": 1.2977777777777777, "grad_norm": 23.11860382817, "learning_rate": 4.940681418135843e-07, "logits/chosen": -1.3301970958709717, "logits/rejected": -1.4387781620025635, "logps/chosen": -36.736915588378906, "logps/rejected": -66.50269317626953, "loss": 0.3156, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3005810081958771, "rewards/margins": 2.1743574142456055, "rewards/rejected": -2.47493839263916, "step": 219 }, { "epoch": 1.3037037037037038, "grad_norm": 24.918955286472, "learning_rate": 4.939266444431335e-07, "logits/chosen": -1.3525760173797607, "logits/rejected": -1.470273733139038, "logps/chosen": -46.7333984375, "logps/rejected": -73.63404083251953, "loss": 0.3665, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24659954011440277, "rewards/margins": 1.7434245347976685, "rewards/rejected": -1.9900240898132324, "step": 220 }, { "epoch": 1.3096296296296297, "grad_norm": 28.581360124769212, "learning_rate": 4.937835000251197e-07, "logits/chosen": -1.4148640632629395, "logits/rejected": -1.4724056720733643, "logps/chosen": -46.71776580810547, "logps/rejected": -69.32954406738281, "loss": 0.4222, "rewards/accuracies": 0.625, "rewards/chosen": -0.426167368888855, "rewards/margins": 1.0898077487945557, "rewards/rejected": -1.515974998474121, "step": 221 }, { "epoch": 1.3155555555555556, "grad_norm": 23.15045697400444, "learning_rate": 4.936387095260863e-07, "logits/chosen": -1.325617790222168, "logits/rejected": -1.3408955335617065, "logps/chosen": -39.602783203125, "logps/rejected": -63.946929931640625, "loss": 0.3595, "rewards/accuracies": 1.0, "rewards/chosen": -0.056969501078128815, "rewards/margins": 2.3829245567321777, "rewards/rejected": -2.439894199371338, "step": 222 }, { "epoch": 1.3214814814814815, "grad_norm": 26.031180737689365, "learning_rate": 4.934922739236912e-07, "logits/chosen": -1.5163429975509644, "logits/rejected": -1.6212719678878784, "logps/chosen": -43.200016021728516, "logps/rejected": -56.9548454284668, "loss": 0.4031, "rewards/accuracies": 0.875, "rewards/chosen": -0.2686010003089905, "rewards/margins": 1.1195085048675537, "rewards/rejected": -1.388109564781189, "step": 223 }, { "epoch": 1.3274074074074074, "grad_norm": 33.46497240818608, "learning_rate": 4.933441942067006e-07, "logits/chosen": -1.3373076915740967, "logits/rejected": -1.3908562660217285, "logps/chosen": -57.834842681884766, "logps/rejected": -68.81277465820312, "loss": 0.5067, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38047823309898376, "rewards/margins": 0.6113041639328003, "rewards/rejected": -0.9917824268341064, "step": 224 }, { "epoch": 1.3333333333333333, "grad_norm": 23.36650582203661, "learning_rate": 4.93194471374982e-07, "logits/chosen": -1.3187856674194336, "logits/rejected": -1.368605375289917, "logps/chosen": -46.856529235839844, "logps/rejected": -61.831233978271484, "loss": 0.3623, "rewards/accuracies": 0.75, "rewards/chosen": 0.002665497362613678, "rewards/margins": 1.1606879234313965, "rewards/rejected": -1.1580225229263306, "step": 225 }, { "epoch": 1.3392592592592591, "grad_norm": 25.411305437774953, "learning_rate": 4.930431064394976e-07, "logits/chosen": -1.3518116474151611, "logits/rejected": -1.386432409286499, "logps/chosen": -51.480796813964844, "logps/rejected": -51.128456115722656, "loss": 0.4065, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33157169818878174, "rewards/margins": 0.9579052925109863, "rewards/rejected": -1.289476990699768, "step": 226 }, { "epoch": 1.3451851851851853, "grad_norm": 26.090560376399463, "learning_rate": 4.928901004222977e-07, "logits/chosen": -1.3473336696624756, "logits/rejected": -1.487202763557434, "logps/chosen": -49.934654235839844, "logps/rejected": -53.528663635253906, "loss": 0.3961, "rewards/accuracies": 0.75, "rewards/chosen": -0.47797858715057373, "rewards/margins": 1.3904893398284912, "rewards/rejected": -1.8684678077697754, "step": 227 }, { "epoch": 1.3511111111111112, "grad_norm": 27.75621449004402, "learning_rate": 4.92735454356513e-07, "logits/chosen": -1.464008092880249, "logits/rejected": -1.5646613836288452, "logps/chosen": -56.57294464111328, "logps/rejected": -68.8311767578125, "loss": 0.3841, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5350128412246704, "rewards/margins": 1.29872465133667, "rewards/rejected": -1.8337376117706299, "step": 228 }, { "epoch": 1.357037037037037, "grad_norm": 25.729715098580982, "learning_rate": 4.925791692863488e-07, "logits/chosen": -1.3492538928985596, "logits/rejected": -1.3906590938568115, "logps/chosen": -40.129737854003906, "logps/rejected": -50.684967041015625, "loss": 0.4227, "rewards/accuracies": 0.875, "rewards/chosen": -0.32827675342559814, "rewards/margins": 1.1863243579864502, "rewards/rejected": -1.5146011114120483, "step": 229 }, { "epoch": 1.362962962962963, "grad_norm": 26.449607937790024, "learning_rate": 4.924212462670768e-07, "logits/chosen": -1.381973385810852, "logits/rejected": -1.3976202011108398, "logps/chosen": -53.29833984375, "logps/rejected": -59.929779052734375, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": -0.11365138739347458, "rewards/margins": 1.7214391231536865, "rewards/rejected": -1.8350905179977417, "step": 230 }, { "epoch": 1.3688888888888888, "grad_norm": 23.43023680474791, "learning_rate": 4.922616863650289e-07, "logits/chosen": -1.1934809684753418, "logits/rejected": -1.2369165420532227, "logps/chosen": -50.21738815307617, "logps/rejected": -71.64352416992188, "loss": 0.3587, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4246065616607666, "rewards/margins": 1.7253427505493164, "rewards/rejected": -2.149949073791504, "step": 231 }, { "epoch": 1.374814814814815, "grad_norm": 28.666265083827813, "learning_rate": 4.921004906575896e-07, "logits/chosen": -1.2596248388290405, "logits/rejected": -1.3159894943237305, "logps/chosen": -51.569847106933594, "logps/rejected": -60.562618255615234, "loss": 0.4363, "rewards/accuracies": 0.875, "rewards/chosen": -0.031792208552360535, "rewards/margins": 1.6072309017181396, "rewards/rejected": -1.6390230655670166, "step": 232 }, { "epoch": 1.3807407407407408, "grad_norm": 32.110066137255615, "learning_rate": 4.919376602331883e-07, "logits/chosen": -1.0603680610656738, "logits/rejected": -1.0503628253936768, "logps/chosen": -61.34019470214844, "logps/rejected": -65.20555114746094, "loss": 0.4379, "rewards/accuracies": 1.0, "rewards/chosen": -0.6490658521652222, "rewards/margins": 2.534877300262451, "rewards/rejected": -3.183943271636963, "step": 233 }, { "epoch": 1.3866666666666667, "grad_norm": 25.074079766201187, "learning_rate": 4.917731961912926e-07, "logits/chosen": -1.4197475910186768, "logits/rejected": -1.4189743995666504, "logps/chosen": -43.400482177734375, "logps/rejected": -53.225189208984375, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/chosen": 0.06946276128292084, "rewards/margins": 1.7199580669403076, "rewards/rejected": -1.6504952907562256, "step": 234 }, { "epoch": 1.3925925925925926, "grad_norm": 26.279570972884244, "learning_rate": 4.91607099642401e-07, "logits/chosen": -1.4161285161972046, "logits/rejected": -1.344247817993164, "logps/chosen": -52.5662841796875, "logps/rejected": -52.22835159301758, "loss": 0.4144, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3079703748226166, "rewards/margins": 1.408158540725708, "rewards/rejected": -1.716128945350647, "step": 235 }, { "epoch": 1.3985185185185185, "grad_norm": 22.71741763432061, "learning_rate": 4.914393717080346e-07, "logits/chosen": -1.3715291023254395, "logits/rejected": -1.4043883085250854, "logps/chosen": -40.7462272644043, "logps/rejected": -51.91321563720703, "loss": 0.3452, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11906649172306061, "rewards/margins": 1.0093883275985718, "rewards/rejected": -1.1284549236297607, "step": 236 }, { "epoch": 1.4044444444444444, "grad_norm": 26.65603294074516, "learning_rate": 4.9127001352073e-07, "logits/chosen": -1.4703233242034912, "logits/rejected": -1.538904070854187, "logps/chosen": -45.97296142578125, "logps/rejected": -65.32240295410156, "loss": 0.3828, "rewards/accuracies": 0.8125, "rewards/chosen": -0.41329696774482727, "rewards/margins": 1.1328431367874146, "rewards/rejected": -1.5461399555206299, "step": 237 }, { "epoch": 1.4103703703703703, "grad_norm": 25.231239292504032, "learning_rate": 4.910990262240321e-07, "logits/chosen": -1.407243251800537, "logits/rejected": -1.4713376760482788, "logps/chosen": -42.070404052734375, "logps/rejected": -47.037384033203125, "loss": 0.3242, "rewards/accuracies": 0.875, "rewards/chosen": -0.1005098894238472, "rewards/margins": 1.4259945154190063, "rewards/rejected": -1.526504397392273, "step": 238 }, { "epoch": 1.4162962962962964, "grad_norm": 24.015207965240965, "learning_rate": 4.909264109724852e-07, "logits/chosen": -1.3257907629013062, "logits/rejected": -1.2598779201507568, "logps/chosen": -41.595149993896484, "logps/rejected": -58.04209899902344, "loss": 0.3351, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09786944091320038, "rewards/margins": 1.5942250490188599, "rewards/rejected": -1.4963555335998535, "step": 239 }, { "epoch": 1.4222222222222223, "grad_norm": 29.87686057856591, "learning_rate": 4.907521689316265e-07, "logits/chosen": -1.4115238189697266, "logits/rejected": -1.3675273656845093, "logps/chosen": -36.79245376586914, "logps/rejected": -73.65255737304688, "loss": 0.4191, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04488750547170639, "rewards/margins": 1.4026293754577637, "rewards/rejected": -1.4475167989730835, "step": 240 }, { "epoch": 1.4281481481481482, "grad_norm": 21.03692614805522, "learning_rate": 4.905763012779775e-07, "logits/chosen": -1.3066779375076294, "logits/rejected": -1.360036015510559, "logps/chosen": -57.45732879638672, "logps/rejected": -75.41363525390625, "loss": 0.281, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6602566838264465, "rewards/margins": 2.5916683673858643, "rewards/rejected": -3.251924991607666, "step": 241 }, { "epoch": 1.434074074074074, "grad_norm": 25.75436647076939, "learning_rate": 4.90398809199036e-07, "logits/chosen": -1.2956441640853882, "logits/rejected": -1.4834749698638916, "logps/chosen": -54.38095474243164, "logps/rejected": -57.828269958496094, "loss": 0.3572, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38308632373809814, "rewards/margins": 1.9507379531860352, "rewards/rejected": -2.333824396133423, "step": 242 }, { "epoch": 1.44, "grad_norm": 29.700920716890522, "learning_rate": 4.902196938932685e-07, "logits/chosen": -1.2027655839920044, "logits/rejected": -1.2516043186187744, "logps/chosen": -42.96432876586914, "logps/rejected": -51.21244812011719, "loss": 0.3837, "rewards/accuracies": 0.75, "rewards/chosen": -0.12649276852607727, "rewards/margins": 1.136317253112793, "rewards/rejected": -1.2628101110458374, "step": 243 }, { "epoch": 1.445925925925926, "grad_norm": 23.713492205204403, "learning_rate": 4.90038956570102e-07, "logits/chosen": -1.4097803831100464, "logits/rejected": -1.418364405632019, "logps/chosen": -56.92813491821289, "logps/rejected": -49.77784729003906, "loss": 0.3147, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1759030818939209, "rewards/margins": 0.9846144914627075, "rewards/rejected": -1.1605175733566284, "step": 244 }, { "epoch": 1.4518518518518517, "grad_norm": 23.75978131455477, "learning_rate": 4.898565984499153e-07, "logits/chosen": -1.3047959804534912, "logits/rejected": -1.3336838483810425, "logps/chosen": -37.61913299560547, "logps/rejected": -66.44524383544922, "loss": 0.3504, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11530294269323349, "rewards/margins": 1.62274968624115, "rewards/rejected": -1.7380526065826416, "step": 245 }, { "epoch": 1.4577777777777778, "grad_norm": 26.759728007939124, "learning_rate": 4.896726207640314e-07, "logits/chosen": -1.3182926177978516, "logits/rejected": -1.2883374691009521, "logps/chosen": -58.83563232421875, "logps/rejected": -50.09530258178711, "loss": 0.3832, "rewards/accuracies": 0.875, "rewards/chosen": -0.3371252119541168, "rewards/margins": 1.3079252243041992, "rewards/rejected": -1.6450505256652832, "step": 246 }, { "epoch": 1.4637037037037037, "grad_norm": 26.290819540180618, "learning_rate": 4.894870247547093e-07, "logits/chosen": -1.2748501300811768, "logits/rejected": -1.4458961486816406, "logps/chosen": -32.43407440185547, "logps/rejected": -57.296913146972656, "loss": 0.3684, "rewards/accuracies": 0.875, "rewards/chosen": -0.279015451669693, "rewards/margins": 1.564210057258606, "rewards/rejected": -1.8432254791259766, "step": 247 }, { "epoch": 1.4696296296296296, "grad_norm": 24.824183393994925, "learning_rate": 4.892998116751348e-07, "logits/chosen": -1.3820867538452148, "logits/rejected": -1.3794804811477661, "logps/chosen": -40.21681594848633, "logps/rejected": -47.595802307128906, "loss": 0.3566, "rewards/accuracies": 0.8125, "rewards/chosen": -0.015096355229616165, "rewards/margins": 0.9623454809188843, "rewards/rejected": -0.9774419665336609, "step": 248 }, { "epoch": 1.4755555555555555, "grad_norm": 21.650609907887063, "learning_rate": 4.891109827894127e-07, "logits/chosen": -1.2940155267715454, "logits/rejected": -1.4335472583770752, "logps/chosen": -74.19985961914062, "logps/rejected": -70.36446380615234, "loss": 0.3129, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0672260969877243, "rewards/margins": 2.0223352909088135, "rewards/rejected": -2.0895614624023438, "step": 249 }, { "epoch": 1.4814814814814814, "grad_norm": 26.35700342159131, "learning_rate": 4.889205393725583e-07, "logits/chosen": -1.1685283184051514, "logits/rejected": -1.2299532890319824, "logps/chosen": -41.14178466796875, "logps/rejected": -58.808467864990234, "loss": 0.3688, "rewards/accuracies": 0.9375, "rewards/chosen": 0.019259896129369736, "rewards/margins": 1.9858262538909912, "rewards/rejected": -1.9665664434432983, "step": 250 }, { "epoch": 1.4874074074074075, "grad_norm": 26.0235689821749, "learning_rate": 4.887284827104881e-07, "logits/chosen": -1.392974615097046, "logits/rejected": -1.505925178527832, "logps/chosen": -36.905818939208984, "logps/rejected": -79.7839584350586, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 0.17365923523902893, "rewards/margins": 2.4765334129333496, "rewards/rejected": -2.3028745651245117, "step": 251 }, { "epoch": 1.4933333333333334, "grad_norm": 29.278848450522304, "learning_rate": 4.885348141000122e-07, "logits/chosen": -1.2471200227737427, "logits/rejected": -1.2988287210464478, "logps/chosen": -39.52333450317383, "logps/rejected": -55.919002532958984, "loss": 0.3928, "rewards/accuracies": 1.0, "rewards/chosen": 0.18332645297050476, "rewards/margins": 2.614619731903076, "rewards/rejected": -2.431293249130249, "step": 252 }, { "epoch": 1.4992592592592593, "grad_norm": 28.042790270776827, "learning_rate": 4.883395348488243e-07, "logits/chosen": -1.3611935377120972, "logits/rejected": -1.4138695001602173, "logps/chosen": -57.63732147216797, "logps/rejected": -60.91813278198242, "loss": 0.4144, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10321831703186035, "rewards/margins": 1.9807438850402832, "rewards/rejected": -2.0839624404907227, "step": 253 }, { "epoch": 1.5051851851851852, "grad_norm": 25.03952491402299, "learning_rate": 4.88142646275494e-07, "logits/chosen": -1.1539257764816284, "logits/rejected": -1.1693516969680786, "logps/chosen": -43.72496032714844, "logps/rejected": -55.043235778808594, "loss": 0.304, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12283861637115479, "rewards/margins": 1.9001600742340088, "rewards/rejected": -1.7773215770721436, "step": 254 }, { "epoch": 1.511111111111111, "grad_norm": 29.48314886820695, "learning_rate": 4.879441497094572e-07, "logits/chosen": -1.3410661220550537, "logits/rejected": -1.4779433012008667, "logps/chosen": -42.038639068603516, "logps/rejected": -49.66044616699219, "loss": 0.4142, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0242156982421875, "rewards/margins": 1.087928295135498, "rewards/rejected": -1.0637125968933105, "step": 255 }, { "epoch": 1.5170370370370372, "grad_norm": 28.638225672957248, "learning_rate": 4.877440464910073e-07, "logits/chosen": -1.2495319843292236, "logits/rejected": -1.303146243095398, "logps/chosen": -42.27051544189453, "logps/rejected": -58.957298278808594, "loss": 0.4303, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2611178457736969, "rewards/margins": 0.8532897233963013, "rewards/rejected": -1.1144076585769653, "step": 256 }, { "epoch": 1.5229629629629629, "grad_norm": 27.33028444497588, "learning_rate": 4.875423379712864e-07, "logits/chosen": -1.2766767740249634, "logits/rejected": -1.296708106994629, "logps/chosen": -50.208858489990234, "logps/rejected": -71.75860595703125, "loss": 0.3848, "rewards/accuracies": 1.0, "rewards/chosen": -0.22476840019226074, "rewards/margins": 1.9250941276550293, "rewards/rejected": -2.14986252784729, "step": 257 }, { "epoch": 1.528888888888889, "grad_norm": 24.26952968926095, "learning_rate": 4.873390255122756e-07, "logits/chosen": -1.2495331764221191, "logits/rejected": -1.2845350503921509, "logps/chosen": -47.2353515625, "logps/rejected": -53.62681579589844, "loss": 0.3365, "rewards/accuracies": 0.875, "rewards/chosen": -0.013976313173770905, "rewards/margins": 1.982759714126587, "rewards/rejected": -1.9967360496520996, "step": 258 }, { "epoch": 1.5348148148148149, "grad_norm": 23.570755979542007, "learning_rate": 4.871341104867864e-07, "logits/chosen": -1.4904160499572754, "logits/rejected": -1.452820897102356, "logps/chosen": -50.363956451416016, "logps/rejected": -59.957847595214844, "loss": 0.3653, "rewards/accuracies": 0.75, "rewards/chosen": -0.100559301674366, "rewards/margins": 1.7651526927947998, "rewards/rejected": -1.8657116889953613, "step": 259 }, { "epoch": 1.5407407407407407, "grad_norm": 26.13987494401341, "learning_rate": 4.869275942784511e-07, "logits/chosen": -1.2994194030761719, "logits/rejected": -1.354923129081726, "logps/chosen": -36.94548034667969, "logps/rejected": -54.38296890258789, "loss": 0.3776, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19663068652153015, "rewards/margins": 1.3016669750213623, "rewards/rejected": -1.4982978105545044, "step": 260 }, { "epoch": 1.5466666666666666, "grad_norm": 25.935557135061014, "learning_rate": 4.867194782817137e-07, "logits/chosen": -1.2576755285263062, "logits/rejected": -1.2709205150604248, "logps/chosen": -47.876617431640625, "logps/rejected": -50.99798583984375, "loss": 0.42, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2130255401134491, "rewards/margins": 1.1574770212173462, "rewards/rejected": -0.9444514513015747, "step": 261 }, { "epoch": 1.5525925925925925, "grad_norm": 25.73949104392821, "learning_rate": 4.865097639018202e-07, "logits/chosen": -1.3414067029953003, "logits/rejected": -1.3837270736694336, "logps/chosen": -51.21342468261719, "logps/rejected": -60.94746017456055, "loss": 0.3494, "rewards/accuracies": 0.9375, "rewards/chosen": -0.38993364572525024, "rewards/margins": 1.3145833015441895, "rewards/rejected": -1.704516887664795, "step": 262 }, { "epoch": 1.5585185185185186, "grad_norm": 26.371785681923715, "learning_rate": 4.862984525548091e-07, "logits/chosen": -1.208691954612732, "logits/rejected": -1.32695472240448, "logps/chosen": -41.11842346191406, "logps/rejected": -51.61888885498047, "loss": 0.3775, "rewards/accuracies": 0.75, "rewards/chosen": -0.16439585387706757, "rewards/margins": 1.7332050800323486, "rewards/rejected": -1.8976010084152222, "step": 263 }, { "epoch": 1.5644444444444443, "grad_norm": 23.641754357015362, "learning_rate": 4.860855456675024e-07, "logits/chosen": -1.1456029415130615, "logits/rejected": -1.2467702627182007, "logps/chosen": -44.02134704589844, "logps/rejected": -69.17605590820312, "loss": 0.3844, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015612021088600159, "rewards/margins": 2.685401439666748, "rewards/rejected": -2.6697897911071777, "step": 264 }, { "epoch": 1.5703703703703704, "grad_norm": 31.566770567869234, "learning_rate": 4.85871044677495e-07, "logits/chosen": -1.291245460510254, "logits/rejected": -1.2862074375152588, "logps/chosen": -48.08729553222656, "logps/rejected": -61.76681900024414, "loss": 0.4026, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2082689106464386, "rewards/margins": 1.9027234315872192, "rewards/rejected": -2.110992193222046, "step": 265 }, { "epoch": 1.5762962962962963, "grad_norm": 23.191781424675476, "learning_rate": 4.856549510331461e-07, "logits/chosen": -1.3137812614440918, "logits/rejected": -1.4595973491668701, "logps/chosen": -45.001014709472656, "logps/rejected": -58.4334602355957, "loss": 0.3633, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22389988601207733, "rewards/margins": 1.405003309249878, "rewards/rejected": -1.6289031505584717, "step": 266 }, { "epoch": 1.5822222222222222, "grad_norm": 27.971724212537442, "learning_rate": 4.854372661935684e-07, "logits/chosen": -1.1602957248687744, "logits/rejected": -1.2511422634124756, "logps/chosen": -44.95030212402344, "logps/rejected": -56.273780822753906, "loss": 0.3956, "rewards/accuracies": 0.875, "rewards/chosen": 0.07271359860897064, "rewards/margins": 1.2605903148651123, "rewards/rejected": -1.1878767013549805, "step": 267 }, { "epoch": 1.5881481481481483, "grad_norm": 24.276028981106442, "learning_rate": 4.852179916286189e-07, "logits/chosen": -1.262486457824707, "logits/rejected": -1.4016473293304443, "logps/chosen": -44.76614761352539, "logps/rejected": -56.81776428222656, "loss": 0.3524, "rewards/accuracies": 0.875, "rewards/chosen": -0.07598677277565002, "rewards/margins": 1.7435487508773804, "rewards/rejected": -1.819535493850708, "step": 268 }, { "epoch": 1.594074074074074, "grad_norm": 23.461378245972664, "learning_rate": 4.849971288188889e-07, "logits/chosen": -1.4302575588226318, "logits/rejected": -1.5334992408752441, "logps/chosen": -41.38616180419922, "logps/rejected": -59.964263916015625, "loss": 0.3707, "rewards/accuracies": 0.875, "rewards/chosen": 0.13780353963375092, "rewards/margins": 1.4046378135681152, "rewards/rejected": -1.2668342590332031, "step": 269 }, { "epoch": 1.6, "grad_norm": 24.37626653853462, "learning_rate": 4.847746792556936e-07, "logits/chosen": -1.2448111772537231, "logits/rejected": -1.313913345336914, "logps/chosen": -51.415077209472656, "logps/rejected": -53.74933624267578, "loss": 0.356, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7689206600189209, "rewards/margins": 1.6365911960601807, "rewards/rejected": -2.4055118560791016, "step": 270 }, { "epoch": 1.605925925925926, "grad_norm": 20.090159347357265, "learning_rate": 4.845506444410626e-07, "logits/chosen": -1.2469216585159302, "logits/rejected": -1.2296561002731323, "logps/chosen": -42.26240158081055, "logps/rejected": -45.776145935058594, "loss": 0.2883, "rewards/accuracies": 0.875, "rewards/chosen": -0.3450557589530945, "rewards/margins": 1.5731394290924072, "rewards/rejected": -1.9181952476501465, "step": 271 }, { "epoch": 1.6118518518518519, "grad_norm": 19.791166531137915, "learning_rate": 4.843250258877294e-07, "logits/chosen": -1.2774832248687744, "logits/rejected": -1.3242905139923096, "logps/chosen": -46.619590759277344, "logps/rejected": -50.182044982910156, "loss": 0.2691, "rewards/accuracies": 0.875, "rewards/chosen": -0.07584185898303986, "rewards/margins": 1.252159833908081, "rewards/rejected": -1.328001618385315, "step": 272 }, { "epoch": 1.6177777777777778, "grad_norm": 29.649066059249037, "learning_rate": 4.840978251191211e-07, "logits/chosen": -1.2474559545516968, "logits/rejected": -1.23881196975708, "logps/chosen": -53.4446907043457, "logps/rejected": -52.38796615600586, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": -0.4888032078742981, "rewards/margins": 1.5558326244354248, "rewards/rejected": -2.044635772705078, "step": 273 }, { "epoch": 1.6237037037037036, "grad_norm": 33.28057855157768, "learning_rate": 4.838690436693483e-07, "logits/chosen": -1.412484049797058, "logits/rejected": -1.4253754615783691, "logps/chosen": -75.49126434326172, "logps/rejected": -70.59172058105469, "loss": 0.4191, "rewards/accuracies": 0.875, "rewards/chosen": -0.3724600374698639, "rewards/margins": 1.891104817390442, "rewards/rejected": -2.2635648250579834, "step": 274 }, { "epoch": 1.6296296296296298, "grad_norm": 22.666180377284093, "learning_rate": 4.836386830831951e-07, "logits/chosen": -1.2181977033615112, "logits/rejected": -1.2449318170547485, "logps/chosen": -38.67042922973633, "logps/rejected": -54.85205841064453, "loss": 0.3325, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08570816367864609, "rewards/margins": 1.6117794513702393, "rewards/rejected": -1.526071310043335, "step": 275 }, { "epoch": 1.6355555555555554, "grad_norm": 25.751950738264924, "learning_rate": 4.834067449161077e-07, "logits/chosen": -1.2701869010925293, "logits/rejected": -1.3182072639465332, "logps/chosen": -47.42134094238281, "logps/rejected": -68.39168548583984, "loss": 0.3341, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11203102767467499, "rewards/margins": 2.025148630142212, "rewards/rejected": -2.1371796131134033, "step": 276 }, { "epoch": 1.6414814814814815, "grad_norm": 19.59862580042334, "learning_rate": 4.83173230734185e-07, "logits/chosen": -1.402850866317749, "logits/rejected": -1.4441189765930176, "logps/chosen": -44.067161560058594, "logps/rejected": -74.80207061767578, "loss": 0.2722, "rewards/accuracies": 0.875, "rewards/chosen": 0.06552604585886002, "rewards/margins": 1.8013237714767456, "rewards/rejected": -1.7357978820800781, "step": 277 }, { "epoch": 1.6474074074074074, "grad_norm": 23.22500510703153, "learning_rate": 4.829381421141671e-07, "logits/chosen": -1.2457361221313477, "logits/rejected": -1.3469531536102295, "logps/chosen": -38.14561462402344, "logps/rejected": -52.48051452636719, "loss": 0.3409, "rewards/accuracies": 0.75, "rewards/chosen": -0.2643684148788452, "rewards/margins": 1.3024507761001587, "rewards/rejected": -1.5668190717697144, "step": 278 }, { "epoch": 1.6533333333333333, "grad_norm": 24.46935150462778, "learning_rate": 4.827014806434253e-07, "logits/chosen": -1.3392874002456665, "logits/rejected": -1.398203730583191, "logps/chosen": -69.1489486694336, "logps/rejected": -86.5308837890625, "loss": 0.2984, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17087030410766602, "rewards/margins": 2.4526419639587402, "rewards/rejected": -2.6235122680664062, "step": 279 }, { "epoch": 1.6592592592592592, "grad_norm": 32.66878587176462, "learning_rate": 4.824632479199511e-07, "logits/chosen": -1.2298073768615723, "logits/rejected": -1.2714858055114746, "logps/chosen": -52.639041900634766, "logps/rejected": -57.43061828613281, "loss": 0.4427, "rewards/accuracies": 0.75, "rewards/chosen": -0.4236421585083008, "rewards/margins": 1.339827299118042, "rewards/rejected": -1.7634694576263428, "step": 280 }, { "epoch": 1.665185185185185, "grad_norm": 22.72790160810974, "learning_rate": 4.822234455523453e-07, "logits/chosen": -1.259157419204712, "logits/rejected": -1.3009653091430664, "logps/chosen": -40.12117385864258, "logps/rejected": -60.172664642333984, "loss": 0.3048, "rewards/accuracies": 0.875, "rewards/chosen": -0.30390816926956177, "rewards/margins": 1.9633526802062988, "rewards/rejected": -2.267261028289795, "step": 281 }, { "epoch": 1.6711111111111112, "grad_norm": 27.8520366261411, "learning_rate": 4.819820751598076e-07, "logits/chosen": -1.3906257152557373, "logits/rejected": -1.425363540649414, "logps/chosen": -41.69005584716797, "logps/rejected": -53.399139404296875, "loss": 0.3495, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20817890763282776, "rewards/margins": 1.4239811897277832, "rewards/rejected": -1.632159948348999, "step": 282 }, { "epoch": 1.6770370370370369, "grad_norm": 25.999550920916317, "learning_rate": 4.817391383721249e-07, "logits/chosen": -1.3971692323684692, "logits/rejected": -1.488884687423706, "logps/chosen": -53.06078338623047, "logps/rejected": -69.60707092285156, "loss": 0.357, "rewards/accuracies": 0.8125, "rewards/chosen": -0.41441503167152405, "rewards/margins": 1.7222974300384521, "rewards/rejected": -2.1367125511169434, "step": 283 }, { "epoch": 1.682962962962963, "grad_norm": 31.91920988188716, "learning_rate": 4.814946368296616e-07, "logits/chosen": -1.292490839958191, "logits/rejected": -1.320987582206726, "logps/chosen": -38.88534927368164, "logps/rejected": -49.37236404418945, "loss": 0.4254, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0005872957408428192, "rewards/margins": 1.1653939485549927, "rewards/rejected": -1.1659812927246094, "step": 284 }, { "epoch": 1.6888888888888889, "grad_norm": 28.470589880633216, "learning_rate": 4.812485721833464e-07, "logits/chosen": -1.1484642028808594, "logits/rejected": -1.241657018661499, "logps/chosen": -57.71142578125, "logps/rejected": -87.64554595947266, "loss": 0.3595, "rewards/accuracies": 0.8125, "rewards/chosen": 0.051835060119628906, "rewards/margins": 2.0897717475891113, "rewards/rejected": -2.0379366874694824, "step": 285 }, { "epoch": 1.6948148148148148, "grad_norm": 26.804492840080194, "learning_rate": 4.810009460946635e-07, "logits/chosen": -1.3247426748275757, "logits/rejected": -1.3112623691558838, "logps/chosen": -43.300559997558594, "logps/rejected": -51.48878860473633, "loss": 0.382, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24530044198036194, "rewards/margins": 1.2045056819915771, "rewards/rejected": -1.4498060941696167, "step": 286 }, { "epoch": 1.7007407407407409, "grad_norm": 26.52740471473782, "learning_rate": 4.8075176023564e-07, "logits/chosen": -1.3240617513656616, "logits/rejected": -1.4052027463912964, "logps/chosen": -45.60319519042969, "logps/rejected": -61.580039978027344, "loss": 0.395, "rewards/accuracies": 0.875, "rewards/chosen": -0.03795705735683441, "rewards/margins": 1.6152465343475342, "rewards/rejected": -1.6532034873962402, "step": 287 }, { "epoch": 1.7066666666666666, "grad_norm": 24.11382703660646, "learning_rate": 4.805010162888346e-07, "logits/chosen": -1.2520138025283813, "logits/rejected": -1.4344892501831055, "logps/chosen": -47.9183464050293, "logps/rejected": -53.69255828857422, "loss": 0.3442, "rewards/accuracies": 0.875, "rewards/chosen": 0.4233085811138153, "rewards/margins": 1.2932010889053345, "rewards/rejected": -0.8698925375938416, "step": 288 }, { "epoch": 1.7125925925925927, "grad_norm": 25.438127060122117, "learning_rate": 4.802487159473271e-07, "logits/chosen": -1.211564064025879, "logits/rejected": -1.2627569437026978, "logps/chosen": -52.95191955566406, "logps/rejected": -65.62922668457031, "loss": 0.3494, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3603849411010742, "rewards/margins": 2.6062426567077637, "rewards/rejected": -2.966627597808838, "step": 289 }, { "epoch": 1.7185185185185186, "grad_norm": 29.349396127433405, "learning_rate": 4.799948609147061e-07, "logits/chosen": -1.3695993423461914, "logits/rejected": -1.3942320346832275, "logps/chosen": -47.67075729370117, "logps/rejected": -61.6407585144043, "loss": 0.3805, "rewards/accuracies": 0.75, "rewards/chosen": -0.23102344572544098, "rewards/margins": 1.8437186479568481, "rewards/rejected": -2.074742078781128, "step": 290 }, { "epoch": 1.7244444444444444, "grad_norm": 22.5576058640629, "learning_rate": 4.797394529050577e-07, "logits/chosen": -1.386033296585083, "logits/rejected": -1.4366207122802734, "logps/chosen": -54.97161865234375, "logps/rejected": -60.0572624206543, "loss": 0.3047, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10227298736572266, "rewards/margins": 2.372859001159668, "rewards/rejected": -2.4751322269439697, "step": 291 }, { "epoch": 1.7303703703703703, "grad_norm": 23.961332637171278, "learning_rate": 4.794824936429543e-07, "logits/chosen": -1.2843897342681885, "logits/rejected": -1.2670435905456543, "logps/chosen": -40.86612319946289, "logps/rejected": -46.26717758178711, "loss": 0.298, "rewards/accuracies": 0.75, "rewards/chosen": -0.021604575216770172, "rewards/margins": 1.5374772548675537, "rewards/rejected": -1.55908203125, "step": 292 }, { "epoch": 1.7362962962962962, "grad_norm": 20.82357798003762, "learning_rate": 4.792239848634426e-07, "logits/chosen": -1.2751004695892334, "logits/rejected": -1.2628631591796875, "logps/chosen": -51.31232833862305, "logps/rejected": -57.4508056640625, "loss": 0.3308, "rewards/accuracies": 0.8125, "rewards/chosen": -0.006660893559455872, "rewards/margins": 2.1980137825012207, "rewards/rejected": -2.20467472076416, "step": 293 }, { "epoch": 1.7422222222222223, "grad_norm": 21.618706080176008, "learning_rate": 4.789639283120322e-07, "logits/chosen": -1.345123052597046, "logits/rejected": -1.4260653257369995, "logps/chosen": -39.87286376953125, "logps/rejected": -60.069549560546875, "loss": 0.343, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13270238041877747, "rewards/margins": 2.6959171295166016, "rewards/rejected": -2.5632147789001465, "step": 294 }, { "epoch": 1.748148148148148, "grad_norm": 22.76141790976671, "learning_rate": 4.787023257446832e-07, "logits/chosen": -1.3435921669006348, "logits/rejected": -1.4133026599884033, "logps/chosen": -53.27568435668945, "logps/rejected": -64.22837829589844, "loss": 0.3139, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0206415057182312, "rewards/margins": 2.1336681842803955, "rewards/rejected": -2.1130268573760986, "step": 295 }, { "epoch": 1.7540740740740741, "grad_norm": 26.56123778763129, "learning_rate": 4.784391789277952e-07, "logits/chosen": -1.3392447233200073, "logits/rejected": -1.3046197891235352, "logps/chosen": -40.423377990722656, "logps/rejected": -48.11532211303711, "loss": 0.3649, "rewards/accuracies": 0.75, "rewards/chosen": -0.04688149318099022, "rewards/margins": 1.1583579778671265, "rewards/rejected": -1.2052394151687622, "step": 296 }, { "epoch": 1.76, "grad_norm": 25.807961780535802, "learning_rate": 4.781744896381944e-07, "logits/chosen": -1.411516547203064, "logits/rejected": -1.3581936359405518, "logps/chosen": -59.09212875366211, "logps/rejected": -65.5539779663086, "loss": 0.3824, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4150131344795227, "rewards/margins": 1.1555423736572266, "rewards/rejected": -1.5705554485321045, "step": 297 }, { "epoch": 1.765925925925926, "grad_norm": 19.33578977859486, "learning_rate": 4.779082596631226e-07, "logits/chosen": -1.4044454097747803, "logits/rejected": -1.4459329843521118, "logps/chosen": -44.998008728027344, "logps/rejected": -69.97136688232422, "loss": 0.222, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09062831103801727, "rewards/margins": 2.470029354095459, "rewards/rejected": -2.560657501220703, "step": 298 }, { "epoch": 1.771851851851852, "grad_norm": 26.421795777830678, "learning_rate": 4.776404908002245e-07, "logits/chosen": -1.4593310356140137, "logits/rejected": -1.4886685609817505, "logps/chosen": -41.98119354248047, "logps/rejected": -54.190040588378906, "loss": 0.3293, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015456534922122955, "rewards/margins": 0.9114388823509216, "rewards/rejected": -0.8959822654724121, "step": 299 }, { "epoch": 1.7777777777777777, "grad_norm": 25.388116798218327, "learning_rate": 4.773711848575356e-07, "logits/chosen": -0.882595419883728, "logits/rejected": -0.8482030630111694, "logps/chosen": -60.15789031982422, "logps/rejected": -53.647987365722656, "loss": 0.3177, "rewards/accuracies": 0.875, "rewards/chosen": -0.21632501482963562, "rewards/margins": 1.840660572052002, "rewards/rejected": -2.05698561668396, "step": 300 }, { "epoch": 1.7837037037037038, "grad_norm": 27.44128369149265, "learning_rate": 4.771003436534702e-07, "logits/chosen": -1.155822515487671, "logits/rejected": -1.1653910875320435, "logps/chosen": -44.46099090576172, "logps/rejected": -67.78788757324219, "loss": 0.3193, "rewards/accuracies": 1.0, "rewards/chosen": 0.11511935293674469, "rewards/margins": 2.467637062072754, "rewards/rejected": -2.35251784324646, "step": 301 }, { "epoch": 1.7896296296296297, "grad_norm": 31.0131157657199, "learning_rate": 4.7682796901680906e-07, "logits/chosen": -1.3534022569656372, "logits/rejected": -1.3563964366912842, "logps/chosen": -52.00239944458008, "logps/rejected": -64.85633850097656, "loss": 0.3686, "rewards/accuracies": 0.875, "rewards/chosen": -0.26378777623176575, "rewards/margins": 2.1019082069396973, "rewards/rejected": -2.3656959533691406, "step": 302 }, { "epoch": 1.7955555555555556, "grad_norm": 25.572323066424033, "learning_rate": 4.765540627866869e-07, "logits/chosen": -1.3851449489593506, "logits/rejected": -1.3254812955856323, "logps/chosen": -64.02766418457031, "logps/rejected": -55.131404876708984, "loss": 0.307, "rewards/accuracies": 0.875, "rewards/chosen": -0.37291884422302246, "rewards/margins": 1.6172915697097778, "rewards/rejected": -1.9902105331420898, "step": 303 }, { "epoch": 1.8014814814814815, "grad_norm": 24.973006963104183, "learning_rate": 4.7627862681258027e-07, "logits/chosen": -1.3431549072265625, "logits/rejected": -1.3935474157333374, "logps/chosen": -43.056156158447266, "logps/rejected": -45.55668640136719, "loss": 0.3787, "rewards/accuracies": 0.9375, "rewards/chosen": -0.027092069387435913, "rewards/margins": 1.2533209323883057, "rewards/rejected": -1.2804131507873535, "step": 304 }, { "epoch": 1.8074074074074074, "grad_norm": 23.246647674435096, "learning_rate": 4.7600166295429476e-07, "logits/chosen": -1.082540512084961, "logits/rejected": -1.1362693309783936, "logps/chosen": -40.44417190551758, "logps/rejected": -52.83973693847656, "loss": 0.3181, "rewards/accuracies": 0.875, "rewards/chosen": -0.2112465500831604, "rewards/margins": 1.6178412437438965, "rewards/rejected": -1.829087734222412, "step": 305 }, { "epoch": 1.8133333333333335, "grad_norm": 27.00429197324121, "learning_rate": 4.7572317308195276e-07, "logits/chosen": -1.2468149662017822, "logits/rejected": -1.3040968179702759, "logps/chosen": -46.73735809326172, "logps/rejected": -63.27323913574219, "loss": 0.3206, "rewards/accuracies": 0.75, "rewards/chosen": -0.37287694215774536, "rewards/margins": 1.7324719429016113, "rewards/rejected": -2.105348825454712, "step": 306 }, { "epoch": 1.8192592592592591, "grad_norm": 23.44156041895113, "learning_rate": 4.7544315907598034e-07, "logits/chosen": -1.345916748046875, "logits/rejected": -1.3447480201721191, "logps/chosen": -40.71674728393555, "logps/rejected": -50.70779800415039, "loss": 0.3617, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0339193195104599, "rewards/margins": 1.5760494470596313, "rewards/rejected": -1.5421302318572998, "step": 307 }, { "epoch": 1.8251851851851852, "grad_norm": 25.06604927659884, "learning_rate": 4.7516162282709515e-07, "logits/chosen": -1.147845983505249, "logits/rejected": -1.1910618543624878, "logps/chosen": -45.736656188964844, "logps/rejected": -55.5797004699707, "loss": 0.3699, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0804969072341919, "rewards/margins": 1.4355418682098389, "rewards/rejected": -1.5160387754440308, "step": 308 }, { "epoch": 1.8311111111111111, "grad_norm": 25.46137399084043, "learning_rate": 4.748785662362932e-07, "logits/chosen": -1.3299181461334229, "logits/rejected": -1.3480998277664185, "logps/chosen": -55.302337646484375, "logps/rejected": -79.200439453125, "loss": 0.3278, "rewards/accuracies": 0.75, "rewards/chosen": 0.21703583002090454, "rewards/margins": 1.7550965547561646, "rewards/rejected": -1.5380607843399048, "step": 309 }, { "epoch": 1.837037037037037, "grad_norm": 26.51707060569584, "learning_rate": 4.7459399121483634e-07, "logits/chosen": -1.1016050577163696, "logits/rejected": -1.2106202840805054, "logps/chosen": -53.677574157714844, "logps/rejected": -60.549896240234375, "loss": 0.3819, "rewards/accuracies": 0.9375, "rewards/chosen": -0.054566219449043274, "rewards/margins": 2.2532806396484375, "rewards/rejected": -2.3078465461730957, "step": 310 }, { "epoch": 1.842962962962963, "grad_norm": 26.561068654774267, "learning_rate": 4.74307899684239e-07, "logits/chosen": -1.2419092655181885, "logits/rejected": -1.2967182397842407, "logps/chosen": -53.5194091796875, "logps/rejected": -60.51311111450195, "loss": 0.3916, "rewards/accuracies": 0.9375, "rewards/chosen": -0.191430002450943, "rewards/margins": 1.867770791053772, "rewards/rejected": -2.0592007637023926, "step": 311 }, { "epoch": 1.8488888888888888, "grad_norm": 24.80025359759185, "learning_rate": 4.7402029357625563e-07, "logits/chosen": -1.4073894023895264, "logits/rejected": -1.2945380210876465, "logps/chosen": -51.761009216308594, "logps/rejected": -56.40605926513672, "loss": 0.3481, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22043563425540924, "rewards/margins": 1.9667236804962158, "rewards/rejected": -2.187159538269043, "step": 312 }, { "epoch": 1.854814814814815, "grad_norm": 26.590869331925262, "learning_rate": 4.737311748328673e-07, "logits/chosen": -1.3399579524993896, "logits/rejected": -1.4058165550231934, "logps/chosen": -48.944793701171875, "logps/rejected": -67.84580993652344, "loss": 0.2951, "rewards/accuracies": 0.875, "rewards/chosen": -0.09198684245347977, "rewards/margins": 2.516043186187744, "rewards/rejected": -2.608030080795288, "step": 313 }, { "epoch": 1.8607407407407406, "grad_norm": 21.57321505154053, "learning_rate": 4.7344054540626887e-07, "logits/chosen": -1.2113909721374512, "logits/rejected": -1.29359769821167, "logps/chosen": -40.80859375, "logps/rejected": -51.39629364013672, "loss": 0.2984, "rewards/accuracies": 0.875, "rewards/chosen": 0.013930395245552063, "rewards/margins": 2.274604082107544, "rewards/rejected": -2.2606735229492188, "step": 314 }, { "epoch": 1.8666666666666667, "grad_norm": 24.633388476867335, "learning_rate": 4.731484072588555e-07, "logits/chosen": -1.2730799913406372, "logits/rejected": -1.3082690238952637, "logps/chosen": -49.419525146484375, "logps/rejected": -56.43271255493164, "loss": 0.3089, "rewards/accuracies": 0.75, "rewards/chosen": -0.40381959080696106, "rewards/margins": 2.45341420173645, "rewards/rejected": -2.857233762741089, "step": 315 }, { "epoch": 1.8725925925925926, "grad_norm": 24.457073160874554, "learning_rate": 4.7285476236320976e-07, "logits/chosen": -1.2295560836791992, "logits/rejected": -1.246058464050293, "logps/chosen": -47.56389236450195, "logps/rejected": -63.71388626098633, "loss": 0.3404, "rewards/accuracies": 0.8125, "rewards/chosen": -0.009183228015899658, "rewards/margins": 2.0711443424224854, "rewards/rejected": -2.0803275108337402, "step": 316 }, { "epoch": 1.8785185185185185, "grad_norm": 24.216496534182244, "learning_rate": 4.725596127020879e-07, "logits/chosen": -1.4801841974258423, "logits/rejected": -1.4818217754364014, "logps/chosen": -49.63344955444336, "logps/rejected": -67.68574523925781, "loss": 0.327, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30766502022743225, "rewards/margins": 1.4231786727905273, "rewards/rejected": -1.1155136823654175, "step": 317 }, { "epoch": 1.8844444444444446, "grad_norm": 29.289508668995026, "learning_rate": 4.7226296026840686e-07, "logits/chosen": -1.3967444896697998, "logits/rejected": -1.442480444908142, "logps/chosen": -44.551124572753906, "logps/rejected": -52.382606506347656, "loss": 0.4098, "rewards/accuracies": 0.8125, "rewards/chosen": -0.198195219039917, "rewards/margins": 0.9453842639923096, "rewards/rejected": -1.1435794830322266, "step": 318 }, { "epoch": 1.8903703703703703, "grad_norm": 29.842607290777202, "learning_rate": 4.7196480706523066e-07, "logits/chosen": -1.3752483129501343, "logits/rejected": -1.4549767971038818, "logps/chosen": -45.82021713256836, "logps/rejected": -60.437225341796875, "loss": 0.4177, "rewards/accuracies": 0.875, "rewards/chosen": -0.03907575458288193, "rewards/margins": 1.6668366193771362, "rewards/rejected": -1.7059123516082764, "step": 319 }, { "epoch": 1.8962962962962964, "grad_norm": 27.506534719126677, "learning_rate": 4.716651551057567e-07, "logits/chosen": -1.272362470626831, "logits/rejected": -1.2426154613494873, "logps/chosen": -47.80769348144531, "logps/rejected": -59.07539367675781, "loss": 0.3766, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17932742834091187, "rewards/margins": 1.7639901638031006, "rewards/rejected": -1.9433174133300781, "step": 320 }, { "epoch": 1.9022222222222223, "grad_norm": 23.467742474280602, "learning_rate": 4.7136400641330245e-07, "logits/chosen": -1.2614030838012695, "logits/rejected": -1.359266996383667, "logps/chosen": -39.78665542602539, "logps/rejected": -58.80195617675781, "loss": 0.3148, "rewards/accuracies": 0.875, "rewards/chosen": 0.033597737550735474, "rewards/margins": 2.5896811485290527, "rewards/rejected": -2.5560834407806396, "step": 321 }, { "epoch": 1.9081481481481481, "grad_norm": 26.8700405040657, "learning_rate": 4.710613630212916e-07, "logits/chosen": -1.360331416130066, "logits/rejected": -1.2999292612075806, "logps/chosen": -54.633628845214844, "logps/rejected": -65.06961059570312, "loss": 0.3516, "rewards/accuracies": 0.875, "rewards/chosen": 0.14919951558113098, "rewards/margins": 3.318268299102783, "rewards/rejected": -3.1690688133239746, "step": 322 }, { "epoch": 1.914074074074074, "grad_norm": 25.742617657524853, "learning_rate": 4.707572269732404e-07, "logits/chosen": -1.267808437347412, "logits/rejected": -1.2729883193969727, "logps/chosen": -43.86378860473633, "logps/rejected": -60.72163009643555, "loss": 0.3297, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22709807753562927, "rewards/margins": 1.6642297506332397, "rewards/rejected": -1.4371315240859985, "step": 323 }, { "epoch": 1.92, "grad_norm": 16.874300534646334, "learning_rate": 4.7045160032274376e-07, "logits/chosen": -1.300421953201294, "logits/rejected": -1.3349748849868774, "logps/chosen": -55.42463684082031, "logps/rejected": -74.85099792480469, "loss": 0.2324, "rewards/accuracies": 1.0, "rewards/chosen": -0.36268478631973267, "rewards/margins": 2.777242660522461, "rewards/rejected": -3.139927387237549, "step": 324 }, { "epoch": 1.925925925925926, "grad_norm": 21.141707211169013, "learning_rate": 4.701444851334617e-07, "logits/chosen": -1.3354458808898926, "logits/rejected": -1.3530535697937012, "logps/chosen": -45.7332763671875, "logps/rejected": -46.52414321899414, "loss": 0.2912, "rewards/accuracies": 0.9375, "rewards/chosen": 0.22032663226127625, "rewards/margins": 2.1787269115448, "rewards/rejected": -1.9584002494812012, "step": 325 }, { "epoch": 1.9318518518518517, "grad_norm": 23.948147218514162, "learning_rate": 4.698358834791051e-07, "logits/chosen": -1.2572468519210815, "logits/rejected": -1.3264085054397583, "logps/chosen": -46.600433349609375, "logps/rejected": -62.773555755615234, "loss": 0.2962, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13065075874328613, "rewards/margins": 2.4154248237609863, "rewards/rejected": -2.5460758209228516, "step": 326 }, { "epoch": 1.9377777777777778, "grad_norm": 24.99842201946006, "learning_rate": 4.695257974434215e-07, "logits/chosen": -1.3555095195770264, "logits/rejected": -1.362163782119751, "logps/chosen": -53.40457534790039, "logps/rejected": -53.34534454345703, "loss": 0.3544, "rewards/accuracies": 0.875, "rewards/chosen": -0.1949038803577423, "rewards/margins": 1.9531302452087402, "rewards/rejected": -2.14803409576416, "step": 327 }, { "epoch": 1.9437037037037037, "grad_norm": 20.447626099998985, "learning_rate": 4.6921422912018174e-07, "logits/chosen": -1.222961187362671, "logits/rejected": -1.3553869724273682, "logps/chosen": -38.92687225341797, "logps/rejected": -69.66336059570312, "loss": 0.2428, "rewards/accuracies": 1.0, "rewards/chosen": 0.08813555538654327, "rewards/margins": 2.587808847427368, "rewards/rejected": -2.499673366546631, "step": 328 }, { "epoch": 1.9496296296296296, "grad_norm": 24.203644596916128, "learning_rate": 4.689011806131651e-07, "logits/chosen": -1.2260617017745972, "logits/rejected": -1.2644914388656616, "logps/chosen": -48.461448669433594, "logps/rejected": -53.205650329589844, "loss": 0.3463, "rewards/accuracies": 0.625, "rewards/chosen": -0.5683692693710327, "rewards/margins": 1.0428115129470825, "rewards/rejected": -1.6111807823181152, "step": 329 }, { "epoch": 1.9555555555555557, "grad_norm": 24.924372266570685, "learning_rate": 4.685866540361455e-07, "logits/chosen": -1.095257043838501, "logits/rejected": -1.2506214380264282, "logps/chosen": -43.94602584838867, "logps/rejected": -65.505615234375, "loss": 0.3495, "rewards/accuracies": 0.875, "rewards/chosen": -0.2011902928352356, "rewards/margins": 2.575469493865967, "rewards/rejected": -2.7766599655151367, "step": 330 }, { "epoch": 1.9614814814814814, "grad_norm": 21.28680096956084, "learning_rate": 4.6827065151287726e-07, "logits/chosen": -1.120419979095459, "logits/rejected": -1.2555115222930908, "logps/chosen": -51.23665237426758, "logps/rejected": -65.72523498535156, "loss": 0.2945, "rewards/accuracies": 0.875, "rewards/chosen": -0.3505423069000244, "rewards/margins": 1.8112775087356567, "rewards/rejected": -2.1618199348449707, "step": 331 }, { "epoch": 1.9674074074074075, "grad_norm": 26.217325638823457, "learning_rate": 4.6795317517708037e-07, "logits/chosen": -1.298682451248169, "logits/rejected": -1.3794772624969482, "logps/chosen": -49.35894775390625, "logps/rejected": -53.33086395263672, "loss": 0.3594, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04362444207072258, "rewards/margins": 1.6961524486541748, "rewards/rejected": -1.652527928352356, "step": 332 }, { "epoch": 1.9733333333333334, "grad_norm": 25.481027607863936, "learning_rate": 4.676342271724265e-07, "logits/chosen": -1.204390048980713, "logits/rejected": -1.3311251401901245, "logps/chosen": -39.88072967529297, "logps/rejected": -50.89021682739258, "loss": 0.3892, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02257857657968998, "rewards/margins": 2.2824935913085938, "rewards/rejected": -2.2599148750305176, "step": 333 }, { "epoch": 1.9792592592592593, "grad_norm": 26.32787839414432, "learning_rate": 4.673138096525243e-07, "logits/chosen": -1.3207244873046875, "logits/rejected": -1.3684799671173096, "logps/chosen": -49.37727355957031, "logps/rejected": -68.2453384399414, "loss": 0.3426, "rewards/accuracies": 0.875, "rewards/chosen": -0.12120073288679123, "rewards/margins": 1.4206024408340454, "rewards/rejected": -1.5418031215667725, "step": 334 }, { "epoch": 1.9851851851851852, "grad_norm": 22.17620720442061, "learning_rate": 4.6699192478090495e-07, "logits/chosen": -1.388519287109375, "logits/rejected": -1.4136606454849243, "logps/chosen": -42.03928756713867, "logps/rejected": -63.68479919433594, "loss": 0.2835, "rewards/accuracies": 1.0, "rewards/chosen": -0.16710436344146729, "rewards/margins": 2.781280040740967, "rewards/rejected": -2.9483845233917236, "step": 335 }, { "epoch": 1.991111111111111, "grad_norm": 23.44832441743166, "learning_rate": 4.666685747310074e-07, "logits/chosen": -1.2400968074798584, "logits/rejected": -1.2306945323944092, "logps/chosen": -50.3095703125, "logps/rejected": -66.10108184814453, "loss": 0.269, "rewards/accuracies": 0.875, "rewards/chosen": -0.1821298599243164, "rewards/margins": 2.479905605316162, "rewards/rejected": -2.6620354652404785, "step": 336 }, { "epoch": 1.9970370370370372, "grad_norm": 26.36611150038332, "learning_rate": 4.663437616861641e-07, "logits/chosen": -1.223244547843933, "logits/rejected": -1.3455448150634766, "logps/chosen": -49.680442810058594, "logps/rejected": -54.95825958251953, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 0.02616964653134346, "rewards/margins": 2.6336476802825928, "rewards/rejected": -2.6074776649475098, "step": 337 }, { "epoch": 2.002962962962963, "grad_norm": 21.883276832839393, "learning_rate": 4.660174878395855e-07, "logits/chosen": -1.2061843872070312, "logits/rejected": -1.2987509965896606, "logps/chosen": -46.96556854248047, "logps/rejected": -58.78457260131836, "loss": 0.3064, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11700738966464996, "rewards/margins": 2.5580999851226807, "rewards/rejected": -2.675107479095459, "step": 338 }, { "epoch": 2.008888888888889, "grad_norm": 19.40225522629075, "learning_rate": 4.6568975539434624e-07, "logits/chosen": -1.2043370008468628, "logits/rejected": -1.2271485328674316, "logps/chosen": -40.650550842285156, "logps/rejected": -53.05525207519531, "loss": 0.274, "rewards/accuracies": 1.0, "rewards/chosen": 0.08650882542133331, "rewards/margins": 1.585855484008789, "rewards/rejected": -1.4993466138839722, "step": 339 }, { "epoch": 2.0148148148148146, "grad_norm": 18.552582482952893, "learning_rate": 4.653605665633694e-07, "logits/chosen": -1.258097767829895, "logits/rejected": -1.3163410425186157, "logps/chosen": -52.92136764526367, "logps/rejected": -72.87959289550781, "loss": 0.2445, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22618934512138367, "rewards/margins": 1.7594165802001953, "rewards/rejected": -1.9856061935424805, "step": 340 }, { "epoch": 2.0207407407407407, "grad_norm": 17.05097874274958, "learning_rate": 4.6502992356941193e-07, "logits/chosen": -1.1468836069107056, "logits/rejected": -1.255903959274292, "logps/chosen": -50.84268569946289, "logps/rejected": -70.92694091796875, "loss": 0.2173, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13767805695533752, "rewards/margins": 3.111330270767212, "rewards/rejected": -2.9736523628234863, "step": 341 }, { "epoch": 2.026666666666667, "grad_norm": 18.830186351991596, "learning_rate": 4.6469782864504993e-07, "logits/chosen": -1.1887123584747314, "logits/rejected": -1.2675807476043701, "logps/chosen": -52.629188537597656, "logps/rejected": -63.47866439819336, "loss": 0.2332, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07282394915819168, "rewards/margins": 2.2217636108398438, "rewards/rejected": -2.148939609527588, "step": 342 }, { "epoch": 2.0325925925925925, "grad_norm": 18.786840301991177, "learning_rate": 4.643642840326627e-07, "logits/chosen": -1.1792242527008057, "logits/rejected": -1.296401023864746, "logps/chosen": -37.09912872314453, "logps/rejected": -64.6133804321289, "loss": 0.2325, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18781086802482605, "rewards/margins": 2.0591042041778564, "rewards/rejected": -2.246915102005005, "step": 343 }, { "epoch": 2.0385185185185186, "grad_norm": 22.083870697287118, "learning_rate": 4.6402929198441876e-07, "logits/chosen": -1.3264474868774414, "logits/rejected": -1.3211390972137451, "logps/chosen": -46.68233108520508, "logps/rejected": -55.392520904541016, "loss": 0.2993, "rewards/accuracies": 0.875, "rewards/chosen": -0.011279929429292679, "rewards/margins": 2.057001829147339, "rewards/rejected": -2.068281650543213, "step": 344 }, { "epoch": 2.0444444444444443, "grad_norm": 21.347264429630435, "learning_rate": 4.6369285476225953e-07, "logits/chosen": -1.1508305072784424, "logits/rejected": -1.319612979888916, "logps/chosen": -31.885753631591797, "logps/rejected": -49.32568359375, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 0.35789793729782104, "rewards/margins": 2.335634231567383, "rewards/rejected": -1.977736234664917, "step": 345 }, { "epoch": 2.0503703703703704, "grad_norm": 24.570352764030314, "learning_rate": 4.6335497463788497e-07, "logits/chosen": -1.243829607963562, "logits/rejected": -1.3181369304656982, "logps/chosen": -60.18506622314453, "logps/rejected": -69.44244384765625, "loss": 0.3113, "rewards/accuracies": 0.875, "rewards/chosen": -0.3555411696434021, "rewards/margins": 2.665473461151123, "rewards/rejected": -3.02101469039917, "step": 346 }, { "epoch": 2.0562962962962965, "grad_norm": 17.6853062093381, "learning_rate": 4.6301565389273755e-07, "logits/chosen": -1.5396754741668701, "logits/rejected": -1.5316152572631836, "logps/chosen": -47.59368896484375, "logps/rejected": -55.50246810913086, "loss": 0.2219, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09630143642425537, "rewards/margins": 2.2724692821502686, "rewards/rejected": -2.3687705993652344, "step": 347 }, { "epoch": 2.062222222222222, "grad_norm": 21.180954712617115, "learning_rate": 4.6267489481798736e-07, "logits/chosen": -1.3041589260101318, "logits/rejected": -1.3119125366210938, "logps/chosen": -51.84857177734375, "logps/rejected": -72.58021545410156, "loss": 0.3009, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2827097177505493, "rewards/margins": 3.221679449081421, "rewards/rejected": -2.938969612121582, "step": 348 }, { "epoch": 2.0681481481481483, "grad_norm": 22.28131069737945, "learning_rate": 4.6233269971451627e-07, "logits/chosen": -1.3343658447265625, "logits/rejected": -1.345481276512146, "logps/chosen": -54.08373260498047, "logps/rejected": -62.955909729003906, "loss": 0.2586, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5682754516601562, "rewards/margins": 1.7530314922332764, "rewards/rejected": -2.3213071823120117, "step": 349 }, { "epoch": 2.074074074074074, "grad_norm": 20.48670863420585, "learning_rate": 4.619890708929025e-07, "logits/chosen": -1.255729079246521, "logits/rejected": -1.3206520080566406, "logps/chosen": -46.70282745361328, "logps/rejected": -62.74363708496094, "loss": 0.2786, "rewards/accuracies": 0.875, "rewards/chosen": 0.245827779173851, "rewards/margins": 1.665952444076538, "rewards/rejected": -1.4201246500015259, "step": 350 }, { "epoch": 2.08, "grad_norm": 17.96440002462223, "learning_rate": 4.6164401067340526e-07, "logits/chosen": -1.2545899152755737, "logits/rejected": -1.322774887084961, "logps/chosen": -42.6780891418457, "logps/rejected": -56.45946502685547, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 0.19612331688404083, "rewards/margins": 2.441504955291748, "rewards/rejected": -2.2453818321228027, "step": 351 }, { "epoch": 2.0859259259259257, "grad_norm": 15.24610036802107, "learning_rate": 4.612975213859487e-07, "logits/chosen": -1.2593666315078735, "logits/rejected": -1.2873666286468506, "logps/chosen": -49.45045471191406, "logps/rejected": -74.48650360107422, "loss": 0.2063, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11988449841737747, "rewards/margins": 2.6868491172790527, "rewards/rejected": -2.8067336082458496, "step": 352 }, { "epoch": 2.091851851851852, "grad_norm": 19.39107155679951, "learning_rate": 4.609496053701064e-07, "logits/chosen": -1.0879367589950562, "logits/rejected": -1.1195926666259766, "logps/chosen": -42.81309509277344, "logps/rejected": -66.11750030517578, "loss": 0.2408, "rewards/accuracies": 0.875, "rewards/chosen": -0.026289865374565125, "rewards/margins": 3.345236301422119, "rewards/rejected": -3.3715262413024902, "step": 353 }, { "epoch": 2.097777777777778, "grad_norm": 18.590102731321778, "learning_rate": 4.606002649750855e-07, "logits/chosen": -1.2917572259902954, "logits/rejected": -1.3807308673858643, "logps/chosen": -50.10943603515625, "logps/rejected": -64.49555969238281, "loss": 0.2492, "rewards/accuracies": 0.875, "rewards/chosen": -0.17642168700695038, "rewards/margins": 2.523153781890869, "rewards/rejected": -2.699575424194336, "step": 354 }, { "epoch": 2.1037037037037036, "grad_norm": 19.687014144002955, "learning_rate": 4.6024950255971106e-07, "logits/chosen": -1.2346889972686768, "logits/rejected": -1.3283984661102295, "logps/chosen": -44.39927673339844, "logps/rejected": -67.63725280761719, "loss": 0.2436, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20876558125019073, "rewards/margins": 2.4332196712493896, "rewards/rejected": -2.224454164505005, "step": 355 }, { "epoch": 2.1096296296296297, "grad_norm": 20.92862453942209, "learning_rate": 4.598973204924097e-07, "logits/chosen": -1.226645588874817, "logits/rejected": -1.2606167793273926, "logps/chosen": -41.84648132324219, "logps/rejected": -62.02039337158203, "loss": 0.2865, "rewards/accuracies": 0.9375, "rewards/chosen": 0.004774313420057297, "rewards/margins": 2.0877463817596436, "rewards/rejected": -2.0829720497131348, "step": 356 }, { "epoch": 2.1155555555555554, "grad_norm": 19.003566570766115, "learning_rate": 4.5954372115119395e-07, "logits/chosen": -1.2138569355010986, "logits/rejected": -1.2532055377960205, "logps/chosen": -45.88983154296875, "logps/rejected": -62.843265533447266, "loss": 0.2296, "rewards/accuracies": 1.0, "rewards/chosen": 0.34711575508117676, "rewards/margins": 2.869643449783325, "rewards/rejected": -2.5225276947021484, "step": 357 }, { "epoch": 2.1214814814814815, "grad_norm": 21.94315127503371, "learning_rate": 4.5918870692364606e-07, "logits/chosen": -1.2227267026901245, "logits/rejected": -1.2402318716049194, "logps/chosen": -50.86392593383789, "logps/rejected": -69.76829528808594, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": -0.050845518708229065, "rewards/margins": 2.4909873008728027, "rewards/rejected": -2.54183292388916, "step": 358 }, { "epoch": 2.127407407407407, "grad_norm": 19.662389858943495, "learning_rate": 4.5883228020690204e-07, "logits/chosen": -1.335383653640747, "logits/rejected": -1.4468356370925903, "logps/chosen": -55.14125061035156, "logps/rejected": -81.48980712890625, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": -0.03477056324481964, "rewards/margins": 2.8970305919647217, "rewards/rejected": -2.9318013191223145, "step": 359 }, { "epoch": 2.1333333333333333, "grad_norm": 16.289737421447924, "learning_rate": 4.5847444340763516e-07, "logits/chosen": -1.258293867111206, "logits/rejected": -1.3521993160247803, "logps/chosen": -42.92197799682617, "logps/rejected": -79.57373809814453, "loss": 0.2154, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12718605995178223, "rewards/margins": 3.129183769226074, "rewards/rejected": -3.2563695907592773, "step": 360 }, { "epoch": 2.1392592592592594, "grad_norm": 19.421173501056757, "learning_rate": 4.5811519894204e-07, "logits/chosen": -1.2771823406219482, "logits/rejected": -1.2784755229949951, "logps/chosen": -45.26205062866211, "logps/rejected": -49.14915084838867, "loss": 0.2966, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3483662009239197, "rewards/margins": 1.337389349937439, "rewards/rejected": -1.6857554912567139, "step": 361 }, { "epoch": 2.145185185185185, "grad_norm": 17.47408354292453, "learning_rate": 4.577545492358159e-07, "logits/chosen": -1.4037187099456787, "logits/rejected": -1.4769129753112793, "logps/chosen": -40.952667236328125, "logps/rejected": -42.973777770996094, "loss": 0.2218, "rewards/accuracies": 1.0, "rewards/chosen": 0.18929442763328552, "rewards/margins": 1.6638569831848145, "rewards/rejected": -1.474562406539917, "step": 362 }, { "epoch": 2.151111111111111, "grad_norm": 19.77727999230194, "learning_rate": 4.573924967241509e-07, "logits/chosen": -1.2610749006271362, "logits/rejected": -1.358107566833496, "logps/chosen": -52.18950653076172, "logps/rejected": -61.858158111572266, "loss": 0.2557, "rewards/accuracies": 0.875, "rewards/chosen": -0.37960347533226013, "rewards/margins": 1.40680730342865, "rewards/rejected": -1.7864106893539429, "step": 363 }, { "epoch": 2.157037037037037, "grad_norm": 21.887529470415178, "learning_rate": 4.5702904385170495e-07, "logits/chosen": -1.1152464151382446, "logits/rejected": -1.1701469421386719, "logps/chosen": -42.043331146240234, "logps/rejected": -60.342838287353516, "loss": 0.2271, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09139405190944672, "rewards/margins": 2.1598100662231445, "rewards/rejected": -2.068415880203247, "step": 364 }, { "epoch": 2.162962962962963, "grad_norm": 20.56491024004271, "learning_rate": 4.566641930725935e-07, "logits/chosen": -1.3306797742843628, "logits/rejected": -1.3579617738723755, "logps/chosen": -46.26102828979492, "logps/rejected": -64.42369079589844, "loss": 0.2496, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09908976405858994, "rewards/margins": 2.0486528873443604, "rewards/rejected": -2.147742509841919, "step": 365 }, { "epoch": 2.168888888888889, "grad_norm": 21.929097513898213, "learning_rate": 4.5629794685037125e-07, "logits/chosen": -1.3259196281433105, "logits/rejected": -1.438348412513733, "logps/chosen": -48.74983596801758, "logps/rejected": -70.9134292602539, "loss": 0.2965, "rewards/accuracies": 0.875, "rewards/chosen": -0.014818083494901657, "rewards/margins": 2.414520263671875, "rewards/rejected": -2.429338216781616, "step": 366 }, { "epoch": 2.1748148148148148, "grad_norm": 21.17363277706632, "learning_rate": 4.5593030765801493e-07, "logits/chosen": -1.1760063171386719, "logits/rejected": -1.3117597103118896, "logps/chosen": -40.7785530090332, "logps/rejected": -55.60898971557617, "loss": 0.2749, "rewards/accuracies": 1.0, "rewards/chosen": -0.1574682742357254, "rewards/margins": 2.237823247909546, "rewards/rejected": -2.395291566848755, "step": 367 }, { "epoch": 2.180740740740741, "grad_norm": 15.52905572804225, "learning_rate": 4.555612779779071e-07, "logits/chosen": -1.177214503288269, "logits/rejected": -1.1642229557037354, "logps/chosen": -50.61836242675781, "logps/rejected": -67.73853302001953, "loss": 0.1967, "rewards/accuracies": 1.0, "rewards/chosen": -0.10218016803264618, "rewards/margins": 2.1828601360321045, "rewards/rejected": -2.2850403785705566, "step": 368 }, { "epoch": 2.1866666666666665, "grad_norm": 18.91258488483491, "learning_rate": 4.551908603018191e-07, "logits/chosen": -1.194425344467163, "logits/rejected": -1.2497589588165283, "logps/chosen": -44.26604461669922, "logps/rejected": -61.95736312866211, "loss": 0.2498, "rewards/accuracies": 1.0, "rewards/chosen": 0.11065736413002014, "rewards/margins": 2.543154239654541, "rewards/rejected": -2.432497024536133, "step": 369 }, { "epoch": 2.1925925925925926, "grad_norm": 21.766572819900112, "learning_rate": 4.548190571308944e-07, "logits/chosen": -1.2378605604171753, "logits/rejected": -1.2948296070098877, "logps/chosen": -47.71161651611328, "logps/rejected": -73.49365234375, "loss": 0.2675, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05222397670149803, "rewards/margins": 2.5048434734344482, "rewards/rejected": -2.557067394256592, "step": 370 }, { "epoch": 2.1985185185185183, "grad_norm": 18.684296079943266, "learning_rate": 4.5444587097563166e-07, "logits/chosen": -1.4498980045318604, "logits/rejected": -1.547486424446106, "logps/chosen": -49.34889221191406, "logps/rejected": -64.73408508300781, "loss": 0.2505, "rewards/accuracies": 1.0, "rewards/chosen": -0.009527906775474548, "rewards/margins": 2.8368988037109375, "rewards/rejected": -2.846426486968994, "step": 371 }, { "epoch": 2.2044444444444444, "grad_norm": 20.657169189715972, "learning_rate": 4.540713043558677e-07, "logits/chosen": -1.2857564687728882, "logits/rejected": -1.3269503116607666, "logps/chosen": -58.65913772583008, "logps/rejected": -69.71886444091797, "loss": 0.2709, "rewards/accuracies": 0.9375, "rewards/chosen": 0.27366358041763306, "rewards/margins": 2.6564712524414062, "rewards/rejected": -2.382807731628418, "step": 372 }, { "epoch": 2.2103703703703705, "grad_norm": 17.83103596373758, "learning_rate": 4.536953598007607e-07, "logits/chosen": -1.4808869361877441, "logits/rejected": -1.4333157539367676, "logps/chosen": -57.45781707763672, "logps/rejected": -56.999839782714844, "loss": 0.2291, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30290722846984863, "rewards/margins": 2.3255772590637207, "rewards/rejected": -2.022670269012451, "step": 373 }, { "epoch": 2.216296296296296, "grad_norm": 15.546827141849722, "learning_rate": 4.533180398487726e-07, "logits/chosen": -1.2495567798614502, "logits/rejected": -1.2979238033294678, "logps/chosen": -63.49386215209961, "logps/rejected": -64.50764465332031, "loss": 0.2212, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29394635558128357, "rewards/margins": 3.0806996822357178, "rewards/rejected": -2.7867531776428223, "step": 374 }, { "epoch": 2.2222222222222223, "grad_norm": 21.86157113722065, "learning_rate": 4.529393470476528e-07, "logits/chosen": -1.3445886373519897, "logits/rejected": -1.3152225017547607, "logps/chosen": -47.706058502197266, "logps/rejected": -49.469703674316406, "loss": 0.2421, "rewards/accuracies": 1.0, "rewards/chosen": 0.056947916746139526, "rewards/margins": 2.0946450233459473, "rewards/rejected": -2.0376970767974854, "step": 375 }, { "epoch": 2.228148148148148, "grad_norm": 17.43370404766248, "learning_rate": 4.525592839544202e-07, "logits/chosen": -1.1432135105133057, "logits/rejected": -1.265343427658081, "logps/chosen": -40.92923355102539, "logps/rejected": -68.56723022460938, "loss": 0.2291, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3592386841773987, "rewards/margins": 3.1753196716308594, "rewards/rejected": -3.5345585346221924, "step": 376 }, { "epoch": 2.234074074074074, "grad_norm": 18.051266700370928, "learning_rate": 4.521778531353462e-07, "logits/chosen": -1.3263722658157349, "logits/rejected": -1.3930516242980957, "logps/chosen": -47.67115020751953, "logps/rejected": -63.065364837646484, "loss": 0.1941, "rewards/accuracies": 1.0, "rewards/chosen": -0.1850857436656952, "rewards/margins": 3.161947250366211, "rewards/rejected": -3.3470327854156494, "step": 377 }, { "epoch": 2.24, "grad_norm": 16.584129376985537, "learning_rate": 4.517950571659376e-07, "logits/chosen": -1.0496330261230469, "logits/rejected": -1.141974687576294, "logps/chosen": -40.75746154785156, "logps/rejected": -60.00761795043945, "loss": 0.2137, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10689018666744232, "rewards/margins": 2.207362651824951, "rewards/rejected": -2.3142528533935547, "step": 378 }, { "epoch": 2.245925925925926, "grad_norm": 21.83474303669417, "learning_rate": 4.5141089863091876e-07, "logits/chosen": -1.3251947164535522, "logits/rejected": -1.4994127750396729, "logps/chosen": -45.369197845458984, "logps/rejected": -60.48664093017578, "loss": 0.2603, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1240503191947937, "rewards/margins": 2.388545513153076, "rewards/rejected": -2.2644951343536377, "step": 379 }, { "epoch": 2.251851851851852, "grad_norm": 16.78610677871395, "learning_rate": 4.5102538012421463e-07, "logits/chosen": -1.26201593875885, "logits/rejected": -1.312106728553772, "logps/chosen": -38.60343933105469, "logps/rejected": -53.056270599365234, "loss": 0.2297, "rewards/accuracies": 1.0, "rewards/chosen": 0.21554449200630188, "rewards/margins": 2.4207348823547363, "rewards/rejected": -2.2051901817321777, "step": 380 }, { "epoch": 2.2577777777777777, "grad_norm": 23.152601443193166, "learning_rate": 4.506385042489328e-07, "logits/chosen": -1.4015882015228271, "logits/rejected": -1.346733808517456, "logps/chosen": -51.458953857421875, "logps/rejected": -56.37825012207031, "loss": 0.31, "rewards/accuracies": 0.875, "rewards/chosen": 0.21772369742393494, "rewards/margins": 1.8389344215393066, "rewards/rejected": -1.6212105751037598, "step": 381 }, { "epoch": 2.2637037037037038, "grad_norm": 21.27171913918321, "learning_rate": 4.5025027361734613e-07, "logits/chosen": -1.1399121284484863, "logits/rejected": -1.2481327056884766, "logps/chosen": -39.44144058227539, "logps/rejected": -61.8831901550293, "loss": 0.3136, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05294826626777649, "rewards/margins": 2.6817171573638916, "rewards/rejected": -2.6287689208984375, "step": 382 }, { "epoch": 2.2696296296296294, "grad_norm": 21.31274885508849, "learning_rate": 4.498606908508753e-07, "logits/chosen": -1.1971873044967651, "logits/rejected": -1.1886755228042603, "logps/chosen": -42.121524810791016, "logps/rejected": -58.5207405090332, "loss": 0.2814, "rewards/accuracies": 1.0, "rewards/chosen": 0.0361473523080349, "rewards/margins": 2.876237630844116, "rewards/rejected": -2.840089797973633, "step": 383 }, { "epoch": 2.2755555555555556, "grad_norm": 19.525267138237183, "learning_rate": 4.4946975858007064e-07, "logits/chosen": -1.2907319068908691, "logits/rejected": -1.3001677989959717, "logps/chosen": -39.252166748046875, "logps/rejected": -56.55052947998047, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 0.35257211327552795, "rewards/margins": 3.527677536010742, "rewards/rejected": -3.175105571746826, "step": 384 }, { "epoch": 2.2814814814814817, "grad_norm": 21.7988692040449, "learning_rate": 4.4907747944459484e-07, "logits/chosen": -1.2917722463607788, "logits/rejected": -1.3734023571014404, "logps/chosen": -52.56479263305664, "logps/rejected": -63.253562927246094, "loss": 0.2781, "rewards/accuracies": 0.875, "rewards/chosen": 0.09430493414402008, "rewards/margins": 2.2183127403259277, "rewards/rejected": -2.1240079402923584, "step": 385 }, { "epoch": 2.2874074074074073, "grad_norm": 15.89301818799247, "learning_rate": 4.486838560932048e-07, "logits/chosen": -1.228432297706604, "logits/rejected": -1.2870917320251465, "logps/chosen": -45.21752166748047, "logps/rejected": -54.30406951904297, "loss": 0.2494, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20035433769226074, "rewards/margins": 1.99539053440094, "rewards/rejected": -2.195744752883911, "step": 386 }, { "epoch": 2.2933333333333334, "grad_norm": 14.328174093261362, "learning_rate": 4.4828889118373394e-07, "logits/chosen": -1.2832486629486084, "logits/rejected": -1.2992146015167236, "logps/chosen": -54.598121643066406, "logps/rejected": -64.07887268066406, "loss": 0.157, "rewards/accuracies": 0.875, "rewards/chosen": 0.2271214723587036, "rewards/margins": 2.856295108795166, "rewards/rejected": -2.629173755645752, "step": 387 }, { "epoch": 2.299259259259259, "grad_norm": 21.241967819688803, "learning_rate": 4.4789258738307413e-07, "logits/chosen": -1.3300862312316895, "logits/rejected": -1.480217456817627, "logps/chosen": -42.869178771972656, "logps/rejected": -63.462066650390625, "loss": 0.2883, "rewards/accuracies": 0.875, "rewards/chosen": 0.06202170252799988, "rewards/margins": 1.9598344564437866, "rewards/rejected": -1.8978126049041748, "step": 388 }, { "epoch": 2.3051851851851852, "grad_norm": 19.822991467494322, "learning_rate": 4.474949473671578e-07, "logits/chosen": -1.0555452108383179, "logits/rejected": -1.042801022529602, "logps/chosen": -38.282432556152344, "logps/rejected": -58.744590759277344, "loss": 0.2191, "rewards/accuracies": 0.875, "rewards/chosen": 0.36787959933280945, "rewards/margins": 3.253455400466919, "rewards/rejected": -2.885575532913208, "step": 389 }, { "epoch": 2.311111111111111, "grad_norm": 21.08659415889429, "learning_rate": 4.4709597382093976e-07, "logits/chosen": -1.3296679258346558, "logits/rejected": -1.3939673900604248, "logps/chosen": -45.78785705566406, "logps/rejected": -65.52828979492188, "loss": 0.2543, "rewards/accuracies": 0.875, "rewards/chosen": 0.6854375004768372, "rewards/margins": 1.8844356536865234, "rewards/rejected": -1.198998212814331, "step": 390 }, { "epoch": 2.317037037037037, "grad_norm": 16.82223616928778, "learning_rate": 4.4669566943837916e-07, "logits/chosen": -1.3169894218444824, "logits/rejected": -1.3499053716659546, "logps/chosen": -46.05174255371094, "logps/rejected": -60.99545669555664, "loss": 0.2095, "rewards/accuracies": 1.0, "rewards/chosen": -0.08141281455755234, "rewards/margins": 2.3928728103637695, "rewards/rejected": -2.47428560256958, "step": 391 }, { "epoch": 2.322962962962963, "grad_norm": 20.79237283373031, "learning_rate": 4.462940369224212e-07, "logits/chosen": -1.3965792655944824, "logits/rejected": -1.3936798572540283, "logps/chosen": -43.78404235839844, "logps/rejected": -59.989688873291016, "loss": 0.2344, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13246512413024902, "rewards/margins": 2.7492973804473877, "rewards/rejected": -2.8817625045776367, "step": 392 }, { "epoch": 2.328888888888889, "grad_norm": 16.04337015452463, "learning_rate": 4.4589107898497885e-07, "logits/chosen": -1.2513833045959473, "logits/rejected": -1.3017098903656006, "logps/chosen": -47.380184173583984, "logps/rejected": -60.824520111083984, "loss": 0.205, "rewards/accuracies": 1.0, "rewards/chosen": -0.05711951479315758, "rewards/margins": 2.9654927253723145, "rewards/rejected": -3.0226120948791504, "step": 393 }, { "epoch": 2.334814814814815, "grad_norm": 22.721297568840754, "learning_rate": 4.454867983469148e-07, "logits/chosen": -1.3320375680923462, "logits/rejected": -1.4008305072784424, "logps/chosen": -45.44051742553711, "logps/rejected": -55.844852447509766, "loss": 0.2664, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1650034487247467, "rewards/margins": 1.6259642839431763, "rewards/rejected": -1.7909677028656006, "step": 394 }, { "epoch": 2.3407407407407406, "grad_norm": 17.321615878576583, "learning_rate": 4.4508119773802294e-07, "logits/chosen": -1.246985912322998, "logits/rejected": -1.2684519290924072, "logps/chosen": -35.195648193359375, "logps/rejected": -53.54356002807617, "loss": 0.2295, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10865418612957001, "rewards/margins": 2.806920051574707, "rewards/rejected": -2.69826602935791, "step": 395 }, { "epoch": 2.3466666666666667, "grad_norm": 16.827821341688068, "learning_rate": 4.4467427989700967e-07, "logits/chosen": -1.0964595079421997, "logits/rejected": -1.1211133003234863, "logps/chosen": -60.25492858886719, "logps/rejected": -70.0510025024414, "loss": 0.2176, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3291707932949066, "rewards/margins": 3.0319859981536865, "rewards/rejected": -3.361156940460205, "step": 396 }, { "epoch": 2.3525925925925923, "grad_norm": 17.57499092020414, "learning_rate": 4.442660475714758e-07, "logits/chosen": -1.3368898630142212, "logits/rejected": -1.3711633682250977, "logps/chosen": -55.84284210205078, "logps/rejected": -67.59239196777344, "loss": 0.2135, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33838677406311035, "rewards/margins": 2.886188507080078, "rewards/rejected": -3.2245752811431885, "step": 397 }, { "epoch": 2.3585185185185185, "grad_norm": 15.652194606879792, "learning_rate": 4.438565035178979e-07, "logits/chosen": -1.3266401290893555, "logits/rejected": -1.2795634269714355, "logps/chosen": -39.78150177001953, "logps/rejected": -52.82093048095703, "loss": 0.2074, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13564905524253845, "rewards/margins": 2.1605172157287598, "rewards/rejected": -2.0248682498931885, "step": 398 }, { "epoch": 2.3644444444444446, "grad_norm": 16.839558145858316, "learning_rate": 4.434456505016094e-07, "logits/chosen": -1.1723679304122925, "logits/rejected": -1.2517800331115723, "logps/chosen": -39.925350189208984, "logps/rejected": -54.610374450683594, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 0.07945774495601654, "rewards/margins": 2.555344343185425, "rewards/rejected": -2.475886344909668, "step": 399 }, { "epoch": 2.3703703703703702, "grad_norm": 20.294868135471607, "learning_rate": 4.430334912967823e-07, "logits/chosen": -1.2500160932540894, "logits/rejected": -1.350742220878601, "logps/chosen": -47.22673034667969, "logps/rejected": -63.268775939941406, "loss": 0.217, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0041030943393707275, "rewards/margins": 2.526927947998047, "rewards/rejected": -2.522824764251709, "step": 400 }, { "epoch": 2.3762962962962964, "grad_norm": 14.309694923599055, "learning_rate": 4.4262002868640826e-07, "logits/chosen": -1.355212688446045, "logits/rejected": -1.4428194761276245, "logps/chosen": -62.766273498535156, "logps/rejected": -70.3138198852539, "loss": 0.1547, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7476651668548584, "rewards/margins": 2.508098602294922, "rewards/rejected": -3.2557637691497803, "step": 401 }, { "epoch": 2.3822222222222225, "grad_norm": 22.127072690196805, "learning_rate": 4.422052654622799e-07, "logits/chosen": -1.1594799757003784, "logits/rejected": -1.1490657329559326, "logps/chosen": -51.881160736083984, "logps/rejected": -63.252323150634766, "loss": 0.2435, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6946939826011658, "rewards/margins": 3.3599495887756348, "rewards/rejected": -4.054643630981445, "step": 402 }, { "epoch": 2.388148148148148, "grad_norm": 14.586982600230868, "learning_rate": 4.417892044249716e-07, "logits/chosen": -1.1080595254898071, "logits/rejected": -1.2213943004608154, "logps/chosen": -48.669837951660156, "logps/rejected": -62.76885986328125, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": -0.2799440622329712, "rewards/margins": 2.3571085929870605, "rewards/rejected": -2.6370527744293213, "step": 403 }, { "epoch": 2.3940740740740742, "grad_norm": 16.407021152533144, "learning_rate": 4.4137184838382125e-07, "logits/chosen": -1.338707447052002, "logits/rejected": -1.4166909456253052, "logps/chosen": -49.7010498046875, "logps/rejected": -63.55725860595703, "loss": 0.2026, "rewards/accuracies": 1.0, "rewards/chosen": 0.03665490448474884, "rewards/margins": 3.0050880908966064, "rewards/rejected": -2.968432903289795, "step": 404 }, { "epoch": 2.4, "grad_norm": 15.47676852300944, "learning_rate": 4.409532001569105e-07, "logits/chosen": -1.0699396133422852, "logits/rejected": -1.1151628494262695, "logps/chosen": -44.824283599853516, "logps/rejected": -58.522796630859375, "loss": 0.1994, "rewards/accuracies": 1.0, "rewards/chosen": -0.2893208861351013, "rewards/margins": 3.242827892303467, "rewards/rejected": -3.532148838043213, "step": 405 }, { "epoch": 2.405925925925926, "grad_norm": 22.374176782743092, "learning_rate": 4.405332625710465e-07, "logits/chosen": -1.202767252922058, "logits/rejected": -1.2067598104476929, "logps/chosen": -50.498321533203125, "logps/rejected": -56.5478401184082, "loss": 0.2553, "rewards/accuracies": 0.875, "rewards/chosen": -0.38981667160987854, "rewards/margins": 2.1452839374542236, "rewards/rejected": -2.5351004600524902, "step": 406 }, { "epoch": 2.4118518518518517, "grad_norm": 16.687053025804047, "learning_rate": 4.401120384617423e-07, "logits/chosen": -1.4677153825759888, "logits/rejected": -1.3985137939453125, "logps/chosen": -61.11507034301758, "logps/rejected": -70.34519958496094, "loss": 0.2365, "rewards/accuracies": 0.875, "rewards/chosen": -0.737166702747345, "rewards/margins": 3.606313943862915, "rewards/rejected": -4.343481063842773, "step": 407 }, { "epoch": 2.417777777777778, "grad_norm": 18.669321736650193, "learning_rate": 4.396895306731977e-07, "logits/chosen": -1.3816975355148315, "logits/rejected": -1.339860200881958, "logps/chosen": -51.137413024902344, "logps/rejected": -51.43387985229492, "loss": 0.2025, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10709303617477417, "rewards/margins": 2.5654587745666504, "rewards/rejected": -2.6725518703460693, "step": 408 }, { "epoch": 2.423703703703704, "grad_norm": 15.749361927163159, "learning_rate": 4.3926574205828037e-07, "logits/chosen": -1.1337440013885498, "logits/rejected": -1.2759745121002197, "logps/chosen": -35.335487365722656, "logps/rejected": -54.63352584838867, "loss": 0.192, "rewards/accuracies": 1.0, "rewards/chosen": 0.07870366424322128, "rewards/margins": 3.1690025329589844, "rewards/rejected": -3.090298652648926, "step": 409 }, { "epoch": 2.4296296296296296, "grad_norm": 18.104317236901334, "learning_rate": 4.388406754785063e-07, "logits/chosen": -1.3435664176940918, "logits/rejected": -1.3172290325164795, "logps/chosen": -41.54008865356445, "logps/rejected": -57.23595428466797, "loss": 0.2439, "rewards/accuracies": 0.875, "rewards/chosen": -0.2125397026538849, "rewards/margins": 2.206397771835327, "rewards/rejected": -2.4189372062683105, "step": 410 }, { "epoch": 2.4355555555555557, "grad_norm": 21.577238479012582, "learning_rate": 4.3841433380402073e-07, "logits/chosen": -1.3005995750427246, "logits/rejected": -1.3782299757003784, "logps/chosen": -48.72734069824219, "logps/rejected": -74.93363952636719, "loss": 0.2632, "rewards/accuracies": 1.0, "rewards/chosen": 0.2786545753479004, "rewards/margins": 3.2695655822753906, "rewards/rejected": -2.9909110069274902, "step": 411 }, { "epoch": 2.4414814814814814, "grad_norm": 14.034858000999158, "learning_rate": 4.379867199135785e-07, "logits/chosen": -1.4155633449554443, "logits/rejected": -1.472240686416626, "logps/chosen": -45.016868591308594, "logps/rejected": -76.23003387451172, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": -0.6123547554016113, "rewards/margins": 3.1689391136169434, "rewards/rejected": -3.7812938690185547, "step": 412 }, { "epoch": 2.4474074074074075, "grad_norm": 16.779130305262424, "learning_rate": 4.375578366945246e-07, "logits/chosen": -1.3278855085372925, "logits/rejected": -1.3343737125396729, "logps/chosen": -46.692420959472656, "logps/rejected": -53.395774841308594, "loss": 0.1859, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5343155264854431, "rewards/margins": 2.1621780395507812, "rewards/rejected": -2.696493625640869, "step": 413 }, { "epoch": 2.453333333333333, "grad_norm": 19.140589088785962, "learning_rate": 4.3712768704277524e-07, "logits/chosen": -1.3284047842025757, "logits/rejected": -1.3112947940826416, "logps/chosen": -43.248924255371094, "logps/rejected": -55.926727294921875, "loss": 0.2448, "rewards/accuracies": 0.875, "rewards/chosen": -0.09817357361316681, "rewards/margins": 2.1646008491516113, "rewards/rejected": -2.2627744674682617, "step": 414 }, { "epoch": 2.4592592592592593, "grad_norm": 21.078378047432963, "learning_rate": 4.366962738627975e-07, "logits/chosen": -1.0980134010314941, "logits/rejected": -1.2354460954666138, "logps/chosen": -33.716976165771484, "logps/rejected": -55.4927978515625, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": -0.5064057111740112, "rewards/margins": 2.779224395751953, "rewards/rejected": -3.285630226135254, "step": 415 }, { "epoch": 2.4651851851851854, "grad_norm": 17.149088012986873, "learning_rate": 4.3626360006759016e-07, "logits/chosen": -1.3233143091201782, "logits/rejected": -1.318281650543213, "logps/chosen": -57.326744079589844, "logps/rejected": -63.12444305419922, "loss": 0.1736, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13537603616714478, "rewards/margins": 2.3170013427734375, "rewards/rejected": -2.4523775577545166, "step": 416 }, { "epoch": 2.471111111111111, "grad_norm": 16.379890197970408, "learning_rate": 4.3582966857866397e-07, "logits/chosen": -1.3193371295928955, "logits/rejected": -1.3573179244995117, "logps/chosen": -41.1270637512207, "logps/rejected": -57.641868591308594, "loss": 0.2079, "rewards/accuracies": 1.0, "rewards/chosen": 0.10911901295185089, "rewards/margins": 2.449983835220337, "rewards/rejected": -2.340864896774292, "step": 417 }, { "epoch": 2.477037037037037, "grad_norm": 18.867606776179514, "learning_rate": 4.353944823260221e-07, "logits/chosen": -1.1927530765533447, "logits/rejected": -1.2237083911895752, "logps/chosen": -41.90693664550781, "logps/rejected": -67.52421569824219, "loss": 0.2608, "rewards/accuracies": 0.8125, "rewards/chosen": -0.009037256240844727, "rewards/margins": 2.025681495666504, "rewards/rejected": -2.0347187519073486, "step": 418 }, { "epoch": 2.482962962962963, "grad_norm": 19.73233833392236, "learning_rate": 4.3495804424813986e-07, "logits/chosen": -1.3428009748458862, "logits/rejected": -1.4153268337249756, "logps/chosen": -46.6826057434082, "logps/rejected": -60.5880012512207, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": -0.07019033282995224, "rewards/margins": 2.512794256210327, "rewards/rejected": -2.582984685897827, "step": 419 }, { "epoch": 2.488888888888889, "grad_norm": 16.547428466719083, "learning_rate": 4.3452035729194534e-07, "logits/chosen": -1.2679708003997803, "logits/rejected": -1.3539072275161743, "logps/chosen": -44.795372009277344, "logps/rejected": -65.70006561279297, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 0.28597304224967957, "rewards/margins": 3.0686075687408447, "rewards/rejected": -2.7826344966888428, "step": 420 }, { "epoch": 2.4948148148148146, "grad_norm": 19.61553554693557, "learning_rate": 4.340814244127993e-07, "logits/chosen": -1.2777574062347412, "logits/rejected": -1.4196518659591675, "logps/chosen": -45.838314056396484, "logps/rejected": -57.400760650634766, "loss": 0.2595, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10272904485464096, "rewards/margins": 1.4262562990188599, "rewards/rejected": -1.5289852619171143, "step": 421 }, { "epoch": 2.5007407407407407, "grad_norm": 17.434275927746988, "learning_rate": 4.3364124857447525e-07, "logits/chosen": -1.1939697265625, "logits/rejected": -1.2483940124511719, "logps/chosen": -52.27749252319336, "logps/rejected": -65.15104675292969, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 0.14658820629119873, "rewards/margins": 2.7353010177612305, "rewards/rejected": -2.588712692260742, "step": 422 }, { "epoch": 2.506666666666667, "grad_norm": 18.989696276439936, "learning_rate": 4.331998327491395e-07, "logits/chosen": -1.390942931175232, "logits/rejected": -1.388514518737793, "logps/chosen": -48.308021545410156, "logps/rejected": -62.73554229736328, "loss": 0.2025, "rewards/accuracies": 0.875, "rewards/chosen": -0.4263012409210205, "rewards/margins": 2.9498748779296875, "rewards/rejected": -3.376175880432129, "step": 423 }, { "epoch": 2.5125925925925925, "grad_norm": 18.052245677061464, "learning_rate": 4.3275717991733097e-07, "logits/chosen": -1.1927298307418823, "logits/rejected": -1.2486273050308228, "logps/chosen": -40.60221862792969, "logps/rejected": -54.09161376953125, "loss": 0.2339, "rewards/accuracies": 0.875, "rewards/chosen": -0.24772867560386658, "rewards/margins": 1.7447564601898193, "rewards/rejected": -1.9924849271774292, "step": 424 }, { "epoch": 2.5185185185185186, "grad_norm": 15.60989150821569, "learning_rate": 4.3231329306794106e-07, "logits/chosen": -1.206120252609253, "logits/rejected": -1.2819563150405884, "logps/chosen": -47.97679138183594, "logps/rejected": -60.327110290527344, "loss": 0.2114, "rewards/accuracies": 1.0, "rewards/chosen": -0.0791320651769638, "rewards/margins": 2.8907041549682617, "rewards/rejected": -2.9698362350463867, "step": 425 }, { "epoch": 2.5244444444444447, "grad_norm": 22.753332037000355, "learning_rate": 4.3186817519819365e-07, "logits/chosen": -1.3056433200836182, "logits/rejected": -1.2404112815856934, "logps/chosen": -48.254608154296875, "logps/rejected": -63.55730056762695, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": -0.5698607563972473, "rewards/margins": 3.0423474311828613, "rewards/rejected": -3.612208366394043, "step": 426 }, { "epoch": 2.5303703703703704, "grad_norm": 19.819465925330856, "learning_rate": 4.314218293136247e-07, "logits/chosen": -1.2659276723861694, "logits/rejected": -1.3557699918746948, "logps/chosen": -40.644039154052734, "logps/rejected": -51.36490249633789, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": -0.053419072180986404, "rewards/margins": 2.2699971199035645, "rewards/rejected": -2.323416233062744, "step": 427 }, { "epoch": 2.536296296296296, "grad_norm": 17.374322567562608, "learning_rate": 4.30974258428062e-07, "logits/chosen": -1.2225278615951538, "logits/rejected": -1.2667872905731201, "logps/chosen": -57.24745178222656, "logps/rejected": -51.737056732177734, "loss": 0.2103, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41055721044540405, "rewards/margins": 2.0105388164520264, "rewards/rejected": -2.421095848083496, "step": 428 }, { "epoch": 2.542222222222222, "grad_norm": 16.50173497546316, "learning_rate": 4.3052546556360486e-07, "logits/chosen": -1.2917060852050781, "logits/rejected": -1.2804986238479614, "logps/chosen": -39.234336853027344, "logps/rejected": -51.28722381591797, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 0.2359772026538849, "rewards/margins": 2.1480839252471924, "rewards/rejected": -1.9121068716049194, "step": 429 }, { "epoch": 2.5481481481481483, "grad_norm": 18.45028210343722, "learning_rate": 4.300754537506036e-07, "logits/chosen": -1.4156684875488281, "logits/rejected": -1.3991972208023071, "logps/chosen": -48.09461212158203, "logps/rejected": -51.55734634399414, "loss": 0.2199, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08063582330942154, "rewards/margins": 2.727208137512207, "rewards/rejected": -2.6465723514556885, "step": 430 }, { "epoch": 2.554074074074074, "grad_norm": 18.538785529380714, "learning_rate": 4.2962422602763925e-07, "logits/chosen": -1.2535463571548462, "logits/rejected": -1.2664145231246948, "logps/chosen": -39.574153900146484, "logps/rejected": -58.36887741088867, "loss": 0.2514, "rewards/accuracies": 0.9375, "rewards/chosen": 0.023720331490039825, "rewards/margins": 2.1531851291656494, "rewards/rejected": -2.129464626312256, "step": 431 }, { "epoch": 2.56, "grad_norm": 20.53028864801327, "learning_rate": 4.2917178544150284e-07, "logits/chosen": -1.2374775409698486, "logits/rejected": -1.3365933895111084, "logps/chosen": -40.8128547668457, "logps/rejected": -62.45230484008789, "loss": 0.2055, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37315669655799866, "rewards/margins": 2.7972960472106934, "rewards/rejected": -3.170452833175659, "step": 432 }, { "epoch": 2.565925925925926, "grad_norm": 23.60580081011262, "learning_rate": 4.2871813504717497e-07, "logits/chosen": -1.3173975944519043, "logits/rejected": -1.3482545614242554, "logps/chosen": -45.852691650390625, "logps/rejected": -65.40410614013672, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": -0.3781399428844452, "rewards/margins": 2.6333463191986084, "rewards/rejected": -3.011486291885376, "step": 433 }, { "epoch": 2.571851851851852, "grad_norm": 17.646349860883195, "learning_rate": 4.2826327790780505e-07, "logits/chosen": -1.362741470336914, "logits/rejected": -1.388155460357666, "logps/chosen": -48.00885772705078, "logps/rejected": -59.41029739379883, "loss": 0.2175, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17543169856071472, "rewards/margins": 2.6400036811828613, "rewards/rejected": -2.8154356479644775, "step": 434 }, { "epoch": 2.5777777777777775, "grad_norm": 19.358547711268727, "learning_rate": 4.278072170946909e-07, "logits/chosen": -1.3277589082717896, "logits/rejected": -1.3741272687911987, "logps/chosen": -53.56547927856445, "logps/rejected": -63.62765121459961, "loss": 0.2636, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37384939193725586, "rewards/margins": 2.68127703666687, "rewards/rejected": -3.055126190185547, "step": 435 }, { "epoch": 2.5837037037037036, "grad_norm": 14.283524298642329, "learning_rate": 4.273499556872576e-07, "logits/chosen": -1.125074863433838, "logits/rejected": -1.2602410316467285, "logps/chosen": -43.2216796875, "logps/rejected": -65.49564361572266, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": -0.21781915426254272, "rewards/margins": 3.0550427436828613, "rewards/rejected": -3.272862195968628, "step": 436 }, { "epoch": 2.5896296296296297, "grad_norm": 14.547779436253572, "learning_rate": 4.2689149677303716e-07, "logits/chosen": -1.3099594116210938, "logits/rejected": -1.4164810180664062, "logps/chosen": -51.39214324951172, "logps/rejected": -58.63938903808594, "loss": 0.1663, "rewards/accuracies": 1.0, "rewards/chosen": 0.15673355758190155, "rewards/margins": 2.696232557296753, "rewards/rejected": -2.539498805999756, "step": 437 }, { "epoch": 2.5955555555555554, "grad_norm": 18.874472104141944, "learning_rate": 4.264318434476472e-07, "logits/chosen": -1.340260624885559, "logits/rejected": -1.4135973453521729, "logps/chosen": -54.49589157104492, "logps/rejected": -62.51300048828125, "loss": 0.2392, "rewards/accuracies": 0.875, "rewards/chosen": -0.20530621707439423, "rewards/margins": 2.566615581512451, "rewards/rejected": -2.7719218730926514, "step": 438 }, { "epoch": 2.6014814814814815, "grad_norm": 24.652912032755857, "learning_rate": 4.2597099881477017e-07, "logits/chosen": -1.4756532907485962, "logits/rejected": -1.5366077423095703, "logps/chosen": -40.621578216552734, "logps/rejected": -55.144046783447266, "loss": 0.2691, "rewards/accuracies": 0.75, "rewards/chosen": -0.39135220646858215, "rewards/margins": 1.317076563835144, "rewards/rejected": -1.7084288597106934, "step": 439 }, { "epoch": 2.6074074074074076, "grad_norm": 16.69781220569815, "learning_rate": 4.2550896598613297e-07, "logits/chosen": -1.3262248039245605, "logits/rejected": -1.401850938796997, "logps/chosen": -32.84125518798828, "logps/rejected": -61.752288818359375, "loss": 0.2153, "rewards/accuracies": 0.875, "rewards/chosen": -0.14606288075447083, "rewards/margins": 2.7544689178466797, "rewards/rejected": -2.9005320072174072, "step": 440 }, { "epoch": 2.6133333333333333, "grad_norm": 22.119565477895417, "learning_rate": 4.25045748081485e-07, "logits/chosen": -1.2764012813568115, "logits/rejected": -1.4300332069396973, "logps/chosen": -37.34137725830078, "logps/rejected": -64.76432800292969, "loss": 0.2216, "rewards/accuracies": 1.0, "rewards/chosen": 0.03927718102931976, "rewards/margins": 3.770249366760254, "rewards/rejected": -3.7309722900390625, "step": 441 }, { "epoch": 2.6192592592592594, "grad_norm": 14.928256593837876, "learning_rate": 4.2458134822857774e-07, "logits/chosen": -1.2378556728363037, "logits/rejected": -1.359390377998352, "logps/chosen": -46.040767669677734, "logps/rejected": -67.20500183105469, "loss": 0.1878, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04216981679201126, "rewards/margins": 3.2073171138763428, "rewards/rejected": -3.2494869232177734, "step": 442 }, { "epoch": 2.625185185185185, "grad_norm": 14.96404630636834, "learning_rate": 4.241157695631435e-07, "logits/chosen": -1.2404184341430664, "logits/rejected": -1.3973965644836426, "logps/chosen": -43.609745025634766, "logps/rejected": -65.57633209228516, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": -0.29699721932411194, "rewards/margins": 3.3562171459198, "rewards/rejected": -3.653214454650879, "step": 443 }, { "epoch": 2.631111111111111, "grad_norm": 20.94939040431925, "learning_rate": 4.2364901522887416e-07, "logits/chosen": -1.3479417562484741, "logits/rejected": -1.3275196552276611, "logps/chosen": -45.53289794921875, "logps/rejected": -62.82244873046875, "loss": 0.2071, "rewards/accuracies": 0.875, "rewards/chosen": -0.2697879672050476, "rewards/margins": 3.2815232276916504, "rewards/rejected": -3.5513112545013428, "step": 444 }, { "epoch": 2.637037037037037, "grad_norm": 21.81972888844552, "learning_rate": 4.2318108837739986e-07, "logits/chosen": -1.0954722166061401, "logits/rejected": -1.2724300622940063, "logps/chosen": -37.40864562988281, "logps/rejected": -53.508148193359375, "loss": 0.2343, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5114908814430237, "rewards/margins": 2.329716682434082, "rewards/rejected": -2.841207504272461, "step": 445 }, { "epoch": 2.642962962962963, "grad_norm": 17.622719951148344, "learning_rate": 4.22711992168268e-07, "logits/chosen": -1.292818546295166, "logits/rejected": -1.4211820363998413, "logps/chosen": -55.30113220214844, "logps/rejected": -62.69170379638672, "loss": 0.2219, "rewards/accuracies": 1.0, "rewards/chosen": -0.12264469265937805, "rewards/margins": 2.33625864982605, "rewards/rejected": -2.4589033126831055, "step": 446 }, { "epoch": 2.648888888888889, "grad_norm": 16.90251346927541, "learning_rate": 4.2224172976892166e-07, "logits/chosen": -1.198624610900879, "logits/rejected": -1.3212577104568481, "logps/chosen": -56.71440887451172, "logps/rejected": -77.87358856201172, "loss": 0.1828, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1206379383802414, "rewards/margins": 3.149707078933716, "rewards/rejected": -3.2703449726104736, "step": 447 }, { "epoch": 2.6548148148148147, "grad_norm": 19.93007489097777, "learning_rate": 4.217703043546783e-07, "logits/chosen": -1.3870353698730469, "logits/rejected": -1.4539594650268555, "logps/chosen": -50.45875930786133, "logps/rejected": -58.48164367675781, "loss": 0.194, "rewards/accuracies": 1.0, "rewards/chosen": -0.5994179248809814, "rewards/margins": 2.5853588581085205, "rewards/rejected": -3.184777021408081, "step": 448 }, { "epoch": 2.660740740740741, "grad_norm": 13.25713353861389, "learning_rate": 4.2129771910870845e-07, "logits/chosen": -1.2270911931991577, "logits/rejected": -1.3550801277160645, "logps/chosen": -44.222511291503906, "logps/rejected": -73.00066375732422, "loss": 0.1563, "rewards/accuracies": 1.0, "rewards/chosen": -0.31280404329299927, "rewards/margins": 3.3302433490753174, "rewards/rejected": -3.643047332763672, "step": 449 }, { "epoch": 2.6666666666666665, "grad_norm": 17.010657621012665, "learning_rate": 4.2082397722201385e-07, "logits/chosen": -1.1666088104248047, "logits/rejected": -1.1798983812332153, "logps/chosen": -34.95051193237305, "logps/rejected": -64.77764129638672, "loss": 0.1893, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23295338451862335, "rewards/margins": 3.2355153560638428, "rewards/rejected": -3.46846866607666, "step": 450 }, { "epoch": 2.6725925925925926, "grad_norm": 16.03990487758234, "learning_rate": 4.2034908189340634e-07, "logits/chosen": -1.2905778884887695, "logits/rejected": -1.2771592140197754, "logps/chosen": -43.270263671875, "logps/rejected": -62.975738525390625, "loss": 0.1909, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37080562114715576, "rewards/margins": 3.1950912475585938, "rewards/rejected": -3.565896987915039, "step": 451 }, { "epoch": 2.6785185185185183, "grad_norm": 16.60951149016189, "learning_rate": 4.19873036329486e-07, "logits/chosen": -1.1631643772125244, "logits/rejected": -1.3125519752502441, "logps/chosen": -52.858795166015625, "logps/rejected": -61.08552169799805, "loss": 0.1714, "rewards/accuracies": 0.875, "rewards/chosen": -0.2712424397468567, "rewards/margins": 2.5944318771362305, "rewards/rejected": -2.8656740188598633, "step": 452 }, { "epoch": 2.6844444444444444, "grad_norm": 18.307768575042843, "learning_rate": 4.1939584374461943e-07, "logits/chosen": -1.3807969093322754, "logits/rejected": -1.4712002277374268, "logps/chosen": -40.716617584228516, "logps/rejected": -50.97564697265625, "loss": 0.2065, "rewards/accuracies": 1.0, "rewards/chosen": 0.5352606773376465, "rewards/margins": 2.2707905769348145, "rewards/rejected": -1.7355297803878784, "step": 453 }, { "epoch": 2.6903703703703705, "grad_norm": 12.19613990182221, "learning_rate": 4.189175073609184e-07, "logits/chosen": -1.210724949836731, "logits/rejected": -1.2001690864562988, "logps/chosen": -50.027584075927734, "logps/rejected": -60.53565979003906, "loss": 0.1483, "rewards/accuracies": 1.0, "rewards/chosen": 0.36551186442375183, "rewards/margins": 2.5059051513671875, "rewards/rejected": -2.1403932571411133, "step": 454 }, { "epoch": 2.696296296296296, "grad_norm": 20.05543126255426, "learning_rate": 4.184380304082177e-07, "logits/chosen": -1.2690619230270386, "logits/rejected": -1.2047336101531982, "logps/chosen": -44.2292366027832, "logps/rejected": -52.65461730957031, "loss": 0.2265, "rewards/accuracies": 0.9375, "rewards/chosen": -0.32682713866233826, "rewards/margins": 2.385038375854492, "rewards/rejected": -2.7118656635284424, "step": 455 }, { "epoch": 2.7022222222222223, "grad_norm": 20.386292279150016, "learning_rate": 4.179574161240536e-07, "logits/chosen": -1.1057473421096802, "logits/rejected": -1.1048200130462646, "logps/chosen": -37.20689392089844, "logps/rejected": -51.49712371826172, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 0.22639192640781403, "rewards/margins": 2.2510790824890137, "rewards/rejected": -2.0246872901916504, "step": 456 }, { "epoch": 2.7081481481481484, "grad_norm": 19.386564533555276, "learning_rate": 4.1747566775364175e-07, "logits/chosen": -1.4120452404022217, "logits/rejected": -1.5171071290969849, "logps/chosen": -34.28341293334961, "logps/rejected": -67.71885681152344, "loss": 0.2456, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5173302292823792, "rewards/margins": 2.899099349975586, "rewards/rejected": -2.3817691802978516, "step": 457 }, { "epoch": 2.714074074074074, "grad_norm": 17.25041625855171, "learning_rate": 4.169927885498556e-07, "logits/chosen": -1.4341471195220947, "logits/rejected": -1.4753046035766602, "logps/chosen": -52.228519439697266, "logps/rejected": -61.93224334716797, "loss": 0.2118, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5279741883277893, "rewards/margins": 3.191676139831543, "rewards/rejected": -3.7196502685546875, "step": 458 }, { "epoch": 2.7199999999999998, "grad_norm": 19.00368041939193, "learning_rate": 4.16508781773204e-07, "logits/chosen": -1.1663175821304321, "logits/rejected": -1.2493047714233398, "logps/chosen": -59.34048843383789, "logps/rejected": -66.43865203857422, "loss": 0.2262, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7447548508644104, "rewards/margins": 3.5409374237060547, "rewards/rejected": -4.28569221496582, "step": 459 }, { "epoch": 2.725925925925926, "grad_norm": 17.264561754976015, "learning_rate": 4.1602365069180976e-07, "logits/chosen": -1.3075733184814453, "logits/rejected": -1.3201042413711548, "logps/chosen": -53.10240936279297, "logps/rejected": -57.974273681640625, "loss": 0.2027, "rewards/accuracies": 1.0, "rewards/chosen": -0.26173388957977295, "rewards/margins": 3.05476975440979, "rewards/rejected": -3.3165035247802734, "step": 460 }, { "epoch": 2.731851851851852, "grad_norm": 18.17486278562164, "learning_rate": 4.155373985813868e-07, "logits/chosen": -1.366817831993103, "logits/rejected": -1.3806540966033936, "logps/chosen": -39.87694549560547, "logps/rejected": -48.488956451416016, "loss": 0.2163, "rewards/accuracies": 1.0, "rewards/chosen": -0.14938339591026306, "rewards/margins": 2.15541410446167, "rewards/rejected": -2.304797410964966, "step": 461 }, { "epoch": 2.7377777777777776, "grad_norm": 19.361148379508926, "learning_rate": 4.150500287252189e-07, "logits/chosen": -1.2390245199203491, "logits/rejected": -1.2137972116470337, "logps/chosen": -52.104366302490234, "logps/rejected": -63.186466217041016, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": -0.6157130002975464, "rewards/margins": 2.714012861251831, "rewards/rejected": -3.329725742340088, "step": 462 }, { "epoch": 2.7437037037037038, "grad_norm": 13.808494148260458, "learning_rate": 4.145615444141369e-07, "logits/chosen": -1.2283602952957153, "logits/rejected": -1.208832025527954, "logps/chosen": -54.112083435058594, "logps/rejected": -54.76646423339844, "loss": 0.1701, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4094162881374359, "rewards/margins": 2.894310474395752, "rewards/rejected": -3.3037266731262207, "step": 463 }, { "epoch": 2.74962962962963, "grad_norm": 19.11789416326404, "learning_rate": 4.1407194894649677e-07, "logits/chosen": -1.2563258409500122, "logits/rejected": -1.2561264038085938, "logps/chosen": -47.32088851928711, "logps/rejected": -69.62883758544922, "loss": 0.2193, "rewards/accuracies": 0.875, "rewards/chosen": -0.12513133883476257, "rewards/margins": 3.2240326404571533, "rewards/rejected": -3.349163770675659, "step": 464 }, { "epoch": 2.7555555555555555, "grad_norm": 19.46290010104755, "learning_rate": 4.135812456281571e-07, "logits/chosen": -1.3946552276611328, "logits/rejected": -1.474714994430542, "logps/chosen": -50.58590316772461, "logps/rejected": -89.3076171875, "loss": 0.2179, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3854854106903076, "rewards/margins": 3.04565691947937, "rewards/rejected": -3.431142568588257, "step": 465 }, { "epoch": 2.7614814814814816, "grad_norm": 21.08647372531421, "learning_rate": 4.1308943777245717e-07, "logits/chosen": -1.209939956665039, "logits/rejected": -1.177263855934143, "logps/chosen": -42.54806900024414, "logps/rejected": -52.786014556884766, "loss": 0.2676, "rewards/accuracies": 0.9375, "rewards/chosen": -0.288261353969574, "rewards/margins": 2.845198631286621, "rewards/rejected": -3.1334598064422607, "step": 466 }, { "epoch": 2.7674074074074073, "grad_norm": 20.73103490558258, "learning_rate": 4.1259652870019426e-07, "logits/chosen": -1.169033169746399, "logits/rejected": -1.1900144815444946, "logps/chosen": -48.64216613769531, "logps/rejected": -56.44643020629883, "loss": 0.2648, "rewards/accuracies": 0.875, "rewards/chosen": -0.49444669485092163, "rewards/margins": 3.2292940616607666, "rewards/rejected": -3.723741054534912, "step": 467 }, { "epoch": 2.7733333333333334, "grad_norm": 16.731518786213478, "learning_rate": 4.121025217396011e-07, "logits/chosen": -1.3166173696517944, "logits/rejected": -1.3515011072158813, "logps/chosen": -41.45418167114258, "logps/rejected": -50.53019714355469, "loss": 0.1668, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45292171835899353, "rewards/margins": 2.9842123985290527, "rewards/rejected": -3.437134265899658, "step": 468 }, { "epoch": 2.779259259259259, "grad_norm": 21.284988595267457, "learning_rate": 4.1160742022632395e-07, "logits/chosen": -1.2805315256118774, "logits/rejected": -1.2572264671325684, "logps/chosen": -43.30008316040039, "logps/rejected": -56.220314025878906, "loss": 0.2376, "rewards/accuracies": 0.875, "rewards/chosen": -0.5084177255630493, "rewards/margins": 1.7679126262664795, "rewards/rejected": -2.2763302326202393, "step": 469 }, { "epoch": 2.785185185185185, "grad_norm": 16.424236630335066, "learning_rate": 4.1111122750339945e-07, "logits/chosen": -1.1565334796905518, "logits/rejected": -1.1801633834838867, "logps/chosen": -56.21332931518555, "logps/rejected": -68.18215942382812, "loss": 0.1819, "rewards/accuracies": 0.9375, "rewards/chosen": -0.310497522354126, "rewards/margins": 3.5792107582092285, "rewards/rejected": -3.8897082805633545, "step": 470 }, { "epoch": 2.7911111111111113, "grad_norm": 17.906723620074274, "learning_rate": 4.106139469212326e-07, "logits/chosen": -1.152573585510254, "logits/rejected": -1.189849615097046, "logps/chosen": -56.43069076538086, "logps/rejected": -69.75574493408203, "loss": 0.2075, "rewards/accuracies": 0.9375, "rewards/chosen": -0.382152795791626, "rewards/margins": 3.191413402557373, "rewards/rejected": -3.573566198348999, "step": 471 }, { "epoch": 2.797037037037037, "grad_norm": 18.98593991817937, "learning_rate": 4.1011558183757374e-07, "logits/chosen": -1.173750400543213, "logits/rejected": -1.2400974035263062, "logps/chosen": -39.005699157714844, "logps/rejected": -63.15963363647461, "loss": 0.2148, "rewards/accuracies": 0.875, "rewards/chosen": -0.7964209914207458, "rewards/margins": 3.4397897720336914, "rewards/rejected": -4.236210346221924, "step": 472 }, { "epoch": 2.802962962962963, "grad_norm": 16.15306648500753, "learning_rate": 4.0961613561749585e-07, "logits/chosen": -1.4924187660217285, "logits/rejected": -1.473052740097046, "logps/chosen": -55.84531784057617, "logps/rejected": -71.32598876953125, "loss": 0.1963, "rewards/accuracies": 1.0, "rewards/chosen": -0.7752455472946167, "rewards/margins": 3.1186671257019043, "rewards/rejected": -3.8939127922058105, "step": 473 }, { "epoch": 2.8088888888888888, "grad_norm": 15.237259647333039, "learning_rate": 4.091156116333723e-07, "logits/chosen": -1.2219537496566772, "logits/rejected": -1.2826833724975586, "logps/chosen": -50.56085968017578, "logps/rejected": -65.05696868896484, "loss": 0.1791, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7408727407455444, "rewards/margins": 2.5775582790374756, "rewards/rejected": -3.3184311389923096, "step": 474 }, { "epoch": 2.814814814814815, "grad_norm": 14.632575495330288, "learning_rate": 4.086140132648534e-07, "logits/chosen": -1.3067800998687744, "logits/rejected": -1.3324289321899414, "logps/chosen": -49.524009704589844, "logps/rejected": -74.86628723144531, "loss": 0.1637, "rewards/accuracies": 0.9375, "rewards/chosen": -0.901452362537384, "rewards/margins": 3.2804508209228516, "rewards/rejected": -4.18190336227417, "step": 475 }, { "epoch": 2.8207407407407405, "grad_norm": 21.595404469848596, "learning_rate": 4.081113438988443e-07, "logits/chosen": -1.3657824993133545, "logits/rejected": -1.3470386266708374, "logps/chosen": -52.59661102294922, "logps/rejected": -66.06396484375, "loss": 0.22, "rewards/accuracies": 1.0, "rewards/chosen": -0.33402401208877563, "rewards/margins": 3.3505983352661133, "rewards/rejected": -3.684622049331665, "step": 476 }, { "epoch": 2.8266666666666667, "grad_norm": 19.687853433851274, "learning_rate": 4.076076069294816e-07, "logits/chosen": -1.3582686185836792, "logits/rejected": -1.4166455268859863, "logps/chosen": -51.02378845214844, "logps/rejected": -74.09524536132812, "loss": 0.1808, "rewards/accuracies": 0.9375, "rewards/chosen": -0.765562117099762, "rewards/margins": 2.352506160736084, "rewards/rejected": -3.1180684566497803, "step": 477 }, { "epoch": 2.8325925925925928, "grad_norm": 17.745281021443276, "learning_rate": 4.071028057581105e-07, "logits/chosen": -1.248734474182129, "logits/rejected": -1.292419672012329, "logps/chosen": -71.39847564697266, "logps/rejected": -67.45188903808594, "loss": 0.2024, "rewards/accuracies": 1.0, "rewards/chosen": -1.0719102621078491, "rewards/margins": 2.9319186210632324, "rewards/rejected": -4.003829002380371, "step": 478 }, { "epoch": 2.8385185185185184, "grad_norm": 16.98519419081863, "learning_rate": 4.065969437932622e-07, "logits/chosen": -1.2369309663772583, "logits/rejected": -1.2602821588516235, "logps/chosen": -61.797019958496094, "logps/rejected": -68.8377456665039, "loss": 0.197, "rewards/accuracies": 0.875, "rewards/chosen": -1.268001914024353, "rewards/margins": 2.377183675765991, "rewards/rejected": -3.6451854705810547, "step": 479 }, { "epoch": 2.8444444444444446, "grad_norm": 17.804384178514177, "learning_rate": 4.0609002445063036e-07, "logits/chosen": -1.3432717323303223, "logits/rejected": -1.2590982913970947, "logps/chosen": -51.91122055053711, "logps/rejected": -61.9157829284668, "loss": 0.2035, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4239550232887268, "rewards/margins": 2.9792933464050293, "rewards/rejected": -3.4032483100891113, "step": 480 }, { "epoch": 2.85037037037037, "grad_norm": 18.264245793315055, "learning_rate": 4.0558205115304846e-07, "logits/chosen": -1.3515952825546265, "logits/rejected": -1.4534411430358887, "logps/chosen": -58.752376556396484, "logps/rejected": -73.56411743164062, "loss": 0.2108, "rewards/accuracies": 0.9375, "rewards/chosen": -0.48288118839263916, "rewards/margins": 2.813925266265869, "rewards/rejected": -3.2968060970306396, "step": 481 }, { "epoch": 2.8562962962962963, "grad_norm": 19.268397927205957, "learning_rate": 4.050730273304663e-07, "logits/chosen": -1.2159310579299927, "logits/rejected": -1.2838042974472046, "logps/chosen": -44.14533233642578, "logps/rejected": -68.81448364257812, "loss": 0.2378, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07248367369174957, "rewards/margins": 3.768887996673584, "rewards/rejected": -3.841372013092041, "step": 482 }, { "epoch": 2.862222222222222, "grad_norm": 11.562310542554206, "learning_rate": 4.045629564199273e-07, "logits/chosen": -1.3160737752914429, "logits/rejected": -1.359323501586914, "logps/chosen": -59.05349349975586, "logps/rejected": -77.77356719970703, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": -0.020738810300827026, "rewards/margins": 4.168304443359375, "rewards/rejected": -4.189043045043945, "step": 483 }, { "epoch": 2.868148148148148, "grad_norm": 20.72487276535475, "learning_rate": 4.04051841865545e-07, "logits/chosen": -1.2002463340759277, "logits/rejected": -1.2717535495758057, "logps/chosen": -54.947025299072266, "logps/rejected": -49.54365539550781, "loss": 0.2478, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12642936408519745, "rewards/margins": 1.9815359115600586, "rewards/rejected": -2.1079654693603516, "step": 484 }, { "epoch": 2.8740740740740742, "grad_norm": 19.870823704086312, "learning_rate": 4.0353968711847974e-07, "logits/chosen": -1.1724351644515991, "logits/rejected": -1.266343116760254, "logps/chosen": -53.72722625732422, "logps/rejected": -71.61346435546875, "loss": 0.2174, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19865109026432037, "rewards/margins": 3.181943416595459, "rewards/rejected": -3.380594491958618, "step": 485 }, { "epoch": 2.88, "grad_norm": 23.156216167074366, "learning_rate": 4.030264956369157e-07, "logits/chosen": -1.1195604801177979, "logits/rejected": -1.1153167486190796, "logps/chosen": -59.811885833740234, "logps/rejected": -68.43907165527344, "loss": 0.1927, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1706525832414627, "rewards/margins": 3.3727633953094482, "rewards/rejected": -3.5434162616729736, "step": 486 }, { "epoch": 2.885925925925926, "grad_norm": 21.673088334063607, "learning_rate": 4.02512270886037e-07, "logits/chosen": -1.2593806982040405, "logits/rejected": -1.341259479522705, "logps/chosen": -59.390708923339844, "logps/rejected": -57.16114807128906, "loss": 0.2189, "rewards/accuracies": 0.8125, "rewards/chosen": -0.01359531283378601, "rewards/margins": 2.733617067337036, "rewards/rejected": -2.7472124099731445, "step": 487 }, { "epoch": 2.891851851851852, "grad_norm": 22.61643853619251, "learning_rate": 4.01997016338005e-07, "logits/chosen": -1.2761664390563965, "logits/rejected": -1.3337078094482422, "logps/chosen": -48.623069763183594, "logps/rejected": -64.20046997070312, "loss": 0.2249, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5526472926139832, "rewards/margins": 3.5587265491485596, "rewards/rejected": -4.1113739013671875, "step": 488 }, { "epoch": 2.897777777777778, "grad_norm": 23.269686558492296, "learning_rate": 4.014807354719342e-07, "logits/chosen": -1.4048537015914917, "logits/rejected": -1.4099326133728027, "logps/chosen": -46.07366180419922, "logps/rejected": -51.41790771484375, "loss": 0.2332, "rewards/accuracies": 0.875, "rewards/chosen": -0.08646069467067719, "rewards/margins": 3.1128947734832764, "rewards/rejected": -3.1993556022644043, "step": 489 }, { "epoch": 2.9037037037037035, "grad_norm": 22.472991628056597, "learning_rate": 4.00963431773869e-07, "logits/chosen": -1.27083420753479, "logits/rejected": -1.3645908832550049, "logps/chosen": -41.91563415527344, "logps/rejected": -57.03092956542969, "loss": 0.3048, "rewards/accuracies": 1.0, "rewards/chosen": 0.1558661013841629, "rewards/margins": 2.5972371101379395, "rewards/rejected": -2.441370964050293, "step": 490 }, { "epoch": 2.9096296296296296, "grad_norm": 18.32253442612832, "learning_rate": 4.0044510873676043e-07, "logits/chosen": -1.3212807178497314, "logits/rejected": -1.336035966873169, "logps/chosen": -58.66569519042969, "logps/rejected": -66.00804138183594, "loss": 0.2009, "rewards/accuracies": 0.875, "rewards/chosen": -0.2158641666173935, "rewards/margins": 3.0606842041015625, "rewards/rejected": -3.2765486240386963, "step": 491 }, { "epoch": 2.9155555555555557, "grad_norm": 16.26649128609091, "learning_rate": 3.9992576986044223e-07, "logits/chosen": -1.209001064300537, "logits/rejected": -1.2842363119125366, "logps/chosen": -52.691688537597656, "logps/rejected": -74.82657623291016, "loss": 0.1554, "rewards/accuracies": 0.9375, "rewards/chosen": -0.620505690574646, "rewards/margins": 3.7764885425567627, "rewards/rejected": -4.396994113922119, "step": 492 }, { "epoch": 2.9214814814814813, "grad_norm": 17.610253600287994, "learning_rate": 3.9940541865160726e-07, "logits/chosen": -1.2993587255477905, "logits/rejected": -1.4684174060821533, "logps/chosen": -50.94914627075195, "logps/rejected": -60.80756378173828, "loss": 0.2051, "rewards/accuracies": 1.0, "rewards/chosen": -0.06892414391040802, "rewards/margins": 2.141951560974121, "rewards/rejected": -2.2108757495880127, "step": 493 }, { "epoch": 2.9274074074074075, "grad_norm": 15.568340315134158, "learning_rate": 3.9888405862378395e-07, "logits/chosen": -1.303976058959961, "logits/rejected": -1.3029195070266724, "logps/chosen": -59.768001556396484, "logps/rejected": -63.35868835449219, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": -0.16178204119205475, "rewards/margins": 2.961404323577881, "rewards/rejected": -3.1231861114501953, "step": 494 }, { "epoch": 2.9333333333333336, "grad_norm": 21.221679180562266, "learning_rate": 3.983616932973124e-07, "logits/chosen": -1.273141622543335, "logits/rejected": -1.334914207458496, "logps/chosen": -44.11486053466797, "logps/rejected": -54.946693420410156, "loss": 0.2123, "rewards/accuracies": 0.875, "rewards/chosen": -0.39918532967567444, "rewards/margins": 1.510643482208252, "rewards/rejected": -1.909828782081604, "step": 495 }, { "epoch": 2.9392592592592592, "grad_norm": 18.3522337117436, "learning_rate": 3.9783832619932076e-07, "logits/chosen": -1.2553412914276123, "logits/rejected": -1.291959524154663, "logps/chosen": -46.14884567260742, "logps/rejected": -56.9810676574707, "loss": 0.219, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5942877531051636, "rewards/margins": 2.607400894165039, "rewards/rejected": -3.201688766479492, "step": 496 }, { "epoch": 2.9451851851851854, "grad_norm": 16.373090407512134, "learning_rate": 3.973139608637015e-07, "logits/chosen": -1.1282140016555786, "logits/rejected": -1.0897331237792969, "logps/chosen": -47.96284484863281, "logps/rejected": -61.69994354248047, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": 0.056502558290958405, "rewards/margins": 2.3940765857696533, "rewards/rejected": -2.337573766708374, "step": 497 }, { "epoch": 2.951111111111111, "grad_norm": 17.62403657778311, "learning_rate": 3.9678860083108713e-07, "logits/chosen": -1.2530139684677124, "logits/rejected": -1.2804282903671265, "logps/chosen": -44.19329833984375, "logps/rejected": -63.33118438720703, "loss": 0.2327, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03126450255513191, "rewards/margins": 3.057286500930786, "rewards/rejected": -3.026021957397461, "step": 498 }, { "epoch": 2.957037037037037, "grad_norm": 18.150598366324193, "learning_rate": 3.9626224964882685e-07, "logits/chosen": -1.35397469997406, "logits/rejected": -1.3743972778320312, "logps/chosen": -46.631126403808594, "logps/rejected": -56.481590270996094, "loss": 0.1899, "rewards/accuracies": 0.875, "rewards/chosen": -0.030939504504203796, "rewards/margins": 2.582378625869751, "rewards/rejected": -2.61331844329834, "step": 499 }, { "epoch": 2.962962962962963, "grad_norm": 17.40782024999203, "learning_rate": 3.957349108709623e-07, "logits/chosen": -1.050638198852539, "logits/rejected": -1.0757251977920532, "logps/chosen": -48.303932189941406, "logps/rejected": -63.3568115234375, "loss": 0.1925, "rewards/accuracies": 0.875, "rewards/chosen": -0.251191109418869, "rewards/margins": 2.1156184673309326, "rewards/rejected": -2.366809368133545, "step": 500 }, { "epoch": 2.968888888888889, "grad_norm": 12.57466221715435, "learning_rate": 3.9520658805820335e-07, "logits/chosen": -1.2836908102035522, "logits/rejected": -1.3424688577651978, "logps/chosen": -56.20665740966797, "logps/rejected": -69.99337005615234, "loss": 0.1532, "rewards/accuracies": 1.0, "rewards/chosen": -0.191994309425354, "rewards/margins": 4.468802452087402, "rewards/rejected": -4.660797119140625, "step": 501 }, { "epoch": 2.974814814814815, "grad_norm": 17.441545006331303, "learning_rate": 3.946772847779045e-07, "logits/chosen": -1.265354037284851, "logits/rejected": -1.2552366256713867, "logps/chosen": -44.5283088684082, "logps/rejected": -45.96354675292969, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 0.26437434554100037, "rewards/margins": 2.901440382003784, "rewards/rejected": -2.637066125869751, "step": 502 }, { "epoch": 2.9807407407407407, "grad_norm": 18.271861903291637, "learning_rate": 3.941470046040406e-07, "logits/chosen": -1.3075486421585083, "logits/rejected": -1.3464792966842651, "logps/chosen": -53.6762580871582, "logps/rejected": -57.431610107421875, "loss": 0.1841, "rewards/accuracies": 1.0, "rewards/chosen": 0.07351833581924438, "rewards/margins": 2.5279555320739746, "rewards/rejected": -2.454437017440796, "step": 503 }, { "epoch": 2.986666666666667, "grad_norm": 19.426242018024382, "learning_rate": 3.936157511171826e-07, "logits/chosen": -1.1142933368682861, "logits/rejected": -1.1934661865234375, "logps/chosen": -40.239479064941406, "logps/rejected": -63.111671447753906, "loss": 0.2169, "rewards/accuracies": 1.0, "rewards/chosen": -0.24999532103538513, "rewards/margins": 3.7723803520202637, "rewards/rejected": -4.022375583648682, "step": 504 }, { "epoch": 2.9925925925925925, "grad_norm": 23.27473360631721, "learning_rate": 3.9308352790447354e-07, "logits/chosen": -1.2489938735961914, "logits/rejected": -1.3289551734924316, "logps/chosen": -44.65389633178711, "logps/rejected": -60.861324310302734, "loss": 0.2433, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0023244433104991913, "rewards/margins": 2.8981451988220215, "rewards/rejected": -2.9004695415496826, "step": 505 }, { "epoch": 2.9985185185185186, "grad_norm": 16.78764710351435, "learning_rate": 3.9255033855960414e-07, "logits/chosen": -1.3627394437789917, "logits/rejected": -1.4329453706741333, "logps/chosen": -35.57086181640625, "logps/rejected": -71.59107971191406, "loss": 0.222, "rewards/accuracies": 0.875, "rewards/chosen": -0.22746825218200684, "rewards/margins": 2.67952299118042, "rewards/rejected": -2.906991481781006, "step": 506 }, { "epoch": 3.0044444444444443, "grad_norm": 15.51252784783785, "learning_rate": 3.920161866827889e-07, "logits/chosen": -1.2271157503128052, "logits/rejected": -1.2314724922180176, "logps/chosen": -42.860347747802734, "logps/rejected": -63.78620910644531, "loss": 0.1722, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39318573474884033, "rewards/margins": 3.201770067214966, "rewards/rejected": -3.5949559211730957, "step": 507 }, { "epoch": 3.0103703703703704, "grad_norm": 15.348034664152413, "learning_rate": 3.914810758807414e-07, "logits/chosen": -1.266876459121704, "logits/rejected": -1.201805591583252, "logps/chosen": -39.3177375793457, "logps/rejected": -63.72394561767578, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": 0.21385174989700317, "rewards/margins": 2.7866299152374268, "rewards/rejected": -2.5727782249450684, "step": 508 }, { "epoch": 3.0162962962962965, "grad_norm": 13.694371117456312, "learning_rate": 3.9094500976665025e-07, "logits/chosen": -1.4511996507644653, "logits/rejected": -1.5082242488861084, "logps/chosen": -45.20631408691406, "logps/rejected": -60.80164337158203, "loss": 0.1663, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0012191124260425568, "rewards/margins": 2.3622097969055176, "rewards/rejected": -2.360990524291992, "step": 509 }, { "epoch": 3.022222222222222, "grad_norm": 9.608073686288138, "learning_rate": 3.904079919601542e-07, "logits/chosen": -1.1920677423477173, "logits/rejected": -1.1335797309875488, "logps/chosen": -48.81990432739258, "logps/rejected": -70.8568115234375, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": -0.3464869260787964, "rewards/margins": 4.490400791168213, "rewards/rejected": -4.836887359619141, "step": 510 }, { "epoch": 3.0281481481481483, "grad_norm": 10.659649188493658, "learning_rate": 3.898700260873182e-07, "logits/chosen": -1.3865935802459717, "logits/rejected": -1.4555929899215698, "logps/chosen": -46.135467529296875, "logps/rejected": -54.744056701660156, "loss": 0.1269, "rewards/accuracies": 1.0, "rewards/chosen": 0.39444661140441895, "rewards/margins": 2.6518468856811523, "rewards/rejected": -2.2574002742767334, "step": 511 }, { "epoch": 3.034074074074074, "grad_norm": 14.922676910008562, "learning_rate": 3.893311157806091e-07, "logits/chosen": -1.30184805393219, "logits/rejected": -1.3149746656417847, "logps/chosen": -54.556312561035156, "logps/rejected": -60.76716995239258, "loss": 0.1527, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7763004302978516, "rewards/margins": 2.673186779022217, "rewards/rejected": -3.4494872093200684, "step": 512 }, { "epoch": 3.04, "grad_norm": 13.25601338827941, "learning_rate": 3.887912646788703e-07, "logits/chosen": -1.3052939176559448, "logits/rejected": -1.2798081636428833, "logps/chosen": -46.84373474121094, "logps/rejected": -72.25546264648438, "loss": 0.152, "rewards/accuracies": 0.9375, "rewards/chosen": -0.09142529964447021, "rewards/margins": 3.686022996902466, "rewards/rejected": -3.7774481773376465, "step": 513 }, { "epoch": 3.0459259259259257, "grad_norm": 11.896910317905705, "learning_rate": 3.882504764272979e-07, "logits/chosen": -1.1876428127288818, "logits/rejected": -1.219206690788269, "logps/chosen": -48.874366760253906, "logps/rejected": -79.81904602050781, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": -0.36831003427505493, "rewards/margins": 2.9127964973449707, "rewards/rejected": -3.281106472015381, "step": 514 }, { "epoch": 3.051851851851852, "grad_norm": 11.704556833201291, "learning_rate": 3.8770875467741577e-07, "logits/chosen": -1.3152544498443604, "logits/rejected": -1.3558157682418823, "logps/chosen": -47.62205505371094, "logps/rejected": -78.95779418945312, "loss": 0.1424, "rewards/accuracies": 1.0, "rewards/chosen": 0.07077004760503769, "rewards/margins": 3.3393805027008057, "rewards/rejected": -3.2686104774475098, "step": 515 }, { "epoch": 3.057777777777778, "grad_norm": 11.04510617201905, "learning_rate": 3.871661030870511e-07, "logits/chosen": -1.2623419761657715, "logits/rejected": -1.2585841417312622, "logps/chosen": -53.519046783447266, "logps/rejected": -77.24893188476562, "loss": 0.1223, "rewards/accuracies": 1.0, "rewards/chosen": -0.017071455717086792, "rewards/margins": 4.162660598754883, "rewards/rejected": -4.179732322692871, "step": 516 }, { "epoch": 3.0637037037037036, "grad_norm": 13.147383461961375, "learning_rate": 3.866225253203093e-07, "logits/chosen": -1.2138491868972778, "logits/rejected": -1.2417991161346436, "logps/chosen": -52.79689025878906, "logps/rejected": -65.50883483886719, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": -0.27549540996551514, "rewards/margins": 3.84724497795105, "rewards/rejected": -4.122740745544434, "step": 517 }, { "epoch": 3.0696296296296297, "grad_norm": 12.707877291174364, "learning_rate": 3.8607802504754984e-07, "logits/chosen": -1.1168802976608276, "logits/rejected": -1.300939679145813, "logps/chosen": -55.39178466796875, "logps/rejected": -69.42161560058594, "loss": 0.1352, "rewards/accuracies": 1.0, "rewards/chosen": -0.18456672132015228, "rewards/margins": 3.3074283599853516, "rewards/rejected": -3.491994857788086, "step": 518 }, { "epoch": 3.0755555555555554, "grad_norm": 12.295407374509045, "learning_rate": 3.85532605945361e-07, "logits/chosen": -1.1297457218170166, "logits/rejected": -1.1549437046051025, "logps/chosen": -58.893798828125, "logps/rejected": -63.923583984375, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 0.11424604058265686, "rewards/margins": 2.7826895713806152, "rewards/rejected": -2.668443441390991, "step": 519 }, { "epoch": 3.0814814814814815, "grad_norm": 14.285195292921095, "learning_rate": 3.849862716965352e-07, "logits/chosen": -1.2393442392349243, "logits/rejected": -1.2823017835617065, "logps/chosen": -54.41102600097656, "logps/rejected": -80.21492004394531, "loss": 0.134, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5927256345748901, "rewards/margins": 4.974529266357422, "rewards/rejected": -5.567255020141602, "step": 520 }, { "epoch": 3.0874074074074076, "grad_norm": 15.778972893153691, "learning_rate": 3.8443902599004406e-07, "logits/chosen": -1.4329743385314941, "logits/rejected": -1.4719727039337158, "logps/chosen": -40.67536163330078, "logps/rejected": -61.614952087402344, "loss": 0.1758, "rewards/accuracies": 0.9375, "rewards/chosen": 0.005242794752120972, "rewards/margins": 2.597073554992676, "rewards/rejected": -2.5918307304382324, "step": 521 }, { "epoch": 3.0933333333333333, "grad_norm": 11.518639728252891, "learning_rate": 3.8389087252101395e-07, "logits/chosen": -1.2456588745117188, "logits/rejected": -1.2938756942749023, "logps/chosen": -44.22534942626953, "logps/rejected": -56.13856506347656, "loss": 0.1464, "rewards/accuracies": 1.0, "rewards/chosen": -0.9335315227508545, "rewards/margins": 2.4933667182922363, "rewards/rejected": -3.426898241043091, "step": 522 }, { "epoch": 3.0992592592592594, "grad_norm": 14.585493614282901, "learning_rate": 3.833418149907001e-07, "logits/chosen": -1.246716022491455, "logits/rejected": -1.2304835319519043, "logps/chosen": -63.897918701171875, "logps/rejected": -69.18603515625, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5471602082252502, "rewards/margins": 3.732401132583618, "rewards/rejected": -4.2795610427856445, "step": 523 }, { "epoch": 3.105185185185185, "grad_norm": 12.956893274097098, "learning_rate": 3.827918571064626e-07, "logits/chosen": -1.2984254360198975, "logits/rejected": -1.4002485275268555, "logps/chosen": -48.713111877441406, "logps/rejected": -60.62818145751953, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 0.053312137722969055, "rewards/margins": 2.424069881439209, "rewards/rejected": -2.370757818222046, "step": 524 }, { "epoch": 3.111111111111111, "grad_norm": 13.250412997131157, "learning_rate": 3.822410025817406e-07, "logits/chosen": -1.3015064001083374, "logits/rejected": -1.3865606784820557, "logps/chosen": -45.77219009399414, "logps/rejected": -60.43882751464844, "loss": 0.1451, "rewards/accuracies": 0.9375, "rewards/chosen": -0.16919779777526855, "rewards/margins": 2.377150297164917, "rewards/rejected": -2.5463480949401855, "step": 525 }, { "epoch": 3.117037037037037, "grad_norm": 12.410375375745177, "learning_rate": 3.816892551360279e-07, "logits/chosen": -1.3042594194412231, "logits/rejected": -1.4607019424438477, "logps/chosen": -54.673851013183594, "logps/rejected": -97.9752426147461, "loss": 0.1133, "rewards/accuracies": 1.0, "rewards/chosen": -0.16998320817947388, "rewards/margins": 5.217333793640137, "rewards/rejected": -5.387316703796387, "step": 526 }, { "epoch": 3.122962962962963, "grad_norm": 13.928112395373121, "learning_rate": 3.8113661849484723e-07, "logits/chosen": -1.2873992919921875, "logits/rejected": -1.3018040657043457, "logps/chosen": -47.664215087890625, "logps/rejected": -62.4625129699707, "loss": 0.1334, "rewards/accuracies": 0.875, "rewards/chosen": -0.14091503620147705, "rewards/margins": 2.199439764022827, "rewards/rejected": -2.3403549194335938, "step": 527 }, { "epoch": 3.128888888888889, "grad_norm": 12.75367411003303, "learning_rate": 3.805830963897256e-07, "logits/chosen": -1.0977472066879272, "logits/rejected": -1.1910130977630615, "logps/chosen": -49.190704345703125, "logps/rejected": -104.37145233154297, "loss": 0.1236, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3834270238876343, "rewards/margins": 5.602377414703369, "rewards/rejected": -5.985805034637451, "step": 528 }, { "epoch": 3.1348148148148147, "grad_norm": 14.210812786624423, "learning_rate": 3.8002869255816873e-07, "logits/chosen": -1.2921619415283203, "logits/rejected": -1.3322010040283203, "logps/chosen": -62.789669036865234, "logps/rejected": -69.41814422607422, "loss": 0.1635, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5743184685707092, "rewards/margins": 3.2138500213623047, "rewards/rejected": -3.788168430328369, "step": 529 }, { "epoch": 3.140740740740741, "grad_norm": 10.800120698981736, "learning_rate": 3.7947341074363593e-07, "logits/chosen": -1.2333744764328003, "logits/rejected": -1.1910183429718018, "logps/chosen": -51.27862548828125, "logps/rejected": -69.56024169921875, "loss": 0.098, "rewards/accuracies": 1.0, "rewards/chosen": -0.18978382647037506, "rewards/margins": 3.8493576049804688, "rewards/rejected": -4.0391411781311035, "step": 530 }, { "epoch": 3.1466666666666665, "grad_norm": 12.837286803004636, "learning_rate": 3.7891725469551485e-07, "logits/chosen": -1.2333027124404907, "logits/rejected": -1.3354871273040771, "logps/chosen": -39.167945861816406, "logps/rejected": -56.154727935791016, "loss": 0.1721, "rewards/accuracies": 0.875, "rewards/chosen": -0.12276704609394073, "rewards/margins": 4.041677474975586, "rewards/rejected": -4.164444446563721, "step": 531 }, { "epoch": 3.1525925925925926, "grad_norm": 9.733353466294954, "learning_rate": 3.783602281690963e-07, "logits/chosen": -1.1893879175186157, "logits/rejected": -1.185511589050293, "logps/chosen": -41.93305969238281, "logps/rejected": -70.7315444946289, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": 0.0085669606924057, "rewards/margins": 3.666670799255371, "rewards/rejected": -3.6581034660339355, "step": 532 }, { "epoch": 3.1585185185185187, "grad_norm": 11.565308138418079, "learning_rate": 3.7780233492554856e-07, "logits/chosen": -1.1588068008422852, "logits/rejected": -1.2201921939849854, "logps/chosen": -45.38616180419922, "logps/rejected": -61.572021484375, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 0.13040557503700256, "rewards/margins": 4.217648029327393, "rewards/rejected": -4.087242126464844, "step": 533 }, { "epoch": 3.1644444444444444, "grad_norm": 12.889495915162684, "learning_rate": 3.7724357873189244e-07, "logits/chosen": -1.2265737056732178, "logits/rejected": -1.232682466506958, "logps/chosen": -46.15911102294922, "logps/rejected": -54.5587043762207, "loss": 0.1355, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24472272396087646, "rewards/margins": 3.2570977210998535, "rewards/rejected": -3.5018205642700195, "step": 534 }, { "epoch": 3.1703703703703705, "grad_norm": 12.45954730910451, "learning_rate": 3.766839633609753e-07, "logits/chosen": -1.2919063568115234, "logits/rejected": -1.3215129375457764, "logps/chosen": -50.997650146484375, "logps/rejected": -54.30298614501953, "loss": 0.1395, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3067464828491211, "rewards/margins": 3.446145534515381, "rewards/rejected": -3.1393988132476807, "step": 535 }, { "epoch": 3.176296296296296, "grad_norm": 11.473128142055426, "learning_rate": 3.761234925914459e-07, "logits/chosen": -1.332575798034668, "logits/rejected": -1.3540210723876953, "logps/chosen": -49.996734619140625, "logps/rejected": -66.5724105834961, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": 0.3274403214454651, "rewards/margins": 3.3128936290740967, "rewards/rejected": -2.9854533672332764, "step": 536 }, { "epoch": 3.1822222222222223, "grad_norm": 11.657192523402523, "learning_rate": 3.755621702077292e-07, "logits/chosen": -1.2775133848190308, "logits/rejected": -1.3381080627441406, "logps/chosen": -49.6759033203125, "logps/rejected": -67.08674621582031, "loss": 0.1286, "rewards/accuracies": 1.0, "rewards/chosen": -0.738972544670105, "rewards/margins": 4.289883613586426, "rewards/rejected": -5.02885627746582, "step": 537 }, { "epoch": 3.188148148148148, "grad_norm": 11.299765875660642, "learning_rate": 3.75e-07, "logits/chosen": -1.3044968843460083, "logits/rejected": -1.3229069709777832, "logps/chosen": -44.54370880126953, "logps/rejected": -59.271095275878906, "loss": 0.1366, "rewards/accuracies": 1.0, "rewards/chosen": -0.5054673552513123, "rewards/margins": 3.5005180835723877, "rewards/rejected": -4.005985260009766, "step": 538 }, { "epoch": 3.194074074074074, "grad_norm": 14.278579624361138, "learning_rate": 3.7443698576415795e-07, "logits/chosen": -1.2247653007507324, "logits/rejected": -1.2305885553359985, "logps/chosen": -67.3138656616211, "logps/rejected": -70.22055053710938, "loss": 0.1701, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1986965537071228, "rewards/margins": 3.648164749145508, "rewards/rejected": -3.4494681358337402, "step": 539 }, { "epoch": 3.2, "grad_norm": 11.732507351761482, "learning_rate": 3.738731313018019e-07, "logits/chosen": -1.119322657585144, "logits/rejected": -1.2076921463012695, "logps/chosen": -50.44218063354492, "logps/rejected": -62.570045471191406, "loss": 0.1377, "rewards/accuracies": 1.0, "rewards/chosen": 0.37010127305984497, "rewards/margins": 3.5672857761383057, "rewards/rejected": -3.1971845626831055, "step": 540 }, { "epoch": 3.205925925925926, "grad_norm": 15.017170781434197, "learning_rate": 3.7330844042020384e-07, "logits/chosen": -1.229867696762085, "logits/rejected": -1.2855830192565918, "logps/chosen": -48.04343795776367, "logps/rejected": -67.48896026611328, "loss": 0.1687, "rewards/accuracies": 0.875, "rewards/chosen": 0.3234538435935974, "rewards/margins": 2.7441210746765137, "rewards/rejected": -2.4206674098968506, "step": 541 }, { "epoch": 3.211851851851852, "grad_norm": 15.007372624369156, "learning_rate": 3.727429169322837e-07, "logits/chosen": -1.2071201801300049, "logits/rejected": -1.2570191621780396, "logps/chosen": -39.118656158447266, "logps/rejected": -54.70764923095703, "loss": 0.1644, "rewards/accuracies": 1.0, "rewards/chosen": -0.36125683784484863, "rewards/margins": 2.9558732509613037, "rewards/rejected": -3.3171298503875732, "step": 542 }, { "epoch": 3.2177777777777776, "grad_norm": 11.308523940892284, "learning_rate": 3.721765646565833e-07, "logits/chosen": -1.3717918395996094, "logits/rejected": -1.4021085500717163, "logps/chosen": -47.05071258544922, "logps/rejected": -76.36489868164062, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": -0.06609348952770233, "rewards/margins": 3.5705413818359375, "rewards/rejected": -3.636634588241577, "step": 543 }, { "epoch": 3.2237037037037037, "grad_norm": 11.281320443508063, "learning_rate": 3.7160938741724057e-07, "logits/chosen": -1.2979824542999268, "logits/rejected": -1.3822671175003052, "logps/chosen": -49.72755432128906, "logps/rejected": -57.613616943359375, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": -0.24122706055641174, "rewards/margins": 3.2609100341796875, "rewards/rejected": -3.5021369457244873, "step": 544 }, { "epoch": 3.2296296296296294, "grad_norm": 12.113133814789371, "learning_rate": 3.7104138904396374e-07, "logits/chosen": -1.153433918952942, "logits/rejected": -1.2310869693756104, "logps/chosen": -55.73785400390625, "logps/rejected": -67.78173065185547, "loss": 0.1102, "rewards/accuracies": 1.0, "rewards/chosen": 0.09717914462089539, "rewards/margins": 4.385678768157959, "rewards/rejected": -4.288499355316162, "step": 545 }, { "epoch": 3.2355555555555555, "grad_norm": 11.882489459481814, "learning_rate": 3.704725733720055e-07, "logits/chosen": -1.076989769935608, "logits/rejected": -1.236989140510559, "logps/chosen": -54.22359848022461, "logps/rejected": -83.63075256347656, "loss": 0.1262, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09978058934211731, "rewards/margins": 3.196918249130249, "rewards/rejected": -3.097137689590454, "step": 546 }, { "epoch": 3.2414814814814816, "grad_norm": 13.358445845497405, "learning_rate": 3.699029442421374e-07, "logits/chosen": -1.0463993549346924, "logits/rejected": -1.1902413368225098, "logps/chosen": -48.361671447753906, "logps/rejected": -65.42289733886719, "loss": 0.1674, "rewards/accuracies": 1.0, "rewards/chosen": 0.02705952525138855, "rewards/margins": 3.5172808170318604, "rewards/rejected": -3.4902215003967285, "step": 547 }, { "epoch": 3.2474074074074073, "grad_norm": 15.253492858367455, "learning_rate": 3.693325055006232e-07, "logits/chosen": -1.225471019744873, "logits/rejected": -1.3131675720214844, "logps/chosen": -39.747589111328125, "logps/rejected": -58.30244064331055, "loss": 0.1664, "rewards/accuracies": 1.0, "rewards/chosen": -0.2591007649898529, "rewards/margins": 2.7843737602233887, "rewards/rejected": -3.0434746742248535, "step": 548 }, { "epoch": 3.2533333333333334, "grad_norm": 10.441244482245853, "learning_rate": 3.6876126099919373e-07, "logits/chosen": -1.1871273517608643, "logits/rejected": -1.192123293876648, "logps/chosen": -40.08260726928711, "logps/rejected": -58.24555969238281, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": -0.014481008052825928, "rewards/margins": 3.1415109634399414, "rewards/rejected": -3.155992031097412, "step": 549 }, { "epoch": 3.259259259259259, "grad_norm": 13.406182971514847, "learning_rate": 3.681892145950203e-07, "logits/chosen": -1.1578247547149658, "logits/rejected": -1.1346076726913452, "logps/chosen": -46.344505310058594, "logps/rejected": -69.93978881835938, "loss": 0.1418, "rewards/accuracies": 1.0, "rewards/chosen": -0.10121719539165497, "rewards/margins": 3.64451003074646, "rewards/rejected": -3.745727062225342, "step": 550 }, { "epoch": 3.265185185185185, "grad_norm": 8.97913278009079, "learning_rate": 3.6761637015068893e-07, "logits/chosen": -1.1909668445587158, "logits/rejected": -1.2579350471496582, "logps/chosen": -55.003353118896484, "logps/rejected": -74.0880126953125, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": -0.34160882234573364, "rewards/margins": 4.481330871582031, "rewards/rejected": -4.822939872741699, "step": 551 }, { "epoch": 3.2711111111111113, "grad_norm": 14.211347383837909, "learning_rate": 3.67042731534174e-07, "logits/chosen": -1.1668155193328857, "logits/rejected": -1.2287236452102661, "logps/chosen": -44.797523498535156, "logps/rejected": -67.50494384765625, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 0.12582820653915405, "rewards/margins": 3.9675216674804688, "rewards/rejected": -3.841693639755249, "step": 552 }, { "epoch": 3.277037037037037, "grad_norm": 12.272261458182058, "learning_rate": 3.6646830261881263e-07, "logits/chosen": -1.38411545753479, "logits/rejected": -1.3050222396850586, "logps/chosen": -63.80104446411133, "logps/rejected": -86.18976593017578, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 0.24508556723594666, "rewards/margins": 4.132302761077881, "rewards/rejected": -3.8872170448303223, "step": 553 }, { "epoch": 3.282962962962963, "grad_norm": 12.169816292887981, "learning_rate": 3.6589308728327797e-07, "logits/chosen": -1.3022029399871826, "logits/rejected": -1.2084033489227295, "logps/chosen": -57.94591522216797, "logps/rejected": -66.00154113769531, "loss": 0.1431, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10055610537528992, "rewards/margins": 3.479001045227051, "rewards/rejected": -3.579557418823242, "step": 554 }, { "epoch": 3.2888888888888888, "grad_norm": 15.04773541364506, "learning_rate": 3.653170894115533e-07, "logits/chosen": -1.208168625831604, "logits/rejected": -1.172544002532959, "logps/chosen": -45.36444854736328, "logps/rejected": -58.140079498291016, "loss": 0.1705, "rewards/accuracies": 1.0, "rewards/chosen": 0.28003713488578796, "rewards/margins": 3.3673346042633057, "rewards/rejected": -3.0872974395751953, "step": 555 }, { "epoch": 3.294814814814815, "grad_norm": 12.022675358094785, "learning_rate": 3.6474031289290586e-07, "logits/chosen": -1.2167534828186035, "logits/rejected": -1.2709438800811768, "logps/chosen": -38.76374053955078, "logps/rejected": -63.62835693359375, "loss": 0.1215, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2464836686849594, "rewards/margins": 3.099431276321411, "rewards/rejected": -2.852947473526001, "step": 556 }, { "epoch": 3.300740740740741, "grad_norm": 15.48226246819183, "learning_rate": 3.641627616218603e-07, "logits/chosen": -1.17167329788208, "logits/rejected": -1.1303136348724365, "logps/chosen": -51.64458465576172, "logps/rejected": -52.03526306152344, "loss": 0.1601, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07310710102319717, "rewards/margins": 2.5615978240966797, "rewards/rejected": -2.634705066680908, "step": 557 }, { "epoch": 3.3066666666666666, "grad_norm": 12.763142895021037, "learning_rate": 3.6358443949817283e-07, "logits/chosen": -1.2829394340515137, "logits/rejected": -1.3682385683059692, "logps/chosen": -68.91250610351562, "logps/rejected": -64.79576110839844, "loss": 0.1504, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4208144545555115, "rewards/margins": 2.884341239929199, "rewards/rejected": -3.3051557540893555, "step": 558 }, { "epoch": 3.3125925925925928, "grad_norm": 10.699144281049568, "learning_rate": 3.630053504268046e-07, "logits/chosen": -1.2629507780075073, "logits/rejected": -1.2774531841278076, "logps/chosen": -56.757015228271484, "logps/rejected": -52.96363067626953, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 0.004754260182380676, "rewards/margins": 2.7854652404785156, "rewards/rejected": -2.7807109355926514, "step": 559 }, { "epoch": 3.3185185185185184, "grad_norm": 12.32948309194096, "learning_rate": 3.62425498317895e-07, "logits/chosen": -1.3774765729904175, "logits/rejected": -1.551738977432251, "logps/chosen": -46.59605026245117, "logps/rejected": -66.0199203491211, "loss": 0.1435, "rewards/accuracies": 1.0, "rewards/chosen": 0.2962900400161743, "rewards/margins": 3.7435803413391113, "rewards/rejected": -3.4472906589508057, "step": 560 }, { "epoch": 3.3244444444444445, "grad_norm": 10.068740616602673, "learning_rate": 3.6184488708673597e-07, "logits/chosen": -1.3757305145263672, "logits/rejected": -1.4594712257385254, "logps/chosen": -47.44297409057617, "logps/rejected": -66.62445068359375, "loss": 0.1161, "rewards/accuracies": 1.0, "rewards/chosen": -0.8080825805664062, "rewards/margins": 2.998539686203003, "rewards/rejected": -3.8066225051879883, "step": 561 }, { "epoch": 3.33037037037037, "grad_norm": 11.189446984327448, "learning_rate": 3.6126352065374517e-07, "logits/chosen": -1.2296010255813599, "logits/rejected": -1.3213616609573364, "logps/chosen": -52.5794792175293, "logps/rejected": -68.11521911621094, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": -0.08329074084758759, "rewards/margins": 3.181485652923584, "rewards/rejected": -3.2647767066955566, "step": 562 }, { "epoch": 3.3362962962962963, "grad_norm": 10.767209581812672, "learning_rate": 3.6068140294443943e-07, "logits/chosen": -1.299060583114624, "logits/rejected": -1.3229368925094604, "logps/chosen": -45.46562194824219, "logps/rejected": -58.185638427734375, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": -0.24200953543186188, "rewards/margins": 2.517746925354004, "rewards/rejected": -2.759756565093994, "step": 563 }, { "epoch": 3.3422222222222224, "grad_norm": 13.067270633778744, "learning_rate": 3.6009853788940856e-07, "logits/chosen": -1.3421900272369385, "logits/rejected": -1.3259530067443848, "logps/chosen": -48.29127502441406, "logps/rejected": -53.28279113769531, "loss": 0.1491, "rewards/accuracies": 0.875, "rewards/chosen": -0.6294746398925781, "rewards/margins": 2.588189125061035, "rewards/rejected": -3.2176637649536133, "step": 564 }, { "epoch": 3.348148148148148, "grad_norm": 15.895189856288384, "learning_rate": 3.595149294242884e-07, "logits/chosen": -1.0856770277023315, "logits/rejected": -1.1430692672729492, "logps/chosen": -46.576560974121094, "logps/rejected": -59.8853759765625, "loss": 0.1596, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09673969447612762, "rewards/margins": 3.5400326251983643, "rewards/rejected": -3.4432926177978516, "step": 565 }, { "epoch": 3.354074074074074, "grad_norm": 11.329858917681372, "learning_rate": 3.589305814897346e-07, "logits/chosen": -1.1583616733551025, "logits/rejected": -1.3731303215026855, "logps/chosen": -48.65069580078125, "logps/rejected": -69.706787109375, "loss": 0.1231, "rewards/accuracies": 1.0, "rewards/chosen": -0.2624179422855377, "rewards/margins": 4.3107147216796875, "rewards/rejected": -4.5731329917907715, "step": 566 }, { "epoch": 3.36, "grad_norm": 10.945781896079705, "learning_rate": 3.5834549803139586e-07, "logits/chosen": -1.179014801979065, "logits/rejected": -1.225006103515625, "logps/chosen": -47.45325469970703, "logps/rejected": -54.36988067626953, "loss": 0.1138, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8625340461730957, "rewards/margins": 3.055980682373047, "rewards/rejected": -3.9185147285461426, "step": 567 }, { "epoch": 3.365925925925926, "grad_norm": 13.426424155571501, "learning_rate": 3.5775968299988725e-07, "logits/chosen": -1.2716865539550781, "logits/rejected": -1.3909945487976074, "logps/chosen": -47.59417724609375, "logps/rejected": -87.17699432373047, "loss": 0.1201, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4128919243812561, "rewards/margins": 4.454270362854004, "rewards/rejected": -4.867162227630615, "step": 568 }, { "epoch": 3.3718518518518517, "grad_norm": 12.610504058042062, "learning_rate": 3.571731403507635e-07, "logits/chosen": -1.1148130893707275, "logits/rejected": -1.1845794916152954, "logps/chosen": -41.87155532836914, "logps/rejected": -59.9697380065918, "loss": 0.1297, "rewards/accuracies": 1.0, "rewards/chosen": 0.17576763033866882, "rewards/margins": 3.7137932777404785, "rewards/rejected": -3.5380258560180664, "step": 569 }, { "epoch": 3.3777777777777778, "grad_norm": 14.308781328176961, "learning_rate": 3.565858740444927e-07, "logits/chosen": -1.2425477504730225, "logits/rejected": -1.251265525817871, "logps/chosen": -37.937618255615234, "logps/rejected": -47.25873565673828, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": -0.5201172828674316, "rewards/margins": 1.7741285562515259, "rewards/rejected": -2.294245719909668, "step": 570 }, { "epoch": 3.383703703703704, "grad_norm": 9.85987353052267, "learning_rate": 3.559978880464289e-07, "logits/chosen": -1.3315365314483643, "logits/rejected": -1.2916427850723267, "logps/chosen": -52.93934631347656, "logps/rejected": -55.17170333862305, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": -0.1882355660200119, "rewards/margins": 3.6535511016845703, "rewards/rejected": -3.8417863845825195, "step": 571 }, { "epoch": 3.3896296296296295, "grad_norm": 13.676050112137077, "learning_rate": 3.5540918632678583e-07, "logits/chosen": -1.327182412147522, "logits/rejected": -1.3504629135131836, "logps/chosen": -55.77523422241211, "logps/rejected": -66.50366973876953, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": -0.11458379030227661, "rewards/margins": 3.909151554107666, "rewards/rejected": -4.023735523223877, "step": 572 }, { "epoch": 3.3955555555555557, "grad_norm": 9.016347803722182, "learning_rate": 3.5481977286060995e-07, "logits/chosen": -1.2306737899780273, "logits/rejected": -1.2826849222183228, "logps/chosen": -52.68648910522461, "logps/rejected": -74.59066772460938, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -0.9488479495048523, "rewards/margins": 4.810406684875488, "rewards/rejected": -5.7592549324035645, "step": 573 }, { "epoch": 3.4014814814814813, "grad_norm": 16.730180888421796, "learning_rate": 3.542296516277535e-07, "logits/chosen": -1.16511869430542, "logits/rejected": -1.2019627094268799, "logps/chosen": -56.49937438964844, "logps/rejected": -63.08637619018555, "loss": 0.1583, "rewards/accuracies": 0.75, "rewards/chosen": -0.2525779604911804, "rewards/margins": 2.9084134101867676, "rewards/rejected": -3.1609911918640137, "step": 574 }, { "epoch": 3.4074074074074074, "grad_norm": 12.148498309953794, "learning_rate": 3.5363882661284767e-07, "logits/chosen": -1.2179135084152222, "logits/rejected": -1.2052103281021118, "logps/chosen": -43.90802764892578, "logps/rejected": -49.14393997192383, "loss": 0.128, "rewards/accuracies": 1.0, "rewards/chosen": -0.3902662992477417, "rewards/margins": 2.4715981483459473, "rewards/rejected": -2.8618645668029785, "step": 575 }, { "epoch": 3.413333333333333, "grad_norm": 12.367099531026179, "learning_rate": 3.53047301805276e-07, "logits/chosen": -1.27561354637146, "logits/rejected": -1.3109219074249268, "logps/chosen": -55.32654571533203, "logps/rejected": -61.15959167480469, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 0.42814308404922485, "rewards/margins": 3.9681081771850586, "rewards/rejected": -3.5399651527404785, "step": 576 }, { "epoch": 3.419259259259259, "grad_norm": 10.977571285480575, "learning_rate": 3.5245508119914683e-07, "logits/chosen": -1.0619442462921143, "logits/rejected": -1.0826367139816284, "logps/chosen": -50.80113983154297, "logps/rejected": -60.27879333496094, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 0.31538814306259155, "rewards/margins": 4.127216339111328, "rewards/rejected": -3.811828136444092, "step": 577 }, { "epoch": 3.4251851851851853, "grad_norm": 15.088444665057558, "learning_rate": 3.518621687932671e-07, "logits/chosen": -1.2352567911148071, "logits/rejected": -1.2788387537002563, "logps/chosen": -50.68989562988281, "logps/rejected": -69.35728454589844, "loss": 0.158, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3294627368450165, "rewards/margins": 3.7237389087677, "rewards/rejected": -4.053201675415039, "step": 578 }, { "epoch": 3.431111111111111, "grad_norm": 12.344161330524548, "learning_rate": 3.5126856859111464e-07, "logits/chosen": -1.357169270515442, "logits/rejected": -1.3112940788269043, "logps/chosen": -53.08173370361328, "logps/rejected": -67.811279296875, "loss": 0.1136, "rewards/accuracies": 0.875, "rewards/chosen": 0.38673120737075806, "rewards/margins": 3.6063897609710693, "rewards/rejected": -3.219658613204956, "step": 579 }, { "epoch": 3.437037037037037, "grad_norm": 13.39088643319056, "learning_rate": 3.5067428460081157e-07, "logits/chosen": -1.0850348472595215, "logits/rejected": -1.1661014556884766, "logps/chosen": -39.36518859863281, "logps/rejected": -50.459564208984375, "loss": 0.1553, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7768265604972839, "rewards/margins": 3.601526975631714, "rewards/rejected": -2.824700355529785, "step": 580 }, { "epoch": 3.442962962962963, "grad_norm": 11.46739020103962, "learning_rate": 3.5007932083509687e-07, "logits/chosen": -1.295800805091858, "logits/rejected": -1.3909955024719238, "logps/chosen": -52.019142150878906, "logps/rejected": -76.2337875366211, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": -0.06341786682605743, "rewards/margins": 3.9215664863586426, "rewards/rejected": -3.9849839210510254, "step": 581 }, { "epoch": 3.448888888888889, "grad_norm": 9.915174802293706, "learning_rate": 3.494836813112998e-07, "logits/chosen": -1.1923561096191406, "logits/rejected": -1.2022353410720825, "logps/chosen": -47.116416931152344, "logps/rejected": -54.7679443359375, "loss": 0.12, "rewards/accuracies": 1.0, "rewards/chosen": -0.3462223708629608, "rewards/margins": 2.759658098220825, "rewards/rejected": -3.1058802604675293, "step": 582 }, { "epoch": 3.454814814814815, "grad_norm": 10.129137706661414, "learning_rate": 3.488873700513124e-07, "logits/chosen": -1.1396245956420898, "logits/rejected": -1.1778151988983154, "logps/chosen": -46.93259811401367, "logps/rejected": -72.2052001953125, "loss": 0.1199, "rewards/accuracies": 1.0, "rewards/chosen": -0.08462900668382645, "rewards/margins": 4.626044273376465, "rewards/rejected": -4.710673809051514, "step": 583 }, { "epoch": 3.4607407407407407, "grad_norm": 12.71575358091169, "learning_rate": 3.482903910815625e-07, "logits/chosen": -1.2865262031555176, "logits/rejected": -1.4614137411117554, "logps/chosen": -46.11248016357422, "logps/rejected": -76.97091674804688, "loss": 0.1249, "rewards/accuracies": 1.0, "rewards/chosen": -0.3348883390426636, "rewards/margins": 3.5430636405944824, "rewards/rejected": -3.8779520988464355, "step": 584 }, { "epoch": 3.466666666666667, "grad_norm": 12.299766608320654, "learning_rate": 3.476927484329862e-07, "logits/chosen": -1.232077717781067, "logits/rejected": -1.4356149435043335, "logps/chosen": -51.55973434448242, "logps/rejected": -55.99464416503906, "loss": 0.1568, "rewards/accuracies": 1.0, "rewards/chosen": -0.1431867480278015, "rewards/margins": 2.7733068466186523, "rewards/rejected": -2.9164934158325195, "step": 585 }, { "epoch": 3.4725925925925925, "grad_norm": 10.95922060452019, "learning_rate": 3.4709444614100113e-07, "logits/chosen": -1.0688925981521606, "logits/rejected": -1.2096281051635742, "logps/chosen": -48.453758239746094, "logps/rejected": -60.63860321044922, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": 0.18309545516967773, "rewards/margins": 4.147503852844238, "rewards/rejected": -3.9644083976745605, "step": 586 }, { "epoch": 3.4785185185185186, "grad_norm": 11.54515377570644, "learning_rate": 3.46495488245479e-07, "logits/chosen": -1.049604892730713, "logits/rejected": -1.170586109161377, "logps/chosen": -31.83830451965332, "logps/rejected": -60.98114776611328, "loss": 0.1006, "rewards/accuracies": 1.0, "rewards/chosen": -0.018343007192015648, "rewards/margins": 4.682507514953613, "rewards/rejected": -4.700850963592529, "step": 587 }, { "epoch": 3.4844444444444447, "grad_norm": 9.908784563066144, "learning_rate": 3.4589587879071814e-07, "logits/chosen": -1.207253098487854, "logits/rejected": -1.2515345811843872, "logps/chosen": -36.640602111816406, "logps/rejected": -65.44203186035156, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 0.08374863117933273, "rewards/margins": 5.360215187072754, "rewards/rejected": -5.276466369628906, "step": 588 }, { "epoch": 3.4903703703703703, "grad_norm": 13.154285168155294, "learning_rate": 3.452956218254165e-07, "logits/chosen": -1.159376859664917, "logits/rejected": -1.3010073900222778, "logps/chosen": -69.1679916381836, "logps/rejected": -80.59864807128906, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": -0.6153389811515808, "rewards/margins": 3.839124917984009, "rewards/rejected": -4.454463958740234, "step": 589 }, { "epoch": 3.4962962962962965, "grad_norm": 13.385750783978704, "learning_rate": 3.44694721402644e-07, "logits/chosen": -1.1509666442871094, "logits/rejected": -1.2663698196411133, "logps/chosen": -52.61823272705078, "logps/rejected": -61.116004943847656, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": -0.3829859495162964, "rewards/margins": 4.054986000061035, "rewards/rejected": -4.437971591949463, "step": 590 }, { "epoch": 3.502222222222222, "grad_norm": 12.780642607852641, "learning_rate": 3.440931815798156e-07, "logits/chosen": -1.2461998462677002, "logits/rejected": -1.3316093683242798, "logps/chosen": -43.889793395996094, "logps/rejected": -48.703697204589844, "loss": 0.1228, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2379981130361557, "rewards/margins": 3.194875478744507, "rewards/rejected": -3.4328739643096924, "step": 591 }, { "epoch": 3.5081481481481482, "grad_norm": 11.675294006646098, "learning_rate": 3.434910064186633e-07, "logits/chosen": -1.3106170892715454, "logits/rejected": -1.3728466033935547, "logps/chosen": -62.90284729003906, "logps/rejected": -79.64160919189453, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": -0.06035172939300537, "rewards/margins": 4.127461910247803, "rewards/rejected": -4.187813758850098, "step": 592 }, { "epoch": 3.514074074074074, "grad_norm": 12.545333065112036, "learning_rate": 3.428881999852093e-07, "logits/chosen": -1.2505770921707153, "logits/rejected": -1.1669187545776367, "logps/chosen": -61.93037033081055, "logps/rejected": -59.22869873046875, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": -0.6076765060424805, "rewards/margins": 3.8657004833221436, "rewards/rejected": -4.473376750946045, "step": 593 }, { "epoch": 3.52, "grad_norm": 12.039562363809202, "learning_rate": 3.4228476634973836e-07, "logits/chosen": -1.4041945934295654, "logits/rejected": -1.3803038597106934, "logps/chosen": -37.459320068359375, "logps/rejected": -47.55082321166992, "loss": 0.1355, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13520795106887817, "rewards/margins": 2.7191033363342285, "rewards/rejected": -2.854311466217041, "step": 594 }, { "epoch": 3.525925925925926, "grad_norm": 10.649784093665557, "learning_rate": 3.4168070958676985e-07, "logits/chosen": -1.319873571395874, "logits/rejected": -1.3600068092346191, "logps/chosen": -37.87757873535156, "logps/rejected": -63.027076721191406, "loss": 0.0997, "rewards/accuracies": 0.9375, "rewards/chosen": -0.26689252257347107, "rewards/margins": 2.804678440093994, "rewards/rejected": -3.071570873260498, "step": 595 }, { "epoch": 3.531851851851852, "grad_norm": 13.340100702360116, "learning_rate": 3.41076033775031e-07, "logits/chosen": -1.3849197626113892, "logits/rejected": -1.2956639528274536, "logps/chosen": -52.903194427490234, "logps/rejected": -65.97325134277344, "loss": 0.1414, "rewards/accuracies": 0.875, "rewards/chosen": -0.3340446949005127, "rewards/margins": 3.176222562789917, "rewards/rejected": -3.5102672576904297, "step": 596 }, { "epoch": 3.537777777777778, "grad_norm": 11.56286685664957, "learning_rate": 3.404707429974289e-07, "logits/chosen": -1.1817491054534912, "logits/rejected": -1.163534164428711, "logps/chosen": -49.18204116821289, "logps/rejected": -59.26782989501953, "loss": 0.111, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37444692850112915, "rewards/margins": 4.150087833404541, "rewards/rejected": -4.524535179138184, "step": 597 }, { "epoch": 3.5437037037037036, "grad_norm": 12.010067381486293, "learning_rate": 3.3986484134102294e-07, "logits/chosen": -1.245498776435852, "logits/rejected": -1.3706715106964111, "logps/chosen": -37.453086853027344, "logps/rejected": -47.116939544677734, "loss": 0.1253, "rewards/accuracies": 0.9375, "rewards/chosen": -0.28796306252479553, "rewards/margins": 3.1174545288085938, "rewards/rejected": -3.4054174423217773, "step": 598 }, { "epoch": 3.5496296296296297, "grad_norm": 8.683364146472007, "learning_rate": 3.392583328969975e-07, "logits/chosen": -1.2362431287765503, "logits/rejected": -1.2151918411254883, "logps/chosen": -50.436302185058594, "logps/rejected": -57.43449020385742, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2915821969509125, "rewards/margins": 2.8281185626983643, "rewards/rejected": -3.1197006702423096, "step": 599 }, { "epoch": 3.5555555555555554, "grad_norm": 10.085808487260866, "learning_rate": 3.3865122176063385e-07, "logits/chosen": -1.21660578250885, "logits/rejected": -1.2353140115737915, "logps/chosen": -71.79434967041016, "logps/rejected": -83.12904357910156, "loss": 0.1073, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2442128658294678, "rewards/margins": 4.630167484283447, "rewards/rejected": -5.874380588531494, "step": 600 }, { "epoch": 3.5614814814814815, "grad_norm": 11.573039808812974, "learning_rate": 3.380435120312831e-07, "logits/chosen": -1.3292632102966309, "logits/rejected": -1.4217140674591064, "logps/chosen": -38.365882873535156, "logps/rejected": -79.41341400146484, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 0.07226824760437012, "rewards/margins": 4.579197883605957, "rewards/rejected": -4.506929874420166, "step": 601 }, { "epoch": 3.5674074074074076, "grad_norm": 9.394842443174891, "learning_rate": 3.374352078123379e-07, "logits/chosen": -1.1822998523712158, "logits/rejected": -1.241405725479126, "logps/chosen": -53.02170944213867, "logps/rejected": -79.73939514160156, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": -0.29872894287109375, "rewards/margins": 4.012335300445557, "rewards/rejected": -4.31106424331665, "step": 602 }, { "epoch": 3.5733333333333333, "grad_norm": 11.84291739497059, "learning_rate": 3.36826313211205e-07, "logits/chosen": -1.2073980569839478, "logits/rejected": -1.280730962753296, "logps/chosen": -43.82855987548828, "logps/rejected": -68.55513000488281, "loss": 0.112, "rewards/accuracies": 1.0, "rewards/chosen": -0.4863467216491699, "rewards/margins": 4.065818786621094, "rewards/rejected": -4.552165508270264, "step": 603 }, { "epoch": 3.5792592592592594, "grad_norm": 12.828020094911588, "learning_rate": 3.36216832339278e-07, "logits/chosen": -1.3106424808502197, "logits/rejected": -1.392452597618103, "logps/chosen": -59.96405792236328, "logps/rejected": -78.44798278808594, "loss": 0.1075, "rewards/accuracies": 1.0, "rewards/chosen": -0.5146230459213257, "rewards/margins": 5.168149948120117, "rewards/rejected": -5.682773113250732, "step": 604 }, { "epoch": 3.585185185185185, "grad_norm": 13.845119069512217, "learning_rate": 3.3560676931190866e-07, "logits/chosen": -1.3275146484375, "logits/rejected": -1.4015703201293945, "logps/chosen": -71.70769500732422, "logps/rejected": -89.53176879882812, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": -0.06229984760284424, "rewards/margins": 4.775994777679443, "rewards/rejected": -4.838294982910156, "step": 605 }, { "epoch": 3.591111111111111, "grad_norm": 17.106298908100317, "learning_rate": 3.3499612824837976e-07, "logits/chosen": -1.2026786804199219, "logits/rejected": -1.1629083156585693, "logps/chosen": -53.834800720214844, "logps/rejected": -69.13336944580078, "loss": 0.1756, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4738187789916992, "rewards/margins": 3.8776614665985107, "rewards/rejected": -4.351480484008789, "step": 606 }, { "epoch": 3.597037037037037, "grad_norm": 10.52043087004134, "learning_rate": 3.343849132718771e-07, "logits/chosen": -1.3493577241897583, "logits/rejected": -1.3847932815551758, "logps/chosen": -43.97184753417969, "logps/rejected": -59.462989807128906, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 0.44303974509239197, "rewards/margins": 3.0357778072357178, "rewards/rejected": -2.592738151550293, "step": 607 }, { "epoch": 3.602962962962963, "grad_norm": 10.291205342582417, "learning_rate": 3.337731285094616e-07, "logits/chosen": -1.2725692987442017, "logits/rejected": -1.3912580013275146, "logps/chosen": -44.75775146484375, "logps/rejected": -63.437007904052734, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/chosen": -0.8086339235305786, "rewards/margins": 3.172126293182373, "rewards/rejected": -3.980760097503662, "step": 608 }, { "epoch": 3.608888888888889, "grad_norm": 10.143519608754804, "learning_rate": 3.3316077809204163e-07, "logits/chosen": -1.17635178565979, "logits/rejected": -1.2556065320968628, "logps/chosen": -58.35077667236328, "logps/rejected": -73.878173828125, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 0.04152454063296318, "rewards/margins": 4.9238481521606445, "rewards/rejected": -4.882323265075684, "step": 609 }, { "epoch": 3.6148148148148147, "grad_norm": 10.812544039186557, "learning_rate": 3.3254786615434495e-07, "logits/chosen": -1.4478641748428345, "logits/rejected": -1.514793872833252, "logps/chosen": -40.245880126953125, "logps/rejected": -53.71836471557617, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 0.2759896516799927, "rewards/margins": 3.572756767272949, "rewards/rejected": -3.296767234802246, "step": 610 }, { "epoch": 3.620740740740741, "grad_norm": 11.075484357385585, "learning_rate": 3.319343968348908e-07, "logits/chosen": -1.2261667251586914, "logits/rejected": -1.2476285696029663, "logps/chosen": -43.76851272583008, "logps/rejected": -69.57899475097656, "loss": 0.0967, "rewards/accuracies": 1.0, "rewards/chosen": -0.36359500885009766, "rewards/margins": 4.579384803771973, "rewards/rejected": -4.94297981262207, "step": 611 }, { "epoch": 3.626666666666667, "grad_norm": 10.278190082484747, "learning_rate": 3.3132037427596186e-07, "logits/chosen": -1.0067546367645264, "logits/rejected": -1.0119884014129639, "logps/chosen": -35.96860885620117, "logps/rejected": -54.944618225097656, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 0.2067289650440216, "rewards/margins": 3.7989470958709717, "rewards/rejected": -3.5922181606292725, "step": 612 }, { "epoch": 3.6325925925925926, "grad_norm": 14.799071682203722, "learning_rate": 3.3070580262357676e-07, "logits/chosen": -1.248012900352478, "logits/rejected": -1.3041200637817383, "logps/chosen": -48.836002349853516, "logps/rejected": -60.00675964355469, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 0.29592806100845337, "rewards/margins": 2.614812135696411, "rewards/rejected": -2.3188841342926025, "step": 613 }, { "epoch": 3.6385185185185183, "grad_norm": 9.398132314598168, "learning_rate": 3.3009068602746135e-07, "logits/chosen": -1.1975395679473877, "logits/rejected": -1.2507511377334595, "logps/chosen": -53.41824722290039, "logps/rejected": -82.38655090332031, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": -0.4321832060813904, "rewards/margins": 4.6858344078063965, "rewards/rejected": -5.11801815032959, "step": 614 }, { "epoch": 3.6444444444444444, "grad_norm": 16.544980874132325, "learning_rate": 3.294750286410213e-07, "logits/chosen": -1.1811959743499756, "logits/rejected": -1.2203879356384277, "logps/chosen": -45.3848876953125, "logps/rejected": -59.8250732421875, "loss": 0.1759, "rewards/accuracies": 0.9375, "rewards/chosen": -0.34930887818336487, "rewards/margins": 3.638972282409668, "rewards/rejected": -3.98828125, "step": 615 }, { "epoch": 3.6503703703703705, "grad_norm": 11.09187739842382, "learning_rate": 3.288588346213139e-07, "logits/chosen": -1.1899372339248657, "logits/rejected": -1.3012945652008057, "logps/chosen": -51.77571487426758, "logps/rejected": -59.12542724609375, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": -0.2844125032424927, "rewards/margins": 3.2368106842041016, "rewards/rejected": -3.521223306655884, "step": 616 }, { "epoch": 3.656296296296296, "grad_norm": 12.486021087724694, "learning_rate": 3.282421081290195e-07, "logits/chosen": -1.3653810024261475, "logits/rejected": -1.4098625183105469, "logps/chosen": -55.62464904785156, "logps/rejected": -74.34550476074219, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": 0.2354530394077301, "rewards/margins": 2.5498616695404053, "rewards/rejected": -2.314408779144287, "step": 617 }, { "epoch": 3.6622222222222223, "grad_norm": 14.642400078478088, "learning_rate": 3.2762485332841404e-07, "logits/chosen": -1.360192060470581, "logits/rejected": -1.3391731977462769, "logps/chosen": -41.53746032714844, "logps/rejected": -57.24707794189453, "loss": 0.1429, "rewards/accuracies": 1.0, "rewards/chosen": 0.11149264872074127, "rewards/margins": 2.653454065322876, "rewards/rejected": -2.541961431503296, "step": 618 }, { "epoch": 3.6681481481481484, "grad_norm": 12.180997641032809, "learning_rate": 3.27007074387341e-07, "logits/chosen": -1.3639566898345947, "logits/rejected": -1.4342676401138306, "logps/chosen": -52.15316390991211, "logps/rejected": -61.58177947998047, "loss": 0.1335, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3126819133758545, "rewards/margins": 3.2332041263580322, "rewards/rejected": -2.9205222129821777, "step": 619 }, { "epoch": 3.674074074074074, "grad_norm": 10.139745307420647, "learning_rate": 3.2638877547718263e-07, "logits/chosen": -1.2260799407958984, "logits/rejected": -1.3334007263183594, "logps/chosen": -47.63837814331055, "logps/rejected": -63.36333465576172, "loss": 0.1099, "rewards/accuracies": 1.0, "rewards/chosen": -0.7048265933990479, "rewards/margins": 3.433518409729004, "rewards/rejected": -4.138344764709473, "step": 620 }, { "epoch": 3.68, "grad_norm": 12.893287201159424, "learning_rate": 3.2576996077283217e-07, "logits/chosen": -1.0819010734558105, "logits/rejected": -1.1939318180084229, "logps/chosen": -46.48964309692383, "logps/rejected": -62.650733947753906, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": -0.2993219196796417, "rewards/margins": 4.32916259765625, "rewards/rejected": -4.62848424911499, "step": 621 }, { "epoch": 3.685925925925926, "grad_norm": 12.489361170148916, "learning_rate": 3.251506344526658e-07, "logits/chosen": -1.1828008890151978, "logits/rejected": -1.2193061113357544, "logps/chosen": -42.48404312133789, "logps/rejected": -69.676513671875, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": -0.25544118881225586, "rewards/margins": 3.554778575897217, "rewards/rejected": -3.8102197647094727, "step": 622 }, { "epoch": 3.691851851851852, "grad_norm": 11.325882270181273, "learning_rate": 3.2453080069851403e-07, "logits/chosen": -1.1413886547088623, "logits/rejected": -1.2398507595062256, "logps/chosen": -52.085227966308594, "logps/rejected": -63.648101806640625, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 0.4174678325653076, "rewards/margins": 4.257204055786133, "rewards/rejected": -3.839736223220825, "step": 623 }, { "epoch": 3.6977777777777776, "grad_norm": 9.234591711774305, "learning_rate": 3.239104636956337e-07, "logits/chosen": -1.1018732786178589, "logits/rejected": -1.2757089138031006, "logps/chosen": -57.293800354003906, "logps/rejected": -78.91786193847656, "loss": 0.0852, "rewards/accuracies": 0.9375, "rewards/chosen": -0.21278901398181915, "rewards/margins": 3.945615291595459, "rewards/rejected": -4.1584038734436035, "step": 624 }, { "epoch": 3.7037037037037037, "grad_norm": 10.3979498655452, "learning_rate": 3.2328962763267993e-07, "logits/chosen": -1.3757495880126953, "logits/rejected": -1.4409973621368408, "logps/chosen": -47.846397399902344, "logps/rejected": -62.478065490722656, "loss": 0.0908, "rewards/accuracies": 1.0, "rewards/chosen": -0.21286453306674957, "rewards/margins": 3.6221718788146973, "rewards/rejected": -3.8350367546081543, "step": 625 }, { "epoch": 3.70962962962963, "grad_norm": 15.014189972537254, "learning_rate": 3.2266829670167736e-07, "logits/chosen": -1.10061514377594, "logits/rejected": -1.373170018196106, "logps/chosen": -51.52172088623047, "logps/rejected": -87.5226058959961, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": -0.577337920665741, "rewards/margins": 4.87119197845459, "rewards/rejected": -5.4485297203063965, "step": 626 }, { "epoch": 3.7155555555555555, "grad_norm": 11.529836335453567, "learning_rate": 3.2204647509799216e-07, "logits/chosen": -1.3851773738861084, "logits/rejected": -1.317387580871582, "logps/chosen": -67.03350830078125, "logps/rejected": -73.63191223144531, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": -0.7230323553085327, "rewards/margins": 3.9368014335632324, "rewards/rejected": -4.659833908081055, "step": 627 }, { "epoch": 3.7214814814814816, "grad_norm": 11.381704318346536, "learning_rate": 3.2142416702030365e-07, "logits/chosen": -1.4134955406188965, "logits/rejected": -1.4581289291381836, "logps/chosen": -37.11763000488281, "logps/rejected": -64.74070739746094, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 0.05463176220655441, "rewards/margins": 3.725959062576294, "rewards/rejected": -3.6713271141052246, "step": 628 }, { "epoch": 3.7274074074074073, "grad_norm": 10.461787154565728, "learning_rate": 3.2080137667057595e-07, "logits/chosen": -1.2077778577804565, "logits/rejected": -1.2291464805603027, "logps/chosen": -43.65045166015625, "logps/rejected": -48.41703796386719, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 0.25063371658325195, "rewards/margins": 2.5582003593444824, "rewards/rejected": -2.3075666427612305, "step": 629 }, { "epoch": 3.7333333333333334, "grad_norm": 12.686465349682216, "learning_rate": 3.201781082540297e-07, "logits/chosen": -1.1312532424926758, "logits/rejected": -1.148064374923706, "logps/chosen": -39.60634231567383, "logps/rejected": -57.44070816040039, "loss": 0.1388, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3378147482872009, "rewards/margins": 2.453248977661133, "rewards/rejected": -2.7910635471343994, "step": 630 }, { "epoch": 3.739259259259259, "grad_norm": 11.96368647028074, "learning_rate": 3.1955436597911315e-07, "logits/chosen": -1.215735912322998, "logits/rejected": -1.3225897550582886, "logps/chosen": -50.87785339355469, "logps/rejected": -59.81559753417969, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 0.0800708532333374, "rewards/margins": 4.3443708419799805, "rewards/rejected": -4.2642998695373535, "step": 631 }, { "epoch": 3.745185185185185, "grad_norm": 16.889261383609924, "learning_rate": 3.1893015405747467e-07, "logits/chosen": -1.0199214220046997, "logits/rejected": -1.0140265226364136, "logps/chosen": -41.95586395263672, "logps/rejected": -54.95637130737305, "loss": 0.1802, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7874963283538818, "rewards/margins": 3.881711959838867, "rewards/rejected": -4.669208526611328, "step": 632 }, { "epoch": 3.7511111111111113, "grad_norm": 13.116077952649336, "learning_rate": 3.183054767039333e-07, "logits/chosen": -1.2483025789260864, "logits/rejected": -1.2860000133514404, "logps/chosen": -66.91580963134766, "logps/rejected": -68.25605010986328, "loss": 0.1365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4118862748146057, "rewards/margins": 3.3932361602783203, "rewards/rejected": -3.8051223754882812, "step": 633 }, { "epoch": 3.757037037037037, "grad_norm": 12.677949496539178, "learning_rate": 3.176803381364512e-07, "logits/chosen": -1.021579384803772, "logits/rejected": -1.080379843711853, "logps/chosen": -46.37247848510742, "logps/rejected": -74.96723175048828, "loss": 0.1178, "rewards/accuracies": 1.0, "rewards/chosen": -0.666985273361206, "rewards/margins": 4.058359146118164, "rewards/rejected": -4.725344181060791, "step": 634 }, { "epoch": 3.762962962962963, "grad_norm": 12.274361965505955, "learning_rate": 3.170547425761046e-07, "logits/chosen": -1.149162769317627, "logits/rejected": -1.1602625846862793, "logps/chosen": -44.212005615234375, "logps/rejected": -62.73664855957031, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": -0.05275583267211914, "rewards/margins": 4.03060245513916, "rewards/rejected": -4.0833587646484375, "step": 635 }, { "epoch": 3.7688888888888887, "grad_norm": 13.580028480959736, "learning_rate": 3.164286942470553e-07, "logits/chosen": -1.3504812717437744, "logits/rejected": -1.347651481628418, "logps/chosen": -46.518310546875, "logps/rejected": -79.4151840209961, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": -0.36658138036727905, "rewards/margins": 3.574552536010742, "rewards/rejected": -3.941133975982666, "step": 636 }, { "epoch": 3.774814814814815, "grad_norm": 15.640271201233753, "learning_rate": 3.1580219737652254e-07, "logits/chosen": -1.3655234575271606, "logits/rejected": -1.4588322639465332, "logps/chosen": -47.00981521606445, "logps/rejected": -64.0774917602539, "loss": 0.1707, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13352152705192566, "rewards/margins": 4.234663009643555, "rewards/rejected": -4.368184566497803, "step": 637 }, { "epoch": 3.7807407407407405, "grad_norm": 8.430239195551694, "learning_rate": 3.1517525619475394e-07, "logits/chosen": -1.3756463527679443, "logits/rejected": -1.388979196548462, "logps/chosen": -38.930641174316406, "logps/rejected": -53.12352752685547, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 0.0509943813085556, "rewards/margins": 3.156320571899414, "rewards/rejected": -3.1053261756896973, "step": 638 }, { "epoch": 3.7866666666666666, "grad_norm": 15.973788029035022, "learning_rate": 3.145478749349974e-07, "logits/chosen": -1.325491189956665, "logits/rejected": -1.391890048980713, "logps/chosen": -58.373809814453125, "logps/rejected": -72.70347595214844, "loss": 0.1228, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7778370380401611, "rewards/margins": 3.0197646617889404, "rewards/rejected": -3.7976019382476807, "step": 639 }, { "epoch": 3.7925925925925927, "grad_norm": 9.758280669090905, "learning_rate": 3.139200578334724e-07, "logits/chosen": -1.0628029108047485, "logits/rejected": -1.0790042877197266, "logps/chosen": -53.540550231933594, "logps/rejected": -68.91120147705078, "loss": 0.1044, "rewards/accuracies": 1.0, "rewards/chosen": -0.7394209504127502, "rewards/margins": 3.1302340030670166, "rewards/rejected": -3.869655132293701, "step": 640 }, { "epoch": 3.7985185185185184, "grad_norm": 12.675841938430084, "learning_rate": 3.132918091293411e-07, "logits/chosen": -1.3551039695739746, "logits/rejected": -1.3326640129089355, "logps/chosen": -47.03355407714844, "logps/rejected": -60.826377868652344, "loss": 0.1353, "rewards/accuracies": 0.875, "rewards/chosen": -0.5289521813392639, "rewards/margins": 2.3704404830932617, "rewards/rejected": -2.899392604827881, "step": 641 }, { "epoch": 3.8044444444444445, "grad_norm": 10.604427627983807, "learning_rate": 3.126631330646801e-07, "logits/chosen": -1.3293988704681396, "logits/rejected": -1.3584859371185303, "logps/chosen": -41.8519287109375, "logps/rejected": -55.53318786621094, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 0.26372450590133667, "rewards/margins": 3.610297679901123, "rewards/rejected": -3.3465731143951416, "step": 642 }, { "epoch": 3.8103703703703706, "grad_norm": 11.285352917600749, "learning_rate": 3.120340338844516e-07, "logits/chosen": -1.3401509523391724, "logits/rejected": -1.400542974472046, "logps/chosen": -46.122562408447266, "logps/rejected": -58.14826965332031, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": -0.2642550468444824, "rewards/margins": 4.115180015563965, "rewards/rejected": -4.379435062408447, "step": 643 }, { "epoch": 3.8162962962962963, "grad_norm": 13.343593594001085, "learning_rate": 3.1140451583647464e-07, "logits/chosen": -1.2204161882400513, "logits/rejected": -1.2906641960144043, "logps/chosen": -45.432891845703125, "logps/rejected": -67.70313262939453, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": -0.40972137451171875, "rewards/margins": 5.800241470336914, "rewards/rejected": -6.209963321685791, "step": 644 }, { "epoch": 3.822222222222222, "grad_norm": 12.873863711524187, "learning_rate": 3.1077458317139677e-07, "logits/chosen": -1.3262977600097656, "logits/rejected": -1.30996835231781, "logps/chosen": -43.31980895996094, "logps/rejected": -52.978004455566406, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": 0.00339663028717041, "rewards/margins": 3.1779003143310547, "rewards/rejected": -3.1745035648345947, "step": 645 }, { "epoch": 3.828148148148148, "grad_norm": 12.300873984167348, "learning_rate": 3.1014424014266494e-07, "logits/chosen": -1.0281978845596313, "logits/rejected": -1.1137367486953735, "logps/chosen": -37.203250885009766, "logps/rejected": -54.28007507324219, "loss": 0.1485, "rewards/accuracies": 0.875, "rewards/chosen": -0.59632408618927, "rewards/margins": 4.16163969039917, "rewards/rejected": -4.757964134216309, "step": 646 }, { "epoch": 3.834074074074074, "grad_norm": 9.697607033127737, "learning_rate": 3.095134910064971e-07, "logits/chosen": -1.3984662294387817, "logits/rejected": -1.4173474311828613, "logps/chosen": -59.23346710205078, "logps/rejected": -51.855438232421875, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": -0.7792799472808838, "rewards/margins": 3.3985934257507324, "rewards/rejected": -4.177873134613037, "step": 647 }, { "epoch": 3.84, "grad_norm": 12.01232429084738, "learning_rate": 3.0888234002185325e-07, "logits/chosen": -1.2062727212905884, "logits/rejected": -1.2513091564178467, "logps/chosen": -36.36381530761719, "logps/rejected": -55.708404541015625, "loss": 0.1198, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009294629096984863, "rewards/margins": 3.4298362731933594, "rewards/rejected": -3.439131021499634, "step": 648 }, { "epoch": 3.845925925925926, "grad_norm": 15.137381506982647, "learning_rate": 3.082507914504068e-07, "logits/chosen": -1.3288854360580444, "logits/rejected": -1.3593389987945557, "logps/chosen": -56.4027099609375, "logps/rejected": -69.04844665527344, "loss": 0.1363, "rewards/accuracies": 1.0, "rewards/chosen": -0.6180347204208374, "rewards/margins": 3.7794382572174072, "rewards/rejected": -4.397473335266113, "step": 649 }, { "epoch": 3.851851851851852, "grad_norm": 15.831659643387738, "learning_rate": 3.0761884955651563e-07, "logits/chosen": -1.3083950281143188, "logits/rejected": -1.2522752285003662, "logps/chosen": -56.29924774169922, "logps/rejected": -51.787841796875, "loss": 0.1859, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2072955071926117, "rewards/margins": 2.5591542720794678, "rewards/rejected": -2.7664496898651123, "step": 650 }, { "epoch": 3.8577777777777778, "grad_norm": 11.29674521911131, "learning_rate": 3.069865186071938e-07, "logits/chosen": -1.3050025701522827, "logits/rejected": -1.3897082805633545, "logps/chosen": -46.678955078125, "logps/rejected": -64.03926086425781, "loss": 0.1305, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24213455617427826, "rewards/margins": 4.125871658325195, "rewards/rejected": -3.8837366104125977, "step": 651 }, { "epoch": 3.863703703703704, "grad_norm": 10.624392101707619, "learning_rate": 3.0635380287208184e-07, "logits/chosen": -1.1970927715301514, "logits/rejected": -1.2164897918701172, "logps/chosen": -48.83156967163086, "logps/rejected": -64.98463439941406, "loss": 0.1034, "rewards/accuracies": 1.0, "rewards/chosen": -0.283173143863678, "rewards/margins": 4.483406066894531, "rewards/rejected": -4.7665791511535645, "step": 652 }, { "epoch": 3.8696296296296295, "grad_norm": 11.033803801081728, "learning_rate": 3.057207066234188e-07, "logits/chosen": -1.3635876178741455, "logits/rejected": -1.3974814414978027, "logps/chosen": -46.53936004638672, "logps/rejected": -53.70029830932617, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": -0.036083512008190155, "rewards/margins": 3.1924872398376465, "rewards/rejected": -3.2285706996917725, "step": 653 }, { "epoch": 3.8755555555555556, "grad_norm": 13.60287696207738, "learning_rate": 3.0508723413601296e-07, "logits/chosen": -1.3430231809616089, "logits/rejected": -1.4458032846450806, "logps/chosen": -53.1356201171875, "logps/rejected": -65.16506958007812, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": -0.2436259686946869, "rewards/margins": 3.3348610401153564, "rewards/rejected": -3.578486919403076, "step": 654 }, { "epoch": 3.8814814814814813, "grad_norm": 14.148324264131578, "learning_rate": 3.0445338968721283e-07, "logits/chosen": -1.2519584894180298, "logits/rejected": -1.3373234272003174, "logps/chosen": -57.95303726196289, "logps/rejected": -75.46018981933594, "loss": 0.1418, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0708034485578537, "rewards/margins": 3.977604389190674, "rewards/rejected": -3.9068009853363037, "step": 655 }, { "epoch": 3.8874074074074074, "grad_norm": 11.954874367510866, "learning_rate": 3.0381917755687896e-07, "logits/chosen": -1.1269997358322144, "logits/rejected": -1.0959970951080322, "logps/chosen": -49.12158966064453, "logps/rejected": -68.00682067871094, "loss": 0.1457, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9364160299301147, "rewards/margins": 3.3399736881256104, "rewards/rejected": -4.2763895988464355, "step": 656 }, { "epoch": 3.8933333333333335, "grad_norm": 12.29898950820709, "learning_rate": 3.0318460202735415e-07, "logits/chosen": -1.283673882484436, "logits/rejected": -1.347425937652588, "logps/chosen": -45.70671081542969, "logps/rejected": -56.07836151123047, "loss": 0.1389, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2212280035018921, "rewards/margins": 3.2367424964904785, "rewards/rejected": -3.457970380783081, "step": 657 }, { "epoch": 3.899259259259259, "grad_norm": 13.788350104255459, "learning_rate": 3.025496673834351e-07, "logits/chosen": -1.26125967502594, "logits/rejected": -1.2926025390625, "logps/chosen": -51.297176361083984, "logps/rejected": -61.652164459228516, "loss": 0.134, "rewards/accuracies": 1.0, "rewards/chosen": -0.7250484228134155, "rewards/margins": 3.813504457473755, "rewards/rejected": -4.538553237915039, "step": 658 }, { "epoch": 3.9051851851851853, "grad_norm": 12.893745092209024, "learning_rate": 3.0191437791234335e-07, "logits/chosen": -1.1480952501296997, "logits/rejected": -1.1113182306289673, "logps/chosen": -46.87205123901367, "logps/rejected": -63.754608154296875, "loss": 0.1155, "rewards/accuracies": 1.0, "rewards/chosen": 0.06853310763835907, "rewards/margins": 4.413917541503906, "rewards/rejected": -4.34538459777832, "step": 659 }, { "epoch": 3.911111111111111, "grad_norm": 12.585185044373889, "learning_rate": 3.0127873790369625e-07, "logits/chosen": -1.0948622226715088, "logits/rejected": -1.1531262397766113, "logps/chosen": -38.73649215698242, "logps/rejected": -46.320533752441406, "loss": 0.1333, "rewards/accuracies": 1.0, "rewards/chosen": -0.4145849943161011, "rewards/margins": 2.6738719940185547, "rewards/rejected": -3.0884571075439453, "step": 660 }, { "epoch": 3.917037037037037, "grad_norm": 9.634291710166424, "learning_rate": 3.006427516494781e-07, "logits/chosen": -1.281577229499817, "logits/rejected": -1.2957377433776855, "logps/chosen": -38.95976257324219, "logps/rejected": -58.57207107543945, "loss": 0.1125, "rewards/accuracies": 0.875, "rewards/chosen": 0.2174490988254547, "rewards/margins": 3.0695672035217285, "rewards/rejected": -2.8521180152893066, "step": 661 }, { "epoch": 3.9229629629629628, "grad_norm": 12.649837995937807, "learning_rate": 3.000064234440111e-07, "logits/chosen": -1.3475849628448486, "logits/rejected": -1.374518871307373, "logps/chosen": -50.88645935058594, "logps/rejected": -61.86933898925781, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": 0.008303865790367126, "rewards/margins": 3.0162546634674072, "rewards/rejected": -3.007950782775879, "step": 662 }, { "epoch": 3.928888888888889, "grad_norm": 9.838791281025717, "learning_rate": 2.9936975758392644e-07, "logits/chosen": -1.3352168798446655, "logits/rejected": -1.3545887470245361, "logps/chosen": -60.31529235839844, "logps/rejected": -62.78803634643555, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": -0.2238210290670395, "rewards/margins": 3.8718276023864746, "rewards/rejected": -4.095648765563965, "step": 663 }, { "epoch": 3.934814814814815, "grad_norm": 9.9056351234486, "learning_rate": 2.9873275836813526e-07, "logits/chosen": -1.2591410875320435, "logits/rejected": -1.2903554439544678, "logps/chosen": -53.714942932128906, "logps/rejected": -59.55838394165039, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": -0.37991151213645935, "rewards/margins": 3.501067638397217, "rewards/rejected": -3.880979061126709, "step": 664 }, { "epoch": 3.9407407407407407, "grad_norm": 13.479218611455286, "learning_rate": 2.980954300977995e-07, "logits/chosen": -1.2214405536651611, "logits/rejected": -1.2860974073410034, "logps/chosen": -52.64996337890625, "logps/rejected": -71.29141235351562, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": -0.941660463809967, "rewards/margins": 4.051599025726318, "rewards/rejected": -4.993259429931641, "step": 665 }, { "epoch": 3.9466666666666668, "grad_norm": 12.022458671134432, "learning_rate": 2.974577770763028e-07, "logits/chosen": -1.1799646615982056, "logits/rejected": -1.3523240089416504, "logps/chosen": -55.82897186279297, "logps/rejected": -89.32501220703125, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": -0.29032182693481445, "rewards/margins": 5.433279514312744, "rewards/rejected": -5.7236008644104, "step": 666 }, { "epoch": 3.9525925925925924, "grad_norm": 12.03264049240042, "learning_rate": 2.96819803609222e-07, "logits/chosen": -1.172480821609497, "logits/rejected": -1.2781836986541748, "logps/chosen": -41.319862365722656, "logps/rejected": -52.289878845214844, "loss": 0.1168, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18020282685756683, "rewards/margins": 3.568021297454834, "rewards/rejected": -3.3878188133239746, "step": 667 }, { "epoch": 3.9585185185185185, "grad_norm": 14.764711496945134, "learning_rate": 2.9618151400429735e-07, "logits/chosen": -1.314758062362671, "logits/rejected": -1.427316665649414, "logps/chosen": -48.17366409301758, "logps/rejected": -61.67540740966797, "loss": 0.1463, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2936151623725891, "rewards/margins": 3.672703504562378, "rewards/rejected": -3.9663188457489014, "step": 668 }, { "epoch": 3.964444444444444, "grad_norm": 14.654996509659126, "learning_rate": 2.955429125714038e-07, "logits/chosen": -1.1277893781661987, "logits/rejected": -1.2207601070404053, "logps/chosen": -42.69140625, "logps/rejected": -68.8827896118164, "loss": 0.1258, "rewards/accuracies": 0.875, "rewards/chosen": -0.17935360968112946, "rewards/margins": 3.781348705291748, "rewards/rejected": -3.960702419281006, "step": 669 }, { "epoch": 3.9703703703703703, "grad_norm": 10.996670224142063, "learning_rate": 2.949040036225218e-07, "logits/chosen": -1.1256000995635986, "logits/rejected": -1.1515730619430542, "logps/chosen": -58.76976776123047, "logps/rejected": -68.46432495117188, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": -1.126528024673462, "rewards/margins": 4.9552130699157715, "rewards/rejected": -6.0817413330078125, "step": 670 }, { "epoch": 3.9762962962962964, "grad_norm": 12.15346985049749, "learning_rate": 2.9426479147170836e-07, "logits/chosen": -1.2197206020355225, "logits/rejected": -1.3296701908111572, "logps/chosen": -37.83189392089844, "logps/rejected": -61.467247009277344, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": -0.36297494173049927, "rewards/margins": 4.596902370452881, "rewards/rejected": -4.959877014160156, "step": 671 }, { "epoch": 3.982222222222222, "grad_norm": 12.097195045066714, "learning_rate": 2.9362528043506767e-07, "logits/chosen": -1.2236287593841553, "logits/rejected": -1.334072232246399, "logps/chosen": -64.24101257324219, "logps/rejected": -72.81991577148438, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": -0.9826846718788147, "rewards/margins": 4.875625133514404, "rewards/rejected": -5.858309745788574, "step": 672 }, { "epoch": 3.988148148148148, "grad_norm": 10.99016588436982, "learning_rate": 2.929854748307221e-07, "logits/chosen": -1.1742664575576782, "logits/rejected": -1.1747915744781494, "logps/chosen": -50.49982452392578, "logps/rejected": -60.32571029663086, "loss": 0.1164, "rewards/accuracies": 1.0, "rewards/chosen": 0.032082848250865936, "rewards/margins": 4.788650989532471, "rewards/rejected": -4.756568431854248, "step": 673 }, { "epoch": 3.9940740740740743, "grad_norm": 10.85735796088607, "learning_rate": 2.923453789787828e-07, "logits/chosen": -1.24151611328125, "logits/rejected": -1.2867331504821777, "logps/chosen": -50.32501220703125, "logps/rejected": -67.8655776977539, "loss": 0.1221, "rewards/accuracies": 1.0, "rewards/chosen": -0.40465599298477173, "rewards/margins": 4.1814141273498535, "rewards/rejected": -4.5860700607299805, "step": 674 }, { "epoch": 4.0, "grad_norm": 13.13554967783866, "learning_rate": 2.9170499720132106e-07, "logits/chosen": -1.1751290559768677, "logits/rejected": -1.1486109495162964, "logps/chosen": -58.797637939453125, "logps/rejected": -80.00987243652344, "loss": 0.1007, "rewards/accuracies": 0.875, "rewards/chosen": -0.1894952356815338, "rewards/margins": 3.617033004760742, "rewards/rejected": -3.8065285682678223, "step": 675 }, { "epoch": 4.005925925925926, "grad_norm": 8.09939021223856, "learning_rate": 2.9106433382233877e-07, "logits/chosen": -1.2218327522277832, "logits/rejected": -1.330657720565796, "logps/chosen": -39.75477981567383, "logps/rejected": -58.90275573730469, "loss": 0.078, "rewards/accuracies": 1.0, "rewards/chosen": 0.026934266090393066, "rewards/margins": 3.75627064704895, "rewards/rejected": -3.7293362617492676, "step": 676 }, { "epoch": 4.011851851851852, "grad_norm": 8.504218288919116, "learning_rate": 2.90423393167739e-07, "logits/chosen": -0.9554922580718994, "logits/rejected": -1.1671780347824097, "logps/chosen": -63.793670654296875, "logps/rejected": -89.34516906738281, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": -0.525771975517273, "rewards/margins": 6.577359199523926, "rewards/rejected": -7.103131294250488, "step": 677 }, { "epoch": 4.017777777777778, "grad_norm": 5.984815722074786, "learning_rate": 2.897821795652972e-07, "logits/chosen": -1.021341323852539, "logits/rejected": -1.1111960411071777, "logps/chosen": -44.92298889160156, "logps/rejected": -78.0810775756836, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 0.19972413778305054, "rewards/margins": 5.106929779052734, "rewards/rejected": -4.907205581665039, "step": 678 }, { "epoch": 4.023703703703704, "grad_norm": 8.449296561459144, "learning_rate": 2.891406973446319e-07, "logits/chosen": -1.2100530862808228, "logits/rejected": -1.263127088546753, "logps/chosen": -64.97561645507812, "logps/rejected": -73.60565185546875, "loss": 0.0919, "rewards/accuracies": 1.0, "rewards/chosen": -0.5560375452041626, "rewards/margins": 3.675342321395874, "rewards/rejected": -4.231379985809326, "step": 679 }, { "epoch": 4.029629629629629, "grad_norm": 7.8329834369019355, "learning_rate": 2.8849895083717536e-07, "logits/chosen": -1.300388216972351, "logits/rejected": -1.2571924924850464, "logps/chosen": -52.756065368652344, "logps/rejected": -63.69276428222656, "loss": 0.0679, "rewards/accuracies": 1.0, "rewards/chosen": -1.5070255994796753, "rewards/margins": 4.4201836585998535, "rewards/rejected": -5.927209377288818, "step": 680 }, { "epoch": 4.035555555555556, "grad_norm": 8.049817495886364, "learning_rate": 2.8785694437614416e-07, "logits/chosen": -1.0563764572143555, "logits/rejected": -1.1485258340835571, "logps/chosen": -44.871803283691406, "logps/rejected": -60.164154052734375, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": -0.7167099714279175, "rewards/margins": 3.6469597816467285, "rewards/rejected": -4.363670349121094, "step": 681 }, { "epoch": 4.0414814814814815, "grad_norm": 9.430046973217774, "learning_rate": 2.872146822965105e-07, "logits/chosen": -1.23280930519104, "logits/rejected": -1.1943401098251343, "logps/chosen": -43.12926483154297, "logps/rejected": -64.86984252929688, "loss": 0.093, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12739908695220947, "rewards/margins": 4.2020697593688965, "rewards/rejected": -4.329468727111816, "step": 682 }, { "epoch": 4.047407407407407, "grad_norm": 7.097112610215551, "learning_rate": 2.865721689349722e-07, "logits/chosen": -1.2696402072906494, "logits/rejected": -1.2817845344543457, "logps/chosen": -51.86808776855469, "logps/rejected": -79.98898315429688, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -1.117104172706604, "rewards/margins": 5.00604248046875, "rewards/rejected": -6.123147010803223, "step": 683 }, { "epoch": 4.053333333333334, "grad_norm": 9.052336019738735, "learning_rate": 2.8592940862992415e-07, "logits/chosen": -1.1059024333953857, "logits/rejected": -1.173060655593872, "logps/chosen": -45.80156326293945, "logps/rejected": -64.51785278320312, "loss": 0.0814, "rewards/accuracies": 1.0, "rewards/chosen": -0.24664664268493652, "rewards/margins": 3.944058418273926, "rewards/rejected": -4.190705299377441, "step": 684 }, { "epoch": 4.059259259259259, "grad_norm": 10.05155868088065, "learning_rate": 2.8528640572142835e-07, "logits/chosen": -1.23401939868927, "logits/rejected": -1.380190134048462, "logps/chosen": -39.44499969482422, "logps/rejected": -53.958587646484375, "loss": 0.0975, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8200135231018066, "rewards/margins": 3.4660472869873047, "rewards/rejected": -4.286060333251953, "step": 685 }, { "epoch": 4.065185185185185, "grad_norm": 8.266099363803365, "learning_rate": 2.846431645511851e-07, "logits/chosen": -1.3201549053192139, "logits/rejected": -1.3402879238128662, "logps/chosen": -40.695865631103516, "logps/rejected": -61.442386627197266, "loss": 0.0861, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18501725792884827, "rewards/margins": 4.403324604034424, "rewards/rejected": -4.588342189788818, "step": 686 }, { "epoch": 4.071111111111111, "grad_norm": 10.616884607002742, "learning_rate": 2.839996894625037e-07, "logits/chosen": -1.1398805379867554, "logits/rejected": -1.3630214929580688, "logps/chosen": -42.72355270385742, "logps/rejected": -79.57286071777344, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5211660861968994, "rewards/margins": 5.314165115356445, "rewards/rejected": -5.835331439971924, "step": 687 }, { "epoch": 4.077037037037037, "grad_norm": 7.9070372912181375, "learning_rate": 2.8335598480027224e-07, "logits/chosen": -1.2060840129852295, "logits/rejected": -1.2967393398284912, "logps/chosen": -57.877464294433594, "logps/rejected": -65.47186279296875, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": -0.24055880308151245, "rewards/margins": 3.2056002616882324, "rewards/rejected": -3.4461588859558105, "step": 688 }, { "epoch": 4.082962962962963, "grad_norm": 6.337597848363227, "learning_rate": 2.8271205491092963e-07, "logits/chosen": -1.240246295928955, "logits/rejected": -1.3377811908721924, "logps/chosen": -43.28995895385742, "logps/rejected": -72.51390838623047, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": -0.404224157333374, "rewards/margins": 5.710890293121338, "rewards/rejected": -6.115114688873291, "step": 689 }, { "epoch": 4.088888888888889, "grad_norm": 10.078954126450014, "learning_rate": 2.820679041424352e-07, "logits/chosen": -1.2308160066604614, "logits/rejected": -1.246185541152954, "logps/chosen": -31.00168228149414, "logps/rejected": -46.25737380981445, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": -0.44722241163253784, "rewards/margins": 2.568000078201294, "rewards/rejected": -3.0152225494384766, "step": 690 }, { "epoch": 4.094814814814815, "grad_norm": 10.105653150868449, "learning_rate": 2.814235368442398e-07, "logits/chosen": -1.229535698890686, "logits/rejected": -1.20412015914917, "logps/chosen": -56.288055419921875, "logps/rejected": -72.79161071777344, "loss": 0.1171, "rewards/accuracies": 1.0, "rewards/chosen": -0.5891921520233154, "rewards/margins": 3.7033348083496094, "rewards/rejected": -4.292527198791504, "step": 691 }, { "epoch": 4.100740740740741, "grad_norm": 6.316862928402733, "learning_rate": 2.8077895736725647e-07, "logits/chosen": -1.2391550540924072, "logits/rejected": -1.383090853691101, "logps/chosen": -48.66514587402344, "logps/rejected": -71.63360595703125, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -0.3529631793498993, "rewards/margins": 4.131495952606201, "rewards/rejected": -4.484459400177002, "step": 692 }, { "epoch": 4.1066666666666665, "grad_norm": 8.936104075687247, "learning_rate": 2.801341700638307e-07, "logits/chosen": -1.1700717210769653, "logits/rejected": -1.2463778257369995, "logps/chosen": -58.735984802246094, "logps/rejected": -69.45088195800781, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": -0.5931496620178223, "rewards/margins": 4.663151741027832, "rewards/rejected": -5.256301403045654, "step": 693 }, { "epoch": 4.112592592592593, "grad_norm": 7.921423465017035, "learning_rate": 2.7948917928771153e-07, "logits/chosen": -1.1677509546279907, "logits/rejected": -1.2224948406219482, "logps/chosen": -49.49176025390625, "logps/rejected": -62.047096252441406, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": -0.2260219305753708, "rewards/margins": 4.291070461273193, "rewards/rejected": -4.517092704772949, "step": 694 }, { "epoch": 4.118518518518519, "grad_norm": 9.635718560557772, "learning_rate": 2.7884398939402156e-07, "logits/chosen": -1.1603374481201172, "logits/rejected": -1.206560730934143, "logps/chosen": -44.97281265258789, "logps/rejected": -54.71010208129883, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": -0.5538350343704224, "rewards/margins": 3.9752297401428223, "rewards/rejected": -4.529065132141113, "step": 695 }, { "epoch": 4.124444444444444, "grad_norm": 9.354746925229785, "learning_rate": 2.78198604739228e-07, "logits/chosen": -1.0650126934051514, "logits/rejected": -1.0606831312179565, "logps/chosen": -50.06618881225586, "logps/rejected": -48.473289489746094, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 0.2048884630203247, "rewards/margins": 2.942831516265869, "rewards/rejected": -2.737943172454834, "step": 696 }, { "epoch": 4.13037037037037, "grad_norm": 9.653581475922081, "learning_rate": 2.7755302968111346e-07, "logits/chosen": -1.244917392730713, "logits/rejected": -1.2490586042404175, "logps/chosen": -59.654884338378906, "logps/rejected": -84.32585906982422, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": -0.5680751800537109, "rewards/margins": 5.05828857421875, "rewards/rejected": -5.626363754272461, "step": 697 }, { "epoch": 4.136296296296297, "grad_norm": 9.124932825116472, "learning_rate": 2.7690726857874564e-07, "logits/chosen": -1.2023653984069824, "logits/rejected": -1.243130087852478, "logps/chosen": -46.383155822753906, "logps/rejected": -58.44104766845703, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": -0.030015796422958374, "rewards/margins": 3.910349130630493, "rewards/rejected": -3.9403653144836426, "step": 698 }, { "epoch": 4.142222222222222, "grad_norm": 11.768103625307159, "learning_rate": 2.7626132579244893e-07, "logits/chosen": -1.333091139793396, "logits/rejected": -1.287567377090454, "logps/chosen": -45.71503829956055, "logps/rejected": -62.3424072265625, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": -0.7412693500518799, "rewards/margins": 3.5215086936950684, "rewards/rejected": -4.262778282165527, "step": 699 }, { "epoch": 4.148148148148148, "grad_norm": 7.69430929680938, "learning_rate": 2.756152056837743e-07, "logits/chosen": -1.271039605140686, "logits/rejected": -1.2991083860397339, "logps/chosen": -48.57284927368164, "logps/rejected": -51.01499938964844, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": -0.21987931430339813, "rewards/margins": 3.371474266052246, "rewards/rejected": -3.59135365486145, "step": 700 }, { "epoch": 4.1540740740740745, "grad_norm": 6.168894895964799, "learning_rate": 2.749689126154698e-07, "logits/chosen": -1.156360149383545, "logits/rejected": -1.262413740158081, "logps/chosen": -37.52547073364258, "logps/rejected": -55.548492431640625, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": -0.12576079368591309, "rewards/margins": 4.342263698577881, "rewards/rejected": -4.468024253845215, "step": 701 }, { "epoch": 4.16, "grad_norm": 7.518876365674256, "learning_rate": 2.743224509514519e-07, "logits/chosen": -1.1882476806640625, "logits/rejected": -1.2636778354644775, "logps/chosen": -46.34234619140625, "logps/rejected": -67.68272399902344, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": -0.6115562915802002, "rewards/margins": 4.2533135414123535, "rewards/rejected": -4.864870071411133, "step": 702 }, { "epoch": 4.165925925925926, "grad_norm": 7.326989288396403, "learning_rate": 2.73675825056775e-07, "logits/chosen": -1.2471364736557007, "logits/rejected": -1.2816392183303833, "logps/chosen": -55.45796203613281, "logps/rejected": -60.71422576904297, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 0.013647403568029404, "rewards/margins": 4.265345573425293, "rewards/rejected": -4.251698017120361, "step": 703 }, { "epoch": 4.1718518518518515, "grad_norm": 11.237707075205611, "learning_rate": 2.730290392976025e-07, "logits/chosen": -1.2631371021270752, "logits/rejected": -1.3731496334075928, "logps/chosen": -58.124061584472656, "logps/rejected": -62.4581413269043, "loss": 0.1116, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3890116512775421, "rewards/margins": 3.304945230484009, "rewards/rejected": -3.6939570903778076, "step": 704 }, { "epoch": 4.177777777777778, "grad_norm": 7.057187300346005, "learning_rate": 2.723820980411774e-07, "logits/chosen": -1.063293695449829, "logits/rejected": -1.1065738201141357, "logps/chosen": -42.29499053955078, "logps/rejected": -54.36339569091797, "loss": 0.0761, "rewards/accuracies": 1.0, "rewards/chosen": -0.11712778359651566, "rewards/margins": 3.8328468799591064, "rewards/rejected": -3.94997501373291, "step": 705 }, { "epoch": 4.183703703703704, "grad_norm": 6.548904651880563, "learning_rate": 2.7173500565579256e-07, "logits/chosen": -1.1943180561065674, "logits/rejected": -1.3050909042358398, "logps/chosen": -60.831878662109375, "logps/rejected": -85.12651062011719, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -1.325363278388977, "rewards/margins": 5.975900173187256, "rewards/rejected": -7.301263332366943, "step": 706 }, { "epoch": 4.189629629629629, "grad_norm": 5.710679075771105, "learning_rate": 2.7108776651076116e-07, "logits/chosen": -1.161668062210083, "logits/rejected": -1.3700096607208252, "logps/chosen": -39.77562713623047, "logps/rejected": -61.1549072265625, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": -0.005037426948547363, "rewards/margins": 5.042215347290039, "rewards/rejected": -5.047252655029297, "step": 707 }, { "epoch": 4.195555555555556, "grad_norm": 7.889770903073412, "learning_rate": 2.704403849763878e-07, "logits/chosen": -1.053623080253601, "logits/rejected": -1.0846974849700928, "logps/chosen": -52.68038558959961, "logps/rejected": -68.91546630859375, "loss": 0.0896, "rewards/accuracies": 1.0, "rewards/chosen": -0.3577325940132141, "rewards/margins": 3.664928436279297, "rewards/rejected": -4.022661209106445, "step": 708 }, { "epoch": 4.201481481481482, "grad_norm": 7.382296529185899, "learning_rate": 2.697928654239378e-07, "logits/chosen": -1.067875623703003, "logits/rejected": -1.1225016117095947, "logps/chosen": -41.09888458251953, "logps/rejected": -51.2681770324707, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": -0.025173917412757874, "rewards/margins": 3.3625717163085938, "rewards/rejected": -3.3877458572387695, "step": 709 }, { "epoch": 4.207407407407407, "grad_norm": 9.009216392494261, "learning_rate": 2.6914521222560907e-07, "logits/chosen": -1.2619165182113647, "logits/rejected": -1.2694003582000732, "logps/chosen": -55.56476593017578, "logps/rejected": -77.80066680908203, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": -0.1241753101348877, "rewards/margins": 5.0723724365234375, "rewards/rejected": -5.196547508239746, "step": 710 }, { "epoch": 4.213333333333333, "grad_norm": 9.523713560320529, "learning_rate": 2.6849742975450163e-07, "logits/chosen": -1.2838767766952515, "logits/rejected": -1.2139889001846313, "logps/chosen": -53.205867767333984, "logps/rejected": -66.61186218261719, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": -0.016508355736732483, "rewards/margins": 3.4612367153167725, "rewards/rejected": -3.4777450561523438, "step": 711 }, { "epoch": 4.2192592592592595, "grad_norm": 9.4854359957746, "learning_rate": 2.6784952238458824e-07, "logits/chosen": -1.312174677848816, "logits/rejected": -1.3309844732284546, "logps/chosen": -50.24847412109375, "logps/rejected": -67.9576187133789, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": -0.5955944061279297, "rewards/margins": 4.73298978805542, "rewards/rejected": -5.32858419418335, "step": 712 }, { "epoch": 4.225185185185185, "grad_norm": 9.597811135040962, "learning_rate": 2.672014944906854e-07, "logits/chosen": -1.288494348526001, "logits/rejected": -1.4439034461975098, "logps/chosen": -52.817901611328125, "logps/rejected": -83.7956314086914, "loss": 0.0988, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11418376863002777, "rewards/margins": 5.282074928283691, "rewards/rejected": -5.39625883102417, "step": 713 }, { "epoch": 4.231111111111111, "grad_norm": 7.492613961615437, "learning_rate": 2.665533504484231e-07, "logits/chosen": -1.1451681852340698, "logits/rejected": -1.2639843225479126, "logps/chosen": -46.51567077636719, "logps/rejected": -61.77153396606445, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -1.22086501121521, "rewards/margins": 4.6231303215026855, "rewards/rejected": -5.843995571136475, "step": 714 }, { "epoch": 4.237037037037037, "grad_norm": 7.256767586610158, "learning_rate": 2.6590509463421573e-07, "logits/chosen": -1.2956974506378174, "logits/rejected": -1.3932404518127441, "logps/chosen": -41.55891418457031, "logps/rejected": -68.27203369140625, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": -0.3283975124359131, "rewards/margins": 5.13521146774292, "rewards/rejected": -5.463608741760254, "step": 715 }, { "epoch": 4.242962962962963, "grad_norm": 5.716493770660752, "learning_rate": 2.6525673142523217e-07, "logits/chosen": -1.1941766738891602, "logits/rejected": -1.291650652885437, "logps/chosen": -59.82276153564453, "logps/rejected": -88.97959899902344, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -0.6497074365615845, "rewards/margins": 6.625558853149414, "rewards/rejected": -7.275265693664551, "step": 716 }, { "epoch": 4.248888888888889, "grad_norm": 7.11703069263412, "learning_rate": 2.646082651993668e-07, "logits/chosen": -1.1893032789230347, "logits/rejected": -1.1735705137252808, "logps/chosen": -50.201759338378906, "logps/rejected": -59.90876007080078, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 0.21657828986644745, "rewards/margins": 4.5485429763793945, "rewards/rejected": -4.33196496963501, "step": 717 }, { "epoch": 4.254814814814814, "grad_norm": 7.311353948110829, "learning_rate": 2.6395970033520944e-07, "logits/chosen": -1.3677504062652588, "logits/rejected": -1.3873264789581299, "logps/chosen": -53.338356018066406, "logps/rejected": -62.122920989990234, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 0.33095160126686096, "rewards/margins": 3.561379909515381, "rewards/rejected": -3.2304282188415527, "step": 718 }, { "epoch": 4.260740740740741, "grad_norm": 6.514828833392831, "learning_rate": 2.6331104121201575e-07, "logits/chosen": -1.1285260915756226, "logits/rejected": -1.2510693073272705, "logps/chosen": -56.24557876586914, "logps/rejected": -88.39117431640625, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": -1.019626259803772, "rewards/margins": 4.762977123260498, "rewards/rejected": -5.7826032638549805, "step": 719 }, { "epoch": 4.266666666666667, "grad_norm": 8.469413160834886, "learning_rate": 2.626622922096782e-07, "logits/chosen": -1.225628137588501, "logits/rejected": -1.2865465879440308, "logps/chosen": -51.26776885986328, "logps/rejected": -72.69799041748047, "loss": 0.0829, "rewards/accuracies": 1.0, "rewards/chosen": -0.32359927892684937, "rewards/margins": 4.169054985046387, "rewards/rejected": -4.492654323577881, "step": 720 }, { "epoch": 4.272592592592592, "grad_norm": 8.233090300808394, "learning_rate": 2.6201345770869584e-07, "logits/chosen": -1.178969383239746, "logits/rejected": -1.201488971710205, "logps/chosen": -40.95018005371094, "logps/rejected": -62.00858688354492, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 0.04447879642248154, "rewards/margins": 4.163085460662842, "rewards/rejected": -4.1186065673828125, "step": 721 }, { "epoch": 4.278518518518519, "grad_norm": 8.262601902648214, "learning_rate": 2.6136454209014513e-07, "logits/chosen": -1.2628636360168457, "logits/rejected": -1.339663028717041, "logps/chosen": -53.15192794799805, "logps/rejected": -67.0715560913086, "loss": 0.0979, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6662226319313049, "rewards/margins": 3.948902130126953, "rewards/rejected": -4.615124702453613, "step": 722 }, { "epoch": 4.2844444444444445, "grad_norm": 7.476616976809312, "learning_rate": 2.6071554973565036e-07, "logits/chosen": -1.1416479349136353, "logits/rejected": -1.1721899509429932, "logps/chosen": -39.949527740478516, "logps/rejected": -50.61048126220703, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": -0.34017127752304077, "rewards/margins": 3.2426137924194336, "rewards/rejected": -3.582785129547119, "step": 723 }, { "epoch": 4.29037037037037, "grad_norm": 9.478491591850005, "learning_rate": 2.600664850273538e-07, "logits/chosen": -1.3482637405395508, "logits/rejected": -1.398117184638977, "logps/chosen": -58.73908233642578, "logps/rejected": -65.84282684326172, "loss": 0.0998, "rewards/accuracies": 1.0, "rewards/chosen": -0.41919857263565063, "rewards/margins": 3.3531155586242676, "rewards/rejected": -3.7723135948181152, "step": 724 }, { "epoch": 4.296296296296296, "grad_norm": 10.540861291398153, "learning_rate": 2.594173523478864e-07, "logits/chosen": -1.4997618198394775, "logits/rejected": -1.5314929485321045, "logps/chosen": -41.16127014160156, "logps/rejected": -64.81993865966797, "loss": 0.1001, "rewards/accuracies": 1.0, "rewards/chosen": -0.14836309850215912, "rewards/margins": 4.376749038696289, "rewards/rejected": -4.525112152099609, "step": 725 }, { "epoch": 4.302222222222222, "grad_norm": 11.07779744891079, "learning_rate": 2.587681560803379e-07, "logits/chosen": -1.1978309154510498, "logits/rejected": -1.2351629734039307, "logps/chosen": -50.8704948425293, "logps/rejected": -63.83390808105469, "loss": 0.1212, "rewards/accuracies": 1.0, "rewards/chosen": 0.03955966234207153, "rewards/margins": 3.869476318359375, "rewards/rejected": -3.829916477203369, "step": 726 }, { "epoch": 4.308148148148148, "grad_norm": 8.191167235944452, "learning_rate": 2.5811890060822754e-07, "logits/chosen": -1.0349016189575195, "logits/rejected": -1.0574527978897095, "logps/chosen": -63.94342803955078, "logps/rejected": -69.69291687011719, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": -0.6818037629127502, "rewards/margins": 4.685443878173828, "rewards/rejected": -5.367248058319092, "step": 727 }, { "epoch": 4.314074074074074, "grad_norm": 6.934328643488799, "learning_rate": 2.574695903154744e-07, "logits/chosen": -1.2235246896743774, "logits/rejected": -1.323827862739563, "logps/chosen": -56.161659240722656, "logps/rejected": -62.984310150146484, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -0.3723098635673523, "rewards/margins": 4.63085412979126, "rewards/rejected": -5.003163814544678, "step": 728 }, { "epoch": 4.32, "grad_norm": 8.367680585727957, "learning_rate": 2.5682022958636753e-07, "logits/chosen": -1.2532156705856323, "logits/rejected": -1.2759723663330078, "logps/chosen": -41.34139633178711, "logps/rejected": -65.8352279663086, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": -0.27511006593704224, "rewards/margins": 4.3679704666137695, "rewards/rejected": -4.643080711364746, "step": 729 }, { "epoch": 4.325925925925926, "grad_norm": 7.841973350120815, "learning_rate": 2.5617082280553655e-07, "logits/chosen": -1.1496574878692627, "logits/rejected": -1.1879431009292603, "logps/chosen": -46.50282287597656, "logps/rejected": -60.425148010253906, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": -0.7109239101409912, "rewards/margins": 2.608046770095825, "rewards/rejected": -3.3189704418182373, "step": 730 }, { "epoch": 4.331851851851852, "grad_norm": 7.2321529113116165, "learning_rate": 2.5552137435792215e-07, "logits/chosen": -1.3128197193145752, "logits/rejected": -1.427548885345459, "logps/chosen": -57.724605560302734, "logps/rejected": -60.89788818359375, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 0.1999157816171646, "rewards/margins": 4.722375869750977, "rewards/rejected": -4.522459983825684, "step": 731 }, { "epoch": 4.337777777777778, "grad_norm": 9.163982491493682, "learning_rate": 2.5487188862874633e-07, "logits/chosen": -1.171358346939087, "logits/rejected": -1.3380895853042603, "logps/chosen": -39.11927795410156, "logps/rejected": -61.229248046875, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 0.14069584012031555, "rewards/margins": 4.519740104675293, "rewards/rejected": -4.379044055938721, "step": 732 }, { "epoch": 4.343703703703704, "grad_norm": 7.364719899880132, "learning_rate": 2.542223700034827e-07, "logits/chosen": -1.0924508571624756, "logits/rejected": -1.2838900089263916, "logps/chosen": -38.81422424316406, "logps/rejected": -69.35069274902344, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": -0.031546324491500854, "rewards/margins": 5.743241786956787, "rewards/rejected": -5.774788856506348, "step": 733 }, { "epoch": 4.3496296296296295, "grad_norm": 6.448101416437852, "learning_rate": 2.535728228678273e-07, "logits/chosen": -0.9950582385063171, "logits/rejected": -1.1089781522750854, "logps/chosen": -42.76914978027344, "logps/rejected": -63.94660568237305, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": -0.29362452030181885, "rewards/margins": 4.061101913452148, "rewards/rejected": -4.354726314544678, "step": 734 }, { "epoch": 4.355555555555555, "grad_norm": 7.151314050055646, "learning_rate": 2.529232516076684e-07, "logits/chosen": -1.181514024734497, "logits/rejected": -1.3054252862930298, "logps/chosen": -40.351993560791016, "logps/rejected": -58.15304183959961, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": -0.06079056113958359, "rewards/margins": 3.7480669021606445, "rewards/rejected": -3.8088576793670654, "step": 735 }, { "epoch": 4.361481481481482, "grad_norm": 7.714066130134251, "learning_rate": 2.522736606090572e-07, "logits/chosen": -1.249624252319336, "logits/rejected": -1.2443642616271973, "logps/chosen": -51.097660064697266, "logps/rejected": -64.75337219238281, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": -0.07368945330381393, "rewards/margins": 3.9429399967193604, "rewards/rejected": -4.016629695892334, "step": 736 }, { "epoch": 4.367407407407407, "grad_norm": 5.457407893062819, "learning_rate": 2.5162405425817804e-07, "logits/chosen": -1.201377511024475, "logits/rejected": -1.2646445035934448, "logps/chosen": -44.47233963012695, "logps/rejected": -71.8295669555664, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": -0.576902449131012, "rewards/margins": 5.617945671081543, "rewards/rejected": -6.194847106933594, "step": 737 }, { "epoch": 4.373333333333333, "grad_norm": 5.615875455951263, "learning_rate": 2.5097443694131944e-07, "logits/chosen": -1.2609564065933228, "logits/rejected": -1.3528251647949219, "logps/chosen": -50.52897262573242, "logps/rejected": -85.29624938964844, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017972737550735474, "rewards/margins": 5.539052486419678, "rewards/rejected": -5.540849685668945, "step": 738 }, { "epoch": 4.37925925925926, "grad_norm": 7.915562673196729, "learning_rate": 2.503248130448434e-07, "logits/chosen": -1.066528081893921, "logits/rejected": -1.1395859718322754, "logps/chosen": -48.640525817871094, "logps/rejected": -67.96998596191406, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": -0.943220317363739, "rewards/margins": 5.604055881500244, "rewards/rejected": -6.547276020050049, "step": 739 }, { "epoch": 4.385185185185185, "grad_norm": 8.204759003681545, "learning_rate": 2.496751869551567e-07, "logits/chosen": -1.2988148927688599, "logits/rejected": -1.2978628873825073, "logps/chosen": -60.13380813598633, "logps/rejected": -73.75167846679688, "loss": 0.0846, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7862513065338135, "rewards/margins": 3.2615106105804443, "rewards/rejected": -4.047761917114258, "step": 740 }, { "epoch": 4.391111111111111, "grad_norm": 7.2931351209323765, "learning_rate": 2.4902556305868064e-07, "logits/chosen": -1.398193120956421, "logits/rejected": -1.4105606079101562, "logps/chosen": -47.56850814819336, "logps/rejected": -72.68304443359375, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": -0.7310683131217957, "rewards/margins": 5.0538201332092285, "rewards/rejected": -5.784888744354248, "step": 741 }, { "epoch": 4.397037037037037, "grad_norm": 8.125782342021866, "learning_rate": 2.4837594574182194e-07, "logits/chosen": -1.4070849418640137, "logits/rejected": -1.404382348060608, "logps/chosen": -49.6085319519043, "logps/rejected": -60.756046295166016, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": -0.6858243346214294, "rewards/margins": 3.5082602500915527, "rewards/rejected": -4.194085121154785, "step": 742 }, { "epoch": 4.402962962962963, "grad_norm": 8.999809351748363, "learning_rate": 2.477263393909429e-07, "logits/chosen": -1.2002407312393188, "logits/rejected": -1.31519615650177, "logps/chosen": -48.29846954345703, "logps/rejected": -67.71229553222656, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": -0.3317267596721649, "rewards/margins": 4.04105806350708, "rewards/rejected": -4.3727850914001465, "step": 743 }, { "epoch": 4.408888888888889, "grad_norm": 9.416772169128135, "learning_rate": 2.4707674839233165e-07, "logits/chosen": -1.0937350988388062, "logits/rejected": -1.089403510093689, "logps/chosen": -45.1909065246582, "logps/rejected": -65.4791488647461, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -0.8421599268913269, "rewards/margins": 5.240363121032715, "rewards/rejected": -6.082523345947266, "step": 744 }, { "epoch": 4.4148148148148145, "grad_norm": 7.888974500578905, "learning_rate": 2.4642717713217266e-07, "logits/chosen": -1.3409512042999268, "logits/rejected": -1.400418758392334, "logps/chosen": -55.626731872558594, "logps/rejected": -68.84590911865234, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": -1.1184844970703125, "rewards/margins": 5.060961723327637, "rewards/rejected": -6.179446697235107, "step": 745 }, { "epoch": 4.420740740740741, "grad_norm": 7.559761044306629, "learning_rate": 2.4577762999651727e-07, "logits/chosen": -1.343059778213501, "logits/rejected": -1.4584920406341553, "logps/chosen": -43.617576599121094, "logps/rejected": -74.24542236328125, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": -0.6335965991020203, "rewards/margins": 5.247225284576416, "rewards/rejected": -5.88082218170166, "step": 746 }, { "epoch": 4.426666666666667, "grad_norm": 6.632838110103838, "learning_rate": 2.451281113712537e-07, "logits/chosen": -1.1152293682098389, "logits/rejected": -1.1278988122940063, "logps/chosen": -45.689693450927734, "logps/rejected": -64.86377716064453, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": -0.05360722541809082, "rewards/margins": 4.151103496551514, "rewards/rejected": -4.204710960388184, "step": 747 }, { "epoch": 4.432592592592592, "grad_norm": 11.658937284533938, "learning_rate": 2.4447862564207783e-07, "logits/chosen": -1.2127296924591064, "logits/rejected": -1.1536256074905396, "logps/chosen": -44.75196838378906, "logps/rejected": -76.20709991455078, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": -0.8044648766517639, "rewards/margins": 4.883936882019043, "rewards/rejected": -5.688401222229004, "step": 748 }, { "epoch": 4.438518518518519, "grad_norm": 8.256649604861265, "learning_rate": 2.438291771944635e-07, "logits/chosen": -1.3526889085769653, "logits/rejected": -1.3918815851211548, "logps/chosen": -35.526405334472656, "logps/rejected": -54.13592529296875, "loss": 0.0901, "rewards/accuracies": 1.0, "rewards/chosen": 0.2358112633228302, "rewards/margins": 3.4109444618225098, "rewards/rejected": -3.175133228302002, "step": 749 }, { "epoch": 4.444444444444445, "grad_norm": 7.328075255957023, "learning_rate": 2.431797704136325e-07, "logits/chosen": -1.359628677368164, "logits/rejected": -1.4804301261901855, "logps/chosen": -39.079750061035156, "logps/rejected": -73.90196228027344, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": -0.36313414573669434, "rewards/margins": 4.543010711669922, "rewards/rejected": -4.906144618988037, "step": 750 }, { "epoch": 4.45037037037037, "grad_norm": 7.689492001307305, "learning_rate": 2.425304096845256e-07, "logits/chosen": -1.3346648216247559, "logits/rejected": -1.3804266452789307, "logps/chosen": -79.88153839111328, "logps/rejected": -85.34058380126953, "loss": 0.0637, "rewards/accuracies": 1.0, "rewards/chosen": -1.5813403129577637, "rewards/margins": 5.533379554748535, "rewards/rejected": -7.114720344543457, "step": 751 }, { "epoch": 4.456296296296296, "grad_norm": 6.295627131379865, "learning_rate": 2.4188109939177244e-07, "logits/chosen": -1.2872998714447021, "logits/rejected": -1.3642802238464355, "logps/chosen": -50.76905059814453, "logps/rejected": -71.48062896728516, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": 0.1857454478740692, "rewards/margins": 4.2039079666137695, "rewards/rejected": -4.018162250518799, "step": 752 }, { "epoch": 4.4622222222222225, "grad_norm": 7.7337867144161505, "learning_rate": 2.412318439196621e-07, "logits/chosen": -1.2910356521606445, "logits/rejected": -1.280400276184082, "logps/chosen": -53.59968566894531, "logps/rejected": -50.8829231262207, "loss": 0.0733, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3582778573036194, "rewards/margins": 3.956660270690918, "rewards/rejected": -4.314938068389893, "step": 753 }, { "epoch": 4.468148148148148, "grad_norm": 5.6372956740874836, "learning_rate": 2.405826476521137e-07, "logits/chosen": -1.0210390090942383, "logits/rejected": -1.04572331905365, "logps/chosen": -43.65760040283203, "logps/rejected": -73.2513656616211, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": -0.07161092013120651, "rewards/margins": 6.007022380828857, "rewards/rejected": -6.0786333084106445, "step": 754 }, { "epoch": 4.474074074074074, "grad_norm": 6.989037730437913, "learning_rate": 2.399335149726463e-07, "logits/chosen": -1.4408361911773682, "logits/rejected": -1.4506512880325317, "logps/chosen": -42.89796447753906, "logps/rejected": -52.081092834472656, "loss": 0.0699, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2379654198884964, "rewards/margins": 3.7549290657043457, "rewards/rejected": -3.516963481903076, "step": 755 }, { "epoch": 4.48, "grad_norm": 10.371093389622446, "learning_rate": 2.392844502643497e-07, "logits/chosen": -1.347337007522583, "logits/rejected": -1.3378440141677856, "logps/chosen": -47.60035705566406, "logps/rejected": -67.4775619506836, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": -0.2960338294506073, "rewards/margins": 4.631132125854492, "rewards/rejected": -4.927165508270264, "step": 756 }, { "epoch": 4.485925925925926, "grad_norm": 6.9101499487233236, "learning_rate": 2.3863545790985485e-07, "logits/chosen": -1.2323015928268433, "logits/rejected": -1.2121610641479492, "logps/chosen": -53.849029541015625, "logps/rejected": -69.28178405761719, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -0.5624325275421143, "rewards/margins": 5.044997215270996, "rewards/rejected": -5.607429504394531, "step": 757 }, { "epoch": 4.491851851851852, "grad_norm": 7.08379891264837, "learning_rate": 2.379865422913042e-07, "logits/chosen": -1.387258768081665, "logits/rejected": -1.3943510055541992, "logps/chosen": -40.565216064453125, "logps/rejected": -62.537567138671875, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -0.456316202878952, "rewards/margins": 3.579047918319702, "rewards/rejected": -4.035364151000977, "step": 758 }, { "epoch": 4.497777777777777, "grad_norm": 7.22602339985227, "learning_rate": 2.3733770779032184e-07, "logits/chosen": -1.1654456853866577, "logits/rejected": -1.251575231552124, "logps/chosen": -50.35562515258789, "logps/rejected": -58.588584899902344, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": -0.7475253939628601, "rewards/margins": 4.376987934112549, "rewards/rejected": -5.124513626098633, "step": 759 }, { "epoch": 4.503703703703704, "grad_norm": 7.976245827333884, "learning_rate": 2.3668895878798423e-07, "logits/chosen": -1.2458069324493408, "logits/rejected": -1.286975383758545, "logps/chosen": -39.373497009277344, "logps/rejected": -55.45878219604492, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 0.21001428365707397, "rewards/margins": 2.6100215911865234, "rewards/rejected": -2.4000072479248047, "step": 760 }, { "epoch": 4.50962962962963, "grad_norm": 5.6749553128019885, "learning_rate": 2.360402996647906e-07, "logits/chosen": -1.0600789785385132, "logits/rejected": -1.1194578409194946, "logps/chosen": -54.22690200805664, "logps/rejected": -84.67164611816406, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -1.4948400259017944, "rewards/margins": 6.4912896156311035, "rewards/rejected": -7.9861297607421875, "step": 761 }, { "epoch": 4.515555555555555, "grad_norm": 8.132240227652444, "learning_rate": 2.3539173480063318e-07, "logits/chosen": -1.2046082019805908, "logits/rejected": -1.321290135383606, "logps/chosen": -46.84300231933594, "logps/rejected": -61.15909194946289, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": -0.3092740774154663, "rewards/margins": 3.509507179260254, "rewards/rejected": -3.8187813758850098, "step": 762 }, { "epoch": 4.521481481481482, "grad_norm": 9.233829676880644, "learning_rate": 2.3474326857476783e-07, "logits/chosen": -1.122426986694336, "logits/rejected": -1.3011819124221802, "logps/chosen": -41.219139099121094, "logps/rejected": -60.89826965332031, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": -0.3601323366165161, "rewards/margins": 4.265294075012207, "rewards/rejected": -4.625426292419434, "step": 763 }, { "epoch": 4.5274074074074075, "grad_norm": 7.528293156397691, "learning_rate": 2.340949053657843e-07, "logits/chosen": -1.3578842878341675, "logits/rejected": -1.2270824909210205, "logps/chosen": -53.93731689453125, "logps/rejected": -68.09098815917969, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 0.41589680314064026, "rewards/margins": 5.329880714416504, "rewards/rejected": -4.913984298706055, "step": 764 }, { "epoch": 4.533333333333333, "grad_norm": 8.611181849954223, "learning_rate": 2.3344664955157685e-07, "logits/chosen": -0.9866389036178589, "logits/rejected": -1.0442872047424316, "logps/chosen": -34.870948791503906, "logps/rejected": -60.437259674072266, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": -0.3536360263824463, "rewards/margins": 4.980638027191162, "rewards/rejected": -5.3342742919921875, "step": 765 }, { "epoch": 4.539259259259259, "grad_norm": 8.391925166272058, "learning_rate": 2.3279850550931458e-07, "logits/chosen": -1.1695847511291504, "logits/rejected": -1.2806165218353271, "logps/chosen": -49.50543212890625, "logps/rejected": -69.00588989257812, "loss": 0.0673, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20481249690055847, "rewards/margins": 5.360518455505371, "rewards/rejected": -5.56533145904541, "step": 766 }, { "epoch": 4.545185185185185, "grad_norm": 10.596638977003911, "learning_rate": 2.3215047761541172e-07, "logits/chosen": -1.15887451171875, "logits/rejected": -1.2061165571212769, "logps/chosen": -37.335731506347656, "logps/rejected": -65.72675323486328, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": -0.2207244336605072, "rewards/margins": 4.417181968688965, "rewards/rejected": -4.637906074523926, "step": 767 }, { "epoch": 4.551111111111111, "grad_norm": 8.380077519111088, "learning_rate": 2.3150257024549845e-07, "logits/chosen": -1.244296669960022, "logits/rejected": -1.2550981044769287, "logps/chosen": -37.77351379394531, "logps/rejected": -57.30008316040039, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": -0.8602774739265442, "rewards/margins": 4.512520790100098, "rewards/rejected": -5.372798442840576, "step": 768 }, { "epoch": 4.557037037037037, "grad_norm": 6.4514979238938475, "learning_rate": 2.3085478777439096e-07, "logits/chosen": -1.166955590248108, "logits/rejected": -1.2839741706848145, "logps/chosen": -50.53374481201172, "logps/rejected": -60.122703552246094, "loss": 0.0571, "rewards/accuracies": 1.0, "rewards/chosen": -0.5598335266113281, "rewards/margins": 4.1827216148376465, "rewards/rejected": -4.742555141448975, "step": 769 }, { "epoch": 4.562962962962963, "grad_norm": 8.023421744646246, "learning_rate": 2.302071345760622e-07, "logits/chosen": -1.3408299684524536, "logits/rejected": -1.3236316442489624, "logps/chosen": -65.58592224121094, "logps/rejected": -64.16624450683594, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -0.36367684602737427, "rewards/margins": 4.246555328369141, "rewards/rejected": -4.610232353210449, "step": 770 }, { "epoch": 4.568888888888889, "grad_norm": 7.446315879947793, "learning_rate": 2.2955961502361232e-07, "logits/chosen": -1.3255586624145508, "logits/rejected": -1.3266708850860596, "logps/chosen": -47.81196975708008, "logps/rejected": -57.09688186645508, "loss": 0.0806, "rewards/accuracies": 0.9375, "rewards/chosen": -0.008279271423816681, "rewards/margins": 3.692826747894287, "rewards/rejected": -3.701106071472168, "step": 771 }, { "epoch": 4.574814814814815, "grad_norm": 7.029370466443028, "learning_rate": 2.2891223348923882e-07, "logits/chosen": -1.1787618398666382, "logits/rejected": -1.282931923866272, "logps/chosen": -58.13561248779297, "logps/rejected": -81.64736938476562, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": -0.6242808103561401, "rewards/margins": 5.2768659591674805, "rewards/rejected": -5.901146411895752, "step": 772 }, { "epoch": 4.58074074074074, "grad_norm": 7.712344599626733, "learning_rate": 2.2826499434420745e-07, "logits/chosen": -0.9917160272598267, "logits/rejected": -1.097461462020874, "logps/chosen": -45.970176696777344, "logps/rejected": -62.3052978515625, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": -0.7307544350624084, "rewards/margins": 4.317347049713135, "rewards/rejected": -5.048101425170898, "step": 773 }, { "epoch": 4.586666666666667, "grad_norm": 9.525692855012032, "learning_rate": 2.2761790195882261e-07, "logits/chosen": -1.1223633289337158, "logits/rejected": -1.1890718936920166, "logps/chosen": -41.621124267578125, "logps/rejected": -69.78456115722656, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": -0.1493055820465088, "rewards/margins": 4.050914764404297, "rewards/rejected": -4.200220108032227, "step": 774 }, { "epoch": 4.592592592592593, "grad_norm": 8.524809696419899, "learning_rate": 2.2697096070239748e-07, "logits/chosen": -1.2589701414108276, "logits/rejected": -1.205632209777832, "logps/chosen": -66.525146484375, "logps/rejected": -65.48693084716797, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": -0.6359963417053223, "rewards/margins": 3.971651554107666, "rewards/rejected": -4.607647895812988, "step": 775 }, { "epoch": 4.598518518518518, "grad_norm": 10.195817407678827, "learning_rate": 2.2632417494322503e-07, "logits/chosen": -1.2319070100784302, "logits/rejected": -1.2836008071899414, "logps/chosen": -52.31074523925781, "logps/rejected": -67.07086181640625, "loss": 0.082, "rewards/accuracies": 1.0, "rewards/chosen": 0.6152889132499695, "rewards/margins": 4.884402275085449, "rewards/rejected": -4.269113063812256, "step": 776 }, { "epoch": 4.604444444444445, "grad_norm": 6.88306364808887, "learning_rate": 2.2567754904854809e-07, "logits/chosen": -1.2738016843795776, "logits/rejected": -1.3152220249176025, "logps/chosen": -51.25276184082031, "logps/rejected": -66.99971008300781, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 0.15602001547813416, "rewards/margins": 4.498040199279785, "rewards/rejected": -4.342020511627197, "step": 777 }, { "epoch": 4.6103703703703705, "grad_norm": 9.223754328006974, "learning_rate": 2.2503108738453014e-07, "logits/chosen": -1.1845307350158691, "logits/rejected": -1.1550822257995605, "logps/chosen": -40.183563232421875, "logps/rejected": -61.688209533691406, "loss": 0.0811, "rewards/accuracies": 0.9375, "rewards/chosen": 0.43673551082611084, "rewards/margins": 3.9823086261749268, "rewards/rejected": -3.5455729961395264, "step": 778 }, { "epoch": 4.616296296296296, "grad_norm": 6.540828709467761, "learning_rate": 2.243847943162257e-07, "logits/chosen": -1.2384378910064697, "logits/rejected": -1.1350700855255127, "logps/chosen": -60.619468688964844, "logps/rejected": -60.44232177734375, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": -0.8905364274978638, "rewards/margins": 4.085331916809082, "rewards/rejected": -4.975867748260498, "step": 779 }, { "epoch": 4.622222222222222, "grad_norm": 8.735745587389715, "learning_rate": 2.23738674207551e-07, "logits/chosen": -1.2693932056427002, "logits/rejected": -1.3459255695343018, "logps/chosen": -43.43098831176758, "logps/rejected": -78.14689636230469, "loss": 0.0891, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10692919790744781, "rewards/margins": 6.120091438293457, "rewards/rejected": -6.013162612915039, "step": 780 }, { "epoch": 4.628148148148148, "grad_norm": 8.400037471171428, "learning_rate": 2.230927314212543e-07, "logits/chosen": -1.2819609642028809, "logits/rejected": -1.2576347589492798, "logps/chosen": -48.470706939697266, "logps/rejected": -60.44685363769531, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": -0.6669830083847046, "rewards/margins": 4.601341724395752, "rewards/rejected": -5.268324851989746, "step": 781 }, { "epoch": 4.634074074074074, "grad_norm": 7.613491024351818, "learning_rate": 2.2244697031888655e-07, "logits/chosen": -1.2104326486587524, "logits/rejected": -1.2761129140853882, "logps/chosen": -49.90730285644531, "logps/rejected": -66.0811767578125, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -0.10209351778030396, "rewards/margins": 3.9792842864990234, "rewards/rejected": -4.081377983093262, "step": 782 }, { "epoch": 4.64, "grad_norm": 5.297293025171356, "learning_rate": 2.21801395260772e-07, "logits/chosen": -1.0075641870498657, "logits/rejected": -1.131706953048706, "logps/chosen": -47.31157684326172, "logps/rejected": -73.90620422363281, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.848071813583374, "rewards/margins": 6.792016983032227, "rewards/rejected": -7.64008903503418, "step": 783 }, { "epoch": 4.645925925925926, "grad_norm": 10.09311813131246, "learning_rate": 2.2115601060597852e-07, "logits/chosen": -1.0886931419372559, "logits/rejected": -1.167771577835083, "logps/chosen": -55.40933609008789, "logps/rejected": -62.43598937988281, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": -0.5595874786376953, "rewards/margins": 4.344570159912109, "rewards/rejected": -4.904156684875488, "step": 784 }, { "epoch": 4.651851851851852, "grad_norm": 7.293624381026964, "learning_rate": 2.2051082071228852e-07, "logits/chosen": -1.3670647144317627, "logits/rejected": -1.4086506366729736, "logps/chosen": -44.550655364990234, "logps/rejected": -54.24536895751953, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 0.411616712808609, "rewards/margins": 3.4762425422668457, "rewards/rejected": -3.0646257400512695, "step": 785 }, { "epoch": 4.657777777777778, "grad_norm": 8.356926519592564, "learning_rate": 2.1986582993616925e-07, "logits/chosen": -1.0711511373519897, "logits/rejected": -1.1973775625228882, "logps/chosen": -44.04728317260742, "logps/rejected": -72.26802062988281, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": -0.4325738251209259, "rewards/margins": 5.791550636291504, "rewards/rejected": -6.224124908447266, "step": 786 }, { "epoch": 4.663703703703703, "grad_norm": 7.361972820362379, "learning_rate": 2.192210426327435e-07, "logits/chosen": -1.2996392250061035, "logits/rejected": -1.3320250511169434, "logps/chosen": -49.28330993652344, "logps/rejected": -63.363739013671875, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": 0.45627862215042114, "rewards/margins": 5.045790195465088, "rewards/rejected": -4.589511394500732, "step": 787 }, { "epoch": 4.66962962962963, "grad_norm": 9.713435992349082, "learning_rate": 2.185764631557602e-07, "logits/chosen": -1.1986274719238281, "logits/rejected": -1.2022976875305176, "logps/chosen": -40.9835205078125, "logps/rejected": -68.665283203125, "loss": 0.1043, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37996906042099, "rewards/margins": 3.0767459869384766, "rewards/rejected": -3.4567153453826904, "step": 788 }, { "epoch": 4.6755555555555555, "grad_norm": 6.6923161913800255, "learning_rate": 2.1793209585756482e-07, "logits/chosen": -1.2109483480453491, "logits/rejected": -1.205720067024231, "logps/chosen": -78.32020568847656, "logps/rejected": -94.1859130859375, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": -0.6772147417068481, "rewards/margins": 6.811749458312988, "rewards/rejected": -7.488964557647705, "step": 789 }, { "epoch": 4.681481481481481, "grad_norm": 6.808448924207019, "learning_rate": 2.1728794508907038e-07, "logits/chosen": -1.247983455657959, "logits/rejected": -1.3049252033233643, "logps/chosen": -46.73481750488281, "logps/rejected": -105.78466796875, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -0.5628355741500854, "rewards/margins": 5.637338161468506, "rewards/rejected": -6.200174331665039, "step": 790 }, { "epoch": 4.687407407407408, "grad_norm": 6.957507315180945, "learning_rate": 2.1664401519972774e-07, "logits/chosen": -1.09550940990448, "logits/rejected": -1.099479079246521, "logps/chosen": -62.40776443481445, "logps/rejected": -72.61018371582031, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": -0.5545617938041687, "rewards/margins": 5.671229362487793, "rewards/rejected": -6.225790977478027, "step": 791 }, { "epoch": 4.693333333333333, "grad_norm": 7.833525102618103, "learning_rate": 2.1600031053749634e-07, "logits/chosen": -1.2867189645767212, "logits/rejected": -1.2888667583465576, "logps/chosen": -55.19492721557617, "logps/rejected": -66.07937622070312, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": -0.6458514332771301, "rewards/margins": 3.38252854347229, "rewards/rejected": -4.028380393981934, "step": 792 }, { "epoch": 4.699259259259259, "grad_norm": 9.046276570843531, "learning_rate": 2.1535683544881478e-07, "logits/chosen": -1.3059468269348145, "logits/rejected": -1.4791343212127686, "logps/chosen": -42.17692947387695, "logps/rejected": -58.22074508666992, "loss": 0.0714, "rewards/accuracies": 1.0, "rewards/chosen": 0.08928295224905014, "rewards/margins": 4.443890571594238, "rewards/rejected": -4.354607582092285, "step": 793 }, { "epoch": 4.705185185185185, "grad_norm": 6.830679137241234, "learning_rate": 2.147135942785716e-07, "logits/chosen": -1.247341513633728, "logits/rejected": -1.2370492219924927, "logps/chosen": -43.17301559448242, "logps/rejected": -57.20549011230469, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -0.9720429182052612, "rewards/margins": 3.6760683059692383, "rewards/rejected": -4.648111343383789, "step": 794 }, { "epoch": 4.711111111111111, "grad_norm": 7.0136622973775715, "learning_rate": 2.1407059137007583e-07, "logits/chosen": -1.170547366142273, "logits/rejected": -1.2341359853744507, "logps/chosen": -53.60289764404297, "logps/rejected": -57.69877624511719, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 0.0990561991930008, "rewards/margins": 4.3058180809021, "rewards/rejected": -4.206761837005615, "step": 795 }, { "epoch": 4.717037037037037, "grad_norm": 8.70239443133945, "learning_rate": 2.1342783106502777e-07, "logits/chosen": -1.1409107446670532, "logits/rejected": -1.1933441162109375, "logps/chosen": -53.885986328125, "logps/rejected": -81.279052734375, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": -0.367448627948761, "rewards/margins": 5.0045485496521, "rewards/rejected": -5.371996879577637, "step": 796 }, { "epoch": 4.722962962962963, "grad_norm": 9.48634525371379, "learning_rate": 2.1278531770348963e-07, "logits/chosen": -1.1792958974838257, "logits/rejected": -1.2433199882507324, "logps/chosen": -53.308433532714844, "logps/rejected": -67.98117065429688, "loss": 0.0702, "rewards/accuracies": 1.0, "rewards/chosen": 0.10387498140335083, "rewards/margins": 4.540947914123535, "rewards/rejected": -4.43707275390625, "step": 797 }, { "epoch": 4.728888888888889, "grad_norm": 7.014370618377176, "learning_rate": 2.121430556238559e-07, "logits/chosen": -1.0922884941101074, "logits/rejected": -1.1718698740005493, "logps/chosen": -42.14895248413086, "logps/rejected": -60.662933349609375, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 0.05287043750286102, "rewards/margins": 6.049391746520996, "rewards/rejected": -5.996521472930908, "step": 798 }, { "epoch": 4.734814814814815, "grad_norm": 6.411956467587046, "learning_rate": 2.115010491628247e-07, "logits/chosen": -1.1495752334594727, "logits/rejected": -1.2174811363220215, "logps/chosen": -39.45281982421875, "logps/rejected": -59.280311584472656, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": 0.009924978017807007, "rewards/margins": 4.528892993927002, "rewards/rejected": -4.518968105316162, "step": 799 }, { "epoch": 4.7407407407407405, "grad_norm": 6.922372990111659, "learning_rate": 2.1085930265536808e-07, "logits/chosen": -1.2290149927139282, "logits/rejected": -1.2082287073135376, "logps/chosen": -35.51820373535156, "logps/rejected": -54.339359283447266, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -0.14226499199867249, "rewards/margins": 3.9152727127075195, "rewards/rejected": -4.057538032531738, "step": 800 }, { "epoch": 4.746666666666667, "grad_norm": 8.936768807671625, "learning_rate": 2.1021782043470278e-07, "logits/chosen": -1.1941179037094116, "logits/rejected": -1.2901239395141602, "logps/chosen": -60.116851806640625, "logps/rejected": -72.54811096191406, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": -0.2554321885108948, "rewards/margins": 4.877737998962402, "rewards/rejected": -5.133170127868652, "step": 801 }, { "epoch": 4.752592592592593, "grad_norm": 7.721368969119234, "learning_rate": 2.0957660683226103e-07, "logits/chosen": -1.2028778791427612, "logits/rejected": -1.2239680290222168, "logps/chosen": -44.84784698486328, "logps/rejected": -61.98634338378906, "loss": 0.0817, "rewards/accuracies": 0.9375, "rewards/chosen": -0.28557777404785156, "rewards/margins": 4.31488037109375, "rewards/rejected": -4.600458145141602, "step": 802 }, { "epoch": 4.758518518518518, "grad_norm": 8.44095587390649, "learning_rate": 2.0893566617766126e-07, "logits/chosen": -1.3699320554733276, "logits/rejected": -1.3465888500213623, "logps/chosen": -54.631710052490234, "logps/rejected": -57.47709274291992, "loss": 0.0724, "rewards/accuracies": 0.9375, "rewards/chosen": 0.38626307249069214, "rewards/margins": 3.516538619995117, "rewards/rejected": -3.1302754878997803, "step": 803 }, { "epoch": 4.764444444444445, "grad_norm": 8.026629554127119, "learning_rate": 2.0829500279867891e-07, "logits/chosen": -1.2593369483947754, "logits/rejected": -1.3323007822036743, "logps/chosen": -33.00196075439453, "logps/rejected": -60.59766387939453, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": 0.5274396538734436, "rewards/margins": 4.80533504486084, "rewards/rejected": -4.277894973754883, "step": 804 }, { "epoch": 4.770370370370371, "grad_norm": 5.545357033235097, "learning_rate": 2.0765462102121719e-07, "logits/chosen": -1.2611300945281982, "logits/rejected": -1.2821089029312134, "logps/chosen": -37.6617431640625, "logps/rejected": -56.39625930786133, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": -0.23814518749713898, "rewards/margins": 3.720951795578003, "rewards/rejected": -3.959096908569336, "step": 805 }, { "epoch": 4.776296296296296, "grad_norm": 9.33283152045016, "learning_rate": 2.0701452516927797e-07, "logits/chosen": -1.1440215110778809, "logits/rejected": -1.1519482135772705, "logps/chosen": -54.63264846801758, "logps/rejected": -75.48451232910156, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -0.687347948551178, "rewards/margins": 5.244063377380371, "rewards/rejected": -5.931410789489746, "step": 806 }, { "epoch": 4.782222222222222, "grad_norm": 6.837505033247797, "learning_rate": 2.0637471956493234e-07, "logits/chosen": -1.146953821182251, "logits/rejected": -1.2953850030899048, "logps/chosen": -36.457977294921875, "logps/rejected": -64.86138916015625, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": -0.5366591215133667, "rewards/margins": 4.506161689758301, "rewards/rejected": -5.042820453643799, "step": 807 }, { "epoch": 4.7881481481481485, "grad_norm": 8.388100188306844, "learning_rate": 2.0573520852829164e-07, "logits/chosen": -1.0507543087005615, "logits/rejected": -1.0924817323684692, "logps/chosen": -39.18290710449219, "logps/rejected": -56.02238082885742, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": -0.6681569814682007, "rewards/margins": 4.3059844970703125, "rewards/rejected": -4.974141597747803, "step": 808 }, { "epoch": 4.794074074074074, "grad_norm": 8.473188918963196, "learning_rate": 2.0509599637747818e-07, "logits/chosen": -1.3122702836990356, "logits/rejected": -1.2597553730010986, "logps/chosen": -44.254432678222656, "logps/rejected": -61.42188262939453, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": -0.7810443639755249, "rewards/margins": 4.892861843109131, "rewards/rejected": -5.673906326293945, "step": 809 }, { "epoch": 4.8, "grad_norm": 7.661013362311576, "learning_rate": 2.0445708742859625e-07, "logits/chosen": -1.1926181316375732, "logits/rejected": -1.1875218152999878, "logps/chosen": -56.58489990234375, "logps/rejected": -69.30337524414062, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": -0.6484168767929077, "rewards/margins": 4.6451520919799805, "rewards/rejected": -5.2935686111450195, "step": 810 }, { "epoch": 4.805925925925926, "grad_norm": 8.740225258601434, "learning_rate": 2.0381848599570273e-07, "logits/chosen": -1.1209958791732788, "logits/rejected": -1.1884212493896484, "logps/chosen": -35.961971282958984, "logps/rejected": -50.096317291259766, "loss": 0.0905, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1351906806230545, "rewards/margins": 4.218876361846924, "rewards/rejected": -4.083685398101807, "step": 811 }, { "epoch": 4.811851851851852, "grad_norm": 8.158970785472667, "learning_rate": 2.0318019639077803e-07, "logits/chosen": -1.2397578954696655, "logits/rejected": -1.3928254842758179, "logps/chosen": -42.682044982910156, "logps/rejected": -72.9716567993164, "loss": 0.0751, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0666639506816864, "rewards/margins": 4.914846420288086, "rewards/rejected": -4.981510162353516, "step": 812 }, { "epoch": 4.817777777777778, "grad_norm": 9.478210960221906, "learning_rate": 2.0254222292369724e-07, "logits/chosen": -1.176468849182129, "logits/rejected": -1.2728384733200073, "logps/chosen": -36.69878387451172, "logps/rejected": -62.156524658203125, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": -0.6981914043426514, "rewards/margins": 3.4020438194274902, "rewards/rejected": -4.1002349853515625, "step": 813 }, { "epoch": 4.823703703703703, "grad_norm": 7.439639636736858, "learning_rate": 2.0190456990220055e-07, "logits/chosen": -1.2218072414398193, "logits/rejected": -1.2740261554718018, "logps/chosen": -43.83546447753906, "logps/rejected": -67.54747009277344, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 0.2251005917787552, "rewards/margins": 5.161179542541504, "rewards/rejected": -4.936079025268555, "step": 814 }, { "epoch": 4.82962962962963, "grad_norm": 7.5097409856257595, "learning_rate": 2.0126724163186474e-07, "logits/chosen": -1.116912603378296, "logits/rejected": -1.166030764579773, "logps/chosen": -49.36180877685547, "logps/rejected": -60.51682662963867, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -0.238020122051239, "rewards/margins": 4.75913143157959, "rewards/rejected": -4.9971513748168945, "step": 815 }, { "epoch": 4.835555555555556, "grad_norm": 7.191731456701853, "learning_rate": 2.006302424160735e-07, "logits/chosen": -1.2778334617614746, "logits/rejected": -1.3720132112503052, "logps/chosen": -44.0409049987793, "logps/rejected": -60.05315399169922, "loss": 0.0666, "rewards/accuracies": 1.0, "rewards/chosen": -0.5977808237075806, "rewards/margins": 4.33931303024292, "rewards/rejected": -4.937093734741211, "step": 816 }, { "epoch": 4.841481481481481, "grad_norm": 8.418630189436973, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -1.2416424751281738, "logits/rejected": -1.2487435340881348, "logps/chosen": -53.07795715332031, "logps/rejected": -70.923095703125, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": -0.4244321286678314, "rewards/margins": 4.733457565307617, "rewards/rejected": -5.1578898429870605, "step": 817 }, { "epoch": 4.847407407407408, "grad_norm": 9.81718746931335, "learning_rate": 1.9935724835052196e-07, "logits/chosen": -1.1533488035202026, "logits/rejected": -1.120931625366211, "logps/chosen": -58.67433166503906, "logps/rejected": -74.95533752441406, "loss": 0.0906, "rewards/accuracies": 1.0, "rewards/chosen": -0.7449749112129211, "rewards/margins": 6.310980796813965, "rewards/rejected": -7.055954933166504, "step": 818 }, { "epoch": 4.8533333333333335, "grad_norm": 8.115955559787736, "learning_rate": 1.987212620963038e-07, "logits/chosen": -1.2583997249603271, "logits/rejected": -1.3349251747131348, "logps/chosen": -53.448116302490234, "logps/rejected": -75.52993774414062, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": -0.15489783883094788, "rewards/margins": 5.193795680999756, "rewards/rejected": -5.34869384765625, "step": 819 }, { "epoch": 4.859259259259259, "grad_norm": 6.723480724558627, "learning_rate": 1.9808562208765663e-07, "logits/chosen": -1.2606794834136963, "logits/rejected": -1.2723817825317383, "logps/chosen": -42.972747802734375, "logps/rejected": -66.92098999023438, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": -0.50331050157547, "rewards/margins": 4.977799415588379, "rewards/rejected": -5.481109619140625, "step": 820 }, { "epoch": 4.865185185185185, "grad_norm": 6.72884777182804, "learning_rate": 1.9745033261656486e-07, "logits/chosen": -1.3054922819137573, "logits/rejected": -1.2133138179779053, "logps/chosen": -53.268924713134766, "logps/rejected": -73.72822570800781, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": -0.7263810634613037, "rewards/margins": 4.860612392425537, "rewards/rejected": -5.586993217468262, "step": 821 }, { "epoch": 4.871111111111111, "grad_norm": 6.7682757332113495, "learning_rate": 1.9681539797264578e-07, "logits/chosen": -1.1518858671188354, "logits/rejected": -1.1976454257965088, "logps/chosen": -61.2183837890625, "logps/rejected": -79.68696594238281, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -0.49593448638916016, "rewards/margins": 5.0298919677734375, "rewards/rejected": -5.525826454162598, "step": 822 }, { "epoch": 4.877037037037037, "grad_norm": 10.44828841212963, "learning_rate": 1.96180822443121e-07, "logits/chosen": -1.2402451038360596, "logits/rejected": -1.3201285600662231, "logps/chosen": -52.995582580566406, "logps/rejected": -63.573150634765625, "loss": 0.101, "rewards/accuracies": 1.0, "rewards/chosen": -0.16973279416561127, "rewards/margins": 4.808659553527832, "rewards/rejected": -4.978392601013184, "step": 823 }, { "epoch": 4.882962962962963, "grad_norm": 11.841416570152811, "learning_rate": 1.955466103127871e-07, "logits/chosen": -1.0633020401000977, "logits/rejected": -1.1554279327392578, "logps/chosen": -41.22328186035156, "logps/rejected": -64.04997253417969, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": -0.6372347474098206, "rewards/margins": 4.569708347320557, "rewards/rejected": -5.206943511962891, "step": 824 }, { "epoch": 4.888888888888889, "grad_norm": 9.119245176045158, "learning_rate": 1.9491276586398715e-07, "logits/chosen": -0.972596287727356, "logits/rejected": -1.0817652940750122, "logps/chosen": -40.38216018676758, "logps/rejected": -71.95773315429688, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": -0.8263460397720337, "rewards/margins": 5.990384578704834, "rewards/rejected": -6.816730499267578, "step": 825 }, { "epoch": 4.894814814814815, "grad_norm": 7.084413600553766, "learning_rate": 1.9427929337658126e-07, "logits/chosen": -1.2040290832519531, "logits/rejected": -1.383630633354187, "logps/chosen": -40.241455078125, "logps/rejected": -61.860687255859375, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -0.3590248227119446, "rewards/margins": 4.8489580154418945, "rewards/rejected": -5.207983493804932, "step": 826 }, { "epoch": 4.900740740740741, "grad_norm": 7.182863168982642, "learning_rate": 1.9364619712791819e-07, "logits/chosen": -1.2070741653442383, "logits/rejected": -1.2761895656585693, "logps/chosen": -45.865875244140625, "logps/rejected": -65.07974243164062, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.03775513172149658, "rewards/margins": 4.42209529876709, "rewards/rejected": -4.459850788116455, "step": 827 }, { "epoch": 4.906666666666666, "grad_norm": 7.946011876870162, "learning_rate": 1.9301348139280627e-07, "logits/chosen": -1.175967812538147, "logits/rejected": -1.2586506605148315, "logps/chosen": -56.91388702392578, "logps/rejected": -59.17463684082031, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 0.16037920117378235, "rewards/margins": 3.1888394355773926, "rewards/rejected": -3.0284600257873535, "step": 828 }, { "epoch": 4.912592592592593, "grad_norm": 9.179383834460381, "learning_rate": 1.9238115044348434e-07, "logits/chosen": -1.1928423643112183, "logits/rejected": -1.2084673643112183, "logps/chosen": -67.9603500366211, "logps/rejected": -81.65391540527344, "loss": 0.0577, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1263477802276611, "rewards/margins": 5.509244918823242, "rewards/rejected": -6.635592937469482, "step": 829 }, { "epoch": 4.9185185185185185, "grad_norm": 9.264958893547135, "learning_rate": 1.9174920854959322e-07, "logits/chosen": -1.1068739891052246, "logits/rejected": -1.2803871631622314, "logps/chosen": -32.360023498535156, "logps/rejected": -56.196678161621094, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": -0.40388235449790955, "rewards/margins": 5.12398624420166, "rewards/rejected": -5.527868270874023, "step": 830 }, { "epoch": 4.924444444444444, "grad_norm": 10.908563975596744, "learning_rate": 1.9111765997814678e-07, "logits/chosen": -1.29888117313385, "logits/rejected": -1.3771575689315796, "logps/chosen": -44.184505462646484, "logps/rejected": -59.10567092895508, "loss": 0.0962, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19967499375343323, "rewards/margins": 4.624224662780762, "rewards/rejected": -4.424549579620361, "step": 831 }, { "epoch": 4.930370370370371, "grad_norm": 8.456676479609067, "learning_rate": 1.904865089935029e-07, "logits/chosen": -1.2636172771453857, "logits/rejected": -1.2544087171554565, "logps/chosen": -41.338768005371094, "logps/rejected": -61.473453521728516, "loss": 0.0762, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11037924885749817, "rewards/margins": 4.756615161895752, "rewards/rejected": -4.866994857788086, "step": 832 }, { "epoch": 4.936296296296296, "grad_norm": 6.202900054839411, "learning_rate": 1.8985575985733507e-07, "logits/chosen": -1.2785180807113647, "logits/rejected": -1.217121958732605, "logps/chosen": -48.604339599609375, "logps/rejected": -70.71717834472656, "loss": 0.0703, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3400578498840332, "rewards/margins": 4.592471599578857, "rewards/rejected": -4.932529449462891, "step": 833 }, { "epoch": 4.942222222222222, "grad_norm": 6.624373897036038, "learning_rate": 1.8922541682860326e-07, "logits/chosen": -1.138252854347229, "logits/rejected": -1.2604851722717285, "logps/chosen": -36.05076217651367, "logps/rejected": -55.33295440673828, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 0.23800107836723328, "rewards/margins": 4.528337478637695, "rewards/rejected": -4.290336608886719, "step": 834 }, { "epoch": 4.948148148148148, "grad_norm": 5.620900100070084, "learning_rate": 1.8859548416352536e-07, "logits/chosen": -1.324847936630249, "logits/rejected": -1.431929111480713, "logps/chosen": -44.402740478515625, "logps/rejected": -68.83207702636719, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.13397502899169922, "rewards/margins": 5.259936332702637, "rewards/rejected": -5.1259613037109375, "step": 835 }, { "epoch": 4.954074074074074, "grad_norm": 6.985083953063934, "learning_rate": 1.8796596611554838e-07, "logits/chosen": -1.3173155784606934, "logits/rejected": -1.1883106231689453, "logps/chosen": -49.2961311340332, "logps/rejected": -58.56343078613281, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -0.21595799922943115, "rewards/margins": 4.060945510864258, "rewards/rejected": -4.27690315246582, "step": 836 }, { "epoch": 4.96, "grad_norm": 8.705680214772553, "learning_rate": 1.8733686693531982e-07, "logits/chosen": -1.1338430643081665, "logits/rejected": -1.2737352848052979, "logps/chosen": -44.42845153808594, "logps/rejected": -79.81404876708984, "loss": 0.0821, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1882634162902832, "rewards/margins": 5.241169452667236, "rewards/rejected": -6.429432392120361, "step": 837 }, { "epoch": 4.965925925925926, "grad_norm": 8.370685133521045, "learning_rate": 1.8670819087065882e-07, "logits/chosen": -1.1332958936691284, "logits/rejected": -1.1995506286621094, "logps/chosen": -49.437835693359375, "logps/rejected": -59.27742004394531, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": -0.6721725463867188, "rewards/margins": 4.094503879547119, "rewards/rejected": -4.76667594909668, "step": 838 }, { "epoch": 4.971851851851852, "grad_norm": 4.897226742571563, "learning_rate": 1.8607994216652756e-07, "logits/chosen": -1.0724759101867676, "logits/rejected": -1.1880738735198975, "logps/chosen": -40.61245346069336, "logps/rejected": -72.87381744384766, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -0.5802782773971558, "rewards/margins": 6.521996021270752, "rewards/rejected": -7.102274417877197, "step": 839 }, { "epoch": 4.977777777777778, "grad_norm": 6.480414219345178, "learning_rate": 1.8545212506500257e-07, "logits/chosen": -1.1149649620056152, "logits/rejected": -1.2812896966934204, "logps/chosen": -49.6659049987793, "logps/rejected": -59.63869094848633, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": -0.08736288547515869, "rewards/margins": 5.593204498291016, "rewards/rejected": -5.680567741394043, "step": 840 }, { "epoch": 4.9837037037037035, "grad_norm": 6.1186920179539035, "learning_rate": 1.848247438052461e-07, "logits/chosen": -1.209456443786621, "logits/rejected": -1.149521827697754, "logps/chosen": -60.91209411621094, "logps/rejected": -81.8936767578125, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": -0.6175415515899658, "rewards/margins": 6.043704986572266, "rewards/rejected": -6.661246299743652, "step": 841 }, { "epoch": 4.989629629629629, "grad_norm": 9.310621643988044, "learning_rate": 1.8419780262347754e-07, "logits/chosen": -1.2068432569503784, "logits/rejected": -1.233940839767456, "logps/chosen": -50.332767486572266, "logps/rejected": -68.87258911132812, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": -1.5439462661743164, "rewards/margins": 5.3185715675354, "rewards/rejected": -6.862517833709717, "step": 842 }, { "epoch": 4.995555555555556, "grad_norm": 8.427374147972621, "learning_rate": 1.835713057529447e-07, "logits/chosen": -1.1492252349853516, "logits/rejected": -1.2668813467025757, "logps/chosen": -40.98643112182617, "logps/rejected": -84.24874877929688, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": 0.22349956631660461, "rewards/margins": 5.515828609466553, "rewards/rejected": -5.292328834533691, "step": 843 }, { "epoch": 5.001481481481481, "grad_norm": 7.137776049040377, "learning_rate": 1.8294525742389545e-07, "logits/chosen": -1.3387656211853027, "logits/rejected": -1.2699817419052124, "logps/chosen": -51.69484329223633, "logps/rejected": -54.29808807373047, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -0.3511824607849121, "rewards/margins": 3.246230363845825, "rewards/rejected": -3.5974128246307373, "step": 844 }, { "epoch": 5.007407407407407, "grad_norm": 5.743840005240809, "learning_rate": 1.8231966186354881e-07, "logits/chosen": -1.1772220134735107, "logits/rejected": -1.1998919248580933, "logps/chosen": -50.18497848510742, "logps/rejected": -67.80682373046875, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": -0.5682530403137207, "rewards/margins": 4.094499111175537, "rewards/rejected": -4.662752151489258, "step": 845 }, { "epoch": 5.013333333333334, "grad_norm": 6.795618286709746, "learning_rate": 1.8169452329606666e-07, "logits/chosen": -1.2579567432403564, "logits/rejected": -1.2464933395385742, "logps/chosen": -45.38380813598633, "logps/rejected": -79.08517456054688, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": -0.8201035261154175, "rewards/margins": 4.0954766273498535, "rewards/rejected": -4.915580749511719, "step": 846 }, { "epoch": 5.019259259259259, "grad_norm": 6.068510959671392, "learning_rate": 1.810698459425254e-07, "logits/chosen": -1.1743278503417969, "logits/rejected": -1.2346994876861572, "logps/chosen": -41.45006561279297, "logps/rejected": -52.600372314453125, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.1332920342683792, "rewards/margins": 4.325839042663574, "rewards/rejected": -4.192546844482422, "step": 847 }, { "epoch": 5.025185185185185, "grad_norm": 6.070290501886953, "learning_rate": 1.8044563402088682e-07, "logits/chosen": -1.0933340787887573, "logits/rejected": -1.0968542098999023, "logps/chosen": -43.97517013549805, "logps/rejected": -68.6432113647461, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": -0.2295885682106018, "rewards/margins": 5.160776615142822, "rewards/rejected": -5.3903656005859375, "step": 848 }, { "epoch": 5.0311111111111115, "grad_norm": 6.383699093215017, "learning_rate": 1.7982189174597033e-07, "logits/chosen": -1.219502568244934, "logits/rejected": -1.1766386032104492, "logps/chosen": -55.12873077392578, "logps/rejected": -68.77888488769531, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": -0.4659353494644165, "rewards/margins": 5.554776191711426, "rewards/rejected": -6.020711898803711, "step": 849 }, { "epoch": 5.037037037037037, "grad_norm": 6.024482599289236, "learning_rate": 1.7919862332942398e-07, "logits/chosen": -1.3594261407852173, "logits/rejected": -1.3427817821502686, "logps/chosen": -49.3066291809082, "logps/rejected": -58.31121826171875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": -0.20746520161628723, "rewards/margins": 4.219393253326416, "rewards/rejected": -4.426857948303223, "step": 850 }, { "epoch": 5.042962962962963, "grad_norm": 6.112060590166798, "learning_rate": 1.785758329796963e-07, "logits/chosen": -1.1953470706939697, "logits/rejected": -1.2780898809432983, "logps/chosen": -42.03157424926758, "logps/rejected": -65.34711456298828, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -0.37147438526153564, "rewards/margins": 4.750897407531738, "rewards/rejected": -5.122371673583984, "step": 851 }, { "epoch": 5.0488888888888885, "grad_norm": 5.568282892153548, "learning_rate": 1.779535249020078e-07, "logits/chosen": -1.2823762893676758, "logits/rejected": -1.4365592002868652, "logps/chosen": -41.219505310058594, "logps/rejected": -55.70688247680664, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.6002290844917297, "rewards/margins": 4.6371564865112305, "rewards/rejected": -4.036927223205566, "step": 852 }, { "epoch": 5.054814814814815, "grad_norm": 4.2104157519083625, "learning_rate": 1.7733170329832262e-07, "logits/chosen": -1.0824943780899048, "logits/rejected": -1.1701000928878784, "logps/chosen": -37.631195068359375, "logps/rejected": -56.78057861328125, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -0.6821134090423584, "rewards/margins": 4.264049053192139, "rewards/rejected": -4.946162223815918, "step": 853 }, { "epoch": 5.060740740740741, "grad_norm": 6.659130886986735, "learning_rate": 1.7671037236732012e-07, "logits/chosen": -1.1604373455047607, "logits/rejected": -1.2753492593765259, "logps/chosen": -55.01195526123047, "logps/rejected": -76.13126373291016, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -0.37604719400405884, "rewards/margins": 6.232474327087402, "rewards/rejected": -6.608521461486816, "step": 854 }, { "epoch": 5.066666666666666, "grad_norm": 5.3537705230979125, "learning_rate": 1.760895363043663e-07, "logits/chosen": -1.1752749681472778, "logits/rejected": -1.262036681175232, "logps/chosen": -53.43190002441406, "logps/rejected": -74.62548065185547, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": -0.6357789635658264, "rewards/margins": 4.767390727996826, "rewards/rejected": -5.403169631958008, "step": 855 }, { "epoch": 5.072592592592593, "grad_norm": 3.8068601009017713, "learning_rate": 1.7546919930148603e-07, "logits/chosen": -1.0931473970413208, "logits/rejected": -1.1400827169418335, "logps/chosen": -75.83396911621094, "logps/rejected": -78.3897705078125, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -0.5985426306724548, "rewards/margins": 4.9465861320495605, "rewards/rejected": -5.545129299163818, "step": 856 }, { "epoch": 5.078518518518519, "grad_norm": 6.252936627705325, "learning_rate": 1.748493655473342e-07, "logits/chosen": -1.373226523399353, "logits/rejected": -1.4498156309127808, "logps/chosen": -49.392845153808594, "logps/rejected": -58.926536560058594, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": -0.544809103012085, "rewards/margins": 4.060660362243652, "rewards/rejected": -4.605469703674316, "step": 857 }, { "epoch": 5.084444444444444, "grad_norm": 7.446095832569485, "learning_rate": 1.742300392271678e-07, "logits/chosen": -1.1426042318344116, "logits/rejected": -1.2104812860488892, "logps/chosen": -38.60154724121094, "logps/rejected": -66.51348876953125, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 0.08904124796390533, "rewards/margins": 4.445226669311523, "rewards/rejected": -4.356184959411621, "step": 858 }, { "epoch": 5.09037037037037, "grad_norm": 4.401887258420257, "learning_rate": 1.7361122452281737e-07, "logits/chosen": -1.3515548706054688, "logits/rejected": -1.4137619733810425, "logps/chosen": -46.939727783203125, "logps/rejected": -59.44325256347656, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.899919867515564, "rewards/margins": 3.3372130393981934, "rewards/rejected": -4.237132549285889, "step": 859 }, { "epoch": 5.0962962962962965, "grad_norm": 5.7853569559732625, "learning_rate": 1.72992925612659e-07, "logits/chosen": -1.2066540718078613, "logits/rejected": -1.2842992544174194, "logps/chosen": -47.825103759765625, "logps/rejected": -70.65608215332031, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": -0.37702682614326477, "rewards/margins": 5.337638854980469, "rewards/rejected": -5.71466588973999, "step": 860 }, { "epoch": 5.102222222222222, "grad_norm": 5.4362349224675315, "learning_rate": 1.7237514667158596e-07, "logits/chosen": -1.3022295236587524, "logits/rejected": -1.3546056747436523, "logps/chosen": -50.296424865722656, "logps/rejected": -66.29060363769531, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.1895902454853058, "rewards/margins": 5.409600257873535, "rewards/rejected": -5.5991902351379395, "step": 861 }, { "epoch": 5.108148148148148, "grad_norm": 5.875751684828407, "learning_rate": 1.7175789187098055e-07, "logits/chosen": -1.044024109840393, "logits/rejected": -1.1330406665802002, "logps/chosen": -36.35478973388672, "logps/rejected": -62.587158203125, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 0.1854926198720932, "rewards/margins": 5.976263046264648, "rewards/rejected": -5.790770530700684, "step": 862 }, { "epoch": 5.114074074074074, "grad_norm": 5.519639345673725, "learning_rate": 1.7114116537868612e-07, "logits/chosen": -1.2666987180709839, "logits/rejected": -1.2495226860046387, "logps/chosen": -46.67084503173828, "logps/rejected": -58.346153259277344, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 0.046361494809389114, "rewards/margins": 4.0351457595825195, "rewards/rejected": -3.9887845516204834, "step": 863 }, { "epoch": 5.12, "grad_norm": 5.611856872848983, "learning_rate": 1.705249713589786e-07, "logits/chosen": -1.2212047576904297, "logits/rejected": -1.3220913410186768, "logps/chosen": -53.67855453491211, "logps/rejected": -86.76349639892578, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -0.5181419849395752, "rewards/margins": 5.356942653656006, "rewards/rejected": -5.875084400177002, "step": 864 }, { "epoch": 5.125925925925926, "grad_norm": 5.1599714002027826, "learning_rate": 1.699093139725386e-07, "logits/chosen": -1.1640803813934326, "logits/rejected": -1.254544973373413, "logps/chosen": -58.018028259277344, "logps/rejected": -67.485595703125, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -0.989666223526001, "rewards/margins": 4.372481346130371, "rewards/rejected": -5.362147331237793, "step": 865 }, { "epoch": 5.131851851851851, "grad_norm": 6.291834529842612, "learning_rate": 1.6929419737642322e-07, "logits/chosen": -1.209142804145813, "logits/rejected": -1.2099839448928833, "logps/chosen": -47.084754943847656, "logps/rejected": -69.58710479736328, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.44162529706954956, "rewards/margins": 5.258059501647949, "rewards/rejected": -5.699685096740723, "step": 866 }, { "epoch": 5.137777777777778, "grad_norm": 4.29389402322372, "learning_rate": 1.686796257240381e-07, "logits/chosen": -1.2211939096450806, "logits/rejected": -1.2630951404571533, "logps/chosen": -43.643333435058594, "logps/rejected": -70.00041961669922, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 0.21772587299346924, "rewards/margins": 4.932277679443359, "rewards/rejected": -4.7145514488220215, "step": 867 }, { "epoch": 5.143703703703704, "grad_norm": 6.70678025622613, "learning_rate": 1.680656031651093e-07, "logits/chosen": -1.1878728866577148, "logits/rejected": -1.1749542951583862, "logps/chosen": -51.935516357421875, "logps/rejected": -71.19942474365234, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.9229670166969299, "rewards/margins": 4.8038554191589355, "rewards/rejected": -5.726822376251221, "step": 868 }, { "epoch": 5.149629629629629, "grad_norm": 4.358134041783962, "learning_rate": 1.6745213384565516e-07, "logits/chosen": -1.379859447479248, "logits/rejected": -1.3122217655181885, "logps/chosen": -52.930946350097656, "logps/rejected": -81.54460144042969, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -1.141561508178711, "rewards/margins": 5.625734806060791, "rewards/rejected": -6.767295837402344, "step": 869 }, { "epoch": 5.155555555555556, "grad_norm": 3.763065225153427, "learning_rate": 1.6683922190795845e-07, "logits/chosen": -1.0610485076904297, "logits/rejected": -1.186012864112854, "logps/chosen": -41.94168472290039, "logps/rejected": -75.10829162597656, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8855224847793579, "rewards/margins": 6.992925643920898, "rewards/rejected": -7.878448009490967, "step": 870 }, { "epoch": 5.161481481481482, "grad_norm": 6.137238349957374, "learning_rate": 1.6622687149053844e-07, "logits/chosen": -1.1331788301467896, "logits/rejected": -1.2375328540802002, "logps/chosen": -51.34503173828125, "logps/rejected": -75.72793579101562, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": -1.1209805011749268, "rewards/margins": 4.683239936828613, "rewards/rejected": -5.804220676422119, "step": 871 }, { "epoch": 5.167407407407407, "grad_norm": 5.621964866813937, "learning_rate": 1.6561508672812295e-07, "logits/chosen": -1.2639954090118408, "logits/rejected": -1.3647345304489136, "logps/chosen": -56.51701354980469, "logps/rejected": -70.59258270263672, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -0.37653955817222595, "rewards/margins": 3.593329906463623, "rewards/rejected": -3.9698691368103027, "step": 872 }, { "epoch": 5.173333333333334, "grad_norm": 9.958380211596328, "learning_rate": 1.650038717516203e-07, "logits/chosen": -1.298003077507019, "logits/rejected": -1.276810646057129, "logps/chosen": -60.997528076171875, "logps/rejected": -57.12252426147461, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": -1.4214093685150146, "rewards/margins": 3.4771430492401123, "rewards/rejected": -4.898552417755127, "step": 873 }, { "epoch": 5.1792592592592595, "grad_norm": 6.568096407056915, "learning_rate": 1.6439323068809137e-07, "logits/chosen": -1.2642948627471924, "logits/rejected": -1.278181791305542, "logps/chosen": -49.35980987548828, "logps/rejected": -68.7181625366211, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 0.15632173418998718, "rewards/margins": 4.260412216186523, "rewards/rejected": -4.104090690612793, "step": 874 }, { "epoch": 5.185185185185185, "grad_norm": 5.324980729784364, "learning_rate": 1.6378316766072196e-07, "logits/chosen": -1.0710474252700806, "logits/rejected": -1.2427352666854858, "logps/chosen": -52.37470626831055, "logps/rejected": -72.27552795410156, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": -0.8807699680328369, "rewards/margins": 5.8853373527526855, "rewards/rejected": -6.766107082366943, "step": 875 }, { "epoch": 5.191111111111111, "grad_norm": 7.891108260191516, "learning_rate": 1.6317368678879496e-07, "logits/chosen": -1.2832369804382324, "logits/rejected": -1.3103740215301514, "logps/chosen": -45.93217468261719, "logps/rejected": -55.650115966796875, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": -0.7958899736404419, "rewards/margins": 4.188290596008301, "rewards/rejected": -4.984180450439453, "step": 876 }, { "epoch": 5.197037037037037, "grad_norm": 6.184649944460298, "learning_rate": 1.6256479218766212e-07, "logits/chosen": -1.2717363834381104, "logits/rejected": -1.3072198629379272, "logps/chosen": -55.998291015625, "logps/rejected": -85.4422607421875, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": -0.28671860694885254, "rewards/margins": 5.818790912628174, "rewards/rejected": -6.105508804321289, "step": 877 }, { "epoch": 5.202962962962963, "grad_norm": 5.348342562947095, "learning_rate": 1.6195648796871687e-07, "logits/chosen": -1.004050374031067, "logits/rejected": -1.1158876419067383, "logps/chosen": -36.67620849609375, "logps/rejected": -66.01150512695312, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": -0.5501542091369629, "rewards/margins": 6.020908355712891, "rewards/rejected": -6.571062088012695, "step": 878 }, { "epoch": 5.208888888888889, "grad_norm": 5.533806185221065, "learning_rate": 1.6134877823936607e-07, "logits/chosen": -1.1981170177459717, "logits/rejected": -1.2208610773086548, "logps/chosen": -58.91746520996094, "logps/rejected": -80.06623840332031, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -0.9180342555046082, "rewards/margins": 4.734535217285156, "rewards/rejected": -5.652569770812988, "step": 879 }, { "epoch": 5.214814814814815, "grad_norm": 4.919679414691683, "learning_rate": 1.6074166710300247e-07, "logits/chosen": -0.9741761684417725, "logits/rejected": -1.0934596061706543, "logps/chosen": -53.64191818237305, "logps/rejected": -64.29776000976562, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -0.5024899840354919, "rewards/margins": 5.4552178382873535, "rewards/rejected": -5.957707405090332, "step": 880 }, { "epoch": 5.220740740740741, "grad_norm": 5.883827857125823, "learning_rate": 1.60135158658977e-07, "logits/chosen": -1.2886649370193481, "logits/rejected": -1.3470699787139893, "logps/chosen": -64.49275207519531, "logps/rejected": -99.09591674804688, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": -2.056515693664551, "rewards/margins": 5.756255626678467, "rewards/rejected": -7.812771320343018, "step": 881 }, { "epoch": 5.226666666666667, "grad_norm": 5.4563657240893715, "learning_rate": 1.5952925700257115e-07, "logits/chosen": -1.195138931274414, "logits/rejected": -1.2933731079101562, "logps/chosen": -47.062171936035156, "logps/rejected": -64.63616943359375, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -0.8991705179214478, "rewards/margins": 4.7366509437561035, "rewards/rejected": -5.635821342468262, "step": 882 }, { "epoch": 5.232592592592592, "grad_norm": 7.636546778330124, "learning_rate": 1.5892396622496905e-07, "logits/chosen": -1.0840702056884766, "logits/rejected": -1.0799250602722168, "logps/chosen": -64.66129302978516, "logps/rejected": -89.11322021484375, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -1.2371934652328491, "rewards/margins": 6.809388637542725, "rewards/rejected": -8.046582221984863, "step": 883 }, { "epoch": 5.238518518518519, "grad_norm": 4.897765269920654, "learning_rate": 1.5831929041323023e-07, "logits/chosen": -1.311830997467041, "logits/rejected": -1.3064095973968506, "logps/chosen": -60.29758834838867, "logps/rejected": -73.54698181152344, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -0.9659146070480347, "rewards/margins": 5.142203330993652, "rewards/rejected": -6.108118057250977, "step": 884 }, { "epoch": 5.2444444444444445, "grad_norm": 5.19071852044687, "learning_rate": 1.5771523365026175e-07, "logits/chosen": -0.986069917678833, "logits/rejected": -1.048060655593872, "logps/chosen": -36.13816452026367, "logps/rejected": -66.58907318115234, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": -0.23351190984249115, "rewards/margins": 4.650994777679443, "rewards/rejected": -4.8845062255859375, "step": 885 }, { "epoch": 5.25037037037037, "grad_norm": 5.460247088889271, "learning_rate": 1.5711180001479068e-07, "logits/chosen": -1.0913227796554565, "logits/rejected": -1.1225485801696777, "logps/chosen": -36.94963073730469, "logps/rejected": -59.24100112915039, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.1481831967830658, "rewards/margins": 3.815484046936035, "rewards/rejected": -3.963667392730713, "step": 886 }, { "epoch": 5.256296296296297, "grad_norm": 4.980602595420538, "learning_rate": 1.5650899358133667e-07, "logits/chosen": -1.2710387706756592, "logits/rejected": -1.3214951753616333, "logps/chosen": -56.805091857910156, "logps/rejected": -69.06848907470703, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": -0.6180689930915833, "rewards/margins": 4.681292533874512, "rewards/rejected": -5.2993621826171875, "step": 887 }, { "epoch": 5.262222222222222, "grad_norm": 5.992859843422505, "learning_rate": 1.5590681842018443e-07, "logits/chosen": -1.303534984588623, "logits/rejected": -1.3712739944458008, "logps/chosen": -66.29510498046875, "logps/rejected": -75.78311157226562, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -0.9916107654571533, "rewards/margins": 5.067993640899658, "rewards/rejected": -6.059604644775391, "step": 888 }, { "epoch": 5.268148148148148, "grad_norm": 4.327858487400165, "learning_rate": 1.5530527859735599e-07, "logits/chosen": -1.2532622814178467, "logits/rejected": -1.4161229133605957, "logps/chosen": -47.307518005371094, "logps/rejected": -74.99484252929688, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": -0.8368775844573975, "rewards/margins": 6.0908203125, "rewards/rejected": -6.92769718170166, "step": 889 }, { "epoch": 5.274074074074074, "grad_norm": 5.2410704858507104, "learning_rate": 1.5470437817458355e-07, "logits/chosen": -1.3019599914550781, "logits/rejected": -1.407655954360962, "logps/chosen": -48.766719818115234, "logps/rejected": -78.3816909790039, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": -0.5846667289733887, "rewards/margins": 4.5074005126953125, "rewards/rejected": -5.092066764831543, "step": 890 }, { "epoch": 5.28, "grad_norm": 6.530618142623513, "learning_rate": 1.5410412120928186e-07, "logits/chosen": -1.243187665939331, "logits/rejected": -1.2798230648040771, "logps/chosen": -62.167999267578125, "logps/rejected": -79.63477325439453, "loss": 0.0622, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1904207468032837, "rewards/margins": 5.07986307144165, "rewards/rejected": -6.2702836990356445, "step": 891 }, { "epoch": 5.285925925925926, "grad_norm": 4.48460068251217, "learning_rate": 1.53504511754521e-07, "logits/chosen": -1.139565110206604, "logits/rejected": -1.1655480861663818, "logps/chosen": -53.463279724121094, "logps/rejected": -77.63116455078125, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -0.6475785970687866, "rewards/margins": 5.736711025238037, "rewards/rejected": -6.384289741516113, "step": 892 }, { "epoch": 5.291851851851852, "grad_norm": 6.637625995278517, "learning_rate": 1.5290555385899877e-07, "logits/chosen": -1.0791090726852417, "logits/rejected": -1.119141936302185, "logps/chosen": -52.37824249267578, "logps/rejected": -72.08952331542969, "loss": 0.0699, "rewards/accuracies": 1.0, "rewards/chosen": -1.1510322093963623, "rewards/margins": 4.51447057723999, "rewards/rejected": -5.66550350189209, "step": 893 }, { "epoch": 5.297777777777778, "grad_norm": 4.306211675643785, "learning_rate": 1.5230725156701373e-07, "logits/chosen": -1.258397102355957, "logits/rejected": -1.2849924564361572, "logps/chosen": -54.06769561767578, "logps/rejected": -77.40863037109375, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": -0.4762367904186249, "rewards/margins": 5.966217041015625, "rewards/rejected": -6.4424543380737305, "step": 894 }, { "epoch": 5.303703703703704, "grad_norm": 4.847603431390897, "learning_rate": 1.517096089184375e-07, "logits/chosen": -1.2337533235549927, "logits/rejected": -1.267514705657959, "logps/chosen": -65.87496948242188, "logps/rejected": -65.5901870727539, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -0.30669963359832764, "rewards/margins": 4.703336715698242, "rewards/rejected": -5.010036468505859, "step": 895 }, { "epoch": 5.3096296296296295, "grad_norm": 5.370548257891294, "learning_rate": 1.5111262994868756e-07, "logits/chosen": -1.127760648727417, "logits/rejected": -1.247991919517517, "logps/chosen": -47.134883880615234, "logps/rejected": -64.86996459960938, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 0.2545168399810791, "rewards/margins": 4.356789588928223, "rewards/rejected": -4.1022725105285645, "step": 896 }, { "epoch": 5.315555555555555, "grad_norm": 5.683501867833103, "learning_rate": 1.5051631868870019e-07, "logits/chosen": -1.057781457901001, "logits/rejected": -1.1627904176712036, "logps/chosen": -47.290008544921875, "logps/rejected": -76.31295776367188, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1723828315734863, "rewards/margins": 5.7342023849487305, "rewards/rejected": -6.906585693359375, "step": 897 }, { "epoch": 5.321481481481482, "grad_norm": 4.515876065020696, "learning_rate": 1.499206791649032e-07, "logits/chosen": -1.2790859937667847, "logits/rejected": -1.1949933767318726, "logps/chosen": -51.798484802246094, "logps/rejected": -69.213134765625, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -0.10272088646888733, "rewards/margins": 6.101094722747803, "rewards/rejected": -6.203815937042236, "step": 898 }, { "epoch": 5.327407407407407, "grad_norm": 7.152221030044184, "learning_rate": 1.4932571539918854e-07, "logits/chosen": -1.2066757678985596, "logits/rejected": -1.2475630044937134, "logps/chosen": -59.380523681640625, "logps/rejected": -74.11068725585938, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -0.9633429050445557, "rewards/margins": 4.7660136222839355, "rewards/rejected": -5.72935676574707, "step": 899 }, { "epoch": 5.333333333333333, "grad_norm": 3.7672921680241673, "learning_rate": 1.4873143140888537e-07, "logits/chosen": -1.1228379011154175, "logits/rejected": -1.1561514139175415, "logps/chosen": -54.204586029052734, "logps/rejected": -87.96161651611328, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -1.6686967611312866, "rewards/margins": 6.6950249671936035, "rewards/rejected": -8.36372184753418, "step": 900 }, { "epoch": 5.33925925925926, "grad_norm": 6.914564087559101, "learning_rate": 1.481378312067329e-07, "logits/chosen": -1.0900871753692627, "logits/rejected": -1.0513169765472412, "logps/chosen": -55.293128967285156, "logps/rejected": -67.97708892822266, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -0.9693081974983215, "rewards/margins": 5.633920669555664, "rewards/rejected": -6.60322904586792, "step": 901 }, { "epoch": 5.345185185185185, "grad_norm": 4.180855150429028, "learning_rate": 1.4754491880085317e-07, "logits/chosen": -1.1942942142486572, "logits/rejected": -1.267154574394226, "logps/chosen": -49.460086822509766, "logps/rejected": -82.92127990722656, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -1.0170201063156128, "rewards/margins": 5.091777324676514, "rewards/rejected": -6.108797073364258, "step": 902 }, { "epoch": 5.351111111111111, "grad_norm": 5.9840055586095415, "learning_rate": 1.4695269819472403e-07, "logits/chosen": -1.1321901082992554, "logits/rejected": -1.2262929677963257, "logps/chosen": -64.43628692626953, "logps/rejected": -62.02772521972656, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": -0.8415505290031433, "rewards/margins": 4.008552551269531, "rewards/rejected": -4.850103378295898, "step": 903 }, { "epoch": 5.357037037037037, "grad_norm": 5.661485291376627, "learning_rate": 1.463611733871523e-07, "logits/chosen": -1.2824634313583374, "logits/rejected": -1.416248083114624, "logps/chosen": -53.39885711669922, "logps/rejected": -84.84522247314453, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -0.37042808532714844, "rewards/margins": 4.654745101928711, "rewards/rejected": -5.025173664093018, "step": 904 }, { "epoch": 5.362962962962963, "grad_norm": 5.2077909388446875, "learning_rate": 1.457703483722466e-07, "logits/chosen": -1.1724047660827637, "logits/rejected": -1.262000560760498, "logps/chosen": -46.26305389404297, "logps/rejected": -69.90074920654297, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 0.14402145147323608, "rewards/margins": 5.816695213317871, "rewards/rejected": -5.672673225402832, "step": 905 }, { "epoch": 5.368888888888889, "grad_norm": 4.801632413673207, "learning_rate": 1.4518022713938998e-07, "logits/chosen": -1.118366003036499, "logits/rejected": -1.2085130214691162, "logps/chosen": -48.222755432128906, "logps/rejected": -72.53038024902344, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": -0.36291417479515076, "rewards/margins": 4.7783098220825195, "rewards/rejected": -5.141223430633545, "step": 906 }, { "epoch": 5.3748148148148145, "grad_norm": 5.6157264725803975, "learning_rate": 1.4459081367321407e-07, "logits/chosen": -1.1638175249099731, "logits/rejected": -1.247123122215271, "logps/chosen": -44.200462341308594, "logps/rejected": -58.13221740722656, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -1.0433942079544067, "rewards/margins": 4.555025100708008, "rewards/rejected": -5.598419189453125, "step": 907 }, { "epoch": 5.380740740740741, "grad_norm": 6.531962348628216, "learning_rate": 1.4400211195357103e-07, "logits/chosen": -1.04237961769104, "logits/rejected": -1.0963478088378906, "logps/chosen": -61.242576599121094, "logps/rejected": -68.6133804321289, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -1.6439560651779175, "rewards/margins": 5.466358184814453, "rewards/rejected": -7.11031436920166, "step": 908 }, { "epoch": 5.386666666666667, "grad_norm": 5.540692876499104, "learning_rate": 1.4341412595550724e-07, "logits/chosen": -1.211525321006775, "logits/rejected": -1.202690839767456, "logps/chosen": -40.24979782104492, "logps/rejected": -71.52658081054688, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -0.8724290728569031, "rewards/margins": 6.074644088745117, "rewards/rejected": -6.947073459625244, "step": 909 }, { "epoch": 5.392592592592592, "grad_norm": 5.924738983593196, "learning_rate": 1.428268596492364e-07, "logits/chosen": -1.0326358079910278, "logits/rejected": -1.1596226692199707, "logps/chosen": -40.392730712890625, "logps/rejected": -66.74359130859375, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -0.2823147177696228, "rewards/margins": 5.335864543914795, "rewards/rejected": -5.6181793212890625, "step": 910 }, { "epoch": 5.398518518518518, "grad_norm": 5.614965106603697, "learning_rate": 1.4224031700011286e-07, "logits/chosen": -1.1185742616653442, "logits/rejected": -1.1819273233413696, "logps/chosen": -45.5130500793457, "logps/rejected": -71.71981048583984, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -0.864221453666687, "rewards/margins": 6.3592610359191895, "rewards/rejected": -7.223482131958008, "step": 911 }, { "epoch": 5.404444444444445, "grad_norm": 6.016423389484806, "learning_rate": 1.416545019686042e-07, "logits/chosen": -1.144727110862732, "logits/rejected": -1.3016068935394287, "logps/chosen": -50.52497482299805, "logps/rejected": -76.85518646240234, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": -1.2942097187042236, "rewards/margins": 5.873856544494629, "rewards/rejected": -7.168066024780273, "step": 912 }, { "epoch": 5.41037037037037, "grad_norm": 4.326351374076242, "learning_rate": 1.4106941851026544e-07, "logits/chosen": -1.0008817911148071, "logits/rejected": -1.0063378810882568, "logps/chosen": -52.42710876464844, "logps/rejected": -80.53175354003906, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -1.7073206901550293, "rewards/margins": 6.483597278594971, "rewards/rejected": -8.19091796875, "step": 913 }, { "epoch": 5.416296296296296, "grad_norm": 6.052332153532867, "learning_rate": 1.4048507057571164e-07, "logits/chosen": -1.036699891090393, "logits/rejected": -0.999764084815979, "logps/chosen": -53.56409454345703, "logps/rejected": -65.7117919921875, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": -0.8550149202346802, "rewards/margins": 5.08674430847168, "rewards/rejected": -5.9417595863342285, "step": 914 }, { "epoch": 5.4222222222222225, "grad_norm": 5.180494758421336, "learning_rate": 1.3990146211059139e-07, "logits/chosen": -1.1877224445343018, "logits/rejected": -1.303817868232727, "logps/chosen": -53.484046936035156, "logps/rejected": -61.8600959777832, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": -0.9356443881988525, "rewards/margins": 4.698094367980957, "rewards/rejected": -5.633738994598389, "step": 915 }, { "epoch": 5.428148148148148, "grad_norm": 5.380172307220296, "learning_rate": 1.3931859705556052e-07, "logits/chosen": -1.2037510871887207, "logits/rejected": -1.2069494724273682, "logps/chosen": -43.905364990234375, "logps/rejected": -61.135196685791016, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": 0.2278020977973938, "rewards/margins": 4.547643661499023, "rewards/rejected": -4.3198418617248535, "step": 916 }, { "epoch": 5.434074074074074, "grad_norm": 5.229207365948886, "learning_rate": 1.387364793462548e-07, "logits/chosen": -1.1884421110153198, "logits/rejected": -1.128300428390503, "logps/chosen": -53.02272415161133, "logps/rejected": -80.40774536132812, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2592935562133789, "rewards/margins": 4.70393705368042, "rewards/rejected": -4.963230609893799, "step": 917 }, { "epoch": 5.44, "grad_norm": 5.447576108491131, "learning_rate": 1.38155112913264e-07, "logits/chosen": -1.1345170736312866, "logits/rejected": -1.139125108718872, "logps/chosen": -46.506019592285156, "logps/rejected": -63.89093017578125, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": -0.2819477915763855, "rewards/margins": 4.921759128570557, "rewards/rejected": -5.203706741333008, "step": 918 }, { "epoch": 5.445925925925926, "grad_norm": 6.124448141336304, "learning_rate": 1.37574501682105e-07, "logits/chosen": -1.2564316987991333, "logits/rejected": -1.2532209157943726, "logps/chosen": -52.18721008300781, "logps/rejected": -81.69747924804688, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": -1.3770983219146729, "rewards/margins": 6.158283233642578, "rewards/rejected": -7.535381317138672, "step": 919 }, { "epoch": 5.451851851851852, "grad_norm": 7.410616675601531, "learning_rate": 1.369946495731954e-07, "logits/chosen": -1.2974275350570679, "logits/rejected": -1.3738926649093628, "logps/chosen": -54.73295974731445, "logps/rejected": -65.42332458496094, "loss": 0.0601, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8449416756629944, "rewards/margins": 4.6002397537231445, "rewards/rejected": -5.445181369781494, "step": 920 }, { "epoch": 5.457777777777777, "grad_norm": 6.234144569472265, "learning_rate": 1.3641556050182707e-07, "logits/chosen": -1.0465418100357056, "logits/rejected": -1.1629109382629395, "logps/chosen": -39.9510498046875, "logps/rejected": -63.465293884277344, "loss": 0.0668, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08770129829645157, "rewards/margins": 4.9500956535339355, "rewards/rejected": -5.037796974182129, "step": 921 }, { "epoch": 5.463703703703704, "grad_norm": 6.996076426450947, "learning_rate": 1.3583723837813964e-07, "logits/chosen": -1.2552739381790161, "logits/rejected": -1.2882190942764282, "logps/chosen": -56.15148162841797, "logps/rejected": -80.41363525390625, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": -1.3065168857574463, "rewards/margins": 6.144155502319336, "rewards/rejected": -7.450672626495361, "step": 922 }, { "epoch": 5.46962962962963, "grad_norm": 4.924640654272727, "learning_rate": 1.3525968710709415e-07, "logits/chosen": -1.302824854850769, "logits/rejected": -1.3598288297653198, "logps/chosen": -48.679534912109375, "logps/rejected": -71.34368896484375, "loss": 0.0382, "rewards/accuracies": 1.0, "rewards/chosen": -0.7547460794448853, "rewards/margins": 5.851616859436035, "rewards/rejected": -6.606363296508789, "step": 923 }, { "epoch": 5.475555555555555, "grad_norm": 5.2437105146313385, "learning_rate": 1.346829105884467e-07, "logits/chosen": -1.0821112394332886, "logits/rejected": -1.092231273651123, "logps/chosen": -45.19500732421875, "logps/rejected": -68.84188079833984, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -0.9143983125686646, "rewards/margins": 5.41214656829834, "rewards/rejected": -6.326544761657715, "step": 924 }, { "epoch": 5.481481481481482, "grad_norm": 5.688009304833448, "learning_rate": 1.3410691271672206e-07, "logits/chosen": -1.2743799686431885, "logits/rejected": -1.279219627380371, "logps/chosen": -46.0670051574707, "logps/rejected": -64.08885192871094, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.5631405115127563, "rewards/margins": 5.1333723068237305, "rewards/rejected": -5.6965131759643555, "step": 925 }, { "epoch": 5.4874074074074075, "grad_norm": 8.374452505500786, "learning_rate": 1.335316973811874e-07, "logits/chosen": -1.0590918064117432, "logits/rejected": -1.244085431098938, "logps/chosen": -42.937896728515625, "logps/rejected": -62.34235763549805, "loss": 0.0643, "rewards/accuracies": 0.9375, "rewards/chosen": -0.014899902045726776, "rewards/margins": 4.596308708190918, "rewards/rejected": -4.611208438873291, "step": 926 }, { "epoch": 5.493333333333333, "grad_norm": 4.931540791985558, "learning_rate": 1.32957268465826e-07, "logits/chosen": -1.1672147512435913, "logits/rejected": -1.2030147314071655, "logps/chosen": -51.3866081237793, "logps/rejected": -70.0325927734375, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -1.1257150173187256, "rewards/margins": 5.762474060058594, "rewards/rejected": -6.88818883895874, "step": 927 }, { "epoch": 5.499259259259259, "grad_norm": 3.7190108460368387, "learning_rate": 1.3238362984931113e-07, "logits/chosen": -1.1120619773864746, "logits/rejected": -1.2704691886901855, "logps/chosen": -40.81899642944336, "logps/rejected": -76.57392883300781, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -1.1256623268127441, "rewards/margins": 6.532195568084717, "rewards/rejected": -7.657857894897461, "step": 928 }, { "epoch": 5.505185185185185, "grad_norm": 4.451300821349891, "learning_rate": 1.318107854049797e-07, "logits/chosen": -1.044045329093933, "logits/rejected": -1.211983323097229, "logps/chosen": -43.968040466308594, "logps/rejected": -58.34918212890625, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -0.7015755772590637, "rewards/margins": 5.835867881774902, "rewards/rejected": -6.537443161010742, "step": 929 }, { "epoch": 5.511111111111111, "grad_norm": 6.226765209722212, "learning_rate": 1.3123873900080628e-07, "logits/chosen": -1.0990855693817139, "logits/rejected": -1.1984244585037231, "logps/chosen": -44.29381561279297, "logps/rejected": -56.79447555541992, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": -0.9139587879180908, "rewards/margins": 4.626317977905273, "rewards/rejected": -5.540277004241943, "step": 930 }, { "epoch": 5.517037037037037, "grad_norm": 5.35157163971713, "learning_rate": 1.306674944993768e-07, "logits/chosen": -1.1949620246887207, "logits/rejected": -1.2123682498931885, "logps/chosen": -47.544219970703125, "logps/rejected": -73.42807006835938, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": -0.5051659941673279, "rewards/margins": 6.832710266113281, "rewards/rejected": -7.337876796722412, "step": 931 }, { "epoch": 5.522962962962963, "grad_norm": 4.7362735283631165, "learning_rate": 1.3009705575786268e-07, "logits/chosen": -1.134218692779541, "logits/rejected": -1.2343950271606445, "logps/chosen": -43.53325653076172, "logps/rejected": -59.55531311035156, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -0.904568612575531, "rewards/margins": 4.297881603240967, "rewards/rejected": -5.202449798583984, "step": 932 }, { "epoch": 5.528888888888889, "grad_norm": 4.89046550004791, "learning_rate": 1.295274266279945e-07, "logits/chosen": -1.239561676979065, "logits/rejected": -1.3123736381530762, "logps/chosen": -54.36166000366211, "logps/rejected": -72.64697265625, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": -0.6119088530540466, "rewards/margins": 6.145787715911865, "rewards/rejected": -6.757696628570557, "step": 933 }, { "epoch": 5.534814814814815, "grad_norm": 4.619990490602159, "learning_rate": 1.2895861095603632e-07, "logits/chosen": -1.1459276676177979, "logits/rejected": -1.1653519868850708, "logps/chosen": -35.19683074951172, "logps/rejected": -53.685089111328125, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 0.3016229569911957, "rewards/margins": 4.380598545074463, "rewards/rejected": -4.078975677490234, "step": 934 }, { "epoch": 5.540740740740741, "grad_norm": 4.993312240195513, "learning_rate": 1.2839061258275946e-07, "logits/chosen": -1.1333314180374146, "logits/rejected": -1.2495033740997314, "logps/chosen": -41.614891052246094, "logps/rejected": -62.70551300048828, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -0.1989545226097107, "rewards/margins": 5.417413711547852, "rewards/rejected": -5.616368293762207, "step": 935 }, { "epoch": 5.546666666666667, "grad_norm": 4.271588493947499, "learning_rate": 1.2782343534341665e-07, "logits/chosen": -1.2903553247451782, "logits/rejected": -1.2682414054870605, "logps/chosen": -50.89227294921875, "logps/rejected": -61.375526428222656, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": -0.728342592716217, "rewards/margins": 4.288450241088867, "rewards/rejected": -5.0167927742004395, "step": 936 }, { "epoch": 5.5525925925925925, "grad_norm": 7.292093111520975, "learning_rate": 1.2725708306771618e-07, "logits/chosen": -1.1426489353179932, "logits/rejected": -1.0802648067474365, "logps/chosen": -47.69855880737305, "logps/rejected": -62.599212646484375, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": -0.5614185333251953, "rewards/margins": 4.699395179748535, "rewards/rejected": -5.2608137130737305, "step": 937 }, { "epoch": 5.558518518518518, "grad_norm": 6.908638237299102, "learning_rate": 1.266915595797961e-07, "logits/chosen": -1.126725196838379, "logits/rejected": -1.1588492393493652, "logps/chosen": -43.83253479003906, "logps/rejected": -71.14010620117188, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -0.5733367800712585, "rewards/margins": 5.2426252365112305, "rewards/rejected": -5.815961837768555, "step": 938 }, { "epoch": 5.564444444444445, "grad_norm": 4.629799808254051, "learning_rate": 1.2612686869819817e-07, "logits/chosen": -1.2065681219100952, "logits/rejected": -1.3164688348770142, "logps/chosen": -38.721435546875, "logps/rejected": -68.58113098144531, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.1604537069797516, "rewards/margins": 6.0843706130981445, "rewards/rejected": -6.244824409484863, "step": 939 }, { "epoch": 5.57037037037037, "grad_norm": 4.522577258438449, "learning_rate": 1.2556301423584208e-07, "logits/chosen": -1.2485311031341553, "logits/rejected": -1.2570372819900513, "logps/chosen": -54.931739807128906, "logps/rejected": -79.12727355957031, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -0.7012864351272583, "rewards/margins": 4.642655849456787, "rewards/rejected": -5.343942642211914, "step": 940 }, { "epoch": 5.576296296296296, "grad_norm": 4.813395455443031, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -1.2301363945007324, "logits/rejected": -1.2636702060699463, "logps/chosen": -44.12531280517578, "logps/rejected": -66.81246948242188, "loss": 0.0369, "rewards/accuracies": 1.0, "rewards/chosen": -0.8484854698181152, "rewards/margins": 4.948280334472656, "rewards/rejected": -5.7967658042907715, "step": 941 }, { "epoch": 5.582222222222223, "grad_norm": 5.798154822473784, "learning_rate": 1.2443782979227082e-07, "logits/chosen": -1.2446839809417725, "logits/rejected": -1.344369888305664, "logps/chosen": -39.720298767089844, "logps/rejected": -61.465423583984375, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": -0.929324746131897, "rewards/margins": 5.307143211364746, "rewards/rejected": -6.2364678382873535, "step": 942 }, { "epoch": 5.588148148148148, "grad_norm": 3.2413652931315067, "learning_rate": 1.2387650740855406e-07, "logits/chosen": -1.2844980955123901, "logits/rejected": -1.3366330862045288, "logps/chosen": -47.887725830078125, "logps/rejected": -60.49043273925781, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -0.8668982982635498, "rewards/margins": 4.607653617858887, "rewards/rejected": -5.474552154541016, "step": 943 }, { "epoch": 5.594074074074074, "grad_norm": 4.6399365768250425, "learning_rate": 1.2331603663902475e-07, "logits/chosen": -1.1947689056396484, "logits/rejected": -1.1636683940887451, "logps/chosen": -58.632686614990234, "logps/rejected": -75.03528594970703, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -0.8372504711151123, "rewards/margins": 6.475705146789551, "rewards/rejected": -7.312955379486084, "step": 944 }, { "epoch": 5.6, "grad_norm": 5.71549500582046, "learning_rate": 1.2275642126810762e-07, "logits/chosen": -1.1459275484085083, "logits/rejected": -1.1692252159118652, "logps/chosen": -46.13181686401367, "logps/rejected": -55.8668212890625, "loss": 0.0577, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7316948771476746, "rewards/margins": 3.8514623641967773, "rewards/rejected": -4.583157539367676, "step": 945 }, { "epoch": 5.605925925925926, "grad_norm": 8.27586438727282, "learning_rate": 1.2219766507445144e-07, "logits/chosen": -1.1214922666549683, "logits/rejected": -1.1562455892562866, "logps/chosen": -47.02653884887695, "logps/rejected": -87.65753173828125, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -0.2972184419631958, "rewards/margins": 7.0828657150268555, "rewards/rejected": -7.38008451461792, "step": 946 }, { "epoch": 5.611851851851852, "grad_norm": 6.271959445561895, "learning_rate": 1.2163977183090368e-07, "logits/chosen": -1.10219144821167, "logits/rejected": -1.108323097229004, "logps/chosen": -49.387237548828125, "logps/rejected": -77.21356201171875, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": -0.7970916628837585, "rewards/margins": 6.1644792556762695, "rewards/rejected": -6.961570739746094, "step": 947 }, { "epoch": 5.6177777777777775, "grad_norm": 7.1120046006269035, "learning_rate": 1.210827453044851e-07, "logits/chosen": -1.2681684494018555, "logits/rejected": -1.2564421892166138, "logps/chosen": -50.25490951538086, "logps/rejected": -75.11100769042969, "loss": 0.0784, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9637770056724548, "rewards/margins": 4.389342308044434, "rewards/rejected": -5.353118896484375, "step": 948 }, { "epoch": 5.623703703703704, "grad_norm": 5.399880404079545, "learning_rate": 1.2052658925636405e-07, "logits/chosen": -1.181290864944458, "logits/rejected": -1.2599127292633057, "logps/chosen": -40.77113342285156, "logps/rejected": -52.629150390625, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.0298051834106445, "rewards/margins": 3.4283018112182617, "rewards/rejected": -4.458106994628906, "step": 949 }, { "epoch": 5.62962962962963, "grad_norm": 6.411458192010983, "learning_rate": 1.1997130744183124e-07, "logits/chosen": -1.130419373512268, "logits/rejected": -1.2022407054901123, "logps/chosen": -71.73152160644531, "logps/rejected": -93.64906311035156, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": -1.6727749109268188, "rewards/margins": 6.842751502990723, "rewards/rejected": -8.515525817871094, "step": 950 }, { "epoch": 5.635555555555555, "grad_norm": 5.677902192188492, "learning_rate": 1.194169036102743e-07, "logits/chosen": -1.116295576095581, "logits/rejected": -1.1786460876464844, "logps/chosen": -44.79496765136719, "logps/rejected": -78.12545776367188, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": -0.8381794691085815, "rewards/margins": 6.8596415519714355, "rewards/rejected": -7.697821140289307, "step": 951 }, { "epoch": 5.641481481481481, "grad_norm": 5.757208376877495, "learning_rate": 1.1886338150515268e-07, "logits/chosen": -1.2456589937210083, "logits/rejected": -1.339970350265503, "logps/chosen": -54.900333404541016, "logps/rejected": -90.55741882324219, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": -1.918721318244934, "rewards/margins": 8.1624116897583, "rewards/rejected": -10.081131935119629, "step": 952 }, { "epoch": 5.647407407407408, "grad_norm": 4.89301134351496, "learning_rate": 1.1831074486397217e-07, "logits/chosen": -1.0892047882080078, "logits/rejected": -1.1265947818756104, "logps/chosen": -50.045753479003906, "logps/rejected": -75.69743347167969, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -0.5692238211631775, "rewards/margins": 6.327653884887695, "rewards/rejected": -6.896877765655518, "step": 953 }, { "epoch": 5.653333333333333, "grad_norm": 6.301689678260451, "learning_rate": 1.1775899741825945e-07, "logits/chosen": -1.0384539365768433, "logits/rejected": -1.1424777507781982, "logps/chosen": -52.425750732421875, "logps/rejected": -87.0918960571289, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": -1.1157212257385254, "rewards/margins": 6.278460502624512, "rewards/rejected": -7.394181251525879, "step": 954 }, { "epoch": 5.659259259259259, "grad_norm": 7.4706943420437115, "learning_rate": 1.172081428935375e-07, "logits/chosen": -0.9616471529006958, "logits/rejected": -1.08530592918396, "logps/chosen": -49.213619232177734, "logps/rejected": -63.890655517578125, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": -1.0542219877243042, "rewards/margins": 5.608872413635254, "rewards/rejected": -6.663094520568848, "step": 955 }, { "epoch": 5.6651851851851855, "grad_norm": 7.815420541527748, "learning_rate": 1.1665818500929986e-07, "logits/chosen": -1.186784029006958, "logits/rejected": -1.2369247674942017, "logps/chosen": -62.02526092529297, "logps/rejected": -81.75930786132812, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": 0.15900936722755432, "rewards/margins": 4.918282508850098, "rewards/rejected": -4.759273529052734, "step": 956 }, { "epoch": 5.671111111111111, "grad_norm": 7.116948617235036, "learning_rate": 1.1610912747898605e-07, "logits/chosen": -1.1347780227661133, "logits/rejected": -1.2619976997375488, "logps/chosen": -49.45901870727539, "logps/rejected": -68.4981689453125, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -0.9228365421295166, "rewards/margins": 4.178586006164551, "rewards/rejected": -5.101422309875488, "step": 957 }, { "epoch": 5.677037037037037, "grad_norm": 6.492425589496003, "learning_rate": 1.1556097400995585e-07, "logits/chosen": -1.2856743335723877, "logits/rejected": -1.3788843154907227, "logps/chosen": -68.00703430175781, "logps/rejected": -79.57150268554688, "loss": 0.0558, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8615601062774658, "rewards/margins": 5.724004745483398, "rewards/rejected": -6.585564613342285, "step": 958 }, { "epoch": 5.6829629629629625, "grad_norm": 5.135996060670731, "learning_rate": 1.1501372830346482e-07, "logits/chosen": -1.0250344276428223, "logits/rejected": -1.0303955078125, "logps/chosen": -40.95635986328125, "logps/rejected": -55.43898010253906, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 0.6365801095962524, "rewards/margins": 4.479146957397461, "rewards/rejected": -3.842566967010498, "step": 959 }, { "epoch": 5.688888888888889, "grad_norm": 5.129604924243458, "learning_rate": 1.1446739405463899e-07, "logits/chosen": -1.0779722929000854, "logits/rejected": -1.1054648160934448, "logps/chosen": -35.89310073852539, "logps/rejected": -54.29434585571289, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -0.5512725114822388, "rewards/margins": 4.163705348968506, "rewards/rejected": -4.714978218078613, "step": 960 }, { "epoch": 5.694814814814815, "grad_norm": 4.8560437167278625, "learning_rate": 1.1392197495245015e-07, "logits/chosen": -1.155285120010376, "logits/rejected": -1.2099722623825073, "logps/chosen": -42.74791717529297, "logps/rejected": -57.803279876708984, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -0.29253920912742615, "rewards/margins": 3.9843411445617676, "rewards/rejected": -4.276880264282227, "step": 961 }, { "epoch": 5.70074074074074, "grad_norm": 5.7930938450559415, "learning_rate": 1.1337747467969069e-07, "logits/chosen": -1.085658311843872, "logits/rejected": -1.0948498249053955, "logps/chosen": -49.48651123046875, "logps/rejected": -76.69400024414062, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -1.2173595428466797, "rewards/margins": 6.082059860229492, "rewards/rejected": -7.299419403076172, "step": 962 }, { "epoch": 5.706666666666667, "grad_norm": 5.30493601375962, "learning_rate": 1.1283389691294893e-07, "logits/chosen": -1.1731464862823486, "logits/rejected": -1.141379475593567, "logps/chosen": -64.22185516357422, "logps/rejected": -83.61888122558594, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -0.7661117315292358, "rewards/margins": 5.380878448486328, "rewards/rejected": -6.1469902992248535, "step": 963 }, { "epoch": 5.712592592592593, "grad_norm": 6.403497269781013, "learning_rate": 1.1229124532258421e-07, "logits/chosen": -1.0684443712234497, "logits/rejected": -1.0762279033660889, "logps/chosen": -58.415157318115234, "logps/rejected": -75.17643737792969, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -1.5389410257339478, "rewards/margins": 5.100183486938477, "rewards/rejected": -6.639124870300293, "step": 964 }, { "epoch": 5.718518518518518, "grad_norm": 4.039319682323648, "learning_rate": 1.1174952357270212e-07, "logits/chosen": -1.3112658262252808, "logits/rejected": -1.2177374362945557, "logps/chosen": -52.77716827392578, "logps/rejected": -66.23959350585938, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -0.6938422918319702, "rewards/margins": 4.566688537597656, "rewards/rejected": -5.260530471801758, "step": 965 }, { "epoch": 5.724444444444444, "grad_norm": 6.537363002137338, "learning_rate": 1.112087353211297e-07, "logits/chosen": -1.2479323148727417, "logits/rejected": -1.34829843044281, "logps/chosen": -44.44136047363281, "logps/rejected": -64.08821105957031, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": -0.6463432312011719, "rewards/margins": 4.6674041748046875, "rewards/rejected": -5.313746929168701, "step": 966 }, { "epoch": 5.730370370370371, "grad_norm": 7.6812486979953984, "learning_rate": 1.1066888421939092e-07, "logits/chosen": -1.1662806272506714, "logits/rejected": -1.200300931930542, "logps/chosen": -54.65203857421875, "logps/rejected": -85.63590240478516, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 0.2911415100097656, "rewards/margins": 6.0351667404174805, "rewards/rejected": -5.744025230407715, "step": 967 }, { "epoch": 5.736296296296296, "grad_norm": 6.1369264649099415, "learning_rate": 1.1012997391268177e-07, "logits/chosen": -1.1206430196762085, "logits/rejected": -1.1232975721359253, "logps/chosen": -44.27980041503906, "logps/rejected": -59.34086608886719, "loss": 0.0601, "rewards/accuracies": 1.0, "rewards/chosen": -0.4602680504322052, "rewards/margins": 4.129044055938721, "rewards/rejected": -4.5893120765686035, "step": 968 }, { "epoch": 5.742222222222222, "grad_norm": 6.212271022570864, "learning_rate": 1.095920080398459e-07, "logits/chosen": -1.1898860931396484, "logits/rejected": -1.193435549736023, "logps/chosen": -50.994625091552734, "logps/rejected": -70.28504943847656, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -1.0652556419372559, "rewards/margins": 5.318528175354004, "rewards/rejected": -6.383784294128418, "step": 969 }, { "epoch": 5.7481481481481485, "grad_norm": 5.897763958041394, "learning_rate": 1.0905499023334979e-07, "logits/chosen": -1.2726876735687256, "logits/rejected": -1.3368113040924072, "logps/chosen": -55.83734130859375, "logps/rejected": -68.83439636230469, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -0.4541219472885132, "rewards/margins": 6.1417083740234375, "rewards/rejected": -6.595829963684082, "step": 970 }, { "epoch": 5.754074074074074, "grad_norm": 5.570072178828081, "learning_rate": 1.0851892411925856e-07, "logits/chosen": -1.1539332866668701, "logits/rejected": -1.237091064453125, "logps/chosen": -55.219852447509766, "logps/rejected": -73.31748962402344, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -0.5192880630493164, "rewards/margins": 4.689527988433838, "rewards/rejected": -5.208816051483154, "step": 971 }, { "epoch": 5.76, "grad_norm": 5.439743506962176, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -1.239844799041748, "logits/rejected": -1.2221330404281616, "logps/chosen": -47.03329849243164, "logps/rejected": -74.15437316894531, "loss": 0.0454, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49033457040786743, "rewards/margins": 5.557613849639893, "rewards/rejected": -6.047948837280273, "step": 972 }, { "epoch": 5.7659259259259255, "grad_norm": 7.484832245837122, "learning_rate": 1.0744966144039588e-07, "logits/chosen": -1.0188753604888916, "logits/rejected": -1.1094659566879272, "logps/chosen": -50.33880615234375, "logps/rejected": -78.59779357910156, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8958476781845093, "rewards/margins": 6.123520851135254, "rewards/rejected": -7.019368648529053, "step": 973 }, { "epoch": 5.771851851851852, "grad_norm": 8.200396955576231, "learning_rate": 1.0691647209552654e-07, "logits/chosen": -1.0882221460342407, "logits/rejected": -1.1707772016525269, "logps/chosen": -41.45745849609375, "logps/rejected": -60.8970947265625, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -0.44809460639953613, "rewards/margins": 3.9365527629852295, "rewards/rejected": -4.384647369384766, "step": 974 }, { "epoch": 5.777777777777778, "grad_norm": 4.498048165253478, "learning_rate": 1.0638424888281744e-07, "logits/chosen": -1.144453763961792, "logits/rejected": -1.2178853750228882, "logps/chosen": -55.933860778808594, "logps/rejected": -81.9764175415039, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.8853988647460938, "rewards/margins": 6.145270824432373, "rewards/rejected": -7.030670166015625, "step": 975 }, { "epoch": 5.783703703703703, "grad_norm": 4.5687275866561174, "learning_rate": 1.0585299539595943e-07, "logits/chosen": -1.2651833295822144, "logits/rejected": -1.1797913312911987, "logps/chosen": -62.95174026489258, "logps/rejected": -73.2163314819336, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -1.0579339265823364, "rewards/margins": 5.187211036682129, "rewards/rejected": -6.245144367218018, "step": 976 }, { "epoch": 5.78962962962963, "grad_norm": 4.757815127421331, "learning_rate": 1.0532271522209551e-07, "logits/chosen": -0.972952663898468, "logits/rejected": -1.0780919790267944, "logps/chosen": -43.577178955078125, "logps/rejected": -72.53720092773438, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": -1.3548003435134888, "rewards/margins": 5.889094829559326, "rewards/rejected": -7.243895530700684, "step": 977 }, { "epoch": 5.795555555555556, "grad_norm": 4.532083769578922, "learning_rate": 1.0479341194179658e-07, "logits/chosen": -1.2063857316970825, "logits/rejected": -1.2588369846343994, "logps/chosen": -44.20055389404297, "logps/rejected": -79.2449722290039, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.3380606472492218, "rewards/margins": 5.87565803527832, "rewards/rejected": -6.213718414306641, "step": 978 }, { "epoch": 5.801481481481481, "grad_norm": 5.179109139924423, "learning_rate": 1.0426508912903764e-07, "logits/chosen": -1.071878433227539, "logits/rejected": -1.1827893257141113, "logps/chosen": -47.58728790283203, "logps/rejected": -66.75723266601562, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -1.366632342338562, "rewards/margins": 5.255701065063477, "rewards/rejected": -6.622333526611328, "step": 979 }, { "epoch": 5.807407407407408, "grad_norm": 6.028226327831648, "learning_rate": 1.0373775035117305e-07, "logits/chosen": -1.0318422317504883, "logits/rejected": -1.0840988159179688, "logps/chosen": -38.939598083496094, "logps/rejected": -65.85391998291016, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.26034343242645264, "rewards/margins": 5.531552314758301, "rewards/rejected": -5.791895389556885, "step": 980 }, { "epoch": 5.8133333333333335, "grad_norm": 5.199707849563059, "learning_rate": 1.0321139916891281e-07, "logits/chosen": -1.1908127069473267, "logits/rejected": -1.2790547609329224, "logps/chosen": -53.803672790527344, "logps/rejected": -98.4791259765625, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": -1.1615757942199707, "rewards/margins": 6.412900924682617, "rewards/rejected": -7.574476718902588, "step": 981 }, { "epoch": 5.819259259259259, "grad_norm": 4.056908323841326, "learning_rate": 1.0268603913629858e-07, "logits/chosen": -1.0693111419677734, "logits/rejected": -1.2135943174362183, "logps/chosen": -43.74428939819336, "logps/rejected": -59.94146728515625, "loss": 0.0352, "rewards/accuracies": 1.0, "rewards/chosen": -0.10237783193588257, "rewards/margins": 5.640899658203125, "rewards/rejected": -5.743277549743652, "step": 982 }, { "epoch": 5.825185185185185, "grad_norm": 6.342627728372702, "learning_rate": 1.0216167380067927e-07, "logits/chosen": -1.144514560699463, "logits/rejected": -1.2700378894805908, "logps/chosen": -38.817726135253906, "logps/rejected": -63.40654754638672, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": -1.0137901306152344, "rewards/margins": 5.49804162979126, "rewards/rejected": -6.511831283569336, "step": 983 }, { "epoch": 5.831111111111111, "grad_norm": 5.951676057587321, "learning_rate": 1.0163830670268767e-07, "logits/chosen": -1.1670303344726562, "logits/rejected": -1.167630910873413, "logps/chosen": -52.73843765258789, "logps/rejected": -75.77318572998047, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": -0.005964726209640503, "rewards/margins": 5.647053241729736, "rewards/rejected": -5.653018474578857, "step": 984 }, { "epoch": 5.837037037037037, "grad_norm": 5.711969478515697, "learning_rate": 1.0111594137621613e-07, "logits/chosen": -1.143544316291809, "logits/rejected": -1.0991742610931396, "logps/chosen": -59.350929260253906, "logps/rejected": -78.52423095703125, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": -1.3191605806350708, "rewards/margins": 6.052453994750977, "rewards/rejected": -7.371614456176758, "step": 985 }, { "epoch": 5.842962962962963, "grad_norm": 9.323420493737709, "learning_rate": 1.0059458134839277e-07, "logits/chosen": -1.0608803033828735, "logits/rejected": -1.148100733757019, "logps/chosen": -49.37598419189453, "logps/rejected": -72.11686706542969, "loss": 0.0605, "rewards/accuracies": 0.9375, "rewards/chosen": -0.662987470626831, "rewards/margins": 5.027313232421875, "rewards/rejected": -5.690300464630127, "step": 986 }, { "epoch": 5.848888888888889, "grad_norm": 6.136186486936588, "learning_rate": 1.0007423013955782e-07, "logits/chosen": -1.3598471879959106, "logits/rejected": -1.4275630712509155, "logps/chosen": -47.69789123535156, "logps/rejected": -68.75079345703125, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": -0.5667755007743835, "rewards/margins": 4.388944625854492, "rewards/rejected": -4.9557204246521, "step": 987 }, { "epoch": 5.854814814814815, "grad_norm": 3.230703616513641, "learning_rate": 9.955489126323954e-08, "logits/chosen": -1.2398353815078735, "logits/rejected": -1.2649370431900024, "logps/chosen": -41.775108337402344, "logps/rejected": -64.36259460449219, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -0.4918060302734375, "rewards/margins": 6.183435440063477, "rewards/rejected": -6.6752424240112305, "step": 988 }, { "epoch": 5.860740740740741, "grad_norm": 4.91314701899232, "learning_rate": 9.903656822613099e-08, "logits/chosen": -1.2056286334991455, "logits/rejected": -1.2726476192474365, "logps/chosen": -48.42527389526367, "logps/rejected": -73.99102783203125, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -0.07021550834178925, "rewards/margins": 6.695013999938965, "rewards/rejected": -6.765229225158691, "step": 989 }, { "epoch": 5.866666666666667, "grad_norm": 4.380834180173861, "learning_rate": 9.851926452806583e-08, "logits/chosen": -1.1519975662231445, "logits/rejected": -1.2040634155273438, "logps/chosen": -53.68801498413086, "logps/rejected": -70.1108627319336, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -1.292681336402893, "rewards/margins": 4.510378360748291, "rewards/rejected": -5.803060054779053, "step": 990 }, { "epoch": 5.872592592592593, "grad_norm": 3.8066981796535857, "learning_rate": 9.800298366199497e-08, "logits/chosen": -1.1126823425292969, "logits/rejected": -1.0786277055740356, "logps/chosen": -57.135101318359375, "logps/rejected": -78.73320770263672, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -1.2120319604873657, "rewards/margins": 5.7933454513549805, "rewards/rejected": -7.005376815795898, "step": 991 }, { "epoch": 5.8785185185185185, "grad_norm": 5.380083654103537, "learning_rate": 9.748772911396291e-08, "logits/chosen": -1.1817781925201416, "logits/rejected": -1.1358152627944946, "logps/chosen": -40.186588287353516, "logps/rejected": -59.40968322753906, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -0.05524274706840515, "rewards/margins": 5.243034362792969, "rewards/rejected": -5.298277378082275, "step": 992 }, { "epoch": 5.884444444444444, "grad_norm": 3.9806748355024304, "learning_rate": 9.697350436308427e-08, "logits/chosen": -1.411184310913086, "logits/rejected": -1.4651869535446167, "logps/chosen": -42.275794982910156, "logps/rejected": -65.55119323730469, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -0.8196922540664673, "rewards/margins": 5.605785369873047, "rewards/rejected": -6.425477504730225, "step": 993 }, { "epoch": 5.890370370370371, "grad_norm": 5.35201952318648, "learning_rate": 9.646031288152021e-08, "logits/chosen": -1.310591459274292, "logits/rejected": -1.4033873081207275, "logps/chosen": -44.234039306640625, "logps/rejected": -80.975341796875, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -0.5743755102157593, "rewards/margins": 6.1121320724487305, "rewards/rejected": -6.686507225036621, "step": 994 }, { "epoch": 5.896296296296296, "grad_norm": 4.207629470079585, "learning_rate": 9.5948158134455e-08, "logits/chosen": -1.4490768909454346, "logits/rejected": -1.4172837734222412, "logps/chosen": -62.98780059814453, "logps/rejected": -65.28123474121094, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -0.4620938301086426, "rewards/margins": 5.255311489105225, "rewards/rejected": -5.717405319213867, "step": 995 }, { "epoch": 5.902222222222222, "grad_norm": 4.720779096307494, "learning_rate": 9.543704358007279e-08, "logits/chosen": -0.9672622084617615, "logits/rejected": -1.0174751281738281, "logps/chosen": -41.439727783203125, "logps/rejected": -62.13349151611328, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -0.340609073638916, "rewards/margins": 4.7247114181518555, "rewards/rejected": -5.06532096862793, "step": 996 }, { "epoch": 5.908148148148149, "grad_norm": 3.674799578091533, "learning_rate": 9.492697266953373e-08, "logits/chosen": -1.3102741241455078, "logits/rejected": -1.235027551651001, "logps/chosen": -46.665985107421875, "logps/rejected": -61.3996696472168, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": 0.2069341540336609, "rewards/margins": 3.999211072921753, "rewards/rejected": -3.7922770977020264, "step": 997 }, { "epoch": 5.914074074074074, "grad_norm": 4.835303274377117, "learning_rate": 9.44179488469516e-08, "logits/chosen": -1.2475138902664185, "logits/rejected": -1.2858538627624512, "logps/chosen": -43.746517181396484, "logps/rejected": -91.54032897949219, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -0.967775285243988, "rewards/margins": 6.45581579208374, "rewards/rejected": -7.423590660095215, "step": 998 }, { "epoch": 5.92, "grad_norm": 6.6521399100097245, "learning_rate": 9.390997554936964e-08, "logits/chosen": -1.1728459596633911, "logits/rejected": -1.151499629020691, "logps/chosen": -53.31846618652344, "logps/rejected": -80.39374542236328, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -1.1168384552001953, "rewards/margins": 6.801609992980957, "rewards/rejected": -7.918448448181152, "step": 999 }, { "epoch": 5.925925925925926, "grad_norm": 4.697878592355873, "learning_rate": 9.340305620673778e-08, "logits/chosen": -1.1270843744277954, "logits/rejected": -1.12493896484375, "logps/chosen": -60.640384674072266, "logps/rejected": -71.61137390136719, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -1.3440361022949219, "rewards/margins": 5.394672870635986, "rewards/rejected": -6.738708972930908, "step": 1000 }, { "epoch": 5.931851851851852, "grad_norm": 7.0928652351773085, "learning_rate": 9.289719424188947e-08, "logits/chosen": -1.0911977291107178, "logits/rejected": -1.1739208698272705, "logps/chosen": -58.13080978393555, "logps/rejected": -69.92738342285156, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -0.24996742606163025, "rewards/margins": 5.095523834228516, "rewards/rejected": -5.345491409301758, "step": 1001 }, { "epoch": 5.937777777777778, "grad_norm": 7.277382234508979, "learning_rate": 9.239239307051841e-08, "logits/chosen": -1.3850924968719482, "logits/rejected": -1.2906146049499512, "logps/chosen": -43.749080657958984, "logps/rejected": -54.221092224121094, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 0.44107675552368164, "rewards/margins": 4.355238437652588, "rewards/rejected": -3.914161443710327, "step": 1002 }, { "epoch": 5.9437037037037035, "grad_norm": 5.0779449589504715, "learning_rate": 9.18886561011557e-08, "logits/chosen": -1.0621355772018433, "logits/rejected": -0.958572506904602, "logps/chosen": -48.210506439208984, "logps/rejected": -69.44853210449219, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": -0.863682210445404, "rewards/margins": 5.017454624176025, "rewards/rejected": -5.881137371063232, "step": 1003 }, { "epoch": 5.94962962962963, "grad_norm": 6.195778586468951, "learning_rate": 9.13859867351466e-08, "logits/chosen": -1.1037704944610596, "logits/rejected": -1.104474425315857, "logps/chosen": -63.77961730957031, "logps/rejected": -65.49559020996094, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -0.5683590769767761, "rewards/margins": 5.2069854736328125, "rewards/rejected": -5.7753448486328125, "step": 1004 }, { "epoch": 5.955555555555556, "grad_norm": 3.8602707820687625, "learning_rate": 9.088438836662777e-08, "logits/chosen": -1.1881828308105469, "logits/rejected": -1.3121764659881592, "logps/chosen": -51.87583541870117, "logps/rejected": -79.06917572021484, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -1.2786531448364258, "rewards/margins": 5.352337837219238, "rewards/rejected": -6.630990982055664, "step": 1005 }, { "epoch": 5.961481481481481, "grad_norm": 4.5867586019012165, "learning_rate": 9.038386438250414e-08, "logits/chosen": -1.241044044494629, "logits/rejected": -1.30616295337677, "logps/chosen": -40.884727478027344, "logps/rejected": -58.961097717285156, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 0.003995433449745178, "rewards/margins": 4.632888317108154, "rewards/rejected": -4.62889289855957, "step": 1006 }, { "epoch": 5.967407407407407, "grad_norm": 5.555139669607316, "learning_rate": 8.988441816242629e-08, "logits/chosen": -1.358335018157959, "logits/rejected": -1.4460186958312988, "logps/chosen": -49.65517807006836, "logps/rejected": -72.40342712402344, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": -1.2420268058776855, "rewards/margins": 5.1783857345581055, "rewards/rejected": -6.420413017272949, "step": 1007 }, { "epoch": 5.973333333333334, "grad_norm": 4.927040690223158, "learning_rate": 8.938605307876736e-08, "logits/chosen": -1.092087745666504, "logits/rejected": -1.1519795656204224, "logps/chosen": -37.996971130371094, "logps/rejected": -58.567012786865234, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -0.3948993682861328, "rewards/margins": 4.651831150054932, "rewards/rejected": -5.0467305183410645, "step": 1008 }, { "epoch": 5.979259259259259, "grad_norm": 4.489805022758967, "learning_rate": 8.888877249660052e-08, "logits/chosen": -1.2722234725952148, "logits/rejected": -1.286690354347229, "logps/chosen": -54.14191436767578, "logps/rejected": -73.30915832519531, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": -0.989962100982666, "rewards/margins": 5.260371685028076, "rewards/rejected": -6.250333786010742, "step": 1009 }, { "epoch": 5.985185185185185, "grad_norm": 5.465649432608913, "learning_rate": 8.839257977367609e-08, "logits/chosen": -1.0653859376907349, "logits/rejected": -1.1169452667236328, "logps/chosen": -45.828392028808594, "logps/rejected": -79.25332641601562, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8341926336288452, "rewards/margins": 6.369889259338379, "rewards/rejected": -7.2040815353393555, "step": 1010 }, { "epoch": 5.9911111111111115, "grad_norm": 6.569969651329867, "learning_rate": 8.789747826039893e-08, "logits/chosen": -0.9660034775733948, "logits/rejected": -1.1117826700210571, "logps/chosen": -47.17693328857422, "logps/rejected": -61.68070983886719, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -0.6077823638916016, "rewards/margins": 4.161911487579346, "rewards/rejected": -4.769693851470947, "step": 1011 }, { "epoch": 5.997037037037037, "grad_norm": 6.425246591344191, "learning_rate": 8.74034712998058e-08, "logits/chosen": -1.1601378917694092, "logits/rejected": -1.3277003765106201, "logps/chosen": -45.462860107421875, "logps/rejected": -79.97228240966797, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": -0.8078010678291321, "rewards/margins": 6.313694000244141, "rewards/rejected": -7.121495246887207, "step": 1012 }, { "epoch": 6.002962962962963, "grad_norm": 4.502315709571971, "learning_rate": 8.69105622275428e-08, "logits/chosen": -1.1005038022994995, "logits/rejected": -1.072653889656067, "logps/chosen": -41.478179931640625, "logps/rejected": -64.40869140625, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": -0.2710479497909546, "rewards/margins": 6.673520088195801, "rewards/rejected": -6.944567680358887, "step": 1013 }, { "epoch": 6.0088888888888885, "grad_norm": 4.379560632717497, "learning_rate": 8.641875437184287e-08, "logits/chosen": -1.1290507316589355, "logits/rejected": -1.2435288429260254, "logps/chosen": -38.80870056152344, "logps/rejected": -75.89081573486328, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -0.28928327560424805, "rewards/margins": 7.293491363525391, "rewards/rejected": -7.582775115966797, "step": 1014 }, { "epoch": 6.014814814814815, "grad_norm": 3.2767704731233396, "learning_rate": 8.592805105350326e-08, "logits/chosen": -1.3261024951934814, "logits/rejected": -1.39167058467865, "logps/chosen": -49.11321258544922, "logps/rejected": -67.26222229003906, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -0.6615856289863586, "rewards/margins": 5.151736259460449, "rewards/rejected": -5.813322067260742, "step": 1015 }, { "epoch": 6.020740740740741, "grad_norm": 4.906372024322152, "learning_rate": 8.543845558586307e-08, "logits/chosen": -1.265272617340088, "logits/rejected": -1.2793196439743042, "logps/chosen": -44.430320739746094, "logps/rejected": -63.15349578857422, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -0.17215143144130707, "rewards/margins": 4.199959754943848, "rewards/rejected": -4.3721113204956055, "step": 1016 }, { "epoch": 6.026666666666666, "grad_norm": 3.538919277162638, "learning_rate": 8.494997127478109e-08, "logits/chosen": -0.9269475340843201, "logits/rejected": -0.994177520275116, "logps/chosen": -46.4836540222168, "logps/rejected": -71.3299789428711, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -0.8119341135025024, "rewards/margins": 6.298572540283203, "rewards/rejected": -7.110507965087891, "step": 1017 }, { "epoch": 6.032592592592593, "grad_norm": 3.4963380275486187, "learning_rate": 8.44626014186132e-08, "logits/chosen": -1.102508544921875, "logits/rejected": -1.2094018459320068, "logps/chosen": -43.70540237426758, "logps/rejected": -62.176700592041016, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.613059401512146, "rewards/margins": 5.6967453956604, "rewards/rejected": -6.309804916381836, "step": 1018 }, { "epoch": 6.038518518518519, "grad_norm": 4.029066081775569, "learning_rate": 8.39763493081902e-08, "logits/chosen": -1.01422917842865, "logits/rejected": -1.1143798828125, "logps/chosen": -44.69683837890625, "logps/rejected": -65.62399291992188, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": -0.6260117292404175, "rewards/margins": 5.6429572105407715, "rewards/rejected": -6.2689690589904785, "step": 1019 }, { "epoch": 6.044444444444444, "grad_norm": 6.291140101689275, "learning_rate": 8.349121822679589e-08, "logits/chosen": -1.3785192966461182, "logits/rejected": -1.3688278198242188, "logps/chosen": -51.58811950683594, "logps/rejected": -67.65833282470703, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -0.22203266620635986, "rewards/margins": 5.307156562805176, "rewards/rejected": -5.529188632965088, "step": 1020 }, { "epoch": 6.05037037037037, "grad_norm": 3.7693925741310186, "learning_rate": 8.300721145014434e-08, "logits/chosen": -1.0719438791275024, "logits/rejected": -1.1484143733978271, "logps/chosen": -43.041751861572266, "logps/rejected": -55.159175872802734, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -1.5907548666000366, "rewards/margins": 4.366609573364258, "rewards/rejected": -5.957364082336426, "step": 1021 }, { "epoch": 6.0562962962962965, "grad_norm": 4.2294470997905265, "learning_rate": 8.252433224635816e-08, "logits/chosen": -1.1875334978103638, "logits/rejected": -1.225481629371643, "logps/chosen": -49.98930740356445, "logps/rejected": -76.53312683105469, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.84998619556427, "rewards/margins": 5.482215881347656, "rewards/rejected": -6.332201957702637, "step": 1022 }, { "epoch": 6.062222222222222, "grad_norm": 6.556056582379274, "learning_rate": 8.204258387594634e-08, "logits/chosen": -1.1337170600891113, "logits/rejected": -1.1994110345840454, "logps/chosen": -56.70390319824219, "logps/rejected": -67.9212646484375, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": -1.0490467548370361, "rewards/margins": 7.023098945617676, "rewards/rejected": -8.072145462036133, "step": 1023 }, { "epoch": 6.068148148148148, "grad_norm": 5.0733621236540225, "learning_rate": 8.15619695917823e-08, "logits/chosen": -1.1547056436538696, "logits/rejected": -1.1759554147720337, "logps/chosen": -60.92186737060547, "logps/rejected": -63.660179138183594, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": -1.3117988109588623, "rewards/margins": 5.863153457641602, "rewards/rejected": -7.174952507019043, "step": 1024 }, { "epoch": 6.074074074074074, "grad_norm": 4.256177047999337, "learning_rate": 8.108249263908163e-08, "logits/chosen": -1.3581020832061768, "logits/rejected": -1.4121311902999878, "logps/chosen": -57.44062042236328, "logps/rejected": -81.3388671875, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -1.617480993270874, "rewards/margins": 5.197127342224121, "rewards/rejected": -6.814608573913574, "step": 1025 }, { "epoch": 6.08, "grad_norm": 3.773024165408493, "learning_rate": 8.060415625538059e-08, "logits/chosen": -1.1411099433898926, "logits/rejected": -1.2848527431488037, "logps/chosen": -44.507415771484375, "logps/rejected": -66.17939758300781, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.23141825199127197, "rewards/margins": 4.467518329620361, "rewards/rejected": -4.698936462402344, "step": 1026 }, { "epoch": 6.085925925925926, "grad_norm": 5.309682508460407, "learning_rate": 8.012696367051409e-08, "logits/chosen": -1.2690435647964478, "logits/rejected": -1.2943583726882935, "logps/chosen": -48.598609924316406, "logps/rejected": -69.47331237792969, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.09322678297758102, "rewards/margins": 5.5550360679626465, "rewards/rejected": -5.648262977600098, "step": 1027 }, { "epoch": 6.091851851851851, "grad_norm": 4.904045381796863, "learning_rate": 7.965091810659369e-08, "logits/chosen": -1.02296781539917, "logits/rejected": -1.1315923929214478, "logps/chosen": -45.45520782470703, "logps/rejected": -67.32499694824219, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -0.799096941947937, "rewards/margins": 4.296487808227539, "rewards/rejected": -5.095584869384766, "step": 1028 }, { "epoch": 6.097777777777778, "grad_norm": 3.1264893216428313, "learning_rate": 7.917602277798612e-08, "logits/chosen": -1.0046404600143433, "logits/rejected": -1.1222795248031616, "logps/chosen": -49.771968841552734, "logps/rejected": -76.62921142578125, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -1.469895839691162, "rewards/margins": 6.9465131759643555, "rewards/rejected": -8.41640853881836, "step": 1029 }, { "epoch": 6.103703703703704, "grad_norm": 4.517176589514016, "learning_rate": 7.870228089129155e-08, "logits/chosen": -0.9964554905891418, "logits/rejected": -1.0924674272537231, "logps/chosen": -41.06652069091797, "logps/rejected": -58.19047164916992, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 0.026089489459991455, "rewards/margins": 5.98679256439209, "rewards/rejected": -5.960702896118164, "step": 1030 }, { "epoch": 6.109629629629629, "grad_norm": 3.8344412940587396, "learning_rate": 7.822969564532167e-08, "logits/chosen": -1.2939468622207642, "logits/rejected": -1.306887149810791, "logps/chosen": -47.0091667175293, "logps/rejected": -73.51778411865234, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": -1.0579248666763306, "rewards/margins": 6.318962574005127, "rewards/rejected": -7.37688684463501, "step": 1031 }, { "epoch": 6.115555555555556, "grad_norm": 9.246762363812655, "learning_rate": 7.775827023107834e-08, "logits/chosen": -0.9924752712249756, "logits/rejected": -1.117583155632019, "logps/chosen": -46.18872833251953, "logps/rejected": -73.19407653808594, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -0.8105946779251099, "rewards/margins": 5.830495834350586, "rewards/rejected": -6.641090393066406, "step": 1032 }, { "epoch": 6.1214814814814815, "grad_norm": 3.882709661161108, "learning_rate": 7.728800783173201e-08, "logits/chosen": -1.0195491313934326, "logits/rejected": -1.1015743017196655, "logps/chosen": -43.58595275878906, "logps/rejected": -79.01959228515625, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -0.06294581294059753, "rewards/margins": 6.317727565765381, "rewards/rejected": -6.380673408508301, "step": 1033 }, { "epoch": 6.127407407407407, "grad_norm": 3.9425021805483746, "learning_rate": 7.681891162260015e-08, "logits/chosen": -0.8941872715950012, "logits/rejected": -0.953436553478241, "logps/chosen": -40.192047119140625, "logps/rejected": -60.56889343261719, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": 0.19415129721164703, "rewards/margins": 5.351212501525879, "rewards/rejected": -5.157060623168945, "step": 1034 }, { "epoch": 6.133333333333334, "grad_norm": 4.311648687445995, "learning_rate": 7.635098477112587e-08, "logits/chosen": -1.4252240657806396, "logits/rejected": -1.3934770822525024, "logps/chosen": -45.87331771850586, "logps/rejected": -68.5013427734375, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -0.39978134632110596, "rewards/margins": 5.793413162231445, "rewards/rejected": -6.193194389343262, "step": 1035 }, { "epoch": 6.139259259259259, "grad_norm": 3.759043416715931, "learning_rate": 7.588423043685646e-08, "logits/chosen": -1.2668523788452148, "logits/rejected": -1.2626163959503174, "logps/chosen": -48.48180389404297, "logps/rejected": -65.43073272705078, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 0.12602674961090088, "rewards/margins": 5.682659149169922, "rewards/rejected": -5.556632995605469, "step": 1036 }, { "epoch": 6.145185185185185, "grad_norm": 5.002265911936616, "learning_rate": 7.541865177142223e-08, "logits/chosen": -1.119709849357605, "logits/rejected": -1.2077200412750244, "logps/chosen": -48.262115478515625, "logps/rejected": -76.51467895507812, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -0.7838220596313477, "rewards/margins": 7.046172142028809, "rewards/rejected": -7.829994201660156, "step": 1037 }, { "epoch": 6.151111111111111, "grad_norm": 2.6511130353379184, "learning_rate": 7.4954251918515e-08, "logits/chosen": -1.2835824489593506, "logits/rejected": -1.228615403175354, "logps/chosen": -47.74272918701172, "logps/rejected": -67.5928955078125, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -0.7985275983810425, "rewards/margins": 5.398589134216309, "rewards/rejected": -6.197115898132324, "step": 1038 }, { "epoch": 6.157037037037037, "grad_norm": 4.420164019917996, "learning_rate": 7.449103401386702e-08, "logits/chosen": -1.1633448600769043, "logits/rejected": -1.3595736026763916, "logps/chosen": -40.677024841308594, "logps/rejected": -75.61985778808594, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -1.0207406282424927, "rewards/margins": 6.18635368347168, "rewards/rejected": -7.207094669342041, "step": 1039 }, { "epoch": 6.162962962962963, "grad_norm": 4.476071391068969, "learning_rate": 7.402900118522978e-08, "logits/chosen": -1.0822563171386719, "logits/rejected": -1.2434269189834595, "logps/chosen": -41.48609924316406, "logps/rejected": -77.5089111328125, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -1.56549072265625, "rewards/margins": 7.847236633300781, "rewards/rejected": -9.412727355957031, "step": 1040 }, { "epoch": 6.168888888888889, "grad_norm": 5.506742522245201, "learning_rate": 7.356815655235286e-08, "logits/chosen": -1.2080464363098145, "logits/rejected": -1.267046570777893, "logps/chosen": -50.45636749267578, "logps/rejected": -74.14933013916016, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -1.737205982208252, "rewards/margins": 5.574244022369385, "rewards/rejected": -7.311450004577637, "step": 1041 }, { "epoch": 6.174814814814815, "grad_norm": 5.666447335256339, "learning_rate": 7.310850322696283e-08, "logits/chosen": -1.1826661825180054, "logits/rejected": -1.2252609729766846, "logps/chosen": -41.64863586425781, "logps/rejected": -55.108455657958984, "loss": 0.057, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15391169488430023, "rewards/margins": 5.0056657791137695, "rewards/rejected": -5.159577369689941, "step": 1042 }, { "epoch": 6.180740740740741, "grad_norm": 4.852046866334851, "learning_rate": 7.265004431274236e-08, "logits/chosen": -1.0628266334533691, "logits/rejected": -1.0478066205978394, "logps/chosen": -46.114532470703125, "logps/rejected": -61.23189163208008, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -0.26190853118896484, "rewards/margins": 6.462977409362793, "rewards/rejected": -6.724886417388916, "step": 1043 }, { "epoch": 6.1866666666666665, "grad_norm": 4.455385489543663, "learning_rate": 7.219278290530909e-08, "logits/chosen": -1.1407748460769653, "logits/rejected": -1.1786659955978394, "logps/chosen": -54.615352630615234, "logps/rejected": -64.38876342773438, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.6010631322860718, "rewards/margins": 4.484706878662109, "rewards/rejected": -5.085770606994629, "step": 1044 }, { "epoch": 6.192592592592592, "grad_norm": 4.3493524620098425, "learning_rate": 7.173672209219494e-08, "logits/chosen": -1.0870596170425415, "logits/rejected": -1.1793104410171509, "logps/chosen": -49.675926208496094, "logps/rejected": -77.1938705444336, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -0.244964599609375, "rewards/margins": 5.766213417053223, "rewards/rejected": -6.011178016662598, "step": 1045 }, { "epoch": 6.198518518518519, "grad_norm": 4.241173092953691, "learning_rate": 7.128186495282507e-08, "logits/chosen": -0.9993229508399963, "logits/rejected": -1.1206510066986084, "logps/chosen": -42.55421447753906, "logps/rejected": -68.5854721069336, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -0.45122551918029785, "rewards/margins": 4.571963787078857, "rewards/rejected": -5.023189544677734, "step": 1046 }, { "epoch": 6.204444444444444, "grad_norm": 4.926385819576706, "learning_rate": 7.082821455849717e-08, "logits/chosen": -0.9856117963790894, "logits/rejected": -1.0857964754104614, "logps/chosen": -51.458953857421875, "logps/rejected": -74.82815551757812, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -0.9152073264122009, "rewards/margins": 5.840726852416992, "rewards/rejected": -6.755934238433838, "step": 1047 }, { "epoch": 6.21037037037037, "grad_norm": 4.207958288778069, "learning_rate": 7.037577397236074e-08, "logits/chosen": -1.2903097867965698, "logits/rejected": -1.193856120109558, "logps/chosen": -55.63256072998047, "logps/rejected": -82.83590698242188, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -1.2452365159988403, "rewards/margins": 6.734645366668701, "rewards/rejected": -7.979881763458252, "step": 1048 }, { "epoch": 6.216296296296297, "grad_norm": 3.6399032890233083, "learning_rate": 6.992454624939636e-08, "logits/chosen": -1.0144593715667725, "logits/rejected": -1.1578352451324463, "logps/chosen": -47.56209182739258, "logps/rejected": -81.01909637451172, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.9345325827598572, "rewards/margins": 5.261068344116211, "rewards/rejected": -6.195600509643555, "step": 1049 }, { "epoch": 6.222222222222222, "grad_norm": 4.564684222290065, "learning_rate": 6.947453443639514e-08, "logits/chosen": -1.2124285697937012, "logits/rejected": -1.231855869293213, "logps/chosen": -48.263282775878906, "logps/rejected": -74.08621978759766, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -0.6651404500007629, "rewards/margins": 5.952767372131348, "rewards/rejected": -6.617908477783203, "step": 1050 }, { "epoch": 6.228148148148148, "grad_norm": 5.891948862633519, "learning_rate": 6.902574157193794e-08, "logits/chosen": -1.2942461967468262, "logits/rejected": -1.3647191524505615, "logps/chosen": -57.16059494018555, "logps/rejected": -66.65109252929688, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": -1.2253351211547852, "rewards/margins": 4.405270576477051, "rewards/rejected": -5.630605697631836, "step": 1051 }, { "epoch": 6.234074074074074, "grad_norm": 4.906829136136163, "learning_rate": 6.857817068637526e-08, "logits/chosen": -1.347945213317871, "logits/rejected": -1.3540549278259277, "logps/chosen": -62.06746292114258, "logps/rejected": -66.86209106445312, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -0.7687455415725708, "rewards/margins": 6.335521697998047, "rewards/rejected": -7.10426664352417, "step": 1052 }, { "epoch": 6.24, "grad_norm": 5.057844402832865, "learning_rate": 6.81318248018064e-08, "logits/chosen": -1.0574407577514648, "logits/rejected": -1.1014264822006226, "logps/chosen": -52.07094955444336, "logps/rejected": -88.19418334960938, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -0.8869056701660156, "rewards/margins": 6.524155616760254, "rewards/rejected": -7.4110612869262695, "step": 1053 }, { "epoch": 6.245925925925926, "grad_norm": 4.673828872705199, "learning_rate": 6.7686706932059e-08, "logits/chosen": -1.191150188446045, "logits/rejected": -1.2414634227752686, "logps/chosen": -47.04499816894531, "logps/rejected": -66.87259674072266, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": -0.42619264125823975, "rewards/margins": 4.806652069091797, "rewards/rejected": -5.232844352722168, "step": 1054 }, { "epoch": 6.2518518518518515, "grad_norm": 5.211512132893841, "learning_rate": 6.72428200826691e-08, "logits/chosen": -1.1251758337020874, "logits/rejected": -1.1461089849472046, "logps/chosen": -59.33203125, "logps/rejected": -76.55050659179688, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.4580373764038086, "rewards/margins": 5.884695529937744, "rewards/rejected": -6.342732906341553, "step": 1055 }, { "epoch": 6.257777777777778, "grad_norm": 7.15758851283485, "learning_rate": 6.680016725086052e-08, "logits/chosen": -1.162949800491333, "logits/rejected": -1.1469281911849976, "logps/chosen": -48.195289611816406, "logps/rejected": -74.478271484375, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": -0.9580037593841553, "rewards/margins": 6.0397186279296875, "rewards/rejected": -6.997722148895264, "step": 1056 }, { "epoch": 6.263703703703704, "grad_norm": 3.9471224321859206, "learning_rate": 6.635875142552475e-08, "logits/chosen": -1.0656099319458008, "logits/rejected": -1.1877268552780151, "logps/chosen": -50.106781005859375, "logps/rejected": -83.21324920654297, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -0.9684824347496033, "rewards/margins": 6.485328674316406, "rewards/rejected": -7.453810691833496, "step": 1057 }, { "epoch": 6.269629629629629, "grad_norm": 5.0963421834197575, "learning_rate": 6.591857558720071e-08, "logits/chosen": -1.1995526552200317, "logits/rejected": -1.2433815002441406, "logps/chosen": -43.03889083862305, "logps/rejected": -59.46638488769531, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -0.2931675314903259, "rewards/margins": 5.359686851501465, "rewards/rejected": -5.6528544425964355, "step": 1058 }, { "epoch": 6.275555555555556, "grad_norm": 6.1786398121643495, "learning_rate": 6.547964270805467e-08, "logits/chosen": -1.2752354145050049, "logits/rejected": -1.2794020175933838, "logps/chosen": -40.54266357421875, "logps/rejected": -68.0904541015625, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -0.13839620351791382, "rewards/margins": 5.956443786621094, "rewards/rejected": -6.094839096069336, "step": 1059 }, { "epoch": 6.281481481481482, "grad_norm": 6.112280975588385, "learning_rate": 6.504195575186008e-08, "logits/chosen": -1.1097819805145264, "logits/rejected": -1.1845353841781616, "logps/chosen": -48.27924346923828, "logps/rejected": -76.77072143554688, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": -0.9963631629943848, "rewards/margins": 5.359070301055908, "rewards/rejected": -6.355433464050293, "step": 1060 }, { "epoch": 6.287407407407407, "grad_norm": 4.518898554024413, "learning_rate": 6.460551767397784e-08, "logits/chosen": -1.1713427305221558, "logits/rejected": -1.3381168842315674, "logps/chosen": -47.503623962402344, "logps/rejected": -72.43147277832031, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.6597295999526978, "rewards/margins": 5.930143356323242, "rewards/rejected": -6.58987283706665, "step": 1061 }, { "epoch": 6.293333333333333, "grad_norm": 4.096124285194584, "learning_rate": 6.417033142133593e-08, "logits/chosen": -1.155659794807434, "logits/rejected": -1.1853158473968506, "logps/chosen": -39.073097229003906, "logps/rejected": -67.7274398803711, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -0.7619829177856445, "rewards/margins": 5.485907554626465, "rewards/rejected": -6.247889995574951, "step": 1062 }, { "epoch": 6.29925925925926, "grad_norm": 3.250336129808355, "learning_rate": 6.37363999324098e-08, "logits/chosen": -0.7996464371681213, "logits/rejected": -0.8508714437484741, "logps/chosen": -40.68121337890625, "logps/rejected": -66.40437316894531, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -1.070540428161621, "rewards/margins": 6.3079142570495605, "rewards/rejected": -7.378454685211182, "step": 1063 }, { "epoch": 6.305185185185185, "grad_norm": 5.052049782134948, "learning_rate": 6.330372613720247e-08, "logits/chosen": -1.282618761062622, "logits/rejected": -1.2823154926300049, "logps/chosen": -46.28981399536133, "logps/rejected": -63.936187744140625, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -1.0241888761520386, "rewards/margins": 4.308932781219482, "rewards/rejected": -5.3331217765808105, "step": 1064 }, { "epoch": 6.311111111111111, "grad_norm": 3.259889226299206, "learning_rate": 6.28723129572247e-08, "logits/chosen": -1.1003174781799316, "logits/rejected": -1.2158787250518799, "logps/chosen": -58.42970275878906, "logps/rejected": -75.01763153076172, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6939321756362915, "rewards/margins": 5.769342422485352, "rewards/rejected": -6.4632744789123535, "step": 1065 }, { "epoch": 6.3170370370370375, "grad_norm": 5.479924725053419, "learning_rate": 6.244216330547533e-08, "logits/chosen": -1.1541452407836914, "logits/rejected": -1.146990180015564, "logps/chosen": -42.12281036376953, "logps/rejected": -54.60337829589844, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -1.3140569925308228, "rewards/margins": 5.23880672454834, "rewards/rejected": -6.552864074707031, "step": 1066 }, { "epoch": 6.322962962962963, "grad_norm": 5.728921555274712, "learning_rate": 6.201328008642159e-08, "logits/chosen": -1.1909137964248657, "logits/rejected": -1.1429741382598877, "logps/chosen": -47.83819580078125, "logps/rejected": -60.23460388183594, "loss": 0.0491, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9541853666305542, "rewards/margins": 5.615292549133301, "rewards/rejected": -6.5694780349731445, "step": 1067 }, { "epoch": 6.328888888888889, "grad_norm": 6.125870321351669, "learning_rate": 6.158566619597932e-08, "logits/chosen": -1.114587426185608, "logits/rejected": -1.1817578077316284, "logps/chosen": -40.60179901123047, "logps/rejected": -65.0115966796875, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -1.0992913246154785, "rewards/margins": 5.84259033203125, "rewards/rejected": -6.9418816566467285, "step": 1068 }, { "epoch": 6.3348148148148145, "grad_norm": 4.240998896145259, "learning_rate": 6.115932452149372e-08, "logits/chosen": -1.162363886833191, "logits/rejected": -1.239659309387207, "logps/chosen": -42.08208465576172, "logps/rejected": -65.01765441894531, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.07813447713851929, "rewards/margins": 6.08473014831543, "rewards/rejected": -6.162865161895752, "step": 1069 }, { "epoch": 6.340740740740741, "grad_norm": 5.299621800363426, "learning_rate": 6.07342579417196e-08, "logits/chosen": -1.1256684064865112, "logits/rejected": -1.1961945295333862, "logps/chosen": -44.97161102294922, "logps/rejected": -65.80847930908203, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": -0.6299454569816589, "rewards/margins": 5.447868347167969, "rewards/rejected": -6.077814102172852, "step": 1070 }, { "epoch": 6.346666666666667, "grad_norm": 3.9266127213154496, "learning_rate": 6.031046932680229e-08, "logits/chosen": -1.1711387634277344, "logits/rejected": -1.1314812898635864, "logps/chosen": -51.13462829589844, "logps/rejected": -77.2643814086914, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": -0.340965211391449, "rewards/margins": 6.144353866577148, "rewards/rejected": -6.485319137573242, "step": 1071 }, { "epoch": 6.352592592592592, "grad_norm": 3.437726901220671, "learning_rate": 5.988796153825768e-08, "logits/chosen": -1.3358840942382812, "logits/rejected": -1.359967589378357, "logps/chosen": -62.960697174072266, "logps/rejected": -75.00189208984375, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -1.8468966484069824, "rewards/margins": 5.507348537445068, "rewards/rejected": -7.354245185852051, "step": 1072 }, { "epoch": 6.358518518518519, "grad_norm": 6.198221610149886, "learning_rate": 5.9466737428953444e-08, "logits/chosen": -1.0683010816574097, "logits/rejected": -1.150133490562439, "logps/chosen": -50.889564514160156, "logps/rejected": -95.1012954711914, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": -1.7292721271514893, "rewards/margins": 7.153807640075684, "rewards/rejected": -8.883079528808594, "step": 1073 }, { "epoch": 6.364444444444445, "grad_norm": 4.926481221358003, "learning_rate": 5.9046799843089464e-08, "logits/chosen": -1.240066409111023, "logits/rejected": -1.2892934083938599, "logps/chosen": -43.26491928100586, "logps/rejected": -57.51011276245117, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.2330174446105957, "rewards/margins": 4.55935001373291, "rewards/rejected": -4.792367458343506, "step": 1074 }, { "epoch": 6.37037037037037, "grad_norm": 4.627848653758448, "learning_rate": 5.862815161617879e-08, "logits/chosen": -1.0937652587890625, "logits/rejected": -1.1810989379882812, "logps/chosen": -55.17090606689453, "logps/rejected": -80.78233337402344, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": -0.8725177645683289, "rewards/margins": 5.987633228302002, "rewards/rejected": -6.860151290893555, "step": 1075 }, { "epoch": 6.376296296296296, "grad_norm": 4.425158565324127, "learning_rate": 5.8210795575028395e-08, "logits/chosen": -1.127935767173767, "logits/rejected": -1.186619758605957, "logps/chosen": -50.21623611450195, "logps/rejected": -81.5506820678711, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": -1.0250823497772217, "rewards/margins": 5.458142280578613, "rewards/rejected": -6.483224868774414, "step": 1076 }, { "epoch": 6.3822222222222225, "grad_norm": 3.213290153468062, "learning_rate": 5.7794734537720156e-08, "logits/chosen": -1.2193506956100464, "logits/rejected": -1.257124423980713, "logps/chosen": -63.197174072265625, "logps/rejected": -71.59033966064453, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": -1.2621850967407227, "rewards/margins": 4.87045955657959, "rewards/rejected": -6.1326446533203125, "step": 1077 }, { "epoch": 6.388148148148148, "grad_norm": 2.972357542462106, "learning_rate": 5.7379971313591736e-08, "logits/chosen": -1.2130753993988037, "logits/rejected": -1.1798851490020752, "logps/chosen": -63.9143180847168, "logps/rejected": -88.82057189941406, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -1.3516303300857544, "rewards/margins": 7.052016735076904, "rewards/rejected": -8.403646469116211, "step": 1078 }, { "epoch": 6.394074074074074, "grad_norm": 5.719397621364952, "learning_rate": 5.69665087032177e-08, "logits/chosen": -1.0475133657455444, "logits/rejected": -1.0087506771087646, "logps/chosen": -48.91676330566406, "logps/rejected": -60.33573913574219, "loss": 0.0531, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9253264665603638, "rewards/margins": 3.923841714859009, "rewards/rejected": -4.84916877746582, "step": 1079 }, { "epoch": 6.4, "grad_norm": 5.636018568126882, "learning_rate": 5.6554349498390606e-08, "logits/chosen": -1.2317850589752197, "logits/rejected": -1.3244078159332275, "logps/chosen": -46.108726501464844, "logps/rejected": -68.44413757324219, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": -0.5189595222473145, "rewards/margins": 5.146785736083984, "rewards/rejected": -5.665745258331299, "step": 1080 }, { "epoch": 6.405925925925926, "grad_norm": 4.154215326167201, "learning_rate": 5.614349648210212e-08, "logits/chosen": -1.2623381614685059, "logits/rejected": -1.3493742942810059, "logps/chosen": -55.09934616088867, "logps/rejected": -68.43744659423828, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -1.1642762422561646, "rewards/margins": 5.039634704589844, "rewards/rejected": -6.203911304473877, "step": 1081 }, { "epoch": 6.411851851851852, "grad_norm": 4.482983957523619, "learning_rate": 5.573395242852416e-08, "logits/chosen": -1.0360773801803589, "logits/rejected": -1.1870149374008179, "logps/chosen": -47.35795974731445, "logps/rejected": -87.30964660644531, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.9789612293243408, "rewards/margins": 6.068577766418457, "rewards/rejected": -7.047539234161377, "step": 1082 }, { "epoch": 6.417777777777777, "grad_norm": 4.094917827882695, "learning_rate": 5.532572010299033e-08, "logits/chosen": -1.233745813369751, "logits/rejected": -1.318791389465332, "logps/chosen": -44.077110290527344, "logps/rejected": -67.19015502929688, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -0.031197786331176758, "rewards/margins": 5.726001262664795, "rewards/rejected": -5.757198810577393, "step": 1083 }, { "epoch": 6.423703703703704, "grad_norm": 5.302432797807315, "learning_rate": 5.4918802261977067e-08, "logits/chosen": -1.0582901239395142, "logits/rejected": -1.1042211055755615, "logps/chosen": -48.33747100830078, "logps/rejected": -63.80529022216797, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": -0.7692410349845886, "rewards/margins": 5.0125932693481445, "rewards/rejected": -5.781834125518799, "step": 1084 }, { "epoch": 6.42962962962963, "grad_norm": 4.723133012663554, "learning_rate": 5.451320165308518e-08, "logits/chosen": -1.0730984210968018, "logits/rejected": -1.1216435432434082, "logps/chosen": -50.3352165222168, "logps/rejected": -73.663818359375, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.752791404724121, "rewards/margins": 6.158186435699463, "rewards/rejected": -7.910977840423584, "step": 1085 }, { "epoch": 6.435555555555555, "grad_norm": 3.9126904954362978, "learning_rate": 5.410892101502118e-08, "logits/chosen": -1.0621589422225952, "logits/rejected": -1.1279480457305908, "logps/chosen": -49.674407958984375, "logps/rejected": -69.31290435791016, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -1.3062057495117188, "rewards/margins": 6.739208221435547, "rewards/rejected": -8.045413970947266, "step": 1086 }, { "epoch": 6.441481481481482, "grad_norm": 4.040968337186127, "learning_rate": 5.370596307757885e-08, "logits/chosen": -1.2488672733306885, "logits/rejected": -1.2570903301239014, "logps/chosen": -44.28248596191406, "logps/rejected": -78.1733627319336, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -0.5499465465545654, "rewards/margins": 6.733985424041748, "rewards/rejected": -7.283932209014893, "step": 1087 }, { "epoch": 6.4474074074074075, "grad_norm": 5.338352284785468, "learning_rate": 5.330433056162084e-08, "logits/chosen": -1.4356791973114014, "logits/rejected": -1.409859538078308, "logps/chosen": -47.546478271484375, "logps/rejected": -56.075843811035156, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": -0.6689544916152954, "rewards/margins": 4.289015769958496, "rewards/rejected": -4.95797061920166, "step": 1088 }, { "epoch": 6.453333333333333, "grad_norm": 6.237433249524388, "learning_rate": 5.29040261790602e-08, "logits/chosen": -1.2191461324691772, "logits/rejected": -1.2319748401641846, "logps/chosen": -64.38487243652344, "logps/rejected": -80.97091674804688, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": -0.5654529333114624, "rewards/margins": 6.002832412719727, "rewards/rejected": -6.568285942077637, "step": 1089 }, { "epoch": 6.459259259259259, "grad_norm": 5.076848534654584, "learning_rate": 5.2505052632842187e-08, "logits/chosen": -1.0852857828140259, "logits/rejected": -1.1707793474197388, "logps/chosen": -49.97313690185547, "logps/rejected": -56.366294860839844, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": -0.5606622695922852, "rewards/margins": 4.890949726104736, "rewards/rejected": -5.451611518859863, "step": 1090 }, { "epoch": 6.465185185185185, "grad_norm": 3.027156656004254, "learning_rate": 5.210741261692586e-08, "logits/chosen": -1.139082670211792, "logits/rejected": -1.4209345579147339, "logps/chosen": -41.075599670410156, "logps/rejected": -70.40779113769531, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -0.8302870988845825, "rewards/margins": 6.642550468444824, "rewards/rejected": -7.472837924957275, "step": 1091 }, { "epoch": 6.471111111111111, "grad_norm": 4.322364950918871, "learning_rate": 5.171110881626603e-08, "logits/chosen": -1.0543513298034668, "logits/rejected": -1.0842859745025635, "logps/chosen": -45.20751190185547, "logps/rejected": -62.549015045166016, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -0.9526963829994202, "rewards/margins": 5.437678337097168, "rewards/rejected": -6.390375137329102, "step": 1092 }, { "epoch": 6.477037037037037, "grad_norm": 4.851644975850725, "learning_rate": 5.1316143906795175e-08, "logits/chosen": -1.1379737854003906, "logits/rejected": -1.185952067375183, "logps/chosen": -56.87077331542969, "logps/rejected": -76.13067626953125, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -2.251248598098755, "rewards/margins": 6.4593071937561035, "rewards/rejected": -8.710555076599121, "step": 1093 }, { "epoch": 6.482962962962963, "grad_norm": 4.68168860240618, "learning_rate": 5.092252055540513e-08, "logits/chosen": -1.1433980464935303, "logits/rejected": -1.233651876449585, "logps/chosen": -51.71131134033203, "logps/rejected": -67.805908203125, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.38715383410453796, "rewards/margins": 4.533756732940674, "rewards/rejected": -4.920910835266113, "step": 1094 }, { "epoch": 6.488888888888889, "grad_norm": 5.654048431405952, "learning_rate": 5.053024141992934e-08, "logits/chosen": -1.207375407218933, "logits/rejected": -1.3669353723526, "logps/chosen": -39.886138916015625, "logps/rejected": -50.32880783081055, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -0.5060518980026245, "rewards/margins": 3.433199882507324, "rewards/rejected": -3.9392518997192383, "step": 1095 }, { "epoch": 6.494814814814815, "grad_norm": 5.999658315264965, "learning_rate": 5.013930914912476e-08, "logits/chosen": -1.114271879196167, "logits/rejected": -1.1348860263824463, "logps/chosen": -34.48630142211914, "logps/rejected": -63.001434326171875, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": -0.942639946937561, "rewards/margins": 5.111496925354004, "rewards/rejected": -6.054136276245117, "step": 1096 }, { "epoch": 6.50074074074074, "grad_norm": 4.615994178463875, "learning_rate": 4.97497263826539e-08, "logits/chosen": -1.0940725803375244, "logits/rejected": -1.2616147994995117, "logps/chosen": -40.44831085205078, "logps/rejected": -75.76023864746094, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -0.6569719910621643, "rewards/margins": 6.785313606262207, "rewards/rejected": -7.442286014556885, "step": 1097 }, { "epoch": 6.506666666666667, "grad_norm": 3.928446346908786, "learning_rate": 4.936149575106727e-08, "logits/chosen": -1.2776455879211426, "logits/rejected": -1.3003250360488892, "logps/chosen": -58.20839309692383, "logps/rejected": -66.91224670410156, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -0.9391154050827026, "rewards/margins": 4.6472015380859375, "rewards/rejected": -5.58631706237793, "step": 1098 }, { "epoch": 6.5125925925925925, "grad_norm": 4.992697059243111, "learning_rate": 4.897461987578541e-08, "logits/chosen": -1.1780474185943604, "logits/rejected": -1.1831226348876953, "logps/chosen": -36.937774658203125, "logps/rejected": -58.36042404174805, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": 0.03631478548049927, "rewards/margins": 5.19421911239624, "rewards/rejected": -5.157904624938965, "step": 1099 }, { "epoch": 6.518518518518518, "grad_norm": 4.0438360078549875, "learning_rate": 4.8589101369081235e-08, "logits/chosen": -1.0455366373062134, "logits/rejected": -1.1880236864089966, "logps/chosen": -48.7999267578125, "logps/rejected": -72.28286743164062, "loss": 0.032, "rewards/accuracies": 1.0, "rewards/chosen": -0.7477136850357056, "rewards/margins": 4.7201337814331055, "rewards/rejected": -5.4678473472595215, "step": 1100 }, { "epoch": 6.524444444444445, "grad_norm": 5.630636162552549, "learning_rate": 4.8204942834062373e-08, "logits/chosen": -1.1372640132904053, "logits/rejected": -1.1603807210922241, "logps/chosen": -37.56422805786133, "logps/rejected": -57.50593948364258, "loss": 0.056, "rewards/accuracies": 1.0, "rewards/chosen": -0.9024946093559265, "rewards/margins": 4.253570556640625, "rewards/rejected": -5.156064987182617, "step": 1101 }, { "epoch": 6.53037037037037, "grad_norm": 4.3142630964680135, "learning_rate": 4.7822146864653744e-08, "logits/chosen": -1.2611795663833618, "logits/rejected": -1.3063312768936157, "logps/chosen": -57.0969123840332, "logps/rejected": -77.35307312011719, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -0.8758964538574219, "rewards/margins": 6.118692398071289, "rewards/rejected": -6.994588851928711, "step": 1102 }, { "epoch": 6.536296296296296, "grad_norm": 3.921713940170822, "learning_rate": 4.744071604557978e-08, "logits/chosen": -1.0361847877502441, "logits/rejected": -1.1400340795516968, "logps/chosen": -43.746986389160156, "logps/rejected": -58.988487243652344, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -0.35746294260025024, "rewards/margins": 4.024482727050781, "rewards/rejected": -4.381945610046387, "step": 1103 }, { "epoch": 6.542222222222223, "grad_norm": 3.209360528145483, "learning_rate": 4.706065295234718e-08, "logits/chosen": -1.0801401138305664, "logits/rejected": -1.191404938697815, "logps/chosen": -54.80885696411133, "logps/rejected": -87.4782485961914, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": -1.2579131126403809, "rewards/margins": 6.139656066894531, "rewards/rejected": -7.397568702697754, "step": 1104 }, { "epoch": 6.548148148148148, "grad_norm": 3.613238422110268, "learning_rate": 4.668196015122736e-08, "logits/chosen": -1.3278753757476807, "logits/rejected": -1.2881349325180054, "logps/chosen": -46.13524627685547, "logps/rejected": -61.586280822753906, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -0.36166372895240784, "rewards/margins": 4.425073146820068, "rewards/rejected": -4.786736488342285, "step": 1105 }, { "epoch": 6.554074074074074, "grad_norm": 5.514027809117569, "learning_rate": 4.630464019923932e-08, "logits/chosen": -1.19566810131073, "logits/rejected": -1.2912870645523071, "logps/chosen": -51.66880798339844, "logps/rejected": -61.59565734863281, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.7243253588676453, "rewards/margins": 4.674445152282715, "rewards/rejected": -5.398770809173584, "step": 1106 }, { "epoch": 6.5600000000000005, "grad_norm": 2.6348077760574973, "learning_rate": 4.5928695644132266e-08, "logits/chosen": -1.0767490863800049, "logits/rejected": -1.175428032875061, "logps/chosen": -44.48492431640625, "logps/rejected": -67.7999496459961, "loss": 0.0213, "rewards/accuracies": 1.0, "rewards/chosen": -1.3176645040512085, "rewards/margins": 6.277517318725586, "rewards/rejected": -7.595181465148926, "step": 1107 }, { "epoch": 6.565925925925926, "grad_norm": 3.981264192762706, "learning_rate": 4.5554129024368334e-08, "logits/chosen": -1.176161289215088, "logits/rejected": -1.1765999794006348, "logps/chosen": -47.69729232788086, "logps/rejected": -73.8935546875, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -1.450006365776062, "rewards/margins": 6.2602643966674805, "rewards/rejected": -7.710270881652832, "step": 1108 }, { "epoch": 6.571851851851852, "grad_norm": 4.583614175137326, "learning_rate": 4.5180942869105594e-08, "logits/chosen": -0.8451076745986938, "logits/rejected": -0.958713948726654, "logps/chosen": -52.66331100463867, "logps/rejected": -64.74360656738281, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -1.3362882137298584, "rewards/margins": 5.263598442077637, "rewards/rejected": -6.599886894226074, "step": 1109 }, { "epoch": 6.5777777777777775, "grad_norm": 3.409875007519284, "learning_rate": 4.480913969818098e-08, "logits/chosen": -1.1373850107192993, "logits/rejected": -1.2118630409240723, "logps/chosen": -48.01524353027344, "logps/rejected": -75.55987548828125, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -1.3638169765472412, "rewards/margins": 6.17181921005249, "rewards/rejected": -7.535635948181152, "step": 1110 }, { "epoch": 6.583703703703704, "grad_norm": 4.606324800785139, "learning_rate": 4.4438722022092925e-08, "logits/chosen": -0.9775791764259338, "logits/rejected": -1.181314468383789, "logps/chosen": -47.213096618652344, "logps/rejected": -72.06116485595703, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -1.4981868267059326, "rewards/margins": 6.857434272766113, "rewards/rejected": -8.355621337890625, "step": 1111 }, { "epoch": 6.58962962962963, "grad_norm": 3.7424781322340253, "learning_rate": 4.406969234198507e-08, "logits/chosen": -1.1918004751205444, "logits/rejected": -1.1910879611968994, "logps/chosen": -52.43225860595703, "logps/rejected": -81.70747375488281, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": -1.601660132408142, "rewards/margins": 7.3738813400268555, "rewards/rejected": -8.975541114807129, "step": 1112 }, { "epoch": 6.595555555555555, "grad_norm": 6.664166725738044, "learning_rate": 4.370205314962872e-08, "logits/chosen": -1.107804298400879, "logits/rejected": -1.1111526489257812, "logps/chosen": -51.81007385253906, "logps/rejected": -62.18156051635742, "loss": 0.0454, "rewards/accuracies": 1.0, "rewards/chosen": -0.7900987267494202, "rewards/margins": 4.66523551940918, "rewards/rejected": -5.455334186553955, "step": 1113 }, { "epoch": 6.601481481481482, "grad_norm": 3.960910024090369, "learning_rate": 4.333580692740643e-08, "logits/chosen": -0.9758756160736084, "logits/rejected": -1.160149335861206, "logps/chosen": -32.102787017822266, "logps/rejected": -63.1169548034668, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -0.2916122078895569, "rewards/margins": 5.569136619567871, "rewards/rejected": -5.860749244689941, "step": 1114 }, { "epoch": 6.607407407407408, "grad_norm": 5.435914920357414, "learning_rate": 4.2970956148295075e-08, "logits/chosen": -1.217504620552063, "logits/rejected": -1.211649775505066, "logps/chosen": -38.41328430175781, "logps/rejected": -60.064430236816406, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": -0.10607568919658661, "rewards/margins": 5.275820255279541, "rewards/rejected": -5.381896018981934, "step": 1115 }, { "epoch": 6.613333333333333, "grad_norm": 3.2372352578704504, "learning_rate": 4.260750327584911e-08, "logits/chosen": -1.2909510135650635, "logits/rejected": -1.3777996301651, "logps/chosen": -48.34864807128906, "logps/rejected": -63.35348129272461, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -0.2591802179813385, "rewards/margins": 5.086357116699219, "rewards/rejected": -5.3455376625061035, "step": 1116 }, { "epoch": 6.619259259259259, "grad_norm": 4.5060587727117865, "learning_rate": 4.2245450764184095e-08, "logits/chosen": -0.9973583221435547, "logits/rejected": -1.065267562866211, "logps/chosen": -54.38595962524414, "logps/rejected": -79.28496551513672, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.91830974817276, "rewards/margins": 6.978662490844727, "rewards/rejected": -7.896972179412842, "step": 1117 }, { "epoch": 6.6251851851851855, "grad_norm": 4.97344196566781, "learning_rate": 4.188480105796005e-08, "logits/chosen": -1.1831059455871582, "logits/rejected": -1.311706304550171, "logps/chosen": -43.16436004638672, "logps/rejected": -64.7727279663086, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.9032515287399292, "rewards/margins": 5.030428409576416, "rewards/rejected": -5.933679580688477, "step": 1118 }, { "epoch": 6.631111111111111, "grad_norm": 5.224006568746007, "learning_rate": 4.1525556592364843e-08, "logits/chosen": -1.2230312824249268, "logits/rejected": -1.332690715789795, "logps/chosen": -62.0909423828125, "logps/rejected": -71.77101135253906, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": -1.2376177310943604, "rewards/margins": 4.808589935302734, "rewards/rejected": -6.046207904815674, "step": 1119 }, { "epoch": 6.637037037037037, "grad_norm": 2.4418988617214876, "learning_rate": 4.116771979309797e-08, "logits/chosen": -1.0453990697860718, "logits/rejected": -1.198245882987976, "logps/chosen": -37.76801300048828, "logps/rejected": -78.61469268798828, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -0.9890796542167664, "rewards/margins": 6.903560638427734, "rewards/rejected": -7.892640113830566, "step": 1120 }, { "epoch": 6.642962962962963, "grad_norm": 4.550581764599643, "learning_rate": 4.081129307635389e-08, "logits/chosen": -1.2914384603500366, "logits/rejected": -1.3126540184020996, "logps/chosen": -41.57066345214844, "logps/rejected": -60.63642883300781, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -0.47267046570777893, "rewards/margins": 4.954557418823242, "rewards/rejected": -5.427227973937988, "step": 1121 }, { "epoch": 6.648888888888889, "grad_norm": 3.4786713866790477, "learning_rate": 4.045627884880606e-08, "logits/chosen": -1.0296213626861572, "logits/rejected": -1.0632151365280151, "logps/chosen": -51.90285873413086, "logps/rejected": -85.03942108154297, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -0.3325456380844116, "rewards/margins": 6.909428596496582, "rewards/rejected": -7.241974830627441, "step": 1122 }, { "epoch": 6.654814814814815, "grad_norm": 4.9375617411503505, "learning_rate": 4.010267950759025e-08, "logits/chosen": -1.0786218643188477, "logits/rejected": -1.2250795364379883, "logps/chosen": -50.944637298583984, "logps/rejected": -80.50453186035156, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": -0.01511731743812561, "rewards/margins": 6.7941389083862305, "rewards/rejected": -6.809256553649902, "step": 1123 }, { "epoch": 6.66074074074074, "grad_norm": 3.1662482794291806, "learning_rate": 3.9750497440288935e-08, "logits/chosen": -1.2261343002319336, "logits/rejected": -1.1729711294174194, "logps/chosen": -60.549949645996094, "logps/rejected": -69.66276550292969, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": -1.525669813156128, "rewards/margins": 5.298177719116211, "rewards/rejected": -6.823847770690918, "step": 1124 }, { "epoch": 6.666666666666667, "grad_norm": 5.42672364034126, "learning_rate": 3.9399735024914475e-08, "logits/chosen": -1.334367036819458, "logits/rejected": -1.282650113105774, "logps/chosen": -44.250877380371094, "logps/rejected": -54.64846420288086, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": -0.5066497921943665, "rewards/margins": 4.384808540344238, "rewards/rejected": -4.891458511352539, "step": 1125 }, { "epoch": 6.672592592592593, "grad_norm": 3.901638006008318, "learning_rate": 3.905039462989365e-08, "logits/chosen": -1.0305792093276978, "logits/rejected": -1.0763057470321655, "logps/chosen": -56.9117431640625, "logps/rejected": -71.73992919921875, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -1.381123661994934, "rewards/margins": 6.097377300262451, "rewards/rejected": -7.478500843048096, "step": 1126 }, { "epoch": 6.678518518518518, "grad_norm": 4.314515236806422, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -0.970765233039856, "logits/rejected": -0.9876160621643066, "logps/chosen": -38.278194427490234, "logps/rejected": -62.56399917602539, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 0.34831175208091736, "rewards/margins": 4.976291656494141, "rewards/rejected": -4.6279802322387695, "step": 1127 }, { "epoch": 6.684444444444445, "grad_norm": 4.337826544829072, "learning_rate": 3.835598932659476e-08, "logits/chosen": -1.2097077369689941, "logits/rejected": -1.2699313163757324, "logps/chosen": -52.94355773925781, "logps/rejected": -80.71701049804688, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -0.8386217355728149, "rewards/margins": 6.98582649230957, "rewards/rejected": -7.824448585510254, "step": 1128 }, { "epoch": 6.6903703703703705, "grad_norm": 4.042784792768212, "learning_rate": 3.801092910709749e-08, "logits/chosen": -1.3506526947021484, "logits/rejected": -1.4134248495101929, "logps/chosen": -48.646488189697266, "logps/rejected": -74.74803924560547, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -0.7620642185211182, "rewards/margins": 5.172915935516357, "rewards/rejected": -5.934980392456055, "step": 1129 }, { "epoch": 6.696296296296296, "grad_norm": 3.4781757684910475, "learning_rate": 3.766730028548376e-08, "logits/chosen": -1.2849725484848022, "logits/rejected": -1.3207203149795532, "logps/chosen": -52.43779754638672, "logps/rejected": -69.90351867675781, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -1.533517599105835, "rewards/margins": 6.085549354553223, "rewards/rejected": -7.619067192077637, "step": 1130 }, { "epoch": 6.702222222222222, "grad_norm": 3.4010729959792934, "learning_rate": 3.732510518201265e-08, "logits/chosen": -0.9972199201583862, "logits/rejected": -0.9870040416717529, "logps/chosen": -60.04631805419922, "logps/rejected": -71.98896026611328, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -1.5988143682479858, "rewards/margins": 5.482958793640137, "rewards/rejected": -7.081772804260254, "step": 1131 }, { "epoch": 6.708148148148148, "grad_norm": 4.71128938913975, "learning_rate": 3.698434610726245e-08, "logits/chosen": -1.2611842155456543, "logits/rejected": -1.335821509361267, "logps/chosen": -46.10606384277344, "logps/rejected": -79.79691314697266, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -0.6949688792228699, "rewards/margins": 6.835414886474609, "rewards/rejected": -7.530384063720703, "step": 1132 }, { "epoch": 6.714074074074074, "grad_norm": 5.831314985346632, "learning_rate": 3.6645025362115e-08, "logits/chosen": -1.0676662921905518, "logits/rejected": -1.211071491241455, "logps/chosen": -57.91938400268555, "logps/rejected": -74.5068359375, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": -0.2757381796836853, "rewards/margins": 6.2339253425598145, "rewards/rejected": -6.5096635818481445, "step": 1133 }, { "epoch": 6.72, "grad_norm": 4.274701107831341, "learning_rate": 3.630714523774042e-08, "logits/chosen": -1.0759488344192505, "logits/rejected": -1.2164721488952637, "logps/chosen": -53.31672286987305, "logps/rejected": -85.29515838623047, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -1.1152231693267822, "rewards/margins": 6.755128860473633, "rewards/rejected": -7.870352268218994, "step": 1134 }, { "epoch": 6.725925925925926, "grad_norm": 3.4258860662014246, "learning_rate": 3.597070801558122e-08, "logits/chosen": -1.2027193307876587, "logits/rejected": -1.2718777656555176, "logps/chosen": -43.425384521484375, "logps/rejected": -81.82745361328125, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -1.1736352443695068, "rewards/margins": 7.256280899047852, "rewards/rejected": -8.429915428161621, "step": 1135 }, { "epoch": 6.731851851851852, "grad_norm": 5.044131099305807, "learning_rate": 3.563571596733722e-08, "logits/chosen": -1.3531447649002075, "logits/rejected": -1.4350122213363647, "logps/chosen": -47.775699615478516, "logps/rejected": -75.65559387207031, "loss": 0.0449, "rewards/accuracies": 0.9375, "rewards/chosen": -0.998325765132904, "rewards/margins": 4.952122688293457, "rewards/rejected": -5.950448036193848, "step": 1136 }, { "epoch": 6.737777777777778, "grad_norm": 3.879818573961412, "learning_rate": 3.530217135495006e-08, "logits/chosen": -1.0087562799453735, "logits/rejected": -1.107908844947815, "logps/chosen": -40.937007904052734, "logps/rejected": -78.79791259765625, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -0.9164285659790039, "rewards/margins": 6.570706367492676, "rewards/rejected": -7.48713493347168, "step": 1137 }, { "epoch": 6.743703703703703, "grad_norm": 3.8451358515328966, "learning_rate": 3.4970076430588027e-08, "logits/chosen": -0.9816167950630188, "logits/rejected": -1.1667194366455078, "logps/chosen": -36.36294937133789, "logps/rejected": -83.9415054321289, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -0.2798038423061371, "rewards/margins": 6.795148849487305, "rewards/rejected": -7.074953079223633, "step": 1138 }, { "epoch": 6.74962962962963, "grad_norm": 4.665882979218104, "learning_rate": 3.463943343663065e-08, "logits/chosen": -1.1199207305908203, "logits/rejected": -1.1523513793945312, "logps/chosen": -57.47267150878906, "logps/rejected": -87.13018798828125, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -1.884210467338562, "rewards/margins": 6.462403297424316, "rewards/rejected": -8.346613883972168, "step": 1139 }, { "epoch": 6.7555555555555555, "grad_norm": 4.597748052381748, "learning_rate": 3.4310244605653795e-08, "logits/chosen": -1.1553621292114258, "logits/rejected": -1.3642417192459106, "logps/chosen": -57.56583023071289, "logps/rejected": -76.71488952636719, "loss": 0.0353, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8032940626144409, "rewards/margins": 6.201355934143066, "rewards/rejected": -7.004650115966797, "step": 1140 }, { "epoch": 6.761481481481481, "grad_norm": 3.8361199615022836, "learning_rate": 3.3982512160414505e-08, "logits/chosen": -1.1486549377441406, "logits/rejected": -1.2162824869155884, "logps/chosen": -52.28083801269531, "logps/rejected": -80.21273803710938, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -0.817528247833252, "rewards/margins": 6.002938747406006, "rewards/rejected": -6.820467948913574, "step": 1141 }, { "epoch": 6.767407407407408, "grad_norm": 3.1360553065274446, "learning_rate": 3.365623831383599e-08, "logits/chosen": -1.184605598449707, "logits/rejected": -1.286274790763855, "logps/chosen": -51.320716857910156, "logps/rejected": -74.52813720703125, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -1.1115386486053467, "rewards/margins": 6.255901336669922, "rewards/rejected": -7.367440223693848, "step": 1142 }, { "epoch": 6.773333333333333, "grad_norm": 4.915326465375198, "learning_rate": 3.3331425268992547e-08, "logits/chosen": -1.0797728300094604, "logits/rejected": -1.0414516925811768, "logps/chosen": -40.94459915161133, "logps/rejected": -57.521244049072266, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": -0.39861059188842773, "rewards/margins": 5.553493499755859, "rewards/rejected": -5.952103614807129, "step": 1143 }, { "epoch": 6.779259259259259, "grad_norm": 4.6607098966794895, "learning_rate": 3.3008075219095045e-08, "logits/chosen": -1.36018705368042, "logits/rejected": -1.3997021913528442, "logps/chosen": -56.11627197265625, "logps/rejected": -75.92520141601562, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -0.5371124148368835, "rewards/margins": 5.539422512054443, "rewards/rejected": -6.076535224914551, "step": 1144 }, { "epoch": 6.785185185185185, "grad_norm": 4.308322609065028, "learning_rate": 3.268619034747566e-08, "logits/chosen": -1.0334553718566895, "logits/rejected": -1.0797995328903198, "logps/chosen": -44.05830764770508, "logps/rejected": -73.79558563232422, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -1.4985544681549072, "rewards/margins": 6.261322498321533, "rewards/rejected": -7.7598772048950195, "step": 1145 }, { "epoch": 6.791111111111111, "grad_norm": 2.806649138401611, "learning_rate": 3.236577282757347e-08, "logits/chosen": -1.1216260194778442, "logits/rejected": -1.196481704711914, "logps/chosen": -52.37408447265625, "logps/rejected": -60.706138610839844, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -0.9106160402297974, "rewards/margins": 4.475264072418213, "rewards/rejected": -5.385880470275879, "step": 1146 }, { "epoch": 6.797037037037037, "grad_norm": 4.0319117047594295, "learning_rate": 3.204682482291959e-08, "logits/chosen": -1.2100169658660889, "logits/rejected": -1.2417694330215454, "logps/chosen": -48.793582916259766, "logps/rejected": -58.74952697753906, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": -0.7654832601547241, "rewards/margins": 4.446800231933594, "rewards/rejected": -5.212283611297607, "step": 1147 }, { "epoch": 6.802962962962963, "grad_norm": 4.9909518633601255, "learning_rate": 3.172934848712272e-08, "logits/chosen": -1.1236099004745483, "logits/rejected": -1.1841087341308594, "logps/chosen": -38.755008697509766, "logps/rejected": -70.70816040039062, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -0.653393030166626, "rewards/margins": 5.56360387802124, "rewards/rejected": -6.216997146606445, "step": 1148 }, { "epoch": 6.808888888888889, "grad_norm": 3.4532209121659263, "learning_rate": 3.141334596385447e-08, "logits/chosen": -1.0277100801467896, "logits/rejected": -1.0570564270019531, "logps/chosen": -46.61890411376953, "logps/rejected": -69.73439025878906, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -0.9495557546615601, "rewards/margins": 6.949012756347656, "rewards/rejected": -7.898568153381348, "step": 1149 }, { "epoch": 6.814814814814815, "grad_norm": 5.871538897220252, "learning_rate": 3.109881938683492e-08, "logits/chosen": -1.1937267780303955, "logits/rejected": -1.285079002380371, "logps/chosen": -36.37516784667969, "logps/rejected": -74.77618408203125, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.38637667894363403, "rewards/margins": 7.374358177185059, "rewards/rejected": -7.760734558105469, "step": 1150 }, { "epoch": 6.8207407407407405, "grad_norm": 7.088095822908057, "learning_rate": 3.078577087981832e-08, "logits/chosen": -1.113420009613037, "logits/rejected": -1.0938458442687988, "logps/chosen": -50.963314056396484, "logps/rejected": -84.46855163574219, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": -1.373986840248108, "rewards/margins": 6.448185920715332, "rewards/rejected": -7.822172164916992, "step": 1151 }, { "epoch": 6.826666666666666, "grad_norm": 4.222520378565838, "learning_rate": 3.047420255657851e-08, "logits/chosen": -1.3878954648971558, "logits/rejected": -1.4309817552566528, "logps/chosen": -47.81982421875, "logps/rejected": -63.69879913330078, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -0.8947990536689758, "rewards/margins": 5.389825344085693, "rewards/rejected": -6.2846245765686035, "step": 1152 }, { "epoch": 6.832592592592593, "grad_norm": 4.126842704006626, "learning_rate": 3.016411652089493e-08, "logits/chosen": -1.2584927082061768, "logits/rejected": -1.3139593601226807, "logps/chosen": -45.37017822265625, "logps/rejected": -72.07698059082031, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -1.2568224668502808, "rewards/margins": 5.785853385925293, "rewards/rejected": -7.042675971984863, "step": 1153 }, { "epoch": 6.838518518518518, "grad_norm": 3.3946248488291246, "learning_rate": 2.985551486653823e-08, "logits/chosen": -1.1920702457427979, "logits/rejected": -1.3103349208831787, "logps/chosen": -46.732398986816406, "logps/rejected": -82.87397766113281, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -0.6517075300216675, "rewards/margins": 7.334443092346191, "rewards/rejected": -7.986151218414307, "step": 1154 }, { "epoch": 6.844444444444444, "grad_norm": 3.4037349577235188, "learning_rate": 2.954839967725617e-08, "logits/chosen": -1.303606390953064, "logits/rejected": -1.3001573085784912, "logps/chosen": -58.52591323852539, "logps/rejected": -64.23796081542969, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": -1.1647790670394897, "rewards/margins": 5.34108829498291, "rewards/rejected": -6.5058674812316895, "step": 1155 }, { "epoch": 6.850370370370371, "grad_norm": 5.007534848746155, "learning_rate": 2.924277302675962e-08, "logits/chosen": -1.1423561573028564, "logits/rejected": -1.1317256689071655, "logps/chosen": -52.439727783203125, "logps/rejected": -66.79461669921875, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": -0.7898544073104858, "rewards/margins": 5.347036361694336, "rewards/rejected": -6.136890888214111, "step": 1156 }, { "epoch": 6.856296296296296, "grad_norm": 4.863091162067616, "learning_rate": 2.893863697870841e-08, "logits/chosen": -1.1290535926818848, "logits/rejected": -1.1492767333984375, "logps/chosen": -51.96717071533203, "logps/rejected": -61.89137268066406, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -0.8053914308547974, "rewards/margins": 4.911278247833252, "rewards/rejected": -5.71666955947876, "step": 1157 }, { "epoch": 6.862222222222222, "grad_norm": 4.046600505355742, "learning_rate": 2.863599358669755e-08, "logits/chosen": -1.3739553689956665, "logits/rejected": -1.424708366394043, "logps/chosen": -55.446754455566406, "logps/rejected": -78.931884765625, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -0.08441561460494995, "rewards/margins": 6.4412431716918945, "rewards/rejected": -6.52565860748291, "step": 1158 }, { "epoch": 6.868148148148148, "grad_norm": 4.797543953031611, "learning_rate": 2.8334844894243287e-08, "logits/chosen": -1.2137575149536133, "logits/rejected": -1.211321234703064, "logps/chosen": -74.98422241210938, "logps/rejected": -88.64210510253906, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": -1.568566918373108, "rewards/margins": 5.5392656326293945, "rewards/rejected": -7.107832431793213, "step": 1159 }, { "epoch": 6.874074074074074, "grad_norm": 4.909249345613524, "learning_rate": 2.803519293476936e-08, "logits/chosen": -1.3701822757720947, "logits/rejected": -1.3919110298156738, "logps/chosen": -51.087623596191406, "logps/rejected": -68.07141876220703, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": -0.7531724572181702, "rewards/margins": 6.090072154998779, "rewards/rejected": -6.843244552612305, "step": 1160 }, { "epoch": 6.88, "grad_norm": 4.722536988352096, "learning_rate": 2.7737039731593138e-08, "logits/chosen": -1.246882677078247, "logits/rejected": -1.3100802898406982, "logps/chosen": -63.127540588378906, "logps/rejected": -75.79826354980469, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -0.8006022572517395, "rewards/margins": 4.3295793533325195, "rewards/rejected": -5.130181789398193, "step": 1161 }, { "epoch": 6.885925925925926, "grad_norm": 3.4791404660178036, "learning_rate": 2.7440387297912122e-08, "logits/chosen": -1.0389580726623535, "logits/rejected": -1.0971720218658447, "logps/chosen": -41.66819763183594, "logps/rejected": -66.16861724853516, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": 0.5657974481582642, "rewards/margins": 6.39125394821167, "rewards/rejected": -5.825456619262695, "step": 1162 }, { "epoch": 6.891851851851852, "grad_norm": 3.411294212428707, "learning_rate": 2.7145237636790276e-08, "logits/chosen": -1.1562262773513794, "logits/rejected": -1.1969718933105469, "logps/chosen": -48.71192932128906, "logps/rejected": -68.86924743652344, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -0.32160070538520813, "rewards/margins": 5.062057018280029, "rewards/rejected": -5.383657455444336, "step": 1163 }, { "epoch": 6.897777777777778, "grad_norm": 4.292288857190118, "learning_rate": 2.685159274114443e-08, "logits/chosen": -1.2545537948608398, "logits/rejected": -1.2724109888076782, "logps/chosen": -44.735939025878906, "logps/rejected": -63.776039123535156, "loss": 0.0404, "rewards/accuracies": 1.0, "rewards/chosen": -0.2329655885696411, "rewards/margins": 5.270541191101074, "rewards/rejected": -5.503507137298584, "step": 1164 }, { "epoch": 6.9037037037037035, "grad_norm": 4.226084967042107, "learning_rate": 2.6559454593731072e-08, "logits/chosen": -1.0003001689910889, "logits/rejected": -1.0545388460159302, "logps/chosen": -50.38365936279297, "logps/rejected": -84.3214111328125, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -1.7823373079299927, "rewards/margins": 6.4346923828125, "rewards/rejected": -8.217029571533203, "step": 1165 }, { "epoch": 6.90962962962963, "grad_norm": 2.3928241797464476, "learning_rate": 2.6268825167132636e-08, "logits/chosen": -1.0937495231628418, "logits/rejected": -1.0489561557769775, "logps/chosen": -47.507728576660156, "logps/rejected": -68.5752944946289, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -0.18306052684783936, "rewards/margins": 5.563224792480469, "rewards/rejected": -5.746285915374756, "step": 1166 }, { "epoch": 6.915555555555556, "grad_norm": 3.6898784855448032, "learning_rate": 2.5979706423744392e-08, "logits/chosen": -1.246962547302246, "logits/rejected": -1.2974610328674316, "logps/chosen": -48.59234619140625, "logps/rejected": -62.70619201660156, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -1.167926549911499, "rewards/margins": 5.319080352783203, "rewards/rejected": -6.487007141113281, "step": 1167 }, { "epoch": 6.921481481481481, "grad_norm": 5.41093492527125, "learning_rate": 2.5692100315761023e-08, "logits/chosen": -1.123171329498291, "logits/rejected": -1.0399839878082275, "logps/chosen": -74.13027954101562, "logps/rejected": -95.59664916992188, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -1.8181382417678833, "rewards/margins": 5.603046417236328, "rewards/rejected": -7.421184539794922, "step": 1168 }, { "epoch": 6.927407407407408, "grad_norm": 2.4175779715247523, "learning_rate": 2.5406008785163717e-08, "logits/chosen": -1.2619564533233643, "logits/rejected": -1.2582941055297852, "logps/chosen": -58.20111846923828, "logps/rejected": -74.21757507324219, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -1.2906043529510498, "rewards/margins": 5.086883544921875, "rewards/rejected": -6.377488136291504, "step": 1169 }, { "epoch": 6.933333333333334, "grad_norm": 3.144038000067894, "learning_rate": 2.512143376370682e-08, "logits/chosen": -1.0144656896591187, "logits/rejected": -1.139504075050354, "logps/chosen": -40.94801330566406, "logps/rejected": -58.50607681274414, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 0.32384026050567627, "rewards/margins": 4.669650554656982, "rewards/rejected": -4.345810413360596, "step": 1170 }, { "epoch": 6.939259259259259, "grad_norm": 5.174012697298315, "learning_rate": 2.4838377172904907e-08, "logits/chosen": -1.117937684059143, "logits/rejected": -1.0629061460494995, "logps/chosen": -57.18743896484375, "logps/rejected": -70.98674011230469, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": -0.9007591009140015, "rewards/margins": 5.176748275756836, "rewards/rejected": -6.077507495880127, "step": 1171 }, { "epoch": 6.945185185185185, "grad_norm": 3.0198485141881424, "learning_rate": 2.455684092401969e-08, "logits/chosen": -1.0965194702148438, "logits/rejected": -1.2074683904647827, "logps/chosen": -33.904380798339844, "logps/rejected": -64.75767517089844, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 0.016904570162296295, "rewards/margins": 6.192440986633301, "rewards/rejected": -6.175536632537842, "step": 1172 }, { "epoch": 6.9511111111111115, "grad_norm": 2.978789862112312, "learning_rate": 2.4276826918047277e-08, "logits/chosen": -1.2849714756011963, "logits/rejected": -1.372328519821167, "logps/chosen": -62.24251937866211, "logps/rejected": -86.72196960449219, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -0.981757402420044, "rewards/margins": 6.014976978302002, "rewards/rejected": -6.996734619140625, "step": 1173 }, { "epoch": 6.957037037037037, "grad_norm": 4.507694763531846, "learning_rate": 2.399833704570517e-08, "logits/chosen": -1.2926836013793945, "logits/rejected": -1.305148959159851, "logps/chosen": -41.6738395690918, "logps/rejected": -58.01539611816406, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": -0.233668252825737, "rewards/margins": 4.863929748535156, "rewards/rejected": -5.097597599029541, "step": 1174 }, { "epoch": 6.962962962962963, "grad_norm": 3.542058417329551, "learning_rate": 2.372137318741968e-08, "logits/chosen": -1.116047739982605, "logits/rejected": -1.0878998041152954, "logps/chosen": -61.21372985839844, "logps/rejected": -87.04780578613281, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -0.5796356201171875, "rewards/margins": 5.676990509033203, "rewards/rejected": -6.256626129150391, "step": 1175 }, { "epoch": 6.968888888888889, "grad_norm": 3.7533160256479423, "learning_rate": 2.3445937213313062e-08, "logits/chosen": -1.0332417488098145, "logits/rejected": -1.062882900238037, "logps/chosen": -67.98162841796875, "logps/rejected": -80.46829986572266, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -1.066481113433838, "rewards/margins": 6.786133289337158, "rewards/rejected": -7.852613925933838, "step": 1176 }, { "epoch": 6.974814814814815, "grad_norm": 3.7587867716967054, "learning_rate": 2.3172030983190926e-08, "logits/chosen": -1.0796126127243042, "logits/rejected": -1.1577988862991333, "logps/chosen": -44.99909210205078, "logps/rejected": -59.904823303222656, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -0.5940638780593872, "rewards/margins": 5.257760047912598, "rewards/rejected": -5.851823806762695, "step": 1177 }, { "epoch": 6.980740740740741, "grad_norm": 4.682551019970592, "learning_rate": 2.2899656346529768e-08, "logits/chosen": -1.3066891431808472, "logits/rejected": -1.3407034873962402, "logps/chosen": -51.310062408447266, "logps/rejected": -53.79896926879883, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -1.3059674501419067, "rewards/margins": 3.983776330947876, "rewards/rejected": -5.289743423461914, "step": 1178 }, { "epoch": 6.986666666666666, "grad_norm": 4.375765145580466, "learning_rate": 2.2628815142464342e-08, "logits/chosen": -1.1559022665023804, "logits/rejected": -1.1550366878509521, "logps/chosen": -47.062721252441406, "logps/rejected": -75.09408569335938, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -1.0067358016967773, "rewards/margins": 6.008578777313232, "rewards/rejected": -7.015315055847168, "step": 1179 }, { "epoch": 6.992592592592593, "grad_norm": 4.4626280624260435, "learning_rate": 2.2359509199775446e-08, "logits/chosen": -0.8305215835571289, "logits/rejected": -0.870010495185852, "logps/chosen": -58.898162841796875, "logps/rejected": -82.0947494506836, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5731546878814697, "rewards/margins": 7.211018085479736, "rewards/rejected": -8.784172058105469, "step": 1180 }, { "epoch": 6.998518518518519, "grad_norm": 5.073247622191449, "learning_rate": 2.2091740336877358e-08, "logits/chosen": -1.0570170879364014, "logits/rejected": -1.1461822986602783, "logps/chosen": -52.41472625732422, "logps/rejected": -97.20932006835938, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -1.4183259010314941, "rewards/margins": 6.313093185424805, "rewards/rejected": -7.731419563293457, "step": 1181 }, { "epoch": 7.004444444444444, "grad_norm": 3.8474555895231273, "learning_rate": 2.1825510361805576e-08, "logits/chosen": -1.2037731409072876, "logits/rejected": -1.2693171501159668, "logps/chosen": -41.03017807006836, "logps/rejected": -64.76742553710938, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -0.069698765873909, "rewards/margins": 6.444377899169922, "rewards/rejected": -6.5140767097473145, "step": 1182 }, { "epoch": 7.010370370370371, "grad_norm": 2.9881585778020154, "learning_rate": 2.156082107220486e-08, "logits/chosen": -1.073196530342102, "logits/rejected": -1.1584742069244385, "logps/chosen": -45.10144805908203, "logps/rejected": -64.95097351074219, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -1.0941698551177979, "rewards/margins": 5.135311126708984, "rewards/rejected": -6.2294816970825195, "step": 1183 }, { "epoch": 7.0162962962962965, "grad_norm": 5.276512414256135, "learning_rate": 2.129767425531673e-08, "logits/chosen": -1.321061611175537, "logits/rejected": -1.3409541845321655, "logps/chosen": -58.39825439453125, "logps/rejected": -74.25123596191406, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -1.1586742401123047, "rewards/margins": 5.59089469909668, "rewards/rejected": -6.749568939208984, "step": 1184 }, { "epoch": 7.022222222222222, "grad_norm": 4.469226836673804, "learning_rate": 2.1036071687967783e-08, "logits/chosen": -1.1318987607955933, "logits/rejected": -1.1667543649673462, "logps/chosen": -56.4000244140625, "logps/rejected": -64.8052978515625, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -1.6219148635864258, "rewards/margins": 4.586970806121826, "rewards/rejected": -6.208885669708252, "step": 1185 }, { "epoch": 7.028148148148148, "grad_norm": 4.6713640086194586, "learning_rate": 2.077601513655733e-08, "logits/chosen": -1.2517354488372803, "logits/rejected": -1.342995047569275, "logps/chosen": -41.56428146362305, "logps/rejected": -54.71381378173828, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -0.5574272871017456, "rewards/margins": 4.39409875869751, "rewards/rejected": -4.951525688171387, "step": 1186 }, { "epoch": 7.034074074074074, "grad_norm": 3.914809073017115, "learning_rate": 2.0517506357045715e-08, "logits/chosen": -1.1398996114730835, "logits/rejected": -1.226372241973877, "logps/chosen": -49.662784576416016, "logps/rejected": -89.25350952148438, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -0.5754448175430298, "rewards/margins": 6.921874046325684, "rewards/rejected": -7.497318267822266, "step": 1187 }, { "epoch": 7.04, "grad_norm": 4.107490647117851, "learning_rate": 2.0260547094942348e-08, "logits/chosen": -1.140151858329773, "logits/rejected": -1.2162840366363525, "logps/chosen": -44.47743225097656, "logps/rejected": -61.597591400146484, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": -0.7503057718276978, "rewards/margins": 5.507844924926758, "rewards/rejected": -6.258151054382324, "step": 1188 }, { "epoch": 7.045925925925926, "grad_norm": 6.6860011316021275, "learning_rate": 2.0005139085293942e-08, "logits/chosen": -1.1185882091522217, "logits/rejected": -1.1940538883209229, "logps/chosen": -61.718780517578125, "logps/rejected": -80.43623352050781, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -0.8451882004737854, "rewards/margins": 6.592974662780762, "rewards/rejected": -7.4381632804870605, "step": 1189 }, { "epoch": 7.051851851851852, "grad_norm": 4.366270256335024, "learning_rate": 1.9751284052672873e-08, "logits/chosen": -1.0844464302062988, "logits/rejected": -1.151923418045044, "logps/chosen": -45.92787170410156, "logps/rejected": -66.75313568115234, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.3969569206237793, "rewards/margins": 5.534310817718506, "rewards/rejected": -5.931267738342285, "step": 1190 }, { "epoch": 7.057777777777778, "grad_norm": 4.510310947071034, "learning_rate": 1.9498983711165345e-08, "logits/chosen": -0.9310641884803772, "logits/rejected": -0.9676129221916199, "logps/chosen": -44.20957565307617, "logps/rejected": -80.73597717285156, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -1.5753066539764404, "rewards/margins": 7.630600929260254, "rewards/rejected": -9.205906867980957, "step": 1191 }, { "epoch": 7.063703703703704, "grad_norm": 3.702033296356849, "learning_rate": 1.9248239764360048e-08, "logits/chosen": -1.0506380796432495, "logits/rejected": -1.1215331554412842, "logps/chosen": -60.218170166015625, "logps/rejected": -61.922786712646484, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": -0.5812512040138245, "rewards/margins": 4.8047709465026855, "rewards/rejected": -5.386022567749023, "step": 1192 }, { "epoch": 7.069629629629629, "grad_norm": 3.9221091135639763, "learning_rate": 1.899905390533649e-08, "logits/chosen": -1.0444316864013672, "logits/rejected": -1.1364364624023438, "logps/chosen": -49.269317626953125, "logps/rejected": -74.48098754882812, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -0.8035168647766113, "rewards/margins": 5.918756008148193, "rewards/rejected": -6.722272872924805, "step": 1193 }, { "epoch": 7.075555555555556, "grad_norm": 3.475733014246764, "learning_rate": 1.8751427816653618e-08, "logits/chosen": -1.145141839981079, "logits/rejected": -1.1588716506958008, "logps/chosen": -39.95106506347656, "logps/rejected": -55.05951690673828, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.6812821626663208, "rewards/margins": 5.355863571166992, "rewards/rejected": -6.037146091461182, "step": 1194 }, { "epoch": 7.0814814814814815, "grad_norm": 5.461483104500958, "learning_rate": 1.8505363170338517e-08, "logits/chosen": -1.1386044025421143, "logits/rejected": -1.211320161819458, "logps/chosen": -54.65919494628906, "logps/rejected": -76.21634674072266, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -0.6187160015106201, "rewards/margins": 6.0350341796875, "rewards/rejected": -6.653749942779541, "step": 1195 }, { "epoch": 7.087407407407407, "grad_norm": 3.537228852616376, "learning_rate": 1.826086162787499e-08, "logits/chosen": -1.234696388244629, "logits/rejected": -1.2426447868347168, "logps/chosen": -49.69206237792969, "logps/rejected": -59.662872314453125, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -0.455064058303833, "rewards/margins": 4.351738452911377, "rewards/rejected": -4.806802272796631, "step": 1196 }, { "epoch": 7.093333333333334, "grad_norm": 4.822202350694429, "learning_rate": 1.8017924840192433e-08, "logits/chosen": -1.178159475326538, "logits/rejected": -1.178950548171997, "logps/chosen": -49.88624572753906, "logps/rejected": -64.72603607177734, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": -0.6928927898406982, "rewards/margins": 5.600357532501221, "rewards/rejected": -6.29325008392334, "step": 1197 }, { "epoch": 7.099259259259259, "grad_norm": 2.6458497013464735, "learning_rate": 1.7776554447654717e-08, "logits/chosen": -1.0108141899108887, "logits/rejected": -1.0034370422363281, "logps/chosen": -46.219276428222656, "logps/rejected": -61.129150390625, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -0.47822344303131104, "rewards/margins": 5.80910062789917, "rewards/rejected": -6.287323951721191, "step": 1198 }, { "epoch": 7.105185185185185, "grad_norm": 4.366914015353461, "learning_rate": 1.7536752080048955e-08, "logits/chosen": -1.1089617013931274, "logits/rejected": -1.0810322761535645, "logps/chosen": -54.921653747558594, "logps/rejected": -78.62164306640625, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -1.7407786846160889, "rewards/margins": 5.562222480773926, "rewards/rejected": -7.303001403808594, "step": 1199 }, { "epoch": 7.111111111111111, "grad_norm": 4.761671953709213, "learning_rate": 1.7298519356574726e-08, "logits/chosen": -1.31987726688385, "logits/rejected": -1.3181467056274414, "logps/chosen": -50.47599792480469, "logps/rejected": -63.006614685058594, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -0.9234836101531982, "rewards/margins": 4.418302536010742, "rewards/rejected": -5.3417863845825195, "step": 1200 }, { "epoch": 7.117037037037037, "grad_norm": 4.101005539222599, "learning_rate": 1.706185788583289e-08, "logits/chosen": -1.2375043630599976, "logits/rejected": -1.2391877174377441, "logps/chosen": -51.08970260620117, "logps/rejected": -73.64395141601562, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -0.4969186782836914, "rewards/margins": 6.408106803894043, "rewards/rejected": -6.905025959014893, "step": 1201 }, { "epoch": 7.122962962962963, "grad_norm": 5.073736905951011, "learning_rate": 1.6826769265815e-08, "logits/chosen": -1.2112133502960205, "logits/rejected": -1.2583060264587402, "logps/chosen": -43.31800842285156, "logps/rejected": -69.34353637695312, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.5988361835479736, "rewards/margins": 5.34016227722168, "rewards/rejected": -5.938999176025391, "step": 1202 }, { "epoch": 7.128888888888889, "grad_norm": 4.8290789301264, "learning_rate": 1.6593255083892228e-08, "logits/chosen": -1.1223499774932861, "logits/rejected": -1.1824976205825806, "logps/chosen": -53.52474594116211, "logps/rejected": -82.81754302978516, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": -1.459550142288208, "rewards/margins": 6.309607028961182, "rewards/rejected": -7.7691569328308105, "step": 1203 }, { "epoch": 7.134814814814815, "grad_norm": 3.433072476835311, "learning_rate": 1.6361316916804896e-08, "logits/chosen": -1.1100119352340698, "logits/rejected": -1.1950923204421997, "logps/chosen": -48.51715087890625, "logps/rejected": -69.97017669677734, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 0.13577726483345032, "rewards/margins": 5.7283935546875, "rewards/rejected": -5.592616081237793, "step": 1204 }, { "epoch": 7.140740740740741, "grad_norm": 2.8445682548846714, "learning_rate": 1.6130956330651646e-08, "logits/chosen": -1.1339422464370728, "logits/rejected": -1.1973767280578613, "logps/chosen": -38.242767333984375, "logps/rejected": -55.585391998291016, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -0.7097413539886475, "rewards/margins": 5.838712692260742, "rewards/rejected": -6.548454284667969, "step": 1205 }, { "epoch": 7.1466666666666665, "grad_norm": 3.6214945778065273, "learning_rate": 1.5902174880878916e-08, "logits/chosen": -1.0982111692428589, "logits/rejected": -1.2695142030715942, "logps/chosen": -42.53776550292969, "logps/rejected": -65.62220001220703, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -0.09825128316879272, "rewards/margins": 5.4457688331604, "rewards/rejected": -5.54401969909668, "step": 1206 }, { "epoch": 7.152592592592592, "grad_norm": 3.5602534628119087, "learning_rate": 1.567497411227059e-08, "logits/chosen": -1.3598968982696533, "logits/rejected": -1.3619459867477417, "logps/chosen": -59.65229797363281, "logps/rejected": -74.8875961303711, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -1.338700771331787, "rewards/margins": 6.547931671142578, "rewards/rejected": -7.886631965637207, "step": 1207 }, { "epoch": 7.158518518518519, "grad_norm": 5.162761564983183, "learning_rate": 1.5449355558937337e-08, "logits/chosen": -1.2608649730682373, "logits/rejected": -1.166750192642212, "logps/chosen": -61.810123443603516, "logps/rejected": -73.02722930908203, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": -1.4834518432617188, "rewards/margins": 5.531788349151611, "rewards/rejected": -7.01524019241333, "step": 1208 }, { "epoch": 7.164444444444444, "grad_norm": 6.5270072472063525, "learning_rate": 1.5225320744306408e-08, "logits/chosen": -1.0810682773590088, "logits/rejected": -1.2014857530593872, "logps/chosen": -42.50169372558594, "logps/rejected": -72.0819320678711, "loss": 0.0589, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6136506795883179, "rewards/margins": 6.531806468963623, "rewards/rejected": -7.145456790924072, "step": 1209 }, { "epoch": 7.17037037037037, "grad_norm": 3.940199460474413, "learning_rate": 1.5002871181111153e-08, "logits/chosen": -1.1244795322418213, "logits/rejected": -1.2221897840499878, "logps/chosen": -46.386444091796875, "logps/rejected": -64.42037963867188, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": -1.1717915534973145, "rewards/margins": 5.057167053222656, "rewards/rejected": -6.228959083557129, "step": 1210 }, { "epoch": 7.176296296296297, "grad_norm": 4.069926138117332, "learning_rate": 1.4782008371381105e-08, "logits/chosen": -1.1110057830810547, "logits/rejected": -1.1199915409088135, "logps/chosen": -50.08976364135742, "logps/rejected": -73.6676254272461, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -2.0431034564971924, "rewards/margins": 5.300725936889648, "rewards/rejected": -7.34382963180542, "step": 1211 }, { "epoch": 7.182222222222222, "grad_norm": 4.768911394055649, "learning_rate": 1.4562733806431666e-08, "logits/chosen": -1.1371710300445557, "logits/rejected": -1.261394739151001, "logps/chosen": -40.512229919433594, "logps/rejected": -63.53961181640625, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 0.15543416142463684, "rewards/margins": 5.225121974945068, "rewards/rejected": -5.069687366485596, "step": 1212 }, { "epoch": 7.188148148148148, "grad_norm": 3.9377568820612456, "learning_rate": 1.434504896685393e-08, "logits/chosen": -1.144298791885376, "logits/rejected": -1.2841874361038208, "logps/chosen": -48.18315505981445, "logps/rejected": -60.86648178100586, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -0.1481950581073761, "rewards/margins": 4.970972537994385, "rewards/rejected": -5.119167327880859, "step": 1213 }, { "epoch": 7.194074074074074, "grad_norm": 3.9266549802613646, "learning_rate": 1.4128955322504965e-08, "logits/chosen": -1.174392580986023, "logits/rejected": -1.1449965238571167, "logps/chosen": -54.98040771484375, "logps/rejected": -75.45690155029297, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.3021417260169983, "rewards/margins": 4.63749885559082, "rewards/rejected": -4.939640998840332, "step": 1214 }, { "epoch": 7.2, "grad_norm": 3.190383440781884, "learning_rate": 1.3914454332497604e-08, "logits/chosen": -0.9599422216415405, "logits/rejected": -1.0563316345214844, "logps/chosen": -47.49116897583008, "logps/rejected": -60.74289321899414, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -0.8696696758270264, "rewards/margins": 6.098237037658691, "rewards/rejected": -6.967906951904297, "step": 1215 }, { "epoch": 7.205925925925926, "grad_norm": 4.429967683063817, "learning_rate": 1.3701547445190836e-08, "logits/chosen": -1.3367496728897095, "logits/rejected": -1.3097389936447144, "logps/chosen": -61.743629455566406, "logps/rejected": -101.25115966796875, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -1.5083428621292114, "rewards/margins": 6.84088134765625, "rewards/rejected": -8.349224090576172, "step": 1216 }, { "epoch": 7.2118518518518515, "grad_norm": 4.134451946730424, "learning_rate": 1.3490236098179813e-08, "logits/chosen": -1.0464622974395752, "logits/rejected": -1.1230379343032837, "logps/chosen": -56.35862731933594, "logps/rejected": -86.72453308105469, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -1.0451548099517822, "rewards/margins": 6.443534851074219, "rewards/rejected": -7.488689422607422, "step": 1217 }, { "epoch": 7.217777777777778, "grad_norm": 4.322416018243353, "learning_rate": 1.3280521718286253e-08, "logits/chosen": -1.0510904788970947, "logits/rejected": -1.151281714439392, "logps/chosen": -46.579933166503906, "logps/rejected": -64.40127563476562, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -1.2190048694610596, "rewards/margins": 4.525186538696289, "rewards/rejected": -5.744192123413086, "step": 1218 }, { "epoch": 7.223703703703704, "grad_norm": 4.623060793873189, "learning_rate": 1.3072405721548857e-08, "logits/chosen": -1.1600358486175537, "logits/rejected": -1.2311742305755615, "logps/chosen": -61.91407775878906, "logps/rejected": -62.87279510498047, "loss": 0.0493, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4685288965702057, "rewards/margins": 4.3588385581970215, "rewards/rejected": -4.827367305755615, "step": 1219 }, { "epoch": 7.229629629629629, "grad_norm": 3.015880673644393, "learning_rate": 1.2865889513213628e-08, "logits/chosen": -1.3033242225646973, "logits/rejected": -1.2112292051315308, "logps/chosen": -47.12859344482422, "logps/rejected": -62.87380599975586, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -0.9500541687011719, "rewards/margins": 5.1967973709106445, "rewards/rejected": -6.146851539611816, "step": 1220 }, { "epoch": 7.235555555555556, "grad_norm": 8.743988585405903, "learning_rate": 1.2660974487724407e-08, "logits/chosen": -1.0260766744613647, "logits/rejected": -1.1828255653381348, "logps/chosen": -47.46949768066406, "logps/rejected": -71.39844512939453, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -1.4114501476287842, "rewards/margins": 6.981899738311768, "rewards/rejected": -8.393349647521973, "step": 1221 }, { "epoch": 7.241481481481482, "grad_norm": 3.606950533677255, "learning_rate": 1.2457662028713594e-08, "logits/chosen": -1.1686749458312988, "logits/rejected": -1.2729439735412598, "logps/chosen": -38.59318923950195, "logps/rejected": -68.04505920410156, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": -0.5534713268280029, "rewards/margins": 5.194384574890137, "rewards/rejected": -5.747856140136719, "step": 1222 }, { "epoch": 7.247407407407407, "grad_norm": 4.021604241001502, "learning_rate": 1.2255953508992612e-08, "logits/chosen": -1.107062816619873, "logits/rejected": -1.186793565750122, "logps/chosen": -50.838260650634766, "logps/rejected": -82.80189514160156, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -2.042785167694092, "rewards/margins": 6.423619270324707, "rewards/rejected": -8.466404914855957, "step": 1223 }, { "epoch": 7.253333333333333, "grad_norm": 3.446655110015099, "learning_rate": 1.205585029054279e-08, "logits/chosen": -1.2475371360778809, "logits/rejected": -1.3144434690475464, "logps/chosen": -52.76904296875, "logps/rejected": -75.10376739501953, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": -1.8716415166854858, "rewards/margins": 6.061808109283447, "rewards/rejected": -7.9334492683410645, "step": 1224 }, { "epoch": 7.2592592592592595, "grad_norm": 5.250885428095321, "learning_rate": 1.1857353724505942e-08, "logits/chosen": -1.0257887840270996, "logits/rejected": -1.0964391231536865, "logps/chosen": -60.26069641113281, "logps/rejected": -83.70278930664062, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -1.4667550325393677, "rewards/margins": 6.692720413208008, "rewards/rejected": -8.159475326538086, "step": 1225 }, { "epoch": 7.265185185185185, "grad_norm": 3.610830532984277, "learning_rate": 1.1660465151175664e-08, "logits/chosen": -1.0537807941436768, "logits/rejected": -1.062589406967163, "logps/chosen": -47.4190559387207, "logps/rejected": -73.4510498046875, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -1.360797643661499, "rewards/margins": 6.665514945983887, "rewards/rejected": -8.026312828063965, "step": 1226 }, { "epoch": 7.271111111111111, "grad_norm": 3.880057572525301, "learning_rate": 1.1465185899987794e-08, "logits/chosen": -1.1795921325683594, "logits/rejected": -1.1620832681655884, "logps/chosen": -46.04042053222656, "logps/rejected": -67.0638198852539, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.7089900970458984, "rewards/margins": 5.227532863616943, "rewards/rejected": -5.936522960662842, "step": 1227 }, { "epoch": 7.277037037037037, "grad_norm": 2.614232435052489, "learning_rate": 1.1271517289511783e-08, "logits/chosen": -1.2677268981933594, "logits/rejected": -1.2671798467636108, "logps/chosen": -45.25715255737305, "logps/rejected": -61.537899017333984, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -0.3847803771495819, "rewards/margins": 4.929067134857178, "rewards/rejected": -5.313847541809082, "step": 1228 }, { "epoch": 7.282962962962963, "grad_norm": 3.9669190076578276, "learning_rate": 1.1079460627441666e-08, "logits/chosen": -1.1752060651779175, "logits/rejected": -1.3029592037200928, "logps/chosen": -32.447357177734375, "logps/rejected": -65.22743225097656, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": 0.12089192867279053, "rewards/margins": 5.483150482177734, "rewards/rejected": -5.3622589111328125, "step": 1229 }, { "epoch": 7.288888888888889, "grad_norm": 2.4583403700175124, "learning_rate": 1.0889017210587215e-08, "logits/chosen": -1.0788657665252686, "logits/rejected": -1.1975395679473877, "logps/chosen": -48.47402572631836, "logps/rejected": -100.1389389038086, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -0.9301841259002686, "rewards/margins": 8.300827026367188, "rewards/rejected": -9.231012344360352, "step": 1230 }, { "epoch": 7.294814814814814, "grad_norm": 6.2132480824108, "learning_rate": 1.0700188324865189e-08, "logits/chosen": -1.0446033477783203, "logits/rejected": -1.076178789138794, "logps/chosen": -57.706825256347656, "logps/rejected": -82.1580810546875, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": 0.013994604349136353, "rewards/margins": 6.870332717895508, "rewards/rejected": -6.856338024139404, "step": 1231 }, { "epoch": 7.300740740740741, "grad_norm": 3.456394568063387, "learning_rate": 1.0512975245290685e-08, "logits/chosen": -1.0236254930496216, "logits/rejected": -1.1054068803787231, "logps/chosen": -32.16044998168945, "logps/rejected": -55.15676498413086, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 0.17881041765213013, "rewards/margins": 4.75008487701416, "rewards/rejected": -4.571274280548096, "step": 1232 }, { "epoch": 7.306666666666667, "grad_norm": 3.5615300664844183, "learning_rate": 1.0327379235968548e-08, "logits/chosen": -1.2441649436950684, "logits/rejected": -1.2749804258346558, "logps/chosen": -39.167240142822266, "logps/rejected": -60.43924331665039, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -0.48208075761795044, "rewards/margins": 5.207080841064453, "rewards/rejected": -5.689162254333496, "step": 1233 }, { "epoch": 7.312592592592592, "grad_norm": 5.390607432659595, "learning_rate": 1.0143401550084751e-08, "logits/chosen": -1.1963176727294922, "logits/rejected": -1.3843016624450684, "logps/chosen": -40.927879333496094, "logps/rejected": -83.95448303222656, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": -0.12738998234272003, "rewards/margins": 5.321667671203613, "rewards/rejected": -5.449057579040527, "step": 1234 }, { "epoch": 7.318518518518519, "grad_norm": 3.142358405413608, "learning_rate": 9.961043429898036e-09, "logits/chosen": -1.145086407661438, "logits/rejected": -1.2318087816238403, "logps/chosen": -68.139892578125, "logps/rejected": -79.01605987548828, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -1.054990291595459, "rewards/margins": 6.7228546142578125, "rewards/rejected": -7.7778449058532715, "step": 1235 }, { "epoch": 7.3244444444444445, "grad_norm": 3.789600903060908, "learning_rate": 9.780306106731418e-09, "logits/chosen": -0.9109461903572083, "logits/rejected": -1.087510585784912, "logps/chosen": -46.758087158203125, "logps/rejected": -79.30746459960938, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.5821919441223145, "rewards/margins": 6.765468120574951, "rewards/rejected": -7.347660064697266, "step": 1236 }, { "epoch": 7.33037037037037, "grad_norm": 6.659386685161879, "learning_rate": 9.601190800963942e-09, "logits/chosen": -1.1415681838989258, "logits/rejected": -1.2213099002838135, "logps/chosen": -37.742740631103516, "logps/rejected": -59.03749084472656, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -0.3648865222930908, "rewards/margins": 4.198371887207031, "rewards/rejected": -4.563258171081543, "step": 1237 }, { "epoch": 7.336296296296297, "grad_norm": 4.090989653775852, "learning_rate": 9.423698722022505e-09, "logits/chosen": -1.0874037742614746, "logits/rejected": -1.2385517358779907, "logps/chosen": -55.656105041503906, "logps/rejected": -89.57843017578125, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -2.0636582374572754, "rewards/margins": 7.275221824645996, "rewards/rejected": -9.33888053894043, "step": 1238 }, { "epoch": 7.342222222222222, "grad_norm": 4.360164582892418, "learning_rate": 9.247831068373458e-09, "logits/chosen": -1.3357502222061157, "logits/rejected": -1.3364484310150146, "logps/chosen": -51.02233123779297, "logps/rejected": -69.96102905273438, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": -0.8885281682014465, "rewards/margins": 5.597801685333252, "rewards/rejected": -6.486330032348633, "step": 1239 }, { "epoch": 7.348148148148148, "grad_norm": 4.002958863084062, "learning_rate": 9.073589027514789e-09, "logits/chosen": -1.2340748310089111, "logits/rejected": -1.258064866065979, "logps/chosen": -50.71947479248047, "logps/rejected": -84.90157318115234, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": -0.6332271695137024, "rewards/margins": 7.2277679443359375, "rewards/rejected": -7.860995769500732, "step": 1240 }, { "epoch": 7.354074074074074, "grad_norm": 4.157943643516201, "learning_rate": 8.900973775967963e-09, "logits/chosen": -1.167553186416626, "logits/rejected": -1.210451364517212, "logps/chosen": -41.46842575073242, "logps/rejected": -53.83827209472656, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 0.1300402283668518, "rewards/margins": 4.080387592315674, "rewards/rejected": -3.950347423553467, "step": 1241 }, { "epoch": 7.36, "grad_norm": 4.4053174745313495, "learning_rate": 8.729986479269924e-09, "logits/chosen": -1.1766040325164795, "logits/rejected": -1.243590235710144, "logps/chosen": -52.42701721191406, "logps/rejected": -71.56402587890625, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7700093984603882, "rewards/margins": 5.737342357635498, "rewards/rejected": -6.507350921630859, "step": 1242 }, { "epoch": 7.365925925925926, "grad_norm": 3.8587503094386824, "learning_rate": 8.56062829196541e-09, "logits/chosen": -1.2487757205963135, "logits/rejected": -1.2560317516326904, "logps/chosen": -56.89815902709961, "logps/rejected": -73.01307678222656, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": -0.6755393147468567, "rewards/margins": 5.386079788208008, "rewards/rejected": -6.061618804931641, "step": 1243 }, { "epoch": 7.371851851851852, "grad_norm": 3.570997078057821, "learning_rate": 8.392900357598959e-09, "logits/chosen": -1.1035842895507812, "logits/rejected": -1.1954938173294067, "logps/chosen": -60.9942741394043, "logps/rejected": -75.77302551269531, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -1.047473669052124, "rewards/margins": 5.5704545974731445, "rewards/rejected": -6.617927551269531, "step": 1244 }, { "epoch": 7.377777777777778, "grad_norm": 5.297807507787032, "learning_rate": 8.2268038087073e-09, "logits/chosen": -1.2557616233825684, "logits/rejected": -1.2342597246170044, "logps/chosen": -60.55089569091797, "logps/rejected": -61.7495002746582, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -0.8329823017120361, "rewards/margins": 5.138329029083252, "rewards/rejected": -5.971311569213867, "step": 1245 }, { "epoch": 7.383703703703704, "grad_norm": 2.929655935601638, "learning_rate": 8.062339766811726e-09, "logits/chosen": -1.1982896327972412, "logits/rejected": -1.205052375793457, "logps/chosen": -64.48491668701172, "logps/rejected": -83.77357482910156, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -0.9903809428215027, "rewards/margins": 6.561237335205078, "rewards/rejected": -7.551617622375488, "step": 1246 }, { "epoch": 7.3896296296296295, "grad_norm": 4.054956069159028, "learning_rate": 7.899509342410376e-09, "logits/chosen": -1.23246431350708, "logits/rejected": -1.4280232191085815, "logps/chosen": -41.71550369262695, "logps/rejected": -79.24267578125, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -0.337856262922287, "rewards/margins": 6.026522636413574, "rewards/rejected": -6.364378929138184, "step": 1247 }, { "epoch": 7.395555555555555, "grad_norm": 4.307744014945286, "learning_rate": 7.738313634970962e-09, "logits/chosen": -1.00555419921875, "logits/rejected": -1.0613325834274292, "logps/chosen": -48.21980285644531, "logps/rejected": -65.70577239990234, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.42074263095855713, "rewards/margins": 5.744640350341797, "rewards/rejected": -6.165383338928223, "step": 1248 }, { "epoch": 7.401481481481482, "grad_norm": 3.475525226093238, "learning_rate": 7.578753732923132e-09, "logits/chosen": -1.1451385021209717, "logits/rejected": -1.202643871307373, "logps/chosen": -56.249755859375, "logps/rejected": -86.0700454711914, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -0.8659681677818298, "rewards/margins": 7.697194576263428, "rewards/rejected": -8.563162803649902, "step": 1249 }, { "epoch": 7.407407407407407, "grad_norm": 3.536162845294115, "learning_rate": 7.4208307136512385e-09, "logits/chosen": -1.149498701095581, "logits/rejected": -1.264514446258545, "logps/chosen": -42.952056884765625, "logps/rejected": -64.82267761230469, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": -1.1783785820007324, "rewards/margins": 6.570066928863525, "rewards/rejected": -7.748445510864258, "step": 1250 }, { "epoch": 7.413333333333333, "grad_norm": 3.945679434147782, "learning_rate": 7.2645456434869965e-09, "logits/chosen": -1.0817800760269165, "logits/rejected": -1.0875228643417358, "logps/chosen": -58.667335510253906, "logps/rejected": -83.24259948730469, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -2.176339626312256, "rewards/margins": 5.792956829071045, "rewards/rejected": -7.969296455383301, "step": 1251 }, { "epoch": 7.41925925925926, "grad_norm": 3.7678107207474065, "learning_rate": 7.109899577702389e-09, "logits/chosen": -1.4233471155166626, "logits/rejected": -1.5132304430007935, "logps/chosen": -47.02076721191406, "logps/rejected": -69.56475830078125, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 0.005723677575588226, "rewards/margins": 5.980765342712402, "rewards/rejected": -5.975041389465332, "step": 1252 }, { "epoch": 7.425185185185185, "grad_norm": 3.9150516093462664, "learning_rate": 6.956893560502358e-09, "logits/chosen": -1.2314457893371582, "logits/rejected": -1.4367800951004028, "logps/chosen": -42.025428771972656, "logps/rejected": -72.81390380859375, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -0.3668340742588043, "rewards/margins": 5.217932224273682, "rewards/rejected": -5.584766387939453, "step": 1253 }, { "epoch": 7.431111111111111, "grad_norm": 3.124760931056755, "learning_rate": 6.805528625018014e-09, "logits/chosen": -1.147788405418396, "logits/rejected": -1.1536781787872314, "logps/chosen": -55.166847229003906, "logps/rejected": -79.21180725097656, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -0.9568672180175781, "rewards/margins": 6.495437145233154, "rewards/rejected": -7.452304840087891, "step": 1254 }, { "epoch": 7.437037037037037, "grad_norm": 3.5413800666688418, "learning_rate": 6.655805793299413e-09, "logits/chosen": -1.1663146018981934, "logits/rejected": -1.2886245250701904, "logps/chosen": -48.67404556274414, "logps/rejected": -72.31990814208984, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": -1.2224462032318115, "rewards/margins": 5.77690315246582, "rewards/rejected": -6.999349117279053, "step": 1255 }, { "epoch": 7.442962962962963, "grad_norm": 3.274524380079391, "learning_rate": 6.5077260763087836e-09, "logits/chosen": -1.1297998428344727, "logits/rejected": -1.202750325202942, "logps/chosen": -44.121734619140625, "logps/rejected": -64.97763061523438, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": -0.33169564604759216, "rewards/margins": 4.686792373657227, "rewards/rejected": -5.018487930297852, "step": 1256 }, { "epoch": 7.448888888888889, "grad_norm": 4.78799236470072, "learning_rate": 6.361290473913705e-09, "logits/chosen": -1.006934404373169, "logits/rejected": -1.006296992301941, "logps/chosen": -52.901729583740234, "logps/rejected": -78.00100708007812, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -0.6054511070251465, "rewards/margins": 6.756753444671631, "rewards/rejected": -7.362204551696777, "step": 1257 }, { "epoch": 7.454814814814815, "grad_norm": 5.170167006660229, "learning_rate": 6.216499974880274e-09, "logits/chosen": -1.2134301662445068, "logits/rejected": -1.2395720481872559, "logps/chosen": -53.94024658203125, "logps/rejected": -62.0114860534668, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -0.7245234251022339, "rewards/margins": 6.406597137451172, "rewards/rejected": -7.131120681762695, "step": 1258 }, { "epoch": 7.460740740740741, "grad_norm": 4.535250486934256, "learning_rate": 6.073355556866527e-09, "logits/chosen": -1.114149570465088, "logits/rejected": -1.1737488508224487, "logps/chosen": -68.56315612792969, "logps/rejected": -67.39340209960938, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": -0.6941238641738892, "rewards/margins": 6.043018341064453, "rewards/rejected": -6.737142562866211, "step": 1259 }, { "epoch": 7.466666666666667, "grad_norm": 4.505965598131084, "learning_rate": 5.9318581864157555e-09, "logits/chosen": -0.9749269485473633, "logits/rejected": -1.096606731414795, "logps/chosen": -50.809974670410156, "logps/rejected": -66.96884155273438, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": -1.3738820552825928, "rewards/margins": 5.837920188903809, "rewards/rejected": -7.2118024826049805, "step": 1260 }, { "epoch": 7.4725925925925925, "grad_norm": 3.0329171414952283, "learning_rate": 5.792008818950034e-09, "logits/chosen": -1.2703770399093628, "logits/rejected": -1.2322120666503906, "logps/chosen": -45.22857666015625, "logps/rejected": -74.27377319335938, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -1.1143908500671387, "rewards/margins": 6.21928071975708, "rewards/rejected": -7.3336710929870605, "step": 1261 }, { "epoch": 7.478518518518518, "grad_norm": 4.4357521903959425, "learning_rate": 5.653808398763726e-09, "logits/chosen": -1.2081799507141113, "logits/rejected": -1.046402931213379, "logps/chosen": -40.83061218261719, "logps/rejected": -50.72254943847656, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 0.3177671432495117, "rewards/margins": 4.231396675109863, "rewards/rejected": -3.9136292934417725, "step": 1262 }, { "epoch": 7.484444444444445, "grad_norm": 3.257895575515831, "learning_rate": 5.5172578590171606e-09, "logits/chosen": -1.1244299411773682, "logits/rejected": -1.2354055643081665, "logps/chosen": -36.96542739868164, "logps/rejected": -56.44902801513672, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -0.17362168431282043, "rewards/margins": 4.583876609802246, "rewards/rejected": -4.757498264312744, "step": 1263 }, { "epoch": 7.49037037037037, "grad_norm": 6.81659423219458, "learning_rate": 5.382358121730296e-09, "logits/chosen": -1.1702152490615845, "logits/rejected": -1.188597559928894, "logps/chosen": -42.586761474609375, "logps/rejected": -65.45720672607422, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": -0.10263597965240479, "rewards/margins": 5.135069370269775, "rewards/rejected": -5.237705230712891, "step": 1264 }, { "epoch": 7.496296296296296, "grad_norm": 4.129282172875423, "learning_rate": 5.249110097776482e-09, "logits/chosen": -1.1987591981887817, "logits/rejected": -1.331768274307251, "logps/chosen": -53.300804138183594, "logps/rejected": -73.4027099609375, "loss": 0.0318, "rewards/accuracies": 1.0, "rewards/chosen": -0.9778902530670166, "rewards/margins": 6.012190818786621, "rewards/rejected": -6.990081310272217, "step": 1265 }, { "epoch": 7.502222222222223, "grad_norm": 3.830093081169558, "learning_rate": 5.117514686876378e-09, "logits/chosen": -0.9744676351547241, "logits/rejected": -1.0355535745620728, "logps/chosen": -41.61549758911133, "logps/rejected": -81.38633728027344, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -0.6650229692459106, "rewards/margins": 7.193120956420898, "rewards/rejected": -7.858143329620361, "step": 1266 }, { "epoch": 7.508148148148148, "grad_norm": 3.0439991616018536, "learning_rate": 4.987572777591764e-09, "logits/chosen": -1.0361356735229492, "logits/rejected": -1.044396162033081, "logps/chosen": -53.80352783203125, "logps/rejected": -73.08494567871094, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -1.4982792139053345, "rewards/margins": 6.756043910980225, "rewards/rejected": -8.25432300567627, "step": 1267 }, { "epoch": 7.514074074074074, "grad_norm": 4.075110447550201, "learning_rate": 4.859285247319656e-09, "logits/chosen": -0.9112200736999512, "logits/rejected": -1.039853572845459, "logps/chosen": -42.56827926635742, "logps/rejected": -66.28154754638672, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -1.0129791498184204, "rewards/margins": 5.994237422943115, "rewards/rejected": -7.007215976715088, "step": 1268 }, { "epoch": 7.52, "grad_norm": 3.4429810335067987, "learning_rate": 4.732652962286282e-09, "logits/chosen": -1.154706358909607, "logits/rejected": -1.2327191829681396, "logps/chosen": -49.44975280761719, "logps/rejected": -87.4102783203125, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -1.3737388849258423, "rewards/margins": 5.274635314941406, "rewards/rejected": -6.648374080657959, "step": 1269 }, { "epoch": 7.525925925925926, "grad_norm": 3.4529101663641466, "learning_rate": 4.607676777541342e-09, "logits/chosen": -1.2047605514526367, "logits/rejected": -1.2406158447265625, "logps/chosen": -49.28171157836914, "logps/rejected": -71.44245910644531, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -2.400432586669922, "rewards/margins": 5.764103889465332, "rewards/rejected": -8.164535522460938, "step": 1270 }, { "epoch": 7.531851851851852, "grad_norm": 5.798561624464363, "learning_rate": 4.4843575369521155e-09, "logits/chosen": -1.054225206375122, "logits/rejected": -1.1656428575515747, "logps/chosen": -76.8630142211914, "logps/rejected": -89.7891845703125, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": -2.780390739440918, "rewards/margins": 6.693795680999756, "rewards/rejected": -9.474185943603516, "step": 1271 }, { "epoch": 7.5377777777777775, "grad_norm": 4.78754195364835, "learning_rate": 4.362696073197863e-09, "logits/chosen": -1.1592118740081787, "logits/rejected": -1.1719485521316528, "logps/chosen": -48.26700973510742, "logps/rejected": -58.30637741088867, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": -0.4617045223712921, "rewards/margins": 4.804683685302734, "rewards/rejected": -5.266387939453125, "step": 1272 }, { "epoch": 7.543703703703704, "grad_norm": 4.792872315009618, "learning_rate": 4.242693207764159e-09, "logits/chosen": -1.2573219537734985, "logits/rejected": -1.3435938358306885, "logps/chosen": -53.14029312133789, "logps/rejected": -70.92604064941406, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -0.6087773442268372, "rewards/margins": 5.773610591888428, "rewards/rejected": -6.382387638092041, "step": 1273 }, { "epoch": 7.54962962962963, "grad_norm": 5.643117139650082, "learning_rate": 4.12434975093734e-09, "logits/chosen": -1.1559739112854004, "logits/rejected": -1.184909701347351, "logps/chosen": -47.83567810058594, "logps/rejected": -61.247657775878906, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": 0.013514861464500427, "rewards/margins": 5.217415809631348, "rewards/rejected": -5.203901290893555, "step": 1274 }, { "epoch": 7.555555555555555, "grad_norm": 2.517671598871281, "learning_rate": 4.007666501799012e-09, "logits/chosen": -1.1557645797729492, "logits/rejected": -1.2193787097930908, "logps/chosen": -48.145084381103516, "logps/rejected": -79.78754425048828, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -0.46319612860679626, "rewards/margins": 5.226775169372559, "rewards/rejected": -5.689971446990967, "step": 1275 }, { "epoch": 7.561481481481481, "grad_norm": 5.050595644131196, "learning_rate": 3.89264424822075e-09, "logits/chosen": -1.086578607559204, "logits/rejected": -1.21217942237854, "logps/chosen": -50.246849060058594, "logps/rejected": -81.24800872802734, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -1.6358373165130615, "rewards/margins": 5.405806064605713, "rewards/rejected": -7.041643142700195, "step": 1276 }, { "epoch": 7.567407407407408, "grad_norm": 2.7619258267022118, "learning_rate": 3.779283766858682e-09, "logits/chosen": -1.2855738401412964, "logits/rejected": -1.2698637247085571, "logps/chosen": -36.27191925048828, "logps/rejected": -65.68701171875, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": -0.6610238552093506, "rewards/margins": 4.722266674041748, "rewards/rejected": -5.3832902908325195, "step": 1277 }, { "epoch": 7.573333333333333, "grad_norm": 3.396277904250881, "learning_rate": 3.667585823148217e-09, "logits/chosen": -1.1778568029403687, "logits/rejected": -1.2962253093719482, "logps/chosen": -55.17719268798828, "logps/rejected": -69.75767517089844, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -1.281670093536377, "rewards/margins": 5.099809646606445, "rewards/rejected": -6.381479740142822, "step": 1278 }, { "epoch": 7.579259259259259, "grad_norm": 4.09050137430788, "learning_rate": 3.5575511712990504e-09, "logits/chosen": -1.24432373046875, "logits/rejected": -1.2919775247573853, "logps/chosen": -57.73225402832031, "logps/rejected": -78.25653076171875, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -1.2200543880462646, "rewards/margins": 5.297441482543945, "rewards/rejected": -6.517495155334473, "step": 1279 }, { "epoch": 7.5851851851851855, "grad_norm": 4.522465031479605, "learning_rate": 3.4491805542899155e-09, "logits/chosen": -1.247235894203186, "logits/rejected": -1.3041255474090576, "logps/chosen": -44.53873062133789, "logps/rejected": -68.25907897949219, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -0.625839114189148, "rewards/margins": 4.951447010040283, "rewards/rejected": -5.5772857666015625, "step": 1280 }, { "epoch": 7.591111111111111, "grad_norm": 4.526740048559791, "learning_rate": 3.342474703863507e-09, "logits/chosen": -1.3184285163879395, "logits/rejected": -1.3301334381103516, "logps/chosen": -52.55039978027344, "logps/rejected": -82.96316528320312, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": -1.1241881847381592, "rewards/margins": 6.9125471115112305, "rewards/rejected": -8.036735534667969, "step": 1281 }, { "epoch": 7.597037037037037, "grad_norm": 4.89773536148764, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -1.1311254501342773, "logits/rejected": -1.1638489961624146, "logps/chosen": -44.68597412109375, "logps/rejected": -67.75846099853516, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -0.7200304865837097, "rewards/margins": 5.980922222137451, "rewards/rejected": -6.700952529907227, "step": 1282 }, { "epoch": 7.6029629629629625, "grad_norm": 2.5581980505049606, "learning_rate": 3.1340601735209137e-09, "logits/chosen": -1.3053855895996094, "logits/rejected": -1.351278305053711, "logps/chosen": -47.90487289428711, "logps/rejected": -71.97268676757812, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.7411696910858154, "rewards/margins": 5.495450973510742, "rewards/rejected": -6.2366204261779785, "step": 1283 }, { "epoch": 7.608888888888889, "grad_norm": 3.7529698234479163, "learning_rate": 3.0323529008664807e-09, "logits/chosen": -1.1116161346435547, "logits/rejected": -1.2262192964553833, "logps/chosen": -51.4724006652832, "logps/rejected": -72.905029296875, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -0.7296598553657532, "rewards/margins": 6.045794486999512, "rewards/rejected": -6.775454521179199, "step": 1284 }, { "epoch": 7.614814814814815, "grad_norm": 4.194004453718564, "learning_rate": 2.9323132093088954e-09, "logits/chosen": -0.99857497215271, "logits/rejected": -1.1457947492599487, "logps/chosen": -40.48921203613281, "logps/rejected": -64.35589599609375, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.7001577019691467, "rewards/margins": 6.241533279418945, "rewards/rejected": -6.941690921783447, "step": 1285 }, { "epoch": 7.62074074074074, "grad_norm": 4.078023392295576, "learning_rate": 2.833941774338655e-09, "logits/chosen": -1.1729825735092163, "logits/rejected": -1.300710916519165, "logps/chosen": -44.66774368286133, "logps/rejected": -74.90933227539062, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.47788554430007935, "rewards/margins": 7.087083339691162, "rewards/rejected": -7.564969062805176, "step": 1286 }, { "epoch": 7.626666666666667, "grad_norm": 3.8564092032512263, "learning_rate": 2.7372392601817675e-09, "logits/chosen": -1.2528834342956543, "logits/rejected": -1.1977287530899048, "logps/chosen": -51.391719818115234, "logps/rejected": -76.97032928466797, "loss": 0.0336, "rewards/accuracies": 1.0, "rewards/chosen": -0.8206769227981567, "rewards/margins": 6.537014961242676, "rewards/rejected": -7.357692718505859, "step": 1287 }, { "epoch": 7.632592592592593, "grad_norm": 4.116461804075974, "learning_rate": 2.6422063197953926e-09, "logits/chosen": -1.3762859106063843, "logits/rejected": -1.4735721349716187, "logps/chosen": -54.44892883300781, "logps/rejected": -75.15809631347656, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": -0.953680157661438, "rewards/margins": 5.054767608642578, "rewards/rejected": -6.008447170257568, "step": 1288 }, { "epoch": 7.638518518518518, "grad_norm": 3.935374520816731, "learning_rate": 2.548843594863348e-09, "logits/chosen": -1.0619758367538452, "logits/rejected": -1.0472029447555542, "logps/chosen": -56.92826461791992, "logps/rejected": -77.54266357421875, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -1.1233818531036377, "rewards/margins": 6.907271385192871, "rewards/rejected": -8.03065299987793, "step": 1289 }, { "epoch": 7.644444444444445, "grad_norm": 5.08850792852047, "learning_rate": 2.4571517157916944e-09, "logits/chosen": -1.102142095565796, "logits/rejected": -1.2241641283035278, "logps/chosen": -38.644996643066406, "logps/rejected": -68.89688110351562, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -0.49043506383895874, "rewards/margins": 5.104089736938477, "rewards/rejected": -5.594525337219238, "step": 1290 }, { "epoch": 7.6503703703703705, "grad_norm": 4.098051438580383, "learning_rate": 2.3671313017046557e-09, "logits/chosen": -1.258387804031372, "logits/rejected": -1.3357480764389038, "logps/chosen": -55.586822509765625, "logps/rejected": -69.7269287109375, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -1.8577889204025269, "rewards/margins": 6.0642290115356445, "rewards/rejected": -7.922017574310303, "step": 1291 }, { "epoch": 7.656296296296296, "grad_norm": 3.4376186407034077, "learning_rate": 2.27878296044029e-09, "logits/chosen": -1.2755929231643677, "logits/rejected": -1.3270050287246704, "logps/chosen": -50.85591125488281, "logps/rejected": -72.466796875, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -0.7081777453422546, "rewards/margins": 5.251181602478027, "rewards/rejected": -5.959359169006348, "step": 1292 }, { "epoch": 7.662222222222223, "grad_norm": 3.023785417947227, "learning_rate": 2.1921072885464633e-09, "logits/chosen": -1.2608731985092163, "logits/rejected": -1.3410117626190186, "logps/chosen": -44.24419021606445, "logps/rejected": -62.62995147705078, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -0.6164356470108032, "rewards/margins": 4.980905532836914, "rewards/rejected": -5.597341537475586, "step": 1293 }, { "epoch": 7.668148148148148, "grad_norm": 3.9892458828568556, "learning_rate": 2.1071048712768545e-09, "logits/chosen": -1.1189539432525635, "logits/rejected": -1.13387131690979, "logps/chosen": -44.46137619018555, "logps/rejected": -64.4898452758789, "loss": 0.0292, "rewards/accuracies": 1.0, "rewards/chosen": -0.322096586227417, "rewards/margins": 5.642068862915039, "rewards/rejected": -5.964165687561035, "step": 1294 }, { "epoch": 7.674074074074074, "grad_norm": 3.060773123486522, "learning_rate": 2.0237762825868752e-09, "logits/chosen": -1.179800271987915, "logits/rejected": -1.2812291383743286, "logps/chosen": -57.156944274902344, "logps/rejected": -73.54817199707031, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -1.3885247707366943, "rewards/margins": 6.368008613586426, "rewards/rejected": -7.756533145904541, "step": 1295 }, { "epoch": 7.68, "grad_norm": 3.6760645913887284, "learning_rate": 1.9421220851298657e-09, "logits/chosen": -1.1212431192398071, "logits/rejected": -1.2247122526168823, "logps/chosen": -49.63182830810547, "logps/rejected": -77.47969055175781, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -1.0002086162567139, "rewards/margins": 5.092729568481445, "rewards/rejected": -6.092937469482422, "step": 1296 }, { "epoch": 7.685925925925926, "grad_norm": 3.9251535192169014, "learning_rate": 1.8621428302533492e-09, "logits/chosen": -1.1562796831130981, "logits/rejected": -1.1696228981018066, "logps/chosen": -50.657447814941406, "logps/rejected": -73.50979614257812, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -1.1022233963012695, "rewards/margins": 5.4585466384887695, "rewards/rejected": -6.560770034790039, "step": 1297 }, { "epoch": 7.691851851851852, "grad_norm": 2.8265644326242656, "learning_rate": 1.7838390579952567e-09, "logits/chosen": -1.055465579032898, "logits/rejected": -1.1630399227142334, "logps/chosen": -48.33147048950195, "logps/rejected": -65.17540740966797, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -0.8744292855262756, "rewards/margins": 6.498607635498047, "rewards/rejected": -7.373037338256836, "step": 1298 }, { "epoch": 7.697777777777778, "grad_norm": 4.040001168249482, "learning_rate": 1.7072112970802633e-09, "logits/chosen": -1.2334121465682983, "logits/rejected": -1.3498921394348145, "logps/chosen": -43.995628356933594, "logps/rejected": -73.16790008544922, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -0.6232740879058838, "rewards/margins": 5.393784523010254, "rewards/rejected": -6.017059326171875, "step": 1299 }, { "epoch": 7.703703703703704, "grad_norm": 4.734494032620544, "learning_rate": 1.6322600649162354e-09, "logits/chosen": -1.0275055170059204, "logits/rejected": -1.0896284580230713, "logps/chosen": -49.858680725097656, "logps/rejected": -62.68039321899414, "loss": 0.048, "rewards/accuracies": 1.0, "rewards/chosen": -0.7178724408149719, "rewards/margins": 4.1225457191467285, "rewards/rejected": -4.840417861938477, "step": 1300 }, { "epoch": 7.70962962962963, "grad_norm": 6.242524488086293, "learning_rate": 1.5589858675907618e-09, "logits/chosen": -1.3054200410842896, "logits/rejected": -1.3421804904937744, "logps/chosen": -52.420509338378906, "logps/rejected": -70.79682159423828, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": -0.030725568532943726, "rewards/margins": 4.999563217163086, "rewards/rejected": -5.0302886962890625, "step": 1301 }, { "epoch": 7.7155555555555555, "grad_norm": 4.116512529426615, "learning_rate": 1.4873891998677112e-09, "logits/chosen": -1.1158173084259033, "logits/rejected": -1.3114423751831055, "logps/chosen": -43.815330505371094, "logps/rejected": -63.80662155151367, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": -0.29764774441719055, "rewards/margins": 4.472417831420898, "rewards/rejected": -4.770065784454346, "step": 1302 }, { "epoch": 7.721481481481481, "grad_norm": 5.115315312723463, "learning_rate": 1.4174705451838743e-09, "logits/chosen": -1.3780591487884521, "logits/rejected": -1.323752999305725, "logps/chosen": -50.43423843383789, "logps/rejected": -63.291046142578125, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": -0.10267294943332672, "rewards/margins": 4.516208648681641, "rewards/rejected": -4.618881702423096, "step": 1303 }, { "epoch": 7.727407407407408, "grad_norm": 3.1452341302331384, "learning_rate": 1.3492303756457158e-09, "logits/chosen": -1.3025587797164917, "logits/rejected": -1.4482269287109375, "logps/chosen": -52.70558166503906, "logps/rejected": -90.87263488769531, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -2.543006181716919, "rewards/margins": 7.467383861541748, "rewards/rejected": -10.010390281677246, "step": 1304 }, { "epoch": 7.733333333333333, "grad_norm": 3.6249017461365907, "learning_rate": 1.2826691520262112e-09, "logits/chosen": -1.281097650527954, "logits/rejected": -1.2951496839523315, "logps/chosen": -43.05864334106445, "logps/rejected": -68.95799255371094, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -0.8746381998062134, "rewards/margins": 5.1224164962768555, "rewards/rejected": -5.9970550537109375, "step": 1305 }, { "epoch": 7.739259259259259, "grad_norm": 4.401779096214971, "learning_rate": 1.2177873237617375e-09, "logits/chosen": -1.2022982835769653, "logits/rejected": -1.216440200805664, "logps/chosen": -62.892704010009766, "logps/rejected": -64.79084777832031, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": -1.37397301197052, "rewards/margins": 5.186420440673828, "rewards/rejected": -6.560393810272217, "step": 1306 }, { "epoch": 7.745185185185186, "grad_norm": 4.374384420762003, "learning_rate": 1.1545853289489927e-09, "logits/chosen": -1.10579252243042, "logits/rejected": -1.1660892963409424, "logps/chosen": -39.71818542480469, "logps/rejected": -54.736663818359375, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -0.8321702480316162, "rewards/margins": 4.650611877441406, "rewards/rejected": -5.482782363891602, "step": 1307 }, { "epoch": 7.751111111111111, "grad_norm": 4.6211551819830445, "learning_rate": 1.0930635943420253e-09, "logits/chosen": -1.1341114044189453, "logits/rejected": -1.2060933113098145, "logps/chosen": -41.124813079833984, "logps/rejected": -74.39026641845703, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -0.612218976020813, "rewards/margins": 6.23983097076416, "rewards/rejected": -6.852049827575684, "step": 1308 }, { "epoch": 7.757037037037037, "grad_norm": 3.088853964662203, "learning_rate": 1.0332225353494318e-09, "logits/chosen": -1.0827617645263672, "logits/rejected": -1.1487969160079956, "logps/chosen": -60.728294372558594, "logps/rejected": -70.6673355102539, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -0.9318689703941345, "rewards/margins": 5.681600570678711, "rewards/rejected": -6.61346960067749, "step": 1309 }, { "epoch": 7.762962962962963, "grad_norm": 3.895796986682556, "learning_rate": 9.750625560315528e-10, "logits/chosen": -1.2412290573120117, "logits/rejected": -1.1642299890518188, "logps/chosen": -56.81752014160156, "logps/rejected": -69.71331787109375, "loss": 0.0306, "rewards/accuracies": 1.0, "rewards/chosen": -0.648478627204895, "rewards/margins": 5.765992164611816, "rewards/rejected": -6.414470672607422, "step": 1310 }, { "epoch": 7.768888888888889, "grad_norm": 4.639857829007214, "learning_rate": 9.185840490975594e-10, "logits/chosen": -0.9799739122390747, "logits/rejected": -0.985424816608429, "logps/chosen": -50.25916290283203, "logps/rejected": -67.01441192626953, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -0.46648460626602173, "rewards/margins": 6.257501602172852, "rewards/rejected": -6.723986625671387, "step": 1311 }, { "epoch": 7.774814814814815, "grad_norm": 4.5476728089506535, "learning_rate": 8.637873959031206e-10, "logits/chosen": -1.2363190650939941, "logits/rejected": -1.3365875482559204, "logps/chosen": -44.06676483154297, "logps/rejected": -64.12844848632812, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -0.49657464027404785, "rewards/margins": 4.909327507019043, "rewards/rejected": -5.405901908874512, "step": 1312 }, { "epoch": 7.7807407407407405, "grad_norm": 2.8518153992439164, "learning_rate": 8.106729664475176e-10, "logits/chosen": -1.3071974515914917, "logits/rejected": -1.4458099603652954, "logps/chosen": -40.395233154296875, "logps/rejected": -70.80223083496094, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -0.7430353760719299, "rewards/margins": 5.836602210998535, "rewards/rejected": -6.579637050628662, "step": 1313 }, { "epoch": 7.786666666666667, "grad_norm": 4.626148207060677, "learning_rate": 7.592411193713122e-10, "logits/chosen": -1.2434099912643433, "logits/rejected": -1.327384352684021, "logps/chosen": -55.95722198486328, "logps/rejected": -87.88571166992188, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -1.251700520515442, "rewards/margins": 5.727961540222168, "rewards/rejected": -6.97966194152832, "step": 1314 }, { "epoch": 7.792592592592593, "grad_norm": 3.873445572325953, "learning_rate": 7.094922019539318e-10, "logits/chosen": -1.0838680267333984, "logits/rejected": -1.211837649345398, "logps/chosen": -36.55458450317383, "logps/rejected": -60.68565368652344, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -0.6588367223739624, "rewards/margins": 5.787833213806152, "rewards/rejected": -6.446670055389404, "step": 1315 }, { "epoch": 7.798518518518518, "grad_norm": 3.5831381543129233, "learning_rate": 6.61426550111227e-10, "logits/chosen": -1.0004241466522217, "logits/rejected": -1.0982451438903809, "logps/chosen": -43.107383728027344, "logps/rejected": -79.90531921386719, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -0.6238442063331604, "rewards/margins": 6.600798606872559, "rewards/rejected": -7.224643707275391, "step": 1316 }, { "epoch": 7.804444444444444, "grad_norm": 4.434687124407557, "learning_rate": 6.150444883933348e-10, "logits/chosen": -1.3124284744262695, "logits/rejected": -1.3378134965896606, "logps/chosen": -51.23555374145508, "logps/rejected": -83.02464294433594, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -0.7402288317680359, "rewards/margins": 6.272833824157715, "rewards/rejected": -7.013063430786133, "step": 1317 }, { "epoch": 7.810370370370371, "grad_norm": 3.7031935136455956, "learning_rate": 5.703463299823186e-10, "logits/chosen": -1.2910884618759155, "logits/rejected": -1.3359012603759766, "logps/chosen": -39.01679992675781, "logps/rejected": -87.91451263427734, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": -0.3888285160064697, "rewards/margins": 8.184977531433105, "rewards/rejected": -8.573806762695312, "step": 1318 }, { "epoch": 7.816296296296296, "grad_norm": 3.280126945946635, "learning_rate": 5.27332376690226e-10, "logits/chosen": -1.2180700302124023, "logits/rejected": -1.213331937789917, "logps/chosen": -47.62986755371094, "logps/rejected": -80.37245178222656, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -0.8481385707855225, "rewards/margins": 5.588566303253174, "rewards/rejected": -6.436704635620117, "step": 1319 }, { "epoch": 7.822222222222222, "grad_norm": 4.285051151960186, "learning_rate": 4.860029189569237e-10, "logits/chosen": -1.158015251159668, "logits/rejected": -1.0862349271774292, "logps/chosen": -57.20246124267578, "logps/rejected": -61.53093719482422, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.9121618270874023, "rewards/margins": 5.656917572021484, "rewards/rejected": -6.569079399108887, "step": 1320 }, { "epoch": 7.8281481481481485, "grad_norm": 3.161324387187779, "learning_rate": 4.463582358482376e-10, "logits/chosen": -0.9615970849990845, "logits/rejected": -1.0251832008361816, "logps/chosen": -48.84516525268555, "logps/rejected": -81.93156433105469, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -0.4886060655117035, "rewards/margins": 6.42310905456543, "rewards/rejected": -6.911715507507324, "step": 1321 }, { "epoch": 7.834074074074074, "grad_norm": 3.873331978451794, "learning_rate": 4.083985950539548e-10, "logits/chosen": -1.2927745580673218, "logits/rejected": -1.2420415878295898, "logps/chosen": -56.42285919189453, "logps/rejected": -77.27031707763672, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -1.0932049751281738, "rewards/margins": 6.1244635581970215, "rewards/rejected": -7.217668533325195, "step": 1322 }, { "epoch": 7.84, "grad_norm": 2.9511525570711648, "learning_rate": 3.721242528861024e-10, "logits/chosen": -1.0464425086975098, "logits/rejected": -1.0884580612182617, "logps/chosen": -46.888404846191406, "logps/rejected": -66.87775421142578, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": -0.9894177913665771, "rewards/margins": 5.856396675109863, "rewards/rejected": -6.845814228057861, "step": 1323 }, { "epoch": 7.8459259259259255, "grad_norm": 2.9263330192188524, "learning_rate": 3.3753545427722687e-10, "logits/chosen": -1.2160556316375732, "logits/rejected": -1.2635902166366577, "logps/chosen": -50.17317199707031, "logps/rejected": -78.7654037475586, "loss": 0.0252, "rewards/accuracies": 1.0, "rewards/chosen": -0.6943256258964539, "rewards/margins": 5.114144802093506, "rewards/rejected": -5.808470249176025, "step": 1324 }, { "epoch": 7.851851851851852, "grad_norm": 4.191017644309632, "learning_rate": 3.0463243277864534e-10, "logits/chosen": -1.2540899515151978, "logits/rejected": -1.260279893875122, "logps/chosen": -50.96095657348633, "logps/rejected": -63.11212158203125, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -0.22250595688819885, "rewards/margins": 5.157078266143799, "rewards/rejected": -5.379584312438965, "step": 1325 }, { "epoch": 7.857777777777778, "grad_norm": 3.78918431544521, "learning_rate": 2.734154105589748e-10, "logits/chosen": -1.2464838027954102, "logits/rejected": -1.232421875, "logps/chosen": -39.673423767089844, "logps/rejected": -56.34529113769531, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -0.18317142128944397, "rewards/margins": 5.008199691772461, "rewards/rejected": -5.191371440887451, "step": 1326 }, { "epoch": 7.863703703703703, "grad_norm": 3.794118871416381, "learning_rate": 2.4388459840257724e-10, "logits/chosen": -1.1672354936599731, "logits/rejected": -1.2810213565826416, "logps/chosen": -44.134117126464844, "logps/rejected": -68.70030975341797, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -0.8252700567245483, "rewards/margins": 6.5332794189453125, "rewards/rejected": -7.35854959487915, "step": 1327 }, { "epoch": 7.86962962962963, "grad_norm": 5.5075916247068655, "learning_rate": 2.1604019570811704e-10, "logits/chosen": -1.1713049411773682, "logits/rejected": -1.2617361545562744, "logps/chosen": -54.49160385131836, "logps/rejected": -71.02827453613281, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": -1.0777705907821655, "rewards/margins": 6.524770736694336, "rewards/rejected": -7.602540969848633, "step": 1328 }, { "epoch": 7.875555555555556, "grad_norm": 4.0258473530695, "learning_rate": 1.8988239048725595e-10, "logits/chosen": -1.265380859375, "logits/rejected": -1.3449554443359375, "logps/chosen": -49.377262115478516, "logps/rejected": -76.43204498291016, "loss": 0.0301, "rewards/accuracies": 1.0, "rewards/chosen": -1.4428175687789917, "rewards/margins": 6.493406295776367, "rewards/rejected": -7.936223983764648, "step": 1329 }, { "epoch": 7.881481481481481, "grad_norm": 3.4526817942675954, "learning_rate": 1.6541135936343208e-10, "logits/chosen": -1.0449414253234863, "logits/rejected": -1.052280068397522, "logps/chosen": -59.01203536987305, "logps/rejected": -109.54611206054688, "loss": 0.0267, "rewards/accuracies": 1.0, "rewards/chosen": -1.199272632598877, "rewards/margins": 7.87403678894043, "rewards/rejected": -9.073308944702148, "step": 1330 }, { "epoch": 7.887407407407407, "grad_norm": 3.4515341264688244, "learning_rate": 1.426272675704998e-10, "logits/chosen": -1.0453736782073975, "logits/rejected": -1.174626350402832, "logps/chosen": -51.22376251220703, "logps/rejected": -77.333740234375, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": -0.14971916377544403, "rewards/margins": 7.365710735321045, "rewards/rejected": -7.515429496765137, "step": 1331 }, { "epoch": 7.8933333333333335, "grad_norm": 4.2431530547055685, "learning_rate": 1.2153026895178608e-10, "logits/chosen": -1.2558284997940063, "logits/rejected": -1.230025053024292, "logps/chosen": -61.31620788574219, "logps/rejected": -71.90235137939453, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.9064638614654541, "rewards/margins": 6.033405303955078, "rewards/rejected": -6.939869403839111, "step": 1332 }, { "epoch": 7.899259259259259, "grad_norm": 4.070328981967792, "learning_rate": 1.0212050595895249e-10, "logits/chosen": -1.1583914756774902, "logits/rejected": -1.1680545806884766, "logps/chosen": -50.851749420166016, "logps/rejected": -60.996482849121094, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 0.11362215876579285, "rewards/margins": 5.627436637878418, "rewards/rejected": -5.513814926147461, "step": 1333 }, { "epoch": 7.905185185185185, "grad_norm": 3.568567101282868, "learning_rate": 8.439810965113481e-11, "logits/chosen": -1.3088608980178833, "logits/rejected": -1.3825346231460571, "logps/chosen": -40.22638702392578, "logps/rejected": -64.48670959472656, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -0.6336486339569092, "rewards/margins": 4.98115873336792, "rewards/rejected": -5.61480712890625, "step": 1334 }, { "epoch": 7.911111111111111, "grad_norm": 3.0558534132855786, "learning_rate": 6.836319969388827e-11, "logits/chosen": -1.2456482648849487, "logits/rejected": -1.3389393091201782, "logps/chosen": -49.01996612548828, "logps/rejected": -72.98611450195312, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": -0.6034674644470215, "rewards/margins": 5.329452037811279, "rewards/rejected": -5.932919502258301, "step": 1335 }, { "epoch": 7.917037037037037, "grad_norm": 3.3169742529023445, "learning_rate": 5.4015884358549204e-11, "logits/chosen": -1.1440975666046143, "logits/rejected": -1.1358253955841064, "logps/chosen": -50.89934539794922, "logps/rejected": -66.2007064819336, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -0.8520979881286621, "rewards/margins": 6.465874195098877, "rewards/rejected": -7.317971229553223, "step": 1336 }, { "epoch": 7.922962962962963, "grad_norm": 4.224810833586431, "learning_rate": 4.135626052143015e-11, "logits/chosen": -1.095085620880127, "logits/rejected": -1.1812586784362793, "logps/chosen": -46.71308898925781, "logps/rejected": -73.55007934570312, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -1.1982324123382568, "rewards/margins": 6.405635356903076, "rewards/rejected": -7.603867530822754, "step": 1337 }, { "epoch": 7.928888888888888, "grad_norm": 5.206045992657888, "learning_rate": 3.0384413663125944e-11, "logits/chosen": -1.204892635345459, "logits/rejected": -1.2737915515899658, "logps/chosen": -50.526206970214844, "logps/rejected": -62.54721450805664, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": -0.9793308973312378, "rewards/margins": 3.79130482673645, "rewards/rejected": -4.770635604858398, "step": 1338 }, { "epoch": 7.934814814814815, "grad_norm": 4.090212912205461, "learning_rate": 2.110041786804184e-11, "logits/chosen": -1.1907331943511963, "logits/rejected": -1.2384510040283203, "logps/chosen": -58.870052337646484, "logps/rejected": -83.39106750488281, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -1.765225887298584, "rewards/margins": 6.5894670486450195, "rewards/rejected": -8.354693412780762, "step": 1339 }, { "epoch": 7.940740740740741, "grad_norm": 3.288258776425581, "learning_rate": 1.350433582381072e-11, "logits/chosen": -1.0594743490219116, "logits/rejected": -1.071734070777893, "logps/chosen": -42.901512145996094, "logps/rejected": -69.27069091796875, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -0.2702757716178894, "rewards/margins": 5.44796895980835, "rewards/rejected": -5.718244552612305, "step": 1340 }, { "epoch": 7.946666666666666, "grad_norm": 4.191786397560346, "learning_rate": 7.596218820876688e-12, "logits/chosen": -1.2260366678237915, "logits/rejected": -1.3320492506027222, "logps/chosen": -66.5378646850586, "logps/rejected": -68.939208984375, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -1.2590943574905396, "rewards/margins": 5.165265083312988, "rewards/rejected": -6.424359321594238, "step": 1341 }, { "epoch": 7.952592592592593, "grad_norm": 3.1073122257922434, "learning_rate": 3.376106752134289e-12, "logits/chosen": -1.0636171102523804, "logits/rejected": -1.1753029823303223, "logps/chosen": -33.64069747924805, "logps/rejected": -56.730262756347656, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 0.11559783667325974, "rewards/margins": 5.496131420135498, "rewards/rejected": -5.380533695220947, "step": 1342 }, { "epoch": 7.9585185185185185, "grad_norm": 3.4153108889579182, "learning_rate": 8.440281127897186e-13, "logits/chosen": -1.0520870685577393, "logits/rejected": -1.1678812503814697, "logps/chosen": -58.156349182128906, "logps/rejected": -92.31648254394531, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -0.6064121127128601, "rewards/margins": 8.176019668579102, "rewards/rejected": -8.782432556152344, "step": 1343 }, { "epoch": 7.964444444444444, "grad_norm": 3.2275881385738243, "learning_rate": 0.0, "logits/chosen": -0.9758695363998413, "logits/rejected": -0.9926738142967224, "logps/chosen": -50.735172271728516, "logps/rejected": -66.82237243652344, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -0.9227455258369446, "rewards/margins": 5.92061710357666, "rewards/rejected": -6.843362331390381, "step": 1344 }, { "epoch": 7.964444444444444, "step": 1344, "total_flos": 0.0, "train_loss": 0.19260043763954723, "train_runtime": 3828.823, "train_samples_per_second": 22.555, "train_steps_per_second": 0.351 } ], "logging_steps": 1, "max_steps": 1344, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }