diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20202 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.964444444444444, + "eval_steps": 100, + "global_step": 1344, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005925925925925926, + "grad_norm": 42.43893981091583, + "learning_rate": 3.7037037037037036e-09, + "logits/chosen": -1.310781478881836, + "logits/rejected": -1.393431305885315, + "logps/chosen": -52.985904693603516, + "logps/rejected": -57.095699310302734, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.011851851851851851, + "grad_norm": 35.96864569231992, + "learning_rate": 7.407407407407407e-09, + "logits/chosen": -1.4714622497558594, + "logits/rejected": -1.5260519981384277, + "logps/chosen": -50.45790100097656, + "logps/rejected": -54.5156135559082, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 39.18444558531369, + "learning_rate": 1.111111111111111e-08, + "logits/chosen": -1.4416756629943848, + "logits/rejected": -1.3762359619140625, + "logps/chosen": -45.7575798034668, + "logps/rejected": -60.49638748168945, + "loss": 0.6978, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.011297130957245827, + "rewards/margins": -0.006299828179180622, + "rewards/rejected": -0.00499730184674263, + "step": 3 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 39.65552735879418, + "learning_rate": 1.4814814814814814e-08, + "logits/chosen": -1.4207963943481445, + "logits/rejected": -1.5604076385498047, + "logps/chosen": -37.122886657714844, + "logps/rejected": -54.21310043334961, + "loss": 0.7077, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.026990916579961777, + "rewards/margins": -0.015975551679730415, + "rewards/rejected": 0.042966462671756744, + "step": 4 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 36.596040564401264, + "learning_rate": 1.8518518518518518e-08, + "logits/chosen": -1.4846137762069702, + "logits/rejected": -1.4979444742202759, + "logps/chosen": -44.22126388549805, + "logps/rejected": -55.1204833984375, + "loss": 0.6865, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.007357514463365078, + "rewards/margins": 0.031326450407505035, + "rewards/rejected": -0.023968935012817383, + "step": 5 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 36.320492249631755, + "learning_rate": 2.222222222222222e-08, + "logits/chosen": -1.4549423456192017, + "logits/rejected": -1.4336495399475098, + "logps/chosen": -42.29203414916992, + "logps/rejected": -40.219139099121094, + "loss": 0.691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005795812234282494, + "rewards/margins": -0.029732275754213333, + "rewards/rejected": 0.03552808612585068, + "step": 6 + }, + { + "epoch": 0.04148148148148148, + "grad_norm": 34.8509598127472, + "learning_rate": 2.5925925925925923e-08, + "logits/chosen": -1.5144797563552856, + "logits/rejected": -1.5598227977752686, + "logps/chosen": -46.01911163330078, + "logps/rejected": -48.391319274902344, + "loss": 0.6861, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.021584009751677513, + "rewards/margins": -0.008964895270764828, + "rewards/rejected": -0.01261911354959011, + "step": 7 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 41.834639343315644, + "learning_rate": 2.962962962962963e-08, + "logits/chosen": -1.5673420429229736, + "logits/rejected": -1.5740389823913574, + "logps/chosen": -44.32832717895508, + "logps/rejected": -58.74522399902344, + "loss": 0.7043, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01198368240147829, + "rewards/margins": -0.015015724115073681, + "rewards/rejected": 0.0030320417135953903, + "step": 8 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 38.052460837465965, + "learning_rate": 3.3333333333333334e-08, + "logits/chosen": -1.5391089916229248, + "logits/rejected": -1.533707857131958, + "logps/chosen": -62.247337341308594, + "logps/rejected": -66.13418579101562, + "loss": 0.7055, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.042546942830085754, + "rewards/margins": 0.04441840201616287, + "rewards/rejected": -0.0018714680336415768, + "step": 9 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 37.13655017506449, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -1.3837740421295166, + "logits/rejected": -1.4348113536834717, + "logps/chosen": -57.03742599487305, + "logps/rejected": -53.65230178833008, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.038426827639341354, + "rewards/margins": 0.047683194279670715, + "rewards/rejected": -0.08611002564430237, + "step": 10 + }, + { + "epoch": 0.06518518518518518, + "grad_norm": 41.67417730129325, + "learning_rate": 4.0740740740740745e-08, + "logits/chosen": -1.591475248336792, + "logits/rejected": -1.522184133529663, + "logps/chosen": -55.395172119140625, + "logps/rejected": -50.715797424316406, + "loss": 0.6957, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012220479547977448, + "rewards/margins": -0.010918429121375084, + "rewards/rejected": 0.023138903081417084, + "step": 11 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 42.17041570253324, + "learning_rate": 4.444444444444444e-08, + "logits/chosen": -1.4714893102645874, + "logits/rejected": -1.5695397853851318, + "logps/chosen": -50.417884826660156, + "logps/rejected": -50.72690200805664, + "loss": 0.7116, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.003867245279252529, + "rewards/margins": -0.008899686858057976, + "rewards/rejected": 0.005032443441450596, + "step": 12 + }, + { + "epoch": 0.07703703703703704, + "grad_norm": 34.71785885257626, + "learning_rate": 4.814814814814814e-08, + "logits/chosen": -1.4874341487884521, + "logits/rejected": -1.5213749408721924, + "logps/chosen": -38.02253341674805, + "logps/rejected": -43.72047424316406, + "loss": 0.695, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0014143595471978188, + "rewards/margins": -0.03884340822696686, + "rewards/rejected": 0.037429049611091614, + "step": 13 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 42.675804996783135, + "learning_rate": 5.1851851851851846e-08, + "logits/chosen": -1.554866075515747, + "logits/rejected": -1.5048046112060547, + "logps/chosen": -46.5153694152832, + "logps/rejected": -63.505584716796875, + "loss": 0.7035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020593512803316116, + "rewards/margins": 0.015447389334440231, + "rewards/rejected": 0.005146123003214598, + "step": 14 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 39.68741922264209, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -1.4316121339797974, + "logits/rejected": -1.4272632598876953, + "logps/chosen": -43.43450927734375, + "logps/rejected": -52.10813903808594, + "loss": 0.6677, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05863571539521217, + "rewards/margins": 0.10169437527656555, + "rewards/rejected": -0.04305865988135338, + "step": 15 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 37.66976673357867, + "learning_rate": 5.925925925925926e-08, + "logits/chosen": -1.4010733366012573, + "logits/rejected": -1.4859191179275513, + "logps/chosen": -44.7211799621582, + "logps/rejected": -54.39717102050781, + "loss": 0.7125, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.021378686651587486, + "rewards/margins": -0.03166306018829346, + "rewards/rejected": 0.010284376330673695, + "step": 16 + }, + { + "epoch": 0.10074074074074074, + "grad_norm": 40.434320443594345, + "learning_rate": 6.296296296296296e-08, + "logits/chosen": -1.587019681930542, + "logits/rejected": -1.583219289779663, + "logps/chosen": -56.18910598754883, + "logps/rejected": -54.055110931396484, + "loss": 0.69, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02960667759180069, + "rewards/margins": 0.0984027162194252, + "rewards/rejected": -0.06879603862762451, + "step": 17 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 38.09471520711338, + "learning_rate": 6.666666666666667e-08, + "logits/chosen": -1.4128365516662598, + "logits/rejected": -1.4133434295654297, + "logps/chosen": -60.68763732910156, + "logps/rejected": -49.51874542236328, + "loss": 0.6935, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.01688566617667675, + "rewards/margins": -0.001714322715997696, + "rewards/rejected": 0.018599988892674446, + "step": 18 + }, + { + "epoch": 0.11259259259259259, + "grad_norm": 39.11514807718614, + "learning_rate": 7.037037037037038e-08, + "logits/chosen": -1.489649772644043, + "logits/rejected": -1.4198248386383057, + "logps/chosen": -50.41545867919922, + "logps/rejected": -52.61773681640625, + "loss": 0.6957, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07306747138500214, + "rewards/margins": 0.03087785467505455, + "rewards/rejected": 0.042189620435237885, + "step": 19 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 39.97363440787134, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -1.3663185834884644, + "logits/rejected": -1.535215973854065, + "logps/chosen": -43.01957702636719, + "logps/rejected": -57.80477523803711, + "loss": 0.6968, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0261086318641901, + "rewards/margins": -0.05911754071712494, + "rewards/rejected": 0.03300891071557999, + "step": 20 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 41.07559338556434, + "learning_rate": 7.777777777777778e-08, + "logits/chosen": -1.3014625310897827, + "logits/rejected": -1.3382461071014404, + "logps/chosen": -49.614036560058594, + "logps/rejected": -62.0330810546875, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022783983498811722, + "rewards/margins": 0.005002446472644806, + "rewards/rejected": 0.017781544476747513, + "step": 21 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 38.35221350226358, + "learning_rate": 8.148148148148149e-08, + "logits/chosen": -1.3671640157699585, + "logits/rejected": -1.4167208671569824, + "logps/chosen": -36.43228530883789, + "logps/rejected": -43.55463409423828, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04212541878223419, + "rewards/margins": 0.004528047516942024, + "rewards/rejected": 0.03759737312793732, + "step": 22 + }, + { + "epoch": 0.1362962962962963, + "grad_norm": 38.57262376611147, + "learning_rate": 8.518518518518517e-08, + "logits/chosen": -1.4059712886810303, + "logits/rejected": -1.4144426584243774, + "logps/chosen": -43.566017150878906, + "logps/rejected": -51.942710876464844, + "loss": 0.6972, + "rewards/accuracies": 0.4375, + "rewards/chosen": -4.487065598368645e-05, + "rewards/margins": -0.00031371042132377625, + "rewards/rejected": 0.00026884046383202076, + "step": 23 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 38.93480155482321, + "learning_rate": 8.888888888888888e-08, + "logits/chosen": -1.4173849821090698, + "logits/rejected": -1.436640977859497, + "logps/chosen": -49.499488830566406, + "logps/rejected": -61.60590362548828, + "loss": 0.6928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.043488118797540665, + "rewards/margins": 0.019792892038822174, + "rewards/rejected": -0.06328101456165314, + "step": 24 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 36.27802321770151, + "learning_rate": 9.259259259259258e-08, + "logits/chosen": -1.4764326810836792, + "logits/rejected": -1.6459614038467407, + "logps/chosen": -43.81329345703125, + "logps/rejected": -57.16102981567383, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007635999470949173, + "rewards/margins": 0.027008555829524994, + "rewards/rejected": -0.01937256008386612, + "step": 25 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 41.196033400227655, + "learning_rate": 9.629629629629629e-08, + "logits/chosen": -1.4573525190353394, + "logits/rejected": -1.5029797554016113, + "logps/chosen": -53.4759521484375, + "logps/rejected": -60.73443603515625, + "loss": 0.7043, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01736607775092125, + "rewards/margins": 0.027802232652902603, + "rewards/rejected": -0.010436153039336205, + "step": 26 + }, + { + "epoch": 0.16, + "grad_norm": 44.1756798619816, + "learning_rate": 1e-07, + "logits/chosen": -1.467996597290039, + "logits/rejected": -1.4205275774002075, + "logps/chosen": -59.43016815185547, + "logps/rejected": -52.35725784301758, + "loss": 0.69, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.014531444758176804, + "rewards/margins": 0.015589211136102676, + "rewards/rejected": -0.001057768240571022, + "step": 27 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 38.02358706006054, + "learning_rate": 1.0370370370370369e-07, + "logits/chosen": -1.4217634201049805, + "logits/rejected": -1.4778153896331787, + "logps/chosen": -45.79845428466797, + "logps/rejected": -53.97588348388672, + "loss": 0.7112, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.012649372220039368, + "rewards/margins": -0.03449837863445282, + "rewards/rejected": 0.04714775085449219, + "step": 28 + }, + { + "epoch": 0.17185185185185184, + "grad_norm": 36.83123424027711, + "learning_rate": 1.074074074074074e-07, + "logits/chosen": -1.3519933223724365, + "logits/rejected": -1.4353563785552979, + "logps/chosen": -33.142181396484375, + "logps/rejected": -50.9314079284668, + "loss": 0.6873, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.014912517741322517, + "rewards/margins": 0.017467448487877846, + "rewards/rejected": -0.032379962503910065, + "step": 29 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 38.82944235868051, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -1.3293776512145996, + "logits/rejected": -1.4100325107574463, + "logps/chosen": -45.714385986328125, + "logps/rejected": -62.283958435058594, + "loss": 0.688, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007679283618927002, + "rewards/margins": -0.010211003012955189, + "rewards/rejected": 0.002531719394028187, + "step": 30 + }, + { + "epoch": 0.1837037037037037, + "grad_norm": 41.525322137451916, + "learning_rate": 1.148148148148148e-07, + "logits/chosen": -1.2476561069488525, + "logits/rejected": -1.365776538848877, + "logps/chosen": -42.67413330078125, + "logps/rejected": -58.3412971496582, + "loss": 0.7013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019379708915948868, + "rewards/margins": 0.01881086826324463, + "rewards/rejected": -0.038190580904483795, + "step": 31 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 37.28625199471067, + "learning_rate": 1.1851851851851851e-07, + "logits/chosen": -1.4711147546768188, + "logits/rejected": -1.4542691707611084, + "logps/chosen": -57.69325256347656, + "logps/rejected": -59.4173698425293, + "loss": 0.6773, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008729221299290657, + "rewards/margins": 0.050374746322631836, + "rewards/rejected": -0.059103965759277344, + "step": 32 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 38.71452491388633, + "learning_rate": 1.2222222222222222e-07, + "logits/chosen": -1.2580482959747314, + "logits/rejected": -1.3434240818023682, + "logps/chosen": -50.910953521728516, + "logps/rejected": -48.82550811767578, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006866408511996269, + "rewards/margins": 0.014877223409712315, + "rewards/rejected": -0.008010816760361195, + "step": 33 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 35.86254457726259, + "learning_rate": 1.2592592592592592e-07, + "logits/chosen": -1.5303350687026978, + "logits/rejected": -1.5936583280563354, + "logps/chosen": -44.84267044067383, + "logps/rejected": -43.60268783569336, + "loss": 0.6793, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010646509006619453, + "rewards/margins": -0.0017105825245380402, + "rewards/rejected": 0.012357092462480068, + "step": 34 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 40.452247232561945, + "learning_rate": 1.2962962962962961e-07, + "logits/chosen": -1.4947465658187866, + "logits/rejected": -1.5485597848892212, + "logps/chosen": -46.03197479248047, + "logps/rejected": -60.23851013183594, + "loss": 0.7077, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008130693808197975, + "rewards/margins": -0.0018020663410425186, + "rewards/rejected": -0.006328627001494169, + "step": 35 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 37.119370880492134, + "learning_rate": 1.3333333333333334e-07, + "logits/chosen": -1.3620657920837402, + "logits/rejected": -1.3718904256820679, + "logps/chosen": -42.104820251464844, + "logps/rejected": -60.524169921875, + "loss": 0.7101, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03648862987756729, + "rewards/margins": -0.007312392815947533, + "rewards/rejected": 0.04380102455615997, + "step": 36 + }, + { + "epoch": 0.21925925925925926, + "grad_norm": 36.94520667301027, + "learning_rate": 1.3703703703703703e-07, + "logits/chosen": -1.5933514833450317, + "logits/rejected": -1.5976018905639648, + "logps/chosen": -38.13121032714844, + "logps/rejected": -50.772254943847656, + "loss": 0.694, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.006040811538696289, + "rewards/margins": 0.006438873242586851, + "rewards/rejected": -0.012479686178267002, + "step": 37 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 39.60573256682946, + "learning_rate": 1.4074074074074075e-07, + "logits/chosen": -1.5454288721084595, + "logits/rejected": -1.6323232650756836, + "logps/chosen": -54.06110382080078, + "logps/rejected": -52.358184814453125, + "loss": 0.7076, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004916094243526459, + "rewards/margins": -0.002387760207056999, + "rewards/rejected": -0.0025283321738243103, + "step": 38 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 35.63198458544882, + "learning_rate": 1.4444444444444442e-07, + "logits/chosen": -1.275139331817627, + "logits/rejected": -1.3453179597854614, + "logps/chosen": -42.013126373291016, + "logps/rejected": -48.47417068481445, + "loss": 0.7046, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0050153713673353195, + "rewards/margins": -0.053548287600278854, + "rewards/rejected": 0.048532914370298386, + "step": 39 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 38.43712146198748, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -1.41544771194458, + "logits/rejected": -1.4644105434417725, + "logps/chosen": -51.85929870605469, + "logps/rejected": -68.28307342529297, + "loss": 0.6958, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.028657151386141777, + "rewards/margins": -0.009387994185090065, + "rewards/rejected": 0.03804514557123184, + "step": 40 + }, + { + "epoch": 0.24296296296296296, + "grad_norm": 38.20341525404962, + "learning_rate": 1.5185185185185184e-07, + "logits/chosen": -1.6255518198013306, + "logits/rejected": -1.5988264083862305, + "logps/chosen": -53.897212982177734, + "logps/rejected": -60.99618148803711, + "loss": 0.7005, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009833859279751778, + "rewards/margins": -0.0012347670271992683, + "rewards/rejected": 0.011068630963563919, + "step": 41 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 39.044305355608955, + "learning_rate": 1.5555555555555556e-07, + "logits/chosen": -1.3904306888580322, + "logits/rejected": -1.4621787071228027, + "logps/chosen": -53.870452880859375, + "logps/rejected": -57.930389404296875, + "loss": 0.6956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030896497890353203, + "rewards/margins": 0.02933361567556858, + "rewards/rejected": 0.001562881050631404, + "step": 42 + }, + { + "epoch": 0.2548148148148148, + "grad_norm": 35.3898254807426, + "learning_rate": 1.5925925925925926e-07, + "logits/chosen": -1.6222436428070068, + "logits/rejected": -1.590828776359558, + "logps/chosen": -34.60671615600586, + "logps/rejected": -40.5382194519043, + "loss": 0.6809, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.01461029052734375, + "rewards/margins": 0.029801419004797935, + "rewards/rejected": -0.04441170394420624, + "step": 43 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 39.131055743183715, + "learning_rate": 1.6296296296296298e-07, + "logits/chosen": -1.4192861318588257, + "logits/rejected": -1.3798493146896362, + "logps/chosen": -39.57558059692383, + "logps/rejected": -57.1050910949707, + "loss": 0.6817, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06627483665943146, + "rewards/margins": 0.06393895298242569, + "rewards/rejected": 0.0023358799517154694, + "step": 44 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 39.36639547497787, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -1.402430772781372, + "logits/rejected": -1.4567971229553223, + "logps/chosen": -42.92563247680664, + "logps/rejected": -59.93791580200195, + "loss": 0.7016, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.012485409155488014, + "rewards/margins": 0.012214185670018196, + "rewards/rejected": 0.00027122534811496735, + "step": 45 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 38.47445875334048, + "learning_rate": 1.7037037037037035e-07, + "logits/chosen": -1.4151440858840942, + "logits/rejected": -1.458237648010254, + "logps/chosen": -55.77102279663086, + "logps/rejected": -57.44105911254883, + "loss": 0.6955, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.015422536060214043, + "rewards/margins": 0.024210453033447266, + "rewards/rejected": -0.008787919767200947, + "step": 46 + }, + { + "epoch": 0.2785185185185185, + "grad_norm": 35.573818690899394, + "learning_rate": 1.7407407407407407e-07, + "logits/chosen": -1.395029067993164, + "logits/rejected": -1.434049367904663, + "logps/chosen": -61.55660629272461, + "logps/rejected": -63.84947967529297, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03462505340576172, + "rewards/margins": 0.07204857468605042, + "rewards/rejected": -0.0374235175549984, + "step": 47 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 43.30636407908927, + "learning_rate": 1.7777777777777776e-07, + "logits/chosen": -1.4949756860733032, + "logits/rejected": -1.5911140441894531, + "logps/chosen": -47.98042297363281, + "logps/rejected": -54.70000457763672, + "loss": 0.6929, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04974164813756943, + "rewards/margins": 0.1031041219830513, + "rewards/rejected": -0.053362466394901276, + "step": 48 + }, + { + "epoch": 0.2903703703703704, + "grad_norm": 39.70891417030015, + "learning_rate": 1.8148148148148149e-07, + "logits/chosen": -1.4629085063934326, + "logits/rejected": -1.4793230295181274, + "logps/chosen": -42.80840301513672, + "logps/rejected": -49.654842376708984, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04174475744366646, + "rewards/margins": 0.03507488965988159, + "rewards/rejected": 0.006669867318123579, + "step": 49 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 37.54848826529638, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -1.2061647176742554, + "logits/rejected": -1.3068634271621704, + "logps/chosen": -33.46271514892578, + "logps/rejected": -49.604408264160156, + "loss": 0.6726, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014253831468522549, + "rewards/margins": 0.03934905678033829, + "rewards/rejected": -0.025095226243138313, + "step": 50 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 37.46608721291794, + "learning_rate": 1.8888888888888888e-07, + "logits/chosen": -1.4794844388961792, + "logits/rejected": -1.5499848127365112, + "logps/chosen": -58.28867721557617, + "logps/rejected": -66.0478286743164, + "loss": 0.6953, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0060877809301018715, + "rewards/margins": -0.03297767788171768, + "rewards/rejected": 0.03906545788049698, + "step": 51 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 36.44525587089733, + "learning_rate": 1.9259259259259257e-07, + "logits/chosen": -1.5470339059829712, + "logits/rejected": -1.5108013153076172, + "logps/chosen": -38.12464141845703, + "logps/rejected": -48.32713317871094, + "loss": 0.6925, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01122739352285862, + "rewards/margins": -0.008649253286421299, + "rewards/rejected": -0.0025781395379453897, + "step": 52 + }, + { + "epoch": 0.31407407407407406, + "grad_norm": 34.72154774204845, + "learning_rate": 1.962962962962963e-07, + "logits/chosen": -1.5386773347854614, + "logits/rejected": -1.5055224895477295, + "logps/chosen": -41.71277618408203, + "logps/rejected": -41.09404754638672, + "loss": 0.7031, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.019169487059116364, + "rewards/margins": -0.00928646419197321, + "rewards/rejected": -0.00988302007317543, + "step": 53 + }, + { + "epoch": 0.32, + "grad_norm": 38.2339131295282, + "learning_rate": 2e-07, + "logits/chosen": -1.4301159381866455, + "logits/rejected": -1.532789945602417, + "logps/chosen": -46.67141342163086, + "logps/rejected": -68.1341552734375, + "loss": 0.6707, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04000416025519371, + "rewards/margins": 0.05813169479370117, + "rewards/rejected": -0.01812753826379776, + "step": 54 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 39.43452908201783, + "learning_rate": 2.0370370370370369e-07, + "logits/chosen": -1.5669282674789429, + "logits/rejected": -1.5772104263305664, + "logps/chosen": -41.083221435546875, + "logps/rejected": -58.070709228515625, + "loss": 0.6709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.042357515543699265, + "rewards/margins": 0.09110362827777863, + "rewards/rejected": -0.04874611273407936, + "step": 55 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 36.090920523616724, + "learning_rate": 2.0740740740740738e-07, + "logits/chosen": -1.5565029382705688, + "logits/rejected": -1.7220072746276855, + "logps/chosen": -61.488285064697266, + "logps/rejected": -65.77334594726562, + "loss": 0.6651, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08066320419311523, + "rewards/margins": 0.11545172333717346, + "rewards/rejected": -0.03478851169347763, + "step": 56 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 38.58486903890317, + "learning_rate": 2.111111111111111e-07, + "logits/chosen": -1.534893274307251, + "logits/rejected": -1.5569573640823364, + "logps/chosen": -48.127193450927734, + "logps/rejected": -49.27490997314453, + "loss": 0.6916, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.017229175195097923, + "rewards/margins": -0.015148879960179329, + "rewards/rejected": 0.03237805515527725, + "step": 57 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 36.94599798372785, + "learning_rate": 2.148148148148148e-07, + "logits/chosen": -1.4781184196472168, + "logits/rejected": -1.5944842100143433, + "logps/chosen": -46.216766357421875, + "logps/rejected": -53.072357177734375, + "loss": 0.6789, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0686614066362381, + "rewards/margins": 0.07688784599304199, + "rewards/rejected": -0.008226440288126469, + "step": 58 + }, + { + "epoch": 0.3496296296296296, + "grad_norm": 35.29207234594264, + "learning_rate": 2.1851851851851852e-07, + "logits/chosen": -1.4319119453430176, + "logits/rejected": -1.4858628511428833, + "logps/chosen": -49.45796203613281, + "logps/rejected": -48.20957946777344, + "loss": 0.675, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.004403375089168549, + "rewards/margins": 0.028496291488409042, + "rewards/rejected": -0.03289966657757759, + "step": 59 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 37.54885156466115, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -1.6024725437164307, + "logits/rejected": -1.6143323183059692, + "logps/chosen": -43.191246032714844, + "logps/rejected": -61.6234130859375, + "loss": 0.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020701315253973007, + "rewards/margins": 0.021595098078250885, + "rewards/rejected": -0.042296409606933594, + "step": 60 + }, + { + "epoch": 0.36148148148148146, + "grad_norm": 36.21246088451733, + "learning_rate": 2.2592592592592591e-07, + "logits/chosen": -1.5574266910552979, + "logits/rejected": -1.6002821922302246, + "logps/chosen": -40.11199188232422, + "logps/rejected": -50.22590255737305, + "loss": 0.6813, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.029111528769135475, + "rewards/margins": 0.06680956482887268, + "rewards/rejected": -0.03769803047180176, + "step": 61 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 39.21506378340429, + "learning_rate": 2.296296296296296e-07, + "logits/chosen": -1.4695520401000977, + "logits/rejected": -1.459860920906067, + "logps/chosen": -43.92028045654297, + "logps/rejected": -49.57878112792969, + "loss": 0.6819, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.008387397974729538, + "rewards/margins": 0.02342185750603676, + "rewards/rejected": -0.015034460462629795, + "step": 62 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 37.33709678012097, + "learning_rate": 2.3333333333333333e-07, + "logits/chosen": -1.4608687162399292, + "logits/rejected": -1.545125126838684, + "logps/chosen": -46.558204650878906, + "logps/rejected": -52.98792266845703, + "loss": 0.6715, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03448185697197914, + "rewards/margins": 0.08211689442396164, + "rewards/rejected": -0.0476350337266922, + "step": 63 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 38.347601536557335, + "learning_rate": 2.3703703703703703e-07, + "logits/chosen": -1.3680189847946167, + "logits/rejected": -1.4239660501480103, + "logps/chosen": -42.97498321533203, + "logps/rejected": -61.106666564941406, + "loss": 0.675, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0033024298027157784, + "rewards/margins": 0.007563777267932892, + "rewards/rejected": -0.004261349327862263, + "step": 64 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 36.216386555506176, + "learning_rate": 2.407407407407407e-07, + "logits/chosen": -1.3454582691192627, + "logits/rejected": -1.3198963403701782, + "logps/chosen": -47.199485778808594, + "logps/rejected": -52.74918746948242, + "loss": 0.6865, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07009802013635635, + "rewards/margins": 0.08980407565832138, + "rewards/rejected": -0.019706057384610176, + "step": 65 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 35.898175335015566, + "learning_rate": 2.4444444444444445e-07, + "logits/chosen": -1.3845653533935547, + "logits/rejected": -1.3613636493682861, + "logps/chosen": -41.01805877685547, + "logps/rejected": -50.0711669921875, + "loss": 0.6704, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007085941731929779, + "rewards/margins": -0.010256503708660603, + "rewards/rejected": 0.003170562209561467, + "step": 66 + }, + { + "epoch": 0.397037037037037, + "grad_norm": 41.27409510225558, + "learning_rate": 2.4814814814814814e-07, + "logits/chosen": -1.4381773471832275, + "logits/rejected": -1.4831494092941284, + "logps/chosen": -37.221649169921875, + "logps/rejected": -44.666282653808594, + "loss": 0.6909, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.014814566820859909, + "rewards/margins": -0.015786362811923027, + "rewards/rejected": 0.0009717955254018307, + "step": 67 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 36.64194351414389, + "learning_rate": 2.5185185185185184e-07, + "logits/chosen": -1.4352205991744995, + "logits/rejected": -1.5572233200073242, + "logps/chosen": -36.161170959472656, + "logps/rejected": -60.05598831176758, + "loss": 0.6713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05245256423950195, + "rewards/margins": 0.08905725926160812, + "rewards/rejected": -0.03660469129681587, + "step": 68 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 39.90280680196674, + "learning_rate": 2.5555555555555553e-07, + "logits/chosen": -1.5193519592285156, + "logits/rejected": -1.562377691268921, + "logps/chosen": -38.338375091552734, + "logps/rejected": -47.005638122558594, + "loss": 0.6841, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003990625962615013, + "rewards/margins": -0.0032351240515708923, + "rewards/rejected": -0.0007555019110441208, + "step": 69 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 38.199525578292175, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -1.5483940839767456, + "logits/rejected": -1.5856201648712158, + "logps/chosen": -50.09548568725586, + "logps/rejected": -60.07989501953125, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011604977771639824, + "rewards/margins": -0.018983270972967148, + "rewards/rejected": 0.007378292270004749, + "step": 70 + }, + { + "epoch": 0.42074074074074075, + "grad_norm": 40.00092221923499, + "learning_rate": 2.629629629629629e-07, + "logits/chosen": -1.4090768098831177, + "logits/rejected": -1.502636432647705, + "logps/chosen": -41.552528381347656, + "logps/rejected": -69.09208679199219, + "loss": 0.6872, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.016017243266105652, + "rewards/margins": 0.0428960807621479, + "rewards/rejected": -0.026878833770751953, + "step": 71 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 35.835263654888294, + "learning_rate": 2.6666666666666667e-07, + "logits/chosen": -1.520636796951294, + "logits/rejected": -1.5853071212768555, + "logps/chosen": -57.213993072509766, + "logps/rejected": -57.933135986328125, + "loss": 0.6606, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.032182980328798294, + "rewards/margins": 0.08221760392189026, + "rewards/rejected": -0.050034623593091965, + "step": 72 + }, + { + "epoch": 0.4325925925925926, + "grad_norm": 35.23686966256771, + "learning_rate": 2.7037037037037037e-07, + "logits/chosen": -1.522557258605957, + "logits/rejected": -1.5760351419448853, + "logps/chosen": -40.514347076416016, + "logps/rejected": -55.03413391113281, + "loss": 0.6688, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.033225104212760925, + "rewards/margins": 0.06852450221776962, + "rewards/rejected": -0.0352993980050087, + "step": 73 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 35.70854416440807, + "learning_rate": 2.7407407407407406e-07, + "logits/chosen": -1.4880564212799072, + "logits/rejected": -1.5454859733581543, + "logps/chosen": -42.971046447753906, + "logps/rejected": -52.73062515258789, + "loss": 0.6735, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04866039752960205, + "rewards/margins": 0.0654120221734047, + "rewards/rejected": -0.016751624643802643, + "step": 74 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 40.60623683148219, + "learning_rate": 2.7777777777777776e-07, + "logits/chosen": -1.4794213771820068, + "logits/rejected": -1.5459836721420288, + "logps/chosen": -51.75897979736328, + "logps/rejected": -66.92141723632812, + "loss": 0.6712, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.055379390716552734, + "rewards/margins": 0.06052131950855255, + "rewards/rejected": -0.005141926929354668, + "step": 75 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 35.71596057516897, + "learning_rate": 2.814814814814815e-07, + "logits/chosen": -1.5502272844314575, + "logits/rejected": -1.542878270149231, + "logps/chosen": -44.19514846801758, + "logps/rejected": -54.45147705078125, + "loss": 0.6756, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.046294547617435455, + "rewards/margins": 0.07543890923261642, + "rewards/rejected": -0.02914435788989067, + "step": 76 + }, + { + "epoch": 0.4562962962962963, + "grad_norm": 36.700424780574956, + "learning_rate": 2.851851851851852e-07, + "logits/chosen": -1.4337775707244873, + "logits/rejected": -1.5133496522903442, + "logps/chosen": -50.23101043701172, + "logps/rejected": -58.330902099609375, + "loss": 0.6959, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007759109139442444, + "rewards/margins": 0.04830126464366913, + "rewards/rejected": -0.04752536118030548, + "step": 77 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 40.066966979685844, + "learning_rate": 2.8888888888888885e-07, + "logits/chosen": -1.5571997165679932, + "logits/rejected": -1.5534377098083496, + "logps/chosen": -40.768104553222656, + "logps/rejected": -54.30720138549805, + "loss": 0.6604, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00044529326260089874, + "rewards/margins": 0.07285849750041962, + "rewards/rejected": -0.07241320610046387, + "step": 78 + }, + { + "epoch": 0.46814814814814815, + "grad_norm": 33.600126630332284, + "learning_rate": 2.9259259259259254e-07, + "logits/chosen": -1.3403937816619873, + "logits/rejected": -1.4642250537872314, + "logps/chosen": -50.575172424316406, + "logps/rejected": -57.23987579345703, + "loss": 0.6658, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0686764270067215, + "rewards/margins": 0.051285505294799805, + "rewards/rejected": 0.017390919849276543, + "step": 79 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 34.57158710893072, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -1.5943031311035156, + "logits/rejected": -1.5197596549987793, + "logps/chosen": -49.53266525268555, + "logps/rejected": -63.10749816894531, + "loss": 0.6404, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1665962040424347, + "rewards/margins": 0.20507952570915222, + "rewards/rejected": -0.03848333656787872, + "step": 80 + }, + { + "epoch": 0.48, + "grad_norm": 35.921264310459016, + "learning_rate": 3e-07, + "logits/chosen": -1.4847490787506104, + "logits/rejected": -1.4525268077850342, + "logps/chosen": -55.6772346496582, + "logps/rejected": -56.0514030456543, + "loss": 0.6712, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07062974572181702, + "rewards/margins": 0.11072392761707306, + "rewards/rejected": -0.04009418934583664, + "step": 81 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 37.39365265650487, + "learning_rate": 3.037037037037037e-07, + "logits/chosen": -1.2456742525100708, + "logits/rejected": -1.3491311073303223, + "logps/chosen": -46.3619499206543, + "logps/rejected": -55.591796875, + "loss": 0.6565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027526114135980606, + "rewards/margins": 0.1402171403169632, + "rewards/rejected": -0.11269102245569229, + "step": 82 + }, + { + "epoch": 0.4918518518518519, + "grad_norm": 33.32492318487895, + "learning_rate": 3.074074074074074e-07, + "logits/chosen": -1.5503648519515991, + "logits/rejected": -1.4737271070480347, + "logps/chosen": -47.40400695800781, + "logps/rejected": -55.980777740478516, + "loss": 0.651, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04232100397348404, + "rewards/margins": 0.012534653767943382, + "rewards/rejected": 0.029786348342895508, + "step": 83 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 36.39089635298574, + "learning_rate": 3.111111111111111e-07, + "logits/chosen": -1.457646369934082, + "logits/rejected": -1.4835176467895508, + "logps/chosen": -49.07018280029297, + "logps/rejected": -51.966102600097656, + "loss": 0.6677, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07979030907154083, + "rewards/margins": 0.06654424965381622, + "rewards/rejected": 0.013246058486402035, + "step": 84 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 35.4105632135552, + "learning_rate": 3.148148148148148e-07, + "logits/chosen": -1.4854512214660645, + "logits/rejected": -1.4701765775680542, + "logps/chosen": -46.278377532958984, + "logps/rejected": -56.43070983886719, + "loss": 0.6584, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08042871952056885, + "rewards/margins": 0.10214265435934067, + "rewards/rejected": -0.021713927388191223, + "step": 85 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 36.96501992841717, + "learning_rate": 3.185185185185185e-07, + "logits/chosen": -1.3575414419174194, + "logits/rejected": -1.4066803455352783, + "logps/chosen": -54.694374084472656, + "logps/rejected": -63.215087890625, + "loss": 0.6543, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12399441003799438, + "rewards/margins": 0.10921817272901535, + "rewards/rejected": 0.014776226133108139, + "step": 86 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 35.591944225315494, + "learning_rate": 3.222222222222222e-07, + "logits/chosen": -1.3541371822357178, + "logits/rejected": -1.4538600444793701, + "logps/chosen": -45.43539810180664, + "logps/rejected": -61.852481842041016, + "loss": 0.6623, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04965083673596382, + "rewards/margins": 0.02898262068629265, + "rewards/rejected": 0.02066822536289692, + "step": 87 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 34.4083607278657, + "learning_rate": 3.2592592592592596e-07, + "logits/chosen": -1.3457781076431274, + "logits/rejected": -1.4116525650024414, + "logps/chosen": -37.99258804321289, + "logps/rejected": -58.2638053894043, + "loss": 0.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10953948646783829, + "rewards/margins": 0.05180351808667183, + "rewards/rejected": 0.05773596838116646, + "step": 88 + }, + { + "epoch": 0.5274074074074074, + "grad_norm": 39.99150012269121, + "learning_rate": 3.296296296296296e-07, + "logits/chosen": -1.4930334091186523, + "logits/rejected": -1.468008041381836, + "logps/chosen": -52.282691955566406, + "logps/rejected": -51.99797439575195, + "loss": 0.6597, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02082650549709797, + "rewards/margins": 0.06192145496606827, + "rewards/rejected": -0.04109494760632515, + "step": 89 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 34.13640419254932, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.551436185836792, + "logits/rejected": -1.5505115985870361, + "logps/chosen": -50.10906982421875, + "logps/rejected": -51.82029724121094, + "loss": 0.6471, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14836148917675018, + "rewards/margins": 0.1573827564716339, + "rewards/rejected": -0.009021259844303131, + "step": 90 + }, + { + "epoch": 0.5392592592592592, + "grad_norm": 36.45897034379589, + "learning_rate": 3.37037037037037e-07, + "logits/chosen": -1.4737859964370728, + "logits/rejected": -1.5076937675476074, + "logps/chosen": -48.10447692871094, + "logps/rejected": -52.40055847167969, + "loss": 0.6466, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.14476074278354645, + "rewards/margins": 0.14221321046352386, + "rewards/rejected": 0.0025475500151515007, + "step": 91 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 33.40862505768372, + "learning_rate": 3.407407407407407e-07, + "logits/chosen": -1.2972910404205322, + "logits/rejected": -1.3978445529937744, + "logps/chosen": -42.778289794921875, + "logps/rejected": -53.14482879638672, + "loss": 0.6351, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06479823589324951, + "rewards/margins": 0.11493370682001114, + "rewards/rejected": -0.05013547092676163, + "step": 92 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 35.19026968431985, + "learning_rate": 3.4444444444444444e-07, + "logits/chosen": -1.6557621955871582, + "logits/rejected": -1.7111676931381226, + "logps/chosen": -48.467952728271484, + "logps/rejected": -46.97455596923828, + "loss": 0.6568, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.07854614406824112, + "rewards/margins": -0.029709193855524063, + "rewards/rejected": 0.10825533419847488, + "step": 93 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 33.64323632736495, + "learning_rate": 3.4814814814814814e-07, + "logits/chosen": -1.4123013019561768, + "logits/rejected": -1.5394983291625977, + "logps/chosen": -49.88652038574219, + "logps/rejected": -44.77579879760742, + "loss": 0.6164, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10286030173301697, + "rewards/margins": 0.14824065566062927, + "rewards/rejected": -0.045380350202322006, + "step": 94 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 38.945535859981476, + "learning_rate": 3.5185185185185183e-07, + "logits/chosen": -1.3453272581100464, + "logits/rejected": -1.3954417705535889, + "logps/chosen": -43.751930236816406, + "logps/rejected": -54.8797607421875, + "loss": 0.6386, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17014189064502716, + "rewards/margins": 0.1695454716682434, + "rewards/rejected": 0.0005964227020740509, + "step": 95 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 35.64681034806811, + "learning_rate": 3.5555555555555553e-07, + "logits/chosen": -1.5312745571136475, + "logits/rejected": -1.5875881910324097, + "logps/chosen": -57.08066940307617, + "logps/rejected": -68.38322448730469, + "loss": 0.6137, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1007266640663147, + "rewards/margins": 0.1999504566192627, + "rewards/rejected": -0.0992238000035286, + "step": 96 + }, + { + "epoch": 0.5748148148148148, + "grad_norm": 39.16907729636469, + "learning_rate": 3.592592592592593e-07, + "logits/chosen": -1.387762427330017, + "logits/rejected": -1.3342286348342896, + "logps/chosen": -46.985740661621094, + "logps/rejected": -58.00916290283203, + "loss": 0.6367, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04014458507299423, + "rewards/margins": 0.12731415033340454, + "rewards/rejected": -0.08716955780982971, + "step": 97 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 37.66257951373822, + "learning_rate": 3.6296296296296297e-07, + "logits/chosen": -1.6567975282669067, + "logits/rejected": -1.6726268529891968, + "logps/chosen": -47.0131950378418, + "logps/rejected": -54.135921478271484, + "loss": 0.6292, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09105877578258514, + "rewards/margins": 0.16023945808410645, + "rewards/rejected": -0.0691806823015213, + "step": 98 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 38.05243257959752, + "learning_rate": 3.666666666666666e-07, + "logits/chosen": -1.4999873638153076, + "logits/rejected": -1.5060404539108276, + "logps/chosen": -31.823820114135742, + "logps/rejected": -53.95246887207031, + "loss": 0.6663, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07480905205011368, + "rewards/margins": 0.04157290235161781, + "rewards/rejected": 0.033236145973205566, + "step": 99 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 35.02084859940032, + "learning_rate": 3.703703703703703e-07, + "logits/chosen": -1.4343098402023315, + "logits/rejected": -1.4538511037826538, + "logps/chosen": -50.422210693359375, + "logps/rejected": -68.70233917236328, + "loss": 0.6379, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.032022904604673386, + "rewards/margins": 0.10641990602016449, + "rewards/rejected": -0.0743969976902008, + "step": 100 + }, + { + "epoch": 0.5985185185185186, + "grad_norm": 34.97282723129252, + "learning_rate": 3.7407407407407406e-07, + "logits/chosen": -1.4774181842803955, + "logits/rejected": -1.4857224225997925, + "logps/chosen": -43.08037185668945, + "logps/rejected": -58.0771598815918, + "loss": 0.6229, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1684502214193344, + "rewards/margins": 0.2295190393924713, + "rewards/rejected": -0.0610688254237175, + "step": 101 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 33.28326626037529, + "learning_rate": 3.7777777777777775e-07, + "logits/chosen": -1.4652920961380005, + "logits/rejected": -1.4563056230545044, + "logps/chosen": -40.73276901245117, + "logps/rejected": -44.06425476074219, + "loss": 0.5857, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20217598974704742, + "rewards/margins": 0.33883577585220337, + "rewards/rejected": -0.13665977120399475, + "step": 102 + }, + { + "epoch": 0.6103703703703703, + "grad_norm": 32.09719371496252, + "learning_rate": 3.8148148148148145e-07, + "logits/chosen": -1.7163668870925903, + "logits/rejected": -1.633704662322998, + "logps/chosen": -44.5764045715332, + "logps/rejected": -43.08470153808594, + "loss": 0.6316, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11517582833766937, + "rewards/margins": 0.22480958700180054, + "rewards/rejected": -0.10963378101587296, + "step": 103 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 34.81825629167962, + "learning_rate": 3.8518518518518515e-07, + "logits/chosen": -1.4617501497268677, + "logits/rejected": -1.5581308603286743, + "logps/chosen": -47.830772399902344, + "logps/rejected": -57.300086975097656, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0872277244925499, + "rewards/margins": 0.3060223162174225, + "rewards/rejected": -0.218794584274292, + "step": 104 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 36.39266070401373, + "learning_rate": 3.888888888888889e-07, + "logits/chosen": -1.5160835981369019, + "logits/rejected": -1.6838594675064087, + "logps/chosen": -42.137794494628906, + "logps/rejected": -65.07337951660156, + "loss": 0.6411, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12303625047206879, + "rewards/margins": 0.14647573232650757, + "rewards/rejected": -0.023439496755599976, + "step": 105 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 31.850193009417975, + "learning_rate": 3.925925925925926e-07, + "logits/chosen": -1.4147698879241943, + "logits/rejected": -1.517427921295166, + "logps/chosen": -54.854183197021484, + "logps/rejected": -51.17685317993164, + "loss": 0.586, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11260886490345001, + "rewards/margins": 0.28815436363220215, + "rewards/rejected": -0.17554549872875214, + "step": 106 + }, + { + "epoch": 0.6340740740740741, + "grad_norm": 34.65709673375601, + "learning_rate": 3.962962962962963e-07, + "logits/chosen": -1.3840844631195068, + "logits/rejected": -1.4422510862350464, + "logps/chosen": -52.301841735839844, + "logps/rejected": -68.42982482910156, + "loss": 0.5803, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.12791098654270172, + "rewards/margins": 0.31330111622810364, + "rewards/rejected": -0.18539008498191833, + "step": 107 + }, + { + "epoch": 0.64, + "grad_norm": 32.89951039869555, + "learning_rate": 4e-07, + "logits/chosen": -1.41471266746521, + "logits/rejected": -1.467881202697754, + "logps/chosen": -52.74435806274414, + "logps/rejected": -60.40370559692383, + "loss": 0.5749, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07138564437627792, + "rewards/margins": 0.2469371259212494, + "rewards/rejected": -0.17555147409439087, + "step": 108 + }, + { + "epoch": 0.6459259259259259, + "grad_norm": 33.493433739647735, + "learning_rate": 4.0370370370370373e-07, + "logits/chosen": -1.1759183406829834, + "logits/rejected": -1.2476868629455566, + "logps/chosen": -47.751792907714844, + "logps/rejected": -67.92081451416016, + "loss": 0.5857, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11714056879281998, + "rewards/margins": 0.45923230051994324, + "rewards/rejected": -0.34209170937538147, + "step": 109 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 33.53546295318829, + "learning_rate": 4.0740740740740737e-07, + "logits/chosen": -1.5570002794265747, + "logits/rejected": -1.5066514015197754, + "logps/chosen": -50.70806884765625, + "logps/rejected": -60.30607223510742, + "loss": 0.6235, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04970753192901611, + "rewards/margins": 0.17559602856636047, + "rewards/rejected": -0.12588849663734436, + "step": 110 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 32.24746058891704, + "learning_rate": 4.1111111111111107e-07, + "logits/chosen": -1.5236682891845703, + "logits/rejected": -1.4900072813034058, + "logps/chosen": -42.65536880493164, + "logps/rejected": -52.812469482421875, + "loss": 0.6056, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12369166314601898, + "rewards/margins": 0.23955130577087402, + "rewards/rejected": -0.11585965007543564, + "step": 111 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 37.062556098993824, + "learning_rate": 4.1481481481481476e-07, + "logits/chosen": -1.5205527544021606, + "logits/rejected": -1.4171969890594482, + "logps/chosen": -62.091796875, + "logps/rejected": -56.35509490966797, + "loss": 0.6062, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14249742031097412, + "rewards/margins": 0.30201977491378784, + "rewards/rejected": -0.15952235460281372, + "step": 112 + }, + { + "epoch": 0.6696296296296296, + "grad_norm": 33.200484026739076, + "learning_rate": 4.185185185185185e-07, + "logits/chosen": -1.4046220779418945, + "logits/rejected": -1.4927732944488525, + "logps/chosen": -41.91143035888672, + "logps/rejected": -46.92932891845703, + "loss": 0.612, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02149050310254097, + "rewards/margins": 0.1793198436498642, + "rewards/rejected": -0.15782934427261353, + "step": 113 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 35.094056061875875, + "learning_rate": 4.222222222222222e-07, + "logits/chosen": -1.3244975805282593, + "logits/rejected": -1.439337968826294, + "logps/chosen": -45.62443542480469, + "logps/rejected": -50.95753479003906, + "loss": 0.5868, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.02482125535607338, + "rewards/margins": 0.12921854853630066, + "rewards/rejected": -0.10439729690551758, + "step": 114 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 32.502983592668585, + "learning_rate": 4.259259259259259e-07, + "logits/chosen": -1.24369478225708, + "logits/rejected": -1.3050041198730469, + "logps/chosen": -51.76207733154297, + "logps/rejected": -61.87729263305664, + "loss": 0.5588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29445695877075195, + "rewards/margins": 0.5628924369812012, + "rewards/rejected": -0.2684354782104492, + "step": 115 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 29.974843969255893, + "learning_rate": 4.296296296296296e-07, + "logits/chosen": -1.5716416835784912, + "logits/rejected": -1.5490162372589111, + "logps/chosen": -45.147708892822266, + "logps/rejected": -52.94227981567383, + "loss": 0.5554, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14993491768836975, + "rewards/margins": 0.33745861053466797, + "rewards/rejected": -0.1875237226486206, + "step": 116 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 34.28074065539359, + "learning_rate": 4.3333333333333335e-07, + "logits/chosen": -1.427602767944336, + "logits/rejected": -1.4697047472000122, + "logps/chosen": -50.961883544921875, + "logps/rejected": -58.997623443603516, + "loss": 0.5549, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08439350128173828, + "rewards/margins": 0.42768120765686035, + "rewards/rejected": -0.34328773617744446, + "step": 117 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 31.518026578609504, + "learning_rate": 4.3703703703703704e-07, + "logits/chosen": -1.4572179317474365, + "logits/rejected": -1.3873144388198853, + "logps/chosen": -41.546226501464844, + "logps/rejected": -49.489315032958984, + "loss": 0.5735, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05958416312932968, + "rewards/margins": 0.34586769342422485, + "rewards/rejected": -0.28628358244895935, + "step": 118 + }, + { + "epoch": 0.7051851851851851, + "grad_norm": 33.033357331372045, + "learning_rate": 4.4074074074074074e-07, + "logits/chosen": -1.4856173992156982, + "logits/rejected": -1.502893090248108, + "logps/chosen": -42.551170349121094, + "logps/rejected": -47.76485824584961, + "loss": 0.5414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.024673819541931152, + "rewards/margins": 0.4481239914894104, + "rewards/rejected": -0.42345017194747925, + "step": 119 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 33.62289241183141, + "learning_rate": 4.444444444444444e-07, + "logits/chosen": -1.4512724876403809, + "logits/rejected": -1.459259033203125, + "logps/chosen": -35.332374572753906, + "logps/rejected": -48.842464447021484, + "loss": 0.5958, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10679290443658829, + "rewards/margins": 0.1940063238143921, + "rewards/rejected": -0.0872134268283844, + "step": 120 + }, + { + "epoch": 0.717037037037037, + "grad_norm": 33.21896811018362, + "learning_rate": 4.4814814814814813e-07, + "logits/chosen": -1.4697211980819702, + "logits/rejected": -1.5072273015975952, + "logps/chosen": -49.4876823425293, + "logps/rejected": -63.57400894165039, + "loss": 0.5228, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07271251082420349, + "rewards/margins": 0.3985411524772644, + "rewards/rejected": -0.3258286416530609, + "step": 121 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 28.810988024096453, + "learning_rate": 4.5185185185185183e-07, + "logits/chosen": -1.4811208248138428, + "logits/rejected": -1.5742110013961792, + "logps/chosen": -43.195194244384766, + "logps/rejected": -52.73583221435547, + "loss": 0.5284, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2379426658153534, + "rewards/margins": 0.5730524659156799, + "rewards/rejected": -0.3351098299026489, + "step": 122 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 33.27927501039274, + "learning_rate": 4.555555555555555e-07, + "logits/chosen": -1.4409135580062866, + "logits/rejected": -1.3866451978683472, + "logps/chosen": -54.90428924560547, + "logps/rejected": -57.876712799072266, + "loss": 0.5717, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03284015506505966, + "rewards/margins": 0.30528080463409424, + "rewards/rejected": -0.272440642118454, + "step": 123 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 29.733201192052977, + "learning_rate": 4.592592592592592e-07, + "logits/chosen": -1.4358739852905273, + "logits/rejected": -1.4204962253570557, + "logps/chosen": -44.5390739440918, + "logps/rejected": -60.877220153808594, + "loss": 0.5214, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12315364927053452, + "rewards/margins": 0.4188612699508667, + "rewards/rejected": -0.295707643032074, + "step": 124 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 32.2116448331724, + "learning_rate": 4.6296296296296297e-07, + "logits/chosen": -1.5213638544082642, + "logits/rejected": -1.5197651386260986, + "logps/chosen": -55.357574462890625, + "logps/rejected": -57.78273010253906, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16674299538135529, + "rewards/margins": 0.4866538643836975, + "rewards/rejected": -0.31991085410118103, + "step": 125 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 30.88294089490252, + "learning_rate": 4.6666666666666666e-07, + "logits/chosen": -1.4147846698760986, + "logits/rejected": -1.5049686431884766, + "logps/chosen": -52.510929107666016, + "logps/rejected": -57.91138458251953, + "loss": 0.5606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10760531574487686, + "rewards/margins": 0.30788373947143555, + "rewards/rejected": -0.20027843117713928, + "step": 126 + }, + { + "epoch": 0.7525925925925926, + "grad_norm": 30.408789687578963, + "learning_rate": 4.7037037037037036e-07, + "logits/chosen": -1.3894518613815308, + "logits/rejected": -1.4455385208129883, + "logps/chosen": -34.467201232910156, + "logps/rejected": -48.339447021484375, + "loss": 0.5228, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10829809308052063, + "rewards/margins": 0.5127183794975281, + "rewards/rejected": -0.40442025661468506, + "step": 127 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 33.008401257466446, + "learning_rate": 4.7407407407407405e-07, + "logits/chosen": -1.4528629779815674, + "logits/rejected": -1.4002412557601929, + "logps/chosen": -43.918251037597656, + "logps/rejected": -54.43046569824219, + "loss": 0.5779, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.017867133021354675, + "rewards/margins": 0.2733150124549866, + "rewards/rejected": -0.2554478645324707, + "step": 128 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 30.60766339230095, + "learning_rate": 4.777777777777778e-07, + "logits/chosen": -1.5056837797164917, + "logits/rejected": -1.4879945516586304, + "logps/chosen": -52.15769958496094, + "logps/rejected": -57.460479736328125, + "loss": 0.5182, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2870657444000244, + "rewards/margins": 0.5349029898643494, + "rewards/rejected": -0.24783721566200256, + "step": 129 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 30.92722489889393, + "learning_rate": 4.814814814814814e-07, + "logits/chosen": -1.4615930318832397, + "logits/rejected": -1.4664386510849, + "logps/chosen": -44.34745788574219, + "logps/rejected": -59.45315170288086, + "loss": 0.5232, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05535607412457466, + "rewards/margins": 0.354500949382782, + "rewards/rejected": -0.2991448640823364, + "step": 130 + }, + { + "epoch": 0.7762962962962963, + "grad_norm": 31.911195699332875, + "learning_rate": 4.851851851851852e-07, + "logits/chosen": -1.3491640090942383, + "logits/rejected": -1.3923466205596924, + "logps/chosen": -60.32138442993164, + "logps/rejected": -51.23785400390625, + "loss": 0.5426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02666044607758522, + "rewards/margins": 0.32274696230888367, + "rewards/rejected": -0.349407434463501, + "step": 131 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 32.780173075722395, + "learning_rate": 4.888888888888889e-07, + "logits/chosen": -1.3976666927337646, + "logits/rejected": -1.4585447311401367, + "logps/chosen": -53.38674545288086, + "logps/rejected": -50.861080169677734, + "loss": 0.5465, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13597837090492249, + "rewards/margins": 0.38355204463005066, + "rewards/rejected": -0.5195304155349731, + "step": 132 + }, + { + "epoch": 0.7881481481481482, + "grad_norm": 30.99528148041851, + "learning_rate": 4.925925925925926e-07, + "logits/chosen": -1.4452307224273682, + "logits/rejected": -1.5811454057693481, + "logps/chosen": -54.36809158325195, + "logps/rejected": -64.48612976074219, + "loss": 0.4654, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10771436989307404, + "rewards/margins": 0.7881489992141724, + "rewards/rejected": -0.6804346442222595, + "step": 133 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 28.418046758024726, + "learning_rate": 4.962962962962963e-07, + "logits/chosen": -1.3397469520568848, + "logits/rejected": -1.2624437808990479, + "logps/chosen": -35.42027282714844, + "logps/rejected": -47.60633087158203, + "loss": 0.5167, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1339387148618698, + "rewards/margins": 0.6218531131744385, + "rewards/rejected": -0.48791444301605225, + "step": 134 + }, + { + "epoch": 0.8, + "grad_norm": 32.781124547085, + "learning_rate": 5e-07, + "logits/chosen": -1.512722373008728, + "logits/rejected": -1.5071099996566772, + "logps/chosen": -58.30361557006836, + "logps/rejected": -54.290679931640625, + "loss": 0.5407, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05489421263337135, + "rewards/margins": 0.5525442957878113, + "rewards/rejected": -0.4976501166820526, + "step": 135 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 35.08995247021387, + "learning_rate": 4.999991559718872e-07, + "logits/chosen": -1.6145902872085571, + "logits/rejected": -1.6498243808746338, + "logps/chosen": -54.22117614746094, + "logps/rejected": -68.34226989746094, + "loss": 0.5369, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.058954719454050064, + "rewards/margins": 0.47398126125335693, + "rewards/rejected": -0.5329359769821167, + "step": 136 + }, + { + "epoch": 0.8118518518518518, + "grad_norm": 34.00931163362952, + "learning_rate": 4.999966238932478e-07, + "logits/chosen": -1.4408268928527832, + "logits/rejected": -1.3998584747314453, + "logps/chosen": -58.43665313720703, + "logps/rejected": -54.154052734375, + "loss": 0.5401, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1059800386428833, + "rewards/margins": 0.38337820768356323, + "rewards/rejected": -0.48935821652412415, + "step": 137 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 28.812193750939542, + "learning_rate": 4.999924037811792e-07, + "logits/chosen": -1.337760090827942, + "logits/rejected": -1.399741768836975, + "logps/chosen": -50.198890686035156, + "logps/rejected": -69.07243347167969, + "loss": 0.4945, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10026273131370544, + "rewards/margins": 0.8603047132492065, + "rewards/rejected": -0.7600419521331787, + "step": 138 + }, + { + "epoch": 0.8237037037037037, + "grad_norm": 32.004953488834715, + "learning_rate": 4.999864956641761e-07, + "logits/chosen": -1.2986966371536255, + "logits/rejected": -1.4439055919647217, + "logps/chosen": -40.57745361328125, + "logps/rejected": -45.291465759277344, + "loss": 0.5348, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03176772594451904, + "rewards/margins": 0.40621811151504517, + "rewards/rejected": -0.4379858374595642, + "step": 139 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 28.821970631268016, + "learning_rate": 4.99978899582132e-07, + "logits/chosen": -1.4549857378005981, + "logits/rejected": -1.5415047407150269, + "logps/chosen": -48.27933883666992, + "logps/rejected": -55.57482147216797, + "loss": 0.509, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.016005441546440125, + "rewards/margins": 0.6398062705993652, + "rewards/rejected": -0.6238008737564087, + "step": 140 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 29.956916160365633, + "learning_rate": 4.999696155863368e-07, + "logits/chosen": -1.161665678024292, + "logits/rejected": -1.2288399934768677, + "logps/chosen": -38.48533248901367, + "logps/rejected": -47.85722351074219, + "loss": 0.5077, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08423295617103577, + "rewards/margins": 0.6424591541290283, + "rewards/rejected": -0.5582262277603149, + "step": 141 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 28.05076670996194, + "learning_rate": 4.999586437394786e-07, + "logits/chosen": -1.4714339971542358, + "logits/rejected": -1.5132161378860474, + "logps/chosen": -48.181156158447266, + "logps/rejected": -55.47602844238281, + "loss": 0.5124, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.003453332930803299, + "rewards/margins": 0.6579344868659973, + "rewards/rejected": -0.654481053352356, + "step": 142 + }, + { + "epoch": 0.8474074074074074, + "grad_norm": 29.989008234893884, + "learning_rate": 4.999459841156414e-07, + "logits/chosen": -1.2807202339172363, + "logits/rejected": -1.3370921611785889, + "logps/chosen": -35.43395233154297, + "logps/rejected": -40.46952819824219, + "loss": 0.5811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05782117694616318, + "rewards/margins": 0.4030662178993225, + "rewards/rejected": -0.34524503350257874, + "step": 143 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 29.438739201559326, + "learning_rate": 4.999316368003061e-07, + "logits/chosen": -1.5066066980361938, + "logits/rejected": -1.4261574745178223, + "logps/chosen": -59.00090789794922, + "logps/rejected": -62.56212615966797, + "loss": 0.5071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1349632292985916, + "rewards/margins": 0.6473178863525391, + "rewards/rejected": -0.5123546123504639, + "step": 144 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 32.72521840563668, + "learning_rate": 4.999156018903489e-07, + "logits/chosen": -1.3574910163879395, + "logits/rejected": -1.4152084589004517, + "logps/chosen": -55.105262756347656, + "logps/rejected": -59.85123062133789, + "loss": 0.5444, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2306346744298935, + "rewards/margins": 0.2394087314605713, + "rewards/rejected": -0.47004345059394836, + "step": 145 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 30.579443998829742, + "learning_rate": 4.998978794940411e-07, + "logits/chosen": -1.5495673418045044, + "logits/rejected": -1.4402530193328857, + "logps/chosen": -55.61601257324219, + "logps/rejected": -49.91148376464844, + "loss": 0.5247, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11350102722644806, + "rewards/margins": 0.6421040296554565, + "rewards/rejected": -0.5286029577255249, + "step": 146 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 32.28607136824646, + "learning_rate": 4.998784697310482e-07, + "logits/chosen": -1.4374938011169434, + "logits/rejected": -1.5053014755249023, + "logps/chosen": -57.0212516784668, + "logps/rejected": -50.56647491455078, + "loss": 0.5361, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07709996402263641, + "rewards/margins": 0.6278308629989624, + "rewards/rejected": -0.55073082447052, + "step": 147 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 28.758614918413993, + "learning_rate": 4.998573727324294e-07, + "logits/chosen": -1.4719284772872925, + "logits/rejected": -1.4712297916412354, + "logps/chosen": -47.55272674560547, + "logps/rejected": -77.12329864501953, + "loss": 0.4827, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19793976843357086, + "rewards/margins": 0.7996434569358826, + "rewards/rejected": -0.6017036437988281, + "step": 148 + }, + { + "epoch": 0.882962962962963, + "grad_norm": 26.09070483109539, + "learning_rate": 4.998345886406365e-07, + "logits/chosen": -1.2813467979431152, + "logits/rejected": -1.4254480600357056, + "logps/chosen": -40.02638626098633, + "logps/rejected": -46.91986083984375, + "loss": 0.4656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18909288942813873, + "rewards/margins": 0.3858364522457123, + "rewards/rejected": -0.19674354791641235, + "step": 149 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 28.043633258520604, + "learning_rate": 4.998101176095128e-07, + "logits/chosen": -1.3586623668670654, + "logits/rejected": -1.3965615034103394, + "logps/chosen": -49.813751220703125, + "logps/rejected": -59.23504638671875, + "loss": 0.5246, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11088036000728607, + "rewards/margins": 0.8094318509101868, + "rewards/rejected": -0.6985514760017395, + "step": 150 + }, + { + "epoch": 0.8948148148148148, + "grad_norm": 28.749826679176486, + "learning_rate": 4.997839598042919e-07, + "logits/chosen": -1.2615379095077515, + "logits/rejected": -1.4461233615875244, + "logps/chosen": -49.363128662109375, + "logps/rejected": -53.413047790527344, + "loss": 0.4865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1839275360107422, + "rewards/margins": 0.7002414464950562, + "rewards/rejected": -0.516313910484314, + "step": 151 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 35.14940181472167, + "learning_rate": 4.997561154015975e-07, + "logits/chosen": -1.4611401557922363, + "logits/rejected": -1.5102167129516602, + "logps/chosen": -44.313316345214844, + "logps/rejected": -49.16785430908203, + "loss": 0.5811, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16626399755477905, + "rewards/margins": 0.05043494701385498, + "rewards/rejected": -0.21669892966747284, + "step": 152 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 28.277476832575395, + "learning_rate": 4.997265845894411e-07, + "logits/chosen": -1.5449285507202148, + "logits/rejected": -1.5440673828125, + "logps/chosen": -56.093666076660156, + "logps/rejected": -49.94963073730469, + "loss": 0.4695, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16555923223495483, + "rewards/margins": 0.6077948808670044, + "rewards/rejected": -0.4422355890274048, + "step": 153 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 29.18744779214114, + "learning_rate": 4.996953675672213e-07, + "logits/chosen": -1.4645702838897705, + "logits/rejected": -1.456922173500061, + "logps/chosen": -44.6121826171875, + "logps/rejected": -53.01622772216797, + "loss": 0.4783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3893875777721405, + "rewards/margins": 0.8575869202613831, + "rewards/rejected": -0.46819937229156494, + "step": 154 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 28.117909866516314, + "learning_rate": 4.996624645457227e-07, + "logits/chosen": -1.6098130941390991, + "logits/rejected": -1.5549395084381104, + "logps/chosen": -61.51123046875, + "logps/rejected": -54.52252960205078, + "loss": 0.4959, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1888391524553299, + "rewards/margins": 0.42284536361694336, + "rewards/rejected": -0.23400622606277466, + "step": 155 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 27.640916240379614, + "learning_rate": 4.996278757471138e-07, + "logits/chosen": -1.1909761428833008, + "logits/rejected": -1.3579903841018677, + "logps/chosen": -47.18357849121094, + "logps/rejected": -53.34309387207031, + "loss": 0.4488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1449306756258011, + "rewards/margins": 0.980364203453064, + "rewards/rejected": -0.8354335427284241, + "step": 156 + }, + { + "epoch": 0.9303703703703704, + "grad_norm": 34.29518160731201, + "learning_rate": 4.995916014049461e-07, + "logits/chosen": -1.4977542161941528, + "logits/rejected": -1.5214853286743164, + "logps/chosen": -62.64368438720703, + "logps/rejected": -60.37792205810547, + "loss": 0.5706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18178661167621613, + "rewards/margins": 0.18183384835720062, + "rewards/rejected": -0.36362048983573914, + "step": 157 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 28.776345767229973, + "learning_rate": 4.995536417641517e-07, + "logits/chosen": -1.3088951110839844, + "logits/rejected": -1.3918869495391846, + "logps/chosen": -42.50482940673828, + "logps/rejected": -51.679649353027344, + "loss": 0.4839, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.012760929763317108, + "rewards/margins": 0.6891413331031799, + "rewards/rejected": -0.7019021511077881, + "step": 158 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 27.13077469944883, + "learning_rate": 4.99513997081043e-07, + "logits/chosen": -1.5389611721038818, + "logits/rejected": -1.6600943803787231, + "logps/chosen": -43.073604583740234, + "logps/rejected": -62.15262222290039, + "loss": 0.4599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08031359314918518, + "rewards/margins": 0.6155234575271606, + "rewards/rejected": -0.5352098941802979, + "step": 159 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 28.815045476308757, + "learning_rate": 4.994726676233097e-07, + "logits/chosen": -1.3073569536209106, + "logits/rejected": -1.3052654266357422, + "logps/chosen": -50.287288665771484, + "logps/rejected": -63.46202087402344, + "loss": 0.4595, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18923993408679962, + "rewards/margins": 0.39738625288009644, + "rewards/rejected": -0.5866261720657349, + "step": 160 + }, + { + "epoch": 0.9540740740740741, + "grad_norm": 30.804074809488196, + "learning_rate": 4.994296536700177e-07, + "logits/chosen": -1.2815802097320557, + "logits/rejected": -1.3018033504486084, + "logps/chosen": -57.259159088134766, + "logps/rejected": -71.91606903076172, + "loss": 0.4449, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25141018629074097, + "rewards/margins": 1.3318860530853271, + "rewards/rejected": -1.0804758071899414, + "step": 161 + }, + { + "epoch": 0.96, + "grad_norm": 30.186158732166184, + "learning_rate": 4.993849555116066e-07, + "logits/chosen": -1.5598193407058716, + "logits/rejected": -1.6054404973983765, + "logps/chosen": -33.970062255859375, + "logps/rejected": -45.955543518066406, + "loss": 0.492, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08695819228887558, + "rewards/margins": 0.38726359605789185, + "rewards/rejected": -0.3003053665161133, + "step": 162 + }, + { + "epoch": 0.965925925925926, + "grad_norm": 23.80589053234063, + "learning_rate": 4.993385734498887e-07, + "logits/chosen": -1.3942906856536865, + "logits/rejected": -1.3598270416259766, + "logps/chosen": -46.04230499267578, + "logps/rejected": -64.92222595214844, + "loss": 0.4004, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.055475056171417236, + "rewards/margins": 1.0718060731887817, + "rewards/rejected": -1.0163309574127197, + "step": 163 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 35.77255758808572, + "learning_rate": 4.992905077980461e-07, + "logits/chosen": -1.2013689279556274, + "logits/rejected": -1.2357840538024902, + "logps/chosen": -54.10811996459961, + "logps/rejected": -57.42766571044922, + "loss": 0.515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21006661653518677, + "rewards/margins": 0.9132038354873657, + "rewards/rejected": -1.1232705116271973, + "step": 164 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 31.178136497144976, + "learning_rate": 4.992407588806287e-07, + "logits/chosen": -1.5258628129959106, + "logits/rejected": -1.5229212045669556, + "logps/chosen": -44.14244842529297, + "logps/rejected": -53.07622528076172, + "loss": 0.5034, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16167354583740234, + "rewards/margins": 0.8384672403335571, + "rewards/rejected": -0.6767936944961548, + "step": 165 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 27.540178875328728, + "learning_rate": 4.991893270335525e-07, + "logits/chosen": -1.2302640676498413, + "logits/rejected": -1.263055682182312, + "logps/chosen": -36.58562088012695, + "logps/rejected": -54.91083526611328, + "loss": 0.4367, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26324182748794556, + "rewards/margins": 1.0756839513778687, + "rewards/rejected": -0.8124420642852783, + "step": 166 + }, + { + "epoch": 0.9896296296296296, + "grad_norm": 29.12805068065961, + "learning_rate": 4.991362126040969e-07, + "logits/chosen": -1.5359268188476562, + "logits/rejected": -1.68173348903656, + "logps/chosen": -38.305641174316406, + "logps/rejected": -54.66886520385742, + "loss": 0.5059, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1688043177127838, + "rewards/margins": 0.6014243364334106, + "rewards/rejected": -0.43262001872062683, + "step": 167 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 30.66836706642539, + "learning_rate": 4.990814159509024e-07, + "logits/chosen": -1.2793900966644287, + "logits/rejected": -1.3615354299545288, + "logps/chosen": -52.69702911376953, + "logps/rejected": -45.728546142578125, + "loss": 0.4884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0890645831823349, + "rewards/margins": 0.5669555068016052, + "rewards/rejected": -0.6560201048851013, + "step": 168 + }, + { + "epoch": 1.0014814814814814, + "grad_norm": 29.387874732293323, + "learning_rate": 4.990249374439684e-07, + "logits/chosen": -1.24739670753479, + "logits/rejected": -1.3176863193511963, + "logps/chosen": -38.417179107666016, + "logps/rejected": -48.35388946533203, + "loss": 0.4576, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25613975524902344, + "rewards/margins": 1.03694486618042, + "rewards/rejected": -0.7808051109313965, + "step": 169 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 29.586273409908994, + "learning_rate": 4.989667774646505e-07, + "logits/chosen": -1.222827672958374, + "logits/rejected": -1.2669568061828613, + "logps/chosen": -56.15669631958008, + "logps/rejected": -51.149871826171875, + "loss": 0.5138, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16251149773597717, + "rewards/margins": 0.9217841029167175, + "rewards/rejected": -0.7592726945877075, + "step": 170 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 29.651889304281102, + "learning_rate": 4.989069364056579e-07, + "logits/chosen": -1.3727763891220093, + "logits/rejected": -1.366886019706726, + "logps/chosen": -48.37831115722656, + "logps/rejected": -37.613990783691406, + "loss": 0.4543, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05444521829485893, + "rewards/margins": 0.8736615777015686, + "rewards/rejected": -0.8192163705825806, + "step": 171 + }, + { + "epoch": 1.0192592592592593, + "grad_norm": 24.451172053007166, + "learning_rate": 4.98845414671051e-07, + "logits/chosen": -1.58548903465271, + "logits/rejected": -1.5485515594482422, + "logps/chosen": -46.289329528808594, + "logps/rejected": -54.086387634277344, + "loss": 0.3671, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.090296670794487, + "rewards/margins": 1.2733991146087646, + "rewards/rejected": -1.1831023693084717, + "step": 172 + }, + { + "epoch": 1.0251851851851852, + "grad_norm": 25.18785787450401, + "learning_rate": 4.987822126762382e-07, + "logits/chosen": -1.3124250173568726, + "logits/rejected": -1.350297212600708, + "logps/chosen": -51.7904167175293, + "logps/rejected": -57.94340896606445, + "loss": 0.3698, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24363113939762115, + "rewards/margins": 1.3216978311538696, + "rewards/rejected": -1.0780668258666992, + "step": 173 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 27.160374266755905, + "learning_rate": 4.987173308479737e-07, + "logits/chosen": -1.3497616052627563, + "logits/rejected": -1.3936090469360352, + "logps/chosen": -51.94996643066406, + "logps/rejected": -61.93647384643555, + "loss": 0.4425, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05416693538427353, + "rewards/margins": 1.2709107398986816, + "rewards/rejected": -1.2167439460754395, + "step": 174 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 26.184809377938368, + "learning_rate": 4.986507696243543e-07, + "logits/chosen": -1.4122731685638428, + "logits/rejected": -1.4073926210403442, + "logps/chosen": -44.906639099121094, + "logps/rejected": -60.822059631347656, + "loss": 0.4117, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015085561200976372, + "rewards/margins": 1.0686676502227783, + "rewards/rejected": -1.0535821914672852, + "step": 175 + }, + { + "epoch": 1.0429629629629629, + "grad_norm": 27.884920522386153, + "learning_rate": 4.985825294548162e-07, + "logits/chosen": -1.2345750331878662, + "logits/rejected": -1.263419508934021, + "logps/chosen": -57.070350646972656, + "logps/rejected": -55.806800842285156, + "loss": 0.4092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012559197843074799, + "rewards/margins": 1.3272449970245361, + "rewards/rejected": -1.3146858215332031, + "step": 176 + }, + { + "epoch": 1.048888888888889, + "grad_norm": 29.753523592942816, + "learning_rate": 4.985126108001323e-07, + "logits/chosen": -1.339246392250061, + "logits/rejected": -1.3507680892944336, + "logps/chosen": -47.27094268798828, + "logps/rejected": -59.008148193359375, + "loss": 0.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20138312876224518, + "rewards/margins": 1.376226544380188, + "rewards/rejected": -1.5776095390319824, + "step": 177 + }, + { + "epoch": 1.0548148148148149, + "grad_norm": 27.788857301475037, + "learning_rate": 4.984410141324092e-07, + "logits/chosen": -1.2924796342849731, + "logits/rejected": -1.3166478872299194, + "logps/chosen": -47.26884841918945, + "logps/rejected": -58.67825698852539, + "loss": 0.4425, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13158157467842102, + "rewards/margins": 0.8055871725082397, + "rewards/rejected": -0.9371688365936279, + "step": 178 + }, + { + "epoch": 1.0607407407407408, + "grad_norm": 26.952026837887974, + "learning_rate": 4.983677399350838e-07, + "logits/chosen": -1.3298072814941406, + "logits/rejected": -1.37056565284729, + "logps/chosen": -45.015419006347656, + "logps/rejected": -68.24140930175781, + "loss": 0.4063, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.010896991938352585, + "rewards/margins": 1.1538221836090088, + "rewards/rejected": -1.1647191047668457, + "step": 179 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 27.254102685735607, + "learning_rate": 4.982927887029197e-07, + "logits/chosen": -1.3543680906295776, + "logits/rejected": -1.4619414806365967, + "logps/chosen": -47.35694885253906, + "logps/rejected": -63.67597961425781, + "loss": 0.4239, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11707471311092377, + "rewards/margins": 0.8920746445655823, + "rewards/rejected": -0.7749999165534973, + "step": 180 + }, + { + "epoch": 1.0725925925925925, + "grad_norm": 32.735453727753146, + "learning_rate": 4.982161609420047e-07, + "logits/chosen": -1.3471641540527344, + "logits/rejected": -1.3713692426681519, + "logps/chosen": -45.259033203125, + "logps/rejected": -72.70370483398438, + "loss": 0.461, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.009991548955440521, + "rewards/margins": 1.367063283920288, + "rewards/rejected": -1.3570716381072998, + "step": 181 + }, + { + "epoch": 1.0785185185185184, + "grad_norm": 26.00414349015101, + "learning_rate": 4.981378571697466e-07, + "logits/chosen": -1.2727996110916138, + "logits/rejected": -1.2572351694107056, + "logps/chosen": -49.19256591796875, + "logps/rejected": -53.22026443481445, + "loss": 0.4732, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25015491247177124, + "rewards/margins": 0.5964915752410889, + "rewards/rejected": -0.8466465473175049, + "step": 182 + }, + { + "epoch": 1.0844444444444445, + "grad_norm": 31.07953663876989, + "learning_rate": 4.980578779148702e-07, + "logits/chosen": -1.1128710508346558, + "logits/rejected": -1.1316419839859009, + "logps/chosen": -41.58570861816406, + "logps/rejected": -55.04689025878906, + "loss": 0.4835, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11291462182998657, + "rewards/margins": 0.8946691751480103, + "rewards/rejected": -0.7817546129226685, + "step": 183 + }, + { + "epoch": 1.0903703703703704, + "grad_norm": 25.932575954455, + "learning_rate": 4.979762237174131e-07, + "logits/chosen": -1.3252081871032715, + "logits/rejected": -1.3753544092178345, + "logps/chosen": -44.07130813598633, + "logps/rejected": -65.07011413574219, + "loss": 0.4, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3121752440929413, + "rewards/margins": 0.870818018913269, + "rewards/rejected": -1.1829932928085327, + "step": 184 + }, + { + "epoch": 1.0962962962962963, + "grad_norm": 27.97479599690778, + "learning_rate": 4.978928951287232e-07, + "logits/chosen": -1.444221019744873, + "logits/rejected": -1.5286879539489746, + "logps/chosen": -58.87108612060547, + "logps/rejected": -73.10811614990234, + "loss": 0.4034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29445308446884155, + "rewards/margins": 1.092024326324463, + "rewards/rejected": -1.3864774703979492, + "step": 185 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 25.653701259430274, + "learning_rate": 4.978078927114535e-07, + "logits/chosen": -1.3871357440948486, + "logits/rejected": -1.356289029121399, + "logps/chosen": -38.077735900878906, + "logps/rejected": -47.814632415771484, + "loss": 0.3951, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3137326240539551, + "rewards/margins": 0.9380292892456055, + "rewards/rejected": -1.2517619132995605, + "step": 186 + }, + { + "epoch": 1.108148148148148, + "grad_norm": 31.129655125600156, + "learning_rate": 4.977212170395597e-07, + "logits/chosen": -1.5308648347854614, + "logits/rejected": -1.5554476976394653, + "logps/chosen": -53.46440124511719, + "logps/rejected": -63.11518096923828, + "loss": 0.4521, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.27529364824295044, + "rewards/margins": 1.0567797422409058, + "rewards/rejected": -1.3320733308792114, + "step": 187 + }, + { + "epoch": 1.114074074074074, + "grad_norm": 30.100213679819685, + "learning_rate": 4.976328686982954e-07, + "logits/chosen": -1.2549948692321777, + "logits/rejected": -1.3156468868255615, + "logps/chosen": -43.605262756347656, + "logps/rejected": -49.80906677246094, + "loss": 0.453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.01309351995587349, + "rewards/margins": 1.2752408981323242, + "rewards/rejected": -1.2621474266052246, + "step": 188 + }, + { + "epoch": 1.12, + "grad_norm": 28.949873010790967, + "learning_rate": 4.975428482842082e-07, + "logits/chosen": -1.2587721347808838, + "logits/rejected": -1.433422565460205, + "logps/chosen": -43.67359161376953, + "logps/rejected": -71.4698257446289, + "loss": 0.4513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14791209995746613, + "rewards/margins": 0.7650717496871948, + "rewards/rejected": -0.9129838943481445, + "step": 189 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 21.901565858294845, + "learning_rate": 4.974511564051367e-07, + "logits/chosen": -1.3661924600601196, + "logits/rejected": -1.418287992477417, + "logps/chosen": -42.59568786621094, + "logps/rejected": -52.789920806884766, + "loss": 0.3271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16271501779556274, + "rewards/margins": 1.124340534210205, + "rewards/rejected": -0.9616254568099976, + "step": 190 + }, + { + "epoch": 1.1318518518518519, + "grad_norm": 27.131664647740493, + "learning_rate": 4.973577936802046e-07, + "logits/chosen": -1.3555781841278076, + "logits/rejected": -1.3374905586242676, + "logps/chosen": -50.74223327636719, + "logps/rejected": -57.81810760498047, + "loss": 0.4049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07096290588378906, + "rewards/margins": 0.9010239243507385, + "rewards/rejected": -0.9719868302345276, + "step": 191 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 26.914531737531586, + "learning_rate": 4.972627607398182e-07, + "logits/chosen": -1.3602168560028076, + "logits/rejected": -1.4505767822265625, + "logps/chosen": -56.723594665527344, + "logps/rejected": -67.85716247558594, + "loss": 0.4419, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19218865036964417, + "rewards/margins": 0.9808802008628845, + "rewards/rejected": -1.173068881034851, + "step": 192 + }, + { + "epoch": 1.1437037037037037, + "grad_norm": 28.445309646618472, + "learning_rate": 4.971660582256614e-07, + "logits/chosen": -1.4276717901229858, + "logits/rejected": -1.4692355394363403, + "logps/chosen": -43.50633239746094, + "logps/rejected": -47.10520935058594, + "loss": 0.3911, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3843527138233185, + "rewards/margins": 0.7021951675415039, + "rewards/rejected": -1.0865478515625, + "step": 193 + }, + { + "epoch": 1.1496296296296296, + "grad_norm": 25.24571619509311, + "learning_rate": 4.970676867906911e-07, + "logits/chosen": -1.3926632404327393, + "logits/rejected": -1.4838123321533203, + "logps/chosen": -51.921207427978516, + "logps/rejected": -66.0678939819336, + "loss": 0.3295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0005776286125183105, + "rewards/margins": 1.9195170402526855, + "rewards/rejected": -1.9189393520355225, + "step": 194 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 25.789717188474583, + "learning_rate": 4.969676470991335e-07, + "logits/chosen": -1.3526115417480469, + "logits/rejected": -1.3458001613616943, + "logps/chosen": -46.61028289794922, + "logps/rejected": -65.9991455078125, + "loss": 0.4138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06992607563734055, + "rewards/margins": 1.478632926940918, + "rewards/rejected": -1.5485591888427734, + "step": 195 + }, + { + "epoch": 1.1614814814814816, + "grad_norm": 24.887314733226713, + "learning_rate": 4.96865939826479e-07, + "logits/chosen": -1.3574655055999756, + "logits/rejected": -1.4423139095306396, + "logps/chosen": -54.61656951904297, + "logps/rejected": -59.239402770996094, + "loss": 0.3951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1115683764219284, + "rewards/margins": 1.4205255508422852, + "rewards/rejected": -1.53209388256073, + "step": 196 + }, + { + "epoch": 1.1674074074074074, + "grad_norm": 28.05153572169805, + "learning_rate": 4.967625656594781e-07, + "logits/chosen": -1.4286890029907227, + "logits/rejected": -1.3814754486083984, + "logps/chosen": -55.42445755004883, + "logps/rejected": -51.32583999633789, + "loss": 0.4464, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4502965807914734, + "rewards/margins": 0.5142717361450195, + "rewards/rejected": -0.9645683765411377, + "step": 197 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 23.637547662812796, + "learning_rate": 4.966575252961365e-07, + "logits/chosen": -1.3578917980194092, + "logits/rejected": -1.3719455003738403, + "logps/chosen": -49.35991668701172, + "logps/rejected": -51.54848098754883, + "loss": 0.3677, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2137174755334854, + "rewards/margins": 0.9547407627105713, + "rewards/rejected": -1.168458342552185, + "step": 198 + }, + { + "epoch": 1.1792592592592592, + "grad_norm": 26.3291433097316, + "learning_rate": 4.9655081944571e-07, + "logits/chosen": -1.4350993633270264, + "logits/rejected": -1.4053765535354614, + "logps/chosen": -43.111515045166016, + "logps/rejected": -50.419593811035156, + "loss": 0.4421, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22111950814723969, + "rewards/margins": 0.6849695444107056, + "rewards/rejected": -0.9060890674591064, + "step": 199 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 28.107972878485548, + "learning_rate": 4.964424488287009e-07, + "logits/chosen": -1.389237642288208, + "logits/rejected": -1.2933259010314941, + "logps/chosen": -54.438865661621094, + "logps/rejected": -51.278350830078125, + "loss": 0.437, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09371478110551834, + "rewards/margins": 0.9150485992431641, + "rewards/rejected": -1.0087634325027466, + "step": 200 + }, + { + "epoch": 1.1911111111111112, + "grad_norm": 30.07536501432843, + "learning_rate": 4.963324141768518e-07, + "logits/chosen": -1.4005461931228638, + "logits/rejected": -1.3559218645095825, + "logps/chosen": -48.987884521484375, + "logps/rejected": -62.02407455444336, + "loss": 0.4439, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.49109989404678345, + "rewards/margins": 0.905448317527771, + "rewards/rejected": -1.3965482711791992, + "step": 201 + }, + { + "epoch": 1.1970370370370371, + "grad_norm": 25.90426712829728, + "learning_rate": 4.962207162331414e-07, + "logits/chosen": -1.3146004676818848, + "logits/rejected": -1.5756001472473145, + "logps/chosen": -44.21037292480469, + "logps/rejected": -55.566314697265625, + "loss": 0.3948, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10266552865505219, + "rewards/margins": 1.2562565803527832, + "rewards/rejected": -1.3589221239089966, + "step": 202 + }, + { + "epoch": 1.202962962962963, + "grad_norm": 27.074271726368853, + "learning_rate": 4.961073557517792e-07, + "logits/chosen": -1.1685363054275513, + "logits/rejected": -1.2372539043426514, + "logps/chosen": -42.1467170715332, + "logps/rejected": -44.715858459472656, + "loss": 0.4235, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13077600300312042, + "rewards/margins": 1.0918397903442383, + "rewards/rejected": -0.961063802242279, + "step": 203 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 27.11020646169495, + "learning_rate": 4.95992333498201e-07, + "logits/chosen": -1.3404663801193237, + "logits/rejected": -1.4096630811691284, + "logps/chosen": -50.35668182373047, + "logps/rejected": -61.568275451660156, + "loss": 0.4222, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15395641326904297, + "rewards/margins": 1.2140846252441406, + "rewards/rejected": -1.3680410385131836, + "step": 204 + }, + { + "epoch": 1.2148148148148148, + "grad_norm": 25.823454078758132, + "learning_rate": 4.958756502490626e-07, + "logits/chosen": -1.3623404502868652, + "logits/rejected": -1.361901044845581, + "logps/chosen": -42.57477569580078, + "logps/rejected": -63.756248474121094, + "loss": 0.3732, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12022893130779266, + "rewards/margins": 1.9076236486434937, + "rewards/rejected": -2.0278525352478027, + "step": 205 + }, + { + "epoch": 1.2207407407407407, + "grad_norm": 23.999886792109795, + "learning_rate": 4.957573067922359e-07, + "logits/chosen": -1.3816536664962769, + "logits/rejected": -1.3499540090560913, + "logps/chosen": -42.67706298828125, + "logps/rejected": -61.45965576171875, + "loss": 0.3764, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23336170613765717, + "rewards/margins": 1.017636775970459, + "rewards/rejected": -1.2509984970092773, + "step": 206 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 24.39207189325382, + "learning_rate": 4.956373039268021e-07, + "logits/chosen": -1.1838045120239258, + "logits/rejected": -1.295671820640564, + "logps/chosen": -49.812320709228516, + "logps/rejected": -56.74592971801758, + "loss": 0.3777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1829771101474762, + "rewards/margins": 1.9797238111495972, + "rewards/rejected": -2.162700891494751, + "step": 207 + }, + { + "epoch": 1.2325925925925927, + "grad_norm": 24.05256880321274, + "learning_rate": 4.955156424630479e-07, + "logits/chosen": -1.277360200881958, + "logits/rejected": -1.250819206237793, + "logps/chosen": -42.28697967529297, + "logps/rejected": -58.031089782714844, + "loss": 0.3739, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2733173370361328, + "rewards/margins": 1.4574378728866577, + "rewards/rejected": -1.7307552099227905, + "step": 208 + }, + { + "epoch": 1.2385185185185186, + "grad_norm": 31.62267871119725, + "learning_rate": 4.953923232224586e-07, + "logits/chosen": -1.2416062355041504, + "logits/rejected": -1.3119093179702759, + "logps/chosen": -45.55648422241211, + "logps/rejected": -50.12644577026367, + "loss": 0.4265, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2814592123031616, + "rewards/margins": 1.3618040084838867, + "rewards/rejected": -1.643263339996338, + "step": 209 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 20.192542093589164, + "learning_rate": 4.952673470377137e-07, + "logits/chosen": -1.2672306299209595, + "logits/rejected": -1.3711098432540894, + "logps/chosen": -45.2178840637207, + "logps/rejected": -69.5517349243164, + "loss": 0.3025, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.025760654360055923, + "rewards/margins": 1.5954328775405884, + "rewards/rejected": -1.5696722269058228, + "step": 210 + }, + { + "epoch": 1.2503703703703704, + "grad_norm": 29.815974642382578, + "learning_rate": 4.951407147526803e-07, + "logits/chosen": -1.4044413566589355, + "logits/rejected": -1.4227166175842285, + "logps/chosen": -50.24687194824219, + "logps/rejected": -54.735660552978516, + "loss": 0.3936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3008299469947815, + "rewards/margins": 1.338200330734253, + "rewards/rejected": -1.6390303373336792, + "step": 211 + }, + { + "epoch": 1.2562962962962962, + "grad_norm": 26.037475671487393, + "learning_rate": 4.950124272224082e-07, + "logits/chosen": -1.3218319416046143, + "logits/rejected": -1.3979120254516602, + "logps/chosen": -53.5397834777832, + "logps/rejected": -61.23820114135742, + "loss": 0.3906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18688765168190002, + "rewards/margins": 1.3252506256103516, + "rewards/rejected": -1.5121382474899292, + "step": 212 + }, + { + "epoch": 1.2622222222222224, + "grad_norm": 24.642049438405937, + "learning_rate": 4.948824853131236e-07, + "logits/chosen": -1.4031530618667603, + "logits/rejected": -1.3881592750549316, + "logps/chosen": -41.80998992919922, + "logps/rejected": -48.333030700683594, + "loss": 0.4209, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23013433814048767, + "rewards/margins": 0.9226297736167908, + "rewards/rejected": -1.152764081954956, + "step": 213 + }, + { + "epoch": 1.268148148148148, + "grad_norm": 26.73226151907008, + "learning_rate": 4.947508899022234e-07, + "logits/chosen": -1.3019957542419434, + "logits/rejected": -1.3482120037078857, + "logps/chosen": -41.626182556152344, + "logps/rejected": -45.29669189453125, + "loss": 0.3644, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22154735028743744, + "rewards/margins": 1.0375902652740479, + "rewards/rejected": -1.2591376304626465, + "step": 214 + }, + { + "epoch": 1.2740740740740741, + "grad_norm": 29.166969953835547, + "learning_rate": 4.946176418782698e-07, + "logits/chosen": -1.3099212646484375, + "logits/rejected": -1.2414171695709229, + "logps/chosen": -55.92378234863281, + "logps/rejected": -71.39763641357422, + "loss": 0.3894, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5391191244125366, + "rewards/margins": 1.3320989608764648, + "rewards/rejected": -1.871217966079712, + "step": 215 + }, + { + "epoch": 1.28, + "grad_norm": 25.60855418641745, + "learning_rate": 4.944827421409829e-07, + "logits/chosen": -1.3376116752624512, + "logits/rejected": -1.3164410591125488, + "logps/chosen": -56.354408264160156, + "logps/rejected": -60.64031982421875, + "loss": 0.3622, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3232120871543884, + "rewards/margins": 1.5070254802703857, + "rewards/rejected": -1.830237627029419, + "step": 216 + }, + { + "epoch": 1.285925925925926, + "grad_norm": 23.201414829534766, + "learning_rate": 4.943461916012363e-07, + "logits/chosen": -1.2318775653839111, + "logits/rejected": -1.352461814880371, + "logps/chosen": -50.486663818359375, + "logps/rejected": -70.3978042602539, + "loss": 0.3015, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.272786945104599, + "rewards/margins": 2.2941455841064453, + "rewards/rejected": -2.5669326782226562, + "step": 217 + }, + { + "epoch": 1.2918518518518518, + "grad_norm": 27.79922817942101, + "learning_rate": 4.9420799118105e-07, + "logits/chosen": -1.3263601064682007, + "logits/rejected": -1.4090485572814941, + "logps/chosen": -48.22421646118164, + "logps/rejected": -53.179039001464844, + "loss": 0.413, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4424872100353241, + "rewards/margins": 1.7214093208312988, + "rewards/rejected": -2.1638965606689453, + "step": 218 + }, + { + "epoch": 1.2977777777777777, + "grad_norm": 23.11860382817, + "learning_rate": 4.940681418135843e-07, + "logits/chosen": -1.3301970958709717, + "logits/rejected": -1.4387781620025635, + "logps/chosen": -36.736915588378906, + "logps/rejected": -66.50269317626953, + "loss": 0.3156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3005810081958771, + "rewards/margins": 2.1743574142456055, + "rewards/rejected": -2.47493839263916, + "step": 219 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 24.918955286472, + "learning_rate": 4.939266444431335e-07, + "logits/chosen": -1.3525760173797607, + "logits/rejected": -1.470273733139038, + "logps/chosen": -46.7333984375, + "logps/rejected": -73.63404083251953, + "loss": 0.3665, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24659954011440277, + "rewards/margins": 1.7434245347976685, + "rewards/rejected": -1.9900240898132324, + "step": 220 + }, + { + "epoch": 1.3096296296296297, + "grad_norm": 28.581360124769212, + "learning_rate": 4.937835000251197e-07, + "logits/chosen": -1.4148640632629395, + "logits/rejected": -1.4724056720733643, + "logps/chosen": -46.71776580810547, + "logps/rejected": -69.32954406738281, + "loss": 0.4222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.426167368888855, + "rewards/margins": 1.0898077487945557, + "rewards/rejected": -1.515974998474121, + "step": 221 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 23.15045697400444, + "learning_rate": 4.936387095260863e-07, + "logits/chosen": -1.325617790222168, + "logits/rejected": -1.3408955335617065, + "logps/chosen": -39.602783203125, + "logps/rejected": -63.946929931640625, + "loss": 0.3595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.056969501078128815, + "rewards/margins": 2.3829245567321777, + "rewards/rejected": -2.439894199371338, + "step": 222 + }, + { + "epoch": 1.3214814814814815, + "grad_norm": 26.031180737689365, + "learning_rate": 4.934922739236912e-07, + "logits/chosen": -1.5163429975509644, + "logits/rejected": -1.6212719678878784, + "logps/chosen": -43.200016021728516, + "logps/rejected": -56.9548454284668, + "loss": 0.4031, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2686010003089905, + "rewards/margins": 1.1195085048675537, + "rewards/rejected": -1.388109564781189, + "step": 223 + }, + { + "epoch": 1.3274074074074074, + "grad_norm": 33.46497240818608, + "learning_rate": 4.933441942067006e-07, + "logits/chosen": -1.3373076915740967, + "logits/rejected": -1.3908562660217285, + "logps/chosen": -57.834842681884766, + "logps/rejected": -68.81277465820312, + "loss": 0.5067, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38047823309898376, + "rewards/margins": 0.6113041639328003, + "rewards/rejected": -0.9917824268341064, + "step": 224 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 23.36650582203661, + "learning_rate": 4.93194471374982e-07, + "logits/chosen": -1.3187856674194336, + "logits/rejected": -1.368605375289917, + "logps/chosen": -46.856529235839844, + "logps/rejected": -61.831233978271484, + "loss": 0.3623, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002665497362613678, + "rewards/margins": 1.1606879234313965, + "rewards/rejected": -1.1580225229263306, + "step": 225 + }, + { + "epoch": 1.3392592592592591, + "grad_norm": 25.411305437774953, + "learning_rate": 4.930431064394976e-07, + "logits/chosen": -1.3518116474151611, + "logits/rejected": -1.386432409286499, + "logps/chosen": -51.480796813964844, + "logps/rejected": -51.128456115722656, + "loss": 0.4065, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.33157169818878174, + "rewards/margins": 0.9579052925109863, + "rewards/rejected": -1.289476990699768, + "step": 226 + }, + { + "epoch": 1.3451851851851853, + "grad_norm": 26.090560376399463, + "learning_rate": 4.928901004222977e-07, + "logits/chosen": -1.3473336696624756, + "logits/rejected": -1.487202763557434, + "logps/chosen": -49.934654235839844, + "logps/rejected": -53.528663635253906, + "loss": 0.3961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47797858715057373, + "rewards/margins": 1.3904893398284912, + "rewards/rejected": -1.8684678077697754, + "step": 227 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 27.75621449004402, + "learning_rate": 4.92735454356513e-07, + "logits/chosen": -1.464008092880249, + "logits/rejected": -1.5646613836288452, + "logps/chosen": -56.57294464111328, + "logps/rejected": -68.8311767578125, + "loss": 0.3841, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5350128412246704, + "rewards/margins": 1.29872465133667, + "rewards/rejected": -1.8337376117706299, + "step": 228 + }, + { + "epoch": 1.357037037037037, + "grad_norm": 25.729715098580982, + "learning_rate": 4.925791692863488e-07, + "logits/chosen": -1.3492538928985596, + "logits/rejected": -1.3906590938568115, + "logps/chosen": -40.129737854003906, + "logps/rejected": -50.684967041015625, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32827675342559814, + "rewards/margins": 1.1863243579864502, + "rewards/rejected": -1.5146011114120483, + "step": 229 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 26.449607937790024, + "learning_rate": 4.924212462670768e-07, + "logits/chosen": -1.381973385810852, + "logits/rejected": -1.3976202011108398, + "logps/chosen": -53.29833984375, + "logps/rejected": -59.929779052734375, + "loss": 0.3469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11365138739347458, + "rewards/margins": 1.7214391231536865, + "rewards/rejected": -1.8350905179977417, + "step": 230 + }, + { + "epoch": 1.3688888888888888, + "grad_norm": 23.43023680474791, + "learning_rate": 4.922616863650289e-07, + "logits/chosen": -1.1934809684753418, + "logits/rejected": -1.2369165420532227, + "logps/chosen": -50.21738815307617, + "logps/rejected": -71.64352416992188, + "loss": 0.3587, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4246065616607666, + "rewards/margins": 1.7253427505493164, + "rewards/rejected": -2.149949073791504, + "step": 231 + }, + { + "epoch": 1.374814814814815, + "grad_norm": 28.666265083827813, + "learning_rate": 4.921004906575896e-07, + "logits/chosen": -1.2596248388290405, + "logits/rejected": -1.3159894943237305, + "logps/chosen": -51.569847106933594, + "logps/rejected": -60.562618255615234, + "loss": 0.4363, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.031792208552360535, + "rewards/margins": 1.6072309017181396, + "rewards/rejected": -1.6390230655670166, + "step": 232 + }, + { + "epoch": 1.3807407407407408, + "grad_norm": 32.110066137255615, + "learning_rate": 4.919376602331883e-07, + "logits/chosen": -1.0603680610656738, + "logits/rejected": -1.0503628253936768, + "logps/chosen": -61.34019470214844, + "logps/rejected": -65.20555114746094, + "loss": 0.4379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6490658521652222, + "rewards/margins": 2.534877300262451, + "rewards/rejected": -3.183943271636963, + "step": 233 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 25.074079766201187, + "learning_rate": 4.917731961912926e-07, + "logits/chosen": -1.4197475910186768, + "logits/rejected": -1.4189743995666504, + "logps/chosen": -43.400482177734375, + "logps/rejected": -53.225189208984375, + "loss": 0.3515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06946276128292084, + "rewards/margins": 1.7199580669403076, + "rewards/rejected": -1.6504952907562256, + "step": 234 + }, + { + "epoch": 1.3925925925925926, + "grad_norm": 26.279570972884244, + "learning_rate": 4.91607099642401e-07, + "logits/chosen": -1.4161285161972046, + "logits/rejected": -1.344247817993164, + "logps/chosen": -52.5662841796875, + "logps/rejected": -52.22835159301758, + "loss": 0.4144, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3079703748226166, + "rewards/margins": 1.408158540725708, + "rewards/rejected": -1.716128945350647, + "step": 235 + }, + { + "epoch": 1.3985185185185185, + "grad_norm": 22.71741763432061, + "learning_rate": 4.914393717080346e-07, + "logits/chosen": -1.3715291023254395, + "logits/rejected": -1.4043883085250854, + "logps/chosen": -40.7462272644043, + "logps/rejected": -51.91321563720703, + "loss": 0.3452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11906649172306061, + "rewards/margins": 1.0093883275985718, + "rewards/rejected": -1.1284549236297607, + "step": 236 + }, + { + "epoch": 1.4044444444444444, + "grad_norm": 26.65603294074516, + "learning_rate": 4.9127001352073e-07, + "logits/chosen": -1.4703233242034912, + "logits/rejected": -1.538904070854187, + "logps/chosen": -45.97296142578125, + "logps/rejected": -65.32240295410156, + "loss": 0.3828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.41329696774482727, + "rewards/margins": 1.1328431367874146, + "rewards/rejected": -1.5461399555206299, + "step": 237 + }, + { + "epoch": 1.4103703703703703, + "grad_norm": 25.231239292504032, + "learning_rate": 4.910990262240321e-07, + "logits/chosen": -1.407243251800537, + "logits/rejected": -1.4713376760482788, + "logps/chosen": -42.070404052734375, + "logps/rejected": -47.037384033203125, + "loss": 0.3242, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1005098894238472, + "rewards/margins": 1.4259945154190063, + "rewards/rejected": -1.526504397392273, + "step": 238 + }, + { + "epoch": 1.4162962962962964, + "grad_norm": 24.015207965240965, + "learning_rate": 4.909264109724852e-07, + "logits/chosen": -1.3257907629013062, + "logits/rejected": -1.2598779201507568, + "logps/chosen": -41.595149993896484, + "logps/rejected": -58.04209899902344, + "loss": 0.3351, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09786944091320038, + "rewards/margins": 1.5942250490188599, + "rewards/rejected": -1.4963555335998535, + "step": 239 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 29.87686057856591, + "learning_rate": 4.907521689316265e-07, + "logits/chosen": -1.4115238189697266, + "logits/rejected": -1.3675273656845093, + "logps/chosen": -36.79245376586914, + "logps/rejected": -73.65255737304688, + "loss": 0.4191, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04488750547170639, + "rewards/margins": 1.4026293754577637, + "rewards/rejected": -1.4475167989730835, + "step": 240 + }, + { + "epoch": 1.4281481481481482, + "grad_norm": 21.03692614805522, + "learning_rate": 4.905763012779775e-07, + "logits/chosen": -1.3066779375076294, + "logits/rejected": -1.360036015510559, + "logps/chosen": -57.45732879638672, + "logps/rejected": -75.41363525390625, + "loss": 0.281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6602566838264465, + "rewards/margins": 2.5916683673858643, + "rewards/rejected": -3.251924991607666, + "step": 241 + }, + { + "epoch": 1.434074074074074, + "grad_norm": 25.75436647076939, + "learning_rate": 4.90398809199036e-07, + "logits/chosen": -1.2956441640853882, + "logits/rejected": -1.4834749698638916, + "logps/chosen": -54.38095474243164, + "logps/rejected": -57.828269958496094, + "loss": 0.3572, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.38308632373809814, + "rewards/margins": 1.9507379531860352, + "rewards/rejected": -2.333824396133423, + "step": 242 + }, + { + "epoch": 1.44, + "grad_norm": 29.700920716890522, + "learning_rate": 4.902196938932685e-07, + "logits/chosen": -1.2027655839920044, + "logits/rejected": -1.2516043186187744, + "logps/chosen": -42.96432876586914, + "logps/rejected": -51.21244812011719, + "loss": 0.3837, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12649276852607727, + "rewards/margins": 1.136317253112793, + "rewards/rejected": -1.2628101110458374, + "step": 243 + }, + { + "epoch": 1.445925925925926, + "grad_norm": 23.713492205204403, + "learning_rate": 4.90038956570102e-07, + "logits/chosen": -1.4097803831100464, + "logits/rejected": -1.418364405632019, + "logps/chosen": -56.92813491821289, + "logps/rejected": -49.77784729003906, + "loss": 0.3147, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1759030818939209, + "rewards/margins": 0.9846144914627075, + "rewards/rejected": -1.1605175733566284, + "step": 244 + }, + { + "epoch": 1.4518518518518517, + "grad_norm": 23.75978131455477, + "learning_rate": 4.898565984499153e-07, + "logits/chosen": -1.3047959804534912, + "logits/rejected": -1.3336838483810425, + "logps/chosen": -37.61913299560547, + "logps/rejected": -66.44524383544922, + "loss": 0.3504, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11530294269323349, + "rewards/margins": 1.62274968624115, + "rewards/rejected": -1.7380526065826416, + "step": 245 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 26.759728007939124, + "learning_rate": 4.896726207640314e-07, + "logits/chosen": -1.3182926177978516, + "logits/rejected": -1.2883374691009521, + "logps/chosen": -58.83563232421875, + "logps/rejected": -50.09530258178711, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3371252119541168, + "rewards/margins": 1.3079252243041992, + "rewards/rejected": -1.6450505256652832, + "step": 246 + }, + { + "epoch": 1.4637037037037037, + "grad_norm": 26.290819540180618, + "learning_rate": 4.894870247547093e-07, + "logits/chosen": -1.2748501300811768, + "logits/rejected": -1.4458961486816406, + "logps/chosen": -32.43407440185547, + "logps/rejected": -57.296913146972656, + "loss": 0.3684, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.279015451669693, + "rewards/margins": 1.564210057258606, + "rewards/rejected": -1.8432254791259766, + "step": 247 + }, + { + "epoch": 1.4696296296296296, + "grad_norm": 24.824183393994925, + "learning_rate": 4.892998116751348e-07, + "logits/chosen": -1.3820867538452148, + "logits/rejected": -1.3794804811477661, + "logps/chosen": -40.21681594848633, + "logps/rejected": -47.595802307128906, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.015096355229616165, + "rewards/margins": 0.9623454809188843, + "rewards/rejected": -0.9774419665336609, + "step": 248 + }, + { + "epoch": 1.4755555555555555, + "grad_norm": 21.650609907887063, + "learning_rate": 4.891109827894127e-07, + "logits/chosen": -1.2940155267715454, + "logits/rejected": -1.4335472583770752, + "logps/chosen": -74.19985961914062, + "logps/rejected": -70.36446380615234, + "loss": 0.3129, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0672260969877243, + "rewards/margins": 2.0223352909088135, + "rewards/rejected": -2.0895614624023438, + "step": 249 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 26.35700342159131, + "learning_rate": 4.889205393725583e-07, + "logits/chosen": -1.1685283184051514, + "logits/rejected": -1.2299532890319824, + "logps/chosen": -41.14178466796875, + "logps/rejected": -58.808467864990234, + "loss": 0.3688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.019259896129369736, + "rewards/margins": 1.9858262538909912, + "rewards/rejected": -1.9665664434432983, + "step": 250 + }, + { + "epoch": 1.4874074074074075, + "grad_norm": 26.0235689821749, + "learning_rate": 4.887284827104881e-07, + "logits/chosen": -1.392974615097046, + "logits/rejected": -1.505925178527832, + "logps/chosen": -36.905818939208984, + "logps/rejected": -79.7839584350586, + "loss": 0.321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17365923523902893, + "rewards/margins": 2.4765334129333496, + "rewards/rejected": -2.3028745651245117, + "step": 251 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 29.278848450522304, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -1.2471200227737427, + "logits/rejected": -1.2988287210464478, + "logps/chosen": -39.52333450317383, + "logps/rejected": -55.919002532958984, + "loss": 0.3928, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18332645297050476, + "rewards/margins": 2.614619731903076, + "rewards/rejected": -2.431293249130249, + "step": 252 + }, + { + "epoch": 1.4992592592592593, + "grad_norm": 28.042790270776827, + "learning_rate": 4.883395348488243e-07, + "logits/chosen": -1.3611935377120972, + "logits/rejected": -1.4138695001602173, + "logps/chosen": -57.63732147216797, + "logps/rejected": -60.91813278198242, + "loss": 0.4144, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10321831703186035, + "rewards/margins": 1.9807438850402832, + "rewards/rejected": -2.0839624404907227, + "step": 253 + }, + { + "epoch": 1.5051851851851852, + "grad_norm": 25.03952491402299, + "learning_rate": 4.88142646275494e-07, + "logits/chosen": -1.1539257764816284, + "logits/rejected": -1.1693516969680786, + "logps/chosen": -43.72496032714844, + "logps/rejected": -55.043235778808594, + "loss": 0.304, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12283861637115479, + "rewards/margins": 1.9001600742340088, + "rewards/rejected": -1.7773215770721436, + "step": 254 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 29.48314886820695, + "learning_rate": 4.879441497094572e-07, + "logits/chosen": -1.3410661220550537, + "logits/rejected": -1.4779433012008667, + "logps/chosen": -42.038639068603516, + "logps/rejected": -49.66044616699219, + "loss": 0.4142, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0242156982421875, + "rewards/margins": 1.087928295135498, + "rewards/rejected": -1.0637125968933105, + "step": 255 + }, + { + "epoch": 1.5170370370370372, + "grad_norm": 28.638225672957248, + "learning_rate": 4.877440464910073e-07, + "logits/chosen": -1.2495319843292236, + "logits/rejected": -1.303146243095398, + "logps/chosen": -42.27051544189453, + "logps/rejected": -58.957298278808594, + "loss": 0.4303, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2611178457736969, + "rewards/margins": 0.8532897233963013, + "rewards/rejected": -1.1144076585769653, + "step": 256 + }, + { + "epoch": 1.5229629629629629, + "grad_norm": 27.33028444497588, + "learning_rate": 4.875423379712864e-07, + "logits/chosen": -1.2766767740249634, + "logits/rejected": -1.296708106994629, + "logps/chosen": -50.208858489990234, + "logps/rejected": -71.75860595703125, + "loss": 0.3848, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22476840019226074, + "rewards/margins": 1.9250941276550293, + "rewards/rejected": -2.14986252784729, + "step": 257 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 24.26952968926095, + "learning_rate": 4.873390255122756e-07, + "logits/chosen": -1.2495331764221191, + "logits/rejected": -1.2845350503921509, + "logps/chosen": -47.2353515625, + "logps/rejected": -53.62681579589844, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013976313173770905, + "rewards/margins": 1.982759714126587, + "rewards/rejected": -1.9967360496520996, + "step": 258 + }, + { + "epoch": 1.5348148148148149, + "grad_norm": 23.570755979542007, + "learning_rate": 4.871341104867864e-07, + "logits/chosen": -1.4904160499572754, + "logits/rejected": -1.452820897102356, + "logps/chosen": -50.363956451416016, + "logps/rejected": -59.957847595214844, + "loss": 0.3653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.100559301674366, + "rewards/margins": 1.7651526927947998, + "rewards/rejected": -1.8657116889953613, + "step": 259 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 26.13987494401341, + "learning_rate": 4.869275942784511e-07, + "logits/chosen": -1.2994194030761719, + "logits/rejected": -1.354923129081726, + "logps/chosen": -36.94548034667969, + "logps/rejected": -54.38296890258789, + "loss": 0.3776, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.19663068652153015, + "rewards/margins": 1.3016669750213623, + "rewards/rejected": -1.4982978105545044, + "step": 260 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 25.935557135061014, + "learning_rate": 4.867194782817137e-07, + "logits/chosen": -1.2576755285263062, + "logits/rejected": -1.2709205150604248, + "logps/chosen": -47.876617431640625, + "logps/rejected": -50.99798583984375, + "loss": 0.42, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2130255401134491, + "rewards/margins": 1.1574770212173462, + "rewards/rejected": -0.9444514513015747, + "step": 261 + }, + { + "epoch": 1.5525925925925925, + "grad_norm": 25.73949104392821, + "learning_rate": 4.865097639018202e-07, + "logits/chosen": -1.3414067029953003, + "logits/rejected": -1.3837270736694336, + "logps/chosen": -51.21342468261719, + "logps/rejected": -60.94746017456055, + "loss": 0.3494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.38993364572525024, + "rewards/margins": 1.3145833015441895, + "rewards/rejected": -1.704516887664795, + "step": 262 + }, + { + "epoch": 1.5585185185185186, + "grad_norm": 26.371785681923715, + "learning_rate": 4.862984525548091e-07, + "logits/chosen": -1.208691954612732, + "logits/rejected": -1.32695472240448, + "logps/chosen": -41.11842346191406, + "logps/rejected": -51.61888885498047, + "loss": 0.3775, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16439585387706757, + "rewards/margins": 1.7332050800323486, + "rewards/rejected": -1.8976010084152222, + "step": 263 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 23.641754357015362, + "learning_rate": 4.860855456675024e-07, + "logits/chosen": -1.1456029415130615, + "logits/rejected": -1.2467702627182007, + "logps/chosen": -44.02134704589844, + "logps/rejected": -69.17605590820312, + "loss": 0.3844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.015612021088600159, + "rewards/margins": 2.685401439666748, + "rewards/rejected": -2.6697897911071777, + "step": 264 + }, + { + "epoch": 1.5703703703703704, + "grad_norm": 31.566770567869234, + "learning_rate": 4.85871044677495e-07, + "logits/chosen": -1.291245460510254, + "logits/rejected": -1.2862074375152588, + "logps/chosen": -48.08729553222656, + "logps/rejected": -61.76681900024414, + "loss": 0.4026, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2082689106464386, + "rewards/margins": 1.9027234315872192, + "rewards/rejected": -2.110992193222046, + "step": 265 + }, + { + "epoch": 1.5762962962962963, + "grad_norm": 23.191781424675476, + "learning_rate": 4.856549510331461e-07, + "logits/chosen": -1.3137812614440918, + "logits/rejected": -1.4595973491668701, + "logps/chosen": -45.001014709472656, + "logps/rejected": -58.4334602355957, + "loss": 0.3633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22389988601207733, + "rewards/margins": 1.405003309249878, + "rewards/rejected": -1.6289031505584717, + "step": 266 + }, + { + "epoch": 1.5822222222222222, + "grad_norm": 27.971724212537442, + "learning_rate": 4.854372661935684e-07, + "logits/chosen": -1.1602957248687744, + "logits/rejected": -1.2511422634124756, + "logps/chosen": -44.95030212402344, + "logps/rejected": -56.273780822753906, + "loss": 0.3956, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07271359860897064, + "rewards/margins": 1.2605903148651123, + "rewards/rejected": -1.1878767013549805, + "step": 267 + }, + { + "epoch": 1.5881481481481483, + "grad_norm": 24.276028981106442, + "learning_rate": 4.852179916286189e-07, + "logits/chosen": -1.262486457824707, + "logits/rejected": -1.4016473293304443, + "logps/chosen": -44.76614761352539, + "logps/rejected": -56.81776428222656, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07598677277565002, + "rewards/margins": 1.7435487508773804, + "rewards/rejected": -1.819535493850708, + "step": 268 + }, + { + "epoch": 1.594074074074074, + "grad_norm": 23.461378245972664, + "learning_rate": 4.849971288188889e-07, + "logits/chosen": -1.4302575588226318, + "logits/rejected": -1.5334992408752441, + "logps/chosen": -41.38616180419922, + "logps/rejected": -59.964263916015625, + "loss": 0.3707, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13780353963375092, + "rewards/margins": 1.4046378135681152, + "rewards/rejected": -1.2668342590332031, + "step": 269 + }, + { + "epoch": 1.6, + "grad_norm": 24.37626653853462, + "learning_rate": 4.847746792556936e-07, + "logits/chosen": -1.2448111772537231, + "logits/rejected": -1.313913345336914, + "logps/chosen": -51.415077209472656, + "logps/rejected": -53.74933624267578, + "loss": 0.356, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7689206600189209, + "rewards/margins": 1.6365911960601807, + "rewards/rejected": -2.4055118560791016, + "step": 270 + }, + { + "epoch": 1.605925925925926, + "grad_norm": 20.090159347357265, + "learning_rate": 4.845506444410626e-07, + "logits/chosen": -1.2469216585159302, + "logits/rejected": -1.2296561002731323, + "logps/chosen": -42.26240158081055, + "logps/rejected": -45.776145935058594, + "loss": 0.2883, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3450557589530945, + "rewards/margins": 1.5731394290924072, + "rewards/rejected": -1.9181952476501465, + "step": 271 + }, + { + "epoch": 1.6118518518518519, + "grad_norm": 19.791166531137915, + "learning_rate": 4.843250258877294e-07, + "logits/chosen": -1.2774832248687744, + "logits/rejected": -1.3242905139923096, + "logps/chosen": -46.619590759277344, + "logps/rejected": -50.182044982910156, + "loss": 0.2691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07584185898303986, + "rewards/margins": 1.252159833908081, + "rewards/rejected": -1.328001618385315, + "step": 272 + }, + { + "epoch": 1.6177777777777778, + "grad_norm": 29.649066059249037, + "learning_rate": 4.840978251191211e-07, + "logits/chosen": -1.2474559545516968, + "logits/rejected": -1.23881196975708, + "logps/chosen": -53.4446907043457, + "logps/rejected": -52.38796615600586, + "loss": 0.4066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4888032078742981, + "rewards/margins": 1.5558326244354248, + "rewards/rejected": -2.044635772705078, + "step": 273 + }, + { + "epoch": 1.6237037037037036, + "grad_norm": 33.28057855157768, + "learning_rate": 4.838690436693483e-07, + "logits/chosen": -1.412484049797058, + "logits/rejected": -1.4253754615783691, + "logps/chosen": -75.49126434326172, + "logps/rejected": -70.59172058105469, + "loss": 0.4191, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3724600374698639, + "rewards/margins": 1.891104817390442, + "rewards/rejected": -2.2635648250579834, + "step": 274 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 22.666180377284093, + "learning_rate": 4.836386830831951e-07, + "logits/chosen": -1.2181977033615112, + "logits/rejected": -1.2449318170547485, + "logps/chosen": -38.67042922973633, + "logps/rejected": -54.85205841064453, + "loss": 0.3325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08570816367864609, + "rewards/margins": 1.6117794513702393, + "rewards/rejected": -1.526071310043335, + "step": 275 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 25.751950738264924, + "learning_rate": 4.834067449161077e-07, + "logits/chosen": -1.2701869010925293, + "logits/rejected": -1.3182072639465332, + "logps/chosen": -47.42134094238281, + "logps/rejected": -68.39168548583984, + "loss": 0.3341, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11203102767467499, + "rewards/margins": 2.025148630142212, + "rewards/rejected": -2.1371796131134033, + "step": 276 + }, + { + "epoch": 1.6414814814814815, + "grad_norm": 19.59862580042334, + "learning_rate": 4.83173230734185e-07, + "logits/chosen": -1.402850866317749, + "logits/rejected": -1.4441189765930176, + "logps/chosen": -44.067161560058594, + "logps/rejected": -74.80207061767578, + "loss": 0.2722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06552604585886002, + "rewards/margins": 1.8013237714767456, + "rewards/rejected": -1.7357978820800781, + "step": 277 + }, + { + "epoch": 1.6474074074074074, + "grad_norm": 23.22500510703153, + "learning_rate": 4.829381421141671e-07, + "logits/chosen": -1.2457361221313477, + "logits/rejected": -1.3469531536102295, + "logps/chosen": -38.14561462402344, + "logps/rejected": -52.48051452636719, + "loss": 0.3409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2643684148788452, + "rewards/margins": 1.3024507761001587, + "rewards/rejected": -1.5668190717697144, + "step": 278 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 24.46935150462778, + "learning_rate": 4.827014806434253e-07, + "logits/chosen": -1.3392874002456665, + "logits/rejected": -1.398203730583191, + "logps/chosen": -69.1489486694336, + "logps/rejected": -86.5308837890625, + "loss": 0.2984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17087030410766602, + "rewards/margins": 2.4526419639587402, + "rewards/rejected": -2.6235122680664062, + "step": 279 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 32.66878587176462, + "learning_rate": 4.824632479199511e-07, + "logits/chosen": -1.2298073768615723, + "logits/rejected": -1.2714858055114746, + "logps/chosen": -52.639041900634766, + "logps/rejected": -57.43061828613281, + "loss": 0.4427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4236421585083008, + "rewards/margins": 1.339827299118042, + "rewards/rejected": -1.7634694576263428, + "step": 280 + }, + { + "epoch": 1.665185185185185, + "grad_norm": 22.72790160810974, + "learning_rate": 4.822234455523453e-07, + "logits/chosen": -1.259157419204712, + "logits/rejected": -1.3009653091430664, + "logps/chosen": -40.12117385864258, + "logps/rejected": -60.172664642333984, + "loss": 0.3048, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30390816926956177, + "rewards/margins": 1.9633526802062988, + "rewards/rejected": -2.267261028289795, + "step": 281 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 27.8520366261411, + "learning_rate": 4.819820751598076e-07, + "logits/chosen": -1.3906257152557373, + "logits/rejected": -1.425363540649414, + "logps/chosen": -41.69005584716797, + "logps/rejected": -53.399139404296875, + "loss": 0.3495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20817890763282776, + "rewards/margins": 1.4239811897277832, + "rewards/rejected": -1.632159948348999, + "step": 282 + }, + { + "epoch": 1.6770370370370369, + "grad_norm": 25.999550920916317, + "learning_rate": 4.817391383721249e-07, + "logits/chosen": -1.3971692323684692, + "logits/rejected": -1.488884687423706, + "logps/chosen": -53.06078338623047, + "logps/rejected": -69.60707092285156, + "loss": 0.357, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.41441503167152405, + "rewards/margins": 1.7222974300384521, + "rewards/rejected": -2.1367125511169434, + "step": 283 + }, + { + "epoch": 1.682962962962963, + "grad_norm": 31.91920988188716, + "learning_rate": 4.814946368296616e-07, + "logits/chosen": -1.292490839958191, + "logits/rejected": -1.320987582206726, + "logps/chosen": -38.88534927368164, + "logps/rejected": -49.37236404418945, + "loss": 0.4254, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0005872957408428192, + "rewards/margins": 1.1653939485549927, + "rewards/rejected": -1.1659812927246094, + "step": 284 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 28.470589880633216, + "learning_rate": 4.812485721833464e-07, + "logits/chosen": -1.1484642028808594, + "logits/rejected": -1.241657018661499, + "logps/chosen": -57.71142578125, + "logps/rejected": -87.64554595947266, + "loss": 0.3595, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.051835060119628906, + "rewards/margins": 2.0897717475891113, + "rewards/rejected": -2.0379366874694824, + "step": 285 + }, + { + "epoch": 1.6948148148148148, + "grad_norm": 26.804492840080194, + "learning_rate": 4.810009460946635e-07, + "logits/chosen": -1.3247426748275757, + "logits/rejected": -1.3112623691558838, + "logps/chosen": -43.300559997558594, + "logps/rejected": -51.48878860473633, + "loss": 0.382, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.24530044198036194, + "rewards/margins": 1.2045056819915771, + "rewards/rejected": -1.4498060941696167, + "step": 286 + }, + { + "epoch": 1.7007407407407409, + "grad_norm": 26.52740471473782, + "learning_rate": 4.8075176023564e-07, + "logits/chosen": -1.3240617513656616, + "logits/rejected": -1.4052027463912964, + "logps/chosen": -45.60319519042969, + "logps/rejected": -61.580039978027344, + "loss": 0.395, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03795705735683441, + "rewards/margins": 1.6152465343475342, + "rewards/rejected": -1.6532034873962402, + "step": 287 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 24.11382703660646, + "learning_rate": 4.805010162888346e-07, + "logits/chosen": -1.2520138025283813, + "logits/rejected": -1.4344892501831055, + "logps/chosen": -47.9183464050293, + "logps/rejected": -53.69255828857422, + "loss": 0.3442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4233085811138153, + "rewards/margins": 1.2932010889053345, + "rewards/rejected": -0.8698925375938416, + "step": 288 + }, + { + "epoch": 1.7125925925925927, + "grad_norm": 25.438127060122117, + "learning_rate": 4.802487159473271e-07, + "logits/chosen": -1.211564064025879, + "logits/rejected": -1.2627569437026978, + "logps/chosen": -52.95191955566406, + "logps/rejected": -65.62922668457031, + "loss": 0.3494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3603849411010742, + "rewards/margins": 2.6062426567077637, + "rewards/rejected": -2.966627597808838, + "step": 289 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 29.349396127433405, + "learning_rate": 4.799948609147061e-07, + "logits/chosen": -1.3695993423461914, + "logits/rejected": -1.3942320346832275, + "logps/chosen": -47.67075729370117, + "logps/rejected": -61.6407585144043, + "loss": 0.3805, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23102344572544098, + "rewards/margins": 1.8437186479568481, + "rewards/rejected": -2.074742078781128, + "step": 290 + }, + { + "epoch": 1.7244444444444444, + "grad_norm": 22.5576058640629, + "learning_rate": 4.797394529050577e-07, + "logits/chosen": -1.386033296585083, + "logits/rejected": -1.4366207122802734, + "logps/chosen": -54.97161865234375, + "logps/rejected": -60.0572624206543, + "loss": 0.3047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10227298736572266, + "rewards/margins": 2.372859001159668, + "rewards/rejected": -2.4751322269439697, + "step": 291 + }, + { + "epoch": 1.7303703703703703, + "grad_norm": 23.961332637171278, + "learning_rate": 4.794824936429543e-07, + "logits/chosen": -1.2843897342681885, + "logits/rejected": -1.2670435905456543, + "logps/chosen": -40.86612319946289, + "logps/rejected": -46.26717758178711, + "loss": 0.298, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021604575216770172, + "rewards/margins": 1.5374772548675537, + "rewards/rejected": -1.55908203125, + "step": 292 + }, + { + "epoch": 1.7362962962962962, + "grad_norm": 20.82357798003762, + "learning_rate": 4.792239848634426e-07, + "logits/chosen": -1.2751004695892334, + "logits/rejected": -1.2628631591796875, + "logps/chosen": -51.31232833862305, + "logps/rejected": -57.4508056640625, + "loss": 0.3308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.006660893559455872, + "rewards/margins": 2.1980137825012207, + "rewards/rejected": -2.20467472076416, + "step": 293 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 21.618706080176008, + "learning_rate": 4.789639283120322e-07, + "logits/chosen": -1.345123052597046, + "logits/rejected": -1.4260653257369995, + "logps/chosen": -39.87286376953125, + "logps/rejected": -60.069549560546875, + "loss": 0.343, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13270238041877747, + "rewards/margins": 2.6959171295166016, + "rewards/rejected": -2.5632147789001465, + "step": 294 + }, + { + "epoch": 1.748148148148148, + "grad_norm": 22.76141790976671, + "learning_rate": 4.787023257446832e-07, + "logits/chosen": -1.3435921669006348, + "logits/rejected": -1.4133026599884033, + "logps/chosen": -53.27568435668945, + "logps/rejected": -64.22837829589844, + "loss": 0.3139, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0206415057182312, + "rewards/margins": 2.1336681842803955, + "rewards/rejected": -2.1130268573760986, + "step": 295 + }, + { + "epoch": 1.7540740740740741, + "grad_norm": 26.56123778763129, + "learning_rate": 4.784391789277952e-07, + "logits/chosen": -1.3392447233200073, + "logits/rejected": -1.3046197891235352, + "logps/chosen": -40.423377990722656, + "logps/rejected": -48.11532211303711, + "loss": 0.3649, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04688149318099022, + "rewards/margins": 1.1583579778671265, + "rewards/rejected": -1.2052394151687622, + "step": 296 + }, + { + "epoch": 1.76, + "grad_norm": 25.807961780535802, + "learning_rate": 4.781744896381944e-07, + "logits/chosen": -1.411516547203064, + "logits/rejected": -1.3581936359405518, + "logps/chosen": -59.09212875366211, + "logps/rejected": -65.5539779663086, + "loss": 0.3824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4150131344795227, + "rewards/margins": 1.1555423736572266, + "rewards/rejected": -1.5705554485321045, + "step": 297 + }, + { + "epoch": 1.765925925925926, + "grad_norm": 19.33578977859486, + "learning_rate": 4.779082596631226e-07, + "logits/chosen": -1.4044454097747803, + "logits/rejected": -1.4459329843521118, + "logps/chosen": -44.998008728027344, + "logps/rejected": -69.97136688232422, + "loss": 0.222, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09062831103801727, + "rewards/margins": 2.470029354095459, + "rewards/rejected": -2.560657501220703, + "step": 298 + }, + { + "epoch": 1.771851851851852, + "grad_norm": 26.421795777830678, + "learning_rate": 4.776404908002245e-07, + "logits/chosen": -1.4593310356140137, + "logits/rejected": -1.4886685609817505, + "logps/chosen": -41.98119354248047, + "logps/rejected": -54.190040588378906, + "loss": 0.3293, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.015456534922122955, + "rewards/margins": 0.9114388823509216, + "rewards/rejected": -0.8959822654724121, + "step": 299 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 25.388116798218327, + "learning_rate": 4.773711848575356e-07, + "logits/chosen": -0.882595419883728, + "logits/rejected": -0.8482030630111694, + "logps/chosen": -60.15789031982422, + "logps/rejected": -53.647987365722656, + "loss": 0.3177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21632501482963562, + "rewards/margins": 1.840660572052002, + "rewards/rejected": -2.05698561668396, + "step": 300 + }, + { + "epoch": 1.7837037037037038, + "grad_norm": 27.44128369149265, + "learning_rate": 4.771003436534702e-07, + "logits/chosen": -1.155822515487671, + "logits/rejected": -1.1653910875320435, + "logps/chosen": -44.46099090576172, + "logps/rejected": -67.78788757324219, + "loss": 0.3193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11511935293674469, + "rewards/margins": 2.467637062072754, + "rewards/rejected": -2.35251784324646, + "step": 301 + }, + { + "epoch": 1.7896296296296297, + "grad_norm": 31.0131157657199, + "learning_rate": 4.7682796901680906e-07, + "logits/chosen": -1.3534022569656372, + "logits/rejected": -1.3563964366912842, + "logps/chosen": -52.00239944458008, + "logps/rejected": -64.85633850097656, + "loss": 0.3686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26378777623176575, + "rewards/margins": 2.1019082069396973, + "rewards/rejected": -2.3656959533691406, + "step": 302 + }, + { + "epoch": 1.7955555555555556, + "grad_norm": 25.572323066424033, + "learning_rate": 4.765540627866869e-07, + "logits/chosen": -1.3851449489593506, + "logits/rejected": -1.3254812955856323, + "logps/chosen": -64.02766418457031, + "logps/rejected": -55.131404876708984, + "loss": 0.307, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37291884422302246, + "rewards/margins": 1.6172915697097778, + "rewards/rejected": -1.9902105331420898, + "step": 303 + }, + { + "epoch": 1.8014814814814815, + "grad_norm": 24.973006963104183, + "learning_rate": 4.7627862681258027e-07, + "logits/chosen": -1.3431549072265625, + "logits/rejected": -1.3935474157333374, + "logps/chosen": -43.056156158447266, + "logps/rejected": -45.55668640136719, + "loss": 0.3787, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.027092069387435913, + "rewards/margins": 1.2533209323883057, + "rewards/rejected": -1.2804131507873535, + "step": 304 + }, + { + "epoch": 1.8074074074074074, + "grad_norm": 23.246647674435096, + "learning_rate": 4.7600166295429476e-07, + "logits/chosen": -1.082540512084961, + "logits/rejected": -1.1362693309783936, + "logps/chosen": -40.44417190551758, + "logps/rejected": -52.83973693847656, + "loss": 0.3181, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2112465500831604, + "rewards/margins": 1.6178412437438965, + "rewards/rejected": -1.829087734222412, + "step": 305 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 27.00429197324121, + "learning_rate": 4.7572317308195276e-07, + "logits/chosen": -1.2468149662017822, + "logits/rejected": -1.3040968179702759, + "logps/chosen": -46.73735809326172, + "logps/rejected": -63.27323913574219, + "loss": 0.3206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.37287694215774536, + "rewards/margins": 1.7324719429016113, + "rewards/rejected": -2.105348825454712, + "step": 306 + }, + { + "epoch": 1.8192592592592591, + "grad_norm": 23.44156041895113, + "learning_rate": 4.7544315907598034e-07, + "logits/chosen": -1.345916748046875, + "logits/rejected": -1.3447480201721191, + "logps/chosen": -40.71674728393555, + "logps/rejected": -50.70779800415039, + "loss": 0.3617, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0339193195104599, + "rewards/margins": 1.5760494470596313, + "rewards/rejected": -1.5421302318572998, + "step": 307 + }, + { + "epoch": 1.8251851851851852, + "grad_norm": 25.06604927659884, + "learning_rate": 4.7516162282709515e-07, + "logits/chosen": -1.147845983505249, + "logits/rejected": -1.1910618543624878, + "logps/chosen": -45.736656188964844, + "logps/rejected": -55.5797004699707, + "loss": 0.3699, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0804969072341919, + "rewards/margins": 1.4355418682098389, + "rewards/rejected": -1.5160387754440308, + "step": 308 + }, + { + "epoch": 1.8311111111111111, + "grad_norm": 25.46137399084043, + "learning_rate": 4.748785662362932e-07, + "logits/chosen": -1.3299181461334229, + "logits/rejected": -1.3480998277664185, + "logps/chosen": -55.302337646484375, + "logps/rejected": -79.200439453125, + "loss": 0.3278, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21703583002090454, + "rewards/margins": 1.7550965547561646, + "rewards/rejected": -1.5380607843399048, + "step": 309 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 26.51707060569584, + "learning_rate": 4.7459399121483634e-07, + "logits/chosen": -1.1016050577163696, + "logits/rejected": -1.2106202840805054, + "logps/chosen": -53.677574157714844, + "logps/rejected": -60.549896240234375, + "loss": 0.3819, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.054566219449043274, + "rewards/margins": 2.2532806396484375, + "rewards/rejected": -2.3078465461730957, + "step": 310 + }, + { + "epoch": 1.842962962962963, + "grad_norm": 26.561068654774267, + "learning_rate": 4.74307899684239e-07, + "logits/chosen": -1.2419092655181885, + "logits/rejected": -1.2967182397842407, + "logps/chosen": -53.5194091796875, + "logps/rejected": -60.51311111450195, + "loss": 0.3916, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.191430002450943, + "rewards/margins": 1.867770791053772, + "rewards/rejected": -2.0592007637023926, + "step": 311 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 24.80025359759185, + "learning_rate": 4.7402029357625563e-07, + "logits/chosen": -1.4073894023895264, + "logits/rejected": -1.2945380210876465, + "logps/chosen": -51.761009216308594, + "logps/rejected": -56.40605926513672, + "loss": 0.3481, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22043563425540924, + "rewards/margins": 1.9667236804962158, + "rewards/rejected": -2.187159538269043, + "step": 312 + }, + { + "epoch": 1.854814814814815, + "grad_norm": 26.590869331925262, + "learning_rate": 4.737311748328673e-07, + "logits/chosen": -1.3399579524993896, + "logits/rejected": -1.4058165550231934, + "logps/chosen": -48.944793701171875, + "logps/rejected": -67.84580993652344, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09198684245347977, + "rewards/margins": 2.516043186187744, + "rewards/rejected": -2.608030080795288, + "step": 313 + }, + { + "epoch": 1.8607407407407406, + "grad_norm": 21.57321505154053, + "learning_rate": 4.7344054540626887e-07, + "logits/chosen": -1.2113909721374512, + "logits/rejected": -1.29359769821167, + "logps/chosen": -40.80859375, + "logps/rejected": -51.39629364013672, + "loss": 0.2984, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.013930395245552063, + "rewards/margins": 2.274604082107544, + "rewards/rejected": -2.2606735229492188, + "step": 314 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 24.633388476867335, + "learning_rate": 4.731484072588555e-07, + "logits/chosen": -1.2730799913406372, + "logits/rejected": -1.3082690238952637, + "logps/chosen": -49.419525146484375, + "logps/rejected": -56.43271255493164, + "loss": 0.3089, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40381959080696106, + "rewards/margins": 2.45341420173645, + "rewards/rejected": -2.857233762741089, + "step": 315 + }, + { + "epoch": 1.8725925925925926, + "grad_norm": 24.457073160874554, + "learning_rate": 4.7285476236320976e-07, + "logits/chosen": -1.2295560836791992, + "logits/rejected": -1.246058464050293, + "logps/chosen": -47.56389236450195, + "logps/rejected": -63.71388626098633, + "loss": 0.3404, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.009183228015899658, + "rewards/margins": 2.0711443424224854, + "rewards/rejected": -2.0803275108337402, + "step": 316 + }, + { + "epoch": 1.8785185185185185, + "grad_norm": 24.216496534182244, + "learning_rate": 4.725596127020879e-07, + "logits/chosen": -1.4801841974258423, + "logits/rejected": -1.4818217754364014, + "logps/chosen": -49.63344955444336, + "logps/rejected": -67.68574523925781, + "loss": 0.327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30766502022743225, + "rewards/margins": 1.4231786727905273, + "rewards/rejected": -1.1155136823654175, + "step": 317 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 29.289508668995026, + "learning_rate": 4.7226296026840686e-07, + "logits/chosen": -1.3967444896697998, + "logits/rejected": -1.442480444908142, + "logps/chosen": -44.551124572753906, + "logps/rejected": -52.382606506347656, + "loss": 0.4098, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.198195219039917, + "rewards/margins": 0.9453842639923096, + "rewards/rejected": -1.1435794830322266, + "step": 318 + }, + { + "epoch": 1.8903703703703703, + "grad_norm": 29.842607290777202, + "learning_rate": 4.7196480706523066e-07, + "logits/chosen": -1.3752483129501343, + "logits/rejected": -1.4549767971038818, + "logps/chosen": -45.82021713256836, + "logps/rejected": -60.437225341796875, + "loss": 0.4177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03907575458288193, + "rewards/margins": 1.6668366193771362, + "rewards/rejected": -1.7059123516082764, + "step": 319 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 27.506534719126677, + "learning_rate": 4.716651551057567e-07, + "logits/chosen": -1.272362470626831, + "logits/rejected": -1.2426154613494873, + "logps/chosen": -47.80769348144531, + "logps/rejected": -59.07539367675781, + "loss": 0.3766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17932742834091187, + "rewards/margins": 1.7639901638031006, + "rewards/rejected": -1.9433174133300781, + "step": 320 + }, + { + "epoch": 1.9022222222222223, + "grad_norm": 23.467742474280602, + "learning_rate": 4.7136400641330245e-07, + "logits/chosen": -1.2614030838012695, + "logits/rejected": -1.359266996383667, + "logps/chosen": -39.78665542602539, + "logps/rejected": -58.80195617675781, + "loss": 0.3148, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.033597737550735474, + "rewards/margins": 2.5896811485290527, + "rewards/rejected": -2.5560834407806396, + "step": 321 + }, + { + "epoch": 1.9081481481481481, + "grad_norm": 26.8700405040657, + "learning_rate": 4.710613630212916e-07, + "logits/chosen": -1.360331416130066, + "logits/rejected": -1.2999292612075806, + "logps/chosen": -54.633628845214844, + "logps/rejected": -65.06961059570312, + "loss": 0.3516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14919951558113098, + "rewards/margins": 3.318268299102783, + "rewards/rejected": -3.1690688133239746, + "step": 322 + }, + { + "epoch": 1.914074074074074, + "grad_norm": 25.742617657524853, + "learning_rate": 4.707572269732404e-07, + "logits/chosen": -1.267808437347412, + "logits/rejected": -1.2729883193969727, + "logps/chosen": -43.86378860473633, + "logps/rejected": -60.72163009643555, + "loss": 0.3297, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22709807753562927, + "rewards/margins": 1.6642297506332397, + "rewards/rejected": -1.4371315240859985, + "step": 323 + }, + { + "epoch": 1.92, + "grad_norm": 16.874300534646334, + "learning_rate": 4.7045160032274376e-07, + "logits/chosen": -1.300421953201294, + "logits/rejected": -1.3349748849868774, + "logps/chosen": -55.42463684082031, + "logps/rejected": -74.85099792480469, + "loss": 0.2324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36268478631973267, + "rewards/margins": 2.777242660522461, + "rewards/rejected": -3.139927387237549, + "step": 324 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 21.141707211169013, + "learning_rate": 4.701444851334617e-07, + "logits/chosen": -1.3354458808898926, + "logits/rejected": -1.3530535697937012, + "logps/chosen": -45.7332763671875, + "logps/rejected": -46.52414321899414, + "loss": 0.2912, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22032663226127625, + "rewards/margins": 2.1787269115448, + "rewards/rejected": -1.9584002494812012, + "step": 325 + }, + { + "epoch": 1.9318518518518517, + "grad_norm": 23.948147218514162, + "learning_rate": 4.698358834791051e-07, + "logits/chosen": -1.2572468519210815, + "logits/rejected": -1.3264085054397583, + "logps/chosen": -46.600433349609375, + "logps/rejected": -62.773555755615234, + "loss": 0.2962, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13065075874328613, + "rewards/margins": 2.4154248237609863, + "rewards/rejected": -2.5460758209228516, + "step": 326 + }, + { + "epoch": 1.9377777777777778, + "grad_norm": 24.99842201946006, + "learning_rate": 4.695257974434215e-07, + "logits/chosen": -1.3555095195770264, + "logits/rejected": -1.362163782119751, + "logps/chosen": -53.40457534790039, + "logps/rejected": -53.34534454345703, + "loss": 0.3544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1949038803577423, + "rewards/margins": 1.9531302452087402, + "rewards/rejected": -2.14803409576416, + "step": 327 + }, + { + "epoch": 1.9437037037037037, + "grad_norm": 20.447626099998985, + "learning_rate": 4.6921422912018174e-07, + "logits/chosen": -1.222961187362671, + "logits/rejected": -1.3553869724273682, + "logps/chosen": -38.92687225341797, + "logps/rejected": -69.66336059570312, + "loss": 0.2428, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08813555538654327, + "rewards/margins": 2.587808847427368, + "rewards/rejected": -2.499673366546631, + "step": 328 + }, + { + "epoch": 1.9496296296296296, + "grad_norm": 24.203644596916128, + "learning_rate": 4.689011806131651e-07, + "logits/chosen": -1.2260617017745972, + "logits/rejected": -1.2644914388656616, + "logps/chosen": -48.461448669433594, + "logps/rejected": -53.205650329589844, + "loss": 0.3463, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5683692693710327, + "rewards/margins": 1.0428115129470825, + "rewards/rejected": -1.6111807823181152, + "step": 329 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 24.924372266570685, + "learning_rate": 4.685866540361455e-07, + "logits/chosen": -1.095257043838501, + "logits/rejected": -1.2506214380264282, + "logps/chosen": -43.94602584838867, + "logps/rejected": -65.505615234375, + "loss": 0.3495, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2011902928352356, + "rewards/margins": 2.575469493865967, + "rewards/rejected": -2.7766599655151367, + "step": 330 + }, + { + "epoch": 1.9614814814814814, + "grad_norm": 21.28680096956084, + "learning_rate": 4.6827065151287726e-07, + "logits/chosen": -1.120419979095459, + "logits/rejected": -1.2555115222930908, + "logps/chosen": -51.23665237426758, + "logps/rejected": -65.72523498535156, + "loss": 0.2945, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3505423069000244, + "rewards/margins": 1.8112775087356567, + "rewards/rejected": -2.1618199348449707, + "step": 331 + }, + { + "epoch": 1.9674074074074075, + "grad_norm": 26.217325638823457, + "learning_rate": 4.6795317517708037e-07, + "logits/chosen": -1.298682451248169, + "logits/rejected": -1.3794772624969482, + "logps/chosen": -49.35894775390625, + "logps/rejected": -53.33086395263672, + "loss": 0.3594, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04362444207072258, + "rewards/margins": 1.6961524486541748, + "rewards/rejected": -1.652527928352356, + "step": 332 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 25.481027607863936, + "learning_rate": 4.676342271724265e-07, + "logits/chosen": -1.204390048980713, + "logits/rejected": -1.3311251401901245, + "logps/chosen": -39.88072967529297, + "logps/rejected": -50.89021682739258, + "loss": 0.3892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02257857657968998, + "rewards/margins": 2.2824935913085938, + "rewards/rejected": -2.2599148750305176, + "step": 333 + }, + { + "epoch": 1.9792592592592593, + "grad_norm": 26.32787839414432, + "learning_rate": 4.673138096525243e-07, + "logits/chosen": -1.3207244873046875, + "logits/rejected": -1.3684799671173096, + "logps/chosen": -49.37727355957031, + "logps/rejected": -68.2453384399414, + "loss": 0.3426, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12120073288679123, + "rewards/margins": 1.4206024408340454, + "rewards/rejected": -1.5418031215667725, + "step": 334 + }, + { + "epoch": 1.9851851851851852, + "grad_norm": 22.17620720442061, + "learning_rate": 4.6699192478090495e-07, + "logits/chosen": -1.388519287109375, + "logits/rejected": -1.4136606454849243, + "logps/chosen": -42.03928756713867, + "logps/rejected": -63.68479919433594, + "loss": 0.2835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16710436344146729, + "rewards/margins": 2.781280040740967, + "rewards/rejected": -2.9483845233917236, + "step": 335 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 23.44832441743166, + "learning_rate": 4.666685747310074e-07, + "logits/chosen": -1.2400968074798584, + "logits/rejected": -1.2306945323944092, + "logps/chosen": -50.3095703125, + "logps/rejected": -66.10108184814453, + "loss": 0.269, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1821298599243164, + "rewards/margins": 2.479905605316162, + "rewards/rejected": -2.6620354652404785, + "step": 336 + }, + { + "epoch": 1.9970370370370372, + "grad_norm": 26.36611150038332, + "learning_rate": 4.663437616861641e-07, + "logits/chosen": -1.223244547843933, + "logits/rejected": -1.3455448150634766, + "logps/chosen": -49.680442810058594, + "logps/rejected": -54.95825958251953, + "loss": 0.3365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02616964653134346, + "rewards/margins": 2.6336476802825928, + "rewards/rejected": -2.6074776649475098, + "step": 337 + }, + { + "epoch": 2.002962962962963, + "grad_norm": 21.883276832839393, + "learning_rate": 4.660174878395855e-07, + "logits/chosen": -1.2061843872070312, + "logits/rejected": -1.2987509965896606, + "logps/chosen": -46.96556854248047, + "logps/rejected": -58.78457260131836, + "loss": 0.3064, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11700738966464996, + "rewards/margins": 2.5580999851226807, + "rewards/rejected": -2.675107479095459, + "step": 338 + }, + { + "epoch": 2.008888888888889, + "grad_norm": 19.40225522629075, + "learning_rate": 4.6568975539434624e-07, + "logits/chosen": -1.2043370008468628, + "logits/rejected": -1.2271485328674316, + "logps/chosen": -40.650550842285156, + "logps/rejected": -53.05525207519531, + "loss": 0.274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08650882542133331, + "rewards/margins": 1.585855484008789, + "rewards/rejected": -1.4993466138839722, + "step": 339 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 18.552582482952893, + "learning_rate": 4.653605665633694e-07, + "logits/chosen": -1.258097767829895, + "logits/rejected": -1.3163410425186157, + "logps/chosen": -52.92136764526367, + "logps/rejected": -72.87959289550781, + "loss": 0.2445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22618934512138367, + "rewards/margins": 1.7594165802001953, + "rewards/rejected": -1.9856061935424805, + "step": 340 + }, + { + "epoch": 2.0207407407407407, + "grad_norm": 17.05097874274958, + "learning_rate": 4.6502992356941193e-07, + "logits/chosen": -1.1468836069107056, + "logits/rejected": -1.255903959274292, + "logps/chosen": -50.84268569946289, + "logps/rejected": -70.92694091796875, + "loss": 0.2173, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13767805695533752, + "rewards/margins": 3.111330270767212, + "rewards/rejected": -2.9736523628234863, + "step": 341 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 18.830186351991596, + "learning_rate": 4.6469782864504993e-07, + "logits/chosen": -1.1887123584747314, + "logits/rejected": -1.2675807476043701, + "logps/chosen": -52.629188537597656, + "logps/rejected": -63.47866439819336, + "loss": 0.2332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07282394915819168, + "rewards/margins": 2.2217636108398438, + "rewards/rejected": -2.148939609527588, + "step": 342 + }, + { + "epoch": 2.0325925925925925, + "grad_norm": 18.786840301991177, + "learning_rate": 4.643642840326627e-07, + "logits/chosen": -1.1792242527008057, + "logits/rejected": -1.296401023864746, + "logps/chosen": -37.09912872314453, + "logps/rejected": -64.6133804321289, + "loss": 0.2325, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18781086802482605, + "rewards/margins": 2.0591042041778564, + "rewards/rejected": -2.246915102005005, + "step": 343 + }, + { + "epoch": 2.0385185185185186, + "grad_norm": 22.083870697287118, + "learning_rate": 4.6402929198441876e-07, + "logits/chosen": -1.3264474868774414, + "logits/rejected": -1.3211390972137451, + "logps/chosen": -46.68233108520508, + "logps/rejected": -55.392520904541016, + "loss": 0.2993, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011279929429292679, + "rewards/margins": 2.057001829147339, + "rewards/rejected": -2.068281650543213, + "step": 344 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 21.347264429630435, + "learning_rate": 4.6369285476225953e-07, + "logits/chosen": -1.1508305072784424, + "logits/rejected": -1.319612979888916, + "logps/chosen": -31.885753631591797, + "logps/rejected": -49.32568359375, + "loss": 0.2799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35789793729782104, + "rewards/margins": 2.335634231567383, + "rewards/rejected": -1.977736234664917, + "step": 345 + }, + { + "epoch": 2.0503703703703704, + "grad_norm": 24.570352764030314, + "learning_rate": 4.6335497463788497e-07, + "logits/chosen": -1.243829607963562, + "logits/rejected": -1.3181369304656982, + "logps/chosen": -60.18506622314453, + "logps/rejected": -69.44244384765625, + "loss": 0.3113, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3555411696434021, + "rewards/margins": 2.665473461151123, + "rewards/rejected": -3.02101469039917, + "step": 346 + }, + { + "epoch": 2.0562962962962965, + "grad_norm": 17.6853062093381, + "learning_rate": 4.6301565389273755e-07, + "logits/chosen": -1.5396754741668701, + "logits/rejected": -1.5316152572631836, + "logps/chosen": -47.59368896484375, + "logps/rejected": -55.50246810913086, + "loss": 0.2219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09630143642425537, + "rewards/margins": 2.2724692821502686, + "rewards/rejected": -2.3687705993652344, + "step": 347 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 21.180954712617115, + "learning_rate": 4.6267489481798736e-07, + "logits/chosen": -1.3041589260101318, + "logits/rejected": -1.3119125366210938, + "logps/chosen": -51.84857177734375, + "logps/rejected": -72.58021545410156, + "loss": 0.3009, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2827097177505493, + "rewards/margins": 3.221679449081421, + "rewards/rejected": -2.938969612121582, + "step": 348 + }, + { + "epoch": 2.0681481481481483, + "grad_norm": 22.28131069737945, + "learning_rate": 4.6233269971451627e-07, + "logits/chosen": -1.3343658447265625, + "logits/rejected": -1.345481276512146, + "logps/chosen": -54.08373260498047, + "logps/rejected": -62.955909729003906, + "loss": 0.2586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5682754516601562, + "rewards/margins": 1.7530314922332764, + "rewards/rejected": -2.3213071823120117, + "step": 349 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 20.48670863420585, + "learning_rate": 4.619890708929025e-07, + "logits/chosen": -1.255729079246521, + "logits/rejected": -1.3206520080566406, + "logps/chosen": -46.70282745361328, + "logps/rejected": -62.74363708496094, + "loss": 0.2786, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.245827779173851, + "rewards/margins": 1.665952444076538, + "rewards/rejected": -1.4201246500015259, + "step": 350 + }, + { + "epoch": 2.08, + "grad_norm": 17.96440002462223, + "learning_rate": 4.6164401067340526e-07, + "logits/chosen": -1.2545899152755737, + "logits/rejected": -1.322774887084961, + "logps/chosen": -42.6780891418457, + "logps/rejected": -56.45946502685547, + "loss": 0.2144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19612331688404083, + "rewards/margins": 2.441504955291748, + "rewards/rejected": -2.2453818321228027, + "step": 351 + }, + { + "epoch": 2.0859259259259257, + "grad_norm": 15.24610036802107, + "learning_rate": 4.612975213859487e-07, + "logits/chosen": -1.2593666315078735, + "logits/rejected": -1.2873666286468506, + "logps/chosen": -49.45045471191406, + "logps/rejected": -74.48650360107422, + "loss": 0.2063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11988449841737747, + "rewards/margins": 2.6868491172790527, + "rewards/rejected": -2.8067336082458496, + "step": 352 + }, + { + "epoch": 2.091851851851852, + "grad_norm": 19.39107155679951, + "learning_rate": 4.609496053701064e-07, + "logits/chosen": -1.0879367589950562, + "logits/rejected": -1.1195926666259766, + "logps/chosen": -42.81309509277344, + "logps/rejected": -66.11750030517578, + "loss": 0.2408, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.026289865374565125, + "rewards/margins": 3.345236301422119, + "rewards/rejected": -3.3715262413024902, + "step": 353 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 18.590102731321778, + "learning_rate": 4.606002649750855e-07, + "logits/chosen": -1.2917572259902954, + "logits/rejected": -1.3807308673858643, + "logps/chosen": -50.10943603515625, + "logps/rejected": -64.49555969238281, + "loss": 0.2492, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17642168700695038, + "rewards/margins": 2.523153781890869, + "rewards/rejected": -2.699575424194336, + "step": 354 + }, + { + "epoch": 2.1037037037037036, + "grad_norm": 19.687014144002955, + "learning_rate": 4.6024950255971106e-07, + "logits/chosen": -1.2346889972686768, + "logits/rejected": -1.3283984661102295, + "logps/chosen": -44.39927673339844, + "logps/rejected": -67.63725280761719, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20876558125019073, + "rewards/margins": 2.4332196712493896, + "rewards/rejected": -2.224454164505005, + "step": 355 + }, + { + "epoch": 2.1096296296296297, + "grad_norm": 20.92862453942209, + "learning_rate": 4.598973204924097e-07, + "logits/chosen": -1.226645588874817, + "logits/rejected": -1.2606167793273926, + "logps/chosen": -41.84648132324219, + "logps/rejected": -62.02039337158203, + "loss": 0.2865, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.004774313420057297, + "rewards/margins": 2.0877463817596436, + "rewards/rejected": -2.0829720497131348, + "step": 356 + }, + { + "epoch": 2.1155555555555554, + "grad_norm": 19.003566570766115, + "learning_rate": 4.5954372115119395e-07, + "logits/chosen": -1.2138569355010986, + "logits/rejected": -1.2532055377960205, + "logps/chosen": -45.88983154296875, + "logps/rejected": -62.843265533447266, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34711575508117676, + "rewards/margins": 2.869643449783325, + "rewards/rejected": -2.5225276947021484, + "step": 357 + }, + { + "epoch": 2.1214814814814815, + "grad_norm": 21.94315127503371, + "learning_rate": 4.5918870692364606e-07, + "logits/chosen": -1.2227267026901245, + "logits/rejected": -1.2402318716049194, + "logps/chosen": -50.86392593383789, + "logps/rejected": -69.76829528808594, + "loss": 0.2698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.050845518708229065, + "rewards/margins": 2.4909873008728027, + "rewards/rejected": -2.54183292388916, + "step": 358 + }, + { + "epoch": 2.127407407407407, + "grad_norm": 19.662389858943495, + "learning_rate": 4.5883228020690204e-07, + "logits/chosen": -1.335383653640747, + "logits/rejected": -1.4468356370925903, + "logps/chosen": -55.14125061035156, + "logps/rejected": -81.48980712890625, + "loss": 0.2557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03477056324481964, + "rewards/margins": 2.8970305919647217, + "rewards/rejected": -2.9318013191223145, + "step": 359 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 16.289737421447924, + "learning_rate": 4.5847444340763516e-07, + "logits/chosen": -1.258293867111206, + "logits/rejected": -1.3521993160247803, + "logps/chosen": -42.92197799682617, + "logps/rejected": -79.57373809814453, + "loss": 0.2154, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12718605995178223, + "rewards/margins": 3.129183769226074, + "rewards/rejected": -3.2563695907592773, + "step": 360 + }, + { + "epoch": 2.1392592592592594, + "grad_norm": 19.421173501056757, + "learning_rate": 4.5811519894204e-07, + "logits/chosen": -1.2771823406219482, + "logits/rejected": -1.2784755229949951, + "logps/chosen": -45.26205062866211, + "logps/rejected": -49.14915084838867, + "loss": 0.2966, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3483662009239197, + "rewards/margins": 1.337389349937439, + "rewards/rejected": -1.6857554912567139, + "step": 361 + }, + { + "epoch": 2.145185185185185, + "grad_norm": 17.47408354292453, + "learning_rate": 4.577545492358159e-07, + "logits/chosen": -1.4037187099456787, + "logits/rejected": -1.4769129753112793, + "logps/chosen": -40.952667236328125, + "logps/rejected": -42.973777770996094, + "loss": 0.2218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18929442763328552, + "rewards/margins": 1.6638569831848145, + "rewards/rejected": -1.474562406539917, + "step": 362 + }, + { + "epoch": 2.151111111111111, + "grad_norm": 19.77727999230194, + "learning_rate": 4.573924967241509e-07, + "logits/chosen": -1.2610749006271362, + "logits/rejected": -1.358107566833496, + "logps/chosen": -52.18950653076172, + "logps/rejected": -61.858158111572266, + "loss": 0.2557, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.37960347533226013, + "rewards/margins": 1.40680730342865, + "rewards/rejected": -1.7864106893539429, + "step": 363 + }, + { + "epoch": 2.157037037037037, + "grad_norm": 21.887529470415178, + "learning_rate": 4.5702904385170495e-07, + "logits/chosen": -1.1152464151382446, + "logits/rejected": -1.1701469421386719, + "logps/chosen": -42.043331146240234, + "logps/rejected": -60.342838287353516, + "loss": 0.2271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09139405190944672, + "rewards/margins": 2.1598100662231445, + "rewards/rejected": -2.068415880203247, + "step": 364 + }, + { + "epoch": 2.162962962962963, + "grad_norm": 20.56491024004271, + "learning_rate": 4.566641930725935e-07, + "logits/chosen": -1.3306797742843628, + "logits/rejected": -1.3579617738723755, + "logps/chosen": -46.26102828979492, + "logps/rejected": -64.42369079589844, + "loss": 0.2496, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.09908976405858994, + "rewards/margins": 2.0486528873443604, + "rewards/rejected": -2.147742509841919, + "step": 365 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 21.929097513898213, + "learning_rate": 4.5629794685037125e-07, + "logits/chosen": -1.3259196281433105, + "logits/rejected": -1.438348412513733, + "logps/chosen": -48.74983596801758, + "logps/rejected": -70.9134292602539, + "loss": 0.2965, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014818083494901657, + "rewards/margins": 2.414520263671875, + "rewards/rejected": -2.429338216781616, + "step": 366 + }, + { + "epoch": 2.1748148148148148, + "grad_norm": 21.17363277706632, + "learning_rate": 4.5593030765801493e-07, + "logits/chosen": -1.1760063171386719, + "logits/rejected": -1.3117597103118896, + "logps/chosen": -40.7785530090332, + "logps/rejected": -55.60898971557617, + "loss": 0.2749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1574682742357254, + "rewards/margins": 2.237823247909546, + "rewards/rejected": -2.395291566848755, + "step": 367 + }, + { + "epoch": 2.180740740740741, + "grad_norm": 15.52905572804225, + "learning_rate": 4.555612779779071e-07, + "logits/chosen": -1.177214503288269, + "logits/rejected": -1.1642229557037354, + "logps/chosen": -50.61836242675781, + "logps/rejected": -67.73853302001953, + "loss": 0.1967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10218016803264618, + "rewards/margins": 2.1828601360321045, + "rewards/rejected": -2.2850403785705566, + "step": 368 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 18.91258488483491, + "learning_rate": 4.551908603018191e-07, + "logits/chosen": -1.194425344467163, + "logits/rejected": -1.2497589588165283, + "logps/chosen": -44.26604461669922, + "logps/rejected": -61.95736312866211, + "loss": 0.2498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11065736413002014, + "rewards/margins": 2.543154239654541, + "rewards/rejected": -2.432497024536133, + "step": 369 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 21.766572819900112, + "learning_rate": 4.548190571308944e-07, + "logits/chosen": -1.2378605604171753, + "logits/rejected": -1.2948296070098877, + "logps/chosen": -47.71161651611328, + "logps/rejected": -73.49365234375, + "loss": 0.2675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05222397670149803, + "rewards/margins": 2.5048434734344482, + "rewards/rejected": -2.557067394256592, + "step": 370 + }, + { + "epoch": 2.1985185185185183, + "grad_norm": 18.684296079943266, + "learning_rate": 4.5444587097563166e-07, + "logits/chosen": -1.4498980045318604, + "logits/rejected": -1.547486424446106, + "logps/chosen": -49.34889221191406, + "logps/rejected": -64.73408508300781, + "loss": 0.2505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009527906775474548, + "rewards/margins": 2.8368988037109375, + "rewards/rejected": -2.846426486968994, + "step": 371 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 20.657169189715972, + "learning_rate": 4.540713043558677e-07, + "logits/chosen": -1.2857564687728882, + "logits/rejected": -1.3269503116607666, + "logps/chosen": -58.65913772583008, + "logps/rejected": -69.71886444091797, + "loss": 0.2709, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27366358041763306, + "rewards/margins": 2.6564712524414062, + "rewards/rejected": -2.382807731628418, + "step": 372 + }, + { + "epoch": 2.2103703703703705, + "grad_norm": 17.83103596373758, + "learning_rate": 4.536953598007607e-07, + "logits/chosen": -1.4808869361877441, + "logits/rejected": -1.4333157539367676, + "logps/chosen": -57.45781707763672, + "logps/rejected": -56.999839782714844, + "loss": 0.2291, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30290722846984863, + "rewards/margins": 2.3255772590637207, + "rewards/rejected": -2.022670269012451, + "step": 373 + }, + { + "epoch": 2.216296296296296, + "grad_norm": 15.546827141849722, + "learning_rate": 4.533180398487726e-07, + "logits/chosen": -1.2495567798614502, + "logits/rejected": -1.2979238033294678, + "logps/chosen": -63.49386215209961, + "logps/rejected": -64.50764465332031, + "loss": 0.2212, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.29394635558128357, + "rewards/margins": 3.0806996822357178, + "rewards/rejected": -2.7867531776428223, + "step": 374 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 21.86157113722065, + "learning_rate": 4.529393470476528e-07, + "logits/chosen": -1.3445886373519897, + "logits/rejected": -1.3152225017547607, + "logps/chosen": -47.706058502197266, + "logps/rejected": -49.469703674316406, + "loss": 0.2421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.056947916746139526, + "rewards/margins": 2.0946450233459473, + "rewards/rejected": -2.0376970767974854, + "step": 375 + }, + { + "epoch": 2.228148148148148, + "grad_norm": 17.43370404766248, + "learning_rate": 4.525592839544202e-07, + "logits/chosen": -1.1432135105133057, + "logits/rejected": -1.265343427658081, + "logps/chosen": -40.92923355102539, + "logps/rejected": -68.56723022460938, + "loss": 0.2291, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3592386841773987, + "rewards/margins": 3.1753196716308594, + "rewards/rejected": -3.5345585346221924, + "step": 376 + }, + { + "epoch": 2.234074074074074, + "grad_norm": 18.051266700370928, + "learning_rate": 4.521778531353462e-07, + "logits/chosen": -1.3263722658157349, + "logits/rejected": -1.3930516242980957, + "logps/chosen": -47.67115020751953, + "logps/rejected": -63.065364837646484, + "loss": 0.1941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1850857436656952, + "rewards/margins": 3.161947250366211, + "rewards/rejected": -3.3470327854156494, + "step": 377 + }, + { + "epoch": 2.24, + "grad_norm": 16.584129376985537, + "learning_rate": 4.517950571659376e-07, + "logits/chosen": -1.0496330261230469, + "logits/rejected": -1.141974687576294, + "logps/chosen": -40.75746154785156, + "logps/rejected": -60.00761795043945, + "loss": 0.2137, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10689018666744232, + "rewards/margins": 2.207362651824951, + "rewards/rejected": -2.3142528533935547, + "step": 378 + }, + { + "epoch": 2.245925925925926, + "grad_norm": 21.83474303669417, + "learning_rate": 4.5141089863091876e-07, + "logits/chosen": -1.3251947164535522, + "logits/rejected": -1.4994127750396729, + "logps/chosen": -45.369197845458984, + "logps/rejected": -60.48664093017578, + "loss": 0.2603, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1240503191947937, + "rewards/margins": 2.388545513153076, + "rewards/rejected": -2.2644951343536377, + "step": 379 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 16.78610677871395, + "learning_rate": 4.5102538012421463e-07, + "logits/chosen": -1.26201593875885, + "logits/rejected": -1.312106728553772, + "logps/chosen": -38.60343933105469, + "logps/rejected": -53.056270599365234, + "loss": 0.2297, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21554449200630188, + "rewards/margins": 2.4207348823547363, + "rewards/rejected": -2.2051901817321777, + "step": 380 + }, + { + "epoch": 2.2577777777777777, + "grad_norm": 23.152601443193166, + "learning_rate": 4.506385042489328e-07, + "logits/chosen": -1.4015882015228271, + "logits/rejected": -1.346733808517456, + "logps/chosen": -51.458953857421875, + "logps/rejected": -56.37825012207031, + "loss": 0.31, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21772369742393494, + "rewards/margins": 1.8389344215393066, + "rewards/rejected": -1.6212105751037598, + "step": 381 + }, + { + "epoch": 2.2637037037037038, + "grad_norm": 21.27171913918321, + "learning_rate": 4.5025027361734613e-07, + "logits/chosen": -1.1399121284484863, + "logits/rejected": -1.2481327056884766, + "logps/chosen": -39.44144058227539, + "logps/rejected": -61.8831901550293, + "loss": 0.3136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05294826626777649, + "rewards/margins": 2.6817171573638916, + "rewards/rejected": -2.6287689208984375, + "step": 382 + }, + { + "epoch": 2.2696296296296294, + "grad_norm": 21.31274885508849, + "learning_rate": 4.498606908508753e-07, + "logits/chosen": -1.1971873044967651, + "logits/rejected": -1.1886755228042603, + "logps/chosen": -42.121524810791016, + "logps/rejected": -58.5207405090332, + "loss": 0.2814, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0361473523080349, + "rewards/margins": 2.876237630844116, + "rewards/rejected": -2.840089797973633, + "step": 383 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 19.525267138237183, + "learning_rate": 4.4946975858007064e-07, + "logits/chosen": -1.2907319068908691, + "logits/rejected": -1.3001677989959717, + "logps/chosen": -39.252166748046875, + "logps/rejected": -56.55052947998047, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35257211327552795, + "rewards/margins": 3.527677536010742, + "rewards/rejected": -3.175105571746826, + "step": 384 + }, + { + "epoch": 2.2814814814814817, + "grad_norm": 21.7988692040449, + "learning_rate": 4.4907747944459484e-07, + "logits/chosen": -1.2917722463607788, + "logits/rejected": -1.3734023571014404, + "logps/chosen": -52.56479263305664, + "logps/rejected": -63.253562927246094, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09430493414402008, + "rewards/margins": 2.2183127403259277, + "rewards/rejected": -2.1240079402923584, + "step": 385 + }, + { + "epoch": 2.2874074074074073, + "grad_norm": 15.89301818799247, + "learning_rate": 4.486838560932048e-07, + "logits/chosen": -1.228432297706604, + "logits/rejected": -1.2870917320251465, + "logps/chosen": -45.21752166748047, + "logps/rejected": -54.30406951904297, + "loss": 0.2494, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20035433769226074, + "rewards/margins": 1.99539053440094, + "rewards/rejected": -2.195744752883911, + "step": 386 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 14.328174093261362, + "learning_rate": 4.4828889118373394e-07, + "logits/chosen": -1.2832486629486084, + "logits/rejected": -1.2992146015167236, + "logps/chosen": -54.598121643066406, + "logps/rejected": -64.07887268066406, + "loss": 0.157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2271214723587036, + "rewards/margins": 2.856295108795166, + "rewards/rejected": -2.629173755645752, + "step": 387 + }, + { + "epoch": 2.299259259259259, + "grad_norm": 21.241967819688803, + "learning_rate": 4.4789258738307413e-07, + "logits/chosen": -1.3300862312316895, + "logits/rejected": -1.480217456817627, + "logps/chosen": -42.869178771972656, + "logps/rejected": -63.462066650390625, + "loss": 0.2883, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06202170252799988, + "rewards/margins": 1.9598344564437866, + "rewards/rejected": -1.8978126049041748, + "step": 388 + }, + { + "epoch": 2.3051851851851852, + "grad_norm": 19.822991467494322, + "learning_rate": 4.474949473671578e-07, + "logits/chosen": -1.0555452108383179, + "logits/rejected": -1.042801022529602, + "logps/chosen": -38.282432556152344, + "logps/rejected": -58.744590759277344, + "loss": 0.2191, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36787959933280945, + "rewards/margins": 3.253455400466919, + "rewards/rejected": -2.885575532913208, + "step": 389 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 21.08659415889429, + "learning_rate": 4.4709597382093976e-07, + "logits/chosen": -1.3296679258346558, + "logits/rejected": -1.3939673900604248, + "logps/chosen": -45.78785705566406, + "logps/rejected": -65.52828979492188, + "loss": 0.2543, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6854375004768372, + "rewards/margins": 1.8844356536865234, + "rewards/rejected": -1.198998212814331, + "step": 390 + }, + { + "epoch": 2.317037037037037, + "grad_norm": 16.82223616928778, + "learning_rate": 4.4669566943837916e-07, + "logits/chosen": -1.3169894218444824, + "logits/rejected": -1.3499053716659546, + "logps/chosen": -46.05174255371094, + "logps/rejected": -60.99545669555664, + "loss": 0.2095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08141281455755234, + "rewards/margins": 2.3928728103637695, + "rewards/rejected": -2.47428560256958, + "step": 391 + }, + { + "epoch": 2.322962962962963, + "grad_norm": 20.79237283373031, + "learning_rate": 4.462940369224212e-07, + "logits/chosen": -1.3965792655944824, + "logits/rejected": -1.3936798572540283, + "logps/chosen": -43.78404235839844, + "logps/rejected": -59.989688873291016, + "loss": 0.2344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13246512413024902, + "rewards/margins": 2.7492973804473877, + "rewards/rejected": -2.8817625045776367, + "step": 392 + }, + { + "epoch": 2.328888888888889, + "grad_norm": 16.04337015452463, + "learning_rate": 4.4589107898497885e-07, + "logits/chosen": -1.2513833045959473, + "logits/rejected": -1.3017098903656006, + "logps/chosen": -47.380184173583984, + "logps/rejected": -60.824520111083984, + "loss": 0.205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05711951479315758, + "rewards/margins": 2.9654927253723145, + "rewards/rejected": -3.0226120948791504, + "step": 393 + }, + { + "epoch": 2.334814814814815, + "grad_norm": 22.721297568840754, + "learning_rate": 4.454867983469148e-07, + "logits/chosen": -1.3320375680923462, + "logits/rejected": -1.4008305072784424, + "logps/chosen": -45.44051742553711, + "logps/rejected": -55.844852447509766, + "loss": 0.2664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1650034487247467, + "rewards/margins": 1.6259642839431763, + "rewards/rejected": -1.7909677028656006, + "step": 394 + }, + { + "epoch": 2.3407407407407406, + "grad_norm": 17.321615878576583, + "learning_rate": 4.4508119773802294e-07, + "logits/chosen": -1.246985912322998, + "logits/rejected": -1.2684519290924072, + "logps/chosen": -35.195648193359375, + "logps/rejected": -53.54356002807617, + "loss": 0.2295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10865418612957001, + "rewards/margins": 2.806920051574707, + "rewards/rejected": -2.69826602935791, + "step": 395 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 16.827821341688068, + "learning_rate": 4.4467427989700967e-07, + "logits/chosen": -1.0964595079421997, + "logits/rejected": -1.1211133003234863, + "logps/chosen": -60.25492858886719, + "logps/rejected": -70.0510025024414, + "loss": 0.2176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3291707932949066, + "rewards/margins": 3.0319859981536865, + "rewards/rejected": -3.361156940460205, + "step": 396 + }, + { + "epoch": 2.3525925925925923, + "grad_norm": 17.57499092020414, + "learning_rate": 4.442660475714758e-07, + "logits/chosen": -1.3368898630142212, + "logits/rejected": -1.3711633682250977, + "logps/chosen": -55.84284210205078, + "logps/rejected": -67.59239196777344, + "loss": 0.2135, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.33838677406311035, + "rewards/margins": 2.886188507080078, + "rewards/rejected": -3.2245752811431885, + "step": 397 + }, + { + "epoch": 2.3585185185185185, + "grad_norm": 15.652194606879792, + "learning_rate": 4.438565035178979e-07, + "logits/chosen": -1.3266401290893555, + "logits/rejected": -1.2795634269714355, + "logps/chosen": -39.78150177001953, + "logps/rejected": -52.82093048095703, + "loss": 0.2074, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13564905524253845, + "rewards/margins": 2.1605172157287598, + "rewards/rejected": -2.0248682498931885, + "step": 398 + }, + { + "epoch": 2.3644444444444446, + "grad_norm": 16.839558145858316, + "learning_rate": 4.434456505016094e-07, + "logits/chosen": -1.1723679304122925, + "logits/rejected": -1.2517800331115723, + "logps/chosen": -39.925350189208984, + "logps/rejected": -54.610374450683594, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07945774495601654, + "rewards/margins": 2.555344343185425, + "rewards/rejected": -2.475886344909668, + "step": 399 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 20.294868135471607, + "learning_rate": 4.430334912967823e-07, + "logits/chosen": -1.2500160932540894, + "logits/rejected": -1.350742220878601, + "logps/chosen": -47.22673034667969, + "logps/rejected": -63.268775939941406, + "loss": 0.217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0041030943393707275, + "rewards/margins": 2.526927947998047, + "rewards/rejected": -2.522824764251709, + "step": 400 + }, + { + "epoch": 2.3762962962962964, + "grad_norm": 14.309694923599055, + "learning_rate": 4.4262002868640826e-07, + "logits/chosen": -1.355212688446045, + "logits/rejected": -1.4428194761276245, + "logps/chosen": -62.766273498535156, + "logps/rejected": -70.3138198852539, + "loss": 0.1547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7476651668548584, + "rewards/margins": 2.508098602294922, + "rewards/rejected": -3.2557637691497803, + "step": 401 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 22.127072690196805, + "learning_rate": 4.422052654622799e-07, + "logits/chosen": -1.1594799757003784, + "logits/rejected": -1.1490657329559326, + "logps/chosen": -51.881160736083984, + "logps/rejected": -63.252323150634766, + "loss": 0.2435, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6946939826011658, + "rewards/margins": 3.3599495887756348, + "rewards/rejected": -4.054643630981445, + "step": 402 + }, + { + "epoch": 2.388148148148148, + "grad_norm": 14.586982600230868, + "learning_rate": 4.417892044249716e-07, + "logits/chosen": -1.1080595254898071, + "logits/rejected": -1.2213943004608154, + "logps/chosen": -48.669837951660156, + "logps/rejected": -62.76885986328125, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2799440622329712, + "rewards/margins": 2.3571085929870605, + "rewards/rejected": -2.6370527744293213, + "step": 403 + }, + { + "epoch": 2.3940740740740742, + "grad_norm": 16.407021152533144, + "learning_rate": 4.4137184838382125e-07, + "logits/chosen": -1.338707447052002, + "logits/rejected": -1.4166909456253052, + "logps/chosen": -49.7010498046875, + "logps/rejected": -63.55725860595703, + "loss": 0.2026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03665490448474884, + "rewards/margins": 3.0050880908966064, + "rewards/rejected": -2.968432903289795, + "step": 404 + }, + { + "epoch": 2.4, + "grad_norm": 15.47676852300944, + "learning_rate": 4.409532001569105e-07, + "logits/chosen": -1.0699396133422852, + "logits/rejected": -1.1151628494262695, + "logps/chosen": -44.824283599853516, + "logps/rejected": -58.522796630859375, + "loss": 0.1994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2893208861351013, + "rewards/margins": 3.242827892303467, + "rewards/rejected": -3.532148838043213, + "step": 405 + }, + { + "epoch": 2.405925925925926, + "grad_norm": 22.374176782743092, + "learning_rate": 4.405332625710465e-07, + "logits/chosen": -1.202767252922058, + "logits/rejected": -1.2067598104476929, + "logps/chosen": -50.498321533203125, + "logps/rejected": -56.5478401184082, + "loss": 0.2553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38981667160987854, + "rewards/margins": 2.1452839374542236, + "rewards/rejected": -2.5351004600524902, + "step": 406 + }, + { + "epoch": 2.4118518518518517, + "grad_norm": 16.687053025804047, + "learning_rate": 4.401120384617423e-07, + "logits/chosen": -1.4677153825759888, + "logits/rejected": -1.3985137939453125, + "logps/chosen": -61.11507034301758, + "logps/rejected": -70.34519958496094, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.737166702747345, + "rewards/margins": 3.606313943862915, + "rewards/rejected": -4.343481063842773, + "step": 407 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 18.669321736650193, + "learning_rate": 4.396895306731977e-07, + "logits/chosen": -1.3816975355148315, + "logits/rejected": -1.339860200881958, + "logps/chosen": -51.137413024902344, + "logps/rejected": -51.43387985229492, + "loss": 0.2025, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10709303617477417, + "rewards/margins": 2.5654587745666504, + "rewards/rejected": -2.6725518703460693, + "step": 408 + }, + { + "epoch": 2.423703703703704, + "grad_norm": 15.749361927163159, + "learning_rate": 4.3926574205828037e-07, + "logits/chosen": -1.1337440013885498, + "logits/rejected": -1.2759745121002197, + "logps/chosen": -35.335487365722656, + "logps/rejected": -54.63352584838867, + "loss": 0.192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07870366424322128, + "rewards/margins": 3.1690025329589844, + "rewards/rejected": -3.090298652648926, + "step": 409 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 18.104317236901334, + "learning_rate": 4.388406754785063e-07, + "logits/chosen": -1.3435664176940918, + "logits/rejected": -1.3172290325164795, + "logps/chosen": -41.54008865356445, + "logps/rejected": -57.23595428466797, + "loss": 0.2439, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2125397026538849, + "rewards/margins": 2.206397771835327, + "rewards/rejected": -2.4189372062683105, + "step": 410 + }, + { + "epoch": 2.4355555555555557, + "grad_norm": 21.577238479012582, + "learning_rate": 4.3841433380402073e-07, + "logits/chosen": -1.3005995750427246, + "logits/rejected": -1.3782299757003784, + "logps/chosen": -48.72734069824219, + "logps/rejected": -74.93363952636719, + "loss": 0.2632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2786545753479004, + "rewards/margins": 3.2695655822753906, + "rewards/rejected": -2.9909110069274902, + "step": 411 + }, + { + "epoch": 2.4414814814814814, + "grad_norm": 14.034858000999158, + "learning_rate": 4.379867199135785e-07, + "logits/chosen": -1.4155633449554443, + "logits/rejected": -1.472240686416626, + "logps/chosen": -45.016868591308594, + "logps/rejected": -76.23003387451172, + "loss": 0.1571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6123547554016113, + "rewards/margins": 3.1689391136169434, + "rewards/rejected": -3.7812938690185547, + "step": 412 + }, + { + "epoch": 2.4474074074074075, + "grad_norm": 16.779130305262424, + "learning_rate": 4.375578366945246e-07, + "logits/chosen": -1.3278855085372925, + "logits/rejected": -1.3343737125396729, + "logps/chosen": -46.692420959472656, + "logps/rejected": -53.395774841308594, + "loss": 0.1859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5343155264854431, + "rewards/margins": 2.1621780395507812, + "rewards/rejected": -2.696493625640869, + "step": 413 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 19.140589088785962, + "learning_rate": 4.3712768704277524e-07, + "logits/chosen": -1.3284047842025757, + "logits/rejected": -1.3112947940826416, + "logps/chosen": -43.248924255371094, + "logps/rejected": -55.926727294921875, + "loss": 0.2448, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09817357361316681, + "rewards/margins": 2.1646008491516113, + "rewards/rejected": -2.2627744674682617, + "step": 414 + }, + { + "epoch": 2.4592592592592593, + "grad_norm": 21.078378047432963, + "learning_rate": 4.366962738627975e-07, + "logits/chosen": -1.0980134010314941, + "logits/rejected": -1.2354460954666138, + "logps/chosen": -33.716976165771484, + "logps/rejected": -55.4927978515625, + "loss": 0.2456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5064057111740112, + "rewards/margins": 2.779224395751953, + "rewards/rejected": -3.285630226135254, + "step": 415 + }, + { + "epoch": 2.4651851851851854, + "grad_norm": 17.149088012986873, + "learning_rate": 4.3626360006759016e-07, + "logits/chosen": -1.3233143091201782, + "logits/rejected": -1.318281650543213, + "logps/chosen": -57.326744079589844, + "logps/rejected": -63.12444305419922, + "loss": 0.1736, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13537603616714478, + "rewards/margins": 2.3170013427734375, + "rewards/rejected": -2.4523775577545166, + "step": 416 + }, + { + "epoch": 2.471111111111111, + "grad_norm": 16.379890197970408, + "learning_rate": 4.3582966857866397e-07, + "logits/chosen": -1.3193371295928955, + "logits/rejected": -1.3573179244995117, + "logps/chosen": -41.1270637512207, + "logps/rejected": -57.641868591308594, + "loss": 0.2079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10911901295185089, + "rewards/margins": 2.449983835220337, + "rewards/rejected": -2.340864896774292, + "step": 417 + }, + { + "epoch": 2.477037037037037, + "grad_norm": 18.867606776179514, + "learning_rate": 4.353944823260221e-07, + "logits/chosen": -1.1927530765533447, + "logits/rejected": -1.2237083911895752, + "logps/chosen": -41.90693664550781, + "logps/rejected": -67.52421569824219, + "loss": 0.2608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.009037256240844727, + "rewards/margins": 2.025681495666504, + "rewards/rejected": -2.0347187519073486, + "step": 418 + }, + { + "epoch": 2.482962962962963, + "grad_norm": 19.73233833392236, + "learning_rate": 4.3495804424813986e-07, + "logits/chosen": -1.3428009748458862, + "logits/rejected": -1.4153268337249756, + "logps/chosen": -46.6826057434082, + "logps/rejected": -60.5880012512207, + "loss": 0.2624, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07019033282995224, + "rewards/margins": 2.512794256210327, + "rewards/rejected": -2.582984685897827, + "step": 419 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 16.547428466719083, + "learning_rate": 4.3452035729194534e-07, + "logits/chosen": -1.2679708003997803, + "logits/rejected": -1.3539072275161743, + "logps/chosen": -44.795372009277344, + "logps/rejected": -65.70006561279297, + "loss": 0.2128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28597304224967957, + "rewards/margins": 3.0686075687408447, + "rewards/rejected": -2.7826344966888428, + "step": 420 + }, + { + "epoch": 2.4948148148148146, + "grad_norm": 19.61553554693557, + "learning_rate": 4.340814244127993e-07, + "logits/chosen": -1.2777574062347412, + "logits/rejected": -1.4196518659591675, + "logps/chosen": -45.838314056396484, + "logps/rejected": -57.400760650634766, + "loss": 0.2595, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10272904485464096, + "rewards/margins": 1.4262562990188599, + "rewards/rejected": -1.5289852619171143, + "step": 421 + }, + { + "epoch": 2.5007407407407407, + "grad_norm": 17.434275927746988, + "learning_rate": 4.3364124857447525e-07, + "logits/chosen": -1.1939697265625, + "logits/rejected": -1.2483940124511719, + "logps/chosen": -52.27749252319336, + "logps/rejected": -65.15104675292969, + "loss": 0.1998, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14658820629119873, + "rewards/margins": 2.7353010177612305, + "rewards/rejected": -2.588712692260742, + "step": 422 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 18.989696276439936, + "learning_rate": 4.331998327491395e-07, + "logits/chosen": -1.390942931175232, + "logits/rejected": -1.388514518737793, + "logps/chosen": -48.308021545410156, + "logps/rejected": -62.73554229736328, + "loss": 0.2025, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4263012409210205, + "rewards/margins": 2.9498748779296875, + "rewards/rejected": -3.376175880432129, + "step": 423 + }, + { + "epoch": 2.5125925925925925, + "grad_norm": 18.052245677061464, + "learning_rate": 4.3275717991733097e-07, + "logits/chosen": -1.1927298307418823, + "logits/rejected": -1.2486273050308228, + "logps/chosen": -40.60221862792969, + "logps/rejected": -54.09161376953125, + "loss": 0.2339, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24772867560386658, + "rewards/margins": 1.7447564601898193, + "rewards/rejected": -1.9924849271774292, + "step": 424 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 15.60989150821569, + "learning_rate": 4.3231329306794106e-07, + "logits/chosen": -1.206120252609253, + "logits/rejected": -1.2819563150405884, + "logps/chosen": -47.97679138183594, + "logps/rejected": -60.327110290527344, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0791320651769638, + "rewards/margins": 2.8907041549682617, + "rewards/rejected": -2.9698362350463867, + "step": 425 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 22.753332037000355, + "learning_rate": 4.3186817519819365e-07, + "logits/chosen": -1.3056433200836182, + "logits/rejected": -1.2404112815856934, + "logps/chosen": -48.254608154296875, + "logps/rejected": -63.55730056762695, + "loss": 0.2564, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5698607563972473, + "rewards/margins": 3.0423474311828613, + "rewards/rejected": -3.612208366394043, + "step": 426 + }, + { + "epoch": 2.5303703703703704, + "grad_norm": 19.819465925330856, + "learning_rate": 4.314218293136247e-07, + "logits/chosen": -1.2659276723861694, + "logits/rejected": -1.3557699918746948, + "logps/chosen": -40.644039154052734, + "logps/rejected": -51.36490249633789, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053419072180986404, + "rewards/margins": 2.2699971199035645, + "rewards/rejected": -2.323416233062744, + "step": 427 + }, + { + "epoch": 2.536296296296296, + "grad_norm": 17.374322567562608, + "learning_rate": 4.30974258428062e-07, + "logits/chosen": -1.2225278615951538, + "logits/rejected": -1.2667872905731201, + "logps/chosen": -57.24745178222656, + "logps/rejected": -51.737056732177734, + "loss": 0.2103, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41055721044540405, + "rewards/margins": 2.0105388164520264, + "rewards/rejected": -2.421095848083496, + "step": 428 + }, + { + "epoch": 2.542222222222222, + "grad_norm": 16.50173497546316, + "learning_rate": 4.3052546556360486e-07, + "logits/chosen": -1.2917060852050781, + "logits/rejected": -1.2804986238479614, + "logps/chosen": -39.234336853027344, + "logps/rejected": -51.28722381591797, + "loss": 0.1917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2359772026538849, + "rewards/margins": 2.1480839252471924, + "rewards/rejected": -1.9121068716049194, + "step": 429 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 18.45028210343722, + "learning_rate": 4.300754537506036e-07, + "logits/chosen": -1.4156684875488281, + "logits/rejected": -1.3991972208023071, + "logps/chosen": -48.09461212158203, + "logps/rejected": -51.55734634399414, + "loss": 0.2199, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08063582330942154, + "rewards/margins": 2.727208137512207, + "rewards/rejected": -2.6465723514556885, + "step": 430 + }, + { + "epoch": 2.554074074074074, + "grad_norm": 18.538785529380714, + "learning_rate": 4.2962422602763925e-07, + "logits/chosen": -1.2535463571548462, + "logits/rejected": -1.2664145231246948, + "logps/chosen": -39.574153900146484, + "logps/rejected": -58.36887741088867, + "loss": 0.2514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.023720331490039825, + "rewards/margins": 2.1531851291656494, + "rewards/rejected": -2.129464626312256, + "step": 431 + }, + { + "epoch": 2.56, + "grad_norm": 20.53028864801327, + "learning_rate": 4.2917178544150284e-07, + "logits/chosen": -1.2374775409698486, + "logits/rejected": -1.3365933895111084, + "logps/chosen": -40.8128547668457, + "logps/rejected": -62.45230484008789, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37315669655799866, + "rewards/margins": 2.7972960472106934, + "rewards/rejected": -3.170452833175659, + "step": 432 + }, + { + "epoch": 2.565925925925926, + "grad_norm": 23.60580081011262, + "learning_rate": 4.2871813504717497e-07, + "logits/chosen": -1.3173975944519043, + "logits/rejected": -1.3482545614242554, + "logps/chosen": -45.852691650390625, + "logps/rejected": -65.40410614013672, + "loss": 0.2228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3781399428844452, + "rewards/margins": 2.6333463191986084, + "rewards/rejected": -3.011486291885376, + "step": 433 + }, + { + "epoch": 2.571851851851852, + "grad_norm": 17.646349860883195, + "learning_rate": 4.2826327790780505e-07, + "logits/chosen": -1.362741470336914, + "logits/rejected": -1.388155460357666, + "logps/chosen": -48.00885772705078, + "logps/rejected": -59.41029739379883, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17543169856071472, + "rewards/margins": 2.6400036811828613, + "rewards/rejected": -2.8154356479644775, + "step": 434 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 19.358547711268727, + "learning_rate": 4.278072170946909e-07, + "logits/chosen": -1.3277589082717896, + "logits/rejected": -1.3741272687911987, + "logps/chosen": -53.56547927856445, + "logps/rejected": -63.62765121459961, + "loss": 0.2636, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37384939193725586, + "rewards/margins": 2.68127703666687, + "rewards/rejected": -3.055126190185547, + "step": 435 + }, + { + "epoch": 2.5837037037037036, + "grad_norm": 14.283524298642329, + "learning_rate": 4.273499556872576e-07, + "logits/chosen": -1.125074863433838, + "logits/rejected": -1.2602410316467285, + "logps/chosen": -43.2216796875, + "logps/rejected": -65.49564361572266, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21781915426254272, + "rewards/margins": 3.0550427436828613, + "rewards/rejected": -3.272862195968628, + "step": 436 + }, + { + "epoch": 2.5896296296296297, + "grad_norm": 14.547779436253572, + "learning_rate": 4.2689149677303716e-07, + "logits/chosen": -1.3099594116210938, + "logits/rejected": -1.4164810180664062, + "logps/chosen": -51.39214324951172, + "logps/rejected": -58.63938903808594, + "loss": 0.1663, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15673355758190155, + "rewards/margins": 2.696232557296753, + "rewards/rejected": -2.539498805999756, + "step": 437 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 18.874472104141944, + "learning_rate": 4.264318434476472e-07, + "logits/chosen": -1.340260624885559, + "logits/rejected": -1.4135973453521729, + "logps/chosen": -54.49589157104492, + "logps/rejected": -62.51300048828125, + "loss": 0.2392, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.20530621707439423, + "rewards/margins": 2.566615581512451, + "rewards/rejected": -2.7719218730926514, + "step": 438 + }, + { + "epoch": 2.6014814814814815, + "grad_norm": 24.652912032755857, + "learning_rate": 4.2597099881477017e-07, + "logits/chosen": -1.4756532907485962, + "logits/rejected": -1.5366077423095703, + "logps/chosen": -40.621578216552734, + "logps/rejected": -55.144046783447266, + "loss": 0.2691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39135220646858215, + "rewards/margins": 1.317076563835144, + "rewards/rejected": -1.7084288597106934, + "step": 439 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 16.69781220569815, + "learning_rate": 4.2550896598613297e-07, + "logits/chosen": -1.3262248039245605, + "logits/rejected": -1.401850938796997, + "logps/chosen": -32.84125518798828, + "logps/rejected": -61.752288818359375, + "loss": 0.2153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14606288075447083, + "rewards/margins": 2.7544689178466797, + "rewards/rejected": -2.9005320072174072, + "step": 440 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 22.119565477895417, + "learning_rate": 4.25045748081485e-07, + "logits/chosen": -1.2764012813568115, + "logits/rejected": -1.4300332069396973, + "logps/chosen": -37.34137725830078, + "logps/rejected": -64.76432800292969, + "loss": 0.2216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03927718102931976, + "rewards/margins": 3.770249366760254, + "rewards/rejected": -3.7309722900390625, + "step": 441 + }, + { + "epoch": 2.6192592592592594, + "grad_norm": 14.928256593837876, + "learning_rate": 4.2458134822857774e-07, + "logits/chosen": -1.2378556728363037, + "logits/rejected": -1.359390377998352, + "logps/chosen": -46.040767669677734, + "logps/rejected": -67.20500183105469, + "loss": 0.1878, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04216981679201126, + "rewards/margins": 3.2073171138763428, + "rewards/rejected": -3.2494869232177734, + "step": 442 + }, + { + "epoch": 2.625185185185185, + "grad_norm": 14.96404630636834, + "learning_rate": 4.241157695631435e-07, + "logits/chosen": -1.2404184341430664, + "logits/rejected": -1.3973965644836426, + "logps/chosen": -43.609745025634766, + "logps/rejected": -65.57633209228516, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29699721932411194, + "rewards/margins": 3.3562171459198, + "rewards/rejected": -3.653214454650879, + "step": 443 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 20.94939040431925, + "learning_rate": 4.2364901522887416e-07, + "logits/chosen": -1.3479417562484741, + "logits/rejected": -1.3275196552276611, + "logps/chosen": -45.53289794921875, + "logps/rejected": -62.82244873046875, + "loss": 0.2071, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2697879672050476, + "rewards/margins": 3.2815232276916504, + "rewards/rejected": -3.5513112545013428, + "step": 444 + }, + { + "epoch": 2.637037037037037, + "grad_norm": 21.81972888844552, + "learning_rate": 4.2318108837739986e-07, + "logits/chosen": -1.0954722166061401, + "logits/rejected": -1.2724300622940063, + "logps/chosen": -37.40864562988281, + "logps/rejected": -53.508148193359375, + "loss": 0.2343, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5114908814430237, + "rewards/margins": 2.329716682434082, + "rewards/rejected": -2.841207504272461, + "step": 445 + }, + { + "epoch": 2.642962962962963, + "grad_norm": 17.622719951148344, + "learning_rate": 4.22711992168268e-07, + "logits/chosen": -1.292818546295166, + "logits/rejected": -1.4211820363998413, + "logps/chosen": -55.30113220214844, + "logps/rejected": -62.69170379638672, + "loss": 0.2219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12264469265937805, + "rewards/margins": 2.33625864982605, + "rewards/rejected": -2.4589033126831055, + "step": 446 + }, + { + "epoch": 2.648888888888889, + "grad_norm": 16.90251346927541, + "learning_rate": 4.2224172976892166e-07, + "logits/chosen": -1.198624610900879, + "logits/rejected": -1.3212577104568481, + "logps/chosen": -56.71440887451172, + "logps/rejected": -77.87358856201172, + "loss": 0.1828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1206379383802414, + "rewards/margins": 3.149707078933716, + "rewards/rejected": -3.2703449726104736, + "step": 447 + }, + { + "epoch": 2.6548148148148147, + "grad_norm": 19.93007489097777, + "learning_rate": 4.217703043546783e-07, + "logits/chosen": -1.3870353698730469, + "logits/rejected": -1.4539594650268555, + "logps/chosen": -50.45875930786133, + "logps/rejected": -58.48164367675781, + "loss": 0.194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5994179248809814, + "rewards/margins": 2.5853588581085205, + "rewards/rejected": -3.184777021408081, + "step": 448 + }, + { + "epoch": 2.660740740740741, + "grad_norm": 13.25713353861389, + "learning_rate": 4.2129771910870845e-07, + "logits/chosen": -1.2270911931991577, + "logits/rejected": -1.3550801277160645, + "logps/chosen": -44.222511291503906, + "logps/rejected": -73.00066375732422, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31280404329299927, + "rewards/margins": 3.3302433490753174, + "rewards/rejected": -3.643047332763672, + "step": 449 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 17.010657621012665, + "learning_rate": 4.2082397722201385e-07, + "logits/chosen": -1.1666088104248047, + "logits/rejected": -1.1798983812332153, + "logps/chosen": -34.95051193237305, + "logps/rejected": -64.77764129638672, + "loss": 0.1893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.23295338451862335, + "rewards/margins": 3.2355153560638428, + "rewards/rejected": -3.46846866607666, + "step": 450 + }, + { + "epoch": 2.6725925925925926, + "grad_norm": 16.03990487758234, + "learning_rate": 4.2034908189340634e-07, + "logits/chosen": -1.2905778884887695, + "logits/rejected": -1.2771592140197754, + "logps/chosen": -43.270263671875, + "logps/rejected": -62.975738525390625, + "loss": 0.1909, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37080562114715576, + "rewards/margins": 3.1950912475585938, + "rewards/rejected": -3.565896987915039, + "step": 451 + }, + { + "epoch": 2.6785185185185183, + "grad_norm": 16.60951149016189, + "learning_rate": 4.19873036329486e-07, + "logits/chosen": -1.1631643772125244, + "logits/rejected": -1.3125519752502441, + "logps/chosen": -52.858795166015625, + "logps/rejected": -61.08552169799805, + "loss": 0.1714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2712424397468567, + "rewards/margins": 2.5944318771362305, + "rewards/rejected": -2.8656740188598633, + "step": 452 + }, + { + "epoch": 2.6844444444444444, + "grad_norm": 18.307768575042843, + "learning_rate": 4.1939584374461943e-07, + "logits/chosen": -1.3807969093322754, + "logits/rejected": -1.4712002277374268, + "logps/chosen": -40.716617584228516, + "logps/rejected": -50.97564697265625, + "loss": 0.2065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5352606773376465, + "rewards/margins": 2.2707905769348145, + "rewards/rejected": -1.7355297803878784, + "step": 453 + }, + { + "epoch": 2.6903703703703705, + "grad_norm": 12.19613990182221, + "learning_rate": 4.189175073609184e-07, + "logits/chosen": -1.210724949836731, + "logits/rejected": -1.2001690864562988, + "logps/chosen": -50.027584075927734, + "logps/rejected": -60.53565979003906, + "loss": 0.1483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36551186442375183, + "rewards/margins": 2.5059051513671875, + "rewards/rejected": -2.1403932571411133, + "step": 454 + }, + { + "epoch": 2.696296296296296, + "grad_norm": 20.05543126255426, + "learning_rate": 4.184380304082177e-07, + "logits/chosen": -1.2690619230270386, + "logits/rejected": -1.2047336101531982, + "logps/chosen": -44.2292366027832, + "logps/rejected": -52.65461730957031, + "loss": 0.2265, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32682713866233826, + "rewards/margins": 2.385038375854492, + "rewards/rejected": -2.7118656635284424, + "step": 455 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 20.386292279150016, + "learning_rate": 4.179574161240536e-07, + "logits/chosen": -1.1057473421096802, + "logits/rejected": -1.1048200130462646, + "logps/chosen": -37.20689392089844, + "logps/rejected": -51.49712371826172, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22639192640781403, + "rewards/margins": 2.2510790824890137, + "rewards/rejected": -2.0246872901916504, + "step": 456 + }, + { + "epoch": 2.7081481481481484, + "grad_norm": 19.386564533555276, + "learning_rate": 4.1747566775364175e-07, + "logits/chosen": -1.4120452404022217, + "logits/rejected": -1.5171071290969849, + "logps/chosen": -34.28341293334961, + "logps/rejected": -67.71885681152344, + "loss": 0.2456, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5173302292823792, + "rewards/margins": 2.899099349975586, + "rewards/rejected": -2.3817691802978516, + "step": 457 + }, + { + "epoch": 2.714074074074074, + "grad_norm": 17.25041625855171, + "learning_rate": 4.169927885498556e-07, + "logits/chosen": -1.4341471195220947, + "logits/rejected": -1.4753046035766602, + "logps/chosen": -52.228519439697266, + "logps/rejected": -61.93224334716797, + "loss": 0.2118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5279741883277893, + "rewards/margins": 3.191676139831543, + "rewards/rejected": -3.7196502685546875, + "step": 458 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 19.00368041939193, + "learning_rate": 4.16508781773204e-07, + "logits/chosen": -1.1663175821304321, + "logits/rejected": -1.2493047714233398, + "logps/chosen": -59.34048843383789, + "logps/rejected": -66.43865203857422, + "loss": 0.2262, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7447548508644104, + "rewards/margins": 3.5409374237060547, + "rewards/rejected": -4.28569221496582, + "step": 459 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 17.264561754976015, + "learning_rate": 4.1602365069180976e-07, + "logits/chosen": -1.3075733184814453, + "logits/rejected": -1.3201042413711548, + "logps/chosen": -53.10240936279297, + "logps/rejected": -57.974273681640625, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26173388957977295, + "rewards/margins": 3.05476975440979, + "rewards/rejected": -3.3165035247802734, + "step": 460 + }, + { + "epoch": 2.731851851851852, + "grad_norm": 18.17486278562164, + "learning_rate": 4.155373985813868e-07, + "logits/chosen": -1.366817831993103, + "logits/rejected": -1.3806540966033936, + "logps/chosen": -39.87694549560547, + "logps/rejected": -48.488956451416016, + "loss": 0.2163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14938339591026306, + "rewards/margins": 2.15541410446167, + "rewards/rejected": -2.304797410964966, + "step": 461 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 19.361148379508926, + "learning_rate": 4.150500287252189e-07, + "logits/chosen": -1.2390245199203491, + "logits/rejected": -1.2137972116470337, + "logps/chosen": -52.104366302490234, + "logps/rejected": -63.186466217041016, + "loss": 0.2281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6157130002975464, + "rewards/margins": 2.714012861251831, + "rewards/rejected": -3.329725742340088, + "step": 462 + }, + { + "epoch": 2.7437037037037038, + "grad_norm": 13.808494148260458, + "learning_rate": 4.145615444141369e-07, + "logits/chosen": -1.2283602952957153, + "logits/rejected": -1.208832025527954, + "logps/chosen": -54.112083435058594, + "logps/rejected": -54.76646423339844, + "loss": 0.1701, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4094162881374359, + "rewards/margins": 2.894310474395752, + "rewards/rejected": -3.3037266731262207, + "step": 463 + }, + { + "epoch": 2.74962962962963, + "grad_norm": 19.11789416326404, + "learning_rate": 4.1407194894649677e-07, + "logits/chosen": -1.2563258409500122, + "logits/rejected": -1.2561264038085938, + "logps/chosen": -47.32088851928711, + "logps/rejected": -69.62883758544922, + "loss": 0.2193, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12513133883476257, + "rewards/margins": 3.2240326404571533, + "rewards/rejected": -3.349163770675659, + "step": 464 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 19.46290010104755, + "learning_rate": 4.135812456281571e-07, + "logits/chosen": -1.3946552276611328, + "logits/rejected": -1.474714994430542, + "logps/chosen": -50.58590316772461, + "logps/rejected": -89.3076171875, + "loss": 0.2179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3854854106903076, + "rewards/margins": 3.04565691947937, + "rewards/rejected": -3.431142568588257, + "step": 465 + }, + { + "epoch": 2.7614814814814816, + "grad_norm": 21.08647372531421, + "learning_rate": 4.1308943777245717e-07, + "logits/chosen": -1.209939956665039, + "logits/rejected": -1.177263855934143, + "logps/chosen": -42.54806900024414, + "logps/rejected": -52.786014556884766, + "loss": 0.2676, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.288261353969574, + "rewards/margins": 2.845198631286621, + "rewards/rejected": -3.1334598064422607, + "step": 466 + }, + { + "epoch": 2.7674074074074073, + "grad_norm": 20.73103490558258, + "learning_rate": 4.1259652870019426e-07, + "logits/chosen": -1.169033169746399, + "logits/rejected": -1.1900144815444946, + "logps/chosen": -48.64216613769531, + "logps/rejected": -56.44643020629883, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49444669485092163, + "rewards/margins": 3.2292940616607666, + "rewards/rejected": -3.723741054534912, + "step": 467 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 16.731518786213478, + "learning_rate": 4.121025217396011e-07, + "logits/chosen": -1.3166173696517944, + "logits/rejected": -1.3515011072158813, + "logps/chosen": -41.45418167114258, + "logps/rejected": -50.53019714355469, + "loss": 0.1668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45292171835899353, + "rewards/margins": 2.9842123985290527, + "rewards/rejected": -3.437134265899658, + "step": 468 + }, + { + "epoch": 2.779259259259259, + "grad_norm": 21.284988595267457, + "learning_rate": 4.1160742022632395e-07, + "logits/chosen": -1.2805315256118774, + "logits/rejected": -1.2572264671325684, + "logps/chosen": -43.30008316040039, + "logps/rejected": -56.220314025878906, + "loss": 0.2376, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5084177255630493, + "rewards/margins": 1.7679126262664795, + "rewards/rejected": -2.2763302326202393, + "step": 469 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 16.424236630335066, + "learning_rate": 4.1111122750339945e-07, + "logits/chosen": -1.1565334796905518, + "logits/rejected": -1.1801633834838867, + "logps/chosen": -56.21332931518555, + "logps/rejected": -68.18215942382812, + "loss": 0.1819, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.310497522354126, + "rewards/margins": 3.5792107582092285, + "rewards/rejected": -3.8897082805633545, + "step": 470 + }, + { + "epoch": 2.7911111111111113, + "grad_norm": 17.906723620074274, + "learning_rate": 4.106139469212326e-07, + "logits/chosen": -1.152573585510254, + "logits/rejected": -1.189849615097046, + "logps/chosen": -56.43069076538086, + "logps/rejected": -69.75574493408203, + "loss": 0.2075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.382152795791626, + "rewards/margins": 3.191413402557373, + "rewards/rejected": -3.573566198348999, + "step": 471 + }, + { + "epoch": 2.797037037037037, + "grad_norm": 18.98593991817937, + "learning_rate": 4.1011558183757374e-07, + "logits/chosen": -1.173750400543213, + "logits/rejected": -1.2400974035263062, + "logps/chosen": -39.005699157714844, + "logps/rejected": -63.15963363647461, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7964209914207458, + "rewards/margins": 3.4397897720336914, + "rewards/rejected": -4.236210346221924, + "step": 472 + }, + { + "epoch": 2.802962962962963, + "grad_norm": 16.15306648500753, + "learning_rate": 4.0961613561749585e-07, + "logits/chosen": -1.4924187660217285, + "logits/rejected": -1.473052740097046, + "logps/chosen": -55.84531784057617, + "logps/rejected": -71.32598876953125, + "loss": 0.1963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7752455472946167, + "rewards/margins": 3.1186671257019043, + "rewards/rejected": -3.8939127922058105, + "step": 473 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 15.237259647333039, + "learning_rate": 4.091156116333723e-07, + "logits/chosen": -1.2219537496566772, + "logits/rejected": -1.2826833724975586, + "logps/chosen": -50.56085968017578, + "logps/rejected": -65.05696868896484, + "loss": 0.1791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7408727407455444, + "rewards/margins": 2.5775582790374756, + "rewards/rejected": -3.3184311389923096, + "step": 474 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 14.632575495330288, + "learning_rate": 4.086140132648534e-07, + "logits/chosen": -1.3067800998687744, + "logits/rejected": -1.3324289321899414, + "logps/chosen": -49.524009704589844, + "logps/rejected": -74.86628723144531, + "loss": 0.1637, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.901452362537384, + "rewards/margins": 3.2804508209228516, + "rewards/rejected": -4.18190336227417, + "step": 475 + }, + { + "epoch": 2.8207407407407405, + "grad_norm": 21.595404469848596, + "learning_rate": 4.081113438988443e-07, + "logits/chosen": -1.3657824993133545, + "logits/rejected": -1.3470386266708374, + "logps/chosen": -52.59661102294922, + "logps/rejected": -66.06396484375, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33402401208877563, + "rewards/margins": 3.3505983352661133, + "rewards/rejected": -3.684622049331665, + "step": 476 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 19.687853433851274, + "learning_rate": 4.076076069294816e-07, + "logits/chosen": -1.3582686185836792, + "logits/rejected": -1.4166455268859863, + "logps/chosen": -51.02378845214844, + "logps/rejected": -74.09524536132812, + "loss": 0.1808, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.765562117099762, + "rewards/margins": 2.352506160736084, + "rewards/rejected": -3.1180684566497803, + "step": 477 + }, + { + "epoch": 2.8325925925925928, + "grad_norm": 17.745281021443276, + "learning_rate": 4.071028057581105e-07, + "logits/chosen": -1.248734474182129, + "logits/rejected": -1.292419672012329, + "logps/chosen": -71.39847564697266, + "logps/rejected": -67.45188903808594, + "loss": 0.2024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0719102621078491, + "rewards/margins": 2.9319186210632324, + "rewards/rejected": -4.003829002380371, + "step": 478 + }, + { + "epoch": 2.8385185185185184, + "grad_norm": 16.98519419081863, + "learning_rate": 4.065969437932622e-07, + "logits/chosen": -1.2369309663772583, + "logits/rejected": -1.2602821588516235, + "logps/chosen": -61.797019958496094, + "logps/rejected": -68.8377456665039, + "loss": 0.197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.268001914024353, + "rewards/margins": 2.377183675765991, + "rewards/rejected": -3.6451854705810547, + "step": 479 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 17.804384178514177, + "learning_rate": 4.0609002445063036e-07, + "logits/chosen": -1.3432717323303223, + "logits/rejected": -1.2590982913970947, + "logps/chosen": -51.91122055053711, + "logps/rejected": -61.9157829284668, + "loss": 0.2035, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4239550232887268, + "rewards/margins": 2.9792933464050293, + "rewards/rejected": -3.4032483100891113, + "step": 480 + }, + { + "epoch": 2.85037037037037, + "grad_norm": 18.264245793315055, + "learning_rate": 4.0558205115304846e-07, + "logits/chosen": -1.3515952825546265, + "logits/rejected": -1.4534411430358887, + "logps/chosen": -58.752376556396484, + "logps/rejected": -73.56411743164062, + "loss": 0.2108, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.48288118839263916, + "rewards/margins": 2.813925266265869, + "rewards/rejected": -3.2968060970306396, + "step": 481 + }, + { + "epoch": 2.8562962962962963, + "grad_norm": 19.268397927205957, + "learning_rate": 4.050730273304663e-07, + "logits/chosen": -1.2159310579299927, + "logits/rejected": -1.2838042974472046, + "logps/chosen": -44.14533233642578, + "logps/rejected": -68.81448364257812, + "loss": 0.2378, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07248367369174957, + "rewards/margins": 3.768887996673584, + "rewards/rejected": -3.841372013092041, + "step": 482 + }, + { + "epoch": 2.862222222222222, + "grad_norm": 11.562310542554206, + "learning_rate": 4.045629564199273e-07, + "logits/chosen": -1.3160737752914429, + "logits/rejected": -1.359323501586914, + "logps/chosen": -59.05349349975586, + "logps/rejected": -77.77356719970703, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.020738810300827026, + "rewards/margins": 4.168304443359375, + "rewards/rejected": -4.189043045043945, + "step": 483 + }, + { + "epoch": 2.868148148148148, + "grad_norm": 20.72487276535475, + "learning_rate": 4.04051841865545e-07, + "logits/chosen": -1.2002463340759277, + "logits/rejected": -1.2717535495758057, + "logps/chosen": -54.947025299072266, + "logps/rejected": -49.54365539550781, + "loss": 0.2478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12642936408519745, + "rewards/margins": 1.9815359115600586, + "rewards/rejected": -2.1079654693603516, + "step": 484 + }, + { + "epoch": 2.8740740740740742, + "grad_norm": 19.870823704086312, + "learning_rate": 4.0353968711847974e-07, + "logits/chosen": -1.1724351644515991, + "logits/rejected": -1.266343116760254, + "logps/chosen": -53.72722625732422, + "logps/rejected": -71.61346435546875, + "loss": 0.2174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19865109026432037, + "rewards/margins": 3.181943416595459, + "rewards/rejected": -3.380594491958618, + "step": 485 + }, + { + "epoch": 2.88, + "grad_norm": 23.156216167074366, + "learning_rate": 4.030264956369157e-07, + "logits/chosen": -1.1195604801177979, + "logits/rejected": -1.1153167486190796, + "logps/chosen": -59.811885833740234, + "logps/rejected": -68.43907165527344, + "loss": 0.1927, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1706525832414627, + "rewards/margins": 3.3727633953094482, + "rewards/rejected": -3.5434162616729736, + "step": 486 + }, + { + "epoch": 2.885925925925926, + "grad_norm": 21.673088334063607, + "learning_rate": 4.02512270886037e-07, + "logits/chosen": -1.2593806982040405, + "logits/rejected": -1.341259479522705, + "logps/chosen": -59.390708923339844, + "logps/rejected": -57.16114807128906, + "loss": 0.2189, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.01359531283378601, + "rewards/margins": 2.733617067337036, + "rewards/rejected": -2.7472124099731445, + "step": 487 + }, + { + "epoch": 2.891851851851852, + "grad_norm": 22.61643853619251, + "learning_rate": 4.01997016338005e-07, + "logits/chosen": -1.2761664390563965, + "logits/rejected": -1.3337078094482422, + "logps/chosen": -48.623069763183594, + "logps/rejected": -64.20046997070312, + "loss": 0.2249, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5526472926139832, + "rewards/margins": 3.5587265491485596, + "rewards/rejected": -4.1113739013671875, + "step": 488 + }, + { + "epoch": 2.897777777777778, + "grad_norm": 23.269686558492296, + "learning_rate": 4.014807354719342e-07, + "logits/chosen": -1.4048537015914917, + "logits/rejected": -1.4099326133728027, + "logps/chosen": -46.07366180419922, + "logps/rejected": -51.41790771484375, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08646069467067719, + "rewards/margins": 3.1128947734832764, + "rewards/rejected": -3.1993556022644043, + "step": 489 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 22.472991628056597, + "learning_rate": 4.00963431773869e-07, + "logits/chosen": -1.27083420753479, + "logits/rejected": -1.3645908832550049, + "logps/chosen": -41.91563415527344, + "logps/rejected": -57.03092956542969, + "loss": 0.3048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1558661013841629, + "rewards/margins": 2.5972371101379395, + "rewards/rejected": -2.441370964050293, + "step": 490 + }, + { + "epoch": 2.9096296296296296, + "grad_norm": 18.32253442612832, + "learning_rate": 4.0044510873676043e-07, + "logits/chosen": -1.3212807178497314, + "logits/rejected": -1.336035966873169, + "logps/chosen": -58.66569519042969, + "logps/rejected": -66.00804138183594, + "loss": 0.2009, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2158641666173935, + "rewards/margins": 3.0606842041015625, + "rewards/rejected": -3.2765486240386963, + "step": 491 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 16.26649128609091, + "learning_rate": 3.9992576986044223e-07, + "logits/chosen": -1.209001064300537, + "logits/rejected": -1.2842363119125366, + "logps/chosen": -52.691688537597656, + "logps/rejected": -74.82657623291016, + "loss": 0.1554, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.620505690574646, + "rewards/margins": 3.7764885425567627, + "rewards/rejected": -4.396994113922119, + "step": 492 + }, + { + "epoch": 2.9214814814814813, + "grad_norm": 17.610253600287994, + "learning_rate": 3.9940541865160726e-07, + "logits/chosen": -1.2993587255477905, + "logits/rejected": -1.4684174060821533, + "logps/chosen": -50.94914627075195, + "logps/rejected": -60.80756378173828, + "loss": 0.2051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06892414391040802, + "rewards/margins": 2.141951560974121, + "rewards/rejected": -2.2108757495880127, + "step": 493 + }, + { + "epoch": 2.9274074074074075, + "grad_norm": 15.568340315134158, + "learning_rate": 3.9888405862378395e-07, + "logits/chosen": -1.303976058959961, + "logits/rejected": -1.3029195070266724, + "logps/chosen": -59.768001556396484, + "logps/rejected": -63.35868835449219, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16178204119205475, + "rewards/margins": 2.961404323577881, + "rewards/rejected": -3.1231861114501953, + "step": 494 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 21.221679180562266, + "learning_rate": 3.983616932973124e-07, + "logits/chosen": -1.273141622543335, + "logits/rejected": -1.334914207458496, + "logps/chosen": -44.11486053466797, + "logps/rejected": -54.946693420410156, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39918532967567444, + "rewards/margins": 1.510643482208252, + "rewards/rejected": -1.909828782081604, + "step": 495 + }, + { + "epoch": 2.9392592592592592, + "grad_norm": 18.3522337117436, + "learning_rate": 3.9783832619932076e-07, + "logits/chosen": -1.2553412914276123, + "logits/rejected": -1.291959524154663, + "logps/chosen": -46.14884567260742, + "logps/rejected": -56.9810676574707, + "loss": 0.219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5942877531051636, + "rewards/margins": 2.607400894165039, + "rewards/rejected": -3.201688766479492, + "step": 496 + }, + { + "epoch": 2.9451851851851854, + "grad_norm": 16.373090407512134, + "learning_rate": 3.973139608637015e-07, + "logits/chosen": -1.1282140016555786, + "logits/rejected": -1.0897331237792969, + "logps/chosen": -47.96284484863281, + "logps/rejected": -61.69994354248047, + "loss": 0.1839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.056502558290958405, + "rewards/margins": 2.3940765857696533, + "rewards/rejected": -2.337573766708374, + "step": 497 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 17.62403657778311, + "learning_rate": 3.9678860083108713e-07, + "logits/chosen": -1.2530139684677124, + "logits/rejected": -1.2804282903671265, + "logps/chosen": -44.19329833984375, + "logps/rejected": -63.33118438720703, + "loss": 0.2327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03126450255513191, + "rewards/margins": 3.057286500930786, + "rewards/rejected": -3.026021957397461, + "step": 498 + }, + { + "epoch": 2.957037037037037, + "grad_norm": 18.150598366324193, + "learning_rate": 3.9626224964882685e-07, + "logits/chosen": -1.35397469997406, + "logits/rejected": -1.3743972778320312, + "logps/chosen": -46.631126403808594, + "logps/rejected": -56.481590270996094, + "loss": 0.1899, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.030939504504203796, + "rewards/margins": 2.582378625869751, + "rewards/rejected": -2.61331844329834, + "step": 499 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 17.40782024999203, + "learning_rate": 3.957349108709623e-07, + "logits/chosen": -1.050638198852539, + "logits/rejected": -1.0757251977920532, + "logps/chosen": -48.303932189941406, + "logps/rejected": -63.3568115234375, + "loss": 0.1925, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.251191109418869, + "rewards/margins": 2.1156184673309326, + "rewards/rejected": -2.366809368133545, + "step": 500 + }, + { + "epoch": 2.968888888888889, + "grad_norm": 12.57466221715435, + "learning_rate": 3.9520658805820335e-07, + "logits/chosen": -1.2836908102035522, + "logits/rejected": -1.3424688577651978, + "logps/chosen": -56.20665740966797, + "logps/rejected": -69.99337005615234, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.191994309425354, + "rewards/margins": 4.468802452087402, + "rewards/rejected": -4.660797119140625, + "step": 501 + }, + { + "epoch": 2.974814814814815, + "grad_norm": 17.441545006331303, + "learning_rate": 3.946772847779045e-07, + "logits/chosen": -1.265354037284851, + "logits/rejected": -1.2552366256713867, + "logps/chosen": -44.5283088684082, + "logps/rejected": -45.96354675292969, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26437434554100037, + "rewards/margins": 2.901440382003784, + "rewards/rejected": -2.637066125869751, + "step": 502 + }, + { + "epoch": 2.9807407407407407, + "grad_norm": 18.271861903291637, + "learning_rate": 3.941470046040406e-07, + "logits/chosen": -1.3075486421585083, + "logits/rejected": -1.3464792966842651, + "logps/chosen": -53.6762580871582, + "logps/rejected": -57.431610107421875, + "loss": 0.1841, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07351833581924438, + "rewards/margins": 2.5279555320739746, + "rewards/rejected": -2.454437017440796, + "step": 503 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 19.426242018024382, + "learning_rate": 3.936157511171826e-07, + "logits/chosen": -1.1142933368682861, + "logits/rejected": -1.1934661865234375, + "logps/chosen": -40.239479064941406, + "logps/rejected": -63.111671447753906, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24999532103538513, + "rewards/margins": 3.7723803520202637, + "rewards/rejected": -4.022375583648682, + "step": 504 + }, + { + "epoch": 2.9925925925925925, + "grad_norm": 23.27473360631721, + "learning_rate": 3.9308352790447354e-07, + "logits/chosen": -1.2489938735961914, + "logits/rejected": -1.3289551734924316, + "logps/chosen": -44.65389633178711, + "logps/rejected": -60.861324310302734, + "loss": 0.2433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0023244433104991913, + "rewards/margins": 2.8981451988220215, + "rewards/rejected": -2.9004695415496826, + "step": 505 + }, + { + "epoch": 2.9985185185185186, + "grad_norm": 16.78764710351435, + "learning_rate": 3.9255033855960414e-07, + "logits/chosen": -1.3627394437789917, + "logits/rejected": -1.4329453706741333, + "logps/chosen": -35.57086181640625, + "logps/rejected": -71.59107971191406, + "loss": 0.222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22746825218200684, + "rewards/margins": 2.67952299118042, + "rewards/rejected": -2.906991481781006, + "step": 506 + }, + { + "epoch": 3.0044444444444443, + "grad_norm": 15.51252784783785, + "learning_rate": 3.920161866827889e-07, + "logits/chosen": -1.2271157503128052, + "logits/rejected": -1.2314724922180176, + "logps/chosen": -42.860347747802734, + "logps/rejected": -63.78620910644531, + "loss": 0.1722, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.39318573474884033, + "rewards/margins": 3.201770067214966, + "rewards/rejected": -3.5949559211730957, + "step": 507 + }, + { + "epoch": 3.0103703703703704, + "grad_norm": 15.348034664152413, + "learning_rate": 3.914810758807414e-07, + "logits/chosen": -1.266876459121704, + "logits/rejected": -1.201805591583252, + "logps/chosen": -39.3177375793457, + "logps/rejected": -63.72394561767578, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21385174989700317, + "rewards/margins": 2.7866299152374268, + "rewards/rejected": -2.5727782249450684, + "step": 508 + }, + { + "epoch": 3.0162962962962965, + "grad_norm": 13.694371117456312, + "learning_rate": 3.9094500976665025e-07, + "logits/chosen": -1.4511996507644653, + "logits/rejected": -1.5082242488861084, + "logps/chosen": -45.20631408691406, + "logps/rejected": -60.80164337158203, + "loss": 0.1663, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0012191124260425568, + "rewards/margins": 2.3622097969055176, + "rewards/rejected": -2.360990524291992, + "step": 509 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 9.608073686288138, + "learning_rate": 3.904079919601542e-07, + "logits/chosen": -1.1920677423477173, + "logits/rejected": -1.1335797309875488, + "logps/chosen": -48.81990432739258, + "logps/rejected": -70.8568115234375, + "loss": 0.1109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3464869260787964, + "rewards/margins": 4.490400791168213, + "rewards/rejected": -4.836887359619141, + "step": 510 + }, + { + "epoch": 3.0281481481481483, + "grad_norm": 10.659649188493658, + "learning_rate": 3.898700260873182e-07, + "logits/chosen": -1.3865935802459717, + "logits/rejected": -1.4555929899215698, + "logps/chosen": -46.135467529296875, + "logps/rejected": -54.744056701660156, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39444661140441895, + "rewards/margins": 2.6518468856811523, + "rewards/rejected": -2.2574002742767334, + "step": 511 + }, + { + "epoch": 3.034074074074074, + "grad_norm": 14.922676910008562, + "learning_rate": 3.893311157806091e-07, + "logits/chosen": -1.30184805393219, + "logits/rejected": -1.3149746656417847, + "logps/chosen": -54.556312561035156, + "logps/rejected": -60.76716995239258, + "loss": 0.1527, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7763004302978516, + "rewards/margins": 2.673186779022217, + "rewards/rejected": -3.4494872093200684, + "step": 512 + }, + { + "epoch": 3.04, + "grad_norm": 13.25601338827941, + "learning_rate": 3.887912646788703e-07, + "logits/chosen": -1.3052939176559448, + "logits/rejected": -1.2798081636428833, + "logps/chosen": -46.84373474121094, + "logps/rejected": -72.25546264648438, + "loss": 0.152, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09142529964447021, + "rewards/margins": 3.686022996902466, + "rewards/rejected": -3.7774481773376465, + "step": 513 + }, + { + "epoch": 3.0459259259259257, + "grad_norm": 11.896910317905705, + "learning_rate": 3.882504764272979e-07, + "logits/chosen": -1.1876428127288818, + "logits/rejected": -1.219206690788269, + "logps/chosen": -48.874366760253906, + "logps/rejected": -79.81904602050781, + "loss": 0.1492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36831003427505493, + "rewards/margins": 2.9127964973449707, + "rewards/rejected": -3.281106472015381, + "step": 514 + }, + { + "epoch": 3.051851851851852, + "grad_norm": 11.704556833201291, + "learning_rate": 3.8770875467741577e-07, + "logits/chosen": -1.3152544498443604, + "logits/rejected": -1.3558157682418823, + "logps/chosen": -47.62205505371094, + "logps/rejected": -78.95779418945312, + "loss": 0.1424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07077004760503769, + "rewards/margins": 3.3393805027008057, + "rewards/rejected": -3.2686104774475098, + "step": 515 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 11.04510617201905, + "learning_rate": 3.871661030870511e-07, + "logits/chosen": -1.2623419761657715, + "logits/rejected": -1.2585841417312622, + "logps/chosen": -53.519046783447266, + "logps/rejected": -77.24893188476562, + "loss": 0.1223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017071455717086792, + "rewards/margins": 4.162660598754883, + "rewards/rejected": -4.179732322692871, + "step": 516 + }, + { + "epoch": 3.0637037037037036, + "grad_norm": 13.147383461961375, + "learning_rate": 3.866225253203093e-07, + "logits/chosen": -1.2138491868972778, + "logits/rejected": -1.2417991161346436, + "logps/chosen": -52.79689025878906, + "logps/rejected": -65.50883483886719, + "loss": 0.1383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27549540996551514, + "rewards/margins": 3.84724497795105, + "rewards/rejected": -4.122740745544434, + "step": 517 + }, + { + "epoch": 3.0696296296296297, + "grad_norm": 12.707877291174364, + "learning_rate": 3.8607802504754984e-07, + "logits/chosen": -1.1168802976608276, + "logits/rejected": -1.300939679145813, + "logps/chosen": -55.39178466796875, + "logps/rejected": -69.42161560058594, + "loss": 0.1352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18456672132015228, + "rewards/margins": 3.3074283599853516, + "rewards/rejected": -3.491994857788086, + "step": 518 + }, + { + "epoch": 3.0755555555555554, + "grad_norm": 12.295407374509045, + "learning_rate": 3.85532605945361e-07, + "logits/chosen": -1.1297457218170166, + "logits/rejected": -1.1549437046051025, + "logps/chosen": -58.893798828125, + "logps/rejected": -63.923583984375, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11424604058265686, + "rewards/margins": 2.7826895713806152, + "rewards/rejected": -2.668443441390991, + "step": 519 + }, + { + "epoch": 3.0814814814814815, + "grad_norm": 14.285195292921095, + "learning_rate": 3.849862716965352e-07, + "logits/chosen": -1.2393442392349243, + "logits/rejected": -1.2823017835617065, + "logps/chosen": -54.41102600097656, + "logps/rejected": -80.21492004394531, + "loss": 0.134, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5927256345748901, + "rewards/margins": 4.974529266357422, + "rewards/rejected": -5.567255020141602, + "step": 520 + }, + { + "epoch": 3.0874074074074076, + "grad_norm": 15.778972893153691, + "learning_rate": 3.8443902599004406e-07, + "logits/chosen": -1.4329743385314941, + "logits/rejected": -1.4719727039337158, + "logps/chosen": -40.67536163330078, + "logps/rejected": -61.614952087402344, + "loss": 0.1758, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.005242794752120972, + "rewards/margins": 2.597073554992676, + "rewards/rejected": -2.5918307304382324, + "step": 521 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 11.518639728252891, + "learning_rate": 3.8389087252101395e-07, + "logits/chosen": -1.2456588745117188, + "logits/rejected": -1.2938756942749023, + "logps/chosen": -44.22534942626953, + "logps/rejected": -56.13856506347656, + "loss": 0.1464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9335315227508545, + "rewards/margins": 2.4933667182922363, + "rewards/rejected": -3.426898241043091, + "step": 522 + }, + { + "epoch": 3.0992592592592594, + "grad_norm": 14.585493614282901, + "learning_rate": 3.833418149907001e-07, + "logits/chosen": -1.246716022491455, + "logits/rejected": -1.2304835319519043, + "logps/chosen": -63.897918701171875, + "logps/rejected": -69.18603515625, + "loss": 0.1847, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5471602082252502, + "rewards/margins": 3.732401132583618, + "rewards/rejected": -4.2795610427856445, + "step": 523 + }, + { + "epoch": 3.105185185185185, + "grad_norm": 12.956893274097098, + "learning_rate": 3.827918571064626e-07, + "logits/chosen": -1.2984254360198975, + "logits/rejected": -1.4002485275268555, + "logps/chosen": -48.713111877441406, + "logps/rejected": -60.62818145751953, + "loss": 0.14, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.053312137722969055, + "rewards/margins": 2.424069881439209, + "rewards/rejected": -2.370757818222046, + "step": 524 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 13.250412997131157, + "learning_rate": 3.822410025817406e-07, + "logits/chosen": -1.3015064001083374, + "logits/rejected": -1.3865606784820557, + "logps/chosen": -45.77219009399414, + "logps/rejected": -60.43882751464844, + "loss": 0.1451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16919779777526855, + "rewards/margins": 2.377150297164917, + "rewards/rejected": -2.5463480949401855, + "step": 525 + }, + { + "epoch": 3.117037037037037, + "grad_norm": 12.410375375745177, + "learning_rate": 3.816892551360279e-07, + "logits/chosen": -1.3042594194412231, + "logits/rejected": -1.4607019424438477, + "logps/chosen": -54.673851013183594, + "logps/rejected": -97.9752426147461, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16998320817947388, + "rewards/margins": 5.217333793640137, + "rewards/rejected": -5.387316703796387, + "step": 526 + }, + { + "epoch": 3.122962962962963, + "grad_norm": 13.928112395373121, + "learning_rate": 3.8113661849484723e-07, + "logits/chosen": -1.2873992919921875, + "logits/rejected": -1.3018040657043457, + "logps/chosen": -47.664215087890625, + "logps/rejected": -62.4625129699707, + "loss": 0.1334, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14091503620147705, + "rewards/margins": 2.199439764022827, + "rewards/rejected": -2.3403549194335938, + "step": 527 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 12.75367411003303, + "learning_rate": 3.805830963897256e-07, + "logits/chosen": -1.0977472066879272, + "logits/rejected": -1.1910130977630615, + "logps/chosen": -49.190704345703125, + "logps/rejected": -104.37145233154297, + "loss": 0.1236, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3834270238876343, + "rewards/margins": 5.602377414703369, + "rewards/rejected": -5.985805034637451, + "step": 528 + }, + { + "epoch": 3.1348148148148147, + "grad_norm": 14.210812786624423, + "learning_rate": 3.8002869255816873e-07, + "logits/chosen": -1.2921619415283203, + "logits/rejected": -1.3322010040283203, + "logps/chosen": -62.789669036865234, + "logps/rejected": -69.41814422607422, + "loss": 0.1635, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5743184685707092, + "rewards/margins": 3.2138500213623047, + "rewards/rejected": -3.788168430328369, + "step": 529 + }, + { + "epoch": 3.140740740740741, + "grad_norm": 10.800120698981736, + "learning_rate": 3.7947341074363593e-07, + "logits/chosen": -1.2333744764328003, + "logits/rejected": -1.1910183429718018, + "logps/chosen": -51.27862548828125, + "logps/rejected": -69.56024169921875, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18978382647037506, + "rewards/margins": 3.8493576049804688, + "rewards/rejected": -4.0391411781311035, + "step": 530 + }, + { + "epoch": 3.1466666666666665, + "grad_norm": 12.837286803004636, + "learning_rate": 3.7891725469551485e-07, + "logits/chosen": -1.2333027124404907, + "logits/rejected": -1.3354871273040771, + "logps/chosen": -39.167945861816406, + "logps/rejected": -56.154727935791016, + "loss": 0.1721, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12276704609394073, + "rewards/margins": 4.041677474975586, + "rewards/rejected": -4.164444446563721, + "step": 531 + }, + { + "epoch": 3.1525925925925926, + "grad_norm": 9.733353466294954, + "learning_rate": 3.783602281690963e-07, + "logits/chosen": -1.1893879175186157, + "logits/rejected": -1.185511589050293, + "logps/chosen": -41.93305969238281, + "logps/rejected": -70.7315444946289, + "loss": 0.1172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0085669606924057, + "rewards/margins": 3.666670799255371, + "rewards/rejected": -3.6581034660339355, + "step": 532 + }, + { + "epoch": 3.1585185185185187, + "grad_norm": 11.565308138418079, + "learning_rate": 3.7780233492554856e-07, + "logits/chosen": -1.1588068008422852, + "logits/rejected": -1.2201921939849854, + "logps/chosen": -45.38616180419922, + "logps/rejected": -61.572021484375, + "loss": 0.1239, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13040557503700256, + "rewards/margins": 4.217648029327393, + "rewards/rejected": -4.087242126464844, + "step": 533 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 12.889495915162684, + "learning_rate": 3.7724357873189244e-07, + "logits/chosen": -1.2265737056732178, + "logits/rejected": -1.232682466506958, + "logps/chosen": -46.15911102294922, + "logps/rejected": -54.5587043762207, + "loss": 0.1355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24472272396087646, + "rewards/margins": 3.2570977210998535, + "rewards/rejected": -3.5018205642700195, + "step": 534 + }, + { + "epoch": 3.1703703703703705, + "grad_norm": 12.45954730910451, + "learning_rate": 3.766839633609753e-07, + "logits/chosen": -1.2919063568115234, + "logits/rejected": -1.3215129375457764, + "logps/chosen": -50.997650146484375, + "logps/rejected": -54.30298614501953, + "loss": 0.1395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3067464828491211, + "rewards/margins": 3.446145534515381, + "rewards/rejected": -3.1393988132476807, + "step": 535 + }, + { + "epoch": 3.176296296296296, + "grad_norm": 11.473128142055426, + "learning_rate": 3.761234925914459e-07, + "logits/chosen": -1.332575798034668, + "logits/rejected": -1.3540210723876953, + "logps/chosen": -49.996734619140625, + "logps/rejected": -66.5724105834961, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3274403214454651, + "rewards/margins": 3.3128936290740967, + "rewards/rejected": -2.9854533672332764, + "step": 536 + }, + { + "epoch": 3.1822222222222223, + "grad_norm": 11.657192523402523, + "learning_rate": 3.755621702077292e-07, + "logits/chosen": -1.2775133848190308, + "logits/rejected": -1.3381080627441406, + "logps/chosen": -49.6759033203125, + "logps/rejected": -67.08674621582031, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.738972544670105, + "rewards/margins": 4.289883613586426, + "rewards/rejected": -5.02885627746582, + "step": 537 + }, + { + "epoch": 3.188148148148148, + "grad_norm": 11.299765875660642, + "learning_rate": 3.75e-07, + "logits/chosen": -1.3044968843460083, + "logits/rejected": -1.3229069709777832, + "logps/chosen": -44.54370880126953, + "logps/rejected": -59.271095275878906, + "loss": 0.1366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5054673552513123, + "rewards/margins": 3.5005180835723877, + "rewards/rejected": -4.005985260009766, + "step": 538 + }, + { + "epoch": 3.194074074074074, + "grad_norm": 14.278579624361138, + "learning_rate": 3.7443698576415795e-07, + "logits/chosen": -1.2247653007507324, + "logits/rejected": -1.2305885553359985, + "logps/chosen": -67.3138656616211, + "logps/rejected": -70.22055053710938, + "loss": 0.1701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1986965537071228, + "rewards/margins": 3.648164749145508, + "rewards/rejected": -3.4494681358337402, + "step": 539 + }, + { + "epoch": 3.2, + "grad_norm": 11.732507351761482, + "learning_rate": 3.738731313018019e-07, + "logits/chosen": -1.119322657585144, + "logits/rejected": -1.2076921463012695, + "logps/chosen": -50.44218063354492, + "logps/rejected": -62.570045471191406, + "loss": 0.1377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37010127305984497, + "rewards/margins": 3.5672857761383057, + "rewards/rejected": -3.1971845626831055, + "step": 540 + }, + { + "epoch": 3.205925925925926, + "grad_norm": 15.017170781434197, + "learning_rate": 3.7330844042020384e-07, + "logits/chosen": -1.229867696762085, + "logits/rejected": -1.2855830192565918, + "logps/chosen": -48.04343795776367, + "logps/rejected": -67.48896026611328, + "loss": 0.1687, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3234538435935974, + "rewards/margins": 2.7441210746765137, + "rewards/rejected": -2.4206674098968506, + "step": 541 + }, + { + "epoch": 3.211851851851852, + "grad_norm": 15.007372624369156, + "learning_rate": 3.727429169322837e-07, + "logits/chosen": -1.2071201801300049, + "logits/rejected": -1.2570191621780396, + "logps/chosen": -39.118656158447266, + "logps/rejected": -54.70764923095703, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36125683784484863, + "rewards/margins": 2.9558732509613037, + "rewards/rejected": -3.3171298503875732, + "step": 542 + }, + { + "epoch": 3.2177777777777776, + "grad_norm": 11.308523940892284, + "learning_rate": 3.721765646565833e-07, + "logits/chosen": -1.3717918395996094, + "logits/rejected": -1.4021085500717163, + "logps/chosen": -47.05071258544922, + "logps/rejected": -76.36489868164062, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06609348952770233, + "rewards/margins": 3.5705413818359375, + "rewards/rejected": -3.636634588241577, + "step": 543 + }, + { + "epoch": 3.2237037037037037, + "grad_norm": 11.281320443508063, + "learning_rate": 3.7160938741724057e-07, + "logits/chosen": -1.2979824542999268, + "logits/rejected": -1.3822671175003052, + "logps/chosen": -49.72755432128906, + "logps/rejected": -57.613616943359375, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24122706055641174, + "rewards/margins": 3.2609100341796875, + "rewards/rejected": -3.5021369457244873, + "step": 544 + }, + { + "epoch": 3.2296296296296294, + "grad_norm": 12.113133814789371, + "learning_rate": 3.7104138904396374e-07, + "logits/chosen": -1.153433918952942, + "logits/rejected": -1.2310869693756104, + "logps/chosen": -55.73785400390625, + "logps/rejected": -67.78173065185547, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09717914462089539, + "rewards/margins": 4.385678768157959, + "rewards/rejected": -4.288499355316162, + "step": 545 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 11.882489459481814, + "learning_rate": 3.704725733720055e-07, + "logits/chosen": -1.076989769935608, + "logits/rejected": -1.236989140510559, + "logps/chosen": -54.22359848022461, + "logps/rejected": -83.63075256347656, + "loss": 0.1262, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09978058934211731, + "rewards/margins": 3.196918249130249, + "rewards/rejected": -3.097137689590454, + "step": 546 + }, + { + "epoch": 3.2414814814814816, + "grad_norm": 13.358445845497405, + "learning_rate": 3.699029442421374e-07, + "logits/chosen": -1.0463993549346924, + "logits/rejected": -1.1902413368225098, + "logps/chosen": -48.361671447753906, + "logps/rejected": -65.42289733886719, + "loss": 0.1674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02705952525138855, + "rewards/margins": 3.5172808170318604, + "rewards/rejected": -3.4902215003967285, + "step": 547 + }, + { + "epoch": 3.2474074074074073, + "grad_norm": 15.253492858367455, + "learning_rate": 3.693325055006232e-07, + "logits/chosen": -1.225471019744873, + "logits/rejected": -1.3131675720214844, + "logps/chosen": -39.747589111328125, + "logps/rejected": -58.30244064331055, + "loss": 0.1664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2591007649898529, + "rewards/margins": 2.7843737602233887, + "rewards/rejected": -3.0434746742248535, + "step": 548 + }, + { + "epoch": 3.2533333333333334, + "grad_norm": 10.441244482245853, + "learning_rate": 3.6876126099919373e-07, + "logits/chosen": -1.1871273517608643, + "logits/rejected": -1.192123293876648, + "logps/chosen": -40.08260726928711, + "logps/rejected": -58.24555969238281, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.014481008052825928, + "rewards/margins": 3.1415109634399414, + "rewards/rejected": -3.155992031097412, + "step": 549 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 13.406182971514847, + "learning_rate": 3.681892145950203e-07, + "logits/chosen": -1.1578247547149658, + "logits/rejected": -1.1346076726913452, + "logps/chosen": -46.344505310058594, + "logps/rejected": -69.93978881835938, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10121719539165497, + "rewards/margins": 3.64451003074646, + "rewards/rejected": -3.745727062225342, + "step": 550 + }, + { + "epoch": 3.265185185185185, + "grad_norm": 8.97913278009079, + "learning_rate": 3.6761637015068893e-07, + "logits/chosen": -1.1909668445587158, + "logits/rejected": -1.2579350471496582, + "logps/chosen": -55.003353118896484, + "logps/rejected": -74.0880126953125, + "loss": 0.0896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34160882234573364, + "rewards/margins": 4.481330871582031, + "rewards/rejected": -4.822939872741699, + "step": 551 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 14.211347383837909, + "learning_rate": 3.67042731534174e-07, + "logits/chosen": -1.1668155193328857, + "logits/rejected": -1.2287236452102661, + "logps/chosen": -44.797523498535156, + "logps/rejected": -67.50494384765625, + "loss": 0.1632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12582820653915405, + "rewards/margins": 3.9675216674804688, + "rewards/rejected": -3.841693639755249, + "step": 552 + }, + { + "epoch": 3.277037037037037, + "grad_norm": 12.272261458182058, + "learning_rate": 3.6646830261881263e-07, + "logits/chosen": -1.38411545753479, + "logits/rejected": -1.3050222396850586, + "logps/chosen": -63.80104446411133, + "logps/rejected": -86.18976593017578, + "loss": 0.1362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24508556723594666, + "rewards/margins": 4.132302761077881, + "rewards/rejected": -3.8872170448303223, + "step": 553 + }, + { + "epoch": 3.282962962962963, + "grad_norm": 12.169816292887981, + "learning_rate": 3.6589308728327797e-07, + "logits/chosen": -1.3022029399871826, + "logits/rejected": -1.2084033489227295, + "logps/chosen": -57.94591522216797, + "logps/rejected": -66.00154113769531, + "loss": 0.1431, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.10055610537528992, + "rewards/margins": 3.479001045227051, + "rewards/rejected": -3.579557418823242, + "step": 554 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 15.04773541364506, + "learning_rate": 3.653170894115533e-07, + "logits/chosen": -1.208168625831604, + "logits/rejected": -1.172544002532959, + "logps/chosen": -45.36444854736328, + "logps/rejected": -58.140079498291016, + "loss": 0.1705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28003713488578796, + "rewards/margins": 3.3673346042633057, + "rewards/rejected": -3.0872974395751953, + "step": 555 + }, + { + "epoch": 3.294814814814815, + "grad_norm": 12.022675358094785, + "learning_rate": 3.6474031289290586e-07, + "logits/chosen": -1.2167534828186035, + "logits/rejected": -1.2709438800811768, + "logps/chosen": -38.76374053955078, + "logps/rejected": -63.62835693359375, + "loss": 0.1215, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2464836686849594, + "rewards/margins": 3.099431276321411, + "rewards/rejected": -2.852947473526001, + "step": 556 + }, + { + "epoch": 3.300740740740741, + "grad_norm": 15.48226246819183, + "learning_rate": 3.641627616218603e-07, + "logits/chosen": -1.17167329788208, + "logits/rejected": -1.1303136348724365, + "logps/chosen": -51.64458465576172, + "logps/rejected": -52.03526306152344, + "loss": 0.1601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07310710102319717, + "rewards/margins": 2.5615978240966797, + "rewards/rejected": -2.634705066680908, + "step": 557 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 12.763142895021037, + "learning_rate": 3.6358443949817283e-07, + "logits/chosen": -1.2829394340515137, + "logits/rejected": -1.3682385683059692, + "logps/chosen": -68.91250610351562, + "logps/rejected": -64.79576110839844, + "loss": 0.1504, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4208144545555115, + "rewards/margins": 2.884341239929199, + "rewards/rejected": -3.3051557540893555, + "step": 558 + }, + { + "epoch": 3.3125925925925928, + "grad_norm": 10.699144281049568, + "learning_rate": 3.630053504268046e-07, + "logits/chosen": -1.2629507780075073, + "logits/rejected": -1.2774531841278076, + "logps/chosen": -56.757015228271484, + "logps/rejected": -52.96363067626953, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004754260182380676, + "rewards/margins": 2.7854652404785156, + "rewards/rejected": -2.7807109355926514, + "step": 559 + }, + { + "epoch": 3.3185185185185184, + "grad_norm": 12.32948309194096, + "learning_rate": 3.62425498317895e-07, + "logits/chosen": -1.3774765729904175, + "logits/rejected": -1.551738977432251, + "logps/chosen": -46.59605026245117, + "logps/rejected": -66.0199203491211, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2962900400161743, + "rewards/margins": 3.7435803413391113, + "rewards/rejected": -3.4472906589508057, + "step": 560 + }, + { + "epoch": 3.3244444444444445, + "grad_norm": 10.068740616602673, + "learning_rate": 3.6184488708673597e-07, + "logits/chosen": -1.3757305145263672, + "logits/rejected": -1.4594712257385254, + "logps/chosen": -47.44297409057617, + "logps/rejected": -66.62445068359375, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8080825805664062, + "rewards/margins": 2.998539686203003, + "rewards/rejected": -3.8066225051879883, + "step": 561 + }, + { + "epoch": 3.33037037037037, + "grad_norm": 11.189446984327448, + "learning_rate": 3.6126352065374517e-07, + "logits/chosen": -1.2296010255813599, + "logits/rejected": -1.3213616609573364, + "logps/chosen": -52.5794792175293, + "logps/rejected": -68.11521911621094, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08329074084758759, + "rewards/margins": 3.181485652923584, + "rewards/rejected": -3.2647767066955566, + "step": 562 + }, + { + "epoch": 3.3362962962962963, + "grad_norm": 10.767209581812672, + "learning_rate": 3.6068140294443943e-07, + "logits/chosen": -1.299060583114624, + "logits/rejected": -1.3229368925094604, + "logps/chosen": -45.46562194824219, + "logps/rejected": -58.185638427734375, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24200953543186188, + "rewards/margins": 2.517746925354004, + "rewards/rejected": -2.759756565093994, + "step": 563 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 13.067270633778744, + "learning_rate": 3.6009853788940856e-07, + "logits/chosen": -1.3421900272369385, + "logits/rejected": -1.3259530067443848, + "logps/chosen": -48.29127502441406, + "logps/rejected": -53.28279113769531, + "loss": 0.1491, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6294746398925781, + "rewards/margins": 2.588189125061035, + "rewards/rejected": -3.2176637649536133, + "step": 564 + }, + { + "epoch": 3.348148148148148, + "grad_norm": 15.895189856288384, + "learning_rate": 3.595149294242884e-07, + "logits/chosen": -1.0856770277023315, + "logits/rejected": -1.1430692672729492, + "logps/chosen": -46.576560974121094, + "logps/rejected": -59.8853759765625, + "loss": 0.1596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09673969447612762, + "rewards/margins": 3.5400326251983643, + "rewards/rejected": -3.4432926177978516, + "step": 565 + }, + { + "epoch": 3.354074074074074, + "grad_norm": 11.329858917681372, + "learning_rate": 3.589305814897346e-07, + "logits/chosen": -1.1583616733551025, + "logits/rejected": -1.3731303215026855, + "logps/chosen": -48.65069580078125, + "logps/rejected": -69.706787109375, + "loss": 0.1231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2624179422855377, + "rewards/margins": 4.3107147216796875, + "rewards/rejected": -4.5731329917907715, + "step": 566 + }, + { + "epoch": 3.36, + "grad_norm": 10.945781896079705, + "learning_rate": 3.5834549803139586e-07, + "logits/chosen": -1.179014801979065, + "logits/rejected": -1.225006103515625, + "logps/chosen": -47.45325469970703, + "logps/rejected": -54.36988067626953, + "loss": 0.1138, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8625340461730957, + "rewards/margins": 3.055980682373047, + "rewards/rejected": -3.9185147285461426, + "step": 567 + }, + { + "epoch": 3.365925925925926, + "grad_norm": 13.426424155571501, + "learning_rate": 3.5775968299988725e-07, + "logits/chosen": -1.2716865539550781, + "logits/rejected": -1.3909945487976074, + "logps/chosen": -47.59417724609375, + "logps/rejected": -87.17699432373047, + "loss": 0.1201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4128919243812561, + "rewards/margins": 4.454270362854004, + "rewards/rejected": -4.867162227630615, + "step": 568 + }, + { + "epoch": 3.3718518518518517, + "grad_norm": 12.610504058042062, + "learning_rate": 3.571731403507635e-07, + "logits/chosen": -1.1148130893707275, + "logits/rejected": -1.1845794916152954, + "logps/chosen": -41.87155532836914, + "logps/rejected": -59.9697380065918, + "loss": 0.1297, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17576763033866882, + "rewards/margins": 3.7137932777404785, + "rewards/rejected": -3.5380258560180664, + "step": 569 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 14.308781328176961, + "learning_rate": 3.565858740444927e-07, + "logits/chosen": -1.2425477504730225, + "logits/rejected": -1.251265525817871, + "logps/chosen": -37.937618255615234, + "logps/rejected": -47.25873565673828, + "loss": 0.1729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5201172828674316, + "rewards/margins": 1.7741285562515259, + "rewards/rejected": -2.294245719909668, + "step": 570 + }, + { + "epoch": 3.383703703703704, + "grad_norm": 9.85987353052267, + "learning_rate": 3.559978880464289e-07, + "logits/chosen": -1.3315365314483643, + "logits/rejected": -1.2916427850723267, + "logps/chosen": -52.93934631347656, + "logps/rejected": -55.17170333862305, + "loss": 0.1122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1882355660200119, + "rewards/margins": 3.6535511016845703, + "rewards/rejected": -3.8417863845825195, + "step": 571 + }, + { + "epoch": 3.3896296296296295, + "grad_norm": 13.676050112137077, + "learning_rate": 3.5540918632678583e-07, + "logits/chosen": -1.327182412147522, + "logits/rejected": -1.3504629135131836, + "logps/chosen": -55.77523422241211, + "logps/rejected": -66.50366973876953, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11458379030227661, + "rewards/margins": 3.909151554107666, + "rewards/rejected": -4.023735523223877, + "step": 572 + }, + { + "epoch": 3.3955555555555557, + "grad_norm": 9.016347803722182, + "learning_rate": 3.5481977286060995e-07, + "logits/chosen": -1.2306737899780273, + "logits/rejected": -1.2826849222183228, + "logps/chosen": -52.68648910522461, + "logps/rejected": -74.59066772460938, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9488479495048523, + "rewards/margins": 4.810406684875488, + "rewards/rejected": -5.7592549324035645, + "step": 573 + }, + { + "epoch": 3.4014814814814813, + "grad_norm": 16.730180888421796, + "learning_rate": 3.542296516277535e-07, + "logits/chosen": -1.16511869430542, + "logits/rejected": -1.2019627094268799, + "logps/chosen": -56.49937438964844, + "logps/rejected": -63.08637619018555, + "loss": 0.1583, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2525779604911804, + "rewards/margins": 2.9084134101867676, + "rewards/rejected": -3.1609911918640137, + "step": 574 + }, + { + "epoch": 3.4074074074074074, + "grad_norm": 12.148498309953794, + "learning_rate": 3.5363882661284767e-07, + "logits/chosen": -1.2179135084152222, + "logits/rejected": -1.2052103281021118, + "logps/chosen": -43.90802764892578, + "logps/rejected": -49.14393997192383, + "loss": 0.128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3902662992477417, + "rewards/margins": 2.4715981483459473, + "rewards/rejected": -2.8618645668029785, + "step": 575 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 12.367099531026179, + "learning_rate": 3.53047301805276e-07, + "logits/chosen": -1.27561354637146, + "logits/rejected": -1.3109219074249268, + "logps/chosen": -55.32654571533203, + "logps/rejected": -61.15959167480469, + "loss": 0.1151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42814308404922485, + "rewards/margins": 3.9681081771850586, + "rewards/rejected": -3.5399651527404785, + "step": 576 + }, + { + "epoch": 3.419259259259259, + "grad_norm": 10.977571285480575, + "learning_rate": 3.5245508119914683e-07, + "logits/chosen": -1.0619442462921143, + "logits/rejected": -1.0826367139816284, + "logps/chosen": -50.80113983154297, + "logps/rejected": -60.27879333496094, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31538814306259155, + "rewards/margins": 4.127216339111328, + "rewards/rejected": -3.811828136444092, + "step": 577 + }, + { + "epoch": 3.4251851851851853, + "grad_norm": 15.088444665057558, + "learning_rate": 3.518621687932671e-07, + "logits/chosen": -1.2352567911148071, + "logits/rejected": -1.2788387537002563, + "logps/chosen": -50.68989562988281, + "logps/rejected": -69.35728454589844, + "loss": 0.158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3294627368450165, + "rewards/margins": 3.7237389087677, + "rewards/rejected": -4.053201675415039, + "step": 578 + }, + { + "epoch": 3.431111111111111, + "grad_norm": 12.344161330524548, + "learning_rate": 3.5126856859111464e-07, + "logits/chosen": -1.357169270515442, + "logits/rejected": -1.3112940788269043, + "logps/chosen": -53.08173370361328, + "logps/rejected": -67.811279296875, + "loss": 0.1136, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38673120737075806, + "rewards/margins": 3.6063897609710693, + "rewards/rejected": -3.219658613204956, + "step": 579 + }, + { + "epoch": 3.437037037037037, + "grad_norm": 13.39088643319056, + "learning_rate": 3.5067428460081157e-07, + "logits/chosen": -1.0850348472595215, + "logits/rejected": -1.1661014556884766, + "logps/chosen": -39.36518859863281, + "logps/rejected": -50.459564208984375, + "loss": 0.1553, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7768265604972839, + "rewards/margins": 3.601526975631714, + "rewards/rejected": -2.824700355529785, + "step": 580 + }, + { + "epoch": 3.442962962962963, + "grad_norm": 11.46739020103962, + "learning_rate": 3.5007932083509687e-07, + "logits/chosen": -1.295800805091858, + "logits/rejected": -1.3909955024719238, + "logps/chosen": -52.019142150878906, + "logps/rejected": -76.2337875366211, + "loss": 0.1357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06341786682605743, + "rewards/margins": 3.9215664863586426, + "rewards/rejected": -3.9849839210510254, + "step": 581 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 9.915174802293706, + "learning_rate": 3.494836813112998e-07, + "logits/chosen": -1.1923561096191406, + "logits/rejected": -1.2022353410720825, + "logps/chosen": -47.116416931152344, + "logps/rejected": -54.7679443359375, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3462223708629608, + "rewards/margins": 2.759658098220825, + "rewards/rejected": -3.1058802604675293, + "step": 582 + }, + { + "epoch": 3.454814814814815, + "grad_norm": 10.129137706661414, + "learning_rate": 3.488873700513124e-07, + "logits/chosen": -1.1396245956420898, + "logits/rejected": -1.1778151988983154, + "logps/chosen": -46.93259811401367, + "logps/rejected": -72.2052001953125, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08462900668382645, + "rewards/margins": 4.626044273376465, + "rewards/rejected": -4.710673809051514, + "step": 583 + }, + { + "epoch": 3.4607407407407407, + "grad_norm": 12.71575358091169, + "learning_rate": 3.482903910815625e-07, + "logits/chosen": -1.2865262031555176, + "logits/rejected": -1.4614137411117554, + "logps/chosen": -46.11248016357422, + "logps/rejected": -76.97091674804688, + "loss": 0.1249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3348883390426636, + "rewards/margins": 3.5430636405944824, + "rewards/rejected": -3.8779520988464355, + "step": 584 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 12.299766608320654, + "learning_rate": 3.476927484329862e-07, + "logits/chosen": -1.232077717781067, + "logits/rejected": -1.4356149435043335, + "logps/chosen": -51.55973434448242, + "logps/rejected": -55.99464416503906, + "loss": 0.1568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1431867480278015, + "rewards/margins": 2.7733068466186523, + "rewards/rejected": -2.9164934158325195, + "step": 585 + }, + { + "epoch": 3.4725925925925925, + "grad_norm": 10.95922060452019, + "learning_rate": 3.4709444614100113e-07, + "logits/chosen": -1.0688925981521606, + "logits/rejected": -1.2096281051635742, + "logps/chosen": -48.453758239746094, + "logps/rejected": -60.63860321044922, + "loss": 0.1153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18309545516967773, + "rewards/margins": 4.147503852844238, + "rewards/rejected": -3.9644083976745605, + "step": 586 + }, + { + "epoch": 3.4785185185185186, + "grad_norm": 11.54515377570644, + "learning_rate": 3.46495488245479e-07, + "logits/chosen": -1.049604892730713, + "logits/rejected": -1.170586109161377, + "logps/chosen": -31.83830451965332, + "logps/rejected": -60.98114776611328, + "loss": 0.1006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018343007192015648, + "rewards/margins": 4.682507514953613, + "rewards/rejected": -4.700850963592529, + "step": 587 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 9.908784563066144, + "learning_rate": 3.4589587879071814e-07, + "logits/chosen": -1.207253098487854, + "logits/rejected": -1.2515345811843872, + "logps/chosen": -36.640602111816406, + "logps/rejected": -65.44203186035156, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08374863117933273, + "rewards/margins": 5.360215187072754, + "rewards/rejected": -5.276466369628906, + "step": 588 + }, + { + "epoch": 3.4903703703703703, + "grad_norm": 13.154285168155294, + "learning_rate": 3.452956218254165e-07, + "logits/chosen": -1.159376859664917, + "logits/rejected": -1.3010073900222778, + "logps/chosen": -69.1679916381836, + "logps/rejected": -80.59864807128906, + "loss": 0.1031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6153389811515808, + "rewards/margins": 3.839124917984009, + "rewards/rejected": -4.454463958740234, + "step": 589 + }, + { + "epoch": 3.4962962962962965, + "grad_norm": 13.385750783978704, + "learning_rate": 3.44694721402644e-07, + "logits/chosen": -1.1509666442871094, + "logits/rejected": -1.2663698196411133, + "logps/chosen": -52.61823272705078, + "logps/rejected": -61.116004943847656, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3829859495162964, + "rewards/margins": 4.054986000061035, + "rewards/rejected": -4.437971591949463, + "step": 590 + }, + { + "epoch": 3.502222222222222, + "grad_norm": 12.780642607852641, + "learning_rate": 3.440931815798156e-07, + "logits/chosen": -1.2461998462677002, + "logits/rejected": -1.3316093683242798, + "logps/chosen": -43.889793395996094, + "logps/rejected": -48.703697204589844, + "loss": 0.1228, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2379981130361557, + "rewards/margins": 3.194875478744507, + "rewards/rejected": -3.4328739643096924, + "step": 591 + }, + { + "epoch": 3.5081481481481482, + "grad_norm": 11.675294006646098, + "learning_rate": 3.434910064186633e-07, + "logits/chosen": -1.3106170892715454, + "logits/rejected": -1.3728466033935547, + "logps/chosen": -62.90284729003906, + "logps/rejected": -79.64160919189453, + "loss": 0.1167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06035172939300537, + "rewards/margins": 4.127461910247803, + "rewards/rejected": -4.187813758850098, + "step": 592 + }, + { + "epoch": 3.514074074074074, + "grad_norm": 12.545333065112036, + "learning_rate": 3.428881999852093e-07, + "logits/chosen": -1.2505770921707153, + "logits/rejected": -1.1669187545776367, + "logps/chosen": -61.93037033081055, + "logps/rejected": -59.22869873046875, + "loss": 0.1287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6076765060424805, + "rewards/margins": 3.8657004833221436, + "rewards/rejected": -4.473376750946045, + "step": 593 + }, + { + "epoch": 3.52, + "grad_norm": 12.039562363809202, + "learning_rate": 3.4228476634973836e-07, + "logits/chosen": -1.4041945934295654, + "logits/rejected": -1.3803038597106934, + "logps/chosen": -37.459320068359375, + "logps/rejected": -47.55082321166992, + "loss": 0.1355, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13520795106887817, + "rewards/margins": 2.7191033363342285, + "rewards/rejected": -2.854311466217041, + "step": 594 + }, + { + "epoch": 3.525925925925926, + "grad_norm": 10.649784093665557, + "learning_rate": 3.4168070958676985e-07, + "logits/chosen": -1.319873571395874, + "logits/rejected": -1.3600068092346191, + "logps/chosen": -37.87757873535156, + "logps/rejected": -63.027076721191406, + "loss": 0.0997, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26689252257347107, + "rewards/margins": 2.804678440093994, + "rewards/rejected": -3.071570873260498, + "step": 595 + }, + { + "epoch": 3.531851851851852, + "grad_norm": 13.340100702360116, + "learning_rate": 3.41076033775031e-07, + "logits/chosen": -1.3849197626113892, + "logits/rejected": -1.2956639528274536, + "logps/chosen": -52.903194427490234, + "logps/rejected": -65.97325134277344, + "loss": 0.1414, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3340446949005127, + "rewards/margins": 3.176222562789917, + "rewards/rejected": -3.5102672576904297, + "step": 596 + }, + { + "epoch": 3.537777777777778, + "grad_norm": 11.56286685664957, + "learning_rate": 3.404707429974289e-07, + "logits/chosen": -1.1817491054534912, + "logits/rejected": -1.163534164428711, + "logps/chosen": -49.18204116821289, + "logps/rejected": -59.26782989501953, + "loss": 0.111, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37444692850112915, + "rewards/margins": 4.150087833404541, + "rewards/rejected": -4.524535179138184, + "step": 597 + }, + { + "epoch": 3.5437037037037036, + "grad_norm": 12.010067381486293, + "learning_rate": 3.3986484134102294e-07, + "logits/chosen": -1.245498776435852, + "logits/rejected": -1.3706715106964111, + "logps/chosen": -37.453086853027344, + "logps/rejected": -47.116939544677734, + "loss": 0.1253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.28796306252479553, + "rewards/margins": 3.1174545288085938, + "rewards/rejected": -3.4054174423217773, + "step": 598 + }, + { + "epoch": 3.5496296296296297, + "grad_norm": 8.683364146472007, + "learning_rate": 3.392583328969975e-07, + "logits/chosen": -1.2362431287765503, + "logits/rejected": -1.2151918411254883, + "logps/chosen": -50.436302185058594, + "logps/rejected": -57.43449020385742, + "loss": 0.0812, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2915821969509125, + "rewards/margins": 2.8281185626983643, + "rewards/rejected": -3.1197006702423096, + "step": 599 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 10.085808487260866, + "learning_rate": 3.3865122176063385e-07, + "logits/chosen": -1.21660578250885, + "logits/rejected": -1.2353140115737915, + "logps/chosen": -71.79434967041016, + "logps/rejected": -83.12904357910156, + "loss": 0.1073, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2442128658294678, + "rewards/margins": 4.630167484283447, + "rewards/rejected": -5.874380588531494, + "step": 600 + }, + { + "epoch": 3.5614814814814815, + "grad_norm": 11.573039808812974, + "learning_rate": 3.380435120312831e-07, + "logits/chosen": -1.3292632102966309, + "logits/rejected": -1.4217140674591064, + "logps/chosen": -38.365882873535156, + "logps/rejected": -79.41341400146484, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07226824760437012, + "rewards/margins": 4.579197883605957, + "rewards/rejected": -4.506929874420166, + "step": 601 + }, + { + "epoch": 3.5674074074074076, + "grad_norm": 9.394842443174891, + "learning_rate": 3.374352078123379e-07, + "logits/chosen": -1.1822998523712158, + "logits/rejected": -1.241405725479126, + "logps/chosen": -53.02170944213867, + "logps/rejected": -79.73939514160156, + "loss": 0.0917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29872894287109375, + "rewards/margins": 4.012335300445557, + "rewards/rejected": -4.31106424331665, + "step": 602 + }, + { + "epoch": 3.5733333333333333, + "grad_norm": 11.84291739497059, + "learning_rate": 3.36826313211205e-07, + "logits/chosen": -1.2073980569839478, + "logits/rejected": -1.280730962753296, + "logps/chosen": -43.82855987548828, + "logps/rejected": -68.55513000488281, + "loss": 0.112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4863467216491699, + "rewards/margins": 4.065818786621094, + "rewards/rejected": -4.552165508270264, + "step": 603 + }, + { + "epoch": 3.5792592592592594, + "grad_norm": 12.828020094911588, + "learning_rate": 3.36216832339278e-07, + "logits/chosen": -1.3106424808502197, + "logits/rejected": -1.392452597618103, + "logps/chosen": -59.96405792236328, + "logps/rejected": -78.44798278808594, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5146230459213257, + "rewards/margins": 5.168149948120117, + "rewards/rejected": -5.682773113250732, + "step": 604 + }, + { + "epoch": 3.585185185185185, + "grad_norm": 13.845119069512217, + "learning_rate": 3.3560676931190866e-07, + "logits/chosen": -1.3275146484375, + "logits/rejected": -1.4015703201293945, + "logps/chosen": -71.70769500732422, + "logps/rejected": -89.53176879882812, + "loss": 0.1426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06229984760284424, + "rewards/margins": 4.775994777679443, + "rewards/rejected": -4.838294982910156, + "step": 605 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 17.106298908100317, + "learning_rate": 3.3499612824837976e-07, + "logits/chosen": -1.2026786804199219, + "logits/rejected": -1.1629083156585693, + "logps/chosen": -53.834800720214844, + "logps/rejected": -69.13336944580078, + "loss": 0.1756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4738187789916992, + "rewards/margins": 3.8776614665985107, + "rewards/rejected": -4.351480484008789, + "step": 606 + }, + { + "epoch": 3.597037037037037, + "grad_norm": 10.52043087004134, + "learning_rate": 3.343849132718771e-07, + "logits/chosen": -1.3493577241897583, + "logits/rejected": -1.3847932815551758, + "logps/chosen": -43.97184753417969, + "logps/rejected": -59.462989807128906, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44303974509239197, + "rewards/margins": 3.0357778072357178, + "rewards/rejected": -2.592738151550293, + "step": 607 + }, + { + "epoch": 3.602962962962963, + "grad_norm": 10.291205342582417, + "learning_rate": 3.337731285094616e-07, + "logits/chosen": -1.2725692987442017, + "logits/rejected": -1.3912580013275146, + "logps/chosen": -44.75775146484375, + "logps/rejected": -63.437007904052734, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8086339235305786, + "rewards/margins": 3.172126293182373, + "rewards/rejected": -3.980760097503662, + "step": 608 + }, + { + "epoch": 3.608888888888889, + "grad_norm": 10.143519608754804, + "learning_rate": 3.3316077809204163e-07, + "logits/chosen": -1.17635178565979, + "logits/rejected": -1.2556065320968628, + "logps/chosen": -58.35077667236328, + "logps/rejected": -73.878173828125, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04152454063296318, + "rewards/margins": 4.9238481521606445, + "rewards/rejected": -4.882323265075684, + "step": 609 + }, + { + "epoch": 3.6148148148148147, + "grad_norm": 10.812544039186557, + "learning_rate": 3.3254786615434495e-07, + "logits/chosen": -1.4478641748428345, + "logits/rejected": -1.514793872833252, + "logps/chosen": -40.245880126953125, + "logps/rejected": -53.71836471557617, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2759896516799927, + "rewards/margins": 3.572756767272949, + "rewards/rejected": -3.296767234802246, + "step": 610 + }, + { + "epoch": 3.620740740740741, + "grad_norm": 11.075484357385585, + "learning_rate": 3.319343968348908e-07, + "logits/chosen": -1.2261667251586914, + "logits/rejected": -1.2476285696029663, + "logps/chosen": -43.76851272583008, + "logps/rejected": -69.57899475097656, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36359500885009766, + "rewards/margins": 4.579384803771973, + "rewards/rejected": -4.94297981262207, + "step": 611 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 10.278190082484747, + "learning_rate": 3.3132037427596186e-07, + "logits/chosen": -1.0067546367645264, + "logits/rejected": -1.0119884014129639, + "logps/chosen": -35.96860885620117, + "logps/rejected": -54.944618225097656, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2067289650440216, + "rewards/margins": 3.7989470958709717, + "rewards/rejected": -3.5922181606292725, + "step": 612 + }, + { + "epoch": 3.6325925925925926, + "grad_norm": 14.799071682203722, + "learning_rate": 3.3070580262357676e-07, + "logits/chosen": -1.248012900352478, + "logits/rejected": -1.3041200637817383, + "logps/chosen": -48.836002349853516, + "logps/rejected": -60.00675964355469, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29592806100845337, + "rewards/margins": 2.614812135696411, + "rewards/rejected": -2.3188841342926025, + "step": 613 + }, + { + "epoch": 3.6385185185185183, + "grad_norm": 9.398132314598168, + "learning_rate": 3.3009068602746135e-07, + "logits/chosen": -1.1975395679473877, + "logits/rejected": -1.2507511377334595, + "logps/chosen": -53.41824722290039, + "logps/rejected": -82.38655090332031, + "loss": 0.1, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4321832060813904, + "rewards/margins": 4.6858344078063965, + "rewards/rejected": -5.11801815032959, + "step": 614 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 16.544980874132325, + "learning_rate": 3.294750286410213e-07, + "logits/chosen": -1.1811959743499756, + "logits/rejected": -1.2203879356384277, + "logps/chosen": -45.3848876953125, + "logps/rejected": -59.8250732421875, + "loss": 0.1759, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.34930887818336487, + "rewards/margins": 3.638972282409668, + "rewards/rejected": -3.98828125, + "step": 615 + }, + { + "epoch": 3.6503703703703705, + "grad_norm": 11.09187739842382, + "learning_rate": 3.288588346213139e-07, + "logits/chosen": -1.1899372339248657, + "logits/rejected": -1.3012945652008057, + "logps/chosen": -51.77571487426758, + "logps/rejected": -59.12542724609375, + "loss": 0.1265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2844125032424927, + "rewards/margins": 3.2368106842041016, + "rewards/rejected": -3.521223306655884, + "step": 616 + }, + { + "epoch": 3.656296296296296, + "grad_norm": 12.486021087724694, + "learning_rate": 3.282421081290195e-07, + "logits/chosen": -1.3653810024261475, + "logits/rejected": -1.4098625183105469, + "logps/chosen": -55.62464904785156, + "logps/rejected": -74.34550476074219, + "loss": 0.1219, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2354530394077301, + "rewards/margins": 2.5498616695404053, + "rewards/rejected": -2.314408779144287, + "step": 617 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 14.642400078478088, + "learning_rate": 3.2762485332841404e-07, + "logits/chosen": -1.360192060470581, + "logits/rejected": -1.3391731977462769, + "logps/chosen": -41.53746032714844, + "logps/rejected": -57.24707794189453, + "loss": 0.1429, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11149264872074127, + "rewards/margins": 2.653454065322876, + "rewards/rejected": -2.541961431503296, + "step": 618 + }, + { + "epoch": 3.6681481481481484, + "grad_norm": 12.180997641032809, + "learning_rate": 3.27007074387341e-07, + "logits/chosen": -1.3639566898345947, + "logits/rejected": -1.4342676401138306, + "logps/chosen": -52.15316390991211, + "logps/rejected": -61.58177947998047, + "loss": 0.1335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3126819133758545, + "rewards/margins": 3.2332041263580322, + "rewards/rejected": -2.9205222129821777, + "step": 619 + }, + { + "epoch": 3.674074074074074, + "grad_norm": 10.139745307420647, + "learning_rate": 3.2638877547718263e-07, + "logits/chosen": -1.2260799407958984, + "logits/rejected": -1.3334007263183594, + "logps/chosen": -47.63837814331055, + "logps/rejected": -63.36333465576172, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7048265933990479, + "rewards/margins": 3.433518409729004, + "rewards/rejected": -4.138344764709473, + "step": 620 + }, + { + "epoch": 3.68, + "grad_norm": 12.893287201159424, + "learning_rate": 3.2576996077283217e-07, + "logits/chosen": -1.0819010734558105, + "logits/rejected": -1.1939318180084229, + "logps/chosen": -46.48964309692383, + "logps/rejected": -62.650733947753906, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2993219196796417, + "rewards/margins": 4.32916259765625, + "rewards/rejected": -4.62848424911499, + "step": 621 + }, + { + "epoch": 3.685925925925926, + "grad_norm": 12.489361170148916, + "learning_rate": 3.251506344526658e-07, + "logits/chosen": -1.1828008890151978, + "logits/rejected": -1.2193061113357544, + "logps/chosen": -42.48404312133789, + "logps/rejected": -69.676513671875, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25544118881225586, + "rewards/margins": 3.554778575897217, + "rewards/rejected": -3.8102197647094727, + "step": 622 + }, + { + "epoch": 3.691851851851852, + "grad_norm": 11.325882270181273, + "learning_rate": 3.2453080069851403e-07, + "logits/chosen": -1.1413886547088623, + "logits/rejected": -1.2398507595062256, + "logps/chosen": -52.085227966308594, + "logps/rejected": -63.648101806640625, + "loss": 0.1143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4174678325653076, + "rewards/margins": 4.257204055786133, + "rewards/rejected": -3.839736223220825, + "step": 623 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 9.234591711774305, + "learning_rate": 3.239104636956337e-07, + "logits/chosen": -1.1018732786178589, + "logits/rejected": -1.2757089138031006, + "logps/chosen": -57.293800354003906, + "logps/rejected": -78.91786193847656, + "loss": 0.0852, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21278901398181915, + "rewards/margins": 3.945615291595459, + "rewards/rejected": -4.1584038734436035, + "step": 624 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 10.3979498655452, + "learning_rate": 3.2328962763267993e-07, + "logits/chosen": -1.3757495880126953, + "logits/rejected": -1.4409973621368408, + "logps/chosen": -47.846397399902344, + "logps/rejected": -62.478065490722656, + "loss": 0.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21286453306674957, + "rewards/margins": 3.6221718788146973, + "rewards/rejected": -3.8350367546081543, + "step": 625 + }, + { + "epoch": 3.70962962962963, + "grad_norm": 15.014189972537254, + "learning_rate": 3.2266829670167736e-07, + "logits/chosen": -1.10061514377594, + "logits/rejected": -1.373170018196106, + "logps/chosen": -51.52172088623047, + "logps/rejected": -87.5226058959961, + "loss": 0.1303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.577337920665741, + "rewards/margins": 4.87119197845459, + "rewards/rejected": -5.4485297203063965, + "step": 626 + }, + { + "epoch": 3.7155555555555555, + "grad_norm": 11.529836335453567, + "learning_rate": 3.2204647509799216e-07, + "logits/chosen": -1.3851773738861084, + "logits/rejected": -1.317387580871582, + "logps/chosen": -67.03350830078125, + "logps/rejected": -73.63191223144531, + "loss": 0.1434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7230323553085327, + "rewards/margins": 3.9368014335632324, + "rewards/rejected": -4.659833908081055, + "step": 627 + }, + { + "epoch": 3.7214814814814816, + "grad_norm": 11.381704318346536, + "learning_rate": 3.2142416702030365e-07, + "logits/chosen": -1.4134955406188965, + "logits/rejected": -1.4581289291381836, + "logps/chosen": -37.11763000488281, + "logps/rejected": -64.74070739746094, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05463176220655441, + "rewards/margins": 3.725959062576294, + "rewards/rejected": -3.6713271141052246, + "step": 628 + }, + { + "epoch": 3.7274074074074073, + "grad_norm": 10.461787154565728, + "learning_rate": 3.2080137667057595e-07, + "logits/chosen": -1.2077778577804565, + "logits/rejected": -1.2291464805603027, + "logps/chosen": -43.65045166015625, + "logps/rejected": -48.41703796386719, + "loss": 0.1067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25063371658325195, + "rewards/margins": 2.5582003593444824, + "rewards/rejected": -2.3075666427612305, + "step": 629 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 12.686465349682216, + "learning_rate": 3.201781082540297e-07, + "logits/chosen": -1.1312532424926758, + "logits/rejected": -1.148064374923706, + "logps/chosen": -39.60634231567383, + "logps/rejected": -57.44070816040039, + "loss": 0.1388, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3378147482872009, + "rewards/margins": 2.453248977661133, + "rewards/rejected": -2.7910635471343994, + "step": 630 + }, + { + "epoch": 3.739259259259259, + "grad_norm": 11.96368647028074, + "learning_rate": 3.1955436597911315e-07, + "logits/chosen": -1.215735912322998, + "logits/rejected": -1.3225897550582886, + "logps/chosen": -50.87785339355469, + "logps/rejected": -59.81559753417969, + "loss": 0.115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0800708532333374, + "rewards/margins": 4.3443708419799805, + "rewards/rejected": -4.2642998695373535, + "step": 631 + }, + { + "epoch": 3.745185185185185, + "grad_norm": 16.889261383609924, + "learning_rate": 3.1893015405747467e-07, + "logits/chosen": -1.0199214220046997, + "logits/rejected": -1.0140265226364136, + "logps/chosen": -41.95586395263672, + "logps/rejected": -54.95637130737305, + "loss": 0.1802, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7874963283538818, + "rewards/margins": 3.881711959838867, + "rewards/rejected": -4.669208526611328, + "step": 632 + }, + { + "epoch": 3.7511111111111113, + "grad_norm": 13.116077952649336, + "learning_rate": 3.183054767039333e-07, + "logits/chosen": -1.2483025789260864, + "logits/rejected": -1.2860000133514404, + "logps/chosen": -66.91580963134766, + "logps/rejected": -68.25605010986328, + "loss": 0.1365, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4118862748146057, + "rewards/margins": 3.3932361602783203, + "rewards/rejected": -3.8051223754882812, + "step": 633 + }, + { + "epoch": 3.757037037037037, + "grad_norm": 12.677949496539178, + "learning_rate": 3.176803381364512e-07, + "logits/chosen": -1.021579384803772, + "logits/rejected": -1.080379843711853, + "logps/chosen": -46.37247848510742, + "logps/rejected": -74.96723175048828, + "loss": 0.1178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.666985273361206, + "rewards/margins": 4.058359146118164, + "rewards/rejected": -4.725344181060791, + "step": 634 + }, + { + "epoch": 3.762962962962963, + "grad_norm": 12.274361965505955, + "learning_rate": 3.170547425761046e-07, + "logits/chosen": -1.149162769317627, + "logits/rejected": -1.1602625846862793, + "logps/chosen": -44.212005615234375, + "logps/rejected": -62.73664855957031, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05275583267211914, + "rewards/margins": 4.03060245513916, + "rewards/rejected": -4.0833587646484375, + "step": 635 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 13.580028480959736, + "learning_rate": 3.164286942470553e-07, + "logits/chosen": -1.3504812717437744, + "logits/rejected": -1.347651481628418, + "logps/chosen": -46.518310546875, + "logps/rejected": -79.4151840209961, + "loss": 0.1202, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36658138036727905, + "rewards/margins": 3.574552536010742, + "rewards/rejected": -3.941133975982666, + "step": 636 + }, + { + "epoch": 3.774814814814815, + "grad_norm": 15.640271201233753, + "learning_rate": 3.1580219737652254e-07, + "logits/chosen": -1.3655234575271606, + "logits/rejected": -1.4588322639465332, + "logps/chosen": -47.00981521606445, + "logps/rejected": -64.0774917602539, + "loss": 0.1707, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13352152705192566, + "rewards/margins": 4.234663009643555, + "rewards/rejected": -4.368184566497803, + "step": 637 + }, + { + "epoch": 3.7807407407407405, + "grad_norm": 8.430239195551694, + "learning_rate": 3.1517525619475394e-07, + "logits/chosen": -1.3756463527679443, + "logits/rejected": -1.388979196548462, + "logps/chosen": -38.930641174316406, + "logps/rejected": -53.12352752685547, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0509943813085556, + "rewards/margins": 3.156320571899414, + "rewards/rejected": -3.1053261756896973, + "step": 638 + }, + { + "epoch": 3.7866666666666666, + "grad_norm": 15.973788029035022, + "learning_rate": 3.145478749349974e-07, + "logits/chosen": -1.325491189956665, + "logits/rejected": -1.391890048980713, + "logps/chosen": -58.373809814453125, + "logps/rejected": -72.70347595214844, + "loss": 0.1228, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7778370380401611, + "rewards/margins": 3.0197646617889404, + "rewards/rejected": -3.7976019382476807, + "step": 639 + }, + { + "epoch": 3.7925925925925927, + "grad_norm": 9.758280669090905, + "learning_rate": 3.139200578334724e-07, + "logits/chosen": -1.0628029108047485, + "logits/rejected": -1.0790042877197266, + "logps/chosen": -53.540550231933594, + "logps/rejected": -68.91120147705078, + "loss": 0.1044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7394209504127502, + "rewards/margins": 3.1302340030670166, + "rewards/rejected": -3.869655132293701, + "step": 640 + }, + { + "epoch": 3.7985185185185184, + "grad_norm": 12.675841938430084, + "learning_rate": 3.132918091293411e-07, + "logits/chosen": -1.3551039695739746, + "logits/rejected": -1.3326640129089355, + "logps/chosen": -47.03355407714844, + "logps/rejected": -60.826377868652344, + "loss": 0.1353, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5289521813392639, + "rewards/margins": 2.3704404830932617, + "rewards/rejected": -2.899392604827881, + "step": 641 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 10.604427627983807, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -1.3293988704681396, + "logits/rejected": -1.3584859371185303, + "logps/chosen": -41.8519287109375, + "logps/rejected": -55.53318786621094, + "loss": 0.1014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26372450590133667, + "rewards/margins": 3.610297679901123, + "rewards/rejected": -3.3465731143951416, + "step": 642 + }, + { + "epoch": 3.8103703703703706, + "grad_norm": 11.285352917600749, + "learning_rate": 3.120340338844516e-07, + "logits/chosen": -1.3401509523391724, + "logits/rejected": -1.400542974472046, + "logps/chosen": -46.122562408447266, + "logps/rejected": -58.14826965332031, + "loss": 0.1172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2642550468444824, + "rewards/margins": 4.115180015563965, + "rewards/rejected": -4.379435062408447, + "step": 643 + }, + { + "epoch": 3.8162962962962963, + "grad_norm": 13.343593594001085, + "learning_rate": 3.1140451583647464e-07, + "logits/chosen": -1.2204161882400513, + "logits/rejected": -1.2906641960144043, + "logps/chosen": -45.432891845703125, + "logps/rejected": -67.70313262939453, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40972137451171875, + "rewards/margins": 5.800241470336914, + "rewards/rejected": -6.209963321685791, + "step": 644 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 12.873863711524187, + "learning_rate": 3.1077458317139677e-07, + "logits/chosen": -1.3262977600097656, + "logits/rejected": -1.30996835231781, + "logps/chosen": -43.31980895996094, + "logps/rejected": -52.978004455566406, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.00339663028717041, + "rewards/margins": 3.1779003143310547, + "rewards/rejected": -3.1745035648345947, + "step": 645 + }, + { + "epoch": 3.828148148148148, + "grad_norm": 12.300873984167348, + "learning_rate": 3.1014424014266494e-07, + "logits/chosen": -1.0281978845596313, + "logits/rejected": -1.1137367486953735, + "logps/chosen": -37.203250885009766, + "logps/rejected": -54.28007507324219, + "loss": 0.1485, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.59632408618927, + "rewards/margins": 4.16163969039917, + "rewards/rejected": -4.757964134216309, + "step": 646 + }, + { + "epoch": 3.834074074074074, + "grad_norm": 9.697607033127737, + "learning_rate": 3.095134910064971e-07, + "logits/chosen": -1.3984662294387817, + "logits/rejected": -1.4173474311828613, + "logps/chosen": -59.23346710205078, + "logps/rejected": -51.855438232421875, + "loss": 0.0892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7792799472808838, + "rewards/margins": 3.3985934257507324, + "rewards/rejected": -4.177873134613037, + "step": 647 + }, + { + "epoch": 3.84, + "grad_norm": 12.01232429084738, + "learning_rate": 3.0888234002185325e-07, + "logits/chosen": -1.2062727212905884, + "logits/rejected": -1.2513091564178467, + "logps/chosen": -36.36381530761719, + "logps/rejected": -55.708404541015625, + "loss": 0.1198, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.009294629096984863, + "rewards/margins": 3.4298362731933594, + "rewards/rejected": -3.439131021499634, + "step": 648 + }, + { + "epoch": 3.845925925925926, + "grad_norm": 15.137381506982647, + "learning_rate": 3.082507914504068e-07, + "logits/chosen": -1.3288854360580444, + "logits/rejected": -1.3593389987945557, + "logps/chosen": -56.4027099609375, + "logps/rejected": -69.04844665527344, + "loss": 0.1363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6180347204208374, + "rewards/margins": 3.7794382572174072, + "rewards/rejected": -4.397473335266113, + "step": 649 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 15.831659643387738, + "learning_rate": 3.0761884955651563e-07, + "logits/chosen": -1.3083950281143188, + "logits/rejected": -1.2522752285003662, + "logps/chosen": -56.29924774169922, + "logps/rejected": -51.787841796875, + "loss": 0.1859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2072955071926117, + "rewards/margins": 2.5591542720794678, + "rewards/rejected": -2.7664496898651123, + "step": 650 + }, + { + "epoch": 3.8577777777777778, + "grad_norm": 11.29674521911131, + "learning_rate": 3.069865186071938e-07, + "logits/chosen": -1.3050025701522827, + "logits/rejected": -1.3897082805633545, + "logps/chosen": -46.678955078125, + "logps/rejected": -64.03926086425781, + "loss": 0.1305, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24213455617427826, + "rewards/margins": 4.125871658325195, + "rewards/rejected": -3.8837366104125977, + "step": 651 + }, + { + "epoch": 3.863703703703704, + "grad_norm": 10.624392101707619, + "learning_rate": 3.0635380287208184e-07, + "logits/chosen": -1.1970927715301514, + "logits/rejected": -1.2164897918701172, + "logps/chosen": -48.83156967163086, + "logps/rejected": -64.98463439941406, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.283173143863678, + "rewards/margins": 4.483406066894531, + "rewards/rejected": -4.7665791511535645, + "step": 652 + }, + { + "epoch": 3.8696296296296295, + "grad_norm": 11.033803801081728, + "learning_rate": 3.057207066234188e-07, + "logits/chosen": -1.3635876178741455, + "logits/rejected": -1.3974814414978027, + "logps/chosen": -46.53936004638672, + "logps/rejected": -53.70029830932617, + "loss": 0.1175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.036083512008190155, + "rewards/margins": 3.1924872398376465, + "rewards/rejected": -3.2285706996917725, + "step": 653 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 13.60287696207738, + "learning_rate": 3.0508723413601296e-07, + "logits/chosen": -1.3430231809616089, + "logits/rejected": -1.4458032846450806, + "logps/chosen": -53.1356201171875, + "logps/rejected": -65.16506958007812, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2436259686946869, + "rewards/margins": 3.3348610401153564, + "rewards/rejected": -3.578486919403076, + "step": 654 + }, + { + "epoch": 3.8814814814814813, + "grad_norm": 14.148324264131578, + "learning_rate": 3.0445338968721283e-07, + "logits/chosen": -1.2519584894180298, + "logits/rejected": -1.3373234272003174, + "logps/chosen": -57.95303726196289, + "logps/rejected": -75.46018981933594, + "loss": 0.1418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0708034485578537, + "rewards/margins": 3.977604389190674, + "rewards/rejected": -3.9068009853363037, + "step": 655 + }, + { + "epoch": 3.8874074074074074, + "grad_norm": 11.954874367510866, + "learning_rate": 3.0381917755687896e-07, + "logits/chosen": -1.1269997358322144, + "logits/rejected": -1.0959970951080322, + "logps/chosen": -49.12158966064453, + "logps/rejected": -68.00682067871094, + "loss": 0.1457, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9364160299301147, + "rewards/margins": 3.3399736881256104, + "rewards/rejected": -4.2763895988464355, + "step": 656 + }, + { + "epoch": 3.8933333333333335, + "grad_norm": 12.29898950820709, + "learning_rate": 3.0318460202735415e-07, + "logits/chosen": -1.283673882484436, + "logits/rejected": -1.347425937652588, + "logps/chosen": -45.70671081542969, + "logps/rejected": -56.07836151123047, + "loss": 0.1389, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2212280035018921, + "rewards/margins": 3.2367424964904785, + "rewards/rejected": -3.457970380783081, + "step": 657 + }, + { + "epoch": 3.899259259259259, + "grad_norm": 13.788350104255459, + "learning_rate": 3.025496673834351e-07, + "logits/chosen": -1.26125967502594, + "logits/rejected": -1.2926025390625, + "logps/chosen": -51.297176361083984, + "logps/rejected": -61.652164459228516, + "loss": 0.134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7250484228134155, + "rewards/margins": 3.813504457473755, + "rewards/rejected": -4.538553237915039, + "step": 658 + }, + { + "epoch": 3.9051851851851853, + "grad_norm": 12.893745092209024, + "learning_rate": 3.0191437791234335e-07, + "logits/chosen": -1.1480952501296997, + "logits/rejected": -1.1113182306289673, + "logps/chosen": -46.87205123901367, + "logps/rejected": -63.754608154296875, + "loss": 0.1155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06853310763835907, + "rewards/margins": 4.413917541503906, + "rewards/rejected": -4.34538459777832, + "step": 659 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 12.585185044373889, + "learning_rate": 3.0127873790369625e-07, + "logits/chosen": -1.0948622226715088, + "logits/rejected": -1.1531262397766113, + "logps/chosen": -38.73649215698242, + "logps/rejected": -46.320533752441406, + "loss": 0.1333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4145849943161011, + "rewards/margins": 2.6738719940185547, + "rewards/rejected": -3.0884571075439453, + "step": 660 + }, + { + "epoch": 3.917037037037037, + "grad_norm": 9.634291710166424, + "learning_rate": 3.006427516494781e-07, + "logits/chosen": -1.281577229499817, + "logits/rejected": -1.2957377433776855, + "logps/chosen": -38.95976257324219, + "logps/rejected": -58.57207107543945, + "loss": 0.1125, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2174490988254547, + "rewards/margins": 3.0695672035217285, + "rewards/rejected": -2.8521180152893066, + "step": 661 + }, + { + "epoch": 3.9229629629629628, + "grad_norm": 12.649837995937807, + "learning_rate": 3.000064234440111e-07, + "logits/chosen": -1.3475849628448486, + "logits/rejected": -1.374518871307373, + "logps/chosen": -50.88645935058594, + "logps/rejected": -61.86933898925781, + "loss": 0.1177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008303865790367126, + "rewards/margins": 3.0162546634674072, + "rewards/rejected": -3.007950782775879, + "step": 662 + }, + { + "epoch": 3.928888888888889, + "grad_norm": 9.838791281025717, + "learning_rate": 2.9936975758392644e-07, + "logits/chosen": -1.3352168798446655, + "logits/rejected": -1.3545887470245361, + "logps/chosen": -60.31529235839844, + "logps/rejected": -62.78803634643555, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2238210290670395, + "rewards/margins": 3.8718276023864746, + "rewards/rejected": -4.095648765563965, + "step": 663 + }, + { + "epoch": 3.934814814814815, + "grad_norm": 9.9056351234486, + "learning_rate": 2.9873275836813526e-07, + "logits/chosen": -1.2591410875320435, + "logits/rejected": -1.2903554439544678, + "logps/chosen": -53.714942932128906, + "logps/rejected": -59.55838394165039, + "loss": 0.0887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37991151213645935, + "rewards/margins": 3.501067638397217, + "rewards/rejected": -3.880979061126709, + "step": 664 + }, + { + "epoch": 3.9407407407407407, + "grad_norm": 13.479218611455286, + "learning_rate": 2.980954300977995e-07, + "logits/chosen": -1.2214405536651611, + "logits/rejected": -1.2860974073410034, + "logps/chosen": -52.64996337890625, + "logps/rejected": -71.29141235351562, + "loss": 0.1261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.941660463809967, + "rewards/margins": 4.051599025726318, + "rewards/rejected": -4.993259429931641, + "step": 665 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 12.022458671134432, + "learning_rate": 2.974577770763028e-07, + "logits/chosen": -1.1799646615982056, + "logits/rejected": -1.3523240089416504, + "logps/chosen": -55.82897186279297, + "logps/rejected": -89.32501220703125, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29032182693481445, + "rewards/margins": 5.433279514312744, + "rewards/rejected": -5.7236008644104, + "step": 666 + }, + { + "epoch": 3.9525925925925924, + "grad_norm": 12.03264049240042, + "learning_rate": 2.96819803609222e-07, + "logits/chosen": -1.172480821609497, + "logits/rejected": -1.2781836986541748, + "logps/chosen": -41.319862365722656, + "logps/rejected": -52.289878845214844, + "loss": 0.1168, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18020282685756683, + "rewards/margins": 3.568021297454834, + "rewards/rejected": -3.3878188133239746, + "step": 667 + }, + { + "epoch": 3.9585185185185185, + "grad_norm": 14.764711496945134, + "learning_rate": 2.9618151400429735e-07, + "logits/chosen": -1.314758062362671, + "logits/rejected": -1.427316665649414, + "logps/chosen": -48.17366409301758, + "logps/rejected": -61.67540740966797, + "loss": 0.1463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2936151623725891, + "rewards/margins": 3.672703504562378, + "rewards/rejected": -3.9663188457489014, + "step": 668 + }, + { + "epoch": 3.964444444444444, + "grad_norm": 14.654996509659126, + "learning_rate": 2.955429125714038e-07, + "logits/chosen": -1.1277893781661987, + "logits/rejected": -1.2207601070404053, + "logps/chosen": -42.69140625, + "logps/rejected": -68.8827896118164, + "loss": 0.1258, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.17935360968112946, + "rewards/margins": 3.781348705291748, + "rewards/rejected": -3.960702419281006, + "step": 669 + }, + { + "epoch": 3.9703703703703703, + "grad_norm": 10.996670224142063, + "learning_rate": 2.949040036225218e-07, + "logits/chosen": -1.1256000995635986, + "logits/rejected": -1.1515730619430542, + "logps/chosen": -58.76976776123047, + "logps/rejected": -68.46432495117188, + "loss": 0.093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.126528024673462, + "rewards/margins": 4.9552130699157715, + "rewards/rejected": -6.0817413330078125, + "step": 670 + }, + { + "epoch": 3.9762962962962964, + "grad_norm": 12.15346985049749, + "learning_rate": 2.9426479147170836e-07, + "logits/chosen": -1.2197206020355225, + "logits/rejected": -1.3296701908111572, + "logps/chosen": -37.83189392089844, + "logps/rejected": -61.467247009277344, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36297494173049927, + "rewards/margins": 4.596902370452881, + "rewards/rejected": -4.959877014160156, + "step": 671 + }, + { + "epoch": 3.982222222222222, + "grad_norm": 12.097195045066714, + "learning_rate": 2.9362528043506767e-07, + "logits/chosen": -1.2236287593841553, + "logits/rejected": -1.334072232246399, + "logps/chosen": -64.24101257324219, + "logps/rejected": -72.81991577148438, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9826846718788147, + "rewards/margins": 4.875625133514404, + "rewards/rejected": -5.858309745788574, + "step": 672 + }, + { + "epoch": 3.988148148148148, + "grad_norm": 10.99016588436982, + "learning_rate": 2.929854748307221e-07, + "logits/chosen": -1.1742664575576782, + "logits/rejected": -1.1747915744781494, + "logps/chosen": -50.49982452392578, + "logps/rejected": -60.32571029663086, + "loss": 0.1164, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.032082848250865936, + "rewards/margins": 4.788650989532471, + "rewards/rejected": -4.756568431854248, + "step": 673 + }, + { + "epoch": 3.9940740740740743, + "grad_norm": 10.85735796088607, + "learning_rate": 2.923453789787828e-07, + "logits/chosen": -1.24151611328125, + "logits/rejected": -1.2867331504821777, + "logps/chosen": -50.32501220703125, + "logps/rejected": -67.8655776977539, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40465599298477173, + "rewards/margins": 4.1814141273498535, + "rewards/rejected": -4.5860700607299805, + "step": 674 + }, + { + "epoch": 4.0, + "grad_norm": 13.13554967783866, + "learning_rate": 2.9170499720132106e-07, + "logits/chosen": -1.1751290559768677, + "logits/rejected": -1.1486109495162964, + "logps/chosen": -58.797637939453125, + "logps/rejected": -80.00987243652344, + "loss": 0.1007, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1894952356815338, + "rewards/margins": 3.617033004760742, + "rewards/rejected": -3.8065285682678223, + "step": 675 + }, + { + "epoch": 4.005925925925926, + "grad_norm": 8.09939021223856, + "learning_rate": 2.9106433382233877e-07, + "logits/chosen": -1.2218327522277832, + "logits/rejected": -1.330657720565796, + "logps/chosen": -39.75477981567383, + "logps/rejected": -58.90275573730469, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026934266090393066, + "rewards/margins": 3.75627064704895, + "rewards/rejected": -3.7293362617492676, + "step": 676 + }, + { + "epoch": 4.011851851851852, + "grad_norm": 8.504218288919116, + "learning_rate": 2.90423393167739e-07, + "logits/chosen": -0.9554922580718994, + "logits/rejected": -1.1671780347824097, + "logps/chosen": -63.793670654296875, + "logps/rejected": -89.34516906738281, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.525771975517273, + "rewards/margins": 6.577359199523926, + "rewards/rejected": -7.103131294250488, + "step": 677 + }, + { + "epoch": 4.017777777777778, + "grad_norm": 5.984815722074786, + "learning_rate": 2.897821795652972e-07, + "logits/chosen": -1.021341323852539, + "logits/rejected": -1.1111960411071777, + "logps/chosen": -44.92298889160156, + "logps/rejected": -78.0810775756836, + "loss": 0.0705, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19972413778305054, + "rewards/margins": 5.106929779052734, + "rewards/rejected": -4.907205581665039, + "step": 678 + }, + { + "epoch": 4.023703703703704, + "grad_norm": 8.449296561459144, + "learning_rate": 2.891406973446319e-07, + "logits/chosen": -1.2100530862808228, + "logits/rejected": -1.263127088546753, + "logps/chosen": -64.97561645507812, + "logps/rejected": -73.60565185546875, + "loss": 0.0919, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5560375452041626, + "rewards/margins": 3.675342321395874, + "rewards/rejected": -4.231379985809326, + "step": 679 + }, + { + "epoch": 4.029629629629629, + "grad_norm": 7.8329834369019355, + "learning_rate": 2.8849895083717536e-07, + "logits/chosen": -1.300388216972351, + "logits/rejected": -1.2571924924850464, + "logps/chosen": -52.756065368652344, + "logps/rejected": -63.69276428222656, + "loss": 0.0679, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5070255994796753, + "rewards/margins": 4.4201836585998535, + "rewards/rejected": -5.927209377288818, + "step": 680 + }, + { + "epoch": 4.035555555555556, + "grad_norm": 8.049817495886364, + "learning_rate": 2.8785694437614416e-07, + "logits/chosen": -1.0563764572143555, + "logits/rejected": -1.1485258340835571, + "logps/chosen": -44.871803283691406, + "logps/rejected": -60.164154052734375, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7167099714279175, + "rewards/margins": 3.6469597816467285, + "rewards/rejected": -4.363670349121094, + "step": 681 + }, + { + "epoch": 4.0414814814814815, + "grad_norm": 9.430046973217774, + "learning_rate": 2.872146822965105e-07, + "logits/chosen": -1.23280930519104, + "logits/rejected": -1.1943401098251343, + "logps/chosen": -43.12926483154297, + "logps/rejected": -64.86984252929688, + "loss": 0.093, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12739908695220947, + "rewards/margins": 4.2020697593688965, + "rewards/rejected": -4.329468727111816, + "step": 682 + }, + { + "epoch": 4.047407407407407, + "grad_norm": 7.097112610215551, + "learning_rate": 2.865721689349722e-07, + "logits/chosen": -1.2696402072906494, + "logits/rejected": -1.2817845344543457, + "logps/chosen": -51.86808776855469, + "logps/rejected": -79.98898315429688, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.117104172706604, + "rewards/margins": 5.00604248046875, + "rewards/rejected": -6.123147010803223, + "step": 683 + }, + { + "epoch": 4.053333333333334, + "grad_norm": 9.052336019738735, + "learning_rate": 2.8592940862992415e-07, + "logits/chosen": -1.1059024333953857, + "logits/rejected": -1.173060655593872, + "logps/chosen": -45.80156326293945, + "logps/rejected": -64.51785278320312, + "loss": 0.0814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24664664268493652, + "rewards/margins": 3.944058418273926, + "rewards/rejected": -4.190705299377441, + "step": 684 + }, + { + "epoch": 4.059259259259259, + "grad_norm": 10.05155868088065, + "learning_rate": 2.8528640572142835e-07, + "logits/chosen": -1.23401939868927, + "logits/rejected": -1.380190134048462, + "logps/chosen": -39.44499969482422, + "logps/rejected": -53.958587646484375, + "loss": 0.0975, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8200135231018066, + "rewards/margins": 3.4660472869873047, + "rewards/rejected": -4.286060333251953, + "step": 685 + }, + { + "epoch": 4.065185185185185, + "grad_norm": 8.266099363803365, + "learning_rate": 2.846431645511851e-07, + "logits/chosen": -1.3201549053192139, + "logits/rejected": -1.3402879238128662, + "logps/chosen": -40.695865631103516, + "logps/rejected": -61.442386627197266, + "loss": 0.0861, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18501725792884827, + "rewards/margins": 4.403324604034424, + "rewards/rejected": -4.588342189788818, + "step": 686 + }, + { + "epoch": 4.071111111111111, + "grad_norm": 10.616884607002742, + "learning_rate": 2.839996894625037e-07, + "logits/chosen": -1.1398805379867554, + "logits/rejected": -1.3630214929580688, + "logps/chosen": -42.72355270385742, + "logps/rejected": -79.57286071777344, + "loss": 0.1002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5211660861968994, + "rewards/margins": 5.314165115356445, + "rewards/rejected": -5.835331439971924, + "step": 687 + }, + { + "epoch": 4.077037037037037, + "grad_norm": 7.9070372912181375, + "learning_rate": 2.8335598480027224e-07, + "logits/chosen": -1.2060840129852295, + "logits/rejected": -1.2967393398284912, + "logps/chosen": -57.877464294433594, + "logps/rejected": -65.47186279296875, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24055880308151245, + "rewards/margins": 3.2056002616882324, + "rewards/rejected": -3.4461588859558105, + "step": 688 + }, + { + "epoch": 4.082962962962963, + "grad_norm": 6.337597848363227, + "learning_rate": 2.8271205491092963e-07, + "logits/chosen": -1.240246295928955, + "logits/rejected": -1.3377811908721924, + "logps/chosen": -43.28995895385742, + "logps/rejected": -72.51390838623047, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.404224157333374, + "rewards/margins": 5.710890293121338, + "rewards/rejected": -6.115114688873291, + "step": 689 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 10.078954126450014, + "learning_rate": 2.820679041424352e-07, + "logits/chosen": -1.2308160066604614, + "logits/rejected": -1.246185541152954, + "logps/chosen": -31.00168228149414, + "logps/rejected": -46.25737380981445, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44722241163253784, + "rewards/margins": 2.568000078201294, + "rewards/rejected": -3.0152225494384766, + "step": 690 + }, + { + "epoch": 4.094814814814815, + "grad_norm": 10.105653150868449, + "learning_rate": 2.814235368442398e-07, + "logits/chosen": -1.229535698890686, + "logits/rejected": -1.20412015914917, + "logps/chosen": -56.288055419921875, + "logps/rejected": -72.79161071777344, + "loss": 0.1171, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5891921520233154, + "rewards/margins": 3.7033348083496094, + "rewards/rejected": -4.292527198791504, + "step": 691 + }, + { + "epoch": 4.100740740740741, + "grad_norm": 6.316862928402733, + "learning_rate": 2.8077895736725647e-07, + "logits/chosen": -1.2391550540924072, + "logits/rejected": -1.383090853691101, + "logps/chosen": -48.66514587402344, + "logps/rejected": -71.63360595703125, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3529631793498993, + "rewards/margins": 4.131495952606201, + "rewards/rejected": -4.484459400177002, + "step": 692 + }, + { + "epoch": 4.1066666666666665, + "grad_norm": 8.936104075687247, + "learning_rate": 2.801341700638307e-07, + "logits/chosen": -1.1700717210769653, + "logits/rejected": -1.2463778257369995, + "logps/chosen": -58.735984802246094, + "logps/rejected": -69.45088195800781, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5931496620178223, + "rewards/margins": 4.663151741027832, + "rewards/rejected": -5.256301403045654, + "step": 693 + }, + { + "epoch": 4.112592592592593, + "grad_norm": 7.921423465017035, + "learning_rate": 2.7948917928771153e-07, + "logits/chosen": -1.1677509546279907, + "logits/rejected": -1.2224948406219482, + "logps/chosen": -49.49176025390625, + "logps/rejected": -62.047096252441406, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2260219305753708, + "rewards/margins": 4.291070461273193, + "rewards/rejected": -4.517092704772949, + "step": 694 + }, + { + "epoch": 4.118518518518519, + "grad_norm": 9.635718560557772, + "learning_rate": 2.7884398939402156e-07, + "logits/chosen": -1.1603374481201172, + "logits/rejected": -1.206560730934143, + "logps/chosen": -44.97281265258789, + "logps/rejected": -54.71010208129883, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5538350343704224, + "rewards/margins": 3.9752297401428223, + "rewards/rejected": -4.529065132141113, + "step": 695 + }, + { + "epoch": 4.124444444444444, + "grad_norm": 9.354746925229785, + "learning_rate": 2.78198604739228e-07, + "logits/chosen": -1.0650126934051514, + "logits/rejected": -1.0606831312179565, + "logps/chosen": -50.06618881225586, + "logps/rejected": -48.473289489746094, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2048884630203247, + "rewards/margins": 2.942831516265869, + "rewards/rejected": -2.737943172454834, + "step": 696 + }, + { + "epoch": 4.13037037037037, + "grad_norm": 9.653581475922081, + "learning_rate": 2.7755302968111346e-07, + "logits/chosen": -1.244917392730713, + "logits/rejected": -1.2490586042404175, + "logps/chosen": -59.654884338378906, + "logps/rejected": -84.32585906982422, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5680751800537109, + "rewards/margins": 5.05828857421875, + "rewards/rejected": -5.626363754272461, + "step": 697 + }, + { + "epoch": 4.136296296296297, + "grad_norm": 9.124932825116472, + "learning_rate": 2.7690726857874564e-07, + "logits/chosen": -1.2023653984069824, + "logits/rejected": -1.243130087852478, + "logps/chosen": -46.383155822753906, + "logps/rejected": -58.44104766845703, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030015796422958374, + "rewards/margins": 3.910349130630493, + "rewards/rejected": -3.9403653144836426, + "step": 698 + }, + { + "epoch": 4.142222222222222, + "grad_norm": 11.768103625307159, + "learning_rate": 2.7626132579244893e-07, + "logits/chosen": -1.333091139793396, + "logits/rejected": -1.287567377090454, + "logps/chosen": -45.71503829956055, + "logps/rejected": -62.3424072265625, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7412693500518799, + "rewards/margins": 3.5215086936950684, + "rewards/rejected": -4.262778282165527, + "step": 699 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 7.69430929680938, + "learning_rate": 2.756152056837743e-07, + "logits/chosen": -1.271039605140686, + "logits/rejected": -1.2991083860397339, + "logps/chosen": -48.57284927368164, + "logps/rejected": -51.01499938964844, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21987931430339813, + "rewards/margins": 3.371474266052246, + "rewards/rejected": -3.59135365486145, + "step": 700 + }, + { + "epoch": 4.1540740740740745, + "grad_norm": 6.168894895964799, + "learning_rate": 2.749689126154698e-07, + "logits/chosen": -1.156360149383545, + "logits/rejected": -1.262413740158081, + "logps/chosen": -37.52547073364258, + "logps/rejected": -55.548492431640625, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12576079368591309, + "rewards/margins": 4.342263698577881, + "rewards/rejected": -4.468024253845215, + "step": 701 + }, + { + "epoch": 4.16, + "grad_norm": 7.518876365674256, + "learning_rate": 2.743224509514519e-07, + "logits/chosen": -1.1882476806640625, + "logits/rejected": -1.2636778354644775, + "logps/chosen": -46.34234619140625, + "logps/rejected": -67.68272399902344, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6115562915802002, + "rewards/margins": 4.2533135414123535, + "rewards/rejected": -4.864870071411133, + "step": 702 + }, + { + "epoch": 4.165925925925926, + "grad_norm": 7.326989288396403, + "learning_rate": 2.73675825056775e-07, + "logits/chosen": -1.2471364736557007, + "logits/rejected": -1.2816392183303833, + "logps/chosen": -55.45796203613281, + "logps/rejected": -60.71422576904297, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013647403568029404, + "rewards/margins": 4.265345573425293, + "rewards/rejected": -4.251698017120361, + "step": 703 + }, + { + "epoch": 4.1718518518518515, + "grad_norm": 11.237707075205611, + "learning_rate": 2.730290392976025e-07, + "logits/chosen": -1.2631371021270752, + "logits/rejected": -1.3731496334075928, + "logps/chosen": -58.124061584472656, + "logps/rejected": -62.4581413269043, + "loss": 0.1116, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3890116512775421, + "rewards/margins": 3.304945230484009, + "rewards/rejected": -3.6939570903778076, + "step": 704 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 7.057187300346005, + "learning_rate": 2.723820980411774e-07, + "logits/chosen": -1.063293695449829, + "logits/rejected": -1.1065738201141357, + "logps/chosen": -42.29499053955078, + "logps/rejected": -54.36339569091797, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11712778359651566, + "rewards/margins": 3.8328468799591064, + "rewards/rejected": -3.94997501373291, + "step": 705 + }, + { + "epoch": 4.183703703703704, + "grad_norm": 6.548904651880563, + "learning_rate": 2.7173500565579256e-07, + "logits/chosen": -1.1943180561065674, + "logits/rejected": -1.3050909042358398, + "logps/chosen": -60.831878662109375, + "logps/rejected": -85.12651062011719, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.325363278388977, + "rewards/margins": 5.975900173187256, + "rewards/rejected": -7.301263332366943, + "step": 706 + }, + { + "epoch": 4.189629629629629, + "grad_norm": 5.710679075771105, + "learning_rate": 2.7108776651076116e-07, + "logits/chosen": -1.161668062210083, + "logits/rejected": -1.3700096607208252, + "logps/chosen": -39.77562713623047, + "logps/rejected": -61.1549072265625, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005037426948547363, + "rewards/margins": 5.042215347290039, + "rewards/rejected": -5.047252655029297, + "step": 707 + }, + { + "epoch": 4.195555555555556, + "grad_norm": 7.889770903073412, + "learning_rate": 2.704403849763878e-07, + "logits/chosen": -1.053623080253601, + "logits/rejected": -1.0846974849700928, + "logps/chosen": -52.68038558959961, + "logps/rejected": -68.91546630859375, + "loss": 0.0896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3577325940132141, + "rewards/margins": 3.664928436279297, + "rewards/rejected": -4.022661209106445, + "step": 708 + }, + { + "epoch": 4.201481481481482, + "grad_norm": 7.382296529185899, + "learning_rate": 2.697928654239378e-07, + "logits/chosen": -1.067875623703003, + "logits/rejected": -1.1225016117095947, + "logps/chosen": -41.09888458251953, + "logps/rejected": -51.2681770324707, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025173917412757874, + "rewards/margins": 3.3625717163085938, + "rewards/rejected": -3.3877458572387695, + "step": 709 + }, + { + "epoch": 4.207407407407407, + "grad_norm": 9.009216392494261, + "learning_rate": 2.6914521222560907e-07, + "logits/chosen": -1.2619165182113647, + "logits/rejected": -1.2694003582000732, + "logps/chosen": -55.56476593017578, + "logps/rejected": -77.80066680908203, + "loss": 0.0828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1241753101348877, + "rewards/margins": 5.0723724365234375, + "rewards/rejected": -5.196547508239746, + "step": 710 + }, + { + "epoch": 4.213333333333333, + "grad_norm": 9.523713560320529, + "learning_rate": 2.6849742975450163e-07, + "logits/chosen": -1.2838767766952515, + "logits/rejected": -1.2139889001846313, + "logps/chosen": -53.205867767333984, + "logps/rejected": -66.61186218261719, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016508355736732483, + "rewards/margins": 3.4612367153167725, + "rewards/rejected": -3.4777450561523438, + "step": 711 + }, + { + "epoch": 4.2192592592592595, + "grad_norm": 9.4854359957746, + "learning_rate": 2.6784952238458824e-07, + "logits/chosen": -1.312174677848816, + "logits/rejected": -1.3309844732284546, + "logps/chosen": -50.24847412109375, + "logps/rejected": -67.9576187133789, + "loss": 0.1068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5955944061279297, + "rewards/margins": 4.73298978805542, + "rewards/rejected": -5.32858419418335, + "step": 712 + }, + { + "epoch": 4.225185185185185, + "grad_norm": 9.597811135040962, + "learning_rate": 2.672014944906854e-07, + "logits/chosen": -1.288494348526001, + "logits/rejected": -1.4439034461975098, + "logps/chosen": -52.817901611328125, + "logps/rejected": -83.7956314086914, + "loss": 0.0988, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11418376863002777, + "rewards/margins": 5.282074928283691, + "rewards/rejected": -5.39625883102417, + "step": 713 + }, + { + "epoch": 4.231111111111111, + "grad_norm": 7.492613961615437, + "learning_rate": 2.665533504484231e-07, + "logits/chosen": -1.1451681852340698, + "logits/rejected": -1.2639843225479126, + "logps/chosen": -46.51567077636719, + "logps/rejected": -61.77153396606445, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.22086501121521, + "rewards/margins": 4.6231303215026855, + "rewards/rejected": -5.843995571136475, + "step": 714 + }, + { + "epoch": 4.237037037037037, + "grad_norm": 7.256767586610158, + "learning_rate": 2.6590509463421573e-07, + "logits/chosen": -1.2956974506378174, + "logits/rejected": -1.3932404518127441, + "logps/chosen": -41.55891418457031, + "logps/rejected": -68.27203369140625, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3283975124359131, + "rewards/margins": 5.13521146774292, + "rewards/rejected": -5.463608741760254, + "step": 715 + }, + { + "epoch": 4.242962962962963, + "grad_norm": 5.716493770660752, + "learning_rate": 2.6525673142523217e-07, + "logits/chosen": -1.1941766738891602, + "logits/rejected": -1.291650652885437, + "logps/chosen": -59.82276153564453, + "logps/rejected": -88.97959899902344, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6497074365615845, + "rewards/margins": 6.625558853149414, + "rewards/rejected": -7.275265693664551, + "step": 716 + }, + { + "epoch": 4.248888888888889, + "grad_norm": 7.11703069263412, + "learning_rate": 2.646082651993668e-07, + "logits/chosen": -1.1893032789230347, + "logits/rejected": -1.1735705137252808, + "logps/chosen": -50.201759338378906, + "logps/rejected": -59.90876007080078, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21657828986644745, + "rewards/margins": 4.5485429763793945, + "rewards/rejected": -4.33196496963501, + "step": 717 + }, + { + "epoch": 4.254814814814814, + "grad_norm": 7.311353948110829, + "learning_rate": 2.6395970033520944e-07, + "logits/chosen": -1.3677504062652588, + "logits/rejected": -1.3873264789581299, + "logps/chosen": -53.338356018066406, + "logps/rejected": -62.122920989990234, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33095160126686096, + "rewards/margins": 3.561379909515381, + "rewards/rejected": -3.2304282188415527, + "step": 718 + }, + { + "epoch": 4.260740740740741, + "grad_norm": 6.514828833392831, + "learning_rate": 2.6331104121201575e-07, + "logits/chosen": -1.1285260915756226, + "logits/rejected": -1.2510693073272705, + "logps/chosen": -56.24557876586914, + "logps/rejected": -88.39117431640625, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.019626259803772, + "rewards/margins": 4.762977123260498, + "rewards/rejected": -5.7826032638549805, + "step": 719 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 8.469413160834886, + "learning_rate": 2.626622922096782e-07, + "logits/chosen": -1.225628137588501, + "logits/rejected": -1.2865465879440308, + "logps/chosen": -51.26776885986328, + "logps/rejected": -72.69799041748047, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32359927892684937, + "rewards/margins": 4.169054985046387, + "rewards/rejected": -4.492654323577881, + "step": 720 + }, + { + "epoch": 4.272592592592592, + "grad_norm": 8.233090300808394, + "learning_rate": 2.6201345770869584e-07, + "logits/chosen": -1.178969383239746, + "logits/rejected": -1.201488971710205, + "logps/chosen": -40.95018005371094, + "logps/rejected": -62.00858688354492, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04447879642248154, + "rewards/margins": 4.163085460662842, + "rewards/rejected": -4.1186065673828125, + "step": 721 + }, + { + "epoch": 4.278518518518519, + "grad_norm": 8.262601902648214, + "learning_rate": 2.6136454209014513e-07, + "logits/chosen": -1.2628636360168457, + "logits/rejected": -1.339663028717041, + "logps/chosen": -53.15192794799805, + "logps/rejected": -67.0715560913086, + "loss": 0.0979, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6662226319313049, + "rewards/margins": 3.948902130126953, + "rewards/rejected": -4.615124702453613, + "step": 722 + }, + { + "epoch": 4.2844444444444445, + "grad_norm": 7.476616976809312, + "learning_rate": 2.6071554973565036e-07, + "logits/chosen": -1.1416479349136353, + "logits/rejected": -1.1721899509429932, + "logps/chosen": -39.949527740478516, + "logps/rejected": -50.61048126220703, + "loss": 0.0763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34017127752304077, + "rewards/margins": 3.2426137924194336, + "rewards/rejected": -3.582785129547119, + "step": 723 + }, + { + "epoch": 4.29037037037037, + "grad_norm": 9.478491591850005, + "learning_rate": 2.600664850273538e-07, + "logits/chosen": -1.3482637405395508, + "logits/rejected": -1.398117184638977, + "logps/chosen": -58.73908233642578, + "logps/rejected": -65.84282684326172, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41919857263565063, + "rewards/margins": 3.3531155586242676, + "rewards/rejected": -3.7723135948181152, + "step": 724 + }, + { + "epoch": 4.296296296296296, + "grad_norm": 10.540861291398153, + "learning_rate": 2.594173523478864e-07, + "logits/chosen": -1.4997618198394775, + "logits/rejected": -1.5314929485321045, + "logps/chosen": -41.16127014160156, + "logps/rejected": -64.81993865966797, + "loss": 0.1001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14836309850215912, + "rewards/margins": 4.376749038696289, + "rewards/rejected": -4.525112152099609, + "step": 725 + }, + { + "epoch": 4.302222222222222, + "grad_norm": 11.07779744891079, + "learning_rate": 2.587681560803379e-07, + "logits/chosen": -1.1978309154510498, + "logits/rejected": -1.2351629734039307, + "logps/chosen": -50.8704948425293, + "logps/rejected": -63.83390808105469, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03955966234207153, + "rewards/margins": 3.869476318359375, + "rewards/rejected": -3.829916477203369, + "step": 726 + }, + { + "epoch": 4.308148148148148, + "grad_norm": 8.191167235944452, + "learning_rate": 2.5811890060822754e-07, + "logits/chosen": -1.0349016189575195, + "logits/rejected": -1.0574527978897095, + "logps/chosen": -63.94342803955078, + "logps/rejected": -69.69291687011719, + "loss": 0.0729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6818037629127502, + "rewards/margins": 4.685443878173828, + "rewards/rejected": -5.367248058319092, + "step": 727 + }, + { + "epoch": 4.314074074074074, + "grad_norm": 6.934328643488799, + "learning_rate": 2.574695903154744e-07, + "logits/chosen": -1.2235246896743774, + "logits/rejected": -1.323827862739563, + "logps/chosen": -56.161659240722656, + "logps/rejected": -62.984310150146484, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3723098635673523, + "rewards/margins": 4.63085412979126, + "rewards/rejected": -5.003163814544678, + "step": 728 + }, + { + "epoch": 4.32, + "grad_norm": 8.367680585727957, + "learning_rate": 2.5682022958636753e-07, + "logits/chosen": -1.2532156705856323, + "logits/rejected": -1.2759723663330078, + "logps/chosen": -41.34139633178711, + "logps/rejected": -65.8352279663086, + "loss": 0.0764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27511006593704224, + "rewards/margins": 4.3679704666137695, + "rewards/rejected": -4.643080711364746, + "step": 729 + }, + { + "epoch": 4.325925925925926, + "grad_norm": 7.841973350120815, + "learning_rate": 2.5617082280553655e-07, + "logits/chosen": -1.1496574878692627, + "logits/rejected": -1.1879431009292603, + "logps/chosen": -46.50282287597656, + "logps/rejected": -60.425148010253906, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7109239101409912, + "rewards/margins": 2.608046770095825, + "rewards/rejected": -3.3189704418182373, + "step": 730 + }, + { + "epoch": 4.331851851851852, + "grad_norm": 7.2321529113116165, + "learning_rate": 2.5552137435792215e-07, + "logits/chosen": -1.3128197193145752, + "logits/rejected": -1.427548885345459, + "logps/chosen": -57.724605560302734, + "logps/rejected": -60.89788818359375, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1999157816171646, + "rewards/margins": 4.722375869750977, + "rewards/rejected": -4.522459983825684, + "step": 731 + }, + { + "epoch": 4.337777777777778, + "grad_norm": 9.163982491493682, + "learning_rate": 2.5487188862874633e-07, + "logits/chosen": -1.171358346939087, + "logits/rejected": -1.3380895853042603, + "logps/chosen": -39.11927795410156, + "logps/rejected": -61.229248046875, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14069584012031555, + "rewards/margins": 4.519740104675293, + "rewards/rejected": -4.379044055938721, + "step": 732 + }, + { + "epoch": 4.343703703703704, + "grad_norm": 7.364719899880132, + "learning_rate": 2.542223700034827e-07, + "logits/chosen": -1.0924508571624756, + "logits/rejected": -1.2838900089263916, + "logps/chosen": -38.81422424316406, + "logps/rejected": -69.35069274902344, + "loss": 0.0559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031546324491500854, + "rewards/margins": 5.743241786956787, + "rewards/rejected": -5.774788856506348, + "step": 733 + }, + { + "epoch": 4.3496296296296295, + "grad_norm": 6.448101416437852, + "learning_rate": 2.535728228678273e-07, + "logits/chosen": -0.9950582385063171, + "logits/rejected": -1.1089781522750854, + "logps/chosen": -42.76914978027344, + "logps/rejected": -63.94660568237305, + "loss": 0.0705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29362452030181885, + "rewards/margins": 4.061101913452148, + "rewards/rejected": -4.354726314544678, + "step": 734 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 7.151314050055646, + "learning_rate": 2.529232516076684e-07, + "logits/chosen": -1.181514024734497, + "logits/rejected": -1.3054252862930298, + "logps/chosen": -40.351993560791016, + "logps/rejected": -58.15304183959961, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06079056113958359, + "rewards/margins": 3.7480669021606445, + "rewards/rejected": -3.8088576793670654, + "step": 735 + }, + { + "epoch": 4.361481481481482, + "grad_norm": 7.714066130134251, + "learning_rate": 2.522736606090572e-07, + "logits/chosen": -1.249624252319336, + "logits/rejected": -1.2443642616271973, + "logps/chosen": -51.097660064697266, + "logps/rejected": -64.75337219238281, + "loss": 0.082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07368945330381393, + "rewards/margins": 3.9429399967193604, + "rewards/rejected": -4.016629695892334, + "step": 736 + }, + { + "epoch": 4.367407407407407, + "grad_norm": 5.457407893062819, + "learning_rate": 2.5162405425817804e-07, + "logits/chosen": -1.201377511024475, + "logits/rejected": -1.2646445035934448, + "logps/chosen": -44.47233963012695, + "logps/rejected": -71.8295669555664, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.576902449131012, + "rewards/margins": 5.617945671081543, + "rewards/rejected": -6.194847106933594, + "step": 737 + }, + { + "epoch": 4.373333333333333, + "grad_norm": 5.615875455951263, + "learning_rate": 2.5097443694131944e-07, + "logits/chosen": -1.2609564065933228, + "logits/rejected": -1.3528251647949219, + "logps/chosen": -50.52897262573242, + "logps/rejected": -85.29624938964844, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0017972737550735474, + "rewards/margins": 5.539052486419678, + "rewards/rejected": -5.540849685668945, + "step": 738 + }, + { + "epoch": 4.37925925925926, + "grad_norm": 7.915562673196729, + "learning_rate": 2.503248130448434e-07, + "logits/chosen": -1.066528081893921, + "logits/rejected": -1.1395859718322754, + "logps/chosen": -48.640525817871094, + "logps/rejected": -67.96998596191406, + "loss": 0.0764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.943220317363739, + "rewards/margins": 5.604055881500244, + "rewards/rejected": -6.547276020050049, + "step": 739 + }, + { + "epoch": 4.385185185185185, + "grad_norm": 8.204759003681545, + "learning_rate": 2.496751869551567e-07, + "logits/chosen": -1.2988148927688599, + "logits/rejected": -1.2978628873825073, + "logps/chosen": -60.13380813598633, + "logps/rejected": -73.75167846679688, + "loss": 0.0846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7862513065338135, + "rewards/margins": 3.2615106105804443, + "rewards/rejected": -4.047761917114258, + "step": 740 + }, + { + "epoch": 4.391111111111111, + "grad_norm": 7.2931351209323765, + "learning_rate": 2.4902556305868064e-07, + "logits/chosen": -1.398193120956421, + "logits/rejected": -1.4105606079101562, + "logps/chosen": -47.56850814819336, + "logps/rejected": -72.68304443359375, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7310683131217957, + "rewards/margins": 5.0538201332092285, + "rewards/rejected": -5.784888744354248, + "step": 741 + }, + { + "epoch": 4.397037037037037, + "grad_norm": 8.125782342021866, + "learning_rate": 2.4837594574182194e-07, + "logits/chosen": -1.4070849418640137, + "logits/rejected": -1.404382348060608, + "logps/chosen": -49.6085319519043, + "logps/rejected": -60.756046295166016, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6858243346214294, + "rewards/margins": 3.5082602500915527, + "rewards/rejected": -4.194085121154785, + "step": 742 + }, + { + "epoch": 4.402962962962963, + "grad_norm": 8.999809351748363, + "learning_rate": 2.477263393909429e-07, + "logits/chosen": -1.2002407312393188, + "logits/rejected": -1.31519615650177, + "logps/chosen": -48.29846954345703, + "logps/rejected": -67.71229553222656, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3317267596721649, + "rewards/margins": 4.04105806350708, + "rewards/rejected": -4.3727850914001465, + "step": 743 + }, + { + "epoch": 4.408888888888889, + "grad_norm": 9.416772169128135, + "learning_rate": 2.4707674839233165e-07, + "logits/chosen": -1.0937350988388062, + "logits/rejected": -1.089403510093689, + "logps/chosen": -45.1909065246582, + "logps/rejected": -65.4791488647461, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8421599268913269, + "rewards/margins": 5.240363121032715, + "rewards/rejected": -6.082523345947266, + "step": 744 + }, + { + "epoch": 4.4148148148148145, + "grad_norm": 7.888974500578905, + "learning_rate": 2.4642717713217266e-07, + "logits/chosen": -1.3409512042999268, + "logits/rejected": -1.400418758392334, + "logps/chosen": -55.626731872558594, + "logps/rejected": -68.84590911865234, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1184844970703125, + "rewards/margins": 5.060961723327637, + "rewards/rejected": -6.179446697235107, + "step": 745 + }, + { + "epoch": 4.420740740740741, + "grad_norm": 7.559761044306629, + "learning_rate": 2.4577762999651727e-07, + "logits/chosen": -1.343059778213501, + "logits/rejected": -1.4584920406341553, + "logps/chosen": -43.617576599121094, + "logps/rejected": -74.24542236328125, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6335965991020203, + "rewards/margins": 5.247225284576416, + "rewards/rejected": -5.88082218170166, + "step": 746 + }, + { + "epoch": 4.426666666666667, + "grad_norm": 6.632838110103838, + "learning_rate": 2.451281113712537e-07, + "logits/chosen": -1.1152293682098389, + "logits/rejected": -1.1278988122940063, + "logps/chosen": -45.689693450927734, + "logps/rejected": -64.86377716064453, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05360722541809082, + "rewards/margins": 4.151103496551514, + "rewards/rejected": -4.204710960388184, + "step": 747 + }, + { + "epoch": 4.432592592592592, + "grad_norm": 11.658937284533938, + "learning_rate": 2.4447862564207783e-07, + "logits/chosen": -1.2127296924591064, + "logits/rejected": -1.1536256074905396, + "logps/chosen": -44.75196838378906, + "logps/rejected": -76.20709991455078, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8044648766517639, + "rewards/margins": 4.883936882019043, + "rewards/rejected": -5.688401222229004, + "step": 748 + }, + { + "epoch": 4.438518518518519, + "grad_norm": 8.256649604861265, + "learning_rate": 2.438291771944635e-07, + "logits/chosen": -1.3526889085769653, + "logits/rejected": -1.3918815851211548, + "logps/chosen": -35.526405334472656, + "logps/rejected": -54.13592529296875, + "loss": 0.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2358112633228302, + "rewards/margins": 3.4109444618225098, + "rewards/rejected": -3.175133228302002, + "step": 749 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 7.328075255957023, + "learning_rate": 2.431797704136325e-07, + "logits/chosen": -1.359628677368164, + "logits/rejected": -1.4804301261901855, + "logps/chosen": -39.079750061035156, + "logps/rejected": -73.90196228027344, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36313414573669434, + "rewards/margins": 4.543010711669922, + "rewards/rejected": -4.906144618988037, + "step": 750 + }, + { + "epoch": 4.45037037037037, + "grad_norm": 7.689492001307305, + "learning_rate": 2.425304096845256e-07, + "logits/chosen": -1.3346648216247559, + "logits/rejected": -1.3804266452789307, + "logps/chosen": -79.88153839111328, + "logps/rejected": -85.34058380126953, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5813403129577637, + "rewards/margins": 5.533379554748535, + "rewards/rejected": -7.114720344543457, + "step": 751 + }, + { + "epoch": 4.456296296296296, + "grad_norm": 6.295627131379865, + "learning_rate": 2.4188109939177244e-07, + "logits/chosen": -1.2872998714447021, + "logits/rejected": -1.3642802238464355, + "logps/chosen": -50.76905059814453, + "logps/rejected": -71.48062896728516, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1857454478740692, + "rewards/margins": 4.2039079666137695, + "rewards/rejected": -4.018162250518799, + "step": 752 + }, + { + "epoch": 4.4622222222222225, + "grad_norm": 7.7337867144161505, + "learning_rate": 2.412318439196621e-07, + "logits/chosen": -1.2910356521606445, + "logits/rejected": -1.280400276184082, + "logps/chosen": -53.59968566894531, + "logps/rejected": -50.8829231262207, + "loss": 0.0733, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3582778573036194, + "rewards/margins": 3.956660270690918, + "rewards/rejected": -4.314938068389893, + "step": 753 + }, + { + "epoch": 4.468148148148148, + "grad_norm": 5.6372956740874836, + "learning_rate": 2.405826476521137e-07, + "logits/chosen": -1.0210390090942383, + "logits/rejected": -1.04572331905365, + "logps/chosen": -43.65760040283203, + "logps/rejected": -73.2513656616211, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07161092013120651, + "rewards/margins": 6.007022380828857, + "rewards/rejected": -6.0786333084106445, + "step": 754 + }, + { + "epoch": 4.474074074074074, + "grad_norm": 6.989037730437913, + "learning_rate": 2.399335149726463e-07, + "logits/chosen": -1.4408361911773682, + "logits/rejected": -1.4506512880325317, + "logps/chosen": -42.89796447753906, + "logps/rejected": -52.081092834472656, + "loss": 0.0699, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2379654198884964, + "rewards/margins": 3.7549290657043457, + "rewards/rejected": -3.516963481903076, + "step": 755 + }, + { + "epoch": 4.48, + "grad_norm": 10.371093389622446, + "learning_rate": 2.392844502643497e-07, + "logits/chosen": -1.347337007522583, + "logits/rejected": -1.3378440141677856, + "logps/chosen": -47.60035705566406, + "logps/rejected": -67.4775619506836, + "loss": 0.0791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2960338294506073, + "rewards/margins": 4.631132125854492, + "rewards/rejected": -4.927165508270264, + "step": 756 + }, + { + "epoch": 4.485925925925926, + "grad_norm": 6.9101499487233236, + "learning_rate": 2.3863545790985485e-07, + "logits/chosen": -1.2323015928268433, + "logits/rejected": -1.2121610641479492, + "logps/chosen": -53.849029541015625, + "logps/rejected": -69.28178405761719, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5624325275421143, + "rewards/margins": 5.044997215270996, + "rewards/rejected": -5.607429504394531, + "step": 757 + }, + { + "epoch": 4.491851851851852, + "grad_norm": 7.08379891264837, + "learning_rate": 2.379865422913042e-07, + "logits/chosen": -1.387258768081665, + "logits/rejected": -1.3943510055541992, + "logps/chosen": -40.565216064453125, + "logps/rejected": -62.537567138671875, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.456316202878952, + "rewards/margins": 3.579047918319702, + "rewards/rejected": -4.035364151000977, + "step": 758 + }, + { + "epoch": 4.497777777777777, + "grad_norm": 7.22602339985227, + "learning_rate": 2.3733770779032184e-07, + "logits/chosen": -1.1654456853866577, + "logits/rejected": -1.251575231552124, + "logps/chosen": -50.35562515258789, + "logps/rejected": -58.588584899902344, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7475253939628601, + "rewards/margins": 4.376987934112549, + "rewards/rejected": -5.124513626098633, + "step": 759 + }, + { + "epoch": 4.503703703703704, + "grad_norm": 7.976245827333884, + "learning_rate": 2.3668895878798423e-07, + "logits/chosen": -1.2458069324493408, + "logits/rejected": -1.286975383758545, + "logps/chosen": -39.373497009277344, + "logps/rejected": -55.45878219604492, + "loss": 0.0785, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21001428365707397, + "rewards/margins": 2.6100215911865234, + "rewards/rejected": -2.4000072479248047, + "step": 760 + }, + { + "epoch": 4.50962962962963, + "grad_norm": 5.6749553128019885, + "learning_rate": 2.360402996647906e-07, + "logits/chosen": -1.0600789785385132, + "logits/rejected": -1.1194578409194946, + "logps/chosen": -54.22690200805664, + "logps/rejected": -84.67164611816406, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4948400259017944, + "rewards/margins": 6.4912896156311035, + "rewards/rejected": -7.9861297607421875, + "step": 761 + }, + { + "epoch": 4.515555555555555, + "grad_norm": 8.132240227652444, + "learning_rate": 2.3539173480063318e-07, + "logits/chosen": -1.2046082019805908, + "logits/rejected": -1.321290135383606, + "logps/chosen": -46.84300231933594, + "logps/rejected": -61.15909194946289, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3092740774154663, + "rewards/margins": 3.509507179260254, + "rewards/rejected": -3.8187813758850098, + "step": 762 + }, + { + "epoch": 4.521481481481482, + "grad_norm": 9.233829676880644, + "learning_rate": 2.3474326857476783e-07, + "logits/chosen": -1.122426986694336, + "logits/rejected": -1.3011819124221802, + "logps/chosen": -41.219139099121094, + "logps/rejected": -60.89826965332031, + "loss": 0.1022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3601323366165161, + "rewards/margins": 4.265294075012207, + "rewards/rejected": -4.625426292419434, + "step": 763 + }, + { + "epoch": 4.5274074074074075, + "grad_norm": 7.528293156397691, + "learning_rate": 2.340949053657843e-07, + "logits/chosen": -1.3578842878341675, + "logits/rejected": -1.2270824909210205, + "logps/chosen": -53.93731689453125, + "logps/rejected": -68.09098815917969, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41589680314064026, + "rewards/margins": 5.329880714416504, + "rewards/rejected": -4.913984298706055, + "step": 764 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 8.611181849954223, + "learning_rate": 2.3344664955157685e-07, + "logits/chosen": -0.9866389036178589, + "logits/rejected": -1.0442872047424316, + "logps/chosen": -34.870948791503906, + "logps/rejected": -60.437259674072266, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3536360263824463, + "rewards/margins": 4.980638027191162, + "rewards/rejected": -5.3342742919921875, + "step": 765 + }, + { + "epoch": 4.539259259259259, + "grad_norm": 8.391925166272058, + "learning_rate": 2.3279850550931458e-07, + "logits/chosen": -1.1695847511291504, + "logits/rejected": -1.2806165218353271, + "logps/chosen": -49.50543212890625, + "logps/rejected": -69.00588989257812, + "loss": 0.0673, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20481249690055847, + "rewards/margins": 5.360518455505371, + "rewards/rejected": -5.56533145904541, + "step": 766 + }, + { + "epoch": 4.545185185185185, + "grad_norm": 10.596638977003911, + "learning_rate": 2.3215047761541172e-07, + "logits/chosen": -1.15887451171875, + "logits/rejected": -1.2061165571212769, + "logps/chosen": -37.335731506347656, + "logps/rejected": -65.72675323486328, + "loss": 0.0853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2207244336605072, + "rewards/margins": 4.417181968688965, + "rewards/rejected": -4.637906074523926, + "step": 767 + }, + { + "epoch": 4.551111111111111, + "grad_norm": 8.380077519111088, + "learning_rate": 2.3150257024549845e-07, + "logits/chosen": -1.244296669960022, + "logits/rejected": -1.2550981044769287, + "logps/chosen": -37.77351379394531, + "logps/rejected": -57.30008316040039, + "loss": 0.0756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8602774739265442, + "rewards/margins": 4.512520790100098, + "rewards/rejected": -5.372798442840576, + "step": 768 + }, + { + "epoch": 4.557037037037037, + "grad_norm": 6.4514979238938475, + "learning_rate": 2.3085478777439096e-07, + "logits/chosen": -1.166955590248108, + "logits/rejected": -1.2839741706848145, + "logps/chosen": -50.53374481201172, + "logps/rejected": -60.122703552246094, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5598335266113281, + "rewards/margins": 4.1827216148376465, + "rewards/rejected": -4.742555141448975, + "step": 769 + }, + { + "epoch": 4.562962962962963, + "grad_norm": 8.023421744646246, + "learning_rate": 2.302071345760622e-07, + "logits/chosen": -1.3408299684524536, + "logits/rejected": -1.3236316442489624, + "logps/chosen": -65.58592224121094, + "logps/rejected": -64.16624450683594, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36367684602737427, + "rewards/margins": 4.246555328369141, + "rewards/rejected": -4.610232353210449, + "step": 770 + }, + { + "epoch": 4.568888888888889, + "grad_norm": 7.446315879947793, + "learning_rate": 2.2955961502361232e-07, + "logits/chosen": -1.3255586624145508, + "logits/rejected": -1.3266708850860596, + "logps/chosen": -47.81196975708008, + "logps/rejected": -57.09688186645508, + "loss": 0.0806, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.008279271423816681, + "rewards/margins": 3.692826747894287, + "rewards/rejected": -3.701106071472168, + "step": 771 + }, + { + "epoch": 4.574814814814815, + "grad_norm": 7.029370466443028, + "learning_rate": 2.2891223348923882e-07, + "logits/chosen": -1.1787618398666382, + "logits/rejected": -1.282931923866272, + "logps/chosen": -58.13561248779297, + "logps/rejected": -81.64736938476562, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6242808103561401, + "rewards/margins": 5.2768659591674805, + "rewards/rejected": -5.901146411895752, + "step": 772 + }, + { + "epoch": 4.58074074074074, + "grad_norm": 7.712344599626733, + "learning_rate": 2.2826499434420745e-07, + "logits/chosen": -0.9917160272598267, + "logits/rejected": -1.097461462020874, + "logps/chosen": -45.970176696777344, + "logps/rejected": -62.3052978515625, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7307544350624084, + "rewards/margins": 4.317347049713135, + "rewards/rejected": -5.048101425170898, + "step": 773 + }, + { + "epoch": 4.586666666666667, + "grad_norm": 9.525692855012032, + "learning_rate": 2.2761790195882261e-07, + "logits/chosen": -1.1223633289337158, + "logits/rejected": -1.1890718936920166, + "logps/chosen": -41.621124267578125, + "logps/rejected": -69.78456115722656, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1493055820465088, + "rewards/margins": 4.050914764404297, + "rewards/rejected": -4.200220108032227, + "step": 774 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 8.524809696419899, + "learning_rate": 2.2697096070239748e-07, + "logits/chosen": -1.2589701414108276, + "logits/rejected": -1.205632209777832, + "logps/chosen": -66.525146484375, + "logps/rejected": -65.48693084716797, + "loss": 0.0821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6359963417053223, + "rewards/margins": 3.971651554107666, + "rewards/rejected": -4.607647895812988, + "step": 775 + }, + { + "epoch": 4.598518518518518, + "grad_norm": 10.195817407678827, + "learning_rate": 2.2632417494322503e-07, + "logits/chosen": -1.2319070100784302, + "logits/rejected": -1.2836008071899414, + "logps/chosen": -52.31074523925781, + "logps/rejected": -67.07086181640625, + "loss": 0.082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6152889132499695, + "rewards/margins": 4.884402275085449, + "rewards/rejected": -4.269113063812256, + "step": 776 + }, + { + "epoch": 4.604444444444445, + "grad_norm": 6.88306364808887, + "learning_rate": 2.2567754904854809e-07, + "logits/chosen": -1.2738016843795776, + "logits/rejected": -1.3152220249176025, + "logps/chosen": -51.25276184082031, + "logps/rejected": -66.99971008300781, + "loss": 0.0651, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15602001547813416, + "rewards/margins": 4.498040199279785, + "rewards/rejected": -4.342020511627197, + "step": 777 + }, + { + "epoch": 4.6103703703703705, + "grad_norm": 9.223754328006974, + "learning_rate": 2.2503108738453014e-07, + "logits/chosen": -1.1845307350158691, + "logits/rejected": -1.1550822257995605, + "logps/chosen": -40.183563232421875, + "logps/rejected": -61.688209533691406, + "loss": 0.0811, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.43673551082611084, + "rewards/margins": 3.9823086261749268, + "rewards/rejected": -3.5455729961395264, + "step": 778 + }, + { + "epoch": 4.616296296296296, + "grad_norm": 6.540828709467761, + "learning_rate": 2.243847943162257e-07, + "logits/chosen": -1.2384378910064697, + "logits/rejected": -1.1350700855255127, + "logps/chosen": -60.619468688964844, + "logps/rejected": -60.44232177734375, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8905364274978638, + "rewards/margins": 4.085331916809082, + "rewards/rejected": -4.975867748260498, + "step": 779 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 8.735745587389715, + "learning_rate": 2.23738674207551e-07, + "logits/chosen": -1.2693932056427002, + "logits/rejected": -1.3459255695343018, + "logps/chosen": -43.43098831176758, + "logps/rejected": -78.14689636230469, + "loss": 0.0891, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10692919790744781, + "rewards/margins": 6.120091438293457, + "rewards/rejected": -6.013162612915039, + "step": 780 + }, + { + "epoch": 4.628148148148148, + "grad_norm": 8.400037471171428, + "learning_rate": 2.230927314212543e-07, + "logits/chosen": -1.2819609642028809, + "logits/rejected": -1.2576347589492798, + "logps/chosen": -48.470706939697266, + "logps/rejected": -60.44685363769531, + "loss": 0.063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6669830083847046, + "rewards/margins": 4.601341724395752, + "rewards/rejected": -5.268324851989746, + "step": 781 + }, + { + "epoch": 4.634074074074074, + "grad_norm": 7.613491024351818, + "learning_rate": 2.2244697031888655e-07, + "logits/chosen": -1.2104326486587524, + "logits/rejected": -1.2761129140853882, + "logps/chosen": -49.90730285644531, + "logps/rejected": -66.0811767578125, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10209351778030396, + "rewards/margins": 3.9792842864990234, + "rewards/rejected": -4.081377983093262, + "step": 782 + }, + { + "epoch": 4.64, + "grad_norm": 5.297293025171356, + "learning_rate": 2.21801395260772e-07, + "logits/chosen": -1.0075641870498657, + "logits/rejected": -1.131706953048706, + "logps/chosen": -47.31157684326172, + "logps/rejected": -73.90620422363281, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.848071813583374, + "rewards/margins": 6.792016983032227, + "rewards/rejected": -7.64008903503418, + "step": 783 + }, + { + "epoch": 4.645925925925926, + "grad_norm": 10.09311813131246, + "learning_rate": 2.2115601060597852e-07, + "logits/chosen": -1.0886931419372559, + "logits/rejected": -1.167771577835083, + "logps/chosen": -55.40933609008789, + "logps/rejected": -62.43598937988281, + "loss": 0.0892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5595874786376953, + "rewards/margins": 4.344570159912109, + "rewards/rejected": -4.904156684875488, + "step": 784 + }, + { + "epoch": 4.651851851851852, + "grad_norm": 7.293624381026964, + "learning_rate": 2.2051082071228852e-07, + "logits/chosen": -1.3670647144317627, + "logits/rejected": -1.4086506366729736, + "logps/chosen": -44.550655364990234, + "logps/rejected": -54.24536895751953, + "loss": 0.0706, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.411616712808609, + "rewards/margins": 3.4762425422668457, + "rewards/rejected": -3.0646257400512695, + "step": 785 + }, + { + "epoch": 4.657777777777778, + "grad_norm": 8.356926519592564, + "learning_rate": 2.1986582993616925e-07, + "logits/chosen": -1.0711511373519897, + "logits/rejected": -1.1973775625228882, + "logps/chosen": -44.04728317260742, + "logps/rejected": -72.26802062988281, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4325738251209259, + "rewards/margins": 5.791550636291504, + "rewards/rejected": -6.224124908447266, + "step": 786 + }, + { + "epoch": 4.663703703703703, + "grad_norm": 7.361972820362379, + "learning_rate": 2.192210426327435e-07, + "logits/chosen": -1.2996392250061035, + "logits/rejected": -1.3320250511169434, + "logps/chosen": -49.28330993652344, + "logps/rejected": -63.363739013671875, + "loss": 0.0702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45627862215042114, + "rewards/margins": 5.045790195465088, + "rewards/rejected": -4.589511394500732, + "step": 787 + }, + { + "epoch": 4.66962962962963, + "grad_norm": 9.713435992349082, + "learning_rate": 2.185764631557602e-07, + "logits/chosen": -1.1986274719238281, + "logits/rejected": -1.2022976875305176, + "logps/chosen": -40.9835205078125, + "logps/rejected": -68.665283203125, + "loss": 0.1043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37996906042099, + "rewards/margins": 3.0767459869384766, + "rewards/rejected": -3.4567153453826904, + "step": 788 + }, + { + "epoch": 4.6755555555555555, + "grad_norm": 6.6923161913800255, + "learning_rate": 2.1793209585756482e-07, + "logits/chosen": -1.2109483480453491, + "logits/rejected": -1.205720067024231, + "logps/chosen": -78.32020568847656, + "logps/rejected": -94.1859130859375, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6772147417068481, + "rewards/margins": 6.811749458312988, + "rewards/rejected": -7.488964557647705, + "step": 789 + }, + { + "epoch": 4.681481481481481, + "grad_norm": 6.808448924207019, + "learning_rate": 2.1728794508907038e-07, + "logits/chosen": -1.247983455657959, + "logits/rejected": -1.3049252033233643, + "logps/chosen": -46.73481750488281, + "logps/rejected": -105.78466796875, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5628355741500854, + "rewards/margins": 5.637338161468506, + "rewards/rejected": -6.200174331665039, + "step": 790 + }, + { + "epoch": 4.687407407407408, + "grad_norm": 6.957507315180945, + "learning_rate": 2.1664401519972774e-07, + "logits/chosen": -1.09550940990448, + "logits/rejected": -1.099479079246521, + "logps/chosen": -62.40776443481445, + "logps/rejected": -72.61018371582031, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5545617938041687, + "rewards/margins": 5.671229362487793, + "rewards/rejected": -6.225790977478027, + "step": 791 + }, + { + "epoch": 4.693333333333333, + "grad_norm": 7.833525102618103, + "learning_rate": 2.1600031053749634e-07, + "logits/chosen": -1.2867189645767212, + "logits/rejected": -1.2888667583465576, + "logps/chosen": -55.19492721557617, + "logps/rejected": -66.07937622070312, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6458514332771301, + "rewards/margins": 3.38252854347229, + "rewards/rejected": -4.028380393981934, + "step": 792 + }, + { + "epoch": 4.699259259259259, + "grad_norm": 9.046276570843531, + "learning_rate": 2.1535683544881478e-07, + "logits/chosen": -1.3059468269348145, + "logits/rejected": -1.4791343212127686, + "logps/chosen": -42.17692947387695, + "logps/rejected": -58.22074508666992, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08928295224905014, + "rewards/margins": 4.443890571594238, + "rewards/rejected": -4.354607582092285, + "step": 793 + }, + { + "epoch": 4.705185185185185, + "grad_norm": 6.830679137241234, + "learning_rate": 2.147135942785716e-07, + "logits/chosen": -1.247341513633728, + "logits/rejected": -1.2370492219924927, + "logps/chosen": -43.17301559448242, + "logps/rejected": -57.20549011230469, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9720429182052612, + "rewards/margins": 3.6760683059692383, + "rewards/rejected": -4.648111343383789, + "step": 794 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 7.0136622973775715, + "learning_rate": 2.1407059137007583e-07, + "logits/chosen": -1.170547366142273, + "logits/rejected": -1.2341359853744507, + "logps/chosen": -53.60289764404297, + "logps/rejected": -57.69877624511719, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0990561991930008, + "rewards/margins": 4.3058180809021, + "rewards/rejected": -4.206761837005615, + "step": 795 + }, + { + "epoch": 4.717037037037037, + "grad_norm": 8.70239443133945, + "learning_rate": 2.1342783106502777e-07, + "logits/chosen": -1.1409107446670532, + "logits/rejected": -1.1933441162109375, + "logps/chosen": -53.885986328125, + "logps/rejected": -81.279052734375, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.367448627948761, + "rewards/margins": 5.0045485496521, + "rewards/rejected": -5.371996879577637, + "step": 796 + }, + { + "epoch": 4.722962962962963, + "grad_norm": 9.48634525371379, + "learning_rate": 2.1278531770348963e-07, + "logits/chosen": -1.1792958974838257, + "logits/rejected": -1.2433199882507324, + "logps/chosen": -53.308433532714844, + "logps/rejected": -67.98117065429688, + "loss": 0.0702, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10387498140335083, + "rewards/margins": 4.540947914123535, + "rewards/rejected": -4.43707275390625, + "step": 797 + }, + { + "epoch": 4.728888888888889, + "grad_norm": 7.014370618377176, + "learning_rate": 2.121430556238559e-07, + "logits/chosen": -1.0922884941101074, + "logits/rejected": -1.1718698740005493, + "logps/chosen": -42.14895248413086, + "logps/rejected": -60.662933349609375, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05287043750286102, + "rewards/margins": 6.049391746520996, + "rewards/rejected": -5.996521472930908, + "step": 798 + }, + { + "epoch": 4.734814814814815, + "grad_norm": 6.411956467587046, + "learning_rate": 2.115010491628247e-07, + "logits/chosen": -1.1495752334594727, + "logits/rejected": -1.2174811363220215, + "logps/chosen": -39.45281982421875, + "logps/rejected": -59.280311584472656, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009924978017807007, + "rewards/margins": 4.528892993927002, + "rewards/rejected": -4.518968105316162, + "step": 799 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 6.922372990111659, + "learning_rate": 2.1085930265536808e-07, + "logits/chosen": -1.2290149927139282, + "logits/rejected": -1.2082287073135376, + "logps/chosen": -35.51820373535156, + "logps/rejected": -54.339359283447266, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14226499199867249, + "rewards/margins": 3.9152727127075195, + "rewards/rejected": -4.057538032531738, + "step": 800 + }, + { + "epoch": 4.746666666666667, + "grad_norm": 8.936768807671625, + "learning_rate": 2.1021782043470278e-07, + "logits/chosen": -1.1941179037094116, + "logits/rejected": -1.2901239395141602, + "logps/chosen": -60.116851806640625, + "logps/rejected": -72.54811096191406, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2554321885108948, + "rewards/margins": 4.877737998962402, + "rewards/rejected": -5.133170127868652, + "step": 801 + }, + { + "epoch": 4.752592592592593, + "grad_norm": 7.721368969119234, + "learning_rate": 2.0957660683226103e-07, + "logits/chosen": -1.2028778791427612, + "logits/rejected": -1.2239680290222168, + "logps/chosen": -44.84784698486328, + "logps/rejected": -61.98634338378906, + "loss": 0.0817, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.28557777404785156, + "rewards/margins": 4.31488037109375, + "rewards/rejected": -4.600458145141602, + "step": 802 + }, + { + "epoch": 4.758518518518518, + "grad_norm": 8.44095587390649, + "learning_rate": 2.0893566617766126e-07, + "logits/chosen": -1.3699320554733276, + "logits/rejected": -1.3465888500213623, + "logps/chosen": -54.631710052490234, + "logps/rejected": -57.47709274291992, + "loss": 0.0724, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38626307249069214, + "rewards/margins": 3.516538619995117, + "rewards/rejected": -3.1302754878997803, + "step": 803 + }, + { + "epoch": 4.764444444444445, + "grad_norm": 8.026629554127119, + "learning_rate": 2.0829500279867891e-07, + "logits/chosen": -1.2593369483947754, + "logits/rejected": -1.3323007822036743, + "logps/chosen": -33.00196075439453, + "logps/rejected": -60.59766387939453, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5274396538734436, + "rewards/margins": 4.80533504486084, + "rewards/rejected": -4.277894973754883, + "step": 804 + }, + { + "epoch": 4.770370370370371, + "grad_norm": 5.545357033235097, + "learning_rate": 2.0765462102121719e-07, + "logits/chosen": -1.2611300945281982, + "logits/rejected": -1.2821089029312134, + "logps/chosen": -37.6617431640625, + "logps/rejected": -56.39625930786133, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23814518749713898, + "rewards/margins": 3.720951795578003, + "rewards/rejected": -3.959096908569336, + "step": 805 + }, + { + "epoch": 4.776296296296296, + "grad_norm": 9.33283152045016, + "learning_rate": 2.0701452516927797e-07, + "logits/chosen": -1.1440215110778809, + "logits/rejected": -1.1519482135772705, + "logps/chosen": -54.63264846801758, + "logps/rejected": -75.48451232910156, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.687347948551178, + "rewards/margins": 5.244063377380371, + "rewards/rejected": -5.931410789489746, + "step": 806 + }, + { + "epoch": 4.782222222222222, + "grad_norm": 6.837505033247797, + "learning_rate": 2.0637471956493234e-07, + "logits/chosen": -1.146953821182251, + "logits/rejected": -1.2953850030899048, + "logps/chosen": -36.457977294921875, + "logps/rejected": -64.86138916015625, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5366591215133667, + "rewards/margins": 4.506161689758301, + "rewards/rejected": -5.042820453643799, + "step": 807 + }, + { + "epoch": 4.7881481481481485, + "grad_norm": 8.388100188306844, + "learning_rate": 2.0573520852829164e-07, + "logits/chosen": -1.0507543087005615, + "logits/rejected": -1.0924817323684692, + "logps/chosen": -39.18290710449219, + "logps/rejected": -56.02238082885742, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6681569814682007, + "rewards/margins": 4.3059844970703125, + "rewards/rejected": -4.974141597747803, + "step": 808 + }, + { + "epoch": 4.794074074074074, + "grad_norm": 8.473188918963196, + "learning_rate": 2.0509599637747818e-07, + "logits/chosen": -1.3122702836990356, + "logits/rejected": -1.2597553730010986, + "logps/chosen": -44.254432678222656, + "logps/rejected": -61.42188262939453, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7810443639755249, + "rewards/margins": 4.892861843109131, + "rewards/rejected": -5.673906326293945, + "step": 809 + }, + { + "epoch": 4.8, + "grad_norm": 7.661013362311576, + "learning_rate": 2.0445708742859625e-07, + "logits/chosen": -1.1926181316375732, + "logits/rejected": -1.1875218152999878, + "logps/chosen": -56.58489990234375, + "logps/rejected": -69.30337524414062, + "loss": 0.0663, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6484168767929077, + "rewards/margins": 4.6451520919799805, + "rewards/rejected": -5.2935686111450195, + "step": 810 + }, + { + "epoch": 4.805925925925926, + "grad_norm": 8.740225258601434, + "learning_rate": 2.0381848599570273e-07, + "logits/chosen": -1.1209958791732788, + "logits/rejected": -1.1884212493896484, + "logps/chosen": -35.961971282958984, + "logps/rejected": -50.096317291259766, + "loss": 0.0905, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1351906806230545, + "rewards/margins": 4.218876361846924, + "rewards/rejected": -4.083685398101807, + "step": 811 + }, + { + "epoch": 4.811851851851852, + "grad_norm": 8.158970785472667, + "learning_rate": 2.0318019639077803e-07, + "logits/chosen": -1.2397578954696655, + "logits/rejected": -1.3928254842758179, + "logps/chosen": -42.682044982910156, + "logps/rejected": -72.9716567993164, + "loss": 0.0751, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0666639506816864, + "rewards/margins": 4.914846420288086, + "rewards/rejected": -4.981510162353516, + "step": 812 + }, + { + "epoch": 4.817777777777778, + "grad_norm": 9.478210960221906, + "learning_rate": 2.0254222292369724e-07, + "logits/chosen": -1.176468849182129, + "logits/rejected": -1.2728384733200073, + "logps/chosen": -36.69878387451172, + "logps/rejected": -62.156524658203125, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6981914043426514, + "rewards/margins": 3.4020438194274902, + "rewards/rejected": -4.1002349853515625, + "step": 813 + }, + { + "epoch": 4.823703703703703, + "grad_norm": 7.439639636736858, + "learning_rate": 2.0190456990220055e-07, + "logits/chosen": -1.2218072414398193, + "logits/rejected": -1.2740261554718018, + "logps/chosen": -43.83546447753906, + "logps/rejected": -67.54747009277344, + "loss": 0.0646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2251005917787552, + "rewards/margins": 5.161179542541504, + "rewards/rejected": -4.936079025268555, + "step": 814 + }, + { + "epoch": 4.82962962962963, + "grad_norm": 7.5097409856257595, + "learning_rate": 2.0126724163186474e-07, + "logits/chosen": -1.116912603378296, + "logits/rejected": -1.166030764579773, + "logps/chosen": -49.36180877685547, + "logps/rejected": -60.51682662963867, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.238020122051239, + "rewards/margins": 4.75913143157959, + "rewards/rejected": -4.9971513748168945, + "step": 815 + }, + { + "epoch": 4.835555555555556, + "grad_norm": 7.191731456701853, + "learning_rate": 2.006302424160735e-07, + "logits/chosen": -1.2778334617614746, + "logits/rejected": -1.3720132112503052, + "logps/chosen": -44.0409049987793, + "logps/rejected": -60.05315399169922, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5977808237075806, + "rewards/margins": 4.33931303024292, + "rewards/rejected": -4.937093734741211, + "step": 816 + }, + { + "epoch": 4.841481481481481, + "grad_norm": 8.418630189436973, + "learning_rate": 1.9999357655598891e-07, + "logits/chosen": -1.2416424751281738, + "logits/rejected": -1.2487435340881348, + "logps/chosen": -53.07795715332031, + "logps/rejected": -70.923095703125, + "loss": 0.0645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4244321286678314, + "rewards/margins": 4.733457565307617, + "rewards/rejected": -5.1578898429870605, + "step": 817 + }, + { + "epoch": 4.847407407407408, + "grad_norm": 9.81718746931335, + "learning_rate": 1.9935724835052196e-07, + "logits/chosen": -1.1533488035202026, + "logits/rejected": -1.120931625366211, + "logps/chosen": -58.67433166503906, + "logps/rejected": -74.95533752441406, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7449749112129211, + "rewards/margins": 6.310980796813965, + "rewards/rejected": -7.055954933166504, + "step": 818 + }, + { + "epoch": 4.8533333333333335, + "grad_norm": 8.115955559787736, + "learning_rate": 1.987212620963038e-07, + "logits/chosen": -1.2583997249603271, + "logits/rejected": -1.3349251747131348, + "logps/chosen": -53.448116302490234, + "logps/rejected": -75.52993774414062, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15489783883094788, + "rewards/margins": 5.193795680999756, + "rewards/rejected": -5.34869384765625, + "step": 819 + }, + { + "epoch": 4.859259259259259, + "grad_norm": 6.723480724558627, + "learning_rate": 1.9808562208765663e-07, + "logits/chosen": -1.2606794834136963, + "logits/rejected": -1.2723817825317383, + "logps/chosen": -42.972747802734375, + "logps/rejected": -66.92098999023438, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.50331050157547, + "rewards/margins": 4.977799415588379, + "rewards/rejected": -5.481109619140625, + "step": 820 + }, + { + "epoch": 4.865185185185185, + "grad_norm": 6.72884777182804, + "learning_rate": 1.9745033261656486e-07, + "logits/chosen": -1.3054922819137573, + "logits/rejected": -1.2133138179779053, + "logps/chosen": -53.268924713134766, + "logps/rejected": -73.72822570800781, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7263810634613037, + "rewards/margins": 4.860612392425537, + "rewards/rejected": -5.586993217468262, + "step": 821 + }, + { + "epoch": 4.871111111111111, + "grad_norm": 6.7682757332113495, + "learning_rate": 1.9681539797264578e-07, + "logits/chosen": -1.1518858671188354, + "logits/rejected": -1.1976454257965088, + "logps/chosen": -61.2183837890625, + "logps/rejected": -79.68696594238281, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49593448638916016, + "rewards/margins": 5.0298919677734375, + "rewards/rejected": -5.525826454162598, + "step": 822 + }, + { + "epoch": 4.877037037037037, + "grad_norm": 10.44828841212963, + "learning_rate": 1.96180822443121e-07, + "logits/chosen": -1.2402451038360596, + "logits/rejected": -1.3201285600662231, + "logps/chosen": -52.995582580566406, + "logps/rejected": -63.573150634765625, + "loss": 0.101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16973279416561127, + "rewards/margins": 4.808659553527832, + "rewards/rejected": -4.978392601013184, + "step": 823 + }, + { + "epoch": 4.882962962962963, + "grad_norm": 11.841416570152811, + "learning_rate": 1.955466103127871e-07, + "logits/chosen": -1.0633020401000977, + "logits/rejected": -1.1554279327392578, + "logps/chosen": -41.22328186035156, + "logps/rejected": -64.04997253417969, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6372347474098206, + "rewards/margins": 4.569708347320557, + "rewards/rejected": -5.206943511962891, + "step": 824 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 9.119245176045158, + "learning_rate": 1.9491276586398715e-07, + "logits/chosen": -0.972596287727356, + "logits/rejected": -1.0817652940750122, + "logps/chosen": -40.38216018676758, + "logps/rejected": -71.95773315429688, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8263460397720337, + "rewards/margins": 5.990384578704834, + "rewards/rejected": -6.816730499267578, + "step": 825 + }, + { + "epoch": 4.894814814814815, + "grad_norm": 7.084413600553766, + "learning_rate": 1.9427929337658126e-07, + "logits/chosen": -1.2040290832519531, + "logits/rejected": -1.383630633354187, + "logps/chosen": -40.241455078125, + "logps/rejected": -61.860687255859375, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3590248227119446, + "rewards/margins": 4.8489580154418945, + "rewards/rejected": -5.207983493804932, + "step": 826 + }, + { + "epoch": 4.900740740740741, + "grad_norm": 7.182863168982642, + "learning_rate": 1.9364619712791819e-07, + "logits/chosen": -1.2070741653442383, + "logits/rejected": -1.2761895656585693, + "logps/chosen": -45.865875244140625, + "logps/rejected": -65.07974243164062, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03775513172149658, + "rewards/margins": 4.42209529876709, + "rewards/rejected": -4.459850788116455, + "step": 827 + }, + { + "epoch": 4.906666666666666, + "grad_norm": 7.946011876870162, + "learning_rate": 1.9301348139280627e-07, + "logits/chosen": -1.175967812538147, + "logits/rejected": -1.2586506605148315, + "logps/chosen": -56.91388702392578, + "logps/rejected": -59.17463684082031, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16037920117378235, + "rewards/margins": 3.1888394355773926, + "rewards/rejected": -3.0284600257873535, + "step": 828 + }, + { + "epoch": 4.912592592592593, + "grad_norm": 9.179383834460381, + "learning_rate": 1.9238115044348434e-07, + "logits/chosen": -1.1928423643112183, + "logits/rejected": -1.2084673643112183, + "logps/chosen": -67.9603500366211, + "logps/rejected": -81.65391540527344, + "loss": 0.0577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1263477802276611, + "rewards/margins": 5.509244918823242, + "rewards/rejected": -6.635592937469482, + "step": 829 + }, + { + "epoch": 4.9185185185185185, + "grad_norm": 9.264958893547135, + "learning_rate": 1.9174920854959322e-07, + "logits/chosen": -1.1068739891052246, + "logits/rejected": -1.2803871631622314, + "logps/chosen": -32.360023498535156, + "logps/rejected": -56.196678161621094, + "loss": 0.0756, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40388235449790955, + "rewards/margins": 5.12398624420166, + "rewards/rejected": -5.527868270874023, + "step": 830 + }, + { + "epoch": 4.924444444444444, + "grad_norm": 10.908563975596744, + "learning_rate": 1.9111765997814678e-07, + "logits/chosen": -1.29888117313385, + "logits/rejected": -1.3771575689315796, + "logps/chosen": -44.184505462646484, + "logps/rejected": -59.10567092895508, + "loss": 0.0962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19967499375343323, + "rewards/margins": 4.624224662780762, + "rewards/rejected": -4.424549579620361, + "step": 831 + }, + { + "epoch": 4.930370370370371, + "grad_norm": 8.456676479609067, + "learning_rate": 1.904865089935029e-07, + "logits/chosen": -1.2636172771453857, + "logits/rejected": -1.2544087171554565, + "logps/chosen": -41.338768005371094, + "logps/rejected": -61.473453521728516, + "loss": 0.0762, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11037924885749817, + "rewards/margins": 4.756615161895752, + "rewards/rejected": -4.866994857788086, + "step": 832 + }, + { + "epoch": 4.936296296296296, + "grad_norm": 6.202900054839411, + "learning_rate": 1.8985575985733507e-07, + "logits/chosen": -1.2785180807113647, + "logits/rejected": -1.217121958732605, + "logps/chosen": -48.604339599609375, + "logps/rejected": -70.71717834472656, + "loss": 0.0703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3400578498840332, + "rewards/margins": 4.592471599578857, + "rewards/rejected": -4.932529449462891, + "step": 833 + }, + { + "epoch": 4.942222222222222, + "grad_norm": 6.624373897036038, + "learning_rate": 1.8922541682860326e-07, + "logits/chosen": -1.138252854347229, + "logits/rejected": -1.2604851722717285, + "logps/chosen": -36.05076217651367, + "logps/rejected": -55.33295440673828, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23800107836723328, + "rewards/margins": 4.528337478637695, + "rewards/rejected": -4.290336608886719, + "step": 834 + }, + { + "epoch": 4.948148148148148, + "grad_norm": 5.620900100070084, + "learning_rate": 1.8859548416352536e-07, + "logits/chosen": -1.324847936630249, + "logits/rejected": -1.431929111480713, + "logps/chosen": -44.402740478515625, + "logps/rejected": -68.83207702636719, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13397502899169922, + "rewards/margins": 5.259936332702637, + "rewards/rejected": -5.1259613037109375, + "step": 835 + }, + { + "epoch": 4.954074074074074, + "grad_norm": 6.985083953063934, + "learning_rate": 1.8796596611554838e-07, + "logits/chosen": -1.3173155784606934, + "logits/rejected": -1.1883106231689453, + "logps/chosen": -49.2961311340332, + "logps/rejected": -58.56343078613281, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21595799922943115, + "rewards/margins": 4.060945510864258, + "rewards/rejected": -4.27690315246582, + "step": 836 + }, + { + "epoch": 4.96, + "grad_norm": 8.705680214772553, + "learning_rate": 1.8733686693531982e-07, + "logits/chosen": -1.1338430643081665, + "logits/rejected": -1.2737352848052979, + "logps/chosen": -44.42845153808594, + "logps/rejected": -79.81404876708984, + "loss": 0.0821, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1882634162902832, + "rewards/margins": 5.241169452667236, + "rewards/rejected": -6.429432392120361, + "step": 837 + }, + { + "epoch": 4.965925925925926, + "grad_norm": 8.370685133521045, + "learning_rate": 1.8670819087065882e-07, + "logits/chosen": -1.1332958936691284, + "logits/rejected": -1.1995506286621094, + "logps/chosen": -49.437835693359375, + "logps/rejected": -59.27742004394531, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6721725463867188, + "rewards/margins": 4.094503879547119, + "rewards/rejected": -4.76667594909668, + "step": 838 + }, + { + "epoch": 4.971851851851852, + "grad_norm": 4.897226742571563, + "learning_rate": 1.8607994216652756e-07, + "logits/chosen": -1.0724759101867676, + "logits/rejected": -1.1880738735198975, + "logps/chosen": -40.61245346069336, + "logps/rejected": -72.87381744384766, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5802782773971558, + "rewards/margins": 6.521996021270752, + "rewards/rejected": -7.102274417877197, + "step": 839 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 6.480414219345178, + "learning_rate": 1.8545212506500257e-07, + "logits/chosen": -1.1149649620056152, + "logits/rejected": -1.2812896966934204, + "logps/chosen": -49.6659049987793, + "logps/rejected": -59.63869094848633, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08736288547515869, + "rewards/margins": 5.593204498291016, + "rewards/rejected": -5.680567741394043, + "step": 840 + }, + { + "epoch": 4.9837037037037035, + "grad_norm": 6.1186920179539035, + "learning_rate": 1.848247438052461e-07, + "logits/chosen": -1.209456443786621, + "logits/rejected": -1.149521827697754, + "logps/chosen": -60.91209411621094, + "logps/rejected": -81.8936767578125, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6175415515899658, + "rewards/margins": 6.043704986572266, + "rewards/rejected": -6.661246299743652, + "step": 841 + }, + { + "epoch": 4.989629629629629, + "grad_norm": 9.310621643988044, + "learning_rate": 1.8419780262347754e-07, + "logits/chosen": -1.2068432569503784, + "logits/rejected": -1.233940839767456, + "logps/chosen": -50.332767486572266, + "logps/rejected": -68.87258911132812, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5439462661743164, + "rewards/margins": 5.3185715675354, + "rewards/rejected": -6.862517833709717, + "step": 842 + }, + { + "epoch": 4.995555555555556, + "grad_norm": 8.427374147972621, + "learning_rate": 1.835713057529447e-07, + "logits/chosen": -1.1492252349853516, + "logits/rejected": -1.2668813467025757, + "logps/chosen": -40.98643112182617, + "logps/rejected": -84.24874877929688, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22349956631660461, + "rewards/margins": 5.515828609466553, + "rewards/rejected": -5.292328834533691, + "step": 843 + }, + { + "epoch": 5.001481481481481, + "grad_norm": 7.137776049040377, + "learning_rate": 1.8294525742389545e-07, + "logits/chosen": -1.3387656211853027, + "logits/rejected": -1.2699817419052124, + "logps/chosen": -51.69484329223633, + "logps/rejected": -54.29808807373047, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3511824607849121, + "rewards/margins": 3.246230363845825, + "rewards/rejected": -3.5974128246307373, + "step": 844 + }, + { + "epoch": 5.007407407407407, + "grad_norm": 5.743840005240809, + "learning_rate": 1.8231966186354881e-07, + "logits/chosen": -1.1772220134735107, + "logits/rejected": -1.1998919248580933, + "logps/chosen": -50.18497848510742, + "logps/rejected": -67.80682373046875, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5682530403137207, + "rewards/margins": 4.094499111175537, + "rewards/rejected": -4.662752151489258, + "step": 845 + }, + { + "epoch": 5.013333333333334, + "grad_norm": 6.795618286709746, + "learning_rate": 1.8169452329606666e-07, + "logits/chosen": -1.2579567432403564, + "logits/rejected": -1.2464933395385742, + "logps/chosen": -45.38380813598633, + "logps/rejected": -79.08517456054688, + "loss": 0.0646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8201035261154175, + "rewards/margins": 4.0954766273498535, + "rewards/rejected": -4.915580749511719, + "step": 846 + }, + { + "epoch": 5.019259259259259, + "grad_norm": 6.068510959671392, + "learning_rate": 1.810698459425254e-07, + "logits/chosen": -1.1743278503417969, + "logits/rejected": -1.2346994876861572, + "logps/chosen": -41.45006561279297, + "logps/rejected": -52.600372314453125, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1332920342683792, + "rewards/margins": 4.325839042663574, + "rewards/rejected": -4.192546844482422, + "step": 847 + }, + { + "epoch": 5.025185185185185, + "grad_norm": 6.070290501886953, + "learning_rate": 1.8044563402088682e-07, + "logits/chosen": -1.0933340787887573, + "logits/rejected": -1.0968542098999023, + "logps/chosen": -43.97517013549805, + "logps/rejected": -68.6432113647461, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2295885682106018, + "rewards/margins": 5.160776615142822, + "rewards/rejected": -5.3903656005859375, + "step": 848 + }, + { + "epoch": 5.0311111111111115, + "grad_norm": 6.383699093215017, + "learning_rate": 1.7982189174597033e-07, + "logits/chosen": -1.219502568244934, + "logits/rejected": -1.1766386032104492, + "logps/chosen": -55.12873077392578, + "logps/rejected": -68.77888488769531, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4659353494644165, + "rewards/margins": 5.554776191711426, + "rewards/rejected": -6.020711898803711, + "step": 849 + }, + { + "epoch": 5.037037037037037, + "grad_norm": 6.024482599289236, + "learning_rate": 1.7919862332942398e-07, + "logits/chosen": -1.3594261407852173, + "logits/rejected": -1.3427817821502686, + "logps/chosen": -49.3066291809082, + "logps/rejected": -58.31121826171875, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20746520161628723, + "rewards/margins": 4.219393253326416, + "rewards/rejected": -4.426857948303223, + "step": 850 + }, + { + "epoch": 5.042962962962963, + "grad_norm": 6.112060590166798, + "learning_rate": 1.785758329796963e-07, + "logits/chosen": -1.1953470706939697, + "logits/rejected": -1.2780898809432983, + "logps/chosen": -42.03157424926758, + "logps/rejected": -65.34711456298828, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37147438526153564, + "rewards/margins": 4.750897407531738, + "rewards/rejected": -5.122371673583984, + "step": 851 + }, + { + "epoch": 5.0488888888888885, + "grad_norm": 5.568282892153548, + "learning_rate": 1.779535249020078e-07, + "logits/chosen": -1.2823762893676758, + "logits/rejected": -1.4365592002868652, + "logps/chosen": -41.219505310058594, + "logps/rejected": -55.70688247680664, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6002290844917297, + "rewards/margins": 4.6371564865112305, + "rewards/rejected": -4.036927223205566, + "step": 852 + }, + { + "epoch": 5.054814814814815, + "grad_norm": 4.2104157519083625, + "learning_rate": 1.7733170329832262e-07, + "logits/chosen": -1.0824943780899048, + "logits/rejected": -1.1701000928878784, + "logps/chosen": -37.631195068359375, + "logps/rejected": -56.78057861328125, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6821134090423584, + "rewards/margins": 4.264049053192139, + "rewards/rejected": -4.946162223815918, + "step": 853 + }, + { + "epoch": 5.060740740740741, + "grad_norm": 6.659130886986735, + "learning_rate": 1.7671037236732012e-07, + "logits/chosen": -1.1604373455047607, + "logits/rejected": -1.2753492593765259, + "logps/chosen": -55.01195526123047, + "logps/rejected": -76.13126373291016, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37604719400405884, + "rewards/margins": 6.232474327087402, + "rewards/rejected": -6.608521461486816, + "step": 854 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 5.3537705230979125, + "learning_rate": 1.760895363043663e-07, + "logits/chosen": -1.1752749681472778, + "logits/rejected": -1.262036681175232, + "logps/chosen": -53.43190002441406, + "logps/rejected": -74.62548065185547, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6357789635658264, + "rewards/margins": 4.767390727996826, + "rewards/rejected": -5.403169631958008, + "step": 855 + }, + { + "epoch": 5.072592592592593, + "grad_norm": 3.8068601009017713, + "learning_rate": 1.7546919930148603e-07, + "logits/chosen": -1.0931473970413208, + "logits/rejected": -1.1400827169418335, + "logps/chosen": -75.83396911621094, + "logps/rejected": -78.3897705078125, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5985426306724548, + "rewards/margins": 4.9465861320495605, + "rewards/rejected": -5.545129299163818, + "step": 856 + }, + { + "epoch": 5.078518518518519, + "grad_norm": 6.252936627705325, + "learning_rate": 1.748493655473342e-07, + "logits/chosen": -1.373226523399353, + "logits/rejected": -1.4498156309127808, + "logps/chosen": -49.392845153808594, + "logps/rejected": -58.926536560058594, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.544809103012085, + "rewards/margins": 4.060660362243652, + "rewards/rejected": -4.605469703674316, + "step": 857 + }, + { + "epoch": 5.084444444444444, + "grad_norm": 7.446095832569485, + "learning_rate": 1.742300392271678e-07, + "logits/chosen": -1.1426042318344116, + "logits/rejected": -1.2104812860488892, + "logps/chosen": -38.60154724121094, + "logps/rejected": -66.51348876953125, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08904124796390533, + "rewards/margins": 4.445226669311523, + "rewards/rejected": -4.356184959411621, + "step": 858 + }, + { + "epoch": 5.09037037037037, + "grad_norm": 4.401887258420257, + "learning_rate": 1.7361122452281737e-07, + "logits/chosen": -1.3515548706054688, + "logits/rejected": -1.4137619733810425, + "logps/chosen": -46.939727783203125, + "logps/rejected": -59.44325256347656, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.899919867515564, + "rewards/margins": 3.3372130393981934, + "rewards/rejected": -4.237132549285889, + "step": 859 + }, + { + "epoch": 5.0962962962962965, + "grad_norm": 5.7853569559732625, + "learning_rate": 1.72992925612659e-07, + "logits/chosen": -1.2066540718078613, + "logits/rejected": -1.2842992544174194, + "logps/chosen": -47.825103759765625, + "logps/rejected": -70.65608215332031, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37702682614326477, + "rewards/margins": 5.337638854980469, + "rewards/rejected": -5.71466588973999, + "step": 860 + }, + { + "epoch": 5.102222222222222, + "grad_norm": 5.4362349224675315, + "learning_rate": 1.7237514667158596e-07, + "logits/chosen": -1.3022295236587524, + "logits/rejected": -1.3546056747436523, + "logps/chosen": -50.296424865722656, + "logps/rejected": -66.29060363769531, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1895902454853058, + "rewards/margins": 5.409600257873535, + "rewards/rejected": -5.5991902351379395, + "step": 861 + }, + { + "epoch": 5.108148148148148, + "grad_norm": 5.875751684828407, + "learning_rate": 1.7175789187098055e-07, + "logits/chosen": -1.044024109840393, + "logits/rejected": -1.1330406665802002, + "logps/chosen": -36.35478973388672, + "logps/rejected": -62.587158203125, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1854926198720932, + "rewards/margins": 5.976263046264648, + "rewards/rejected": -5.790770530700684, + "step": 862 + }, + { + "epoch": 5.114074074074074, + "grad_norm": 5.519639345673725, + "learning_rate": 1.7114116537868612e-07, + "logits/chosen": -1.2666987180709839, + "logits/rejected": -1.2495226860046387, + "logps/chosen": -46.67084503173828, + "logps/rejected": -58.346153259277344, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046361494809389114, + "rewards/margins": 4.0351457595825195, + "rewards/rejected": -3.9887845516204834, + "step": 863 + }, + { + "epoch": 5.12, + "grad_norm": 5.611856872848983, + "learning_rate": 1.705249713589786e-07, + "logits/chosen": -1.2212047576904297, + "logits/rejected": -1.3220913410186768, + "logps/chosen": -53.67855453491211, + "logps/rejected": -86.76349639892578, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5181419849395752, + "rewards/margins": 5.356942653656006, + "rewards/rejected": -5.875084400177002, + "step": 864 + }, + { + "epoch": 5.125925925925926, + "grad_norm": 5.1599714002027826, + "learning_rate": 1.699093139725386e-07, + "logits/chosen": -1.1640803813934326, + "logits/rejected": -1.254544973373413, + "logps/chosen": -58.018028259277344, + "logps/rejected": -67.485595703125, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.989666223526001, + "rewards/margins": 4.372481346130371, + "rewards/rejected": -5.362147331237793, + "step": 865 + }, + { + "epoch": 5.131851851851851, + "grad_norm": 6.291834529842612, + "learning_rate": 1.6929419737642322e-07, + "logits/chosen": -1.209142804145813, + "logits/rejected": -1.2099839448928833, + "logps/chosen": -47.084754943847656, + "logps/rejected": -69.58710479736328, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44162529706954956, + "rewards/margins": 5.258059501647949, + "rewards/rejected": -5.699685096740723, + "step": 866 + }, + { + "epoch": 5.137777777777778, + "grad_norm": 4.29389402322372, + "learning_rate": 1.686796257240381e-07, + "logits/chosen": -1.2211939096450806, + "logits/rejected": -1.2630951404571533, + "logps/chosen": -43.643333435058594, + "logps/rejected": -70.00041961669922, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21772587299346924, + "rewards/margins": 4.932277679443359, + "rewards/rejected": -4.7145514488220215, + "step": 867 + }, + { + "epoch": 5.143703703703704, + "grad_norm": 6.70678025622613, + "learning_rate": 1.680656031651093e-07, + "logits/chosen": -1.1878728866577148, + "logits/rejected": -1.1749542951583862, + "logps/chosen": -51.935516357421875, + "logps/rejected": -71.19942474365234, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9229670166969299, + "rewards/margins": 4.8038554191589355, + "rewards/rejected": -5.726822376251221, + "step": 868 + }, + { + "epoch": 5.149629629629629, + "grad_norm": 4.358134041783962, + "learning_rate": 1.6745213384565516e-07, + "logits/chosen": -1.379859447479248, + "logits/rejected": -1.3122217655181885, + "logps/chosen": -52.930946350097656, + "logps/rejected": -81.54460144042969, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.141561508178711, + "rewards/margins": 5.625734806060791, + "rewards/rejected": -6.767295837402344, + "step": 869 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 3.763065225153427, + "learning_rate": 1.6683922190795845e-07, + "logits/chosen": -1.0610485076904297, + "logits/rejected": -1.186012864112854, + "logps/chosen": -41.94168472290039, + "logps/rejected": -75.10829162597656, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8855224847793579, + "rewards/margins": 6.992925643920898, + "rewards/rejected": -7.878448009490967, + "step": 870 + }, + { + "epoch": 5.161481481481482, + "grad_norm": 6.137238349957374, + "learning_rate": 1.6622687149053844e-07, + "logits/chosen": -1.1331788301467896, + "logits/rejected": -1.2375328540802002, + "logps/chosen": -51.34503173828125, + "logps/rejected": -75.72793579101562, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1209805011749268, + "rewards/margins": 4.683239936828613, + "rewards/rejected": -5.804220676422119, + "step": 871 + }, + { + "epoch": 5.167407407407407, + "grad_norm": 5.621964866813937, + "learning_rate": 1.6561508672812295e-07, + "logits/chosen": -1.2639954090118408, + "logits/rejected": -1.3647345304489136, + "logps/chosen": -56.51701354980469, + "logps/rejected": -70.59258270263672, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37653955817222595, + "rewards/margins": 3.593329906463623, + "rewards/rejected": -3.9698691368103027, + "step": 872 + }, + { + "epoch": 5.173333333333334, + "grad_norm": 9.958380211596328, + "learning_rate": 1.650038717516203e-07, + "logits/chosen": -1.298003077507019, + "logits/rejected": -1.276810646057129, + "logps/chosen": -60.997528076171875, + "logps/rejected": -57.12252426147461, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4214093685150146, + "rewards/margins": 3.4771430492401123, + "rewards/rejected": -4.898552417755127, + "step": 873 + }, + { + "epoch": 5.1792592592592595, + "grad_norm": 6.568096407056915, + "learning_rate": 1.6439323068809137e-07, + "logits/chosen": -1.2642948627471924, + "logits/rejected": -1.278181791305542, + "logps/chosen": -49.35980987548828, + "logps/rejected": -68.7181625366211, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15632173418998718, + "rewards/margins": 4.260412216186523, + "rewards/rejected": -4.104090690612793, + "step": 874 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 5.324980729784364, + "learning_rate": 1.6378316766072196e-07, + "logits/chosen": -1.0710474252700806, + "logits/rejected": -1.2427352666854858, + "logps/chosen": -52.37470626831055, + "logps/rejected": -72.27552795410156, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8807699680328369, + "rewards/margins": 5.8853373527526855, + "rewards/rejected": -6.766107082366943, + "step": 875 + }, + { + "epoch": 5.191111111111111, + "grad_norm": 7.891108260191516, + "learning_rate": 1.6317368678879496e-07, + "logits/chosen": -1.2832369804382324, + "logits/rejected": -1.3103740215301514, + "logps/chosen": -45.93217468261719, + "logps/rejected": -55.650115966796875, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7958899736404419, + "rewards/margins": 4.188290596008301, + "rewards/rejected": -4.984180450439453, + "step": 876 + }, + { + "epoch": 5.197037037037037, + "grad_norm": 6.184649944460298, + "learning_rate": 1.6256479218766212e-07, + "logits/chosen": -1.2717363834381104, + "logits/rejected": -1.3072198629379272, + "logps/chosen": -55.998291015625, + "logps/rejected": -85.4422607421875, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28671860694885254, + "rewards/margins": 5.818790912628174, + "rewards/rejected": -6.105508804321289, + "step": 877 + }, + { + "epoch": 5.202962962962963, + "grad_norm": 5.348342562947095, + "learning_rate": 1.6195648796871687e-07, + "logits/chosen": -1.004050374031067, + "logits/rejected": -1.1158876419067383, + "logps/chosen": -36.67620849609375, + "logps/rejected": -66.01150512695312, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5501542091369629, + "rewards/margins": 6.020908355712891, + "rewards/rejected": -6.571062088012695, + "step": 878 + }, + { + "epoch": 5.208888888888889, + "grad_norm": 5.533806185221065, + "learning_rate": 1.6134877823936607e-07, + "logits/chosen": -1.1981170177459717, + "logits/rejected": -1.2208610773086548, + "logps/chosen": -58.91746520996094, + "logps/rejected": -80.06623840332031, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9180342555046082, + "rewards/margins": 4.734535217285156, + "rewards/rejected": -5.652569770812988, + "step": 879 + }, + { + "epoch": 5.214814814814815, + "grad_norm": 4.919679414691683, + "learning_rate": 1.6074166710300247e-07, + "logits/chosen": -0.9741761684417725, + "logits/rejected": -1.0934596061706543, + "logps/chosen": -53.64191818237305, + "logps/rejected": -64.29776000976562, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5024899840354919, + "rewards/margins": 5.4552178382873535, + "rewards/rejected": -5.957707405090332, + "step": 880 + }, + { + "epoch": 5.220740740740741, + "grad_norm": 5.883827857125823, + "learning_rate": 1.60135158658977e-07, + "logits/chosen": -1.2886649370193481, + "logits/rejected": -1.3470699787139893, + "logps/chosen": -64.49275207519531, + "logps/rejected": -99.09591674804688, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.056515693664551, + "rewards/margins": 5.756255626678467, + "rewards/rejected": -7.812771320343018, + "step": 881 + }, + { + "epoch": 5.226666666666667, + "grad_norm": 5.4563657240893715, + "learning_rate": 1.5952925700257115e-07, + "logits/chosen": -1.195138931274414, + "logits/rejected": -1.2933731079101562, + "logps/chosen": -47.062171936035156, + "logps/rejected": -64.63616943359375, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8991705179214478, + "rewards/margins": 4.7366509437561035, + "rewards/rejected": -5.635821342468262, + "step": 882 + }, + { + "epoch": 5.232592592592592, + "grad_norm": 7.636546778330124, + "learning_rate": 1.5892396622496905e-07, + "logits/chosen": -1.0840702056884766, + "logits/rejected": -1.0799250602722168, + "logps/chosen": -64.66129302978516, + "logps/rejected": -89.11322021484375, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2371934652328491, + "rewards/margins": 6.809388637542725, + "rewards/rejected": -8.046582221984863, + "step": 883 + }, + { + "epoch": 5.238518518518519, + "grad_norm": 4.897765269920654, + "learning_rate": 1.5831929041323023e-07, + "logits/chosen": -1.311830997467041, + "logits/rejected": -1.3064095973968506, + "logps/chosen": -60.29758834838867, + "logps/rejected": -73.54698181152344, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9659146070480347, + "rewards/margins": 5.142203330993652, + "rewards/rejected": -6.108118057250977, + "step": 884 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 5.19071852044687, + "learning_rate": 1.5771523365026175e-07, + "logits/chosen": -0.986069917678833, + "logits/rejected": -1.048060655593872, + "logps/chosen": -36.13816452026367, + "logps/rejected": -66.58907318115234, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23351190984249115, + "rewards/margins": 4.650994777679443, + "rewards/rejected": -4.8845062255859375, + "step": 885 + }, + { + "epoch": 5.25037037037037, + "grad_norm": 5.460247088889271, + "learning_rate": 1.5711180001479068e-07, + "logits/chosen": -1.0913227796554565, + "logits/rejected": -1.1225485801696777, + "logps/chosen": -36.94963073730469, + "logps/rejected": -59.24100112915039, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1481831967830658, + "rewards/margins": 3.815484046936035, + "rewards/rejected": -3.963667392730713, + "step": 886 + }, + { + "epoch": 5.256296296296297, + "grad_norm": 4.980602595420538, + "learning_rate": 1.5650899358133667e-07, + "logits/chosen": -1.2710387706756592, + "logits/rejected": -1.3214951753616333, + "logps/chosen": -56.805091857910156, + "logps/rejected": -69.06848907470703, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6180689930915833, + "rewards/margins": 4.681292533874512, + "rewards/rejected": -5.2993621826171875, + "step": 887 + }, + { + "epoch": 5.262222222222222, + "grad_norm": 5.992859843422505, + "learning_rate": 1.5590681842018443e-07, + "logits/chosen": -1.303534984588623, + "logits/rejected": -1.3712739944458008, + "logps/chosen": -66.29510498046875, + "logps/rejected": -75.78311157226562, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9916107654571533, + "rewards/margins": 5.067993640899658, + "rewards/rejected": -6.059604644775391, + "step": 888 + }, + { + "epoch": 5.268148148148148, + "grad_norm": 4.327858487400165, + "learning_rate": 1.5530527859735599e-07, + "logits/chosen": -1.2532622814178467, + "logits/rejected": -1.4161229133605957, + "logps/chosen": -47.307518005371094, + "logps/rejected": -74.99484252929688, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8368775844573975, + "rewards/margins": 6.0908203125, + "rewards/rejected": -6.92769718170166, + "step": 889 + }, + { + "epoch": 5.274074074074074, + "grad_norm": 5.2410704858507104, + "learning_rate": 1.5470437817458355e-07, + "logits/chosen": -1.3019599914550781, + "logits/rejected": -1.407655954360962, + "logps/chosen": -48.766719818115234, + "logps/rejected": -78.3816909790039, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5846667289733887, + "rewards/margins": 4.5074005126953125, + "rewards/rejected": -5.092066764831543, + "step": 890 + }, + { + "epoch": 5.28, + "grad_norm": 6.530618142623513, + "learning_rate": 1.5410412120928186e-07, + "logits/chosen": -1.243187665939331, + "logits/rejected": -1.2798230648040771, + "logps/chosen": -62.167999267578125, + "logps/rejected": -79.63477325439453, + "loss": 0.0622, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1904207468032837, + "rewards/margins": 5.07986307144165, + "rewards/rejected": -6.2702836990356445, + "step": 891 + }, + { + "epoch": 5.285925925925926, + "grad_norm": 4.48460068251217, + "learning_rate": 1.53504511754521e-07, + "logits/chosen": -1.139565110206604, + "logits/rejected": -1.1655480861663818, + "logps/chosen": -53.463279724121094, + "logps/rejected": -77.63116455078125, + "loss": 0.0299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6475785970687866, + "rewards/margins": 5.736711025238037, + "rewards/rejected": -6.384289741516113, + "step": 892 + }, + { + "epoch": 5.291851851851852, + "grad_norm": 6.637625995278517, + "learning_rate": 1.5290555385899877e-07, + "logits/chosen": -1.0791090726852417, + "logits/rejected": -1.119141936302185, + "logps/chosen": -52.37824249267578, + "logps/rejected": -72.08952331542969, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1510322093963623, + "rewards/margins": 4.51447057723999, + "rewards/rejected": -5.66550350189209, + "step": 893 + }, + { + "epoch": 5.297777777777778, + "grad_norm": 4.306211675643785, + "learning_rate": 1.5230725156701373e-07, + "logits/chosen": -1.258397102355957, + "logits/rejected": -1.2849924564361572, + "logps/chosen": -54.06769561767578, + "logps/rejected": -77.40863037109375, + "loss": 0.0298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4762367904186249, + "rewards/margins": 5.966217041015625, + "rewards/rejected": -6.4424543380737305, + "step": 894 + }, + { + "epoch": 5.303703703703704, + "grad_norm": 4.847603431390897, + "learning_rate": 1.517096089184375e-07, + "logits/chosen": -1.2337533235549927, + "logits/rejected": -1.267514705657959, + "logps/chosen": -65.87496948242188, + "logps/rejected": -65.5901870727539, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30669963359832764, + "rewards/margins": 4.703336715698242, + "rewards/rejected": -5.010036468505859, + "step": 895 + }, + { + "epoch": 5.3096296296296295, + "grad_norm": 5.370548257891294, + "learning_rate": 1.5111262994868756e-07, + "logits/chosen": -1.127760648727417, + "logits/rejected": -1.247991919517517, + "logps/chosen": -47.134883880615234, + "logps/rejected": -64.86996459960938, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2545168399810791, + "rewards/margins": 4.356789588928223, + "rewards/rejected": -4.1022725105285645, + "step": 896 + }, + { + "epoch": 5.315555555555555, + "grad_norm": 5.683501867833103, + "learning_rate": 1.5051631868870019e-07, + "logits/chosen": -1.057781457901001, + "logits/rejected": -1.1627904176712036, + "logps/chosen": -47.290008544921875, + "logps/rejected": -76.31295776367188, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1723828315734863, + "rewards/margins": 5.7342023849487305, + "rewards/rejected": -6.906585693359375, + "step": 897 + }, + { + "epoch": 5.321481481481482, + "grad_norm": 4.515876065020696, + "learning_rate": 1.499206791649032e-07, + "logits/chosen": -1.2790859937667847, + "logits/rejected": -1.1949933767318726, + "logps/chosen": -51.798484802246094, + "logps/rejected": -69.213134765625, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10272088646888733, + "rewards/margins": 6.101094722747803, + "rewards/rejected": -6.203815937042236, + "step": 898 + }, + { + "epoch": 5.327407407407407, + "grad_norm": 7.152221030044184, + "learning_rate": 1.4932571539918854e-07, + "logits/chosen": -1.2066757678985596, + "logits/rejected": -1.2475630044937134, + "logps/chosen": -59.380523681640625, + "logps/rejected": -74.11068725585938, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9633429050445557, + "rewards/margins": 4.7660136222839355, + "rewards/rejected": -5.72935676574707, + "step": 899 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 3.7672921680241673, + "learning_rate": 1.4873143140888537e-07, + "logits/chosen": -1.1228379011154175, + "logits/rejected": -1.1561514139175415, + "logps/chosen": -54.204586029052734, + "logps/rejected": -87.96161651611328, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6686967611312866, + "rewards/margins": 6.6950249671936035, + "rewards/rejected": -8.36372184753418, + "step": 900 + }, + { + "epoch": 5.33925925925926, + "grad_norm": 6.914564087559101, + "learning_rate": 1.481378312067329e-07, + "logits/chosen": -1.0900871753692627, + "logits/rejected": -1.0513169765472412, + "logps/chosen": -55.293128967285156, + "logps/rejected": -67.97708892822266, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9693081974983215, + "rewards/margins": 5.633920669555664, + "rewards/rejected": -6.60322904586792, + "step": 901 + }, + { + "epoch": 5.345185185185185, + "grad_norm": 4.180855150429028, + "learning_rate": 1.4754491880085317e-07, + "logits/chosen": -1.1942942142486572, + "logits/rejected": -1.267154574394226, + "logps/chosen": -49.460086822509766, + "logps/rejected": -82.92127990722656, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0170201063156128, + "rewards/margins": 5.091777324676514, + "rewards/rejected": -6.108797073364258, + "step": 902 + }, + { + "epoch": 5.351111111111111, + "grad_norm": 5.9840055586095415, + "learning_rate": 1.4695269819472403e-07, + "logits/chosen": -1.1321901082992554, + "logits/rejected": -1.2262929677963257, + "logps/chosen": -64.43628692626953, + "logps/rejected": -62.02772521972656, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8415505290031433, + "rewards/margins": 4.008552551269531, + "rewards/rejected": -4.850103378295898, + "step": 903 + }, + { + "epoch": 5.357037037037037, + "grad_norm": 5.661485291376627, + "learning_rate": 1.463611733871523e-07, + "logits/chosen": -1.2824634313583374, + "logits/rejected": -1.416248083114624, + "logps/chosen": -53.39885711669922, + "logps/rejected": -84.84522247314453, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37042808532714844, + "rewards/margins": 4.654745101928711, + "rewards/rejected": -5.025173664093018, + "step": 904 + }, + { + "epoch": 5.362962962962963, + "grad_norm": 5.2077909388446875, + "learning_rate": 1.457703483722466e-07, + "logits/chosen": -1.1724047660827637, + "logits/rejected": -1.262000560760498, + "logps/chosen": -46.26305389404297, + "logps/rejected": -69.90074920654297, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14402145147323608, + "rewards/margins": 5.816695213317871, + "rewards/rejected": -5.672673225402832, + "step": 905 + }, + { + "epoch": 5.368888888888889, + "grad_norm": 4.801632413673207, + "learning_rate": 1.4518022713938998e-07, + "logits/chosen": -1.118366003036499, + "logits/rejected": -1.2085130214691162, + "logps/chosen": -48.222755432128906, + "logps/rejected": -72.53038024902344, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36291417479515076, + "rewards/margins": 4.7783098220825195, + "rewards/rejected": -5.141223430633545, + "step": 906 + }, + { + "epoch": 5.3748148148148145, + "grad_norm": 5.6157264725803975, + "learning_rate": 1.4459081367321407e-07, + "logits/chosen": -1.1638175249099731, + "logits/rejected": -1.247123122215271, + "logps/chosen": -44.200462341308594, + "logps/rejected": -58.13221740722656, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0433942079544067, + "rewards/margins": 4.555025100708008, + "rewards/rejected": -5.598419189453125, + "step": 907 + }, + { + "epoch": 5.380740740740741, + "grad_norm": 6.531962348628216, + "learning_rate": 1.4400211195357103e-07, + "logits/chosen": -1.04237961769104, + "logits/rejected": -1.0963478088378906, + "logps/chosen": -61.242576599121094, + "logps/rejected": -68.6133804321289, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6439560651779175, + "rewards/margins": 5.466358184814453, + "rewards/rejected": -7.11031436920166, + "step": 908 + }, + { + "epoch": 5.386666666666667, + "grad_norm": 5.540692876499104, + "learning_rate": 1.4341412595550724e-07, + "logits/chosen": -1.211525321006775, + "logits/rejected": -1.202690839767456, + "logps/chosen": -40.24979782104492, + "logps/rejected": -71.52658081054688, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8724290728569031, + "rewards/margins": 6.074644088745117, + "rewards/rejected": -6.947073459625244, + "step": 909 + }, + { + "epoch": 5.392592592592592, + "grad_norm": 5.924738983593196, + "learning_rate": 1.428268596492364e-07, + "logits/chosen": -1.0326358079910278, + "logits/rejected": -1.1596226692199707, + "logps/chosen": -40.392730712890625, + "logps/rejected": -66.74359130859375, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2823147177696228, + "rewards/margins": 5.335864543914795, + "rewards/rejected": -5.6181793212890625, + "step": 910 + }, + { + "epoch": 5.398518518518518, + "grad_norm": 5.614965106603697, + "learning_rate": 1.4224031700011286e-07, + "logits/chosen": -1.1185742616653442, + "logits/rejected": -1.1819273233413696, + "logps/chosen": -45.5130500793457, + "logps/rejected": -71.71981048583984, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.864221453666687, + "rewards/margins": 6.3592610359191895, + "rewards/rejected": -7.223482131958008, + "step": 911 + }, + { + "epoch": 5.404444444444445, + "grad_norm": 6.016423389484806, + "learning_rate": 1.416545019686042e-07, + "logits/chosen": -1.144727110862732, + "logits/rejected": -1.3016068935394287, + "logps/chosen": -50.52497482299805, + "logps/rejected": -76.85518646240234, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2942097187042236, + "rewards/margins": 5.873856544494629, + "rewards/rejected": -7.168066024780273, + "step": 912 + }, + { + "epoch": 5.41037037037037, + "grad_norm": 4.326351374076242, + "learning_rate": 1.4106941851026544e-07, + "logits/chosen": -1.0008817911148071, + "logits/rejected": -1.0063378810882568, + "logps/chosen": -52.42710876464844, + "logps/rejected": -80.53175354003906, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7073206901550293, + "rewards/margins": 6.483597278594971, + "rewards/rejected": -8.19091796875, + "step": 913 + }, + { + "epoch": 5.416296296296296, + "grad_norm": 6.052332153532867, + "learning_rate": 1.4048507057571164e-07, + "logits/chosen": -1.036699891090393, + "logits/rejected": -0.999764084815979, + "logps/chosen": -53.56409454345703, + "logps/rejected": -65.7117919921875, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8550149202346802, + "rewards/margins": 5.08674430847168, + "rewards/rejected": -5.9417595863342285, + "step": 914 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 5.180494758421336, + "learning_rate": 1.3990146211059139e-07, + "logits/chosen": -1.1877224445343018, + "logits/rejected": -1.303817868232727, + "logps/chosen": -53.484046936035156, + "logps/rejected": -61.8600959777832, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9356443881988525, + "rewards/margins": 4.698094367980957, + "rewards/rejected": -5.633738994598389, + "step": 915 + }, + { + "epoch": 5.428148148148148, + "grad_norm": 5.380172307220296, + "learning_rate": 1.3931859705556052e-07, + "logits/chosen": -1.2037510871887207, + "logits/rejected": -1.2069494724273682, + "logps/chosen": -43.905364990234375, + "logps/rejected": -61.135196685791016, + "loss": 0.0552, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2278020977973938, + "rewards/margins": 4.547643661499023, + "rewards/rejected": -4.3198418617248535, + "step": 916 + }, + { + "epoch": 5.434074074074074, + "grad_norm": 5.229207365948886, + "learning_rate": 1.387364793462548e-07, + "logits/chosen": -1.1884421110153198, + "logits/rejected": -1.128300428390503, + "logps/chosen": -53.02272415161133, + "logps/rejected": -80.40774536132812, + "loss": 0.044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2592935562133789, + "rewards/margins": 4.70393705368042, + "rewards/rejected": -4.963230609893799, + "step": 917 + }, + { + "epoch": 5.44, + "grad_norm": 5.447576108491131, + "learning_rate": 1.38155112913264e-07, + "logits/chosen": -1.1345170736312866, + "logits/rejected": -1.139125108718872, + "logps/chosen": -46.506019592285156, + "logps/rejected": -63.89093017578125, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2819477915763855, + "rewards/margins": 4.921759128570557, + "rewards/rejected": -5.203706741333008, + "step": 918 + }, + { + "epoch": 5.445925925925926, + "grad_norm": 6.124448141336304, + "learning_rate": 1.37574501682105e-07, + "logits/chosen": -1.2564316987991333, + "logits/rejected": -1.2532209157943726, + "logps/chosen": -52.18721008300781, + "logps/rejected": -81.69747924804688, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3770983219146729, + "rewards/margins": 6.158283233642578, + "rewards/rejected": -7.535381317138672, + "step": 919 + }, + { + "epoch": 5.451851851851852, + "grad_norm": 7.410616675601531, + "learning_rate": 1.369946495731954e-07, + "logits/chosen": -1.2974275350570679, + "logits/rejected": -1.3738926649093628, + "logps/chosen": -54.73295974731445, + "logps/rejected": -65.42332458496094, + "loss": 0.0601, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8449416756629944, + "rewards/margins": 4.6002397537231445, + "rewards/rejected": -5.445181369781494, + "step": 920 + }, + { + "epoch": 5.457777777777777, + "grad_norm": 6.234144569472265, + "learning_rate": 1.3641556050182707e-07, + "logits/chosen": -1.0465418100357056, + "logits/rejected": -1.1629109382629395, + "logps/chosen": -39.9510498046875, + "logps/rejected": -63.465293884277344, + "loss": 0.0668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08770129829645157, + "rewards/margins": 4.9500956535339355, + "rewards/rejected": -5.037796974182129, + "step": 921 + }, + { + "epoch": 5.463703703703704, + "grad_norm": 6.996076426450947, + "learning_rate": 1.3583723837813964e-07, + "logits/chosen": -1.2552739381790161, + "logits/rejected": -1.2882190942764282, + "logps/chosen": -56.15148162841797, + "logps/rejected": -80.41363525390625, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3065168857574463, + "rewards/margins": 6.144155502319336, + "rewards/rejected": -7.450672626495361, + "step": 922 + }, + { + "epoch": 5.46962962962963, + "grad_norm": 4.924640654272727, + "learning_rate": 1.3525968710709415e-07, + "logits/chosen": -1.302824854850769, + "logits/rejected": -1.3598288297653198, + "logps/chosen": -48.679534912109375, + "logps/rejected": -71.34368896484375, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7547460794448853, + "rewards/margins": 5.851616859436035, + "rewards/rejected": -6.606363296508789, + "step": 923 + }, + { + "epoch": 5.475555555555555, + "grad_norm": 5.2437105146313385, + "learning_rate": 1.346829105884467e-07, + "logits/chosen": -1.0821112394332886, + "logits/rejected": -1.092231273651123, + "logps/chosen": -45.19500732421875, + "logps/rejected": -68.84188079833984, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9143983125686646, + "rewards/margins": 5.41214656829834, + "rewards/rejected": -6.326544761657715, + "step": 924 + }, + { + "epoch": 5.481481481481482, + "grad_norm": 5.688009304833448, + "learning_rate": 1.3410691271672206e-07, + "logits/chosen": -1.2743799686431885, + "logits/rejected": -1.279219627380371, + "logps/chosen": -46.0670051574707, + "logps/rejected": -64.08885192871094, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5631405115127563, + "rewards/margins": 5.1333723068237305, + "rewards/rejected": -5.6965131759643555, + "step": 925 + }, + { + "epoch": 5.4874074074074075, + "grad_norm": 8.374452505500786, + "learning_rate": 1.335316973811874e-07, + "logits/chosen": -1.0590918064117432, + "logits/rejected": -1.244085431098938, + "logps/chosen": -42.937896728515625, + "logps/rejected": -62.34235763549805, + "loss": 0.0643, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.014899902045726776, + "rewards/margins": 4.596308708190918, + "rewards/rejected": -4.611208438873291, + "step": 926 + }, + { + "epoch": 5.493333333333333, + "grad_norm": 4.931540791985558, + "learning_rate": 1.32957268465826e-07, + "logits/chosen": -1.1672147512435913, + "logits/rejected": -1.2030147314071655, + "logps/chosen": -51.3866081237793, + "logps/rejected": -70.0325927734375, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1257150173187256, + "rewards/margins": 5.762474060058594, + "rewards/rejected": -6.88818883895874, + "step": 927 + }, + { + "epoch": 5.499259259259259, + "grad_norm": 3.7190108460368387, + "learning_rate": 1.3238362984931113e-07, + "logits/chosen": -1.1120619773864746, + "logits/rejected": -1.2704691886901855, + "logps/chosen": -40.81899642944336, + "logps/rejected": -76.57392883300781, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1256623268127441, + "rewards/margins": 6.532195568084717, + "rewards/rejected": -7.657857894897461, + "step": 928 + }, + { + "epoch": 5.505185185185185, + "grad_norm": 4.451300821349891, + "learning_rate": 1.318107854049797e-07, + "logits/chosen": -1.044045329093933, + "logits/rejected": -1.211983323097229, + "logps/chosen": -43.968040466308594, + "logps/rejected": -58.34918212890625, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7015755772590637, + "rewards/margins": 5.835867881774902, + "rewards/rejected": -6.537443161010742, + "step": 929 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 6.226765209722212, + "learning_rate": 1.3123873900080628e-07, + "logits/chosen": -1.0990855693817139, + "logits/rejected": -1.1984244585037231, + "logps/chosen": -44.29381561279297, + "logps/rejected": -56.79447555541992, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9139587879180908, + "rewards/margins": 4.626317977905273, + "rewards/rejected": -5.540277004241943, + "step": 930 + }, + { + "epoch": 5.517037037037037, + "grad_norm": 5.35157163971713, + "learning_rate": 1.306674944993768e-07, + "logits/chosen": -1.1949620246887207, + "logits/rejected": -1.2123682498931885, + "logps/chosen": -47.544219970703125, + "logps/rejected": -73.42807006835938, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5051659941673279, + "rewards/margins": 6.832710266113281, + "rewards/rejected": -7.337876796722412, + "step": 931 + }, + { + "epoch": 5.522962962962963, + "grad_norm": 4.7362735283631165, + "learning_rate": 1.3009705575786268e-07, + "logits/chosen": -1.134218692779541, + "logits/rejected": -1.2343950271606445, + "logps/chosen": -43.53325653076172, + "logps/rejected": -59.55531311035156, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.904568612575531, + "rewards/margins": 4.297881603240967, + "rewards/rejected": -5.202449798583984, + "step": 932 + }, + { + "epoch": 5.528888888888889, + "grad_norm": 4.89046550004791, + "learning_rate": 1.295274266279945e-07, + "logits/chosen": -1.239561676979065, + "logits/rejected": -1.3123736381530762, + "logps/chosen": -54.36166000366211, + "logps/rejected": -72.64697265625, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6119088530540466, + "rewards/margins": 6.145787715911865, + "rewards/rejected": -6.757696628570557, + "step": 933 + }, + { + "epoch": 5.534814814814815, + "grad_norm": 4.619990490602159, + "learning_rate": 1.2895861095603632e-07, + "logits/chosen": -1.1459276676177979, + "logits/rejected": -1.1653519868850708, + "logps/chosen": -35.19683074951172, + "logps/rejected": -53.685089111328125, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3016229569911957, + "rewards/margins": 4.380598545074463, + "rewards/rejected": -4.078975677490234, + "step": 934 + }, + { + "epoch": 5.540740740740741, + "grad_norm": 4.993312240195513, + "learning_rate": 1.2839061258275946e-07, + "logits/chosen": -1.1333314180374146, + "logits/rejected": -1.2495033740997314, + "logps/chosen": -41.614891052246094, + "logps/rejected": -62.70551300048828, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1989545226097107, + "rewards/margins": 5.417413711547852, + "rewards/rejected": -5.616368293762207, + "step": 935 + }, + { + "epoch": 5.546666666666667, + "grad_norm": 4.271588493947499, + "learning_rate": 1.2782343534341665e-07, + "logits/chosen": -1.2903553247451782, + "logits/rejected": -1.2682414054870605, + "logps/chosen": -50.89227294921875, + "logps/rejected": -61.375526428222656, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.728342592716217, + "rewards/margins": 4.288450241088867, + "rewards/rejected": -5.0167927742004395, + "step": 936 + }, + { + "epoch": 5.5525925925925925, + "grad_norm": 7.292093111520975, + "learning_rate": 1.2725708306771618e-07, + "logits/chosen": -1.1426489353179932, + "logits/rejected": -1.0802648067474365, + "logps/chosen": -47.69855880737305, + "logps/rejected": -62.599212646484375, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5614185333251953, + "rewards/margins": 4.699395179748535, + "rewards/rejected": -5.2608137130737305, + "step": 937 + }, + { + "epoch": 5.558518518518518, + "grad_norm": 6.908638237299102, + "learning_rate": 1.266915595797961e-07, + "logits/chosen": -1.126725196838379, + "logits/rejected": -1.1588492393493652, + "logps/chosen": -43.83253479003906, + "logps/rejected": -71.14010620117188, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5733367800712585, + "rewards/margins": 5.2426252365112305, + "rewards/rejected": -5.815961837768555, + "step": 938 + }, + { + "epoch": 5.564444444444445, + "grad_norm": 4.629799808254051, + "learning_rate": 1.2612686869819817e-07, + "logits/chosen": -1.2065681219100952, + "logits/rejected": -1.3164688348770142, + "logps/chosen": -38.721435546875, + "logps/rejected": -68.58113098144531, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1604537069797516, + "rewards/margins": 6.0843706130981445, + "rewards/rejected": -6.244824409484863, + "step": 939 + }, + { + "epoch": 5.57037037037037, + "grad_norm": 4.522577258438449, + "learning_rate": 1.2556301423584208e-07, + "logits/chosen": -1.2485311031341553, + "logits/rejected": -1.2570372819900513, + "logps/chosen": -54.931739807128906, + "logps/rejected": -79.12727355957031, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7012864351272583, + "rewards/margins": 4.642655849456787, + "rewards/rejected": -5.343942642211914, + "step": 940 + }, + { + "epoch": 5.576296296296296, + "grad_norm": 4.813395455443031, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -1.2301363945007324, + "logits/rejected": -1.2636702060699463, + "logps/chosen": -44.12531280517578, + "logps/rejected": -66.81246948242188, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8484854698181152, + "rewards/margins": 4.948280334472656, + "rewards/rejected": -5.7967658042907715, + "step": 941 + }, + { + "epoch": 5.582222222222223, + "grad_norm": 5.798154822473784, + "learning_rate": 1.2443782979227082e-07, + "logits/chosen": -1.2446839809417725, + "logits/rejected": -1.344369888305664, + "logps/chosen": -39.720298767089844, + "logps/rejected": -61.465423583984375, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.929324746131897, + "rewards/margins": 5.307143211364746, + "rewards/rejected": -6.2364678382873535, + "step": 942 + }, + { + "epoch": 5.588148148148148, + "grad_norm": 3.2413652931315067, + "learning_rate": 1.2387650740855406e-07, + "logits/chosen": -1.2844980955123901, + "logits/rejected": -1.3366330862045288, + "logps/chosen": -47.887725830078125, + "logps/rejected": -60.49043273925781, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8668982982635498, + "rewards/margins": 4.607653617858887, + "rewards/rejected": -5.474552154541016, + "step": 943 + }, + { + "epoch": 5.594074074074074, + "grad_norm": 4.6399365768250425, + "learning_rate": 1.2331603663902475e-07, + "logits/chosen": -1.1947689056396484, + "logits/rejected": -1.1636683940887451, + "logps/chosen": -58.632686614990234, + "logps/rejected": -75.03528594970703, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8372504711151123, + "rewards/margins": 6.475705146789551, + "rewards/rejected": -7.312955379486084, + "step": 944 + }, + { + "epoch": 5.6, + "grad_norm": 5.71549500582046, + "learning_rate": 1.2275642126810762e-07, + "logits/chosen": -1.1459275484085083, + "logits/rejected": -1.1692252159118652, + "logps/chosen": -46.13181686401367, + "logps/rejected": -55.8668212890625, + "loss": 0.0577, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7316948771476746, + "rewards/margins": 3.8514623641967773, + "rewards/rejected": -4.583157539367676, + "step": 945 + }, + { + "epoch": 5.605925925925926, + "grad_norm": 8.27586438727282, + "learning_rate": 1.2219766507445144e-07, + "logits/chosen": -1.1214922666549683, + "logits/rejected": -1.1562455892562866, + "logps/chosen": -47.02653884887695, + "logps/rejected": -87.65753173828125, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2972184419631958, + "rewards/margins": 7.0828657150268555, + "rewards/rejected": -7.38008451461792, + "step": 946 + }, + { + "epoch": 5.611851851851852, + "grad_norm": 6.271959445561895, + "learning_rate": 1.2163977183090368e-07, + "logits/chosen": -1.10219144821167, + "logits/rejected": -1.108323097229004, + "logps/chosen": -49.387237548828125, + "logps/rejected": -77.21356201171875, + "loss": 0.0674, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7970916628837585, + "rewards/margins": 6.1644792556762695, + "rewards/rejected": -6.961570739746094, + "step": 947 + }, + { + "epoch": 5.6177777777777775, + "grad_norm": 7.1120046006269035, + "learning_rate": 1.210827453044851e-07, + "logits/chosen": -1.2681684494018555, + "logits/rejected": -1.2564421892166138, + "logps/chosen": -50.25490951538086, + "logps/rejected": -75.11100769042969, + "loss": 0.0784, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9637770056724548, + "rewards/margins": 4.389342308044434, + "rewards/rejected": -5.353118896484375, + "step": 948 + }, + { + "epoch": 5.623703703703704, + "grad_norm": 5.399880404079545, + "learning_rate": 1.2052658925636405e-07, + "logits/chosen": -1.181290864944458, + "logits/rejected": -1.2599127292633057, + "logps/chosen": -40.77113342285156, + "logps/rejected": -52.629150390625, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0298051834106445, + "rewards/margins": 3.4283018112182617, + "rewards/rejected": -4.458106994628906, + "step": 949 + }, + { + "epoch": 5.62962962962963, + "grad_norm": 6.411458192010983, + "learning_rate": 1.1997130744183124e-07, + "logits/chosen": -1.130419373512268, + "logits/rejected": -1.2022407054901123, + "logps/chosen": -71.73152160644531, + "logps/rejected": -93.64906311035156, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6727749109268188, + "rewards/margins": 6.842751502990723, + "rewards/rejected": -8.515525817871094, + "step": 950 + }, + { + "epoch": 5.635555555555555, + "grad_norm": 5.677902192188492, + "learning_rate": 1.194169036102743e-07, + "logits/chosen": -1.116295576095581, + "logits/rejected": -1.1786460876464844, + "logps/chosen": -44.79496765136719, + "logps/rejected": -78.12545776367188, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8381794691085815, + "rewards/margins": 6.8596415519714355, + "rewards/rejected": -7.697821140289307, + "step": 951 + }, + { + "epoch": 5.641481481481481, + "grad_norm": 5.757208376877495, + "learning_rate": 1.1886338150515268e-07, + "logits/chosen": -1.2456589937210083, + "logits/rejected": -1.339970350265503, + "logps/chosen": -54.900333404541016, + "logps/rejected": -90.55741882324219, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.918721318244934, + "rewards/margins": 8.1624116897583, + "rewards/rejected": -10.081131935119629, + "step": 952 + }, + { + "epoch": 5.647407407407408, + "grad_norm": 4.89301134351496, + "learning_rate": 1.1831074486397217e-07, + "logits/chosen": -1.0892047882080078, + "logits/rejected": -1.1265947818756104, + "logps/chosen": -50.045753479003906, + "logps/rejected": -75.69743347167969, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5692238211631775, + "rewards/margins": 6.327653884887695, + "rewards/rejected": -6.896877765655518, + "step": 953 + }, + { + "epoch": 5.653333333333333, + "grad_norm": 6.301689678260451, + "learning_rate": 1.1775899741825945e-07, + "logits/chosen": -1.0384539365768433, + "logits/rejected": -1.1424777507781982, + "logps/chosen": -52.425750732421875, + "logps/rejected": -87.0918960571289, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1157212257385254, + "rewards/margins": 6.278460502624512, + "rewards/rejected": -7.394181251525879, + "step": 954 + }, + { + "epoch": 5.659259259259259, + "grad_norm": 7.4706943420437115, + "learning_rate": 1.172081428935375e-07, + "logits/chosen": -0.9616471529006958, + "logits/rejected": -1.08530592918396, + "logps/chosen": -49.213619232177734, + "logps/rejected": -63.890655517578125, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0542219877243042, + "rewards/margins": 5.608872413635254, + "rewards/rejected": -6.663094520568848, + "step": 955 + }, + { + "epoch": 5.6651851851851855, + "grad_norm": 7.815420541527748, + "learning_rate": 1.1665818500929986e-07, + "logits/chosen": -1.186784029006958, + "logits/rejected": -1.2369247674942017, + "logps/chosen": -62.02526092529297, + "logps/rejected": -81.75930786132812, + "loss": 0.0518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15900936722755432, + "rewards/margins": 4.918282508850098, + "rewards/rejected": -4.759273529052734, + "step": 956 + }, + { + "epoch": 5.671111111111111, + "grad_norm": 7.116948617235036, + "learning_rate": 1.1610912747898605e-07, + "logits/chosen": -1.1347780227661133, + "logits/rejected": -1.2619976997375488, + "logps/chosen": -49.45901870727539, + "logps/rejected": -68.4981689453125, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9228365421295166, + "rewards/margins": 4.178586006164551, + "rewards/rejected": -5.101422309875488, + "step": 957 + }, + { + "epoch": 5.677037037037037, + "grad_norm": 6.492425589496003, + "learning_rate": 1.1556097400995585e-07, + "logits/chosen": -1.2856743335723877, + "logits/rejected": -1.3788843154907227, + "logps/chosen": -68.00703430175781, + "logps/rejected": -79.57150268554688, + "loss": 0.0558, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8615601062774658, + "rewards/margins": 5.724004745483398, + "rewards/rejected": -6.585564613342285, + "step": 958 + }, + { + "epoch": 5.6829629629629625, + "grad_norm": 5.135996060670731, + "learning_rate": 1.1501372830346482e-07, + "logits/chosen": -1.0250344276428223, + "logits/rejected": -1.0303955078125, + "logps/chosen": -40.95635986328125, + "logps/rejected": -55.43898010253906, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6365801095962524, + "rewards/margins": 4.479146957397461, + "rewards/rejected": -3.842566967010498, + "step": 959 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 5.129604924243458, + "learning_rate": 1.1446739405463899e-07, + "logits/chosen": -1.0779722929000854, + "logits/rejected": -1.1054648160934448, + "logps/chosen": -35.89310073852539, + "logps/rejected": -54.29434585571289, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5512725114822388, + "rewards/margins": 4.163705348968506, + "rewards/rejected": -4.714978218078613, + "step": 960 + }, + { + "epoch": 5.694814814814815, + "grad_norm": 4.8560437167278625, + "learning_rate": 1.1392197495245015e-07, + "logits/chosen": -1.155285120010376, + "logits/rejected": -1.2099722623825073, + "logps/chosen": -42.74791717529297, + "logps/rejected": -57.803279876708984, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29253920912742615, + "rewards/margins": 3.9843411445617676, + "rewards/rejected": -4.276880264282227, + "step": 961 + }, + { + "epoch": 5.70074074074074, + "grad_norm": 5.7930938450559415, + "learning_rate": 1.1337747467969069e-07, + "logits/chosen": -1.085658311843872, + "logits/rejected": -1.0948498249053955, + "logps/chosen": -49.48651123046875, + "logps/rejected": -76.69400024414062, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2173595428466797, + "rewards/margins": 6.082059860229492, + "rewards/rejected": -7.299419403076172, + "step": 962 + }, + { + "epoch": 5.706666666666667, + "grad_norm": 5.30493601375962, + "learning_rate": 1.1283389691294893e-07, + "logits/chosen": -1.1731464862823486, + "logits/rejected": -1.141379475593567, + "logps/chosen": -64.22185516357422, + "logps/rejected": -83.61888122558594, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7661117315292358, + "rewards/margins": 5.380878448486328, + "rewards/rejected": -6.1469902992248535, + "step": 963 + }, + { + "epoch": 5.712592592592593, + "grad_norm": 6.403497269781013, + "learning_rate": 1.1229124532258421e-07, + "logits/chosen": -1.0684443712234497, + "logits/rejected": -1.0762279033660889, + "logps/chosen": -58.415157318115234, + "logps/rejected": -75.17643737792969, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5389410257339478, + "rewards/margins": 5.100183486938477, + "rewards/rejected": -6.639124870300293, + "step": 964 + }, + { + "epoch": 5.718518518518518, + "grad_norm": 4.039319682323648, + "learning_rate": 1.1174952357270212e-07, + "logits/chosen": -1.3112658262252808, + "logits/rejected": -1.2177374362945557, + "logps/chosen": -52.77716827392578, + "logps/rejected": -66.23959350585938, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6938422918319702, + "rewards/margins": 4.566688537597656, + "rewards/rejected": -5.260530471801758, + "step": 965 + }, + { + "epoch": 5.724444444444444, + "grad_norm": 6.537363002137338, + "learning_rate": 1.112087353211297e-07, + "logits/chosen": -1.2479323148727417, + "logits/rejected": -1.34829843044281, + "logps/chosen": -44.44136047363281, + "logps/rejected": -64.08821105957031, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6463432312011719, + "rewards/margins": 4.6674041748046875, + "rewards/rejected": -5.313746929168701, + "step": 966 + }, + { + "epoch": 5.730370370370371, + "grad_norm": 7.6812486979953984, + "learning_rate": 1.1066888421939092e-07, + "logits/chosen": -1.1662806272506714, + "logits/rejected": -1.200300931930542, + "logps/chosen": -54.65203857421875, + "logps/rejected": -85.63590240478516, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2911415100097656, + "rewards/margins": 6.0351667404174805, + "rewards/rejected": -5.744025230407715, + "step": 967 + }, + { + "epoch": 5.736296296296296, + "grad_norm": 6.1369264649099415, + "learning_rate": 1.1012997391268177e-07, + "logits/chosen": -1.1206430196762085, + "logits/rejected": -1.1232975721359253, + "logps/chosen": -44.27980041503906, + "logps/rejected": -59.34086608886719, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4602680504322052, + "rewards/margins": 4.129044055938721, + "rewards/rejected": -4.5893120765686035, + "step": 968 + }, + { + "epoch": 5.742222222222222, + "grad_norm": 6.212271022570864, + "learning_rate": 1.095920080398459e-07, + "logits/chosen": -1.1898860931396484, + "logits/rejected": -1.193435549736023, + "logps/chosen": -50.994625091552734, + "logps/rejected": -70.28504943847656, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0652556419372559, + "rewards/margins": 5.318528175354004, + "rewards/rejected": -6.383784294128418, + "step": 969 + }, + { + "epoch": 5.7481481481481485, + "grad_norm": 5.897763958041394, + "learning_rate": 1.0905499023334979e-07, + "logits/chosen": -1.2726876735687256, + "logits/rejected": -1.3368113040924072, + "logps/chosen": -55.83734130859375, + "logps/rejected": -68.83439636230469, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4541219472885132, + "rewards/margins": 6.1417083740234375, + "rewards/rejected": -6.595829963684082, + "step": 970 + }, + { + "epoch": 5.754074074074074, + "grad_norm": 5.570072178828081, + "learning_rate": 1.0851892411925856e-07, + "logits/chosen": -1.1539332866668701, + "logits/rejected": -1.237091064453125, + "logps/chosen": -55.219852447509766, + "logps/rejected": -73.31748962402344, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5192880630493164, + "rewards/margins": 4.689527988433838, + "rewards/rejected": -5.208816051483154, + "step": 971 + }, + { + "epoch": 5.76, + "grad_norm": 5.439743506962176, + "learning_rate": 1.0798381331721107e-07, + "logits/chosen": -1.239844799041748, + "logits/rejected": -1.2221330404281616, + "logps/chosen": -47.03329849243164, + "logps/rejected": -74.15437316894531, + "loss": 0.0454, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49033457040786743, + "rewards/margins": 5.557613849639893, + "rewards/rejected": -6.047948837280273, + "step": 972 + }, + { + "epoch": 5.7659259259259255, + "grad_norm": 7.484832245837122, + "learning_rate": 1.0744966144039588e-07, + "logits/chosen": -1.0188753604888916, + "logits/rejected": -1.1094659566879272, + "logps/chosen": -50.33880615234375, + "logps/rejected": -78.59779357910156, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8958476781845093, + "rewards/margins": 6.123520851135254, + "rewards/rejected": -7.019368648529053, + "step": 973 + }, + { + "epoch": 5.771851851851852, + "grad_norm": 8.200396955576231, + "learning_rate": 1.0691647209552654e-07, + "logits/chosen": -1.0882221460342407, + "logits/rejected": -1.1707772016525269, + "logps/chosen": -41.45745849609375, + "logps/rejected": -60.8970947265625, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44809460639953613, + "rewards/margins": 3.9365527629852295, + "rewards/rejected": -4.384647369384766, + "step": 974 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 4.498048165253478, + "learning_rate": 1.0638424888281744e-07, + "logits/chosen": -1.144453763961792, + "logits/rejected": -1.2178853750228882, + "logps/chosen": -55.933860778808594, + "logps/rejected": -81.9764175415039, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8853988647460938, + "rewards/margins": 6.145270824432373, + "rewards/rejected": -7.030670166015625, + "step": 975 + }, + { + "epoch": 5.783703703703703, + "grad_norm": 4.5687275866561174, + "learning_rate": 1.0585299539595943e-07, + "logits/chosen": -1.2651833295822144, + "logits/rejected": -1.1797913312911987, + "logps/chosen": -62.95174026489258, + "logps/rejected": -73.2163314819336, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0579339265823364, + "rewards/margins": 5.187211036682129, + "rewards/rejected": -6.245144367218018, + "step": 976 + }, + { + "epoch": 5.78962962962963, + "grad_norm": 4.757815127421331, + "learning_rate": 1.0532271522209551e-07, + "logits/chosen": -0.972952663898468, + "logits/rejected": -1.0780919790267944, + "logps/chosen": -43.577178955078125, + "logps/rejected": -72.53720092773438, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3548003435134888, + "rewards/margins": 5.889094829559326, + "rewards/rejected": -7.243895530700684, + "step": 977 + }, + { + "epoch": 5.795555555555556, + "grad_norm": 4.532083769578922, + "learning_rate": 1.0479341194179658e-07, + "logits/chosen": -1.2063857316970825, + "logits/rejected": -1.2588369846343994, + "logps/chosen": -44.20055389404297, + "logps/rejected": -79.2449722290039, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3380606472492218, + "rewards/margins": 5.87565803527832, + "rewards/rejected": -6.213718414306641, + "step": 978 + }, + { + "epoch": 5.801481481481481, + "grad_norm": 5.179109139924423, + "learning_rate": 1.0426508912903764e-07, + "logits/chosen": -1.071878433227539, + "logits/rejected": -1.1827893257141113, + "logps/chosen": -47.58728790283203, + "logps/rejected": -66.75723266601562, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.366632342338562, + "rewards/margins": 5.255701065063477, + "rewards/rejected": -6.622333526611328, + "step": 979 + }, + { + "epoch": 5.807407407407408, + "grad_norm": 6.028226327831648, + "learning_rate": 1.0373775035117305e-07, + "logits/chosen": -1.0318422317504883, + "logits/rejected": -1.0840988159179688, + "logps/chosen": -38.939598083496094, + "logps/rejected": -65.85391998291016, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26034343242645264, + "rewards/margins": 5.531552314758301, + "rewards/rejected": -5.791895389556885, + "step": 980 + }, + { + "epoch": 5.8133333333333335, + "grad_norm": 5.199707849563059, + "learning_rate": 1.0321139916891281e-07, + "logits/chosen": -1.1908127069473267, + "logits/rejected": -1.2790547609329224, + "logps/chosen": -53.803672790527344, + "logps/rejected": -98.4791259765625, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1615757942199707, + "rewards/margins": 6.412900924682617, + "rewards/rejected": -7.574476718902588, + "step": 981 + }, + { + "epoch": 5.819259259259259, + "grad_norm": 4.056908323841326, + "learning_rate": 1.0268603913629858e-07, + "logits/chosen": -1.0693111419677734, + "logits/rejected": -1.2135943174362183, + "logps/chosen": -43.74428939819336, + "logps/rejected": -59.94146728515625, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10237783193588257, + "rewards/margins": 5.640899658203125, + "rewards/rejected": -5.743277549743652, + "step": 982 + }, + { + "epoch": 5.825185185185185, + "grad_norm": 6.342627728372702, + "learning_rate": 1.0216167380067927e-07, + "logits/chosen": -1.144514560699463, + "logits/rejected": -1.2700378894805908, + "logps/chosen": -38.817726135253906, + "logps/rejected": -63.40654754638672, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0137901306152344, + "rewards/margins": 5.49804162979126, + "rewards/rejected": -6.511831283569336, + "step": 983 + }, + { + "epoch": 5.831111111111111, + "grad_norm": 5.951676057587321, + "learning_rate": 1.0163830670268767e-07, + "logits/chosen": -1.1670303344726562, + "logits/rejected": -1.167630910873413, + "logps/chosen": -52.73843765258789, + "logps/rejected": -75.77318572998047, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005964726209640503, + "rewards/margins": 5.647053241729736, + "rewards/rejected": -5.653018474578857, + "step": 984 + }, + { + "epoch": 5.837037037037037, + "grad_norm": 5.711969478515697, + "learning_rate": 1.0111594137621613e-07, + "logits/chosen": -1.143544316291809, + "logits/rejected": -1.0991742610931396, + "logps/chosen": -59.350929260253906, + "logps/rejected": -78.52423095703125, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3191605806350708, + "rewards/margins": 6.052453994750977, + "rewards/rejected": -7.371614456176758, + "step": 985 + }, + { + "epoch": 5.842962962962963, + "grad_norm": 9.323420493737709, + "learning_rate": 1.0059458134839277e-07, + "logits/chosen": -1.0608803033828735, + "logits/rejected": -1.148100733757019, + "logps/chosen": -49.37598419189453, + "logps/rejected": -72.11686706542969, + "loss": 0.0605, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.662987470626831, + "rewards/margins": 5.027313232421875, + "rewards/rejected": -5.690300464630127, + "step": 986 + }, + { + "epoch": 5.848888888888889, + "grad_norm": 6.136186486936588, + "learning_rate": 1.0007423013955782e-07, + "logits/chosen": -1.3598471879959106, + "logits/rejected": -1.4275630712509155, + "logps/chosen": -47.69789123535156, + "logps/rejected": -68.75079345703125, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5667755007743835, + "rewards/margins": 4.388944625854492, + "rewards/rejected": -4.9557204246521, + "step": 987 + }, + { + "epoch": 5.854814814814815, + "grad_norm": 3.230703616513641, + "learning_rate": 9.955489126323954e-08, + "logits/chosen": -1.2398353815078735, + "logits/rejected": -1.2649370431900024, + "logps/chosen": -41.775108337402344, + "logps/rejected": -64.36259460449219, + "loss": 0.0272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4918060302734375, + "rewards/margins": 6.183435440063477, + "rewards/rejected": -6.6752424240112305, + "step": 988 + }, + { + "epoch": 5.860740740740741, + "grad_norm": 4.91314701899232, + "learning_rate": 9.903656822613099e-08, + "logits/chosen": -1.2056286334991455, + "logits/rejected": -1.2726476192474365, + "logps/chosen": -48.42527389526367, + "logps/rejected": -73.99102783203125, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07021550834178925, + "rewards/margins": 6.695013999938965, + "rewards/rejected": -6.765229225158691, + "step": 989 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 4.380834180173861, + "learning_rate": 9.851926452806583e-08, + "logits/chosen": -1.1519975662231445, + "logits/rejected": -1.2040634155273438, + "logps/chosen": -53.68801498413086, + "logps/rejected": -70.1108627319336, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.292681336402893, + "rewards/margins": 4.510378360748291, + "rewards/rejected": -5.803060054779053, + "step": 990 + }, + { + "epoch": 5.872592592592593, + "grad_norm": 3.8066981796535857, + "learning_rate": 9.800298366199497e-08, + "logits/chosen": -1.1126823425292969, + "logits/rejected": -1.0786277055740356, + "logps/chosen": -57.135101318359375, + "logps/rejected": -78.73320770263672, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2120319604873657, + "rewards/margins": 5.7933454513549805, + "rewards/rejected": -7.005376815795898, + "step": 991 + }, + { + "epoch": 5.8785185185185185, + "grad_norm": 5.380083654103537, + "learning_rate": 9.748772911396291e-08, + "logits/chosen": -1.1817781925201416, + "logits/rejected": -1.1358152627944946, + "logps/chosen": -40.186588287353516, + "logps/rejected": -59.40968322753906, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05524274706840515, + "rewards/margins": 5.243034362792969, + "rewards/rejected": -5.298277378082275, + "step": 992 + }, + { + "epoch": 5.884444444444444, + "grad_norm": 3.9806748355024304, + "learning_rate": 9.697350436308427e-08, + "logits/chosen": -1.411184310913086, + "logits/rejected": -1.4651869535446167, + "logps/chosen": -42.275794982910156, + "logps/rejected": -65.55119323730469, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8196922540664673, + "rewards/margins": 5.605785369873047, + "rewards/rejected": -6.425477504730225, + "step": 993 + }, + { + "epoch": 5.890370370370371, + "grad_norm": 5.35201952318648, + "learning_rate": 9.646031288152021e-08, + "logits/chosen": -1.310591459274292, + "logits/rejected": -1.4033873081207275, + "logps/chosen": -44.234039306640625, + "logps/rejected": -80.975341796875, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5743755102157593, + "rewards/margins": 6.1121320724487305, + "rewards/rejected": -6.686507225036621, + "step": 994 + }, + { + "epoch": 5.896296296296296, + "grad_norm": 4.207629470079585, + "learning_rate": 9.5948158134455e-08, + "logits/chosen": -1.4490768909454346, + "logits/rejected": -1.4172837734222412, + "logps/chosen": -62.98780059814453, + "logps/rejected": -65.28123474121094, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4620938301086426, + "rewards/margins": 5.255311489105225, + "rewards/rejected": -5.717405319213867, + "step": 995 + }, + { + "epoch": 5.902222222222222, + "grad_norm": 4.720779096307494, + "learning_rate": 9.543704358007279e-08, + "logits/chosen": -0.9672622084617615, + "logits/rejected": -1.0174751281738281, + "logps/chosen": -41.439727783203125, + "logps/rejected": -62.13349151611328, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.340609073638916, + "rewards/margins": 4.7247114181518555, + "rewards/rejected": -5.06532096862793, + "step": 996 + }, + { + "epoch": 5.908148148148149, + "grad_norm": 3.674799578091533, + "learning_rate": 9.492697266953373e-08, + "logits/chosen": -1.3102741241455078, + "logits/rejected": -1.235027551651001, + "logps/chosen": -46.665985107421875, + "logps/rejected": -61.3996696472168, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2069341540336609, + "rewards/margins": 3.999211072921753, + "rewards/rejected": -3.7922770977020264, + "step": 997 + }, + { + "epoch": 5.914074074074074, + "grad_norm": 4.835303274377117, + "learning_rate": 9.44179488469516e-08, + "logits/chosen": -1.2475138902664185, + "logits/rejected": -1.2858538627624512, + "logps/chosen": -43.746517181396484, + "logps/rejected": -91.54032897949219, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.967775285243988, + "rewards/margins": 6.45581579208374, + "rewards/rejected": -7.423590660095215, + "step": 998 + }, + { + "epoch": 5.92, + "grad_norm": 6.6521399100097245, + "learning_rate": 9.390997554936964e-08, + "logits/chosen": -1.1728459596633911, + "logits/rejected": -1.151499629020691, + "logps/chosen": -53.31846618652344, + "logps/rejected": -80.39374542236328, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1168384552001953, + "rewards/margins": 6.801609992980957, + "rewards/rejected": -7.918448448181152, + "step": 999 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 4.697878592355873, + "learning_rate": 9.340305620673778e-08, + "logits/chosen": -1.1270843744277954, + "logits/rejected": -1.12493896484375, + "logps/chosen": -60.640384674072266, + "logps/rejected": -71.61137390136719, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3440361022949219, + "rewards/margins": 5.394672870635986, + "rewards/rejected": -6.738708972930908, + "step": 1000 + }, + { + "epoch": 5.931851851851852, + "grad_norm": 7.0928652351773085, + "learning_rate": 9.289719424188947e-08, + "logits/chosen": -1.0911977291107178, + "logits/rejected": -1.1739208698272705, + "logps/chosen": -58.13080978393555, + "logps/rejected": -69.92738342285156, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24996742606163025, + "rewards/margins": 5.095523834228516, + "rewards/rejected": -5.345491409301758, + "step": 1001 + }, + { + "epoch": 5.937777777777778, + "grad_norm": 7.277382234508979, + "learning_rate": 9.239239307051841e-08, + "logits/chosen": -1.3850924968719482, + "logits/rejected": -1.2906146049499512, + "logps/chosen": -43.749080657958984, + "logps/rejected": -54.221092224121094, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44107675552368164, + "rewards/margins": 4.355238437652588, + "rewards/rejected": -3.914161443710327, + "step": 1002 + }, + { + "epoch": 5.9437037037037035, + "grad_norm": 5.0779449589504715, + "learning_rate": 9.18886561011557e-08, + "logits/chosen": -1.0621355772018433, + "logits/rejected": -0.958572506904602, + "logps/chosen": -48.210506439208984, + "logps/rejected": -69.44853210449219, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.863682210445404, + "rewards/margins": 5.017454624176025, + "rewards/rejected": -5.881137371063232, + "step": 1003 + }, + { + "epoch": 5.94962962962963, + "grad_norm": 6.195778586468951, + "learning_rate": 9.13859867351466e-08, + "logits/chosen": -1.1037704944610596, + "logits/rejected": -1.104474425315857, + "logps/chosen": -63.77961730957031, + "logps/rejected": -65.49559020996094, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5683590769767761, + "rewards/margins": 5.2069854736328125, + "rewards/rejected": -5.7753448486328125, + "step": 1004 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 3.8602707820687625, + "learning_rate": 9.088438836662777e-08, + "logits/chosen": -1.1881828308105469, + "logits/rejected": -1.3121764659881592, + "logps/chosen": -51.87583541870117, + "logps/rejected": -79.06917572021484, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2786531448364258, + "rewards/margins": 5.352337837219238, + "rewards/rejected": -6.630990982055664, + "step": 1005 + }, + { + "epoch": 5.961481481481481, + "grad_norm": 4.5867586019012165, + "learning_rate": 9.038386438250414e-08, + "logits/chosen": -1.241044044494629, + "logits/rejected": -1.30616295337677, + "logps/chosen": -40.884727478027344, + "logps/rejected": -58.961097717285156, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003995433449745178, + "rewards/margins": 4.632888317108154, + "rewards/rejected": -4.62889289855957, + "step": 1006 + }, + { + "epoch": 5.967407407407407, + "grad_norm": 5.555139669607316, + "learning_rate": 8.988441816242629e-08, + "logits/chosen": -1.358335018157959, + "logits/rejected": -1.4460186958312988, + "logps/chosen": -49.65517807006836, + "logps/rejected": -72.40342712402344, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2420268058776855, + "rewards/margins": 5.1783857345581055, + "rewards/rejected": -6.420413017272949, + "step": 1007 + }, + { + "epoch": 5.973333333333334, + "grad_norm": 4.927040690223158, + "learning_rate": 8.938605307876736e-08, + "logits/chosen": -1.092087745666504, + "logits/rejected": -1.1519795656204224, + "logps/chosen": -37.996971130371094, + "logps/rejected": -58.567012786865234, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3948993682861328, + "rewards/margins": 4.651831150054932, + "rewards/rejected": -5.0467305183410645, + "step": 1008 + }, + { + "epoch": 5.979259259259259, + "grad_norm": 4.489805022758967, + "learning_rate": 8.888877249660052e-08, + "logits/chosen": -1.2722234725952148, + "logits/rejected": -1.286690354347229, + "logps/chosen": -54.14191436767578, + "logps/rejected": -73.30915832519531, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.989962100982666, + "rewards/margins": 5.260371685028076, + "rewards/rejected": -6.250333786010742, + "step": 1009 + }, + { + "epoch": 5.985185185185185, + "grad_norm": 5.465649432608913, + "learning_rate": 8.839257977367609e-08, + "logits/chosen": -1.0653859376907349, + "logits/rejected": -1.1169452667236328, + "logps/chosen": -45.828392028808594, + "logps/rejected": -79.25332641601562, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8341926336288452, + "rewards/margins": 6.369889259338379, + "rewards/rejected": -7.2040815353393555, + "step": 1010 + }, + { + "epoch": 5.9911111111111115, + "grad_norm": 6.569969651329867, + "learning_rate": 8.789747826039893e-08, + "logits/chosen": -0.9660034775733948, + "logits/rejected": -1.1117826700210571, + "logps/chosen": -47.17693328857422, + "logps/rejected": -61.68070983886719, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6077823638916016, + "rewards/margins": 4.161911487579346, + "rewards/rejected": -4.769693851470947, + "step": 1011 + }, + { + "epoch": 5.997037037037037, + "grad_norm": 6.425246591344191, + "learning_rate": 8.74034712998058e-08, + "logits/chosen": -1.1601378917694092, + "logits/rejected": -1.3277003765106201, + "logps/chosen": -45.462860107421875, + "logps/rejected": -79.97228240966797, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8078010678291321, + "rewards/margins": 6.313694000244141, + "rewards/rejected": -7.121495246887207, + "step": 1012 + }, + { + "epoch": 6.002962962962963, + "grad_norm": 4.502315709571971, + "learning_rate": 8.69105622275428e-08, + "logits/chosen": -1.1005038022994995, + "logits/rejected": -1.072653889656067, + "logps/chosen": -41.478179931640625, + "logps/rejected": -64.40869140625, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2710479497909546, + "rewards/margins": 6.673520088195801, + "rewards/rejected": -6.944567680358887, + "step": 1013 + }, + { + "epoch": 6.0088888888888885, + "grad_norm": 4.379560632717497, + "learning_rate": 8.641875437184287e-08, + "logits/chosen": -1.1290507316589355, + "logits/rejected": -1.2435288429260254, + "logps/chosen": -38.80870056152344, + "logps/rejected": -75.89081573486328, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28928327560424805, + "rewards/margins": 7.293491363525391, + "rewards/rejected": -7.582775115966797, + "step": 1014 + }, + { + "epoch": 6.014814814814815, + "grad_norm": 3.2767704731233396, + "learning_rate": 8.592805105350326e-08, + "logits/chosen": -1.3261024951934814, + "logits/rejected": -1.39167058467865, + "logps/chosen": -49.11321258544922, + "logps/rejected": -67.26222229003906, + "loss": 0.0251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6615856289863586, + "rewards/margins": 5.151736259460449, + "rewards/rejected": -5.813322067260742, + "step": 1015 + }, + { + "epoch": 6.020740740740741, + "grad_norm": 4.906372024322152, + "learning_rate": 8.543845558586307e-08, + "logits/chosen": -1.265272617340088, + "logits/rejected": -1.2793196439743042, + "logps/chosen": -44.430320739746094, + "logps/rejected": -63.15349578857422, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17215143144130707, + "rewards/margins": 4.199959754943848, + "rewards/rejected": -4.3721113204956055, + "step": 1016 + }, + { + "epoch": 6.026666666666666, + "grad_norm": 3.538919277162638, + "learning_rate": 8.494997127478109e-08, + "logits/chosen": -0.9269475340843201, + "logits/rejected": -0.994177520275116, + "logps/chosen": -46.4836540222168, + "logps/rejected": -71.3299789428711, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8119341135025024, + "rewards/margins": 6.298572540283203, + "rewards/rejected": -7.110507965087891, + "step": 1017 + }, + { + "epoch": 6.032592592592593, + "grad_norm": 3.4963380275486187, + "learning_rate": 8.44626014186132e-08, + "logits/chosen": -1.102508544921875, + "logits/rejected": -1.2094018459320068, + "logps/chosen": -43.70540237426758, + "logps/rejected": -62.176700592041016, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.613059401512146, + "rewards/margins": 5.6967453956604, + "rewards/rejected": -6.309804916381836, + "step": 1018 + }, + { + "epoch": 6.038518518518519, + "grad_norm": 4.029066081775569, + "learning_rate": 8.39763493081902e-08, + "logits/chosen": -1.01422917842865, + "logits/rejected": -1.1143798828125, + "logps/chosen": -44.69683837890625, + "logps/rejected": -65.62399291992188, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6260117292404175, + "rewards/margins": 5.6429572105407715, + "rewards/rejected": -6.2689690589904785, + "step": 1019 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 6.291140101689275, + "learning_rate": 8.349121822679589e-08, + "logits/chosen": -1.3785192966461182, + "logits/rejected": -1.3688278198242188, + "logps/chosen": -51.58811950683594, + "logps/rejected": -67.65833282470703, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22203266620635986, + "rewards/margins": 5.307156562805176, + "rewards/rejected": -5.529188632965088, + "step": 1020 + }, + { + "epoch": 6.05037037037037, + "grad_norm": 3.7693925741310186, + "learning_rate": 8.300721145014434e-08, + "logits/chosen": -1.0719438791275024, + "logits/rejected": -1.1484143733978271, + "logps/chosen": -43.041751861572266, + "logps/rejected": -55.159175872802734, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5907548666000366, + "rewards/margins": 4.366609573364258, + "rewards/rejected": -5.957364082336426, + "step": 1021 + }, + { + "epoch": 6.0562962962962965, + "grad_norm": 4.2294470997905265, + "learning_rate": 8.252433224635816e-08, + "logits/chosen": -1.1875334978103638, + "logits/rejected": -1.225481629371643, + "logps/chosen": -49.98930740356445, + "logps/rejected": -76.53312683105469, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.84998619556427, + "rewards/margins": 5.482215881347656, + "rewards/rejected": -6.332201957702637, + "step": 1022 + }, + { + "epoch": 6.062222222222222, + "grad_norm": 6.556056582379274, + "learning_rate": 8.204258387594634e-08, + "logits/chosen": -1.1337170600891113, + "logits/rejected": -1.1994110345840454, + "logps/chosen": -56.70390319824219, + "logps/rejected": -67.9212646484375, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0490467548370361, + "rewards/margins": 7.023098945617676, + "rewards/rejected": -8.072145462036133, + "step": 1023 + }, + { + "epoch": 6.068148148148148, + "grad_norm": 5.0733621236540225, + "learning_rate": 8.15619695917823e-08, + "logits/chosen": -1.1547056436538696, + "logits/rejected": -1.1759554147720337, + "logps/chosen": -60.92186737060547, + "logps/rejected": -63.660179138183594, + "loss": 0.0514, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3117988109588623, + "rewards/margins": 5.863153457641602, + "rewards/rejected": -7.174952507019043, + "step": 1024 + }, + { + "epoch": 6.074074074074074, + "grad_norm": 4.256177047999337, + "learning_rate": 8.108249263908163e-08, + "logits/chosen": -1.3581020832061768, + "logits/rejected": -1.4121311902999878, + "logps/chosen": -57.44062042236328, + "logps/rejected": -81.3388671875, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.617480993270874, + "rewards/margins": 5.197127342224121, + "rewards/rejected": -6.814608573913574, + "step": 1025 + }, + { + "epoch": 6.08, + "grad_norm": 3.773024165408493, + "learning_rate": 8.060415625538059e-08, + "logits/chosen": -1.1411099433898926, + "logits/rejected": -1.2848527431488037, + "logps/chosen": -44.507415771484375, + "logps/rejected": -66.17939758300781, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23141825199127197, + "rewards/margins": 4.467518329620361, + "rewards/rejected": -4.698936462402344, + "step": 1026 + }, + { + "epoch": 6.085925925925926, + "grad_norm": 5.309682508460407, + "learning_rate": 8.012696367051409e-08, + "logits/chosen": -1.2690435647964478, + "logits/rejected": -1.2943583726882935, + "logps/chosen": -48.598609924316406, + "logps/rejected": -69.47331237792969, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09322678297758102, + "rewards/margins": 5.5550360679626465, + "rewards/rejected": -5.648262977600098, + "step": 1027 + }, + { + "epoch": 6.091851851851851, + "grad_norm": 4.904045381796863, + "learning_rate": 7.965091810659369e-08, + "logits/chosen": -1.02296781539917, + "logits/rejected": -1.1315923929214478, + "logps/chosen": -45.45520782470703, + "logps/rejected": -67.32499694824219, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.799096941947937, + "rewards/margins": 4.296487808227539, + "rewards/rejected": -5.095584869384766, + "step": 1028 + }, + { + "epoch": 6.097777777777778, + "grad_norm": 3.1264893216428313, + "learning_rate": 7.917602277798612e-08, + "logits/chosen": -1.0046404600143433, + "logits/rejected": -1.1222795248031616, + "logps/chosen": -49.771968841552734, + "logps/rejected": -76.62921142578125, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.469895839691162, + "rewards/margins": 6.9465131759643555, + "rewards/rejected": -8.41640853881836, + "step": 1029 + }, + { + "epoch": 6.103703703703704, + "grad_norm": 4.517176589514016, + "learning_rate": 7.870228089129155e-08, + "logits/chosen": -0.9964554905891418, + "logits/rejected": -1.0924674272537231, + "logps/chosen": -41.06652069091797, + "logps/rejected": -58.19047164916992, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026089489459991455, + "rewards/margins": 5.98679256439209, + "rewards/rejected": -5.960702896118164, + "step": 1030 + }, + { + "epoch": 6.109629629629629, + "grad_norm": 3.8344412940587396, + "learning_rate": 7.822969564532167e-08, + "logits/chosen": -1.2939468622207642, + "logits/rejected": -1.306887149810791, + "logps/chosen": -47.0091667175293, + "logps/rejected": -73.51778411865234, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0579248666763306, + "rewards/margins": 6.318962574005127, + "rewards/rejected": -7.37688684463501, + "step": 1031 + }, + { + "epoch": 6.115555555555556, + "grad_norm": 9.246762363812655, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -0.9924752712249756, + "logits/rejected": -1.117583155632019, + "logps/chosen": -46.18872833251953, + "logps/rejected": -73.19407653808594, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8105946779251099, + "rewards/margins": 5.830495834350586, + "rewards/rejected": -6.641090393066406, + "step": 1032 + }, + { + "epoch": 6.1214814814814815, + "grad_norm": 3.882709661161108, + "learning_rate": 7.728800783173201e-08, + "logits/chosen": -1.0195491313934326, + "logits/rejected": -1.1015743017196655, + "logps/chosen": -43.58595275878906, + "logps/rejected": -79.01959228515625, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06294581294059753, + "rewards/margins": 6.317727565765381, + "rewards/rejected": -6.380673408508301, + "step": 1033 + }, + { + "epoch": 6.127407407407407, + "grad_norm": 3.9425021805483746, + "learning_rate": 7.681891162260015e-08, + "logits/chosen": -0.8941872715950012, + "logits/rejected": -0.953436553478241, + "logps/chosen": -40.192047119140625, + "logps/rejected": -60.56889343261719, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19415129721164703, + "rewards/margins": 5.351212501525879, + "rewards/rejected": -5.157060623168945, + "step": 1034 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 4.311648687445995, + "learning_rate": 7.635098477112587e-08, + "logits/chosen": -1.4252240657806396, + "logits/rejected": -1.3934770822525024, + "logps/chosen": -45.87331771850586, + "logps/rejected": -68.5013427734375, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39978134632110596, + "rewards/margins": 5.793413162231445, + "rewards/rejected": -6.193194389343262, + "step": 1035 + }, + { + "epoch": 6.139259259259259, + "grad_norm": 3.759043416715931, + "learning_rate": 7.588423043685646e-08, + "logits/chosen": -1.2668523788452148, + "logits/rejected": -1.2626163959503174, + "logps/chosen": -48.48180389404297, + "logps/rejected": -65.43073272705078, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12602674961090088, + "rewards/margins": 5.682659149169922, + "rewards/rejected": -5.556632995605469, + "step": 1036 + }, + { + "epoch": 6.145185185185185, + "grad_norm": 5.002265911936616, + "learning_rate": 7.541865177142223e-08, + "logits/chosen": -1.119709849357605, + "logits/rejected": -1.2077200412750244, + "logps/chosen": -48.262115478515625, + "logps/rejected": -76.51467895507812, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7838220596313477, + "rewards/margins": 7.046172142028809, + "rewards/rejected": -7.829994201660156, + "step": 1037 + }, + { + "epoch": 6.151111111111111, + "grad_norm": 2.6511130353379184, + "learning_rate": 7.4954251918515e-08, + "logits/chosen": -1.2835824489593506, + "logits/rejected": -1.228615403175354, + "logps/chosen": -47.74272918701172, + "logps/rejected": -67.5928955078125, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7985275983810425, + "rewards/margins": 5.398589134216309, + "rewards/rejected": -6.197115898132324, + "step": 1038 + }, + { + "epoch": 6.157037037037037, + "grad_norm": 4.420164019917996, + "learning_rate": 7.449103401386702e-08, + "logits/chosen": -1.1633448600769043, + "logits/rejected": -1.3595736026763916, + "logps/chosen": -40.677024841308594, + "logps/rejected": -75.61985778808594, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0207406282424927, + "rewards/margins": 6.18635368347168, + "rewards/rejected": -7.207094669342041, + "step": 1039 + }, + { + "epoch": 6.162962962962963, + "grad_norm": 4.476071391068969, + "learning_rate": 7.402900118522978e-08, + "logits/chosen": -1.0822563171386719, + "logits/rejected": -1.2434269189834595, + "logps/chosen": -41.48609924316406, + "logps/rejected": -77.5089111328125, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.56549072265625, + "rewards/margins": 7.847236633300781, + "rewards/rejected": -9.412727355957031, + "step": 1040 + }, + { + "epoch": 6.168888888888889, + "grad_norm": 5.506742522245201, + "learning_rate": 7.356815655235286e-08, + "logits/chosen": -1.2080464363098145, + "logits/rejected": -1.267046570777893, + "logps/chosen": -50.45636749267578, + "logps/rejected": -74.14933013916016, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.737205982208252, + "rewards/margins": 5.574244022369385, + "rewards/rejected": -7.311450004577637, + "step": 1041 + }, + { + "epoch": 6.174814814814815, + "grad_norm": 5.666447335256339, + "learning_rate": 7.310850322696283e-08, + "logits/chosen": -1.1826661825180054, + "logits/rejected": -1.2252609729766846, + "logps/chosen": -41.64863586425781, + "logps/rejected": -55.108455657958984, + "loss": 0.057, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15391169488430023, + "rewards/margins": 5.0056657791137695, + "rewards/rejected": -5.159577369689941, + "step": 1042 + }, + { + "epoch": 6.180740740740741, + "grad_norm": 4.852046866334851, + "learning_rate": 7.265004431274236e-08, + "logits/chosen": -1.0628266334533691, + "logits/rejected": -1.0478066205978394, + "logps/chosen": -46.114532470703125, + "logps/rejected": -61.23189163208008, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26190853118896484, + "rewards/margins": 6.462977409362793, + "rewards/rejected": -6.724886417388916, + "step": 1043 + }, + { + "epoch": 6.1866666666666665, + "grad_norm": 4.455385489543663, + "learning_rate": 7.219278290530909e-08, + "logits/chosen": -1.1407748460769653, + "logits/rejected": -1.1786659955978394, + "logps/chosen": -54.615352630615234, + "logps/rejected": -64.38876342773438, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6010631322860718, + "rewards/margins": 4.484706878662109, + "rewards/rejected": -5.085770606994629, + "step": 1044 + }, + { + "epoch": 6.192592592592592, + "grad_norm": 4.3493524620098425, + "learning_rate": 7.173672209219494e-08, + "logits/chosen": -1.0870596170425415, + "logits/rejected": -1.1793104410171509, + "logps/chosen": -49.675926208496094, + "logps/rejected": -77.1938705444336, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.244964599609375, + "rewards/margins": 5.766213417053223, + "rewards/rejected": -6.011178016662598, + "step": 1045 + }, + { + "epoch": 6.198518518518519, + "grad_norm": 4.241173092953691, + "learning_rate": 7.128186495282507e-08, + "logits/chosen": -0.9993229508399963, + "logits/rejected": -1.1206510066986084, + "logps/chosen": -42.55421447753906, + "logps/rejected": -68.5854721069336, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45122551918029785, + "rewards/margins": 4.571963787078857, + "rewards/rejected": -5.023189544677734, + "step": 1046 + }, + { + "epoch": 6.204444444444444, + "grad_norm": 4.926385819576706, + "learning_rate": 7.082821455849717e-08, + "logits/chosen": -0.9856117963790894, + "logits/rejected": -1.0857964754104614, + "logps/chosen": -51.458953857421875, + "logps/rejected": -74.82815551757812, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9152073264122009, + "rewards/margins": 5.840726852416992, + "rewards/rejected": -6.755934238433838, + "step": 1047 + }, + { + "epoch": 6.21037037037037, + "grad_norm": 4.207958288778069, + "learning_rate": 7.037577397236074e-08, + "logits/chosen": -1.2903097867965698, + "logits/rejected": -1.193856120109558, + "logps/chosen": -55.63256072998047, + "logps/rejected": -82.83590698242188, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2452365159988403, + "rewards/margins": 6.734645366668701, + "rewards/rejected": -7.979881763458252, + "step": 1048 + }, + { + "epoch": 6.216296296296297, + "grad_norm": 3.6399032890233083, + "learning_rate": 6.992454624939636e-08, + "logits/chosen": -1.0144593715667725, + "logits/rejected": -1.1578352451324463, + "logps/chosen": -47.56209182739258, + "logps/rejected": -81.01909637451172, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9345325827598572, + "rewards/margins": 5.261068344116211, + "rewards/rejected": -6.195600509643555, + "step": 1049 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 4.564684222290065, + "learning_rate": 6.947453443639514e-08, + "logits/chosen": -1.2124285697937012, + "logits/rejected": -1.231855869293213, + "logps/chosen": -48.263282775878906, + "logps/rejected": -74.08621978759766, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6651404500007629, + "rewards/margins": 5.952767372131348, + "rewards/rejected": -6.617908477783203, + "step": 1050 + }, + { + "epoch": 6.228148148148148, + "grad_norm": 5.891948862633519, + "learning_rate": 6.902574157193794e-08, + "logits/chosen": -1.2942461967468262, + "logits/rejected": -1.3647191524505615, + "logps/chosen": -57.16059494018555, + "logps/rejected": -66.65109252929688, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2253351211547852, + "rewards/margins": 4.405270576477051, + "rewards/rejected": -5.630605697631836, + "step": 1051 + }, + { + "epoch": 6.234074074074074, + "grad_norm": 4.906829136136163, + "learning_rate": 6.857817068637526e-08, + "logits/chosen": -1.347945213317871, + "logits/rejected": -1.3540549278259277, + "logps/chosen": -62.06746292114258, + "logps/rejected": -66.86209106445312, + "loss": 0.0364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7687455415725708, + "rewards/margins": 6.335521697998047, + "rewards/rejected": -7.10426664352417, + "step": 1052 + }, + { + "epoch": 6.24, + "grad_norm": 5.057844402832865, + "learning_rate": 6.81318248018064e-08, + "logits/chosen": -1.0574407577514648, + "logits/rejected": -1.1014264822006226, + "logps/chosen": -52.07094955444336, + "logps/rejected": -88.19418334960938, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8869056701660156, + "rewards/margins": 6.524155616760254, + "rewards/rejected": -7.4110612869262695, + "step": 1053 + }, + { + "epoch": 6.245925925925926, + "grad_norm": 4.673828872705199, + "learning_rate": 6.7686706932059e-08, + "logits/chosen": -1.191150188446045, + "logits/rejected": -1.2414634227752686, + "logps/chosen": -47.04499816894531, + "logps/rejected": -66.87259674072266, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42619264125823975, + "rewards/margins": 4.806652069091797, + "rewards/rejected": -5.232844352722168, + "step": 1054 + }, + { + "epoch": 6.2518518518518515, + "grad_norm": 5.211512132893841, + "learning_rate": 6.72428200826691e-08, + "logits/chosen": -1.1251758337020874, + "logits/rejected": -1.1461089849472046, + "logps/chosen": -59.33203125, + "logps/rejected": -76.55050659179688, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4580373764038086, + "rewards/margins": 5.884695529937744, + "rewards/rejected": -6.342732906341553, + "step": 1055 + }, + { + "epoch": 6.257777777777778, + "grad_norm": 7.15758851283485, + "learning_rate": 6.680016725086052e-08, + "logits/chosen": -1.162949800491333, + "logits/rejected": -1.1469281911849976, + "logps/chosen": -48.195289611816406, + "logps/rejected": -74.478271484375, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9580037593841553, + "rewards/margins": 6.0397186279296875, + "rewards/rejected": -6.997722148895264, + "step": 1056 + }, + { + "epoch": 6.263703703703704, + "grad_norm": 3.9471224321859206, + "learning_rate": 6.635875142552475e-08, + "logits/chosen": -1.0656099319458008, + "logits/rejected": -1.1877268552780151, + "logps/chosen": -50.106781005859375, + "logps/rejected": -83.21324920654297, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9684824347496033, + "rewards/margins": 6.485328674316406, + "rewards/rejected": -7.453810691833496, + "step": 1057 + }, + { + "epoch": 6.269629629629629, + "grad_norm": 5.0963421834197575, + "learning_rate": 6.591857558720071e-08, + "logits/chosen": -1.1995526552200317, + "logits/rejected": -1.2433815002441406, + "logps/chosen": -43.03889083862305, + "logps/rejected": -59.46638488769531, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2931675314903259, + "rewards/margins": 5.359686851501465, + "rewards/rejected": -5.6528544425964355, + "step": 1058 + }, + { + "epoch": 6.275555555555556, + "grad_norm": 6.1786398121643495, + "learning_rate": 6.547964270805467e-08, + "logits/chosen": -1.2752354145050049, + "logits/rejected": -1.2794020175933838, + "logps/chosen": -40.54266357421875, + "logps/rejected": -68.0904541015625, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13839620351791382, + "rewards/margins": 5.956443786621094, + "rewards/rejected": -6.094839096069336, + "step": 1059 + }, + { + "epoch": 6.281481481481482, + "grad_norm": 6.112280975588385, + "learning_rate": 6.504195575186008e-08, + "logits/chosen": -1.1097819805145264, + "logits/rejected": -1.1845353841781616, + "logps/chosen": -48.27924346923828, + "logps/rejected": -76.77072143554688, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9963631629943848, + "rewards/margins": 5.359070301055908, + "rewards/rejected": -6.355433464050293, + "step": 1060 + }, + { + "epoch": 6.287407407407407, + "grad_norm": 4.518898554024413, + "learning_rate": 6.460551767397784e-08, + "logits/chosen": -1.1713427305221558, + "logits/rejected": -1.3381168842315674, + "logps/chosen": -47.503623962402344, + "logps/rejected": -72.43147277832031, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6597295999526978, + "rewards/margins": 5.930143356323242, + "rewards/rejected": -6.58987283706665, + "step": 1061 + }, + { + "epoch": 6.293333333333333, + "grad_norm": 4.096124285194584, + "learning_rate": 6.417033142133593e-08, + "logits/chosen": -1.155659794807434, + "logits/rejected": -1.1853158473968506, + "logps/chosen": -39.073097229003906, + "logps/rejected": -67.7274398803711, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7619829177856445, + "rewards/margins": 5.485907554626465, + "rewards/rejected": -6.247889995574951, + "step": 1062 + }, + { + "epoch": 6.29925925925926, + "grad_norm": 3.250336129808355, + "learning_rate": 6.37363999324098e-08, + "logits/chosen": -0.7996464371681213, + "logits/rejected": -0.8508714437484741, + "logps/chosen": -40.68121337890625, + "logps/rejected": -66.40437316894531, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.070540428161621, + "rewards/margins": 6.3079142570495605, + "rewards/rejected": -7.378454685211182, + "step": 1063 + }, + { + "epoch": 6.305185185185185, + "grad_norm": 5.052049782134948, + "learning_rate": 6.330372613720247e-08, + "logits/chosen": -1.282618761062622, + "logits/rejected": -1.2823154926300049, + "logps/chosen": -46.28981399536133, + "logps/rejected": -63.936187744140625, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0241888761520386, + "rewards/margins": 4.308932781219482, + "rewards/rejected": -5.3331217765808105, + "step": 1064 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 3.259889226299206, + "learning_rate": 6.28723129572247e-08, + "logits/chosen": -1.1003174781799316, + "logits/rejected": -1.2158787250518799, + "logps/chosen": -58.42970275878906, + "logps/rejected": -75.01763153076172, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6939321756362915, + "rewards/margins": 5.769342422485352, + "rewards/rejected": -6.4632744789123535, + "step": 1065 + }, + { + "epoch": 6.3170370370370375, + "grad_norm": 5.479924725053419, + "learning_rate": 6.244216330547533e-08, + "logits/chosen": -1.1541452407836914, + "logits/rejected": -1.146990180015564, + "logps/chosen": -42.12281036376953, + "logps/rejected": -54.60337829589844, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3140569925308228, + "rewards/margins": 5.23880672454834, + "rewards/rejected": -6.552864074707031, + "step": 1066 + }, + { + "epoch": 6.322962962962963, + "grad_norm": 5.728921555274712, + "learning_rate": 6.201328008642159e-08, + "logits/chosen": -1.1909137964248657, + "logits/rejected": -1.1429741382598877, + "logps/chosen": -47.83819580078125, + "logps/rejected": -60.23460388183594, + "loss": 0.0491, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9541853666305542, + "rewards/margins": 5.615292549133301, + "rewards/rejected": -6.5694780349731445, + "step": 1067 + }, + { + "epoch": 6.328888888888889, + "grad_norm": 6.125870321351669, + "learning_rate": 6.158566619597932e-08, + "logits/chosen": -1.114587426185608, + "logits/rejected": -1.1817578077316284, + "logps/chosen": -40.60179901123047, + "logps/rejected": -65.0115966796875, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0992913246154785, + "rewards/margins": 5.84259033203125, + "rewards/rejected": -6.9418816566467285, + "step": 1068 + }, + { + "epoch": 6.3348148148148145, + "grad_norm": 4.240998896145259, + "learning_rate": 6.115932452149372e-08, + "logits/chosen": -1.162363886833191, + "logits/rejected": -1.239659309387207, + "logps/chosen": -42.08208465576172, + "logps/rejected": -65.01765441894531, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07813447713851929, + "rewards/margins": 6.08473014831543, + "rewards/rejected": -6.162865161895752, + "step": 1069 + }, + { + "epoch": 6.340740740740741, + "grad_norm": 5.299621800363426, + "learning_rate": 6.07342579417196e-08, + "logits/chosen": -1.1256684064865112, + "logits/rejected": -1.1961945295333862, + "logps/chosen": -44.97161102294922, + "logps/rejected": -65.80847930908203, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6299454569816589, + "rewards/margins": 5.447868347167969, + "rewards/rejected": -6.077814102172852, + "step": 1070 + }, + { + "epoch": 6.346666666666667, + "grad_norm": 3.9266127213154496, + "learning_rate": 6.031046932680229e-08, + "logits/chosen": -1.1711387634277344, + "logits/rejected": -1.1314812898635864, + "logps/chosen": -51.13462829589844, + "logps/rejected": -77.2643814086914, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.340965211391449, + "rewards/margins": 6.144353866577148, + "rewards/rejected": -6.485319137573242, + "step": 1071 + }, + { + "epoch": 6.352592592592592, + "grad_norm": 3.437726901220671, + "learning_rate": 5.988796153825768e-08, + "logits/chosen": -1.3358840942382812, + "logits/rejected": -1.359967589378357, + "logps/chosen": -62.960697174072266, + "logps/rejected": -75.00189208984375, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8468966484069824, + "rewards/margins": 5.507348537445068, + "rewards/rejected": -7.354245185852051, + "step": 1072 + }, + { + "epoch": 6.358518518518519, + "grad_norm": 6.198221610149886, + "learning_rate": 5.9466737428953444e-08, + "logits/chosen": -1.0683010816574097, + "logits/rejected": -1.150133490562439, + "logps/chosen": -50.889564514160156, + "logps/rejected": -95.1012954711914, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7292721271514893, + "rewards/margins": 7.153807640075684, + "rewards/rejected": -8.883079528808594, + "step": 1073 + }, + { + "epoch": 6.364444444444445, + "grad_norm": 4.926481221358003, + "learning_rate": 5.9046799843089464e-08, + "logits/chosen": -1.240066409111023, + "logits/rejected": -1.2892934083938599, + "logps/chosen": -43.26491928100586, + "logps/rejected": -57.51011276245117, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2330174446105957, + "rewards/margins": 4.55935001373291, + "rewards/rejected": -4.792367458343506, + "step": 1074 + }, + { + "epoch": 6.37037037037037, + "grad_norm": 4.627848653758448, + "learning_rate": 5.862815161617879e-08, + "logits/chosen": -1.0937652587890625, + "logits/rejected": -1.1810989379882812, + "logps/chosen": -55.17090606689453, + "logps/rejected": -80.78233337402344, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8725177645683289, + "rewards/margins": 5.987633228302002, + "rewards/rejected": -6.860151290893555, + "step": 1075 + }, + { + "epoch": 6.376296296296296, + "grad_norm": 4.425158565324127, + "learning_rate": 5.8210795575028395e-08, + "logits/chosen": -1.127935767173767, + "logits/rejected": -1.186619758605957, + "logps/chosen": -50.21623611450195, + "logps/rejected": -81.5506820678711, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0250823497772217, + "rewards/margins": 5.458142280578613, + "rewards/rejected": -6.483224868774414, + "step": 1076 + }, + { + "epoch": 6.3822222222222225, + "grad_norm": 3.213290153468062, + "learning_rate": 5.7794734537720156e-08, + "logits/chosen": -1.2193506956100464, + "logits/rejected": -1.257124423980713, + "logps/chosen": -63.197174072265625, + "logps/rejected": -71.59033966064453, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2621850967407227, + "rewards/margins": 4.87045955657959, + "rewards/rejected": -6.1326446533203125, + "step": 1077 + }, + { + "epoch": 6.388148148148148, + "grad_norm": 2.972357542462106, + "learning_rate": 5.7379971313591736e-08, + "logits/chosen": -1.2130753993988037, + "logits/rejected": -1.1798851490020752, + "logps/chosen": -63.9143180847168, + "logps/rejected": -88.82057189941406, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3516303300857544, + "rewards/margins": 7.052016735076904, + "rewards/rejected": -8.403646469116211, + "step": 1078 + }, + { + "epoch": 6.394074074074074, + "grad_norm": 5.719397621364952, + "learning_rate": 5.69665087032177e-08, + "logits/chosen": -1.0475133657455444, + "logits/rejected": -1.0087506771087646, + "logps/chosen": -48.91676330566406, + "logps/rejected": -60.33573913574219, + "loss": 0.0531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9253264665603638, + "rewards/margins": 3.923841714859009, + "rewards/rejected": -4.84916877746582, + "step": 1079 + }, + { + "epoch": 6.4, + "grad_norm": 5.636018568126882, + "learning_rate": 5.6554349498390606e-08, + "logits/chosen": -1.2317850589752197, + "logits/rejected": -1.3244078159332275, + "logps/chosen": -46.108726501464844, + "logps/rejected": -68.44413757324219, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5189595222473145, + "rewards/margins": 5.146785736083984, + "rewards/rejected": -5.665745258331299, + "step": 1080 + }, + { + "epoch": 6.405925925925926, + "grad_norm": 4.154215326167201, + "learning_rate": 5.614349648210212e-08, + "logits/chosen": -1.2623381614685059, + "logits/rejected": -1.3493742942810059, + "logps/chosen": -55.09934616088867, + "logps/rejected": -68.43744659423828, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1642762422561646, + "rewards/margins": 5.039634704589844, + "rewards/rejected": -6.203911304473877, + "step": 1081 + }, + { + "epoch": 6.411851851851852, + "grad_norm": 4.482983957523619, + "learning_rate": 5.573395242852416e-08, + "logits/chosen": -1.0360773801803589, + "logits/rejected": -1.1870149374008179, + "logps/chosen": -47.35795974731445, + "logps/rejected": -87.30964660644531, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9789612293243408, + "rewards/margins": 6.068577766418457, + "rewards/rejected": -7.047539234161377, + "step": 1082 + }, + { + "epoch": 6.417777777777777, + "grad_norm": 4.094917827882695, + "learning_rate": 5.532572010299033e-08, + "logits/chosen": -1.233745813369751, + "logits/rejected": -1.318791389465332, + "logps/chosen": -44.077110290527344, + "logps/rejected": -67.19015502929688, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031197786331176758, + "rewards/margins": 5.726001262664795, + "rewards/rejected": -5.757198810577393, + "step": 1083 + }, + { + "epoch": 6.423703703703704, + "grad_norm": 5.302432797807315, + "learning_rate": 5.4918802261977067e-08, + "logits/chosen": -1.0582901239395142, + "logits/rejected": -1.1042211055755615, + "logps/chosen": -48.33747100830078, + "logps/rejected": -63.80529022216797, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7692410349845886, + "rewards/margins": 5.0125932693481445, + "rewards/rejected": -5.781834125518799, + "step": 1084 + }, + { + "epoch": 6.42962962962963, + "grad_norm": 4.723133012663554, + "learning_rate": 5.451320165308518e-08, + "logits/chosen": -1.0730984210968018, + "logits/rejected": -1.1216435432434082, + "logps/chosen": -50.3352165222168, + "logps/rejected": -73.663818359375, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.752791404724121, + "rewards/margins": 6.158186435699463, + "rewards/rejected": -7.910977840423584, + "step": 1085 + }, + { + "epoch": 6.435555555555555, + "grad_norm": 3.9126904954362978, + "learning_rate": 5.410892101502118e-08, + "logits/chosen": -1.0621589422225952, + "logits/rejected": -1.1279480457305908, + "logps/chosen": -49.674407958984375, + "logps/rejected": -69.31290435791016, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3062057495117188, + "rewards/margins": 6.739208221435547, + "rewards/rejected": -8.045413970947266, + "step": 1086 + }, + { + "epoch": 6.441481481481482, + "grad_norm": 4.040968337186127, + "learning_rate": 5.370596307757885e-08, + "logits/chosen": -1.2488672733306885, + "logits/rejected": -1.2570903301239014, + "logps/chosen": -44.28248596191406, + "logps/rejected": -78.1733627319336, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5499465465545654, + "rewards/margins": 6.733985424041748, + "rewards/rejected": -7.283932209014893, + "step": 1087 + }, + { + "epoch": 6.4474074074074075, + "grad_norm": 5.338352284785468, + "learning_rate": 5.330433056162084e-08, + "logits/chosen": -1.4356791973114014, + "logits/rejected": -1.409859538078308, + "logps/chosen": -47.546478271484375, + "logps/rejected": -56.075843811035156, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6689544916152954, + "rewards/margins": 4.289015769958496, + "rewards/rejected": -4.95797061920166, + "step": 1088 + }, + { + "epoch": 6.453333333333333, + "grad_norm": 6.237433249524388, + "learning_rate": 5.29040261790602e-08, + "logits/chosen": -1.2191461324691772, + "logits/rejected": -1.2319748401641846, + "logps/chosen": -64.38487243652344, + "logps/rejected": -80.97091674804688, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5654529333114624, + "rewards/margins": 6.002832412719727, + "rewards/rejected": -6.568285942077637, + "step": 1089 + }, + { + "epoch": 6.459259259259259, + "grad_norm": 5.076848534654584, + "learning_rate": 5.2505052632842187e-08, + "logits/chosen": -1.0852857828140259, + "logits/rejected": -1.1707793474197388, + "logps/chosen": -49.97313690185547, + "logps/rejected": -56.366294860839844, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5606622695922852, + "rewards/margins": 4.890949726104736, + "rewards/rejected": -5.451611518859863, + "step": 1090 + }, + { + "epoch": 6.465185185185185, + "grad_norm": 3.027156656004254, + "learning_rate": 5.210741261692586e-08, + "logits/chosen": -1.139082670211792, + "logits/rejected": -1.4209345579147339, + "logps/chosen": -41.075599670410156, + "logps/rejected": -70.40779113769531, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8302870988845825, + "rewards/margins": 6.642550468444824, + "rewards/rejected": -7.472837924957275, + "step": 1091 + }, + { + "epoch": 6.471111111111111, + "grad_norm": 4.322364950918871, + "learning_rate": 5.171110881626603e-08, + "logits/chosen": -1.0543513298034668, + "logits/rejected": -1.0842859745025635, + "logps/chosen": -45.20751190185547, + "logps/rejected": -62.549015045166016, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9526963829994202, + "rewards/margins": 5.437678337097168, + "rewards/rejected": -6.390375137329102, + "step": 1092 + }, + { + "epoch": 6.477037037037037, + "grad_norm": 4.851644975850725, + "learning_rate": 5.1316143906795175e-08, + "logits/chosen": -1.1379737854003906, + "logits/rejected": -1.185952067375183, + "logps/chosen": -56.87077331542969, + "logps/rejected": -76.13067626953125, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.251248598098755, + "rewards/margins": 6.4593071937561035, + "rewards/rejected": -8.710555076599121, + "step": 1093 + }, + { + "epoch": 6.482962962962963, + "grad_norm": 4.68168860240618, + "learning_rate": 5.092252055540513e-08, + "logits/chosen": -1.1433980464935303, + "logits/rejected": -1.233651876449585, + "logps/chosen": -51.71131134033203, + "logps/rejected": -67.805908203125, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38715383410453796, + "rewards/margins": 4.533756732940674, + "rewards/rejected": -4.920910835266113, + "step": 1094 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 5.654048431405952, + "learning_rate": 5.053024141992934e-08, + "logits/chosen": -1.207375407218933, + "logits/rejected": -1.3669353723526, + "logps/chosen": -39.886138916015625, + "logps/rejected": -50.32880783081055, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5060518980026245, + "rewards/margins": 3.433199882507324, + "rewards/rejected": -3.9392518997192383, + "step": 1095 + }, + { + "epoch": 6.494814814814815, + "grad_norm": 5.999658315264965, + "learning_rate": 5.013930914912476e-08, + "logits/chosen": -1.114271879196167, + "logits/rejected": -1.1348860263824463, + "logps/chosen": -34.48630142211914, + "logps/rejected": -63.001434326171875, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.942639946937561, + "rewards/margins": 5.111496925354004, + "rewards/rejected": -6.054136276245117, + "step": 1096 + }, + { + "epoch": 6.50074074074074, + "grad_norm": 4.615994178463875, + "learning_rate": 4.97497263826539e-08, + "logits/chosen": -1.0940725803375244, + "logits/rejected": -1.2616147994995117, + "logps/chosen": -40.44831085205078, + "logps/rejected": -75.76023864746094, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6569719910621643, + "rewards/margins": 6.785313606262207, + "rewards/rejected": -7.442286014556885, + "step": 1097 + }, + { + "epoch": 6.506666666666667, + "grad_norm": 3.928446346908786, + "learning_rate": 4.936149575106727e-08, + "logits/chosen": -1.2776455879211426, + "logits/rejected": -1.3003250360488892, + "logps/chosen": -58.20839309692383, + "logps/rejected": -66.91224670410156, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9391154050827026, + "rewards/margins": 4.6472015380859375, + "rewards/rejected": -5.58631706237793, + "step": 1098 + }, + { + "epoch": 6.5125925925925925, + "grad_norm": 4.992697059243111, + "learning_rate": 4.897461987578541e-08, + "logits/chosen": -1.1780474185943604, + "logits/rejected": -1.1831226348876953, + "logps/chosen": -36.937774658203125, + "logps/rejected": -58.36042404174805, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03631478548049927, + "rewards/margins": 5.19421911239624, + "rewards/rejected": -5.157904624938965, + "step": 1099 + }, + { + "epoch": 6.518518518518518, + "grad_norm": 4.0438360078549875, + "learning_rate": 4.8589101369081235e-08, + "logits/chosen": -1.0455366373062134, + "logits/rejected": -1.1880236864089966, + "logps/chosen": -48.7999267578125, + "logps/rejected": -72.28286743164062, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7477136850357056, + "rewards/margins": 4.7201337814331055, + "rewards/rejected": -5.4678473472595215, + "step": 1100 + }, + { + "epoch": 6.524444444444445, + "grad_norm": 5.630636162552549, + "learning_rate": 4.8204942834062373e-08, + "logits/chosen": -1.1372640132904053, + "logits/rejected": -1.1603807210922241, + "logps/chosen": -37.56422805786133, + "logps/rejected": -57.50593948364258, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9024946093559265, + "rewards/margins": 4.253570556640625, + "rewards/rejected": -5.156064987182617, + "step": 1101 + }, + { + "epoch": 6.53037037037037, + "grad_norm": 4.3142630964680135, + "learning_rate": 4.7822146864653744e-08, + "logits/chosen": -1.2611795663833618, + "logits/rejected": -1.3063312768936157, + "logps/chosen": -57.0969123840332, + "logps/rejected": -77.35307312011719, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8758964538574219, + "rewards/margins": 6.118692398071289, + "rewards/rejected": -6.994588851928711, + "step": 1102 + }, + { + "epoch": 6.536296296296296, + "grad_norm": 3.921713940170822, + "learning_rate": 4.744071604557978e-08, + "logits/chosen": -1.0361847877502441, + "logits/rejected": -1.1400340795516968, + "logps/chosen": -43.746986389160156, + "logps/rejected": -58.988487243652344, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35746294260025024, + "rewards/margins": 4.024482727050781, + "rewards/rejected": -4.381945610046387, + "step": 1103 + }, + { + "epoch": 6.542222222222223, + "grad_norm": 3.209360528145483, + "learning_rate": 4.706065295234718e-08, + "logits/chosen": -1.0801401138305664, + "logits/rejected": -1.191404938697815, + "logps/chosen": -54.80885696411133, + "logps/rejected": -87.4782485961914, + "loss": 0.0236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2579131126403809, + "rewards/margins": 6.139656066894531, + "rewards/rejected": -7.397568702697754, + "step": 1104 + }, + { + "epoch": 6.548148148148148, + "grad_norm": 3.613238422110268, + "learning_rate": 4.668196015122736e-08, + "logits/chosen": -1.3278753757476807, + "logits/rejected": -1.2881349325180054, + "logps/chosen": -46.13524627685547, + "logps/rejected": -61.586280822753906, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36166372895240784, + "rewards/margins": 4.425073146820068, + "rewards/rejected": -4.786736488342285, + "step": 1105 + }, + { + "epoch": 6.554074074074074, + "grad_norm": 5.514027809117569, + "learning_rate": 4.630464019923932e-08, + "logits/chosen": -1.19566810131073, + "logits/rejected": -1.2912870645523071, + "logps/chosen": -51.66880798339844, + "logps/rejected": -61.59565734863281, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7243253588676453, + "rewards/margins": 4.674445152282715, + "rewards/rejected": -5.398770809173584, + "step": 1106 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 2.6348077760574973, + "learning_rate": 4.5928695644132266e-08, + "logits/chosen": -1.0767490863800049, + "logits/rejected": -1.175428032875061, + "logps/chosen": -44.48492431640625, + "logps/rejected": -67.7999496459961, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3176645040512085, + "rewards/margins": 6.277517318725586, + "rewards/rejected": -7.595181465148926, + "step": 1107 + }, + { + "epoch": 6.565925925925926, + "grad_norm": 3.981264192762706, + "learning_rate": 4.5554129024368334e-08, + "logits/chosen": -1.176161289215088, + "logits/rejected": -1.1765999794006348, + "logps/chosen": -47.69729232788086, + "logps/rejected": -73.8935546875, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.450006365776062, + "rewards/margins": 6.2602643966674805, + "rewards/rejected": -7.710270881652832, + "step": 1108 + }, + { + "epoch": 6.571851851851852, + "grad_norm": 4.583614175137326, + "learning_rate": 4.5180942869105594e-08, + "logits/chosen": -0.8451076745986938, + "logits/rejected": -0.958713948726654, + "logps/chosen": -52.66331100463867, + "logps/rejected": -64.74360656738281, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3362882137298584, + "rewards/margins": 5.263598442077637, + "rewards/rejected": -6.599886894226074, + "step": 1109 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 3.409875007519284, + "learning_rate": 4.480913969818098e-08, + "logits/chosen": -1.1373850107192993, + "logits/rejected": -1.2118630409240723, + "logps/chosen": -48.01524353027344, + "logps/rejected": -75.55987548828125, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3638169765472412, + "rewards/margins": 6.17181921005249, + "rewards/rejected": -7.535635948181152, + "step": 1110 + }, + { + "epoch": 6.583703703703704, + "grad_norm": 4.606324800785139, + "learning_rate": 4.4438722022092925e-08, + "logits/chosen": -0.9775791764259338, + "logits/rejected": -1.181314468383789, + "logps/chosen": -47.213096618652344, + "logps/rejected": -72.06116485595703, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4981868267059326, + "rewards/margins": 6.857434272766113, + "rewards/rejected": -8.355621337890625, + "step": 1111 + }, + { + "epoch": 6.58962962962963, + "grad_norm": 3.7424781322340253, + "learning_rate": 4.406969234198507e-08, + "logits/chosen": -1.1918004751205444, + "logits/rejected": -1.1910879611968994, + "logps/chosen": -52.43225860595703, + "logps/rejected": -81.70747375488281, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.601660132408142, + "rewards/margins": 7.3738813400268555, + "rewards/rejected": -8.975541114807129, + "step": 1112 + }, + { + "epoch": 6.595555555555555, + "grad_norm": 6.664166725738044, + "learning_rate": 4.370205314962872e-08, + "logits/chosen": -1.107804298400879, + "logits/rejected": -1.1111526489257812, + "logps/chosen": -51.81007385253906, + "logps/rejected": -62.18156051635742, + "loss": 0.0454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7900987267494202, + "rewards/margins": 4.66523551940918, + "rewards/rejected": -5.455334186553955, + "step": 1113 + }, + { + "epoch": 6.601481481481482, + "grad_norm": 3.960910024090369, + "learning_rate": 4.333580692740643e-08, + "logits/chosen": -0.9758756160736084, + "logits/rejected": -1.160149335861206, + "logps/chosen": -32.102787017822266, + "logps/rejected": -63.1169548034668, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2916122078895569, + "rewards/margins": 5.569136619567871, + "rewards/rejected": -5.860749244689941, + "step": 1114 + }, + { + "epoch": 6.607407407407408, + "grad_norm": 5.435914920357414, + "learning_rate": 4.2970956148295075e-08, + "logits/chosen": -1.217504620552063, + "logits/rejected": -1.211649775505066, + "logps/chosen": -38.41328430175781, + "logps/rejected": -60.064430236816406, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10607568919658661, + "rewards/margins": 5.275820255279541, + "rewards/rejected": -5.381896018981934, + "step": 1115 + }, + { + "epoch": 6.613333333333333, + "grad_norm": 3.2372352578704504, + "learning_rate": 4.260750327584911e-08, + "logits/chosen": -1.2909510135650635, + "logits/rejected": -1.3777996301651, + "logps/chosen": -48.34864807128906, + "logps/rejected": -63.35348129272461, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2591802179813385, + "rewards/margins": 5.086357116699219, + "rewards/rejected": -5.3455376625061035, + "step": 1116 + }, + { + "epoch": 6.619259259259259, + "grad_norm": 4.5060587727117865, + "learning_rate": 4.2245450764184095e-08, + "logits/chosen": -0.9973583221435547, + "logits/rejected": -1.065267562866211, + "logps/chosen": -54.38595962524414, + "logps/rejected": -79.28496551513672, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.91830974817276, + "rewards/margins": 6.978662490844727, + "rewards/rejected": -7.896972179412842, + "step": 1117 + }, + { + "epoch": 6.6251851851851855, + "grad_norm": 4.97344196566781, + "learning_rate": 4.188480105796005e-08, + "logits/chosen": -1.1831059455871582, + "logits/rejected": -1.311706304550171, + "logps/chosen": -43.16436004638672, + "logps/rejected": -64.7727279663086, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9032515287399292, + "rewards/margins": 5.030428409576416, + "rewards/rejected": -5.933679580688477, + "step": 1118 + }, + { + "epoch": 6.631111111111111, + "grad_norm": 5.224006568746007, + "learning_rate": 4.1525556592364843e-08, + "logits/chosen": -1.2230312824249268, + "logits/rejected": -1.332690715789795, + "logps/chosen": -62.0909423828125, + "logps/rejected": -71.77101135253906, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2376177310943604, + "rewards/margins": 4.808589935302734, + "rewards/rejected": -6.046207904815674, + "step": 1119 + }, + { + "epoch": 6.637037037037037, + "grad_norm": 2.4418988617214876, + "learning_rate": 4.116771979309797e-08, + "logits/chosen": -1.0453990697860718, + "logits/rejected": -1.198245882987976, + "logps/chosen": -37.76801300048828, + "logps/rejected": -78.61469268798828, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9890796542167664, + "rewards/margins": 6.903560638427734, + "rewards/rejected": -7.892640113830566, + "step": 1120 + }, + { + "epoch": 6.642962962962963, + "grad_norm": 4.550581764599643, + "learning_rate": 4.081129307635389e-08, + "logits/chosen": -1.2914384603500366, + "logits/rejected": -1.3126540184020996, + "logps/chosen": -41.57066345214844, + "logps/rejected": -60.63642883300781, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47267046570777893, + "rewards/margins": 4.954557418823242, + "rewards/rejected": -5.427227973937988, + "step": 1121 + }, + { + "epoch": 6.648888888888889, + "grad_norm": 3.4786713866790477, + "learning_rate": 4.045627884880606e-08, + "logits/chosen": -1.0296213626861572, + "logits/rejected": -1.0632151365280151, + "logps/chosen": -51.90285873413086, + "logps/rejected": -85.03942108154297, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3325456380844116, + "rewards/margins": 6.909428596496582, + "rewards/rejected": -7.241974830627441, + "step": 1122 + }, + { + "epoch": 6.654814814814815, + "grad_norm": 4.9375617411503505, + "learning_rate": 4.010267950759025e-08, + "logits/chosen": -1.0786218643188477, + "logits/rejected": -1.2250795364379883, + "logps/chosen": -50.944637298583984, + "logps/rejected": -80.50453186035156, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01511731743812561, + "rewards/margins": 6.7941389083862305, + "rewards/rejected": -6.809256553649902, + "step": 1123 + }, + { + "epoch": 6.66074074074074, + "grad_norm": 3.1662482794291806, + "learning_rate": 3.9750497440288935e-08, + "logits/chosen": -1.2261343002319336, + "logits/rejected": -1.1729711294174194, + "logps/chosen": -60.549949645996094, + "logps/rejected": -69.66276550292969, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.525669813156128, + "rewards/margins": 5.298177719116211, + "rewards/rejected": -6.823847770690918, + "step": 1124 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 5.42672364034126, + "learning_rate": 3.9399735024914475e-08, + "logits/chosen": -1.334367036819458, + "logits/rejected": -1.282650113105774, + "logps/chosen": -44.250877380371094, + "logps/rejected": -54.64846420288086, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5066497921943665, + "rewards/margins": 4.384808540344238, + "rewards/rejected": -4.891458511352539, + "step": 1125 + }, + { + "epoch": 6.672592592592593, + "grad_norm": 3.901638006008318, + "learning_rate": 3.905039462989365e-08, + "logits/chosen": -1.0305792093276978, + "logits/rejected": -1.0763057470321655, + "logps/chosen": -56.9117431640625, + "logps/rejected": -71.73992919921875, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.381123661994934, + "rewards/margins": 6.097377300262451, + "rewards/rejected": -7.478500843048096, + "step": 1126 + }, + { + "epoch": 6.678518518518518, + "grad_norm": 4.314515236806422, + "learning_rate": 3.8702478614051345e-08, + "logits/chosen": -0.970765233039856, + "logits/rejected": -0.9876160621643066, + "logps/chosen": -38.278194427490234, + "logps/rejected": -62.56399917602539, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34831175208091736, + "rewards/margins": 4.976291656494141, + "rewards/rejected": -4.6279802322387695, + "step": 1127 + }, + { + "epoch": 6.684444444444445, + "grad_norm": 4.337826544829072, + "learning_rate": 3.835598932659476e-08, + "logits/chosen": -1.2097077369689941, + "logits/rejected": -1.2699313163757324, + "logps/chosen": -52.94355773925781, + "logps/rejected": -80.71701049804688, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8386217355728149, + "rewards/margins": 6.98582649230957, + "rewards/rejected": -7.824448585510254, + "step": 1128 + }, + { + "epoch": 6.6903703703703705, + "grad_norm": 4.042784792768212, + "learning_rate": 3.801092910709749e-08, + "logits/chosen": -1.3506526947021484, + "logits/rejected": -1.4134248495101929, + "logps/chosen": -48.646488189697266, + "logps/rejected": -74.74803924560547, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7620642185211182, + "rewards/margins": 5.172915935516357, + "rewards/rejected": -5.934980392456055, + "step": 1129 + }, + { + "epoch": 6.696296296296296, + "grad_norm": 3.4781757684910475, + "learning_rate": 3.766730028548376e-08, + "logits/chosen": -1.2849725484848022, + "logits/rejected": -1.3207203149795532, + "logps/chosen": -52.43779754638672, + "logps/rejected": -69.90351867675781, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.533517599105835, + "rewards/margins": 6.085549354553223, + "rewards/rejected": -7.619067192077637, + "step": 1130 + }, + { + "epoch": 6.702222222222222, + "grad_norm": 3.4010729959792934, + "learning_rate": 3.732510518201265e-08, + "logits/chosen": -0.9972199201583862, + "logits/rejected": -0.9870040416717529, + "logps/chosen": -60.04631805419922, + "logps/rejected": -71.98896026611328, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5988143682479858, + "rewards/margins": 5.482958793640137, + "rewards/rejected": -7.081772804260254, + "step": 1131 + }, + { + "epoch": 6.708148148148148, + "grad_norm": 4.71128938913975, + "learning_rate": 3.698434610726245e-08, + "logits/chosen": -1.2611842155456543, + "logits/rejected": -1.335821509361267, + "logps/chosen": -46.10606384277344, + "logps/rejected": -79.79691314697266, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6949688792228699, + "rewards/margins": 6.835414886474609, + "rewards/rejected": -7.530384063720703, + "step": 1132 + }, + { + "epoch": 6.714074074074074, + "grad_norm": 5.831314985346632, + "learning_rate": 3.6645025362115e-08, + "logits/chosen": -1.0676662921905518, + "logits/rejected": -1.211071491241455, + "logps/chosen": -57.91938400268555, + "logps/rejected": -74.5068359375, + "loss": 0.0415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2757381796836853, + "rewards/margins": 6.2339253425598145, + "rewards/rejected": -6.5096635818481445, + "step": 1133 + }, + { + "epoch": 6.72, + "grad_norm": 4.274701107831341, + "learning_rate": 3.630714523774042e-08, + "logits/chosen": -1.0759488344192505, + "logits/rejected": -1.2164721488952637, + "logps/chosen": -53.31672286987305, + "logps/rejected": -85.29515838623047, + "loss": 0.0294, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1152231693267822, + "rewards/margins": 6.755128860473633, + "rewards/rejected": -7.870352268218994, + "step": 1134 + }, + { + "epoch": 6.725925925925926, + "grad_norm": 3.4258860662014246, + "learning_rate": 3.597070801558122e-08, + "logits/chosen": -1.2027193307876587, + "logits/rejected": -1.2718777656555176, + "logps/chosen": -43.425384521484375, + "logps/rejected": -81.82745361328125, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1736352443695068, + "rewards/margins": 7.256280899047852, + "rewards/rejected": -8.429915428161621, + "step": 1135 + }, + { + "epoch": 6.731851851851852, + "grad_norm": 5.044131099305807, + "learning_rate": 3.563571596733722e-08, + "logits/chosen": -1.3531447649002075, + "logits/rejected": -1.4350122213363647, + "logps/chosen": -47.775699615478516, + "logps/rejected": -75.65559387207031, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.998325765132904, + "rewards/margins": 4.952122688293457, + "rewards/rejected": -5.950448036193848, + "step": 1136 + }, + { + "epoch": 6.737777777777778, + "grad_norm": 3.879818573961412, + "learning_rate": 3.530217135495006e-08, + "logits/chosen": -1.0087562799453735, + "logits/rejected": -1.107908844947815, + "logps/chosen": -40.937007904052734, + "logps/rejected": -78.79791259765625, + "loss": 0.0317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9164285659790039, + "rewards/margins": 6.570706367492676, + "rewards/rejected": -7.48713493347168, + "step": 1137 + }, + { + "epoch": 6.743703703703703, + "grad_norm": 3.8451358515328966, + "learning_rate": 3.4970076430588027e-08, + "logits/chosen": -0.9816167950630188, + "logits/rejected": -1.1667194366455078, + "logps/chosen": -36.36294937133789, + "logps/rejected": -83.9415054321289, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2798038423061371, + "rewards/margins": 6.795148849487305, + "rewards/rejected": -7.074953079223633, + "step": 1138 + }, + { + "epoch": 6.74962962962963, + "grad_norm": 4.665882979218104, + "learning_rate": 3.463943343663065e-08, + "logits/chosen": -1.1199207305908203, + "logits/rejected": -1.1523513793945312, + "logps/chosen": -57.47267150878906, + "logps/rejected": -87.13018798828125, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.884210467338562, + "rewards/margins": 6.462403297424316, + "rewards/rejected": -8.346613883972168, + "step": 1139 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 4.597748052381748, + "learning_rate": 3.4310244605653795e-08, + "logits/chosen": -1.1553621292114258, + "logits/rejected": -1.3642417192459106, + "logps/chosen": -57.56583023071289, + "logps/rejected": -76.71488952636719, + "loss": 0.0353, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8032940626144409, + "rewards/margins": 6.201355934143066, + "rewards/rejected": -7.004650115966797, + "step": 1140 + }, + { + "epoch": 6.761481481481481, + "grad_norm": 3.8361199615022836, + "learning_rate": 3.3982512160414505e-08, + "logits/chosen": -1.1486549377441406, + "logits/rejected": -1.2162824869155884, + "logps/chosen": -52.28083801269531, + "logps/rejected": -80.21273803710938, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.817528247833252, + "rewards/margins": 6.002938747406006, + "rewards/rejected": -6.820467948913574, + "step": 1141 + }, + { + "epoch": 6.767407407407408, + "grad_norm": 3.1360553065274446, + "learning_rate": 3.365623831383599e-08, + "logits/chosen": -1.184605598449707, + "logits/rejected": -1.286274790763855, + "logps/chosen": -51.320716857910156, + "logps/rejected": -74.52813720703125, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1115386486053467, + "rewards/margins": 6.255901336669922, + "rewards/rejected": -7.367440223693848, + "step": 1142 + }, + { + "epoch": 6.773333333333333, + "grad_norm": 4.915326465375198, + "learning_rate": 3.3331425268992547e-08, + "logits/chosen": -1.0797728300094604, + "logits/rejected": -1.0414516925811768, + "logps/chosen": -40.94459915161133, + "logps/rejected": -57.521244049072266, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39861059188842773, + "rewards/margins": 5.553493499755859, + "rewards/rejected": -5.952103614807129, + "step": 1143 + }, + { + "epoch": 6.779259259259259, + "grad_norm": 4.6607098966794895, + "learning_rate": 3.3008075219095045e-08, + "logits/chosen": -1.36018705368042, + "logits/rejected": -1.3997021913528442, + "logps/chosen": -56.11627197265625, + "logps/rejected": -75.92520141601562, + "loss": 0.0308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5371124148368835, + "rewards/margins": 5.539422512054443, + "rewards/rejected": -6.076535224914551, + "step": 1144 + }, + { + "epoch": 6.785185185185185, + "grad_norm": 4.308322609065028, + "learning_rate": 3.268619034747566e-08, + "logits/chosen": -1.0334553718566895, + "logits/rejected": -1.0797995328903198, + "logps/chosen": -44.05830764770508, + "logps/rejected": -73.79558563232422, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4985544681549072, + "rewards/margins": 6.261322498321533, + "rewards/rejected": -7.7598772048950195, + "step": 1145 + }, + { + "epoch": 6.791111111111111, + "grad_norm": 2.806649138401611, + "learning_rate": 3.236577282757347e-08, + "logits/chosen": -1.1216260194778442, + "logits/rejected": -1.196481704711914, + "logps/chosen": -52.37408447265625, + "logps/rejected": -60.706138610839844, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9106160402297974, + "rewards/margins": 4.475264072418213, + "rewards/rejected": -5.385880470275879, + "step": 1146 + }, + { + "epoch": 6.797037037037037, + "grad_norm": 4.0319117047594295, + "learning_rate": 3.204682482291959e-08, + "logits/chosen": -1.2100169658660889, + "logits/rejected": -1.2417694330215454, + "logps/chosen": -48.793582916259766, + "logps/rejected": -58.74952697753906, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7654832601547241, + "rewards/margins": 4.446800231933594, + "rewards/rejected": -5.212283611297607, + "step": 1147 + }, + { + "epoch": 6.802962962962963, + "grad_norm": 4.9909518633601255, + "learning_rate": 3.172934848712272e-08, + "logits/chosen": -1.1236099004745483, + "logits/rejected": -1.1841087341308594, + "logps/chosen": -38.755008697509766, + "logps/rejected": -70.70816040039062, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.653393030166626, + "rewards/margins": 5.56360387802124, + "rewards/rejected": -6.216997146606445, + "step": 1148 + }, + { + "epoch": 6.808888888888889, + "grad_norm": 3.4532209121659263, + "learning_rate": 3.141334596385447e-08, + "logits/chosen": -1.0277100801467896, + "logits/rejected": -1.0570564270019531, + "logps/chosen": -46.61890411376953, + "logps/rejected": -69.73439025878906, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9495557546615601, + "rewards/margins": 6.949012756347656, + "rewards/rejected": -7.898568153381348, + "step": 1149 + }, + { + "epoch": 6.814814814814815, + "grad_norm": 5.871538897220252, + "learning_rate": 3.109881938683492e-08, + "logits/chosen": -1.1937267780303955, + "logits/rejected": -1.285079002380371, + "logps/chosen": -36.37516784667969, + "logps/rejected": -74.77618408203125, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38637667894363403, + "rewards/margins": 7.374358177185059, + "rewards/rejected": -7.760734558105469, + "step": 1150 + }, + { + "epoch": 6.8207407407407405, + "grad_norm": 7.088095822908057, + "learning_rate": 3.078577087981832e-08, + "logits/chosen": -1.113420009613037, + "logits/rejected": -1.0938458442687988, + "logps/chosen": -50.963314056396484, + "logps/rejected": -84.46855163574219, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.373986840248108, + "rewards/margins": 6.448185920715332, + "rewards/rejected": -7.822172164916992, + "step": 1151 + }, + { + "epoch": 6.826666666666666, + "grad_norm": 4.222520378565838, + "learning_rate": 3.047420255657851e-08, + "logits/chosen": -1.3878954648971558, + "logits/rejected": -1.4309817552566528, + "logps/chosen": -47.81982421875, + "logps/rejected": -63.69879913330078, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8947990536689758, + "rewards/margins": 5.389825344085693, + "rewards/rejected": -6.2846245765686035, + "step": 1152 + }, + { + "epoch": 6.832592592592593, + "grad_norm": 4.126842704006626, + "learning_rate": 3.016411652089493e-08, + "logits/chosen": -1.2584927082061768, + "logits/rejected": -1.3139593601226807, + "logps/chosen": -45.37017822265625, + "logps/rejected": -72.07698059082031, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2568224668502808, + "rewards/margins": 5.785853385925293, + "rewards/rejected": -7.042675971984863, + "step": 1153 + }, + { + "epoch": 6.838518518518518, + "grad_norm": 3.3946248488291246, + "learning_rate": 2.985551486653823e-08, + "logits/chosen": -1.1920702457427979, + "logits/rejected": -1.3103349208831787, + "logps/chosen": -46.732398986816406, + "logps/rejected": -82.87397766113281, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6517075300216675, + "rewards/margins": 7.334443092346191, + "rewards/rejected": -7.986151218414307, + "step": 1154 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 3.4037349577235188, + "learning_rate": 2.954839967725617e-08, + "logits/chosen": -1.303606390953064, + "logits/rejected": -1.3001573085784912, + "logps/chosen": -58.52591323852539, + "logps/rejected": -64.23796081542969, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1647790670394897, + "rewards/margins": 5.34108829498291, + "rewards/rejected": -6.5058674812316895, + "step": 1155 + }, + { + "epoch": 6.850370370370371, + "grad_norm": 5.007534848746155, + "learning_rate": 2.924277302675962e-08, + "logits/chosen": -1.1423561573028564, + "logits/rejected": -1.1317256689071655, + "logps/chosen": -52.439727783203125, + "logps/rejected": -66.79461669921875, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7898544073104858, + "rewards/margins": 5.347036361694336, + "rewards/rejected": -6.136890888214111, + "step": 1156 + }, + { + "epoch": 6.856296296296296, + "grad_norm": 4.863091162067616, + "learning_rate": 2.893863697870841e-08, + "logits/chosen": -1.1290535926818848, + "logits/rejected": -1.1492767333984375, + "logps/chosen": -51.96717071533203, + "logps/rejected": -61.89137268066406, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8053914308547974, + "rewards/margins": 4.911278247833252, + "rewards/rejected": -5.71666955947876, + "step": 1157 + }, + { + "epoch": 6.862222222222222, + "grad_norm": 4.046600505355742, + "learning_rate": 2.863599358669755e-08, + "logits/chosen": -1.3739553689956665, + "logits/rejected": -1.424708366394043, + "logps/chosen": -55.446754455566406, + "logps/rejected": -78.931884765625, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08441561460494995, + "rewards/margins": 6.4412431716918945, + "rewards/rejected": -6.52565860748291, + "step": 1158 + }, + { + "epoch": 6.868148148148148, + "grad_norm": 4.797543953031611, + "learning_rate": 2.8334844894243287e-08, + "logits/chosen": -1.2137575149536133, + "logits/rejected": -1.211321234703064, + "logps/chosen": -74.98422241210938, + "logps/rejected": -88.64210510253906, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.568566918373108, + "rewards/margins": 5.5392656326293945, + "rewards/rejected": -7.107832431793213, + "step": 1159 + }, + { + "epoch": 6.874074074074074, + "grad_norm": 4.909249345613524, + "learning_rate": 2.803519293476936e-08, + "logits/chosen": -1.3701822757720947, + "logits/rejected": -1.3919110298156738, + "logps/chosen": -51.087623596191406, + "logps/rejected": -68.07141876220703, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7531724572181702, + "rewards/margins": 6.090072154998779, + "rewards/rejected": -6.843244552612305, + "step": 1160 + }, + { + "epoch": 6.88, + "grad_norm": 4.722536988352096, + "learning_rate": 2.7737039731593138e-08, + "logits/chosen": -1.246882677078247, + "logits/rejected": -1.3100802898406982, + "logps/chosen": -63.127540588378906, + "logps/rejected": -75.79826354980469, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8006022572517395, + "rewards/margins": 4.3295793533325195, + "rewards/rejected": -5.130181789398193, + "step": 1161 + }, + { + "epoch": 6.885925925925926, + "grad_norm": 3.4791404660178036, + "learning_rate": 2.7440387297912122e-08, + "logits/chosen": -1.0389580726623535, + "logits/rejected": -1.0971720218658447, + "logps/chosen": -41.66819763183594, + "logps/rejected": -66.16861724853516, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5657974481582642, + "rewards/margins": 6.39125394821167, + "rewards/rejected": -5.825456619262695, + "step": 1162 + }, + { + "epoch": 6.891851851851852, + "grad_norm": 3.411294212428707, + "learning_rate": 2.7145237636790276e-08, + "logits/chosen": -1.1562262773513794, + "logits/rejected": -1.1969718933105469, + "logps/chosen": -48.71192932128906, + "logps/rejected": -68.86924743652344, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32160070538520813, + "rewards/margins": 5.062057018280029, + "rewards/rejected": -5.383657455444336, + "step": 1163 + }, + { + "epoch": 6.897777777777778, + "grad_norm": 4.292288857190118, + "learning_rate": 2.685159274114443e-08, + "logits/chosen": -1.2545537948608398, + "logits/rejected": -1.2724109888076782, + "logps/chosen": -44.735939025878906, + "logps/rejected": -63.776039123535156, + "loss": 0.0404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2329655885696411, + "rewards/margins": 5.270541191101074, + "rewards/rejected": -5.503507137298584, + "step": 1164 + }, + { + "epoch": 6.9037037037037035, + "grad_norm": 4.226084967042107, + "learning_rate": 2.6559454593731072e-08, + "logits/chosen": -1.0003001689910889, + "logits/rejected": -1.0545388460159302, + "logps/chosen": -50.38365936279297, + "logps/rejected": -84.3214111328125, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7823373079299927, + "rewards/margins": 6.4346923828125, + "rewards/rejected": -8.217029571533203, + "step": 1165 + }, + { + "epoch": 6.90962962962963, + "grad_norm": 2.3928241797464476, + "learning_rate": 2.6268825167132636e-08, + "logits/chosen": -1.0937495231628418, + "logits/rejected": -1.0489561557769775, + "logps/chosen": -47.507728576660156, + "logps/rejected": -68.5752944946289, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18306052684783936, + "rewards/margins": 5.563224792480469, + "rewards/rejected": -5.746285915374756, + "step": 1166 + }, + { + "epoch": 6.915555555555556, + "grad_norm": 3.6898784855448032, + "learning_rate": 2.5979706423744392e-08, + "logits/chosen": -1.246962547302246, + "logits/rejected": -1.2974610328674316, + "logps/chosen": -48.59234619140625, + "logps/rejected": -62.70619201660156, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.167926549911499, + "rewards/margins": 5.319080352783203, + "rewards/rejected": -6.487007141113281, + "step": 1167 + }, + { + "epoch": 6.921481481481481, + "grad_norm": 5.41093492527125, + "learning_rate": 2.5692100315761023e-08, + "logits/chosen": -1.123171329498291, + "logits/rejected": -1.0399839878082275, + "logps/chosen": -74.13027954101562, + "logps/rejected": -95.59664916992188, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8181382417678833, + "rewards/margins": 5.603046417236328, + "rewards/rejected": -7.421184539794922, + "step": 1168 + }, + { + "epoch": 6.927407407407408, + "grad_norm": 2.4175779715247523, + "learning_rate": 2.5406008785163717e-08, + "logits/chosen": -1.2619564533233643, + "logits/rejected": -1.2582941055297852, + "logps/chosen": -58.20111846923828, + "logps/rejected": -74.21757507324219, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2906043529510498, + "rewards/margins": 5.086883544921875, + "rewards/rejected": -6.377488136291504, + "step": 1169 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 3.144038000067894, + "learning_rate": 2.512143376370682e-08, + "logits/chosen": -1.0144656896591187, + "logits/rejected": -1.139504075050354, + "logps/chosen": -40.94801330566406, + "logps/rejected": -58.50607681274414, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32384026050567627, + "rewards/margins": 4.669650554656982, + "rewards/rejected": -4.345810413360596, + "step": 1170 + }, + { + "epoch": 6.939259259259259, + "grad_norm": 5.174012697298315, + "learning_rate": 2.4838377172904907e-08, + "logits/chosen": -1.117937684059143, + "logits/rejected": -1.0629061460494995, + "logps/chosen": -57.18743896484375, + "logps/rejected": -70.98674011230469, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9007591009140015, + "rewards/margins": 5.176748275756836, + "rewards/rejected": -6.077507495880127, + "step": 1171 + }, + { + "epoch": 6.945185185185185, + "grad_norm": 3.0198485141881424, + "learning_rate": 2.455684092401969e-08, + "logits/chosen": -1.0965194702148438, + "logits/rejected": -1.2074683904647827, + "logps/chosen": -33.904380798339844, + "logps/rejected": -64.75767517089844, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016904570162296295, + "rewards/margins": 6.192440986633301, + "rewards/rejected": -6.175536632537842, + "step": 1172 + }, + { + "epoch": 6.9511111111111115, + "grad_norm": 2.978789862112312, + "learning_rate": 2.4276826918047277e-08, + "logits/chosen": -1.2849714756011963, + "logits/rejected": -1.372328519821167, + "logps/chosen": -62.24251937866211, + "logps/rejected": -86.72196960449219, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.981757402420044, + "rewards/margins": 6.014976978302002, + "rewards/rejected": -6.996734619140625, + "step": 1173 + }, + { + "epoch": 6.957037037037037, + "grad_norm": 4.507694763531846, + "learning_rate": 2.399833704570517e-08, + "logits/chosen": -1.2926836013793945, + "logits/rejected": -1.305148959159851, + "logps/chosen": -41.6738395690918, + "logps/rejected": -58.01539611816406, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.233668252825737, + "rewards/margins": 4.863929748535156, + "rewards/rejected": -5.097597599029541, + "step": 1174 + }, + { + "epoch": 6.962962962962963, + "grad_norm": 3.542058417329551, + "learning_rate": 2.372137318741968e-08, + "logits/chosen": -1.116047739982605, + "logits/rejected": -1.0878998041152954, + "logps/chosen": -61.21372985839844, + "logps/rejected": -87.04780578613281, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5796356201171875, + "rewards/margins": 5.676990509033203, + "rewards/rejected": -6.256626129150391, + "step": 1175 + }, + { + "epoch": 6.968888888888889, + "grad_norm": 3.7533160256479423, + "learning_rate": 2.3445937213313062e-08, + "logits/chosen": -1.0332417488098145, + "logits/rejected": -1.062882900238037, + "logps/chosen": -67.98162841796875, + "logps/rejected": -80.46829986572266, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.066481113433838, + "rewards/margins": 6.786133289337158, + "rewards/rejected": -7.852613925933838, + "step": 1176 + }, + { + "epoch": 6.974814814814815, + "grad_norm": 3.7587867716967054, + "learning_rate": 2.3172030983190926e-08, + "logits/chosen": -1.0796126127243042, + "logits/rejected": -1.1577988862991333, + "logps/chosen": -44.99909210205078, + "logps/rejected": -59.904823303222656, + "loss": 0.0265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5940638780593872, + "rewards/margins": 5.257760047912598, + "rewards/rejected": -5.851823806762695, + "step": 1177 + }, + { + "epoch": 6.980740740740741, + "grad_norm": 4.682551019970592, + "learning_rate": 2.2899656346529768e-08, + "logits/chosen": -1.3066891431808472, + "logits/rejected": -1.3407034873962402, + "logps/chosen": -51.310062408447266, + "logps/rejected": -53.79896926879883, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3059674501419067, + "rewards/margins": 3.983776330947876, + "rewards/rejected": -5.289743423461914, + "step": 1178 + }, + { + "epoch": 6.986666666666666, + "grad_norm": 4.375765145580466, + "learning_rate": 2.2628815142464342e-08, + "logits/chosen": -1.1559022665023804, + "logits/rejected": -1.1550366878509521, + "logps/chosen": -47.062721252441406, + "logps/rejected": -75.09408569335938, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0067358016967773, + "rewards/margins": 6.008578777313232, + "rewards/rejected": -7.015315055847168, + "step": 1179 + }, + { + "epoch": 6.992592592592593, + "grad_norm": 4.4626280624260435, + "learning_rate": 2.2359509199775446e-08, + "logits/chosen": -0.8305215835571289, + "logits/rejected": -0.870010495185852, + "logps/chosen": -58.898162841796875, + "logps/rejected": -82.0947494506836, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5731546878814697, + "rewards/margins": 7.211018085479736, + "rewards/rejected": -8.784172058105469, + "step": 1180 + }, + { + "epoch": 6.998518518518519, + "grad_norm": 5.073247622191449, + "learning_rate": 2.2091740336877358e-08, + "logits/chosen": -1.0570170879364014, + "logits/rejected": -1.1461822986602783, + "logps/chosen": -52.41472625732422, + "logps/rejected": -97.20932006835938, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4183259010314941, + "rewards/margins": 6.313093185424805, + "rewards/rejected": -7.731419563293457, + "step": 1181 + }, + { + "epoch": 7.004444444444444, + "grad_norm": 3.8474555895231273, + "learning_rate": 2.1825510361805576e-08, + "logits/chosen": -1.2037731409072876, + "logits/rejected": -1.2693171501159668, + "logps/chosen": -41.03017807006836, + "logps/rejected": -64.76742553710938, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.069698765873909, + "rewards/margins": 6.444377899169922, + "rewards/rejected": -6.5140767097473145, + "step": 1182 + }, + { + "epoch": 7.010370370370371, + "grad_norm": 2.9881585778020154, + "learning_rate": 2.156082107220486e-08, + "logits/chosen": -1.073196530342102, + "logits/rejected": -1.1584742069244385, + "logps/chosen": -45.10144805908203, + "logps/rejected": -64.95097351074219, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0941698551177979, + "rewards/margins": 5.135311126708984, + "rewards/rejected": -6.2294816970825195, + "step": 1183 + }, + { + "epoch": 7.0162962962962965, + "grad_norm": 5.276512414256135, + "learning_rate": 2.129767425531673e-08, + "logits/chosen": -1.321061611175537, + "logits/rejected": -1.3409541845321655, + "logps/chosen": -58.39825439453125, + "logps/rejected": -74.25123596191406, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1586742401123047, + "rewards/margins": 5.59089469909668, + "rewards/rejected": -6.749568939208984, + "step": 1184 + }, + { + "epoch": 7.022222222222222, + "grad_norm": 4.469226836673804, + "learning_rate": 2.1036071687967783e-08, + "logits/chosen": -1.1318987607955933, + "logits/rejected": -1.1667543649673462, + "logps/chosen": -56.4000244140625, + "logps/rejected": -64.8052978515625, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6219148635864258, + "rewards/margins": 4.586970806121826, + "rewards/rejected": -6.208885669708252, + "step": 1185 + }, + { + "epoch": 7.028148148148148, + "grad_norm": 4.6713640086194586, + "learning_rate": 2.077601513655733e-08, + "logits/chosen": -1.2517354488372803, + "logits/rejected": -1.342995047569275, + "logps/chosen": -41.56428146362305, + "logps/rejected": -54.71381378173828, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5574272871017456, + "rewards/margins": 4.39409875869751, + "rewards/rejected": -4.951525688171387, + "step": 1186 + }, + { + "epoch": 7.034074074074074, + "grad_norm": 3.914809073017115, + "learning_rate": 2.0517506357045715e-08, + "logits/chosen": -1.1398996114730835, + "logits/rejected": -1.226372241973877, + "logps/chosen": -49.662784576416016, + "logps/rejected": -89.25350952148438, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5754448175430298, + "rewards/margins": 6.921874046325684, + "rewards/rejected": -7.497318267822266, + "step": 1187 + }, + { + "epoch": 7.04, + "grad_norm": 4.107490647117851, + "learning_rate": 2.0260547094942348e-08, + "logits/chosen": -1.140151858329773, + "logits/rejected": -1.2162840366363525, + "logps/chosen": -44.47743225097656, + "logps/rejected": -61.597591400146484, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7503057718276978, + "rewards/margins": 5.507844924926758, + "rewards/rejected": -6.258151054382324, + "step": 1188 + }, + { + "epoch": 7.045925925925926, + "grad_norm": 6.6860011316021275, + "learning_rate": 2.0005139085293942e-08, + "logits/chosen": -1.1185882091522217, + "logits/rejected": -1.1940538883209229, + "logps/chosen": -61.718780517578125, + "logps/rejected": -80.43623352050781, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8451882004737854, + "rewards/margins": 6.592974662780762, + "rewards/rejected": -7.4381632804870605, + "step": 1189 + }, + { + "epoch": 7.051851851851852, + "grad_norm": 4.366270256335024, + "learning_rate": 1.9751284052672873e-08, + "logits/chosen": -1.0844464302062988, + "logits/rejected": -1.151923418045044, + "logps/chosen": -45.92787170410156, + "logps/rejected": -66.75313568115234, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3969569206237793, + "rewards/margins": 5.534310817718506, + "rewards/rejected": -5.931267738342285, + "step": 1190 + }, + { + "epoch": 7.057777777777778, + "grad_norm": 4.510310947071034, + "learning_rate": 1.9498983711165345e-08, + "logits/chosen": -0.9310641884803772, + "logits/rejected": -0.9676129221916199, + "logps/chosen": -44.20957565307617, + "logps/rejected": -80.73597717285156, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5753066539764404, + "rewards/margins": 7.630600929260254, + "rewards/rejected": -9.205906867980957, + "step": 1191 + }, + { + "epoch": 7.063703703703704, + "grad_norm": 3.702033296356849, + "learning_rate": 1.9248239764360048e-08, + "logits/chosen": -1.0506380796432495, + "logits/rejected": -1.1215331554412842, + "logps/chosen": -60.218170166015625, + "logps/rejected": -61.922786712646484, + "loss": 0.0346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5812512040138245, + "rewards/margins": 4.8047709465026855, + "rewards/rejected": -5.386022567749023, + "step": 1192 + }, + { + "epoch": 7.069629629629629, + "grad_norm": 3.9221091135639763, + "learning_rate": 1.899905390533649e-08, + "logits/chosen": -1.0444316864013672, + "logits/rejected": -1.1364364624023438, + "logps/chosen": -49.269317626953125, + "logps/rejected": -74.48098754882812, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8035168647766113, + "rewards/margins": 5.918756008148193, + "rewards/rejected": -6.722272872924805, + "step": 1193 + }, + { + "epoch": 7.075555555555556, + "grad_norm": 3.475733014246764, + "learning_rate": 1.8751427816653618e-08, + "logits/chosen": -1.145141839981079, + "logits/rejected": -1.1588716506958008, + "logps/chosen": -39.95106506347656, + "logps/rejected": -55.05951690673828, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6812821626663208, + "rewards/margins": 5.355863571166992, + "rewards/rejected": -6.037146091461182, + "step": 1194 + }, + { + "epoch": 7.0814814814814815, + "grad_norm": 5.461483104500958, + "learning_rate": 1.8505363170338517e-08, + "logits/chosen": -1.1386044025421143, + "logits/rejected": -1.211320161819458, + "logps/chosen": -54.65919494628906, + "logps/rejected": -76.21634674072266, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6187160015106201, + "rewards/margins": 6.0350341796875, + "rewards/rejected": -6.653749942779541, + "step": 1195 + }, + { + "epoch": 7.087407407407407, + "grad_norm": 3.537228852616376, + "learning_rate": 1.826086162787499e-08, + "logits/chosen": -1.234696388244629, + "logits/rejected": -1.2426447868347168, + "logps/chosen": -49.69206237792969, + "logps/rejected": -59.662872314453125, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.455064058303833, + "rewards/margins": 4.351738452911377, + "rewards/rejected": -4.806802272796631, + "step": 1196 + }, + { + "epoch": 7.093333333333334, + "grad_norm": 4.822202350694429, + "learning_rate": 1.8017924840192433e-08, + "logits/chosen": -1.178159475326538, + "logits/rejected": -1.178950548171997, + "logps/chosen": -49.88624572753906, + "logps/rejected": -64.72603607177734, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6928927898406982, + "rewards/margins": 5.600357532501221, + "rewards/rejected": -6.29325008392334, + "step": 1197 + }, + { + "epoch": 7.099259259259259, + "grad_norm": 2.6458497013464735, + "learning_rate": 1.7776554447654717e-08, + "logits/chosen": -1.0108141899108887, + "logits/rejected": -1.0034370422363281, + "logps/chosen": -46.219276428222656, + "logps/rejected": -61.129150390625, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47822344303131104, + "rewards/margins": 5.80910062789917, + "rewards/rejected": -6.287323951721191, + "step": 1198 + }, + { + "epoch": 7.105185185185185, + "grad_norm": 4.366914015353461, + "learning_rate": 1.7536752080048955e-08, + "logits/chosen": -1.1089617013931274, + "logits/rejected": -1.0810322761535645, + "logps/chosen": -54.921653747558594, + "logps/rejected": -78.62164306640625, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7407786846160889, + "rewards/margins": 5.562222480773926, + "rewards/rejected": -7.303001403808594, + "step": 1199 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 4.761671953709213, + "learning_rate": 1.7298519356574726e-08, + "logits/chosen": -1.31987726688385, + "logits/rejected": -1.3181467056274414, + "logps/chosen": -50.47599792480469, + "logps/rejected": -63.006614685058594, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9234836101531982, + "rewards/margins": 4.418302536010742, + "rewards/rejected": -5.3417863845825195, + "step": 1200 + }, + { + "epoch": 7.117037037037037, + "grad_norm": 4.101005539222599, + "learning_rate": 1.706185788583289e-08, + "logits/chosen": -1.2375043630599976, + "logits/rejected": -1.2391877174377441, + "logps/chosen": -51.08970260620117, + "logps/rejected": -73.64395141601562, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4969186782836914, + "rewards/margins": 6.408106803894043, + "rewards/rejected": -6.905025959014893, + "step": 1201 + }, + { + "epoch": 7.122962962962963, + "grad_norm": 5.073736905951011, + "learning_rate": 1.6826769265815e-08, + "logits/chosen": -1.2112133502960205, + "logits/rejected": -1.2583060264587402, + "logps/chosen": -43.31800842285156, + "logps/rejected": -69.34353637695312, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5988361835479736, + "rewards/margins": 5.34016227722168, + "rewards/rejected": -5.938999176025391, + "step": 1202 + }, + { + "epoch": 7.128888888888889, + "grad_norm": 4.8290789301264, + "learning_rate": 1.6593255083892228e-08, + "logits/chosen": -1.1223499774932861, + "logits/rejected": -1.1824976205825806, + "logps/chosen": -53.52474594116211, + "logps/rejected": -82.81754302978516, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.459550142288208, + "rewards/margins": 6.309607028961182, + "rewards/rejected": -7.7691569328308105, + "step": 1203 + }, + { + "epoch": 7.134814814814815, + "grad_norm": 3.433072476835311, + "learning_rate": 1.6361316916804896e-08, + "logits/chosen": -1.1100119352340698, + "logits/rejected": -1.1950923204421997, + "logps/chosen": -48.51715087890625, + "logps/rejected": -69.97017669677734, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13577726483345032, + "rewards/margins": 5.7283935546875, + "rewards/rejected": -5.592616081237793, + "step": 1204 + }, + { + "epoch": 7.140740740740741, + "grad_norm": 2.8445682548846714, + "learning_rate": 1.6130956330651646e-08, + "logits/chosen": -1.1339422464370728, + "logits/rejected": -1.1973767280578613, + "logps/chosen": -38.242767333984375, + "logps/rejected": -55.585391998291016, + "loss": 0.0234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7097413539886475, + "rewards/margins": 5.838712692260742, + "rewards/rejected": -6.548454284667969, + "step": 1205 + }, + { + "epoch": 7.1466666666666665, + "grad_norm": 3.6214945778065273, + "learning_rate": 1.5902174880878916e-08, + "logits/chosen": -1.0982111692428589, + "logits/rejected": -1.2695142030715942, + "logps/chosen": -42.53776550292969, + "logps/rejected": -65.62220001220703, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09825128316879272, + "rewards/margins": 5.4457688331604, + "rewards/rejected": -5.54401969909668, + "step": 1206 + }, + { + "epoch": 7.152592592592592, + "grad_norm": 3.5602534628119087, + "learning_rate": 1.567497411227059e-08, + "logits/chosen": -1.3598968982696533, + "logits/rejected": -1.3619459867477417, + "logps/chosen": -59.65229797363281, + "logps/rejected": -74.8875961303711, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.338700771331787, + "rewards/margins": 6.547931671142578, + "rewards/rejected": -7.886631965637207, + "step": 1207 + }, + { + "epoch": 7.158518518518519, + "grad_norm": 5.162761564983183, + "learning_rate": 1.5449355558937337e-08, + "logits/chosen": -1.2608649730682373, + "logits/rejected": -1.166750192642212, + "logps/chosen": -61.810123443603516, + "logps/rejected": -73.02722930908203, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4834518432617188, + "rewards/margins": 5.531788349151611, + "rewards/rejected": -7.01524019241333, + "step": 1208 + }, + { + "epoch": 7.164444444444444, + "grad_norm": 6.5270072472063525, + "learning_rate": 1.5225320744306408e-08, + "logits/chosen": -1.0810682773590088, + "logits/rejected": -1.2014857530593872, + "logps/chosen": -42.50169372558594, + "logps/rejected": -72.0819320678711, + "loss": 0.0589, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6136506795883179, + "rewards/margins": 6.531806468963623, + "rewards/rejected": -7.145456790924072, + "step": 1209 + }, + { + "epoch": 7.17037037037037, + "grad_norm": 3.940199460474413, + "learning_rate": 1.5002871181111153e-08, + "logits/chosen": -1.1244795322418213, + "logits/rejected": -1.2221897840499878, + "logps/chosen": -46.386444091796875, + "logps/rejected": -64.42037963867188, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1717915534973145, + "rewards/margins": 5.057167053222656, + "rewards/rejected": -6.228959083557129, + "step": 1210 + }, + { + "epoch": 7.176296296296297, + "grad_norm": 4.069926138117332, + "learning_rate": 1.4782008371381105e-08, + "logits/chosen": -1.1110057830810547, + "logits/rejected": -1.1199915409088135, + "logps/chosen": -50.08976364135742, + "logps/rejected": -73.6676254272461, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0431034564971924, + "rewards/margins": 5.300725936889648, + "rewards/rejected": -7.34382963180542, + "step": 1211 + }, + { + "epoch": 7.182222222222222, + "grad_norm": 4.768911394055649, + "learning_rate": 1.4562733806431666e-08, + "logits/chosen": -1.1371710300445557, + "logits/rejected": -1.261394739151001, + "logps/chosen": -40.512229919433594, + "logps/rejected": -63.53961181640625, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15543416142463684, + "rewards/margins": 5.225121974945068, + "rewards/rejected": -5.069687366485596, + "step": 1212 + }, + { + "epoch": 7.188148148148148, + "grad_norm": 3.9377568820612456, + "learning_rate": 1.434504896685393e-08, + "logits/chosen": -1.144298791885376, + "logits/rejected": -1.2841874361038208, + "logps/chosen": -48.18315505981445, + "logps/rejected": -60.86648178100586, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1481950581073761, + "rewards/margins": 4.970972537994385, + "rewards/rejected": -5.119167327880859, + "step": 1213 + }, + { + "epoch": 7.194074074074074, + "grad_norm": 3.9266549802613646, + "learning_rate": 1.4128955322504965e-08, + "logits/chosen": -1.174392580986023, + "logits/rejected": -1.1449965238571167, + "logps/chosen": -54.98040771484375, + "logps/rejected": -75.45690155029297, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3021417260169983, + "rewards/margins": 4.63749885559082, + "rewards/rejected": -4.939640998840332, + "step": 1214 + }, + { + "epoch": 7.2, + "grad_norm": 3.190383440781884, + "learning_rate": 1.3914454332497604e-08, + "logits/chosen": -0.9599422216415405, + "logits/rejected": -1.0563316345214844, + "logps/chosen": -47.49116897583008, + "logps/rejected": -60.74289321899414, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8696696758270264, + "rewards/margins": 6.098237037658691, + "rewards/rejected": -6.967906951904297, + "step": 1215 + }, + { + "epoch": 7.205925925925926, + "grad_norm": 4.429967683063817, + "learning_rate": 1.3701547445190836e-08, + "logits/chosen": -1.3367496728897095, + "logits/rejected": -1.3097389936447144, + "logps/chosen": -61.743629455566406, + "logps/rejected": -101.25115966796875, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5083428621292114, + "rewards/margins": 6.84088134765625, + "rewards/rejected": -8.349224090576172, + "step": 1216 + }, + { + "epoch": 7.2118518518518515, + "grad_norm": 4.134451946730424, + "learning_rate": 1.3490236098179813e-08, + "logits/chosen": -1.0464622974395752, + "logits/rejected": -1.1230379343032837, + "logps/chosen": -56.35862731933594, + "logps/rejected": -86.72453308105469, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0451548099517822, + "rewards/margins": 6.443534851074219, + "rewards/rejected": -7.488689422607422, + "step": 1217 + }, + { + "epoch": 7.217777777777778, + "grad_norm": 4.322416018243353, + "learning_rate": 1.3280521718286253e-08, + "logits/chosen": -1.0510904788970947, + "logits/rejected": -1.151281714439392, + "logps/chosen": -46.579933166503906, + "logps/rejected": -64.40127563476562, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2190048694610596, + "rewards/margins": 4.525186538696289, + "rewards/rejected": -5.744192123413086, + "step": 1218 + }, + { + "epoch": 7.223703703703704, + "grad_norm": 4.623060793873189, + "learning_rate": 1.3072405721548857e-08, + "logits/chosen": -1.1600358486175537, + "logits/rejected": -1.2311742305755615, + "logps/chosen": -61.91407775878906, + "logps/rejected": -62.87279510498047, + "loss": 0.0493, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4685288965702057, + "rewards/margins": 4.3588385581970215, + "rewards/rejected": -4.827367305755615, + "step": 1219 + }, + { + "epoch": 7.229629629629629, + "grad_norm": 3.015880673644393, + "learning_rate": 1.2865889513213628e-08, + "logits/chosen": -1.3033242225646973, + "logits/rejected": -1.2112292051315308, + "logps/chosen": -47.12859344482422, + "logps/rejected": -62.87380599975586, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9500541687011719, + "rewards/margins": 5.1967973709106445, + "rewards/rejected": -6.146851539611816, + "step": 1220 + }, + { + "epoch": 7.235555555555556, + "grad_norm": 8.743988585405903, + "learning_rate": 1.2660974487724407e-08, + "logits/chosen": -1.0260766744613647, + "logits/rejected": -1.1828255653381348, + "logps/chosen": -47.46949768066406, + "logps/rejected": -71.39844512939453, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4114501476287842, + "rewards/margins": 6.981899738311768, + "rewards/rejected": -8.393349647521973, + "step": 1221 + }, + { + "epoch": 7.241481481481482, + "grad_norm": 3.606950533677255, + "learning_rate": 1.2457662028713594e-08, + "logits/chosen": -1.1686749458312988, + "logits/rejected": -1.2729439735412598, + "logps/chosen": -38.59318923950195, + "logps/rejected": -68.04505920410156, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5534713268280029, + "rewards/margins": 5.194384574890137, + "rewards/rejected": -5.747856140136719, + "step": 1222 + }, + { + "epoch": 7.247407407407407, + "grad_norm": 4.021604241001502, + "learning_rate": 1.2255953508992612e-08, + "logits/chosen": -1.107062816619873, + "logits/rejected": -1.186793565750122, + "logps/chosen": -50.838260650634766, + "logps/rejected": -82.80189514160156, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.042785167694092, + "rewards/margins": 6.423619270324707, + "rewards/rejected": -8.466404914855957, + "step": 1223 + }, + { + "epoch": 7.253333333333333, + "grad_norm": 3.446655110015099, + "learning_rate": 1.205585029054279e-08, + "logits/chosen": -1.2475371360778809, + "logits/rejected": -1.3144434690475464, + "logps/chosen": -52.76904296875, + "logps/rejected": -75.10376739501953, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8716415166854858, + "rewards/margins": 6.061808109283447, + "rewards/rejected": -7.9334492683410645, + "step": 1224 + }, + { + "epoch": 7.2592592592592595, + "grad_norm": 5.250885428095321, + "learning_rate": 1.1857353724505942e-08, + "logits/chosen": -1.0257887840270996, + "logits/rejected": -1.0964391231536865, + "logps/chosen": -60.26069641113281, + "logps/rejected": -83.70278930664062, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4667550325393677, + "rewards/margins": 6.692720413208008, + "rewards/rejected": -8.159475326538086, + "step": 1225 + }, + { + "epoch": 7.265185185185185, + "grad_norm": 3.610830532984277, + "learning_rate": 1.1660465151175664e-08, + "logits/chosen": -1.0537807941436768, + "logits/rejected": -1.062589406967163, + "logps/chosen": -47.4190559387207, + "logps/rejected": -73.4510498046875, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.360797643661499, + "rewards/margins": 6.665514945983887, + "rewards/rejected": -8.026312828063965, + "step": 1226 + }, + { + "epoch": 7.271111111111111, + "grad_norm": 3.880057572525301, + "learning_rate": 1.1465185899987794e-08, + "logits/chosen": -1.1795921325683594, + "logits/rejected": -1.1620832681655884, + "logps/chosen": -46.04042053222656, + "logps/rejected": -67.0638198852539, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7089900970458984, + "rewards/margins": 5.227532863616943, + "rewards/rejected": -5.936522960662842, + "step": 1227 + }, + { + "epoch": 7.277037037037037, + "grad_norm": 2.614232435052489, + "learning_rate": 1.1271517289511783e-08, + "logits/chosen": -1.2677268981933594, + "logits/rejected": -1.2671798467636108, + "logps/chosen": -45.25715255737305, + "logps/rejected": -61.537899017333984, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3847803771495819, + "rewards/margins": 4.929067134857178, + "rewards/rejected": -5.313847541809082, + "step": 1228 + }, + { + "epoch": 7.282962962962963, + "grad_norm": 3.9669190076578276, + "learning_rate": 1.1079460627441666e-08, + "logits/chosen": -1.1752060651779175, + "logits/rejected": -1.3029592037200928, + "logps/chosen": -32.447357177734375, + "logps/rejected": -65.22743225097656, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12089192867279053, + "rewards/margins": 5.483150482177734, + "rewards/rejected": -5.3622589111328125, + "step": 1229 + }, + { + "epoch": 7.288888888888889, + "grad_norm": 2.4583403700175124, + "learning_rate": 1.0889017210587215e-08, + "logits/chosen": -1.0788657665252686, + "logits/rejected": -1.1975395679473877, + "logps/chosen": -48.47402572631836, + "logps/rejected": -100.1389389038086, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9301841259002686, + "rewards/margins": 8.300827026367188, + "rewards/rejected": -9.231012344360352, + "step": 1230 + }, + { + "epoch": 7.294814814814814, + "grad_norm": 6.2132480824108, + "learning_rate": 1.0700188324865189e-08, + "logits/chosen": -1.0446033477783203, + "logits/rejected": -1.076178789138794, + "logps/chosen": -57.706825256347656, + "logps/rejected": -82.1580810546875, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013994604349136353, + "rewards/margins": 6.870332717895508, + "rewards/rejected": -6.856338024139404, + "step": 1231 + }, + { + "epoch": 7.300740740740741, + "grad_norm": 3.456394568063387, + "learning_rate": 1.0512975245290685e-08, + "logits/chosen": -1.0236254930496216, + "logits/rejected": -1.1054068803787231, + "logps/chosen": -32.16044998168945, + "logps/rejected": -55.15676498413086, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17881041765213013, + "rewards/margins": 4.75008487701416, + "rewards/rejected": -4.571274280548096, + "step": 1232 + }, + { + "epoch": 7.306666666666667, + "grad_norm": 3.5615300664844183, + "learning_rate": 1.0327379235968548e-08, + "logits/chosen": -1.2441649436950684, + "logits/rejected": -1.2749804258346558, + "logps/chosen": -39.167240142822266, + "logps/rejected": -60.43924331665039, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.48208075761795044, + "rewards/margins": 5.207080841064453, + "rewards/rejected": -5.689162254333496, + "step": 1233 + }, + { + "epoch": 7.312592592592592, + "grad_norm": 5.390607432659595, + "learning_rate": 1.0143401550084751e-08, + "logits/chosen": -1.1963176727294922, + "logits/rejected": -1.3843016624450684, + "logps/chosen": -40.927879333496094, + "logps/rejected": -83.95448303222656, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12738998234272003, + "rewards/margins": 5.321667671203613, + "rewards/rejected": -5.449057579040527, + "step": 1234 + }, + { + "epoch": 7.318518518518519, + "grad_norm": 3.142358405413608, + "learning_rate": 9.961043429898036e-09, + "logits/chosen": -1.145086407661438, + "logits/rejected": -1.2318087816238403, + "logps/chosen": -68.139892578125, + "logps/rejected": -79.01605987548828, + "loss": 0.0226, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.054990291595459, + "rewards/margins": 6.7228546142578125, + "rewards/rejected": -7.7778449058532715, + "step": 1235 + }, + { + "epoch": 7.3244444444444445, + "grad_norm": 3.789600903060908, + "learning_rate": 9.780306106731418e-09, + "logits/chosen": -0.9109461903572083, + "logits/rejected": -1.087510585784912, + "logps/chosen": -46.758087158203125, + "logps/rejected": -79.30746459960938, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5821919441223145, + "rewards/margins": 6.765468120574951, + "rewards/rejected": -7.347660064697266, + "step": 1236 + }, + { + "epoch": 7.33037037037037, + "grad_norm": 6.659386685161879, + "learning_rate": 9.601190800963942e-09, + "logits/chosen": -1.1415681838989258, + "logits/rejected": -1.2213099002838135, + "logps/chosen": -37.742740631103516, + "logps/rejected": -59.03749084472656, + "loss": 0.068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3648865222930908, + "rewards/margins": 4.198371887207031, + "rewards/rejected": -4.563258171081543, + "step": 1237 + }, + { + "epoch": 7.336296296296297, + "grad_norm": 4.090989653775852, + "learning_rate": 9.423698722022505e-09, + "logits/chosen": -1.0874037742614746, + "logits/rejected": -1.2385517358779907, + "logps/chosen": -55.656105041503906, + "logps/rejected": -89.57843017578125, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0636582374572754, + "rewards/margins": 7.275221824645996, + "rewards/rejected": -9.33888053894043, + "step": 1238 + }, + { + "epoch": 7.342222222222222, + "grad_norm": 4.360164582892418, + "learning_rate": 9.247831068373458e-09, + "logits/chosen": -1.3357502222061157, + "logits/rejected": -1.3364484310150146, + "logps/chosen": -51.02233123779297, + "logps/rejected": -69.96102905273438, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8885281682014465, + "rewards/margins": 5.597801685333252, + "rewards/rejected": -6.486330032348633, + "step": 1239 + }, + { + "epoch": 7.348148148148148, + "grad_norm": 4.002958863084062, + "learning_rate": 9.073589027514789e-09, + "logits/chosen": -1.2340748310089111, + "logits/rejected": -1.258064866065979, + "logps/chosen": -50.71947479248047, + "logps/rejected": -84.90157318115234, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6332271695137024, + "rewards/margins": 7.2277679443359375, + "rewards/rejected": -7.860995769500732, + "step": 1240 + }, + { + "epoch": 7.354074074074074, + "grad_norm": 4.157943643516201, + "learning_rate": 8.900973775967963e-09, + "logits/chosen": -1.167553186416626, + "logits/rejected": -1.210451364517212, + "logps/chosen": -41.46842575073242, + "logps/rejected": -53.83827209472656, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1300402283668518, + "rewards/margins": 4.080387592315674, + "rewards/rejected": -3.950347423553467, + "step": 1241 + }, + { + "epoch": 7.36, + "grad_norm": 4.4053174745313495, + "learning_rate": 8.729986479269924e-09, + "logits/chosen": -1.1766040325164795, + "logits/rejected": -1.243590235710144, + "logps/chosen": -52.42701721191406, + "logps/rejected": -71.56402587890625, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7700093984603882, + "rewards/margins": 5.737342357635498, + "rewards/rejected": -6.507350921630859, + "step": 1242 + }, + { + "epoch": 7.365925925925926, + "grad_norm": 3.8587503094386824, + "learning_rate": 8.56062829196541e-09, + "logits/chosen": -1.2487757205963135, + "logits/rejected": -1.2560317516326904, + "logps/chosen": -56.89815902709961, + "logps/rejected": -73.01307678222656, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6755393147468567, + "rewards/margins": 5.386079788208008, + "rewards/rejected": -6.061618804931641, + "step": 1243 + }, + { + "epoch": 7.371851851851852, + "grad_norm": 3.570997078057821, + "learning_rate": 8.392900357598959e-09, + "logits/chosen": -1.1035842895507812, + "logits/rejected": -1.1954938173294067, + "logps/chosen": -60.9942741394043, + "logps/rejected": -75.77302551269531, + "loss": 0.0315, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.047473669052124, + "rewards/margins": 5.5704545974731445, + "rewards/rejected": -6.617927551269531, + "step": 1244 + }, + { + "epoch": 7.377777777777778, + "grad_norm": 5.297807507787032, + "learning_rate": 8.2268038087073e-09, + "logits/chosen": -1.2557616233825684, + "logits/rejected": -1.2342597246170044, + "logps/chosen": -60.55089569091797, + "logps/rejected": -61.7495002746582, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8329823017120361, + "rewards/margins": 5.138329029083252, + "rewards/rejected": -5.971311569213867, + "step": 1245 + }, + { + "epoch": 7.383703703703704, + "grad_norm": 2.929655935601638, + "learning_rate": 8.062339766811726e-09, + "logits/chosen": -1.1982896327972412, + "logits/rejected": -1.205052375793457, + "logps/chosen": -64.48491668701172, + "logps/rejected": -83.77357482910156, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9903809428215027, + "rewards/margins": 6.561237335205078, + "rewards/rejected": -7.551617622375488, + "step": 1246 + }, + { + "epoch": 7.3896296296296295, + "grad_norm": 4.054956069159028, + "learning_rate": 7.899509342410376e-09, + "logits/chosen": -1.23246431350708, + "logits/rejected": -1.4280232191085815, + "logps/chosen": -41.71550369262695, + "logps/rejected": -79.24267578125, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.337856262922287, + "rewards/margins": 6.026522636413574, + "rewards/rejected": -6.364378929138184, + "step": 1247 + }, + { + "epoch": 7.395555555555555, + "grad_norm": 4.307744014945286, + "learning_rate": 7.738313634970962e-09, + "logits/chosen": -1.00555419921875, + "logits/rejected": -1.0613325834274292, + "logps/chosen": -48.21980285644531, + "logps/rejected": -65.70577239990234, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42074263095855713, + "rewards/margins": 5.744640350341797, + "rewards/rejected": -6.165383338928223, + "step": 1248 + }, + { + "epoch": 7.401481481481482, + "grad_norm": 3.475525226093238, + "learning_rate": 7.578753732923132e-09, + "logits/chosen": -1.1451385021209717, + "logits/rejected": -1.202643871307373, + "logps/chosen": -56.249755859375, + "logps/rejected": -86.0700454711914, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8659681677818298, + "rewards/margins": 7.697194576263428, + "rewards/rejected": -8.563162803649902, + "step": 1249 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 3.536162845294115, + "learning_rate": 7.4208307136512385e-09, + "logits/chosen": -1.149498701095581, + "logits/rejected": -1.264514446258545, + "logps/chosen": -42.952056884765625, + "logps/rejected": -64.82267761230469, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1783785820007324, + "rewards/margins": 6.570066928863525, + "rewards/rejected": -7.748445510864258, + "step": 1250 + }, + { + "epoch": 7.413333333333333, + "grad_norm": 3.945679434147782, + "learning_rate": 7.2645456434869965e-09, + "logits/chosen": -1.0817800760269165, + "logits/rejected": -1.0875228643417358, + "logps/chosen": -58.667335510253906, + "logps/rejected": -83.24259948730469, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.176339626312256, + "rewards/margins": 5.792956829071045, + "rewards/rejected": -7.969296455383301, + "step": 1251 + }, + { + "epoch": 7.41925925925926, + "grad_norm": 3.7678107207474065, + "learning_rate": 7.109899577702389e-09, + "logits/chosen": -1.4233471155166626, + "logits/rejected": -1.5132304430007935, + "logps/chosen": -47.02076721191406, + "logps/rejected": -69.56475830078125, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005723677575588226, + "rewards/margins": 5.980765342712402, + "rewards/rejected": -5.975041389465332, + "step": 1252 + }, + { + "epoch": 7.425185185185185, + "grad_norm": 3.9150516093462664, + "learning_rate": 6.956893560502358e-09, + "logits/chosen": -1.2314457893371582, + "logits/rejected": -1.4367800951004028, + "logps/chosen": -42.025428771972656, + "logps/rejected": -72.81390380859375, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3668340742588043, + "rewards/margins": 5.217932224273682, + "rewards/rejected": -5.584766387939453, + "step": 1253 + }, + { + "epoch": 7.431111111111111, + "grad_norm": 3.124760931056755, + "learning_rate": 6.805528625018014e-09, + "logits/chosen": -1.147788405418396, + "logits/rejected": -1.1536781787872314, + "logps/chosen": -55.166847229003906, + "logps/rejected": -79.21180725097656, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9568672180175781, + "rewards/margins": 6.495437145233154, + "rewards/rejected": -7.452304840087891, + "step": 1254 + }, + { + "epoch": 7.437037037037037, + "grad_norm": 3.5413800666688418, + "learning_rate": 6.655805793299413e-09, + "logits/chosen": -1.1663146018981934, + "logits/rejected": -1.2886245250701904, + "logps/chosen": -48.67404556274414, + "logps/rejected": -72.31990814208984, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2224462032318115, + "rewards/margins": 5.77690315246582, + "rewards/rejected": -6.999349117279053, + "step": 1255 + }, + { + "epoch": 7.442962962962963, + "grad_norm": 3.274524380079391, + "learning_rate": 6.5077260763087836e-09, + "logits/chosen": -1.1297998428344727, + "logits/rejected": -1.202750325202942, + "logps/chosen": -44.121734619140625, + "logps/rejected": -64.97763061523438, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33169564604759216, + "rewards/margins": 4.686792373657227, + "rewards/rejected": -5.018487930297852, + "step": 1256 + }, + { + "epoch": 7.448888888888889, + "grad_norm": 4.78799236470072, + "learning_rate": 6.361290473913705e-09, + "logits/chosen": -1.006934404373169, + "logits/rejected": -1.006296992301941, + "logps/chosen": -52.901729583740234, + "logps/rejected": -78.00100708007812, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6054511070251465, + "rewards/margins": 6.756753444671631, + "rewards/rejected": -7.362204551696777, + "step": 1257 + }, + { + "epoch": 7.454814814814815, + "grad_norm": 5.170167006660229, + "learning_rate": 6.216499974880274e-09, + "logits/chosen": -1.2134301662445068, + "logits/rejected": -1.2395720481872559, + "logps/chosen": -53.94024658203125, + "logps/rejected": -62.0114860534668, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7245234251022339, + "rewards/margins": 6.406597137451172, + "rewards/rejected": -7.131120681762695, + "step": 1258 + }, + { + "epoch": 7.460740740740741, + "grad_norm": 4.535250486934256, + "learning_rate": 6.073355556866527e-09, + "logits/chosen": -1.114149570465088, + "logits/rejected": -1.1737488508224487, + "logps/chosen": -68.56315612792969, + "logps/rejected": -67.39340209960938, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6941238641738892, + "rewards/margins": 6.043018341064453, + "rewards/rejected": -6.737142562866211, + "step": 1259 + }, + { + "epoch": 7.466666666666667, + "grad_norm": 4.505965598131084, + "learning_rate": 5.9318581864157555e-09, + "logits/chosen": -0.9749269485473633, + "logits/rejected": -1.096606731414795, + "logps/chosen": -50.809974670410156, + "logps/rejected": -66.96884155273438, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3738820552825928, + "rewards/margins": 5.837920188903809, + "rewards/rejected": -7.2118024826049805, + "step": 1260 + }, + { + "epoch": 7.4725925925925925, + "grad_norm": 3.0329171414952283, + "learning_rate": 5.792008818950034e-09, + "logits/chosen": -1.2703770399093628, + "logits/rejected": -1.2322120666503906, + "logps/chosen": -45.22857666015625, + "logps/rejected": -74.27377319335938, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1143908500671387, + "rewards/margins": 6.21928071975708, + "rewards/rejected": -7.3336710929870605, + "step": 1261 + }, + { + "epoch": 7.478518518518518, + "grad_norm": 4.4357521903959425, + "learning_rate": 5.653808398763726e-09, + "logits/chosen": -1.2081799507141113, + "logits/rejected": -1.046402931213379, + "logps/chosen": -40.83061218261719, + "logps/rejected": -50.72254943847656, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3177671432495117, + "rewards/margins": 4.231396675109863, + "rewards/rejected": -3.9136292934417725, + "step": 1262 + }, + { + "epoch": 7.484444444444445, + "grad_norm": 3.257895575515831, + "learning_rate": 5.5172578590171606e-09, + "logits/chosen": -1.1244299411773682, + "logits/rejected": -1.2354055643081665, + "logps/chosen": -36.96542739868164, + "logps/rejected": -56.44902801513672, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17362168431282043, + "rewards/margins": 4.583876609802246, + "rewards/rejected": -4.757498264312744, + "step": 1263 + }, + { + "epoch": 7.49037037037037, + "grad_norm": 6.81659423219458, + "learning_rate": 5.382358121730296e-09, + "logits/chosen": -1.1702152490615845, + "logits/rejected": -1.188597559928894, + "logps/chosen": -42.586761474609375, + "logps/rejected": -65.45720672607422, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10263597965240479, + "rewards/margins": 5.135069370269775, + "rewards/rejected": -5.237705230712891, + "step": 1264 + }, + { + "epoch": 7.496296296296296, + "grad_norm": 4.129282172875423, + "learning_rate": 5.249110097776482e-09, + "logits/chosen": -1.1987591981887817, + "logits/rejected": -1.331768274307251, + "logps/chosen": -53.300804138183594, + "logps/rejected": -73.4027099609375, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9778902530670166, + "rewards/margins": 6.012190818786621, + "rewards/rejected": -6.990081310272217, + "step": 1265 + }, + { + "epoch": 7.502222222222223, + "grad_norm": 3.830093081169558, + "learning_rate": 5.117514686876378e-09, + "logits/chosen": -0.9744676351547241, + "logits/rejected": -1.0355535745620728, + "logps/chosen": -41.61549758911133, + "logps/rejected": -81.38633728027344, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6650229692459106, + "rewards/margins": 7.193120956420898, + "rewards/rejected": -7.858143329620361, + "step": 1266 + }, + { + "epoch": 7.508148148148148, + "grad_norm": 3.0439991616018536, + "learning_rate": 4.987572777591764e-09, + "logits/chosen": -1.0361356735229492, + "logits/rejected": -1.044396162033081, + "logps/chosen": -53.80352783203125, + "logps/rejected": -73.08494567871094, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4982792139053345, + "rewards/margins": 6.756043910980225, + "rewards/rejected": -8.25432300567627, + "step": 1267 + }, + { + "epoch": 7.514074074074074, + "grad_norm": 4.075110447550201, + "learning_rate": 4.859285247319656e-09, + "logits/chosen": -0.9112200736999512, + "logits/rejected": -1.039853572845459, + "logps/chosen": -42.56827926635742, + "logps/rejected": -66.28154754638672, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0129791498184204, + "rewards/margins": 5.994237422943115, + "rewards/rejected": -7.007215976715088, + "step": 1268 + }, + { + "epoch": 7.52, + "grad_norm": 3.4429810335067987, + "learning_rate": 4.732652962286282e-09, + "logits/chosen": -1.154706358909607, + "logits/rejected": -1.2327191829681396, + "logps/chosen": -49.44975280761719, + "logps/rejected": -87.4102783203125, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3737388849258423, + "rewards/margins": 5.274635314941406, + "rewards/rejected": -6.648374080657959, + "step": 1269 + }, + { + "epoch": 7.525925925925926, + "grad_norm": 3.4529101663641466, + "learning_rate": 4.607676777541342e-09, + "logits/chosen": -1.2047605514526367, + "logits/rejected": -1.2406158447265625, + "logps/chosen": -49.28171157836914, + "logps/rejected": -71.44245910644531, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.400432586669922, + "rewards/margins": 5.764103889465332, + "rewards/rejected": -8.164535522460938, + "step": 1270 + }, + { + "epoch": 7.531851851851852, + "grad_norm": 5.798561624464363, + "learning_rate": 4.4843575369521155e-09, + "logits/chosen": -1.054225206375122, + "logits/rejected": -1.1656428575515747, + "logps/chosen": -76.8630142211914, + "logps/rejected": -89.7891845703125, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.780390739440918, + "rewards/margins": 6.693795680999756, + "rewards/rejected": -9.474185943603516, + "step": 1271 + }, + { + "epoch": 7.5377777777777775, + "grad_norm": 4.78754195364835, + "learning_rate": 4.362696073197863e-09, + "logits/chosen": -1.1592118740081787, + "logits/rejected": -1.1719485521316528, + "logps/chosen": -48.26700973510742, + "logps/rejected": -58.30637741088867, + "loss": 0.0405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4617045223712921, + "rewards/margins": 4.804683685302734, + "rewards/rejected": -5.266387939453125, + "step": 1272 + }, + { + "epoch": 7.543703703703704, + "grad_norm": 4.792872315009618, + "learning_rate": 4.242693207764159e-09, + "logits/chosen": -1.2573219537734985, + "logits/rejected": -1.3435938358306885, + "logps/chosen": -53.14029312133789, + "logps/rejected": -70.92604064941406, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6087773442268372, + "rewards/margins": 5.773610591888428, + "rewards/rejected": -6.382387638092041, + "step": 1273 + }, + { + "epoch": 7.54962962962963, + "grad_norm": 5.643117139650082, + "learning_rate": 4.12434975093734e-09, + "logits/chosen": -1.1559739112854004, + "logits/rejected": -1.184909701347351, + "logps/chosen": -47.83567810058594, + "logps/rejected": -61.247657775878906, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013514861464500427, + "rewards/margins": 5.217415809631348, + "rewards/rejected": -5.203901290893555, + "step": 1274 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 2.517671598871281, + "learning_rate": 4.007666501799012e-09, + "logits/chosen": -1.1557645797729492, + "logits/rejected": -1.2193787097930908, + "logps/chosen": -48.145084381103516, + "logps/rejected": -79.78754425048828, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46319612860679626, + "rewards/margins": 5.226775169372559, + "rewards/rejected": -5.689971446990967, + "step": 1275 + }, + { + "epoch": 7.561481481481481, + "grad_norm": 5.050595644131196, + "learning_rate": 3.89264424822075e-09, + "logits/chosen": -1.086578607559204, + "logits/rejected": -1.21217942237854, + "logps/chosen": -50.246849060058594, + "logps/rejected": -81.24800872802734, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6358373165130615, + "rewards/margins": 5.405806064605713, + "rewards/rejected": -7.041643142700195, + "step": 1276 + }, + { + "epoch": 7.567407407407408, + "grad_norm": 2.7619258267022118, + "learning_rate": 3.779283766858682e-09, + "logits/chosen": -1.2855738401412964, + "logits/rejected": -1.2698637247085571, + "logps/chosen": -36.27191925048828, + "logps/rejected": -65.68701171875, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6610238552093506, + "rewards/margins": 4.722266674041748, + "rewards/rejected": -5.3832902908325195, + "step": 1277 + }, + { + "epoch": 7.573333333333333, + "grad_norm": 3.396277904250881, + "learning_rate": 3.667585823148217e-09, + "logits/chosen": -1.1778568029403687, + "logits/rejected": -1.2962253093719482, + "logps/chosen": -55.17719268798828, + "logps/rejected": -69.75767517089844, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.281670093536377, + "rewards/margins": 5.099809646606445, + "rewards/rejected": -6.381479740142822, + "step": 1278 + }, + { + "epoch": 7.579259259259259, + "grad_norm": 4.09050137430788, + "learning_rate": 3.5575511712990504e-09, + "logits/chosen": -1.24432373046875, + "logits/rejected": -1.2919775247573853, + "logps/chosen": -57.73225402832031, + "logps/rejected": -78.25653076171875, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2200543880462646, + "rewards/margins": 5.297441482543945, + "rewards/rejected": -6.517495155334473, + "step": 1279 + }, + { + "epoch": 7.5851851851851855, + "grad_norm": 4.522465031479605, + "learning_rate": 3.4491805542899155e-09, + "logits/chosen": -1.247235894203186, + "logits/rejected": -1.3041255474090576, + "logps/chosen": -44.53873062133789, + "logps/rejected": -68.25907897949219, + "loss": 0.0374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.625839114189148, + "rewards/margins": 4.951447010040283, + "rewards/rejected": -5.5772857666015625, + "step": 1280 + }, + { + "epoch": 7.591111111111111, + "grad_norm": 4.526740048559791, + "learning_rate": 3.342474703863507e-09, + "logits/chosen": -1.3184285163879395, + "logits/rejected": -1.3301334381103516, + "logps/chosen": -52.55039978027344, + "logps/rejected": -82.96316528320312, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1241881847381592, + "rewards/margins": 6.9125471115112305, + "rewards/rejected": -8.036735534667969, + "step": 1281 + }, + { + "epoch": 7.597037037037037, + "grad_norm": 4.89773536148764, + "learning_rate": 3.2374343405217884e-09, + "logits/chosen": -1.1311254501342773, + "logits/rejected": -1.1638489961624146, + "logps/chosen": -44.68597412109375, + "logps/rejected": -67.75846099853516, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7200304865837097, + "rewards/margins": 5.980922222137451, + "rewards/rejected": -6.700952529907227, + "step": 1282 + }, + { + "epoch": 7.6029629629629625, + "grad_norm": 2.5581980505049606, + "learning_rate": 3.1340601735209137e-09, + "logits/chosen": -1.3053855895996094, + "logits/rejected": -1.351278305053711, + "logps/chosen": -47.90487289428711, + "logps/rejected": -71.97268676757812, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7411696910858154, + "rewards/margins": 5.495450973510742, + "rewards/rejected": -6.2366204261779785, + "step": 1283 + }, + { + "epoch": 7.608888888888889, + "grad_norm": 3.7529698234479163, + "learning_rate": 3.0323529008664807e-09, + "logits/chosen": -1.1116161346435547, + "logits/rejected": -1.2262192964553833, + "logps/chosen": -51.4724006652832, + "logps/rejected": -72.905029296875, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7296598553657532, + "rewards/margins": 6.045794486999512, + "rewards/rejected": -6.775454521179199, + "step": 1284 + }, + { + "epoch": 7.614814814814815, + "grad_norm": 4.194004453718564, + "learning_rate": 2.9323132093088954e-09, + "logits/chosen": -0.99857497215271, + "logits/rejected": -1.1457947492599487, + "logps/chosen": -40.48921203613281, + "logps/rejected": -64.35589599609375, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7001577019691467, + "rewards/margins": 6.241533279418945, + "rewards/rejected": -6.941690921783447, + "step": 1285 + }, + { + "epoch": 7.62074074074074, + "grad_norm": 4.078023392295576, + "learning_rate": 2.833941774338655e-09, + "logits/chosen": -1.1729825735092163, + "logits/rejected": -1.300710916519165, + "logps/chosen": -44.66774368286133, + "logps/rejected": -74.90933227539062, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47788554430007935, + "rewards/margins": 7.087083339691162, + "rewards/rejected": -7.564969062805176, + "step": 1286 + }, + { + "epoch": 7.626666666666667, + "grad_norm": 3.8564092032512263, + "learning_rate": 2.7372392601817675e-09, + "logits/chosen": -1.2528834342956543, + "logits/rejected": -1.1977287530899048, + "logps/chosen": -51.391719818115234, + "logps/rejected": -76.97032928466797, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8206769227981567, + "rewards/margins": 6.537014961242676, + "rewards/rejected": -7.357692718505859, + "step": 1287 + }, + { + "epoch": 7.632592592592593, + "grad_norm": 4.116461804075974, + "learning_rate": 2.6422063197953926e-09, + "logits/chosen": -1.3762859106063843, + "logits/rejected": -1.4735721349716187, + "logps/chosen": -54.44892883300781, + "logps/rejected": -75.15809631347656, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.953680157661438, + "rewards/margins": 5.054767608642578, + "rewards/rejected": -6.008447170257568, + "step": 1288 + }, + { + "epoch": 7.638518518518518, + "grad_norm": 3.935374520816731, + "learning_rate": 2.548843594863348e-09, + "logits/chosen": -1.0619758367538452, + "logits/rejected": -1.0472029447555542, + "logps/chosen": -56.92826461791992, + "logps/rejected": -77.54266357421875, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1233818531036377, + "rewards/margins": 6.907271385192871, + "rewards/rejected": -8.03065299987793, + "step": 1289 + }, + { + "epoch": 7.644444444444445, + "grad_norm": 5.08850792852047, + "learning_rate": 2.4571517157916944e-09, + "logits/chosen": -1.102142095565796, + "logits/rejected": -1.2241641283035278, + "logps/chosen": -38.644996643066406, + "logps/rejected": -68.89688110351562, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49043506383895874, + "rewards/margins": 5.104089736938477, + "rewards/rejected": -5.594525337219238, + "step": 1290 + }, + { + "epoch": 7.6503703703703705, + "grad_norm": 4.098051438580383, + "learning_rate": 2.3671313017046557e-09, + "logits/chosen": -1.258387804031372, + "logits/rejected": -1.3357480764389038, + "logps/chosen": -55.586822509765625, + "logps/rejected": -69.7269287109375, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8577889204025269, + "rewards/margins": 6.0642290115356445, + "rewards/rejected": -7.922017574310303, + "step": 1291 + }, + { + "epoch": 7.656296296296296, + "grad_norm": 3.4376186407034077, + "learning_rate": 2.27878296044029e-09, + "logits/chosen": -1.2755929231643677, + "logits/rejected": -1.3270050287246704, + "logps/chosen": -50.85591125488281, + "logps/rejected": -72.466796875, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7081777453422546, + "rewards/margins": 5.251181602478027, + "rewards/rejected": -5.959359169006348, + "step": 1292 + }, + { + "epoch": 7.662222222222223, + "grad_norm": 3.023785417947227, + "learning_rate": 2.1921072885464633e-09, + "logits/chosen": -1.2608731985092163, + "logits/rejected": -1.3410117626190186, + "logps/chosen": -44.24419021606445, + "logps/rejected": -62.62995147705078, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6164356470108032, + "rewards/margins": 4.980905532836914, + "rewards/rejected": -5.597341537475586, + "step": 1293 + }, + { + "epoch": 7.668148148148148, + "grad_norm": 3.9892458828568556, + "learning_rate": 2.1071048712768545e-09, + "logits/chosen": -1.1189539432525635, + "logits/rejected": -1.13387131690979, + "logps/chosen": -44.46137619018555, + "logps/rejected": -64.4898452758789, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.322096586227417, + "rewards/margins": 5.642068862915039, + "rewards/rejected": -5.964165687561035, + "step": 1294 + }, + { + "epoch": 7.674074074074074, + "grad_norm": 3.060773123486522, + "learning_rate": 2.0237762825868752e-09, + "logits/chosen": -1.179800271987915, + "logits/rejected": -1.2812291383743286, + "logps/chosen": -57.156944274902344, + "logps/rejected": -73.54817199707031, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3885247707366943, + "rewards/margins": 6.368008613586426, + "rewards/rejected": -7.756533145904541, + "step": 1295 + }, + { + "epoch": 7.68, + "grad_norm": 3.6760645913887284, + "learning_rate": 1.9421220851298657e-09, + "logits/chosen": -1.1212431192398071, + "logits/rejected": -1.2247122526168823, + "logps/chosen": -49.63182830810547, + "logps/rejected": -77.47969055175781, + "loss": 0.0266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0002086162567139, + "rewards/margins": 5.092729568481445, + "rewards/rejected": -6.092937469482422, + "step": 1296 + }, + { + "epoch": 7.685925925925926, + "grad_norm": 3.9251535192169014, + "learning_rate": 1.8621428302533492e-09, + "logits/chosen": -1.1562796831130981, + "logits/rejected": -1.1696228981018066, + "logps/chosen": -50.657447814941406, + "logps/rejected": -73.50979614257812, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1022233963012695, + "rewards/margins": 5.4585466384887695, + "rewards/rejected": -6.560770034790039, + "step": 1297 + }, + { + "epoch": 7.691851851851852, + "grad_norm": 2.8265644326242656, + "learning_rate": 1.7838390579952567e-09, + "logits/chosen": -1.055465579032898, + "logits/rejected": -1.1630399227142334, + "logps/chosen": -48.33147048950195, + "logps/rejected": -65.17540740966797, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8744292855262756, + "rewards/margins": 6.498607635498047, + "rewards/rejected": -7.373037338256836, + "step": 1298 + }, + { + "epoch": 7.697777777777778, + "grad_norm": 4.040001168249482, + "learning_rate": 1.7072112970802633e-09, + "logits/chosen": -1.2334121465682983, + "logits/rejected": -1.3498921394348145, + "logps/chosen": -43.995628356933594, + "logps/rejected": -73.16790008544922, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6232740879058838, + "rewards/margins": 5.393784523010254, + "rewards/rejected": -6.017059326171875, + "step": 1299 + }, + { + "epoch": 7.703703703703704, + "grad_norm": 4.734494032620544, + "learning_rate": 1.6322600649162354e-09, + "logits/chosen": -1.0275055170059204, + "logits/rejected": -1.0896284580230713, + "logps/chosen": -49.858680725097656, + "logps/rejected": -62.68039321899414, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7178724408149719, + "rewards/margins": 4.1225457191467285, + "rewards/rejected": -4.840417861938477, + "step": 1300 + }, + { + "epoch": 7.70962962962963, + "grad_norm": 6.242524488086293, + "learning_rate": 1.5589858675907618e-09, + "logits/chosen": -1.3054200410842896, + "logits/rejected": -1.3421804904937744, + "logps/chosen": -52.420509338378906, + "logps/rejected": -70.79682159423828, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030725568532943726, + "rewards/margins": 4.999563217163086, + "rewards/rejected": -5.0302886962890625, + "step": 1301 + }, + { + "epoch": 7.7155555555555555, + "grad_norm": 4.116512529426615, + "learning_rate": 1.4873891998677112e-09, + "logits/chosen": -1.1158173084259033, + "logits/rejected": -1.3114423751831055, + "logps/chosen": -43.815330505371094, + "logps/rejected": -63.80662155151367, + "loss": 0.0338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29764774441719055, + "rewards/margins": 4.472417831420898, + "rewards/rejected": -4.770065784454346, + "step": 1302 + }, + { + "epoch": 7.721481481481481, + "grad_norm": 5.115315312723463, + "learning_rate": 1.4174705451838743e-09, + "logits/chosen": -1.3780591487884521, + "logits/rejected": -1.323752999305725, + "logps/chosen": -50.43423843383789, + "logps/rejected": -63.291046142578125, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10267294943332672, + "rewards/margins": 4.516208648681641, + "rewards/rejected": -4.618881702423096, + "step": 1303 + }, + { + "epoch": 7.727407407407408, + "grad_norm": 3.1452341302331384, + "learning_rate": 1.3492303756457158e-09, + "logits/chosen": -1.3025587797164917, + "logits/rejected": -1.4482269287109375, + "logps/chosen": -52.70558166503906, + "logps/rejected": -90.87263488769531, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.543006181716919, + "rewards/margins": 7.467383861541748, + "rewards/rejected": -10.010390281677246, + "step": 1304 + }, + { + "epoch": 7.733333333333333, + "grad_norm": 3.6249017461365907, + "learning_rate": 1.2826691520262112e-09, + "logits/chosen": -1.281097650527954, + "logits/rejected": -1.2951496839523315, + "logps/chosen": -43.05864334106445, + "logps/rejected": -68.95799255371094, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8746381998062134, + "rewards/margins": 5.1224164962768555, + "rewards/rejected": -5.9970550537109375, + "step": 1305 + }, + { + "epoch": 7.739259259259259, + "grad_norm": 4.401779096214971, + "learning_rate": 1.2177873237617375e-09, + "logits/chosen": -1.2022982835769653, + "logits/rejected": -1.216440200805664, + "logps/chosen": -62.892704010009766, + "logps/rejected": -64.79084777832031, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.37397301197052, + "rewards/margins": 5.186420440673828, + "rewards/rejected": -6.560393810272217, + "step": 1306 + }, + { + "epoch": 7.745185185185186, + "grad_norm": 4.374384420762003, + "learning_rate": 1.1545853289489927e-09, + "logits/chosen": -1.10579252243042, + "logits/rejected": -1.1660892963409424, + "logps/chosen": -39.71818542480469, + "logps/rejected": -54.736663818359375, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8321702480316162, + "rewards/margins": 4.650611877441406, + "rewards/rejected": -5.482782363891602, + "step": 1307 + }, + { + "epoch": 7.751111111111111, + "grad_norm": 4.6211551819830445, + "learning_rate": 1.0930635943420253e-09, + "logits/chosen": -1.1341114044189453, + "logits/rejected": -1.2060933113098145, + "logps/chosen": -41.124813079833984, + "logps/rejected": -74.39026641845703, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.612218976020813, + "rewards/margins": 6.23983097076416, + "rewards/rejected": -6.852049827575684, + "step": 1308 + }, + { + "epoch": 7.757037037037037, + "grad_norm": 3.088853964662203, + "learning_rate": 1.0332225353494318e-09, + "logits/chosen": -1.0827617645263672, + "logits/rejected": -1.1487969160079956, + "logps/chosen": -60.728294372558594, + "logps/rejected": -70.6673355102539, + "loss": 0.025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9318689703941345, + "rewards/margins": 5.681600570678711, + "rewards/rejected": -6.61346960067749, + "step": 1309 + }, + { + "epoch": 7.762962962962963, + "grad_norm": 3.895796986682556, + "learning_rate": 9.750625560315528e-10, + "logits/chosen": -1.2412290573120117, + "logits/rejected": -1.1642299890518188, + "logps/chosen": -56.81752014160156, + "logps/rejected": -69.71331787109375, + "loss": 0.0306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.648478627204895, + "rewards/margins": 5.765992164611816, + "rewards/rejected": -6.414470672607422, + "step": 1310 + }, + { + "epoch": 7.768888888888889, + "grad_norm": 4.639857829007214, + "learning_rate": 9.185840490975594e-10, + "logits/chosen": -0.9799739122390747, + "logits/rejected": -0.985424816608429, + "logps/chosen": -50.25916290283203, + "logps/rejected": -67.01441192626953, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46648460626602173, + "rewards/margins": 6.257501602172852, + "rewards/rejected": -6.723986625671387, + "step": 1311 + }, + { + "epoch": 7.774814814814815, + "grad_norm": 4.5476728089506535, + "learning_rate": 8.637873959031206e-10, + "logits/chosen": -1.2363190650939941, + "logits/rejected": -1.3365875482559204, + "logps/chosen": -44.06676483154297, + "logps/rejected": -64.12844848632812, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49657464027404785, + "rewards/margins": 4.909327507019043, + "rewards/rejected": -5.405901908874512, + "step": 1312 + }, + { + "epoch": 7.7807407407407405, + "grad_norm": 2.8518153992439164, + "learning_rate": 8.106729664475176e-10, + "logits/chosen": -1.3071974515914917, + "logits/rejected": -1.4458099603652954, + "logps/chosen": -40.395233154296875, + "logps/rejected": -70.80223083496094, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7430353760719299, + "rewards/margins": 5.836602210998535, + "rewards/rejected": -6.579637050628662, + "step": 1313 + }, + { + "epoch": 7.786666666666667, + "grad_norm": 4.626148207060677, + "learning_rate": 7.592411193713122e-10, + "logits/chosen": -1.2434099912643433, + "logits/rejected": -1.327384352684021, + "logps/chosen": -55.95722198486328, + "logps/rejected": -87.88571166992188, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.251700520515442, + "rewards/margins": 5.727961540222168, + "rewards/rejected": -6.97966194152832, + "step": 1314 + }, + { + "epoch": 7.792592592592593, + "grad_norm": 3.873445572325953, + "learning_rate": 7.094922019539318e-10, + "logits/chosen": -1.0838680267333984, + "logits/rejected": -1.211837649345398, + "logps/chosen": -36.55458450317383, + "logps/rejected": -60.68565368652344, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6588367223739624, + "rewards/margins": 5.787833213806152, + "rewards/rejected": -6.446670055389404, + "step": 1315 + }, + { + "epoch": 7.798518518518518, + "grad_norm": 3.5831381543129233, + "learning_rate": 6.61426550111227e-10, + "logits/chosen": -1.0004241466522217, + "logits/rejected": -1.0982451438903809, + "logps/chosen": -43.107383728027344, + "logps/rejected": -79.90531921386719, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6238442063331604, + "rewards/margins": 6.600798606872559, + "rewards/rejected": -7.224643707275391, + "step": 1316 + }, + { + "epoch": 7.804444444444444, + "grad_norm": 4.434687124407557, + "learning_rate": 6.150444883933348e-10, + "logits/chosen": -1.3124284744262695, + "logits/rejected": -1.3378134965896606, + "logps/chosen": -51.23555374145508, + "logps/rejected": -83.02464294433594, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7402288317680359, + "rewards/margins": 6.272833824157715, + "rewards/rejected": -7.013063430786133, + "step": 1317 + }, + { + "epoch": 7.810370370370371, + "grad_norm": 3.7031935136455956, + "learning_rate": 5.703463299823186e-10, + "logits/chosen": -1.2910884618759155, + "logits/rejected": -1.3359012603759766, + "logps/chosen": -39.01679992675781, + "logps/rejected": -87.91451263427734, + "loss": 0.0337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3888285160064697, + "rewards/margins": 8.184977531433105, + "rewards/rejected": -8.573806762695312, + "step": 1318 + }, + { + "epoch": 7.816296296296296, + "grad_norm": 3.280126945946635, + "learning_rate": 5.27332376690226e-10, + "logits/chosen": -1.2180700302124023, + "logits/rejected": -1.213331937789917, + "logps/chosen": -47.62986755371094, + "logps/rejected": -80.37245178222656, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8481385707855225, + "rewards/margins": 5.588566303253174, + "rewards/rejected": -6.436704635620117, + "step": 1319 + }, + { + "epoch": 7.822222222222222, + "grad_norm": 4.285051151960186, + "learning_rate": 4.860029189569237e-10, + "logits/chosen": -1.158015251159668, + "logits/rejected": -1.0862349271774292, + "logps/chosen": -57.20246124267578, + "logps/rejected": -61.53093719482422, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9121618270874023, + "rewards/margins": 5.656917572021484, + "rewards/rejected": -6.569079399108887, + "step": 1320 + }, + { + "epoch": 7.8281481481481485, + "grad_norm": 3.161324387187779, + "learning_rate": 4.463582358482376e-10, + "logits/chosen": -0.9615970849990845, + "logits/rejected": -1.0251832008361816, + "logps/chosen": -48.84516525268555, + "logps/rejected": -81.93156433105469, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4886060655117035, + "rewards/margins": 6.42310905456543, + "rewards/rejected": -6.911715507507324, + "step": 1321 + }, + { + "epoch": 7.834074074074074, + "grad_norm": 3.873331978451794, + "learning_rate": 4.083985950539548e-10, + "logits/chosen": -1.2927745580673218, + "logits/rejected": -1.2420415878295898, + "logps/chosen": -56.42285919189453, + "logps/rejected": -77.27031707763672, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0932049751281738, + "rewards/margins": 6.1244635581970215, + "rewards/rejected": -7.217668533325195, + "step": 1322 + }, + { + "epoch": 7.84, + "grad_norm": 2.9511525570711648, + "learning_rate": 3.721242528861024e-10, + "logits/chosen": -1.0464425086975098, + "logits/rejected": -1.0884580612182617, + "logps/chosen": -46.888404846191406, + "logps/rejected": -66.87775421142578, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9894177913665771, + "rewards/margins": 5.856396675109863, + "rewards/rejected": -6.845814228057861, + "step": 1323 + }, + { + "epoch": 7.8459259259259255, + "grad_norm": 2.9263330192188524, + "learning_rate": 3.3753545427722687e-10, + "logits/chosen": -1.2160556316375732, + "logits/rejected": -1.2635902166366577, + "logps/chosen": -50.17317199707031, + "logps/rejected": -78.7654037475586, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6943256258964539, + "rewards/margins": 5.114144802093506, + "rewards/rejected": -5.808470249176025, + "step": 1324 + }, + { + "epoch": 7.851851851851852, + "grad_norm": 4.191017644309632, + "learning_rate": 3.0463243277864534e-10, + "logits/chosen": -1.2540899515151978, + "logits/rejected": -1.260279893875122, + "logps/chosen": -50.96095657348633, + "logps/rejected": -63.11212158203125, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22250595688819885, + "rewards/margins": 5.157078266143799, + "rewards/rejected": -5.379584312438965, + "step": 1325 + }, + { + "epoch": 7.857777777777778, + "grad_norm": 3.78918431544521, + "learning_rate": 2.734154105589748e-10, + "logits/chosen": -1.2464838027954102, + "logits/rejected": -1.232421875, + "logps/chosen": -39.673423767089844, + "logps/rejected": -56.34529113769531, + "loss": 0.0322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18317142128944397, + "rewards/margins": 5.008199691772461, + "rewards/rejected": -5.191371440887451, + "step": 1326 + }, + { + "epoch": 7.863703703703703, + "grad_norm": 3.794118871416381, + "learning_rate": 2.4388459840257724e-10, + "logits/chosen": -1.1672354936599731, + "logits/rejected": -1.2810213565826416, + "logps/chosen": -44.134117126464844, + "logps/rejected": -68.70030975341797, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8252700567245483, + "rewards/margins": 6.5332794189453125, + "rewards/rejected": -7.35854959487915, + "step": 1327 + }, + { + "epoch": 7.86962962962963, + "grad_norm": 5.5075916247068655, + "learning_rate": 2.1604019570811704e-10, + "logits/chosen": -1.1713049411773682, + "logits/rejected": -1.2617361545562744, + "logps/chosen": -54.49160385131836, + "logps/rejected": -71.02827453613281, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0777705907821655, + "rewards/margins": 6.524770736694336, + "rewards/rejected": -7.602540969848633, + "step": 1328 + }, + { + "epoch": 7.875555555555556, + "grad_norm": 4.0258473530695, + "learning_rate": 1.8988239048725595e-10, + "logits/chosen": -1.265380859375, + "logits/rejected": -1.3449554443359375, + "logps/chosen": -49.377262115478516, + "logps/rejected": -76.43204498291016, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4428175687789917, + "rewards/margins": 6.493406295776367, + "rewards/rejected": -7.936223983764648, + "step": 1329 + }, + { + "epoch": 7.881481481481481, + "grad_norm": 3.4526817942675954, + "learning_rate": 1.6541135936343208e-10, + "logits/chosen": -1.0449414253234863, + "logits/rejected": -1.052280068397522, + "logps/chosen": -59.01203536987305, + "logps/rejected": -109.54611206054688, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.199272632598877, + "rewards/margins": 7.87403678894043, + "rewards/rejected": -9.073308944702148, + "step": 1330 + }, + { + "epoch": 7.887407407407407, + "grad_norm": 3.4515341264688244, + "learning_rate": 1.426272675704998e-10, + "logits/chosen": -1.0453736782073975, + "logits/rejected": -1.174626350402832, + "logps/chosen": -51.22376251220703, + "logps/rejected": -77.333740234375, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14971916377544403, + "rewards/margins": 7.365710735321045, + "rewards/rejected": -7.515429496765137, + "step": 1331 + }, + { + "epoch": 7.8933333333333335, + "grad_norm": 4.2431530547055685, + "learning_rate": 1.2153026895178608e-10, + "logits/chosen": -1.2558284997940063, + "logits/rejected": -1.230025053024292, + "logps/chosen": -61.31620788574219, + "logps/rejected": -71.90235137939453, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9064638614654541, + "rewards/margins": 6.033405303955078, + "rewards/rejected": -6.939869403839111, + "step": 1332 + }, + { + "epoch": 7.899259259259259, + "grad_norm": 4.070328981967792, + "learning_rate": 1.0212050595895249e-10, + "logits/chosen": -1.1583914756774902, + "logits/rejected": -1.1680545806884766, + "logps/chosen": -50.851749420166016, + "logps/rejected": -60.996482849121094, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11362215876579285, + "rewards/margins": 5.627436637878418, + "rewards/rejected": -5.513814926147461, + "step": 1333 + }, + { + "epoch": 7.905185185185185, + "grad_norm": 3.568567101282868, + "learning_rate": 8.439810965113481e-11, + "logits/chosen": -1.3088608980178833, + "logits/rejected": -1.3825346231460571, + "logps/chosen": -40.22638702392578, + "logps/rejected": -64.48670959472656, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6336486339569092, + "rewards/margins": 4.98115873336792, + "rewards/rejected": -5.61480712890625, + "step": 1334 + }, + { + "epoch": 7.911111111111111, + "grad_norm": 3.0558534132855786, + "learning_rate": 6.836319969388827e-11, + "logits/chosen": -1.2456482648849487, + "logits/rejected": -1.3389393091201782, + "logps/chosen": -49.01996612548828, + "logps/rejected": -72.98611450195312, + "loss": 0.0312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6034674644470215, + "rewards/margins": 5.329452037811279, + "rewards/rejected": -5.932919502258301, + "step": 1335 + }, + { + "epoch": 7.917037037037037, + "grad_norm": 3.3169742529023445, + "learning_rate": 5.4015884358549204e-11, + "logits/chosen": -1.1440975666046143, + "logits/rejected": -1.1358253955841064, + "logps/chosen": -50.89934539794922, + "logps/rejected": -66.2007064819336, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8520979881286621, + "rewards/margins": 6.465874195098877, + "rewards/rejected": -7.317971229553223, + "step": 1336 + }, + { + "epoch": 7.922962962962963, + "grad_norm": 4.224810833586431, + "learning_rate": 4.135626052143015e-11, + "logits/chosen": -1.095085620880127, + "logits/rejected": -1.1812586784362793, + "logps/chosen": -46.71308898925781, + "logps/rejected": -73.55007934570312, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1982324123382568, + "rewards/margins": 6.405635356903076, + "rewards/rejected": -7.603867530822754, + "step": 1337 + }, + { + "epoch": 7.928888888888888, + "grad_norm": 5.206045992657888, + "learning_rate": 3.0384413663125944e-11, + "logits/chosen": -1.204892635345459, + "logits/rejected": -1.2737915515899658, + "logps/chosen": -50.526206970214844, + "logps/rejected": -62.54721450805664, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9793308973312378, + "rewards/margins": 3.79130482673645, + "rewards/rejected": -4.770635604858398, + "step": 1338 + }, + { + "epoch": 7.934814814814815, + "grad_norm": 4.090212912205461, + "learning_rate": 2.110041786804184e-11, + "logits/chosen": -1.1907331943511963, + "logits/rejected": -1.2384510040283203, + "logps/chosen": -58.870052337646484, + "logps/rejected": -83.39106750488281, + "loss": 0.029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.765225887298584, + "rewards/margins": 6.5894670486450195, + "rewards/rejected": -8.354693412780762, + "step": 1339 + }, + { + "epoch": 7.940740740740741, + "grad_norm": 3.288258776425581, + "learning_rate": 1.350433582381072e-11, + "logits/chosen": -1.0594743490219116, + "logits/rejected": -1.071734070777893, + "logps/chosen": -42.901512145996094, + "logps/rejected": -69.27069091796875, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2702757716178894, + "rewards/margins": 5.44796895980835, + "rewards/rejected": -5.718244552612305, + "step": 1340 + }, + { + "epoch": 7.946666666666666, + "grad_norm": 4.191786397560346, + "learning_rate": 7.596218820876688e-12, + "logits/chosen": -1.2260366678237915, + "logits/rejected": -1.3320492506027222, + "logps/chosen": -66.5378646850586, + "logps/rejected": -68.939208984375, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2590943574905396, + "rewards/margins": 5.165265083312988, + "rewards/rejected": -6.424359321594238, + "step": 1341 + }, + { + "epoch": 7.952592592592593, + "grad_norm": 3.1073122257922434, + "learning_rate": 3.376106752134289e-12, + "logits/chosen": -1.0636171102523804, + "logits/rejected": -1.1753029823303223, + "logps/chosen": -33.64069747924805, + "logps/rejected": -56.730262756347656, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11559783667325974, + "rewards/margins": 5.496131420135498, + "rewards/rejected": -5.380533695220947, + "step": 1342 + }, + { + "epoch": 7.9585185185185185, + "grad_norm": 3.4153108889579182, + "learning_rate": 8.440281127897186e-13, + "logits/chosen": -1.0520870685577393, + "logits/rejected": -1.1678812503814697, + "logps/chosen": -58.156349182128906, + "logps/rejected": -92.31648254394531, + "loss": 0.0247, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6064121127128601, + "rewards/margins": 8.176019668579102, + "rewards/rejected": -8.782432556152344, + "step": 1343 + }, + { + "epoch": 7.964444444444444, + "grad_norm": 3.2275881385738243, + "learning_rate": 0.0, + "logits/chosen": -0.9758695363998413, + "logits/rejected": -0.9926738142967224, + "logps/chosen": -50.735172271728516, + "logps/rejected": -66.82237243652344, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9227455258369446, + "rewards/margins": 5.92061710357666, + "rewards/rejected": -6.843362331390381, + "step": 1344 + }, + { + "epoch": 7.964444444444444, + "step": 1344, + "total_flos": 0.0, + "train_loss": 0.19260043763954723, + "train_runtime": 3828.823, + "train_samples_per_second": 22.555, + "train_steps_per_second": 0.351 + } + ], + "logging_steps": 1, + "max_steps": 1344, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}