{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 0.1786131818333053, "learning_rate": 6.25e-08, "logits/chosen": -1.4481778144836426, "logits/rejected": -1.4499433040618896, "logps/chosen": -7.982362270355225, "logps/rejected": -8.15577507019043, "loss": -0.0009, "rewards/accuracies": 0.5, "rewards/chosen": -7.982362270355225, "rewards/margins": 0.17341338098049164, "rewards/rejected": -8.15577507019043, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 0.07282553733728953, "learning_rate": 1.25e-07, "logits/chosen": -1.4464797973632812, "logits/rejected": -1.4372261762619019, "logps/chosen": -8.047597885131836, "logps/rejected": -7.961185455322266, "loss": 0.0001, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.047597885131836, "rewards/margins": -0.08641364425420761, "rewards/rejected": -7.961185455322266, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 0.1915776567981181, "learning_rate": 1.875e-07, "logits/chosen": -1.4458509683609009, "logits/rejected": -1.4425008296966553, "logps/chosen": -7.852419853210449, "logps/rejected": -7.86798095703125, "loss": -0.0011, "rewards/accuracies": 0.5, "rewards/chosen": -7.852419853210449, "rewards/margins": 0.01556050218641758, "rewards/rejected": -7.86798095703125, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 0.18043036746469734, "learning_rate": 2.5e-07, "logits/chosen": -1.433124303817749, "logits/rejected": -1.4371713399887085, "logps/chosen": -8.189096450805664, "logps/rejected": -8.211885452270508, "loss": 0.0005, "rewards/accuracies": 0.5, "rewards/chosen": -8.189096450805664, "rewards/margins": 0.02278941310942173, "rewards/rejected": -8.211885452270508, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 0.0768554595715667, "learning_rate": 3.125e-07, "logits/chosen": -1.4702521562576294, "logits/rejected": -1.46065354347229, "logps/chosen": -8.118414878845215, "logps/rejected": -8.017342567443848, "loss": 0.0008, "rewards/accuracies": 0.4375, "rewards/chosen": -8.118414878845215, "rewards/margins": -0.10107225179672241, "rewards/rejected": -8.017342567443848, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 0.15370408069863134, "learning_rate": 3.75e-07, "logits/chosen": -1.4349619150161743, "logits/rejected": -1.4234455823898315, "logps/chosen": -7.866227626800537, "logps/rejected": -7.7843732833862305, "loss": 0.0009, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -7.866227626800537, "rewards/margins": -0.08185449987649918, "rewards/rejected": -7.7843732833862305, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 0.08486142616829283, "learning_rate": 4.3749999999999994e-07, "logits/chosen": -1.4395850896835327, "logits/rejected": -1.4178800582885742, "logps/chosen": -8.148027420043945, "logps/rejected": -8.014989852905273, "loss": -0.0007, "rewards/accuracies": 0.5, "rewards/chosen": -8.148027420043945, "rewards/margins": -0.13303671777248383, "rewards/rejected": -8.014989852905273, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 0.1990381181235585, "learning_rate": 5e-07, "logits/chosen": -1.4183995723724365, "logits/rejected": -1.4341777563095093, "logps/chosen": -8.116990089416504, "logps/rejected": -8.271265983581543, "loss": -0.0007, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.116990089416504, "rewards/margins": 0.15427525341510773, "rewards/rejected": -8.271265983581543, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 0.05622179554254164, "learning_rate": 5.625e-07, "logits/chosen": -1.4420310258865356, "logits/rejected": -1.442714810371399, "logps/chosen": -8.133280754089355, "logps/rejected": -7.922200679779053, "loss": -0.0001, "rewards/accuracies": 0.4375, "rewards/chosen": -8.133280754089355, "rewards/margins": -0.21108034253120422, "rewards/rejected": -7.922200679779053, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 0.09631702855440245, "learning_rate": 5.999678242522831e-07, "logits/chosen": -1.4332040548324585, "logits/rejected": -1.4520256519317627, "logps/chosen": -8.272871017456055, "logps/rejected": -8.261492729187012, "loss": 0.0006, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -8.272871017456055, "rewards/margins": -0.011378437280654907, "rewards/rejected": -8.261492729187012, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 0.06712571499202777, "learning_rate": 5.996059263493219e-07, "logits/chosen": -1.4460947513580322, "logits/rejected": -1.4431495666503906, "logps/chosen": -8.187493324279785, "logps/rejected": -8.160319328308105, "loss": 0.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -8.187493324279785, "rewards/margins": -0.02717405930161476, "rewards/rejected": -8.160319328308105, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 0.20114381829735603, "learning_rate": 5.988423976115163e-07, "logits/chosen": -1.4432101249694824, "logits/rejected": -1.455540657043457, "logps/chosen": -8.201032638549805, "logps/rejected": -8.437708854675293, "loss": -0.0006, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.201032638549805, "rewards/margins": 0.23667626082897186, "rewards/rejected": -8.437708854675293, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 0.17507947570299737, "learning_rate": 5.976782615723061e-07, "logits/chosen": -1.3914668560028076, "logits/rejected": -1.4124656915664673, "logps/chosen": -8.03328800201416, "logps/rejected": -8.427019119262695, "loss": 0.0001, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.03328800201416, "rewards/margins": 0.393731027841568, "rewards/rejected": -8.427019119262695, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 0.28383551734545676, "learning_rate": 5.961150787913738e-07, "logits/chosen": -1.399139404296875, "logits/rejected": -1.3929238319396973, "logps/chosen": -8.119722366333008, "logps/rejected": -8.134946823120117, "loss": 0.0014, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.119722366333008, "rewards/margins": 0.01522480882704258, "rewards/rejected": -8.134946823120117, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 0.05943092529446436, "learning_rate": 5.941549447626671e-07, "logits/chosen": -1.4151188135147095, "logits/rejected": -1.4230639934539795, "logps/chosen": -8.229839324951172, "logps/rejected": -8.183255195617676, "loss": -0.0001, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.229839324951172, "rewards/margins": -0.04658409580588341, "rewards/rejected": -8.183255195617676, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 0.11166728026103465, "learning_rate": 5.918004871053251e-07, "logits/chosen": -1.431121587753296, "logits/rejected": -1.4449329376220703, "logps/chosen": -8.453977584838867, "logps/rejected": -8.471723556518555, "loss": -0.0007, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -8.453977584838867, "rewards/margins": 0.017745357006788254, "rewards/rejected": -8.471723556518555, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 0.11291027423617692, "learning_rate": 5.890548620412763e-07, "logits/chosen": -1.4427986145019531, "logits/rejected": -1.4420478343963623, "logps/chosen": -8.645545959472656, "logps/rejected": -8.712440490722656, "loss": 0.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.645545959472656, "rewards/margins": 0.06689504534006119, "rewards/rejected": -8.712440490722656, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 0.2075325531424713, "learning_rate": 5.859217501642258e-07, "logits/chosen": -1.431888222694397, "logits/rejected": -1.4438416957855225, "logps/chosen": -8.67556381225586, "logps/rejected": -8.743408203125, "loss": -0.0001, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.67556381225586, "rewards/margins": 0.0678454041481018, "rewards/rejected": -8.743408203125, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 0.1350380165212238, "learning_rate": 5.824053515057091e-07, "logits/chosen": -1.4262675046920776, "logits/rejected": -1.4206186532974243, "logps/chosen": -8.711984634399414, "logps/rejected": -8.466314315795898, "loss": 0.0001, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -8.711984634399414, "rewards/margins": -0.24566936492919922, "rewards/rejected": -8.466314315795898, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 0.056801258116621844, "learning_rate": 5.785103799048218e-07, "logits/chosen": -1.4670069217681885, "logits/rejected": -1.4761860370635986, "logps/chosen": -8.660918235778809, "logps/rejected": -8.600263595581055, "loss": 0.0004, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -8.660918235778809, "rewards/margins": -0.060654301196336746, "rewards/rejected": -8.600263595581055, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 0.02647029175400585, "learning_rate": 5.742420566891749e-07, "logits/chosen": -1.4849385023117065, "logits/rejected": -1.4768640995025635, "logps/chosen": -8.584749221801758, "logps/rejected": -8.679868698120117, "loss": 0.0001, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.584749221801758, "rewards/margins": 0.09512145817279816, "rewards/rejected": -8.679868698120117, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 0.14957708248219087, "learning_rate": 5.696061036755478e-07, "logits/chosen": -1.537284255027771, "logits/rejected": -1.524402141571045, "logps/chosen": -9.06226921081543, "logps/rejected": -9.15291976928711, "loss": -0.0, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -9.06226921081543, "rewards/margins": 0.09065041691064835, "rewards/rejected": -9.15291976928711, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 0.04538978337584183, "learning_rate": 5.64608735499618e-07, "logits/chosen": -1.500104546546936, "logits/rejected": -1.4960343837738037, "logps/chosen": -9.003273010253906, "logps/rejected": -9.141524314880371, "loss": 0.0001, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -9.003273010253906, "rewards/margins": 0.1382521390914917, "rewards/rejected": -9.141524314880371, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 0.08148752548258974, "learning_rate": 5.592566512850545e-07, "logits/chosen": -1.5267969369888306, "logits/rejected": -1.522019624710083, "logps/chosen": -9.461637496948242, "logps/rejected": -9.392511367797852, "loss": 0.0003, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -9.461637496948242, "rewards/margins": -0.06912745535373688, "rewards/rejected": -9.392511367797852, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 0.011705138951069663, "learning_rate": 5.535570256631384e-07, "logits/chosen": -1.6196930408477783, "logits/rejected": -1.6130282878875732, "logps/chosen": -10.071660041809082, "logps/rejected": -9.955024719238281, "loss": 0.0003, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -10.071660041809082, "rewards/margins": -0.11663500964641571, "rewards/rejected": -9.955024719238281, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 0.03311359707714934, "learning_rate": 5.475174991549528e-07, "logits/chosen": -1.6836683750152588, "logits/rejected": -1.6870734691619873, "logps/chosen": -10.576017379760742, "logps/rejected": -10.587455749511719, "loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.576017379760742, "rewards/margins": 0.011437964625656605, "rewards/rejected": -10.587455749511719, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 0.0028807868975943276, "learning_rate": 5.411461679290317e-07, "logits/chosen": -1.716474175453186, "logits/rejected": -1.7054128646850586, "logps/chosen": -10.966715812683105, "logps/rejected": -11.17033576965332, "loss": 0.0, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -10.966715812683105, "rewards/margins": 0.2036197930574417, "rewards/rejected": -11.17033576965332, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 0.0071240218588725325, "learning_rate": 5.34451572948201e-07, "logits/chosen": -1.7624616622924805, "logits/rejected": -1.750759482383728, "logps/chosen": -11.156391143798828, "logps/rejected": -11.09487533569336, "loss": -0.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -11.156391143798828, "rewards/margins": -0.061515532433986664, "rewards/rejected": -11.09487533569336, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 0.007104778936656334, "learning_rate": 5.274426885201582e-07, "logits/chosen": -1.7734272480010986, "logits/rejected": -1.7652689218521118, "logps/chosen": -11.338696479797363, "logps/rejected": -11.272669792175293, "loss": 0.0001, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -11.338696479797363, "rewards/margins": -0.06602667272090912, "rewards/rejected": -11.272669792175293, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 0.0026785877800151164, "learning_rate": 5.201289102671411e-07, "logits/chosen": -1.7958892583847046, "logits/rejected": -1.7981617450714111, "logps/chosen": -11.584068298339844, "logps/rejected": -11.576414108276367, "loss": -0.0001, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -11.584068298339844, "rewards/margins": -0.007653522305190563, "rewards/rejected": -11.576414108276367, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 0.032494261377218156, "learning_rate": 5.12520042530811e-07, "logits/chosen": -1.7825043201446533, "logits/rejected": -1.7530311346054077, "logps/chosen": -11.603002548217773, "logps/rejected": -11.390130043029785, "loss": 0.0, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -11.603002548217773, "rewards/margins": -0.21287231147289276, "rewards/rejected": -11.390130043029785, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 0.011685158472598062, "learning_rate": 5.046262852292346e-07, "logits/chosen": -1.770146369934082, "logits/rejected": -1.7625595331192017, "logps/chosen": -11.565991401672363, "logps/rejected": -11.481651306152344, "loss": 0.0, "rewards/accuracies": 0.5, "rewards/chosen": -11.565991401672363, "rewards/margins": -0.08434131741523743, "rewards/rejected": -11.481651306152344, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 0.01649802186957788, "learning_rate": 4.964582201835856e-07, "logits/chosen": -1.800264596939087, "logits/rejected": -1.78522527217865, "logps/chosen": -11.585084915161133, "logps/rejected": -11.45960521697998, "loss": 0.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -11.585084915161133, "rewards/margins": -0.125479057431221, "rewards/rejected": -11.45960521697998, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 0.007019029699907713, "learning_rate": 4.880267969328908e-07, "logits/chosen": -1.7802222967147827, "logits/rejected": -1.7600643634796143, "logps/chosen": -11.943793296813965, "logps/rejected": -11.659793853759766, "loss": 0.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -11.943793296813965, "rewards/margins": -0.2839995324611664, "rewards/rejected": -11.659793853759766, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 0.05226386515779408, "learning_rate": 4.793433180558423e-07, "logits/chosen": -1.803500771522522, "logits/rejected": -1.780199646949768, "logps/chosen": -11.922220230102539, "logps/rejected": -11.642582893371582, "loss": -0.0001, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -11.922220230102539, "rewards/margins": -0.2796374261379242, "rewards/rejected": -11.642582893371582, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 0.028525783518634065, "learning_rate": 4.704194240193467e-07, "logits/chosen": -1.8105300664901733, "logits/rejected": -1.8023895025253296, "logps/chosen": -11.983869552612305, "logps/rejected": -11.885191917419434, "loss": -0.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -11.983869552612305, "rewards/margins": -0.09867729246616364, "rewards/rejected": -11.885191917419434, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 0.001398073561465244, "learning_rate": 4.6126707757412686e-07, "logits/chosen": -1.793454885482788, "logits/rejected": -1.7760779857635498, "logps/chosen": -11.986612319946289, "logps/rejected": -11.752717971801758, "loss": 0.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -11.986612319946289, "rewards/margins": -0.2338949739933014, "rewards/rejected": -11.752717971801758, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 0.0014862685239923532, "learning_rate": 4.5189854771829086e-07, "logits/chosen": -1.7965761423110962, "logits/rejected": -1.7916818857192993, "logps/chosen": -11.609036445617676, "logps/rejected": -11.723310470581055, "loss": -0.0, "rewards/accuracies": 0.46875, "rewards/chosen": -11.609036445617676, "rewards/margins": 0.11427430808544159, "rewards/rejected": -11.723310470581055, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 0.005759001469221081, "learning_rate": 4.4232639325036807e-07, "logits/chosen": -1.7666387557983398, "logits/rejected": -1.7522531747817993, "logps/chosen": -11.82097339630127, "logps/rejected": -11.537239074707031, "loss": 0.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -11.82097339630127, "rewards/margins": -0.28373217582702637, "rewards/rejected": -11.537239074707031, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 0.008163598439494825, "learning_rate": 4.32563445933859e-07, "logits/chosen": -1.7980865240097046, "logits/rejected": -1.799093246459961, "logps/chosen": -11.458420753479004, "logps/rejected": -11.457086563110352, "loss": 0.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -11.458420753479004, "rewards/margins": -0.001333725405856967, "rewards/rejected": -11.457086563110352, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 0.005297760091128626, "learning_rate": 4.226227932958664e-07, "logits/chosen": -1.7564678192138672, "logits/rejected": -1.7390811443328857, "logps/chosen": -11.681690216064453, "logps/rejected": -11.680364608764648, "loss": -0.0, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -11.681690216064453, "rewards/margins": -0.0013252407079562545, "rewards/rejected": -11.680364608764648, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 0.006991157705444288, "learning_rate": 4.1251776108286854e-07, "logits/chosen": -1.7594162225723267, "logits/rejected": -1.7496426105499268, "logps/chosen": -11.553954124450684, "logps/rejected": -11.592232704162598, "loss": -0.0, "rewards/accuracies": 0.4375, "rewards/chosen": -11.553954124450684, "rewards/margins": 0.03827826306223869, "rewards/rejected": -11.592232704162598, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 0.17801591448497012, "learning_rate": 4.022618953971514e-07, "logits/chosen": -1.7519290447235107, "logits/rejected": -1.7526050806045532, "logps/chosen": -11.367416381835938, "logps/rejected": -11.493242263793945, "loss": -0.0001, "rewards/accuracies": 0.5, "rewards/chosen": -11.367416381835938, "rewards/margins": 0.12582536041736603, "rewards/rejected": -11.493242263793945, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 0.023941549745350215, "learning_rate": 3.918689445378477e-07, "logits/chosen": -1.7803691625595093, "logits/rejected": -1.7690092325210571, "logps/chosen": -11.14652156829834, "logps/rejected": -11.06640625, "loss": -0.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -11.14652156829834, "rewards/margins": -0.08011455833911896, "rewards/rejected": -11.06640625, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 0.012228246875019251, "learning_rate": 3.813528405709251e-07, "logits/chosen": -1.763047456741333, "logits/rejected": -1.748477578163147, "logps/chosen": -10.784479141235352, "logps/rejected": -10.884133338928223, "loss": -0.0, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -10.784479141235352, "rewards/margins": 0.09965618699789047, "rewards/rejected": -10.884133338928223, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 0.025471289629283092, "learning_rate": 3.707276806528282e-07, "logits/chosen": -1.7505123615264893, "logits/rejected": -1.726243257522583, "logps/chosen": -10.95793342590332, "logps/rejected": -10.815945625305176, "loss": -0.0004, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -10.95793342590332, "rewards/margins": -0.1419888436794281, "rewards/rejected": -10.815945625305176, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 0.041173650635296204, "learning_rate": 3.6000770813281334e-07, "logits/chosen": -1.7271692752838135, "logits/rejected": -1.7334954738616943, "logps/chosen": -10.472602844238281, "logps/rejected": -10.564123153686523, "loss": 0.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -10.472602844238281, "rewards/margins": 0.09151904284954071, "rewards/rejected": -10.564123153686523, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 0.08997133666429326, "learning_rate": 3.4920729345930654e-07, "logits/chosen": -1.7182731628417969, "logits/rejected": -1.7184534072875977, "logps/chosen": -10.474161148071289, "logps/rejected": -10.386842727661133, "loss": -0.0002, "rewards/accuracies": 0.4375, "rewards/chosen": -10.474161148071289, "rewards/margins": -0.08731891214847565, "rewards/rejected": -10.386842727661133, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 0.1394130468386245, "learning_rate": 3.383409149158814e-07, "logits/chosen": -1.7007522583007812, "logits/rejected": -1.7036033868789673, "logps/chosen": -10.160537719726562, "logps/rejected": -10.112937927246094, "loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.160537719726562, "rewards/margins": -0.047600626945495605, "rewards/rejected": -10.112937927246094, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 0.020206583655585347, "learning_rate": 3.2742313921268035e-07, "logits/chosen": -1.6959460973739624, "logits/rejected": -1.6945714950561523, "logps/chosen": -10.04688549041748, "logps/rejected": -10.157293319702148, "loss": 0.0004, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -10.04688549041748, "rewards/margins": 0.11040810495615005, "rewards/rejected": -10.157293319702148, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 0.02211534123468387, "learning_rate": 3.1646860195929825e-07, "logits/chosen": -1.7011514902114868, "logits/rejected": -1.6924482583999634, "logps/chosen": -10.177471160888672, "logps/rejected": -10.2760648727417, "loss": -0.0, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -10.177471160888672, "rewards/margins": 0.09859313070774078, "rewards/rejected": -10.2760648727417, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 0.01462304817094237, "learning_rate": 3.054919880453032e-07, "logits/chosen": -1.635061264038086, "logits/rejected": -1.6313447952270508, "logps/chosen": -10.064990043640137, "logps/rejected": -10.172506332397461, "loss": 0.0002, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.064990043640137, "rewards/margins": 0.10751698166131973, "rewards/rejected": -10.172506332397461, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 0.025471797170515387, "learning_rate": 2.9450801195469686e-07, "logits/chosen": -1.705881118774414, "logits/rejected": -1.7156155109405518, "logps/chosen": -10.338391304016113, "logps/rejected": -10.457733154296875, "loss": -0.0001, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -10.338391304016113, "rewards/margins": 0.11934256553649902, "rewards/rejected": -10.457733154296875, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 0.12522976970659572, "learning_rate": 2.835313980407017e-07, "logits/chosen": -1.7283920049667358, "logits/rejected": -1.693943977355957, "logps/chosen": -10.727476119995117, "logps/rejected": -10.667070388793945, "loss": 0.0002, "rewards/accuracies": 0.5, "rewards/chosen": -10.727476119995117, "rewards/margins": -0.06040619686245918, "rewards/rejected": -10.667070388793945, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 0.054630239668369114, "learning_rate": 2.7257686078731973e-07, "logits/chosen": -1.7700306177139282, "logits/rejected": -1.7575089931488037, "logps/chosen": -10.57455825805664, "logps/rejected": -10.53483772277832, "loss": 0.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -10.57455825805664, "rewards/margins": -0.039720237255096436, "rewards/rejected": -10.53483772277832, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 0.01001763169581052, "learning_rate": 2.6165908508411857e-07, "logits/chosen": -1.7749627828598022, "logits/rejected": -1.7755094766616821, "logps/chosen": -10.61900806427002, "logps/rejected": -10.749244689941406, "loss": -0.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -10.61900806427002, "rewards/margins": 0.13023611903190613, "rewards/rejected": -10.749244689941406, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 0.005742953865366501, "learning_rate": 2.5079270654069354e-07, "logits/chosen": -1.7149406671524048, "logits/rejected": -1.7182254791259766, "logps/chosen": -10.701220512390137, "logps/rejected": -10.742330551147461, "loss": -0.0, "rewards/accuracies": 0.5, "rewards/chosen": -10.701220512390137, "rewards/margins": 0.04111091420054436, "rewards/rejected": -10.742330551147461, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 0.07672060396160495, "learning_rate": 2.399922918671867e-07, "logits/chosen": -1.7587283849716187, "logits/rejected": -1.765321969985962, "logps/chosen": -10.592453956604004, "logps/rejected": -10.729036331176758, "loss": -0.0002, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -10.592453956604004, "rewards/margins": 0.13658304512500763, "rewards/rejected": -10.729036331176758, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 0.02428213775826746, "learning_rate": 2.2927231934717176e-07, "logits/chosen": -1.7775872945785522, "logits/rejected": -1.7704694271087646, "logps/chosen": -10.622035026550293, "logps/rejected": -10.568597793579102, "loss": -0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.622035026550293, "rewards/margins": -0.053437769412994385, "rewards/rejected": -10.568597793579102, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 0.02860916331635737, "learning_rate": 2.1864715942907487e-07, "logits/chosen": -1.7398102283477783, "logits/rejected": -1.73589289188385, "logps/chosen": -10.513009071350098, "logps/rejected": -10.603796005249023, "loss": 0.0001, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -10.513009071350098, "rewards/margins": 0.09078951179981232, "rewards/rejected": -10.603796005249023, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 0.006648402581775306, "learning_rate": 2.081310554621522e-07, "logits/chosen": -1.7437083721160889, "logits/rejected": -1.7521770000457764, "logps/chosen": -10.70260238647461, "logps/rejected": -10.796627044677734, "loss": 0.0, "rewards/accuracies": 0.46875, "rewards/chosen": -10.70260238647461, "rewards/margins": 0.09402483701705933, "rewards/rejected": -10.796627044677734, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 0.019271183476255463, "learning_rate": 1.9773810460284862e-07, "logits/chosen": -1.7898304462432861, "logits/rejected": -1.788010835647583, "logps/chosen": -10.732610702514648, "logps/rejected": -10.593297958374023, "loss": 0.0, "rewards/accuracies": 0.46875, "rewards/chosen": -10.732610702514648, "rewards/margins": -0.1393119990825653, "rewards/rejected": -10.593297958374023, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 0.02906299326320322, "learning_rate": 1.874822389171314e-07, "logits/chosen": -1.7548414468765259, "logits/rejected": -1.7560005187988281, "logps/chosen": -10.635225296020508, "logps/rejected": -10.656919479370117, "loss": -0.0007, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -10.635225296020508, "rewards/margins": 0.02169397473335266, "rewards/rejected": -10.656919479370117, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 0.014440585705017344, "learning_rate": 1.7737720670413356e-07, "logits/chosen": -1.796992540359497, "logits/rejected": -1.7862850427627563, "logps/chosen": -10.90754508972168, "logps/rejected": -10.712991714477539, "loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.90754508972168, "rewards/margins": -0.19455406069755554, "rewards/rejected": -10.712991714477539, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 0.04236411566791533, "learning_rate": 1.6743655406614095e-07, "logits/chosen": -1.7757809162139893, "logits/rejected": -1.762711524963379, "logps/chosen": -10.787301063537598, "logps/rejected": -10.89518928527832, "loss": -0.0002, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -10.787301063537598, "rewards/margins": 0.10788729041814804, "rewards/rejected": -10.89518928527832, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 0.027523703126398503, "learning_rate": 1.5767360674963198e-07, "logits/chosen": -1.749119520187378, "logits/rejected": -1.7414264678955078, "logps/chosen": -10.739618301391602, "logps/rejected": -10.675132751464844, "loss": -0.0, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -10.739618301391602, "rewards/margins": -0.06448470056056976, "rewards/rejected": -10.675132751464844, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 0.014789739092952364, "learning_rate": 1.4810145228170922e-07, "logits/chosen": -1.7625138759613037, "logits/rejected": -1.7610912322998047, "logps/chosen": -10.482194900512695, "logps/rejected": -10.562524795532227, "loss": 0.0, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -10.482194900512695, "rewards/margins": 0.0803314819931984, "rewards/rejected": -10.562524795532227, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 0.01922207578951387, "learning_rate": 1.3873292242587306e-07, "logits/chosen": -1.7635924816131592, "logits/rejected": -1.7583682537078857, "logps/chosen": -10.654876708984375, "logps/rejected": -10.662198066711426, "loss": 0.0, "rewards/accuracies": 0.46875, "rewards/chosen": -10.654876708984375, "rewards/margins": 0.0073205530643463135, "rewards/rejected": -10.662198066711426, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 0.03539872090887725, "learning_rate": 1.295805759806533e-07, "logits/chosen": -1.7667583227157593, "logits/rejected": -1.7786306142807007, "logps/chosen": -10.630470275878906, "logps/rejected": -10.790535926818848, "loss": -0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.630470275878906, "rewards/margins": 0.16006465256214142, "rewards/rejected": -10.790535926818848, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 0.022967566874532427, "learning_rate": 1.2065668194415777e-07, "logits/chosen": -1.7326412200927734, "logits/rejected": -1.729612946510315, "logps/chosen": -10.603561401367188, "logps/rejected": -10.510968208312988, "loss": -0.0001, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -10.603561401367188, "rewards/margins": -0.09259293973445892, "rewards/rejected": -10.510968208312988, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 0.012752562963674085, "learning_rate": 1.1197320306710923e-07, "logits/chosen": -1.762995958328247, "logits/rejected": -1.743704080581665, "logps/chosen": -10.666765213012695, "logps/rejected": -10.419971466064453, "loss": 0.0001, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -10.666765213012695, "rewards/margins": -0.24679343402385712, "rewards/rejected": -10.419971466064453, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 0.009738702402905076, "learning_rate": 1.035417798164145e-07, "logits/chosen": -1.747469186782837, "logits/rejected": -1.739599585533142, "logps/chosen": -10.463155746459961, "logps/rejected": -10.498678207397461, "loss": 0.0, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -10.463155746459961, "rewards/margins": 0.03552195057272911, "rewards/rejected": -10.498678207397461, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 0.06431835444147056, "learning_rate": 9.537371477076535e-08, "logits/chosen": -1.7092878818511963, "logits/rejected": -1.6988884210586548, "logps/chosen": -10.540400505065918, "logps/rejected": -10.477300643920898, "loss": 0.0, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -10.540400505065918, "rewards/margins": -0.06309934705495834, "rewards/rejected": -10.477300643920898, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 0.052909632347048964, "learning_rate": 8.747995746918898e-08, "logits/chosen": -1.7701542377471924, "logits/rejected": -1.758774757385254, "logps/chosen": -10.620153427124023, "logps/rejected": -10.608236312866211, "loss": 0.0, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -10.620153427124023, "rewards/margins": -0.011916184797883034, "rewards/rejected": -10.608236312866211, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 0.012450945922480083, "learning_rate": 7.987108973285888e-08, "logits/chosen": -1.7810709476470947, "logits/rejected": -1.7817420959472656, "logps/chosen": -10.626587867736816, "logps/rejected": -10.649564743041992, "loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.626587867736816, "rewards/margins": 0.022976160049438477, "rewards/rejected": -10.649564743041992, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 0.004460244436581798, "learning_rate": 7.255731147984174e-08, "logits/chosen": -1.7550201416015625, "logits/rejected": -1.7136766910552979, "logps/chosen": -10.767892837524414, "logps/rejected": -10.837907791137695, "loss": 0.0001, "rewards/accuracies": 0.53125, "rewards/chosen": -10.767892837524414, "rewards/margins": 0.0700153112411499, "rewards/rejected": -10.837907791137695, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 0.019593651768819848, "learning_rate": 6.554842705179898e-08, "logits/chosen": -1.778969407081604, "logits/rejected": -1.75950026512146, "logps/chosen": -10.800111770629883, "logps/rejected": -10.750751495361328, "loss": -0.0, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -10.800111770629883, "rewards/margins": -0.04935937374830246, "rewards/rejected": -10.750751495361328, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 0.03689590756531894, "learning_rate": 5.885383207096832e-08, "logits/chosen": -1.78683602809906, "logits/rejected": -1.7717739343643188, "logps/chosen": -10.598950386047363, "logps/rejected": -10.57533073425293, "loss": 0.0, "rewards/accuracies": 0.4375, "rewards/chosen": -10.598950386047363, "rewards/margins": -0.02361871860921383, "rewards/rejected": -10.57533073425293, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 0.012548371625903876, "learning_rate": 5.2482500845047165e-08, "logits/chosen": -1.743941307067871, "logits/rejected": -1.7405914068222046, "logps/chosen": -10.441683769226074, "logps/rejected": -10.442001342773438, "loss": -0.0005, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -10.441683769226074, "rewards/margins": 0.00031833647517487407, "rewards/rejected": -10.442001342773438, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 0.013565529452432605, "learning_rate": 4.644297433686162e-08, "logits/chosen": -1.741358757019043, "logits/rejected": -1.7224204540252686, "logps/chosen": -10.534662246704102, "logps/rejected": -10.434551239013672, "loss": -0.0002, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.534662246704102, "rewards/margins": -0.10010989010334015, "rewards/rejected": -10.434551239013672, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 0.10515870132328753, "learning_rate": 4.074334871494558e-08, "logits/chosen": -1.7736488580703735, "logits/rejected": -1.7777044773101807, "logps/chosen": -10.679037094116211, "logps/rejected": -10.682011604309082, "loss": -0.0002, "rewards/accuracies": 0.46875, "rewards/chosen": -10.679037094116211, "rewards/margins": 0.002974617527797818, "rewards/rejected": -10.682011604309082, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 0.009978883732705937, "learning_rate": 3.5391264500382e-08, "logits/chosen": -1.774903655052185, "logits/rejected": -1.7690646648406982, "logps/chosen": -10.4728364944458, "logps/rejected": -10.374021530151367, "loss": 0.0001, "rewards/accuracies": 0.5, "rewards/chosen": -10.4728364944458, "rewards/margins": -0.09881408512592316, "rewards/rejected": -10.374021530151367, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 0.006390665968550676, "learning_rate": 3.0393896324452226e-08, "logits/chosen": -1.7874574661254883, "logits/rejected": -1.7859035730361938, "logps/chosen": -10.748700141906738, "logps/rejected": -10.698121070861816, "loss": 0.0, "rewards/accuracies": 0.4375, "rewards/chosen": -10.748700141906738, "rewards/margins": -0.050578661262989044, "rewards/rejected": -10.698121070861816, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 0.0968516068570453, "learning_rate": 2.5757943310825026e-08, "logits/chosen": -1.7406418323516846, "logits/rejected": -1.7307815551757812, "logps/chosen": -10.560035705566406, "logps/rejected": -10.44668960571289, "loss": -0.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -10.560035705566406, "rewards/margins": -0.11334645748138428, "rewards/rejected": -10.44668960571289, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 0.00590008646483764, "learning_rate": 2.148962009517823e-08, "logits/chosen": -1.7577073574066162, "logits/rejected": -1.7527620792388916, "logps/chosen": -10.674304008483887, "logps/rejected": -10.642423629760742, "loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.674304008483887, "rewards/margins": -0.03188156336545944, "rewards/rejected": -10.642423629760742, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 0.013120062560596776, "learning_rate": 1.759464849429082e-08, "logits/chosen": -1.7664194107055664, "logits/rejected": -1.760266900062561, "logps/chosen": -10.494427680969238, "logps/rejected": -10.483784675598145, "loss": -0.0002, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -10.494427680969238, "rewards/margins": -0.010644030757248402, "rewards/rejected": -10.483784675598145, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 0.1742770180554167, "learning_rate": 1.4078249835774169e-08, "logits/chosen": -1.778710961341858, "logits/rejected": -1.7815498113632202, "logps/chosen": -10.63943862915039, "logps/rejected": -10.601329803466797, "loss": 0.0007, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -10.63943862915039, "rewards/margins": -0.03810780122876167, "rewards/rejected": -10.601329803466797, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 0.0404689579696443, "learning_rate": 1.0945137958723705e-08, "logits/chosen": -1.7349185943603516, "logits/rejected": -1.730891227722168, "logps/chosen": -10.740056991577148, "logps/rejected": -10.685806274414062, "loss": 0.0004, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -10.740056991577148, "rewards/margins": -0.054250918328762054, "rewards/rejected": -10.685806274414062, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 0.09262486522903134, "learning_rate": 8.19951289467482e-09, "logits/chosen": -1.7872415781021118, "logits/rejected": -1.776834487915039, "logps/chosen": -10.619129180908203, "logps/rejected": -10.654691696166992, "loss": 0.0001, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -10.619129180908203, "rewards/margins": 0.03556279465556145, "rewards/rejected": -10.654691696166992, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 0.02488581461219639, "learning_rate": 5.84505523733293e-09, "logits/chosen": -1.7146650552749634, "logits/rejected": -1.6959879398345947, "logps/chosen": -10.7011137008667, "logps/rejected": -10.602450370788574, "loss": 0.0001, "rewards/accuracies": 0.4375, "rewards/chosen": -10.7011137008667, "rewards/margins": -0.09866499900817871, "rewards/rejected": -10.602450370788574, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 0.09420372269289778, "learning_rate": 3.8849212086261466e-09, "logits/chosen": -1.7629196643829346, "logits/rejected": -1.7624677419662476, "logps/chosen": -10.534662246704102, "logps/rejected": -10.774109840393066, "loss": 0.0001, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -10.534662246704102, "rewards/margins": 0.23944692313671112, "rewards/rejected": -10.774109840393066, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 0.012886454137460854, "learning_rate": 2.3217384276938756e-09, "logits/chosen": -1.7755534648895264, "logits/rejected": -1.7767679691314697, "logps/chosen": -10.819222450256348, "logps/rejected": -10.87352180480957, "loss": -0.0, "rewards/accuracies": 0.46875, "rewards/chosen": -10.819222450256348, "rewards/margins": 0.05429871007800102, "rewards/rejected": -10.87352180480957, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 0.12756283914628896, "learning_rate": 1.1576023884836472e-09, "logits/chosen": -1.783143401145935, "logits/rejected": -1.7813498973846436, "logps/chosen": -10.936141967773438, "logps/rejected": -10.884490013122559, "loss": -0.0003, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -10.936141967773438, "rewards/margins": -0.051651883870363235, "rewards/rejected": -10.884490013122559, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 0.03220532320786913, "learning_rate": 3.940736506780395e-10, "logits/chosen": -1.7732326984405518, "logits/rejected": -1.7703964710235596, "logps/chosen": -10.523382186889648, "logps/rejected": -10.703128814697266, "loss": 0.0001, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -10.523382186889648, "rewards/margins": 0.17974743247032166, "rewards/rejected": -10.703128814697266, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 0.015956952957413534, "learning_rate": 3.2175747716822744e-11, "logits/chosen": -1.7491505146026611, "logits/rejected": -1.731268286705017, "logps/chosen": -10.880552291870117, "logps/rejected": -10.799489974975586, "loss": -0.0, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -10.880552291870117, "rewards/margins": -0.08106319606304169, "rewards/rejected": -10.799489974975586, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 4.822632607948269e-06, "train_runtime": 8193.4913, "train_samples_per_second": 7.461, "train_steps_per_second": 0.058 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }