{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1207, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.7548209366391183e-09, "logits/chosen": 26.39580726623535, "logits/rejected": 11.82745361328125, "logps/chosen": -16.1988468170166, "logps/rejected": -13.144353866577148, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.7548209366391185e-08, "logits/chosen": 29.872276306152344, "logits/rejected": 14.677568435668945, "logps/chosen": -20.20920181274414, "logps/rejected": -13.73128604888916, "loss": 0.691, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0067731961607933044, "rewards/margins": 0.006475849077105522, "rewards/rejected": 0.00029734655981883407, "step": 10 }, { "epoch": 0.02, "learning_rate": 5.509641873278237e-08, "logits/chosen": 30.209009170532227, "logits/rejected": 15.056164741516113, "logps/chosen": -18.60057258605957, "logps/rejected": -11.032133102416992, "loss": 0.6935, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0034766769967973232, "rewards/margins": -0.0025064717046916485, "rewards/rejected": 0.005983148701488972, "step": 20 }, { "epoch": 0.02, "learning_rate": 8.264462809917355e-08, "logits/chosen": 28.76483154296875, "logits/rejected": 15.631364822387695, "logps/chosen": -15.23334789276123, "logps/rejected": -11.55754280090332, "loss": 0.6896, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.006374981254339218, "rewards/margins": 0.012253050692379475, "rewards/rejected": -0.005878068506717682, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.1019283746556474e-07, "logits/chosen": 28.015995025634766, "logits/rejected": 15.524843215942383, "logps/chosen": -14.355989456176758, "logps/rejected": -13.594694137573242, "loss": 0.6963, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0007919473573565483, "rewards/margins": -0.001215279451571405, "rewards/rejected": 0.0020072259940207005, "step": 40 }, { "epoch": 0.04, "learning_rate": 1.3774104683195592e-07, "logits/chosen": 26.29638671875, "logits/rejected": 14.062334060668945, "logps/chosen": -16.20824432373047, "logps/rejected": -12.649689674377441, "loss": 0.6942, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.010644015856087208, "rewards/margins": -0.0052523259073495865, "rewards/rejected": -0.005391689948737621, "step": 50 }, { "epoch": 0.05, "learning_rate": 1.652892561983471e-07, "logits/chosen": 29.316747665405273, "logits/rejected": 16.020191192626953, "logps/chosen": -12.744729995727539, "logps/rejected": -11.136804580688477, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002563739661127329, "rewards/margins": 0.005123462527990341, "rewards/rejected": -0.00768720218911767, "step": 60 }, { "epoch": 0.06, "learning_rate": 1.928374655647383e-07, "logits/chosen": 26.675777435302734, "logits/rejected": 14.060956001281738, "logps/chosen": -16.239505767822266, "logps/rejected": -12.4938325881958, "loss": 0.6939, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0003385665186215192, "rewards/margins": -0.002058954443782568, "rewards/rejected": 0.0017203886527568102, "step": 70 }, { "epoch": 0.07, "learning_rate": 2.2038567493112948e-07, "logits/chosen": 28.196887969970703, "logits/rejected": 12.054227828979492, "logps/chosen": -20.904470443725586, "logps/rejected": -13.934288024902344, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": 0.003977104090154171, "rewards/margins": 0.015087218955159187, "rewards/rejected": -0.011110116727650166, "step": 80 }, { "epoch": 0.07, "learning_rate": 2.4793388429752067e-07, "logits/chosen": 29.7618408203125, "logits/rejected": 18.205202102661133, "logps/chosen": -15.01855754852295, "logps/rejected": -11.685895919799805, "loss": 0.6915, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0006884575122967362, "rewards/margins": 0.005596242845058441, "rewards/rejected": -0.006284697912633419, "step": 90 }, { "epoch": 0.08, "learning_rate": 2.7548209366391183e-07, "logits/chosen": 30.90869140625, "logits/rejected": 15.959096908569336, "logps/chosen": -17.325950622558594, "logps/rejected": -12.195378303527832, "loss": 0.6876, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.010991076938807964, "rewards/margins": 0.013710225000977516, "rewards/rejected": -0.0027191475965082645, "step": 100 }, { "epoch": 0.09, "learning_rate": 3.0303030303030305e-07, "logits/chosen": 28.79201316833496, "logits/rejected": 15.219340324401855, "logps/chosen": -18.680936813354492, "logps/rejected": -13.7983980178833, "loss": 0.6877, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.004993617534637451, "rewards/margins": -0.00810275413095951, "rewards/rejected": 0.0031091361306607723, "step": 110 }, { "epoch": 0.1, "learning_rate": 3.305785123966942e-07, "logits/chosen": 27.478748321533203, "logits/rejected": 15.220988273620605, "logps/chosen": -16.01573944091797, "logps/rejected": -12.17924690246582, "loss": 0.6792, "rewards/accuracies": 0.6875, "rewards/chosen": 0.014243041165173054, "rewards/margins": 0.034918706864118576, "rewards/rejected": -0.020675668492913246, "step": 120 }, { "epoch": 0.11, "learning_rate": 3.5812672176308537e-07, "logits/chosen": 26.560028076171875, "logits/rejected": 15.939570426940918, "logps/chosen": -13.826791763305664, "logps/rejected": -13.941461563110352, "loss": 0.6773, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.019950710237026215, "rewards/margins": 0.04326733201742172, "rewards/rejected": -0.02331661805510521, "step": 130 }, { "epoch": 0.12, "learning_rate": 3.856749311294766e-07, "logits/chosen": 26.08416748046875, "logits/rejected": 14.266347885131836, "logps/chosen": -15.164468765258789, "logps/rejected": -11.680855751037598, "loss": 0.6802, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.011501764878630638, "rewards/margins": 0.026589352637529373, "rewards/rejected": -0.015087584033608437, "step": 140 }, { "epoch": 0.12, "learning_rate": 4.1322314049586775e-07, "logits/chosen": 25.376188278198242, "logits/rejected": 14.326393127441406, "logps/chosen": -13.939798355102539, "logps/rejected": -11.30695915222168, "loss": 0.6736, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02343142405152321, "rewards/margins": 0.04499483481049538, "rewards/rejected": -0.021563410758972168, "step": 150 }, { "epoch": 0.13, "learning_rate": 4.4077134986225897e-07, "logits/chosen": 26.731481552124023, "logits/rejected": 15.117622375488281, "logps/chosen": -16.316083908081055, "logps/rejected": -12.077122688293457, "loss": 0.6702, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.016491735354065895, "rewards/margins": 0.03659318760037422, "rewards/rejected": -0.020101450383663177, "step": 160 }, { "epoch": 0.14, "learning_rate": 4.6831955922865013e-07, "logits/chosen": 24.21108055114746, "logits/rejected": 13.150289535522461, "logps/chosen": -15.752470016479492, "logps/rejected": -12.763819694519043, "loss": 0.6669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015269671566784382, "rewards/margins": 0.05473564937710762, "rewards/rejected": -0.03946598246693611, "step": 170 }, { "epoch": 0.15, "learning_rate": 4.958677685950413e-07, "logits/chosen": 25.526866912841797, "logits/rejected": 13.504974365234375, "logps/chosen": -15.648371696472168, "logps/rejected": -14.616043090820312, "loss": 0.6593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03144051134586334, "rewards/margins": 0.08548189699649811, "rewards/rejected": -0.05404139310121536, "step": 180 }, { "epoch": 0.16, "learning_rate": 5.234159779614325e-07, "logits/chosen": 27.430185317993164, "logits/rejected": 14.744453430175781, "logps/chosen": -15.322090148925781, "logps/rejected": -12.112442970275879, "loss": 0.6537, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.049348168075084686, "rewards/margins": 0.10144704580307007, "rewards/rejected": -0.05209888145327568, "step": 190 }, { "epoch": 0.17, "learning_rate": 5.509641873278237e-07, "logits/chosen": 25.273468017578125, "logits/rejected": 13.581573486328125, "logps/chosen": -15.621078491210938, "logps/rejected": -12.692342758178711, "loss": 0.6464, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03991089388728142, "rewards/margins": 0.09212763607501984, "rewards/rejected": -0.05221674591302872, "step": 200 }, { "epoch": 0.17, "learning_rate": 5.785123966942148e-07, "logits/chosen": 25.644643783569336, "logits/rejected": 13.767237663269043, "logps/chosen": -15.617544174194336, "logps/rejected": -12.023015975952148, "loss": 0.644, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04201408103108406, "rewards/margins": 0.09177117049694061, "rewards/rejected": -0.04975708946585655, "step": 210 }, { "epoch": 0.18, "learning_rate": 6.060606060606061e-07, "logits/chosen": 24.790884017944336, "logits/rejected": 12.790756225585938, "logps/chosen": -15.314257621765137, "logps/rejected": -14.318506240844727, "loss": 0.6307, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.038176413625478745, "rewards/margins": 0.15190339088439941, "rewards/rejected": -0.11372697353363037, "step": 220 }, { "epoch": 0.19, "learning_rate": 6.336088154269972e-07, "logits/chosen": 22.88454818725586, "logits/rejected": 13.714921951293945, "logps/chosen": -14.685772895812988, "logps/rejected": -13.587165832519531, "loss": 0.6179, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.043118949979543686, "rewards/margins": 0.15061327815055847, "rewards/rejected": -0.10749433189630508, "step": 230 }, { "epoch": 0.2, "learning_rate": 6.611570247933884e-07, "logits/chosen": 21.898698806762695, "logits/rejected": 11.047361373901367, "logps/chosen": -15.72697925567627, "logps/rejected": -14.030769348144531, "loss": 0.6285, "rewards/accuracies": 0.875, "rewards/chosen": 0.03969469666481018, "rewards/margins": 0.1823449730873108, "rewards/rejected": -0.1426502764225006, "step": 240 }, { "epoch": 0.21, "learning_rate": 6.887052341597795e-07, "logits/chosen": 22.43340301513672, "logits/rejected": 12.1521577835083, "logps/chosen": -13.921714782714844, "logps/rejected": -12.171701431274414, "loss": 0.5992, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.016784045845270157, "rewards/margins": 0.17469708621501923, "rewards/rejected": -0.15791305899620056, "step": 250 }, { "epoch": 0.22, "learning_rate": 7.162534435261707e-07, "logits/chosen": 18.996856689453125, "logits/rejected": 11.13405990600586, "logps/chosen": -12.716665267944336, "logps/rejected": -13.552793502807617, "loss": 0.595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.018164673820137978, "rewards/margins": 0.222183495759964, "rewards/rejected": -0.20401883125305176, "step": 260 }, { "epoch": 0.22, "learning_rate": 7.43801652892562e-07, "logits/chosen": 19.36636734008789, "logits/rejected": 11.057047843933105, "logps/chosen": -14.496172904968262, "logps/rejected": -14.488624572753906, "loss": 0.5816, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.001324807875789702, "rewards/margins": 0.22037926316261292, "rewards/rejected": -0.22170408070087433, "step": 270 }, { "epoch": 0.23, "learning_rate": 7.713498622589532e-07, "logits/chosen": 19.02053451538086, "logits/rejected": 9.96888542175293, "logps/chosen": -16.628681182861328, "logps/rejected": -15.949708938598633, "loss": 0.5839, "rewards/accuracies": 0.8125, "rewards/chosen": -0.012945109978318214, "rewards/margins": 0.256003201007843, "rewards/rejected": -0.2689483165740967, "step": 280 }, { "epoch": 0.24, "learning_rate": 7.988980716253443e-07, "logits/chosen": 18.616968154907227, "logits/rejected": 8.835405349731445, "logps/chosen": -19.750667572021484, "logps/rejected": -16.19721031188965, "loss": 0.5653, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.06239085644483566, "rewards/margins": 0.2769063115119934, "rewards/rejected": -0.33929720520973206, "step": 290 }, { "epoch": 0.25, "learning_rate": 8.264462809917355e-07, "logits/chosen": 15.796917915344238, "logits/rejected": 8.689828872680664, "logps/chosen": -17.465213775634766, "logps/rejected": -15.414024353027344, "loss": 0.5615, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.08662676811218262, "rewards/margins": 0.2607734799385071, "rewards/rejected": -0.3474002778530121, "step": 300 }, { "epoch": 0.26, "learning_rate": 8.539944903581266e-07, "logits/chosen": 18.2047176361084, "logits/rejected": 9.365196228027344, "logps/chosen": -19.556568145751953, "logps/rejected": -17.901718139648438, "loss": 0.5593, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.08784887939691544, "rewards/margins": 0.3691648840904236, "rewards/rejected": -0.4570137560367584, "step": 310 }, { "epoch": 0.27, "learning_rate": 8.815426997245179e-07, "logits/chosen": 15.353642463684082, "logits/rejected": 6.739576816558838, "logps/chosen": -20.512248992919922, "logps/rejected": -19.218114852905273, "loss": 0.5561, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.17307022213935852, "rewards/margins": 0.36532771587371826, "rewards/rejected": -0.5383979678153992, "step": 320 }, { "epoch": 0.27, "learning_rate": 9.09090909090909e-07, "logits/chosen": 15.02954387664795, "logits/rejected": 7.253678798675537, "logps/chosen": -17.31052017211914, "logps/rejected": -19.950084686279297, "loss": 0.513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.12172117084264755, "rewards/margins": 0.5619670152664185, "rewards/rejected": -0.6836881041526794, "step": 330 }, { "epoch": 0.28, "learning_rate": 9.366391184573003e-07, "logits/chosen": 13.335912704467773, "logits/rejected": 6.360755443572998, "logps/chosen": -15.61326789855957, "logps/rejected": -18.34040069580078, "loss": 0.514, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11029072105884552, "rewards/margins": 0.49073973298072815, "rewards/rejected": -0.6010304689407349, "step": 340 }, { "epoch": 0.29, "learning_rate": 9.641873278236914e-07, "logits/chosen": 13.87597370147705, "logits/rejected": 6.252366065979004, "logps/chosen": -18.98870277404785, "logps/rejected": -21.158178329467773, "loss": 0.5214, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.24019315838813782, "rewards/margins": 0.5031756162643433, "rewards/rejected": -0.7433687448501587, "step": 350 }, { "epoch": 0.3, "learning_rate": 9.917355371900827e-07, "logits/chosen": 11.825353622436523, "logits/rejected": 5.63161039352417, "logps/chosen": -19.554107666015625, "logps/rejected": -23.050094604492188, "loss": 0.4963, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.21284595131874084, "rewards/margins": 0.669977068901062, "rewards/rejected": -0.88282310962677, "step": 360 }, { "epoch": 0.31, "learning_rate": 9.978514426028237e-07, "logits/chosen": 12.24281120300293, "logits/rejected": 5.460196495056152, "logps/chosen": -17.32269287109375, "logps/rejected": -20.322254180908203, "loss": 0.5042, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.14736561477184296, "rewards/margins": 0.6818751692771912, "rewards/rejected": -0.8292407989501953, "step": 370 }, { "epoch": 0.31, "learning_rate": 9.94782074892572e-07, "logits/chosen": 12.279787063598633, "logits/rejected": 4.998232841491699, "logps/chosen": -20.298416137695312, "logps/rejected": -18.53396224975586, "loss": 0.5261, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2723187804222107, "rewards/margins": 0.4115908145904541, "rewards/rejected": -0.6839095950126648, "step": 380 }, { "epoch": 0.32, "learning_rate": 9.917127071823205e-07, "logits/chosen": 10.357662200927734, "logits/rejected": 4.154456615447998, "logps/chosen": -18.979114532470703, "logps/rejected": -21.90044593811035, "loss": 0.4965, "rewards/accuracies": 0.875, "rewards/chosen": -0.21007630228996277, "rewards/margins": 0.7006437182426453, "rewards/rejected": -0.91072016954422, "step": 390 }, { "epoch": 0.33, "learning_rate": 9.886433394720688e-07, "logits/chosen": 11.203413009643555, "logits/rejected": 4.303916931152344, "logps/chosen": -21.126995086669922, "logps/rejected": -22.852222442626953, "loss": 0.4797, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.24745836853981018, "rewards/margins": 0.7489529848098755, "rewards/rejected": -0.9964112043380737, "step": 400 }, { "epoch": 0.34, "learning_rate": 9.85573971761817e-07, "logits/chosen": 10.214349746704102, "logits/rejected": 4.580574989318848, "logps/chosen": -18.805049896240234, "logps/rejected": -23.103439331054688, "loss": 0.4577, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.263306587934494, "rewards/margins": 0.6945715546607971, "rewards/rejected": -0.9578781127929688, "step": 410 }, { "epoch": 0.35, "learning_rate": 9.825046040515653e-07, "logits/chosen": 8.766363143920898, "logits/rejected": 3.4318103790283203, "logps/chosen": -19.84041404724121, "logps/rejected": -24.33705711364746, "loss": 0.4796, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.3555639386177063, "rewards/margins": 0.8206856846809387, "rewards/rejected": -1.176249623298645, "step": 420 }, { "epoch": 0.36, "learning_rate": 9.794352363413136e-07, "logits/chosen": 9.122425079345703, "logits/rejected": 3.945725917816162, "logps/chosen": -17.52169418334961, "logps/rejected": -21.919692993164062, "loss": 0.4498, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.30032050609588623, "rewards/margins": 0.7815459370613098, "rewards/rejected": -1.0818665027618408, "step": 430 }, { "epoch": 0.36, "learning_rate": 9.76365868631062e-07, "logits/chosen": 8.77065372467041, "logits/rejected": 3.7122809886932373, "logps/chosen": -19.154537200927734, "logps/rejected": -24.57700538635254, "loss": 0.4632, "rewards/accuracies": 0.75, "rewards/chosen": -0.34669268131256104, "rewards/margins": 0.7837158441543579, "rewards/rejected": -1.130408525466919, "step": 440 }, { "epoch": 0.37, "learning_rate": 9.732965009208102e-07, "logits/chosen": 8.956605911254883, "logits/rejected": 3.74959135055542, "logps/chosen": -20.082813262939453, "logps/rejected": -26.573284149169922, "loss": 0.4309, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.2830074429512024, "rewards/margins": 0.9917829632759094, "rewards/rejected": -1.2747904062271118, "step": 450 }, { "epoch": 0.38, "learning_rate": 9.702271332105585e-07, "logits/chosen": 8.408400535583496, "logits/rejected": 3.8332247734069824, "logps/chosen": -20.55516815185547, "logps/rejected": -28.63671875, "loss": 0.4418, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.41807669401168823, "rewards/margins": 0.9992788434028625, "rewards/rejected": -1.4173555374145508, "step": 460 }, { "epoch": 0.39, "learning_rate": 9.671577655003068e-07, "logits/chosen": 9.856401443481445, "logits/rejected": 3.718613386154175, "logps/chosen": -23.1437931060791, "logps/rejected": -23.95172882080078, "loss": 0.476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4323394298553467, "rewards/margins": 0.7092487812042236, "rewards/rejected": -1.1415880918502808, "step": 470 }, { "epoch": 0.4, "learning_rate": 9.640883977900553e-07, "logits/chosen": 8.667339324951172, "logits/rejected": 2.9023003578186035, "logps/chosen": -21.404876708984375, "logps/rejected": -26.862573623657227, "loss": 0.4335, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.39861372113227844, "rewards/margins": 0.9942604899406433, "rewards/rejected": -1.3928741216659546, "step": 480 }, { "epoch": 0.41, "learning_rate": 9.610190300798036e-07, "logits/chosen": 10.391927719116211, "logits/rejected": 4.0250349044799805, "logps/chosen": -21.36394500732422, "logps/rejected": -23.884748458862305, "loss": 0.4719, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4040401577949524, "rewards/margins": 0.7485278248786926, "rewards/rejected": -1.152567982673645, "step": 490 }, { "epoch": 0.41, "learning_rate": 9.57949662369552e-07, "logits/chosen": 9.353527069091797, "logits/rejected": 2.8288679122924805, "logps/chosen": -25.12404441833496, "logps/rejected": -29.256061553955078, "loss": 0.4368, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4457229971885681, "rewards/margins": 1.1012388467788696, "rewards/rejected": -1.546961784362793, "step": 500 }, { "epoch": 0.42, "learning_rate": 9.548802946593e-07, "logits/chosen": 9.08042049407959, "logits/rejected": 3.0393152236938477, "logps/chosen": -19.980348587036133, "logps/rejected": -27.6535587310791, "loss": 0.4654, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43742698431015015, "rewards/margins": 1.0599616765975952, "rewards/rejected": -1.4973886013031006, "step": 510 }, { "epoch": 0.43, "learning_rate": 9.518109269490484e-07, "logits/chosen": 9.434534072875977, "logits/rejected": 3.911717653274536, "logps/chosen": -19.236385345458984, "logps/rejected": -25.492351531982422, "loss": 0.4462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.47872424125671387, "rewards/margins": 0.9361263513565063, "rewards/rejected": -1.4148504734039307, "step": 520 }, { "epoch": 0.44, "learning_rate": 9.487415592387967e-07, "logits/chosen": 8.490682601928711, "logits/rejected": 2.998997449874878, "logps/chosen": -22.683155059814453, "logps/rejected": -27.904708862304688, "loss": 0.4349, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.45238861441612244, "rewards/margins": 1.0105421543121338, "rewards/rejected": -1.462930679321289, "step": 530 }, { "epoch": 0.45, "learning_rate": 9.456721915285451e-07, "logits/chosen": 8.005501747131348, "logits/rejected": 2.737757682800293, "logps/chosen": -20.333431243896484, "logps/rejected": -25.07863998413086, "loss": 0.4708, "rewards/accuracies": 0.75, "rewards/chosen": -0.5002648830413818, "rewards/margins": 0.7843891382217407, "rewards/rejected": -1.284654140472412, "step": 540 }, { "epoch": 0.46, "learning_rate": 9.426028238182934e-07, "logits/chosen": 9.101912498474121, "logits/rejected": 3.375775098800659, "logps/chosen": -21.192792892456055, "logps/rejected": -25.945816040039062, "loss": 0.4237, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3506927788257599, "rewards/margins": 1.0248024463653564, "rewards/rejected": -1.375495195388794, "step": 550 }, { "epoch": 0.46, "learning_rate": 9.395334561080417e-07, "logits/chosen": 7.2458319664001465, "logits/rejected": 2.7766900062561035, "logps/chosen": -17.64424705505371, "logps/rejected": -26.000625610351562, "loss": 0.4295, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.3371118903160095, "rewards/margins": 1.1054513454437256, "rewards/rejected": -1.4425632953643799, "step": 560 }, { "epoch": 0.47, "learning_rate": 9.3646408839779e-07, "logits/chosen": 9.442346572875977, "logits/rejected": 3.140744686126709, "logps/chosen": -22.103801727294922, "logps/rejected": -24.909873962402344, "loss": 0.4487, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.49047422409057617, "rewards/margins": 0.8361800909042358, "rewards/rejected": -1.3266541957855225, "step": 570 }, { "epoch": 0.48, "learning_rate": 9.333947206875383e-07, "logits/chosen": 8.154924392700195, "logits/rejected": 2.262667655944824, "logps/chosen": -21.817089080810547, "logps/rejected": -27.921960830688477, "loss": 0.4416, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43799060583114624, "rewards/margins": 1.0769658088684082, "rewards/rejected": -1.5149563550949097, "step": 580 }, { "epoch": 0.49, "learning_rate": 9.303253529772867e-07, "logits/chosen": 8.049860000610352, "logits/rejected": 2.4107654094696045, "logps/chosen": -19.665203094482422, "logps/rejected": -29.684701919555664, "loss": 0.3801, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4063892960548401, "rewards/margins": 1.3044670820236206, "rewards/rejected": -1.7108564376831055, "step": 590 }, { "epoch": 0.5, "learning_rate": 9.27255985267035e-07, "logits/chosen": 6.481775760650635, "logits/rejected": 1.5172659158706665, "logps/chosen": -19.953556060791016, "logps/rejected": -28.245019912719727, "loss": 0.4764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4115385413169861, "rewards/margins": 1.1210072040557861, "rewards/rejected": -1.532545804977417, "step": 600 }, { "epoch": 0.51, "learning_rate": 9.241866175567833e-07, "logits/chosen": 6.667191505432129, "logits/rejected": 2.198336601257324, "logps/chosen": -20.37515640258789, "logps/rejected": -26.758914947509766, "loss": 0.4404, "rewards/accuracies": 0.8125, "rewards/chosen": -0.44527798891067505, "rewards/margins": 0.9959529042243958, "rewards/rejected": -1.4412308931350708, "step": 610 }, { "epoch": 0.51, "learning_rate": 9.211172498465316e-07, "logits/chosen": 7.916673183441162, "logits/rejected": 2.2087154388427734, "logps/chosen": -22.382949829101562, "logps/rejected": -31.7215576171875, "loss": 0.4229, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.44467464089393616, "rewards/margins": 1.4343655109405518, "rewards/rejected": -1.8790401220321655, "step": 620 }, { "epoch": 0.52, "learning_rate": 9.1804788213628e-07, "logits/chosen": 6.640467643737793, "logits/rejected": 1.0475283861160278, "logps/chosen": -20.7355899810791, "logps/rejected": -27.703689575195312, "loss": 0.4368, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5207439661026001, "rewards/margins": 1.0785092115402222, "rewards/rejected": -1.5992531776428223, "step": 630 }, { "epoch": 0.53, "learning_rate": 9.149785144260283e-07, "logits/chosen": 7.666356086730957, "logits/rejected": 2.7732627391815186, "logps/chosen": -21.1815242767334, "logps/rejected": -26.17404556274414, "loss": 0.4759, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.546532928943634, "rewards/margins": 0.8885123133659363, "rewards/rejected": -1.4350452423095703, "step": 640 }, { "epoch": 0.54, "learning_rate": 9.119091467157764e-07, "logits/chosen": 7.963301658630371, "logits/rejected": 3.2961201667785645, "logps/chosen": -25.18842315673828, "logps/rejected": -34.91250228881836, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6346438527107239, "rewards/margins": 1.2946127653121948, "rewards/rejected": -1.9292566776275635, "step": 650 }, { "epoch": 0.55, "learning_rate": 9.088397790055247e-07, "logits/chosen": 8.768635749816895, "logits/rejected": 2.8241097927093506, "logps/chosen": -19.90713119506836, "logps/rejected": -25.65402603149414, "loss": 0.4516, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.29502856731414795, "rewards/margins": 1.0819497108459473, "rewards/rejected": -1.3769781589508057, "step": 660 }, { "epoch": 0.56, "learning_rate": 9.057704112952731e-07, "logits/chosen": 8.142461776733398, "logits/rejected": 3.9477691650390625, "logps/chosen": -15.86639404296875, "logps/rejected": -28.901317596435547, "loss": 0.4027, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.2652890980243683, "rewards/margins": 1.242172360420227, "rewards/rejected": -1.507461428642273, "step": 670 }, { "epoch": 0.56, "learning_rate": 9.027010435850214e-07, "logits/chosen": 6.994630336761475, "logits/rejected": 2.5457005500793457, "logps/chosen": -20.582218170166016, "logps/rejected": -32.019065856933594, "loss": 0.3887, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.46679186820983887, "rewards/margins": 1.4011623859405518, "rewards/rejected": -1.8679542541503906, "step": 680 }, { "epoch": 0.57, "learning_rate": 8.996316758747697e-07, "logits/chosen": 8.759129524230957, "logits/rejected": 2.230740785598755, "logps/chosen": -22.778507232666016, "logps/rejected": -30.9553279876709, "loss": 0.411, "rewards/accuracies": 0.875, "rewards/chosen": -0.44361358880996704, "rewards/margins": 1.3607478141784668, "rewards/rejected": -1.804361343383789, "step": 690 }, { "epoch": 0.58, "learning_rate": 8.96562308164518e-07, "logits/chosen": 8.269109725952148, "logits/rejected": 2.9809257984161377, "logps/chosen": -20.890945434570312, "logps/rejected": -29.63645362854004, "loss": 0.4285, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4037216603755951, "rewards/margins": 1.249784231185913, "rewards/rejected": -1.6535059213638306, "step": 700 }, { "epoch": 0.59, "learning_rate": 8.934929404542663e-07, "logits/chosen": 8.710639953613281, "logits/rejected": 2.2442469596862793, "logps/chosen": -23.648401260375977, "logps/rejected": -32.714195251464844, "loss": 0.4078, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5406786799430847, "rewards/margins": 1.3301970958709717, "rewards/rejected": -1.8708757162094116, "step": 710 }, { "epoch": 0.6, "learning_rate": 8.904235727440147e-07, "logits/chosen": 7.416018486022949, "logits/rejected": 2.754124402999878, "logps/chosen": -19.017892837524414, "logps/rejected": -29.035497665405273, "loss": 0.4203, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47058525681495667, "rewards/margins": 1.207162618637085, "rewards/rejected": -1.6777477264404297, "step": 720 }, { "epoch": 0.6, "learning_rate": 8.87354205033763e-07, "logits/chosen": 8.478178977966309, "logits/rejected": 2.1382641792297363, "logps/chosen": -20.007627487182617, "logps/rejected": -28.002798080444336, "loss": 0.4004, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3123311996459961, "rewards/margins": 1.304101824760437, "rewards/rejected": -1.6164331436157227, "step": 730 }, { "epoch": 0.61, "learning_rate": 8.842848373235113e-07, "logits/chosen": 7.798401832580566, "logits/rejected": 3.254276752471924, "logps/chosen": -19.55718994140625, "logps/rejected": -30.556888580322266, "loss": 0.3779, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4111674726009369, "rewards/margins": 1.4537394046783447, "rewards/rejected": -1.864906668663025, "step": 740 }, { "epoch": 0.62, "learning_rate": 8.812154696132596e-07, "logits/chosen": 7.0984344482421875, "logits/rejected": 2.0487966537475586, "logps/chosen": -22.84370994567871, "logps/rejected": -31.912372589111328, "loss": 0.3788, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.628212034702301, "rewards/margins": 1.291364312171936, "rewards/rejected": -1.9195764064788818, "step": 750 }, { "epoch": 0.63, "learning_rate": 8.78146101903008e-07, "logits/chosen": 6.816826820373535, "logits/rejected": 2.289916515350342, "logps/chosen": -17.96799087524414, "logps/rejected": -30.52446937561035, "loss": 0.4104, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4248894155025482, "rewards/margins": 1.4009729623794556, "rewards/rejected": -1.8258622884750366, "step": 760 }, { "epoch": 0.64, "learning_rate": 8.750767341927563e-07, "logits/chosen": 7.032599449157715, "logits/rejected": 1.5030428171157837, "logps/chosen": -20.34360122680664, "logps/rejected": -29.46416664123535, "loss": 0.4124, "rewards/accuracies": 0.75, "rewards/chosen": -0.44062167406082153, "rewards/margins": 1.3179067373275757, "rewards/rejected": -1.758528470993042, "step": 770 }, { "epoch": 0.65, "learning_rate": 8.720073664825046e-07, "logits/chosen": 7.1342644691467285, "logits/rejected": 2.248922348022461, "logps/chosen": -21.11859130859375, "logps/rejected": -32.53595733642578, "loss": 0.4279, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.48495250940322876, "rewards/margins": 1.4530541896820068, "rewards/rejected": -1.9380067586898804, "step": 780 }, { "epoch": 0.65, "learning_rate": 8.689379987722528e-07, "logits/chosen": 8.225824356079102, "logits/rejected": 2.9487974643707275, "logps/chosen": -22.600482940673828, "logps/rejected": -26.93109130859375, "loss": 0.4919, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6789675951004028, "rewards/margins": 0.8013946413993835, "rewards/rejected": -1.4803621768951416, "step": 790 }, { "epoch": 0.66, "learning_rate": 8.658686310620012e-07, "logits/chosen": 7.499259948730469, "logits/rejected": 2.63696551322937, "logps/chosen": -21.861347198486328, "logps/rejected": -32.604400634765625, "loss": 0.3619, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5475003123283386, "rewards/margins": 1.4189766645431519, "rewards/rejected": -1.9664767980575562, "step": 800 }, { "epoch": 0.67, "learning_rate": 8.627992633517495e-07, "logits/chosen": 7.751049041748047, "logits/rejected": 2.5999271869659424, "logps/chosen": -19.283123016357422, "logps/rejected": -28.197988510131836, "loss": 0.405, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4166495203971863, "rewards/margins": 1.1838258504867554, "rewards/rejected": -1.600475549697876, "step": 810 }, { "epoch": 0.68, "learning_rate": 8.597298956414978e-07, "logits/chosen": 7.180790901184082, "logits/rejected": 2.6444411277770996, "logps/chosen": -18.11197280883789, "logps/rejected": -31.1019229888916, "loss": 0.3864, "rewards/accuracies": 0.875, "rewards/chosen": -0.38645365834236145, "rewards/margins": 1.5587714910507202, "rewards/rejected": -1.9452250003814697, "step": 820 }, { "epoch": 0.69, "learning_rate": 8.566605279312461e-07, "logits/chosen": 7.82586669921875, "logits/rejected": 2.0177793502807617, "logps/chosen": -21.55264663696289, "logps/rejected": -31.95199966430664, "loss": 0.4222, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.509517252445221, "rewards/margins": 1.4583721160888672, "rewards/rejected": -1.967889428138733, "step": 830 }, { "epoch": 0.7, "learning_rate": 8.535911602209944e-07, "logits/chosen": 8.363626480102539, "logits/rejected": 2.3595283031463623, "logps/chosen": -24.080026626586914, "logps/rejected": -31.28499412536621, "loss": 0.4039, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5127073526382446, "rewards/margins": 1.3342761993408203, "rewards/rejected": -1.8469836711883545, "step": 840 }, { "epoch": 0.7, "learning_rate": 8.505217925107428e-07, "logits/chosen": 7.312800407409668, "logits/rejected": 1.5801849365234375, "logps/chosen": -24.2260684967041, "logps/rejected": -31.582530975341797, "loss": 0.3933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.618637204170227, "rewards/margins": 1.3562564849853516, "rewards/rejected": -1.974893569946289, "step": 850 }, { "epoch": 0.71, "learning_rate": 8.474524248004911e-07, "logits/chosen": 7.401895999908447, "logits/rejected": 2.4258737564086914, "logps/chosen": -23.088558197021484, "logps/rejected": -34.858551025390625, "loss": 0.4219, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5713120102882385, "rewards/margins": 1.4319599866867065, "rewards/rejected": -2.0032718181610107, "step": 860 }, { "epoch": 0.72, "learning_rate": 8.443830570902394e-07, "logits/chosen": 7.6809186935424805, "logits/rejected": 1.9851977825164795, "logps/chosen": -22.698991775512695, "logps/rejected": -28.081676483154297, "loss": 0.4202, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4443737864494324, "rewards/margins": 1.1558778285980225, "rewards/rejected": -1.6002517938613892, "step": 870 }, { "epoch": 0.73, "learning_rate": 8.413136893799877e-07, "logits/chosen": 7.851733207702637, "logits/rejected": 2.435418128967285, "logps/chosen": -20.593048095703125, "logps/rejected": -34.249061584472656, "loss": 0.3602, "rewards/accuracies": 0.875, "rewards/chosen": -0.3790115714073181, "rewards/margins": 1.632420301437378, "rewards/rejected": -2.0114316940307617, "step": 880 }, { "epoch": 0.74, "learning_rate": 8.382443216697361e-07, "logits/chosen": 8.80190372467041, "logits/rejected": 1.6230602264404297, "logps/chosen": -22.132980346679688, "logps/rejected": -34.392417907714844, "loss": 0.3686, "rewards/accuracies": 0.8125, "rewards/chosen": -0.42830777168273926, "rewards/margins": 1.6717679500579834, "rewards/rejected": -2.1000757217407227, "step": 890 }, { "epoch": 0.75, "learning_rate": 8.351749539594844e-07, "logits/chosen": 5.885406970977783, "logits/rejected": 1.2718799114227295, "logps/chosen": -20.493120193481445, "logps/rejected": -37.505531311035156, "loss": 0.3625, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5876837968826294, "rewards/margins": 1.8779884576797485, "rewards/rejected": -2.465672731399536, "step": 900 }, { "epoch": 0.75, "learning_rate": 8.321055862492326e-07, "logits/chosen": 7.028090000152588, "logits/rejected": 2.0724620819091797, "logps/chosen": -16.073829650878906, "logps/rejected": -31.5205078125, "loss": 0.3969, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.28281980752944946, "rewards/margins": 1.7461847066879272, "rewards/rejected": -2.0290045738220215, "step": 910 }, { "epoch": 0.76, "learning_rate": 8.290362185389809e-07, "logits/chosen": 6.3124566078186035, "logits/rejected": 1.5514699220657349, "logps/chosen": -23.613628387451172, "logps/rejected": -37.15446090698242, "loss": 0.3778, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.6669880747795105, "rewards/margins": 1.6280193328857422, "rewards/rejected": -2.2950074672698975, "step": 920 }, { "epoch": 0.77, "learning_rate": 8.259668508287292e-07, "logits/chosen": 6.556632041931152, "logits/rejected": 2.245039463043213, "logps/chosen": -18.484737396240234, "logps/rejected": -27.77170753479004, "loss": 0.403, "rewards/accuracies": 0.875, "rewards/chosen": -0.4349077343940735, "rewards/margins": 1.382024884223938, "rewards/rejected": -1.8169326782226562, "step": 930 }, { "epoch": 0.78, "learning_rate": 8.228974831184775e-07, "logits/chosen": 7.8744659423828125, "logits/rejected": 1.9694064855575562, "logps/chosen": -22.25751304626465, "logps/rejected": -31.402053833007812, "loss": 0.3876, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.4930182099342346, "rewards/margins": 1.3855167627334595, "rewards/rejected": -1.8785350322723389, "step": 940 }, { "epoch": 0.79, "learning_rate": 8.198281154082258e-07, "logits/chosen": 6.765068054199219, "logits/rejected": 0.8865860104560852, "logps/chosen": -25.58555030822754, "logps/rejected": -34.25336456298828, "loss": 0.3733, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7215810418128967, "rewards/margins": 1.4795466661453247, "rewards/rejected": -2.201127529144287, "step": 950 }, { "epoch": 0.8, "learning_rate": 8.167587476979741e-07, "logits/chosen": 7.8806304931640625, "logits/rejected": 1.8827158212661743, "logps/chosen": -25.51276206970215, "logps/rejected": -36.2748908996582, "loss": 0.4153, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.586471438407898, "rewards/margins": 1.6772867441177368, "rewards/rejected": -2.263758420944214, "step": 960 }, { "epoch": 0.8, "learning_rate": 8.136893799877224e-07, "logits/chosen": 7.661480903625488, "logits/rejected": 2.064643383026123, "logps/chosen": -18.307971954345703, "logps/rejected": -27.596664428710938, "loss": 0.4239, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4390641748905182, "rewards/margins": 1.1807688474655151, "rewards/rejected": -1.619832992553711, "step": 970 }, { "epoch": 0.81, "learning_rate": 8.106200122774708e-07, "logits/chosen": 6.639142036437988, "logits/rejected": 1.3553249835968018, "logps/chosen": -20.3842716217041, "logps/rejected": -30.23708152770996, "loss": 0.3863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5303052663803101, "rewards/margins": 1.1942421197891235, "rewards/rejected": -1.7245473861694336, "step": 980 }, { "epoch": 0.82, "learning_rate": 8.075506445672191e-07, "logits/chosen": 7.895041465759277, "logits/rejected": 1.9215726852416992, "logps/chosen": -20.276226043701172, "logps/rejected": -32.28209686279297, "loss": 0.4116, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.3237341046333313, "rewards/margins": 1.7264082431793213, "rewards/rejected": -2.050142288208008, "step": 990 }, { "epoch": 0.83, "learning_rate": 8.044812768569674e-07, "logits/chosen": 5.989454746246338, "logits/rejected": 1.490361213684082, "logps/chosen": -17.40456199645996, "logps/rejected": -29.303430557250977, "loss": 0.3933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3834887146949768, "rewards/margins": 1.3754570484161377, "rewards/rejected": -1.7589458227157593, "step": 1000 }, { "epoch": 0.84, "learning_rate": 8.014119091467157e-07, "logits/chosen": 6.841021537780762, "logits/rejected": 2.0202012062072754, "logps/chosen": -20.09317970275879, "logps/rejected": -32.73921203613281, "loss": 0.3925, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5454772114753723, "rewards/margins": 1.5412445068359375, "rewards/rejected": -2.086721658706665, "step": 1010 }, { "epoch": 0.85, "learning_rate": 7.983425414364641e-07, "logits/chosen": 7.673565864562988, "logits/rejected": 2.136615037918091, "logps/chosen": -23.69163703918457, "logps/rejected": -38.273521423339844, "loss": 0.358, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5863059759140015, "rewards/margins": 1.8284555673599243, "rewards/rejected": -2.414761781692505, "step": 1020 }, { "epoch": 0.85, "learning_rate": 7.952731737262124e-07, "logits/chosen": 7.155651092529297, "logits/rejected": 1.925082802772522, "logps/chosen": -21.324934005737305, "logps/rejected": -30.300312042236328, "loss": 0.4033, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5829371809959412, "rewards/margins": 1.2596461772918701, "rewards/rejected": -1.8425836563110352, "step": 1030 }, { "epoch": 0.86, "learning_rate": 7.922038060159607e-07, "logits/chosen": 6.084378242492676, "logits/rejected": 1.0743868350982666, "logps/chosen": -18.61046028137207, "logps/rejected": -34.52151107788086, "loss": 0.4137, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.39434099197387695, "rewards/margins": 1.7767207622528076, "rewards/rejected": -2.1710617542266846, "step": 1040 }, { "epoch": 0.87, "learning_rate": 7.89134438305709e-07, "logits/chosen": 8.538483619689941, "logits/rejected": 2.3657491207122803, "logps/chosen": -23.018388748168945, "logps/rejected": -30.576452255249023, "loss": 0.4068, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.47074708342552185, "rewards/margins": 1.3823906183242798, "rewards/rejected": -1.8531373739242554, "step": 1050 }, { "epoch": 0.88, "learning_rate": 7.860650705954574e-07, "logits/chosen": 7.347692966461182, "logits/rejected": 2.716426372528076, "logps/chosen": -19.426044464111328, "logps/rejected": -37.71467590332031, "loss": 0.3489, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.47837352752685547, "rewards/margins": 1.9600915908813477, "rewards/rejected": -2.438465118408203, "step": 1060 }, { "epoch": 0.89, "learning_rate": 7.829957028852057e-07, "logits/chosen": 6.892212867736816, "logits/rejected": 1.0572636127471924, "logps/chosen": -23.00826644897461, "logps/rejected": -32.86492156982422, "loss": 0.3699, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.5571302175521851, "rewards/margins": 1.5158917903900146, "rewards/rejected": -2.07302188873291, "step": 1070 }, { "epoch": 0.89, "learning_rate": 7.799263351749539e-07, "logits/chosen": 6.221480369567871, "logits/rejected": 1.0836519002914429, "logps/chosen": -23.462066650390625, "logps/rejected": -30.0970516204834, "loss": 0.3992, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7195588946342468, "rewards/margins": 1.1414775848388672, "rewards/rejected": -1.8610365390777588, "step": 1080 }, { "epoch": 0.9, "learning_rate": 7.768569674647022e-07, "logits/chosen": 6.816685676574707, "logits/rejected": 1.0394450426101685, "logps/chosen": -26.166027069091797, "logps/rejected": -37.584617614746094, "loss": 0.3405, "rewards/accuracies": 0.875, "rewards/chosen": -0.7692574262619019, "rewards/margins": 1.640032410621643, "rewards/rejected": -2.409290075302124, "step": 1090 }, { "epoch": 0.91, "learning_rate": 7.737875997544505e-07, "logits/chosen": 6.812434196472168, "logits/rejected": 1.2860956192016602, "logps/chosen": -23.65300178527832, "logps/rejected": -30.067922592163086, "loss": 0.3969, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5657695531845093, "rewards/margins": 1.3029512166976929, "rewards/rejected": -1.8687207698822021, "step": 1100 }, { "epoch": 0.92, "learning_rate": 7.707182320441989e-07, "logits/chosen": 7.6257643699646, "logits/rejected": 2.1522669792175293, "logps/chosen": -21.895709991455078, "logps/rejected": -33.245784759521484, "loss": 0.3956, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5534289479255676, "rewards/margins": 1.4592502117156982, "rewards/rejected": -2.012679100036621, "step": 1110 }, { "epoch": 0.93, "learning_rate": 7.676488643339472e-07, "logits/chosen": 5.683910369873047, "logits/rejected": 1.4482964277267456, "logps/chosen": -16.731014251708984, "logps/rejected": -30.28756332397461, "loss": 0.3422, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3482304513454437, "rewards/margins": 1.5359057188034058, "rewards/rejected": -1.8841359615325928, "step": 1120 }, { "epoch": 0.94, "learning_rate": 7.645794966236955e-07, "logits/chosen": 5.315042972564697, "logits/rejected": 0.5469619631767273, "logps/chosen": -22.090145111083984, "logps/rejected": -34.24272918701172, "loss": 0.3736, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.6412897109985352, "rewards/margins": 1.5726335048675537, "rewards/rejected": -2.2139229774475098, "step": 1130 }, { "epoch": 0.94, "learning_rate": 7.615101289134438e-07, "logits/chosen": 6.428001403808594, "logits/rejected": 0.9593698382377625, "logps/chosen": -23.828655242919922, "logps/rejected": -40.11705780029297, "loss": 0.3795, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.5897135734558105, "rewards/margins": 2.1194376945495605, "rewards/rejected": -2.709150791168213, "step": 1140 }, { "epoch": 0.95, "learning_rate": 7.584407612031922e-07, "logits/chosen": 5.8998918533325195, "logits/rejected": 1.3522698879241943, "logps/chosen": -21.022151947021484, "logps/rejected": -35.53426742553711, "loss": 0.4113, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5483814477920532, "rewards/margins": 1.7454071044921875, "rewards/rejected": -2.293788433074951, "step": 1150 }, { "epoch": 0.96, "learning_rate": 7.553713934929404e-07, "logits/chosen": 6.54571008682251, "logits/rejected": 1.3163864612579346, "logps/chosen": -22.404720306396484, "logps/rejected": -34.17644500732422, "loss": 0.401, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.48589468002319336, "rewards/margins": 1.6578638553619385, "rewards/rejected": -2.143758773803711, "step": 1160 }, { "epoch": 0.97, "learning_rate": 7.523020257826887e-07, "logits/chosen": 6.862661838531494, "logits/rejected": 0.9924262762069702, "logps/chosen": -20.455472946166992, "logps/rejected": -31.53582763671875, "loss": 0.3799, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5109466910362244, "rewards/margins": 1.570505142211914, "rewards/rejected": -2.081451892852783, "step": 1170 }, { "epoch": 0.98, "learning_rate": 7.49232658072437e-07, "logits/chosen": 6.391946315765381, "logits/rejected": 1.5113942623138428, "logps/chosen": -19.47252655029297, "logps/rejected": -31.717498779296875, "loss": 0.3975, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.45694679021835327, "rewards/margins": 1.5134795904159546, "rewards/rejected": -1.9704265594482422, "step": 1180 }, { "epoch": 0.99, "learning_rate": 7.461632903621854e-07, "logits/chosen": 7.326245307922363, "logits/rejected": 1.6284013986587524, "logps/chosen": -16.586496353149414, "logps/rejected": -28.21954345703125, "loss": 0.3592, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.40092840790748596, "rewards/margins": 1.4025030136108398, "rewards/rejected": -1.8034312725067139, "step": 1190 }, { "epoch": 0.99, "learning_rate": 7.430939226519337e-07, "logits/chosen": 6.372334003448486, "logits/rejected": 1.4239909648895264, "logps/chosen": -25.205049514770508, "logps/rejected": -38.102027893066406, "loss": 0.3902, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8760989904403687, "rewards/margins": 1.513891577720642, "rewards/rejected": -2.38999080657959, "step": 1200 }, { "epoch": 1.0, "eval_logits/chosen": 7.443781852722168, "eval_logits/rejected": -0.24055449664592743, "eval_logps/chosen": -23.05025291442871, "eval_logps/rejected": -42.57103729248047, "eval_loss": 0.2749061584472656, "eval_rewards/accuracies": 0.9285714030265808, "eval_rewards/chosen": -0.303264319896698, "eval_rewards/margins": 2.6336822509765625, "eval_rewards/rejected": -2.9369466304779053, "eval_runtime": 13.4587, "eval_samples_per_second": 7.43, "eval_steps_per_second": 0.52, "step": 1207 } ], "logging_steps": 10, "max_steps": 3621, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }