{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5095, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019627085377821394, "grad_norm": 7695.078157716598, "learning_rate": 9.803921568627451e-10, "logits/chosen": -2.9195547103881836, "logits/rejected": -2.4565553665161133, "logps/chosen": -421.782470703125, "logps/rejected": -89.33955383300781, "loss": 499.7888, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001962708537782139, "grad_norm": 8523.861999496248, "learning_rate": 9.803921568627451e-09, "logits/chosen": -2.5579118728637695, "logits/rejected": -2.5539872646331787, "logps/chosen": -328.5361633300781, "logps/rejected": -224.728515625, "loss": 500.0604, "rewards/accuracies": 0.37037035822868347, "rewards/chosen": -0.02532227709889412, "rewards/margins": -0.12882067263126373, "rewards/rejected": 0.10349839180707932, "step": 10 }, { "epoch": 0.003925417075564278, "grad_norm": 8141.883083281547, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -2.748523473739624, "logits/rejected": -2.6494884490966797, "logps/chosen": -241.36862182617188, "logps/rejected": -228.7290802001953, "loss": 466.6132, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.20755431056022644, "rewards/margins": 0.09448665380477905, "rewards/rejected": 0.11306764930486679, "step": 20 }, { "epoch": 0.005888125613346418, "grad_norm": 9089.169358641317, "learning_rate": 2.941176470588235e-08, "logits/chosen": -2.805922031402588, "logits/rejected": -2.7502973079681396, "logps/chosen": -271.3504333496094, "logps/rejected": -276.63763427734375, "loss": 573.6512, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.38350290060043335, "rewards/margins": -0.16553720831871033, "rewards/rejected": 0.5490401387214661, "step": 30 }, { "epoch": 0.007850834151128557, "grad_norm": 8417.60796163703, "learning_rate": 3.9215686274509804e-08, "logits/chosen": -2.5296969413757324, "logits/rejected": -2.614142894744873, "logps/chosen": -234.57723999023438, "logps/rejected": -197.72872924804688, "loss": 566.4437, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5821893215179443, "rewards/margins": 0.21359722316265106, "rewards/rejected": 1.3685922622680664, "step": 40 }, { "epoch": 0.009813542688910697, "grad_norm": 6327.552834890516, "learning_rate": 4.901960784313725e-08, "logits/chosen": -2.7662551403045654, "logits/rejected": -2.7321105003356934, "logps/chosen": -261.9884948730469, "logps/rejected": -280.5721435546875, "loss": 567.1037, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 3.4329071044921875, "rewards/margins": 0.40464481711387634, "rewards/rejected": 3.0282623767852783, "step": 50 }, { "epoch": 0.011776251226692836, "grad_norm": 5972.253584813363, "learning_rate": 5.88235294117647e-08, "logits/chosen": -2.759847640991211, "logits/rejected": -2.6784212589263916, "logps/chosen": -249.282470703125, "logps/rejected": -230.25588989257812, "loss": 525.8602, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 5.610996246337891, "rewards/margins": -0.2631208300590515, "rewards/rejected": 5.874117374420166, "step": 60 }, { "epoch": 0.013738959764474975, "grad_norm": 5990.262385424046, "learning_rate": 6.862745098039216e-08, "logits/chosen": -2.846557140350342, "logits/rejected": -2.780174493789673, "logps/chosen": -294.8406677246094, "logps/rejected": -229.6591033935547, "loss": 501.9, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 7.265566825866699, "rewards/margins": -0.3561328053474426, "rewards/rejected": 7.621700286865234, "step": 70 }, { "epoch": 0.015701668302257114, "grad_norm": 5796.982235405843, "learning_rate": 7.843137254901961e-08, "logits/chosen": -2.7839438915252686, "logits/rejected": -2.595407009124756, "logps/chosen": -315.75201416015625, "logps/rejected": -205.21286010742188, "loss": 535.338, "rewards/accuracies": 0.5, "rewards/chosen": 8.52326488494873, "rewards/margins": 0.2876408100128174, "rewards/rejected": 8.235624313354492, "step": 80 }, { "epoch": 0.017664376840039256, "grad_norm": 5927.090334614455, "learning_rate": 8.823529411764706e-08, "logits/chosen": -2.844291925430298, "logits/rejected": -2.818943738937378, "logps/chosen": -259.13348388671875, "logps/rejected": -261.32122802734375, "loss": 555.3117, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 11.018460273742676, "rewards/margins": -0.5622838139533997, "rewards/rejected": 11.580743789672852, "step": 90 }, { "epoch": 0.019627085377821395, "grad_norm": 4923.734511273913, "learning_rate": 9.80392156862745e-08, "logits/chosen": -2.8124048709869385, "logits/rejected": -2.730870246887207, "logps/chosen": -284.17413330078125, "logps/rejected": -248.091064453125, "loss": 487.075, "rewards/accuracies": 0.29999998211860657, "rewards/chosen": 11.848287582397461, "rewards/margins": 0.02657313272356987, "rewards/rejected": 11.821714401245117, "step": 100 }, { "epoch": 0.021589793915603533, "grad_norm": 5710.345549001755, "learning_rate": 1.0784313725490195e-07, "logits/chosen": -2.841452121734619, "logits/rejected": -2.760373592376709, "logps/chosen": -322.677978515625, "logps/rejected": -283.0740661621094, "loss": 496.9658, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 10.059597969055176, "rewards/margins": 0.1907721310853958, "rewards/rejected": 9.868825912475586, "step": 110 }, { "epoch": 0.023552502453385672, "grad_norm": 4267.1269607205095, "learning_rate": 1.176470588235294e-07, "logits/chosen": -2.6766438484191895, "logits/rejected": -2.60054349899292, "logps/chosen": -189.0832977294922, "logps/rejected": -173.5619659423828, "loss": 500.976, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 13.830352783203125, "rewards/margins": -1.0408800840377808, "rewards/rejected": 14.871232986450195, "step": 120 }, { "epoch": 0.02551521099116781, "grad_norm": 5055.7057004203425, "learning_rate": 1.2745098039215685e-07, "logits/chosen": -2.6325724124908447, "logits/rejected": -2.6757912635803223, "logps/chosen": -345.6778259277344, "logps/rejected": -288.1807556152344, "loss": 516.968, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 15.467287063598633, "rewards/margins": 2.80692720413208, "rewards/rejected": 12.660360336303711, "step": 130 }, { "epoch": 0.02747791952894995, "grad_norm": 6694.84371329099, "learning_rate": 1.3725490196078432e-07, "logits/chosen": -2.712160587310791, "logits/rejected": -2.756772518157959, "logps/chosen": -179.55929565429688, "logps/rejected": -178.046142578125, "loss": 494.6539, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 13.633160591125488, "rewards/margins": -1.888087272644043, "rewards/rejected": 15.521249771118164, "step": 140 }, { "epoch": 0.029440628066732092, "grad_norm": 5094.761864381693, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -2.8623595237731934, "logits/rejected": -2.7546021938323975, "logps/chosen": -219.88882446289062, "logps/rejected": -209.93106079101562, "loss": 487.1993, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 14.253957748413086, "rewards/margins": -1.1866604089736938, "rewards/rejected": 15.440618515014648, "step": 150 }, { "epoch": 0.03140333660451423, "grad_norm": 5722.953728892541, "learning_rate": 1.5686274509803921e-07, "logits/chosen": -2.8009588718414307, "logits/rejected": -2.7127015590667725, "logps/chosen": -263.4642028808594, "logps/rejected": -204.67913818359375, "loss": 488.6207, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 15.075262069702148, "rewards/margins": 1.5416971445083618, "rewards/rejected": 13.533564567565918, "step": 160 }, { "epoch": 0.033366045142296366, "grad_norm": 5406.103665715419, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.688260555267334, "logits/rejected": -2.66151762008667, "logps/chosen": -230.9227752685547, "logps/rejected": -188.932861328125, "loss": 486.579, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 17.17111587524414, "rewards/margins": -0.39257413148880005, "rewards/rejected": 17.563688278198242, "step": 170 }, { "epoch": 0.03532875368007851, "grad_norm": 3607.492970015055, "learning_rate": 1.764705882352941e-07, "logits/chosen": -2.7789673805236816, "logits/rejected": -2.679816722869873, "logps/chosen": -242.88671875, "logps/rejected": -227.7840118408203, "loss": 481.6462, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 15.954030990600586, "rewards/margins": -3.311750888824463, "rewards/rejected": 19.265785217285156, "step": 180 }, { "epoch": 0.03729146221786065, "grad_norm": 4808.623970063617, "learning_rate": 1.8627450980392158e-07, "logits/chosen": -2.6783928871154785, "logits/rejected": -2.6424717903137207, "logps/chosen": -300.4185791015625, "logps/rejected": -239.74569702148438, "loss": 496.5815, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 17.760433197021484, "rewards/margins": 0.7568072080612183, "rewards/rejected": 17.003625869750977, "step": 190 }, { "epoch": 0.03925417075564279, "grad_norm": 8590.339480711116, "learning_rate": 1.96078431372549e-07, "logits/chosen": -2.8148932456970215, "logits/rejected": -2.617837429046631, "logps/chosen": -262.127197265625, "logps/rejected": -177.19729614257812, "loss": 528.763, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 24.703397750854492, "rewards/margins": 5.610960960388184, "rewards/rejected": 19.092435836791992, "step": 200 }, { "epoch": 0.04121687929342493, "grad_norm": 4772.234179976811, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -2.571575164794922, "logits/rejected": -2.488711357116699, "logps/chosen": -224.4163055419922, "logps/rejected": -219.7928466796875, "loss": 491.4365, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 17.970706939697266, "rewards/margins": 2.292705535888672, "rewards/rejected": 15.678001403808594, "step": 210 }, { "epoch": 0.04317958783120707, "grad_norm": 4457.756320569655, "learning_rate": 2.156862745098039e-07, "logits/chosen": -2.77001953125, "logits/rejected": -2.7243027687072754, "logps/chosen": -270.52069091796875, "logps/rejected": -267.315185546875, "loss": 548.6979, "rewards/accuracies": 0.5, "rewards/chosen": 17.678638458251953, "rewards/margins": -0.4655752182006836, "rewards/rejected": 18.14421272277832, "step": 220 }, { "epoch": 0.045142296368989206, "grad_norm": 4436.791958749799, "learning_rate": 2.2549019607843137e-07, "logits/chosen": -2.8024957180023193, "logits/rejected": -2.660709857940674, "logps/chosen": -246.68179321289062, "logps/rejected": -183.47801208496094, "loss": 458.2289, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 20.225656509399414, "rewards/margins": 1.1780885457992554, "rewards/rejected": 19.04756736755371, "step": 230 }, { "epoch": 0.047105004906771344, "grad_norm": 4851.214394709931, "learning_rate": 2.352941176470588e-07, "logits/chosen": -2.688019037246704, "logits/rejected": -2.6779627799987793, "logps/chosen": -230.35806274414062, "logps/rejected": -223.46469116210938, "loss": 490.6466, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 19.589611053466797, "rewards/margins": -4.361724853515625, "rewards/rejected": 23.95133399963379, "step": 240 }, { "epoch": 0.04906771344455348, "grad_norm": 4924.226168440959, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -2.6655263900756836, "logits/rejected": -2.59631609916687, "logps/chosen": -213.60543823242188, "logps/rejected": -182.390625, "loss": 458.9436, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 25.82254409790039, "rewards/margins": 5.193560600280762, "rewards/rejected": 20.628982543945312, "step": 250 }, { "epoch": 0.05103042198233562, "grad_norm": 5042.917008262602, "learning_rate": 2.549019607843137e-07, "logits/chosen": -2.750251293182373, "logits/rejected": -2.7108583450317383, "logps/chosen": -296.44635009765625, "logps/rejected": -226.4813690185547, "loss": 488.1532, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 21.304428100585938, "rewards/margins": -1.4539859294891357, "rewards/rejected": 22.7584171295166, "step": 260 }, { "epoch": 0.05299313052011776, "grad_norm": 4890.209437898859, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -2.6184678077697754, "logits/rejected": -2.6272358894348145, "logps/chosen": -205.027587890625, "logps/rejected": -197.83663940429688, "loss": 401.3296, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 22.567365646362305, "rewards/margins": -2.5471272468566895, "rewards/rejected": 25.114490509033203, "step": 270 }, { "epoch": 0.0549558390578999, "grad_norm": 4974.045072015982, "learning_rate": 2.7450980392156863e-07, "logits/chosen": -2.6787219047546387, "logits/rejected": -2.6040050983428955, "logps/chosen": -225.64871215820312, "logps/rejected": -180.16539001464844, "loss": 471.3332, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 19.779897689819336, "rewards/margins": -1.9217135906219482, "rewards/rejected": 21.70161247253418, "step": 280 }, { "epoch": 0.05691854759568204, "grad_norm": 4075.0039747753326, "learning_rate": 2.8431372549019607e-07, "logits/chosen": -2.771533966064453, "logits/rejected": -2.6332433223724365, "logps/chosen": -279.80279541015625, "logps/rejected": -208.13052368164062, "loss": 477.8637, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 24.917438507080078, "rewards/margins": 1.8436920642852783, "rewards/rejected": 23.073745727539062, "step": 290 }, { "epoch": 0.058881256133464184, "grad_norm": 4301.819357666736, "learning_rate": 2.941176470588235e-07, "logits/chosen": -2.626574754714966, "logits/rejected": -2.628239154815674, "logps/chosen": -259.4626770019531, "logps/rejected": -275.0853576660156, "loss": 458.7708, "rewards/accuracies": 0.5, "rewards/chosen": 23.829364776611328, "rewards/margins": -1.2697381973266602, "rewards/rejected": 25.099105834960938, "step": 300 }, { "epoch": 0.06084396467124632, "grad_norm": 4704.80103510706, "learning_rate": 3.0392156862745094e-07, "logits/chosen": -2.675342082977295, "logits/rejected": -2.5852386951446533, "logps/chosen": -270.854248046875, "logps/rejected": -212.65805053710938, "loss": 508.6296, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 31.20000648498535, "rewards/margins": 7.389936923980713, "rewards/rejected": 23.81006622314453, "step": 310 }, { "epoch": 0.06280667320902845, "grad_norm": 4797.39266587772, "learning_rate": 3.1372549019607843e-07, "logits/chosen": -2.653855085372925, "logits/rejected": -2.6646554470062256, "logps/chosen": -182.79727172851562, "logps/rejected": -204.63052368164062, "loss": 443.3549, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 20.889680862426758, "rewards/margins": 0.9416363835334778, "rewards/rejected": 19.948043823242188, "step": 320 }, { "epoch": 0.0647693817468106, "grad_norm": 3846.90384024348, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -2.6689724922180176, "logits/rejected": -2.618972063064575, "logps/chosen": -251.56640625, "logps/rejected": -183.63084411621094, "loss": 397.1196, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 24.467573165893555, "rewards/margins": -5.52662467956543, "rewards/rejected": 29.994197845458984, "step": 330 }, { "epoch": 0.06673209028459273, "grad_norm": 5086.669533999424, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.7471423149108887, "logits/rejected": -2.619368076324463, "logps/chosen": -320.1322326660156, "logps/rejected": -214.53182983398438, "loss": 511.8433, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 30.095443725585938, "rewards/margins": 3.8112258911132812, "rewards/rejected": 26.284221649169922, "step": 340 }, { "epoch": 0.06869479882237488, "grad_norm": 4135.450239206523, "learning_rate": 3.431372549019608e-07, "logits/chosen": -2.6878678798675537, "logits/rejected": -2.6312239170074463, "logps/chosen": -170.13888549804688, "logps/rejected": -156.8441619873047, "loss": 457.6831, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 20.482637405395508, "rewards/margins": 0.018645573407411575, "rewards/rejected": 20.463991165161133, "step": 350 }, { "epoch": 0.07065750736015702, "grad_norm": 4955.612877440178, "learning_rate": 3.529411764705882e-07, "logits/chosen": -2.7627711296081543, "logits/rejected": -2.596731662750244, "logps/chosen": -323.8910217285156, "logps/rejected": -228.11819458007812, "loss": 513.3576, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 26.47014808654785, "rewards/margins": -1.49628746509552, "rewards/rejected": 27.966434478759766, "step": 360 }, { "epoch": 0.07262021589793916, "grad_norm": 5191.260127136411, "learning_rate": 3.6274509803921566e-07, "logits/chosen": -2.642749547958374, "logits/rejected": -2.540039539337158, "logps/chosen": -246.0485076904297, "logps/rejected": -229.6261749267578, "loss": 503.3668, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 27.6164608001709, "rewards/margins": -5.55600643157959, "rewards/rejected": 33.172462463378906, "step": 370 }, { "epoch": 0.0745829244357213, "grad_norm": 4248.3126174245535, "learning_rate": 3.7254901960784315e-07, "logits/chosen": -2.4983344078063965, "logits/rejected": -2.6608455181121826, "logps/chosen": -216.2776641845703, "logps/rejected": -261.11383056640625, "loss": 463.8114, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 23.812610626220703, "rewards/margins": -7.91140079498291, "rewards/rejected": 31.724010467529297, "step": 380 }, { "epoch": 0.07654563297350343, "grad_norm": 4560.638704744429, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -2.541658401489258, "logits/rejected": -2.384592056274414, "logps/chosen": -243.137451171875, "logps/rejected": -242.7369842529297, "loss": 505.7781, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 23.56881332397461, "rewards/margins": -12.386117935180664, "rewards/rejected": 35.95492935180664, "step": 390 }, { "epoch": 0.07850834151128558, "grad_norm": 5066.806916895327, "learning_rate": 3.92156862745098e-07, "logits/chosen": -2.685091495513916, "logits/rejected": -2.5697619915008545, "logps/chosen": -234.70156860351562, "logps/rejected": -246.5504913330078, "loss": 503.3179, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 29.678075790405273, "rewards/margins": 0.47646698355674744, "rewards/rejected": 29.201608657836914, "step": 400 }, { "epoch": 0.08047105004906771, "grad_norm": 5227.403614179651, "learning_rate": 4.019607843137255e-07, "logits/chosen": -2.6153388023376465, "logits/rejected": -2.6428751945495605, "logps/chosen": -271.0966491699219, "logps/rejected": -225.4984588623047, "loss": 464.8479, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 30.264745712280273, "rewards/margins": 2.1848247051239014, "rewards/rejected": 28.079919815063477, "step": 410 }, { "epoch": 0.08243375858684986, "grad_norm": 4868.874765478459, "learning_rate": 4.117647058823529e-07, "logits/chosen": -2.6614885330200195, "logits/rejected": -2.6459619998931885, "logps/chosen": -231.1891632080078, "logps/rejected": -233.40103149414062, "loss": 489.6508, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 29.440465927124023, "rewards/margins": -13.364405632019043, "rewards/rejected": 42.80486297607422, "step": 420 }, { "epoch": 0.08439646712463199, "grad_norm": 6272.321633430104, "learning_rate": 4.215686274509804e-07, "logits/chosen": -2.8335399627685547, "logits/rejected": -2.644537925720215, "logps/chosen": -331.88970947265625, "logps/rejected": -218.96865844726562, "loss": 438.3674, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 32.342071533203125, "rewards/margins": 11.023529052734375, "rewards/rejected": 21.31854248046875, "step": 430 }, { "epoch": 0.08635917566241413, "grad_norm": 3468.7372394369486, "learning_rate": 4.313725490196078e-07, "logits/chosen": -2.6140809059143066, "logits/rejected": -2.529378890991211, "logps/chosen": -263.18231201171875, "logps/rejected": -210.88919067382812, "loss": 469.5156, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 36.2723274230957, "rewards/margins": 4.149384498596191, "rewards/rejected": 32.12294006347656, "step": 440 }, { "epoch": 0.08832188420019627, "grad_norm": 4602.551772130593, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.7470650672912598, "logits/rejected": -2.8349151611328125, "logps/chosen": -255.5126190185547, "logps/rejected": -249.41067504882812, "loss": 470.7945, "rewards/accuracies": 0.3333333134651184, "rewards/chosen": 25.42453384399414, "rewards/margins": -14.558160781860352, "rewards/rejected": 39.982696533203125, "step": 450 }, { "epoch": 0.09028459273797841, "grad_norm": 4894.576240603985, "learning_rate": 4.5098039215686274e-07, "logits/chosen": -2.554935932159424, "logits/rejected": -2.4666876792907715, "logps/chosen": -220.52279663085938, "logps/rejected": -231.3734130859375, "loss": 472.8678, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 28.813745498657227, "rewards/margins": 2.0909621715545654, "rewards/rejected": 26.7227840423584, "step": 460 }, { "epoch": 0.09224730127576054, "grad_norm": 4738.262801513381, "learning_rate": 4.6078431372549013e-07, "logits/chosen": -2.427926540374756, "logits/rejected": -2.3819785118103027, "logps/chosen": -237.2082061767578, "logps/rejected": -274.40142822265625, "loss": 489.6224, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": 26.397693634033203, "rewards/margins": -35.425819396972656, "rewards/rejected": 61.823509216308594, "step": 470 }, { "epoch": 0.09421000981354269, "grad_norm": 4975.613495131456, "learning_rate": 4.705882352941176e-07, "logits/chosen": -2.7603957653045654, "logits/rejected": -2.664858341217041, "logps/chosen": -287.56280517578125, "logps/rejected": -292.58740234375, "loss": 515.9132, "rewards/accuracies": 0.5, "rewards/chosen": 32.57997512817383, "rewards/margins": 2.327850580215454, "rewards/rejected": 30.252126693725586, "step": 480 }, { "epoch": 0.09617271835132483, "grad_norm": 2980.3710795452093, "learning_rate": 4.803921568627451e-07, "logits/chosen": -2.433875560760498, "logits/rejected": -2.4270520210266113, "logps/chosen": -269.6871337890625, "logps/rejected": -286.18511962890625, "loss": 490.2408, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 24.98427963256836, "rewards/margins": -10.737724304199219, "rewards/rejected": 35.72200393676758, "step": 490 }, { "epoch": 0.09813542688910697, "grad_norm": 4577.218197831634, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.6620137691497803, "logits/rejected": -2.624298334121704, "logps/chosen": -268.84979248046875, "logps/rejected": -228.1798553466797, "loss": 428.5641, "rewards/accuracies": 0.5, "rewards/chosen": 30.748973846435547, "rewards/margins": 8.2810697555542, "rewards/rejected": 22.46790313720703, "step": 500 }, { "epoch": 0.10009813542688911, "grad_norm": 2974.4950215344847, "learning_rate": 5e-07, "logits/chosen": -2.559434175491333, "logits/rejected": -2.4876298904418945, "logps/chosen": -249.40090942382812, "logps/rejected": -241.1505126953125, "loss": 448.1437, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 38.660858154296875, "rewards/margins": 7.0601806640625, "rewards/rejected": 31.60067367553711, "step": 510 }, { "epoch": 0.10206084396467124, "grad_norm": 4350.403609600577, "learning_rate": 4.999941314693213e-07, "logits/chosen": -2.574789524078369, "logits/rejected": -2.550934314727783, "logps/chosen": -218.6322784423828, "logps/rejected": -173.77525329589844, "loss": 460.1616, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 31.011077880859375, "rewards/margins": -1.9973556995391846, "rewards/rejected": 33.00843811035156, "step": 520 }, { "epoch": 0.10402355250245339, "grad_norm": 5025.346211421526, "learning_rate": 4.999765261528027e-07, "logits/chosen": -2.609151601791382, "logits/rejected": -2.6930503845214844, "logps/chosen": -262.9989013671875, "logps/rejected": -280.50274658203125, "loss": 485.6344, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 26.35666847229004, "rewards/margins": -9.358552932739258, "rewards/rejected": 35.71521759033203, "step": 530 }, { "epoch": 0.10598626104023552, "grad_norm": 4795.300896477626, "learning_rate": 4.999471848769828e-07, "logits/chosen": -2.504603385925293, "logits/rejected": -2.509408950805664, "logps/chosen": -260.916259765625, "logps/rejected": -287.2959899902344, "loss": 465.1678, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 29.1888427734375, "rewards/margins": 2.88441801071167, "rewards/rejected": 26.304424285888672, "step": 540 }, { "epoch": 0.10794896957801767, "grad_norm": 5291.830653761251, "learning_rate": 4.999061090193831e-07, "logits/chosen": -2.7169575691223145, "logits/rejected": -2.574615001678467, "logps/chosen": -286.2722473144531, "logps/rejected": -274.84759521484375, "loss": 531.3509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 36.53556442260742, "rewards/margins": 9.655365943908691, "rewards/rejected": 26.880207061767578, "step": 550 }, { "epoch": 0.1099116781157998, "grad_norm": 4294.277561666831, "learning_rate": 4.998533005084428e-07, "logits/chosen": -2.6560511589050293, "logits/rejected": -2.673675298690796, "logps/chosen": -254.6729278564453, "logps/rejected": -222.37979125976562, "loss": 424.8929, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 26.852008819580078, "rewards/margins": -2.5513346195220947, "rewards/rejected": 29.40334129333496, "step": 560 }, { "epoch": 0.11187438665358194, "grad_norm": 4013.8424370724483, "learning_rate": 4.997887618234292e-07, "logits/chosen": -2.606541872024536, "logits/rejected": -2.682185649871826, "logps/chosen": -250.31546020507812, "logps/rejected": -274.77874755859375, "loss": 506.972, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 31.33298683166504, "rewards/margins": -2.6476120948791504, "rewards/rejected": 33.9806022644043, "step": 570 }, { "epoch": 0.11383709519136408, "grad_norm": 3840.5056559095287, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.7121341228485107, "logits/rejected": -2.5332818031311035, "logps/chosen": -215.18496704101562, "logps/rejected": -189.94198608398438, "loss": 431.4372, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 28.723220825195312, "rewards/margins": 5.397341251373291, "rewards/rejected": 23.325878143310547, "step": 580 }, { "epoch": 0.11579980372914622, "grad_norm": 3941.421650843535, "learning_rate": 4.996245066016623e-07, "logits/chosen": -2.670100688934326, "logits/rejected": -2.584319591522217, "logps/chosen": -223.4555206298828, "logps/rejected": -204.05929565429688, "loss": 347.5933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 31.671985626220703, "rewards/margins": 4.035216331481934, "rewards/rejected": 27.636768341064453, "step": 590 }, { "epoch": 0.11776251226692837, "grad_norm": 5609.191364022628, "learning_rate": 4.995247977764035e-07, "logits/chosen": -2.506758451461792, "logits/rejected": -2.5981059074401855, "logps/chosen": -204.51402282714844, "logps/rejected": -173.1730194091797, "loss": 510.3478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 30.046728134155273, "rewards/margins": -6.598879337310791, "rewards/rejected": 36.645606994628906, "step": 600 }, { "epoch": 0.1197252208047105, "grad_norm": 5009.758427999647, "learning_rate": 4.994133741996982e-07, "logits/chosen": -2.65734601020813, "logits/rejected": -2.6545443534851074, "logps/chosen": -258.2228088378906, "logps/rejected": -210.0067901611328, "loss": 484.3109, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 23.592992782592773, "rewards/margins": -5.538031578063965, "rewards/rejected": 29.131023406982422, "step": 610 }, { "epoch": 0.12168792934249265, "grad_norm": 5363.664640832944, "learning_rate": 4.992902411026877e-07, "logits/chosen": -2.609527111053467, "logits/rejected": -2.605999708175659, "logps/chosen": -238.1632843017578, "logps/rejected": -315.69329833984375, "loss": 463.5376, "rewards/accuracies": 0.5, "rewards/chosen": 27.6790828704834, "rewards/margins": 0.5688053369522095, "rewards/rejected": 27.110280990600586, "step": 620 }, { "epoch": 0.12365063788027478, "grad_norm": 4311.589890986141, "learning_rate": 4.991554042662548e-07, "logits/chosen": -2.5265004634857178, "logits/rejected": -2.5692458152770996, "logps/chosen": -211.8911590576172, "logps/rejected": -210.7008056640625, "loss": 440.8429, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 37.06153106689453, "rewards/margins": 7.217554569244385, "rewards/rejected": 29.84397315979004, "step": 630 }, { "epoch": 0.1256133464180569, "grad_norm": 4609.647394782643, "learning_rate": 4.990088700207525e-07, "logits/chosen": -2.5927557945251465, "logits/rejected": -2.6587564945220947, "logps/chosen": -180.91180419921875, "logps/rejected": -201.13131713867188, "loss": 410.4563, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 28.912639617919922, "rewards/margins": -10.199140548706055, "rewards/rejected": 39.111778259277344, "step": 640 }, { "epoch": 0.12757605495583907, "grad_norm": 4119.162413223579, "learning_rate": 4.988506452457066e-07, "logits/chosen": -2.570932388305664, "logits/rejected": -2.645514488220215, "logps/chosen": -258.30291748046875, "logps/rejected": -267.41119384765625, "loss": 445.8911, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.332210540771484, "rewards/margins": 9.490628242492676, "rewards/rejected": 28.841583251953125, "step": 650 }, { "epoch": 0.1295387634936212, "grad_norm": 4789.877275203537, "learning_rate": 4.986807373694925e-07, "logits/chosen": -2.6149802207946777, "logits/rejected": -2.5973472595214844, "logps/chosen": -228.85330200195312, "logps/rejected": -235.3708038330078, "loss": 459.3869, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 35.13557815551758, "rewards/margins": 2.73466157913208, "rewards/rejected": 32.40091323852539, "step": 660 }, { "epoch": 0.13150147203140333, "grad_norm": 4459.812327482822, "learning_rate": 4.984991543689869e-07, "logits/chosen": -2.660524368286133, "logits/rejected": -2.5925040245056152, "logps/chosen": -240.50296020507812, "logps/rejected": -257.6099548339844, "loss": 495.4315, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 42.89487838745117, "rewards/margins": 19.89583969116211, "rewards/rejected": 22.999040603637695, "step": 670 }, { "epoch": 0.13346418056918546, "grad_norm": 4273.897641113578, "learning_rate": 4.983059047691931e-07, "logits/chosen": -2.6355478763580322, "logits/rejected": -2.5462124347686768, "logps/chosen": -227.7132568359375, "logps/rejected": -188.1220703125, "loss": 448.9462, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 30.760122299194336, "rewards/margins": 4.663891792297363, "rewards/rejected": 26.096227645874023, "step": 680 }, { "epoch": 0.13542688910696762, "grad_norm": 4628.194176183738, "learning_rate": 4.981009976428408e-07, "logits/chosen": -2.48315691947937, "logits/rejected": -2.4086811542510986, "logps/chosen": -286.48016357421875, "logps/rejected": -238.0033721923828, "loss": 459.0327, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 24.821685791015625, "rewards/margins": -3.5525615215301514, "rewards/rejected": 28.374248504638672, "step": 690 }, { "epoch": 0.13738959764474976, "grad_norm": 5417.6256406016655, "learning_rate": 4.9788444260996e-07, "logits/chosen": -2.6074776649475098, "logits/rejected": -2.6096506118774414, "logps/chosen": -235.46939086914062, "logps/rejected": -211.37625122070312, "loss": 433.4594, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 28.747339248657227, "rewards/margins": -17.744359970092773, "rewards/rejected": 46.49169921875, "step": 700 }, { "epoch": 0.1393523061825319, "grad_norm": 5082.956665278541, "learning_rate": 4.976562498374295e-07, "logits/chosen": -2.6656737327575684, "logits/rejected": -2.6231331825256348, "logps/chosen": -257.89898681640625, "logps/rejected": -230.61776733398438, "loss": 477.1858, "rewards/accuracies": 0.5, "rewards/chosen": 31.849462509155273, "rewards/margins": 3.5808677673339844, "rewards/rejected": 28.268596649169922, "step": 710 }, { "epoch": 0.14131501472031405, "grad_norm": 4683.782278834829, "learning_rate": 4.974164300384997e-07, "logits/chosen": -2.613971471786499, "logits/rejected": -2.663684368133545, "logps/chosen": -202.5010986328125, "logps/rejected": -259.63616943359375, "loss": 448.8101, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 28.70241355895996, "rewards/margins": -5.085217475891113, "rewards/rejected": 33.787635803222656, "step": 720 }, { "epoch": 0.14327772325809618, "grad_norm": 4107.436513649201, "learning_rate": 4.971649944722893e-07, "logits/chosen": -2.601701498031616, "logits/rejected": -2.654048204421997, "logps/chosen": -228.1996612548828, "logps/rejected": -249.9207763671875, "loss": 463.9367, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 24.239540100097656, "rewards/margins": -9.543767929077148, "rewards/rejected": 33.78330993652344, "step": 730 }, { "epoch": 0.1452404317958783, "grad_norm": 3839.3323324164153, "learning_rate": 4.96901954943257e-07, "logits/chosen": -2.5906546115875244, "logits/rejected": -2.418152332305908, "logps/chosen": -228.8038330078125, "logps/rejected": -142.23724365234375, "loss": 447.7127, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 36.81355285644531, "rewards/margins": 12.445768356323242, "rewards/rejected": 24.367786407470703, "step": 740 }, { "epoch": 0.14720314033366044, "grad_norm": 4176.412649636955, "learning_rate": 4.96627323800647e-07, "logits/chosen": -2.565721035003662, "logits/rejected": -2.5852303504943848, "logps/chosen": -196.6484375, "logps/rejected": -216.9084930419922, "loss": 453.8092, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 41.040504455566406, "rewards/margins": 13.594090461730957, "rewards/rejected": 27.446414947509766, "step": 750 }, { "epoch": 0.1491658488714426, "grad_norm": 3941.2012801097057, "learning_rate": 4.963411139379099e-07, "logits/chosen": -2.6596925258636475, "logits/rejected": -2.585742950439453, "logps/chosen": -259.2139587402344, "logps/rejected": -231.1278839111328, "loss": 470.8556, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 31.395431518554688, "rewards/margins": 1.9789135456085205, "rewards/rejected": 29.416519165039062, "step": 760 }, { "epoch": 0.15112855740922473, "grad_norm": 4955.044390669766, "learning_rate": 4.960433387920964e-07, "logits/chosen": -2.5374937057495117, "logits/rejected": -2.5497844219207764, "logps/chosen": -145.56930541992188, "logps/rejected": -266.82159423828125, "loss": 446.4508, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": 24.17589569091797, "rewards/margins": -1.778869390487671, "rewards/rejected": 25.95476722717285, "step": 770 }, { "epoch": 0.15309126594700687, "grad_norm": 7089.744405843409, "learning_rate": 4.957340123432271e-07, "logits/chosen": -2.5326530933380127, "logits/rejected": -2.404533863067627, "logps/chosen": -292.24658203125, "logps/rejected": -204.31930541992188, "loss": 478.8755, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 40.32138442993164, "rewards/margins": 12.539506912231445, "rewards/rejected": 27.781875610351562, "step": 780 }, { "epoch": 0.155053974484789, "grad_norm": 4122.114584173073, "learning_rate": 4.954131491136361e-07, "logits/chosen": -2.564535617828369, "logits/rejected": -2.4996018409729004, "logps/chosen": -287.51141357421875, "logps/rejected": -249.18173217773438, "loss": 503.4248, "rewards/accuracies": 0.5, "rewards/chosen": 37.882667541503906, "rewards/margins": -1.5718473196029663, "rewards/rejected": 39.45451736450195, "step": 790 }, { "epoch": 0.15701668302257116, "grad_norm": 4788.833682169245, "learning_rate": 4.95080764167289e-07, "logits/chosen": -2.548109531402588, "logits/rejected": -2.588792562484741, "logps/chosen": -217.90963745117188, "logps/rejected": -243.854248046875, "loss": 491.3777, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 32.40715789794922, "rewards/margins": 2.9248015880584717, "rewards/rejected": 29.482357025146484, "step": 800 }, { "epoch": 0.1589793915603533, "grad_norm": 4268.850779393387, "learning_rate": 4.94736873109076e-07, "logits/chosen": -2.6233417987823486, "logits/rejected": -2.61574125289917, "logps/chosen": -220.2720184326172, "logps/rejected": -201.5345001220703, "loss": 470.0953, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 32.03792953491211, "rewards/margins": 1.7439085245132446, "rewards/rejected": 30.294021606445312, "step": 810 }, { "epoch": 0.16094210009813542, "grad_norm": 5328.822800569311, "learning_rate": 4.943814920840787e-07, "logits/chosen": -2.4270339012145996, "logits/rejected": -2.389511823654175, "logps/chosen": -241.12448120117188, "logps/rejected": -219.40316772460938, "loss": 431.8633, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 37.06961441040039, "rewards/margins": -9.050786972045898, "rewards/rejected": 46.12040328979492, "step": 820 }, { "epoch": 0.16290480863591755, "grad_norm": 4376.334525428355, "learning_rate": 4.940146377768126e-07, "logits/chosen": -2.5405285358428955, "logits/rejected": -2.48429536819458, "logps/chosen": -232.83358764648438, "logps/rejected": -197.3223419189453, "loss": 413.9131, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 54.56694412231445, "rewards/margins": 22.161144256591797, "rewards/rejected": 32.40580368041992, "step": 830 }, { "epoch": 0.1648675171736997, "grad_norm": 4537.304696118614, "learning_rate": 4.936363274104441e-07, "logits/chosen": -2.5849764347076416, "logits/rejected": -2.5415334701538086, "logps/chosen": -254.42129516601562, "logps/rejected": -192.87167358398438, "loss": 405.1104, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 31.997446060180664, "rewards/margins": -1.940429925918579, "rewards/rejected": 33.9378776550293, "step": 840 }, { "epoch": 0.16683022571148184, "grad_norm": 3909.8911410130995, "learning_rate": 4.932465787459808e-07, "logits/chosen": -2.677730083465576, "logits/rejected": -2.5695979595184326, "logps/chosen": -232.7654266357422, "logps/rejected": -210.0175323486328, "loss": 442.1903, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 33.31116485595703, "rewards/margins": -9.833333969116211, "rewards/rejected": 43.14450454711914, "step": 850 }, { "epoch": 0.16879293424926398, "grad_norm": 4668.004498413699, "learning_rate": 4.92845410081439e-07, "logits/chosen": -2.4772255420684814, "logits/rejected": -2.484923839569092, "logps/chosen": -227.3833465576172, "logps/rejected": -249.98782348632812, "loss": 430.9701, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 31.499420166015625, "rewards/margins": -2.1714835166931152, "rewards/rejected": 33.67090606689453, "step": 860 }, { "epoch": 0.17075564278704614, "grad_norm": 4657.11755860737, "learning_rate": 4.924328402509833e-07, "logits/chosen": -2.5510215759277344, "logits/rejected": -2.518038272857666, "logps/chosen": -243.21200561523438, "logps/rejected": -201.63858032226562, "loss": 473.4434, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 30.57634925842285, "rewards/margins": -4.8953399658203125, "rewards/rejected": 35.4716911315918, "step": 870 }, { "epoch": 0.17271835132482827, "grad_norm": 4840.204230588623, "learning_rate": 4.920088886240434e-07, "logits/chosen": -2.4730472564697266, "logits/rejected": -2.36600923538208, "logps/chosen": -251.2110595703125, "logps/rejected": -236.5413055419922, "loss": 422.1851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 33.34375, "rewards/margins": 10.204463005065918, "rewards/rejected": 23.139286041259766, "step": 880 }, { "epoch": 0.1746810598626104, "grad_norm": 4157.926315887367, "learning_rate": 4.915735751044045e-07, "logits/chosen": -2.721060276031494, "logits/rejected": -2.609971284866333, "logps/chosen": -245.4339141845703, "logps/rejected": -206.713134765625, "loss": 466.6165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 42.93861389160156, "rewards/margins": 3.1537113189697266, "rewards/rejected": 39.7849006652832, "step": 890 }, { "epoch": 0.17664376840039253, "grad_norm": 5288.463900214572, "learning_rate": 4.911269201292724e-07, "logits/chosen": -2.6828205585479736, "logits/rejected": -2.6067633628845215, "logps/chosen": -258.6050720214844, "logps/rejected": -215.49777221679688, "loss": 475.2619, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 35.75965881347656, "rewards/margins": -6.9160919189453125, "rewards/rejected": 42.675758361816406, "step": 900 }, { "epoch": 0.1786064769381747, "grad_norm": 4588.2274681178, "learning_rate": 4.906689446683146e-07, "logits/chosen": -2.6247589588165283, "logits/rejected": -2.706444501876831, "logps/chosen": -205.7058563232422, "logps/rejected": -279.2490234375, "loss": 483.4424, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 31.428142547607422, "rewards/margins": -6.55536413192749, "rewards/rejected": 37.9835090637207, "step": 910 }, { "epoch": 0.18056918547595682, "grad_norm": 4709.655035818351, "learning_rate": 4.901996702226755e-07, "logits/chosen": -2.504795789718628, "logits/rejected": -2.537221908569336, "logps/chosen": -267.9532165527344, "logps/rejected": -299.85638427734375, "loss": 460.9857, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 32.67220687866211, "rewards/margins": -5.247345924377441, "rewards/rejected": 37.91954803466797, "step": 920 }, { "epoch": 0.18253189401373895, "grad_norm": 4431.199882462497, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.809670925140381, "logits/rejected": -2.5074360370635986, "logps/chosen": -285.9214172363281, "logps/rejected": -185.4030303955078, "loss": 437.1225, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 38.11512756347656, "rewards/margins": 12.34187126159668, "rewards/rejected": 25.773258209228516, "step": 930 }, { "epoch": 0.1844946025515211, "grad_norm": 5356.0935582822785, "learning_rate": 4.892273130332334e-07, "logits/chosen": -2.711378574371338, "logits/rejected": -2.675032138824463, "logps/chosen": -285.53460693359375, "logps/rejected": -320.78167724609375, "loss": 473.1405, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 42.49833297729492, "rewards/margins": 3.7150447368621826, "rewards/rejected": 38.783287048339844, "step": 940 }, { "epoch": 0.18645731108930325, "grad_norm": 4591.265894345519, "learning_rate": 4.887242759398945e-07, "logits/chosen": -2.5174307823181152, "logits/rejected": -2.3968613147735596, "logps/chosen": -164.76437377929688, "logps/rejected": -159.45059204101562, "loss": 422.0927, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 32.820526123046875, "rewards/margins": -1.7133598327636719, "rewards/rejected": 34.53388977050781, "step": 950 }, { "epoch": 0.18842001962708538, "grad_norm": 4501.703024071969, "learning_rate": 4.88210031160659e-07, "logits/chosen": -2.5425117015838623, "logits/rejected": -2.5814175605773926, "logps/chosen": -235.32461547851562, "logps/rejected": -220.5, "loss": 481.1401, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 31.44137954711914, "rewards/margins": -0.7482231855392456, "rewards/rejected": 32.189605712890625, "step": 960 }, { "epoch": 0.1903827281648675, "grad_norm": 3761.653691869692, "learning_rate": 4.876846028384169e-07, "logits/chosen": -2.6352555751800537, "logits/rejected": -2.5650863647460938, "logps/chosen": -176.44219970703125, "logps/rejected": -205.5272674560547, "loss": 379.0157, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 27.441858291625977, "rewards/margins": -2.6123874187469482, "rewards/rejected": 30.054244995117188, "step": 970 }, { "epoch": 0.19234543670264967, "grad_norm": 3688.16826691894, "learning_rate": 4.87148015641106e-07, "logits/chosen": -2.642768144607544, "logits/rejected": -2.6509671211242676, "logps/chosen": -214.3375244140625, "logps/rejected": -244.3864288330078, "loss": 477.9643, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 36.71364974975586, "rewards/margins": 6.217877388000488, "rewards/rejected": 30.495773315429688, "step": 980 }, { "epoch": 0.1943081452404318, "grad_norm": 3668.1326136631747, "learning_rate": 4.866002947605539e-07, "logits/chosen": -2.5380349159240723, "logits/rejected": -2.4647417068481445, "logps/chosen": -208.49099731445312, "logps/rejected": -204.13284301757812, "loss": 395.11, "rewards/accuracies": 0.3333333134651184, "rewards/chosen": 28.401355743408203, "rewards/margins": -10.098089218139648, "rewards/rejected": 38.49944305419922, "step": 990 }, { "epoch": 0.19627085377821393, "grad_norm": 4429.484309033255, "learning_rate": 4.860414659112948e-07, "logits/chosen": -2.6225905418395996, "logits/rejected": -2.456757068634033, "logps/chosen": -207.32296752929688, "logps/rejected": -165.3435516357422, "loss": 445.1423, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 30.326162338256836, "rewards/margins": 2.9115099906921387, "rewards/rejected": 27.41465187072754, "step": 1000 }, { "epoch": 0.19823356231599606, "grad_norm": 4709.9204917731795, "learning_rate": 4.854715553293627e-07, "logits/chosen": -2.6830430030822754, "logits/rejected": -2.5512986183166504, "logps/chosen": -270.9964904785156, "logps/rejected": -176.79745483398438, "loss": 496.4829, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 43.204612731933594, "rewards/margins": 17.4560546875, "rewards/rejected": 25.74855613708496, "step": 1010 }, { "epoch": 0.20019627085377822, "grad_norm": 3895.548818863484, "learning_rate": 4.848905897710595e-07, "logits/chosen": -2.3969950675964355, "logits/rejected": -2.3870437145233154, "logps/chosen": -277.4095764160156, "logps/rejected": -171.2035369873047, "loss": 464.7826, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 33.65400314331055, "rewards/margins": -12.002626419067383, "rewards/rejected": 45.65662384033203, "step": 1020 }, { "epoch": 0.20215897939156036, "grad_norm": 5888.578415290401, "learning_rate": 4.842985965116987e-07, "logits/chosen": -2.638823986053467, "logits/rejected": -2.5890250205993652, "logps/chosen": -303.5338439941406, "logps/rejected": -239.8367919921875, "loss": 473.8742, "rewards/accuracies": 0.5, "rewards/chosen": 37.500709533691406, "rewards/margins": -4.421719551086426, "rewards/rejected": 41.92242431640625, "step": 1030 }, { "epoch": 0.2041216879293425, "grad_norm": 13266.38798683862, "learning_rate": 4.836956033443253e-07, "logits/chosen": -2.5955393314361572, "logits/rejected": -2.5155348777770996, "logps/chosen": -275.0190124511719, "logps/rejected": -253.9836883544922, "loss": 465.175, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 92.16295623779297, "rewards/margins": -35.01982879638672, "rewards/rejected": 127.18278503417969, "step": 1040 }, { "epoch": 0.20608439646712462, "grad_norm": 4932.727575414187, "learning_rate": 4.830816385784104e-07, "logits/chosen": -2.684427499771118, "logits/rejected": -2.6435837745666504, "logps/chosen": -273.91754150390625, "logps/rejected": -245.6715850830078, "loss": 466.2123, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 33.9106330871582, "rewards/margins": 0.9209011197090149, "rewards/rejected": 32.9897346496582, "step": 1050 }, { "epoch": 0.20804710500490678, "grad_norm": 4776.035725293843, "learning_rate": 4.824567310385226e-07, "logits/chosen": -2.6558475494384766, "logits/rejected": -2.6177608966827393, "logps/chosen": -286.10113525390625, "logps/rejected": -227.9519805908203, "loss": 482.4986, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 31.268722534179688, "rewards/margins": -2.8335373401641846, "rewards/rejected": 34.102256774902344, "step": 1060 }, { "epoch": 0.2100098135426889, "grad_norm": 5901.273056179599, "learning_rate": 4.818209100629744e-07, "logits/chosen": -2.5953338146209717, "logits/rejected": -2.552004814147949, "logps/chosen": -203.58914184570312, "logps/rejected": -225.4922332763672, "loss": 445.1609, "rewards/accuracies": 0.5, "rewards/chosen": 45.0809440612793, "rewards/margins": -0.3565046191215515, "rewards/rejected": 45.43744659423828, "step": 1070 }, { "epoch": 0.21197252208047104, "grad_norm": 3305.566013387382, "learning_rate": 4.81174205502445e-07, "logits/chosen": -2.5669751167297363, "logits/rejected": -2.6002602577209473, "logps/chosen": -190.98147583007812, "logps/rejected": -173.95994567871094, "loss": 380.1803, "rewards/accuracies": 0.5, "rewards/chosen": 30.217853546142578, "rewards/margins": -7.351622581481934, "rewards/rejected": 37.56947326660156, "step": 1080 }, { "epoch": 0.2139352306182532, "grad_norm": 3711.398480605608, "learning_rate": 4.80516647718579e-07, "logits/chosen": -2.5809082984924316, "logits/rejected": -2.51855206489563, "logps/chosen": -204.2241973876953, "logps/rejected": -218.84872436523438, "loss": 390.2722, "rewards/accuracies": 0.5666666030883789, "rewards/chosen": 35.5487174987793, "rewards/margins": 0.2038247287273407, "rewards/rejected": 35.34489059448242, "step": 1090 }, { "epoch": 0.21589793915603533, "grad_norm": 3952.008170028738, "learning_rate": 4.798482675825602e-07, "logits/chosen": -2.66428279876709, "logits/rejected": -2.694563388824463, "logps/chosen": -179.92654418945312, "logps/rejected": -216.7838897705078, "loss": 455.5333, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 28.27036476135254, "rewards/margins": -3.186108350753784, "rewards/rejected": 31.456470489501953, "step": 1100 }, { "epoch": 0.21786064769381747, "grad_norm": 5288.6150146613545, "learning_rate": 4.791690964736636e-07, "logits/chosen": -2.5695927143096924, "logits/rejected": -2.5862300395965576, "logps/chosen": -244.2303009033203, "logps/rejected": -182.8033447265625, "loss": 417.4308, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 67.5400619506836, "rewards/margins": 35.33257293701172, "rewards/rejected": 32.207496643066406, "step": 1110 }, { "epoch": 0.2198233562315996, "grad_norm": 5347.641800993126, "learning_rate": 4.78479166277781e-07, "logits/chosen": -2.6153831481933594, "logits/rejected": -2.4904675483703613, "logps/chosen": -298.97479248046875, "logps/rejected": -240.39431762695312, "loss": 515.0782, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 32.2321891784668, "rewards/margins": -12.380395889282227, "rewards/rejected": 44.612586975097656, "step": 1120 }, { "epoch": 0.22178606476938176, "grad_norm": 5258.094474663755, "learning_rate": 4.777785093859247e-07, "logits/chosen": -2.770720958709717, "logits/rejected": -2.547207832336426, "logps/chosen": -241.500244140625, "logps/rejected": -252.56668090820312, "loss": 480.3711, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 43.165924072265625, "rewards/margins": 3.580148220062256, "rewards/rejected": 39.585777282714844, "step": 1130 }, { "epoch": 0.2237487733071639, "grad_norm": 5553.182619936151, "learning_rate": 4.770671586927063e-07, "logits/chosen": -2.7911696434020996, "logits/rejected": -2.684234142303467, "logps/chosen": -317.8401794433594, "logps/rejected": -291.2255859375, "loss": 496.6098, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 39.029273986816406, "rewards/margins": 1.6814048290252686, "rewards/rejected": 37.347869873046875, "step": 1140 }, { "epoch": 0.22571148184494602, "grad_norm": 4408.1202191004995, "learning_rate": 4.7634514759479275e-07, "logits/chosen": -2.7609219551086426, "logits/rejected": -2.6142373085021973, "logps/chosen": -247.8801727294922, "logps/rejected": -194.33895874023438, "loss": 461.1519, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 37.78066635131836, "rewards/margins": 11.475553512573242, "rewards/rejected": 26.305110931396484, "step": 1150 }, { "epoch": 0.22767419038272815, "grad_norm": 5480.72496037876, "learning_rate": 4.7561250998933835e-07, "logits/chosen": -2.5984532833099365, "logits/rejected": -2.4840378761291504, "logps/chosen": -309.9330139160156, "logps/rejected": -181.5688018798828, "loss": 490.8118, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 34.388038635253906, "rewards/margins": 1.5576369762420654, "rewards/rejected": 32.830406188964844, "step": 1160 }, { "epoch": 0.2296368989205103, "grad_norm": 4453.143374911204, "learning_rate": 4.7486928027239304e-07, "logits/chosen": -2.5082366466522217, "logits/rejected": -2.4943606853485107, "logps/chosen": -179.74754333496094, "logps/rejected": -184.5972137451172, "loss": 471.2, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 25.90127944946289, "rewards/margins": -1.3986310958862305, "rewards/rejected": 27.299907684326172, "step": 1170 }, { "epoch": 0.23159960745829244, "grad_norm": 5802.440683508613, "learning_rate": 4.7411549333728807e-07, "logits/chosen": -2.5889554023742676, "logits/rejected": -2.634286403656006, "logps/chosen": -247.23440551757812, "logps/rejected": -242.90347290039062, "loss": 495.7646, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 31.239521026611328, "rewards/margins": -10.11313247680664, "rewards/rejected": 41.352657318115234, "step": 1180 }, { "epoch": 0.23356231599607458, "grad_norm": 4881.3901064765905, "learning_rate": 4.7335118457299756e-07, "logits/chosen": -2.605823040008545, "logits/rejected": -2.655714750289917, "logps/chosen": -257.37359619140625, "logps/rejected": -226.6381378173828, "loss": 467.4191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 30.597707748413086, "rewards/margins": -4.192519187927246, "rewards/rejected": 34.790225982666016, "step": 1190 }, { "epoch": 0.23552502453385674, "grad_norm": 4259.232291896253, "learning_rate": 4.7257638986247684e-07, "logits/chosen": -2.685166120529175, "logits/rejected": -2.6258645057678223, "logps/chosen": -263.2113342285156, "logps/rejected": -298.49688720703125, "loss": 446.8362, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 33.98692321777344, "rewards/margins": -12.568541526794434, "rewards/rejected": 46.55546188354492, "step": 1200 }, { "epoch": 0.23748773307163887, "grad_norm": 4436.6755123445055, "learning_rate": 4.7179114558097814e-07, "logits/chosen": -2.593357801437378, "logits/rejected": -2.5329232215881348, "logps/chosen": -226.9739227294922, "logps/rejected": -177.63357543945312, "loss": 467.4614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 32.3291015625, "rewards/margins": 5.060539722442627, "rewards/rejected": 27.268564224243164, "step": 1210 }, { "epoch": 0.239450441609421, "grad_norm": 4013.2887284238113, "learning_rate": 4.709954885943428e-07, "logits/chosen": -2.6918628215789795, "logits/rejected": -2.6806817054748535, "logps/chosen": -240.38052368164062, "logps/rejected": -189.43763732910156, "loss": 403.3918, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 32.36589431762695, "rewards/margins": 4.384322166442871, "rewards/rejected": 27.9815731048584, "step": 1220 }, { "epoch": 0.24141315014720313, "grad_norm": 4930.575310666661, "learning_rate": 4.7018945625727026e-07, "logits/chosen": -2.7886033058166504, "logits/rejected": -2.706328868865967, "logps/chosen": -245.9519500732422, "logps/rejected": -245.0874481201172, "loss": 488.3562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 31.739898681640625, "rewards/margins": 7.568617343902588, "rewards/rejected": 24.171281814575195, "step": 1230 }, { "epoch": 0.2433758586849853, "grad_norm": 3942.7295284984402, "learning_rate": 4.6937308641156447e-07, "logits/chosen": -2.605045795440674, "logits/rejected": -2.567878007888794, "logps/chosen": -178.3212432861328, "logps/rejected": -180.12820434570312, "loss": 406.4432, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 31.777252197265625, "rewards/margins": -0.4026016294956207, "rewards/rejected": 32.17985916137695, "step": 1240 }, { "epoch": 0.24533856722276742, "grad_norm": 4262.859818723618, "learning_rate": 4.685464173843574e-07, "logits/chosen": -2.6398653984069824, "logits/rejected": -2.648780345916748, "logps/chosen": -197.7738037109375, "logps/rejected": -225.6578826904297, "loss": 441.8806, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 36.21776580810547, "rewards/margins": 6.563495635986328, "rewards/rejected": 29.654272079467773, "step": 1250 }, { "epoch": 0.24730127576054955, "grad_norm": 5397.937316901458, "learning_rate": 4.677094879863093e-07, "logits/chosen": -2.6843018531799316, "logits/rejected": -2.5696897506713867, "logps/chosen": -220.08462524414062, "logps/rejected": -189.0897216796875, "loss": 482.1675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 28.3251895904541, "rewards/margins": 0.6852840185165405, "rewards/rejected": 27.639904022216797, "step": 1260 }, { "epoch": 0.2492639842983317, "grad_norm": 5100.629009033718, "learning_rate": 4.66862337509787e-07, "logits/chosen": -2.6803290843963623, "logits/rejected": -2.6703336238861084, "logps/chosen": -281.07177734375, "logps/rejected": -186.40174865722656, "loss": 521.8964, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 39.752471923828125, "rewards/margins": 5.96087646484375, "rewards/rejected": 33.79159164428711, "step": 1270 }, { "epoch": 0.2512266928361138, "grad_norm": 4595.979870084018, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.6391656398773193, "logits/rejected": -2.45827054977417, "logps/chosen": -219.8842010498047, "logps/rejected": -184.69334411621094, "loss": 478.3666, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 36.203399658203125, "rewards/margins": 6.85970401763916, "rewards/rejected": 29.34369468688965, "step": 1280 }, { "epoch": 0.25318940137389595, "grad_norm": 4158.646585214779, "learning_rate": 4.6513753288822833e-07, "logits/chosen": -2.6670408248901367, "logits/rejected": -2.617326259613037, "logps/chosen": -126.4794692993164, "logps/rejected": -147.56619262695312, "loss": 373.9057, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 32.822105407714844, "rewards/margins": 7.652844429016113, "rewards/rejected": 25.169261932373047, "step": 1290 }, { "epoch": 0.25515210991167814, "grad_norm": 4963.405587351429, "learning_rate": 4.6425995971974265e-07, "logits/chosen": -2.599966287612915, "logits/rejected": -2.575037956237793, "logps/chosen": -255.7868194580078, "logps/rejected": -174.5104522705078, "loss": 466.948, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 27.49749755859375, "rewards/margins": -15.012187004089355, "rewards/rejected": 42.50968933105469, "step": 1300 }, { "epoch": 0.25711481844946027, "grad_norm": 5415.187880414271, "learning_rate": 4.633723274220824e-07, "logits/chosen": -2.679082155227661, "logits/rejected": -2.613781690597534, "logps/chosen": -257.79620361328125, "logps/rejected": -279.03985595703125, "loss": 522.431, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 39.13835906982422, "rewards/margins": -5.3103437423706055, "rewards/rejected": 44.448707580566406, "step": 1310 }, { "epoch": 0.2590775269872424, "grad_norm": 2682.375646417619, "learning_rate": 4.624746776680267e-07, "logits/chosen": -2.4919180870056152, "logits/rejected": -2.525841474533081, "logps/chosen": -224.8561553955078, "logps/rejected": -211.89932250976562, "loss": 418.4607, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 23.724262237548828, "rewards/margins": -11.103318214416504, "rewards/rejected": 34.827579498291016, "step": 1320 }, { "epoch": 0.26104023552502453, "grad_norm": 4531.628992068344, "learning_rate": 4.6156705260065634e-07, "logits/chosen": -2.464676856994629, "logits/rejected": -2.4597787857055664, "logps/chosen": -178.28427124023438, "logps/rejected": -178.5281219482422, "loss": 423.1229, "rewards/accuracies": 0.2666666507720947, "rewards/chosen": 30.889575958251953, "rewards/margins": -15.716069221496582, "rewards/rejected": 46.60564041137695, "step": 1330 }, { "epoch": 0.26300294406280667, "grad_norm": 4075.9441228112637, "learning_rate": 4.606494948313758e-07, "logits/chosen": -2.5901541709899902, "logits/rejected": -2.497117042541504, "logps/chosen": -204.6013946533203, "logps/rejected": -216.5972442626953, "loss": 445.6849, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 37.51020050048828, "rewards/margins": -1.5504966974258423, "rewards/rejected": 39.060699462890625, "step": 1340 }, { "epoch": 0.2649656526005888, "grad_norm": 5057.830525011509, "learning_rate": 4.597220474379125e-07, "logits/chosen": -2.656686782836914, "logits/rejected": -2.7043204307556152, "logps/chosen": -296.2122802734375, "logps/rejected": -289.3331604003906, "loss": 464.1113, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 31.062763214111328, "rewards/margins": -2.759782552719116, "rewards/rejected": 33.822547912597656, "step": 1350 }, { "epoch": 0.26692836113837093, "grad_norm": 3919.94236710167, "learning_rate": 4.587847539622942e-07, "logits/chosen": -2.622345447540283, "logits/rejected": -2.6256356239318848, "logps/chosen": -341.5245361328125, "logps/rejected": -288.871337890625, "loss": 485.2092, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 36.633548736572266, "rewards/margins": 2.233851671218872, "rewards/rejected": 34.39970016479492, "step": 1360 }, { "epoch": 0.2688910696761531, "grad_norm": 4920.1016303208735, "learning_rate": 4.5783765840880505e-07, "logits/chosen": -2.7265937328338623, "logits/rejected": -2.666107416152954, "logps/chosen": -282.02496337890625, "logps/rejected": -271.85833740234375, "loss": 492.7476, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 46.0014533996582, "rewards/margins": 11.102203369140625, "rewards/rejected": 34.89925003051758, "step": 1370 }, { "epoch": 0.27085377821393525, "grad_norm": 6729.628673965608, "learning_rate": 4.568808052419196e-07, "logits/chosen": -2.6116538047790527, "logits/rejected": -2.5717697143554688, "logps/chosen": -194.3059844970703, "logps/rejected": -184.2753143310547, "loss": 464.076, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 30.298664093017578, "rewards/margins": -3.016819477081299, "rewards/rejected": 33.31548309326172, "step": 1380 }, { "epoch": 0.2728164867517174, "grad_norm": 4537.998136989965, "learning_rate": 4.5591423938421513e-07, "logits/chosen": -2.5770394802093506, "logits/rejected": -2.492131471633911, "logps/chosen": -244.56942749023438, "logps/rejected": -211.59860229492188, "loss": 472.8399, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 39.29642105102539, "rewards/margins": -7.591795921325684, "rewards/rejected": 46.888221740722656, "step": 1390 }, { "epoch": 0.2747791952894995, "grad_norm": 4061.718067103401, "learning_rate": 4.549380062142627e-07, "logits/chosen": -2.6460325717926025, "logits/rejected": -2.594343900680542, "logps/chosen": -225.2256317138672, "logps/rejected": -253.9397735595703, "loss": 511.3023, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 35.28333282470703, "rewards/margins": -4.53452205657959, "rewards/rejected": 39.81785583496094, "step": 1400 }, { "epoch": 0.27674190382728164, "grad_norm": 6543.838463953099, "learning_rate": 4.5395215156449683e-07, "logits/chosen": -2.5624992847442627, "logits/rejected": -2.5937201976776123, "logps/chosen": -255.32485961914062, "logps/rejected": -288.13134765625, "loss": 467.8676, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 47.551822662353516, "rewards/margins": -13.412294387817383, "rewards/rejected": 60.9641227722168, "step": 1410 }, { "epoch": 0.2787046123650638, "grad_norm": 4174.873530739881, "learning_rate": 4.5295672171906365e-07, "logits/chosen": -2.61214542388916, "logits/rejected": -2.493485450744629, "logps/chosen": -234.7046661376953, "logps/rejected": -192.59169006347656, "loss": 454.9075, "rewards/accuracies": 0.5, "rewards/chosen": 30.760204315185547, "rewards/margins": -4.985699653625488, "rewards/rejected": 35.74591064453125, "step": 1420 }, { "epoch": 0.2806673209028459, "grad_norm": 3618.032412740325, "learning_rate": 4.5195176341164765e-07, "logits/chosen": -2.6637091636657715, "logits/rejected": -2.6291720867156982, "logps/chosen": -235.5375518798828, "logps/rejected": -263.6867370605469, "loss": 512.698, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 32.95092010498047, "rewards/margins": 1.7986339330673218, "rewards/rejected": 31.152286529541016, "step": 1430 }, { "epoch": 0.2826300294406281, "grad_norm": 4437.597451759081, "learning_rate": 4.509373238232782e-07, "logits/chosen": -2.4831185340881348, "logits/rejected": -2.482316732406616, "logps/chosen": -234.2296142578125, "logps/rejected": -185.9305419921875, "loss": 484.7292, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 64.58255004882812, "rewards/margins": 16.714292526245117, "rewards/rejected": 47.868255615234375, "step": 1440 }, { "epoch": 0.2845927379784102, "grad_norm": 4372.421620305732, "learning_rate": 4.499134505801141e-07, "logits/chosen": -2.519425868988037, "logits/rejected": -2.5516860485076904, "logps/chosen": -203.31507873535156, "logps/rejected": -210.93594360351562, "loss": 418.6516, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 27.685678482055664, "rewards/margins": -13.57408618927002, "rewards/rejected": 41.259761810302734, "step": 1450 }, { "epoch": 0.28655544651619236, "grad_norm": 4621.812160949887, "learning_rate": 4.488801917512076e-07, "logits/chosen": -2.62172269821167, "logits/rejected": -2.729160785675049, "logps/chosen": -257.2018127441406, "logps/rejected": -292.1290588378906, "loss": 477.9748, "rewards/accuracies": 0.5, "rewards/chosen": 32.220680236816406, "rewards/margins": -12.17983627319336, "rewards/rejected": 44.4005126953125, "step": 1460 }, { "epoch": 0.2885181550539745, "grad_norm": 5196.492652500245, "learning_rate": 4.478375958462479e-07, "logits/chosen": -2.566138744354248, "logits/rejected": -2.452779531478882, "logps/chosen": -257.0789794921875, "logps/rejected": -185.4340362548828, "loss": 460.3571, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 41.83165740966797, "rewards/margins": 1.7921245098114014, "rewards/rejected": 40.039527893066406, "step": 1470 }, { "epoch": 0.2904808635917566, "grad_norm": 4148.920158831787, "learning_rate": 4.467857118132833e-07, "logits/chosen": -2.556814670562744, "logits/rejected": -2.593320846557617, "logps/chosen": -215.48593139648438, "logps/rejected": -211.2922821044922, "loss": 457.0326, "rewards/accuracies": 0.5, "rewards/chosen": 31.426509857177734, "rewards/margins": -1.3721458911895752, "rewards/rejected": 32.79865646362305, "step": 1480 }, { "epoch": 0.29244357212953875, "grad_norm": 4273.569689206981, "learning_rate": 4.457245890364235e-07, "logits/chosen": -2.646486759185791, "logits/rejected": -2.502842426300049, "logps/chosen": -308.3550109863281, "logps/rejected": -232.5363006591797, "loss": 479.2763, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 41.21160888671875, "rewards/margins": 9.703352928161621, "rewards/rejected": 31.508255004882812, "step": 1490 }, { "epoch": 0.2944062806673209, "grad_norm": 4652.619016984834, "learning_rate": 4.4465427733352124e-07, "logits/chosen": -2.576737642288208, "logits/rejected": -2.547116279602051, "logps/chosen": -243.11770629882812, "logps/rejected": -234.5743865966797, "loss": 433.5632, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 39.526947021484375, "rewards/margins": 1.5665864944458008, "rewards/rejected": 37.960357666015625, "step": 1500 }, { "epoch": 0.296368989205103, "grad_norm": 5026.282262955393, "learning_rate": 4.43574826953833e-07, "logits/chosen": -2.5200655460357666, "logits/rejected": -2.495128631591797, "logps/chosen": -278.53692626953125, "logps/rejected": -280.415283203125, "loss": 453.0888, "rewards/accuracies": 0.29999998211860657, "rewards/chosen": 32.80377960205078, "rewards/margins": -12.697413444519043, "rewards/rejected": 45.501197814941406, "step": 1510 }, { "epoch": 0.2983316977428852, "grad_norm": 3558.0012849900972, "learning_rate": 4.4248628857565997e-07, "logits/chosen": -2.4504408836364746, "logits/rejected": -2.351283550262451, "logps/chosen": -299.6264953613281, "logps/rejected": -180.04957580566406, "loss": 428.5945, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 36.6990966796875, "rewards/margins": 4.336518287658691, "rewards/rejected": 32.36257553100586, "step": 1520 }, { "epoch": 0.30029440628066734, "grad_norm": 4483.806390292095, "learning_rate": 4.413887133039692e-07, "logits/chosen": -2.776367664337158, "logits/rejected": -2.4363582134246826, "logps/chosen": -366.796630859375, "logps/rejected": -245.8273162841797, "loss": 497.7816, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 42.83641815185547, "rewards/margins": 14.016261100769043, "rewards/rejected": 28.82015609741211, "step": 1530 }, { "epoch": 0.30225711481844947, "grad_norm": 4316.725031558483, "learning_rate": 4.4028215266799395e-07, "logits/chosen": -2.6478710174560547, "logits/rejected": -2.518744945526123, "logps/chosen": -237.1392822265625, "logps/rejected": -193.0914764404297, "loss": 463.9993, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 35.98551559448242, "rewards/margins": 7.529461860656738, "rewards/rejected": 28.456050872802734, "step": 1540 }, { "epoch": 0.3042198233562316, "grad_norm": 4976.322296464837, "learning_rate": 4.391666586188145e-07, "logits/chosen": -2.482912302017212, "logits/rejected": -2.411414623260498, "logps/chosen": -181.9075469970703, "logps/rejected": -195.3955535888672, "loss": 406.0816, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": 30.790664672851562, "rewards/margins": -0.060872841626405716, "rewards/rejected": 30.85154151916504, "step": 1550 }, { "epoch": 0.30618253189401373, "grad_norm": 4631.128903009067, "learning_rate": 4.380422835269193e-07, "logits/chosen": -2.6279826164245605, "logits/rejected": -2.5860679149627686, "logps/chosen": -221.2560577392578, "logps/rejected": -227.3316192626953, "loss": 449.2048, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 32.71919631958008, "rewards/margins": -6.1832380294799805, "rewards/rejected": 38.90243148803711, "step": 1560 }, { "epoch": 0.30814524043179586, "grad_norm": 3520.0287409004613, "learning_rate": 4.3690908017974596e-07, "logits/chosen": -2.519622802734375, "logits/rejected": -2.4551331996917725, "logps/chosen": -193.1994171142578, "logps/rejected": -201.52053833007812, "loss": 396.6672, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 32.537864685058594, "rewards/margins": -39.2354736328125, "rewards/rejected": 71.7733383178711, "step": 1570 }, { "epoch": 0.310107948969578, "grad_norm": 5125.863786211232, "learning_rate": 4.3576710177920356e-07, "logits/chosen": -2.6030383110046387, "logits/rejected": -2.582373857498169, "logps/chosen": -198.5616912841797, "logps/rejected": -195.01638793945312, "loss": 405.4208, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 37.846439361572266, "rewards/margins": 2.210106372833252, "rewards/rejected": 35.636329650878906, "step": 1580 }, { "epoch": 0.3120706575073602, "grad_norm": 3936.5961043796497, "learning_rate": 4.346164019391742e-07, "logits/chosen": -2.6877033710479736, "logits/rejected": -2.572878360748291, "logps/chosen": -327.56475830078125, "logps/rejected": -299.89569091796875, "loss": 500.4787, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 33.7691650390625, "rewards/margins": 0.5889126062393188, "rewards/rejected": 33.18025588989258, "step": 1590 }, { "epoch": 0.3140333660451423, "grad_norm": 3855.976180413627, "learning_rate": 4.3345703468299634e-07, "logits/chosen": -2.392334461212158, "logits/rejected": -2.389955759048462, "logps/chosen": -237.44192504882812, "logps/rejected": -223.8633270263672, "loss": 473.9611, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 40.286521911621094, "rewards/margins": -6.727256774902344, "rewards/rejected": 47.01377868652344, "step": 1600 }, { "epoch": 0.31599607458292445, "grad_norm": 5009.958399563597, "learning_rate": 4.322890544409286e-07, "logits/chosen": -2.5345873832702637, "logits/rejected": -2.32863187789917, "logps/chosen": -261.5216369628906, "logps/rejected": -230.5297088623047, "loss": 489.7756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 52.92722702026367, "rewards/margins": 24.752338409423828, "rewards/rejected": 28.174877166748047, "step": 1610 }, { "epoch": 0.3179587831207066, "grad_norm": 3977.265195051046, "learning_rate": 4.311125160475938e-07, "logits/chosen": -2.5485174655914307, "logits/rejected": -2.572300910949707, "logps/chosen": -235.796630859375, "logps/rejected": -310.8801574707031, "loss": 473.5826, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 36.304649353027344, "rewards/margins": -5.325404167175293, "rewards/rejected": 41.63005828857422, "step": 1620 }, { "epoch": 0.3199214916584887, "grad_norm": 4937.364126383082, "learning_rate": 4.299274747394055e-07, "logits/chosen": -2.4575133323669434, "logits/rejected": -2.456610679626465, "logps/chosen": -230.38827514648438, "logps/rejected": -210.62454223632812, "loss": 445.8371, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 35.976993560791016, "rewards/margins": 4.183018684387207, "rewards/rejected": 31.793975830078125, "step": 1630 }, { "epoch": 0.32188420019627084, "grad_norm": 5426.95587112137, "learning_rate": 4.287339861519737e-07, "logits/chosen": -2.54447340965271, "logits/rejected": -2.5630240440368652, "logps/chosen": -260.1687927246094, "logps/rejected": -245.56338500976562, "loss": 456.7415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 39.52373504638672, "rewards/margins": 2.219897747039795, "rewards/rejected": 37.303836822509766, "step": 1640 }, { "epoch": 0.323846908734053, "grad_norm": 4842.585858336256, "learning_rate": 4.275321063174936e-07, "logits/chosen": -2.6207590103149414, "logits/rejected": -2.650418281555176, "logps/chosen": -298.9424133300781, "logps/rejected": -225.19558715820312, "loss": 488.1541, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.705909729003906, "rewards/margins": 7.440756320953369, "rewards/rejected": 30.265148162841797, "step": 1650 }, { "epoch": 0.3258096172718351, "grad_norm": 4378.29508595839, "learning_rate": 4.2632189166211454e-07, "logits/chosen": -2.476266622543335, "logits/rejected": -2.595913887023926, "logps/chosen": -204.14492797851562, "logps/rejected": -215.26126098632812, "loss": 449.5967, "rewards/accuracies": 0.40000003576278687, "rewards/chosen": 33.3272705078125, "rewards/margins": -15.386652946472168, "rewards/rejected": 48.71392059326172, "step": 1660 }, { "epoch": 0.3277723258096173, "grad_norm": 4410.765655477834, "learning_rate": 4.251033990032912e-07, "logits/chosen": -2.563709259033203, "logits/rejected": -2.550846576690674, "logps/chosen": -253.3117218017578, "logps/rejected": -275.704833984375, "loss": 464.9021, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 43.785118103027344, "rewards/margins": 15.132052421569824, "rewards/rejected": 28.653064727783203, "step": 1670 }, { "epoch": 0.3297350343473994, "grad_norm": 4903.83578654043, "learning_rate": 4.238766855471161e-07, "logits/chosen": -2.5991299152374268, "logits/rejected": -2.599780321121216, "logps/chosen": -297.52630615234375, "logps/rejected": -195.06985473632812, "loss": 473.8348, "rewards/accuracies": 0.5, "rewards/chosen": 49.35599899291992, "rewards/margins": 9.37015438079834, "rewards/rejected": 39.98583984375, "step": 1680 }, { "epoch": 0.33169774288518156, "grad_norm": 5158.016326959571, "learning_rate": 4.226418088856335e-07, "logits/chosen": -2.48799467086792, "logits/rejected": -2.589542865753174, "logps/chosen": -228.2854766845703, "logps/rejected": -296.7008972167969, "loss": 479.6581, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 33.118438720703125, "rewards/margins": -2.1525566577911377, "rewards/rejected": 35.27099609375, "step": 1690 }, { "epoch": 0.3336604514229637, "grad_norm": 4286.561139019483, "learning_rate": 4.2139882699413613e-07, "logits/chosen": -2.692596673965454, "logits/rejected": -2.530569314956665, "logps/chosen": -234.590087890625, "logps/rejected": -154.15579223632812, "loss": 410.8604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 37.41648864746094, "rewards/margins": 0.9406414031982422, "rewards/rejected": 36.47584915161133, "step": 1700 }, { "epoch": 0.3356231599607458, "grad_norm": 4907.107578923295, "learning_rate": 4.2014779822844274e-07, "logits/chosen": -2.5812151432037354, "logits/rejected": -2.5249361991882324, "logps/chosen": -194.2213897705078, "logps/rejected": -230.5895538330078, "loss": 483.7218, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 35.815399169921875, "rewards/margins": 10.298858642578125, "rewards/rejected": 25.516538619995117, "step": 1710 }, { "epoch": 0.33758586849852795, "grad_norm": 5150.776933193368, "learning_rate": 4.18888781322159e-07, "logits/chosen": -2.5948588848114014, "logits/rejected": -2.361288547515869, "logps/chosen": -212.5074462890625, "logps/rejected": -220.21035766601562, "loss": 468.4072, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 34.529293060302734, "rewards/margins": -2.4311232566833496, "rewards/rejected": 36.96042251586914, "step": 1720 }, { "epoch": 0.3395485770363101, "grad_norm": 4734.509257051262, "learning_rate": 4.176218353839195e-07, "logits/chosen": -2.6863338947296143, "logits/rejected": -2.690185308456421, "logps/chosen": -226.93643188476562, "logps/rejected": -184.9244842529297, "loss": 455.7779, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 35.38648223876953, "rewards/margins": 10.249422073364258, "rewards/rejected": 25.137056350708008, "step": 1730 }, { "epoch": 0.34151128557409227, "grad_norm": 4277.488404749702, "learning_rate": 4.1634701989461325e-07, "logits/chosen": -2.5563833713531494, "logits/rejected": -2.575597047805786, "logps/chosen": -227.07296752929688, "logps/rejected": -235.7079315185547, "loss": 463.36, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 41.53756332397461, "rewards/margins": -0.9080625772476196, "rewards/rejected": 42.44562911987305, "step": 1740 }, { "epoch": 0.3434739941118744, "grad_norm": 5121.3329084397665, "learning_rate": 4.1506439470459056e-07, "logits/chosen": -2.5935757160186768, "logits/rejected": -2.6307332515716553, "logps/chosen": -204.5602569580078, "logps/rejected": -185.46250915527344, "loss": 415.0569, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 29.67550277709961, "rewards/margins": -6.236639976501465, "rewards/rejected": 35.91214370727539, "step": 1750 }, { "epoch": 0.34543670264965654, "grad_norm": 4288.60724056639, "learning_rate": 4.137740200308537e-07, "logits/chosen": -2.769371509552002, "logits/rejected": -2.630138874053955, "logps/chosen": -255.390625, "logps/rejected": -238.37637329101562, "loss": 447.6325, "rewards/accuracies": 0.76666659116745, "rewards/chosen": 35.000877380371094, "rewards/margins": 6.092411994934082, "rewards/rejected": 28.90846824645996, "step": 1760 }, { "epoch": 0.34739941118743867, "grad_norm": 4066.385997799853, "learning_rate": 4.124759564542295e-07, "logits/chosen": -2.6266300678253174, "logits/rejected": -2.5432302951812744, "logps/chosen": -257.3196716308594, "logps/rejected": -187.99392700195312, "loss": 444.7196, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 38.820655822753906, "rewards/margins": 14.107760429382324, "rewards/rejected": 24.712894439697266, "step": 1770 }, { "epoch": 0.3493621197252208, "grad_norm": 4219.533318286452, "learning_rate": 4.111702649165255e-07, "logits/chosen": -2.7003438472747803, "logits/rejected": -2.574720859527588, "logps/chosen": -214.9884490966797, "logps/rejected": -178.40650939941406, "loss": 396.0387, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 36.658668518066406, "rewards/margins": -0.34727534651756287, "rewards/rejected": 37.005943298339844, "step": 1780 }, { "epoch": 0.35132482826300293, "grad_norm": 4522.624284940719, "learning_rate": 4.0985700671766834e-07, "logits/chosen": -2.5886049270629883, "logits/rejected": -2.4019060134887695, "logps/chosen": -305.7603454589844, "logps/rejected": -250.5217742919922, "loss": 504.2884, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 49.30162811279297, "rewards/margins": 18.17063331604004, "rewards/rejected": 31.130992889404297, "step": 1790 }, { "epoch": 0.35328753680078506, "grad_norm": 4148.605801079037, "learning_rate": 4.085362435128262e-07, "logits/chosen": -2.5969197750091553, "logits/rejected": -2.430412530899048, "logps/chosen": -253.0435028076172, "logps/rejected": -213.9705047607422, "loss": 450.6715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 34.25432586669922, "rewards/margins": -2.4074110984802246, "rewards/rejected": 36.66173553466797, "step": 1800 }, { "epoch": 0.35525024533856725, "grad_norm": 4383.445324888864, "learning_rate": 4.0720803730951423e-07, "logits/chosen": -2.7193925380706787, "logits/rejected": -2.553121566772461, "logps/chosen": -273.88330078125, "logps/rejected": -160.2400665283203, "loss": 465.7694, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 36.27952194213867, "rewards/margins": 3.4309074878692627, "rewards/rejected": 32.84861373901367, "step": 1810 }, { "epoch": 0.3572129538763494, "grad_norm": 4650.197621021058, "learning_rate": 4.058724504646834e-07, "logits/chosen": -2.537329912185669, "logits/rejected": -2.4770290851593018, "logps/chosen": -188.79598999023438, "logps/rejected": -176.62588500976562, "loss": 420.7817, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 39.98280715942383, "rewards/margins": -10.657835006713867, "rewards/rejected": 50.64064407348633, "step": 1820 }, { "epoch": 0.3591756624141315, "grad_norm": 5154.40174453461, "learning_rate": 4.045295456817924e-07, "logits/chosen": -2.560668706893921, "logits/rejected": -2.5223541259765625, "logps/chosen": -246.7219696044922, "logps/rejected": -244.5557098388672, "loss": 492.5268, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 30.156757354736328, "rewards/margins": -3.125023126602173, "rewards/rejected": 33.281776428222656, "step": 1830 }, { "epoch": 0.36113837095191365, "grad_norm": 4277.633576382915, "learning_rate": 4.0317938600786484e-07, "logits/chosen": -2.63722562789917, "logits/rejected": -2.671647071838379, "logps/chosen": -261.1764831542969, "logps/rejected": -239.5066680908203, "loss": 515.1007, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": 33.79315948486328, "rewards/margins": -2.2674169540405273, "rewards/rejected": 36.06057357788086, "step": 1840 }, { "epoch": 0.3631010794896958, "grad_norm": 4849.989172933969, "learning_rate": 4.0182203483052825e-07, "logits/chosen": -2.4976391792297363, "logits/rejected": -2.4753127098083496, "logps/chosen": -247.03573608398438, "logps/rejected": -167.9617919921875, "loss": 464.8464, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 33.916893005371094, "rewards/margins": -3.894451856613159, "rewards/rejected": 37.81134796142578, "step": 1850 }, { "epoch": 0.3650637880274779, "grad_norm": 5305.396800713257, "learning_rate": 4.004575558750389e-07, "logits/chosen": -2.7480387687683105, "logits/rejected": -2.552293300628662, "logps/chosen": -307.5226135253906, "logps/rejected": -270.4818420410156, "loss": 515.8467, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 43.089561462402344, "rewards/margins": 12.114897727966309, "rewards/rejected": 30.97466468811035, "step": 1860 }, { "epoch": 0.36702649656526004, "grad_norm": 3766.794463439121, "learning_rate": 3.9908601320128976e-07, "logits/chosen": -2.347365140914917, "logits/rejected": -2.384580135345459, "logps/chosen": -201.86337280273438, "logps/rejected": -206.92623901367188, "loss": 455.4958, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 35.576011657714844, "rewards/margins": 0.6420882940292358, "rewards/rejected": 34.93392562866211, "step": 1870 }, { "epoch": 0.3689892051030422, "grad_norm": 5137.166654216243, "learning_rate": 3.9770747120080284e-07, "logits/chosen": -2.542323112487793, "logits/rejected": -2.541592836380005, "logps/chosen": -189.4404754638672, "logps/rejected": -167.32725524902344, "loss": 418.2102, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 36.097442626953125, "rewards/margins": -5.9926018714904785, "rewards/rejected": 42.09004592895508, "step": 1880 }, { "epoch": 0.37095191364082436, "grad_norm": 4046.4653595977607, "learning_rate": 3.963219945937063e-07, "logits/chosen": -2.5712907314300537, "logits/rejected": -2.4558281898498535, "logps/chosen": -199.93008422851562, "logps/rejected": -189.0198974609375, "loss": 478.3735, "rewards/accuracies": 0.5, "rewards/chosen": 33.24176788330078, "rewards/margins": -3.3829407691955566, "rewards/rejected": 36.62471389770508, "step": 1890 }, { "epoch": 0.3729146221786065, "grad_norm": 5011.92916099243, "learning_rate": 3.949296484256959e-07, "logits/chosen": -2.6864089965820312, "logits/rejected": -2.579397201538086, "logps/chosen": -211.36483764648438, "logps/rejected": -207.35995483398438, "loss": 417.3505, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 43.83955383300781, "rewards/margins": 7.592188358306885, "rewards/rejected": 36.24736785888672, "step": 1900 }, { "epoch": 0.3748773307163886, "grad_norm": 4019.496297350411, "learning_rate": 3.935304980649813e-07, "logits/chosen": -2.607959032058716, "logits/rejected": -2.6045727729797363, "logps/chosen": -265.7958068847656, "logps/rejected": -236.3028564453125, "loss": 426.1907, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": 34.28295135498047, "rewards/margins": -8.695396423339844, "rewards/rejected": 42.97834396362305, "step": 1910 }, { "epoch": 0.37684003925417076, "grad_norm": 4356.423244834971, "learning_rate": 3.92124609199217e-07, "logits/chosen": -2.481682538986206, "logits/rejected": -2.5561962127685547, "logps/chosen": -168.49618530273438, "logps/rejected": -182.44041442871094, "loss": 407.5496, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 35.77836990356445, "rewards/margins": 0.9616115689277649, "rewards/rejected": 34.81675720214844, "step": 1920 }, { "epoch": 0.3788027477919529, "grad_norm": 4486.626433944729, "learning_rate": 3.907120478324185e-07, "logits/chosen": -2.5384328365325928, "logits/rejected": -2.544644832611084, "logps/chosen": -250.89175415039062, "logps/rejected": -236.45236206054688, "loss": 485.9587, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 38.95748519897461, "rewards/margins": 4.464973449707031, "rewards/rejected": 34.492515563964844, "step": 1930 }, { "epoch": 0.380765456329735, "grad_norm": 5957.488621544725, "learning_rate": 3.8929288028186364e-07, "logits/chosen": -2.544926166534424, "logits/rejected": -2.453709363937378, "logps/chosen": -185.8582763671875, "logps/rejected": -159.22975158691406, "loss": 432.8231, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 33.13045120239258, "rewards/margins": -4.596245765686035, "rewards/rejected": 37.7266960144043, "step": 1940 }, { "epoch": 0.38272816486751715, "grad_norm": 4577.262664079208, "learning_rate": 3.8786717317497875e-07, "logits/chosen": -2.484395980834961, "logits/rejected": -2.5102570056915283, "logps/chosen": -275.0531311035156, "logps/rejected": -228.39883422851562, "loss": 430.9164, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 38.611968994140625, "rewards/margins": -3.3621764183044434, "rewards/rejected": 41.974151611328125, "step": 1950 }, { "epoch": 0.38469087340529934, "grad_norm": 4253.196394738559, "learning_rate": 3.864349934462111e-07, "logits/chosen": -2.5822837352752686, "logits/rejected": -2.562549114227295, "logps/chosen": -227.0452423095703, "logps/rejected": -201.46873474121094, "loss": 461.1355, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 35.39836120605469, "rewards/margins": -8.067391395568848, "rewards/rejected": 43.46575164794922, "step": 1960 }, { "epoch": 0.38665358194308147, "grad_norm": 4509.157666462303, "learning_rate": 3.84996408333886e-07, "logits/chosen": -2.7467126846313477, "logits/rejected": -2.5748817920684814, "logps/chosen": -269.403076171875, "logps/rejected": -185.69235229492188, "loss": 484.4049, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 36.83091735839844, "rewards/margins": 1.1465529203414917, "rewards/rejected": 35.684364318847656, "step": 1970 }, { "epoch": 0.3886162904808636, "grad_norm": 4336.3496772389035, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -2.700141191482544, "logits/rejected": -2.7247486114501953, "logps/chosen": -200.31036376953125, "logps/rejected": -166.716064453125, "loss": 438.6323, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 30.3387393951416, "rewards/margins": -7.581428527832031, "rewards/rejected": 37.920169830322266, "step": 1980 }, { "epoch": 0.39057899901864573, "grad_norm": 5348.9980937018945, "learning_rate": 3.8210029241230204e-07, "logits/chosen": -2.7195792198181152, "logits/rejected": -2.5863280296325684, "logps/chosen": -295.79119873046875, "logps/rejected": -250.6574249267578, "loss": 559.4731, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 52.1451416015625, "rewards/margins": 19.041133880615234, "rewards/rejected": 33.10401153564453, "step": 1990 }, { "epoch": 0.39254170755642787, "grad_norm": 4816.794446517075, "learning_rate": 3.806428975706042e-07, "logits/chosen": -2.3942437171936035, "logits/rejected": -2.3860244750976562, "logps/chosen": -181.06466674804688, "logps/rejected": -195.17808532714844, "loss": 387.9617, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 53.385047912597656, "rewards/margins": 18.948444366455078, "rewards/rejected": 34.43659973144531, "step": 2000 }, { "epoch": 0.39450441609421, "grad_norm": 6070.466031055457, "learning_rate": 3.791793692740876e-07, "logits/chosen": -2.4864609241485596, "logits/rejected": -2.498897075653076, "logps/chosen": -194.35671997070312, "logps/rejected": -140.57203674316406, "loss": 423.7441, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 38.10416793823242, "rewards/margins": 10.152827262878418, "rewards/rejected": 27.951335906982422, "step": 2010 }, { "epoch": 0.39646712463199213, "grad_norm": 4731.301435131496, "learning_rate": 3.777097762328381e-07, "logits/chosen": -2.6302943229675293, "logits/rejected": -2.582368850708008, "logps/chosen": -262.92559814453125, "logps/rejected": -223.01577758789062, "loss": 458.122, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 33.463165283203125, "rewards/margins": -4.165156364440918, "rewards/rejected": 37.62831497192383, "step": 2020 }, { "epoch": 0.39842983316977426, "grad_norm": 3182.878603855419, "learning_rate": 3.762341874416702e-07, "logits/chosen": -2.4996063709259033, "logits/rejected": -2.3436052799224854, "logps/chosen": -191.55349731445312, "logps/rejected": -135.09239196777344, "loss": 406.8957, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 35.906471252441406, "rewards/margins": 7.290860652923584, "rewards/rejected": 28.615610122680664, "step": 2030 }, { "epoch": 0.40039254170755645, "grad_norm": 4112.393036976447, "learning_rate": 3.7475267217688896e-07, "logits/chosen": -2.5368053913116455, "logits/rejected": -2.6471173763275146, "logps/chosen": -165.9794158935547, "logps/rejected": -209.2161407470703, "loss": 420.6569, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 34.43918991088867, "rewards/margins": -9.173788070678711, "rewards/rejected": 43.61297607421875, "step": 2040 }, { "epoch": 0.4023552502453386, "grad_norm": 4158.868734597658, "learning_rate": 3.7326529999303633e-07, "logits/chosen": -2.4712069034576416, "logits/rejected": -2.466320753097534, "logps/chosen": -183.16184997558594, "logps/rejected": -216.8333740234375, "loss": 419.9413, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 32.53175354003906, "rewards/margins": -8.037252426147461, "rewards/rejected": 40.569000244140625, "step": 2050 }, { "epoch": 0.4043179587831207, "grad_norm": 4584.7452639371095, "learning_rate": 3.7177214071962684e-07, "logits/chosen": -2.5812697410583496, "logits/rejected": -2.6067354679107666, "logps/chosen": -212.4852752685547, "logps/rejected": -277.4024353027344, "loss": 485.5656, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 39.76322555541992, "rewards/margins": 2.1475565433502197, "rewards/rejected": 37.615665435791016, "step": 2060 }, { "epoch": 0.40628066732090284, "grad_norm": 4168.50229175131, "learning_rate": 3.7027326445786835e-07, "logits/chosen": -2.6407763957977295, "logits/rejected": -2.5833470821380615, "logps/chosen": -214.7595977783203, "logps/rejected": -206.1675262451172, "loss": 450.9582, "rewards/accuracies": 0.5, "rewards/chosen": 41.8992805480957, "rewards/margins": 10.203690528869629, "rewards/rejected": 31.69559097290039, "step": 2070 }, { "epoch": 0.408243375858685, "grad_norm": 4744.500799794224, "learning_rate": 3.6876874157737167e-07, "logits/chosen": -2.6174182891845703, "logits/rejected": -2.624634265899658, "logps/chosen": -227.5395050048828, "logps/rejected": -241.0737762451172, "loss": 471.8342, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 33.56273651123047, "rewards/margins": -9.664677619934082, "rewards/rejected": 43.227413177490234, "step": 2080 }, { "epoch": 0.4102060843964671, "grad_norm": 4440.846443142311, "learning_rate": 3.67258642712846e-07, "logits/chosen": -2.6877191066741943, "logits/rejected": -2.5582659244537354, "logps/chosen": -216.27066040039062, "logps/rejected": -167.98318481445312, "loss": 436.4445, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.20050048828125, "rewards/margins": 7.238005638122559, "rewards/rejected": 29.96249771118164, "step": 2090 }, { "epoch": 0.41216879293424924, "grad_norm": 4600.09562625597, "learning_rate": 3.6574303876078366e-07, "logits/chosen": -2.7248592376708984, "logits/rejected": -2.645047664642334, "logps/chosen": -243.5182342529297, "logps/rejected": -232.8569793701172, "loss": 449.7953, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 39.2998046875, "rewards/margins": -1.4524590969085693, "rewards/rejected": 40.75226593017578, "step": 2100 }, { "epoch": 0.4141315014720314, "grad_norm": 5993.644361974152, "learning_rate": 3.642220008761309e-07, "logits/chosen": -2.7186810970306396, "logits/rejected": -2.6466875076293945, "logps/chosen": -287.5413513183594, "logps/rejected": -254.6502227783203, "loss": 533.4502, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 40.02311706542969, "rewards/margins": 7.727900505065918, "rewards/rejected": 32.29521942138672, "step": 2110 }, { "epoch": 0.41609421000981356, "grad_norm": 3941.922949118662, "learning_rate": 3.626956004689476e-07, "logits/chosen": -2.73811411857605, "logits/rejected": -2.5903234481811523, "logps/chosen": -314.7489929199219, "logps/rejected": -184.81614685058594, "loss": 418.1722, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 46.809268951416016, "rewards/margins": 16.42887306213379, "rewards/rejected": 30.380395889282227, "step": 2120 }, { "epoch": 0.4180569185475957, "grad_norm": 4994.62562305448, "learning_rate": 3.6116390920105474e-07, "logits/chosen": -2.7437849044799805, "logits/rejected": -2.695708990097046, "logps/chosen": -231.01681518554688, "logps/rejected": -200.68460083007812, "loss": 488.287, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 36.310768127441406, "rewards/margins": 2.005819320678711, "rewards/rejected": 34.30495071411133, "step": 2130 }, { "epoch": 0.4200196270853778, "grad_norm": 4375.304026236624, "learning_rate": 3.5962699898266983e-07, "logits/chosen": -2.6645920276641846, "logits/rejected": -2.5936334133148193, "logps/chosen": -211.1452178955078, "logps/rejected": -186.7575225830078, "loss": 419.7613, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 32.87139129638672, "rewards/margins": 3.3978514671325684, "rewards/rejected": 29.473541259765625, "step": 2140 }, { "epoch": 0.42198233562315995, "grad_norm": 3665.1490476903364, "learning_rate": 3.5808494196903117e-07, "logits/chosen": -2.5833377838134766, "logits/rejected": -2.525230884552002, "logps/chosen": -293.2630615234375, "logps/rejected": -173.25833129882812, "loss": 479.6071, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 32.98417282104492, "rewards/margins": -8.318937301635742, "rewards/rejected": 41.3031120300293, "step": 2150 }, { "epoch": 0.4239450441609421, "grad_norm": 4315.555600452838, "learning_rate": 3.565378105570097e-07, "logits/chosen": -2.62446665763855, "logits/rejected": -2.5740838050842285, "logps/chosen": -244.9754638671875, "logps/rejected": -180.99819946289062, "loss": 438.9187, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 39.099754333496094, "rewards/margins": 12.707671165466309, "rewards/rejected": 26.392078399658203, "step": 2160 }, { "epoch": 0.4259077526987242, "grad_norm": 5510.7039652280155, "learning_rate": 3.549856773817107e-07, "logits/chosen": -2.575657367706299, "logits/rejected": -2.4953224658966064, "logps/chosen": -205.1666717529297, "logps/rejected": -187.93984985351562, "loss": 485.9353, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 33.390872955322266, "rewards/margins": 5.281000137329102, "rewards/rejected": 28.109872817993164, "step": 2170 }, { "epoch": 0.4278704612365064, "grad_norm": 5120.327309285725, "learning_rate": 3.5342861531306344e-07, "logits/chosen": -2.593902111053467, "logits/rejected": -2.543280601501465, "logps/chosen": -217.9532928466797, "logps/rejected": -180.51889038085938, "loss": 467.9969, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 36.239341735839844, "rewards/margins": 0.7464396357536316, "rewards/rejected": 35.492897033691406, "step": 2180 }, { "epoch": 0.42983316977428854, "grad_norm": 4710.801534208424, "learning_rate": 3.518666974524002e-07, "logits/chosen": -2.6590843200683594, "logits/rejected": -2.586304187774658, "logps/chosen": -280.81085205078125, "logps/rejected": -236.250732421875, "loss": 466.6427, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 35.222198486328125, "rewards/margins": -1.8785030841827393, "rewards/rejected": 37.10070037841797, "step": 2190 }, { "epoch": 0.43179587831207067, "grad_norm": 4639.368213133138, "learning_rate": 3.5029999712902387e-07, "logits/chosen": -2.7458598613739014, "logits/rejected": -2.742999792098999, "logps/chosen": -292.08331298828125, "logps/rejected": -315.49456787109375, "loss": 494.2247, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 37.22055435180664, "rewards/margins": -1.9078445434570312, "rewards/rejected": 39.12839889526367, "step": 2200 }, { "epoch": 0.4337585868498528, "grad_norm": 4075.814076061963, "learning_rate": 3.4872858789676583e-07, "logits/chosen": -2.4446651935577393, "logits/rejected": -2.519179582595825, "logps/chosen": -198.51075744628906, "logps/rejected": -195.6839599609375, "loss": 423.1005, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 34.416419982910156, "rewards/margins": -4.6278910636901855, "rewards/rejected": 39.044315338134766, "step": 2210 }, { "epoch": 0.43572129538763493, "grad_norm": 5623.558616130809, "learning_rate": 3.4715254353053236e-07, "logits/chosen": -2.5616421699523926, "logits/rejected": -2.620027780532837, "logps/chosen": -235.35067749023438, "logps/rejected": -243.73171997070312, "loss": 478.7932, "rewards/accuracies": 0.3333333134651184, "rewards/chosen": 36.581520080566406, "rewards/margins": -0.6448574066162109, "rewards/rejected": 37.226375579833984, "step": 2220 }, { "epoch": 0.43768400392541706, "grad_norm": 4862.19129057131, "learning_rate": 3.4557193802284123e-07, "logits/chosen": -2.5887420177459717, "logits/rejected": -2.5235865116119385, "logps/chosen": -227.75057983398438, "logps/rejected": -214.9437255859375, "loss": 468.0142, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 36.97982406616211, "rewards/margins": -6.219507694244385, "rewards/rejected": 43.19933319091797, "step": 2230 }, { "epoch": 0.4396467124631992, "grad_norm": 4734.63707338868, "learning_rate": 3.4398684558034763e-07, "logits/chosen": -2.451140880584717, "logits/rejected": -2.4948220252990723, "logps/chosen": -226.34683227539062, "logps/rejected": -202.0099639892578, "loss": 465.8895, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 42.129207611083984, "rewards/margins": -5.78919792175293, "rewards/rejected": 47.91840362548828, "step": 2240 }, { "epoch": 0.44160942100098133, "grad_norm": 6076.960687119696, "learning_rate": 3.4239734062036067e-07, "logits/chosen": -2.6000795364379883, "logits/rejected": -2.6038219928741455, "logps/chosen": -245.7303466796875, "logps/rejected": -232.3347930908203, "loss": 441.4258, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 38.31798553466797, "rewards/margins": -9.228495597839355, "rewards/rejected": 47.546485900878906, "step": 2250 }, { "epoch": 0.4435721295387635, "grad_norm": 4559.100462974265, "learning_rate": 3.4080349776734924e-07, "logits/chosen": -2.5710062980651855, "logits/rejected": -2.5078659057617188, "logps/chosen": -257.9673767089844, "logps/rejected": -238.927734375, "loss": 445.9494, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 40.69451141357422, "rewards/margins": 1.9841305017471313, "rewards/rejected": 38.71038818359375, "step": 2260 }, { "epoch": 0.44553483807654565, "grad_norm": 4245.220565699372, "learning_rate": 3.392053918494389e-07, "logits/chosen": -2.533465623855591, "logits/rejected": -2.56412672996521, "logps/chosen": -275.1715087890625, "logps/rejected": -240.7425994873047, "loss": 470.4912, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 31.101980209350586, "rewards/margins": 1.7008116245269775, "rewards/rejected": 29.401172637939453, "step": 2270 }, { "epoch": 0.4474975466143278, "grad_norm": 4162.537615965158, "learning_rate": 3.376030978948983e-07, "logits/chosen": -2.4892563819885254, "logits/rejected": -2.392144203186035, "logps/chosen": -279.1177062988281, "logps/rejected": -255.7156524658203, "loss": 457.0973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 43.22492218017578, "rewards/margins": 11.627494812011719, "rewards/rejected": 31.597427368164062, "step": 2280 }, { "epoch": 0.4494602551521099, "grad_norm": 5230.945935986782, "learning_rate": 3.3599669112861756e-07, "logits/chosen": -2.6333365440368652, "logits/rejected": -2.6040003299713135, "logps/chosen": -226.58486938476562, "logps/rejected": -245.96005249023438, "loss": 477.1376, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 42.69200897216797, "rewards/margins": -3.276989698410034, "rewards/rejected": 45.968997955322266, "step": 2290 }, { "epoch": 0.45142296368989204, "grad_norm": 3929.202252727979, "learning_rate": 3.343862469685755e-07, "logits/chosen": -2.5004894733428955, "logits/rejected": -2.4886584281921387, "logps/chosen": -208.4034881591797, "logps/rejected": -188.3735809326172, "loss": 456.2935, "rewards/accuracies": 0.5, "rewards/chosen": 31.812917709350586, "rewards/margins": -6.975862979888916, "rewards/rejected": 38.788780212402344, "step": 2300 }, { "epoch": 0.4533856722276742, "grad_norm": 5214.156588475492, "learning_rate": 3.3277184102230004e-07, "logits/chosen": -2.7178711891174316, "logits/rejected": -2.724022388458252, "logps/chosen": -234.6586456298828, "logps/rejected": -246.9966583251953, "loss": 499.9511, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 34.978572845458984, "rewards/margins": 7.584236145019531, "rewards/rejected": 27.394336700439453, "step": 2310 }, { "epoch": 0.4553483807654563, "grad_norm": 4575.858557653078, "learning_rate": 3.311535490833176e-07, "logits/chosen": -2.4982151985168457, "logits/rejected": -2.4811289310455322, "logps/chosen": -194.088134765625, "logps/rejected": -242.5849151611328, "loss": 499.5238, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 54.86347198486328, "rewards/margins": 10.496244430541992, "rewards/rejected": 44.36722946166992, "step": 2320 }, { "epoch": 0.4573110893032385, "grad_norm": 4641.516621248123, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -2.662986993789673, "logits/rejected": -2.5589983463287354, "logps/chosen": -304.39935302734375, "logps/rejected": -216.46932983398438, "loss": 479.5048, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 38.267696380615234, "rewards/margins": 5.254390716552734, "rewards/rejected": 33.013301849365234, "step": 2330 }, { "epoch": 0.4592737978410206, "grad_norm": 4408.044494580562, "learning_rate": 3.279056113099742e-07, "logits/chosen": -2.619048833847046, "logits/rejected": -2.541274309158325, "logps/chosen": -247.12771606445312, "logps/rejected": -290.5010681152344, "loss": 460.61, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 40.39784240722656, "rewards/margins": 1.5373001098632812, "rewards/rejected": 38.86054229736328, "step": 2340 }, { "epoch": 0.46123650637880276, "grad_norm": 4233.056941895596, "learning_rate": 3.2627611796059283e-07, "logits/chosen": -2.5531439781188965, "logits/rejected": -2.4957103729248047, "logps/chosen": -233.532958984375, "logps/rejected": -212.55191040039062, "loss": 494.6469, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 39.436241149902344, "rewards/margins": 13.565861701965332, "rewards/rejected": 25.870376586914062, "step": 2350 }, { "epoch": 0.4631992149165849, "grad_norm": 5123.462575919028, "learning_rate": 3.246430435813051e-07, "logits/chosen": -2.6400294303894043, "logits/rejected": -2.5540122985839844, "logps/chosen": -237.27371215820312, "logps/rejected": -184.3572235107422, "loss": 444.0543, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 36.393699645996094, "rewards/margins": 2.897949695587158, "rewards/rejected": 33.495750427246094, "step": 2360 }, { "epoch": 0.465161923454367, "grad_norm": 3504.0236788538973, "learning_rate": 3.230064648420878e-07, "logits/chosen": -2.6284031867980957, "logits/rejected": -2.446946382522583, "logps/chosen": -233.88278198242188, "logps/rejected": -162.52059936523438, "loss": 412.3497, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 42.815452575683594, "rewards/margins": 12.520002365112305, "rewards/rejected": 30.29545021057129, "step": 2370 }, { "epoch": 0.46712463199214915, "grad_norm": 4766.94598434386, "learning_rate": 3.2136645857744114e-07, "logits/chosen": -2.397789239883423, "logits/rejected": -2.3977391719818115, "logps/chosen": -189.5882568359375, "logps/rejected": -226.1632843017578, "loss": 447.2617, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 42.929969787597656, "rewards/margins": -7.894274711608887, "rewards/rejected": 50.824241638183594, "step": 2380 }, { "epoch": 0.4690873405299313, "grad_norm": 4702.600347462076, "learning_rate": 3.197231017827818e-07, "logits/chosen": -2.5886404514312744, "logits/rejected": -2.542672872543335, "logps/chosen": -241.9503173828125, "logps/rejected": -225.9481658935547, "loss": 400.5882, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 39.55445098876953, "rewards/margins": 4.344472408294678, "rewards/rejected": 35.20997619628906, "step": 2390 }, { "epoch": 0.47105004906771347, "grad_norm": 5874.7133346146, "learning_rate": 3.1807647161082797e-07, "logits/chosen": -2.6827895641326904, "logits/rejected": -2.5796356201171875, "logps/chosen": -223.42575073242188, "logps/rejected": -222.6304168701172, "loss": 508.7384, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 51.37737274169922, "rewards/margins": 16.392112731933594, "rewards/rejected": 34.98526382446289, "step": 2400 }, { "epoch": 0.4730127576054956, "grad_norm": 4080.6756176221616, "learning_rate": 3.1642664536797693e-07, "logits/chosen": -2.5640296936035156, "logits/rejected": -2.45853328704834, "logps/chosen": -232.3906707763672, "logps/rejected": -225.85330200195312, "loss": 418.0434, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 40.605072021484375, "rewards/margins": -7.888661861419678, "rewards/rejected": 48.49373245239258, "step": 2410 }, { "epoch": 0.47497546614327774, "grad_norm": 4602.357148985481, "learning_rate": 3.147737005106762e-07, "logits/chosen": -2.554788112640381, "logits/rejected": -2.701284646987915, "logps/chosen": -270.83477783203125, "logps/rejected": -232.8144073486328, "loss": 454.253, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 28.20511817932129, "rewards/margins": -13.63386058807373, "rewards/rejected": 41.8389778137207, "step": 2420 }, { "epoch": 0.47693817468105987, "grad_norm": 5186.892444597569, "learning_rate": 3.1311771464178655e-07, "logits/chosen": -2.549485683441162, "logits/rejected": -2.512749195098877, "logps/chosen": -263.4794616699219, "logps/rejected": -182.04495239257812, "loss": 462.0962, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 31.768932342529297, "rewards/margins": -2.404651165008545, "rewards/rejected": 34.173583984375, "step": 2430 }, { "epoch": 0.478900883218842, "grad_norm": 5048.1658714524065, "learning_rate": 3.1145876550693893e-07, "logits/chosen": -2.690701961517334, "logits/rejected": -2.6157898902893066, "logps/chosen": -246.66543579101562, "logps/rejected": -191.4427032470703, "loss": 429.1527, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 39.08533477783203, "rewards/margins": -3.444082260131836, "rewards/rejected": 42.5294189453125, "step": 2440 }, { "epoch": 0.48086359175662413, "grad_norm": 4525.940898564814, "learning_rate": 3.097969309908847e-07, "logits/chosen": -2.383237838745117, "logits/rejected": -2.433600902557373, "logps/chosen": -188.96987915039062, "logps/rejected": -172.08071899414062, "loss": 408.2894, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 50.24271774291992, "rewards/margins": 15.582735061645508, "rewards/rejected": 34.65998077392578, "step": 2450 }, { "epoch": 0.48282630029440626, "grad_norm": 5357.565333054472, "learning_rate": 3.081322891138382e-07, "logits/chosen": -2.651235818862915, "logits/rejected": -2.6922688484191895, "logps/chosen": -265.7408142089844, "logps/rejected": -224.93313598632812, "loss": 480.8474, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 39.43803787231445, "rewards/margins": 2.3424158096313477, "rewards/rejected": 37.095619201660156, "step": 2460 }, { "epoch": 0.4847890088321884, "grad_norm": 4624.913071085645, "learning_rate": 3.0646491802781514e-07, "logits/chosen": -2.5180201530456543, "logits/rejected": -2.3734679222106934, "logps/chosen": -233.11752319335938, "logps/rejected": -151.3263702392578, "loss": 458.9539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 33.529136657714844, "rewards/margins": 4.377078056335449, "rewards/rejected": 29.152053833007812, "step": 2470 }, { "epoch": 0.4867517173699706, "grad_norm": 3849.641695370538, "learning_rate": 3.047948960129624e-07, "logits/chosen": -2.4560704231262207, "logits/rejected": -2.467038631439209, "logps/chosen": -174.33287048339844, "logps/rejected": -179.1482696533203, "loss": 425.778, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 35.16417694091797, "rewards/margins": -3.221940517425537, "rewards/rejected": 38.38611602783203, "step": 2480 }, { "epoch": 0.4887144259077527, "grad_norm": 4474.0585982753055, "learning_rate": 3.0312230147388334e-07, "logits/chosen": -2.7361364364624023, "logits/rejected": -2.6660802364349365, "logps/chosen": -265.2712097167969, "logps/rejected": -250.2879638671875, "loss": 499.5539, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 43.949073791503906, "rewards/margins": -13.000185012817383, "rewards/rejected": 56.94926071166992, "step": 2490 }, { "epoch": 0.49067713444553485, "grad_norm": 4876.035584744987, "learning_rate": 3.01447212935957e-07, "logits/chosen": -2.598324775695801, "logits/rejected": -2.6883630752563477, "logps/chosen": -192.37045288085938, "logps/rejected": -199.43600463867188, "loss": 446.1502, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 34.07781219482422, "rewards/margins": -17.534502029418945, "rewards/rejected": 51.61231231689453, "step": 2500 }, { "epoch": 0.492639842983317, "grad_norm": 4593.399795935327, "learning_rate": 2.9976970904165104e-07, "logits/chosen": -2.8025119304656982, "logits/rejected": -2.6266629695892334, "logps/chosen": -341.9221496582031, "logps/rejected": -253.7430419921875, "loss": 466.9557, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 43.02267837524414, "rewards/margins": 2.0816879272460938, "rewards/rejected": 40.94099426269531, "step": 2510 }, { "epoch": 0.4946025515210991, "grad_norm": 5402.563955263948, "learning_rate": 2.980898685468301e-07, "logits/chosen": -2.6228396892547607, "logits/rejected": -2.5475149154663086, "logps/chosen": -258.35101318359375, "logps/rejected": -198.5154266357422, "loss": 441.9007, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 41.29050064086914, "rewards/margins": 7.081448554992676, "rewards/rejected": 34.20905303955078, "step": 2520 }, { "epoch": 0.49656526005888124, "grad_norm": 4280.269687459939, "learning_rate": 2.96407770317058e-07, "logits/chosen": -2.4809489250183105, "logits/rejected": -2.4302022457122803, "logps/chosen": -176.412353515625, "logps/rejected": -169.72401428222656, "loss": 423.0802, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 31.984704971313477, "rewards/margins": -6.700386047363281, "rewards/rejected": 38.685096740722656, "step": 2530 }, { "epoch": 0.4985279685966634, "grad_norm": 4237.256524133457, "learning_rate": 2.9472349332389523e-07, "logits/chosen": -2.536940574645996, "logits/rejected": -2.4029650688171387, "logps/chosen": -257.356689453125, "logps/rejected": -158.67816162109375, "loss": 460.7807, "rewards/accuracies": 0.6333332657814026, "rewards/chosen": 36.0179443359375, "rewards/margins": -4.86074686050415, "rewards/rejected": 40.87868881225586, "step": 2540 }, { "epoch": 0.5004906771344455, "grad_norm": 4282.243026261946, "learning_rate": 2.930371166411915e-07, "logits/chosen": -2.752068042755127, "logits/rejected": -2.6536898612976074, "logps/chosen": -276.26544189453125, "logps/rejected": -274.9800720214844, "loss": 461.675, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 38.50434494018555, "rewards/margins": 1.045363426208496, "rewards/rejected": 37.458984375, "step": 2550 }, { "epoch": 0.5024533856722276, "grad_norm": 4716.421144282486, "learning_rate": 2.913487194413731e-07, "logits/chosen": -2.5679516792297363, "logits/rejected": -2.6082217693328857, "logps/chosen": -239.5904541015625, "logps/rejected": -245.8770294189453, "loss": 461.8308, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 43.07164001464844, "rewards/margins": 9.934675216674805, "rewards/rejected": 33.136966705322266, "step": 2560 }, { "epoch": 0.5044160942100098, "grad_norm": 4660.636941487216, "learning_rate": 2.896583809917262e-07, "logits/chosen": -2.5704874992370605, "logits/rejected": -2.527533769607544, "logps/chosen": -186.18045043945312, "logps/rejected": -185.8052215576172, "loss": 423.5537, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 38.3198356628418, "rewards/margins": -0.8927882313728333, "rewards/rejected": 39.212623596191406, "step": 2570 }, { "epoch": 0.5063788027477919, "grad_norm": 5162.821735056709, "learning_rate": 2.879661806506751e-07, "logits/chosen": -2.5468087196350098, "logits/rejected": -2.4348654747009277, "logps/chosen": -238.07852172851562, "logps/rejected": -274.9657287597656, "loss": 453.0871, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 34.86083221435547, "rewards/margins": 0.16193465888500214, "rewards/rejected": 34.698890686035156, "step": 2580 }, { "epoch": 0.5083415112855741, "grad_norm": 4605.722636207217, "learning_rate": 2.86272197864057e-07, "logits/chosen": -2.7966628074645996, "logits/rejected": -2.7884743213653564, "logps/chosen": -282.0419921875, "logps/rejected": -219.54385375976562, "loss": 516.6672, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 38.49335479736328, "rewards/margins": 3.876373767852783, "rewards/rejected": 34.61698532104492, "step": 2590 }, { "epoch": 0.5103042198233563, "grad_norm": 4389.535360502405, "learning_rate": 2.845765121613912e-07, "logits/chosen": -2.5735490322113037, "logits/rejected": -2.522249937057495, "logps/chosen": -251.6597900390625, "logps/rejected": -150.88699340820312, "loss": 437.7409, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 43.90034484863281, "rewards/margins": -4.054275989532471, "rewards/rejected": 47.954627990722656, "step": 2600 }, { "epoch": 0.5122669283611384, "grad_norm": 5505.17046958691, "learning_rate": 2.828792031521464e-07, "logits/chosen": -2.60178542137146, "logits/rejected": -2.643982172012329, "logps/chosen": -269.0943298339844, "logps/rejected": -254.6349639892578, "loss": 477.3345, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 45.671730041503906, "rewards/margins": 5.145081043243408, "rewards/rejected": 40.52665328979492, "step": 2610 }, { "epoch": 0.5142296368989205, "grad_norm": 5794.100559420684, "learning_rate": 2.811803505220025e-07, "logits/chosen": -2.621931552886963, "logits/rejected": -2.460784435272217, "logps/chosen": -207.79434204101562, "logps/rejected": -173.03713989257812, "loss": 446.7151, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 55.721534729003906, "rewards/margins": 21.065649032592773, "rewards/rejected": 34.6558837890625, "step": 2620 }, { "epoch": 0.5161923454367027, "grad_norm": 4998.841167117095, "learning_rate": 2.7948003402910975e-07, "logits/chosen": -2.6204910278320312, "logits/rejected": -2.52304744720459, "logps/chosen": -263.1111755371094, "logps/rejected": -216.44528198242188, "loss": 435.7576, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 34.93376922607422, "rewards/margins": -2.444157123565674, "rewards/rejected": 37.37792205810547, "step": 2630 }, { "epoch": 0.5181550539744848, "grad_norm": 4647.687919322091, "learning_rate": 2.777783335003442e-07, "logits/chosen": -2.744004249572754, "logits/rejected": -2.6306076049804688, "logps/chosen": -273.3846130371094, "logps/rejected": -217.7678985595703, "loss": 451.462, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": 38.19150161743164, "rewards/margins": -4.022319793701172, "rewards/rejected": 42.21382141113281, "step": 2640 }, { "epoch": 0.5201177625122669, "grad_norm": 4640.001302888592, "learning_rate": 2.760753288275598e-07, "logits/chosen": -2.6173348426818848, "logits/rejected": -2.473811149597168, "logps/chosen": -217.0681610107422, "logps/rejected": -221.2267303466797, "loss": 402.598, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 39.30550003051758, "rewards/margins": 7.629674434661865, "rewards/rejected": 31.675817489624023, "step": 2650 }, { "epoch": 0.5220804710500491, "grad_norm": 4162.675353293604, "learning_rate": 2.7437109996383795e-07, "logits/chosen": -2.538184642791748, "logits/rejected": -2.4441897869110107, "logps/chosen": -217.38949584960938, "logps/rejected": -181.7127227783203, "loss": 473.9091, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.0445556640625, "rewards/margins": 9.555212020874023, "rewards/rejected": 28.48933982849121, "step": 2660 }, { "epoch": 0.5240431795878312, "grad_norm": 4733.653275267734, "learning_rate": 2.7266572691973365e-07, "logits/chosen": -2.7847039699554443, "logits/rejected": -2.7330451011657715, "logps/chosen": -292.99713134765625, "logps/rejected": -241.0466766357422, "loss": 466.7176, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 40.643089294433594, "rewards/margins": 4.975247859954834, "rewards/rejected": 35.6678466796875, "step": 2670 }, { "epoch": 0.5260058881256133, "grad_norm": 4359.262818451264, "learning_rate": 2.709592897595191e-07, "logits/chosen": -2.5485806465148926, "logits/rejected": -2.4584767818450928, "logps/chosen": -237.18295288085938, "logps/rejected": -147.65628051757812, "loss": 427.6604, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 38.61459732055664, "rewards/margins": 0.6483393907546997, "rewards/rejected": 37.96625518798828, "step": 2680 }, { "epoch": 0.5279685966633955, "grad_norm": 5233.246097927523, "learning_rate": 2.6925186859742494e-07, "logits/chosen": -2.6418886184692383, "logits/rejected": -2.6851634979248047, "logps/chosen": -221.16012573242188, "logps/rejected": -182.71084594726562, "loss": 468.212, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": 36.26055145263672, "rewards/margins": -8.975361824035645, "rewards/rejected": 45.23591232299805, "step": 2690 }, { "epoch": 0.5299313052011776, "grad_norm": 4190.663619639473, "learning_rate": 2.675435435938788e-07, "logits/chosen": -2.690380811691284, "logits/rejected": -2.6410889625549316, "logps/chosen": -280.4224548339844, "logps/rejected": -229.3972625732422, "loss": 422.0635, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 35.18825149536133, "rewards/margins": 8.508127212524414, "rewards/rejected": 26.680124282836914, "step": 2700 }, { "epoch": 0.5318940137389597, "grad_norm": 4756.840304078283, "learning_rate": 2.6583439495174247e-07, "logits/chosen": -2.7346031665802, "logits/rejected": -2.6536295413970947, "logps/chosen": -249.3466339111328, "logps/rejected": -198.13674926757812, "loss": 464.5689, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 41.55207443237305, "rewards/margins": -0.3666302561759949, "rewards/rejected": 41.918704986572266, "step": 2710 }, { "epoch": 0.5338567222767419, "grad_norm": 4667.309132532129, "learning_rate": 2.6412450291254564e-07, "logits/chosen": -2.5879178047180176, "logits/rejected": -2.5420918464660645, "logps/chosen": -259.8382263183594, "logps/rejected": -201.00717163085938, "loss": 459.5845, "rewards/accuracies": 0.5, "rewards/chosen": 31.532434463500977, "rewards/margins": -8.552940368652344, "rewards/rejected": 40.08536911010742, "step": 2720 }, { "epoch": 0.535819430814524, "grad_norm": 4748.499403267251, "learning_rate": 2.6241394775271954e-07, "logits/chosen": -2.5561461448669434, "logits/rejected": -2.4587976932525635, "logps/chosen": -221.2384490966797, "logps/rejected": -198.52723693847656, "loss": 479.5985, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 39.287330627441406, "rewards/margins": 8.734041213989258, "rewards/rejected": 30.553293228149414, "step": 2730 }, { "epoch": 0.5377821393523062, "grad_norm": 4533.5320344006905, "learning_rate": 2.607028097798276e-07, "logits/chosen": -2.515784740447998, "logits/rejected": -2.4613964557647705, "logps/chosen": -244.35098266601562, "logps/rejected": -255.76876831054688, "loss": 449.5435, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 41.61980438232422, "rewards/margins": 4.5574846267700195, "rewards/rejected": 37.06231689453125, "step": 2740 }, { "epoch": 0.5397448478900884, "grad_norm": 4306.144339979795, "learning_rate": 2.5899116932879534e-07, "logits/chosen": -2.5324409008026123, "logits/rejected": -2.4598164558410645, "logps/chosen": -163.7893524169922, "logps/rejected": -178.0715789794922, "loss": 432.9026, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 36.10814666748047, "rewards/margins": 5.678139686584473, "rewards/rejected": 30.430007934570312, "step": 2750 }, { "epoch": 0.5417075564278705, "grad_norm": 4113.54374124532, "learning_rate": 2.5727910675813866e-07, "logits/chosen": -2.435154438018799, "logits/rejected": -2.542113780975342, "logps/chosen": -212.41983032226562, "logps/rejected": -228.49526977539062, "loss": 470.3569, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 40.49172592163086, "rewards/margins": -3.3083176612854004, "rewards/rejected": 43.80004119873047, "step": 2760 }, { "epoch": 0.5436702649656526, "grad_norm": 5349.584082641944, "learning_rate": 2.555667024461915e-07, "logits/chosen": -2.6058411598205566, "logits/rejected": -2.6284537315368652, "logps/chosen": -194.70742797851562, "logps/rejected": -217.4590301513672, "loss": 459.6099, "rewards/accuracies": 0.5, "rewards/chosen": 37.93566131591797, "rewards/margins": -10.541536331176758, "rewards/rejected": 48.477195739746094, "step": 2770 }, { "epoch": 0.5456329735034348, "grad_norm": 4703.226204053684, "learning_rate": 2.5385403678733157e-07, "logits/chosen": -2.5422253608703613, "logits/rejected": -2.514312505722046, "logps/chosen": -158.90664672851562, "logps/rejected": -199.33143615722656, "loss": 419.9885, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 40.289405822753906, "rewards/margins": 8.071573257446289, "rewards/rejected": 32.21782684326172, "step": 2780 }, { "epoch": 0.5475956820412169, "grad_norm": 3944.7102508190455, "learning_rate": 2.521411901882067e-07, "logits/chosen": -2.628328800201416, "logits/rejected": -2.554342746734619, "logps/chosen": -226.6717529296875, "logps/rejected": -170.31246948242188, "loss": 441.2466, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 36.1329231262207, "rewards/margins": 0.4193221926689148, "rewards/rejected": 35.713600158691406, "step": 2790 }, { "epoch": 0.549558390578999, "grad_norm": 3885.3308770567787, "learning_rate": 2.504282430639594e-07, "logits/chosen": -2.6190319061279297, "logits/rejected": -2.5678019523620605, "logps/chosen": -174.28733825683594, "logps/rejected": -176.26931762695312, "loss": 442.4453, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 33.87175369262695, "rewards/margins": 5.558090686798096, "rewards/rejected": 28.31365966796875, "step": 2800 }, { "epoch": 0.5515210991167812, "grad_norm": 5218.016929324244, "learning_rate": 2.4871527583445163e-07, "logits/chosen": -2.665811538696289, "logits/rejected": -2.6195101737976074, "logps/chosen": -276.04541015625, "logps/rejected": -239.3740234375, "loss": 430.998, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 33.38459014892578, "rewards/margins": -0.7618095278739929, "rewards/rejected": 34.14639663696289, "step": 2810 }, { "epoch": 0.5534838076545633, "grad_norm": 4109.24398892843, "learning_rate": 2.470023689204893e-07, "logits/chosen": -2.6391077041625977, "logits/rejected": -2.6396422386169434, "logps/chosen": -252.8202362060547, "logps/rejected": -239.5262451171875, "loss": 420.4567, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 41.7357177734375, "rewards/margins": 1.3734490871429443, "rewards/rejected": 40.362266540527344, "step": 2820 }, { "epoch": 0.5554465161923454, "grad_norm": 4059.341902081399, "learning_rate": 2.452896027400465e-07, "logits/chosen": -2.72487211227417, "logits/rejected": -2.6438424587249756, "logps/chosen": -234.49044799804688, "logps/rejected": -237.9053192138672, "loss": 470.4413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 46.0778923034668, "rewards/margins": 1.6294276714324951, "rewards/rejected": 44.448463439941406, "step": 2830 }, { "epoch": 0.5574092247301276, "grad_norm": 5062.330068814192, "learning_rate": 2.4357705770449046e-07, "logits/chosen": -2.505913257598877, "logits/rejected": -2.5338852405548096, "logps/chosen": -215.6706085205078, "logps/rejected": -204.14747619628906, "loss": 421.4331, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 34.1905632019043, "rewards/margins": 0.7007572054862976, "rewards/rejected": 33.48980712890625, "step": 2840 }, { "epoch": 0.5593719332679097, "grad_norm": 4383.801364110385, "learning_rate": 2.418648142148056e-07, "logits/chosen": -2.4966864585876465, "logits/rejected": -2.526427984237671, "logps/chosen": -246.0478515625, "logps/rejected": -179.22616577148438, "loss": 456.8376, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 37.7618293762207, "rewards/margins": -0.15580138564109802, "rewards/rejected": 37.91763687133789, "step": 2850 }, { "epoch": 0.5613346418056918, "grad_norm": 3793.43585256964, "learning_rate": 2.4015295265781966e-07, "logits/chosen": -2.387354850769043, "logits/rejected": -2.3449103832244873, "logps/chosen": -254.6068878173828, "logps/rejected": -263.32269287109375, "loss": 454.3401, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 43.64141845703125, "rewards/margins": -1.6728923320770264, "rewards/rejected": 45.31431198120117, "step": 2860 }, { "epoch": 0.563297350343474, "grad_norm": 4384.572825733482, "learning_rate": 2.3844155340242893e-07, "logits/chosen": -2.5381031036376953, "logits/rejected": -2.506338596343994, "logps/chosen": -176.04165649414062, "logps/rejected": -180.90724182128906, "loss": 395.7169, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 40.10258102416992, "rewards/margins": 2.170680522918701, "rewards/rejected": 37.931907653808594, "step": 2870 }, { "epoch": 0.5652600588812562, "grad_norm": 6019.272847984498, "learning_rate": 2.36730696795826e-07, "logits/chosen": -2.772923469543457, "logits/rejected": -2.754718065261841, "logps/chosen": -205.79281616210938, "logps/rejected": -298.2911071777344, "loss": 514.6727, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 34.41475296020508, "rewards/margins": -3.1085028648376465, "rewards/rejected": 37.52325439453125, "step": 2880 }, { "epoch": 0.5672227674190383, "grad_norm": 4282.95508081811, "learning_rate": 2.3502046315972655e-07, "logits/chosen": -2.614781141281128, "logits/rejected": -2.548280715942383, "logps/chosen": -272.5713195800781, "logps/rejected": -253.4302520751953, "loss": 470.3598, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 46.17670440673828, "rewards/margins": 5.820972442626953, "rewards/rejected": 40.355735778808594, "step": 2890 }, { "epoch": 0.5691854759568205, "grad_norm": 4430.8877672860535, "learning_rate": 2.3331093278659906e-07, "logits/chosen": -2.634147882461548, "logits/rejected": -2.641078472137451, "logps/chosen": -258.8018798828125, "logps/rejected": -241.8573760986328, "loss": 461.3288, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 49.79001235961914, "rewards/margins": 7.490835666656494, "rewards/rejected": 42.29917526245117, "step": 2900 }, { "epoch": 0.5711481844946026, "grad_norm": 4962.723571464093, "learning_rate": 2.31602185935895e-07, "logits/chosen": -2.749091386795044, "logits/rejected": -2.6512725353240967, "logps/chosen": -248.509033203125, "logps/rejected": -194.28793334960938, "loss": 441.0518, "rewards/accuracies": 0.5, "rewards/chosen": 35.29741668701172, "rewards/margins": 6.335047721862793, "rewards/rejected": 28.96236801147461, "step": 2910 }, { "epoch": 0.5731108930323847, "grad_norm": 4110.409170118379, "learning_rate": 2.298943028302811e-07, "logits/chosen": -2.7444474697113037, "logits/rejected": -2.6776154041290283, "logps/chosen": -255.79269409179688, "logps/rejected": -257.0079650878906, "loss": 463.8671, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 33.684104919433594, "rewards/margins": -20.564167022705078, "rewards/rejected": 54.24827194213867, "step": 2920 }, { "epoch": 0.5750736015701668, "grad_norm": 3637.573540955409, "learning_rate": 2.2818736365187242e-07, "logits/chosen": -2.6612980365753174, "logits/rejected": -2.625981330871582, "logps/chosen": -194.0688934326172, "logps/rejected": -154.44122314453125, "loss": 375.4742, "rewards/accuracies": 0.5, "rewards/chosen": 33.018577575683594, "rewards/margins": -4.529515266418457, "rewards/rejected": 37.548099517822266, "step": 2930 }, { "epoch": 0.577036310107949, "grad_norm": 4580.753048080811, "learning_rate": 2.2648144853846847e-07, "logits/chosen": -2.6012885570526123, "logits/rejected": -2.56108021736145, "logps/chosen": -231.09231567382812, "logps/rejected": -221.6013641357422, "loss": 468.9104, "rewards/accuracies": 0.5, "rewards/chosen": 28.090499877929688, "rewards/margins": -15.818713188171387, "rewards/rejected": 43.909210205078125, "step": 2940 }, { "epoch": 0.5789990186457311, "grad_norm": 4972.307690153018, "learning_rate": 2.247766375797906e-07, "logits/chosen": -2.586660861968994, "logits/rejected": -2.606600522994995, "logps/chosen": -156.38302612304688, "logps/rejected": -165.44100952148438, "loss": 493.1104, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 35.012245178222656, "rewards/margins": -4.334465980529785, "rewards/rejected": 39.34671401977539, "step": 2950 }, { "epoch": 0.5809617271835132, "grad_norm": 3767.4283177101634, "learning_rate": 2.2307301081372222e-07, "logits/chosen": -2.544673204421997, "logits/rejected": -2.5878868103027344, "logps/chosen": -215.91079711914062, "logps/rejected": -239.40243530273438, "loss": 379.2808, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 41.98701858520508, "rewards/margins": 5.5673136711120605, "rewards/rejected": 36.41970443725586, "step": 2960 }, { "epoch": 0.5829244357212954, "grad_norm": 4987.044244288877, "learning_rate": 2.2137064822255086e-07, "logits/chosen": -2.6021924018859863, "logits/rejected": -2.5253403186798096, "logps/chosen": -181.9603729248047, "logps/rejected": -176.57830810546875, "loss": 393.5017, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 38.797855377197266, "rewards/margins": -0.5924438238143921, "rewards/rejected": 39.39030075073242, "step": 2970 }, { "epoch": 0.5848871442590775, "grad_norm": 4962.65706927889, "learning_rate": 2.1966962972921322e-07, "logits/chosen": -2.6079602241516113, "logits/rejected": -2.55991792678833, "logps/chosen": -200.67337036132812, "logps/rejected": -234.6151885986328, "loss": 492.385, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 36.16741180419922, "rewards/margins": 1.037660837173462, "rewards/rejected": 35.12975311279297, "step": 2980 }, { "epoch": 0.5868498527968596, "grad_norm": 5511.373165062106, "learning_rate": 2.1797003519354285e-07, "logits/chosen": -2.632431745529175, "logits/rejected": -2.6114015579223633, "logps/chosen": -224.2974090576172, "logps/rejected": -224.2973175048828, "loss": 458.8507, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.5731201171875, "rewards/margins": -1.473892331123352, "rewards/rejected": 39.04701614379883, "step": 2990 }, { "epoch": 0.5888125613346418, "grad_norm": 5348.887293650911, "learning_rate": 2.1627194440852142e-07, "logits/chosen": -2.4771065711975098, "logits/rejected": -2.5486245155334473, "logps/chosen": -254.0469207763672, "logps/rejected": -229.03182983398438, "loss": 479.329, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 39.728477478027344, "rewards/margins": 10.570646286010742, "rewards/rejected": 29.157833099365234, "step": 3000 }, { "epoch": 0.5907752698724239, "grad_norm": 4409.821263587961, "learning_rate": 2.1457543709653176e-07, "logits/chosen": -2.642092227935791, "logits/rejected": -2.6010630130767822, "logps/chosen": -236.6316680908203, "logps/rejected": -216.1038055419922, "loss": 456.4789, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 37.16182327270508, "rewards/margins": 1.520094633102417, "rewards/rejected": 35.641727447509766, "step": 3010 }, { "epoch": 0.592737978410206, "grad_norm": 4895.273800471361, "learning_rate": 2.128805929056154e-07, "logits/chosen": -2.605106830596924, "logits/rejected": -2.6045784950256348, "logps/chosen": -153.69691467285156, "logps/rejected": -166.0969696044922, "loss": 438.2885, "rewards/accuracies": 0.5, "rewards/chosen": 27.13759994506836, "rewards/margins": -1.3463178873062134, "rewards/rejected": 28.48392105102539, "step": 3020 }, { "epoch": 0.5947006869479883, "grad_norm": 4532.808190246022, "learning_rate": 2.1118749140573358e-07, "logits/chosen": -2.6185317039489746, "logits/rejected": -2.553497552871704, "logps/chosen": -218.6675262451172, "logps/rejected": -216.93447875976562, "loss": 430.802, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 35.49135208129883, "rewards/margins": 1.2050817012786865, "rewards/rejected": 34.28627014160156, "step": 3030 }, { "epoch": 0.5966633954857704, "grad_norm": 4602.001983182268, "learning_rate": 2.0949621208503092e-07, "logits/chosen": -2.5064640045166016, "logits/rejected": -2.5191729068756104, "logps/chosen": -250.8184356689453, "logps/rejected": -188.92161560058594, "loss": 459.5887, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 49.90302276611328, "rewards/margins": 17.360546112060547, "rewards/rejected": 32.54247283935547, "step": 3040 }, { "epoch": 0.5986261040235525, "grad_norm": 3721.707073522762, "learning_rate": 2.0780683434610413e-07, "logits/chosen": -2.5521321296691895, "logits/rejected": -2.5300755500793457, "logps/chosen": -205.0784912109375, "logps/rejected": -224.0399627685547, "loss": 426.4348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 35.36996841430664, "rewards/margins": 4.673448085784912, "rewards/rejected": 30.696521759033203, "step": 3050 }, { "epoch": 0.6005888125613347, "grad_norm": 5192.749971428991, "learning_rate": 2.0611943750227375e-07, "logits/chosen": -2.5217807292938232, "logits/rejected": -2.4791207313537598, "logps/chosen": -219.89437866210938, "logps/rejected": -198.4583282470703, "loss": 433.7738, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": 42.70241165161133, "rewards/margins": -2.064373016357422, "rewards/rejected": 44.76677703857422, "step": 3060 }, { "epoch": 0.6025515210991168, "grad_norm": 5461.744696463223, "learning_rate": 2.044341007738612e-07, "logits/chosen": -2.6488800048828125, "logits/rejected": -2.5429649353027344, "logps/chosen": -296.5935363769531, "logps/rejected": -253.7562255859375, "loss": 420.5087, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 54.2684326171875, "rewards/margins": 10.449033737182617, "rewards/rejected": 43.819400787353516, "step": 3070 }, { "epoch": 0.6045142296368989, "grad_norm": 4070.9126786852544, "learning_rate": 2.027509032844687e-07, "logits/chosen": -2.783871650695801, "logits/rejected": -2.8232202529907227, "logps/chosen": -271.37237548828125, "logps/rejected": -303.91717529296875, "loss": 513.612, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 48.6921501159668, "rewards/margins": 0.8832836151123047, "rewards/rejected": 47.808868408203125, "step": 3080 }, { "epoch": 0.6064769381746811, "grad_norm": 4759.790843261946, "learning_rate": 2.010699240572651e-07, "logits/chosen": -2.656386613845825, "logits/rejected": -2.7077832221984863, "logps/chosen": -314.8296203613281, "logps/rejected": -276.51123046875, "loss": 542.8778, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 44.121734619140625, "rewards/margins": -4.4381608963012695, "rewards/rejected": 48.559898376464844, "step": 3090 }, { "epoch": 0.6084396467124632, "grad_norm": 4163.665532803295, "learning_rate": 1.993912420112756e-07, "logits/chosen": -2.493317127227783, "logits/rejected": -2.449044704437256, "logps/chosen": -256.8260192871094, "logps/rejected": -284.9006042480469, "loss": 460.4679, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 66.61991882324219, "rewards/margins": -32.62569808959961, "rewards/rejected": 99.24561309814453, "step": 3100 }, { "epoch": 0.6104023552502453, "grad_norm": 11937.020377702016, "learning_rate": 1.9771493595767707e-07, "logits/chosen": -2.5441012382507324, "logits/rejected": -2.5751733779907227, "logps/chosen": -247.4366912841797, "logps/rejected": -298.6747741699219, "loss": 441.9069, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.246559143066406, "rewards/margins": 10.147356986999512, "rewards/rejected": 28.09920310974121, "step": 3110 }, { "epoch": 0.6123650637880275, "grad_norm": 4595.713922683333, "learning_rate": 1.9604108459609752e-07, "logits/chosen": -2.616608142852783, "logits/rejected": -2.595402240753174, "logps/chosen": -271.9317932128906, "logps/rejected": -279.56298828125, "loss": 430.3151, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 47.826576232910156, "rewards/margins": 9.75730037689209, "rewards/rejected": 38.069278717041016, "step": 3120 }, { "epoch": 0.6143277723258096, "grad_norm": 4013.64311777735, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -2.5802206993103027, "logits/rejected": -2.518826961517334, "logps/chosen": -244.60006713867188, "logps/rejected": -230.1786346435547, "loss": 482.6275, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 64.09306335449219, "rewards/margins": -5.149061679840088, "rewards/rejected": 69.24212646484375, "step": 3130 }, { "epoch": 0.6162904808635917, "grad_norm": 4316.587871257736, "learning_rate": 1.9270106016760035e-07, "logits/chosen": -2.676774263381958, "logits/rejected": -2.6333038806915283, "logps/chosen": -221.6033172607422, "logps/rejected": -231.56314086914062, "loss": 388.8536, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 42.179222106933594, "rewards/margins": 0.9718021154403687, "rewards/rejected": 41.207420349121094, "step": 3140 }, { "epoch": 0.6182531894013739, "grad_norm": 4599.885600859548, "learning_rate": 1.9103504390896944e-07, "logits/chosen": -2.6174519062042236, "logits/rejected": -2.568662643432617, "logps/chosen": -190.2574462890625, "logps/rejected": -243.7998046875, "loss": 433.49, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 33.63319778442383, "rewards/margins": -8.475696563720703, "rewards/rejected": 42.10889434814453, "step": 3150 }, { "epoch": 0.620215897939156, "grad_norm": 4708.710117073037, "learning_rate": 1.8937179595156876e-07, "logits/chosen": -2.6824817657470703, "logits/rejected": -2.5376830101013184, "logps/chosen": -236.5217742919922, "logps/rejected": -173.81320190429688, "loss": 472.0527, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 42.436920166015625, "rewards/margins": 7.270151615142822, "rewards/rejected": 35.166770935058594, "step": 3160 }, { "epoch": 0.6221786064769381, "grad_norm": 5239.948622278977, "learning_rate": 1.8771139438197168e-07, "logits/chosen": -2.6610610485076904, "logits/rejected": -2.5386433601379395, "logps/chosen": -254.28652954101562, "logps/rejected": -267.49725341796875, "loss": 515.909, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.86225509643555, "rewards/margins": 5.88637638092041, "rewards/rejected": 31.975879669189453, "step": 3170 }, { "epoch": 0.6241413150147204, "grad_norm": 3644.418786384365, "learning_rate": 1.8605391715311846e-07, "logits/chosen": -2.4733784198760986, "logits/rejected": -2.344364881515503, "logps/chosen": -246.8455352783203, "logps/rejected": -174.06283569335938, "loss": 437.8695, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 38.20279312133789, "rewards/margins": 5.049153804779053, "rewards/rejected": 33.15364074707031, "step": 3180 }, { "epoch": 0.6261040235525025, "grad_norm": 5441.213028983494, "learning_rate": 1.8439944208065704e-07, "logits/chosen": -2.672022819519043, "logits/rejected": -2.6368296146392822, "logps/chosen": -299.55938720703125, "logps/rejected": -291.67816162109375, "loss": 476.3895, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 50.05424880981445, "rewards/margins": 15.180732727050781, "rewards/rejected": 34.8735237121582, "step": 3190 }, { "epoch": 0.6280667320902846, "grad_norm": 4261.357128597478, "learning_rate": 1.8274804683928913e-07, "logits/chosen": -2.6394951343536377, "logits/rejected": -2.5230634212493896, "logps/chosen": -291.575439453125, "logps/rejected": -241.4493865966797, "loss": 480.2888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 41.28780746459961, "rewards/margins": -0.9671966433525085, "rewards/rejected": 42.2550048828125, "step": 3200 }, { "epoch": 0.6300294406280668, "grad_norm": 4478.453528554573, "learning_rate": 1.810998089591238e-07, "logits/chosen": -2.676175117492676, "logits/rejected": -2.6173923015594482, "logps/chosen": -212.4101104736328, "logps/rejected": -218.7488250732422, "loss": 445.4913, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 40.66368865966797, "rewards/margins": 0.6559812426567078, "rewards/rejected": 40.00770950317383, "step": 3210 }, { "epoch": 0.6319921491658489, "grad_norm": 4539.001330624233, "learning_rate": 1.7945480582203745e-07, "logits/chosen": -2.58876371383667, "logits/rejected": -2.568467378616333, "logps/chosen": -198.20535278320312, "logps/rejected": -231.5351104736328, "loss": 403.5555, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 50.637123107910156, "rewards/margins": 14.340398788452148, "rewards/rejected": 36.296722412109375, "step": 3220 }, { "epoch": 0.633954857703631, "grad_norm": 4529.697661512652, "learning_rate": 1.7781311465804128e-07, "logits/chosen": -2.485405445098877, "logits/rejected": -2.449179172515869, "logps/chosen": -214.7068634033203, "logps/rejected": -192.54421997070312, "loss": 431.5271, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 64.39295196533203, "rewards/margins": 23.37224006652832, "rewards/rejected": 41.020721435546875, "step": 3230 }, { "epoch": 0.6359175662414132, "grad_norm": 5177.31263434082, "learning_rate": 1.7617481254165487e-07, "logits/chosen": -2.5449860095977783, "logits/rejected": -2.557673931121826, "logps/chosen": -207.82638549804688, "logps/rejected": -189.16799926757812, "loss": 477.9846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 63.777488708496094, "rewards/margins": 30.085861206054688, "rewards/rejected": 33.69163131713867, "step": 3240 }, { "epoch": 0.6378802747791953, "grad_norm": 4353.921471179031, "learning_rate": 1.745399763882881e-07, "logits/chosen": -2.573568344116211, "logits/rejected": -2.520761013031006, "logps/chosen": -253.19186401367188, "logps/rejected": -242.68246459960938, "loss": 397.9687, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 36.56116485595703, "rewards/margins": 2.457200527191162, "rewards/rejected": 34.10395812988281, "step": 3250 }, { "epoch": 0.6398429833169774, "grad_norm": 4828.868713875561, "learning_rate": 1.7290868295062983e-07, "logits/chosen": -2.4604735374450684, "logits/rejected": -2.5235681533813477, "logps/chosen": -242.060302734375, "logps/rejected": -246.6624298095703, "loss": 452.2631, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 45.33163070678711, "rewards/margins": 7.067657470703125, "rewards/rejected": 38.263973236083984, "step": 3260 }, { "epoch": 0.6418056918547596, "grad_norm": 5303.634850928562, "learning_rate": 1.7128100881504492e-07, "logits/chosen": -2.565659999847412, "logits/rejected": -2.437257766723633, "logps/chosen": -241.627685546875, "logps/rejected": -187.8494415283203, "loss": 508.0383, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 44.94586181640625, "rewards/margins": 15.400263786315918, "rewards/rejected": 29.54559898376465, "step": 3270 }, { "epoch": 0.6437684003925417, "grad_norm": 4748.199539584447, "learning_rate": 1.6965703039797808e-07, "logits/chosen": -2.548427104949951, "logits/rejected": -2.4462082386016846, "logps/chosen": -266.9375, "logps/rejected": -184.31228637695312, "loss": 455.7013, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 48.49503707885742, "rewards/margins": 5.851919651031494, "rewards/rejected": 42.64311981201172, "step": 3280 }, { "epoch": 0.6457311089303238, "grad_norm": 5059.711030740259, "learning_rate": 1.6803682394236656e-07, "logits/chosen": -2.7422773838043213, "logits/rejected": -2.6419894695281982, "logps/chosen": -285.46710205078125, "logps/rejected": -216.86337280273438, "loss": 411.9077, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": 46.86991500854492, "rewards/margins": -5.114217281341553, "rewards/rejected": 51.984130859375, "step": 3290 }, { "epoch": 0.647693817468106, "grad_norm": 4717.050316429819, "learning_rate": 1.664204655140607e-07, "logits/chosen": -2.5475409030914307, "logits/rejected": -2.5353424549102783, "logps/chosen": -196.85162353515625, "logps/rejected": -225.6861114501953, "loss": 469.1195, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 40.43767547607422, "rewards/margins": -12.499992370605469, "rewards/rejected": 52.93767166137695, "step": 3300 }, { "epoch": 0.6496565260058881, "grad_norm": 4539.935608470393, "learning_rate": 1.6480803099825277e-07, "logits/chosen": -2.6134755611419678, "logits/rejected": -2.5636157989501953, "logps/chosen": -215.1249237060547, "logps/rejected": -166.5946807861328, "loss": 435.5142, "rewards/accuracies": 0.5, "rewards/chosen": 46.504249572753906, "rewards/margins": -8.098384857177734, "rewards/rejected": 54.602638244628906, "step": 3310 }, { "epoch": 0.6516192345436702, "grad_norm": 4318.11502712315, "learning_rate": 1.6319959609591412e-07, "logits/chosen": -2.493683338165283, "logits/rejected": -2.4112517833709717, "logps/chosen": -193.81930541992188, "logps/rejected": -158.67062377929688, "loss": 411.9584, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 48.755897521972656, "rewards/margins": 11.440227508544922, "rewards/rejected": 37.31566619873047, "step": 3320 }, { "epoch": 0.6535819430814525, "grad_norm": 4180.583847225108, "learning_rate": 1.6159523632024126e-07, "logits/chosen": -2.603215456008911, "logits/rejected": -2.5020575523376465, "logps/chosen": -246.5648956298828, "logps/rejected": -274.041259765625, "loss": 465.5974, "rewards/accuracies": 0.5, "rewards/chosen": 42.838783264160156, "rewards/margins": 1.3022468090057373, "rewards/rejected": 41.53654098510742, "step": 3330 }, { "epoch": 0.6555446516192346, "grad_norm": 5364.492058913141, "learning_rate": 1.599950269931107e-07, "logits/chosen": -2.4481101036071777, "logits/rejected": -2.4633541107177734, "logps/chosen": -259.56243896484375, "logps/rejected": -212.51803588867188, "loss": 471.3864, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 36.67453384399414, "rewards/margins": -3.0609450340270996, "rewards/rejected": 39.73548126220703, "step": 3340 }, { "epoch": 0.6575073601570167, "grad_norm": 3839.3877086108932, "learning_rate": 1.5839904324154273e-07, "logits/chosen": -2.592939853668213, "logits/rejected": -2.455808401107788, "logps/chosen": -243.414306640625, "logps/rejected": -233.156982421875, "loss": 460.8823, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 41.609745025634766, "rewards/margins": -20.40592384338379, "rewards/rejected": 62.01567459106445, "step": 3350 }, { "epoch": 0.6594700686947988, "grad_norm": 4412.835972775599, "learning_rate": 1.568073599941742e-07, "logits/chosen": -2.6051926612854004, "logits/rejected": -2.6242880821228027, "logps/chosen": -260.4896545410156, "logps/rejected": -234.35903930664062, "loss": 470.7374, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 38.179622650146484, "rewards/margins": -1.027712345123291, "rewards/rejected": 39.20732879638672, "step": 3360 }, { "epoch": 0.661432777232581, "grad_norm": 3617.8577010382865, "learning_rate": 1.552200519777408e-07, "logits/chosen": -2.6957767009735107, "logits/rejected": -2.5345678329467773, "logps/chosen": -273.77520751953125, "logps/rejected": -219.6286163330078, "loss": 490.3991, "rewards/accuracies": 0.5, "rewards/chosen": 44.90713882446289, "rewards/margins": 16.124292373657227, "rewards/rejected": 28.7828426361084, "step": 3370 }, { "epoch": 0.6633954857703631, "grad_norm": 4019.619244739591, "learning_rate": 1.5363719371356882e-07, "logits/chosen": -2.7795815467834473, "logits/rejected": -2.727719306945801, "logps/chosen": -290.29443359375, "logps/rejected": -190.9752960205078, "loss": 368.1448, "rewards/accuracies": 0.5, "rewards/chosen": 50.465457916259766, "rewards/margins": 13.8067626953125, "rewards/rejected": 36.658695220947266, "step": 3380 }, { "epoch": 0.6653581943081452, "grad_norm": 4725.070966876188, "learning_rate": 1.5205885951407665e-07, "logits/chosen": -2.6008598804473877, "logits/rejected": -2.5928776264190674, "logps/chosen": -214.3909454345703, "logps/rejected": -253.8872833251953, "loss": 430.3905, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 39.41809844970703, "rewards/margins": -9.465060234069824, "rewards/rejected": 48.883155822753906, "step": 3390 }, { "epoch": 0.6673209028459274, "grad_norm": 5097.329526473776, "learning_rate": 1.5048512347928564e-07, "logits/chosen": -2.5902061462402344, "logits/rejected": -2.38393497467041, "logps/chosen": -249.7587432861328, "logps/rejected": -171.68746948242188, "loss": 440.4719, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 38.5333366394043, "rewards/margins": 7.863332271575928, "rewards/rejected": 30.67000389099121, "step": 3400 }, { "epoch": 0.6692836113837095, "grad_norm": 4547.883065698954, "learning_rate": 1.4891605949334133e-07, "logits/chosen": -2.750488042831421, "logits/rejected": -2.6084415912628174, "logps/chosen": -425.6971130371094, "logps/rejected": -345.8495788574219, "loss": 519.4268, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 47.112815856933594, "rewards/margins": 13.372627258300781, "rewards/rejected": 33.74019241333008, "step": 3410 }, { "epoch": 0.6712463199214916, "grad_norm": 5017.880275091979, "learning_rate": 1.4735174122104476e-07, "logits/chosen": -2.5117251873016357, "logits/rejected": -2.4606595039367676, "logps/chosen": -201.7481231689453, "logps/rejected": -164.49876403808594, "loss": 430.2011, "rewards/accuracies": 0.6333332657814026, "rewards/chosen": 37.651527404785156, "rewards/margins": 3.23347544670105, "rewards/rejected": 34.41805648803711, "step": 3420 }, { "epoch": 0.6732090284592738, "grad_norm": 4525.0237736900335, "learning_rate": 1.457922421043943e-07, "logits/chosen": -2.6487231254577637, "logits/rejected": -2.4554855823516846, "logps/chosen": -283.5890197753906, "logps/rejected": -169.73692321777344, "loss": 429.0944, "rewards/accuracies": 0.5, "rewards/chosen": 36.453189849853516, "rewards/margins": 0.8102760314941406, "rewards/rejected": 35.642913818359375, "step": 3430 }, { "epoch": 0.6751717369970559, "grad_norm": 4836.351887681175, "learning_rate": 1.4423763535913704e-07, "logits/chosen": -2.7184793949127197, "logits/rejected": -2.66823148727417, "logps/chosen": -220.481201171875, "logps/rejected": -233.3111572265625, "loss": 444.5051, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 46.962947845458984, "rewards/margins": 12.238215446472168, "rewards/rejected": 34.724735260009766, "step": 3440 }, { "epoch": 0.677134445534838, "grad_norm": 5097.525411981749, "learning_rate": 1.426879939713322e-07, "logits/chosen": -2.6069436073303223, "logits/rejected": -2.58504581451416, "logps/chosen": -262.5929870605469, "logps/rejected": -197.80320739746094, "loss": 414.4149, "rewards/accuracies": 0.5, "rewards/chosen": 41.25318908691406, "rewards/margins": 3.419710874557495, "rewards/rejected": 37.83348083496094, "step": 3450 }, { "epoch": 0.6790971540726202, "grad_norm": 3986.6147298360115, "learning_rate": 1.4114339069392374e-07, "logits/chosen": -2.6958091259002686, "logits/rejected": -2.527467727661133, "logps/chosen": -251.56552124023438, "logps/rejected": -173.15176391601562, "loss": 406.3061, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 45.1236686706543, "rewards/margins": 6.652157783508301, "rewards/rejected": 38.47150802612305, "step": 3460 }, { "epoch": 0.6810598626104023, "grad_norm": 4532.603335528347, "learning_rate": 1.3960389804332556e-07, "logits/chosen": -2.560441493988037, "logits/rejected": -2.5246596336364746, "logps/chosen": -216.03915405273438, "logps/rejected": -249.4767303466797, "loss": 432.2447, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 41.10346221923828, "rewards/margins": -1.274405837059021, "rewards/rejected": 42.377864837646484, "step": 3470 }, { "epoch": 0.6830225711481845, "grad_norm": 4453.197624373022, "learning_rate": 1.380695882960165e-07, "logits/chosen": -2.591632843017578, "logits/rejected": -2.5592544078826904, "logps/chosen": -243.0457763671875, "logps/rejected": -180.34759521484375, "loss": 409.3893, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 29.93951988220215, "rewards/margins": -16.706111907958984, "rewards/rejected": 46.6456298828125, "step": 3480 }, { "epoch": 0.6849852796859667, "grad_norm": 4587.090606395604, "learning_rate": 1.3654053348514702e-07, "logits/chosen": -2.3719887733459473, "logits/rejected": -2.284217357635498, "logps/chosen": -130.6105499267578, "logps/rejected": -172.13636779785156, "loss": 469.2424, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 33.91435623168945, "rewards/margins": 4.379640102386475, "rewards/rejected": 29.534717559814453, "step": 3490 }, { "epoch": 0.6869479882237488, "grad_norm": 4560.865072217876, "learning_rate": 1.350168053971577e-07, "logits/chosen": -2.571834087371826, "logits/rejected": -2.5446648597717285, "logps/chosen": -318.7664794921875, "logps/rejected": -205.4440460205078, "loss": 483.0706, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 43.74156951904297, "rewards/margins": 9.468842506408691, "rewards/rejected": 34.272727966308594, "step": 3500 }, { "epoch": 0.6889106967615309, "grad_norm": 4127.48376874728, "learning_rate": 1.3349847556840876e-07, "logits/chosen": -2.6035072803497314, "logits/rejected": -2.621722936630249, "logps/chosen": -200.86965942382812, "logps/rejected": -229.7669219970703, "loss": 503.4723, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 44.48680877685547, "rewards/margins": 7.581006050109863, "rewards/rejected": 36.905799865722656, "step": 3510 }, { "epoch": 0.6908734052993131, "grad_norm": 4659.940196560349, "learning_rate": 1.3198561528182182e-07, "logits/chosen": -2.5462212562561035, "logits/rejected": -2.5860719680786133, "logps/chosen": -170.47500610351562, "logps/rejected": -178.5261688232422, "loss": 484.3155, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 28.57163429260254, "rewards/margins": -16.579164505004883, "rewards/rejected": 45.15079879760742, "step": 3520 }, { "epoch": 0.6928361138370952, "grad_norm": 5071.0735487629045, "learning_rate": 1.3047829556353263e-07, "logits/chosen": -2.592815399169922, "logits/rejected": -2.55568265914917, "logps/chosen": -216.5419921875, "logps/rejected": -212.7433624267578, "loss": 467.819, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 36.28511428833008, "rewards/margins": 9.000925064086914, "rewards/rejected": 27.2841854095459, "step": 3530 }, { "epoch": 0.6947988223748773, "grad_norm": 4320.72890254297, "learning_rate": 1.2897658717955742e-07, "logits/chosen": -2.4630746841430664, "logits/rejected": -2.3922531604766846, "logps/chosen": -222.0006866455078, "logps/rejected": -181.97515869140625, "loss": 387.9535, "rewards/accuracies": 0.43333330750465393, "rewards/chosen": 40.32611083984375, "rewards/margins": 8.769170761108398, "rewards/rejected": 31.556941986083984, "step": 3540 }, { "epoch": 0.6967615309126595, "grad_norm": 4726.8806674800735, "learning_rate": 1.2748056063246994e-07, "logits/chosen": -2.6976161003112793, "logits/rejected": -2.6608502864837646, "logps/chosen": -262.3450622558594, "logps/rejected": -241.70654296875, "loss": 478.4425, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 39.3624153137207, "rewards/margins": -8.320429801940918, "rewards/rejected": 47.68284225463867, "step": 3550 }, { "epoch": 0.6987242394504416, "grad_norm": 4601.9229330035905, "learning_rate": 1.2599028615809183e-07, "logits/chosen": -2.582568645477295, "logits/rejected": -2.5583627223968506, "logps/chosen": -257.66522216796875, "logps/rejected": -201.1621856689453, "loss": 453.1743, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 39.114158630371094, "rewards/margins": 1.6954081058502197, "rewards/rejected": 37.41875457763672, "step": 3560 }, { "epoch": 0.7006869479882237, "grad_norm": 3547.0105030627583, "learning_rate": 1.2450583372219458e-07, "logits/chosen": -2.5113353729248047, "logits/rejected": -2.5333168506622314, "logps/chosen": -252.20834350585938, "logps/rejected": -246.343017578125, "loss": 418.4648, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 50.306793212890625, "rewards/margins": 1.4378515481948853, "rewards/rejected": 48.86893844604492, "step": 3570 }, { "epoch": 0.7026496565260059, "grad_norm": 4122.336208978104, "learning_rate": 1.230272730172157e-07, "logits/chosen": -2.4826693534851074, "logits/rejected": -2.4991822242736816, "logps/chosen": -253.10952758789062, "logps/rejected": -263.3420104980469, "loss": 407.0652, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 43.307411193847656, "rewards/margins": -1.0030360221862793, "rewards/rejected": 44.310447692871094, "step": 3580 }, { "epoch": 0.704612365063788, "grad_norm": 4848.095885022227, "learning_rate": 1.2155467345898602e-07, "logits/chosen": -2.607501745223999, "logits/rejected": -2.520416736602783, "logps/chosen": -218.1444854736328, "logps/rejected": -248.6970977783203, "loss": 443.058, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 49.31399917602539, "rewards/margins": -15.503260612487793, "rewards/rejected": 64.81726837158203, "step": 3590 }, { "epoch": 0.7065750736015701, "grad_norm": 3891.8257985300042, "learning_rate": 1.2008810418347093e-07, "logits/chosen": -2.5468451976776123, "logits/rejected": -2.646713972091675, "logps/chosen": -171.98916625976562, "logps/rejected": -169.75364685058594, "loss": 354.3096, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 29.069055557250977, "rewards/margins": -12.974533081054688, "rewards/rejected": 42.04358673095703, "step": 3600 }, { "epoch": 0.7085377821393523, "grad_norm": 4447.461803676008, "learning_rate": 1.1862763404352483e-07, "logits/chosen": -2.733118772506714, "logits/rejected": -2.6008687019348145, "logps/chosen": -274.53125, "logps/rejected": -239.507080078125, "loss": 506.1706, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 39.546661376953125, "rewards/margins": -0.24837017059326172, "rewards/rejected": 39.7950325012207, "step": 3610 }, { "epoch": 0.7105004906771345, "grad_norm": 4505.5930871988285, "learning_rate": 1.1717333160565807e-07, "logits/chosen": -2.662598133087158, "logits/rejected": -2.610295057296753, "logps/chosen": -303.64556884765625, "logps/rejected": -236.7455596923828, "loss": 454.2638, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 54.06207275390625, "rewards/margins": 8.460227966308594, "rewards/rejected": 45.601844787597656, "step": 3620 }, { "epoch": 0.7124631992149166, "grad_norm": 5463.823656137664, "learning_rate": 1.1572526514681874e-07, "logits/chosen": -2.6133432388305664, "logits/rejected": -2.5437867641448975, "logps/chosen": -260.36273193359375, "logps/rejected": -290.7547912597656, "loss": 455.4876, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 37.65696716308594, "rewards/margins": 0.33991679549217224, "rewards/rejected": 37.31705093383789, "step": 3630 }, { "epoch": 0.7144259077526988, "grad_norm": 4995.263176189, "learning_rate": 1.1428350265118613e-07, "logits/chosen": -2.666706085205078, "logits/rejected": -2.551975727081299, "logps/chosen": -272.3623352050781, "logps/rejected": -243.81796264648438, "loss": 491.338, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 44.81690979003906, "rewards/margins": 11.616350173950195, "rewards/rejected": 33.20055389404297, "step": 3640 }, { "epoch": 0.7163886162904809, "grad_norm": 6107.795181914702, "learning_rate": 1.128481118069799e-07, "logits/chosen": -2.6353912353515625, "logits/rejected": -2.429758071899414, "logps/chosen": -231.792724609375, "logps/rejected": -225.7418212890625, "loss": 478.819, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 39.23735427856445, "rewards/margins": -10.274882316589355, "rewards/rejected": 49.51223373413086, "step": 3650 }, { "epoch": 0.718351324828263, "grad_norm": 4559.000157082686, "learning_rate": 1.114191600032815e-07, "logits/chosen": -2.705068588256836, "logits/rejected": -2.5441765785217285, "logps/chosen": -266.17327880859375, "logps/rejected": -232.7379913330078, "loss": 478.844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 45.4412727355957, "rewards/margins": 4.956995964050293, "rewards/rejected": 40.484275817871094, "step": 3660 }, { "epoch": 0.7203140333660452, "grad_norm": 5142.283096753777, "learning_rate": 1.0999671432687099e-07, "logits/chosen": -2.5555663108825684, "logits/rejected": -2.3798253536224365, "logps/chosen": -250.02890014648438, "logps/rejected": -199.82859802246094, "loss": 469.3981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 48.90443801879883, "rewards/margins": 12.09408187866211, "rewards/rejected": 36.810359954833984, "step": 3670 }, { "epoch": 0.7222767419038273, "grad_norm": 4831.377411388173, "learning_rate": 1.085808415590772e-07, "logits/chosen": -2.70011568069458, "logits/rejected": -2.653996706008911, "logps/chosen": -252.80184936523438, "logps/rejected": -220.48556518554688, "loss": 441.9232, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 32.957645416259766, "rewards/margins": 5.683412551879883, "rewards/rejected": 27.27423667907715, "step": 3680 }, { "epoch": 0.7242394504416094, "grad_norm": 4974.359444411199, "learning_rate": 1.0717160817264217e-07, "logits/chosen": -2.6733148097991943, "logits/rejected": -2.5146970748901367, "logps/chosen": -226.50149536132812, "logps/rejected": -233.2989501953125, "loss": 423.3871, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 45.977073669433594, "rewards/margins": 10.666391372680664, "rewards/rejected": 35.31068801879883, "step": 3690 }, { "epoch": 0.7262021589793916, "grad_norm": 4246.788329339762, "learning_rate": 1.0576908032860088e-07, "logits/chosen": -2.3461179733276367, "logits/rejected": -2.367692232131958, "logps/chosen": -205.265625, "logps/rejected": -166.40811157226562, "loss": 433.289, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 51.241355895996094, "rewards/margins": 10.636072158813477, "rewards/rejected": 40.605281829833984, "step": 3700 }, { "epoch": 0.7281648675171737, "grad_norm": 4238.6510414008535, "learning_rate": 1.0437332387317474e-07, "logits/chosen": -2.6798744201660156, "logits/rejected": -2.5828280448913574, "logps/chosen": -207.62948608398438, "logps/rejected": -175.34597778320312, "loss": 449.38, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 37.80424118041992, "rewards/margins": 1.2310230731964111, "rewards/rejected": 36.573219299316406, "step": 3710 }, { "epoch": 0.7301275760549558, "grad_norm": 4840.256252230794, "learning_rate": 1.0298440433468048e-07, "logits/chosen": -2.762472152709961, "logits/rejected": -2.674773693084717, "logps/chosen": -292.7893981933594, "logps/rejected": -214.03201293945312, "loss": 504.2667, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 44.73220443725586, "rewards/margins": 6.6243391036987305, "rewards/rejected": 38.107872009277344, "step": 3720 }, { "epoch": 0.732090284592738, "grad_norm": 5142.153148669025, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -2.6255710124969482, "logits/rejected": -2.4991328716278076, "logps/chosen": -219.4676513671875, "logps/rejected": -178.04421997070312, "loss": 418.81, "rewards/accuracies": 0.5, "rewards/chosen": 31.921016693115234, "rewards/margins": 1.1468164920806885, "rewards/rejected": 30.774200439453125, "step": 3730 }, { "epoch": 0.7340529931305201, "grad_norm": 5349.043883670735, "learning_rate": 1.0022733651378606e-07, "logits/chosen": -2.691039562225342, "logits/rejected": -2.532325506210327, "logps/chosen": -333.5316467285156, "logps/rejected": -235.3372344970703, "loss": 483.2089, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 39.4155387878418, "rewards/margins": 4.746474266052246, "rewards/rejected": 34.6690673828125, "step": 3740 }, { "epoch": 0.7360157016683022, "grad_norm": 5552.097807426382, "learning_rate": 9.88593176708827e-08, "logits/chosen": -2.5714547634124756, "logits/rejected": -2.592780351638794, "logps/chosen": -232.8053741455078, "logps/rejected": -243.07565307617188, "loss": 453.0571, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 42.47258758544922, "rewards/margins": -0.20083312690258026, "rewards/rejected": 42.673423767089844, "step": 3750 }, { "epoch": 0.7379784102060843, "grad_norm": 3840.3442356230516, "learning_rate": 9.749839461782769e-08, "logits/chosen": -2.7019336223602295, "logits/rejected": -2.7575888633728027, "logps/chosen": -218.07241821289062, "logps/rejected": -260.0151062011719, "loss": 420.3206, "rewards/accuracies": 0.5, "rewards/chosen": 31.354604721069336, "rewards/margins": -0.9220180511474609, "rewards/rejected": 32.2766227722168, "step": 3760 }, { "epoch": 0.7399411187438666, "grad_norm": 4322.842993427434, "learning_rate": 9.614463124757041e-08, "logits/chosen": -2.4091198444366455, "logits/rejected": -2.4457297325134277, "logps/chosen": -200.60665893554688, "logps/rejected": -193.24002075195312, "loss": 400.7776, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 35.54182434082031, "rewards/margins": -4.2541184425354, "rewards/rejected": 39.795936584472656, "step": 3770 }, { "epoch": 0.7419038272816487, "grad_norm": 4109.254624443096, "learning_rate": 9.479809111692586e-08, "logits/chosen": -2.6434237957000732, "logits/rejected": -2.640014171600342, "logps/chosen": -197.17636108398438, "logps/rejected": -240.40560913085938, "loss": 468.0661, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 37.981597900390625, "rewards/margins": -3.1270596981048584, "rewards/rejected": 41.10865783691406, "step": 3780 }, { "epoch": 0.7438665358194309, "grad_norm": 2855.689201059151, "learning_rate": 9.345883744359065e-08, "logits/chosen": -2.5281224250793457, "logits/rejected": -2.5895755290985107, "logps/chosen": -235.78555297851562, "logps/rejected": -295.47418212890625, "loss": 443.861, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 34.26362991333008, "rewards/margins": -6.375296115875244, "rewards/rejected": 40.63892364501953, "step": 3790 }, { "epoch": 0.745829244357213, "grad_norm": 4095.0063036058204, "learning_rate": 9.212693310317479e-08, "logits/chosen": -2.6409642696380615, "logits/rejected": -2.624389171600342, "logps/chosen": -219.658935546875, "logps/rejected": -201.77801513671875, "loss": 351.3882, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 34.03952407836914, "rewards/margins": -6.243267059326172, "rewards/rejected": 40.28278732299805, "step": 3800 }, { "epoch": 0.7477919528949951, "grad_norm": 5539.3807407061995, "learning_rate": 9.08024406262503e-08, "logits/chosen": -2.657045841217041, "logits/rejected": -2.575908660888672, "logps/chosen": -203.8868408203125, "logps/rejected": -216.73587036132812, "loss": 441.1558, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 40.998573303222656, "rewards/margins": 7.906311988830566, "rewards/rejected": 33.092262268066406, "step": 3810 }, { "epoch": 0.7497546614327772, "grad_norm": 4983.020657353442, "learning_rate": 8.94854221954148e-08, "logits/chosen": -2.6159934997558594, "logits/rejected": -2.5689492225646973, "logps/chosen": -196.431884765625, "logps/rejected": -162.80032348632812, "loss": 436.0953, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 37.00093078613281, "rewards/margins": -5.000515937805176, "rewards/rejected": 42.00144577026367, "step": 3820 }, { "epoch": 0.7517173699705594, "grad_norm": 4490.380890972323, "learning_rate": 8.817593964237316e-08, "logits/chosen": -2.6067864894866943, "logits/rejected": -2.5778086185455322, "logps/chosen": -240.83688354492188, "logps/rejected": -200.7977294921875, "loss": 475.9042, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 38.47137451171875, "rewards/margins": -4.470655918121338, "rewards/rejected": 42.9420280456543, "step": 3830 }, { "epoch": 0.7536800785083415, "grad_norm": 4077.9532632758896, "learning_rate": 8.68740544450334e-08, "logits/chosen": -2.733462333679199, "logits/rejected": -2.5932741165161133, "logps/chosen": -307.20343017578125, "logps/rejected": -230.4998016357422, "loss": 459.9514, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 53.13201904296875, "rewards/margins": 15.279828071594238, "rewards/rejected": 37.8521842956543, "step": 3840 }, { "epoch": 0.7556427870461236, "grad_norm": 4075.7974134105666, "learning_rate": 8.557982772462138e-08, "logits/chosen": -2.468815803527832, "logits/rejected": -2.475836992263794, "logps/chosen": -220.2480926513672, "logps/rejected": -208.54409790039062, "loss": 431.1687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 42.63874816894531, "rewards/margins": -0.08769845962524414, "rewards/rejected": 42.7264518737793, "step": 3850 }, { "epoch": 0.7576054955839058, "grad_norm": 4183.992966358521, "learning_rate": 8.429332024281088e-08, "logits/chosen": -2.636207103729248, "logits/rejected": -2.5418641567230225, "logps/chosen": -255.40493774414062, "logps/rejected": -189.78768920898438, "loss": 432.5977, "rewards/accuracies": 0.5, "rewards/chosen": 30.501134872436523, "rewards/margins": 3.4080607891082764, "rewards/rejected": 27.09307289123535, "step": 3860 }, { "epoch": 0.7595682041216879, "grad_norm": 4995.9974039380695, "learning_rate": 8.301459239887073e-08, "logits/chosen": -2.7311673164367676, "logits/rejected": -2.5933837890625, "logps/chosen": -290.5824279785156, "logps/rejected": -235.07888793945312, "loss": 500.8695, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 49.280540466308594, "rewards/margins": 18.33820915222168, "rewards/rejected": 30.942337036132812, "step": 3870 }, { "epoch": 0.76153091265947, "grad_norm": 3567.3108992513685, "learning_rate": 8.17437042268298e-08, "logits/chosen": -2.6642038822174072, "logits/rejected": -2.669226884841919, "logps/chosen": -245.01382446289062, "logps/rejected": -263.2218933105469, "loss": 428.3638, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 43.7030029296875, "rewards/margins": 0.9183242917060852, "rewards/rejected": 42.7846794128418, "step": 3880 }, { "epoch": 0.7634936211972522, "grad_norm": 4561.994494662221, "learning_rate": 8.048071539265761e-08, "logits/chosen": -2.6854119300842285, "logits/rejected": -2.4938607215881348, "logps/chosen": -265.8202819824219, "logps/rejected": -190.0291290283203, "loss": 484.5575, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 48.63257598876953, "rewards/margins": 7.2778825759887695, "rewards/rejected": 41.354698181152344, "step": 3890 }, { "epoch": 0.7654563297350343, "grad_norm": 4900.414267620809, "learning_rate": 7.922568519146425e-08, "logits/chosen": -2.370140552520752, "logits/rejected": -2.4470481872558594, "logps/chosen": -196.44146728515625, "logps/rejected": -162.85464477539062, "loss": 392.1866, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 30.315418243408203, "rewards/margins": -11.973292350769043, "rewards/rejected": 42.2887077331543, "step": 3900 }, { "epoch": 0.7674190382728164, "grad_norm": 4856.0820375206495, "learning_rate": 7.79786725447154e-08, "logits/chosen": -2.5297694206237793, "logits/rejected": -2.509547710418701, "logps/chosen": -224.86044311523438, "logps/rejected": -189.3321990966797, "loss": 462.1615, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 35.0362663269043, "rewards/margins": 8.711492538452148, "rewards/rejected": 26.32477378845215, "step": 3910 }, { "epoch": 0.7693817468105987, "grad_norm": 5365.9583135093335, "learning_rate": 7.6739735997467e-08, "logits/chosen": -2.6649789810180664, "logits/rejected": -2.6618189811706543, "logps/chosen": -259.0755920410156, "logps/rejected": -222.2447967529297, "loss": 490.3834, "rewards/accuracies": 0.533333420753479, "rewards/chosen": 38.72358322143555, "rewards/margins": 0.7858904600143433, "rewards/rejected": 37.93769073486328, "step": 3920 }, { "epoch": 0.7713444553483808, "grad_norm": 4335.707899099495, "learning_rate": 7.550893371561593e-08, "logits/chosen": -2.37510085105896, "logits/rejected": -2.476245641708374, "logps/chosen": -218.3855438232422, "logps/rejected": -196.65638732910156, "loss": 448.4311, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 49.959930419921875, "rewards/margins": 1.878343939781189, "rewards/rejected": 48.08158493041992, "step": 3930 }, { "epoch": 0.7733071638861629, "grad_norm": 4130.993241850778, "learning_rate": 7.428632348317004e-08, "logits/chosen": -2.62736439704895, "logits/rejected": -2.5467922687530518, "logps/chosen": -207.3795928955078, "logps/rejected": -221.605224609375, "loss": 447.8396, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 41.028438568115234, "rewards/margins": 4.151480674743652, "rewards/rejected": 36.876956939697266, "step": 3940 }, { "epoch": 0.7752698724239451, "grad_norm": 3780.1009018502455, "learning_rate": 7.307196269953444e-08, "logits/chosen": -2.7356576919555664, "logits/rejected": -2.649294376373291, "logps/chosen": -247.86917114257812, "logps/rejected": -226.4387664794922, "loss": 399.9371, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 35.226959228515625, "rewards/margins": -1.1159265041351318, "rewards/rejected": 36.34288787841797, "step": 3950 }, { "epoch": 0.7772325809617272, "grad_norm": 4994.717832707031, "learning_rate": 7.186590837681732e-08, "logits/chosen": -2.6508007049560547, "logits/rejected": -2.543595790863037, "logps/chosen": -225.8889617919922, "logps/rejected": -170.00404357910156, "loss": 446.4747, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 36.95011901855469, "rewards/margins": -0.2716609835624695, "rewards/rejected": 37.221778869628906, "step": 3960 }, { "epoch": 0.7791952894995093, "grad_norm": 5242.481124911569, "learning_rate": 7.066821713715293e-08, "logits/chosen": -2.682112455368042, "logits/rejected": -2.6149463653564453, "logps/chosen": -276.6802673339844, "logps/rejected": -244.07644653320312, "loss": 500.1734, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 38.208160400390625, "rewards/margins": -3.544785976409912, "rewards/rejected": 41.75294876098633, "step": 3970 }, { "epoch": 0.7811579980372915, "grad_norm": 5223.772181995805, "learning_rate": 6.947894521004357e-08, "logits/chosen": -2.6946938037872314, "logits/rejected": -2.719026565551758, "logps/chosen": -241.08273315429688, "logps/rejected": -248.2490692138672, "loss": 500.5159, "rewards/accuracies": 0.4999999403953552, "rewards/chosen": 40.993896484375, "rewards/margins": -2.9638912677764893, "rewards/rejected": 43.95779037475586, "step": 3980 }, { "epoch": 0.7831207065750736, "grad_norm": 4163.850774595611, "learning_rate": 6.829814842971965e-08, "logits/chosen": -2.660637617111206, "logits/rejected": -2.658723831176758, "logps/chosen": -191.04733276367188, "logps/rejected": -218.947998046875, "loss": 470.5708, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 35.07811737060547, "rewards/margins": 6.308375358581543, "rewards/rejected": 28.769744873046875, "step": 3990 }, { "epoch": 0.7850834151128557, "grad_norm": 5725.875365218493, "learning_rate": 6.712588223251809e-08, "logits/chosen": -2.7059569358825684, "logits/rejected": -2.6640262603759766, "logps/chosen": -298.4080505371094, "logps/rejected": -242.0406951904297, "loss": 471.646, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 42.22529220581055, "rewards/margins": 4.421542167663574, "rewards/rejected": 37.80375289916992, "step": 4000 }, { "epoch": 0.7870461236506379, "grad_norm": 5064.624511268707, "learning_rate": 6.596220165428002e-08, "logits/chosen": -2.5640180110931396, "logits/rejected": -2.560215711593628, "logps/chosen": -211.7909393310547, "logps/rejected": -205.9022674560547, "loss": 443.7035, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 37.227325439453125, "rewards/margins": -4.98318338394165, "rewards/rejected": 42.210506439208984, "step": 4010 }, { "epoch": 0.78900883218842, "grad_norm": 3850.4450567632475, "learning_rate": 6.48071613277669e-08, "logits/chosen": -2.599118709564209, "logits/rejected": -2.493950366973877, "logps/chosen": -205.70864868164062, "logps/rejected": -222.24453735351562, "loss": 469.0605, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 34.287513732910156, "rewards/margins": -11.613555908203125, "rewards/rejected": 45.90106964111328, "step": 4020 }, { "epoch": 0.7909715407262021, "grad_norm": 4806.151283034705, "learning_rate": 6.366081548009553e-08, "logits/chosen": -2.5998551845550537, "logits/rejected": -2.6141610145568848, "logps/chosen": -228.11703491210938, "logps/rejected": -225.1660614013672, "loss": 489.8256, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 38.82665252685547, "rewards/margins": 2.988865852355957, "rewards/rejected": 35.83778762817383, "step": 4030 }, { "epoch": 0.7929342492639843, "grad_norm": 4670.543637279515, "learning_rate": 6.252321793019192e-08, "logits/chosen": -2.5964105129241943, "logits/rejected": -2.6201224327087402, "logps/chosen": -196.1589813232422, "logps/rejected": -220.24972534179688, "loss": 429.3503, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 35.577186584472656, "rewards/margins": 8.084705352783203, "rewards/rejected": 27.492481231689453, "step": 4040 }, { "epoch": 0.7948969578017664, "grad_norm": 3538.7537563169826, "learning_rate": 6.139442208626517e-08, "logits/chosen": -2.5182323455810547, "logits/rejected": -2.5223388671875, "logps/chosen": -161.74742126464844, "logps/rejected": -129.5215301513672, "loss": 416.8181, "rewards/accuracies": 0.5, "rewards/chosen": 30.123971939086914, "rewards/margins": -0.24544867873191833, "rewards/rejected": 30.369421005249023, "step": 4050 }, { "epoch": 0.7968596663395485, "grad_norm": 4220.376873142191, "learning_rate": 6.027448094329963e-08, "logits/chosen": -2.7116899490356445, "logits/rejected": -2.700085163116455, "logps/chosen": -206.6305694580078, "logps/rejected": -232.31869506835938, "loss": 439.5108, "rewards/accuracies": 0.36666664481163025, "rewards/chosen": 36.45801544189453, "rewards/margins": -3.6942970752716064, "rewards/rejected": 40.15230941772461, "step": 4060 }, { "epoch": 0.7988223748773308, "grad_norm": 3914.973610657506, "learning_rate": 5.916344708056681e-08, "logits/chosen": -2.5923945903778076, "logits/rejected": -2.6218533515930176, "logps/chosen": -227.88473510742188, "logps/rejected": -195.7071075439453, "loss": 429.5933, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 41.29386520385742, "rewards/margins": 4.6230788230896, "rewards/rejected": 36.6707878112793, "step": 4070 }, { "epoch": 0.8007850834151129, "grad_norm": 3745.173886850546, "learning_rate": 5.8061372659157306e-08, "logits/chosen": -2.581653118133545, "logits/rejected": -2.5396599769592285, "logps/chosen": -282.02325439453125, "logps/rejected": -270.64324951171875, "loss": 466.2438, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 68.39329528808594, "rewards/margins": 10.069668769836426, "rewards/rejected": 58.3236198425293, "step": 4080 }, { "epoch": 0.802747791952895, "grad_norm": 4574.038017807836, "learning_rate": 5.6968309419531376e-08, "logits/chosen": -2.616150379180908, "logits/rejected": -2.662601947784424, "logps/chosen": -250.6794891357422, "logps/rejected": -214.62893676757812, "loss": 481.5961, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 38.4940071105957, "rewards/margins": 1.1990143060684204, "rewards/rejected": 37.29499053955078, "step": 4090 }, { "epoch": 0.8047105004906772, "grad_norm": 4328.731519222315, "learning_rate": 5.5884308679090525e-08, "logits/chosen": -2.608259439468384, "logits/rejected": -2.3745930194854736, "logps/chosen": -205.98544311523438, "logps/rejected": -161.15406799316406, "loss": 461.3298, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 41.09679412841797, "rewards/margins": 18.13766860961914, "rewards/rejected": 22.95912742614746, "step": 4100 }, { "epoch": 0.8066732090284593, "grad_norm": 4289.949289633817, "learning_rate": 5.480942132976732e-08, "logits/chosen": -2.6755638122558594, "logits/rejected": -2.5002448558807373, "logps/chosen": -275.1580505371094, "logps/rejected": -160.10308837890625, "loss": 446.8364, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 46.448463439941406, "rewards/margins": 12.812484741210938, "rewards/rejected": 33.635982513427734, "step": 4110 }, { "epoch": 0.8086359175662414, "grad_norm": 4999.739123013394, "learning_rate": 5.374369783563698e-08, "logits/chosen": -2.601970911026001, "logits/rejected": -2.5141968727111816, "logps/chosen": -239.7788848876953, "logps/rejected": -238.88851928710938, "loss": 508.9154, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 39.377445220947266, "rewards/margins": -1.0681085586547852, "rewards/rejected": 40.44554901123047, "step": 4120 }, { "epoch": 0.8105986261040236, "grad_norm": 4353.603961100224, "learning_rate": 5.268718823054752e-08, "logits/chosen": -2.607865810394287, "logits/rejected": -2.518691062927246, "logps/chosen": -210.3962860107422, "logps/rejected": -214.9388427734375, "loss": 448.6874, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 35.81767654418945, "rewards/margins": 3.1919147968292236, "rewards/rejected": 32.625755310058594, "step": 4130 }, { "epoch": 0.8125613346418057, "grad_norm": 4615.118197694769, "learning_rate": 5.1639942115771384e-08, "logits/chosen": -2.5191102027893066, "logits/rejected": -2.606433629989624, "logps/chosen": -189.73458862304688, "logps/rejected": -178.82321166992188, "loss": 403.7612, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 43.21727752685547, "rewards/margins": 4.633804798126221, "rewards/rejected": 38.58347702026367, "step": 4140 }, { "epoch": 0.8145240431795878, "grad_norm": 4841.387032958852, "learning_rate": 5.060200865767605e-08, "logits/chosen": -2.660099506378174, "logits/rejected": -2.5387344360351562, "logps/chosen": -327.4266357421875, "logps/rejected": -261.6529541015625, "loss": 442.6245, "rewards/accuracies": 0.3333333134651184, "rewards/chosen": 46.238677978515625, "rewards/margins": 4.456854343414307, "rewards/rejected": 41.781822204589844, "step": 4150 }, { "epoch": 0.81648675171737, "grad_norm": 4311.46735416585, "learning_rate": 4.957343658541632e-08, "logits/chosen": -2.596989154815674, "logits/rejected": -2.6006436347961426, "logps/chosen": -197.25576782226562, "logps/rejected": -235.4192657470703, "loss": 460.46, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 34.012142181396484, "rewards/margins": -0.21886949241161346, "rewards/rejected": 34.23101043701172, "step": 4160 }, { "epoch": 0.8184494602551521, "grad_norm": 4273.784732786409, "learning_rate": 4.8554274188646215e-08, "logits/chosen": -2.6031975746154785, "logits/rejected": -2.52292537689209, "logps/chosen": -231.68310546875, "logps/rejected": -182.66123962402344, "loss": 435.8417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 28.487682342529297, "rewards/margins": 0.8157535791397095, "rewards/rejected": 27.67193031311035, "step": 4170 }, { "epoch": 0.8204121687929342, "grad_norm": 4626.558302395106, "learning_rate": 4.754456931525208e-08, "logits/chosen": -2.4478862285614014, "logits/rejected": -2.496283769607544, "logps/chosen": -223.149658203125, "logps/rejected": -206.23141479492188, "loss": 458.4399, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 44.41162109375, "rewards/margins": -19.236581802368164, "rewards/rejected": 63.64820098876953, "step": 4180 }, { "epoch": 0.8223748773307163, "grad_norm": 3431.928908752168, "learning_rate": 4.654436936910622e-08, "logits/chosen": -2.6450271606445312, "logits/rejected": -2.577511787414551, "logps/chosen": -263.02874755859375, "logps/rejected": -201.28765869140625, "loss": 424.2093, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 41.35602951049805, "rewards/margins": -2.2303383350372314, "rewards/rejected": 43.58637237548828, "step": 4190 }, { "epoch": 0.8243375858684985, "grad_norm": 5915.329560813934, "learning_rate": 4.555372130784102e-08, "logits/chosen": -2.689040184020996, "logits/rejected": -2.6475412845611572, "logps/chosen": -340.04925537109375, "logps/rejected": -252.8019561767578, "loss": 533.694, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 40.53813552856445, "rewards/margins": -6.165125846862793, "rewards/rejected": 46.7032585144043, "step": 4200 }, { "epoch": 0.8263002944062807, "grad_norm": 4737.886824886873, "learning_rate": 4.45726716406449e-08, "logits/chosen": -2.697927474975586, "logits/rejected": -2.719137668609619, "logps/chosen": -273.2940368652344, "logps/rejected": -216.5937957763672, "loss": 451.9255, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 35.69445037841797, "rewards/margins": -6.094300270080566, "rewards/rejected": 41.78874588012695, "step": 4210 }, { "epoch": 0.8282630029440629, "grad_norm": 4647.421613466152, "learning_rate": 4.360126642607842e-08, "logits/chosen": -2.5642268657684326, "logits/rejected": -2.4418981075286865, "logps/chosen": -289.02105712890625, "logps/rejected": -204.9908905029297, "loss": 440.6149, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 41.383323669433594, "rewards/margins": 8.476794242858887, "rewards/rejected": 32.906532287597656, "step": 4220 }, { "epoch": 0.830225711481845, "grad_norm": 4716.893754956261, "learning_rate": 4.2639551269912034e-08, "logits/chosen": -2.5265350341796875, "logits/rejected": -2.473881721496582, "logps/chosen": -175.4093780517578, "logps/rejected": -160.4100341796875, "loss": 429.2075, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 33.56465530395508, "rewards/margins": 2.8609957695007324, "rewards/rejected": 30.703664779663086, "step": 4230 }, { "epoch": 0.8321884200196271, "grad_norm": 4521.242810672261, "learning_rate": 4.168757132298478e-08, "logits/chosen": -2.666539192199707, "logits/rejected": -2.5834553241729736, "logps/chosen": -215.19424438476562, "logps/rejected": -227.21981811523438, "loss": 514.1854, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 43.63666915893555, "rewards/margins": -5.732529640197754, "rewards/rejected": 49.36920166015625, "step": 4240 }, { "epoch": 0.8341511285574092, "grad_norm": 4505.593260548116, "learning_rate": 4.0745371279084976e-08, "logits/chosen": -2.647498607635498, "logits/rejected": -2.5911407470703125, "logps/chosen": -212.0909423828125, "logps/rejected": -184.5307159423828, "loss": 444.7489, "rewards/accuracies": 0.5, "rewards/chosen": 36.252803802490234, "rewards/margins": -3.5416641235351562, "rewards/rejected": 39.794471740722656, "step": 4250 }, { "epoch": 0.8361138370951914, "grad_norm": 3860.498177175377, "learning_rate": 3.9812995372851544e-08, "logits/chosen": -2.5739986896514893, "logits/rejected": -2.562840700149536, "logps/chosen": -208.7525177001953, "logps/rejected": -183.97872924804688, "loss": 448.1645, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.437034606933594, "rewards/margins": -2.179542303085327, "rewards/rejected": 40.616580963134766, "step": 4260 }, { "epoch": 0.8380765456329735, "grad_norm": 4504.214351649246, "learning_rate": 3.8890487377697265e-08, "logits/chosen": -2.665849208831787, "logits/rejected": -2.6961982250213623, "logps/chosen": -215.90518188476562, "logps/rejected": -201.3123321533203, "loss": 442.2289, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.20843505859375, "rewards/margins": 1.6408487558364868, "rewards/rejected": 36.567588806152344, "step": 4270 }, { "epoch": 0.8400392541707556, "grad_norm": 5880.925064454791, "learning_rate": 3.7977890603754e-08, "logits/chosen": -2.633975028991699, "logits/rejected": -2.5609302520751953, "logps/chosen": -299.1169128417969, "logps/rejected": -262.66351318359375, "loss": 456.942, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 49.8904914855957, "rewards/margins": -0.6626808047294617, "rewards/rejected": 50.55316925048828, "step": 4280 }, { "epoch": 0.8420019627085378, "grad_norm": 4398.542164694948, "learning_rate": 3.707524789583891e-08, "logits/chosen": -2.6659607887268066, "logits/rejected": -2.5334270000457764, "logps/chosen": -270.0296325683594, "logps/rejected": -269.8255310058594, "loss": 478.531, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 43.77894973754883, "rewards/margins": 4.812531471252441, "rewards/rejected": 38.96641540527344, "step": 4290 }, { "epoch": 0.8439646712463199, "grad_norm": 4267.277513897122, "learning_rate": 3.6182601631443596e-08, "logits/chosen": -2.6800174713134766, "logits/rejected": -2.6633589267730713, "logps/chosen": -287.9276123046875, "logps/rejected": -243.28076171875, "loss": 483.4367, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 35.420494079589844, "rewards/margins": 11.147326469421387, "rewards/rejected": 24.27316665649414, "step": 4300 }, { "epoch": 0.845927379784102, "grad_norm": 4431.88895323535, "learning_rate": 3.529999371874381e-08, "logits/chosen": -2.5969769954681396, "logits/rejected": -2.5301458835601807, "logps/chosen": -252.8112030029297, "logps/rejected": -220.48355102539062, "loss": 455.2943, "rewards/accuracies": 0.40000003576278687, "rewards/chosen": 28.724105834960938, "rewards/margins": -13.333305358886719, "rewards/rejected": 42.057411193847656, "step": 4310 }, { "epoch": 0.8478900883218842, "grad_norm": 4783.508213649116, "learning_rate": 3.4427465594632555e-08, "logits/chosen": -2.436122179031372, "logits/rejected": -2.3911919593811035, "logps/chosen": -155.9526824951172, "logps/rejected": -138.89137268066406, "loss": 431.1864, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 44.76438903808594, "rewards/margins": 5.638034820556641, "rewards/rejected": 39.12635040283203, "step": 4320 }, { "epoch": 0.8498527968596663, "grad_norm": 4884.956243665607, "learning_rate": 3.356505822277417e-08, "logits/chosen": -2.6550347805023193, "logits/rejected": -2.5887913703918457, "logps/chosen": -234.34707641601562, "logps/rejected": -222.6831512451172, "loss": 460.6974, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 35.740257263183594, "rewards/margins": -4.165590763092041, "rewards/rejected": 39.905845642089844, "step": 4330 }, { "epoch": 0.8518155053974484, "grad_norm": 4860.935405417142, "learning_rate": 3.271281209168186e-08, "logits/chosen": -2.7008445262908936, "logits/rejected": -2.528564929962158, "logps/chosen": -237.4414825439453, "logps/rejected": -188.55038452148438, "loss": 401.3473, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 38.89970779418945, "rewards/margins": 11.053295135498047, "rewards/rejected": 27.846410751342773, "step": 4340 }, { "epoch": 0.8537782139352306, "grad_norm": 6055.100620430209, "learning_rate": 3.187076721281595e-08, "logits/chosen": -2.66092848777771, "logits/rejected": -2.5830471515655518, "logps/chosen": -210.565673828125, "logps/rejected": -207.7209930419922, "loss": 435.1621, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 35.088157653808594, "rewards/margins": 3.6022579669952393, "rewards/rejected": 31.485897064208984, "step": 4350 }, { "epoch": 0.8557409224730128, "grad_norm": 5425.634496450765, "learning_rate": 3.1038963118706244e-08, "logits/chosen": -2.4575374126434326, "logits/rejected": -2.410790205001831, "logps/chosen": -230.1165313720703, "logps/rejected": -192.03001403808594, "loss": 463.0259, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 47.27201461791992, "rewards/margins": 6.697729587554932, "rewards/rejected": 40.57428741455078, "step": 4360 }, { "epoch": 0.8577036310107949, "grad_norm": 4776.448875246613, "learning_rate": 3.0217438861095315e-08, "logits/chosen": -2.524313449859619, "logits/rejected": -2.5867514610290527, "logps/chosen": -174.5723876953125, "logps/rejected": -189.49002075195312, "loss": 437.6378, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 31.9951114654541, "rewards/margins": -0.5893153548240662, "rewards/rejected": 32.58442687988281, "step": 4370 }, { "epoch": 0.8596663395485771, "grad_norm": 4275.684848556367, "learning_rate": 2.940623300910572e-08, "logits/chosen": -2.7988975048065186, "logits/rejected": -2.5053915977478027, "logps/chosen": -271.8748474121094, "logps/rejected": -174.35372924804688, "loss": 423.6515, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 47.99496841430664, "rewards/margins": 20.64928436279297, "rewards/rejected": 27.34568214416504, "step": 4380 }, { "epoch": 0.8616290480863592, "grad_norm": 5322.318809597932, "learning_rate": 2.860538364742898e-08, "logits/chosen": -2.5552399158477783, "logits/rejected": -2.506809711456299, "logps/chosen": -298.73236083984375, "logps/rejected": -188.0784454345703, "loss": 482.5715, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 71.10126495361328, "rewards/margins": 38.96298599243164, "rewards/rejected": 32.138275146484375, "step": 4390 }, { "epoch": 0.8635917566241413, "grad_norm": 4149.509052971397, "learning_rate": 2.7814928374537334e-08, "logits/chosen": -2.7223961353302, "logits/rejected": -2.724046230316162, "logps/chosen": -195.20144653320312, "logps/rejected": -162.70327758789062, "loss": 380.9954, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 30.957189559936523, "rewards/margins": -9.33814811706543, "rewards/rejected": 40.295345306396484, "step": 4400 }, { "epoch": 0.8655544651619235, "grad_norm": 3713.7789351124247, "learning_rate": 2.7034904300918982e-08, "logits/chosen": -2.5335371494293213, "logits/rejected": -2.6188912391662598, "logps/chosen": -188.16238403320312, "logps/rejected": -225.91171264648438, "loss": 480.6724, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 33.5013313293457, "rewards/margins": -3.412436008453369, "rewards/rejected": 36.91376495361328, "step": 4410 }, { "epoch": 0.8675171736997056, "grad_norm": 4032.6993198961527, "learning_rate": 2.62653480473356e-08, "logits/chosen": -2.7851080894470215, "logits/rejected": -2.6982483863830566, "logps/chosen": -227.9115753173828, "logps/rejected": -208.1966094970703, "loss": 455.1615, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 36.069908142089844, "rewards/margins": -3.239365339279175, "rewards/rejected": 39.30927276611328, "step": 4420 }, { "epoch": 0.8694798822374877, "grad_norm": 4746.624165191648, "learning_rate": 2.550629574310309e-08, "logits/chosen": -2.5399880409240723, "logits/rejected": -2.567885160446167, "logps/chosen": -208.8949432373047, "logps/rejected": -237.2500762939453, "loss": 469.7333, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 30.57306480407715, "rewards/margins": -15.53956413269043, "rewards/rejected": 46.11262893676758, "step": 4430 }, { "epoch": 0.8714425907752699, "grad_norm": 5491.387596164183, "learning_rate": 2.475778302439524e-08, "logits/chosen": -2.7679731845855713, "logits/rejected": -2.642608404159546, "logps/chosen": -276.5832824707031, "logps/rejected": -206.662353515625, "loss": 488.9821, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 40.070716857910156, "rewards/margins": -2.739978790283203, "rewards/rejected": 42.81069564819336, "step": 4440 }, { "epoch": 0.873405299313052, "grad_norm": 4944.209462929502, "learning_rate": 2.4019845032570875e-08, "logits/chosen": -2.6773393154144287, "logits/rejected": -2.6586387157440186, "logps/chosen": -229.5032196044922, "logps/rejected": -239.168701171875, "loss": 458.2089, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 42.63987350463867, "rewards/margins": 4.401918411254883, "rewards/rejected": 38.237953186035156, "step": 4450 }, { "epoch": 0.8753680078508341, "grad_norm": 3572.7898569854297, "learning_rate": 2.3292516412524054e-08, "logits/chosen": -2.7301645278930664, "logits/rejected": -2.576932907104492, "logps/chosen": -254.9808807373047, "logps/rejected": -182.1911163330078, "loss": 467.2688, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 39.77206802368164, "rewards/margins": 2.0470340251922607, "rewards/rejected": 37.725032806396484, "step": 4460 }, { "epoch": 0.8773307163886163, "grad_norm": 4185.799065349416, "learning_rate": 2.2575831311057225e-08, "logits/chosen": -2.636169910430908, "logits/rejected": -2.468846082687378, "logps/chosen": -201.06875610351562, "logps/rejected": -196.6493377685547, "loss": 425.2623, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 32.289947509765625, "rewards/margins": -4.733141899108887, "rewards/rejected": 37.02309036254883, "step": 4470 }, { "epoch": 0.8792934249263984, "grad_norm": 4820.737672747037, "learning_rate": 2.1869823375278483e-08, "logits/chosen": -2.442539691925049, "logits/rejected": -2.250866651535034, "logps/chosen": -166.69845581054688, "logps/rejected": -153.46478271484375, "loss": 423.7283, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 46.657470703125, "rewards/margins": -1.7938625812530518, "rewards/rejected": 48.451332092285156, "step": 4480 }, { "epoch": 0.8812561334641805, "grad_norm": 4332.453219389595, "learning_rate": 2.1174525751021578e-08, "logits/chosen": -2.590590000152588, "logits/rejected": -2.5887067317962646, "logps/chosen": -225.1390838623047, "logps/rejected": -217.708740234375, "loss": 427.2968, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 46.83483123779297, "rewards/margins": -6.553465366363525, "rewards/rejected": 53.38829803466797, "step": 4490 }, { "epoch": 0.8832188420019627, "grad_norm": 3521.570327513789, "learning_rate": 2.0489971081290193e-08, "logits/chosen": -2.6378543376922607, "logits/rejected": -2.61759352684021, "logps/chosen": -259.528564453125, "logps/rejected": -198.2197723388672, "loss": 462.5908, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 38.549556732177734, "rewards/margins": -3.70062255859375, "rewards/rejected": 42.25017547607422, "step": 4500 }, { "epoch": 0.8851815505397449, "grad_norm": 4182.514142645229, "learning_rate": 1.9816191504724826e-08, "logits/chosen": -2.569251298904419, "logits/rejected": -2.466890811920166, "logps/chosen": -183.66378784179688, "logps/rejected": -176.77728271484375, "loss": 424.732, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 42.58602523803711, "rewards/margins": 4.075484275817871, "rewards/rejected": 38.51054382324219, "step": 4510 }, { "epoch": 0.887144259077527, "grad_norm": 4074.2996752655677, "learning_rate": 1.9153218654094498e-08, "logits/chosen": -2.6386618614196777, "logits/rejected": -2.592601776123047, "logps/chosen": -229.7263946533203, "logps/rejected": -193.13670349121094, "loss": 447.1824, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 49.375267028808594, "rewards/margins": 0.2786618173122406, "rewards/rejected": 49.09660339355469, "step": 4520 }, { "epoch": 0.8891069676153092, "grad_norm": 4530.355997455761, "learning_rate": 1.8501083654811206e-08, "logits/chosen": -2.5692012310028076, "logits/rejected": -2.621366262435913, "logps/chosen": -255.8880615234375, "logps/rejected": -219.7906494140625, "loss": 442.176, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 40.39422607421875, "rewards/margins": -0.9515798687934875, "rewards/rejected": 41.345802307128906, "step": 4530 }, { "epoch": 0.8910696761530913, "grad_norm": 4684.779486705413, "learning_rate": 1.7859817123469068e-08, "logits/chosen": -2.453920602798462, "logits/rejected": -2.5157649517059326, "logps/chosen": -173.42568969726562, "logps/rejected": -196.0359649658203, "loss": 420.6044, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 27.611618041992188, "rewards/margins": -5.926667213439941, "rewards/rejected": 33.53828430175781, "step": 4540 }, { "epoch": 0.8930323846908734, "grad_norm": 4064.0291223800077, "learning_rate": 1.7229449166406477e-08, "logits/chosen": -2.6352016925811768, "logits/rejected": -2.5559489727020264, "logps/chosen": -285.65545654296875, "logps/rejected": -221.51974487304688, "loss": 402.8019, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 44.281219482421875, "rewards/margins": 3.11841082572937, "rewards/rejected": 41.162811279296875, "step": 4550 }, { "epoch": 0.8949950932286556, "grad_norm": 3899.6042633621073, "learning_rate": 1.66100093782931e-08, "logits/chosen": -2.4818520545959473, "logits/rejected": -2.5240206718444824, "logps/chosen": -224.8530731201172, "logps/rejected": -245.0667724609375, "loss": 452.7596, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 36.947601318359375, "rewards/margins": -9.113560676574707, "rewards/rejected": 46.061161041259766, "step": 4560 }, { "epoch": 0.8969578017664377, "grad_norm": 4679.961925387276, "learning_rate": 1.600152684074005e-08, "logits/chosen": -2.539313793182373, "logits/rejected": -2.57034969329834, "logps/chosen": -271.5174560546875, "logps/rejected": -281.6427307128906, "loss": 476.7667, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 51.80295944213867, "rewards/margins": 16.400400161743164, "rewards/rejected": 35.402565002441406, "step": 4570 }, { "epoch": 0.8989205103042198, "grad_norm": 4429.10974153184, "learning_rate": 1.540403012093483e-08, "logits/chosen": -2.6423587799072266, "logits/rejected": -2.542809009552002, "logps/chosen": -274.2030029296875, "logps/rejected": -211.15005493164062, "loss": 437.5022, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 33.952781677246094, "rewards/margins": 5.86837911605835, "rewards/rejected": 28.08440589904785, "step": 4580 }, { "epoch": 0.900883218842002, "grad_norm": 4900.034915916676, "learning_rate": 1.4817547270300185e-08, "logits/chosen": -2.635451316833496, "logits/rejected": -2.694579601287842, "logps/chosen": -224.94287109375, "logps/rejected": -294.5521545410156, "loss": 489.1139, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 42.881195068359375, "rewards/margins": -13.004231452941895, "rewards/rejected": 55.88542556762695, "step": 4590 }, { "epoch": 0.9028459273797841, "grad_norm": 4415.368652937512, "learning_rate": 1.4242105823176837e-08, "logits/chosen": -2.620007038116455, "logits/rejected": -2.5084280967712402, "logps/chosen": -281.3882141113281, "logps/rejected": -214.0166778564453, "loss": 431.6098, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 38.51653289794922, "rewards/margins": -2.2388863563537598, "rewards/rejected": 40.75541687011719, "step": 4600 }, { "epoch": 0.9048086359175662, "grad_norm": 4673.780083482767, "learning_rate": 1.3677732795531083e-08, "logits/chosen": -2.5132038593292236, "logits/rejected": -2.558148145675659, "logps/chosen": -225.7850341796875, "logps/rejected": -277.06292724609375, "loss": 447.9788, "rewards/accuracies": 0.5, "rewards/chosen": 38.36613082885742, "rewards/margins": 1.1070663928985596, "rewards/rejected": 37.259063720703125, "step": 4610 }, { "epoch": 0.9067713444553483, "grad_norm": 4751.787329471743, "learning_rate": 1.3124454683686364e-08, "logits/chosen": -2.538525104522705, "logits/rejected": -2.559837818145752, "logps/chosen": -207.3318634033203, "logps/rejected": -222.19656372070312, "loss": 390.7619, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 42.52751922607422, "rewards/margins": 1.1009085178375244, "rewards/rejected": 41.426612854003906, "step": 4620 }, { "epoch": 0.9087340529931305, "grad_norm": 3330.4433365907444, "learning_rate": 1.2582297463079288e-08, "logits/chosen": -2.664224147796631, "logits/rejected": -2.48795223236084, "logps/chosen": -185.2596435546875, "logps/rejected": -117.39766693115234, "loss": 416.2551, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 43.808128356933594, "rewards/margins": 6.2645745277404785, "rewards/rejected": 37.54355239868164, "step": 4630 }, { "epoch": 0.9106967615309126, "grad_norm": 4153.6041400108625, "learning_rate": 1.2051286587040049e-08, "logits/chosen": -2.583369731903076, "logits/rejected": -2.554609537124634, "logps/chosen": -235.30203247070312, "logps/rejected": -237.036865234375, "loss": 435.5568, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 45.061485290527344, "rewards/margins": 5.400920867919922, "rewards/rejected": 39.660560607910156, "step": 4640 }, { "epoch": 0.9126594700686947, "grad_norm": 4977.865699530452, "learning_rate": 1.1531446985597604e-08, "logits/chosen": -2.6574277877807617, "logits/rejected": -2.655367851257324, "logps/chosen": -331.62701416015625, "logps/rejected": -245.23562622070312, "loss": 503.0266, "rewards/accuracies": 0.5, "rewards/chosen": 43.575950622558594, "rewards/margins": 2.5033040046691895, "rewards/rejected": 41.07265090942383, "step": 4650 }, { "epoch": 0.914622178606477, "grad_norm": 4767.861821528587, "learning_rate": 1.1022803064309194e-08, "logits/chosen": -2.5727593898773193, "logits/rejected": -2.439497470855713, "logps/chosen": -246.31069946289062, "logps/rejected": -274.4377746582031, "loss": 460.7343, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 71.24130249023438, "rewards/margins": -35.16503143310547, "rewards/rejected": 106.40632629394531, "step": 4660 }, { "epoch": 0.9165848871442591, "grad_norm": 4339.937447004007, "learning_rate": 1.0525378703114401e-08, "logits/chosen": -2.7839035987854004, "logits/rejected": -2.6208622455596924, "logps/chosen": -171.77706909179688, "logps/rejected": -163.43350219726562, "loss": 412.53, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 39.18108367919922, "rewards/margins": 12.46143627166748, "rewards/rejected": 26.719646453857422, "step": 4670 }, { "epoch": 0.9185475956820413, "grad_norm": 5194.796246618481, "learning_rate": 1.0039197255214238e-08, "logits/chosen": -2.656510829925537, "logits/rejected": -2.70479154586792, "logps/chosen": -157.28660583496094, "logps/rejected": -191.18362426757812, "loss": 462.9357, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 30.98434066772461, "rewards/margins": -9.275310516357422, "rewards/rejected": 40.25965118408203, "step": 4680 }, { "epoch": 0.9205103042198234, "grad_norm": 4776.008954221483, "learning_rate": 9.564281545974661e-09, "logits/chosen": -2.669079303741455, "logits/rejected": -2.7157466411590576, "logps/chosen": -225.6437225341797, "logps/rejected": -216.80502319335938, "loss": 406.8918, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 32.2359504699707, "rewards/margins": -0.8312788009643555, "rewards/rejected": 33.067230224609375, "step": 4690 }, { "epoch": 0.9224730127576055, "grad_norm": 4908.2657214789815, "learning_rate": 9.100653871854963e-09, "logits/chosen": -2.679262399673462, "logits/rejected": -2.778007984161377, "logps/chosen": -262.40435791015625, "logps/rejected": -246.1509246826172, "loss": 441.9648, "rewards/accuracies": 0.3999999761581421, "rewards/chosen": 33.58713150024414, "rewards/margins": -9.374418258666992, "rewards/rejected": 42.961551666259766, "step": 4700 }, { "epoch": 0.9244357212953876, "grad_norm": 4390.917872822957, "learning_rate": 8.648335999360934e-09, "logits/chosen": -2.6005899906158447, "logits/rejected": -2.47259783744812, "logps/chosen": -230.7547607421875, "logps/rejected": -167.08209228515625, "loss": 487.7689, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 30.501667022705078, "rewards/margins": -12.195757865905762, "rewards/rejected": 42.697425842285156, "step": 4710 }, { "epoch": 0.9263984298331698, "grad_norm": 4194.9838878661885, "learning_rate": 8.207349164023047e-09, "logits/chosen": -2.4237990379333496, "logits/rejected": -2.501603126525879, "logps/chosen": -227.91152954101562, "logps/rejected": -213.74789428710938, "loss": 416.369, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 36.72782897949219, "rewards/margins": -10.737937927246094, "rewards/rejected": 47.46576690673828, "step": 4720 }, { "epoch": 0.9283611383709519, "grad_norm": 4906.187260216281, "learning_rate": 7.777714069399532e-09, "logits/chosen": -2.566519260406494, "logits/rejected": -2.4378700256347656, "logps/chosen": -234.2556610107422, "logps/rejected": -213.920166015625, "loss": 469.0785, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 39.02256393432617, "rewards/margins": -1.7270667552947998, "rewards/rejected": 40.74962615966797, "step": 4730 }, { "epoch": 0.930323846908734, "grad_norm": 4787.2292795122075, "learning_rate": 7.359450886104263e-09, "logits/chosen": -2.572117567062378, "logits/rejected": -2.478353500366211, "logps/chosen": -258.222900390625, "logps/rejected": -196.6918487548828, "loss": 399.8398, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 39.50740432739258, "rewards/margins": -6.970947265625, "rewards/rejected": 46.47834777832031, "step": 4740 }, { "epoch": 0.9322865554465162, "grad_norm": 4652.016685552805, "learning_rate": 6.9525792508597634e-09, "logits/chosen": -2.7576687335968018, "logits/rejected": -2.7529947757720947, "logps/chosen": -238.3095703125, "logps/rejected": -252.69491577148438, "loss": 448.3941, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 50.65209197998047, "rewards/margins": 16.916269302368164, "rewards/rejected": 33.73582077026367, "step": 4750 }, { "epoch": 0.9342492639842983, "grad_norm": 4341.304306349906, "learning_rate": 6.557118265575451e-09, "logits/chosen": -2.5280609130859375, "logits/rejected": -2.5678343772888184, "logps/chosen": -267.6020812988281, "logps/rejected": -232.5404510498047, "loss": 430.607, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 37.56711959838867, "rewards/margins": -5.861638069152832, "rewards/rejected": 43.42876052856445, "step": 4760 }, { "epoch": 0.9362119725220804, "grad_norm": 5059.8930667799805, "learning_rate": 6.1730864964507636e-09, "logits/chosen": -2.683964252471924, "logits/rejected": -2.538910388946533, "logps/chosen": -254.32638549804688, "logps/rejected": -191.3366241455078, "loss": 443.235, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 42.44365692138672, "rewards/margins": -5.388759613037109, "rewards/rejected": 47.83241653442383, "step": 4770 }, { "epoch": 0.9381746810598626, "grad_norm": 4955.471916279354, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -2.6290130615234375, "logits/rejected": -2.549903392791748, "logps/chosen": -240.4611053466797, "logps/rejected": -189.25515747070312, "loss": 433.3238, "rewards/accuracies": 0.5, "rewards/chosen": 39.42809295654297, "rewards/margins": 5.345437049865723, "rewards/rejected": 34.08266067504883, "step": 4780 }, { "epoch": 0.9401373895976447, "grad_norm": 4626.054506926014, "learning_rate": 5.439382187722968e-09, "logits/chosen": -2.818596124649048, "logits/rejected": -2.6862807273864746, "logps/chosen": -343.7168273925781, "logps/rejected": -248.85122680664062, "loss": 455.7912, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 39.650245666503906, "rewards/margins": 2.1781814098358154, "rewards/rejected": 37.472068786621094, "step": 4790 }, { "epoch": 0.9421000981354269, "grad_norm": 4852.35621544925, "learning_rate": 5.089744094249837e-09, "logits/chosen": -2.8111395835876465, "logits/rejected": -2.5496573448181152, "logps/chosen": -319.8478088378906, "logps/rejected": -225.66421508789062, "loss": 448.5634, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 50.069862365722656, "rewards/margins": 11.557438850402832, "rewards/rejected": 38.512420654296875, "step": 4800 }, { "epoch": 0.9440628066732091, "grad_norm": 4371.655540404004, "learning_rate": 4.751604107579077e-09, "logits/chosen": -2.7233150005340576, "logits/rejected": -2.6152617931365967, "logps/chosen": -232.55117797851562, "logps/rejected": -210.745361328125, "loss": 452.9889, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 51.05016326904297, "rewards/margins": 15.190671920776367, "rewards/rejected": 35.85948944091797, "step": 4810 }, { "epoch": 0.9460255152109912, "grad_norm": 5385.241820900966, "learning_rate": 4.424978102789661e-09, "logits/chosen": -2.448641300201416, "logits/rejected": -2.389498233795166, "logps/chosen": -313.8994445800781, "logps/rejected": -198.6295623779297, "loss": 497.7832, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 92.62660217285156, "rewards/margins": 39.63528060913086, "rewards/rejected": 52.99132537841797, "step": 4820 }, { "epoch": 0.9479882237487733, "grad_norm": 4912.7958917731285, "learning_rate": 4.109881414399524e-09, "logits/chosen": -2.6708881855010986, "logits/rejected": -2.621290683746338, "logps/chosen": -241.0809783935547, "logps/rejected": -248.0062255859375, "loss": 516.8406, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.55589294433594, "rewards/margins": 4.549037933349609, "rewards/rejected": 33.00685501098633, "step": 4830 }, { "epoch": 0.9499509322865555, "grad_norm": 5368.909774149902, "learning_rate": 3.806328835645272e-09, "logits/chosen": -2.4885942935943604, "logits/rejected": -2.4917941093444824, "logps/chosen": -208.9341278076172, "logps/rejected": -193.43490600585938, "loss": 475.2958, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 32.45588302612305, "rewards/margins": -3.4110305309295654, "rewards/rejected": 35.866912841796875, "step": 4840 }, { "epoch": 0.9519136408243376, "grad_norm": 4661.5821603670565, "learning_rate": 3.5143346177878565e-09, "logits/chosen": -2.6988439559936523, "logits/rejected": -2.6492531299591064, "logps/chosen": -327.31988525390625, "logps/rejected": -232.4467315673828, "loss": 494.2165, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 41.69511413574219, "rewards/margins": 4.302943229675293, "rewards/rejected": 37.39216995239258, "step": 4850 }, { "epoch": 0.9538763493621197, "grad_norm": 4980.677871646419, "learning_rate": 3.233912469443545e-09, "logits/chosen": -2.5953526496887207, "logits/rejected": -2.442674160003662, "logps/chosen": -279.9731750488281, "logps/rejected": -181.50048828125, "loss": 487.4147, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 47.79022979736328, "rewards/margins": 9.6940336227417, "rewards/rejected": 38.09619903564453, "step": 4860 }, { "epoch": 0.9558390578999019, "grad_norm": 4664.751420879302, "learning_rate": 2.9650755559401388e-09, "logits/chosen": -2.470555543899536, "logits/rejected": -2.4070048332214355, "logps/chosen": -276.84552001953125, "logps/rejected": -245.06423950195312, "loss": 445.485, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": 70.07025909423828, "rewards/margins": 1.3178901672363281, "rewards/rejected": 68.75236511230469, "step": 4870 }, { "epoch": 0.957801766437684, "grad_norm": 4302.0676887743775, "learning_rate": 2.7078364986990175e-09, "logits/chosen": -2.494755268096924, "logits/rejected": -2.3925018310546875, "logps/chosen": -353.33343505859375, "logps/rejected": -268.85394287109375, "loss": 474.6627, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 42.405967712402344, "rewards/margins": 1.3862206935882568, "rewards/rejected": 41.01974105834961, "step": 4880 }, { "epoch": 0.9597644749754661, "grad_norm": 3888.461730316749, "learning_rate": 2.4622073746426165e-09, "logits/chosen": -2.5994725227355957, "logits/rejected": -2.5952556133270264, "logps/chosen": -236.3902587890625, "logps/rejected": -188.7119598388672, "loss": 450.1432, "rewards/accuracies": 0.5, "rewards/chosen": 37.518062591552734, "rewards/margins": 1.0947520732879639, "rewards/rejected": 36.42330551147461, "step": 4890 }, { "epoch": 0.9617271835132483, "grad_norm": 4381.720003872811, "learning_rate": 2.2281997156273213e-09, "logits/chosen": -2.6201536655426025, "logits/rejected": -2.5877933502197266, "logps/chosen": -266.6863708496094, "logps/rejected": -211.7058563232422, "loss": 494.6588, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 45.659767150878906, "rewards/margins": -1.3253047466278076, "rewards/rejected": 46.985069274902344, "step": 4900 }, { "epoch": 0.9636898920510304, "grad_norm": 3346.892913249055, "learning_rate": 2.0058245079021265e-09, "logits/chosen": -2.630370616912842, "logits/rejected": -2.5561165809631348, "logps/chosen": -220.84243774414062, "logps/rejected": -161.4556121826172, "loss": 440.1361, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 38.90265655517578, "rewards/margins": -1.657293677330017, "rewards/rejected": 40.55995178222656, "step": 4910 }, { "epoch": 0.9656526005888125, "grad_norm": 5128.958349199915, "learning_rate": 1.7950921915928784e-09, "logits/chosen": -2.450956106185913, "logits/rejected": -2.4342901706695557, "logps/chosen": -206.73245239257812, "logps/rejected": -181.1966094970703, "loss": 418.8285, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 58.6185417175293, "rewards/margins": 28.910715103149414, "rewards/rejected": 29.70783042907715, "step": 4920 }, { "epoch": 0.9676153091265947, "grad_norm": 5099.162044042663, "learning_rate": 1.596012660212087e-09, "logits/chosen": -2.6458563804626465, "logits/rejected": -2.554788589477539, "logps/chosen": -279.41925048828125, "logps/rejected": -185.72958374023438, "loss": 513.4474, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 39.27458953857422, "rewards/margins": 5.483712196350098, "rewards/rejected": 33.79088592529297, "step": 4930 }, { "epoch": 0.9695780176643768, "grad_norm": 4551.279045211256, "learning_rate": 1.408595260194434e-09, "logits/chosen": -2.5972819328308105, "logits/rejected": -2.4921669960021973, "logps/chosen": -277.7696838378906, "logps/rejected": -167.14578247070312, "loss": 414.1692, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 37.349212646484375, "rewards/margins": 3.4034876823425293, "rewards/rejected": 33.94572067260742, "step": 4940 }, { "epoch": 0.971540726202159, "grad_norm": 3773.680808520389, "learning_rate": 1.2328487904580131e-09, "logits/chosen": -2.6180472373962402, "logits/rejected": -2.4990921020507812, "logps/chosen": -174.1087646484375, "logps/rejected": -185.30502319335938, "loss": 414.0831, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 39.71233367919922, "rewards/margins": 5.6509785652160645, "rewards/rejected": 34.06135177612305, "step": 4950 }, { "epoch": 0.9735034347399412, "grad_norm": 4881.977965503595, "learning_rate": 1.0687815019912173e-09, "logits/chosen": -2.5493404865264893, "logits/rejected": -2.513451337814331, "logps/chosen": -231.9395294189453, "logps/rejected": -286.641845703125, "loss": 508.2021, "rewards/accuracies": 0.5, "rewards/chosen": 48.53483200073242, "rewards/margins": 3.981222629547119, "rewards/rejected": 44.553611755371094, "step": 4960 }, { "epoch": 0.9754661432777233, "grad_norm": 3823.13046567604, "learning_rate": 9.164010974653802e-10, "logits/chosen": -2.585801362991333, "logits/rejected": -2.5278539657592773, "logps/chosen": -229.3662109375, "logps/rejected": -232.8991241455078, "loss": 405.6757, "rewards/accuracies": 0.4666666090488434, "rewards/chosen": 38.5717658996582, "rewards/margins": -1.0833070278167725, "rewards/rejected": 39.655067443847656, "step": 4970 }, { "epoch": 0.9774288518155054, "grad_norm": 4001.149224700046, "learning_rate": 7.757147308731504e-10, "logits/chosen": -2.591248035430908, "logits/rejected": -2.4500527381896973, "logps/chosen": -262.47003173828125, "logps/rejected": -236.8359832763672, "loss": 429.9303, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 45.33655548095703, "rewards/margins": 12.93742561340332, "rewards/rejected": 32.39912796020508, "step": 4980 }, { "epoch": 0.9793915603532876, "grad_norm": 4285.35894614754, "learning_rate": 6.467290071925646e-10, "logits/chosen": -2.4244279861450195, "logits/rejected": -2.5194430351257324, "logps/chosen": -187.13661193847656, "logps/rejected": -179.65377807617188, "loss": 471.227, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 26.85833168029785, "rewards/margins": -15.445712089538574, "rewards/rejected": 42.304039001464844, "step": 4990 }, { "epoch": 0.9813542688910697, "grad_norm": 4527.552369700591, "learning_rate": 5.29449982077046e-10, "logits/chosen": -2.6375205516815186, "logits/rejected": -2.586085796356201, "logps/chosen": -246.67724609375, "logps/rejected": -176.51097106933594, "loss": 446.9356, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 37.850624084472656, "rewards/margins": 0.24829140305519104, "rewards/rejected": 37.602333068847656, "step": 5000 }, { "epoch": 0.9833169774288518, "grad_norm": 5452.2882899497845, "learning_rate": 4.2388316157104806e-10, "logits/chosen": -2.5154407024383545, "logits/rejected": -2.468649387359619, "logps/chosen": -233.3128662109375, "logps/rejected": -186.05923461914062, "loss": 497.3842, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 34.57950973510742, "rewards/margins": -0.713744044303894, "rewards/rejected": 35.293251037597656, "step": 5010 }, { "epoch": 0.985279685966634, "grad_norm": 4683.8869096540675, "learning_rate": 3.300335018515676e-10, "logits/chosen": -2.5799403190612793, "logits/rejected": -2.5058860778808594, "logps/chosen": -176.56678771972656, "logps/rejected": -124.17610168457031, "loss": 461.8412, "rewards/accuracies": 0.5000000596046448, "rewards/chosen": 37.2112922668457, "rewards/margins": -3.3934874534606934, "rewards/rejected": 40.60478210449219, "step": 5020 }, { "epoch": 0.9872423945044161, "grad_norm": 4264.237376017354, "learning_rate": 2.4790540899546907e-10, "logits/chosen": -2.508977174758911, "logits/rejected": -2.5530219078063965, "logps/chosen": -184.57777404785156, "logps/rejected": -225.44070434570312, "loss": 435.1646, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 37.472877502441406, "rewards/margins": -12.762925148010254, "rewards/rejected": 50.23580551147461, "step": 5030 }, { "epoch": 0.9892051030421982, "grad_norm": 3979.2806515385905, "learning_rate": 1.7750273877262244e-10, "logits/chosen": -2.5312910079956055, "logits/rejected": -2.4877536296844482, "logps/chosen": -250.9333953857422, "logps/rejected": -216.4891357421875, "loss": 438.17, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 37.162452697753906, "rewards/margins": -7.625374794006348, "rewards/rejected": 44.7878303527832, "step": 5040 }, { "epoch": 0.9911678115799804, "grad_norm": 4453.834332740248, "learning_rate": 1.1882879646485379e-10, "logits/chosen": -2.4382872581481934, "logits/rejected": -2.3793227672576904, "logps/chosen": -180.4657745361328, "logps/rejected": -177.47760009765625, "loss": 477.8574, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 44.736907958984375, "rewards/margins": 6.490880012512207, "rewards/rejected": 38.24602127075195, "step": 5050 }, { "epoch": 0.9931305201177625, "grad_norm": 4887.6266129665655, "learning_rate": 7.188633671079136e-11, "logits/chosen": -2.6609623432159424, "logits/rejected": -2.531956434249878, "logps/chosen": -239.935546875, "logps/rejected": -164.88461303710938, "loss": 430.8216, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 48.27210235595703, "rewards/margins": 3.368035078048706, "rewards/rejected": 44.9040641784668, "step": 5060 }, { "epoch": 0.9950932286555446, "grad_norm": 5751.1482161869735, "learning_rate": 3.6677563376580344e-11, "logits/chosen": -2.605489730834961, "logits/rejected": -2.559293270111084, "logps/chosen": -228.0939178466797, "logps/rejected": -279.6558837890625, "loss": 497.0193, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 36.726707458496094, "rewards/margins": 5.8451924324035645, "rewards/rejected": 30.881515502929688, "step": 5070 }, { "epoch": 0.9970559371933267, "grad_norm": 3951.8547590293524, "learning_rate": 1.3204129452354385e-11, "logits/chosen": -2.562615394592285, "logits/rejected": -2.5046939849853516, "logps/chosen": -234.50192260742188, "logps/rejected": -236.96835327148438, "loss": 461.9209, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 36.579288482666016, "rewards/margins": -11.703116416931152, "rewards/rejected": 48.282405853271484, "step": 5080 }, { "epoch": 0.9990186457311089, "grad_norm": 5081.291469555591, "learning_rate": 1.467136974631078e-12, "logits/chosen": -2.6177847385406494, "logits/rejected": -2.4632296562194824, "logps/chosen": -227.45236206054688, "logps/rejected": -167.18699645996094, "loss": 430.0306, "rewards/accuracies": 0.46666663885116577, "rewards/chosen": 44.46977996826172, "rewards/margins": 9.820869445800781, "rewards/rejected": 34.64891052246094, "step": 5090 }, { "epoch": 1.0, "step": 5095, "total_flos": 0.0, "train_loss": 457.40178580242, "train_runtime": 17501.0679, "train_samples_per_second": 3.493, "train_steps_per_second": 0.291 } ], "logging_steps": 10, "max_steps": 5095, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }