{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993626513702996, "eval_steps": 500, "global_step": 784, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.166666666666666e-09, "logits/chosen": -3.2249984741210938, "logits/rejected": -3.1493096351623535, "logps/chosen": -301.805419921875, "logps/rejected": -749.23681640625, "loss": 1.014, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.333333333333332e-09, "logits/chosen": -3.146193265914917, "logits/rejected": -3.004434108734131, "logps/chosen": -326.51904296875, "logps/rejected": -537.8560791015625, "loss": 1.0164, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.25e-08, "logits/chosen": -3.231161594390869, "logits/rejected": -3.083953857421875, "logps/chosen": -335.8543395996094, "logps/rejected": -649.480712890625, "loss": 1.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.06978759914636612, "rewards/margins": 0.07396087795495987, "rewards/rejected": -0.00417327880859375, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.6666666666666664e-08, "logits/chosen": -3.1735241413116455, "logits/rejected": -3.095536708831787, "logps/chosen": -311.54852294921875, "logps/rejected": -836.0142822265625, "loss": 0.9902, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035659782588481903, "rewards/margins": 0.008897393941879272, "rewards/rejected": -0.01246337965130806, "step": 4 }, { "epoch": 0.01, "learning_rate": 2.0833333333333335e-08, "logits/chosen": -3.2310121059417725, "logits/rejected": -2.858600378036499, "logps/chosen": -292.44610595703125, "logps/rejected": -2052.43701171875, "loss": 1.0199, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019805908668786287, "rewards/margins": 0.08045349270105362, "rewards/rejected": -0.08243408799171448, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.5e-08, "logits/chosen": -3.208169460296631, "logits/rejected": -3.0478270053863525, "logps/chosen": -309.3616027832031, "logps/rejected": -585.9378662109375, "loss": 1.0202, "rewards/accuracies": 0.5, "rewards/chosen": -0.02354583889245987, "rewards/margins": 0.0061767566949129105, "rewards/rejected": -0.02972259372472763, "step": 6 }, { "epoch": 0.01, "learning_rate": 2.9166666666666666e-08, "logits/chosen": -3.210592269897461, "logits/rejected": -3.085233688354492, "logps/chosen": -290.191650390625, "logps/rejected": -417.7170715332031, "loss": 1.0271, "rewards/accuracies": 0.5, "rewards/chosen": -0.0020172111690044403, "rewards/margins": -0.043182373046875, "rewards/rejected": 0.04116516187787056, "step": 7 }, { "epoch": 0.01, "learning_rate": 3.333333333333333e-08, "logits/chosen": -3.168795585632324, "logits/rejected": -3.0627212524414062, "logps/chosen": -333.8818359375, "logps/rejected": -773.6167602539062, "loss": 1.0441, "rewards/accuracies": 0.5, "rewards/chosen": 0.03167724609375, "rewards/margins": 0.03685455024242401, "rewards/rejected": -0.00517730787396431, "step": 8 }, { "epoch": 0.01, "learning_rate": 3.75e-08, "logits/chosen": -3.1084446907043457, "logits/rejected": -3.0046944618225098, "logps/chosen": -295.39178466796875, "logps/rejected": -1110.526123046875, "loss": 1.0252, "rewards/accuracies": 1.0, "rewards/chosen": -0.01828308217227459, "rewards/margins": 0.04431457445025444, "rewards/rejected": -0.06259765475988388, "step": 9 }, { "epoch": 0.01, "learning_rate": 4.166666666666667e-08, "logits/chosen": -3.1318116188049316, "logits/rejected": -3.0226166248321533, "logps/chosen": -290.97320556640625, "logps/rejected": -899.5476684570312, "loss": 0.9921, "rewards/accuracies": 1.0, "rewards/chosen": 0.02289123460650444, "rewards/margins": 0.12102356553077698, "rewards/rejected": -0.09813232719898224, "step": 10 }, { "epoch": 0.01, "learning_rate": 4.583333333333333e-08, "logits/chosen": -3.1727778911590576, "logits/rejected": -3.0235958099365234, "logps/chosen": -330.50823974609375, "logps/rejected": -925.6340942382812, "loss": 1.0406, "rewards/accuracies": 0.0, "rewards/chosen": -0.00299835205078125, "rewards/margins": -0.03201141208410263, "rewards/rejected": 0.02901306189596653, "step": 11 }, { "epoch": 0.02, "learning_rate": 5e-08, "logits/chosen": -3.135713577270508, "logits/rejected": -3.084009885787964, "logps/chosen": -326.48944091796875, "logps/rejected": -742.2381591796875, "loss": 1.0233, "rewards/accuracies": 0.5, "rewards/chosen": 0.05649567022919655, "rewards/margins": -0.01782378740608692, "rewards/rejected": 0.07431945949792862, "step": 12 }, { "epoch": 0.02, "learning_rate": 5.416666666666666e-08, "logits/chosen": -3.181818962097168, "logits/rejected": -2.976017475128174, "logps/chosen": -285.50201416015625, "logps/rejected": -1483.154296875, "loss": 1.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.0037414561957120895, "rewards/margins": 0.070098876953125, "rewards/rejected": -0.06635741889476776, "step": 13 }, { "epoch": 0.02, "learning_rate": 5.833333333333333e-08, "logits/chosen": -3.1668577194213867, "logits/rejected": -3.019139289855957, "logps/chosen": -341.5426025390625, "logps/rejected": -1347.2843017578125, "loss": 0.9789, "rewards/accuracies": 1.0, "rewards/chosen": -0.03780059888958931, "rewards/margins": 0.10509186238050461, "rewards/rejected": -0.14289246499538422, "step": 14 }, { "epoch": 0.02, "learning_rate": 6.25e-08, "logits/chosen": -3.1883792877197266, "logits/rejected": -3.1575865745544434, "logps/chosen": -339.2225646972656, "logps/rejected": -812.0121459960938, "loss": 1.041, "rewards/accuracies": 0.5, "rewards/chosen": 0.01588440127670765, "rewards/margins": -0.02966614067554474, "rewards/rejected": 0.04555054008960724, "step": 15 }, { "epoch": 0.02, "learning_rate": 6.666666666666665e-08, "logits/chosen": -3.1587257385253906, "logits/rejected": -3.0870401859283447, "logps/chosen": -312.6769104003906, "logps/rejected": -529.3914794921875, "loss": 1.0472, "rewards/accuracies": 0.5, "rewards/chosen": -0.014216613955795765, "rewards/margins": -0.032161712646484375, "rewards/rejected": 0.017945099622011185, "step": 16 }, { "epoch": 0.02, "learning_rate": 7.083333333333334e-08, "logits/chosen": -3.1896886825561523, "logits/rejected": -3.1233325004577637, "logps/chosen": -323.9560546875, "logps/rejected": -905.28369140625, "loss": 0.9814, "rewards/accuracies": 1.0, "rewards/chosen": -0.01519775390625, "rewards/margins": 0.15228576958179474, "rewards/rejected": -0.16748352348804474, "step": 17 }, { "epoch": 0.02, "learning_rate": 7.5e-08, "logits/chosen": -3.2174019813537598, "logits/rejected": -2.9764885902404785, "logps/chosen": -297.34320068359375, "logps/rejected": -2056.311767578125, "loss": 0.9731, "rewards/accuracies": 0.5, "rewards/chosen": -0.0068191527388989925, "rewards/margins": 0.001353454776108265, "rewards/rejected": -0.00817260704934597, "step": 18 }, { "epoch": 0.02, "learning_rate": 7.916666666666665e-08, "logits/chosen": -3.1254308223724365, "logits/rejected": -3.0187110900878906, "logps/chosen": -319.93780517578125, "logps/rejected": -1764.055419921875, "loss": 0.9561, "rewards/accuracies": 1.0, "rewards/chosen": 0.03852539137005806, "rewards/margins": 0.22076416015625, "rewards/rejected": -0.18223877251148224, "step": 19 }, { "epoch": 0.03, "learning_rate": 8.333333333333334e-08, "logits/chosen": -3.153252601623535, "logits/rejected": -3.0897886753082275, "logps/chosen": -293.991455078125, "logps/rejected": -1007.9874267578125, "loss": 0.9509, "rewards/accuracies": 1.0, "rewards/chosen": 0.03754730150103569, "rewards/margins": 0.1800888180732727, "rewards/rejected": -0.14254151284694672, "step": 20 }, { "epoch": 0.03, "learning_rate": 8.75e-08, "logits/chosen": -3.0951619148254395, "logits/rejected": -3.0999135971069336, "logps/chosen": -317.90863037109375, "logps/rejected": -799.510009765625, "loss": 0.9576, "rewards/accuracies": 1.0, "rewards/chosen": 0.0001739503350108862, "rewards/margins": 0.13384094834327698, "rewards/rejected": -0.1336669921875, "step": 21 }, { "epoch": 0.03, "learning_rate": 9.166666666666665e-08, "logits/chosen": -3.166450023651123, "logits/rejected": -3.0963730812072754, "logps/chosen": -261.5613708496094, "logps/rejected": -1341.11865234375, "loss": 0.9453, "rewards/accuracies": 1.0, "rewards/chosen": 0.014226531609892845, "rewards/margins": 0.08567734062671661, "rewards/rejected": -0.07145080715417862, "step": 22 }, { "epoch": 0.03, "learning_rate": 9.583333333333334e-08, "logits/chosen": -3.195559024810791, "logits/rejected": -3.127469778060913, "logps/chosen": -291.3478088378906, "logps/rejected": -754.260498046875, "loss": 0.953, "rewards/accuracies": 1.0, "rewards/chosen": 0.05744171142578125, "rewards/margins": 0.16487275063991547, "rewards/rejected": -0.10743103176355362, "step": 23 }, { "epoch": 0.03, "learning_rate": 1e-07, "logits/chosen": -3.1770012378692627, "logits/rejected": -2.8715193271636963, "logps/chosen": -311.1556701660156, "logps/rejected": -2055.337646484375, "loss": 0.899, "rewards/accuracies": 1.0, "rewards/chosen": 0.06309814751148224, "rewards/margins": 0.5045897960662842, "rewards/rejected": -0.4414916932582855, "step": 24 }, { "epoch": 0.03, "learning_rate": 9.999957281897734e-08, "logits/chosen": -3.2143921852111816, "logits/rejected": -3.0585074424743652, "logps/chosen": -336.6886901855469, "logps/rejected": -593.4904174804688, "loss": 0.9558, "rewards/accuracies": 0.5, "rewards/chosen": 0.00925598107278347, "rewards/margins": 0.09302216023206711, "rewards/rejected": -0.08376617729663849, "step": 25 }, { "epoch": 0.03, "learning_rate": 9.999829128320872e-08, "logits/chosen": -3.22650146484375, "logits/rejected": -3.0790164470672607, "logps/chosen": -328.5511779785156, "logps/rejected": -1046.105712890625, "loss": 0.9552, "rewards/accuracies": 1.0, "rewards/chosen": -0.01667480543255806, "rewards/margins": 0.18015746772289276, "rewards/rejected": -0.19683226943016052, "step": 26 }, { "epoch": 0.03, "learning_rate": 9.999615541459205e-08, "logits/chosen": -3.191080093383789, "logits/rejected": -3.099419593811035, "logps/chosen": -326.3691711425781, "logps/rejected": -625.0943603515625, "loss": 0.9987, "rewards/accuracies": 1.0, "rewards/chosen": 0.07567443698644638, "rewards/margins": 0.12075195461511612, "rewards/rejected": -0.04507751762866974, "step": 27 }, { "epoch": 0.04, "learning_rate": 9.999316524962345e-08, "logits/chosen": -3.132746696472168, "logits/rejected": -3.036904811859131, "logps/chosen": -295.0516662597656, "logps/rejected": -1148.4930419921875, "loss": 0.9057, "rewards/accuracies": 1.0, "rewards/chosen": 0.05676117539405823, "rewards/margins": 0.3011459410190582, "rewards/rejected": -0.244384765625, "step": 28 }, { "epoch": 0.04, "learning_rate": 9.998932083939655e-08, "logits/chosen": -3.2061564922332764, "logits/rejected": -3.0649170875549316, "logps/chosen": -311.3853759765625, "logps/rejected": -722.98974609375, "loss": 0.9486, "rewards/accuracies": 1.0, "rewards/chosen": -0.0073699951171875, "rewards/margins": 0.104949951171875, "rewards/rejected": -0.1123199462890625, "step": 29 }, { "epoch": 0.04, "learning_rate": 9.998462224960173e-08, "logits/chosen": -3.1454668045043945, "logits/rejected": -2.9101996421813965, "logps/chosen": -291.77984619140625, "logps/rejected": -1197.62646484375, "loss": 0.8972, "rewards/accuracies": 1.0, "rewards/chosen": 0.03359680250287056, "rewards/margins": 0.39168548583984375, "rewards/rejected": -0.3580887019634247, "step": 30 }, { "epoch": 0.04, "learning_rate": 9.997906956052494e-08, "logits/chosen": -3.1428914070129395, "logits/rejected": -2.883697748184204, "logps/chosen": -333.80523681640625, "logps/rejected": -3550.531494140625, "loss": 0.8167, "rewards/accuracies": 1.0, "rewards/chosen": 0.06267394870519638, "rewards/margins": 1.1710236072540283, "rewards/rejected": -1.1083495616912842, "step": 31 }, { "epoch": 0.04, "learning_rate": 9.99726628670463e-08, "logits/chosen": -3.143510341644287, "logits/rejected": -3.032212734222412, "logps/chosen": -315.5950012207031, "logps/rejected": -1120.73193359375, "loss": 0.8895, "rewards/accuracies": 1.0, "rewards/chosen": 0.04322967678308487, "rewards/margins": 0.4030410945415497, "rewards/rejected": -0.359811395406723, "step": 32 }, { "epoch": 0.04, "learning_rate": 9.996540227863853e-08, "logits/chosen": -3.167703151702881, "logits/rejected": -2.989516258239746, "logps/chosen": -315.71563720703125, "logps/rejected": -1541.743408203125, "loss": 0.8419, "rewards/accuracies": 1.0, "rewards/chosen": 0.06288605183362961, "rewards/margins": 0.5780136585235596, "rewards/rejected": -0.5151275992393494, "step": 33 }, { "epoch": 0.04, "learning_rate": 9.995728791936504e-08, "logits/chosen": -3.1676580905914307, "logits/rejected": -3.0280845165252686, "logps/chosen": -306.76422119140625, "logps/rejected": -284.1114196777344, "loss": 0.8609, "rewards/accuracies": 1.0, "rewards/chosen": 0.006521605886518955, "rewards/margins": 0.06946487724781036, "rewards/rejected": -0.06294326484203339, "step": 34 }, { "epoch": 0.04, "learning_rate": 9.994831992787787e-08, "logits/chosen": -3.1609086990356445, "logits/rejected": -3.1102871894836426, "logps/chosen": -295.2339172363281, "logps/rejected": -848.3267822265625, "loss": 0.8924, "rewards/accuracies": 1.0, "rewards/chosen": 0.04673156887292862, "rewards/margins": 0.25919342041015625, "rewards/rejected": -0.21246185898780823, "step": 35 }, { "epoch": 0.05, "learning_rate": 9.993849845741523e-08, "logits/chosen": -3.2044692039489746, "logits/rejected": -3.157431125640869, "logps/chosen": -296.6109619140625, "logps/rejected": -929.6781005859375, "loss": 0.8161, "rewards/accuracies": 1.0, "rewards/chosen": -0.0261688232421875, "rewards/margins": 0.27874451875686646, "rewards/rejected": -0.30491334199905396, "step": 36 }, { "epoch": 0.05, "learning_rate": 9.992782367579898e-08, "logits/chosen": -3.1619975566864014, "logits/rejected": -3.051429271697998, "logps/chosen": -290.8027648925781, "logps/rejected": -827.8645629882812, "loss": 0.896, "rewards/accuracies": 1.0, "rewards/chosen": 0.09846954047679901, "rewards/margins": 0.4867492616176605, "rewards/rejected": -0.3882797360420227, "step": 37 }, { "epoch": 0.05, "learning_rate": 9.991629576543162e-08, "logits/chosen": -3.1461408138275146, "logits/rejected": -2.9624147415161133, "logps/chosen": -273.280517578125, "logps/rejected": -869.5138549804688, "loss": 0.8606, "rewards/accuracies": 1.0, "rewards/chosen": -0.02434539794921875, "rewards/margins": 0.37226563692092896, "rewards/rejected": -0.3966110348701477, "step": 38 }, { "epoch": 0.05, "learning_rate": 9.99039149232934e-08, "logits/chosen": -3.2151553630828857, "logits/rejected": -3.0655453205108643, "logps/chosen": -336.36822509765625, "logps/rejected": -466.5371398925781, "loss": 0.9161, "rewards/accuracies": 1.0, "rewards/chosen": 0.03702392429113388, "rewards/margins": 0.2571556270122528, "rewards/rejected": -0.22013169527053833, "step": 39 }, { "epoch": 0.05, "learning_rate": 9.989068136093872e-08, "logits/chosen": -3.258430004119873, "logits/rejected": -3.081821918487549, "logps/chosen": -275.0604248046875, "logps/rejected": -492.6986999511719, "loss": 0.8307, "rewards/accuracies": 1.0, "rewards/chosen": 0.046251676976680756, "rewards/margins": 0.21889877319335938, "rewards/rejected": -0.17264708876609802, "step": 40 }, { "epoch": 0.05, "learning_rate": 9.987659530449266e-08, "logits/chosen": -3.1874489784240723, "logits/rejected": -3.0211949348449707, "logps/chosen": -328.46832275390625, "logps/rejected": -353.61407470703125, "loss": 0.792, "rewards/accuracies": 1.0, "rewards/chosen": 0.10475921630859375, "rewards/margins": 0.20680390298366547, "rewards/rejected": -0.10204467922449112, "step": 41 }, { "epoch": 0.05, "learning_rate": 9.986165699464705e-08, "logits/chosen": -3.2336575984954834, "logits/rejected": -2.9090828895568848, "logps/chosen": -313.66546630859375, "logps/rejected": -1515.11083984375, "loss": 0.7491, "rewards/accuracies": 1.0, "rewards/chosen": 0.09416427463293076, "rewards/margins": 0.8240532279014587, "rewards/rejected": -0.729888916015625, "step": 42 }, { "epoch": 0.05, "learning_rate": 9.98458666866564e-08, "logits/chosen": -3.1701323986053467, "logits/rejected": -3.104189872741699, "logps/chosen": -307.9913635253906, "logps/rejected": -1239.7489013671875, "loss": 0.8028, "rewards/accuracies": 1.0, "rewards/chosen": 0.05185394361615181, "rewards/margins": 0.7007369995117188, "rewards/rejected": -0.6488831043243408, "step": 43 }, { "epoch": 0.06, "learning_rate": 9.982922465033349e-08, "logits/chosen": -3.2232751846313477, "logits/rejected": -2.9489033222198486, "logps/chosen": -318.5318298339844, "logps/rejected": -1666.061767578125, "loss": 0.7867, "rewards/accuracies": 1.0, "rewards/chosen": 0.04532928764820099, "rewards/margins": 1.1688034534454346, "rewards/rejected": -1.12347412109375, "step": 44 }, { "epoch": 0.06, "learning_rate": 9.981173117004483e-08, "logits/chosen": -3.1307129859924316, "logits/rejected": -2.9686222076416016, "logps/chosen": -322.6069030761719, "logps/rejected": -380.3050537109375, "loss": 0.8488, "rewards/accuracies": 1.0, "rewards/chosen": 0.0469207763671875, "rewards/margins": 0.13071519136428833, "rewards/rejected": -0.08379440754652023, "step": 45 }, { "epoch": 0.06, "learning_rate": 9.979338654470568e-08, "logits/chosen": -3.1362812519073486, "logits/rejected": -3.0732340812683105, "logps/chosen": -294.1836242675781, "logps/rejected": -805.0936279296875, "loss": 0.7552, "rewards/accuracies": 1.0, "rewards/chosen": 0.06991424411535263, "rewards/margins": 0.5652450323104858, "rewards/rejected": -0.495330810546875, "step": 46 }, { "epoch": 0.06, "learning_rate": 9.977419108777513e-08, "logits/chosen": -3.1503219604492188, "logits/rejected": -3.0821824073791504, "logps/chosen": -281.01153564453125, "logps/rejected": -349.4947509765625, "loss": 0.8554, "rewards/accuracies": 1.0, "rewards/chosen": 0.04772796854376793, "rewards/margins": 0.07519836723804474, "rewards/rejected": -0.02747039869427681, "step": 47 }, { "epoch": 0.06, "learning_rate": 9.975414512725057e-08, "logits/chosen": -3.216569423675537, "logits/rejected": -3.073711395263672, "logps/chosen": -321.4179382324219, "logps/rejected": -795.11767578125, "loss": 0.8079, "rewards/accuracies": 1.0, "rewards/chosen": 0.07071685791015625, "rewards/margins": 0.5427291989326477, "rewards/rejected": -0.47201234102249146, "step": 48 }, { "epoch": 0.06, "learning_rate": 9.973324900566212e-08, "logits/chosen": -3.1911654472351074, "logits/rejected": -3.1419806480407715, "logps/chosen": -297.58673095703125, "logps/rejected": -835.0924072265625, "loss": 0.7546, "rewards/accuracies": 1.0, "rewards/chosen": 0.10965271294116974, "rewards/margins": 0.5665603876113892, "rewards/rejected": -0.4569076895713806, "step": 49 }, { "epoch": 0.06, "learning_rate": 9.971150308006688e-08, "logits/chosen": -3.200486660003662, "logits/rejected": -3.011589765548706, "logps/chosen": -281.7305908203125, "logps/rejected": -1012.6817626953125, "loss": 0.7289, "rewards/accuracies": 1.0, "rewards/chosen": 0.05294037237763405, "rewards/margins": 0.6990127563476562, "rewards/rejected": -0.6460723876953125, "step": 50 }, { "epoch": 0.07, "learning_rate": 9.968890772204271e-08, "logits/chosen": -3.176891803741455, "logits/rejected": -3.0459342002868652, "logps/chosen": -349.77178955078125, "logps/rejected": -363.2972412109375, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.14232787489891052, "rewards/margins": 0.3122970461845398, "rewards/rejected": -0.16996917128562927, "step": 51 }, { "epoch": 0.07, "learning_rate": 9.96654633176819e-08, "logits/chosen": -3.196500062942505, "logits/rejected": -3.0736465454101562, "logps/chosen": -286.8270568847656, "logps/rejected": -328.6838073730469, "loss": 0.7779, "rewards/accuracies": 1.0, "rewards/chosen": 0.03851165622472763, "rewards/margins": 0.22881624102592468, "rewards/rejected": -0.19030457735061646, "step": 52 }, { "epoch": 0.07, "learning_rate": 9.964117026758469e-08, "logits/chosen": -3.1912660598754883, "logits/rejected": -3.0806589126586914, "logps/chosen": -311.2342529296875, "logps/rejected": -430.6540222167969, "loss": 0.7643, "rewards/accuracies": 1.0, "rewards/chosen": 0.08027801662683487, "rewards/margins": 0.3427978456020355, "rewards/rejected": -0.26251983642578125, "step": 53 }, { "epoch": 0.07, "learning_rate": 9.961602898685224e-08, "logits/chosen": -3.221360206604004, "logits/rejected": -3.086759090423584, "logps/chosen": -330.12750244140625, "logps/rejected": -463.0695495605469, "loss": 0.7848, "rewards/accuracies": 1.0, "rewards/chosen": 0.07873840630054474, "rewards/margins": 0.36484071612358093, "rewards/rejected": -0.286102294921875, "step": 54 }, { "epoch": 0.07, "learning_rate": 9.959003990507971e-08, "logits/chosen": -3.2101995944976807, "logits/rejected": -2.977877378463745, "logps/chosen": -293.685546875, "logps/rejected": -1960.0701904296875, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.09671249985694885, "rewards/margins": 1.292055606842041, "rewards/rejected": -1.195343017578125, "step": 55 }, { "epoch": 0.07, "learning_rate": 9.956320346634876e-08, "logits/chosen": -3.2060670852661133, "logits/rejected": -3.136112689971924, "logps/chosen": -302.2705078125, "logps/rejected": -771.0523681640625, "loss": 0.7688, "rewards/accuracies": 1.0, "rewards/chosen": 0.15837860107421875, "rewards/margins": 0.7147537469863892, "rewards/rejected": -0.5563751459121704, "step": 56 }, { "epoch": 0.07, "learning_rate": 9.953552012922011e-08, "logits/chosen": -3.164094924926758, "logits/rejected": -3.005011558532715, "logps/chosen": -330.3134765625, "logps/rejected": -864.9485473632812, "loss": 0.7687, "rewards/accuracies": 1.0, "rewards/chosen": 0.05993194505572319, "rewards/margins": 0.6428573131561279, "rewards/rejected": -0.5829254388809204, "step": 57 }, { "epoch": 0.07, "learning_rate": 9.950699036672558e-08, "logits/chosen": -3.173696279525757, "logits/rejected": -2.9495906829833984, "logps/chosen": -277.751708984375, "logps/rejected": -1186.7330322265625, "loss": 0.7155, "rewards/accuracies": 1.0, "rewards/chosen": 0.16595458984375, "rewards/margins": 1.1458266973495483, "rewards/rejected": -0.9798721075057983, "step": 58 }, { "epoch": 0.08, "learning_rate": 9.947761466636013e-08, "logits/chosen": -3.1897473335266113, "logits/rejected": -3.0537807941436768, "logps/chosen": -330.7126770019531, "logps/rejected": -630.2939453125, "loss": 0.7819, "rewards/accuracies": 1.0, "rewards/chosen": 0.134857177734375, "rewards/margins": 0.5481231808662415, "rewards/rejected": -0.41326600313186646, "step": 59 }, { "epoch": 0.08, "learning_rate": 9.944739353007342e-08, "logits/chosen": -3.1256113052368164, "logits/rejected": -2.9480977058410645, "logps/chosen": -279.75677490234375, "logps/rejected": -1057.55810546875, "loss": 0.7373, "rewards/accuracies": 1.0, "rewards/chosen": 0.137950137257576, "rewards/margins": 0.978291392326355, "rewards/rejected": -0.8403412103652954, "step": 60 }, { "epoch": 0.08, "learning_rate": 9.941632747426128e-08, "logits/chosen": -3.201542615890503, "logits/rejected": -3.132146120071411, "logps/chosen": -318.45147705078125, "logps/rejected": -693.7222290039062, "loss": 0.7273, "rewards/accuracies": 1.0, "rewards/chosen": 0.15223082900047302, "rewards/margins": 0.617266833782196, "rewards/rejected": -0.465036004781723, "step": 61 }, { "epoch": 0.08, "learning_rate": 9.938441702975688e-08, "logits/chosen": -3.114938259124756, "logits/rejected": -2.9603705406188965, "logps/chosen": -327.34173583984375, "logps/rejected": -1001.5906982421875, "loss": 0.7469, "rewards/accuracies": 1.0, "rewards/chosen": 0.05615692213177681, "rewards/margins": 0.8922195434570312, "rewards/rejected": -0.8360626101493835, "step": 62 }, { "epoch": 0.08, "learning_rate": 9.93516627418217e-08, "logits/chosen": -3.1806511878967285, "logits/rejected": -3.0861804485321045, "logps/chosen": -311.4403381347656, "logps/rejected": -836.93017578125, "loss": 0.7781, "rewards/accuracies": 1.0, "rewards/chosen": 0.04831695556640625, "rewards/margins": 1.0133925676345825, "rewards/rejected": -0.965075671672821, "step": 63 }, { "epoch": 0.08, "learning_rate": 9.931806517013611e-08, "logits/chosen": -3.169240951538086, "logits/rejected": -2.929600715637207, "logps/chosen": -352.5500183105469, "logps/rejected": -1527.708984375, "loss": 0.588, "rewards/accuracies": 1.0, "rewards/chosen": 0.109649658203125, "rewards/margins": 1.6402220726013184, "rewards/rejected": -1.530572533607483, "step": 64 }, { "epoch": 0.08, "learning_rate": 9.928362488878995e-08, "logits/chosen": -3.144212484359741, "logits/rejected": -3.025667428970337, "logps/chosen": -335.56805419921875, "logps/rejected": -443.33587646484375, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.03352661058306694, "rewards/margins": 0.37225037813186646, "rewards/rejected": -0.3387237787246704, "step": 65 }, { "epoch": 0.08, "learning_rate": 9.924834248627259e-08, "logits/chosen": -3.1753573417663574, "logits/rejected": -2.9522151947021484, "logps/chosen": -338.23028564453125, "logps/rejected": -1572.392333984375, "loss": 0.7683, "rewards/accuracies": 1.0, "rewards/chosen": 0.156178280711174, "rewards/margins": 1.8543715476989746, "rewards/rejected": -1.6981933116912842, "step": 66 }, { "epoch": 0.09, "learning_rate": 9.921221856546293e-08, "logits/chosen": -3.1570985317230225, "logits/rejected": -3.0882997512817383, "logps/chosen": -271.7112731933594, "logps/rejected": -633.765625, "loss": 0.7553, "rewards/accuracies": 1.0, "rewards/chosen": 0.10711364448070526, "rewards/margins": 0.5735381841659546, "rewards/rejected": -0.4664245843887329, "step": 67 }, { "epoch": 0.09, "learning_rate": 9.917525374361911e-08, "logits/chosen": -3.23593807220459, "logits/rejected": -2.989205837249756, "logps/chosen": -327.2955322265625, "logps/rejected": -1355.3602294921875, "loss": 0.7278, "rewards/accuracies": 1.0, "rewards/chosen": 0.12084655463695526, "rewards/margins": 1.485453724861145, "rewards/rejected": -1.3646072149276733, "step": 68 }, { "epoch": 0.09, "learning_rate": 9.913744865236797e-08, "logits/chosen": -3.2211899757385254, "logits/rejected": -3.033085584640503, "logps/chosen": -314.1671447753906, "logps/rejected": -570.3936157226562, "loss": 0.747, "rewards/accuracies": 1.0, "rewards/chosen": 0.10247192531824112, "rewards/margins": 0.3731094300746918, "rewards/rejected": -0.27063751220703125, "step": 69 }, { "epoch": 0.09, "learning_rate": 9.909880393769419e-08, "logits/chosen": -3.1713225841522217, "logits/rejected": -3.1056480407714844, "logps/chosen": -300.5295104980469, "logps/rejected": -555.9124755859375, "loss": 0.724, "rewards/accuracies": 1.0, "rewards/chosen": 0.20963135361671448, "rewards/margins": 0.6884399652481079, "rewards/rejected": -0.47880861163139343, "step": 70 }, { "epoch": 0.09, "learning_rate": 9.905932025992931e-08, "logits/chosen": -3.1941139698028564, "logits/rejected": -3.115065574645996, "logps/chosen": -267.07379150390625, "logps/rejected": -1074.5709228515625, "loss": 0.6257, "rewards/accuracies": 1.0, "rewards/chosen": 0.14247971773147583, "rewards/margins": 1.1177451610565186, "rewards/rejected": -0.9752655029296875, "step": 71 }, { "epoch": 0.09, "learning_rate": 9.901899829374047e-08, "logits/chosen": -3.1491024494171143, "logits/rejected": -2.911647319793701, "logps/chosen": -290.2532958984375, "logps/rejected": -995.76904296875, "loss": 0.6296, "rewards/accuracies": 1.0, "rewards/chosen": 0.21207427978515625, "rewards/margins": 1.16395103931427, "rewards/rejected": -0.9518768191337585, "step": 72 }, { "epoch": 0.09, "learning_rate": 9.89778387281188e-08, "logits/chosen": -3.1779696941375732, "logits/rejected": -3.0134782791137695, "logps/chosen": -284.9223327636719, "logps/rejected": -1673.2235107421875, "loss": 0.5945, "rewards/accuracies": 1.0, "rewards/chosen": 0.10809020698070526, "rewards/margins": 1.9634613990783691, "rewards/rejected": -1.8553712368011475, "step": 73 }, { "epoch": 0.09, "learning_rate": 9.893584226636772e-08, "logits/chosen": -3.196732997894287, "logits/rejected": -3.048961639404297, "logps/chosen": -279.7977600097656, "logps/rejected": -1038.031494140625, "loss": 0.6066, "rewards/accuracies": 1.0, "rewards/chosen": 0.19439086318016052, "rewards/margins": 1.152746558189392, "rewards/rejected": -0.958355724811554, "step": 74 }, { "epoch": 0.1, "learning_rate": 9.889300962609089e-08, "logits/chosen": -3.1647043228149414, "logits/rejected": -3.1162519454956055, "logps/chosen": -305.32049560546875, "logps/rejected": -841.71923828125, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": 0.05420837551355362, "rewards/margins": 0.7930267453193665, "rewards/rejected": -0.7388184070587158, "step": 75 }, { "epoch": 0.1, "learning_rate": 9.884934153917997e-08, "logits/chosen": -3.201965808868408, "logits/rejected": -3.036243438720703, "logps/chosen": -314.16766357421875, "logps/rejected": -1415.630615234375, "loss": 0.5628, "rewards/accuracies": 1.0, "rewards/chosen": 0.1571395993232727, "rewards/margins": 1.8212509155273438, "rewards/rejected": -1.6641113758087158, "step": 76 }, { "epoch": 0.1, "learning_rate": 9.880483875180204e-08, "logits/chosen": -3.1953911781311035, "logits/rejected": -3.132089376449585, "logps/chosen": -279.3058776855469, "logps/rejected": -1149.112548828125, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": 0.13473740220069885, "rewards/margins": 1.4993095397949219, "rewards/rejected": -1.3645721673965454, "step": 77 }, { "epoch": 0.1, "learning_rate": 9.875950202438699e-08, "logits/chosen": -3.223398447036743, "logits/rejected": -3.097919464111328, "logps/chosen": -299.7601013183594, "logps/rejected": -1126.1875, "loss": 0.5831, "rewards/accuracies": 1.0, "rewards/chosen": 0.08757781982421875, "rewards/margins": 1.2484420537948608, "rewards/rejected": -1.160864233970642, "step": 78 }, { "epoch": 0.1, "learning_rate": 9.871333213161437e-08, "logits/chosen": -3.1679320335388184, "logits/rejected": -3.059769868850708, "logps/chosen": -264.73858642578125, "logps/rejected": -782.2879638671875, "loss": 0.7183, "rewards/accuracies": 1.0, "rewards/chosen": 0.22988128662109375, "rewards/margins": 1.0487396717071533, "rewards/rejected": -0.8188583254814148, "step": 79 }, { "epoch": 0.1, "learning_rate": 9.866632986240029e-08, "logits/chosen": -3.1169557571411133, "logits/rejected": -3.0497756004333496, "logps/chosen": -299.4889831542969, "logps/rejected": -577.6195678710938, "loss": 0.6353, "rewards/accuracies": 1.0, "rewards/chosen": 0.17203064262866974, "rewards/margins": 0.65203857421875, "rewards/rejected": -0.48000794649124146, "step": 80 }, { "epoch": 0.1, "learning_rate": 9.861849601988382e-08, "logits/chosen": -3.2003068923950195, "logits/rejected": -3.055177688598633, "logps/chosen": -297.44677734375, "logps/rejected": -417.418212890625, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 0.203288272023201, "rewards/margins": 0.5406494140625, "rewards/rejected": -0.3373611569404602, "step": 81 }, { "epoch": 0.1, "learning_rate": 9.856983142141337e-08, "logits/chosen": -3.201425552368164, "logits/rejected": -3.1356821060180664, "logps/chosen": -291.04278564453125, "logps/rejected": -545.7918701171875, "loss": 0.5733, "rewards/accuracies": 1.0, "rewards/chosen": 0.12079392373561859, "rewards/margins": 0.5997284054756165, "rewards/rejected": -0.47893446683883667, "step": 82 }, { "epoch": 0.11, "learning_rate": 9.852033689853267e-08, "logits/chosen": -3.139970541000366, "logits/rejected": -3.0880112648010254, "logps/chosen": -300.56787109375, "logps/rejected": -798.6337890625, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 0.14684906601905823, "rewards/margins": 0.9613418579101562, "rewards/rejected": -0.8144928216934204, "step": 83 }, { "epoch": 0.11, "learning_rate": 9.847001329696651e-08, "logits/chosen": -3.1874310970306396, "logits/rejected": -2.9923434257507324, "logps/chosen": -266.3863525390625, "logps/rejected": -551.790283203125, "loss": 0.6482, "rewards/accuracies": 1.0, "rewards/chosen": 0.133320614695549, "rewards/margins": 0.6430771350860596, "rewards/rejected": -0.5097565054893494, "step": 84 }, { "epoch": 0.11, "learning_rate": 9.841886147660644e-08, "logits/chosen": -3.1477699279785156, "logits/rejected": -2.9644293785095215, "logps/chosen": -311.7578430175781, "logps/rejected": -1515.637451171875, "loss": 0.623, "rewards/accuracies": 1.0, "rewards/chosen": 0.16172486543655396, "rewards/margins": 2.090118408203125, "rewards/rejected": -1.9283936023712158, "step": 85 }, { "epoch": 0.11, "learning_rate": 9.836688231149591e-08, "logits/chosen": -3.20509934425354, "logits/rejected": -3.0798838138580322, "logps/chosen": -300.50360107421875, "logps/rejected": -984.5396728515625, "loss": 0.6046, "rewards/accuracies": 1.0, "rewards/chosen": 0.11804504692554474, "rewards/margins": 1.2045868635177612, "rewards/rejected": -1.086541771888733, "step": 86 }, { "epoch": 0.11, "learning_rate": 9.831407668981545e-08, "logits/chosen": -3.1603403091430664, "logits/rejected": -3.049039363861084, "logps/chosen": -260.6101989746094, "logps/rejected": -1007.2135009765625, "loss": 0.6366, "rewards/accuracies": 1.0, "rewards/chosen": 0.14610671997070312, "rewards/margins": 1.1530098915100098, "rewards/rejected": -1.0069031715393066, "step": 87 }, { "epoch": 0.11, "learning_rate": 9.826044551386743e-08, "logits/chosen": -3.148299217224121, "logits/rejected": -3.035773992538452, "logps/chosen": -334.4939270019531, "logps/rejected": -884.3077392578125, "loss": 0.6635, "rewards/accuracies": 1.0, "rewards/chosen": 0.08952179551124573, "rewards/margins": 1.0365997552871704, "rewards/rejected": -0.9470779299736023, "step": 88 }, { "epoch": 0.11, "learning_rate": 9.820598970006067e-08, "logits/chosen": -3.1548469066619873, "logits/rejected": -3.0570640563964844, "logps/chosen": -313.880126953125, "logps/rejected": -314.97747802734375, "loss": 0.6403, "rewards/accuracies": 1.0, "rewards/chosen": 0.21246491372585297, "rewards/margins": 0.43617862462997437, "rewards/rejected": -0.223713681101799, "step": 89 }, { "epoch": 0.11, "learning_rate": 9.81507101788948e-08, "logits/chosen": -3.1939706802368164, "logits/rejected": -3.1054234504699707, "logps/chosen": -285.1236877441406, "logps/rejected": -655.6411743164062, "loss": 0.6378, "rewards/accuracies": 1.0, "rewards/chosen": 0.13791199028491974, "rewards/margins": 0.8990112543106079, "rewards/rejected": -0.7610992193222046, "step": 90 }, { "epoch": 0.12, "learning_rate": 9.80946078949443e-08, "logits/chosen": -3.1734585762023926, "logits/rejected": -3.038327217102051, "logps/chosen": -265.0036926269531, "logps/rejected": -1314.807373046875, "loss": 0.5709, "rewards/accuracies": 1.0, "rewards/chosen": 0.12732239067554474, "rewards/margins": 2.0808136463165283, "rewards/rejected": -1.9534912109375, "step": 91 }, { "epoch": 0.12, "learning_rate": 9.803768380684241e-08, "logits/chosen": -3.236311435699463, "logits/rejected": -3.07564640045166, "logps/chosen": -281.0992431640625, "logps/rejected": -994.37109375, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.17062988877296448, "rewards/margins": 1.435632348060608, "rewards/rejected": -1.2650024890899658, "step": 92 }, { "epoch": 0.12, "learning_rate": 9.797993888726472e-08, "logits/chosen": -3.1539576053619385, "logits/rejected": -2.9736130237579346, "logps/chosen": -349.68011474609375, "logps/rejected": -1679.7442626953125, "loss": 0.5686, "rewards/accuracies": 1.0, "rewards/chosen": 0.14419251680374146, "rewards/margins": 1.9925811290740967, "rewards/rejected": -1.848388671875, "step": 93 }, { "epoch": 0.12, "learning_rate": 9.792137412291263e-08, "logits/chosen": -3.183964729309082, "logits/rejected": -3.062598705291748, "logps/chosen": -279.9299621582031, "logps/rejected": -321.3896484375, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.19986876845359802, "rewards/margins": 0.4040283262729645, "rewards/rejected": -0.20415955781936646, "step": 94 }, { "epoch": 0.12, "learning_rate": 9.786199051449635e-08, "logits/chosen": -3.19346284866333, "logits/rejected": -3.089428186416626, "logps/chosen": -294.04449462890625, "logps/rejected": -531.293701171875, "loss": 0.5432, "rewards/accuracies": 1.0, "rewards/chosen": 0.10500641167163849, "rewards/margins": 0.7067612409591675, "rewards/rejected": -0.6017547845840454, "step": 95 }, { "epoch": 0.12, "learning_rate": 9.780178907671787e-08, "logits/chosen": -3.1862735748291016, "logits/rejected": -3.083137035369873, "logps/chosen": -297.15606689453125, "logps/rejected": -705.2899169921875, "loss": 0.6458, "rewards/accuracies": 1.0, "rewards/chosen": 0.19854736328125, "rewards/margins": 1.099267601966858, "rewards/rejected": -0.9007202386856079, "step": 96 }, { "epoch": 0.12, "learning_rate": 9.774077083825372e-08, "logits/chosen": -3.1301450729370117, "logits/rejected": -2.9289908409118652, "logps/chosen": -308.36907958984375, "logps/rejected": -749.8486328125, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.22395172715187073, "rewards/margins": 1.034918189048767, "rewards/rejected": -0.8109664916992188, "step": 97 }, { "epoch": 0.12, "learning_rate": 9.767893684173721e-08, "logits/chosen": -3.1105282306671143, "logits/rejected": -3.012983560562134, "logps/chosen": -350.5636901855469, "logps/rejected": -530.8416137695312, "loss": 0.6034, "rewards/accuracies": 1.0, "rewards/chosen": 0.22839051485061646, "rewards/margins": 0.8692840337753296, "rewards/rejected": -0.6408935785293579, "step": 98 }, { "epoch": 0.13, "learning_rate": 9.761628814374073e-08, "logits/chosen": -3.224022388458252, "logits/rejected": -3.0747718811035156, "logps/chosen": -300.6407165527344, "logps/rejected": -634.3179931640625, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.13871917128562927, "rewards/margins": 0.9531967043876648, "rewards/rejected": -0.8144775629043579, "step": 99 }, { "epoch": 0.13, "learning_rate": 9.755282581475768e-08, "logits/chosen": -3.209427833557129, "logits/rejected": -3.1925430297851562, "logps/chosen": -274.5077209472656, "logps/rejected": -830.9178466796875, "loss": 0.5581, "rewards/accuracies": 1.0, "rewards/chosen": 0.23591232299804688, "rewards/margins": 1.3630669116973877, "rewards/rejected": -1.1271545886993408, "step": 100 }, { "epoch": 0.13, "learning_rate": 9.748855093918415e-08, "logits/chosen": -3.1754627227783203, "logits/rejected": -3.12052583694458, "logps/chosen": -306.0105285644531, "logps/rejected": -829.791748046875, "loss": 0.6161, "rewards/accuracies": 1.0, "rewards/chosen": 0.19002380967140198, "rewards/margins": 1.3093750476837158, "rewards/rejected": -1.1193511486053467, "step": 101 }, { "epoch": 0.13, "learning_rate": 9.742346461530047e-08, "logits/chosen": -3.2314319610595703, "logits/rejected": -3.1260786056518555, "logps/chosen": -334.7138977050781, "logps/rejected": -730.204833984375, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 0.15774232149124146, "rewards/margins": 1.154144287109375, "rewards/rejected": -0.9964019656181335, "step": 102 }, { "epoch": 0.13, "learning_rate": 9.73575679552523e-08, "logits/chosen": -3.217122793197632, "logits/rejected": -3.1138224601745605, "logps/chosen": -302.53228759765625, "logps/rejected": -1306.1474609375, "loss": 0.637, "rewards/accuracies": 1.0, "rewards/chosen": 0.17429198324680328, "rewards/margins": 2.1529784202575684, "rewards/rejected": -1.9786865711212158, "step": 103 }, { "epoch": 0.13, "learning_rate": 9.729086208503173e-08, "logits/chosen": -3.1760613918304443, "logits/rejected": -2.9852185249328613, "logps/chosen": -306.879150390625, "logps/rejected": -855.8590698242188, "loss": 0.5696, "rewards/accuracies": 1.0, "rewards/chosen": 0.2941543459892273, "rewards/margins": 1.4850770235061646, "rewards/rejected": -1.1909226179122925, "step": 104 }, { "epoch": 0.13, "learning_rate": 9.722334814445807e-08, "logits/chosen": -3.1452910900115967, "logits/rejected": -3.040231227874756, "logps/chosen": -364.91839599609375, "logps/rejected": -1172.751220703125, "loss": 0.6195, "rewards/accuracies": 1.0, "rewards/chosen": 0.1998031735420227, "rewards/margins": 1.8836883306503296, "rewards/rejected": -1.6838852167129517, "step": 105 }, { "epoch": 0.14, "learning_rate": 9.715502728715826e-08, "logits/chosen": -3.1303720474243164, "logits/rejected": -3.09177303314209, "logps/chosen": -302.2375793457031, "logps/rejected": -644.7623291015625, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.23871764540672302, "rewards/margins": 1.0760986804962158, "rewards/rejected": -0.8373810052871704, "step": 106 }, { "epoch": 0.14, "learning_rate": 9.708590068054727e-08, "logits/chosen": -3.2218027114868164, "logits/rejected": -3.151235818862915, "logps/chosen": -324.8514709472656, "logps/rejected": -1079.48828125, "loss": 0.5582, "rewards/accuracies": 1.0, "rewards/chosen": 0.27400970458984375, "rewards/margins": 2.096086025238037, "rewards/rejected": -1.822076439857483, "step": 107 }, { "epoch": 0.14, "learning_rate": 9.701596950580806e-08, "logits/chosen": -3.1836509704589844, "logits/rejected": -3.025966167449951, "logps/chosen": -271.3963623046875, "logps/rejected": -1566.9482421875, "loss": 0.5021, "rewards/accuracies": 1.0, "rewards/chosen": 0.19192656874656677, "rewards/margins": 2.399531602859497, "rewards/rejected": -2.2076048851013184, "step": 108 }, { "epoch": 0.14, "learning_rate": 9.694523495787148e-08, "logits/chosen": -3.160971164703369, "logits/rejected": -3.111281156539917, "logps/chosen": -311.0094909667969, "logps/rejected": -572.3988647460938, "loss": 0.5866, "rewards/accuracies": 1.0, "rewards/chosen": 0.211863711476326, "rewards/margins": 0.9914063215255737, "rewards/rejected": -0.7795425653457642, "step": 109 }, { "epoch": 0.14, "learning_rate": 9.687369824539577e-08, "logits/chosen": -3.1838698387145996, "logits/rejected": -3.123863697052002, "logps/chosen": -293.73468017578125, "logps/rejected": -789.507568359375, "loss": 0.6077, "rewards/accuracies": 1.0, "rewards/chosen": 0.13229981064796448, "rewards/margins": 1.2912139892578125, "rewards/rejected": -1.1589142084121704, "step": 110 }, { "epoch": 0.14, "learning_rate": 9.680136059074597e-08, "logits/chosen": -3.1897339820861816, "logits/rejected": -3.1208338737487793, "logps/chosen": -316.12689208984375, "logps/rejected": -490.99505615234375, "loss": 0.5171, "rewards/accuracies": 1.0, "rewards/chosen": 0.12238617241382599, "rewards/margins": 0.7528289556503296, "rewards/rejected": -0.6304428577423096, "step": 111 }, { "epoch": 0.14, "learning_rate": 9.672822322997304e-08, "logits/chosen": -3.1391255855560303, "logits/rejected": -3.100623369216919, "logps/chosen": -296.51116943359375, "logps/rejected": -1353.2978515625, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.218760684132576, "rewards/margins": 2.0154526233673096, "rewards/rejected": -1.79669189453125, "step": 112 }, { "epoch": 0.14, "learning_rate": 9.665428741279266e-08, "logits/chosen": -3.163559913635254, "logits/rejected": -3.03309965133667, "logps/chosen": -281.498291015625, "logps/rejected": -1149.287353515625, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 0.18170395493507385, "rewards/margins": 1.9521689414978027, "rewards/rejected": -1.7704651355743408, "step": 113 }, { "epoch": 0.15, "learning_rate": 9.657955440256394e-08, "logits/chosen": -3.13822865486145, "logits/rejected": -3.065958023071289, "logps/chosen": -321.07684326171875, "logps/rejected": -436.6490478515625, "loss": 0.6958, "rewards/accuracies": 1.0, "rewards/chosen": 0.2741958796977997, "rewards/margins": 0.8569122552871704, "rewards/rejected": -0.5827164053916931, "step": 114 }, { "epoch": 0.15, "learning_rate": 9.650402547626786e-08, "logits/chosen": -3.1538047790527344, "logits/rejected": -3.0229949951171875, "logps/chosen": -323.47210693359375, "logps/rejected": -1077.6041259765625, "loss": 0.555, "rewards/accuracies": 1.0, "rewards/chosen": 0.3337051570415497, "rewards/margins": 1.6813675165176392, "rewards/rejected": -1.3476624488830566, "step": 115 }, { "epoch": 0.15, "learning_rate": 9.642770192448534e-08, "logits/chosen": -3.191667079925537, "logits/rejected": -3.065495014190674, "logps/chosen": -309.8695983886719, "logps/rejected": -1002.7518920898438, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 0.12017059326171875, "rewards/margins": 1.547227382659912, "rewards/rejected": -1.427056908607483, "step": 116 }, { "epoch": 0.15, "learning_rate": 9.635058505137534e-08, "logits/chosen": -3.1632161140441895, "logits/rejected": -3.121382713317871, "logps/chosen": -296.9832763671875, "logps/rejected": -624.9923095703125, "loss": 0.5451, "rewards/accuracies": 1.0, "rewards/chosen": 0.22241058945655823, "rewards/margins": 1.2145600318908691, "rewards/rejected": -0.9921493530273438, "step": 117 }, { "epoch": 0.15, "learning_rate": 9.627267617465242e-08, "logits/chosen": -3.1945154666900635, "logits/rejected": -3.117353916168213, "logps/chosen": -369.95306396484375, "logps/rejected": -936.5, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": 0.184794619679451, "rewards/margins": 1.640141248703003, "rewards/rejected": -1.455346703529358, "step": 118 }, { "epoch": 0.15, "learning_rate": 9.619397662556434e-08, "logits/chosen": -3.2927639484405518, "logits/rejected": -3.0766711235046387, "logps/chosen": -306.3031005859375, "logps/rejected": -445.4729919433594, "loss": 0.5393, "rewards/accuracies": 1.0, "rewards/chosen": 0.24399414658546448, "rewards/margins": 0.7625488042831421, "rewards/rejected": -0.5185546875, "step": 119 }, { "epoch": 0.15, "learning_rate": 9.611448774886923e-08, "logits/chosen": -3.2101049423217773, "logits/rejected": -3.184274435043335, "logps/chosen": -338.61181640625, "logps/rejected": -892.9679565429688, "loss": 0.5307, "rewards/accuracies": 1.0, "rewards/chosen": 0.2676956355571747, "rewards/margins": 1.6261703968048096, "rewards/rejected": -1.3584747314453125, "step": 120 }, { "epoch": 0.15, "learning_rate": 9.603421090281269e-08, "logits/chosen": -3.1711249351501465, "logits/rejected": -2.958958148956299, "logps/chosen": -297.0474853515625, "logps/rejected": -3400.0693359375, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": 0.24140778183937073, "rewards/margins": 5.868666648864746, "rewards/rejected": -5.62725830078125, "step": 121 }, { "epoch": 0.16, "learning_rate": 9.595314745910454e-08, "logits/chosen": -3.2135682106018066, "logits/rejected": -3.1208643913269043, "logps/chosen": -310.07666015625, "logps/rejected": -850.7954711914062, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 0.23880615830421448, "rewards/margins": 1.4460572004318237, "rewards/rejected": -1.2072510719299316, "step": 122 }, { "epoch": 0.16, "learning_rate": 9.587129880289538e-08, "logits/chosen": -3.2396786212921143, "logits/rejected": -3.140011787414551, "logps/chosen": -293.2325439453125, "logps/rejected": -579.173828125, "loss": 0.5851, "rewards/accuracies": 1.0, "rewards/chosen": 0.30014342069625854, "rewards/margins": 1.1962554454803467, "rewards/rejected": -0.8961120843887329, "step": 123 }, { "epoch": 0.16, "learning_rate": 9.578866633275286e-08, "logits/chosen": -3.214778184890747, "logits/rejected": -2.8767876625061035, "logps/chosen": -292.50274658203125, "logps/rejected": -2328.9296875, "loss": 0.455, "rewards/accuracies": 1.0, "rewards/chosen": 0.2836258113384247, "rewards/margins": 4.076545715332031, "rewards/rejected": -3.7929201126098633, "step": 124 }, { "epoch": 0.16, "learning_rate": 9.570525146063798e-08, "logits/chosen": -3.1385247707366943, "logits/rejected": -3.1087429523468018, "logps/chosen": -316.48748779296875, "logps/rejected": -280.95550537109375, "loss": 0.5436, "rewards/accuracies": 1.0, "rewards/chosen": 0.26283419132232666, "rewards/margins": 0.6209030151367188, "rewards/rejected": -0.3580688536167145, "step": 125 }, { "epoch": 0.16, "learning_rate": 9.562105561188067e-08, "logits/chosen": -3.1671807765960693, "logits/rejected": -2.951056957244873, "logps/chosen": -326.326171875, "logps/rejected": -913.9244384765625, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 0.1945037841796875, "rewards/margins": 1.4417998790740967, "rewards/rejected": -1.2472960948944092, "step": 126 }, { "epoch": 0.16, "learning_rate": 9.553608022515576e-08, "logits/chosen": -3.196260452270508, "logits/rejected": -3.119661808013916, "logps/chosen": -304.29595947265625, "logps/rejected": -568.8287353515625, "loss": 0.5696, "rewards/accuracies": 1.0, "rewards/chosen": 0.3104797601699829, "rewards/margins": 1.081243872642517, "rewards/rejected": -0.770764172077179, "step": 127 }, { "epoch": 0.16, "learning_rate": 9.545032675245812e-08, "logits/chosen": -3.219020366668701, "logits/rejected": -3.048574924468994, "logps/chosen": -301.6912841796875, "logps/rejected": -307.72125244140625, "loss": 0.7076, "rewards/accuracies": 1.0, "rewards/chosen": 0.3106124997138977, "rewards/margins": 0.6999374628067017, "rewards/rejected": -0.38932496309280396, "step": 128 }, { "epoch": 0.16, "learning_rate": 9.536379665907798e-08, "logits/chosen": -3.240649938583374, "logits/rejected": -3.142881393432617, "logps/chosen": -294.18316650390625, "logps/rejected": -1003.5775756835938, "loss": 0.5256, "rewards/accuracies": 1.0, "rewards/chosen": 0.27088165283203125, "rewards/margins": 2.0299394130706787, "rewards/rejected": -1.7590577602386475, "step": 129 }, { "epoch": 0.17, "learning_rate": 9.527649142357594e-08, "logits/chosen": -3.165945529937744, "logits/rejected": -3.0238585472106934, "logps/chosen": -372.13360595703125, "logps/rejected": -334.7425231933594, "loss": 0.586, "rewards/accuracies": 1.0, "rewards/chosen": 0.13617096841335297, "rewards/margins": 0.6197845935821533, "rewards/rejected": -0.4836135804653168, "step": 130 }, { "epoch": 0.17, "learning_rate": 9.518841253775753e-08, "logits/chosen": -3.224423408508301, "logits/rejected": -3.1188879013061523, "logps/chosen": -279.64990234375, "logps/rejected": -1282.41162109375, "loss": 0.5799, "rewards/accuracies": 1.0, "rewards/chosen": 0.24613571166992188, "rewards/margins": 1.8503745794296265, "rewards/rejected": -1.6042388677597046, "step": 131 }, { "epoch": 0.17, "learning_rate": 9.509956150664795e-08, "logits/chosen": -3.111767053604126, "logits/rejected": -3.015601634979248, "logps/chosen": -337.3946838378906, "logps/rejected": -865.012451171875, "loss": 0.5399, "rewards/accuracies": 1.0, "rewards/chosen": 0.3085418939590454, "rewards/margins": 1.4745299816131592, "rewards/rejected": -1.1659882068634033, "step": 132 }, { "epoch": 0.17, "learning_rate": 9.500993984846612e-08, "logits/chosen": -3.147463083267212, "logits/rejected": -3.080206871032715, "logps/chosen": -279.31011962890625, "logps/rejected": -967.1348876953125, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": 0.3432357907295227, "rewards/margins": 2.146092414855957, "rewards/rejected": -1.8028564453125, "step": 133 }, { "epoch": 0.17, "learning_rate": 9.491954909459894e-08, "logits/chosen": -3.146892547607422, "logits/rejected": -3.0112736225128174, "logps/chosen": -282.56024169921875, "logps/rejected": -878.54931640625, "loss": 0.511, "rewards/accuracies": 1.0, "rewards/chosen": 0.2342277467250824, "rewards/margins": 1.4850761890411377, "rewards/rejected": -1.250848412513733, "step": 134 }, { "epoch": 0.17, "learning_rate": 9.482839078957499e-08, "logits/chosen": -3.1611664295196533, "logits/rejected": -2.9647622108459473, "logps/chosen": -305.19915771484375, "logps/rejected": -1410.558349609375, "loss": 0.5353, "rewards/accuracies": 1.0, "rewards/chosen": 0.3275604248046875, "rewards/margins": 3.13089919090271, "rewards/rejected": -2.8033387660980225, "step": 135 }, { "epoch": 0.17, "learning_rate": 9.473646649103817e-08, "logits/chosen": -3.129085063934326, "logits/rejected": -2.94246244430542, "logps/chosen": -389.33819580078125, "logps/rejected": -1356.5257568359375, "loss": 0.5445, "rewards/accuracies": 1.0, "rewards/chosen": 0.2980636656284332, "rewards/margins": 3.031716823577881, "rewards/rejected": -2.7336533069610596, "step": 136 }, { "epoch": 0.17, "learning_rate": 9.464377776972114e-08, "logits/chosen": -3.1956686973571777, "logits/rejected": -3.0553667545318604, "logps/chosen": -322.5648498535156, "logps/rejected": -375.11334228515625, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": 0.25546568632125854, "rewards/margins": 0.723339855670929, "rewards/rejected": -0.4678741693496704, "step": 137 }, { "epoch": 0.18, "learning_rate": 9.455032620941839e-08, "logits/chosen": -3.1898679733276367, "logits/rejected": -3.0511648654937744, "logps/chosen": -303.6807861328125, "logps/rejected": -391.760498046875, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 0.2574142515659332, "rewards/margins": 0.8870590925216675, "rewards/rejected": -0.6296447515487671, "step": 138 }, { "epoch": 0.18, "learning_rate": 9.445611340695925e-08, "logits/chosen": -3.1977014541625977, "logits/rejected": -3.027592182159424, "logps/chosen": -307.8069763183594, "logps/rejected": -599.1807861328125, "loss": 0.5838, "rewards/accuracies": 1.0, "rewards/chosen": 0.244709774851799, "rewards/margins": 1.2436859607696533, "rewards/rejected": -0.9989761114120483, "step": 139 }, { "epoch": 0.18, "learning_rate": 9.436114097218058e-08, "logits/chosen": -3.149136543273926, "logits/rejected": -3.0685458183288574, "logps/chosen": -325.85845947265625, "logps/rejected": -400.5986328125, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": 0.3126571774482727, "rewards/margins": 0.7819442749023438, "rewards/rejected": -0.46928709745407104, "step": 140 }, { "epoch": 0.18, "learning_rate": 9.426541052789925e-08, "logits/chosen": -3.201765775680542, "logits/rejected": -2.8847641944885254, "logps/chosen": -315.51947021484375, "logps/rejected": -3692.5771484375, "loss": 0.4764, "rewards/accuracies": 1.0, "rewards/chosen": 0.283529669046402, "rewards/margins": 6.836874485015869, "rewards/rejected": -6.5533447265625, "step": 141 }, { "epoch": 0.18, "learning_rate": 9.416892370988443e-08, "logits/chosen": -3.199918746948242, "logits/rejected": -2.969221830368042, "logps/chosen": -321.73040771484375, "logps/rejected": -909.3025512695312, "loss": 0.4877, "rewards/accuracies": 1.0, "rewards/chosen": 0.22735747694969177, "rewards/margins": 1.762689232826233, "rewards/rejected": -1.5353318452835083, "step": 142 }, { "epoch": 0.18, "learning_rate": 9.40716821668296e-08, "logits/chosen": -3.2314085960388184, "logits/rejected": -2.9572577476501465, "logps/chosen": -321.66815185546875, "logps/rejected": -1972.026123046875, "loss": 0.5281, "rewards/accuracies": 1.0, "rewards/chosen": 0.2714691162109375, "rewards/margins": 4.522293567657471, "rewards/rejected": -4.250824451446533, "step": 143 }, { "epoch": 0.18, "learning_rate": 9.397368756032444e-08, "logits/chosen": -3.273019313812256, "logits/rejected": -3.0089969635009766, "logps/chosen": -286.8343200683594, "logps/rejected": -1743.9630126953125, "loss": 0.4892, "rewards/accuracies": 1.0, "rewards/chosen": 0.29644012451171875, "rewards/margins": 3.5674548149108887, "rewards/rejected": -3.271014451980591, "step": 144 }, { "epoch": 0.18, "learning_rate": 9.387494156482642e-08, "logits/chosen": -3.1706767082214355, "logits/rejected": -3.072216033935547, "logps/chosen": -298.6855163574219, "logps/rejected": -1044.80859375, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 0.24984130263328552, "rewards/margins": 2.070675849914551, "rewards/rejected": -1.8208343982696533, "step": 145 }, { "epoch": 0.19, "learning_rate": 9.377544586763214e-08, "logits/chosen": -3.1706104278564453, "logits/rejected": -3.1661696434020996, "logps/chosen": -305.09881591796875, "logps/rejected": -698.9486694335938, "loss": 0.534, "rewards/accuracies": 1.0, "rewards/chosen": 0.2414703369140625, "rewards/margins": 1.457427978515625, "rewards/rejected": -1.2159576416015625, "step": 146 }, { "epoch": 0.19, "learning_rate": 9.367520216884854e-08, "logits/chosen": -3.2229061126708984, "logits/rejected": -2.9811699390411377, "logps/chosen": -294.1118469238281, "logps/rejected": -1262.804931640625, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": 0.2642669677734375, "rewards/margins": 2.4951112270355225, "rewards/rejected": -2.230844259262085, "step": 147 }, { "epoch": 0.19, "learning_rate": 9.357421218136385e-08, "logits/chosen": -3.196449041366577, "logits/rejected": -3.1877706050872803, "logps/chosen": -300.8568115234375, "logps/rejected": -824.1300048828125, "loss": 0.4317, "rewards/accuracies": 1.0, "rewards/chosen": 0.2204177975654602, "rewards/margins": 1.5985641479492188, "rewards/rejected": -1.3781464099884033, "step": 148 }, { "epoch": 0.19, "learning_rate": 9.347247763081834e-08, "logits/chosen": -3.176239013671875, "logits/rejected": -3.0687332153320312, "logps/chosen": -295.97882080078125, "logps/rejected": -1355.37744140625, "loss": 0.5446, "rewards/accuracies": 1.0, "rewards/chosen": 0.30335313081741333, "rewards/margins": 2.844118595123291, "rewards/rejected": -2.5407652854919434, "step": 149 }, { "epoch": 0.19, "learning_rate": 9.337000025557476e-08, "logits/chosen": -3.174694061279297, "logits/rejected": -3.017245054244995, "logps/chosen": -262.70367431640625, "logps/rejected": -1036.4615478515625, "loss": 0.528, "rewards/accuracies": 1.0, "rewards/chosen": 0.27755051851272583, "rewards/margins": 2.1778342723846436, "rewards/rejected": -1.9002838134765625, "step": 150 }, { "epoch": 0.19, "learning_rate": 9.32667818066887e-08, "logits/chosen": -3.195706844329834, "logits/rejected": -3.0691373348236084, "logps/chosen": -337.6896057128906, "logps/rejected": -589.3656005859375, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": 0.35773926973342896, "rewards/margins": 1.3219032287597656, "rewards/rejected": -0.9641639590263367, "step": 151 }, { "epoch": 0.19, "learning_rate": 9.316282404787869e-08, "logits/chosen": -3.2151098251342773, "logits/rejected": -3.174314498901367, "logps/chosen": -285.521240234375, "logps/rejected": -814.7307739257812, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 0.3309219479560852, "rewards/margins": 2.0363173484802246, "rewards/rejected": -1.7053954601287842, "step": 152 }, { "epoch": 0.2, "learning_rate": 9.305812875549598e-08, "logits/chosen": -3.2136082649230957, "logits/rejected": -2.9426255226135254, "logps/chosen": -293.48004150390625, "logps/rejected": -1581.8680419921875, "loss": 0.4788, "rewards/accuracies": 1.0, "rewards/chosen": 0.2660537660121918, "rewards/margins": 3.2243452072143555, "rewards/rejected": -2.958291530609131, "step": 153 }, { "epoch": 0.2, "learning_rate": 9.295269771849425e-08, "logits/chosen": -3.2112693786621094, "logits/rejected": -3.1239545345306396, "logps/chosen": -314.3002624511719, "logps/rejected": -601.9202270507812, "loss": 0.5235, "rewards/accuracies": 1.0, "rewards/chosen": 0.35342103242874146, "rewards/margins": 1.4561188220977783, "rewards/rejected": -1.1026978492736816, "step": 154 }, { "epoch": 0.2, "learning_rate": 9.284653273839905e-08, "logits/chosen": -3.1957523822784424, "logits/rejected": -2.9913055896759033, "logps/chosen": -329.4015808105469, "logps/rejected": -945.3348388671875, "loss": 0.5454, "rewards/accuracies": 1.0, "rewards/chosen": 0.17346802353858948, "rewards/margins": 1.8218811750411987, "rewards/rejected": -1.648413062095642, "step": 155 }, { "epoch": 0.2, "learning_rate": 9.273963562927694e-08, "logits/chosen": -3.177485466003418, "logits/rejected": -3.0401406288146973, "logps/chosen": -368.9881286621094, "logps/rejected": -579.5093994140625, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 0.3402862548828125, "rewards/margins": 1.3465545177459717, "rewards/rejected": -1.0062682628631592, "step": 156 }, { "epoch": 0.2, "learning_rate": 9.26320082177046e-08, "logits/chosen": -3.1276426315307617, "logits/rejected": -3.0681357383728027, "logps/chosen": -347.9696960449219, "logps/rejected": -455.0523681640625, "loss": 0.5587, "rewards/accuracies": 1.0, "rewards/chosen": 0.32302552461624146, "rewards/margins": 0.9264801144599915, "rewards/rejected": -0.60345458984375, "step": 157 }, { "epoch": 0.2, "learning_rate": 9.252365234273753e-08, "logits/chosen": -3.098644733428955, "logits/rejected": -3.0707664489746094, "logps/chosen": -334.91217041015625, "logps/rejected": -515.2086181640625, "loss": 0.5455, "rewards/accuracies": 1.0, "rewards/chosen": 0.3436111509799957, "rewards/margins": 1.011216640472412, "rewards/rejected": -0.6676055788993835, "step": 158 }, { "epoch": 0.2, "learning_rate": 9.241456985587868e-08, "logits/chosen": -3.1839492321014404, "logits/rejected": -3.0791797637939453, "logps/chosen": -273.38995361328125, "logps/rejected": -1007.9376831054688, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 0.2555503845214844, "rewards/margins": 1.980074405670166, "rewards/rejected": -1.7245240211486816, "step": 159 }, { "epoch": 0.2, "learning_rate": 9.230476262104676e-08, "logits/chosen": -3.22599720954895, "logits/rejected": -3.043410301208496, "logps/chosen": -304.51763916015625, "logps/rejected": -329.8388366699219, "loss": 0.6368, "rewards/accuracies": 1.0, "rewards/chosen": 0.2683761715888977, "rewards/margins": 0.6773086786270142, "rewards/rejected": -0.40893250703811646, "step": 160 }, { "epoch": 0.21, "learning_rate": 9.219423251454446e-08, "logits/chosen": -3.1622962951660156, "logits/rejected": -2.9217076301574707, "logps/chosen": -273.77532958984375, "logps/rejected": -1485.353515625, "loss": 0.4716, "rewards/accuracies": 1.0, "rewards/chosen": 0.3116287291049957, "rewards/margins": 2.9552764892578125, "rewards/rejected": -2.6436476707458496, "step": 161 }, { "epoch": 0.21, "learning_rate": 9.208298142502635e-08, "logits/chosen": -3.210597515106201, "logits/rejected": -3.0421814918518066, "logps/chosen": -288.9012145996094, "logps/rejected": -334.4779052734375, "loss": 0.533, "rewards/accuracies": 1.0, "rewards/chosen": 0.28864288330078125, "rewards/margins": 0.649615466594696, "rewards/rejected": -0.3609725832939148, "step": 162 }, { "epoch": 0.21, "learning_rate": 9.197101125346657e-08, "logits/chosen": -3.229559898376465, "logits/rejected": -3.0165181159973145, "logps/chosen": -329.9634704589844, "logps/rejected": -611.6617431640625, "loss": 0.6008, "rewards/accuracies": 1.0, "rewards/chosen": 0.3326354920864105, "rewards/margins": 1.4128632545471191, "rewards/rejected": -1.0802277326583862, "step": 163 }, { "epoch": 0.21, "learning_rate": 9.185832391312642e-08, "logits/chosen": -3.1119544506073, "logits/rejected": -2.973877429962158, "logps/chosen": -282.87969970703125, "logps/rejected": -1079.2725830078125, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 0.38330233097076416, "rewards/margins": 2.7334916591644287, "rewards/rejected": -2.350189447402954, "step": 164 }, { "epoch": 0.21, "learning_rate": 9.174492132952165e-08, "logits/chosen": -3.2307515144348145, "logits/rejected": -3.1356136798858643, "logps/chosen": -297.04022216796875, "logps/rejected": -1830.1766357421875, "loss": 0.4512, "rewards/accuracies": 1.0, "rewards/chosen": 0.306814581155777, "rewards/margins": 4.103631973266602, "rewards/rejected": -3.7968170642852783, "step": 165 }, { "epoch": 0.21, "learning_rate": 9.163080544038952e-08, "logits/chosen": -3.21661376953125, "logits/rejected": -3.0669946670532227, "logps/chosen": -284.95166015625, "logps/rejected": -624.6632690429688, "loss": 0.5875, "rewards/accuracies": 1.0, "rewards/chosen": 0.24109497666358948, "rewards/margins": 1.4402450323104858, "rewards/rejected": -1.1991500854492188, "step": 166 }, { "epoch": 0.21, "learning_rate": 9.15159781956557e-08, "logits/chosen": -3.194049119949341, "logits/rejected": -3.004361152648926, "logps/chosen": -305.9996643066406, "logps/rejected": -1268.5496826171875, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 0.23648223280906677, "rewards/margins": 2.7457780838012695, "rewards/rejected": -2.509295701980591, "step": 167 }, { "epoch": 0.21, "learning_rate": 9.1400441557401e-08, "logits/chosen": -3.173955202102661, "logits/rejected": -3.130608558654785, "logps/chosen": -310.40447998046875, "logps/rejected": -510.1054382324219, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 0.23483581840991974, "rewards/margins": 0.977948009967804, "rewards/rejected": -0.7431122064590454, "step": 168 }, { "epoch": 0.22, "learning_rate": 9.128419749982779e-08, "logits/chosen": -3.220134735107422, "logits/rejected": -3.0761778354644775, "logps/chosen": -287.095703125, "logps/rejected": -724.5929565429688, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 0.2717346251010895, "rewards/margins": 1.685006856918335, "rewards/rejected": -1.4132721424102783, "step": 169 }, { "epoch": 0.22, "learning_rate": 9.116724800922628e-08, "logits/chosen": -3.2018251419067383, "logits/rejected": -3.1133244037628174, "logps/chosen": -313.37908935546875, "logps/rejected": -819.5926513671875, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/chosen": 0.270376592874527, "rewards/margins": 1.9035125970840454, "rewards/rejected": -1.6331360340118408, "step": 170 }, { "epoch": 0.22, "learning_rate": 9.10495950839406e-08, "logits/chosen": -3.1902337074279785, "logits/rejected": -2.8874258995056152, "logps/chosen": -305.383544921875, "logps/rejected": -1464.7802734375, "loss": 0.5359, "rewards/accuracies": 1.0, "rewards/chosen": 0.418447881937027, "rewards/margins": 3.5803847312927246, "rewards/rejected": -3.1619369983673096, "step": 171 }, { "epoch": 0.22, "learning_rate": 9.093124073433462e-08, "logits/chosen": -3.2314977645874023, "logits/rejected": -3.189458131790161, "logps/chosen": -261.06524658203125, "logps/rejected": -735.2535400390625, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 0.2647491693496704, "rewards/margins": 1.7515076398849487, "rewards/rejected": -1.4867584705352783, "step": 172 }, { "epoch": 0.22, "learning_rate": 9.081218698275762e-08, "logits/chosen": -3.1314713954925537, "logits/rejected": -3.0789852142333984, "logps/chosen": -337.8638916015625, "logps/rejected": -791.8680419921875, "loss": 0.4229, "rewards/accuracies": 1.0, "rewards/chosen": 0.34014892578125, "rewards/margins": 1.8456084728240967, "rewards/rejected": -1.5054595470428467, "step": 173 }, { "epoch": 0.22, "learning_rate": 9.069243586350974e-08, "logits/chosen": -3.1196396350860596, "logits/rejected": -3.0273592472076416, "logps/chosen": -336.85162353515625, "logps/rejected": -611.2289428710938, "loss": 0.4803, "rewards/accuracies": 1.0, "rewards/chosen": 0.3239730894565582, "rewards/margins": 1.5095276832580566, "rewards/rejected": -1.1855545043945312, "step": 174 }, { "epoch": 0.22, "learning_rate": 9.057198942280721e-08, "logits/chosen": -3.1980557441711426, "logits/rejected": -3.0690836906433105, "logps/chosen": -334.24652099609375, "logps/rejected": -558.0846557617188, "loss": 0.5571, "rewards/accuracies": 1.0, "rewards/chosen": 0.32322847843170166, "rewards/margins": 1.4217147827148438, "rewards/rejected": -1.098486304283142, "step": 175 }, { "epoch": 0.22, "learning_rate": 9.045084971874737e-08, "logits/chosen": -3.188248634338379, "logits/rejected": -3.037966728210449, "logps/chosen": -320.2401123046875, "logps/rejected": -628.0545654296875, "loss": 0.5189, "rewards/accuracies": 1.0, "rewards/chosen": 0.3054962158203125, "rewards/margins": 1.3142821788787842, "rewards/rejected": -1.0087859630584717, "step": 176 }, { "epoch": 0.23, "learning_rate": 9.032901882127352e-08, "logits/chosen": -3.177165985107422, "logits/rejected": -3.113628387451172, "logps/chosen": -304.43475341796875, "logps/rejected": -1165.38916015625, "loss": 0.5019, "rewards/accuracies": 1.0, "rewards/chosen": 0.22731781005859375, "rewards/margins": 2.2952682971954346, "rewards/rejected": -2.067950487136841, "step": 177 }, { "epoch": 0.23, "learning_rate": 9.020649881213957e-08, "logits/chosen": -3.168698787689209, "logits/rejected": -3.0432815551757812, "logps/chosen": -347.3075866699219, "logps/rejected": -702.0647583007812, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": 0.29272156953811646, "rewards/margins": 1.5264678001403809, "rewards/rejected": -1.2337462902069092, "step": 178 }, { "epoch": 0.23, "learning_rate": 9.008329178487441e-08, "logits/chosen": -3.222975015640259, "logits/rejected": -2.9357855319976807, "logps/chosen": -322.2657470703125, "logps/rejected": -2026.362060546875, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 0.3402511477470398, "rewards/margins": 4.516812324523926, "rewards/rejected": -4.176560878753662, "step": 179 }, { "epoch": 0.23, "learning_rate": 8.995939984474623e-08, "logits/chosen": -3.24245023727417, "logits/rejected": -3.0682058334350586, "logps/chosen": -337.97320556640625, "logps/rejected": -578.8516845703125, "loss": 0.4806, "rewards/accuracies": 1.0, "rewards/chosen": 0.21620941162109375, "rewards/margins": 1.436366319656372, "rewards/rejected": -1.2201569080352783, "step": 180 }, { "epoch": 0.23, "learning_rate": 8.983482510872644e-08, "logits/chosen": -3.216860771179199, "logits/rejected": -3.0865163803100586, "logps/chosen": -260.472412109375, "logps/rejected": -795.1110229492188, "loss": 0.4274, "rewards/accuracies": 1.0, "rewards/chosen": 0.36563795804977417, "rewards/margins": 1.714484453201294, "rewards/rejected": -1.348846435546875, "step": 181 }, { "epoch": 0.23, "learning_rate": 8.970956970545355e-08, "logits/chosen": -3.160489320755005, "logits/rejected": -3.098374366760254, "logps/chosen": -346.75555419921875, "logps/rejected": -771.2117919921875, "loss": 0.5347, "rewards/accuracies": 1.0, "rewards/chosen": 0.47485196590423584, "rewards/margins": 1.8892563581466675, "rewards/rejected": -1.4144043922424316, "step": 182 }, { "epoch": 0.23, "learning_rate": 8.958363577519683e-08, "logits/chosen": -3.2185592651367188, "logits/rejected": -3.1287074089050293, "logps/chosen": -292.98406982421875, "logps/rejected": -604.7207641601562, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 0.4188392758369446, "rewards/margins": 1.3509269952774048, "rewards/rejected": -0.932087779045105, "step": 183 }, { "epoch": 0.23, "learning_rate": 8.945702546981968e-08, "logits/chosen": -3.1986746788024902, "logits/rejected": -3.068488597869873, "logps/chosen": -290.31103515625, "logps/rejected": -586.75732421875, "loss": 0.5309, "rewards/accuracies": 1.0, "rewards/chosen": 0.38413697481155396, "rewards/margins": 1.6791588068008423, "rewards/rejected": -1.295021891593933, "step": 184 }, { "epoch": 0.24, "learning_rate": 8.932974095274289e-08, "logits/chosen": -3.266152858734131, "logits/rejected": -3.0519206523895264, "logps/chosen": -296.3551025390625, "logps/rejected": -965.6512451171875, "loss": 0.5318, "rewards/accuracies": 1.0, "rewards/chosen": 0.45425570011138916, "rewards/margins": 2.0544114112854004, "rewards/rejected": -1.6001555919647217, "step": 185 }, { "epoch": 0.24, "learning_rate": 8.920178439890764e-08, "logits/chosen": -3.2256579399108887, "logits/rejected": -3.0981740951538086, "logps/chosen": -290.2095947265625, "logps/rejected": -901.3886108398438, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 0.3446609377861023, "rewards/margins": 2.5229997634887695, "rewards/rejected": -2.1783385276794434, "step": 186 }, { "epoch": 0.24, "learning_rate": 8.907315799473844e-08, "logits/chosen": -3.1887052059173584, "logits/rejected": -3.084376335144043, "logps/chosen": -321.8831481933594, "logps/rejected": -511.89227294921875, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 0.27739107608795166, "rewards/margins": 1.4154860973358154, "rewards/rejected": -1.1380951404571533, "step": 187 }, { "epoch": 0.24, "learning_rate": 8.894386393810562e-08, "logits/chosen": -3.2270395755767822, "logits/rejected": -3.01027250289917, "logps/chosen": -336.370361328125, "logps/rejected": -815.64892578125, "loss": 0.4635, "rewards/accuracies": 1.0, "rewards/chosen": 0.37109071016311646, "rewards/margins": 2.031031847000122, "rewards/rejected": -1.6599410772323608, "step": 188 }, { "epoch": 0.24, "learning_rate": 8.881390443828787e-08, "logits/chosen": -3.162266731262207, "logits/rejected": -3.0193991661071777, "logps/chosen": -347.53680419921875, "logps/rejected": -356.0137939453125, "loss": 0.5379, "rewards/accuracies": 1.0, "rewards/chosen": 0.34590452909469604, "rewards/margins": 0.90234375, "rewards/rejected": -0.556439220905304, "step": 189 }, { "epoch": 0.24, "learning_rate": 8.868328171593447e-08, "logits/chosen": -3.2049601078033447, "logits/rejected": -3.074934959411621, "logps/chosen": -283.96728515625, "logps/rejected": -946.1781005859375, "loss": 0.3942, "rewards/accuracies": 1.0, "rewards/chosen": 0.3866211175918579, "rewards/margins": 2.762991428375244, "rewards/rejected": -2.376370429992676, "step": 190 }, { "epoch": 0.24, "learning_rate": 8.855199800302735e-08, "logits/chosen": -3.255514621734619, "logits/rejected": -3.0017194747924805, "logps/chosen": -288.88116455078125, "logps/rejected": -1086.78125, "loss": 0.3615, "rewards/accuracies": 1.0, "rewards/chosen": 0.3438812494277954, "rewards/margins": 2.6475465297698975, "rewards/rejected": -2.3036651611328125, "step": 191 }, { "epoch": 0.24, "learning_rate": 8.842005554284295e-08, "logits/chosen": -3.1742117404937744, "logits/rejected": -3.1327199935913086, "logps/chosen": -299.7100830078125, "logps/rejected": -592.070556640625, "loss": 0.5149, "rewards/accuracies": 1.0, "rewards/chosen": 0.3928634822368622, "rewards/margins": 1.596785068511963, "rewards/rejected": -1.2039215564727783, "step": 192 }, { "epoch": 0.25, "learning_rate": 8.828745658991386e-08, "logits/chosen": -3.1815013885498047, "logits/rejected": -2.990596055984497, "logps/chosen": -318.9503173828125, "logps/rejected": -1588.0489501953125, "loss": 0.448, "rewards/accuracies": 1.0, "rewards/chosen": 0.4241134822368622, "rewards/margins": 4.170475959777832, "rewards/rejected": -3.7463624477386475, "step": 193 }, { "epoch": 0.25, "learning_rate": 8.815420340999033e-08, "logits/chosen": -3.1395978927612305, "logits/rejected": -3.042855739593506, "logps/chosen": -283.7027282714844, "logps/rejected": -389.2200927734375, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 0.43332597613334656, "rewards/margins": 1.1048774719238281, "rewards/rejected": -0.671551525592804, "step": 194 }, { "epoch": 0.25, "learning_rate": 8.802029828000155e-08, "logits/chosen": -3.136807918548584, "logits/rejected": -3.029937267303467, "logps/chosen": -294.83782958984375, "logps/rejected": -1035.1944580078125, "loss": 0.4992, "rewards/accuracies": 1.0, "rewards/chosen": 0.3357437252998352, "rewards/margins": 2.3474717140197754, "rewards/rejected": -2.011727809906006, "step": 195 }, { "epoch": 0.25, "learning_rate": 8.788574348801674e-08, "logits/chosen": -3.166574001312256, "logits/rejected": -2.9960289001464844, "logps/chosen": -305.96551513671875, "logps/rejected": -882.837158203125, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 0.3513946533203125, "rewards/margins": 1.8470063209533691, "rewards/rejected": -1.495611548423767, "step": 196 }, { "epoch": 0.25, "learning_rate": 8.775054133320602e-08, "logits/chosen": -3.2113380432128906, "logits/rejected": -3.0065579414367676, "logps/chosen": -294.24932861328125, "logps/rejected": -1474.91650390625, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": 0.4259994626045227, "rewards/margins": 3.195469856262207, "rewards/rejected": -2.76947021484375, "step": 197 }, { "epoch": 0.25, "learning_rate": 8.761469412580124e-08, "logits/chosen": -3.2097272872924805, "logits/rejected": -3.1494340896606445, "logps/chosen": -328.04449462890625, "logps/rejected": -766.717041015625, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 0.39765626192092896, "rewards/margins": 2.2400269508361816, "rewards/rejected": -1.842370629310608, "step": 198 }, { "epoch": 0.25, "learning_rate": 8.74782041870563e-08, "logits/chosen": -3.2441558837890625, "logits/rejected": -3.093447685241699, "logps/chosen": -255.07485961914062, "logps/rejected": -515.2230224609375, "loss": 0.4612, "rewards/accuracies": 1.0, "rewards/chosen": 0.25839537382125854, "rewards/margins": 1.312678575515747, "rewards/rejected": -1.0542831420898438, "step": 199 }, { "epoch": 0.25, "learning_rate": 8.734107384920769e-08, "logits/chosen": -3.1633448600769043, "logits/rejected": -3.0525200366973877, "logps/chosen": -287.8890686035156, "logps/rejected": -584.0713500976562, "loss": 0.4961, "rewards/accuracies": 1.0, "rewards/chosen": 0.3514160215854645, "rewards/margins": 1.5149505138397217, "rewards/rejected": -1.1635345220565796, "step": 200 }, { "epoch": 0.26, "learning_rate": 8.720330545543453e-08, "logits/chosen": -3.192828893661499, "logits/rejected": -3.074289321899414, "logps/chosen": -271.29412841796875, "logps/rejected": -639.4959106445312, "loss": 0.5278, "rewards/accuracies": 1.0, "rewards/chosen": 0.3534805178642273, "rewards/margins": 1.5008468627929688, "rewards/rejected": -1.1473664045333862, "step": 201 }, { "epoch": 0.26, "learning_rate": 8.706490135981854e-08, "logits/chosen": -3.1958134174346924, "logits/rejected": -3.0822901725769043, "logps/chosen": -301.6406555175781, "logps/rejected": -1349.125732421875, "loss": 0.4285, "rewards/accuracies": 1.0, "rewards/chosen": 0.3664688169956207, "rewards/margins": 3.2932329177856445, "rewards/rejected": -2.9267640113830566, "step": 202 }, { "epoch": 0.26, "learning_rate": 8.692586392730385e-08, "logits/chosen": -3.0855298042297363, "logits/rejected": -3.043283700942993, "logps/chosen": -318.6641845703125, "logps/rejected": -1065.7725830078125, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 0.2737518548965454, "rewards/margins": 2.939785957336426, "rewards/rejected": -2.666033983230591, "step": 203 }, { "epoch": 0.26, "learning_rate": 8.678619553365658e-08, "logits/chosen": -3.194345235824585, "logits/rejected": -3.093803644180298, "logps/chosen": -315.56671142578125, "logps/rejected": -683.402587890625, "loss": 0.5111, "rewards/accuracies": 1.0, "rewards/chosen": 0.31668397784233093, "rewards/margins": 1.8739044666290283, "rewards/rejected": -1.557220458984375, "step": 204 }, { "epoch": 0.26, "learning_rate": 8.664589856542419e-08, "logits/chosen": -3.2287254333496094, "logits/rejected": -3.090963840484619, "logps/chosen": -305.2910461425781, "logps/rejected": -387.62646484375, "loss": 0.4251, "rewards/accuracies": 1.0, "rewards/chosen": 0.4661140441894531, "rewards/margins": 1.100263237953186, "rewards/rejected": -0.6341491937637329, "step": 205 }, { "epoch": 0.26, "learning_rate": 8.650497541989481e-08, "logits/chosen": -3.2009711265563965, "logits/rejected": -2.9402360916137695, "logps/chosen": -320.56353759765625, "logps/rejected": -847.0079956054688, "loss": 0.5404, "rewards/accuracies": 1.0, "rewards/chosen": 0.4001823663711548, "rewards/margins": 2.1064231395721436, "rewards/rejected": -1.7062408924102783, "step": 206 }, { "epoch": 0.26, "learning_rate": 8.636342850505615e-08, "logits/chosen": -3.1634693145751953, "logits/rejected": -3.082956314086914, "logps/chosen": -329.83099365234375, "logps/rejected": -1225.40771484375, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 0.3219497799873352, "rewards/margins": 2.4023513793945312, "rewards/rejected": -2.080401659011841, "step": 207 }, { "epoch": 0.27, "learning_rate": 8.622126023955445e-08, "logits/chosen": -3.141824722290039, "logits/rejected": -3.0830273628234863, "logps/chosen": -284.5570373535156, "logps/rejected": -883.7313232421875, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.4273780882358551, "rewards/margins": 2.4022622108459473, "rewards/rejected": -1.974884033203125, "step": 208 }, { "epoch": 0.27, "learning_rate": 8.60784730526531e-08, "logits/chosen": -3.2322452068328857, "logits/rejected": -3.133467197418213, "logps/chosen": -298.30352783203125, "logps/rejected": -470.72015380859375, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 0.4036957025527954, "rewards/margins": 1.480000376701355, "rewards/rejected": -1.0763046741485596, "step": 209 }, { "epoch": 0.27, "learning_rate": 8.593506938419119e-08, "logits/chosen": -3.188553810119629, "logits/rejected": -3.0198974609375, "logps/chosen": -310.6797180175781, "logps/rejected": -1072.758544921875, "loss": 0.5366, "rewards/accuracies": 1.0, "rewards/chosen": 0.3125961422920227, "rewards/margins": 2.807481288909912, "rewards/rejected": -2.4948854446411133, "step": 210 }, { "epoch": 0.27, "learning_rate": 8.579105168454172e-08, "logits/chosen": -3.2412476539611816, "logits/rejected": -2.9426205158233643, "logps/chosen": -326.3055114746094, "logps/rejected": -3838.50439453125, "loss": 0.37, "rewards/accuracies": 1.0, "rewards/chosen": 0.3506317138671875, "rewards/margins": 6.676596641540527, "rewards/rejected": -6.32596492767334, "step": 211 }, { "epoch": 0.27, "learning_rate": 8.564642241456985e-08, "logits/chosen": -3.1519813537597656, "logits/rejected": -3.1256027221679688, "logps/chosen": -335.84490966796875, "logps/rejected": -660.4705810546875, "loss": 0.5825, "rewards/accuracies": 1.0, "rewards/chosen": 0.26736605167388916, "rewards/margins": 1.6270798444747925, "rewards/rejected": -1.3597137928009033, "step": 212 }, { "epoch": 0.27, "learning_rate": 8.550118404559074e-08, "logits/chosen": -3.20867919921875, "logits/rejected": -3.1310033798217773, "logps/chosen": -333.13568115234375, "logps/rejected": -982.8037109375, "loss": 0.4083, "rewards/accuracies": 1.0, "rewards/chosen": 0.41062164306640625, "rewards/margins": 2.557298183441162, "rewards/rejected": -2.146676540374756, "step": 213 }, { "epoch": 0.27, "learning_rate": 8.535533905932736e-08, "logits/chosen": -3.1827402114868164, "logits/rejected": -3.128274440765381, "logps/chosen": -325.45269775390625, "logps/rejected": -730.94091796875, "loss": 0.4593, "rewards/accuracies": 1.0, "rewards/chosen": 0.3803543150424957, "rewards/margins": 1.810652256011963, "rewards/rejected": -1.4302978515625, "step": 214 }, { "epoch": 0.27, "learning_rate": 8.52088899478682e-08, "logits/chosen": -3.111001491546631, "logits/rejected": -3.0441107749938965, "logps/chosen": -300.48785400390625, "logps/rejected": -647.1129150390625, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 0.511059582233429, "rewards/margins": 2.1696410179138184, "rewards/rejected": -1.6585816144943237, "step": 215 }, { "epoch": 0.28, "learning_rate": 8.506183921362442e-08, "logits/chosen": -3.2167296409606934, "logits/rejected": -3.0682506561279297, "logps/chosen": -302.20013427734375, "logps/rejected": -270.08782958984375, "loss": 0.4783, "rewards/accuracies": 1.0, "rewards/chosen": 0.4342849850654602, "rewards/margins": 0.9010025262832642, "rewards/rejected": -0.46671754121780396, "step": 216 }, { "epoch": 0.28, "learning_rate": 8.491418936928741e-08, "logits/chosen": -3.1010336875915527, "logits/rejected": -3.007777690887451, "logps/chosen": -283.88720703125, "logps/rejected": -2624.2197265625, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 0.47362369298934937, "rewards/margins": 7.011419773101807, "rewards/rejected": -6.5377960205078125, "step": 217 }, { "epoch": 0.28, "learning_rate": 8.47659429377856e-08, "logits/chosen": -3.1793899536132812, "logits/rejected": -2.977548837661743, "logps/chosen": -305.095703125, "logps/rejected": -1592.623779296875, "loss": 0.4644, "rewards/accuracies": 1.0, "rewards/chosen": 0.2760574221611023, "rewards/margins": 4.603557109832764, "rewards/rejected": -4.3274993896484375, "step": 218 }, { "epoch": 0.28, "learning_rate": 8.461710245224147e-08, "logits/chosen": -3.164832592010498, "logits/rejected": -3.070700168609619, "logps/chosen": -333.66815185546875, "logps/rejected": -636.1544799804688, "loss": 0.453, "rewards/accuracies": 1.0, "rewards/chosen": 0.5276321172714233, "rewards/margins": 2.0903396606445312, "rewards/rejected": -1.562707543373108, "step": 219 }, { "epoch": 0.28, "learning_rate": 8.446767045592829e-08, "logits/chosen": -3.1868817806243896, "logits/rejected": -3.121734142303467, "logps/chosen": -291.0616455078125, "logps/rejected": -745.1539306640625, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 0.40918123722076416, "rewards/margins": 1.8817107677459717, "rewards/rejected": -1.472529649734497, "step": 220 }, { "epoch": 0.28, "learning_rate": 8.431764950222655e-08, "logits/chosen": -3.1209163665771484, "logits/rejected": -3.0937914848327637, "logps/chosen": -320.466796875, "logps/rejected": -705.82470703125, "loss": 0.483, "rewards/accuracies": 1.0, "rewards/chosen": 0.40373992919921875, "rewards/margins": 2.1449997425079346, "rewards/rejected": -1.7412598133087158, "step": 221 }, { "epoch": 0.28, "learning_rate": 8.416704215458041e-08, "logits/chosen": -3.1416096687316895, "logits/rejected": -3.0127768516540527, "logps/chosen": -308.8350524902344, "logps/rejected": -1499.3897705078125, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 0.353201299905777, "rewards/margins": 4.444418430328369, "rewards/rejected": -4.091217041015625, "step": 222 }, { "epoch": 0.28, "learning_rate": 8.401585098645395e-08, "logits/chosen": -3.1722002029418945, "logits/rejected": -3.1010513305664062, "logps/chosen": -290.59625244140625, "logps/rejected": -893.9110107421875, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 0.39920806884765625, "rewards/margins": 2.7101364135742188, "rewards/rejected": -2.3109283447265625, "step": 223 }, { "epoch": 0.29, "learning_rate": 8.386407858128706e-08, "logits/chosen": -3.239274501800537, "logits/rejected": -3.104538917541504, "logps/chosen": -299.3688049316406, "logps/rejected": -1313.6993408203125, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 0.37024688720703125, "rewards/margins": 3.288588047027588, "rewards/rejected": -2.9183411598205566, "step": 224 }, { "epoch": 0.29, "learning_rate": 8.371172753245137e-08, "logits/chosen": -3.2034573554992676, "logits/rejected": -3.025956153869629, "logps/chosen": -284.72052001953125, "logps/rejected": -861.580810546875, "loss": 0.4351, "rewards/accuracies": 1.0, "rewards/chosen": 0.2962844967842102, "rewards/margins": 2.1622543334960938, "rewards/rejected": -1.8659698963165283, "step": 225 }, { "epoch": 0.29, "learning_rate": 8.355880044320598e-08, "logits/chosen": -3.186309814453125, "logits/rejected": -3.0823569297790527, "logps/chosen": -287.226806640625, "logps/rejected": -677.1670532226562, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 0.40285032987594604, "rewards/margins": 1.7601425647735596, "rewards/rejected": -1.3572921752929688, "step": 226 }, { "epoch": 0.29, "learning_rate": 8.340529992665288e-08, "logits/chosen": -3.219883918762207, "logits/rejected": -3.182373523712158, "logps/chosen": -290.51025390625, "logps/rejected": -698.3676147460938, "loss": 0.4644, "rewards/accuracies": 1.0, "rewards/chosen": 0.39911195635795593, "rewards/margins": 1.9913406372070312, "rewards/rejected": -1.5922287702560425, "step": 227 }, { "epoch": 0.29, "learning_rate": 8.32512286056924e-08, "logits/chosen": -3.2572708129882812, "logits/rejected": -3.1509811878204346, "logps/chosen": -291.31585693359375, "logps/rejected": -824.832763671875, "loss": 0.4436, "rewards/accuracies": 1.0, "rewards/chosen": 0.30090028047561646, "rewards/margins": 2.2495696544647217, "rewards/rejected": -1.94866943359375, "step": 228 }, { "epoch": 0.29, "learning_rate": 8.309658911297832e-08, "logits/chosen": -3.2087903022766113, "logits/rejected": -3.0726311206817627, "logps/chosen": -301.10443115234375, "logps/rejected": -742.4307861328125, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": 0.40003207325935364, "rewards/margins": 1.8457839488983154, "rewards/rejected": -1.4457519054412842, "step": 229 }, { "epoch": 0.29, "learning_rate": 8.294138409087289e-08, "logits/chosen": -3.190702438354492, "logits/rejected": -2.9835166931152344, "logps/chosen": -279.91876220703125, "logps/rejected": -2996.839111328125, "loss": 0.4449, "rewards/accuracies": 1.0, "rewards/chosen": 0.53045654296875, "rewards/margins": 7.440071105957031, "rewards/rejected": -6.9096150398254395, "step": 230 }, { "epoch": 0.29, "learning_rate": 8.278561619140171e-08, "logits/chosen": -3.2375681400299072, "logits/rejected": -3.111532211303711, "logps/chosen": -305.81341552734375, "logps/rejected": -714.4471435546875, "loss": 0.5255, "rewards/accuracies": 1.0, "rewards/chosen": 0.6023589968681335, "rewards/margins": 2.2386016845703125, "rewards/rejected": -1.6362426280975342, "step": 231 }, { "epoch": 0.3, "learning_rate": 8.262928807620843e-08, "logits/chosen": -3.172877788543701, "logits/rejected": -3.114123821258545, "logps/chosen": -334.9998779296875, "logps/rejected": -540.7589111328125, "loss": 0.5165, "rewards/accuracies": 1.0, "rewards/chosen": 0.42046356201171875, "rewards/margins": 1.557580590248108, "rewards/rejected": -1.1371170282363892, "step": 232 }, { "epoch": 0.3, "learning_rate": 8.247240241650917e-08, "logits/chosen": -3.171924352645874, "logits/rejected": -3.056098461151123, "logps/chosen": -296.35687255859375, "logps/rejected": -800.5528564453125, "loss": 0.4119, "rewards/accuracies": 1.0, "rewards/chosen": 0.48618775606155396, "rewards/margins": 2.368105888366699, "rewards/rejected": -1.8819183111190796, "step": 233 }, { "epoch": 0.3, "learning_rate": 8.231496189304703e-08, "logits/chosen": -3.242809295654297, "logits/rejected": -3.2010927200317383, "logps/chosen": -321.78009033203125, "logps/rejected": -586.5162353515625, "loss": 0.5646, "rewards/accuracies": 1.0, "rewards/chosen": 0.353079229593277, "rewards/margins": 1.6197419166564941, "rewards/rejected": -1.26666259765625, "step": 234 }, { "epoch": 0.3, "learning_rate": 8.215696919604617e-08, "logits/chosen": -3.2117037773132324, "logits/rejected": -3.0278453826904297, "logps/chosen": -317.3045349121094, "logps/rejected": -866.4177856445312, "loss": 0.4423, "rewards/accuracies": 1.0, "rewards/chosen": 0.3774048089981079, "rewards/margins": 2.588955879211426, "rewards/rejected": -2.2115509510040283, "step": 235 }, { "epoch": 0.3, "learning_rate": 8.199842702516583e-08, "logits/chosen": -3.24027156829834, "logits/rejected": -3.0846264362335205, "logps/chosen": -266.98992919921875, "logps/rejected": -256.82672119140625, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": 0.3778747618198395, "rewards/margins": 1.005623698234558, "rewards/rejected": -0.6277489066123962, "step": 236 }, { "epoch": 0.3, "learning_rate": 8.18393380894543e-08, "logits/chosen": -3.2284650802612305, "logits/rejected": -2.9114110469818115, "logps/chosen": -318.7149658203125, "logps/rejected": -821.3900146484375, "loss": 0.4653, "rewards/accuracies": 1.0, "rewards/chosen": 0.3110412657260895, "rewards/margins": 2.3332061767578125, "rewards/rejected": -2.022164821624756, "step": 237 }, { "epoch": 0.3, "learning_rate": 8.167970510730252e-08, "logits/chosen": -3.1899914741516113, "logits/rejected": -3.022425651550293, "logps/chosen": -294.5552062988281, "logps/rejected": -597.2496337890625, "loss": 0.4622, "rewards/accuracies": 1.0, "rewards/chosen": 0.3180603086948395, "rewards/margins": 1.699650526046753, "rewards/rejected": -1.3815902471542358, "step": 238 }, { "epoch": 0.3, "learning_rate": 8.151953080639775e-08, "logits/chosen": -3.1844470500946045, "logits/rejected": -3.01338791847229, "logps/chosen": -361.455078125, "logps/rejected": -700.7388305664062, "loss": 0.4976, "rewards/accuracies": 1.0, "rewards/chosen": 0.3937667906284332, "rewards/margins": 2.029606580734253, "rewards/rejected": -1.6358399391174316, "step": 239 }, { "epoch": 0.31, "learning_rate": 8.135881792367685e-08, "logits/chosen": -3.1403493881225586, "logits/rejected": -3.0027685165405273, "logps/chosen": -281.55657958984375, "logps/rejected": -1033.9443359375, "loss": 0.4561, "rewards/accuracies": 1.0, "rewards/chosen": 0.39189910888671875, "rewards/margins": 3.0235657691955566, "rewards/rejected": -2.631666660308838, "step": 240 }, { "epoch": 0.31, "learning_rate": 8.119756920527954e-08, "logits/chosen": -3.2203640937805176, "logits/rejected": -2.9461116790771484, "logps/chosen": -282.46868896484375, "logps/rejected": -1589.55224609375, "loss": 0.4093, "rewards/accuracies": 1.0, "rewards/chosen": 0.41849517822265625, "rewards/margins": 4.386119365692139, "rewards/rejected": -3.9676239490509033, "step": 241 }, { "epoch": 0.31, "learning_rate": 8.103578740650156e-08, "logits/chosen": -3.239978313446045, "logits/rejected": -3.1076226234436035, "logps/chosen": -257.97491455078125, "logps/rejected": -719.4148559570312, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 0.33644333481788635, "rewards/margins": 1.859035611152649, "rewards/rejected": -1.522592306137085, "step": 242 }, { "epoch": 0.31, "learning_rate": 8.087347529174742e-08, "logits/chosen": -3.2340948581695557, "logits/rejected": -3.10042667388916, "logps/chosen": -294.8481750488281, "logps/rejected": -1241.787109375, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 0.3943374752998352, "rewards/margins": 3.698054790496826, "rewards/rejected": -3.3037171363830566, "step": 243 }, { "epoch": 0.31, "learning_rate": 8.07106356344834e-08, "logits/chosen": -3.1547842025756836, "logits/rejected": -2.9430785179138184, "logps/chosen": -379.8869934082031, "logps/rejected": -1477.2598876953125, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 0.5726776123046875, "rewards/margins": 4.308169364929199, "rewards/rejected": -3.735491991043091, "step": 244 }, { "epoch": 0.31, "learning_rate": 8.054727121718987e-08, "logits/chosen": -3.1527626514434814, "logits/rejected": -3.0656850337982178, "logps/chosen": -279.7440185546875, "logps/rejected": -1039.111328125, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 0.5274528861045837, "rewards/margins": 2.805278778076172, "rewards/rejected": -2.2778258323669434, "step": 245 }, { "epoch": 0.31, "learning_rate": 8.038338483131406e-08, "logits/chosen": -3.1871728897094727, "logits/rejected": -3.0673084259033203, "logps/chosen": -328.7554931640625, "logps/rejected": -910.046630859375, "loss": 0.4217, "rewards/accuracies": 1.0, "rewards/chosen": 0.42711639404296875, "rewards/margins": 2.3737382888793945, "rewards/rejected": -1.9466217756271362, "step": 246 }, { "epoch": 0.31, "learning_rate": 8.021897927722208e-08, "logits/chosen": -3.083707809448242, "logits/rejected": -3.034977912902832, "logps/chosen": -278.7366027832031, "logps/rejected": -205.09683227539062, "loss": 0.4989, "rewards/accuracies": 1.0, "rewards/chosen": 0.3319503962993622, "rewards/margins": 0.6092132329940796, "rewards/rejected": -0.2772628664970398, "step": 247 }, { "epoch": 0.32, "learning_rate": 8.005405736415125e-08, "logits/chosen": -3.1064558029174805, "logits/rejected": -2.9568798542022705, "logps/chosen": -323.35986328125, "logps/rejected": -366.4610900878906, "loss": 0.535, "rewards/accuracies": 1.0, "rewards/chosen": 0.5135253667831421, "rewards/margins": 1.1482040882110596, "rewards/rejected": -0.6346786618232727, "step": 248 }, { "epoch": 0.32, "learning_rate": 7.988862191016203e-08, "logits/chosen": -3.1793885231018066, "logits/rejected": -3.0432658195495605, "logps/chosen": -287.9894104003906, "logps/rejected": -1048.34326171875, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 0.47931522130966187, "rewards/margins": 2.7367706298828125, "rewards/rejected": -2.257455348968506, "step": 249 }, { "epoch": 0.32, "learning_rate": 7.97226757420899e-08, "logits/chosen": -3.240239143371582, "logits/rejected": -3.054462432861328, "logps/chosen": -289.623779296875, "logps/rejected": -738.958740234375, "loss": 0.4616, "rewards/accuracies": 1.0, "rewards/chosen": 0.42137908935546875, "rewards/margins": 2.121324062347412, "rewards/rejected": -1.699945092201233, "step": 250 }, { "epoch": 0.32, "learning_rate": 7.955622169549696e-08, "logits/chosen": -3.265397548675537, "logits/rejected": -3.003241777420044, "logps/chosen": -313.3736877441406, "logps/rejected": -1002.1644287109375, "loss": 0.5071, "rewards/accuracies": 1.0, "rewards/chosen": 0.4189041256904602, "rewards/margins": 2.859998941421509, "rewards/rejected": -2.4410948753356934, "step": 251 }, { "epoch": 0.32, "learning_rate": 7.938926261462366e-08, "logits/chosen": -3.2620582580566406, "logits/rejected": -3.115530014038086, "logps/chosen": -339.23870849609375, "logps/rejected": -603.46484375, "loss": 0.4501, "rewards/accuracies": 1.0, "rewards/chosen": 0.39473116397857666, "rewards/margins": 1.9752259254455566, "rewards/rejected": -1.58049476146698, "step": 252 }, { "epoch": 0.32, "learning_rate": 7.922180135233999e-08, "logits/chosen": -3.1825928688049316, "logits/rejected": -3.1034798622131348, "logps/chosen": -344.0287170410156, "logps/rejected": -548.2366943359375, "loss": 0.4725, "rewards/accuracies": 1.0, "rewards/chosen": 0.647381603717804, "rewards/margins": 1.8871124982833862, "rewards/rejected": -1.2397308349609375, "step": 253 }, { "epoch": 0.32, "learning_rate": 7.905384077009691e-08, "logits/chosen": -3.151496410369873, "logits/rejected": -3.10984468460083, "logps/chosen": -301.0751037597656, "logps/rejected": -885.0989379882812, "loss": 0.4012, "rewards/accuracies": 1.0, "rewards/chosen": 0.39567261934280396, "rewards/margins": 2.6456298828125, "rewards/rejected": -2.249957323074341, "step": 254 }, { "epoch": 0.33, "learning_rate": 7.888538373787734e-08, "logits/chosen": -3.2037317752838135, "logits/rejected": -3.1332061290740967, "logps/chosen": -321.5391845703125, "logps/rejected": -1221.12841796875, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 0.4278045892715454, "rewards/margins": 3.620352268218994, "rewards/rejected": -3.1925477981567383, "step": 255 }, { "epoch": 0.33, "learning_rate": 7.871643313414717e-08, "logits/chosen": -3.156106472015381, "logits/rejected": -3.1187968254089355, "logps/chosen": -254.52182006835938, "logps/rejected": -1068.4429931640625, "loss": 0.3892, "rewards/accuracies": 1.0, "rewards/chosen": 0.5172065496444702, "rewards/margins": 3.026533603668213, "rewards/rejected": -2.509326934814453, "step": 256 }, { "epoch": 0.33, "learning_rate": 7.854699184580609e-08, "logits/chosen": -3.254756450653076, "logits/rejected": -3.0626864433288574, "logps/chosen": -304.60986328125, "logps/rejected": -229.47576904296875, "loss": 0.5731, "rewards/accuracies": 1.0, "rewards/chosen": 0.4023071527481079, "rewards/margins": 0.6850143671035767, "rewards/rejected": -0.28270721435546875, "step": 257 }, { "epoch": 0.33, "learning_rate": 7.837706276813818e-08, "logits/chosen": -3.175839424133301, "logits/rejected": -3.0216164588928223, "logps/chosen": -313.3533020019531, "logps/rejected": -572.4879760742188, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 0.4118209779262543, "rewards/margins": 1.5765838623046875, "rewards/rejected": -1.1647629737854004, "step": 258 }, { "epoch": 0.33, "learning_rate": 7.820664880476255e-08, "logits/chosen": -3.1645569801330566, "logits/rejected": -3.1420228481292725, "logps/chosen": -309.967529296875, "logps/rejected": -594.2871704101562, "loss": 0.3737, "rewards/accuracies": 1.0, "rewards/chosen": 0.43959200382232666, "rewards/margins": 1.779412865638733, "rewards/rejected": -1.3398208618164062, "step": 259 }, { "epoch": 0.33, "learning_rate": 7.803575286758363e-08, "logits/chosen": -3.1707615852355957, "logits/rejected": -3.1215734481811523, "logps/chosen": -300.1448974609375, "logps/rejected": -624.2242431640625, "loss": 0.4749, "rewards/accuracies": 1.0, "rewards/chosen": 0.31672364473342896, "rewards/margins": 2.0574891567230225, "rewards/rejected": -1.7407654523849487, "step": 260 }, { "epoch": 0.33, "learning_rate": 7.786437787674148e-08, "logits/chosen": -3.1946990489959717, "logits/rejected": -2.9711506366729736, "logps/chosen": -301.2517395019531, "logps/rejected": -2410.300537109375, "loss": 0.5163, "rewards/accuracies": 1.0, "rewards/chosen": 0.3161575198173523, "rewards/margins": 6.247736930847168, "rewards/rejected": -5.93157958984375, "step": 261 }, { "epoch": 0.33, "learning_rate": 7.769252676056186e-08, "logits/chosen": -3.23768949508667, "logits/rejected": -3.108297109603882, "logps/chosen": -344.859619140625, "logps/rejected": -953.634765625, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 0.4521636962890625, "rewards/margins": 2.75669264793396, "rewards/rejected": -2.3045287132263184, "step": 262 }, { "epoch": 0.34, "learning_rate": 7.752020245550617e-08, "logits/chosen": -3.2300336360931396, "logits/rejected": -3.1675775051116943, "logps/chosen": -318.15118408203125, "logps/rejected": -463.9980773925781, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 0.5251373648643494, "rewards/margins": 1.4997880458831787, "rewards/rejected": -0.9746506214141846, "step": 263 }, { "epoch": 0.34, "learning_rate": 7.734740790612135e-08, "logits/chosen": -3.232787609100342, "logits/rejected": -3.090399742126465, "logps/chosen": -299.2611389160156, "logps/rejected": -713.0697021484375, "loss": 0.4408, "rewards/accuracies": 1.0, "rewards/chosen": 0.5529663562774658, "rewards/margins": 2.246145725250244, "rewards/rejected": -1.6931793689727783, "step": 264 }, { "epoch": 0.34, "learning_rate": 7.717414606498946e-08, "logits/chosen": -3.211059331893921, "logits/rejected": -3.031338691711426, "logps/chosen": -271.779052734375, "logps/rejected": -965.702880859375, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 0.5372451543807983, "rewards/margins": 3.1984238624572754, "rewards/rejected": -2.6611785888671875, "step": 265 }, { "epoch": 0.34, "learning_rate": 7.700041989267735e-08, "logits/chosen": -3.230165958404541, "logits/rejected": -3.118411064147949, "logps/chosen": -312.1624755859375, "logps/rejected": -346.77459716796875, "loss": 0.5088, "rewards/accuracies": 1.0, "rewards/chosen": 0.5288284420967102, "rewards/margins": 1.3516708612442017, "rewards/rejected": -0.8228424191474915, "step": 266 }, { "epoch": 0.34, "learning_rate": 7.682623235768597e-08, "logits/chosen": -3.1531879901885986, "logits/rejected": -3.078362464904785, "logps/chosen": -290.466796875, "logps/rejected": -894.6112060546875, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 0.45133668184280396, "rewards/margins": 2.609513759613037, "rewards/rejected": -2.158177137374878, "step": 267 }, { "epoch": 0.34, "learning_rate": 7.665158643639968e-08, "logits/chosen": -3.1789255142211914, "logits/rejected": -2.9105491638183594, "logps/chosen": -293.23822021484375, "logps/rejected": -3097.203857421875, "loss": 0.444, "rewards/accuracies": 1.0, "rewards/chosen": 0.32520753145217896, "rewards/margins": 8.161087036132812, "rewards/rejected": -7.835879802703857, "step": 268 }, { "epoch": 0.34, "learning_rate": 7.647648511303544e-08, "logits/chosen": -3.1961426734924316, "logits/rejected": -2.999044179916382, "logps/chosen": -265.6154479980469, "logps/rejected": -1265.5347900390625, "loss": 0.3839, "rewards/accuracies": 1.0, "rewards/chosen": 0.5167938470840454, "rewards/margins": 4.082941055297852, "rewards/rejected": -3.5661468505859375, "step": 269 }, { "epoch": 0.34, "learning_rate": 7.63009313795917e-08, "logits/chosen": -3.186161518096924, "logits/rejected": -2.9531478881835938, "logps/chosen": -296.60833740234375, "logps/rejected": -954.3577880859375, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": 0.39543306827545166, "rewards/margins": 2.7010498046875, "rewards/rejected": -2.305616855621338, "step": 270 }, { "epoch": 0.35, "learning_rate": 7.612492823579743e-08, "logits/chosen": -3.187504768371582, "logits/rejected": -2.864610195159912, "logps/chosen": -309.1134033203125, "logps/rejected": -2134.911376953125, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 0.5176590085029602, "rewards/margins": 6.307649612426758, "rewards/rejected": -5.789990425109863, "step": 271 }, { "epoch": 0.35, "learning_rate": 7.594847868906076e-08, "logits/chosen": -3.1514663696289062, "logits/rejected": -3.111537456512451, "logps/chosen": -305.0625, "logps/rejected": -923.783935546875, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 0.4518798887729645, "rewards/margins": 3.092587471008301, "rewards/rejected": -2.640707492828369, "step": 272 }, { "epoch": 0.35, "learning_rate": 7.577158575441756e-08, "logits/chosen": -3.1907975673675537, "logits/rejected": -2.9711570739746094, "logps/chosen": -350.8769836425781, "logps/rejected": -1563.263671875, "loss": 0.4437, "rewards/accuracies": 1.0, "rewards/chosen": 0.5159271359443665, "rewards/margins": 4.656472682952881, "rewards/rejected": -4.140545845031738, "step": 273 }, { "epoch": 0.35, "learning_rate": 7.559425245448005e-08, "logits/chosen": -3.239628314971924, "logits/rejected": -3.119154930114746, "logps/chosen": -292.18359375, "logps/rejected": -364.6217041015625, "loss": 0.5185, "rewards/accuracies": 1.0, "rewards/chosen": 0.4305816888809204, "rewards/margins": 1.187677025794983, "rewards/rejected": -0.7570953369140625, "step": 274 }, { "epoch": 0.35, "learning_rate": 7.541648181938503e-08, "logits/chosen": -3.156550407409668, "logits/rejected": -3.137394428253174, "logps/chosen": -319.2832336425781, "logps/rejected": -686.4276733398438, "loss": 0.4125, "rewards/accuracies": 1.0, "rewards/chosen": 0.4179062247276306, "rewards/margins": 2.345912218093872, "rewards/rejected": -1.9280059337615967, "step": 275 }, { "epoch": 0.35, "learning_rate": 7.523827688674219e-08, "logits/chosen": -3.1932408809661865, "logits/rejected": -3.0858030319213867, "logps/chosen": -317.556884765625, "logps/rejected": -1378.7491455078125, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 0.43173831701278687, "rewards/margins": 3.918652296066284, "rewards/rejected": -3.4869141578674316, "step": 276 }, { "epoch": 0.35, "learning_rate": 7.505964070158213e-08, "logits/chosen": -3.188957691192627, "logits/rejected": -3.0604348182678223, "logps/chosen": -296.63385009765625, "logps/rejected": -364.438720703125, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 0.4779403805732727, "rewards/margins": 1.308197021484375, "rewards/rejected": -0.8302567005157471, "step": 277 }, { "epoch": 0.35, "learning_rate": 7.488057631630437e-08, "logits/chosen": -3.2187302112579346, "logits/rejected": -3.160400390625, "logps/chosen": -285.0635986328125, "logps/rejected": -601.4281616210938, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 0.4450820982456207, "rewards/margins": 1.990347146987915, "rewards/rejected": -1.5452651977539062, "step": 278 }, { "epoch": 0.36, "learning_rate": 7.47010867906252e-08, "logits/chosen": -3.2696664333343506, "logits/rejected": -3.15395450592041, "logps/chosen": -306.9798583984375, "logps/rejected": -495.1792907714844, "loss": 0.5067, "rewards/accuracies": 1.0, "rewards/chosen": 0.3780364990234375, "rewards/margins": 1.57030189037323, "rewards/rejected": -1.192265272140503, "step": 279 }, { "epoch": 0.36, "learning_rate": 7.452117519152541e-08, "logits/chosen": -3.1961417198181152, "logits/rejected": -3.0728201866149902, "logps/chosen": -315.22998046875, "logps/rejected": -1152.0810546875, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 0.5452560186386108, "rewards/margins": 3.4776229858398438, "rewards/rejected": -2.9323668479919434, "step": 280 }, { "epoch": 0.36, "learning_rate": 7.434084459319781e-08, "logits/chosen": -3.162919521331787, "logits/rejected": -3.072948455810547, "logps/chosen": -291.536865234375, "logps/rejected": -874.2633056640625, "loss": 0.4237, "rewards/accuracies": 1.0, "rewards/chosen": 0.6040443181991577, "rewards/margins": 2.7627878189086914, "rewards/rejected": -2.158743381500244, "step": 281 }, { "epoch": 0.36, "learning_rate": 7.41600980769948e-08, "logits/chosen": -3.2194535732269287, "logits/rejected": -3.0999197959899902, "logps/chosen": -286.52532958984375, "logps/rejected": -521.390869140625, "loss": 0.4204, "rewards/accuracies": 1.0, "rewards/chosen": 0.43071287870407104, "rewards/margins": 1.6562836170196533, "rewards/rejected": -1.2255706787109375, "step": 282 }, { "epoch": 0.36, "learning_rate": 7.397893873137563e-08, "logits/chosen": -3.2214760780334473, "logits/rejected": -3.1218197345733643, "logps/chosen": -315.9534912109375, "logps/rejected": -518.8146362304688, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 0.4122711420059204, "rewards/margins": 1.6100311279296875, "rewards/rejected": -1.197759985923767, "step": 283 }, { "epoch": 0.36, "learning_rate": 7.379736965185368e-08, "logits/chosen": -3.1488943099975586, "logits/rejected": -3.0918045043945312, "logps/chosen": -325.8968505859375, "logps/rejected": -740.5457763671875, "loss": 0.4144, "rewards/accuracies": 1.0, "rewards/chosen": 0.6187195181846619, "rewards/margins": 2.5115966796875, "rewards/rejected": -1.892877221107483, "step": 284 }, { "epoch": 0.36, "learning_rate": 7.361539394094355e-08, "logits/chosen": -3.200594186782837, "logits/rejected": -2.8805007934570312, "logps/chosen": -275.498046875, "logps/rejected": -2063.130859375, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 0.45897525548934937, "rewards/margins": 6.478066921234131, "rewards/rejected": -6.019091606140137, "step": 285 }, { "epoch": 0.36, "learning_rate": 7.343301470810807e-08, "logits/chosen": -3.2124557495117188, "logits/rejected": -3.077472448348999, "logps/chosen": -349.00054931640625, "logps/rejected": -848.3692626953125, "loss": 0.5066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4935165345668793, "rewards/margins": 2.6234726905822754, "rewards/rejected": -2.1299562454223633, "step": 286 }, { "epoch": 0.37, "learning_rate": 7.325023506970511e-08, "logits/chosen": -3.161384344100952, "logits/rejected": -2.9854631423950195, "logps/chosen": -307.0898742675781, "logps/rejected": -1186.8179931640625, "loss": 0.3964, "rewards/accuracies": 1.0, "rewards/chosen": 0.5870590209960938, "rewards/margins": 3.494340658187866, "rewards/rejected": -2.9072816371917725, "step": 287 }, { "epoch": 0.37, "learning_rate": 7.306705814893439e-08, "logits/chosen": -3.2024946212768555, "logits/rejected": -3.027733325958252, "logps/chosen": -347.9485778808594, "logps/rejected": -280.10931396484375, "loss": 0.5399, "rewards/accuracies": 1.0, "rewards/chosen": 0.545684814453125, "rewards/margins": 1.1189277172088623, "rewards/rejected": -0.5732429623603821, "step": 288 }, { "epoch": 0.37, "learning_rate": 7.288348707578408e-08, "logits/chosen": -3.154916286468506, "logits/rejected": -3.119337797164917, "logps/chosen": -349.40777587890625, "logps/rejected": -522.712890625, "loss": 0.5609, "rewards/accuracies": 1.0, "rewards/chosen": 0.47333985567092896, "rewards/margins": 1.5256683826446533, "rewards/rejected": -1.0523284673690796, "step": 289 }, { "epoch": 0.37, "learning_rate": 7.269952498697734e-08, "logits/chosen": -3.2057666778564453, "logits/rejected": -3.104879856109619, "logps/chosen": -324.03704833984375, "logps/rejected": -652.0909423828125, "loss": 0.4445, "rewards/accuracies": 1.0, "rewards/chosen": 0.49629825353622437, "rewards/margins": 2.2532501220703125, "rewards/rejected": -1.756951928138733, "step": 290 }, { "epoch": 0.37, "learning_rate": 7.251517502591869e-08, "logits/chosen": -3.272763252258301, "logits/rejected": -3.061901569366455, "logps/chosen": -325.1474304199219, "logps/rejected": -630.0096435546875, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 0.5567687749862671, "rewards/margins": 1.9857650995254517, "rewards/rejected": -1.4289963245391846, "step": 291 }, { "epoch": 0.37, "learning_rate": 7.233044034264033e-08, "logits/chosen": -3.234614849090576, "logits/rejected": -3.0080432891845703, "logps/chosen": -327.2720947265625, "logps/rejected": -588.0780639648438, "loss": 0.4337, "rewards/accuracies": 1.0, "rewards/chosen": 0.6154724359512329, "rewards/margins": 2.0149171352386475, "rewards/rejected": -1.3994446992874146, "step": 292 }, { "epoch": 0.37, "learning_rate": 7.214532409374828e-08, "logits/chosen": -3.1743288040161133, "logits/rejected": -3.057673931121826, "logps/chosen": -296.7602844238281, "logps/rejected": -723.2427978515625, "loss": 0.4799, "rewards/accuracies": 1.0, "rewards/chosen": 0.4191741943359375, "rewards/margins": 1.92563796043396, "rewards/rejected": -1.506463646888733, "step": 293 }, { "epoch": 0.37, "learning_rate": 7.195982944236851e-08, "logits/chosen": -3.1801466941833496, "logits/rejected": -3.1115450859069824, "logps/chosen": -313.5283203125, "logps/rejected": -648.2113037109375, "loss": 0.4576, "rewards/accuracies": 1.0, "rewards/chosen": 0.5136184692382812, "rewards/margins": 1.9931365251541138, "rewards/rejected": -1.4795180559158325, "step": 294 }, { "epoch": 0.38, "learning_rate": 7.17739595580928e-08, "logits/chosen": -3.2537879943847656, "logits/rejected": -3.048915386199951, "logps/chosen": -332.0135803222656, "logps/rejected": -1367.305419921875, "loss": 0.4446, "rewards/accuracies": 1.0, "rewards/chosen": 0.5646805167198181, "rewards/margins": 3.692028760910034, "rewards/rejected": -3.1273481845855713, "step": 295 }, { "epoch": 0.38, "learning_rate": 7.158771761692464e-08, "logits/chosen": -3.2559001445770264, "logits/rejected": -3.121368885040283, "logps/chosen": -282.0394592285156, "logps/rejected": -705.148681640625, "loss": 0.4402, "rewards/accuracies": 1.0, "rewards/chosen": 0.3875992000102997, "rewards/margins": 2.2600433826446533, "rewards/rejected": -1.8724441528320312, "step": 296 }, { "epoch": 0.38, "learning_rate": 7.140110680122495e-08, "logits/chosen": -3.245110034942627, "logits/rejected": -3.1197657585144043, "logps/chosen": -311.64892578125, "logps/rejected": -618.3832397460938, "loss": 0.4681, "rewards/accuracies": 1.0, "rewards/chosen": 0.660290539264679, "rewards/margins": 2.3905715942382812, "rewards/rejected": -1.730281114578247, "step": 297 }, { "epoch": 0.38, "learning_rate": 7.121413029965768e-08, "logits/chosen": -3.2494254112243652, "logits/rejected": -3.082028865814209, "logps/chosen": -258.56475830078125, "logps/rejected": -889.0240478515625, "loss": 0.4033, "rewards/accuracies": 1.0, "rewards/chosen": 0.44359666109085083, "rewards/margins": 2.907991886138916, "rewards/rejected": -2.464395046234131, "step": 298 }, { "epoch": 0.38, "learning_rate": 7.102679130713537e-08, "logits/chosen": -3.2354302406311035, "logits/rejected": -3.1197919845581055, "logps/chosen": -311.4739990234375, "logps/rejected": -339.64007568359375, "loss": 0.4844, "rewards/accuracies": 1.0, "rewards/chosen": 0.4727310538291931, "rewards/margins": 1.1263916492462158, "rewards/rejected": -0.6536605954170227, "step": 299 }, { "epoch": 0.38, "learning_rate": 7.083909302476451e-08, "logits/chosen": -3.216707706451416, "logits/rejected": -3.1210408210754395, "logps/chosen": -336.4608154296875, "logps/rejected": -832.471435546875, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 0.3611282408237457, "rewards/margins": 2.5386605262756348, "rewards/rejected": -2.177532196044922, "step": 300 }, { "epoch": 0.38, "learning_rate": 7.065103865979087e-08, "logits/chosen": -3.182262420654297, "logits/rejected": -3.032802104949951, "logps/chosen": -276.903076171875, "logps/rejected": -2636.367919921875, "loss": 0.3703, "rewards/accuracies": 1.0, "rewards/chosen": 0.4069320857524872, "rewards/margins": 6.945216178894043, "rewards/rejected": -6.5382843017578125, "step": 301 }, { "epoch": 0.38, "learning_rate": 7.046263142554469e-08, "logits/chosen": -3.2053186893463135, "logits/rejected": -3.04459285736084, "logps/chosen": -288.0726318359375, "logps/rejected": -494.9621276855469, "loss": 0.4814, "rewards/accuracies": 1.0, "rewards/chosen": 0.5722061395645142, "rewards/margins": 1.9882049560546875, "rewards/rejected": -1.4159988164901733, "step": 302 }, { "epoch": 0.39, "learning_rate": 7.027387454138578e-08, "logits/chosen": -3.2189810276031494, "logits/rejected": -3.1405882835388184, "logps/chosen": -297.28997802734375, "logps/rejected": -721.6944580078125, "loss": 0.4212, "rewards/accuracies": 1.0, "rewards/chosen": 0.4842010736465454, "rewards/margins": 2.643341064453125, "rewards/rejected": -2.159140110015869, "step": 303 }, { "epoch": 0.39, "learning_rate": 7.008477123264848e-08, "logits/chosen": -3.213839530944824, "logits/rejected": -2.964235544204712, "logps/chosen": -289.43780517578125, "logps/rejected": -1651.685546875, "loss": 0.4192, "rewards/accuracies": 1.0, "rewards/chosen": 0.5944916009902954, "rewards/margins": 4.832251071929932, "rewards/rejected": -4.237759590148926, "step": 304 }, { "epoch": 0.39, "learning_rate": 6.989532473058657e-08, "logits/chosen": -3.217395305633545, "logits/rejected": -3.0948526859283447, "logps/chosen": -317.0671081542969, "logps/rejected": -769.6229248046875, "loss": 0.451, "rewards/accuracies": 1.0, "rewards/chosen": 0.46319884061813354, "rewards/margins": 2.594931125640869, "rewards/rejected": -2.131732225418091, "step": 305 }, { "epoch": 0.39, "learning_rate": 6.970553827231808e-08, "logits/chosen": -3.2625489234924316, "logits/rejected": -3.113396167755127, "logps/chosen": -264.7628173828125, "logps/rejected": -529.5965576171875, "loss": 0.4328, "rewards/accuracies": 1.0, "rewards/chosen": 0.6453766226768494, "rewards/margins": 1.7521653175354004, "rewards/rejected": -1.1067886352539062, "step": 306 }, { "epoch": 0.39, "learning_rate": 6.951541510076994e-08, "logits/chosen": -3.1684460639953613, "logits/rejected": -3.0913383960723877, "logps/chosen": -310.14874267578125, "logps/rejected": -533.739990234375, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 0.32389068603515625, "rewards/margins": 1.4234176874160767, "rewards/rejected": -1.0995270013809204, "step": 307 }, { "epoch": 0.39, "learning_rate": 6.932495846462261e-08, "logits/chosen": -3.2126970291137695, "logits/rejected": -3.1210930347442627, "logps/chosen": -307.2174072265625, "logps/rejected": -458.455078125, "loss": 0.4474, "rewards/accuracies": 1.0, "rewards/chosen": 0.44346314668655396, "rewards/margins": 1.570622205734253, "rewards/rejected": -1.1271591186523438, "step": 308 }, { "epoch": 0.39, "learning_rate": 6.913417161825448e-08, "logits/chosen": -3.237246513366699, "logits/rejected": -3.086176872253418, "logps/chosen": -316.7855224609375, "logps/rejected": -1188.5302734375, "loss": 0.3588, "rewards/accuracies": 1.0, "rewards/chosen": 0.4472549557685852, "rewards/margins": 3.787776231765747, "rewards/rejected": -3.3405213356018066, "step": 309 }, { "epoch": 0.4, "learning_rate": 6.894305782168638e-08, "logits/chosen": -3.2428503036499023, "logits/rejected": -3.119485855102539, "logps/chosen": -266.57757568359375, "logps/rejected": -517.67138671875, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 0.36165618896484375, "rewards/margins": 1.5651931762695312, "rewards/rejected": -1.2035369873046875, "step": 310 }, { "epoch": 0.4, "learning_rate": 6.875162034052578e-08, "logits/chosen": -3.1927685737609863, "logits/rejected": -3.0587611198425293, "logps/chosen": -285.4232482910156, "logps/rejected": -714.4824829101562, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 0.39855730533599854, "rewards/margins": 2.5216164588928223, "rewards/rejected": -2.1230592727661133, "step": 311 }, { "epoch": 0.4, "learning_rate": 6.855986244591103e-08, "logits/chosen": -3.173314094543457, "logits/rejected": -3.018852710723877, "logps/chosen": -302.4118347167969, "logps/rejected": -952.234375, "loss": 0.4374, "rewards/accuracies": 1.0, "rewards/chosen": 0.418304443359375, "rewards/margins": 2.917672872543335, "rewards/rejected": -2.49936842918396, "step": 312 }, { "epoch": 0.4, "learning_rate": 6.836778741445549e-08, "logits/chosen": -3.1484017372131348, "logits/rejected": -3.1268224716186523, "logps/chosen": -297.6955871582031, "logps/rejected": -786.7733154296875, "loss": 0.3878, "rewards/accuracies": 1.0, "rewards/chosen": 0.49537354707717896, "rewards/margins": 3.159487724304199, "rewards/rejected": -2.664114475250244, "step": 313 }, { "epoch": 0.4, "learning_rate": 6.817539852819148e-08, "logits/chosen": -3.2351622581481934, "logits/rejected": -3.147650957107544, "logps/chosen": -259.1064147949219, "logps/rejected": -734.3907470703125, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 0.42841261625289917, "rewards/margins": 2.696887969970703, "rewards/rejected": -2.268475294113159, "step": 314 }, { "epoch": 0.4, "learning_rate": 6.798269907451427e-08, "logits/chosen": -3.219303846359253, "logits/rejected": -2.9629669189453125, "logps/chosen": -300.0625, "logps/rejected": -1208.94384765625, "loss": 0.4892, "rewards/accuracies": 1.0, "rewards/chosen": 0.4914703369140625, "rewards/margins": 3.932936191558838, "rewards/rejected": -3.4414658546447754, "step": 315 }, { "epoch": 0.4, "learning_rate": 6.778969234612582e-08, "logits/chosen": -3.2062907218933105, "logits/rejected": -2.9684784412384033, "logps/chosen": -354.30853271484375, "logps/rejected": -1090.8692626953125, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": 0.3961990475654602, "rewards/margins": 3.0074126720428467, "rewards/rejected": -2.6112136840820312, "step": 316 }, { "epoch": 0.4, "learning_rate": 6.759638164097861e-08, "logits/chosen": -3.173107385635376, "logits/rejected": -3.1590609550476074, "logps/chosen": -302.285400390625, "logps/rejected": -762.8922119140625, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 0.4889663755893707, "rewards/margins": 2.9515886306762695, "rewards/rejected": -2.4626221656799316, "step": 317 }, { "epoch": 0.41, "learning_rate": 6.740277026221922e-08, "logits/chosen": -3.2537617683410645, "logits/rejected": -3.1149139404296875, "logps/chosen": -293.7182312011719, "logps/rejected": -818.447509765625, "loss": 0.4467, "rewards/accuracies": 1.0, "rewards/chosen": 0.43005067110061646, "rewards/margins": 2.6824097633361816, "rewards/rejected": -2.252358913421631, "step": 318 }, { "epoch": 0.41, "learning_rate": 6.720886151813194e-08, "logits/chosen": -3.2055227756500244, "logits/rejected": -3.1270751953125, "logps/chosen": -306.1141357421875, "logps/rejected": -648.5712890625, "loss": 0.4619, "rewards/accuracies": 1.0, "rewards/chosen": 0.4433792233467102, "rewards/margins": 1.9841781854629517, "rewards/rejected": -1.5407989025115967, "step": 319 }, { "epoch": 0.41, "learning_rate": 6.701465872208215e-08, "logits/chosen": -3.159332275390625, "logits/rejected": -3.1247663497924805, "logps/chosen": -281.23284912109375, "logps/rejected": -953.4114990234375, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 0.44287338852882385, "rewards/margins": 2.8425865173339844, "rewards/rejected": -2.3997130393981934, "step": 320 }, { "epoch": 0.41, "learning_rate": 6.682016519245985e-08, "logits/chosen": -3.157062530517578, "logits/rejected": -3.146803855895996, "logps/chosen": -272.3890380859375, "logps/rejected": -963.0634765625, "loss": 0.4102, "rewards/accuracies": 1.0, "rewards/chosen": 0.5336944460868835, "rewards/margins": 2.8027405738830566, "rewards/rejected": -2.2690460681915283, "step": 321 }, { "epoch": 0.41, "learning_rate": 6.662538425262284e-08, "logits/chosen": -3.2256760597229004, "logits/rejected": -3.1039938926696777, "logps/chosen": -255.93734741210938, "logps/rejected": -842.6132202148438, "loss": 0.4262, "rewards/accuracies": 1.0, "rewards/chosen": 0.49976807832717896, "rewards/margins": 2.5472044944763184, "rewards/rejected": -2.0474367141723633, "step": 322 }, { "epoch": 0.41, "learning_rate": 6.643031923083994e-08, "logits/chosen": -3.255248785018921, "logits/rejected": -2.968074321746826, "logps/chosen": -278.13885498046875, "logps/rejected": -1335.4881591796875, "loss": 0.41, "rewards/accuracies": 1.0, "rewards/chosen": 0.4061371088027954, "rewards/margins": 3.975616693496704, "rewards/rejected": -3.569479465484619, "step": 323 }, { "epoch": 0.41, "learning_rate": 6.623497346023418e-08, "logits/chosen": -3.225923538208008, "logits/rejected": -3.0975167751312256, "logps/chosen": -322.9974670410156, "logps/rejected": -507.8825378417969, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 0.5409622192382812, "rewards/margins": 1.8294388055801392, "rewards/rejected": -1.288476586341858, "step": 324 }, { "epoch": 0.41, "learning_rate": 6.603935027872579e-08, "logits/chosen": -3.139043092727661, "logits/rejected": -3.086012125015259, "logps/chosen": -317.0920715332031, "logps/rejected": -774.6438598632812, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 0.522308349609375, "rewards/margins": 2.559497117996216, "rewards/rejected": -2.037188768386841, "step": 325 }, { "epoch": 0.42, "learning_rate": 6.584345302897522e-08, "logits/chosen": -3.2347912788391113, "logits/rejected": -3.043266773223877, "logps/chosen": -305.8695983886719, "logps/rejected": -1773.32080078125, "loss": 0.3915, "rewards/accuracies": 1.0, "rewards/chosen": 0.42008668184280396, "rewards/margins": 5.8200225830078125, "rewards/rejected": -5.399936199188232, "step": 326 }, { "epoch": 0.42, "learning_rate": 6.564728505832595e-08, "logits/chosen": -3.164857864379883, "logits/rejected": -3.121790885925293, "logps/chosen": -312.07659912109375, "logps/rejected": -1203.7498779296875, "loss": 0.3982, "rewards/accuracies": 1.0, "rewards/chosen": 0.549102783203125, "rewards/margins": 3.778921604156494, "rewards/rejected": -3.229818820953369, "step": 327 }, { "epoch": 0.42, "learning_rate": 6.545084971874738e-08, "logits/chosen": -3.127871036529541, "logits/rejected": -3.0897231101989746, "logps/chosen": -368.34954833984375, "logps/rejected": -507.6291198730469, "loss": 0.4132, "rewards/accuracies": 1.0, "rewards/chosen": 0.5141220092773438, "rewards/margins": 1.7551345825195312, "rewards/rejected": -1.2410125732421875, "step": 328 }, { "epoch": 0.42, "learning_rate": 6.525415036677744e-08, "logits/chosen": -3.1823253631591797, "logits/rejected": -3.0887386798858643, "logps/chosen": -356.98004150390625, "logps/rejected": -1449.858642578125, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 0.4719940423965454, "rewards/margins": 4.4596099853515625, "rewards/rejected": -3.9876160621643066, "step": 329 }, { "epoch": 0.42, "learning_rate": 6.505719036346537e-08, "logits/chosen": -3.1926043033599854, "logits/rejected": -3.059253692626953, "logps/chosen": -310.3914489746094, "logps/rejected": -935.95654296875, "loss": 0.3387, "rewards/accuracies": 1.0, "rewards/chosen": 0.5625565052032471, "rewards/margins": 3.137101650238037, "rewards/rejected": -2.574545383453369, "step": 330 }, { "epoch": 0.42, "learning_rate": 6.485997307431419e-08, "logits/chosen": -3.1867260932922363, "logits/rejected": -3.1082048416137695, "logps/chosen": -293.86431884765625, "logps/rejected": -1259.6412353515625, "loss": 0.4292, "rewards/accuracies": 1.0, "rewards/chosen": 0.5440475344657898, "rewards/margins": 4.424986362457275, "rewards/rejected": -3.880938768386841, "step": 331 }, { "epoch": 0.42, "learning_rate": 6.466250186922324e-08, "logits/chosen": -3.125744104385376, "logits/rejected": -3.0673446655273438, "logps/chosen": -325.03021240234375, "logps/rejected": -676.51708984375, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 0.46668702363967896, "rewards/margins": 2.1007583141326904, "rewards/rejected": -1.6340713500976562, "step": 332 }, { "epoch": 0.42, "learning_rate": 6.446478012243055e-08, "logits/chosen": -3.240267753601074, "logits/rejected": -2.9978504180908203, "logps/chosen": -299.1335754394531, "logps/rejected": -1395.142578125, "loss": 0.4781, "rewards/accuracies": 1.0, "rewards/chosen": 0.6062675714492798, "rewards/margins": 4.465278148651123, "rewards/rejected": -3.8590104579925537, "step": 333 }, { "epoch": 0.43, "learning_rate": 6.426681121245526e-08, "logits/chosen": -3.1781423091888428, "logits/rejected": -3.059810161590576, "logps/chosen": -340.66693115234375, "logps/rejected": -504.43365478515625, "loss": 0.514, "rewards/accuracies": 1.0, "rewards/chosen": 0.4205436706542969, "rewards/margins": 1.6471138000488281, "rewards/rejected": -1.2265701293945312, "step": 334 }, { "epoch": 0.43, "learning_rate": 6.406859852203981e-08, "logits/chosen": -3.2401766777038574, "logits/rejected": -3.1178174018859863, "logps/chosen": -294.5863037109375, "logps/rejected": -948.2916870117188, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": 0.4956413209438324, "rewards/margins": 3.6721978187561035, "rewards/rejected": -3.1765565872192383, "step": 335 }, { "epoch": 0.43, "learning_rate": 6.387014543809223e-08, "logits/chosen": -3.2066402435302734, "logits/rejected": -2.9485764503479004, "logps/chosen": -312.47601318359375, "logps/rejected": -1348.9857177734375, "loss": 0.3235, "rewards/accuracies": 1.0, "rewards/chosen": 0.5144378542900085, "rewards/margins": 4.1268157958984375, "rewards/rejected": -3.6123781204223633, "step": 336 }, { "epoch": 0.43, "learning_rate": 6.367145535162812e-08, "logits/chosen": -3.2356083393096924, "logits/rejected": -3.083646774291992, "logps/chosen": -286.55987548828125, "logps/rejected": -941.4859619140625, "loss": 0.4123, "rewards/accuracies": 1.0, "rewards/chosen": 0.6187942624092102, "rewards/margins": 2.944075107574463, "rewards/rejected": -2.3252806663513184, "step": 337 }, { "epoch": 0.43, "learning_rate": 6.347253165771289e-08, "logits/chosen": -3.171553134918213, "logits/rejected": -3.1139235496520996, "logps/chosen": -295.05419921875, "logps/rejected": -1042.39599609375, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 0.5685508847236633, "rewards/margins": 3.842665910720825, "rewards/rejected": -3.2741150856018066, "step": 338 }, { "epoch": 0.43, "learning_rate": 6.327337775540361e-08, "logits/chosen": -3.269473075866699, "logits/rejected": -3.035998821258545, "logps/chosen": -270.006591796875, "logps/rejected": -301.6492919921875, "loss": 0.4138, "rewards/accuracies": 1.0, "rewards/chosen": 0.4032478332519531, "rewards/margins": 0.9925727844238281, "rewards/rejected": -0.589324951171875, "step": 339 }, { "epoch": 0.43, "learning_rate": 6.307399704769099e-08, "logits/chosen": -3.246171236038208, "logits/rejected": -3.11867094039917, "logps/chosen": -282.40496826171875, "logps/rejected": -1033.7764892578125, "loss": 0.4256, "rewards/accuracies": 1.0, "rewards/chosen": 0.44911348819732666, "rewards/margins": 3.490828037261963, "rewards/rejected": -3.041714668273926, "step": 340 }, { "epoch": 0.43, "learning_rate": 6.287439294144119e-08, "logits/chosen": -3.228243112564087, "logits/rejected": -3.1257553100585938, "logps/chosen": -284.8995666503906, "logps/rejected": -494.117919921875, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 0.49929505586624146, "rewards/margins": 1.8305329084396362, "rewards/rejected": -1.33123779296875, "step": 341 }, { "epoch": 0.44, "learning_rate": 6.26745688473377e-08, "logits/chosen": -3.2068586349487305, "logits/rejected": -3.093867540359497, "logps/chosen": -301.98541259765625, "logps/rejected": -443.2766418457031, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 0.47451937198638916, "rewards/margins": 1.6581283807754517, "rewards/rejected": -1.1836090087890625, "step": 342 }, { "epoch": 0.44, "learning_rate": 6.247452817982293e-08, "logits/chosen": -3.2020983695983887, "logits/rejected": -3.1271021366119385, "logps/chosen": -280.3255920410156, "logps/rejected": -621.730224609375, "loss": 0.4425, "rewards/accuracies": 1.0, "rewards/chosen": 0.41617968678474426, "rewards/margins": 2.1400885581970215, "rewards/rejected": -1.7239091396331787, "step": 343 }, { "epoch": 0.44, "learning_rate": 6.227427435703996e-08, "logits/chosen": -3.136793375015259, "logits/rejected": -3.0568058490753174, "logps/chosen": -293.35906982421875, "logps/rejected": -1128.430419921875, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 0.5674392580986023, "rewards/margins": 3.6189041137695312, "rewards/rejected": -3.0514650344848633, "step": 344 }, { "epoch": 0.44, "learning_rate": 6.20738108007741e-08, "logits/chosen": -3.1911704540252686, "logits/rejected": -3.162592887878418, "logps/chosen": -312.2906494140625, "logps/rejected": -513.4693603515625, "loss": 0.4177, "rewards/accuracies": 1.0, "rewards/chosen": 0.5875122547149658, "rewards/margins": 1.9067094326019287, "rewards/rejected": -1.319197177886963, "step": 345 }, { "epoch": 0.44, "learning_rate": 6.187314093639443e-08, "logits/chosen": -3.231839179992676, "logits/rejected": -3.1773183345794678, "logps/chosen": -315.91778564453125, "logps/rejected": -594.143310546875, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 0.561767578125, "rewards/margins": 1.8702576160430908, "rewards/rejected": -1.3084900379180908, "step": 346 }, { "epoch": 0.44, "learning_rate": 6.167226819279528e-08, "logits/chosen": -3.2714924812316895, "logits/rejected": -2.9882519245147705, "logps/chosen": -284.3290710449219, "logps/rejected": -687.0112915039062, "loss": 0.4174, "rewards/accuracies": 1.0, "rewards/chosen": 0.504779040813446, "rewards/margins": 2.3905396461486816, "rewards/rejected": -1.8857605457305908, "step": 347 }, { "epoch": 0.44, "learning_rate": 6.147119600233758e-08, "logits/chosen": -3.21177339553833, "logits/rejected": -2.9689884185791016, "logps/chosen": -324.82293701171875, "logps/rejected": -2655.3037109375, "loss": 0.3583, "rewards/accuracies": 1.0, "rewards/chosen": 0.5545395016670227, "rewards/margins": 7.883793830871582, "rewards/rejected": -7.329254627227783, "step": 348 }, { "epoch": 0.44, "learning_rate": 6.126992780079031e-08, "logits/chosen": -3.1782169342041016, "logits/rejected": -3.1333529949188232, "logps/chosen": -291.79425048828125, "logps/rejected": -609.4161376953125, "loss": 0.3984, "rewards/accuracies": 1.0, "rewards/chosen": 0.6528595089912415, "rewards/margins": 2.352067470550537, "rewards/rejected": -1.6992080211639404, "step": 349 }, { "epoch": 0.45, "learning_rate": 6.106846702727172e-08, "logits/chosen": -3.1614363193511963, "logits/rejected": -3.105189800262451, "logps/chosen": -264.09759521484375, "logps/rejected": -737.4447631835938, "loss": 0.3727, "rewards/accuracies": 1.0, "rewards/chosen": 0.670550525188446, "rewards/margins": 2.848886013031006, "rewards/rejected": -2.178335666656494, "step": 350 }, { "epoch": 0.45, "learning_rate": 6.086681712419058e-08, "logits/chosen": -3.2550861835479736, "logits/rejected": -3.0854573249816895, "logps/chosen": -323.5987548828125, "logps/rejected": -437.5296630859375, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": 0.4456283450126648, "rewards/margins": 1.4625976085662842, "rewards/rejected": -1.0169693231582642, "step": 351 }, { "epoch": 0.45, "learning_rate": 6.066498153718735e-08, "logits/chosen": -3.2647364139556885, "logits/rejected": -3.084054470062256, "logps/chosen": -260.8603210449219, "logps/rejected": -1266.4725341796875, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 0.6641120910644531, "rewards/margins": 3.9598276615142822, "rewards/rejected": -3.29571533203125, "step": 352 }, { "epoch": 0.45, "learning_rate": 6.046296371507533e-08, "logits/chosen": -3.198726177215576, "logits/rejected": -3.0870070457458496, "logps/chosen": -288.456298828125, "logps/rejected": -734.63671875, "loss": 0.4294, "rewards/accuracies": 1.0, "rewards/chosen": 0.39592134952545166, "rewards/margins": 2.300096035003662, "rewards/rejected": -1.9041748046875, "step": 353 }, { "epoch": 0.45, "learning_rate": 6.02607671097817e-08, "logits/chosen": -3.2159082889556885, "logits/rejected": -2.9490692615509033, "logps/chosen": -309.4449768066406, "logps/rejected": -1208.7926025390625, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 0.5770339965820312, "rewards/margins": 3.8995132446289062, "rewards/rejected": -3.322479248046875, "step": 354 }, { "epoch": 0.45, "learning_rate": 6.005839517628861e-08, "logits/chosen": -3.1883654594421387, "logits/rejected": -3.049473285675049, "logps/chosen": -289.34393310546875, "logps/rejected": -1186.1004638671875, "loss": 0.3747, "rewards/accuracies": 1.0, "rewards/chosen": 0.5959351062774658, "rewards/margins": 3.72637939453125, "rewards/rejected": -3.1304445266723633, "step": 355 }, { "epoch": 0.45, "learning_rate": 5.985585137257401e-08, "logits/chosen": -3.1301755905151367, "logits/rejected": -2.9297738075256348, "logps/chosen": -275.662841796875, "logps/rejected": -1885.32470703125, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": 0.6412284970283508, "rewards/margins": 5.464373588562012, "rewards/rejected": -4.823144912719727, "step": 356 }, { "epoch": 0.46, "learning_rate": 5.965313915955268e-08, "logits/chosen": -3.1387076377868652, "logits/rejected": -3.0147385597229004, "logps/chosen": -288.5281982421875, "logps/rejected": -1651.32470703125, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": 0.4480941891670227, "rewards/margins": 5.168336868286133, "rewards/rejected": -4.720242500305176, "step": 357 }, { "epoch": 0.46, "learning_rate": 5.945026200101701e-08, "logits/chosen": -3.164442539215088, "logits/rejected": -2.9346213340759277, "logps/chosen": -306.4716796875, "logps/rejected": -1408.7919921875, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 0.6446472406387329, "rewards/margins": 4.945733547210693, "rewards/rejected": -4.30108642578125, "step": 358 }, { "epoch": 0.46, "learning_rate": 5.9247223363577924e-08, "logits/chosen": -3.1809613704681396, "logits/rejected": -3.0968637466430664, "logps/chosen": -334.51263427734375, "logps/rejected": -2364.450439453125, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 0.5502792596817017, "rewards/margins": 7.2457594871521, "rewards/rejected": -6.6954803466796875, "step": 359 }, { "epoch": 0.46, "learning_rate": 5.90440267166055e-08, "logits/chosen": -3.200594902038574, "logits/rejected": -3.1262640953063965, "logps/chosen": -306.59051513671875, "logps/rejected": -933.5364379882812, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 0.5721511840820312, "rewards/margins": 3.8732621669769287, "rewards/rejected": -3.3011109828948975, "step": 360 }, { "epoch": 0.46, "learning_rate": 5.8840675532169806e-08, "logits/chosen": -3.242427349090576, "logits/rejected": -3.0789642333984375, "logps/chosen": -269.59332275390625, "logps/rejected": -408.001220703125, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 0.39833372831344604, "rewards/margins": 1.468550205230713, "rewards/rejected": -1.070216417312622, "step": 361 }, { "epoch": 0.46, "learning_rate": 5.8637173284981525e-08, "logits/chosen": -3.2428317070007324, "logits/rejected": -2.9990756511688232, "logps/chosen": -302.956298828125, "logps/rejected": -1397.58935546875, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 0.42158281803131104, "rewards/margins": 4.8087873458862305, "rewards/rejected": -4.387204170227051, "step": 362 }, { "epoch": 0.46, "learning_rate": 5.843352345233257e-08, "logits/chosen": -3.2510440349578857, "logits/rejected": -3.1663875579833984, "logps/chosen": -309.576416015625, "logps/rejected": -754.7469482421875, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 0.37456971406936646, "rewards/margins": 2.4045701026916504, "rewards/rejected": -2.0300002098083496, "step": 363 }, { "epoch": 0.46, "learning_rate": 5.8229729514036697e-08, "logits/chosen": -3.248162269592285, "logits/rejected": -2.8519139289855957, "logps/chosen": -307.92926025390625, "logps/rejected": -2107.53662109375, "loss": 0.4432, "rewards/accuracies": 1.0, "rewards/chosen": 0.6793212890625, "rewards/margins": 7.593079090118408, "rewards/rejected": -6.913758277893066, "step": 364 }, { "epoch": 0.47, "learning_rate": 5.802579495237003e-08, "logits/chosen": -3.1610569953918457, "logits/rejected": -3.130971908569336, "logps/chosen": -296.58349609375, "logps/rejected": -556.6192016601562, "loss": 0.4804, "rewards/accuracies": 1.0, "rewards/chosen": 0.6802002191543579, "rewards/margins": 2.099377393722534, "rewards/rejected": -1.4191772937774658, "step": 365 }, { "epoch": 0.47, "learning_rate": 5.7821723252011546e-08, "logits/chosen": -3.1973013877868652, "logits/rejected": -2.8949573040008545, "logps/chosen": -356.1853332519531, "logps/rejected": -1248.0517578125, "loss": 0.4219, "rewards/accuracies": 1.0, "rewards/chosen": 0.32774507999420166, "rewards/margins": 4.079756259918213, "rewards/rejected": -3.7520110607147217, "step": 366 }, { "epoch": 0.47, "learning_rate": 5.7617517899983546e-08, "logits/chosen": -3.198963165283203, "logits/rejected": -3.0584235191345215, "logps/chosen": -305.49755859375, "logps/rejected": -1003.0659790039062, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 0.576812744140625, "rewards/margins": 3.6630630493164062, "rewards/rejected": -3.0862503051757812, "step": 367 }, { "epoch": 0.47, "learning_rate": 5.741318238559209e-08, "logits/chosen": -3.1088180541992188, "logits/rejected": -3.2090542316436768, "logps/chosen": -332.8783264160156, "logps/rejected": -931.2759399414062, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 0.6366760730743408, "rewards/margins": 3.3732542991638184, "rewards/rejected": -2.7365784645080566, "step": 368 }, { "epoch": 0.47, "learning_rate": 5.7208720200367334e-08, "logits/chosen": -3.1358304023742676, "logits/rejected": -3.087451457977295, "logps/chosen": -335.411865234375, "logps/rejected": -613.3777465820312, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 0.6406418085098267, "rewards/margins": 2.3040413856506348, "rewards/rejected": -1.663399577140808, "step": 369 }, { "epoch": 0.47, "learning_rate": 5.7004134838003895e-08, "logits/chosen": -3.2219905853271484, "logits/rejected": -3.1817572116851807, "logps/chosen": -303.64892578125, "logps/rejected": -738.6241455078125, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 0.6848938465118408, "rewards/margins": 3.2034544944763184, "rewards/rejected": -2.5185608863830566, "step": 370 }, { "epoch": 0.47, "learning_rate": 5.6799429794301135e-08, "logits/chosen": -3.2321670055389404, "logits/rejected": -3.040034770965576, "logps/chosen": -301.8544921875, "logps/rejected": -438.642333984375, "loss": 0.4094, "rewards/accuracies": 1.0, "rewards/chosen": 0.5522842407226562, "rewards/margins": 1.8727128505706787, "rewards/rejected": -1.320428490638733, "step": 371 }, { "epoch": 0.47, "learning_rate": 5.659460856710345e-08, "logits/chosen": -3.221762180328369, "logits/rejected": -3.129549741744995, "logps/chosen": -301.0585021972656, "logps/rejected": -971.7654418945312, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 0.6064910888671875, "rewards/margins": 3.513394355773926, "rewards/rejected": -2.9069032669067383, "step": 372 }, { "epoch": 0.48, "learning_rate": 5.63896746562405e-08, "logits/chosen": -3.188143253326416, "logits/rejected": -3.1325526237487793, "logps/chosen": -254.2765350341797, "logps/rejected": -746.89111328125, "loss": 0.4395, "rewards/accuracies": 1.0, "rewards/chosen": 0.5932877063751221, "rewards/margins": 2.7942185401916504, "rewards/rejected": -2.2009308338165283, "step": 373 }, { "epoch": 0.48, "learning_rate": 5.618463156346739e-08, "logits/chosen": -3.2358574867248535, "logits/rejected": -3.0821471214294434, "logps/chosen": -303.78179931640625, "logps/rejected": -434.3711853027344, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 0.5801025629043579, "rewards/margins": 1.504614233970642, "rewards/rejected": -0.924511730670929, "step": 374 }, { "epoch": 0.48, "learning_rate": 5.597948279240483e-08, "logits/chosen": -3.1925768852233887, "logits/rejected": -3.08958101272583, "logps/chosen": -377.38372802734375, "logps/rejected": -631.8582763671875, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": 0.45956575870513916, "rewards/margins": 1.9082214832305908, "rewards/rejected": -1.4486557245254517, "step": 375 }, { "epoch": 0.48, "learning_rate": 5.5774231848479313e-08, "logits/chosen": -3.216506242752075, "logits/rejected": -2.9710140228271484, "logps/chosen": -336.973388671875, "logps/rejected": -1973.974365234375, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 0.5115921497344971, "rewards/margins": 6.2433977127075195, "rewards/rejected": -5.731805801391602, "step": 376 }, { "epoch": 0.48, "learning_rate": 5.556888223886315e-08, "logits/chosen": -3.189899206161499, "logits/rejected": -3.099536418914795, "logps/chosen": -296.1923522949219, "logps/rejected": -708.3084106445312, "loss": 0.4243, "rewards/accuracies": 1.0, "rewards/chosen": 0.48807069659233093, "rewards/margins": 2.2460968494415283, "rewards/rejected": -1.758026123046875, "step": 377 }, { "epoch": 0.48, "learning_rate": 5.536343747241459e-08, "logits/chosen": -3.2063980102539062, "logits/rejected": -3.1263179779052734, "logps/chosen": -303.41064453125, "logps/rejected": -623.3209228515625, "loss": 0.4359, "rewards/accuracies": 1.0, "rewards/chosen": 0.4863540530204773, "rewards/margins": 2.2163329124450684, "rewards/rejected": -1.7299789190292358, "step": 378 }, { "epoch": 0.48, "learning_rate": 5.515790105961785e-08, "logits/chosen": -3.2091307640075684, "logits/rejected": -3.096869468688965, "logps/chosen": -308.23992919921875, "logps/rejected": -552.718017578125, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 0.5659225583076477, "rewards/margins": 2.062835693359375, "rewards/rejected": -1.496913194656372, "step": 379 }, { "epoch": 0.48, "learning_rate": 5.495227651252314e-08, "logits/chosen": -3.170034170150757, "logits/rejected": -3.1193249225616455, "logps/chosen": -361.4864501953125, "logps/rejected": -658.8186645507812, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 0.49003908038139343, "rewards/margins": 2.149932861328125, "rewards/rejected": -1.6598937511444092, "step": 380 }, { "epoch": 0.49, "learning_rate": 5.474656734468662e-08, "logits/chosen": -3.1895508766174316, "logits/rejected": -3.010605812072754, "logps/chosen": -293.01611328125, "logps/rejected": -314.55438232421875, "loss": 0.4217, "rewards/accuracies": 1.0, "rewards/chosen": 0.6266059875488281, "rewards/margins": 1.2529778480529785, "rewards/rejected": -0.6263717412948608, "step": 381 }, { "epoch": 0.49, "learning_rate": 5.454077707111041e-08, "logits/chosen": -3.1510937213897705, "logits/rejected": -3.0209999084472656, "logps/chosen": -311.19635009765625, "logps/rejected": -1508.339599609375, "loss": 0.3846, "rewards/accuracies": 1.0, "rewards/chosen": 0.5679657459259033, "rewards/margins": 5.9072418212890625, "rewards/rejected": -5.339276313781738, "step": 382 }, { "epoch": 0.49, "learning_rate": 5.433490920818249e-08, "logits/chosen": -3.215914726257324, "logits/rejected": -3.1172261238098145, "logps/chosen": -313.872314453125, "logps/rejected": -978.7078857421875, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 0.6341918706893921, "rewards/margins": 3.608196973800659, "rewards/rejected": -2.9740049839019775, "step": 383 }, { "epoch": 0.49, "learning_rate": 5.4128967273616623e-08, "logits/chosen": -3.1991796493530273, "logits/rejected": -3.117245674133301, "logps/chosen": -338.6932067871094, "logps/rejected": -686.8824462890625, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": 0.6985352039337158, "rewards/margins": 3.1957809925079346, "rewards/rejected": -2.4972457885742188, "step": 384 }, { "epoch": 0.49, "learning_rate": 5.392295478639225e-08, "logits/chosen": -3.18943190574646, "logits/rejected": -3.1652956008911133, "logps/chosen": -296.5998229980469, "logps/rejected": -850.2589111328125, "loss": 0.3386, "rewards/accuracies": 1.0, "rewards/chosen": 0.4179534912109375, "rewards/margins": 3.12969970703125, "rewards/rejected": -2.7117462158203125, "step": 385 }, { "epoch": 0.49, "learning_rate": 5.3716875266694385e-08, "logits/chosen": -3.1747264862060547, "logits/rejected": -3.0396621227264404, "logps/chosen": -298.94476318359375, "logps/rejected": -761.3916625976562, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 0.5151214599609375, "rewards/margins": 2.423083543777466, "rewards/rejected": -1.9079620838165283, "step": 386 }, { "epoch": 0.49, "learning_rate": 5.351073223585341e-08, "logits/chosen": -3.166252851486206, "logits/rejected": -3.0785176753997803, "logps/chosen": -275.1827392578125, "logps/rejected": -813.380859375, "loss": 0.336, "rewards/accuracies": 1.0, "rewards/chosen": 0.5653831362724304, "rewards/margins": 3.1253669261932373, "rewards/rejected": -2.559983730316162, "step": 387 }, { "epoch": 0.49, "learning_rate": 5.3304529216284965e-08, "logits/chosen": -3.2658495903015137, "logits/rejected": -2.9212400913238525, "logps/chosen": -295.725830078125, "logps/rejected": -2336.459228515625, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 0.5461273193359375, "rewards/margins": 7.732700347900391, "rewards/rejected": -7.186572551727295, "step": 388 }, { "epoch": 0.5, "learning_rate": 5.309826973142973e-08, "logits/chosen": -3.2464656829833984, "logits/rejected": -3.202122211456299, "logps/chosen": -276.9617614746094, "logps/rejected": -768.829833984375, "loss": 0.3592, "rewards/accuracies": 1.0, "rewards/chosen": 0.6378067135810852, "rewards/margins": 3.044743537902832, "rewards/rejected": -2.4069366455078125, "step": 389 }, { "epoch": 0.5, "learning_rate": 5.28919573056932e-08, "logits/chosen": -3.242055892944336, "logits/rejected": -3.0543785095214844, "logps/chosen": -361.5600280761719, "logps/rejected": -3260.323486328125, "loss": 0.446, "rewards/accuracies": 1.0, "rewards/chosen": 0.42770540714263916, "rewards/margins": 8.6927490234375, "rewards/rejected": -8.265044212341309, "step": 390 }, { "epoch": 0.5, "learning_rate": 5.268559546438549e-08, "logits/chosen": -3.2106716632843018, "logits/rejected": -3.0833044052124023, "logps/chosen": -267.7562255859375, "logps/rejected": -596.6238403320312, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 0.6427253484725952, "rewards/margins": 2.1468634605407715, "rewards/rejected": -1.5041382312774658, "step": 391 }, { "epoch": 0.5, "learning_rate": 5.2479187733661114e-08, "logits/chosen": -3.1712112426757812, "logits/rejected": -3.0280351638793945, "logps/chosen": -310.35052490234375, "logps/rejected": -811.8780517578125, "loss": 0.3965, "rewards/accuracies": 1.0, "rewards/chosen": 0.5920822620391846, "rewards/margins": 2.6683363914489746, "rewards/rejected": -2.076254367828369, "step": 392 }, { "epoch": 0.5, "learning_rate": 5.227273764045868e-08, "logits/chosen": -3.129927158355713, "logits/rejected": -2.975839614868164, "logps/chosen": -285.79180908203125, "logps/rejected": -606.9668579101562, "loss": 0.4009, "rewards/accuracies": 1.0, "rewards/chosen": 0.6272323727607727, "rewards/margins": 2.1615495681762695, "rewards/rejected": -1.5343170166015625, "step": 393 }, { "epoch": 0.5, "learning_rate": 5.2066248712440654e-08, "logits/chosen": -3.186248779296875, "logits/rejected": -2.967146873474121, "logps/chosen": -281.67767333984375, "logps/rejected": -3178.98095703125, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 0.6050910949707031, "rewards/margins": 7.412638187408447, "rewards/rejected": -6.807547092437744, "step": 394 }, { "epoch": 0.5, "learning_rate": 5.185972447793312e-08, "logits/chosen": -3.195786237716675, "logits/rejected": -3.014780044555664, "logps/chosen": -294.686279296875, "logps/rejected": -1145.607666015625, "loss": 0.378, "rewards/accuracies": 1.0, "rewards/chosen": 0.4898117184638977, "rewards/margins": 3.983384847640991, "rewards/rejected": -3.4935731887817383, "step": 395 }, { "epoch": 0.5, "learning_rate": 5.16531684658654e-08, "logits/chosen": -3.174570322036743, "logits/rejected": -2.989135980606079, "logps/chosen": -286.1412048339844, "logps/rejected": -1072.724365234375, "loss": 0.4525, "rewards/accuracies": 1.0, "rewards/chosen": 0.5179932117462158, "rewards/margins": 3.302964687347412, "rewards/rejected": -2.7849717140197754, "step": 396 }, { "epoch": 0.51, "learning_rate": 5.1446584205709856e-08, "logits/chosen": -3.1776952743530273, "logits/rejected": -2.9835612773895264, "logps/chosen": -314.298095703125, "logps/rejected": -3107.71923828125, "loss": 0.3487, "rewards/accuracies": 1.0, "rewards/chosen": 0.6482772827148438, "rewards/margins": 9.714640617370605, "rewards/rejected": -9.066363334655762, "step": 397 }, { "epoch": 0.51, "learning_rate": 5.123997522742151e-08, "logits/chosen": -3.2705726623535156, "logits/rejected": -3.1116838455200195, "logps/chosen": -295.44970703125, "logps/rejected": -472.2664794921875, "loss": 0.4006, "rewards/accuracies": 1.0, "rewards/chosen": 0.5833083987236023, "rewards/margins": 2.0314621925354004, "rewards/rejected": -1.4481537342071533, "step": 398 }, { "epoch": 0.51, "learning_rate": 5.103334506137772e-08, "logits/chosen": -3.2147769927978516, "logits/rejected": -2.916189193725586, "logps/chosen": -309.2398986816406, "logps/rejected": -1117.9913330078125, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 0.550152599811554, "rewards/margins": 3.6230835914611816, "rewards/rejected": -3.0729310512542725, "step": 399 }, { "epoch": 0.51, "learning_rate": 5.082669723831793e-08, "logits/chosen": -3.2036144733428955, "logits/rejected": -3.099757671356201, "logps/chosen": -274.186767578125, "logps/rejected": -1155.1396484375, "loss": 0.383, "rewards/accuracies": 1.0, "rewards/chosen": 0.539276123046875, "rewards/margins": 4.467303276062012, "rewards/rejected": -3.928027391433716, "step": 400 }, { "epoch": 0.51, "learning_rate": 5.062003528928327e-08, "logits/chosen": -3.2463061809539795, "logits/rejected": -3.077441453933716, "logps/chosen": -317.15203857421875, "logps/rejected": -714.6187133789062, "loss": 0.3948, "rewards/accuracies": 1.0, "rewards/chosen": 0.6456009149551392, "rewards/margins": 2.955152988433838, "rewards/rejected": -2.309551954269409, "step": 401 }, { "epoch": 0.51, "learning_rate": 5.041336274555624e-08, "logits/chosen": -3.1866655349731445, "logits/rejected": -3.0052385330200195, "logps/chosen": -328.8551025390625, "logps/rejected": -3260.66259765625, "loss": 0.4012, "rewards/accuracies": 1.0, "rewards/chosen": 0.5886688232421875, "rewards/margins": 9.829867362976074, "rewards/rejected": -9.241198539733887, "step": 402 }, { "epoch": 0.51, "learning_rate": 5.0206683138600414e-08, "logits/chosen": -3.168546438217163, "logits/rejected": -3.1083011627197266, "logps/chosen": -395.8145751953125, "logps/rejected": -1136.02490234375, "loss": 0.4023, "rewards/accuracies": 1.0, "rewards/chosen": 0.5005966424942017, "rewards/margins": 3.662053108215332, "rewards/rejected": -3.161456346511841, "step": 403 }, { "epoch": 0.51, "learning_rate": 5e-08, "logits/chosen": -3.2205591201782227, "logits/rejected": -3.1303653717041016, "logps/chosen": -265.9959411621094, "logps/rejected": -406.21563720703125, "loss": 0.4148, "rewards/accuracies": 1.0, "rewards/chosen": 0.5852249264717102, "rewards/margins": 1.840388536453247, "rewards/rejected": -1.2551636695861816, "step": 404 }, { "epoch": 0.52, "learning_rate": 4.9793316861399595e-08, "logits/chosen": -3.2379775047302246, "logits/rejected": -3.0118536949157715, "logps/chosen": -280.91046142578125, "logps/rejected": -1451.888916015625, "loss": 0.3441, "rewards/accuracies": 1.0, "rewards/chosen": 0.6211913824081421, "rewards/margins": 4.857804775238037, "rewards/rejected": -4.236613750457764, "step": 405 }, { "epoch": 0.52, "learning_rate": 4.9586637254443753e-08, "logits/chosen": -3.2794084548950195, "logits/rejected": -3.1248879432678223, "logps/chosen": -282.2588806152344, "logps/rejected": -604.1436157226562, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 0.4979843497276306, "rewards/margins": 2.3752822875976562, "rewards/rejected": -1.8772979974746704, "step": 406 }, { "epoch": 0.52, "learning_rate": 4.937996471071675e-08, "logits/chosen": -3.1536147594451904, "logits/rejected": -3.105973720550537, "logps/chosen": -310.077880859375, "logps/rejected": -1037.4150390625, "loss": 0.4027, "rewards/accuracies": 1.0, "rewards/chosen": 0.7135879993438721, "rewards/margins": 3.826115608215332, "rewards/rejected": -3.112527370452881, "step": 407 }, { "epoch": 0.52, "learning_rate": 4.917330276168207e-08, "logits/chosen": -3.16801381111145, "logits/rejected": -2.991434097290039, "logps/chosen": -295.65625, "logps/rejected": -1225.750244140625, "loss": 0.3461, "rewards/accuracies": 1.0, "rewards/chosen": 0.6913940906524658, "rewards/margins": 4.22755765914917, "rewards/rejected": -3.536163330078125, "step": 408 }, { "epoch": 0.52, "learning_rate": 4.8966654938622295e-08, "logits/chosen": -3.183180332183838, "logits/rejected": -3.1251840591430664, "logps/chosen": -296.36968994140625, "logps/rejected": -1070.8380126953125, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 0.6264068484306335, "rewards/margins": 4.133169651031494, "rewards/rejected": -3.506762742996216, "step": 409 }, { "epoch": 0.52, "learning_rate": 4.8760024772578495e-08, "logits/chosen": -3.204259157180786, "logits/rejected": -3.065732479095459, "logps/chosen": -297.00848388671875, "logps/rejected": -195.59640502929688, "loss": 0.4238, "rewards/accuracies": 1.0, "rewards/chosen": 0.5604599118232727, "rewards/margins": 0.9594337940216064, "rewards/rejected": -0.39897385239601135, "step": 410 }, { "epoch": 0.52, "learning_rate": 4.855341579429014e-08, "logits/chosen": -3.181199073791504, "logits/rejected": -3.0626821517944336, "logps/chosen": -357.794921875, "logps/rejected": -831.8054809570312, "loss": 0.4134, "rewards/accuracies": 1.0, "rewards/chosen": 0.6356521844863892, "rewards/margins": 2.4917068481445312, "rewards/rejected": -1.8560547828674316, "step": 411 }, { "epoch": 0.53, "learning_rate": 4.834683153413459e-08, "logits/chosen": -3.2219948768615723, "logits/rejected": -3.083454132080078, "logps/chosen": -360.8112487792969, "logps/rejected": -768.890625, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 0.696002185344696, "rewards/margins": 2.465292453765869, "rewards/rejected": -1.7692902088165283, "step": 412 }, { "epoch": 0.53, "learning_rate": 4.814027552206689e-08, "logits/chosen": -3.1529219150543213, "logits/rejected": -3.0561656951904297, "logps/chosen": -322.585205078125, "logps/rejected": -885.2293701171875, "loss": 0.4237, "rewards/accuracies": 1.0, "rewards/chosen": 0.5888397693634033, "rewards/margins": 3.515284538269043, "rewards/rejected": -2.9264450073242188, "step": 413 }, { "epoch": 0.53, "learning_rate": 4.793375128755933e-08, "logits/chosen": -3.1592960357666016, "logits/rejected": -3.130289077758789, "logps/chosen": -342.1187744140625, "logps/rejected": -1167.467041015625, "loss": 0.4043, "rewards/accuracies": 1.0, "rewards/chosen": 0.657745361328125, "rewards/margins": 4.110632419586182, "rewards/rejected": -3.4528870582580566, "step": 414 }, { "epoch": 0.53, "learning_rate": 4.7727262359541324e-08, "logits/chosen": -3.2087855339050293, "logits/rejected": -3.1228184700012207, "logps/chosen": -283.19451904296875, "logps/rejected": -542.4149169921875, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 0.4185684323310852, "rewards/margins": 2.1915009021759033, "rewards/rejected": -1.7729324102401733, "step": 415 }, { "epoch": 0.53, "learning_rate": 4.7520812266338875e-08, "logits/chosen": -3.146057367324829, "logits/rejected": -2.904390811920166, "logps/chosen": -355.3348693847656, "logps/rejected": -920.7169189453125, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 0.6512451171875, "rewards/margins": 3.3315751552581787, "rewards/rejected": -2.6803300380706787, "step": 416 }, { "epoch": 0.53, "learning_rate": 4.7314404535614514e-08, "logits/chosen": -3.2027125358581543, "logits/rejected": -3.089214324951172, "logps/chosen": -269.82110595703125, "logps/rejected": -936.6822509765625, "loss": 0.4673, "rewards/accuracies": 1.0, "rewards/chosen": 0.4193382263183594, "rewards/margins": 3.745687246322632, "rewards/rejected": -3.3263490200042725, "step": 417 }, { "epoch": 0.53, "learning_rate": 4.7108042694306806e-08, "logits/chosen": -3.207357883453369, "logits/rejected": -3.1308059692382812, "logps/chosen": -320.91534423828125, "logps/rejected": -547.9696044921875, "loss": 0.4009, "rewards/accuracies": 1.0, "rewards/chosen": 0.5833419561386108, "rewards/margins": 2.099331855773926, "rewards/rejected": -1.5159897804260254, "step": 418 }, { "epoch": 0.53, "learning_rate": 4.690173026857027e-08, "logits/chosen": -3.11209774017334, "logits/rejected": -2.9996190071105957, "logps/chosen": -313.864990234375, "logps/rejected": -1714.847900390625, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 0.8171142935752869, "rewards/margins": 6.216732978820801, "rewards/rejected": -5.399618625640869, "step": 419 }, { "epoch": 0.54, "learning_rate": 4.669547078371503e-08, "logits/chosen": -3.173478126525879, "logits/rejected": -3.0958776473999023, "logps/chosen": -335.9696960449219, "logps/rejected": -898.0482177734375, "loss": 0.357, "rewards/accuracies": 1.0, "rewards/chosen": 0.7239792346954346, "rewards/margins": 3.7373549938201904, "rewards/rejected": -3.013375759124756, "step": 420 }, { "epoch": 0.54, "learning_rate": 4.648926776414659e-08, "logits/chosen": -3.3034157752990723, "logits/rejected": -3.069611072540283, "logps/chosen": -294.08251953125, "logps/rejected": -540.9917602539062, "loss": 0.3999, "rewards/accuracies": 1.0, "rewards/chosen": 0.5947319269180298, "rewards/margins": 1.9745888710021973, "rewards/rejected": -1.3798569440841675, "step": 421 }, { "epoch": 0.54, "learning_rate": 4.6283124733305623e-08, "logits/chosen": -3.2436113357543945, "logits/rejected": -3.1534056663513184, "logps/chosen": -350.60064697265625, "logps/rejected": -453.30560302734375, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.4789932370185852, "rewards/margins": 1.7662781476974487, "rewards/rejected": -1.2872848510742188, "step": 422 }, { "epoch": 0.54, "learning_rate": 4.6077045213607754e-08, "logits/chosen": -3.1710448265075684, "logits/rejected": -3.215569496154785, "logps/chosen": -272.9000244140625, "logps/rejected": -1153.1151123046875, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 0.5614792108535767, "rewards/margins": 4.21493673324585, "rewards/rejected": -3.6534576416015625, "step": 423 }, { "epoch": 0.54, "learning_rate": 4.5871032726383385e-08, "logits/chosen": -3.251715660095215, "logits/rejected": -2.954317092895508, "logps/chosen": -310.1248779296875, "logps/rejected": -1499.52490234375, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 0.7279952764511108, "rewards/margins": 5.327824592590332, "rewards/rejected": -4.599829196929932, "step": 424 }, { "epoch": 0.54, "learning_rate": 4.566509079181751e-08, "logits/chosen": -3.081737995147705, "logits/rejected": -2.8800411224365234, "logps/chosen": -336.6192626953125, "logps/rejected": -1676.2899169921875, "loss": 0.3943, "rewards/accuracies": 1.0, "rewards/chosen": 0.3818649351596832, "rewards/margins": 4.7561750411987305, "rewards/rejected": -4.374310493469238, "step": 425 }, { "epoch": 0.54, "learning_rate": 4.5459222928889584e-08, "logits/chosen": -3.244499444961548, "logits/rejected": -3.139596939086914, "logps/chosen": -310.95440673828125, "logps/rejected": -1041.111328125, "loss": 0.3897, "rewards/accuracies": 1.0, "rewards/chosen": 0.5178360342979431, "rewards/margins": 3.7540910243988037, "rewards/rejected": -3.236254930496216, "step": 426 }, { "epoch": 0.54, "learning_rate": 4.525343265531338e-08, "logits/chosen": -3.2109334468841553, "logits/rejected": -3.052732467651367, "logps/chosen": -333.5762634277344, "logps/rejected": -631.702392578125, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 0.522473156452179, "rewards/margins": 2.0010955333709717, "rewards/rejected": -1.4786224365234375, "step": 427 }, { "epoch": 0.55, "learning_rate": 4.504772348747686e-08, "logits/chosen": -3.179661273956299, "logits/rejected": -3.0750999450683594, "logps/chosen": -265.528564453125, "logps/rejected": -980.9937744140625, "loss": 0.3824, "rewards/accuracies": 1.0, "rewards/chosen": 0.5891349911689758, "rewards/margins": 3.249343156814575, "rewards/rejected": -2.660208225250244, "step": 428 }, { "epoch": 0.55, "learning_rate": 4.484209894038215e-08, "logits/chosen": -3.1786696910858154, "logits/rejected": -3.0327324867248535, "logps/chosen": -308.4676818847656, "logps/rejected": -414.2895202636719, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 0.6330177187919617, "rewards/margins": 1.5574073791503906, "rewards/rejected": -0.924389660358429, "step": 429 }, { "epoch": 0.55, "learning_rate": 4.463656252758542e-08, "logits/chosen": -3.189028263092041, "logits/rejected": -2.9923415184020996, "logps/chosen": -311.518310546875, "logps/rejected": -1639.14697265625, "loss": 0.3606, "rewards/accuracies": 1.0, "rewards/chosen": 0.6845306158065796, "rewards/margins": 6.43199348449707, "rewards/rejected": -5.747462749481201, "step": 430 }, { "epoch": 0.55, "learning_rate": 4.443111776113686e-08, "logits/chosen": -3.221428155899048, "logits/rejected": -2.9014949798583984, "logps/chosen": -292.84344482421875, "logps/rejected": -1244.086181640625, "loss": 0.367, "rewards/accuracies": 1.0, "rewards/chosen": 0.49191439151763916, "rewards/margins": 4.656263828277588, "rewards/rejected": -4.164349555969238, "step": 431 }, { "epoch": 0.55, "learning_rate": 4.4225768151520695e-08, "logits/chosen": -3.1147282123565674, "logits/rejected": -3.0418295860290527, "logps/chosen": -346.7253112792969, "logps/rejected": -977.7135009765625, "loss": 0.3733, "rewards/accuracies": 1.0, "rewards/chosen": 0.4741409420967102, "rewards/margins": 3.310555934906006, "rewards/rejected": -2.8364150524139404, "step": 432 }, { "epoch": 0.55, "learning_rate": 4.402051720759518e-08, "logits/chosen": -3.1369447708129883, "logits/rejected": -3.1226706504821777, "logps/chosen": -306.972412109375, "logps/rejected": -358.23834228515625, "loss": 0.4071, "rewards/accuracies": 1.0, "rewards/chosen": 0.7444823980331421, "rewards/margins": 1.637629747390747, "rewards/rejected": -0.893147349357605, "step": 433 }, { "epoch": 0.55, "learning_rate": 4.3815368436532614e-08, "logits/chosen": -3.132817268371582, "logits/rejected": -3.0784127712249756, "logps/chosen": -303.44097900390625, "logps/rejected": -712.1101684570312, "loss": 0.3981, "rewards/accuracies": 1.0, "rewards/chosen": 0.6578658819198608, "rewards/margins": 2.787275791168213, "rewards/rejected": -2.1294097900390625, "step": 434 }, { "epoch": 0.55, "learning_rate": 4.361032534375951e-08, "logits/chosen": -3.191828727722168, "logits/rejected": -3.1145267486572266, "logps/chosen": -304.6988525390625, "logps/rejected": -435.693359375, "loss": 0.4543, "rewards/accuracies": 1.0, "rewards/chosen": 0.6216766834259033, "rewards/margins": 1.5341278314590454, "rewards/rejected": -0.9124511480331421, "step": 435 }, { "epoch": 0.56, "learning_rate": 4.340539143289655e-08, "logits/chosen": -3.2068073749542236, "logits/rejected": -3.0112967491149902, "logps/chosen": -302.16827392578125, "logps/rejected": -304.2091064453125, "loss": 0.4452, "rewards/accuracies": 1.0, "rewards/chosen": 0.6297012567520142, "rewards/margins": 1.3682372570037842, "rewards/rejected": -0.7385361194610596, "step": 436 }, { "epoch": 0.56, "learning_rate": 4.320057020569888e-08, "logits/chosen": -3.202565908432007, "logits/rejected": -3.0543136596679688, "logps/chosen": -300.71331787109375, "logps/rejected": -1472.1444091796875, "loss": 0.4445, "rewards/accuracies": 1.0, "rewards/chosen": 0.695721447467804, "rewards/margins": 5.474752902984619, "rewards/rejected": -4.779031753540039, "step": 437 }, { "epoch": 0.56, "learning_rate": 4.29958651619961e-08, "logits/chosen": -3.168929100036621, "logits/rejected": -2.994929790496826, "logps/chosen": -296.6910400390625, "logps/rejected": -913.5280151367188, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 0.6127212643623352, "rewards/margins": 2.7225356101989746, "rewards/rejected": -2.1098146438598633, "step": 438 }, { "epoch": 0.56, "learning_rate": 4.279127979963266e-08, "logits/chosen": -3.2409119606018066, "logits/rejected": -3.1529431343078613, "logps/chosen": -274.5456237792969, "logps/rejected": -521.24609375, "loss": 0.4207, "rewards/accuracies": 1.0, "rewards/chosen": 0.3792564570903778, "rewards/margins": 1.870581865310669, "rewards/rejected": -1.4913253784179688, "step": 439 }, { "epoch": 0.56, "learning_rate": 4.2586817614407896e-08, "logits/chosen": -3.1785550117492676, "logits/rejected": -3.104673147201538, "logps/chosen": -326.68988037109375, "logps/rejected": -808.768798828125, "loss": 0.457, "rewards/accuracies": 1.0, "rewards/chosen": 0.6791412234306335, "rewards/margins": 3.066537380218506, "rewards/rejected": -2.3873963356018066, "step": 440 }, { "epoch": 0.56, "learning_rate": 4.238248210001645e-08, "logits/chosen": -3.2504398822784424, "logits/rejected": -3.127967357635498, "logps/chosen": -315.4090576171875, "logps/rejected": -415.2513122558594, "loss": 0.44, "rewards/accuracies": 1.0, "rewards/chosen": 0.5789902210235596, "rewards/margins": 1.5562775135040283, "rewards/rejected": -0.9772872924804688, "step": 441 }, { "epoch": 0.56, "learning_rate": 4.217827674798844e-08, "logits/chosen": -3.2256736755371094, "logits/rejected": -2.9833106994628906, "logps/chosen": -289.737548828125, "logps/rejected": -972.8110961914062, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 0.5857048034667969, "rewards/margins": 3.7130134105682373, "rewards/rejected": -3.1273086071014404, "step": 442 }, { "epoch": 0.56, "learning_rate": 4.197420504762997e-08, "logits/chosen": -3.203711986541748, "logits/rejected": -3.0020227432250977, "logps/chosen": -306.5966796875, "logps/rejected": -1133.640625, "loss": 0.4113, "rewards/accuracies": 1.0, "rewards/chosen": 0.5348556637763977, "rewards/margins": 4.334768772125244, "rewards/rejected": -3.799913167953491, "step": 443 }, { "epoch": 0.57, "learning_rate": 4.177027048596329e-08, "logits/chosen": -3.1563985347747803, "logits/rejected": -3.1017937660217285, "logps/chosen": -275.43475341796875, "logps/rejected": -336.95147705078125, "loss": 0.4044, "rewards/accuracies": 1.0, "rewards/chosen": 0.6941497921943665, "rewards/margins": 1.6875596046447754, "rewards/rejected": -0.9934097528457642, "step": 444 }, { "epoch": 0.57, "learning_rate": 4.156647654766743e-08, "logits/chosen": -3.158172607421875, "logits/rejected": -3.150595188140869, "logps/chosen": -347.13031005859375, "logps/rejected": -874.6608276367188, "loss": 0.4117, "rewards/accuracies": 1.0, "rewards/chosen": 0.6817825436592102, "rewards/margins": 3.7921524047851562, "rewards/rejected": -3.1103696823120117, "step": 445 }, { "epoch": 0.57, "learning_rate": 4.13628267150185e-08, "logits/chosen": -3.1888532638549805, "logits/rejected": -3.0043959617614746, "logps/chosen": -271.24346923828125, "logps/rejected": -677.084228515625, "loss": 0.3929, "rewards/accuracies": 1.0, "rewards/chosen": 0.6222320795059204, "rewards/margins": 2.6309831142425537, "rewards/rejected": -2.0087509155273438, "step": 446 }, { "epoch": 0.57, "learning_rate": 4.1159324467830196e-08, "logits/chosen": -3.1882622241973877, "logits/rejected": -3.0434422492980957, "logps/chosen": -304.9939880371094, "logps/rejected": -952.2710571289062, "loss": 0.4222, "rewards/accuracies": 1.0, "rewards/chosen": 0.7368667721748352, "rewards/margins": 3.8095717430114746, "rewards/rejected": -3.072705030441284, "step": 447 }, { "epoch": 0.57, "learning_rate": 4.095597328339452e-08, "logits/chosen": -3.1593360900878906, "logits/rejected": -2.962174892425537, "logps/chosen": -293.75006103515625, "logps/rejected": -1101.1181640625, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 0.6844100952148438, "rewards/margins": 4.001359939575195, "rewards/rejected": -3.3169496059417725, "step": 448 }, { "epoch": 0.57, "learning_rate": 4.075277663642208e-08, "logits/chosen": -3.182586669921875, "logits/rejected": -3.0737338066101074, "logps/chosen": -284.89886474609375, "logps/rejected": -702.7561645507812, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": 0.7490936517715454, "rewards/margins": 3.0320374965667725, "rewards/rejected": -2.2829437255859375, "step": 449 }, { "epoch": 0.57, "learning_rate": 4.054973799898299e-08, "logits/chosen": -3.1790544986724854, "logits/rejected": -3.0791513919830322, "logps/chosen": -317.34722900390625, "logps/rejected": -622.22509765625, "loss": 0.4231, "rewards/accuracies": 1.0, "rewards/chosen": 0.47265928983688354, "rewards/margins": 2.1122498512268066, "rewards/rejected": -1.6395905017852783, "step": 450 }, { "epoch": 0.57, "learning_rate": 4.0346860840447325e-08, "logits/chosen": -3.1752712726593018, "logits/rejected": -3.070650100708008, "logps/chosen": -280.7152099609375, "logps/rejected": -826.495849609375, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 0.5379509329795837, "rewards/margins": 2.766033172607422, "rewards/rejected": -2.2280821800231934, "step": 451 }, { "epoch": 0.58, "learning_rate": 4.014414862742599e-08, "logits/chosen": -3.2393293380737305, "logits/rejected": -2.9934096336364746, "logps/chosen": -303.299560546875, "logps/rejected": -919.615478515625, "loss": 0.3908, "rewards/accuracies": 1.0, "rewards/chosen": 0.5678207278251648, "rewards/margins": 2.9987807273864746, "rewards/rejected": -2.430960178375244, "step": 452 }, { "epoch": 0.58, "learning_rate": 3.994160482371138e-08, "logits/chosen": -3.2610368728637695, "logits/rejected": -3.0780282020568848, "logps/chosen": -284.688720703125, "logps/rejected": -654.8016357421875, "loss": 0.3778, "rewards/accuracies": 1.0, "rewards/chosen": 0.5396965146064758, "rewards/margins": 2.5390801429748535, "rewards/rejected": -1.9993836879730225, "step": 453 }, { "epoch": 0.58, "learning_rate": 3.973923289021829e-08, "logits/chosen": -3.202742099761963, "logits/rejected": -3.017909049987793, "logps/chosen": -331.0603942871094, "logps/rejected": -699.9763793945312, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": 0.7995560169219971, "rewards/margins": 2.9844093322753906, "rewards/rejected": -2.1848533153533936, "step": 454 }, { "epoch": 0.58, "learning_rate": 3.953703628492467e-08, "logits/chosen": -3.1027908325195312, "logits/rejected": -3.055490016937256, "logps/chosen": -322.3897705078125, "logps/rejected": -992.0010375976562, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 0.6793426275253296, "rewards/margins": 3.9063751697540283, "rewards/rejected": -3.2270326614379883, "step": 455 }, { "epoch": 0.58, "learning_rate": 3.933501846281266e-08, "logits/chosen": -3.1689605712890625, "logits/rejected": -3.027801990509033, "logps/chosen": -290.0516662597656, "logps/rejected": -1322.06591796875, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 0.6314529776573181, "rewards/margins": 4.339473247528076, "rewards/rejected": -3.7080202102661133, "step": 456 }, { "epoch": 0.58, "learning_rate": 3.913318287580942e-08, "logits/chosen": -3.2049922943115234, "logits/rejected": -3.0795388221740723, "logps/chosen": -283.0895690917969, "logps/rejected": -745.4329833984375, "loss": 0.4012, "rewards/accuracies": 1.0, "rewards/chosen": 0.4507454037666321, "rewards/margins": 2.837784767150879, "rewards/rejected": -2.3870391845703125, "step": 457 }, { "epoch": 0.58, "learning_rate": 3.893153297272828e-08, "logits/chosen": -3.1524505615234375, "logits/rejected": -3.1281239986419678, "logps/chosen": -312.18890380859375, "logps/rejected": -557.7091064453125, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 0.6392883062362671, "rewards/margins": 2.206280469894409, "rewards/rejected": -1.566992163658142, "step": 458 }, { "epoch": 0.59, "learning_rate": 3.87300721992097e-08, "logits/chosen": -3.2389817237854004, "logits/rejected": -3.0834274291992188, "logps/chosen": -319.19976806640625, "logps/rejected": -447.5809020996094, "loss": 0.3822, "rewards/accuracies": 1.0, "rewards/chosen": 0.6651809811592102, "rewards/margins": 1.850947618484497, "rewards/rejected": -1.185766577720642, "step": 459 }, { "epoch": 0.59, "learning_rate": 3.8528803997662425e-08, "logits/chosen": -3.165736675262451, "logits/rejected": -3.0530195236206055, "logps/chosen": -322.0986633300781, "logps/rejected": -635.8182373046875, "loss": 0.4294, "rewards/accuracies": 1.0, "rewards/chosen": 0.4869323670864105, "rewards/margins": 2.386837959289551, "rewards/rejected": -1.8999054431915283, "step": 460 }, { "epoch": 0.59, "learning_rate": 3.8327731807204744e-08, "logits/chosen": -3.1865100860595703, "logits/rejected": -2.8939969539642334, "logps/chosen": -324.39044189453125, "logps/rejected": -2959.194091796875, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 0.7195816040039062, "rewards/margins": 9.65630054473877, "rewards/rejected": -8.936718940734863, "step": 461 }, { "epoch": 0.59, "learning_rate": 3.812685906360557e-08, "logits/chosen": -3.1790757179260254, "logits/rejected": -3.1500391960144043, "logps/chosen": -297.87213134765625, "logps/rejected": -712.6231689453125, "loss": 0.4397, "rewards/accuracies": 1.0, "rewards/chosen": 0.64471435546875, "rewards/margins": 2.7404847145080566, "rewards/rejected": -2.0957703590393066, "step": 462 }, { "epoch": 0.59, "learning_rate": 3.792618919922591e-08, "logits/chosen": -3.1598076820373535, "logits/rejected": -3.1009554862976074, "logps/chosen": -296.70208740234375, "logps/rejected": -774.1287841796875, "loss": 0.4018, "rewards/accuracies": 1.0, "rewards/chosen": 0.6656372547149658, "rewards/margins": 3.145111083984375, "rewards/rejected": -2.479473829269409, "step": 463 }, { "epoch": 0.59, "learning_rate": 3.7725725642960044e-08, "logits/chosen": -3.1343464851379395, "logits/rejected": -2.9639270305633545, "logps/chosen": -264.8961181640625, "logps/rejected": -1046.865234375, "loss": 0.3684, "rewards/accuracies": 1.0, "rewards/chosen": 0.7623039484024048, "rewards/margins": 3.4787073135375977, "rewards/rejected": -2.7164032459259033, "step": 464 }, { "epoch": 0.59, "learning_rate": 3.752547182017708e-08, "logits/chosen": -3.216053009033203, "logits/rejected": -3.166804075241089, "logps/chosen": -277.41925048828125, "logps/rejected": -900.5040893554688, "loss": 0.4248, "rewards/accuracies": 1.0, "rewards/chosen": 0.6430511474609375, "rewards/margins": 3.647305488586426, "rewards/rejected": -3.0042543411254883, "step": 465 }, { "epoch": 0.59, "learning_rate": 3.7325431152662297e-08, "logits/chosen": -3.2004311084747314, "logits/rejected": -3.0343470573425293, "logps/chosen": -315.249755859375, "logps/rejected": -865.4548950195312, "loss": 0.41, "rewards/accuracies": 1.0, "rewards/chosen": 0.5276702642440796, "rewards/margins": 3.2757370471954346, "rewards/rejected": -2.7480669021606445, "step": 466 }, { "epoch": 0.6, "learning_rate": 3.7125607058558804e-08, "logits/chosen": -3.1694695949554443, "logits/rejected": -3.005161762237549, "logps/chosen": -356.745361328125, "logps/rejected": -1254.153076171875, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": 0.5412735342979431, "rewards/margins": 4.5924272537231445, "rewards/rejected": -4.051153659820557, "step": 467 }, { "epoch": 0.6, "learning_rate": 3.692600295230901e-08, "logits/chosen": -3.122526168823242, "logits/rejected": -3.0577776432037354, "logps/chosen": -311.2394714355469, "logps/rejected": -794.132080078125, "loss": 0.3803, "rewards/accuracies": 1.0, "rewards/chosen": 0.7434555292129517, "rewards/margins": 3.4070725440979004, "rewards/rejected": -2.6636171340942383, "step": 468 }, { "epoch": 0.6, "learning_rate": 3.6726622244596394e-08, "logits/chosen": -3.248018980026245, "logits/rejected": -3.1726107597351074, "logps/chosen": -339.030517578125, "logps/rejected": -856.8536376953125, "loss": 0.4088, "rewards/accuracies": 1.0, "rewards/chosen": 0.6406234502792358, "rewards/margins": 3.4511735439300537, "rewards/rejected": -2.8105499744415283, "step": 469 }, { "epoch": 0.6, "learning_rate": 3.6527468342287096e-08, "logits/chosen": -3.206486225128174, "logits/rejected": -3.0322766304016113, "logps/chosen": -293.01806640625, "logps/rejected": -775.6672973632812, "loss": 0.436, "rewards/accuracies": 1.0, "rewards/chosen": 0.542980968952179, "rewards/margins": 2.781292676925659, "rewards/rejected": -2.238311767578125, "step": 470 }, { "epoch": 0.6, "learning_rate": 3.632854464837188e-08, "logits/chosen": -3.2725443840026855, "logits/rejected": -3.1323695182800293, "logps/chosen": -272.3056640625, "logps/rejected": -713.327880859375, "loss": 0.4119, "rewards/accuracies": 1.0, "rewards/chosen": 0.6893623471260071, "rewards/margins": 3.0571236610412598, "rewards/rejected": -2.3677613735198975, "step": 471 }, { "epoch": 0.6, "learning_rate": 3.612985456190778e-08, "logits/chosen": -3.1859400272369385, "logits/rejected": -3.1148037910461426, "logps/chosen": -273.51812744140625, "logps/rejected": -838.7601928710938, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": 0.6654754877090454, "rewards/margins": 3.3520569801330566, "rewards/rejected": -2.686581611633301, "step": 472 }, { "epoch": 0.6, "learning_rate": 3.5931401477960176e-08, "logits/chosen": -3.1956167221069336, "logits/rejected": -3.0992445945739746, "logps/chosen": -279.5893859863281, "logps/rejected": -659.1705322265625, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 0.6706962585449219, "rewards/margins": 2.489765167236328, "rewards/rejected": -1.8190689086914062, "step": 473 }, { "epoch": 0.6, "learning_rate": 3.5733188787544745e-08, "logits/chosen": -3.2512049674987793, "logits/rejected": -3.085721015930176, "logps/chosen": -287.231689453125, "logps/rejected": -569.9952392578125, "loss": 0.3759, "rewards/accuracies": 1.0, "rewards/chosen": 0.5786300897598267, "rewards/margins": 2.3939239978790283, "rewards/rejected": -1.8152939081192017, "step": 474 }, { "epoch": 0.61, "learning_rate": 3.553521987756945e-08, "logits/chosen": -3.198422431945801, "logits/rejected": -3.050794839859009, "logps/chosen": -321.2420654296875, "logps/rejected": -758.67236328125, "loss": 0.3917, "rewards/accuracies": 1.0, "rewards/chosen": 0.5076156854629517, "rewards/margins": 2.7792129516601562, "rewards/rejected": -2.271597385406494, "step": 475 }, { "epoch": 0.61, "learning_rate": 3.5337498130776766e-08, "logits/chosen": -3.1864166259765625, "logits/rejected": -3.0390403270721436, "logps/chosen": -301.76104736328125, "logps/rejected": -1028.03955078125, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 0.7175262570381165, "rewards/margins": 3.974665880203247, "rewards/rejected": -3.2571396827697754, "step": 476 }, { "epoch": 0.61, "learning_rate": 3.5140026925685804e-08, "logits/chosen": -3.214045524597168, "logits/rejected": -2.9917454719543457, "logps/chosen": -287.192626953125, "logps/rejected": -2445.42041015625, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 0.6780288815498352, "rewards/margins": 7.556654453277588, "rewards/rejected": -6.878625392913818, "step": 477 }, { "epoch": 0.61, "learning_rate": 3.494280963653463e-08, "logits/chosen": -3.2204103469848633, "logits/rejected": -3.1125526428222656, "logps/chosen": -285.90087890625, "logps/rejected": -341.13763427734375, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 0.6825058460235596, "rewards/margins": 1.649621605873108, "rewards/rejected": -0.9671157598495483, "step": 478 }, { "epoch": 0.61, "learning_rate": 3.474584963322256e-08, "logits/chosen": -3.1974711418151855, "logits/rejected": -3.097668170928955, "logps/chosen": -294.84161376953125, "logps/rejected": -1009.5863647460938, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 0.6558624505996704, "rewards/margins": 3.509194850921631, "rewards/rejected": -2.85333251953125, "step": 479 }, { "epoch": 0.61, "learning_rate": 3.4549150281252633e-08, "logits/chosen": -3.2087454795837402, "logits/rejected": -3.0607573986053467, "logps/chosen": -268.4227294921875, "logps/rejected": -678.0498657226562, "loss": 0.374, "rewards/accuracies": 1.0, "rewards/chosen": 0.5329391360282898, "rewards/margins": 2.8787858486175537, "rewards/rejected": -2.345846652984619, "step": 480 }, { "epoch": 0.61, "learning_rate": 3.435271494167404e-08, "logits/chosen": -3.2122387886047363, "logits/rejected": -3.1061930656433105, "logps/chosen": -302.2122802734375, "logps/rejected": -594.0101928710938, "loss": 0.4166, "rewards/accuracies": 1.0, "rewards/chosen": 0.5037262439727783, "rewards/margins": 2.3232789039611816, "rewards/rejected": -1.8195526599884033, "step": 481 }, { "epoch": 0.61, "learning_rate": 3.415654697102478e-08, "logits/chosen": -3.186624526977539, "logits/rejected": -3.1868200302124023, "logps/chosen": -295.527587890625, "logps/rejected": -1091.524169921875, "loss": 0.3357, "rewards/accuracies": 1.0, "rewards/chosen": 0.45885011553764343, "rewards/margins": 4.684148788452148, "rewards/rejected": -4.225298881530762, "step": 482 }, { "epoch": 0.62, "learning_rate": 3.396064972127421e-08, "logits/chosen": -3.2489800453186035, "logits/rejected": -3.03688907623291, "logps/chosen": -251.1661834716797, "logps/rejected": -262.5812072753906, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 0.600128173828125, "rewards/margins": 1.2590241432189941, "rewards/rejected": -0.6588958501815796, "step": 483 }, { "epoch": 0.62, "learning_rate": 3.376502653976583e-08, "logits/chosen": -3.226328134536743, "logits/rejected": -3.1134815216064453, "logps/chosen": -291.71533203125, "logps/rejected": -1190.44580078125, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 0.5173026919364929, "rewards/margins": 3.8190574645996094, "rewards/rejected": -3.3017547130584717, "step": 484 }, { "epoch": 0.62, "learning_rate": 3.356968076916006e-08, "logits/chosen": -3.2212722301483154, "logits/rejected": -3.080720901489258, "logps/chosen": -290.01470947265625, "logps/rejected": -275.0726623535156, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 0.6795867681503296, "rewards/margins": 1.3673416376113892, "rewards/rejected": -0.6877548694610596, "step": 485 }, { "epoch": 0.62, "learning_rate": 3.337461574737716e-08, "logits/chosen": -3.2666332721710205, "logits/rejected": -2.99259614944458, "logps/chosen": -311.112548828125, "logps/rejected": -1239.3172607421875, "loss": 0.4486, "rewards/accuracies": 1.0, "rewards/chosen": 0.6678756475448608, "rewards/margins": 4.8026275634765625, "rewards/rejected": -4.134751796722412, "step": 486 }, { "epoch": 0.62, "learning_rate": 3.317983480754015e-08, "logits/chosen": -3.1639795303344727, "logits/rejected": -3.1106090545654297, "logps/chosen": -305.957275390625, "logps/rejected": -351.41363525390625, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 0.6501907110214233, "rewards/margins": 1.623274326324463, "rewards/rejected": -0.97308349609375, "step": 487 }, { "epoch": 0.62, "learning_rate": 3.298534127791784e-08, "logits/chosen": -3.150050640106201, "logits/rejected": -3.1105313301086426, "logps/chosen": -356.27349853515625, "logps/rejected": -587.836669921875, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 0.48805543780326843, "rewards/margins": 2.2207183837890625, "rewards/rejected": -1.7326629161834717, "step": 488 }, { "epoch": 0.62, "learning_rate": 3.279113848186808e-08, "logits/chosen": -3.1563405990600586, "logits/rejected": -3.1168437004089355, "logps/chosen": -301.45184326171875, "logps/rejected": -578.4769287109375, "loss": 0.4137, "rewards/accuracies": 1.0, "rewards/chosen": 0.5539001822471619, "rewards/margins": 2.2950668334960938, "rewards/rejected": -1.7411667108535767, "step": 489 }, { "epoch": 0.62, "learning_rate": 3.259722973778077e-08, "logits/chosen": -3.180234670639038, "logits/rejected": -3.075415849685669, "logps/chosen": -327.01544189453125, "logps/rejected": -1200.087158203125, "loss": 0.4355, "rewards/accuracies": 1.0, "rewards/chosen": 0.7247833609580994, "rewards/margins": 4.3583526611328125, "rewards/rejected": -3.6335694789886475, "step": 490 }, { "epoch": 0.63, "learning_rate": 3.24036183590214e-08, "logits/chosen": -3.207980155944824, "logits/rejected": -2.9735991954803467, "logps/chosen": -335.82672119140625, "logps/rejected": -1212.4122314453125, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": 0.6326507925987244, "rewards/margins": 4.487835884094238, "rewards/rejected": -3.855185031890869, "step": 491 }, { "epoch": 0.63, "learning_rate": 3.221030765387417e-08, "logits/chosen": -3.195420742034912, "logits/rejected": -3.024667739868164, "logps/chosen": -308.67669677734375, "logps/rejected": -306.7330627441406, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 0.6317276358604431, "rewards/margins": 1.4126510620117188, "rewards/rejected": -0.7809234857559204, "step": 492 }, { "epoch": 0.63, "learning_rate": 3.201730092548573e-08, "logits/chosen": -3.1894588470458984, "logits/rejected": -3.155329704284668, "logps/chosen": -313.96795654296875, "logps/rejected": -733.8590087890625, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 0.7099090814590454, "rewards/margins": 2.9907593727111816, "rewards/rejected": -2.2808501720428467, "step": 493 }, { "epoch": 0.63, "learning_rate": 3.18246014718085e-08, "logits/chosen": -3.1389636993408203, "logits/rejected": -3.0566611289978027, "logps/chosen": -315.1930847167969, "logps/rejected": -3222.074951171875, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": 0.7528122067451477, "rewards/margins": 8.292238235473633, "rewards/rejected": -7.539425373077393, "step": 494 }, { "epoch": 0.63, "learning_rate": 3.16322125855445e-08, "logits/chosen": -3.2481393814086914, "logits/rejected": -3.049389123916626, "logps/chosen": -302.9310302734375, "logps/rejected": -1065.663330078125, "loss": 0.4043, "rewards/accuracies": 1.0, "rewards/chosen": 0.8289520740509033, "rewards/margins": 4.454034805297852, "rewards/rejected": -3.625082492828369, "step": 495 }, { "epoch": 0.63, "learning_rate": 3.1440137554088955e-08, "logits/chosen": -3.1807217597961426, "logits/rejected": -3.0820324420928955, "logps/chosen": -274.8067626953125, "logps/rejected": -306.20843505859375, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 0.6049560308456421, "rewards/margins": 1.480017066001892, "rewards/rejected": -0.87506103515625, "step": 496 }, { "epoch": 0.63, "learning_rate": 3.1248379659474225e-08, "logits/chosen": -3.227687120437622, "logits/rejected": -3.088521718978882, "logps/chosen": -337.75323486328125, "logps/rejected": -597.0486450195312, "loss": 0.4317, "rewards/accuracies": 1.0, "rewards/chosen": 0.4984222650527954, "rewards/margins": 2.6008973121643066, "rewards/rejected": -2.102475166320801, "step": 497 }, { "epoch": 0.63, "learning_rate": 3.1056942178313604e-08, "logits/chosen": -3.1404337882995605, "logits/rejected": -3.161167860031128, "logps/chosen": -328.3240966796875, "logps/rejected": -796.5966186523438, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 0.7789032459259033, "rewards/margins": 3.214883327484131, "rewards/rejected": -2.4359803199768066, "step": 498 }, { "epoch": 0.64, "learning_rate": 3.086582838174551e-08, "logits/chosen": -3.2160532474517822, "logits/rejected": -3.023923397064209, "logps/chosen": -239.2784423828125, "logps/rejected": -1501.3812255859375, "loss": 0.335, "rewards/accuracies": 1.0, "rewards/chosen": 0.664435625076294, "rewards/margins": 5.29917049407959, "rewards/rejected": -4.634735107421875, "step": 499 }, { "epoch": 0.64, "learning_rate": 3.0675041535377396e-08, "logits/chosen": -3.2409939765930176, "logits/rejected": -3.1071083545684814, "logps/chosen": -280.9150085449219, "logps/rejected": -358.6431884765625, "loss": 0.4026, "rewards/accuracies": 1.0, "rewards/chosen": 0.7578185796737671, "rewards/margins": 1.7579834461212158, "rewards/rejected": -1.0001648664474487, "step": 500 }, { "epoch": 0.64, "learning_rate": 3.048458489923005e-08, "logits/chosen": -3.174217462539673, "logits/rejected": -3.1099424362182617, "logps/chosen": -299.0069580078125, "logps/rejected": -626.7015380859375, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 0.575732409954071, "rewards/margins": 2.430899143218994, "rewards/rejected": -1.8551666736602783, "step": 501 }, { "epoch": 0.64, "learning_rate": 3.029446172768193e-08, "logits/chosen": -3.222553253173828, "logits/rejected": -3.1253914833068848, "logps/chosen": -294.47027587890625, "logps/rejected": -359.5600891113281, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 0.6058639287948608, "rewards/margins": 1.5264549255371094, "rewards/rejected": -0.9205909967422485, "step": 502 }, { "epoch": 0.64, "learning_rate": 3.0104675269413436e-08, "logits/chosen": -3.258329153060913, "logits/rejected": -3.178882598876953, "logps/chosen": -314.7869873046875, "logps/rejected": -795.1162109375, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 0.7517455816268921, "rewards/margins": 3.512439012527466, "rewards/rejected": -2.760693311691284, "step": 503 }, { "epoch": 0.64, "learning_rate": 2.991522876735154e-08, "logits/chosen": -3.234927177429199, "logits/rejected": -2.9616053104400635, "logps/chosen": -335.6416015625, "logps/rejected": -1295.107666015625, "loss": 0.4418, "rewards/accuracies": 1.0, "rewards/chosen": 0.6654708981513977, "rewards/margins": 4.76613187789917, "rewards/rejected": -4.100660800933838, "step": 504 }, { "epoch": 0.64, "learning_rate": 2.9726125458614215e-08, "logits/chosen": -3.226522922515869, "logits/rejected": -3.1322991847991943, "logps/chosen": -320.7984313964844, "logps/rejected": -535.3530883789062, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 0.7053421139717102, "rewards/margins": 2.3979110717773438, "rewards/rejected": -1.6925690174102783, "step": 505 }, { "epoch": 0.64, "learning_rate": 2.9537368574455303e-08, "logits/chosen": -3.1886935234069824, "logits/rejected": -3.0453593730926514, "logps/chosen": -292.70355224609375, "logps/rejected": -323.955810546875, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 0.6719573736190796, "rewards/margins": 1.3907744884490967, "rewards/rejected": -0.7188171744346619, "step": 506 }, { "epoch": 0.65, "learning_rate": 2.9348961340209117e-08, "logits/chosen": -3.22487211227417, "logits/rejected": -3.139822483062744, "logps/chosen": -287.74810791015625, "logps/rejected": -798.045166015625, "loss": 0.3743, "rewards/accuracies": 1.0, "rewards/chosen": 0.6260315179824829, "rewards/margins": 3.365792751312256, "rewards/rejected": -2.7397613525390625, "step": 507 }, { "epoch": 0.65, "learning_rate": 2.916090697523549e-08, "logits/chosen": -3.213870048522949, "logits/rejected": -3.1166326999664307, "logps/chosen": -318.812744140625, "logps/rejected": -1770.402099609375, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 0.50274658203125, "rewards/margins": 5.669409275054932, "rewards/rejected": -5.166662693023682, "step": 508 }, { "epoch": 0.65, "learning_rate": 2.897320869286462e-08, "logits/chosen": -3.283513069152832, "logits/rejected": -3.073971748352051, "logps/chosen": -304.54156494140625, "logps/rejected": -811.2105712890625, "loss": 0.4303, "rewards/accuracies": 1.0, "rewards/chosen": 0.3923538327217102, "rewards/margins": 2.752469062805176, "rewards/rejected": -2.3601150512695312, "step": 509 }, { "epoch": 0.65, "learning_rate": 2.8785869700342317e-08, "logits/chosen": -3.142305850982666, "logits/rejected": -3.0984549522399902, "logps/chosen": -257.202392578125, "logps/rejected": -1030.898681640625, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 0.6760787963867188, "rewards/margins": 4.5385026931762695, "rewards/rejected": -3.862423896789551, "step": 510 }, { "epoch": 0.65, "learning_rate": 2.8598893198775044e-08, "logits/chosen": -3.240079879760742, "logits/rejected": -3.1960842609405518, "logps/chosen": -298.40643310546875, "logps/rejected": -744.6553955078125, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 0.5135498046875, "rewards/margins": 2.7533631324768066, "rewards/rejected": -2.2398133277893066, "step": 511 }, { "epoch": 0.65, "learning_rate": 2.841228238307536e-08, "logits/chosen": -3.168138027191162, "logits/rejected": -2.9043784141540527, "logps/chosen": -327.3485107421875, "logps/rejected": -1278.0804443359375, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 0.6381866931915283, "rewards/margins": 4.332284927368164, "rewards/rejected": -3.6940979957580566, "step": 512 }, { "epoch": 0.65, "learning_rate": 2.8226040441907207e-08, "logits/chosen": -3.261582374572754, "logits/rejected": -3.0910911560058594, "logps/chosen": -304.2082214355469, "logps/rejected": -484.2754821777344, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 0.6006454229354858, "rewards/margins": 2.040421962738037, "rewards/rejected": -1.4397766590118408, "step": 513 }, { "epoch": 0.66, "learning_rate": 2.8040170557631488e-08, "logits/chosen": -3.2281432151794434, "logits/rejected": -3.062053680419922, "logps/chosen": -311.663818359375, "logps/rejected": -549.0792846679688, "loss": 0.4126, "rewards/accuracies": 1.0, "rewards/chosen": 0.684246838092804, "rewards/margins": 2.237600803375244, "rewards/rejected": -1.5533539056777954, "step": 514 }, { "epoch": 0.66, "learning_rate": 2.7854675906251723e-08, "logits/chosen": -3.1807291507720947, "logits/rejected": -2.975985288619995, "logps/chosen": -325.442626953125, "logps/rejected": -1321.699462890625, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 0.6151703000068665, "rewards/margins": 5.120221138000488, "rewards/rejected": -4.5050506591796875, "step": 515 }, { "epoch": 0.66, "learning_rate": 2.7669559657359676e-08, "logits/chosen": -3.2138562202453613, "logits/rejected": -2.9481186866760254, "logps/chosen": -331.21337890625, "logps/rejected": -3609.8681640625, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 0.6349426507949829, "rewards/margins": 8.085433959960938, "rewards/rejected": -7.450491428375244, "step": 516 }, { "epoch": 0.66, "learning_rate": 2.7484824974081323e-08, "logits/chosen": -3.2353641986846924, "logits/rejected": -2.965372085571289, "logps/chosen": -302.8134460449219, "logps/rejected": -1284.2926025390625, "loss": 0.3837, "rewards/accuracies": 1.0, "rewards/chosen": 0.7378937005996704, "rewards/margins": 4.753383159637451, "rewards/rejected": -4.015489101409912, "step": 517 }, { "epoch": 0.66, "learning_rate": 2.730047501302266e-08, "logits/chosen": -3.168485164642334, "logits/rejected": -3.0262155532836914, "logps/chosen": -323.810546875, "logps/rejected": -977.2511596679688, "loss": 0.4094, "rewards/accuracies": 1.0, "rewards/chosen": 0.6620101928710938, "rewards/margins": 3.574052572250366, "rewards/rejected": -2.9120423793792725, "step": 518 }, { "epoch": 0.66, "learning_rate": 2.711651292421593e-08, "logits/chosen": -3.25239896774292, "logits/rejected": -3.033632516860962, "logps/chosen": -288.620849609375, "logps/rejected": -1666.4749755859375, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 0.569403886795044, "rewards/margins": 6.43017053604126, "rewards/rejected": -5.860766410827637, "step": 519 }, { "epoch": 0.66, "learning_rate": 2.6932941851065616e-08, "logits/chosen": -3.207303524017334, "logits/rejected": -3.1289710998535156, "logps/chosen": -294.8376770019531, "logps/rejected": -875.3799438476562, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 0.606091320514679, "rewards/margins": 3.1427292823791504, "rewards/rejected": -2.536637783050537, "step": 520 }, { "epoch": 0.66, "learning_rate": 2.6749764930294905e-08, "logits/chosen": -3.2497076988220215, "logits/rejected": -2.8781585693359375, "logps/chosen": -377.5392150878906, "logps/rejected": -2420.089111328125, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 0.3753921389579773, "rewards/margins": 7.84396505355835, "rewards/rejected": -7.468573093414307, "step": 521 }, { "epoch": 0.67, "learning_rate": 2.656698529189193e-08, "logits/chosen": -3.200413703918457, "logits/rejected": -3.046640157699585, "logps/chosen": -310.5372619628906, "logps/rejected": -763.46142578125, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 0.7267792224884033, "rewards/margins": 3.1238648891448975, "rewards/rejected": -2.397085666656494, "step": 522 }, { "epoch": 0.67, "learning_rate": 2.638460605905646e-08, "logits/chosen": -3.1550867557525635, "logits/rejected": -3.1070454120635986, "logps/chosen": -310.2062072753906, "logps/rejected": -613.1400146484375, "loss": 0.4067, "rewards/accuracies": 1.0, "rewards/chosen": 0.6984955072402954, "rewards/margins": 2.6872024536132812, "rewards/rejected": -1.9887069463729858, "step": 523 }, { "epoch": 0.67, "learning_rate": 2.620263034814632e-08, "logits/chosen": -3.1839632987976074, "logits/rejected": -3.1488442420959473, "logps/chosen": -284.51580810546875, "logps/rejected": -1150.5621337890625, "loss": 0.3262, "rewards/accuracies": 1.0, "rewards/chosen": 0.5897903442382812, "rewards/margins": 4.1925249099731445, "rewards/rejected": -3.6027345657348633, "step": 524 }, { "epoch": 0.67, "learning_rate": 2.6021061268624378e-08, "logits/chosen": -3.209395408630371, "logits/rejected": -3.191882848739624, "logps/chosen": -299.6375732421875, "logps/rejected": -711.489013671875, "loss": 0.3822, "rewards/accuracies": 1.0, "rewards/chosen": 0.4391235411167145, "rewards/margins": 2.585601806640625, "rewards/rejected": -2.1464784145355225, "step": 525 }, { "epoch": 0.67, "learning_rate": 2.5839901923005202e-08, "logits/chosen": -3.2236666679382324, "logits/rejected": -2.9712367057800293, "logps/chosen": -253.67901611328125, "logps/rejected": -1273.004638671875, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 0.6534652709960938, "rewards/margins": 4.95323371887207, "rewards/rejected": -4.299768447875977, "step": 526 }, { "epoch": 0.67, "learning_rate": 2.5659155406802192e-08, "logits/chosen": -3.1776413917541504, "logits/rejected": -3.131108283996582, "logps/chosen": -325.4475402832031, "logps/rejected": -722.1754150390625, "loss": 0.3785, "rewards/accuracies": 1.0, "rewards/chosen": 0.5308182239532471, "rewards/margins": 2.9596633911132812, "rewards/rejected": -2.428845167160034, "step": 527 }, { "epoch": 0.67, "learning_rate": 2.5478824808474607e-08, "logits/chosen": -3.2007699012756348, "logits/rejected": -3.0861010551452637, "logps/chosen": -301.3409118652344, "logps/rejected": -1272.3980712890625, "loss": 0.3069, "rewards/accuracies": 1.0, "rewards/chosen": 0.7039734125137329, "rewards/margins": 4.392248630523682, "rewards/rejected": -3.6882753372192383, "step": 528 }, { "epoch": 0.67, "learning_rate": 2.5298913209374804e-08, "logits/chosen": -3.201162815093994, "logits/rejected": -3.0659501552581787, "logps/chosen": -293.0525817871094, "logps/rejected": -761.3424072265625, "loss": 0.3979, "rewards/accuracies": 1.0, "rewards/chosen": 0.4473617672920227, "rewards/margins": 2.627058506011963, "rewards/rejected": -2.179696559906006, "step": 529 }, { "epoch": 0.68, "learning_rate": 2.5119423683695657e-08, "logits/chosen": -3.1094250679016113, "logits/rejected": -3.052112579345703, "logps/chosen": -278.5267333984375, "logps/rejected": -875.0875854492188, "loss": 0.4099, "rewards/accuracies": 1.0, "rewards/chosen": 0.6065841913223267, "rewards/margins": 3.2452621459960938, "rewards/rejected": -2.6386780738830566, "step": 530 }, { "epoch": 0.68, "learning_rate": 2.494035929841789e-08, "logits/chosen": -3.1778788566589355, "logits/rejected": -3.0881667137145996, "logps/chosen": -348.02178955078125, "logps/rejected": -961.6015625, "loss": 0.4452, "rewards/accuracies": 1.0, "rewards/chosen": 0.7511261105537415, "rewards/margins": 3.8344390392303467, "rewards/rejected": -3.08331298828125, "step": 531 }, { "epoch": 0.68, "learning_rate": 2.4761723113257826e-08, "logits/chosen": -3.168428659439087, "logits/rejected": -3.0778682231903076, "logps/chosen": -268.1236877441406, "logps/rejected": -706.4583740234375, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 0.5240821838378906, "rewards/margins": 3.098386526107788, "rewards/rejected": -2.5743043422698975, "step": 532 }, { "epoch": 0.68, "learning_rate": 2.458351818061497e-08, "logits/chosen": -3.2232487201690674, "logits/rejected": -3.0904088020324707, "logps/chosen": -285.81561279296875, "logps/rejected": -508.1881408691406, "loss": 0.3998, "rewards/accuracies": 1.0, "rewards/chosen": 0.6070312261581421, "rewards/margins": 2.2625136375427246, "rewards/rejected": -1.655482530593872, "step": 533 }, { "epoch": 0.68, "learning_rate": 2.4405747545519962e-08, "logits/chosen": -3.1532862186431885, "logits/rejected": -3.1098833084106445, "logps/chosen": -279.649658203125, "logps/rejected": -806.363525390625, "loss": 0.3293, "rewards/accuracies": 1.0, "rewards/chosen": 0.5274536609649658, "rewards/margins": 2.9855897426605225, "rewards/rejected": -2.4581360816955566, "step": 534 }, { "epoch": 0.68, "learning_rate": 2.422841424558244e-08, "logits/chosen": -3.245488166809082, "logits/rejected": -3.028334617614746, "logps/chosen": -297.3872985839844, "logps/rejected": -1779.322998046875, "loss": 0.3961, "rewards/accuracies": 1.0, "rewards/chosen": 0.7100814580917358, "rewards/margins": 6.948063850402832, "rewards/rejected": -6.237982273101807, "step": 535 }, { "epoch": 0.68, "learning_rate": 2.4051521310939256e-08, "logits/chosen": -3.2355501651763916, "logits/rejected": -3.1765527725219727, "logps/chosen": -278.6797180175781, "logps/rejected": -445.71868896484375, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": 0.63765949010849, "rewards/margins": 2.0806617736816406, "rewards/rejected": -1.4430023431777954, "step": 536 }, { "epoch": 0.68, "learning_rate": 2.3875071764202558e-08, "logits/chosen": -3.197939872741699, "logits/rejected": -3.0194005966186523, "logps/chosen": -293.06622314453125, "logps/rejected": -345.1778564453125, "loss": 0.4617, "rewards/accuracies": 1.0, "rewards/chosen": 0.5990188717842102, "rewards/margins": 1.4414414167404175, "rewards/rejected": -0.8424224853515625, "step": 537 }, { "epoch": 0.69, "learning_rate": 2.3699068620408304e-08, "logits/chosen": -3.214016914367676, "logits/rejected": -3.097398281097412, "logps/chosen": -303.96917724609375, "logps/rejected": -656.4412231445312, "loss": 0.4275, "rewards/accuracies": 1.0, "rewards/chosen": 0.5854644775390625, "rewards/margins": 2.6941347122192383, "rewards/rejected": -2.1086699962615967, "step": 538 }, { "epoch": 0.69, "learning_rate": 2.352351488696457e-08, "logits/chosen": -3.225841999053955, "logits/rejected": -3.0791025161743164, "logps/chosen": -329.56634521484375, "logps/rejected": -363.83892822265625, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 0.6059219241142273, "rewards/margins": 1.452906847000122, "rewards/rejected": -0.84698486328125, "step": 539 }, { "epoch": 0.69, "learning_rate": 2.3348413563600322e-08, "logits/chosen": -3.1946468353271484, "logits/rejected": -3.1131882667541504, "logps/chosen": -333.88232421875, "logps/rejected": -677.1282348632812, "loss": 0.4297, "rewards/accuracies": 1.0, "rewards/chosen": 0.6683257818222046, "rewards/margins": 2.7191452980041504, "rewards/rejected": -2.0508193969726562, "step": 540 }, { "epoch": 0.69, "learning_rate": 2.317376764231403e-08, "logits/chosen": -3.2134387493133545, "logits/rejected": -3.206509590148926, "logps/chosen": -307.0220947265625, "logps/rejected": -612.5013427734375, "loss": 0.3779, "rewards/accuracies": 1.0, "rewards/chosen": 0.6673965454101562, "rewards/margins": 2.6936843395233154, "rewards/rejected": -2.026287794113159, "step": 541 }, { "epoch": 0.69, "learning_rate": 2.2999580107322654e-08, "logits/chosen": -3.1350834369659424, "logits/rejected": -3.1297152042388916, "logps/chosen": -299.7679138183594, "logps/rejected": -650.8038940429688, "loss": 0.4138, "rewards/accuracies": 1.0, "rewards/chosen": 0.558544933795929, "rewards/margins": 2.6933655738830566, "rewards/rejected": -2.1348206996917725, "step": 542 }, { "epoch": 0.69, "learning_rate": 2.2825853935010535e-08, "logits/chosen": -3.179844856262207, "logits/rejected": -3.0460634231567383, "logps/chosen": -337.36279296875, "logps/rejected": -1154.3455810546875, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 0.6409576535224915, "rewards/margins": 4.764538764953613, "rewards/rejected": -4.1235809326171875, "step": 543 }, { "epoch": 0.69, "learning_rate": 2.2652592093878663e-08, "logits/chosen": -3.1963486671447754, "logits/rejected": -3.101998805999756, "logps/chosen": -273.0517272949219, "logps/rejected": -458.7620544433594, "loss": 0.3633, "rewards/accuracies": 1.0, "rewards/chosen": 0.7913986444473267, "rewards/margins": 2.1172897815704346, "rewards/rejected": -1.325891137123108, "step": 544 }, { "epoch": 0.69, "learning_rate": 2.2479797544493827e-08, "logits/chosen": -3.188274383544922, "logits/rejected": -3.1142849922180176, "logps/chosen": -309.5697937011719, "logps/rejected": -726.5000610351562, "loss": 0.3784, "rewards/accuracies": 1.0, "rewards/chosen": 0.6268249750137329, "rewards/margins": 2.6656296253204346, "rewards/rejected": -2.038804531097412, "step": 545 }, { "epoch": 0.7, "learning_rate": 2.2307473239438153e-08, "logits/chosen": -3.209980010986328, "logits/rejected": -3.0809547901153564, "logps/chosen": -269.77044677734375, "logps/rejected": -507.39453125, "loss": 0.3909, "rewards/accuracies": 1.0, "rewards/chosen": 0.5550888180732727, "rewards/margins": 2.0420944690704346, "rewards/rejected": -1.487005591392517, "step": 546 }, { "epoch": 0.7, "learning_rate": 2.2135622123258513e-08, "logits/chosen": -3.1820034980773926, "logits/rejected": -3.1408963203430176, "logps/chosen": -304.5548400878906, "logps/rejected": -508.6209716796875, "loss": 0.4164, "rewards/accuracies": 1.0, "rewards/chosen": 0.6897079944610596, "rewards/margins": 2.097928047180176, "rewards/rejected": -1.4082199335098267, "step": 547 }, { "epoch": 0.7, "learning_rate": 2.196424713241637e-08, "logits/chosen": -3.216374158859253, "logits/rejected": -2.850444793701172, "logps/chosen": -298.20355224609375, "logps/rejected": -2571.616943359375, "loss": 0.3889, "rewards/accuracies": 1.0, "rewards/chosen": 0.7942245602607727, "rewards/margins": 8.660962104797363, "rewards/rejected": -7.8667378425598145, "step": 548 }, { "epoch": 0.7, "learning_rate": 2.1793351195237446e-08, "logits/chosen": -3.196949005126953, "logits/rejected": -2.9126338958740234, "logps/chosen": -288.66583251953125, "logps/rejected": -2729.535888671875, "loss": 0.372, "rewards/accuracies": 1.0, "rewards/chosen": 0.5887802243232727, "rewards/margins": 9.042482376098633, "rewards/rejected": -8.453701972961426, "step": 549 }, { "epoch": 0.7, "learning_rate": 2.162293723186182e-08, "logits/chosen": -3.179318904876709, "logits/rejected": -3.0729665756225586, "logps/chosen": -306.5478515625, "logps/rejected": -1467.814208984375, "loss": 0.3837, "rewards/accuracies": 1.0, "rewards/chosen": 0.6647049188613892, "rewards/margins": 5.154506206512451, "rewards/rejected": -4.489801406860352, "step": 550 }, { "epoch": 0.7, "learning_rate": 2.1453008154193904e-08, "logits/chosen": -3.1481056213378906, "logits/rejected": -3.1690468788146973, "logps/chosen": -301.5870361328125, "logps/rejected": -616.0428466796875, "loss": 0.4203, "rewards/accuracies": 1.0, "rewards/chosen": 0.6466507315635681, "rewards/margins": 2.6097946166992188, "rewards/rejected": -1.9631439447402954, "step": 551 }, { "epoch": 0.7, "learning_rate": 2.128356686585282e-08, "logits/chosen": -3.2168774604797363, "logits/rejected": -3.0828709602355957, "logps/chosen": -343.3780517578125, "logps/rejected": -678.1688232421875, "loss": 0.3977, "rewards/accuracies": 1.0, "rewards/chosen": 0.5426605343818665, "rewards/margins": 2.6700563430786133, "rewards/rejected": -2.1273956298828125, "step": 552 }, { "epoch": 0.7, "learning_rate": 2.1114616262122648e-08, "logits/chosen": -3.2284188270568848, "logits/rejected": -3.0848803520202637, "logps/chosen": -338.2596130371094, "logps/rejected": -753.50244140625, "loss": 0.4106, "rewards/accuracies": 1.0, "rewards/chosen": 0.5096481442451477, "rewards/margins": 2.835920810699463, "rewards/rejected": -2.326272487640381, "step": 553 }, { "epoch": 0.71, "learning_rate": 2.0946159229903088e-08, "logits/chosen": -3.233768939971924, "logits/rejected": -2.962259292602539, "logps/chosen": -289.3715515136719, "logps/rejected": -1990.2413330078125, "loss": 0.4352, "rewards/accuracies": 1.0, "rewards/chosen": 0.5079300403594971, "rewards/margins": 6.782066345214844, "rewards/rejected": -6.274136543273926, "step": 554 }, { "epoch": 0.71, "learning_rate": 2.077819864766e-08, "logits/chosen": -3.2313950061798096, "logits/rejected": -3.1511545181274414, "logps/chosen": -300.439453125, "logps/rejected": -607.098876953125, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 0.7852722406387329, "rewards/margins": 2.2513580322265625, "rewards/rejected": -1.4660859107971191, "step": 555 }, { "epoch": 0.71, "learning_rate": 2.0610737385376347e-08, "logits/chosen": -3.197509288787842, "logits/rejected": -2.959805488586426, "logps/chosen": -262.1246643066406, "logps/rejected": -737.706787109375, "loss": 0.4231, "rewards/accuracies": 1.0, "rewards/chosen": 0.660351574420929, "rewards/margins": 3.1063294410705566, "rewards/rejected": -2.4459779262542725, "step": 556 }, { "epoch": 0.71, "learning_rate": 2.0443778304503024e-08, "logits/chosen": -3.1770877838134766, "logits/rejected": -3.1136109828948975, "logps/chosen": -305.33477783203125, "logps/rejected": -872.1993408203125, "loss": 0.3988, "rewards/accuracies": 1.0, "rewards/chosen": 0.7836548089981079, "rewards/margins": 3.635336399078369, "rewards/rejected": -2.851681709289551, "step": 557 }, { "epoch": 0.71, "learning_rate": 2.0277324257910105e-08, "logits/chosen": -3.192110538482666, "logits/rejected": -3.1434326171875, "logps/chosen": -282.17352294921875, "logps/rejected": -870.3905029296875, "loss": 0.3415, "rewards/accuracies": 1.0, "rewards/chosen": 0.672741711139679, "rewards/margins": 3.1066741943359375, "rewards/rejected": -2.4339325428009033, "step": 558 }, { "epoch": 0.71, "learning_rate": 2.0111378089837954e-08, "logits/chosen": -3.2035202980041504, "logits/rejected": -2.999499797821045, "logps/chosen": -323.62481689453125, "logps/rejected": -537.3291625976562, "loss": 0.3823, "rewards/accuracies": 1.0, "rewards/chosen": 0.7835677862167358, "rewards/margins": 2.0939133167266846, "rewards/rejected": -1.3103455305099487, "step": 559 }, { "epoch": 0.71, "learning_rate": 1.9945942635848744e-08, "logits/chosen": -3.156513214111328, "logits/rejected": -3.1367034912109375, "logps/chosen": -269.69342041015625, "logps/rejected": -994.7991943359375, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8170517086982727, "rewards/margins": 4.355531215667725, "rewards/rejected": -3.5384795665740967, "step": 560 }, { "epoch": 0.72, "learning_rate": 1.978102072277791e-08, "logits/chosen": -3.2219491004943848, "logits/rejected": -3.1053366661071777, "logps/chosen": -281.48443603515625, "logps/rejected": -794.62109375, "loss": 0.3571, "rewards/accuracies": 1.0, "rewards/chosen": 0.6210052371025085, "rewards/margins": 3.2064788341522217, "rewards/rejected": -2.5854737758636475, "step": 561 }, { "epoch": 0.72, "learning_rate": 1.961661516868594e-08, "logits/chosen": -3.171048164367676, "logits/rejected": -3.1146998405456543, "logps/chosen": -326.5294494628906, "logps/rejected": -323.4760437011719, "loss": 0.4938, "rewards/accuracies": 1.0, "rewards/chosen": 0.7022323608398438, "rewards/margins": 1.3471580743789673, "rewards/rejected": -0.6449257135391235, "step": 562 }, { "epoch": 0.72, "learning_rate": 1.9452728782810107e-08, "logits/chosen": -3.2322583198547363, "logits/rejected": -3.027923107147217, "logps/chosen": -307.30731201171875, "logps/rejected": -419.6705322265625, "loss": 0.4171, "rewards/accuracies": 1.0, "rewards/chosen": 0.47350311279296875, "rewards/margins": 1.8864548206329346, "rewards/rejected": -1.4129517078399658, "step": 563 }, { "epoch": 0.72, "learning_rate": 1.928936436551661e-08, "logits/chosen": -3.2284960746765137, "logits/rejected": -3.0939207077026367, "logps/chosen": -302.4546813964844, "logps/rejected": -1566.0389404296875, "loss": 0.4161, "rewards/accuracies": 1.0, "rewards/chosen": 0.8815399408340454, "rewards/margins": 6.025326251983643, "rewards/rejected": -5.143786430358887, "step": 564 }, { "epoch": 0.72, "learning_rate": 1.9126524708252555e-08, "logits/chosen": -3.1517200469970703, "logits/rejected": -3.075575113296509, "logps/chosen": -309.5433349609375, "logps/rejected": -828.3705444335938, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 0.725341796875, "rewards/margins": 3.356783866882324, "rewards/rejected": -2.6314423084259033, "step": 565 }, { "epoch": 0.72, "learning_rate": 1.8964212593498442e-08, "logits/chosen": -3.2500247955322266, "logits/rejected": -3.124441623687744, "logps/chosen": -313.982177734375, "logps/rejected": -551.7918701171875, "loss": 0.3743, "rewards/accuracies": 1.0, "rewards/chosen": 0.6935562491416931, "rewards/margins": 2.01483154296875, "rewards/rejected": -1.3212753534317017, "step": 566 }, { "epoch": 0.72, "learning_rate": 1.8802430794720454e-08, "logits/chosen": -3.172243118286133, "logits/rejected": -3.000232219696045, "logps/chosen": -283.85791015625, "logps/rejected": -846.5006103515625, "loss": 0.3787, "rewards/accuracies": 1.0, "rewards/chosen": 0.5262054204940796, "rewards/margins": 3.52374267578125, "rewards/rejected": -2.99753737449646, "step": 567 }, { "epoch": 0.72, "learning_rate": 1.8641182076323148e-08, "logits/chosen": -3.265712261199951, "logits/rejected": -3.186251163482666, "logps/chosen": -309.9307861328125, "logps/rejected": -1186.233642578125, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": 0.543103039264679, "rewards/margins": 4.259524345397949, "rewards/rejected": -3.716421604156494, "step": 568 }, { "epoch": 0.73, "learning_rate": 1.848046919360225e-08, "logits/chosen": -3.1793313026428223, "logits/rejected": -3.0751380920410156, "logps/chosen": -273.32489013671875, "logps/rejected": -1371.838623046875, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 0.6751663088798523, "rewards/margins": 5.145045757293701, "rewards/rejected": -4.469879150390625, "step": 569 }, { "epoch": 0.73, "learning_rate": 1.8320294892697475e-08, "logits/chosen": -3.1431403160095215, "logits/rejected": -3.151942729949951, "logps/chosen": -277.0229187011719, "logps/rejected": -593.3192138671875, "loss": 0.4463, "rewards/accuracies": 1.0, "rewards/chosen": 0.7080665826797485, "rewards/margins": 2.523831844329834, "rewards/rejected": -1.815765380859375, "step": 570 }, { "epoch": 0.73, "learning_rate": 1.8160661910545715e-08, "logits/chosen": -3.1740636825561523, "logits/rejected": -3.0155699253082275, "logps/chosen": -351.4755554199219, "logps/rejected": -616.7528076171875, "loss": 0.4242, "rewards/accuracies": 1.0, "rewards/chosen": 0.6496963500976562, "rewards/margins": 2.6570420265197754, "rewards/rejected": -2.007345676422119, "step": 571 }, { "epoch": 0.73, "learning_rate": 1.8001572974834166e-08, "logits/chosen": -3.163912296295166, "logits/rejected": -3.1030969619750977, "logps/chosen": -298.7839660644531, "logps/rejected": -759.8409423828125, "loss": 0.3955, "rewards/accuracies": 1.0, "rewards/chosen": 0.7133820056915283, "rewards/margins": 2.956744432449341, "rewards/rejected": -2.2433624267578125, "step": 572 }, { "epoch": 0.73, "learning_rate": 1.7843030803953834e-08, "logits/chosen": -3.169480562210083, "logits/rejected": -3.0860204696655273, "logps/chosen": -285.5494384765625, "logps/rejected": -713.7589721679688, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": 0.6155914664268494, "rewards/margins": 3.008816719055176, "rewards/rejected": -2.3932251930236816, "step": 573 }, { "epoch": 0.73, "learning_rate": 1.768503810695295e-08, "logits/chosen": -3.197324514389038, "logits/rejected": -3.07839298248291, "logps/chosen": -328.1409912109375, "logps/rejected": -841.93701171875, "loss": 0.4007, "rewards/accuracies": 1.0, "rewards/chosen": 0.7515579462051392, "rewards/margins": 3.4638075828552246, "rewards/rejected": -2.712249755859375, "step": 574 }, { "epoch": 0.73, "learning_rate": 1.7527597583490822e-08, "logits/chosen": -3.1949310302734375, "logits/rejected": -2.9750170707702637, "logps/chosen": -302.8563232421875, "logps/rejected": -1828.4755859375, "loss": 0.3737, "rewards/accuracies": 1.0, "rewards/chosen": 0.658447265625, "rewards/margins": 6.40625, "rewards/rejected": -5.747802734375, "step": 575 }, { "epoch": 0.73, "learning_rate": 1.7370711923791564e-08, "logits/chosen": -3.252814769744873, "logits/rejected": -3.0623557567596436, "logps/chosen": -334.306396484375, "logps/rejected": -374.0791015625, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": 0.7677658200263977, "rewards/margins": 1.889912486076355, "rewards/rejected": -1.1221466064453125, "step": 576 }, { "epoch": 0.74, "learning_rate": 1.7214383808598282e-08, "logits/chosen": -3.249006509780884, "logits/rejected": -3.131883382797241, "logps/chosen": -260.5145263671875, "logps/rejected": -740.1544189453125, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": 0.6197845339775085, "rewards/margins": 2.734065294265747, "rewards/rejected": -2.1142807006835938, "step": 577 }, { "epoch": 0.74, "learning_rate": 1.70586159091271e-08, "logits/chosen": -3.1785988807678223, "logits/rejected": -2.974374771118164, "logps/chosen": -352.1556701660156, "logps/rejected": -1218.466064453125, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 0.5362091064453125, "rewards/margins": 4.590564250946045, "rewards/rejected": -4.054355144500732, "step": 578 }, { "epoch": 0.74, "learning_rate": 1.6903410887021675e-08, "logits/chosen": -3.1856517791748047, "logits/rejected": -3.0781774520874023, "logps/chosen": -306.43353271484375, "logps/rejected": -1111.63037109375, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 0.6511062979698181, "rewards/margins": 4.543357849121094, "rewards/rejected": -3.89225172996521, "step": 579 }, { "epoch": 0.74, "learning_rate": 1.674877139430758e-08, "logits/chosen": -3.225602149963379, "logits/rejected": -3.097665309906006, "logps/chosen": -252.47523498535156, "logps/rejected": -1014.3025512695312, "loss": 0.3693, "rewards/accuracies": 1.0, "rewards/chosen": 0.6732162237167358, "rewards/margins": 3.960768222808838, "rewards/rejected": -3.2875518798828125, "step": 580 }, { "epoch": 0.74, "learning_rate": 1.6594700073347112e-08, "logits/chosen": -3.186816453933716, "logits/rejected": -3.0833213329315186, "logps/chosen": -296.64764404296875, "logps/rejected": -574.7928466796875, "loss": 0.3989, "rewards/accuracies": 1.0, "rewards/chosen": 0.514819324016571, "rewards/margins": 2.423095703125, "rewards/rejected": -1.9082763195037842, "step": 581 }, { "epoch": 0.74, "learning_rate": 1.6441199556794033e-08, "logits/chosen": -3.238840103149414, "logits/rejected": -3.0051918029785156, "logps/chosen": -328.4454650878906, "logps/rejected": -590.5196533203125, "loss": 0.3974, "rewards/accuracies": 1.0, "rewards/chosen": 0.8275070190429688, "rewards/margins": 2.4020652770996094, "rewards/rejected": -1.5745582580566406, "step": 582 }, { "epoch": 0.74, "learning_rate": 1.6288272467548632e-08, "logits/chosen": -3.164496421813965, "logits/rejected": -2.9226880073547363, "logps/chosen": -272.1784973144531, "logps/rejected": -1875.50146484375, "loss": 0.3366, "rewards/accuracies": 1.0, "rewards/chosen": 0.46259766817092896, "rewards/margins": 6.813040256500244, "rewards/rejected": -6.350442409515381, "step": 583 }, { "epoch": 0.74, "learning_rate": 1.6135921418712954e-08, "logits/chosen": -3.177595376968384, "logits/rejected": -3.106581211090088, "logps/chosen": -288.7750244140625, "logps/rejected": -718.3802490234375, "loss": 0.396, "rewards/accuracies": 1.0, "rewards/chosen": 0.7301834225654602, "rewards/margins": 3.1808998584747314, "rewards/rejected": -2.450716495513916, "step": 584 }, { "epoch": 0.75, "learning_rate": 1.5984149013546046e-08, "logits/chosen": -3.1278748512268066, "logits/rejected": -3.1368112564086914, "logps/chosen": -336.1808166503906, "logps/rejected": -324.63671875, "loss": 0.4299, "rewards/accuracies": 1.0, "rewards/chosen": 0.46487122774124146, "rewards/margins": 1.25536048412323, "rewards/rejected": -0.7904891967773438, "step": 585 }, { "epoch": 0.75, "learning_rate": 1.583295784541958e-08, "logits/chosen": -3.1585350036621094, "logits/rejected": -2.9586024284362793, "logps/chosen": -310.5759582519531, "logps/rejected": -1770.3521728515625, "loss": 0.38, "rewards/accuracies": 1.0, "rewards/chosen": 0.706024169921875, "rewards/margins": 7.071271896362305, "rewards/rejected": -6.365248203277588, "step": 586 }, { "epoch": 0.75, "learning_rate": 1.568235049777345e-08, "logits/chosen": -3.2259747982025146, "logits/rejected": -3.0647358894348145, "logps/chosen": -282.6731262207031, "logps/rejected": -645.5059204101562, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 0.6217659115791321, "rewards/margins": 2.723256826400757, "rewards/rejected": -2.1014907360076904, "step": 587 }, { "epoch": 0.75, "learning_rate": 1.553232954407171e-08, "logits/chosen": -3.2613492012023926, "logits/rejected": -3.082167148590088, "logps/chosen": -304.8526306152344, "logps/rejected": -391.41064453125, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 0.6079742908477783, "rewards/margins": 1.8539962768554688, "rewards/rejected": -1.2460219860076904, "step": 588 }, { "epoch": 0.75, "learning_rate": 1.5382897547758512e-08, "logits/chosen": -3.247161626815796, "logits/rejected": -3.1985926628112793, "logps/chosen": -289.5562744140625, "logps/rejected": -684.0227661132812, "loss": 0.3735, "rewards/accuracies": 1.0, "rewards/chosen": 0.6008331775665283, "rewards/margins": 2.5938446521759033, "rewards/rejected": -1.993011474609375, "step": 589 }, { "epoch": 0.75, "learning_rate": 1.52340570622144e-08, "logits/chosen": -3.198961019515991, "logits/rejected": -3.0576484203338623, "logps/chosen": -324.492431640625, "logps/rejected": -1005.84423828125, "loss": 0.3877, "rewards/accuracies": 1.0, "rewards/chosen": 0.5694580078125, "rewards/margins": 3.42386794090271, "rewards/rejected": -2.854409694671631, "step": 590 }, { "epoch": 0.75, "learning_rate": 1.508581063071258e-08, "logits/chosen": -3.235337734222412, "logits/rejected": -3.165937900543213, "logps/chosen": -320.70184326171875, "logps/rejected": -541.1258544921875, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 0.6993682980537415, "rewards/margins": 2.2986130714416504, "rewards/rejected": -1.5992447137832642, "step": 591 }, { "epoch": 0.75, "learning_rate": 1.493816078637557e-08, "logits/chosen": -3.2287940979003906, "logits/rejected": -3.0664119720458984, "logps/chosen": -331.5074462890625, "logps/rejected": -413.8134460449219, "loss": 0.3654, "rewards/accuracies": 1.0, "rewards/chosen": 0.6367446780204773, "rewards/margins": 1.758030652999878, "rewards/rejected": -1.1212860345840454, "step": 592 }, { "epoch": 0.76, "learning_rate": 1.47911100521318e-08, "logits/chosen": -3.2041492462158203, "logits/rejected": -3.1454172134399414, "logps/chosen": -284.87713623046875, "logps/rejected": -461.22149658203125, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": 0.5847762823104858, "rewards/margins": 1.8279907703399658, "rewards/rejected": -1.2432143688201904, "step": 593 }, { "epoch": 0.76, "learning_rate": 1.4644660940672625e-08, "logits/chosen": -3.101226568222046, "logits/rejected": -3.0249600410461426, "logps/chosen": -297.4947814941406, "logps/rejected": -1135.284912109375, "loss": 0.3654, "rewards/accuracies": 1.0, "rewards/chosen": 0.8361923098564148, "rewards/margins": 4.788810729980469, "rewards/rejected": -3.9526185989379883, "step": 594 }, { "epoch": 0.76, "learning_rate": 1.4498815954409278e-08, "logits/chosen": -3.160712718963623, "logits/rejected": -3.077864170074463, "logps/chosen": -300.15234375, "logps/rejected": -426.2779235839844, "loss": 0.4188, "rewards/accuracies": 1.0, "rewards/chosen": 0.5366714596748352, "rewards/margins": 1.8155839443206787, "rewards/rejected": -1.2789124250411987, "step": 595 }, { "epoch": 0.76, "learning_rate": 1.4353577585430148e-08, "logits/chosen": -3.232564687728882, "logits/rejected": -3.0458602905273438, "logps/chosen": -251.73597717285156, "logps/rejected": -1157.6531982421875, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 0.5361747741699219, "rewards/margins": 4.349971771240234, "rewards/rejected": -3.8137969970703125, "step": 596 }, { "epoch": 0.76, "learning_rate": 1.4208948315458275e-08, "logits/chosen": -3.128933906555176, "logits/rejected": -2.930844306945801, "logps/chosen": -316.641357421875, "logps/rejected": -3592.85302734375, "loss": 0.4145, "rewards/accuracies": 1.0, "rewards/chosen": 0.7182838916778564, "rewards/margins": 11.807291030883789, "rewards/rejected": -11.089007377624512, "step": 597 }, { "epoch": 0.76, "learning_rate": 1.4064930615808806e-08, "logits/chosen": -3.230276346206665, "logits/rejected": -3.067732572555542, "logps/chosen": -341.46942138671875, "logps/rejected": -1117.1708984375, "loss": 0.3757, "rewards/accuracies": 1.0, "rewards/chosen": 0.43240511417388916, "rewards/margins": 3.7263078689575195, "rewards/rejected": -3.293902635574341, "step": 598 }, { "epoch": 0.76, "learning_rate": 1.39215269473469e-08, "logits/chosen": -3.203866958618164, "logits/rejected": -3.180807113647461, "logps/chosen": -324.54571533203125, "logps/rejected": -1097.523193359375, "loss": 0.4792, "rewards/accuracies": 1.0, "rewards/chosen": 0.827301025390625, "rewards/margins": 4.869989395141602, "rewards/rejected": -4.042688369750977, "step": 599 }, { "epoch": 0.76, "learning_rate": 1.3778739760445552e-08, "logits/chosen": -3.1907029151916504, "logits/rejected": -3.06864857673645, "logps/chosen": -289.54217529296875, "logps/rejected": -1083.228271484375, "loss": 0.3478, "rewards/accuracies": 1.0, "rewards/chosen": 0.666351318359375, "rewards/margins": 4.473645210266113, "rewards/rejected": -3.8072938919067383, "step": 600 }, { "epoch": 0.77, "learning_rate": 1.3636571494943861e-08, "logits/chosen": -3.216348648071289, "logits/rejected": -2.9865622520446777, "logps/chosen": -288.2769775390625, "logps/rejected": -1301.2900390625, "loss": 0.3806, "rewards/accuracies": 1.0, "rewards/chosen": 0.40893250703811646, "rewards/margins": 4.555765151977539, "rewards/rejected": -4.146832466125488, "step": 601 }, { "epoch": 0.77, "learning_rate": 1.349502458010519e-08, "logits/chosen": -3.1652350425720215, "logits/rejected": -3.16892671585083, "logps/chosen": -314.7898254394531, "logps/rejected": -1067.5556640625, "loss": 0.3398, "rewards/accuracies": 1.0, "rewards/chosen": 0.7538009881973267, "rewards/margins": 4.5405073165893555, "rewards/rejected": -3.7867064476013184, "step": 602 }, { "epoch": 0.77, "learning_rate": 1.3354101434575805e-08, "logits/chosen": -3.1747913360595703, "logits/rejected": -3.135251045227051, "logps/chosen": -293.16436767578125, "logps/rejected": -1027.9432373046875, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": 0.4745529294013977, "rewards/margins": 3.8980209827423096, "rewards/rejected": -3.4234681129455566, "step": 603 }, { "epoch": 0.77, "learning_rate": 1.321380446634342e-08, "logits/chosen": -3.230456829071045, "logits/rejected": -3.0374910831451416, "logps/chosen": -298.0791015625, "logps/rejected": -2984.79443359375, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 0.5816513299942017, "rewards/margins": 11.273740768432617, "rewards/rejected": -10.692090034484863, "step": 604 }, { "epoch": 0.77, "learning_rate": 1.3074136072696147e-08, "logits/chosen": -3.2403690814971924, "logits/rejected": -3.068743944168091, "logps/chosen": -322.991455078125, "logps/rejected": -680.3526611328125, "loss": 0.3666, "rewards/accuracies": 1.0, "rewards/chosen": 0.7957992553710938, "rewards/margins": 3.1598129272460938, "rewards/rejected": -2.364013671875, "step": 605 }, { "epoch": 0.77, "learning_rate": 1.2935098640181458e-08, "logits/chosen": -3.219139337539673, "logits/rejected": -3.0725748538970947, "logps/chosen": -276.38983154296875, "logps/rejected": -1247.24462890625, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 0.6592696905136108, "rewards/margins": 3.9346022605895996, "rewards/rejected": -3.2753326892852783, "step": 606 }, { "epoch": 0.77, "learning_rate": 1.2796694544565478e-08, "logits/chosen": -3.2551517486572266, "logits/rejected": -3.117729425430298, "logps/chosen": -306.96551513671875, "logps/rejected": -1059.1156005859375, "loss": 0.3655, "rewards/accuracies": 1.0, "rewards/chosen": 0.6375259160995483, "rewards/margins": 3.844590663909912, "rewards/rejected": -3.207064628601074, "step": 607 }, { "epoch": 0.78, "learning_rate": 1.2658926150792321e-08, "logits/chosen": -3.1876072883605957, "logits/rejected": -3.1105217933654785, "logps/chosen": -293.5586853027344, "logps/rejected": -448.782958984375, "loss": 0.4938, "rewards/accuracies": 1.0, "rewards/chosen": 0.595672607421875, "rewards/margins": 1.8718918561935425, "rewards/rejected": -1.2762192487716675, "step": 608 }, { "epoch": 0.78, "learning_rate": 1.2521795812943703e-08, "logits/chosen": -3.2520675659179688, "logits/rejected": -3.191102981567383, "logps/chosen": -294.5622863769531, "logps/rejected": -3563.8681640625, "loss": 0.3238, "rewards/accuracies": 1.0, "rewards/chosen": 0.6282013058662415, "rewards/margins": 7.7801055908203125, "rewards/rejected": -7.151904106140137, "step": 609 }, { "epoch": 0.78, "learning_rate": 1.2385305874198776e-08, "logits/chosen": -3.2237188816070557, "logits/rejected": -3.0899267196655273, "logps/chosen": -337.33154296875, "logps/rejected": -796.156005859375, "loss": 0.4165, "rewards/accuracies": 1.0, "rewards/chosen": 0.6227996945381165, "rewards/margins": 3.0379302501678467, "rewards/rejected": -2.415130615234375, "step": 610 }, { "epoch": 0.78, "learning_rate": 1.2249458666793966e-08, "logits/chosen": -3.162837505340576, "logits/rejected": -3.180877208709717, "logps/chosen": -292.28094482421875, "logps/rejected": -1216.5731201171875, "loss": 0.3402, "rewards/accuracies": 1.0, "rewards/chosen": 0.842510998249054, "rewards/margins": 4.915765762329102, "rewards/rejected": -4.073254585266113, "step": 611 }, { "epoch": 0.78, "learning_rate": 1.2114256511983274e-08, "logits/chosen": -3.1879513263702393, "logits/rejected": -3.1179792881011963, "logps/chosen": -334.3758850097656, "logps/rejected": -842.7742309570312, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 0.5041107535362244, "rewards/margins": 3.2366883754730225, "rewards/rejected": -2.7325775623321533, "step": 612 }, { "epoch": 0.78, "learning_rate": 1.1979701719998452e-08, "logits/chosen": -3.211066246032715, "logits/rejected": -3.011672019958496, "logps/chosen": -278.65240478515625, "logps/rejected": -694.3025512695312, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 0.5701980590820312, "rewards/margins": 2.622183322906494, "rewards/rejected": -2.051985263824463, "step": 613 }, { "epoch": 0.78, "learning_rate": 1.1845796590009682e-08, "logits/chosen": -3.2238192558288574, "logits/rejected": -2.979383707046509, "logps/chosen": -274.48046875, "logps/rejected": -1438.1650390625, "loss": 0.3935, "rewards/accuracies": 1.0, "rewards/chosen": 0.5493003726005554, "rewards/margins": 5.82890510559082, "rewards/rejected": -5.279604911804199, "step": 614 }, { "epoch": 0.78, "learning_rate": 1.1712543410086145e-08, "logits/chosen": -3.1785812377929688, "logits/rejected": -2.984666347503662, "logps/chosen": -263.34490966796875, "logps/rejected": -1317.18798828125, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 0.44180527329444885, "rewards/margins": 4.58309268951416, "rewards/rejected": -4.141287326812744, "step": 615 }, { "epoch": 0.79, "learning_rate": 1.157994445715706e-08, "logits/chosen": -3.199995517730713, "logits/rejected": -3.0870492458343506, "logps/chosen": -262.36993408203125, "logps/rejected": -681.9628295898438, "loss": 0.3693, "rewards/accuracies": 1.0, "rewards/chosen": 0.5514167547225952, "rewards/margins": 2.4747109413146973, "rewards/rejected": -1.9232940673828125, "step": 616 }, { "epoch": 0.79, "learning_rate": 1.1448001996972645e-08, "logits/chosen": -3.218848705291748, "logits/rejected": -3.1180012226104736, "logps/chosen": -301.41021728515625, "logps/rejected": -592.742431640625, "loss": 0.3755, "rewards/accuracies": 1.0, "rewards/chosen": 0.5736435055732727, "rewards/margins": 2.6843948364257812, "rewards/rejected": -2.1107513904571533, "step": 617 }, { "epoch": 0.79, "learning_rate": 1.1316718284065534e-08, "logits/chosen": -3.1913106441497803, "logits/rejected": -3.007892608642578, "logps/chosen": -286.28271484375, "logps/rejected": -1018.0855712890625, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 0.6660293936729431, "rewards/margins": 3.7797813415527344, "rewards/rejected": -3.1137518882751465, "step": 618 }, { "epoch": 0.79, "learning_rate": 1.1186095561712128e-08, "logits/chosen": -3.2579457759857178, "logits/rejected": -3.1007728576660156, "logps/chosen": -312.11468505859375, "logps/rejected": -736.5653686523438, "loss": 0.4254, "rewards/accuracies": 1.0, "rewards/chosen": 0.5914520025253296, "rewards/margins": 3.224246025085449, "rewards/rejected": -2.632794141769409, "step": 619 }, { "epoch": 0.79, "learning_rate": 1.1056136061894383e-08, "logits/chosen": -3.261101722717285, "logits/rejected": -3.133169174194336, "logps/chosen": -292.71551513671875, "logps/rejected": -1462.875732421875, "loss": 0.3625, "rewards/accuracies": 1.0, "rewards/chosen": 0.7436469793319702, "rewards/margins": 5.485978603363037, "rewards/rejected": -4.742331027984619, "step": 620 }, { "epoch": 0.79, "learning_rate": 1.0926842005261549e-08, "logits/chosen": -3.1444897651672363, "logits/rejected": -3.1617188453674316, "logps/chosen": -298.6170349121094, "logps/rejected": -857.9765014648438, "loss": 0.3763, "rewards/accuracies": 1.0, "rewards/chosen": 0.7347885370254517, "rewards/margins": 3.745286464691162, "rewards/rejected": -3.010498046875, "step": 621 }, { "epoch": 0.79, "learning_rate": 1.0798215601092353e-08, "logits/chosen": -3.256404161453247, "logits/rejected": -2.9615297317504883, "logps/chosen": -282.98614501953125, "logps/rejected": -2585.22802734375, "loss": 0.367, "rewards/accuracies": 1.0, "rewards/chosen": 0.6839675903320312, "rewards/margins": 8.651124954223633, "rewards/rejected": -7.967157363891602, "step": 622 }, { "epoch": 0.79, "learning_rate": 1.067025904725713e-08, "logits/chosen": -3.2144360542297363, "logits/rejected": -3.154191493988037, "logps/chosen": -293.63336181640625, "logps/rejected": -703.9183959960938, "loss": 0.3585, "rewards/accuracies": 1.0, "rewards/chosen": 0.6726547479629517, "rewards/margins": 3.114428997039795, "rewards/rejected": -2.4417741298675537, "step": 623 }, { "epoch": 0.8, "learning_rate": 1.0542974530180326e-08, "logits/chosen": -3.185856819152832, "logits/rejected": -3.102485179901123, "logps/chosen": -293.21502685546875, "logps/rejected": -638.3473510742188, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 0.7283782958984375, "rewards/margins": 2.729597568511963, "rewards/rejected": -2.0012192726135254, "step": 624 }, { "epoch": 0.8, "learning_rate": 1.0416364224803182e-08, "logits/chosen": -3.140071153640747, "logits/rejected": -3.0963966846466064, "logps/chosen": -344.5130920410156, "logps/rejected": -643.440185546875, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": 0.7475296258926392, "rewards/margins": 2.4806976318359375, "rewards/rejected": -1.733168125152588, "step": 625 }, { "epoch": 0.8, "learning_rate": 1.0290430294546448e-08, "logits/chosen": -3.2173027992248535, "logits/rejected": -3.0957908630371094, "logps/chosen": -240.49545288085938, "logps/rejected": -713.5307006835938, "loss": 0.3618, "rewards/accuracies": 1.0, "rewards/chosen": 0.5469444394111633, "rewards/margins": 2.688997745513916, "rewards/rejected": -2.1420531272888184, "step": 626 }, { "epoch": 0.8, "learning_rate": 1.016517489127357e-08, "logits/chosen": -3.1559386253356934, "logits/rejected": -3.1280295848846436, "logps/chosen": -298.00506591796875, "logps/rejected": -1668.661376953125, "loss": 0.386, "rewards/accuracies": 1.0, "rewards/chosen": 0.8507492542266846, "rewards/margins": 6.737382888793945, "rewards/rejected": -5.886633396148682, "step": 627 }, { "epoch": 0.8, "learning_rate": 1.0040600155253764e-08, "logits/chosen": -3.1657886505126953, "logits/rejected": -3.0944666862487793, "logps/chosen": -355.71258544921875, "logps/rejected": -978.0113525390625, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 0.7419112920761108, "rewards/margins": 3.7167861461639404, "rewards/rejected": -2.974874973297119, "step": 628 }, { "epoch": 0.8, "learning_rate": 9.916708215125585e-09, "logits/chosen": -3.1956019401550293, "logits/rejected": -3.0983948707580566, "logps/chosen": -295.4072265625, "logps/rejected": -652.2484130859375, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 0.7121047973632812, "rewards/margins": 2.643660068511963, "rewards/rejected": -1.9315552711486816, "step": 629 }, { "epoch": 0.8, "learning_rate": 9.793501187860431e-09, "logits/chosen": -3.1795501708984375, "logits/rejected": -2.923694610595703, "logps/chosen": -303.447021484375, "logps/rejected": -1292.77587890625, "loss": 0.3912, "rewards/accuracies": 1.0, "rewards/chosen": 0.4927116632461548, "rewards/margins": 4.529209136962891, "rewards/rejected": -4.036497592926025, "step": 630 }, { "epoch": 0.8, "learning_rate": 9.670981178726485e-09, "logits/chosen": -3.2398881912231445, "logits/rejected": -3.131667137145996, "logps/chosen": -298.68255615234375, "logps/rejected": -431.2021484375, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/chosen": 0.5923904180526733, "rewards/margins": 1.7624390125274658, "rewards/rejected": -1.170048475265503, "step": 631 }, { "epoch": 0.81, "learning_rate": 9.549150281252633e-09, "logits/chosen": -3.1884500980377197, "logits/rejected": -3.050455093383789, "logps/chosen": -312.8995666503906, "logps/rejected": -1529.9827880859375, "loss": 0.4442, "rewards/accuracies": 1.0, "rewards/chosen": 0.6491225957870483, "rewards/margins": 6.029634475708008, "rewards/rejected": -5.380511283874512, "step": 632 }, { "epoch": 0.81, "learning_rate": 9.428010577192796e-09, "logits/chosen": -3.166779041290283, "logits/rejected": -3.0989861488342285, "logps/chosen": -315.7928771972656, "logps/rejected": -718.4888916015625, "loss": 0.4266, "rewards/accuracies": 1.0, "rewards/chosen": 0.539227306842804, "rewards/margins": 2.7135329246520996, "rewards/rejected": -2.1743056774139404, "step": 633 }, { "epoch": 0.81, "learning_rate": 9.307564136490254e-09, "logits/chosen": -3.213235855102539, "logits/rejected": -3.127086639404297, "logps/chosen": -299.2451477050781, "logps/rejected": -431.27276611328125, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 0.6102280020713806, "rewards/margins": 1.8352646827697754, "rewards/rejected": -1.22503662109375, "step": 634 }, { "epoch": 0.81, "learning_rate": 9.187813017242386e-09, "logits/chosen": -3.2164196968078613, "logits/rejected": -3.072253704071045, "logps/chosen": -298.36016845703125, "logps/rejected": -649.3165283203125, "loss": 0.4329, "rewards/accuracies": 1.0, "rewards/chosen": 0.5936005115509033, "rewards/margins": 2.5906982421875, "rewards/rejected": -1.9970978498458862, "step": 635 }, { "epoch": 0.81, "learning_rate": 9.068759265665382e-09, "logits/chosen": -3.2064032554626465, "logits/rejected": -3.1343915462493896, "logps/chosen": -353.2247314453125, "logps/rejected": -627.2993774414062, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 0.6988846063613892, "rewards/margins": 2.401745557785034, "rewards/rejected": -1.7028610706329346, "step": 636 }, { "epoch": 0.81, "learning_rate": 8.950404916059406e-09, "logits/chosen": -3.2286949157714844, "logits/rejected": -3.0597119331359863, "logps/chosen": -275.50537109375, "logps/rejected": -512.9461669921875, "loss": 0.4201, "rewards/accuracies": 1.0, "rewards/chosen": 0.5865631103515625, "rewards/margins": 1.8351958990097046, "rewards/rejected": -1.248632788658142, "step": 637 }, { "epoch": 0.81, "learning_rate": 8.832751990773713e-09, "logits/chosen": -3.1961007118225098, "logits/rejected": -3.0305111408233643, "logps/chosen": -328.4877624511719, "logps/rejected": -443.09521484375, "loss": 0.4568, "rewards/accuracies": 1.0, "rewards/chosen": 0.6881149411201477, "rewards/margins": 1.9717285633087158, "rewards/rejected": -1.2836135625839233, "step": 638 }, { "epoch": 0.81, "learning_rate": 8.715802500172214e-09, "logits/chosen": -3.201709747314453, "logits/rejected": -3.0527853965759277, "logps/chosen": -286.9666748046875, "logps/rejected": -686.5706176757812, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 0.6477676630020142, "rewards/margins": 2.8871095180511475, "rewards/rejected": -2.2393417358398438, "step": 639 }, { "epoch": 0.82, "learning_rate": 8.599558442598998e-09, "logits/chosen": -3.1968026161193848, "logits/rejected": -2.920701026916504, "logps/chosen": -323.91827392578125, "logps/rejected": -2272.82470703125, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 0.6918350458145142, "rewards/margins": 8.648866653442383, "rewards/rejected": -7.95703125, "step": 640 }, { "epoch": 0.82, "learning_rate": 8.484021804344305e-09, "logits/chosen": -3.1589488983154297, "logits/rejected": -3.097161054611206, "logps/chosen": -299.0281982421875, "logps/rejected": -539.90478515625, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 0.6258010864257812, "rewards/margins": 2.2825005054473877, "rewards/rejected": -1.6566994190216064, "step": 641 }, { "epoch": 0.82, "learning_rate": 8.369194559610482e-09, "logits/chosen": -3.2193729877471924, "logits/rejected": -3.1034140586853027, "logps/chosen": -312.00726318359375, "logps/rejected": -600.1370849609375, "loss": 0.4184, "rewards/accuracies": 1.0, "rewards/chosen": 0.7323989868164062, "rewards/margins": 2.616734504699707, "rewards/rejected": -1.8843353986740112, "step": 642 }, { "epoch": 0.82, "learning_rate": 8.25507867047835e-09, "logits/chosen": -3.24824857711792, "logits/rejected": -3.1001033782958984, "logps/chosen": -294.35137939453125, "logps/rejected": -693.6536254882812, "loss": 0.3812, "rewards/accuracies": 1.0, "rewards/chosen": 0.6110076904296875, "rewards/margins": 2.5430235862731934, "rewards/rejected": -1.9320160150527954, "step": 643 }, { "epoch": 0.82, "learning_rate": 8.141676086873572e-09, "logits/chosen": -3.2194020748138428, "logits/rejected": -3.0117225646972656, "logps/chosen": -291.5976867675781, "logps/rejected": -1317.936767578125, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 0.6047104001045227, "rewards/margins": 4.8288254737854, "rewards/rejected": -4.224114894866943, "step": 644 }, { "epoch": 0.82, "learning_rate": 8.028988746533432e-09, "logits/chosen": -3.221467971801758, "logits/rejected": -3.0671043395996094, "logps/chosen": -281.31646728515625, "logps/rejected": -415.2162780761719, "loss": 0.4594, "rewards/accuracies": 1.0, "rewards/chosen": 0.5429031252861023, "rewards/margins": 1.5473358631134033, "rewards/rejected": -1.0044326782226562, "step": 645 }, { "epoch": 0.82, "learning_rate": 7.917018574973644e-09, "logits/chosen": -3.203296184539795, "logits/rejected": -3.038102865219116, "logps/chosen": -314.24322509765625, "logps/rejected": -670.4117431640625, "loss": 0.4193, "rewards/accuracies": 1.0, "rewards/chosen": 0.8325378894805908, "rewards/margins": 2.828110456466675, "rewards/rejected": -1.9955726861953735, "step": 646 }, { "epoch": 0.82, "learning_rate": 7.805767485455527e-09, "logits/chosen": -3.1836509704589844, "logits/rejected": -3.099478244781494, "logps/chosen": -349.501953125, "logps/rejected": -354.75140380859375, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 0.6456451416015625, "rewards/margins": 1.6594727039337158, "rewards/rejected": -1.0138275623321533, "step": 647 }, { "epoch": 0.83, "learning_rate": 7.695237378953223e-09, "logits/chosen": -3.1789817810058594, "logits/rejected": -2.9934451580047607, "logps/chosen": -324.02691650390625, "logps/rejected": -3629.000244140625, "loss": 0.3852, "rewards/accuracies": 1.0, "rewards/chosen": 0.6175537109375, "rewards/margins": 8.326087951660156, "rewards/rejected": -7.708534240722656, "step": 648 }, { "epoch": 0.83, "learning_rate": 7.585430144121319e-09, "logits/chosen": -3.2198870182037354, "logits/rejected": -3.1139631271362305, "logps/chosen": -257.1016845703125, "logps/rejected": -675.375732421875, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 0.6456893682479858, "rewards/margins": 2.6052260398864746, "rewards/rejected": -1.9595367908477783, "step": 649 }, { "epoch": 0.83, "learning_rate": 7.476347657262455e-09, "logits/chosen": -3.229457378387451, "logits/rejected": -3.1295313835144043, "logps/chosen": -289.19000244140625, "logps/rejected": -1055.0440673828125, "loss": 0.3673, "rewards/accuracies": 1.0, "rewards/chosen": 0.5712951421737671, "rewards/margins": 3.946572780609131, "rewards/rejected": -3.3752777576446533, "step": 650 }, { "epoch": 0.83, "learning_rate": 7.367991782295391e-09, "logits/chosen": -3.1479439735412598, "logits/rejected": -3.1603617668151855, "logps/chosen": -307.9562683105469, "logps/rejected": -594.515625, "loss": 0.4131, "rewards/accuracies": 1.0, "rewards/chosen": 0.7819595336914062, "rewards/margins": 2.7565033435821533, "rewards/rejected": -1.974543809890747, "step": 651 }, { "epoch": 0.83, "learning_rate": 7.260364370723044e-09, "logits/chosen": -3.190568447113037, "logits/rejected": -2.9808664321899414, "logps/chosen": -276.9456481933594, "logps/rejected": -1836.8212890625, "loss": 0.3673, "rewards/accuracies": 1.0, "rewards/chosen": 0.5658226013183594, "rewards/margins": 6.989541530609131, "rewards/rejected": -6.423718452453613, "step": 652 }, { "epoch": 0.83, "learning_rate": 7.153467261600948e-09, "logits/chosen": -3.225680351257324, "logits/rejected": -3.116489887237549, "logps/chosen": -294.24053955078125, "logps/rejected": -1071.5364990234375, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": 0.7151840329170227, "rewards/margins": 4.5580854415893555, "rewards/rejected": -3.8429017066955566, "step": 653 }, { "epoch": 0.83, "learning_rate": 7.047302281505735e-09, "logits/chosen": -3.2185990810394287, "logits/rejected": -2.9491963386535645, "logps/chosen": -281.07147216796875, "logps/rejected": -677.61962890625, "loss": 0.4088, "rewards/accuracies": 1.0, "rewards/chosen": 0.5253158807754517, "rewards/margins": 2.6603240966796875, "rewards/rejected": -2.1350083351135254, "step": 654 }, { "epoch": 0.83, "learning_rate": 6.9418712445040165e-09, "logits/chosen": -3.18601655960083, "logits/rejected": -2.9326558113098145, "logps/chosen": -261.1278076171875, "logps/rejected": -1361.590576171875, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 0.7044914960861206, "rewards/margins": 4.808281421661377, "rewards/rejected": -4.103790283203125, "step": 655 }, { "epoch": 0.84, "learning_rate": 6.837175952121305e-09, "logits/chosen": -3.212704658508301, "logits/rejected": -3.1013479232788086, "logps/chosen": -264.63067626953125, "logps/rejected": -776.9671630859375, "loss": 0.4008, "rewards/accuracies": 1.0, "rewards/chosen": 0.5692589282989502, "rewards/margins": 3.0238304138183594, "rewards/rejected": -2.4545717239379883, "step": 656 }, { "epoch": 0.84, "learning_rate": 6.733218193311291e-09, "logits/chosen": -3.187042474746704, "logits/rejected": -2.9553134441375732, "logps/chosen": -261.625244140625, "logps/rejected": -1304.9052734375, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 0.5570533871650696, "rewards/margins": 4.681659698486328, "rewards/rejected": -4.124606132507324, "step": 657 }, { "epoch": 0.84, "learning_rate": 6.629999744425236e-09, "logits/chosen": -3.2095141410827637, "logits/rejected": -3.1920104026794434, "logps/chosen": -300.57037353515625, "logps/rejected": -507.1513977050781, "loss": 0.375, "rewards/accuracies": 1.0, "rewards/chosen": 0.5786407589912415, "rewards/margins": 2.3110992908477783, "rewards/rejected": -1.7324585914611816, "step": 658 }, { "epoch": 0.84, "learning_rate": 6.527522369181654e-09, "logits/chosen": -3.2011075019836426, "logits/rejected": -3.141143560409546, "logps/chosen": -267.9905700683594, "logps/rejected": -951.5579833984375, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 0.5844413638114929, "rewards/margins": 3.991020917892456, "rewards/rejected": -3.4065794944763184, "step": 659 }, { "epoch": 0.84, "learning_rate": 6.42578781863613e-09, "logits/chosen": -3.1843504905700684, "logits/rejected": -3.0849013328552246, "logps/chosen": -367.78546142578125, "logps/rejected": -446.81396484375, "loss": 0.4399, "rewards/accuracies": 1.0, "rewards/chosen": 0.7669128179550171, "rewards/margins": 2.0155515670776367, "rewards/rejected": -1.2486388683319092, "step": 660 }, { "epoch": 0.84, "learning_rate": 6.324797831151452e-09, "logits/chosen": -3.1739625930786133, "logits/rejected": -3.0056445598602295, "logps/chosen": -322.02935791015625, "logps/rejected": -1367.14501953125, "loss": 0.3876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8497527837753296, "rewards/margins": 5.065661430358887, "rewards/rejected": -4.215909004211426, "step": 661 }, { "epoch": 0.84, "learning_rate": 6.22455413236786e-09, "logits/chosen": -3.1611099243164062, "logits/rejected": -2.8947596549987793, "logps/chosen": -315.3330078125, "logps/rejected": -1370.766845703125, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 0.9430527091026306, "rewards/margins": 4.720611572265625, "rewards/rejected": -3.7775590419769287, "step": 662 }, { "epoch": 0.85, "learning_rate": 6.125058435173569e-09, "logits/chosen": -3.2055461406707764, "logits/rejected": -2.9365530014038086, "logps/chosen": -301.68145751953125, "logps/rejected": -1670.8336181640625, "loss": 0.3558, "rewards/accuracies": 1.0, "rewards/chosen": 0.7245162725448608, "rewards/margins": 6.044398784637451, "rewards/rejected": -5.319882392883301, "step": 663 }, { "epoch": 0.85, "learning_rate": 6.026312439675552e-09, "logits/chosen": -3.191805362701416, "logits/rejected": -2.907623767852783, "logps/chosen": -379.9254455566406, "logps/rejected": -2029.1109619140625, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 0.5404312610626221, "rewards/margins": 6.867895603179932, "rewards/rejected": -6.3274641036987305, "step": 664 }, { "epoch": 0.85, "learning_rate": 5.928317833170393e-09, "logits/chosen": -3.1741297245025635, "logits/rejected": -3.0331149101257324, "logps/chosen": -273.0776672363281, "logps/rejected": -1754.444091796875, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": 0.5252349972724915, "rewards/margins": 6.554409980773926, "rewards/rejected": -6.0291748046875, "step": 665 }, { "epoch": 0.85, "learning_rate": 5.831076290115572e-09, "logits/chosen": -3.1829848289489746, "logits/rejected": -3.174759864807129, "logps/chosen": -344.27227783203125, "logps/rejected": -697.61181640625, "loss": 0.3848, "rewards/accuracies": 1.0, "rewards/chosen": 0.7314102053642273, "rewards/margins": 2.9258437156677246, "rewards/rejected": -2.1944336891174316, "step": 666 }, { "epoch": 0.85, "learning_rate": 5.734589472100737e-09, "logits/chosen": -3.227323055267334, "logits/rejected": -3.136648416519165, "logps/chosen": -295.7225341796875, "logps/rejected": -762.833740234375, "loss": 0.3772, "rewards/accuracies": 1.0, "rewards/chosen": 0.7328933477401733, "rewards/margins": 3.603895664215088, "rewards/rejected": -2.871002197265625, "step": 667 }, { "epoch": 0.85, "learning_rate": 5.638859027819409e-09, "logits/chosen": -3.173727035522461, "logits/rejected": -3.167201519012451, "logps/chosen": -287.3570556640625, "logps/rejected": -665.5151977539062, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 0.7403457760810852, "rewards/margins": 2.984387159347534, "rewards/rejected": -2.2440414428710938, "step": 668 }, { "epoch": 0.85, "learning_rate": 5.543886593040736e-09, "logits/chosen": -3.2112481594085693, "logits/rejected": -3.0102343559265137, "logps/chosen": -317.2652587890625, "logps/rejected": -324.43731689453125, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 0.6820632815361023, "rewards/margins": 1.4637176990509033, "rewards/rejected": -0.7816543579101562, "step": 669 }, { "epoch": 0.85, "learning_rate": 5.44967379058161e-09, "logits/chosen": -3.1375887393951416, "logits/rejected": -2.9998369216918945, "logps/chosen": -303.53778076171875, "logps/rejected": -874.317138671875, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 0.7584106922149658, "rewards/margins": 3.583639621734619, "rewards/rejected": -2.8252289295196533, "step": 670 }, { "epoch": 0.86, "learning_rate": 5.356222230278856e-09, "logits/chosen": -3.2412219047546387, "logits/rejected": -3.1876304149627686, "logps/chosen": -322.7831115722656, "logps/rejected": -686.5469970703125, "loss": 0.3621, "rewards/accuracies": 1.0, "rewards/chosen": 0.6718552112579346, "rewards/margins": 2.784724712371826, "rewards/rejected": -2.1128692626953125, "step": 671 }, { "epoch": 0.86, "learning_rate": 5.263533508961826e-09, "logits/chosen": -3.2075963020324707, "logits/rejected": -3.06126070022583, "logps/chosen": -319.6999206542969, "logps/rejected": -226.6263427734375, "loss": 0.3913, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589149713516235, "rewards/margins": 1.441750407218933, "rewards/rejected": -0.5828354358673096, "step": 672 }, { "epoch": 0.86, "learning_rate": 5.17160921042501e-09, "logits/chosen": -3.173320770263672, "logits/rejected": -3.0253348350524902, "logps/chosen": -271.0562744140625, "logps/rejected": -622.025146484375, "loss": 0.4255, "rewards/accuracies": 1.0, "rewards/chosen": 0.7230300903320312, "rewards/margins": 2.4337358474731445, "rewards/rejected": -1.7107056379318237, "step": 673 }, { "epoch": 0.86, "learning_rate": 5.080450905401057e-09, "logits/chosen": -3.2301998138427734, "logits/rejected": -3.1662395000457764, "logps/chosen": -294.4507141113281, "logps/rejected": -603.0359497070312, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 0.8338730335235596, "rewards/margins": 2.8640549182891846, "rewards/rejected": -2.030181884765625, "step": 674 }, { "epoch": 0.86, "learning_rate": 4.9900601515338705e-09, "logits/chosen": -3.205817699432373, "logits/rejected": -3.0113778114318848, "logps/chosen": -324.7476806640625, "logps/rejected": -464.58563232421875, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": 0.7953994870185852, "rewards/margins": 1.9840224981307983, "rewards/rejected": -1.188623070716858, "step": 675 }, { "epoch": 0.86, "learning_rate": 4.9004384933520545e-09, "logits/chosen": -3.162327766418457, "logits/rejected": -3.0500316619873047, "logps/chosen": -309.560302734375, "logps/rejected": -1040.028564453125, "loss": 0.4202, "rewards/accuracies": 1.0, "rewards/chosen": 0.7019668817520142, "rewards/margins": 3.998173475265503, "rewards/rejected": -3.2962067127227783, "step": 676 }, { "epoch": 0.86, "learning_rate": 4.811587462242461e-09, "logits/chosen": -3.1395297050476074, "logits/rejected": -3.1153202056884766, "logps/chosen": -315.3875732421875, "logps/rejected": -914.7925415039062, "loss": 0.4053, "rewards/accuracies": 1.0, "rewards/chosen": 0.7801392078399658, "rewards/margins": 3.7201967239379883, "rewards/rejected": -2.9400575160980225, "step": 677 }, { "epoch": 0.86, "learning_rate": 4.7235085764240625e-09, "logits/chosen": -3.166598320007324, "logits/rejected": -3.1013574600219727, "logps/chosen": -311.6807556152344, "logps/rejected": -1520.334228515625, "loss": 0.4132, "rewards/accuracies": 1.0, "rewards/chosen": 0.7898025512695312, "rewards/margins": 4.967543125152588, "rewards/rejected": -4.177740573883057, "step": 678 }, { "epoch": 0.87, "learning_rate": 4.636203340922007e-09, "logits/chosen": -3.1295456886291504, "logits/rejected": -3.099003314971924, "logps/chosen": -292.8515625, "logps/rejected": -679.4717407226562, "loss": 0.4325, "rewards/accuracies": 1.0, "rewards/chosen": 0.7147582769393921, "rewards/margins": 2.8826797008514404, "rewards/rejected": -2.167921543121338, "step": 679 }, { "epoch": 0.87, "learning_rate": 4.549673247541874e-09, "logits/chosen": -3.259054183959961, "logits/rejected": -3.102315664291382, "logps/chosen": -276.3656005859375, "logps/rejected": -417.9696044921875, "loss": 0.4061, "rewards/accuracies": 1.0, "rewards/chosen": 0.5470665097236633, "rewards/margins": 1.8681786060333252, "rewards/rejected": -1.321112036705017, "step": 680 }, { "epoch": 0.87, "learning_rate": 4.463919774844233e-09, "logits/chosen": -3.1922545433044434, "logits/rejected": -2.9245548248291016, "logps/chosen": -334.2574462890625, "logps/rejected": -1620.05712890625, "loss": 0.407, "rewards/accuracies": 1.0, "rewards/chosen": 0.8049026727676392, "rewards/margins": 5.680906772613525, "rewards/rejected": -4.876004219055176, "step": 681 }, { "epoch": 0.87, "learning_rate": 4.37894438811931e-09, "logits/chosen": -3.1363773345947266, "logits/rejected": -3.0102481842041016, "logps/chosen": -287.7495422363281, "logps/rejected": -1663.244873046875, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": 0.8029037714004517, "rewards/margins": 6.249262809753418, "rewards/rejected": -5.446359157562256, "step": 682 }, { "epoch": 0.87, "learning_rate": 4.294748539362031e-09, "logits/chosen": -3.1324610710144043, "logits/rejected": -3.092179536819458, "logps/chosen": -316.67352294921875, "logps/rejected": -449.9515075683594, "loss": 0.3814, "rewards/accuracies": 1.0, "rewards/chosen": 0.676513671875, "rewards/margins": 2.0609192848205566, "rewards/rejected": -1.384405493736267, "step": 683 }, { "epoch": 0.87, "learning_rate": 4.2113336672471245e-09, "logits/chosen": -3.240708589553833, "logits/rejected": -3.2064454555511475, "logps/chosen": -326.0115966796875, "logps/rejected": -793.60205078125, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 0.6662399768829346, "rewards/margins": 2.9811296463012695, "rewards/rejected": -2.314889430999756, "step": 684 }, { "epoch": 0.87, "learning_rate": 4.128701197104628e-09, "logits/chosen": -3.1469757556915283, "logits/rejected": -3.035496950149536, "logps/chosen": -294.5797119140625, "logps/rejected": -1053.807373046875, "loss": 0.351, "rewards/accuracies": 1.0, "rewards/chosen": 0.5670578479766846, "rewards/margins": 4.180845737457275, "rewards/rejected": -3.61378812789917, "step": 685 }, { "epoch": 0.87, "learning_rate": 4.0468525408954454e-09, "logits/chosen": -3.2118563652038574, "logits/rejected": -3.051189661026001, "logps/chosen": -284.482666015625, "logps/rejected": -878.2967529296875, "loss": 0.4003, "rewards/accuracies": 1.0, "rewards/chosen": 0.6894134283065796, "rewards/margins": 3.2594757080078125, "rewards/rejected": -2.5700623989105225, "step": 686 }, { "epoch": 0.88, "learning_rate": 3.9657890971873e-09, "logits/chosen": -3.2550652027130127, "logits/rejected": -3.047168016433716, "logps/chosen": -280.9194030761719, "logps/rejected": -1576.694580078125, "loss": 0.3487, "rewards/accuracies": 1.0, "rewards/chosen": 0.6268417835235596, "rewards/margins": 5.901292324066162, "rewards/rejected": -5.274450778961182, "step": 687 }, { "epoch": 0.88, "learning_rate": 3.8855122511307626e-09, "logits/chosen": -3.244507312774658, "logits/rejected": -3.1207046508789062, "logps/chosen": -318.93780517578125, "logps/rejected": -371.9796447753906, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 0.6206680536270142, "rewards/margins": 1.680220127105713, "rewards/rejected": -1.0595520734786987, "step": 688 }, { "epoch": 0.88, "learning_rate": 3.8060233744356625e-09, "logits/chosen": -3.196354866027832, "logits/rejected": -3.0705645084381104, "logps/chosen": -281.1085205078125, "logps/rejected": -607.69970703125, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 0.7124252319335938, "rewards/margins": 2.310804843902588, "rewards/rejected": -1.5983794927597046, "step": 689 }, { "epoch": 0.88, "learning_rate": 3.727323825347578e-09, "logits/chosen": -3.187636375427246, "logits/rejected": -2.952805519104004, "logps/chosen": -256.0855712890625, "logps/rejected": -1079.634521484375, "loss": 0.4758, "rewards/accuracies": 1.0, "rewards/chosen": 0.47701495885849, "rewards/margins": 4.15155553817749, "rewards/rejected": -3.6745407581329346, "step": 690 }, { "epoch": 0.88, "learning_rate": 3.649414948624652e-09, "logits/chosen": -3.2041807174682617, "logits/rejected": -3.090986967086792, "logps/chosen": -341.1267395019531, "logps/rejected": -559.1605224609375, "loss": 0.4158, "rewards/accuracies": 1.0, "rewards/chosen": 0.5978164672851562, "rewards/margins": 2.383422374725342, "rewards/rejected": -1.7856056690216064, "step": 691 }, { "epoch": 0.88, "learning_rate": 3.5722980755146515e-09, "logits/chosen": -3.200984477996826, "logits/rejected": -2.9716386795043945, "logps/chosen": -255.27783203125, "logps/rejected": -1087.7823486328125, "loss": 0.3536, "rewards/accuracies": 1.0, "rewards/chosen": 0.7224342823028564, "rewards/margins": 3.8398356437683105, "rewards/rejected": -3.117401123046875, "step": 692 }, { "epoch": 0.88, "learning_rate": 3.4959745237321427e-09, "logits/chosen": -3.2456045150756836, "logits/rejected": -3.148855686187744, "logps/chosen": -291.146484375, "logps/rejected": -320.76556396484375, "loss": 0.4491, "rewards/accuracies": 1.0, "rewards/chosen": 0.5875107049942017, "rewards/margins": 1.4597976207733154, "rewards/rejected": -0.8722870349884033, "step": 693 }, { "epoch": 0.88, "learning_rate": 3.4204455974360556e-09, "logits/chosen": -3.284578800201416, "logits/rejected": -3.1131768226623535, "logps/chosen": -266.48675537109375, "logps/rejected": -1265.3289794921875, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 0.5005782842636108, "rewards/margins": 4.811589241027832, "rewards/rejected": -4.311010837554932, "step": 694 }, { "epoch": 0.89, "learning_rate": 3.3457125872073388e-09, "logits/chosen": -3.2124900817871094, "logits/rejected": -3.127713441848755, "logps/chosen": -309.71356201171875, "logps/rejected": -686.1776123046875, "loss": 0.4402, "rewards/accuracies": 1.0, "rewards/chosen": 0.6789658069610596, "rewards/margins": 2.464150905609131, "rewards/rejected": -1.7851852178573608, "step": 695 }, { "epoch": 0.89, "learning_rate": 3.2717767700269627e-09, "logits/chosen": -3.218496322631836, "logits/rejected": -3.0609261989593506, "logps/chosen": -268.1003723144531, "logps/rejected": -448.26190185546875, "loss": 0.4097, "rewards/accuracies": 1.0, "rewards/chosen": 0.647687554359436, "rewards/margins": 1.6989006996154785, "rewards/rejected": -1.051213026046753, "step": 696 }, { "epoch": 0.89, "learning_rate": 3.198639409254017e-09, "logits/chosen": -3.199665069580078, "logits/rejected": -3.0766544342041016, "logps/chosen": -282.9046325683594, "logps/rejected": -1382.535888671875, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 0.5880317687988281, "rewards/margins": 5.397614479064941, "rewards/rejected": -4.809582710266113, "step": 697 }, { "epoch": 0.89, "learning_rate": 3.1263017546042324e-09, "logits/chosen": -3.1841917037963867, "logits/rejected": -3.084965467453003, "logps/chosen": -317.069091796875, "logps/rejected": -728.71484375, "loss": 0.413, "rewards/accuracies": 1.0, "rewards/chosen": 0.7664825916290283, "rewards/margins": 3.131582736968994, "rewards/rejected": -2.365100145339966, "step": 698 }, { "epoch": 0.89, "learning_rate": 3.054765042128521e-09, "logits/chosen": -3.1769795417785645, "logits/rejected": -3.153965473175049, "logps/chosen": -287.5623779296875, "logps/rejected": -784.6666259765625, "loss": 0.3388, "rewards/accuracies": 1.0, "rewards/chosen": 0.5664596557617188, "rewards/margins": 3.1077804565429688, "rewards/rejected": -2.54132080078125, "step": 699 }, { "epoch": 0.89, "learning_rate": 2.9840304941919412e-09, "logits/chosen": -3.2292990684509277, "logits/rejected": -3.1292903423309326, "logps/chosen": -302.6883239746094, "logps/rejected": -543.7086791992188, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 0.7623649835586548, "rewards/margins": 2.368380069732666, "rewards/rejected": -1.6060150861740112, "step": 700 }, { "epoch": 0.89, "learning_rate": 2.9140993194527286e-09, "logits/chosen": -3.1862854957580566, "logits/rejected": -3.0554537773132324, "logps/chosen": -303.1500244140625, "logps/rejected": -681.7243041992188, "loss": 0.4027, "rewards/accuracies": 1.0, "rewards/chosen": 0.6057373285293579, "rewards/margins": 3.0531206130981445, "rewards/rejected": -2.447383165359497, "step": 701 }, { "epoch": 0.89, "learning_rate": 2.8449727128417366e-09, "logits/chosen": -3.1896491050720215, "logits/rejected": -3.1270194053649902, "logps/chosen": -290.72308349609375, "logps/rejected": -957.2294311523438, "loss": 0.3329, "rewards/accuracies": 1.0, "rewards/chosen": 0.7721801996231079, "rewards/margins": 3.716622829437256, "rewards/rejected": -2.9444427490234375, "step": 702 }, { "epoch": 0.9, "learning_rate": 2.7766518555419394e-09, "logits/chosen": -3.2163023948669434, "logits/rejected": -3.015904188156128, "logps/chosen": -321.28143310546875, "logps/rejected": -1093.9732666015625, "loss": 0.3755, "rewards/accuracies": 1.0, "rewards/chosen": 0.6852676868438721, "rewards/margins": 4.464021682739258, "rewards/rejected": -3.7787537574768066, "step": 703 }, { "epoch": 0.9, "learning_rate": 2.709137914968268e-09, "logits/chosen": -3.2334790229797363, "logits/rejected": -3.0332934856414795, "logps/chosen": -276.35479736328125, "logps/rejected": -1034.2490234375, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 0.6641929745674133, "rewards/margins": 3.5935065746307373, "rewards/rejected": -2.9293136596679688, "step": 704 }, { "epoch": 0.9, "learning_rate": 2.642432044747711e-09, "logits/chosen": -3.2320785522460938, "logits/rejected": -3.013669490814209, "logps/chosen": -274.4428405761719, "logps/rejected": -500.280517578125, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": 0.6112778186798096, "rewards/margins": 2.6145546436309814, "rewards/rejected": -2.003276824951172, "step": 705 }, { "epoch": 0.9, "learning_rate": 2.57653538469953e-09, "logits/chosen": -3.2305850982666016, "logits/rejected": -3.106910228729248, "logps/chosen": -272.8189392089844, "logps/rejected": -555.33447265625, "loss": 0.3713, "rewards/accuracies": 1.0, "rewards/chosen": 0.7180007696151733, "rewards/margins": 2.704519748687744, "rewards/rejected": -1.9865188598632812, "step": 706 }, { "epoch": 0.9, "learning_rate": 2.51144906081584e-09, "logits/chosen": -3.2110304832458496, "logits/rejected": -3.1262667179107666, "logps/chosen": -287.1530456542969, "logps/rejected": -516.3367919921875, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 0.5093635320663452, "rewards/margins": 2.477097272872925, "rewards/rejected": -1.9677338600158691, "step": 707 }, { "epoch": 0.9, "learning_rate": 2.4471741852423233e-09, "logits/chosen": -3.2305829524993896, "logits/rejected": -3.1171813011169434, "logps/chosen": -285.522216796875, "logps/rejected": -624.7633056640625, "loss": 0.3874, "rewards/accuracies": 1.0, "rewards/chosen": 0.7575653195381165, "rewards/margins": 2.5063796043395996, "rewards/rejected": -1.748814344406128, "step": 708 }, { "epoch": 0.9, "learning_rate": 2.3837118562592794e-09, "logits/chosen": -3.255368709564209, "logits/rejected": -3.24276065826416, "logps/chosen": -290.5984191894531, "logps/rejected": -612.07470703125, "loss": 0.4232, "rewards/accuracies": 1.0, "rewards/chosen": 0.8189681768417358, "rewards/margins": 2.8792786598205566, "rewards/rejected": -2.0603103637695312, "step": 709 }, { "epoch": 0.91, "learning_rate": 2.3210631582627928e-09, "logits/chosen": -3.145559549331665, "logits/rejected": -3.132385730743408, "logps/chosen": -279.2958068847656, "logps/rejected": -636.367431640625, "loss": 0.3922, "rewards/accuracies": 1.0, "rewards/chosen": 0.8283355832099915, "rewards/margins": 3.1568603515625, "rewards/rejected": -2.3285248279571533, "step": 710 }, { "epoch": 0.91, "learning_rate": 2.259229161746279e-09, "logits/chosen": -3.1485753059387207, "logits/rejected": -3.058232545852661, "logps/chosen": -345.04681396484375, "logps/rejected": -384.2386169433594, "loss": 0.4597, "rewards/accuracies": 1.0, "rewards/chosen": 0.6334244012832642, "rewards/margins": 1.6508164405822754, "rewards/rejected": -1.0173920392990112, "step": 711 }, { "epoch": 0.91, "learning_rate": 2.198210923282118e-09, "logits/chosen": -3.163756847381592, "logits/rejected": -3.06687593460083, "logps/chosen": -349.2109375, "logps/rejected": -769.3989868164062, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": 0.6343582272529602, "rewards/margins": 2.851637363433838, "rewards/rejected": -2.2172791957855225, "step": 712 }, { "epoch": 0.91, "learning_rate": 2.1380094855036614e-09, "logits/chosen": -3.1928811073303223, "logits/rejected": -3.090635061264038, "logps/chosen": -290.51788330078125, "logps/rejected": -735.7965698242188, "loss": 0.4715, "rewards/accuracies": 1.0, "rewards/chosen": 0.7065643668174744, "rewards/margins": 2.455777168273926, "rewards/rejected": -1.749212622642517, "step": 713 }, { "epoch": 0.91, "learning_rate": 2.0786258770873644e-09, "logits/chosen": -3.1876368522644043, "logits/rejected": -3.0704073905944824, "logps/chosen": -302.066650390625, "logps/rejected": -915.7626953125, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 0.49805527925491333, "rewards/margins": 3.888134002685547, "rewards/rejected": -3.3900787830352783, "step": 714 }, { "epoch": 0.91, "learning_rate": 2.020061112735266e-09, "logits/chosen": -3.2656917572021484, "logits/rejected": -3.1112287044525146, "logps/chosen": -320.65625, "logps/rejected": -800.2109375, "loss": 0.4156, "rewards/accuracies": 1.0, "rewards/chosen": 0.7395523190498352, "rewards/margins": 3.4485886096954346, "rewards/rejected": -2.709036350250244, "step": 715 }, { "epoch": 0.91, "learning_rate": 1.9623161931575925e-09, "logits/chosen": -3.2395052909851074, "logits/rejected": -3.0168867111206055, "logps/chosen": -314.0843811035156, "logps/rejected": -1248.5517578125, "loss": 0.4105, "rewards/accuracies": 1.0, "rewards/chosen": 0.7494796514511108, "rewards/margins": 4.95749044418335, "rewards/rejected": -4.208010673522949, "step": 716 }, { "epoch": 0.91, "learning_rate": 1.905392105055703e-09, "logits/chosen": -3.128763198852539, "logits/rejected": -2.9955520629882812, "logps/chosen": -291.9544372558594, "logps/rejected": -1115.964599609375, "loss": 0.3985, "rewards/accuracies": 1.0, "rewards/chosen": 0.7385101318359375, "rewards/margins": 4.130877494812012, "rewards/rejected": -3.3923676013946533, "step": 717 }, { "epoch": 0.92, "learning_rate": 1.8492898211051989e-09, "logits/chosen": -3.191375970840454, "logits/rejected": -2.979489326477051, "logps/chosen": -269.20684814453125, "logps/rejected": -1172.8763427734375, "loss": 0.3595, "rewards/accuracies": 1.0, "rewards/chosen": 0.6256996393203735, "rewards/margins": 4.21213436126709, "rewards/rejected": -3.586435079574585, "step": 718 }, { "epoch": 0.92, "learning_rate": 1.7940102999393193e-09, "logits/chosen": -3.1853792667388916, "logits/rejected": -3.1820321083068848, "logps/chosen": -259.60296630859375, "logps/rejected": -503.8096923828125, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 0.5516158938407898, "rewards/margins": 2.0223495960235596, "rewards/rejected": -1.470733642578125, "step": 719 }, { "epoch": 0.92, "learning_rate": 1.7395544861325716e-09, "logits/chosen": -3.2024083137512207, "logits/rejected": -3.0915093421936035, "logps/chosen": -317.27899169921875, "logps/rejected": -1413.554931640625, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 0.6771957874298096, "rewards/margins": 5.1454758644104, "rewards/rejected": -4.468279838562012, "step": 720 }, { "epoch": 0.92, "learning_rate": 1.6859233101845506e-09, "logits/chosen": -3.120319366455078, "logits/rejected": -3.0753166675567627, "logps/chosen": -306.0942687988281, "logps/rejected": -886.9466552734375, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/chosen": 0.7180145382881165, "rewards/margins": 3.351296901702881, "rewards/rejected": -2.6332826614379883, "step": 721 }, { "epoch": 0.92, "learning_rate": 1.6331176885040876e-09, "logits/chosen": -3.1857662200927734, "logits/rejected": -3.0978641510009766, "logps/chosen": -347.28448486328125, "logps/rejected": -620.2137451171875, "loss": 0.466, "rewards/accuracies": 1.0, "rewards/chosen": 0.7213333249092102, "rewards/margins": 2.7959625720977783, "rewards/rejected": -2.074629306793213, "step": 722 }, { "epoch": 0.92, "learning_rate": 1.5811385233935548e-09, "logits/chosen": -3.2340736389160156, "logits/rejected": -3.1432278156280518, "logps/chosen": -291.7189025878906, "logps/rejected": -681.6392822265625, "loss": 0.3794, "rewards/accuracies": 1.0, "rewards/chosen": 0.6019119024276733, "rewards/margins": 2.9274673461914062, "rewards/rejected": -2.3255553245544434, "step": 723 }, { "epoch": 0.92, "learning_rate": 1.5299867030334812e-09, "logits/chosen": -3.2196555137634277, "logits/rejected": -3.0795955657958984, "logps/chosen": -268.80548095703125, "logps/rejected": -390.1972961425781, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 0.7011650204658508, "rewards/margins": 1.5963737964630127, "rewards/rejected": -0.8952087759971619, "step": 724 }, { "epoch": 0.92, "learning_rate": 1.4796631014673322e-09, "logits/chosen": -3.2224292755126953, "logits/rejected": -3.0713889598846436, "logps/chosen": -286.0936279296875, "logps/rejected": -306.79339599609375, "loss": 0.4165, "rewards/accuracies": 1.0, "rewards/chosen": 0.5686798095703125, "rewards/margins": 1.3943290710449219, "rewards/rejected": -0.8256492614746094, "step": 725 }, { "epoch": 0.93, "learning_rate": 1.4301685785866214e-09, "logits/chosen": -3.1685962677001953, "logits/rejected": -3.049055337905884, "logps/chosen": -378.30609130859375, "logps/rejected": -579.7233276367188, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 0.4987228512763977, "rewards/margins": 1.9863662719726562, "rewards/rejected": -1.4876434803009033, "step": 726 }, { "epoch": 0.93, "learning_rate": 1.3815039801161721e-09, "logits/chosen": -3.1636438369750977, "logits/rejected": -3.0560858249664307, "logps/chosen": -305.259033203125, "logps/rejected": -362.99066162109375, "loss": 0.4711, "rewards/accuracies": 1.0, "rewards/chosen": 0.5066254138946533, "rewards/margins": 1.545562744140625, "rewards/rejected": -1.0389374494552612, "step": 727 }, { "epoch": 0.93, "learning_rate": 1.3336701375997127e-09, "logits/chosen": -3.239227533340454, "logits/rejected": -3.0747621059417725, "logps/chosen": -316.3634948730469, "logps/rejected": -617.9470825195312, "loss": 0.457, "rewards/accuracies": 1.0, "rewards/chosen": 0.6519584655761719, "rewards/margins": 2.45833683013916, "rewards/rejected": -1.8063781261444092, "step": 728 }, { "epoch": 0.93, "learning_rate": 1.2866678683856268e-09, "logits/chosen": -3.2047009468078613, "logits/rejected": -3.1150074005126953, "logps/chosen": -299.3955078125, "logps/rejected": -995.9140014648438, "loss": 0.3791, "rewards/accuracies": 1.0, "rewards/chosen": 0.6497421264648438, "rewards/margins": 4.075563430786133, "rewards/rejected": -3.425820827484131, "step": 729 }, { "epoch": 0.93, "learning_rate": 1.240497975613014e-09, "logits/chosen": -3.2007150650024414, "logits/rejected": -3.0823116302490234, "logps/chosen": -352.0189514160156, "logps/rejected": -638.258056640625, "loss": 0.3977, "rewards/accuracies": 1.0, "rewards/chosen": 0.5283309817314148, "rewards/margins": 2.433955430984497, "rewards/rejected": -1.9056243896484375, "step": 730 }, { "epoch": 0.93, "learning_rate": 1.1951612481979567e-09, "logits/chosen": -3.1955575942993164, "logits/rejected": -3.170045852661133, "logps/chosen": -280.4822998046875, "logps/rejected": -943.0496826171875, "loss": 0.4086, "rewards/accuracies": 1.0, "rewards/chosen": 0.7454330325126648, "rewards/margins": 3.44016695022583, "rewards/rejected": -2.6947340965270996, "step": 731 }, { "epoch": 0.93, "learning_rate": 1.1506584608200365e-09, "logits/chosen": -3.2437222003936768, "logits/rejected": -3.108583688735962, "logps/chosen": -253.0540771484375, "logps/rejected": -578.02099609375, "loss": 0.4582, "rewards/accuracies": 1.0, "rewards/chosen": 0.622423529624939, "rewards/margins": 2.6013665199279785, "rewards/rejected": -1.97894287109375, "step": 732 }, { "epoch": 0.93, "learning_rate": 1.1069903739091002e-09, "logits/chosen": -3.1967506408691406, "logits/rejected": -2.974410057067871, "logps/chosen": -269.8782958984375, "logps/rejected": -1815.54052734375, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 0.7586196660995483, "rewards/margins": 7.275825500488281, "rewards/rejected": -6.517206192016602, "step": 733 }, { "epoch": 0.94, "learning_rate": 1.064157733632276e-09, "logits/chosen": -3.2173094749450684, "logits/rejected": -3.0900630950927734, "logps/chosen": -267.27392578125, "logps/rejected": -516.103759765625, "loss": 0.3778, "rewards/accuracies": 1.0, "rewards/chosen": 0.4388381838798523, "rewards/margins": 1.917170763015747, "rewards/rejected": -1.47833251953125, "step": 734 }, { "epoch": 0.94, "learning_rate": 1.0221612718812e-09, "logits/chosen": -3.223905563354492, "logits/rejected": -3.030141830444336, "logps/chosen": -312.45672607421875, "logps/rejected": -493.65679931640625, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": 0.6388046741485596, "rewards/margins": 2.0924010276794434, "rewards/rejected": -1.4535964727401733, "step": 735 }, { "epoch": 0.94, "learning_rate": 9.81001706259532e-10, "logits/chosen": -3.1747093200683594, "logits/rejected": -3.15254807472229, "logps/chosen": -329.68743896484375, "logps/rejected": -1226.9739990234375, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 0.5114639401435852, "rewards/margins": 5.2485032081604, "rewards/rejected": -4.737039089202881, "step": 736 }, { "epoch": 0.94, "learning_rate": 9.40679740070688e-10, "logits/chosen": -3.223698139190674, "logits/rejected": -3.127666473388672, "logps/chosen": -306.9605712890625, "logps/rejected": -694.6029663085938, "loss": 0.406, "rewards/accuracies": 1.0, "rewards/chosen": 0.67449951171875, "rewards/margins": 2.765847682952881, "rewards/rejected": -2.091348171234131, "step": 737 }, { "epoch": 0.94, "learning_rate": 9.011960623058201e-10, "logits/chosen": -3.1586294174194336, "logits/rejected": -3.0917553901672363, "logps/chosen": -324.0772705078125, "logps/rejected": -926.000244140625, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 0.7590347528457642, "rewards/margins": 3.369035243988037, "rewards/rejected": -2.6100006103515625, "step": 738 }, { "epoch": 0.94, "learning_rate": 8.625513476320289e-10, "logits/chosen": -3.226217746734619, "logits/rejected": -3.079533576965332, "logps/chosen": -307.0830383300781, "logps/rejected": -815.183349609375, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 0.7472305297851562, "rewards/margins": 3.329831123352051, "rewards/rejected": -2.5826005935668945, "step": 739 }, { "epoch": 0.94, "learning_rate": 8.247462563808816e-10, "logits/chosen": -3.198183059692383, "logits/rejected": -3.1778945922851562, "logps/chosen": -326.77197265625, "logps/rejected": -874.4058837890625, "loss": 0.3941, "rewards/accuracies": 1.0, "rewards/chosen": 0.7213531732559204, "rewards/margins": 3.8611602783203125, "rewards/rejected": -3.1398072242736816, "step": 740 }, { "epoch": 0.94, "learning_rate": 7.877814345370715e-10, "logits/chosen": -3.1232688426971436, "logits/rejected": -2.9919917583465576, "logps/chosen": -276.6590576171875, "logps/rejected": -2012.9337158203125, "loss": 0.4311, "rewards/accuracies": 1.0, "rewards/chosen": 0.6873466372489929, "rewards/margins": 7.023268699645996, "rewards/rejected": -6.3359222412109375, "step": 741 }, { "epoch": 0.95, "learning_rate": 7.516575137274162e-10, "logits/chosen": -3.1711294651031494, "logits/rejected": -3.1236729621887207, "logps/chosen": -273.9808044433594, "logps/rejected": -838.2468872070312, "loss": 0.3691, "rewards/accuracies": 1.0, "rewards/chosen": 0.44710084795951843, "rewards/margins": 3.237539768218994, "rewards/rejected": -2.7904388904571533, "step": 742 }, { "epoch": 0.95, "learning_rate": 7.163751112100435e-10, "logits/chosen": -3.230532646179199, "logits/rejected": -3.108858108520508, "logps/chosen": -310.24945068359375, "logps/rejected": -453.00518798828125, "loss": 0.4254, "rewards/accuracies": 1.0, "rewards/chosen": 0.5498703122138977, "rewards/margins": 2.1461288928985596, "rewards/rejected": -1.5962586402893066, "step": 743 }, { "epoch": 0.95, "learning_rate": 6.819348298638839e-10, "logits/chosen": -3.199756622314453, "logits/rejected": -3.059999942779541, "logps/chosen": -333.8587951660156, "logps/rejected": -1346.556640625, "loss": 0.393, "rewards/accuracies": 1.0, "rewards/chosen": 0.6002441644668579, "rewards/margins": 5.455053806304932, "rewards/rejected": -4.854809761047363, "step": 744 }, { "epoch": 0.95, "learning_rate": 6.483372581783054e-10, "logits/chosen": -3.1975765228271484, "logits/rejected": -3.061234951019287, "logps/chosen": -292.7640380859375, "logps/rejected": -914.1372680664062, "loss": 0.3393, "rewards/accuracies": 1.0, "rewards/chosen": 0.8583008050918579, "rewards/margins": 3.8595457077026367, "rewards/rejected": -3.0012450218200684, "step": 745 }, { "epoch": 0.95, "learning_rate": 6.15582970243117e-10, "logits/chosen": -3.2178993225097656, "logits/rejected": -3.1198439598083496, "logps/chosen": -289.4781799316406, "logps/rejected": -853.135009765625, "loss": 0.3836, "rewards/accuracies": 1.0, "rewards/chosen": 0.6307388544082642, "rewards/margins": 3.1878037452697754, "rewards/rejected": -2.5570647716522217, "step": 746 }, { "epoch": 0.95, "learning_rate": 5.83672525738721e-10, "logits/chosen": -3.186641216278076, "logits/rejected": -3.0900301933288574, "logps/chosen": -298.91156005859375, "logps/rejected": -568.1196899414062, "loss": 0.4106, "rewards/accuracies": 1.0, "rewards/chosen": 0.655834972858429, "rewards/margins": 2.5903947353363037, "rewards/rejected": -1.93455970287323, "step": 747 }, { "epoch": 0.95, "learning_rate": 5.526064699265753e-10, "logits/chosen": -3.1814985275268555, "logits/rejected": -3.0743982791900635, "logps/chosen": -291.9840087890625, "logps/rejected": -904.8698120117188, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 0.6595489382743835, "rewards/margins": 3.10880446434021, "rewards/rejected": -2.4492554664611816, "step": 748 }, { "epoch": 0.95, "learning_rate": 5.223853336398632e-10, "logits/chosen": -3.215315580368042, "logits/rejected": -3.087337017059326, "logps/chosen": -331.27001953125, "logps/rejected": -890.446044921875, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 0.6234802007675171, "rewards/margins": 3.145442247390747, "rewards/rejected": -2.5219619274139404, "step": 749 }, { "epoch": 0.96, "learning_rate": 4.930096332744105e-10, "logits/chosen": -3.268667221069336, "logits/rejected": -2.9527018070220947, "logps/chosen": -290.3645935058594, "logps/rejected": -1473.3505859375, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/chosen": 0.5004104375839233, "rewards/margins": 5.650190830230713, "rewards/rejected": -5.1497802734375, "step": 750 }, { "epoch": 0.96, "learning_rate": 4.644798707798936e-10, "logits/chosen": -3.1844069957733154, "logits/rejected": -3.08846378326416, "logps/chosen": -289.44769287109375, "logps/rejected": -584.5943603515625, "loss": 0.38, "rewards/accuracies": 1.0, "rewards/chosen": 0.7022247314453125, "rewards/margins": 2.627851963043213, "rewards/rejected": -1.9256271123886108, "step": 751 }, { "epoch": 0.96, "learning_rate": 4.3679653365124024e-10, "logits/chosen": -3.2341179847717285, "logits/rejected": -3.0959038734436035, "logps/chosen": -302.2899475097656, "logps/rejected": -764.8199462890625, "loss": 0.3609, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071167230606079, "rewards/margins": 3.054068088531494, "rewards/rejected": -2.346951484680176, "step": 752 }, { "epoch": 0.96, "learning_rate": 4.0996009492029195e-10, "logits/chosen": -3.2016520500183105, "logits/rejected": -3.1903700828552246, "logps/chosen": -320.98736572265625, "logps/rejected": -1045.232421875, "loss": 0.3883, "rewards/accuracies": 1.0, "rewards/chosen": 0.6153976321220398, "rewards/margins": 4.4324846267700195, "rewards/rejected": -3.817086696624756, "step": 753 }, { "epoch": 0.96, "learning_rate": 3.8397101314774914e-10, "logits/chosen": -3.193345785140991, "logits/rejected": -2.9245657920837402, "logps/chosen": -350.3182678222656, "logps/rejected": -1830.7747802734375, "loss": 0.3939, "rewards/accuracies": 1.0, "rewards/chosen": 0.9119583368301392, "rewards/margins": 6.988306999206543, "rewards/rejected": -6.076348781585693, "step": 754 }, { "epoch": 0.96, "learning_rate": 3.588297324153056e-10, "logits/chosen": -3.242316246032715, "logits/rejected": -3.064382553100586, "logps/chosen": -277.2093811035156, "logps/rejected": -625.5988159179688, "loss": 0.4189, "rewards/accuracies": 1.0, "rewards/chosen": 0.6834198236465454, "rewards/margins": 2.6008315086364746, "rewards/rejected": -1.9174118041992188, "step": 755 }, { "epoch": 0.96, "learning_rate": 3.345366823180928e-10, "logits/chosen": -3.1976585388183594, "logits/rejected": -3.2019314765930176, "logps/chosen": -321.23614501953125, "logps/rejected": -692.27880859375, "loss": 0.3534, "rewards/accuracies": 1.0, "rewards/chosen": 0.525738537311554, "rewards/margins": 3.207305908203125, "rewards/rejected": -2.681567430496216, "step": 756 }, { "epoch": 0.96, "learning_rate": 3.110922779573033e-10, "logits/chosen": -3.25972318649292, "logits/rejected": -2.8794760704040527, "logps/chosen": -304.11083984375, "logps/rejected": -1401.837890625, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 0.4956115782260895, "rewards/margins": 4.570953369140625, "rewards/rejected": -4.075341701507568, "step": 757 }, { "epoch": 0.97, "learning_rate": 2.8849691993311776e-10, "logits/chosen": -3.2232017517089844, "logits/rejected": -3.1720387935638428, "logps/chosen": -276.08599853515625, "logps/rejected": -936.2028198242188, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 0.6930389404296875, "rewards/margins": 3.8312530517578125, "rewards/rejected": -3.138214111328125, "step": 758 }, { "epoch": 0.97, "learning_rate": 2.667509943378721e-10, "logits/chosen": -3.150697946548462, "logits/rejected": -3.0973734855651855, "logps/chosen": -265.6333923339844, "logps/rejected": -595.3242797851562, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 0.6727607846260071, "rewards/margins": 2.2459709644317627, "rewards/rejected": -1.5732102394104004, "step": 759 }, { "epoch": 0.97, "learning_rate": 2.4585487274942915e-10, "logits/chosen": -3.1693992614746094, "logits/rejected": -3.1101512908935547, "logps/chosen": -313.1036682128906, "logps/rejected": -874.0783081054688, "loss": 0.4294, "rewards/accuracies": 1.0, "rewards/chosen": 0.6152847409248352, "rewards/margins": 3.1677322387695312, "rewards/rejected": -2.552447557449341, "step": 760 }, { "epoch": 0.97, "learning_rate": 2.2580891222485632e-10, "logits/chosen": -3.2151947021484375, "logits/rejected": -3.108241081237793, "logps/chosen": -296.72760009765625, "logps/rejected": -1057.76123046875, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 0.559924304485321, "rewards/margins": 4.202589511871338, "rewards/rejected": -3.642665147781372, "step": 761 }, { "epoch": 0.97, "learning_rate": 2.0661345529430774e-10, "logits/chosen": -3.1744604110717773, "logits/rejected": -3.1444530487060547, "logps/chosen": -277.7886657714844, "logps/rejected": -587.2330322265625, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 0.6266372799873352, "rewards/margins": 2.6373324394226074, "rewards/rejected": -2.010694980621338, "step": 762 }, { "epoch": 0.97, "learning_rate": 1.8826882995517934e-10, "logits/chosen": -3.254056930541992, "logits/rejected": -2.995344400405884, "logps/chosen": -291.30096435546875, "logps/rejected": -2078.12255859375, "loss": 0.38, "rewards/accuracies": 1.0, "rewards/chosen": 0.6462463140487671, "rewards/margins": 8.317098617553711, "rewards/rejected": -7.6708526611328125, "step": 763 }, { "epoch": 0.97, "learning_rate": 1.7077534966650763e-10, "logits/chosen": -3.166578769683838, "logits/rejected": -3.0770671367645264, "logps/chosen": -306.1241455078125, "logps/rejected": -433.53668212890625, "loss": 0.4308, "rewards/accuracies": 1.0, "rewards/chosen": 0.7256454229354858, "rewards/margins": 2.038848876953125, "rewards/rejected": -1.3132034540176392, "step": 764 }, { "epoch": 0.98, "learning_rate": 1.541333133436018e-10, "logits/chosen": -3.161050796508789, "logits/rejected": -3.132455587387085, "logps/chosen": -335.01239013671875, "logps/rejected": -814.6708984375, "loss": 0.3997, "rewards/accuracies": 1.0, "rewards/chosen": 0.7254928350448608, "rewards/margins": 4.0282793045043945, "rewards/rejected": -3.302786350250244, "step": 765 }, { "epoch": 0.98, "learning_rate": 1.3834300535294218e-10, "logits/chosen": -3.249178886413574, "logits/rejected": -3.210875988006592, "logps/chosen": -311.4815673828125, "logps/rejected": -604.221923828125, "loss": 0.4304, "rewards/accuracies": 1.0, "rewards/chosen": 0.6562973260879517, "rewards/margins": 2.6236190795898438, "rewards/rejected": -1.9673218727111816, "step": 766 }, { "epoch": 0.98, "learning_rate": 1.2340469550733423e-10, "logits/chosen": -3.2018702030181885, "logits/rejected": -3.0305399894714355, "logps/chosen": -343.4668273925781, "logps/rejected": -1450.14892578125, "loss": 0.3689, "rewards/accuracies": 1.0, "rewards/chosen": 0.5559600591659546, "rewards/margins": 4.688418388366699, "rewards/rejected": -4.132458686828613, "step": 767 }, { "epoch": 0.98, "learning_rate": 1.0931863906127325e-10, "logits/chosen": -3.1557273864746094, "logits/rejected": -3.0095226764678955, "logps/chosen": -313.0769958496094, "logps/rejected": -1637.114990234375, "loss": 0.3763, "rewards/accuracies": 1.0, "rewards/chosen": 0.7741928100585938, "rewards/margins": 6.006004333496094, "rewards/rejected": -5.2318115234375, "step": 768 }, { "epoch": 0.98, "learning_rate": 9.608507670659238e-11, "logits/chosen": -3.221951961517334, "logits/rejected": -3.019785165786743, "logps/chosen": -296.75164794921875, "logps/rejected": -495.6778869628906, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 0.9292892813682556, "rewards/margins": 2.4586455821990967, "rewards/rejected": -1.5293563604354858, "step": 769 }, { "epoch": 0.98, "learning_rate": 8.370423456837139e-11, "logits/chosen": -3.235274314880371, "logits/rejected": -3.0252840518951416, "logps/chosen": -311.74188232421875, "logps/rejected": -1291.3594970703125, "loss": 0.3514, "rewards/accuracies": 1.0, "rewards/chosen": 0.8289031982421875, "rewards/margins": 4.814780235290527, "rewards/rejected": -3.9858765602111816, "step": 770 }, { "epoch": 0.98, "learning_rate": 7.217632420102871e-11, "logits/chosen": -3.2101941108703613, "logits/rejected": -3.006812572479248, "logps/chosen": -308.04266357421875, "logps/rejected": -1610.805908203125, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 0.7157729864120483, "rewards/margins": 6.616945266723633, "rewards/rejected": -5.901171684265137, "step": 771 }, { "epoch": 0.98, "learning_rate": 6.150154258476314e-11, "logits/chosen": -3.2556138038635254, "logits/rejected": -3.082338809967041, "logps/chosen": -301.9775390625, "logps/rejected": -1602.870849609375, "loss": 0.3812, "rewards/accuracies": 1.0, "rewards/chosen": 0.6454574465751648, "rewards/margins": 6.705345153808594, "rewards/rejected": -6.059887886047363, "step": 772 }, { "epoch": 0.99, "learning_rate": 5.168007212212333e-11, "logits/chosen": -3.141261100769043, "logits/rejected": -3.221561908721924, "logps/chosen": -316.3329772949219, "logps/rejected": -656.3191528320312, "loss": 0.3803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8180221319198608, "rewards/margins": 2.983149766921997, "rewards/rejected": -2.165127754211426, "step": 773 }, { "epoch": 0.99, "learning_rate": 4.271208063494902e-11, "logits/chosen": -3.191789150238037, "logits/rejected": -2.9860124588012695, "logps/chosen": -319.43768310546875, "logps/rejected": -1667.4658203125, "loss": 0.3938, "rewards/accuracies": 1.0, "rewards/chosen": 0.5426498651504517, "rewards/margins": 6.958763122558594, "rewards/rejected": -6.416113376617432, "step": 774 }, { "epoch": 0.99, "learning_rate": 3.459772136146788e-11, "logits/chosen": -3.2168076038360596, "logits/rejected": -3.121039867401123, "logps/chosen": -332.55670166015625, "logps/rejected": -595.037841796875, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 0.7808471918106079, "rewards/margins": 2.571293830871582, "rewards/rejected": -1.7904465198516846, "step": 775 }, { "epoch": 0.99, "learning_rate": 2.733713295369755e-11, "logits/chosen": -3.2130374908447266, "logits/rejected": -3.1135754585266113, "logps/chosen": -334.0561218261719, "logps/rejected": -513.86474609375, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 0.6648727655410767, "rewards/margins": 2.0167970657348633, "rewards/rejected": -1.351924180984497, "step": 776 }, { "epoch": 0.99, "learning_rate": 2.093043947505868e-11, "logits/chosen": -3.1713690757751465, "logits/rejected": -3.059943199157715, "logps/chosen": -298.05596923828125, "logps/rejected": -469.1112976074219, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 0.6881790161132812, "rewards/margins": 2.0587539672851562, "rewards/rejected": -1.370574951171875, "step": 777 }, { "epoch": 0.99, "learning_rate": 1.53777503982655e-11, "logits/chosen": -3.194476366043091, "logits/rejected": -3.084303379058838, "logps/chosen": -322.5533447265625, "logps/rejected": -1018.46240234375, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 0.525158703327179, "rewards/margins": 4.292025566101074, "rewards/rejected": -3.766867160797119, "step": 778 }, { "epoch": 0.99, "learning_rate": 1.0679160603449533e-11, "logits/chosen": -3.187591552734375, "logits/rejected": -3.0368316173553467, "logps/chosen": -321.7261962890625, "logps/rejected": -849.2809448242188, "loss": 0.3658, "rewards/accuracies": 1.0, "rewards/chosen": 0.7079070806503296, "rewards/margins": 3.025784492492676, "rewards/rejected": -2.3178772926330566, "step": 779 }, { "epoch": 0.99, "learning_rate": 6.834750376549792e-12, "logits/chosen": -3.185743570327759, "logits/rejected": -3.0629444122314453, "logps/chosen": -269.47332763671875, "logps/rejected": -907.3726196289062, "loss": 0.3985, "rewards/accuracies": 1.0, "rewards/chosen": 0.6079849004745483, "rewards/margins": 3.702078342437744, "rewards/rejected": -3.0940933227539062, "step": 780 }, { "epoch": 1.0, "learning_rate": 3.844585407936085e-12, "logits/chosen": -3.1324658393859863, "logits/rejected": -3.0577239990234375, "logps/chosen": -274.8044128417969, "logps/rejected": -929.41796875, "loss": 0.3405, "rewards/accuracies": 1.0, "rewards/chosen": 0.670996904373169, "rewards/margins": 3.5558815002441406, "rewards/rejected": -2.8848845958709717, "step": 781 }, { "epoch": 1.0, "learning_rate": 1.7087167912710475e-12, "logits/chosen": -3.167895793914795, "logits/rejected": -2.8354835510253906, "logps/chosen": -286.9731140136719, "logps/rejected": -2454.48779296875, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 0.585645318031311, "rewards/margins": 7.92957878112793, "rewards/rejected": -7.34393310546875, "step": 782 }, { "epoch": 1.0, "learning_rate": 4.271810226552652e-13, "logits/chosen": -3.1862082481384277, "logits/rejected": -3.0197579860687256, "logps/chosen": -307.32525634765625, "logps/rejected": -888.2658081054688, "loss": 0.3817, "rewards/accuracies": 1.0, "rewards/chosen": 0.47113344073295593, "rewards/margins": 3.056584358215332, "rewards/rejected": -2.5854508876800537, "step": 783 }, { "epoch": 1.0, "learning_rate": 0.0, "logits/chosen": -3.2104344367980957, "logits/rejected": -2.9568209648132324, "logps/chosen": -290.27178955078125, "logps/rejected": -1015.5006103515625, "loss": 0.3955, "rewards/accuracies": 1.0, "rewards/chosen": 0.7462204098701477, "rewards/margins": 4.146245002746582, "rewards/rejected": -3.400024652481079, "step": 784 }, { "epoch": 1.0, "step": 784, "total_flos": 0.0, "train_loss": 0.4745224376722258, "train_runtime": 1855.6751, "train_samples_per_second": 3.382, "train_steps_per_second": 0.422 } ], "logging_steps": 1.0, "max_steps": 784, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }