{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994495412844037, "eval_steps": 500, "global_step": 408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014678899082568808, "grad_norm": 11.81737232208252, "learning_rate": 2.439024390243903e-07, "logits/chosen": -0.9879676103591919, "logits/rejected": -1.9993298053741455, "logps/chosen": -269.27239990234375, "logps/rejected": -186.47621154785156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.029357798165137616, "grad_norm": 11.950206756591797, "learning_rate": 4.878048780487805e-07, "logits/chosen": -1.0342975854873657, "logits/rejected": -1.9880424737930298, "logps/chosen": -290.81072998046875, "logps/rejected": -204.50514221191406, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -0.009406615048646927, "rewards/margins": 0.020395996049046516, "rewards/rejected": -0.029802614822983742, "step": 4 }, { "epoch": 0.044036697247706424, "grad_norm": 11.719453811645508, "learning_rate": 7.317073170731707e-07, "logits/chosen": -1.1187832355499268, "logits/rejected": -2.125272750854492, "logps/chosen": -295.85894775390625, "logps/rejected": -203.1645050048828, "loss": 0.6642, "rewards/accuracies": 0.625, "rewards/chosen": 0.07618961483240128, "rewards/margins": 0.09595101326704025, "rewards/rejected": -0.01976138912141323, "step": 6 }, { "epoch": 0.05871559633027523, "grad_norm": 12.171574592590332, "learning_rate": 9.75609756097561e-07, "logits/chosen": -1.250899076461792, "logits/rejected": -2.1083037853240967, "logps/chosen": -252.51145935058594, "logps/rejected": -164.40138244628906, "loss": 0.7179, "rewards/accuracies": 0.53125, "rewards/chosen": 0.044141992926597595, "rewards/margins": 0.003628704696893692, "rewards/rejected": 0.040513284504413605, "step": 8 }, { "epoch": 0.07339449541284404, "grad_norm": 13.125951766967773, "learning_rate": 1.2195121951219514e-06, "logits/chosen": -1.0734999179840088, "logits/rejected": -2.204047679901123, "logps/chosen": -306.6387939453125, "logps/rejected": -158.70912170410156, "loss": 0.7397, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0127907395362854, "rewards/margins": -0.031406134366989136, "rewards/rejected": 0.01861538738012314, "step": 10 }, { "epoch": 0.08807339449541285, "grad_norm": 15.099513053894043, "learning_rate": 1.4634146341463414e-06, "logits/chosen": -1.0108157396316528, "logits/rejected": -1.977769374847412, "logps/chosen": -344.31402587890625, "logps/rejected": -223.3643798828125, "loss": 0.7584, "rewards/accuracies": 0.40625, "rewards/chosen": -0.06078364700078964, "rewards/margins": -0.0867958813905716, "rewards/rejected": 0.026012245565652847, "step": 12 }, { "epoch": 0.10275229357798166, "grad_norm": 13.785890579223633, "learning_rate": 1.707317073170732e-06, "logits/chosen": -0.9762290120124817, "logits/rejected": -1.9721505641937256, "logps/chosen": -259.58258056640625, "logps/rejected": -167.8755645751953, "loss": 0.7235, "rewards/accuracies": 0.46875, "rewards/chosen": 0.023126909509301186, "rewards/margins": -0.02050386182963848, "rewards/rejected": 0.043630778789520264, "step": 14 }, { "epoch": 0.11743119266055047, "grad_norm": 13.014513969421387, "learning_rate": 1.951219512195122e-06, "logits/chosen": -1.1472342014312744, "logits/rejected": -2.0296616554260254, "logps/chosen": -269.7952575683594, "logps/rejected": -186.65452575683594, "loss": 0.7405, "rewards/accuracies": 0.453125, "rewards/chosen": 0.004093457013368607, "rewards/margins": -0.044701721519231796, "rewards/rejected": 0.0487951785326004, "step": 16 }, { "epoch": 0.13211009174311927, "grad_norm": 12.20093059539795, "learning_rate": 2.1951219512195125e-06, "logits/chosen": -1.0266412496566772, "logits/rejected": -2.0891737937927246, "logps/chosen": -313.8085021972656, "logps/rejected": -197.85943603515625, "loss": 0.6731, "rewards/accuracies": 0.609375, "rewards/chosen": 0.09984610974788666, "rewards/margins": 0.0959281176328659, "rewards/rejected": 0.003917992115020752, "step": 18 }, { "epoch": 0.14678899082568808, "grad_norm": 12.344905853271484, "learning_rate": 2.4390243902439027e-06, "logits/chosen": -1.0662198066711426, "logits/rejected": -2.0889832973480225, "logps/chosen": -308.8189697265625, "logps/rejected": -156.6934814453125, "loss": 0.6784, "rewards/accuracies": 0.546875, "rewards/chosen": 0.08540838956832886, "rewards/margins": 0.07292439043521881, "rewards/rejected": 0.012484000064432621, "step": 20 }, { "epoch": 0.1614678899082569, "grad_norm": 11.898660659790039, "learning_rate": 2.682926829268293e-06, "logits/chosen": -1.2143007516860962, "logits/rejected": -2.262324571609497, "logps/chosen": -298.7814636230469, "logps/rejected": -186.76119995117188, "loss": 0.6781, "rewards/accuracies": 0.515625, "rewards/chosen": 0.08213196694850922, "rewards/margins": 0.08361663669347763, "rewards/rejected": -0.0014846734702587128, "step": 22 }, { "epoch": 0.1761467889908257, "grad_norm": 13.68064022064209, "learning_rate": 2.926829268292683e-06, "logits/chosen": -1.0233314037322998, "logits/rejected": -2.1899986267089844, "logps/chosen": -370.8209228515625, "logps/rejected": -156.96270751953125, "loss": 0.7306, "rewards/accuracies": 0.515625, "rewards/chosen": -0.007536953315138817, "rewards/margins": -0.009052609093487263, "rewards/rejected": 0.0015156615991145372, "step": 24 }, { "epoch": 0.1908256880733945, "grad_norm": 12.661199569702148, "learning_rate": 3.1707317073170736e-06, "logits/chosen": -1.2463735342025757, "logits/rejected": -2.1673622131347656, "logps/chosen": -326.9246520996094, "logps/rejected": -182.17701721191406, "loss": 0.7175, "rewards/accuracies": 0.546875, "rewards/chosen": -0.010405808687210083, "rewards/margins": 0.007835682481527328, "rewards/rejected": -0.01824149303138256, "step": 26 }, { "epoch": 0.20550458715596331, "grad_norm": 11.61974811553955, "learning_rate": 3.414634146341464e-06, "logits/chosen": -1.1716669797897339, "logits/rejected": -2.2106716632843018, "logps/chosen": -284.443603515625, "logps/rejected": -165.102783203125, "loss": 0.7409, "rewards/accuracies": 0.546875, "rewards/chosen": 0.007994448766112328, "rewards/margins": -0.035433441400527954, "rewards/rejected": 0.04342789575457573, "step": 28 }, { "epoch": 0.22018348623853212, "grad_norm": 10.777989387512207, "learning_rate": 3.6585365853658537e-06, "logits/chosen": -1.0662914514541626, "logits/rejected": -2.1156551837921143, "logps/chosen": -289.4057922363281, "logps/rejected": -197.46649169921875, "loss": 0.6371, "rewards/accuracies": 0.609375, "rewards/chosen": 0.1247626319527626, "rewards/margins": 0.15936096012592316, "rewards/rejected": -0.03459831699728966, "step": 30 }, { "epoch": 0.23486238532110093, "grad_norm": 12.190910339355469, "learning_rate": 3.902439024390244e-06, "logits/chosen": -1.1755316257476807, "logits/rejected": -2.1449058055877686, "logps/chosen": -288.5774841308594, "logps/rejected": -163.59588623046875, "loss": 0.6733, "rewards/accuracies": 0.546875, "rewards/chosen": 0.09892146289348602, "rewards/margins": 0.08179756253957748, "rewards/rejected": 0.01712390035390854, "step": 32 }, { "epoch": 0.24954128440366974, "grad_norm": 13.154803276062012, "learning_rate": 4.146341463414634e-06, "logits/chosen": -1.1496777534484863, "logits/rejected": -2.2045750617980957, "logps/chosen": -324.6558837890625, "logps/rejected": -164.45327758789062, "loss": 0.6478, "rewards/accuracies": 0.640625, "rewards/chosen": 0.08885271847248077, "rewards/margins": 0.1558375358581543, "rewards/rejected": -0.06698483228683472, "step": 34 }, { "epoch": 0.26422018348623855, "grad_norm": 11.07314682006836, "learning_rate": 4.390243902439025e-06, "logits/chosen": -1.1677134037017822, "logits/rejected": -2.0850350856781006, "logps/chosen": -311.2884216308594, "logps/rejected": -204.43142700195312, "loss": 0.6193, "rewards/accuracies": 0.65625, "rewards/chosen": 0.20451843738555908, "rewards/margins": 0.22681473195552826, "rewards/rejected": -0.02229629084467888, "step": 36 }, { "epoch": 0.27889908256880735, "grad_norm": 12.431696891784668, "learning_rate": 4.634146341463416e-06, "logits/chosen": -1.195428729057312, "logits/rejected": -2.197521686553955, "logps/chosen": -294.04962158203125, "logps/rejected": -200.2810516357422, "loss": 0.6598, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15507417917251587, "rewards/margins": 0.11412172019481659, "rewards/rejected": 0.04095245152711868, "step": 38 }, { "epoch": 0.29357798165137616, "grad_norm": 11.575589179992676, "learning_rate": 4.8780487804878055e-06, "logits/chosen": -1.0411652326583862, "logits/rejected": -2.03951096534729, "logps/chosen": -345.9762268066406, "logps/rejected": -181.34144592285156, "loss": 0.6186, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14570173621177673, "rewards/margins": 0.20660607516765594, "rewards/rejected": -0.06090431660413742, "step": 40 }, { "epoch": 0.30825688073394497, "grad_norm": 12.716486930847168, "learning_rate": 4.999908404322799e-06, "logits/chosen": -1.0371800661087036, "logits/rejected": -2.2317895889282227, "logps/chosen": -319.42755126953125, "logps/rejected": -172.60479736328125, "loss": 0.6248, "rewards/accuracies": 0.640625, "rewards/chosen": 0.13961191475391388, "rewards/margins": 0.19473902881145477, "rewards/rejected": -0.055127132683992386, "step": 42 }, { "epoch": 0.3229357798165138, "grad_norm": 10.400399208068848, "learning_rate": 4.999175679175577e-06, "logits/chosen": -1.1097325086593628, "logits/rejected": -2.1328647136688232, "logps/chosen": -251.92745971679688, "logps/rejected": -161.21292114257812, "loss": 0.5849, "rewards/accuracies": 0.765625, "rewards/chosen": 0.25156256556510925, "rewards/margins": 0.2756442427635193, "rewards/rejected": -0.02408166043460369, "step": 44 }, { "epoch": 0.3376146788990826, "grad_norm": 11.084893226623535, "learning_rate": 4.997710443643461e-06, "logits/chosen": -1.1712064743041992, "logits/rejected": -2.0722293853759766, "logps/chosen": -259.9323425292969, "logps/rejected": -206.37510681152344, "loss": 0.6109, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22726279497146606, "rewards/margins": 0.24400296807289124, "rewards/rejected": -0.016740169376134872, "step": 46 }, { "epoch": 0.3522935779816514, "grad_norm": 13.230236053466797, "learning_rate": 4.995513127188151e-06, "logits/chosen": -1.0816175937652588, "logits/rejected": -2.215028762817383, "logps/chosen": -365.7675476074219, "logps/rejected": -183.13980102539062, "loss": 0.5456, "rewards/accuracies": 0.75, "rewards/chosen": 0.34264349937438965, "rewards/margins": 0.38213008642196655, "rewards/rejected": -0.03948655351996422, "step": 48 }, { "epoch": 0.3669724770642202, "grad_norm": 11.37851333618164, "learning_rate": 4.992584373844853e-06, "logits/chosen": -1.2096611261367798, "logits/rejected": -2.082951784133911, "logps/chosen": -345.7232971191406, "logps/rejected": -184.25949096679688, "loss": 0.5091, "rewards/accuracies": 0.8125, "rewards/chosen": 0.413723886013031, "rewards/margins": 0.4999043643474579, "rewards/rejected": -0.08618048578500748, "step": 50 }, { "epoch": 0.381651376146789, "grad_norm": 9.676469802856445, "learning_rate": 4.98892504203351e-06, "logits/chosen": -1.2248896360397339, "logits/rejected": -2.1341745853424072, "logps/chosen": -282.0457763671875, "logps/rejected": -158.89736938476562, "loss": 0.501, "rewards/accuracies": 0.921875, "rewards/chosen": 0.42736518383026123, "rewards/margins": 0.5060732960700989, "rewards/rejected": -0.07870808988809586, "step": 52 }, { "epoch": 0.3963302752293578, "grad_norm": 9.402766227722168, "learning_rate": 4.9845362043071925e-06, "logits/chosen": -1.0192848443984985, "logits/rejected": -2.0682382583618164, "logps/chosen": -290.6011962890625, "logps/rejected": -163.6627197265625, "loss": 0.4541, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5030941963195801, "rewards/margins": 0.6462306380271912, "rewards/rejected": -0.14313644170761108, "step": 54 }, { "epoch": 0.41100917431192663, "grad_norm": 10.944356918334961, "learning_rate": 4.97941914703774e-06, "logits/chosen": -1.1482800245285034, "logits/rejected": -2.151231050491333, "logps/chosen": -287.7913513183594, "logps/rejected": -201.2919464111328, "loss": 0.4487, "rewards/accuracies": 0.875, "rewards/chosen": 0.6401927471160889, "rewards/margins": 0.7009615898132324, "rewards/rejected": -0.06076894700527191, "step": 56 }, { "epoch": 0.42568807339449544, "grad_norm": 8.618446350097656, "learning_rate": 4.973575370038718e-06, "logits/chosen": -1.0707895755767822, "logits/rejected": -2.049323558807373, "logps/chosen": -305.2084045410156, "logps/rejected": -193.321533203125, "loss": 0.3851, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8177109956741333, "rewards/margins": 0.9303702116012573, "rewards/rejected": -0.11265924572944641, "step": 58 }, { "epoch": 0.44036697247706424, "grad_norm": 7.712850093841553, "learning_rate": 4.967006586125827e-06, "logits/chosen": -1.240044355392456, "logits/rejected": -2.0774481296539307, "logps/chosen": -301.3046569824219, "logps/rejected": -186.58460998535156, "loss": 0.35, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9353222846984863, "rewards/margins": 1.0043295621871948, "rewards/rejected": -0.06900733709335327, "step": 60 }, { "epoch": 0.45504587155963305, "grad_norm": 8.133475303649902, "learning_rate": 4.959714720614871e-06, "logits/chosen": -1.1756389141082764, "logits/rejected": -2.2198028564453125, "logps/chosen": -319.236083984375, "logps/rejected": -184.04647827148438, "loss": 0.3239, "rewards/accuracies": 0.953125, "rewards/chosen": 0.9475828409194946, "rewards/margins": 1.177114725112915, "rewards/rejected": -0.22953176498413086, "step": 62 }, { "epoch": 0.46972477064220186, "grad_norm": 6.613894462585449, "learning_rate": 4.951701910757446e-06, "logits/chosen": -1.1599823236465454, "logits/rejected": -2.064751148223877, "logps/chosen": -253.94537353515625, "logps/rejected": -188.87652587890625, "loss": 0.3088, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0339241027832031, "rewards/margins": 1.2678444385528564, "rewards/rejected": -0.2339203655719757, "step": 64 }, { "epoch": 0.48440366972477067, "grad_norm": 8.49493408203125, "learning_rate": 4.942970505114514e-06, "logits/chosen": -1.0440397262573242, "logits/rejected": -2.1136162281036377, "logps/chosen": -308.4583435058594, "logps/rejected": -176.34474182128906, "loss": 0.268, "rewards/accuracies": 0.984375, "rewards/chosen": 1.1736990213394165, "rewards/margins": 1.4280885457992554, "rewards/rejected": -0.25438952445983887, "step": 66 }, { "epoch": 0.4990825688073395, "grad_norm": 6.022420883178711, "learning_rate": 4.933523062868033e-06, "logits/chosen": -1.0774444341659546, "logits/rejected": -2.1658172607421875, "logps/chosen": -269.4661560058594, "logps/rejected": -164.7786865234375, "loss": 0.2372, "rewards/accuracies": 0.96875, "rewards/chosen": 1.3157860040664673, "rewards/margins": 1.5686390399932861, "rewards/rejected": -0.25285303592681885, "step": 68 }, { "epoch": 0.5137614678899083, "grad_norm": 4.839372634887695, "learning_rate": 4.923362353070859e-06, "logits/chosen": -0.8954001665115356, "logits/rejected": -2.1572980880737305, "logps/chosen": -287.38250732421875, "logps/rejected": -159.82025146484375, "loss": 0.2079, "rewards/accuracies": 0.984375, "rewards/chosen": 1.4153721332550049, "rewards/margins": 1.8501354455947876, "rewards/rejected": -0.43476346135139465, "step": 70 }, { "epoch": 0.5284403669724771, "grad_norm": 5.355666160583496, "learning_rate": 4.912491353835138e-06, "logits/chosen": -1.1590656042099, "logits/rejected": -2.088367462158203, "logps/chosen": -260.02386474609375, "logps/rejected": -185.47396850585938, "loss": 0.2185, "rewards/accuracies": 0.984375, "rewards/chosen": 1.4196313619613647, "rewards/margins": 1.858705997467041, "rewards/rejected": -0.43907448649406433, "step": 72 }, { "epoch": 0.5431192660550459, "grad_norm": 4.641209602355957, "learning_rate": 4.900913251459418e-06, "logits/chosen": -1.0761524438858032, "logits/rejected": -2.0451908111572266, "logps/chosen": -264.9051513671875, "logps/rejected": -173.16702270507812, "loss": 0.1769, "rewards/accuracies": 1.0, "rewards/chosen": 1.5420759916305542, "rewards/margins": 2.071654796600342, "rewards/rejected": -0.5295785069465637, "step": 74 }, { "epoch": 0.5577981651376147, "grad_norm": 4.564330101013184, "learning_rate": 4.8886314394947396e-06, "logits/chosen": -0.9936952590942383, "logits/rejected": -2.070539951324463, "logps/chosen": -278.8867492675781, "logps/rejected": -185.91055297851562, "loss": 0.1608, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9348686933517456, "rewards/margins": 2.52958083152771, "rewards/rejected": -0.5947118997573853, "step": 76 }, { "epoch": 0.5724770642201835, "grad_norm": 5.782593250274658, "learning_rate": 4.875649517749985e-06, "logits/chosen": -1.0427924394607544, "logits/rejected": -2.180347442626953, "logps/chosen": -282.06732177734375, "logps/rejected": -191.30137634277344, "loss": 0.1548, "rewards/accuracies": 0.96875, "rewards/chosen": 1.9499953985214233, "rewards/margins": 2.6421873569488525, "rewards/rejected": -0.6921918392181396, "step": 78 }, { "epoch": 0.5871559633027523, "grad_norm": 4.356126308441162, "learning_rate": 4.861971291236772e-06, "logits/chosen": -1.134873390197754, "logits/rejected": -2.047222852706909, "logps/chosen": -328.65509033203125, "logps/rejected": -191.76483154296875, "loss": 0.1841, "rewards/accuracies": 0.9375, "rewards/chosen": 2.2117769718170166, "rewards/margins": 2.5882744789123535, "rewards/rejected": -0.37649768590927124, "step": 80 }, { "epoch": 0.6018348623853211, "grad_norm": 3.70808482170105, "learning_rate": 4.847600769054201e-06, "logits/chosen": -1.1773045063018799, "logits/rejected": -2.071323871612549, "logps/chosen": -365.7237243652344, "logps/rejected": -221.5764923095703, "loss": 0.1093, "rewards/accuracies": 0.984375, "rewards/chosen": 2.4926247596740723, "rewards/margins": 3.0623347759246826, "rewards/rejected": -0.5697098970413208, "step": 82 }, { "epoch": 0.6165137614678899, "grad_norm": 2.8207852840423584, "learning_rate": 4.832542163213787e-06, "logits/chosen": -1.0239057540893555, "logits/rejected": -2.1960628032684326, "logps/chosen": -261.3912658691406, "logps/rejected": -155.67286682128906, "loss": 0.1073, "rewards/accuracies": 0.984375, "rewards/chosen": 2.281486749649048, "rewards/margins": 3.123940944671631, "rewards/rejected": -0.8424541354179382, "step": 84 }, { "epoch": 0.6311926605504588, "grad_norm": 2.6905996799468994, "learning_rate": 4.816799887404911e-06, "logits/chosen": -1.2185587882995605, "logits/rejected": -2.146491289138794, "logps/chosen": -300.77069091796875, "logps/rejected": -185.7276153564453, "loss": 0.1277, "rewards/accuracies": 0.96875, "rewards/chosen": 2.3313047885894775, "rewards/margins": 3.0290822982788086, "rewards/rejected": -0.6977773904800415, "step": 86 }, { "epoch": 0.6458715596330276, "grad_norm": 1.891965389251709, "learning_rate": 4.800378555701168e-06, "logits/chosen": -1.056377649307251, "logits/rejected": -2.001763343811035, "logps/chosen": -354.14990234375, "logps/rejected": -186.62448120117188, "loss": 0.1089, "rewards/accuracies": 0.953125, "rewards/chosen": 2.5092077255249023, "rewards/margins": 3.3948686122894287, "rewards/rejected": -0.8856609463691711, "step": 88 }, { "epoch": 0.6605504587155964, "grad_norm": 3.7145261764526367, "learning_rate": 4.783282981207979e-06, "logits/chosen": -1.1021761894226074, "logits/rejected": -2.2725181579589844, "logps/chosen": -296.32763671875, "logps/rejected": -169.7439727783203, "loss": 0.0866, "rewards/accuracies": 0.984375, "rewards/chosen": 2.6951088905334473, "rewards/margins": 3.6553006172180176, "rewards/rejected": -0.9601919054985046, "step": 90 }, { "epoch": 0.6752293577981652, "grad_norm": 2.50156307220459, "learning_rate": 4.765518174651864e-06, "logits/chosen": -1.1074126958847046, "logits/rejected": -2.051131248474121, "logps/chosen": -285.9756164550781, "logps/rejected": -190.58448791503906, "loss": 0.0852, "rewards/accuracies": 0.984375, "rewards/chosen": 2.7018895149230957, "rewards/margins": 3.8313865661621094, "rewards/rejected": -1.1294972896575928, "step": 92 }, { "epoch": 0.689908256880734, "grad_norm": 2.272671699523926, "learning_rate": 4.747089342911793e-06, "logits/chosen": -0.9693321585655212, "logits/rejected": -2.168473720550537, "logps/chosen": -291.7270812988281, "logps/rejected": -175.2049560546875, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 2.954824209213257, "rewards/margins": 4.128055095672607, "rewards/rejected": -1.1732308864593506, "step": 94 }, { "epoch": 0.7045871559633028, "grad_norm": 2.2310574054718018, "learning_rate": 4.728001887493048e-06, "logits/chosen": -0.9781808853149414, "logits/rejected": -2.155506134033203, "logps/chosen": -299.66314697265625, "logps/rejected": -194.56436157226562, "loss": 0.067, "rewards/accuracies": 0.984375, "rewards/chosen": 3.1443114280700684, "rewards/margins": 4.226352214813232, "rewards/rejected": -1.0820410251617432, "step": 96 }, { "epoch": 0.7192660550458716, "grad_norm": 1.7269368171691895, "learning_rate": 4.708261402944036e-06, "logits/chosen": -1.0619006156921387, "logits/rejected": -2.1256189346313477, "logps/chosen": -315.5987548828125, "logps/rejected": -188.52439880371094, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 3.3185007572174072, "rewards/margins": 4.740314960479736, "rewards/rejected": -1.4218144416809082, "step": 98 }, { "epoch": 0.7339449541284404, "grad_norm": 3.1934289932250977, "learning_rate": 4.687873675216522e-06, "logits/chosen": -0.9534860253334045, "logits/rejected": -1.9718412160873413, "logps/chosen": -303.17181396484375, "logps/rejected": -199.40789794921875, "loss": 0.0892, "rewards/accuracies": 0.953125, "rewards/chosen": 3.4963011741638184, "rewards/margins": 4.650891304016113, "rewards/rejected": -1.1545898914337158, "step": 100 }, { "epoch": 0.7486238532110092, "grad_norm": 1.0567034482955933, "learning_rate": 4.666844679969765e-06, "logits/chosen": -1.287552833557129, "logits/rejected": -2.272284507751465, "logps/chosen": -299.2529296875, "logps/rejected": -208.53785705566406, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 3.132131814956665, "rewards/margins": 4.880558967590332, "rewards/rejected": -1.748427152633667, "step": 102 }, { "epoch": 0.763302752293578, "grad_norm": 1.3455036878585815, "learning_rate": 4.6451805808190464e-06, "logits/chosen": -1.049391508102417, "logits/rejected": -2.1182594299316406, "logps/chosen": -284.2237548828125, "logps/rejected": -176.08627319335938, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 3.437601089477539, "rewards/margins": 5.258786678314209, "rewards/rejected": -1.821185827255249, "step": 104 }, { "epoch": 0.7779816513761468, "grad_norm": 1.6037604808807373, "learning_rate": 4.622887727529104e-06, "logits/chosen": -1.0589053630828857, "logits/rejected": -2.095472812652588, "logps/chosen": -257.8381042480469, "logps/rejected": -207.0792236328125, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 3.2390363216400146, "rewards/margins": 5.350650310516357, "rewards/rejected": -2.1116137504577637, "step": 106 }, { "epoch": 0.7926605504587156, "grad_norm": 1.9005062580108643, "learning_rate": 4.599972654153018e-06, "logits/chosen": -0.9298142194747925, "logits/rejected": -2.0814666748046875, "logps/chosen": -301.68865966796875, "logps/rejected": -174.01010131835938, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 3.7089574337005615, "rewards/margins": 5.569860458374023, "rewards/rejected": -1.8609036207199097, "step": 108 }, { "epoch": 0.8073394495412844, "grad_norm": 0.712770402431488, "learning_rate": 4.5764420771170735e-06, "logits/chosen": -0.9678480625152588, "logits/rejected": -2.0447123050689697, "logps/chosen": -278.64398193359375, "logps/rejected": -192.5853729248047, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 3.513016939163208, "rewards/margins": 5.6180419921875, "rewards/rejected": -2.105024814605713, "step": 110 }, { "epoch": 0.8220183486238533, "grad_norm": 1.3919163942337036, "learning_rate": 4.552302893252166e-06, "logits/chosen": -1.2199370861053467, "logits/rejected": -2.197056293487549, "logps/chosen": -306.26080322265625, "logps/rejected": -205.06845092773438, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": 3.367818832397461, "rewards/margins": 5.316436290740967, "rewards/rejected": -1.948617696762085, "step": 112 }, { "epoch": 0.8366972477064221, "grad_norm": 3.037362575531006, "learning_rate": 4.52756217777234e-06, "logits/chosen": -1.2299991846084595, "logits/rejected": -2.1640126705169678, "logps/chosen": -311.70574951171875, "logps/rejected": -207.38746643066406, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": 3.595170021057129, "rewards/margins": 5.463950157165527, "rewards/rejected": -1.8687800168991089, "step": 114 }, { "epoch": 0.8513761467889909, "grad_norm": 0.8069730401039124, "learning_rate": 4.502227182201035e-06, "logits/chosen": -0.9528835415840149, "logits/rejected": -1.977004051208496, "logps/chosen": -264.5509033203125, "logps/rejected": -174.93551635742188, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 3.867180109024048, "rewards/margins": 6.007584571838379, "rewards/rejected": -2.140403985977173, "step": 116 }, { "epoch": 0.8660550458715597, "grad_norm": 1.4102082252502441, "learning_rate": 4.476305332245662e-06, "logits/chosen": -1.0918750762939453, "logits/rejected": -2.3146743774414062, "logps/chosen": -314.5960998535156, "logps/rejected": -152.3535614013672, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 3.734642744064331, "rewards/margins": 6.135974407196045, "rewards/rejected": -2.4013314247131348, "step": 118 }, { "epoch": 0.8807339449541285, "grad_norm": 2.8867928981781006, "learning_rate": 4.449804225621116e-06, "logits/chosen": -1.0288662910461426, "logits/rejected": -2.0701658725738525, "logps/chosen": -279.2713317871094, "logps/rejected": -180.374267578125, "loss": 0.0485, "rewards/accuracies": 0.984375, "rewards/chosen": 3.5982298851013184, "rewards/margins": 5.633719444274902, "rewards/rejected": -2.0354888439178467, "step": 120 }, { "epoch": 0.8954128440366973, "grad_norm": 0.7778434753417969, "learning_rate": 4.422731629822887e-06, "logits/chosen": -0.9540915489196777, "logits/rejected": -1.9875534772872925, "logps/chosen": -314.85003662109375, "logps/rejected": -194.16896057128906, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": 3.723828077316284, "rewards/margins": 6.088706016540527, "rewards/rejected": -2.364877462387085, "step": 122 }, { "epoch": 0.9100917431192661, "grad_norm": 1.9667764902114868, "learning_rate": 4.395095479850396e-06, "logits/chosen": -0.9676120281219482, "logits/rejected": -1.9072697162628174, "logps/chosen": -287.99981689453125, "logps/rejected": -186.82659912109375, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 3.7472550868988037, "rewards/margins": 6.025314807891846, "rewards/rejected": -2.278059482574463, "step": 124 }, { "epoch": 0.9247706422018349, "grad_norm": 0.4268924593925476, "learning_rate": 4.366903875881243e-06, "logits/chosen": -1.0968043804168701, "logits/rejected": -2.334925651550293, "logps/chosen": -275.3115234375, "logps/rejected": -164.202392578125, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 3.771523952484131, "rewards/margins": 6.7282609939575195, "rewards/rejected": -2.9567372798919678, "step": 126 }, { "epoch": 0.9394495412844037, "grad_norm": 1.4270014762878418, "learning_rate": 4.3381650808970365e-06, "logits/chosen": -1.0460113286972046, "logits/rejected": -1.9695379734039307, "logps/chosen": -254.8202667236328, "logps/rejected": -185.63243103027344, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 3.706533432006836, "rewards/margins": 6.099806785583496, "rewards/rejected": -2.39327335357666, "step": 128 }, { "epoch": 0.9541284403669725, "grad_norm": 0.6754117012023926, "learning_rate": 4.308887518261507e-06, "logits/chosen": -0.8909565210342407, "logits/rejected": -1.9432121515274048, "logps/chosen": -278.40216064453125, "logps/rejected": -195.16552734375, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": 4.237884521484375, "rewards/margins": 6.639657974243164, "rewards/rejected": -2.4017739295959473, "step": 130 }, { "epoch": 0.9688073394495413, "grad_norm": 0.7388483285903931, "learning_rate": 4.279079769251617e-06, "logits/chosen": -1.2244815826416016, "logits/rejected": -2.1885085105895996, "logps/chosen": -351.21783447265625, "logps/rejected": -210.98890686035156, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 4.041647911071777, "rewards/margins": 6.690797328948975, "rewards/rejected": -2.6491494178771973, "step": 132 }, { "epoch": 0.9834862385321101, "grad_norm": 0.7370263934135437, "learning_rate": 4.248750570542373e-06, "logits/chosen": -1.0081679821014404, "logits/rejected": -2.0711734294891357, "logps/chosen": -272.2639465332031, "logps/rejected": -179.82412719726562, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 3.885261058807373, "rewards/margins": 6.468730449676514, "rewards/rejected": -2.5834696292877197, "step": 134 }, { "epoch": 0.998165137614679, "grad_norm": 2.1839847564697266, "learning_rate": 4.21790881164611e-06, "logits/chosen": -0.9589763879776001, "logits/rejected": -2.103942394256592, "logps/chosen": -282.6980285644531, "logps/rejected": -193.8739776611328, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 4.211104869842529, "rewards/margins": 7.221211910247803, "rewards/rejected": -3.0101072788238525, "step": 136 }, { "epoch": 1.0128440366972478, "grad_norm": 2.379425525665283, "learning_rate": 4.186563532306957e-06, "logits/chosen": -0.9432098865509033, "logits/rejected": -2.0608460903167725, "logps/chosen": -288.9028625488281, "logps/rejected": -168.07359313964844, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": 4.061460494995117, "rewards/margins": 7.086147308349609, "rewards/rejected": -3.0246872901916504, "step": 138 }, { "epoch": 1.0275229357798166, "grad_norm": 2.2438290119171143, "learning_rate": 4.154723919851291e-06, "logits/chosen": -1.1197127103805542, "logits/rejected": -2.0973258018493652, "logps/chosen": -290.60296630859375, "logps/rejected": -173.36465454101562, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": 3.5277624130249023, "rewards/margins": 6.31058406829834, "rewards/rejected": -2.7828218936920166, "step": 140 }, { "epoch": 1.0422018348623854, "grad_norm": 0.38025742769241333, "learning_rate": 4.122399306494918e-06, "logits/chosen": -1.1321005821228027, "logits/rejected": -2.2533721923828125, "logps/chosen": -336.11224365234375, "logps/rejected": -198.53457641601562, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 4.0418477058410645, "rewards/margins": 7.016913890838623, "rewards/rejected": -2.975067138671875, "step": 142 }, { "epoch": 1.0568807339449542, "grad_norm": 1.0832823514938354, "learning_rate": 4.089599166607794e-06, "logits/chosen": -1.0980923175811768, "logits/rejected": -2.007105588912964, "logps/chosen": -292.0760803222656, "logps/rejected": -186.78787231445312, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 3.942317008972168, "rewards/margins": 7.760876178741455, "rewards/rejected": -3.818559169769287, "step": 144 }, { "epoch": 1.071559633027523, "grad_norm": 0.542005717754364, "learning_rate": 4.05633311393708e-06, "logits/chosen": -0.9787145853042603, "logits/rejected": -2.0150396823883057, "logps/chosen": -257.6767883300781, "logps/rejected": -172.47512817382812, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": 4.031050682067871, "rewards/margins": 7.134464263916016, "rewards/rejected": -3.1034140586853027, "step": 146 }, { "epoch": 1.0862385321100918, "grad_norm": 1.513509750366211, "learning_rate": 4.022610898789349e-06, "logits/chosen": -1.008697509765625, "logits/rejected": -2.0967135429382324, "logps/chosen": -266.4443664550781, "logps/rejected": -186.60263061523438, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 4.120553970336914, "rewards/margins": 7.502930641174316, "rewards/rejected": -3.382376194000244, "step": 148 }, { "epoch": 1.1009174311926606, "grad_norm": 1.2189836502075195, "learning_rate": 3.988442405172755e-06, "logits/chosen": -0.8885701894760132, "logits/rejected": -2.0014257431030273, "logps/chosen": -281.70147705078125, "logps/rejected": -201.9718780517578, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 4.45000696182251, "rewards/margins": 7.705287933349609, "rewards/rejected": -3.2552807331085205, "step": 150 }, { "epoch": 1.1155963302752294, "grad_norm": 0.2563473880290985, "learning_rate": 3.953837647900031e-06, "logits/chosen": -0.9757863283157349, "logits/rejected": -2.0974419116973877, "logps/chosen": -273.5846862792969, "logps/rejected": -195.75936889648438, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 4.658951759338379, "rewards/margins": 8.117878913879395, "rewards/rejected": -3.4589266777038574, "step": 152 }, { "epoch": 1.1302752293577982, "grad_norm": 2.6809535026550293, "learning_rate": 3.918806769653135e-06, "logits/chosen": -0.8756412863731384, "logits/rejected": -1.9975080490112305, "logps/chosen": -318.453857421875, "logps/rejected": -195.71372985839844, "loss": 0.0324, "rewards/accuracies": 1.0, "rewards/chosen": 4.540386199951172, "rewards/margins": 7.758340358734131, "rewards/rejected": -3.217954158782959, "step": 154 }, { "epoch": 1.144954128440367, "grad_norm": 0.34194982051849365, "learning_rate": 3.88336003801042e-06, "logits/chosen": -0.9494649171829224, "logits/rejected": -2.052715301513672, "logps/chosen": -255.02169799804688, "logps/rejected": -178.14224243164062, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 3.8328986167907715, "rewards/margins": 7.075186729431152, "rewards/rejected": -3.2422876358032227, "step": 156 }, { "epoch": 1.1596330275229358, "grad_norm": 0.37112390995025635, "learning_rate": 3.847507842437205e-06, "logits/chosen": -0.8547274470329285, "logits/rejected": -2.1034629344940186, "logps/chosen": -296.8822021484375, "logps/rejected": -171.6925048828125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 4.401730060577393, "rewards/margins": 8.271763801574707, "rewards/rejected": -3.8700337409973145, "step": 158 }, { "epoch": 1.1743119266055047, "grad_norm": 0.6065702438354492, "learning_rate": 3.811260691240604e-06, "logits/chosen": -0.894873857498169, "logits/rejected": -2.086596965789795, "logps/chosen": -340.1643981933594, "logps/rejected": -188.5568389892578, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 4.769918441772461, "rewards/margins": 8.063861846923828, "rewards/rejected": -3.293943166732788, "step": 160 }, { "epoch": 1.1889908256880735, "grad_norm": 0.38738325238227844, "learning_rate": 3.774629208489547e-06, "logits/chosen": -0.9661360383033752, "logits/rejected": -2.0905256271362305, "logps/chosen": -241.7164764404297, "logps/rejected": -172.8728790283203, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 3.948549270629883, "rewards/margins": 7.093012809753418, "rewards/rejected": -3.1444640159606934, "step": 162 }, { "epoch": 1.2036697247706423, "grad_norm": 0.4064182639122009, "learning_rate": 3.7376241309008433e-06, "logits/chosen": -1.1252474784851074, "logits/rejected": -2.123969793319702, "logps/chosen": -326.73370361328125, "logps/rejected": -183.0895233154297, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 4.725476264953613, "rewards/margins": 7.916323661804199, "rewards/rejected": -3.190847635269165, "step": 164 }, { "epoch": 1.218348623853211, "grad_norm": 0.3772048056125641, "learning_rate": 3.7002563046922502e-06, "logits/chosen": -1.0913598537445068, "logits/rejected": -2.229214668273926, "logps/chosen": -326.4932861328125, "logps/rejected": -173.02989196777344, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 4.724410533905029, "rewards/margins": 8.843596458435059, "rewards/rejected": -4.119184970855713, "step": 166 }, { "epoch": 1.2330275229357799, "grad_norm": 0.48601555824279785, "learning_rate": 3.6625366824034337e-06, "logits/chosen": -0.8681567907333374, "logits/rejected": -2.067228317260742, "logps/chosen": -279.7916259765625, "logps/rejected": -206.9775390625, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 4.7722625732421875, "rewards/margins": 9.153127670288086, "rewards/rejected": -4.380865097045898, "step": 168 }, { "epoch": 1.2477064220183487, "grad_norm": 1.1044621467590332, "learning_rate": 3.6244763196857714e-06, "logits/chosen": -0.9898172616958618, "logits/rejected": -2.130460262298584, "logps/chosen": -296.6734619140625, "logps/rejected": -181.456298828125, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 4.674115180969238, "rewards/margins": 8.792963981628418, "rewards/rejected": -4.1188483238220215, "step": 170 }, { "epoch": 1.2623853211009175, "grad_norm": 1.3868632316589355, "learning_rate": 3.5860863720619333e-06, "logits/chosen": -1.0125945806503296, "logits/rejected": -2.080739736557007, "logps/chosen": -289.3682861328125, "logps/rejected": -184.58253479003906, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": 5.030690670013428, "rewards/margins": 8.200502395629883, "rewards/rejected": -3.1698126792907715, "step": 172 }, { "epoch": 1.2770642201834863, "grad_norm": 0.4792233407497406, "learning_rate": 3.547378091656186e-06, "logits/chosen": -0.9022351503372192, "logits/rejected": -2.0722413063049316, "logps/chosen": -293.7245178222656, "logps/rejected": -173.53054809570312, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 4.439302444458008, "rewards/margins": 8.599308013916016, "rewards/rejected": -4.160006046295166, "step": 174 }, { "epoch": 1.2917431192660551, "grad_norm": 0.7374489903450012, "learning_rate": 3.5083628238963913e-06, "logits/chosen": -1.085463047027588, "logits/rejected": -1.969193935394287, "logps/chosen": -234.489013671875, "logps/rejected": -175.44613647460938, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 4.414360523223877, "rewards/margins": 7.9778642654418945, "rewards/rejected": -3.5635030269622803, "step": 176 }, { "epoch": 1.306422018348624, "grad_norm": 1.7487801313400269, "learning_rate": 3.4690520041886473e-06, "logits/chosen": -0.9150568246841431, "logits/rejected": -2.0502333641052246, "logps/chosen": -275.4502258300781, "logps/rejected": -212.3257598876953, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 4.58690071105957, "rewards/margins": 8.234541893005371, "rewards/rejected": -3.647641181945801, "step": 178 }, { "epoch": 1.3211009174311927, "grad_norm": 0.12792479991912842, "learning_rate": 3.4294571545655653e-06, "logits/chosen": -0.91706383228302, "logits/rejected": -2.196730613708496, "logps/chosen": -293.5966796875, "logps/rejected": -180.54701232910156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 4.876993656158447, "rewards/margins": 9.287820816040039, "rewards/rejected": -4.41082763671875, "step": 180 }, { "epoch": 1.3357798165137615, "grad_norm": 1.574942708015442, "learning_rate": 3.38958988030915e-06, "logits/chosen": -1.1890692710876465, "logits/rejected": -2.066960334777832, "logps/chosen": -274.7825622558594, "logps/rejected": -224.47837829589844, "loss": 0.0592, "rewards/accuracies": 0.984375, "rewards/chosen": 4.447505950927734, "rewards/margins": 8.213159561157227, "rewards/rejected": -3.7656538486480713, "step": 182 }, { "epoch": 1.3504587155963304, "grad_norm": 1.0345042943954468, "learning_rate": 3.3494618665492833e-06, "logits/chosen": -1.1099860668182373, "logits/rejected": -2.0204684734344482, "logps/chosen": -255.40478515625, "logps/rejected": -192.52752685546875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 4.036734104156494, "rewards/margins": 7.486913681030273, "rewards/rejected": -3.4501795768737793, "step": 184 }, { "epoch": 1.3651376146788992, "grad_norm": 0.1829257309436798, "learning_rate": 3.3090848748388042e-06, "logits/chosen": -1.0115846395492554, "logits/rejected": -2.1213629245758057, "logps/chosen": -353.5410461425781, "logps/rejected": -192.9250030517578, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 4.421483993530273, "rewards/margins": 8.999296188354492, "rewards/rejected": -4.577812194824219, "step": 186 }, { "epoch": 1.379816513761468, "grad_norm": 0.3030329942703247, "learning_rate": 3.2684707397061887e-06, "logits/chosen": -1.0969910621643066, "logits/rejected": -2.0923759937286377, "logps/chosen": -293.1423645019531, "logps/rejected": -173.88784790039062, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 4.567940711975098, "rewards/margins": 8.394798278808594, "rewards/rejected": -3.8268580436706543, "step": 188 }, { "epoch": 1.3944954128440368, "grad_norm": 0.8538657426834106, "learning_rate": 3.2276313651868364e-06, "logits/chosen": -0.9523632526397705, "logits/rejected": -2.0854203701019287, "logps/chosen": -297.4543762207031, "logps/rejected": -162.13568115234375, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 4.548934459686279, "rewards/margins": 8.458111763000488, "rewards/rejected": -3.909177541732788, "step": 190 }, { "epoch": 1.4091743119266056, "grad_norm": 0.4353146553039551, "learning_rate": 3.1865787213339926e-06, "logits/chosen": -0.9564714431762695, "logits/rejected": -2.0908193588256836, "logps/chosen": -281.3487243652344, "logps/rejected": -186.54757690429688, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 4.7654829025268555, "rewards/margins": 9.230217933654785, "rewards/rejected": -4.464734077453613, "step": 192 }, { "epoch": 1.4238532110091744, "grad_norm": 0.40312162041664124, "learning_rate": 3.1453248407103156e-06, "logits/chosen": -0.9966449737548828, "logits/rejected": -2.1248295307159424, "logps/chosen": -287.3999328613281, "logps/rejected": -169.44496154785156, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 4.19376802444458, "rewards/margins": 8.428169250488281, "rewards/rejected": -4.234401702880859, "step": 194 }, { "epoch": 1.4385321100917432, "grad_norm": 1.217081904411316, "learning_rate": 3.1038818148611178e-06, "logits/chosen": -1.022183895111084, "logits/rejected": -2.0069739818573, "logps/chosen": -312.16973876953125, "logps/rejected": -181.82955932617188, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 4.685446262359619, "rewards/margins": 8.785126686096191, "rewards/rejected": -4.099679946899414, "step": 196 }, { "epoch": 1.453211009174312, "grad_norm": 0.6292124390602112, "learning_rate": 3.062261790770331e-06, "logits/chosen": -0.8997288942337036, "logits/rejected": -1.9895069599151611, "logps/chosen": -259.7031555175781, "logps/rejected": -182.4678497314453, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": 4.453324794769287, "rewards/margins": 7.995000839233398, "rewards/rejected": -3.541675567626953, "step": 198 }, { "epoch": 1.4678899082568808, "grad_norm": 0.46019911766052246, "learning_rate": 3.0204769673002123e-06, "logits/chosen": -0.981975793838501, "logits/rejected": -2.123629331588745, "logps/chosen": -333.59722900390625, "logps/rejected": -198.2655487060547, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 4.988855838775635, "rewards/margins": 8.937817573547363, "rewards/rejected": -3.948960781097412, "step": 200 }, { "epoch": 1.4825688073394496, "grad_norm": 1.5788525342941284, "learning_rate": 2.978539591615848e-06, "logits/chosen": -1.0232621431350708, "logits/rejected": -1.9014160633087158, "logps/chosen": -299.21649169921875, "logps/rejected": -196.33389282226562, "loss": 0.0167, "rewards/accuracies": 0.984375, "rewards/chosen": 4.068594932556152, "rewards/margins": 8.521183013916016, "rewards/rejected": -4.452587604522705, "step": 202 }, { "epoch": 1.4972477064220184, "grad_norm": 0.18567878007888794, "learning_rate": 2.936461955595501e-06, "logits/chosen": -1.0283303260803223, "logits/rejected": -2.1100425720214844, "logps/chosen": -298.8528137207031, "logps/rejected": -191.35086059570312, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 4.9132208824157715, "rewards/margins": 8.617488861083984, "rewards/rejected": -3.704267978668213, "step": 204 }, { "epoch": 1.5119266055045872, "grad_norm": 0.13169872760772705, "learning_rate": 2.8942563922278487e-06, "logits/chosen": -1.0413228273391724, "logits/rejected": -2.1321609020233154, "logps/chosen": -286.2435607910156, "logps/rejected": -196.66256713867188, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 4.733799457550049, "rewards/margins": 9.53592586517334, "rewards/rejected": -4.802126407623291, "step": 206 }, { "epoch": 1.526605504587156, "grad_norm": 2.0917320251464844, "learning_rate": 2.8519352719971783e-06, "logits/chosen": -1.097141981124878, "logits/rejected": -2.0799503326416016, "logps/chosen": -316.93597412109375, "logps/rejected": -201.67100524902344, "loss": 0.0309, "rewards/accuracies": 0.984375, "rewards/chosen": 4.9373931884765625, "rewards/margins": 8.860273361206055, "rewards/rejected": -3.9228808879852295, "step": 208 }, { "epoch": 1.5412844036697249, "grad_norm": 0.6296855807304382, "learning_rate": 2.8095109992575824e-06, "logits/chosen": -0.9797852039337158, "logits/rejected": -2.088029146194458, "logps/chosen": -328.76251220703125, "logps/rejected": -201.71078491210938, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 5.399056911468506, "rewards/margins": 9.34478759765625, "rewards/rejected": -3.945730209350586, "step": 210 }, { "epoch": 1.5559633027522937, "grad_norm": 0.3409838080406189, "learning_rate": 2.7669960085972407e-06, "logits/chosen": -0.9346829652786255, "logits/rejected": -2.2055399417877197, "logps/chosen": -351.57489013671875, "logps/rejected": -219.8714141845703, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 5.243146896362305, "rewards/margins": 9.388729095458984, "rewards/rejected": -4.145582675933838, "step": 212 }, { "epoch": 1.5706422018348625, "grad_norm": 0.791716456413269, "learning_rate": 2.7244027611938247e-06, "logits/chosen": -0.8380637764930725, "logits/rejected": -1.925654649734497, "logps/chosen": -251.362548828125, "logps/rejected": -220.16436767578125, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 4.47086763381958, "rewards/margins": 8.536866188049316, "rewards/rejected": -4.065998554229736, "step": 214 }, { "epoch": 1.5853211009174313, "grad_norm": 0.2506906986236572, "learning_rate": 2.6817437411621194e-06, "logits/chosen": -0.9830411076545715, "logits/rejected": -2.0578300952911377, "logps/chosen": -342.7967529296875, "logps/rejected": -237.17025756835938, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 5.036384582519531, "rewards/margins": 9.013383865356445, "rewards/rejected": -3.976999521255493, "step": 216 }, { "epoch": 1.6, "grad_norm": 0.22265683114528656, "learning_rate": 2.639031451894923e-06, "logits/chosen": -1.028990387916565, "logits/rejected": -1.9095451831817627, "logps/chosen": -330.1585998535156, "logps/rejected": -222.2620086669922, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 5.05686616897583, "rewards/margins": 9.352256774902344, "rewards/rejected": -4.2953901290893555, "step": 218 }, { "epoch": 1.614678899082569, "grad_norm": 0.857473611831665, "learning_rate": 2.5962784123982843e-06, "logits/chosen": -1.049895167350769, "logits/rejected": -2.1700665950775146, "logps/chosen": -305.7288513183594, "logps/rejected": -198.574462890625, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": 4.7103071212768555, "rewards/margins": 9.439103126525879, "rewards/rejected": -4.728795528411865, "step": 220 }, { "epoch": 1.6293577981651377, "grad_norm": 0.2698463797569275, "learning_rate": 2.5534971536221804e-06, "logits/chosen": -0.861595630645752, "logits/rejected": -1.9534931182861328, "logps/chosen": -268.7453918457031, "logps/rejected": -191.55238342285156, "loss": 0.024, "rewards/accuracies": 0.984375, "rewards/chosen": 4.435842514038086, "rewards/margins": 9.0311918258667, "rewards/rejected": -4.595349311828613, "step": 222 }, { "epoch": 1.6440366972477065, "grad_norm": 1.8740975856781006, "learning_rate": 2.5107002147876814e-06, "logits/chosen": -1.010701298713684, "logits/rejected": -1.9186618328094482, "logps/chosen": -263.1980895996094, "logps/rejected": -206.22360229492188, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 4.652899742126465, "rewards/margins": 9.141688346862793, "rewards/rejected": -4.488787651062012, "step": 224 }, { "epoch": 1.6587155963302753, "grad_norm": 2.0281364917755127, "learning_rate": 2.467900139711693e-06, "logits/chosen": -1.0440551042556763, "logits/rejected": -1.971301555633545, "logps/chosen": -272.8301696777344, "logps/rejected": -197.07269287109375, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": 4.243446350097656, "rewards/margins": 8.737079620361328, "rewards/rejected": -4.493633270263672, "step": 226 }, { "epoch": 1.6733944954128441, "grad_norm": 0.23005536198616028, "learning_rate": 2.4251094731303586e-06, "logits/chosen": -0.9269182085990906, "logits/rejected": -2.089838981628418, "logps/chosen": -291.0529479980469, "logps/rejected": -179.94895935058594, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 5.278434753417969, "rewards/margins": 9.40982723236084, "rewards/rejected": -4.131391525268555, "step": 228 }, { "epoch": 1.688073394495413, "grad_norm": 0.0681939348578453, "learning_rate": 2.3823407570221812e-06, "logits/chosen": -0.8353657126426697, "logits/rejected": -2.02689266204834, "logps/chosen": -300.3406982421875, "logps/rejected": -175.13296508789062, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 4.921877384185791, "rewards/margins": 9.119339942932129, "rewards/rejected": -4.197463035583496, "step": 230 }, { "epoch": 1.7027522935779817, "grad_norm": 0.24333705008029938, "learning_rate": 2.3396065269319655e-06, "logits/chosen": -1.0092397928237915, "logits/rejected": -2.1053268909454346, "logps/chosen": -300.02294921875, "logps/rejected": -172.78187561035156, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 4.987635612487793, "rewards/margins": 9.38530445098877, "rewards/rejected": -4.397668361663818, "step": 232 }, { "epoch": 1.7174311926605506, "grad_norm": 0.6797487139701843, "learning_rate": 2.2969193082966353e-06, "logits/chosen": -0.8851895332336426, "logits/rejected": -2.036161422729492, "logps/chosen": -285.2466735839844, "logps/rejected": -189.85882568359375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 5.067909240722656, "rewards/margins": 9.919548988342285, "rewards/rejected": -4.851640701293945, "step": 234 }, { "epoch": 1.7321100917431194, "grad_norm": 0.6367282271385193, "learning_rate": 2.2542916127740194e-06, "logits/chosen": -0.8543779253959656, "logits/rejected": -1.752845048904419, "logps/chosen": -312.6046142578125, "logps/rejected": -234.28988647460938, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 4.974181175231934, "rewards/margins": 9.56247329711914, "rewards/rejected": -4.588292121887207, "step": 236 }, { "epoch": 1.7467889908256882, "grad_norm": 0.2897071838378906, "learning_rate": 2.211735934575674e-06, "logits/chosen": -0.9410618543624878, "logits/rejected": -2.174349308013916, "logps/chosen": -281.28863525390625, "logps/rejected": -165.9616241455078, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 4.256411552429199, "rewards/margins": 9.012039184570312, "rewards/rejected": -4.755627632141113, "step": 238 }, { "epoch": 1.761467889908257, "grad_norm": 0.41199827194213867, "learning_rate": 2.1692647468048235e-06, "logits/chosen": -1.0583674907684326, "logits/rejected": -2.0003695487976074, "logps/chosen": -307.04766845703125, "logps/rejected": -206.0718231201172, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 4.970597743988037, "rewards/margins": 10.884113311767578, "rewards/rejected": -5.913515567779541, "step": 240 }, { "epoch": 1.7761467889908258, "grad_norm": 0.5175734162330627, "learning_rate": 2.126890497800477e-06, "logits/chosen": -1.0432560443878174, "logits/rejected": -1.912244439125061, "logps/chosen": -297.5209655761719, "logps/rejected": -201.7635498046875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 4.594603538513184, "rewards/margins": 8.852926254272461, "rewards/rejected": -4.258323669433594, "step": 242 }, { "epoch": 1.7908256880733946, "grad_norm": 0.6547983288764954, "learning_rate": 2.084625607488816e-06, "logits/chosen": -0.9311404228210449, "logits/rejected": -2.1106457710266113, "logps/chosen": -275.57183837890625, "logps/rejected": -188.14370727539062, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": 5.0517778396606445, "rewards/margins": 9.953323364257812, "rewards/rejected": -4.901544094085693, "step": 244 }, { "epoch": 1.8055045871559634, "grad_norm": 0.33203306794166565, "learning_rate": 2.0424824637428995e-06, "logits/chosen": -0.9116280674934387, "logits/rejected": -2.247035026550293, "logps/chosen": -267.2120361328125, "logps/rejected": -171.6895751953125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 4.797155857086182, "rewards/margins": 9.531312942504883, "rewards/rejected": -4.734157562255859, "step": 246 }, { "epoch": 1.8201834862385322, "grad_norm": 0.5430265069007874, "learning_rate": 2.0004734187517744e-06, "logits/chosen": -1.082189917564392, "logits/rejected": -1.9552661180496216, "logps/chosen": -318.3630676269531, "logps/rejected": -176.5325469970703, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 4.9373297691345215, "rewards/margins": 9.501139640808105, "rewards/rejected": -4.563809871673584, "step": 248 }, { "epoch": 1.834862385321101, "grad_norm": 0.19558808207511902, "learning_rate": 1.9586107854000327e-06, "logits/chosen": -1.1152639389038086, "logits/rejected": -2.129647731781006, "logps/chosen": -296.6053466796875, "logps/rejected": -169.00213623046875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 4.840038299560547, "rewards/margins": 9.542232513427734, "rewards/rejected": -4.702193737030029, "step": 250 }, { "epoch": 1.8495412844036698, "grad_norm": 0.45886340737342834, "learning_rate": 1.916906833658899e-06, "logits/chosen": -0.8982828855514526, "logits/rejected": -2.0570406913757324, "logps/chosen": -324.3260803222656, "logps/rejected": -216.9913330078125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 4.856814861297607, "rewards/margins": 9.941521644592285, "rewards/rejected": -5.084706783294678, "step": 252 }, { "epoch": 1.8642201834862386, "grad_norm": 0.9119444489479065, "learning_rate": 1.8753737869898921e-06, "logits/chosen": -0.972162663936615, "logits/rejected": -2.016150951385498, "logps/chosen": -248.53463745117188, "logps/rejected": -184.32382202148438, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 4.359373092651367, "rewards/margins": 10.132734298706055, "rewards/rejected": -5.7733612060546875, "step": 254 }, { "epoch": 1.8788990825688074, "grad_norm": 0.12387188524007797, "learning_rate": 1.8340238187621185e-06, "logits/chosen": -0.8442805409431458, "logits/rejected": -1.9759818315505981, "logps/chosen": -262.56671142578125, "logps/rejected": -175.5653839111328, "loss": 0.0327, "rewards/accuracies": 0.984375, "rewards/chosen": 4.67296028137207, "rewards/margins": 9.085709571838379, "rewards/rejected": -4.412749290466309, "step": 256 }, { "epoch": 1.8935779816513763, "grad_norm": 0.5057358145713806, "learning_rate": 1.7928690486842438e-06, "logits/chosen": -1.015974760055542, "logits/rejected": -2.1081368923187256, "logps/chosen": -253.27394104003906, "logps/rejected": -160.09469604492188, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 4.807024002075195, "rewards/margins": 9.237527847290039, "rewards/rejected": -4.4305033683776855, "step": 258 }, { "epoch": 1.908256880733945, "grad_norm": 0.6048524379730225, "learning_rate": 1.7519215392522026e-06, "logits/chosen": -0.9711456298828125, "logits/rejected": -2.1203389167785645, "logps/chosen": -282.3438720703125, "logps/rejected": -166.2510528564453, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 5.09495735168457, "rewards/margins": 9.728025436401367, "rewards/rejected": -4.633067607879639, "step": 260 }, { "epoch": 1.9229357798165139, "grad_norm": 0.5542910695075989, "learning_rate": 1.7111932922136715e-06, "logits/chosen": -0.9748891592025757, "logits/rejected": -1.8318710327148438, "logps/chosen": -253.21209716796875, "logps/rejected": -202.5255889892578, "loss": 0.019, "rewards/accuracies": 0.984375, "rewards/chosen": 4.323376178741455, "rewards/margins": 9.282448768615723, "rewards/rejected": -4.959072589874268, "step": 262 }, { "epoch": 1.9376146788990827, "grad_norm": 0.24626314640045166, "learning_rate": 1.6706962450503408e-06, "logits/chosen": -0.8283478617668152, "logits/rejected": -2.0624401569366455, "logps/chosen": -282.2995300292969, "logps/rejected": -189.75595092773438, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 4.900084495544434, "rewards/margins": 10.33100414276123, "rewards/rejected": -5.4309186935424805, "step": 264 }, { "epoch": 1.9522935779816515, "grad_norm": 1.167913794517517, "learning_rate": 1.630442267479034e-06, "logits/chosen": -0.789318323135376, "logits/rejected": -1.9187240600585938, "logps/chosen": -266.4274597167969, "logps/rejected": -198.47540283203125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 4.9975690841674805, "rewards/margins": 9.762743949890137, "rewards/rejected": -4.765174865722656, "step": 266 }, { "epoch": 1.9669724770642203, "grad_norm": 0.05298791825771332, "learning_rate": 1.5904431579726837e-06, "logits/chosen": -0.9226531982421875, "logits/rejected": -2.0884995460510254, "logps/chosen": -295.6236267089844, "logps/rejected": -165.60801696777344, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 4.444765567779541, "rewards/margins": 9.557327270507812, "rewards/rejected": -5.1125617027282715, "step": 268 }, { "epoch": 1.981651376146789, "grad_norm": 0.11875250190496445, "learning_rate": 1.5507106403021897e-06, "logits/chosen": -0.8945147395133972, "logits/rejected": -2.1213436126708984, "logps/chosen": -329.32354736328125, "logps/rejected": -205.64938354492188, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 5.863536834716797, "rewards/margins": 10.598997116088867, "rewards/rejected": -4.735459804534912, "step": 270 }, { "epoch": 1.996330275229358, "grad_norm": 0.16463226079940796, "learning_rate": 1.511256360100171e-06, "logits/chosen": -0.8653547167778015, "logits/rejected": -2.120985746383667, "logps/chosen": -294.7039489746094, "logps/rejected": -191.1700897216797, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 4.6077494621276855, "rewards/margins": 9.97114372253418, "rewards/rejected": -5.363394737243652, "step": 272 }, { "epoch": 2.0110091743119267, "grad_norm": 0.5620644092559814, "learning_rate": 1.4720918814476234e-06, "logits/chosen": -1.0870428085327148, "logits/rejected": -2.203629493713379, "logps/chosen": -255.451171875, "logps/rejected": -179.9131317138672, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 4.301146984100342, "rewards/margins": 10.202719688415527, "rewards/rejected": -5.9015727043151855, "step": 274 }, { "epoch": 2.0256880733944955, "grad_norm": 0.22174260020256042, "learning_rate": 1.4332286834844792e-06, "logits/chosen": -1.1182466745376587, "logits/rejected": -2.1164536476135254, "logps/chosen": -286.516357421875, "logps/rejected": -188.3638458251953, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 4.640047550201416, "rewards/margins": 9.481383323669434, "rewards/rejected": -4.841336250305176, "step": 276 }, { "epoch": 2.0403669724770643, "grad_norm": 0.33157217502593994, "learning_rate": 1.3946781570450563e-06, "logits/chosen": -0.9743894338607788, "logits/rejected": -2.0844216346740723, "logps/chosen": -303.1180419921875, "logps/rejected": -197.849853515625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 5.488700866699219, "rewards/margins": 10.087553024291992, "rewards/rejected": -4.598852157592773, "step": 278 }, { "epoch": 2.055045871559633, "grad_norm": 0.2071988433599472, "learning_rate": 1.3564516013194023e-06, "logits/chosen": -0.7817774415016174, "logits/rejected": -1.967786431312561, "logps/chosen": -266.5663757324219, "logps/rejected": -185.63877868652344, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 4.6336894035339355, "rewards/margins": 10.230566024780273, "rewards/rejected": -5.596876621246338, "step": 280 }, { "epoch": 2.069724770642202, "grad_norm": 0.35437583923339844, "learning_rate": 1.3185602205414894e-06, "logits/chosen": -0.9503396153450012, "logits/rejected": -2.0260818004608154, "logps/chosen": -269.90093994140625, "logps/rejected": -172.7965850830078, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 4.775392532348633, "rewards/margins": 9.074084281921387, "rewards/rejected": -4.298691272735596, "step": 282 }, { "epoch": 2.0844036697247708, "grad_norm": 0.09949786216020584, "learning_rate": 1.2810151207052465e-06, "logits/chosen": -1.025212049484253, "logits/rejected": -2.090640068054199, "logps/chosen": -335.35882568359375, "logps/rejected": -221.98355102539062, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 4.815802097320557, "rewards/margins": 9.805765151977539, "rewards/rejected": -4.989964008331299, "step": 284 }, { "epoch": 2.0990825688073396, "grad_norm": 0.1902090311050415, "learning_rate": 1.2438273063093811e-06, "logits/chosen": -0.8500208854675293, "logits/rejected": -1.9380009174346924, "logps/chosen": -277.2483215332031, "logps/rejected": -168.717529296875, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 4.573535442352295, "rewards/margins": 9.026323318481445, "rewards/rejected": -4.452788352966309, "step": 286 }, { "epoch": 2.1137614678899084, "grad_norm": 0.3208858072757721, "learning_rate": 1.2070076771319536e-06, "logits/chosen": -1.082637906074524, "logits/rejected": -1.9498220682144165, "logps/chosen": -353.7499084472656, "logps/rejected": -200.4058074951172, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 4.898225784301758, "rewards/margins": 8.967299461364746, "rewards/rejected": -4.0690741539001465, "step": 288 }, { "epoch": 2.128440366972477, "grad_norm": 1.1437596082687378, "learning_rate": 1.1705670250356417e-06, "logits/chosen": -0.8648325800895691, "logits/rejected": -2.037424087524414, "logps/chosen": -311.7782287597656, "logps/rejected": -195.8933563232422, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 5.304541110992432, "rewards/margins": 10.098506927490234, "rewards/rejected": -4.7939653396606445, "step": 290 }, { "epoch": 2.143119266055046, "grad_norm": 0.16270968317985535, "learning_rate": 1.1345160308046413e-06, "logits/chosen": -0.9791809916496277, "logits/rejected": -2.24078369140625, "logps/chosen": -382.8855895996094, "logps/rejected": -205.8779754638672, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 5.505633354187012, "rewards/margins": 11.220186233520508, "rewards/rejected": -5.714553356170654, "step": 292 }, { "epoch": 2.157798165137615, "grad_norm": 2.2618370056152344, "learning_rate": 1.0988652610141154e-06, "logits/chosen": -0.9164503216743469, "logits/rejected": -1.9510498046875, "logps/chosen": -276.1203918457031, "logps/rejected": -212.5254669189453, "loss": 0.0214, "rewards/accuracies": 0.984375, "rewards/chosen": 4.720728874206543, "rewards/margins": 9.504093170166016, "rewards/rejected": -4.783364295959473, "step": 294 }, { "epoch": 2.1724770642201836, "grad_norm": 0.08572974801063538, "learning_rate": 1.063625164933124e-06, "logits/chosen": -0.8781817555427551, "logits/rejected": -2.0793867111206055, "logps/chosen": -330.6020202636719, "logps/rejected": -208.72425842285156, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 5.462750434875488, "rewards/margins": 11.203109741210938, "rewards/rejected": -5.740358352661133, "step": 296 }, { "epoch": 2.1871559633027524, "grad_norm": 0.5944895148277283, "learning_rate": 1.0288060714619359e-06, "logits/chosen": -1.1157301664352417, "logits/rejected": -2.214977502822876, "logps/chosen": -316.9060363769531, "logps/rejected": -167.6466522216797, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 5.41005277633667, "rewards/margins": 10.054335594177246, "rewards/rejected": -4.644283294677734, "step": 298 }, { "epoch": 2.2018348623853212, "grad_norm": 0.5239315629005432, "learning_rate": 9.944181861046188e-07, "logits/chosen": -0.8929880857467651, "logits/rejected": -1.9771008491516113, "logps/chosen": -334.0789489746094, "logps/rejected": -201.40476989746094, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 5.018026351928711, "rewards/margins": 10.597818374633789, "rewards/rejected": -5.579792499542236, "step": 300 }, { "epoch": 2.21651376146789, "grad_norm": 0.17582310736179352, "learning_rate": 9.604715879777986e-07, "logits/chosen": -0.9466437101364136, "logits/rejected": -2.1750948429107666, "logps/chosen": -279.5908203125, "logps/rejected": -154.12644958496094, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 5.104902744293213, "rewards/margins": 10.134696006774902, "rewards/rejected": -5.029792785644531, "step": 302 }, { "epoch": 2.231192660550459, "grad_norm": 0.44052350521087646, "learning_rate": 9.269762268564616e-07, "logits/chosen": -1.0591435432434082, "logits/rejected": -2.134446382522583, "logps/chosen": -255.2496337890625, "logps/rejected": -161.16136169433594, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 4.842598915100098, "rewards/margins": 9.835264205932617, "rewards/rejected": -4.992665767669678, "step": 304 }, { "epoch": 2.2458715596330276, "grad_norm": 0.702462375164032, "learning_rate": 8.939419202576694e-07, "logits/chosen": -0.768172025680542, "logits/rejected": -1.7977386713027954, "logps/chosen": -258.4624938964844, "logps/rejected": -183.80621337890625, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 4.580810070037842, "rewards/margins": 7.86204719543457, "rewards/rejected": -3.2812376022338867, "step": 306 }, { "epoch": 2.2605504587155965, "grad_norm": 0.4431416690349579, "learning_rate": 8.61378350563033e-07, "logits/chosen": -0.9345456957817078, "logits/rejected": -1.9868954420089722, "logps/chosen": -250.33721923828125, "logps/rejected": -193.64549255371094, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 4.593289375305176, "rewards/margins": 9.057455062866211, "rewards/rejected": -4.464165210723877, "step": 308 }, { "epoch": 2.2752293577981653, "grad_norm": 0.30388739705085754, "learning_rate": 8.292950621808022e-07, "logits/chosen": -0.9780189990997314, "logits/rejected": -2.0176703929901123, "logps/chosen": -285.4472961425781, "logps/rejected": -191.96495056152344, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 5.00777006149292, "rewards/margins": 10.013311386108398, "rewards/rejected": -5.005540370941162, "step": 310 }, { "epoch": 2.289908256880734, "grad_norm": 0.3784541189670563, "learning_rate": 7.977014587483925e-07, "logits/chosen": -1.0011767148971558, "logits/rejected": -2.0550498962402344, "logps/chosen": -273.92138671875, "logps/rejected": -228.16741943359375, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": 4.854176998138428, "rewards/margins": 9.765246391296387, "rewards/rejected": -4.911068916320801, "step": 312 }, { "epoch": 2.304587155963303, "grad_norm": 0.2992611527442932, "learning_rate": 7.666068003761684e-07, "logits/chosen": -0.9273378849029541, "logits/rejected": -2.042013645172119, "logps/chosen": -296.50616455078125, "logps/rejected": -169.5068817138672, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.002749443054199, "rewards/margins": 10.678738594055176, "rewards/rejected": -5.675989151000977, "step": 314 }, { "epoch": 2.3192660550458717, "grad_norm": 0.23903429508209229, "learning_rate": 7.360202009332993e-07, "logits/chosen": -1.0399566888809204, "logits/rejected": -2.143623113632202, "logps/chosen": -296.7044677734375, "logps/rejected": -185.7502899169922, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 4.675933361053467, "rewards/margins": 10.062166213989258, "rewards/rejected": -5.386232376098633, "step": 316 }, { "epoch": 2.3339449541284405, "grad_norm": 0.23702357709407806, "learning_rate": 7.059506253764773e-07, "logits/chosen": -0.9900916814804077, "logits/rejected": -2.093594789505005, "logps/chosen": -314.4263000488281, "logps/rejected": -194.73269653320312, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 4.672959327697754, "rewards/margins": 10.092663764953613, "rewards/rejected": -5.419704437255859, "step": 318 }, { "epoch": 2.3486238532110093, "grad_norm": 0.40923863649368286, "learning_rate": 6.764068871222825e-07, "logits/chosen": -0.7488622069358826, "logits/rejected": -1.9413087368011475, "logps/chosen": -287.322021484375, "logps/rejected": -188.12283325195312, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 5.020465850830078, "rewards/margins": 9.374917984008789, "rewards/rejected": -4.354452133178711, "step": 320 }, { "epoch": 2.363302752293578, "grad_norm": 0.19543257355690002, "learning_rate": 6.473976454639608e-07, "logits/chosen": -0.9299582839012146, "logits/rejected": -2.107851266860962, "logps/chosen": -293.9842529296875, "logps/rejected": -168.69400024414062, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 5.329434871673584, "rewards/margins": 10.282726287841797, "rewards/rejected": -4.9532904624938965, "step": 322 }, { "epoch": 2.377981651376147, "grad_norm": 1.0940320491790771, "learning_rate": 6.189314030333796e-07, "logits/chosen": -0.8577584624290466, "logits/rejected": -1.934208631515503, "logps/chosen": -280.2929382324219, "logps/rejected": -220.2890167236328, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": 4.774515628814697, "rewards/margins": 10.356229782104492, "rewards/rejected": -5.581714153289795, "step": 324 }, { "epoch": 2.3926605504587157, "grad_norm": 0.12193372845649719, "learning_rate": 5.910165033089e-07, "logits/chosen": -0.8733283281326294, "logits/rejected": -2.079462766647339, "logps/chosen": -316.6996765136719, "logps/rejected": -201.25564575195312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 5.122347831726074, "rewards/margins": 9.807957649230957, "rewards/rejected": -4.685609817504883, "step": 326 }, { "epoch": 2.4073394495412845, "grad_norm": 0.43534737825393677, "learning_rate": 5.636611281698956e-07, "logits/chosen": -0.8986641764640808, "logits/rejected": -1.9822278022766113, "logps/chosen": -262.6693420410156, "logps/rejected": -185.32846069335938, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 4.460582733154297, "rewards/margins": 9.22741413116455, "rewards/rejected": -4.766830921173096, "step": 328 }, { "epoch": 2.4220183486238533, "grad_norm": 0.17293158173561096, "learning_rate": 5.368732954986389e-07, "logits/chosen": -1.0250214338302612, "logits/rejected": -2.0870189666748047, "logps/chosen": -279.42999267578125, "logps/rejected": -196.77059936523438, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 4.643675327301025, "rewards/margins": 9.602448463439941, "rewards/rejected": -4.958773136138916, "step": 330 }, { "epoch": 2.436697247706422, "grad_norm": 0.20987118780612946, "learning_rate": 5.106608568302504e-07, "logits/chosen": -1.066097617149353, "logits/rejected": -2.057497978210449, "logps/chosen": -257.912109375, "logps/rejected": -195.58677673339844, "loss": 0.0204, "rewards/accuracies": 0.984375, "rewards/chosen": 4.727499961853027, "rewards/margins": 9.989591598510742, "rewards/rejected": -5.262092590332031, "step": 332 }, { "epoch": 2.451376146788991, "grad_norm": 1.3423670530319214, "learning_rate": 4.850314950514124e-07, "logits/chosen": -0.8067299127578735, "logits/rejected": -1.9319019317626953, "logps/chosen": -281.423583984375, "logps/rejected": -192.34666442871094, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": 4.9796462059021, "rewards/margins": 9.925731658935547, "rewards/rejected": -4.9460859298706055, "step": 334 }, { "epoch": 2.4660550458715598, "grad_norm": 0.2133161723613739, "learning_rate": 4.599927221485034e-07, "logits/chosen": -0.9198440909385681, "logits/rejected": -2.121577024459839, "logps/chosen": -277.7024230957031, "logps/rejected": -171.4612579345703, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 4.626633167266846, "rewards/margins": 9.858685493469238, "rewards/rejected": -5.232051849365234, "step": 336 }, { "epoch": 2.4807339449541286, "grad_norm": 0.16850100457668304, "learning_rate": 4.3555187700583175e-07, "logits/chosen": -0.8522999882698059, "logits/rejected": -2.053220748901367, "logps/chosen": -265.3820495605469, "logps/rejected": -188.9971160888672, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 4.703529357910156, "rewards/margins": 10.37534236907959, "rewards/rejected": -5.671813011169434, "step": 338 }, { "epoch": 2.4954128440366974, "grad_norm": 0.15111279487609863, "learning_rate": 4.1171612325460244e-07, "logits/chosen": -0.9065884351730347, "logits/rejected": -1.9212383031845093, "logps/chosen": -279.0539245605469, "logps/rejected": -185.0900115966797, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 4.483066082000732, "rewards/margins": 9.553812026977539, "rewards/rejected": -5.070746421813965, "step": 340 }, { "epoch": 2.510091743119266, "grad_norm": 0.06084302440285683, "learning_rate": 3.8849244717325206e-07, "logits/chosen": -0.9317240715026855, "logits/rejected": -1.988271713256836, "logps/chosen": -268.8980407714844, "logps/rejected": -202.74929809570312, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 5.079599380493164, "rewards/margins": 10.533794403076172, "rewards/rejected": -5.454195022583008, "step": 342 }, { "epoch": 2.524770642201835, "grad_norm": 0.8741805553436279, "learning_rate": 3.658876556397628e-07, "logits/chosen": -1.1219009160995483, "logits/rejected": -2.1234138011932373, "logps/chosen": -255.28311157226562, "logps/rejected": -171.67091369628906, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 5.061553478240967, "rewards/margins": 10.136541366577148, "rewards/rejected": -5.074987411499023, "step": 344 }, { "epoch": 2.539449541284404, "grad_norm": 0.4372842013835907, "learning_rate": 3.4390837413656256e-07, "logits/chosen": -0.9813422560691833, "logits/rejected": -2.116903781890869, "logps/chosen": -278.31292724609375, "logps/rejected": -204.9643096923828, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 5.085036277770996, "rewards/margins": 10.61630630493164, "rewards/rejected": -5.531269073486328, "step": 346 }, { "epoch": 2.5541284403669726, "grad_norm": 1.2550814151763916, "learning_rate": 3.225610448085903e-07, "logits/chosen": -0.9581831693649292, "logits/rejected": -2.0414552688598633, "logps/chosen": -270.668701171875, "logps/rejected": -183.82034301757812, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 4.899204730987549, "rewards/margins": 10.034035682678223, "rewards/rejected": -5.134830474853516, "step": 348 }, { "epoch": 2.5688073394495414, "grad_norm": 0.03559936583042145, "learning_rate": 3.018519245750989e-07, "logits/chosen": -0.9744287729263306, "logits/rejected": -1.9595189094543457, "logps/chosen": -321.4473876953125, "logps/rejected": -223.63467407226562, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 4.675147533416748, "rewards/margins": 10.108884811401367, "rewards/rejected": -5.433738708496094, "step": 350 }, { "epoch": 2.5834862385321102, "grad_norm": 0.2746826708316803, "learning_rate": 2.817870832957459e-07, "logits/chosen": -0.8869858980178833, "logits/rejected": -2.016246795654297, "logps/chosen": -259.2815856933594, "logps/rejected": -180.26258850097656, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 5.067638874053955, "rewards/margins": 10.355432510375977, "rewards/rejected": -5.287793159484863, "step": 352 }, { "epoch": 2.598165137614679, "grad_norm": 0.17304402589797974, "learning_rate": 2.6237240199151386e-07, "logits/chosen": -1.0045228004455566, "logits/rejected": -2.091106414794922, "logps/chosen": -264.62774658203125, "logps/rejected": -172.3504638671875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 4.949882507324219, "rewards/margins": 9.181565284729004, "rewards/rejected": -4.231683254241943, "step": 354 }, { "epoch": 2.612844036697248, "grad_norm": 0.9380022883415222, "learning_rate": 2.436135711209786e-07, "logits/chosen": -1.1858479976654053, "logits/rejected": -2.1570074558258057, "logps/chosen": -279.8266296386719, "logps/rejected": -165.30809020996094, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 4.504940032958984, "rewards/margins": 9.417243957519531, "rewards/rejected": -4.912304401397705, "step": 356 }, { "epoch": 2.6275229357798167, "grad_norm": 0.6058441400527954, "learning_rate": 2.2551608891243026e-07, "logits/chosen": -1.1764850616455078, "logits/rejected": -2.1525368690490723, "logps/chosen": -352.7016296386719, "logps/rejected": -213.2824249267578, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 4.686002731323242, "rewards/margins": 9.250012397766113, "rewards/rejected": -4.564009666442871, "step": 358 }, { "epoch": 2.6422018348623855, "grad_norm": 0.14885057508945465, "learning_rate": 2.0808525975233807e-07, "logits/chosen": -0.8036705255508423, "logits/rejected": -2.0143167972564697, "logps/chosen": -282.6025085449219, "logps/rejected": -200.5447540283203, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": 4.5807576179504395, "rewards/margins": 9.530784606933594, "rewards/rejected": -4.950027942657471, "step": 360 }, { "epoch": 2.6568807339449543, "grad_norm": 0.40349748730659485, "learning_rate": 1.9132619263063144e-07, "logits/chosen": -0.8986431360244751, "logits/rejected": -2.059335231781006, "logps/chosen": -346.6067810058594, "logps/rejected": -212.26348876953125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": 5.356535911560059, "rewards/margins": 10.730274200439453, "rewards/rejected": -5.373737812042236, "step": 362 }, { "epoch": 2.671559633027523, "grad_norm": 0.02223406359553337, "learning_rate": 1.7524379964325155e-07, "logits/chosen": -0.9592161774635315, "logits/rejected": -2.094557523727417, "logps/chosen": -327.5130310058594, "logps/rejected": -203.91741943359375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 5.003190517425537, "rewards/margins": 10.261466979980469, "rewards/rejected": -5.258275985717773, "step": 364 }, { "epoch": 2.686238532110092, "grad_norm": 0.21345356106758118, "learning_rate": 1.5984279455240975e-07, "logits/chosen": -0.9917050004005432, "logits/rejected": -2.0196518898010254, "logps/chosen": -282.2841796875, "logps/rejected": -191.79910278320312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 5.118269443511963, "rewards/margins": 10.18971061706543, "rewards/rejected": -5.071441173553467, "step": 366 }, { "epoch": 2.7009174311926607, "grad_norm": 0.0874081626534462, "learning_rate": 1.451276914049818e-07, "logits/chosen": -0.9789815545082092, "logits/rejected": -2.004281997680664, "logps/chosen": -256.3831787109375, "logps/rejected": -178.12074279785156, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 4.682791233062744, "rewards/margins": 10.118135452270508, "rewards/rejected": -5.435344219207764, "step": 368 }, { "epoch": 2.7155963302752295, "grad_norm": 0.15522974729537964, "learning_rate": 1.3110280320943692e-07, "logits/chosen": -0.89200758934021, "logits/rejected": -2.112806797027588, "logps/chosen": -271.0398254394531, "logps/rejected": -171.77919006347656, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 5.064676284790039, "rewards/margins": 10.34743881225586, "rewards/rejected": -5.28276252746582, "step": 370 }, { "epoch": 2.7302752293577983, "grad_norm": 0.5010592341423035, "learning_rate": 1.1777224067169218e-07, "logits/chosen": -0.8353609442710876, "logits/rejected": -1.9892935752868652, "logps/chosen": -279.2965087890625, "logps/rejected": -191.65855407714844, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 5.053516864776611, "rewards/margins": 10.337114334106445, "rewards/rejected": -5.283597469329834, "step": 372 }, { "epoch": 2.744954128440367, "grad_norm": 0.48516571521759033, "learning_rate": 1.0513991099025872e-07, "logits/chosen": -1.016608476638794, "logits/rejected": -2.1301956176757812, "logps/chosen": -323.5552673339844, "logps/rejected": -193.77996826171875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 4.830180644989014, "rewards/margins": 9.39232349395752, "rewards/rejected": -4.562142372131348, "step": 374 }, { "epoch": 2.759633027522936, "grad_norm": 0.18496806919574738, "learning_rate": 9.320951671104194e-08, "logits/chosen": -0.9126584529876709, "logits/rejected": -2.1175155639648438, "logps/chosen": -314.1302795410156, "logps/rejected": -191.1002960205078, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 5.795359134674072, "rewards/margins": 10.335535049438477, "rewards/rejected": -4.540175914764404, "step": 376 }, { "epoch": 2.7743119266055047, "grad_norm": 0.158527210354805, "learning_rate": 8.198455464212108e-08, "logits/chosen": -0.9621077179908752, "logits/rejected": -2.066542148590088, "logps/chosen": -293.69940185546875, "logps/rejected": -176.17442321777344, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 5.256305694580078, "rewards/margins": 10.846275329589844, "rewards/rejected": -5.589971542358398, "step": 378 }, { "epoch": 2.7889908256880735, "grad_norm": 0.11951223015785217, "learning_rate": 7.146831482883115e-08, "logits/chosen": -0.7449550628662109, "logits/rejected": -2.0898332595825195, "logps/chosen": -297.49365234375, "logps/rejected": -172.2970428466797, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.16118860244751, "rewards/margins": 11.0027437210083, "rewards/rejected": -5.841555595397949, "step": 380 }, { "epoch": 2.8036697247706424, "grad_norm": 0.32903870940208435, "learning_rate": 6.16638795894492e-08, "logits/chosen": -0.9001256823539734, "logits/rejected": -1.9853109121322632, "logps/chosen": -261.6986389160156, "logps/rejected": -200.32876586914062, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 5.192695617675781, "rewards/margins": 10.185223579406738, "rewards/rejected": -4.992527961730957, "step": 382 }, { "epoch": 2.818348623853211, "grad_norm": 1.1356521844863892, "learning_rate": 5.257412261176375e-08, "logits/chosen": -1.0478947162628174, "logits/rejected": -2.031193971633911, "logps/chosen": -272.9176025390625, "logps/rejected": -191.03363037109375, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 5.408777236938477, "rewards/margins": 10.281020164489746, "rewards/rejected": -4.8722429275512695, "step": 384 }, { "epoch": 2.83302752293578, "grad_norm": 0.36936327815055847, "learning_rate": 4.4201708110795384e-08, "logits/chosen": -0.9411278963088989, "logits/rejected": -1.9795866012573242, "logps/chosen": -292.65386962890625, "logps/rejected": -204.53778076171875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 4.8684539794921875, "rewards/margins": 9.500937461853027, "rewards/rejected": -4.632482528686523, "step": 386 }, { "epoch": 2.847706422018349, "grad_norm": 0.16482090950012207, "learning_rate": 3.654909004791152e-08, "logits/chosen": -0.938539981842041, "logits/rejected": -2.1424248218536377, "logps/chosen": -293.4642333984375, "logps/rejected": -184.5415802001953, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 4.669407844543457, "rewards/margins": 10.090935707092285, "rewards/rejected": -5.421527862548828, "step": 388 }, { "epoch": 2.8623853211009176, "grad_norm": 0.13458868861198425, "learning_rate": 2.9618511411570462e-08, "logits/chosen": -1.0025708675384521, "logits/rejected": -2.083418607711792, "logps/chosen": -284.4067687988281, "logps/rejected": -172.37875366210938, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 4.38177490234375, "rewards/margins": 9.599076271057129, "rewards/rejected": -5.2173004150390625, "step": 390 }, { "epoch": 2.8770642201834864, "grad_norm": 1.0483838319778442, "learning_rate": 2.3412003559898088e-08, "logits/chosen": -0.8990004658699036, "logits/rejected": -1.8701345920562744, "logps/chosen": -271.26129150390625, "logps/rejected": -207.7753143310547, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 4.918183326721191, "rewards/margins": 9.391782760620117, "rewards/rejected": -4.473598957061768, "step": 392 }, { "epoch": 2.891743119266055, "grad_norm": 0.25216034054756165, "learning_rate": 1.793138562529634e-08, "logits/chosen": -0.971919059753418, "logits/rejected": -2.1569983959198, "logps/chosen": -346.71875, "logps/rejected": -184.22348022460938, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.533580780029297, "rewards/margins": 10.083108901977539, "rewards/rejected": -4.549527168273926, "step": 394 }, { "epoch": 2.906422018348624, "grad_norm": 0.8910009860992432, "learning_rate": 1.317826398125277e-08, "logits/chosen": -1.062324047088623, "logits/rejected": -2.1035232543945312, "logps/chosen": -293.03125, "logps/rejected": -204.35723876953125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": 5.314833641052246, "rewards/margins": 10.953380584716797, "rewards/rejected": -5.638547420501709, "step": 396 }, { "epoch": 2.921100917431193, "grad_norm": 0.3026532828807831, "learning_rate": 9.15403177151275e-09, "logits/chosen": -0.9711483716964722, "logits/rejected": -1.8983428478240967, "logps/chosen": -276.14398193359375, "logps/rejected": -217.97817993164062, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 5.347715377807617, "rewards/margins": 10.437253952026367, "rewards/rejected": -5.08953857421875, "step": 398 }, { "epoch": 2.9357798165137616, "grad_norm": 0.21673916280269623, "learning_rate": 5.85986850174608e-09, "logits/chosen": -0.8715996146202087, "logits/rejected": -2.193289279937744, "logps/chosen": -312.49847412109375, "logps/rejected": -185.29078674316406, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 5.40484619140625, "rewards/margins": 10.567187309265137, "rewards/rejected": -5.162341117858887, "step": 400 }, { "epoch": 2.9504587155963304, "grad_norm": 0.18232221901416779, "learning_rate": 3.296739693834927e-09, "logits/chosen": -1.094886302947998, "logits/rejected": -1.9882696866989136, "logps/chosen": -305.0465087890625, "logps/rejected": -184.07928466796875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 4.589092254638672, "rewards/margins": 9.142921447753906, "rewards/rejected": -4.553828239440918, "step": 402 }, { "epoch": 2.9651376146788992, "grad_norm": 0.28508853912353516, "learning_rate": 1.4653966028774225e-09, "logits/chosen": -0.9431482553482056, "logits/rejected": -1.953324556350708, "logps/chosen": -313.6567077636719, "logps/rejected": -213.5366973876953, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 4.910001277923584, "rewards/margins": 10.507518768310547, "rewards/rejected": -5.597517490386963, "step": 404 }, { "epoch": 2.979816513761468, "grad_norm": 0.14594300091266632, "learning_rate": 3.6637599699351766e-10, "logits/chosen": -0.940761387348175, "logits/rejected": -2.1918911933898926, "logps/chosen": -289.67626953125, "logps/rejected": -180.724365234375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 5.103169918060303, "rewards/margins": 9.98257827758789, "rewards/rejected": -4.879408359527588, "step": 406 }, { "epoch": 2.994495412844037, "grad_norm": 0.16170361638069153, "learning_rate": 0.0, "logits/chosen": -1.0162718296051025, "logits/rejected": -1.9849637746810913, "logps/chosen": -319.7056579589844, "logps/rejected": -213.12435913085938, "loss": 0.0316, "rewards/accuracies": 0.984375, "rewards/chosen": 5.299257755279541, "rewards/margins": 10.258605003356934, "rewards/rejected": -4.959346771240234, "step": 408 }, { "epoch": 2.994495412844037, "step": 408, "total_flos": 7.837376281021809e+17, "train_loss": 0.11720214437923905, "train_runtime": 8069.9016, "train_samples_per_second": 1.62, "train_steps_per_second": 0.051 } ], "logging_steps": 2, "max_steps": 408, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.837376281021809e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }