dpo_06230018_policy2_0.6 / trainer_state.json
WDong's picture
Upload 17 files
9c7174e verified
raw
history blame
107 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994495412844037,
"eval_steps": 500,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014678899082568808,
"grad_norm": 11.81737232208252,
"learning_rate": 2.439024390243903e-07,
"logits/chosen": -0.9879676103591919,
"logits/rejected": -1.9993298053741455,
"logps/chosen": -269.27239990234375,
"logps/rejected": -186.47621154785156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.029357798165137616,
"grad_norm": 11.950206756591797,
"learning_rate": 4.878048780487805e-07,
"logits/chosen": -1.0342975854873657,
"logits/rejected": -1.9880424737930298,
"logps/chosen": -290.81072998046875,
"logps/rejected": -204.50514221191406,
"loss": 0.712,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.009406615048646927,
"rewards/margins": 0.020395996049046516,
"rewards/rejected": -0.029802614822983742,
"step": 4
},
{
"epoch": 0.044036697247706424,
"grad_norm": 11.719453811645508,
"learning_rate": 7.317073170731707e-07,
"logits/chosen": -1.1187832355499268,
"logits/rejected": -2.125272750854492,
"logps/chosen": -295.85894775390625,
"logps/rejected": -203.1645050048828,
"loss": 0.6642,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.07618961483240128,
"rewards/margins": 0.09595101326704025,
"rewards/rejected": -0.01976138912141323,
"step": 6
},
{
"epoch": 0.05871559633027523,
"grad_norm": 12.171574592590332,
"learning_rate": 9.75609756097561e-07,
"logits/chosen": -1.250899076461792,
"logits/rejected": -2.1083037853240967,
"logps/chosen": -252.51145935058594,
"logps/rejected": -164.40138244628906,
"loss": 0.7179,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.044141992926597595,
"rewards/margins": 0.003628704696893692,
"rewards/rejected": 0.040513284504413605,
"step": 8
},
{
"epoch": 0.07339449541284404,
"grad_norm": 13.125951766967773,
"learning_rate": 1.2195121951219514e-06,
"logits/chosen": -1.0734999179840088,
"logits/rejected": -2.204047679901123,
"logps/chosen": -306.6387939453125,
"logps/rejected": -158.70912170410156,
"loss": 0.7397,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.0127907395362854,
"rewards/margins": -0.031406134366989136,
"rewards/rejected": 0.01861538738012314,
"step": 10
},
{
"epoch": 0.08807339449541285,
"grad_norm": 15.099513053894043,
"learning_rate": 1.4634146341463414e-06,
"logits/chosen": -1.0108157396316528,
"logits/rejected": -1.977769374847412,
"logps/chosen": -344.31402587890625,
"logps/rejected": -223.3643798828125,
"loss": 0.7584,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06078364700078964,
"rewards/margins": -0.0867958813905716,
"rewards/rejected": 0.026012245565652847,
"step": 12
},
{
"epoch": 0.10275229357798166,
"grad_norm": 13.785890579223633,
"learning_rate": 1.707317073170732e-06,
"logits/chosen": -0.9762290120124817,
"logits/rejected": -1.9721505641937256,
"logps/chosen": -259.58258056640625,
"logps/rejected": -167.8755645751953,
"loss": 0.7235,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.023126909509301186,
"rewards/margins": -0.02050386182963848,
"rewards/rejected": 0.043630778789520264,
"step": 14
},
{
"epoch": 0.11743119266055047,
"grad_norm": 13.014513969421387,
"learning_rate": 1.951219512195122e-06,
"logits/chosen": -1.1472342014312744,
"logits/rejected": -2.0296616554260254,
"logps/chosen": -269.7952575683594,
"logps/rejected": -186.65452575683594,
"loss": 0.7405,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.004093457013368607,
"rewards/margins": -0.044701721519231796,
"rewards/rejected": 0.0487951785326004,
"step": 16
},
{
"epoch": 0.13211009174311927,
"grad_norm": 12.20093059539795,
"learning_rate": 2.1951219512195125e-06,
"logits/chosen": -1.0266412496566772,
"logits/rejected": -2.0891737937927246,
"logps/chosen": -313.8085021972656,
"logps/rejected": -197.85943603515625,
"loss": 0.6731,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.09984610974788666,
"rewards/margins": 0.0959281176328659,
"rewards/rejected": 0.003917992115020752,
"step": 18
},
{
"epoch": 0.14678899082568808,
"grad_norm": 12.344905853271484,
"learning_rate": 2.4390243902439027e-06,
"logits/chosen": -1.0662198066711426,
"logits/rejected": -2.0889832973480225,
"logps/chosen": -308.8189697265625,
"logps/rejected": -156.6934814453125,
"loss": 0.6784,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.08540838956832886,
"rewards/margins": 0.07292439043521881,
"rewards/rejected": 0.012484000064432621,
"step": 20
},
{
"epoch": 0.1614678899082569,
"grad_norm": 11.898660659790039,
"learning_rate": 2.682926829268293e-06,
"logits/chosen": -1.2143007516860962,
"logits/rejected": -2.262324571609497,
"logps/chosen": -298.7814636230469,
"logps/rejected": -186.76119995117188,
"loss": 0.6781,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.08213196694850922,
"rewards/margins": 0.08361663669347763,
"rewards/rejected": -0.0014846734702587128,
"step": 22
},
{
"epoch": 0.1761467889908257,
"grad_norm": 13.68064022064209,
"learning_rate": 2.926829268292683e-06,
"logits/chosen": -1.0233314037322998,
"logits/rejected": -2.1899986267089844,
"logps/chosen": -370.8209228515625,
"logps/rejected": -156.96270751953125,
"loss": 0.7306,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.007536953315138817,
"rewards/margins": -0.009052609093487263,
"rewards/rejected": 0.0015156615991145372,
"step": 24
},
{
"epoch": 0.1908256880733945,
"grad_norm": 12.661199569702148,
"learning_rate": 3.1707317073170736e-06,
"logits/chosen": -1.2463735342025757,
"logits/rejected": -2.1673622131347656,
"logps/chosen": -326.9246520996094,
"logps/rejected": -182.17701721191406,
"loss": 0.7175,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.010405808687210083,
"rewards/margins": 0.007835682481527328,
"rewards/rejected": -0.01824149303138256,
"step": 26
},
{
"epoch": 0.20550458715596331,
"grad_norm": 11.61974811553955,
"learning_rate": 3.414634146341464e-06,
"logits/chosen": -1.1716669797897339,
"logits/rejected": -2.2106716632843018,
"logps/chosen": -284.443603515625,
"logps/rejected": -165.102783203125,
"loss": 0.7409,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.007994448766112328,
"rewards/margins": -0.035433441400527954,
"rewards/rejected": 0.04342789575457573,
"step": 28
},
{
"epoch": 0.22018348623853212,
"grad_norm": 10.777989387512207,
"learning_rate": 3.6585365853658537e-06,
"logits/chosen": -1.0662914514541626,
"logits/rejected": -2.1156551837921143,
"logps/chosen": -289.4057922363281,
"logps/rejected": -197.46649169921875,
"loss": 0.6371,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.1247626319527626,
"rewards/margins": 0.15936096012592316,
"rewards/rejected": -0.03459831699728966,
"step": 30
},
{
"epoch": 0.23486238532110093,
"grad_norm": 12.190910339355469,
"learning_rate": 3.902439024390244e-06,
"logits/chosen": -1.1755316257476807,
"logits/rejected": -2.1449058055877686,
"logps/chosen": -288.5774841308594,
"logps/rejected": -163.59588623046875,
"loss": 0.6733,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.09892146289348602,
"rewards/margins": 0.08179756253957748,
"rewards/rejected": 0.01712390035390854,
"step": 32
},
{
"epoch": 0.24954128440366974,
"grad_norm": 13.154803276062012,
"learning_rate": 4.146341463414634e-06,
"logits/chosen": -1.1496777534484863,
"logits/rejected": -2.2045750617980957,
"logps/chosen": -324.6558837890625,
"logps/rejected": -164.45327758789062,
"loss": 0.6478,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.08885271847248077,
"rewards/margins": 0.1558375358581543,
"rewards/rejected": -0.06698483228683472,
"step": 34
},
{
"epoch": 0.26422018348623855,
"grad_norm": 11.07314682006836,
"learning_rate": 4.390243902439025e-06,
"logits/chosen": -1.1677134037017822,
"logits/rejected": -2.0850350856781006,
"logps/chosen": -311.2884216308594,
"logps/rejected": -204.43142700195312,
"loss": 0.6193,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.20451843738555908,
"rewards/margins": 0.22681473195552826,
"rewards/rejected": -0.02229629084467888,
"step": 36
},
{
"epoch": 0.27889908256880735,
"grad_norm": 12.431696891784668,
"learning_rate": 4.634146341463416e-06,
"logits/chosen": -1.195428729057312,
"logits/rejected": -2.197521686553955,
"logps/chosen": -294.04962158203125,
"logps/rejected": -200.2810516357422,
"loss": 0.6598,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.15507417917251587,
"rewards/margins": 0.11412172019481659,
"rewards/rejected": 0.04095245152711868,
"step": 38
},
{
"epoch": 0.29357798165137616,
"grad_norm": 11.575589179992676,
"learning_rate": 4.8780487804878055e-06,
"logits/chosen": -1.0411652326583862,
"logits/rejected": -2.03951096534729,
"logps/chosen": -345.9762268066406,
"logps/rejected": -181.34144592285156,
"loss": 0.6186,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.14570173621177673,
"rewards/margins": 0.20660607516765594,
"rewards/rejected": -0.06090431660413742,
"step": 40
},
{
"epoch": 0.30825688073394497,
"grad_norm": 12.716486930847168,
"learning_rate": 4.999908404322799e-06,
"logits/chosen": -1.0371800661087036,
"logits/rejected": -2.2317895889282227,
"logps/chosen": -319.42755126953125,
"logps/rejected": -172.60479736328125,
"loss": 0.6248,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.13961191475391388,
"rewards/margins": 0.19473902881145477,
"rewards/rejected": -0.055127132683992386,
"step": 42
},
{
"epoch": 0.3229357798165138,
"grad_norm": 10.400399208068848,
"learning_rate": 4.999175679175577e-06,
"logits/chosen": -1.1097325086593628,
"logits/rejected": -2.1328647136688232,
"logps/chosen": -251.92745971679688,
"logps/rejected": -161.21292114257812,
"loss": 0.5849,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.25156256556510925,
"rewards/margins": 0.2756442427635193,
"rewards/rejected": -0.02408166043460369,
"step": 44
},
{
"epoch": 0.3376146788990826,
"grad_norm": 11.084893226623535,
"learning_rate": 4.997710443643461e-06,
"logits/chosen": -1.1712064743041992,
"logits/rejected": -2.0722293853759766,
"logps/chosen": -259.9323425292969,
"logps/rejected": -206.37510681152344,
"loss": 0.6109,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.22726279497146606,
"rewards/margins": 0.24400296807289124,
"rewards/rejected": -0.016740169376134872,
"step": 46
},
{
"epoch": 0.3522935779816514,
"grad_norm": 13.230236053466797,
"learning_rate": 4.995513127188151e-06,
"logits/chosen": -1.0816175937652588,
"logits/rejected": -2.215028762817383,
"logps/chosen": -365.7675476074219,
"logps/rejected": -183.13980102539062,
"loss": 0.5456,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.34264349937438965,
"rewards/margins": 0.38213008642196655,
"rewards/rejected": -0.03948655351996422,
"step": 48
},
{
"epoch": 0.3669724770642202,
"grad_norm": 11.37851333618164,
"learning_rate": 4.992584373844853e-06,
"logits/chosen": -1.2096611261367798,
"logits/rejected": -2.082951784133911,
"logps/chosen": -345.7232971191406,
"logps/rejected": -184.25949096679688,
"loss": 0.5091,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.413723886013031,
"rewards/margins": 0.4999043643474579,
"rewards/rejected": -0.08618048578500748,
"step": 50
},
{
"epoch": 0.381651376146789,
"grad_norm": 9.676469802856445,
"learning_rate": 4.98892504203351e-06,
"logits/chosen": -1.2248896360397339,
"logits/rejected": -2.1341745853424072,
"logps/chosen": -282.0457763671875,
"logps/rejected": -158.89736938476562,
"loss": 0.501,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.42736518383026123,
"rewards/margins": 0.5060732960700989,
"rewards/rejected": -0.07870808988809586,
"step": 52
},
{
"epoch": 0.3963302752293578,
"grad_norm": 9.402766227722168,
"learning_rate": 4.9845362043071925e-06,
"logits/chosen": -1.0192848443984985,
"logits/rejected": -2.0682382583618164,
"logps/chosen": -290.6011962890625,
"logps/rejected": -163.6627197265625,
"loss": 0.4541,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.5030941963195801,
"rewards/margins": 0.6462306380271912,
"rewards/rejected": -0.14313644170761108,
"step": 54
},
{
"epoch": 0.41100917431192663,
"grad_norm": 10.944356918334961,
"learning_rate": 4.97941914703774e-06,
"logits/chosen": -1.1482800245285034,
"logits/rejected": -2.151231050491333,
"logps/chosen": -287.7913513183594,
"logps/rejected": -201.2919464111328,
"loss": 0.4487,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6401927471160889,
"rewards/margins": 0.7009615898132324,
"rewards/rejected": -0.06076894700527191,
"step": 56
},
{
"epoch": 0.42568807339449544,
"grad_norm": 8.618446350097656,
"learning_rate": 4.973575370038718e-06,
"logits/chosen": -1.0707895755767822,
"logits/rejected": -2.049323558807373,
"logps/chosen": -305.2084045410156,
"logps/rejected": -193.321533203125,
"loss": 0.3851,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.8177109956741333,
"rewards/margins": 0.9303702116012573,
"rewards/rejected": -0.11265924572944641,
"step": 58
},
{
"epoch": 0.44036697247706424,
"grad_norm": 7.712850093841553,
"learning_rate": 4.967006586125827e-06,
"logits/chosen": -1.240044355392456,
"logits/rejected": -2.0774481296539307,
"logps/chosen": -301.3046569824219,
"logps/rejected": -186.58460998535156,
"loss": 0.35,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.9353222846984863,
"rewards/margins": 1.0043295621871948,
"rewards/rejected": -0.06900733709335327,
"step": 60
},
{
"epoch": 0.45504587155963305,
"grad_norm": 8.133475303649902,
"learning_rate": 4.959714720614871e-06,
"logits/chosen": -1.1756389141082764,
"logits/rejected": -2.2198028564453125,
"logps/chosen": -319.236083984375,
"logps/rejected": -184.04647827148438,
"loss": 0.3239,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.9475828409194946,
"rewards/margins": 1.177114725112915,
"rewards/rejected": -0.22953176498413086,
"step": 62
},
{
"epoch": 0.46972477064220186,
"grad_norm": 6.613894462585449,
"learning_rate": 4.951701910757446e-06,
"logits/chosen": -1.1599823236465454,
"logits/rejected": -2.064751148223877,
"logps/chosen": -253.94537353515625,
"logps/rejected": -188.87652587890625,
"loss": 0.3088,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.0339241027832031,
"rewards/margins": 1.2678444385528564,
"rewards/rejected": -0.2339203655719757,
"step": 64
},
{
"epoch": 0.48440366972477067,
"grad_norm": 8.49493408203125,
"learning_rate": 4.942970505114514e-06,
"logits/chosen": -1.0440397262573242,
"logits/rejected": -2.1136162281036377,
"logps/chosen": -308.4583435058594,
"logps/rejected": -176.34474182128906,
"loss": 0.268,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.1736990213394165,
"rewards/margins": 1.4280885457992554,
"rewards/rejected": -0.25438952445983887,
"step": 66
},
{
"epoch": 0.4990825688073395,
"grad_norm": 6.022420883178711,
"learning_rate": 4.933523062868033e-06,
"logits/chosen": -1.0774444341659546,
"logits/rejected": -2.1658172607421875,
"logps/chosen": -269.4661560058594,
"logps/rejected": -164.7786865234375,
"loss": 0.2372,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.3157860040664673,
"rewards/margins": 1.5686390399932861,
"rewards/rejected": -0.25285303592681885,
"step": 68
},
{
"epoch": 0.5137614678899083,
"grad_norm": 4.839372634887695,
"learning_rate": 4.923362353070859e-06,
"logits/chosen": -0.8954001665115356,
"logits/rejected": -2.1572980880737305,
"logps/chosen": -287.38250732421875,
"logps/rejected": -159.82025146484375,
"loss": 0.2079,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.4153721332550049,
"rewards/margins": 1.8501354455947876,
"rewards/rejected": -0.43476346135139465,
"step": 70
},
{
"epoch": 0.5284403669724771,
"grad_norm": 5.355666160583496,
"learning_rate": 4.912491353835138e-06,
"logits/chosen": -1.1590656042099,
"logits/rejected": -2.088367462158203,
"logps/chosen": -260.02386474609375,
"logps/rejected": -185.47396850585938,
"loss": 0.2185,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.4196313619613647,
"rewards/margins": 1.858705997467041,
"rewards/rejected": -0.43907448649406433,
"step": 72
},
{
"epoch": 0.5431192660550459,
"grad_norm": 4.641209602355957,
"learning_rate": 4.900913251459418e-06,
"logits/chosen": -1.0761524438858032,
"logits/rejected": -2.0451908111572266,
"logps/chosen": -264.9051513671875,
"logps/rejected": -173.16702270507812,
"loss": 0.1769,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5420759916305542,
"rewards/margins": 2.071654796600342,
"rewards/rejected": -0.5295785069465637,
"step": 74
},
{
"epoch": 0.5577981651376147,
"grad_norm": 4.564330101013184,
"learning_rate": 4.8886314394947396e-06,
"logits/chosen": -0.9936952590942383,
"logits/rejected": -2.070539951324463,
"logps/chosen": -278.8867492675781,
"logps/rejected": -185.91055297851562,
"loss": 0.1608,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.9348686933517456,
"rewards/margins": 2.52958083152771,
"rewards/rejected": -0.5947118997573853,
"step": 76
},
{
"epoch": 0.5724770642201835,
"grad_norm": 5.782593250274658,
"learning_rate": 4.875649517749985e-06,
"logits/chosen": -1.0427924394607544,
"logits/rejected": -2.180347442626953,
"logps/chosen": -282.06732177734375,
"logps/rejected": -191.30137634277344,
"loss": 0.1548,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.9499953985214233,
"rewards/margins": 2.6421873569488525,
"rewards/rejected": -0.6921918392181396,
"step": 78
},
{
"epoch": 0.5871559633027523,
"grad_norm": 4.356126308441162,
"learning_rate": 4.861971291236772e-06,
"logits/chosen": -1.134873390197754,
"logits/rejected": -2.047222852706909,
"logps/chosen": -328.65509033203125,
"logps/rejected": -191.76483154296875,
"loss": 0.1841,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.2117769718170166,
"rewards/margins": 2.5882744789123535,
"rewards/rejected": -0.37649768590927124,
"step": 80
},
{
"epoch": 0.6018348623853211,
"grad_norm": 3.70808482170105,
"learning_rate": 4.847600769054201e-06,
"logits/chosen": -1.1773045063018799,
"logits/rejected": -2.071323871612549,
"logps/chosen": -365.7237243652344,
"logps/rejected": -221.5764923095703,
"loss": 0.1093,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.4926247596740723,
"rewards/margins": 3.0623347759246826,
"rewards/rejected": -0.5697098970413208,
"step": 82
},
{
"epoch": 0.6165137614678899,
"grad_norm": 2.8207852840423584,
"learning_rate": 4.832542163213787e-06,
"logits/chosen": -1.0239057540893555,
"logits/rejected": -2.1960628032684326,
"logps/chosen": -261.3912658691406,
"logps/rejected": -155.67286682128906,
"loss": 0.1073,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.281486749649048,
"rewards/margins": 3.123940944671631,
"rewards/rejected": -0.8424541354179382,
"step": 84
},
{
"epoch": 0.6311926605504588,
"grad_norm": 2.6905996799468994,
"learning_rate": 4.816799887404911e-06,
"logits/chosen": -1.2185587882995605,
"logits/rejected": -2.146491289138794,
"logps/chosen": -300.77069091796875,
"logps/rejected": -185.7276153564453,
"loss": 0.1277,
"rewards/accuracies": 0.96875,
"rewards/chosen": 2.3313047885894775,
"rewards/margins": 3.0290822982788086,
"rewards/rejected": -0.6977773904800415,
"step": 86
},
{
"epoch": 0.6458715596330276,
"grad_norm": 1.891965389251709,
"learning_rate": 4.800378555701168e-06,
"logits/chosen": -1.056377649307251,
"logits/rejected": -2.001763343811035,
"logps/chosen": -354.14990234375,
"logps/rejected": -186.62448120117188,
"loss": 0.1089,
"rewards/accuracies": 0.953125,
"rewards/chosen": 2.5092077255249023,
"rewards/margins": 3.3948686122894287,
"rewards/rejected": -0.8856609463691711,
"step": 88
},
{
"epoch": 0.6605504587155964,
"grad_norm": 3.7145261764526367,
"learning_rate": 4.783282981207979e-06,
"logits/chosen": -1.1021761894226074,
"logits/rejected": -2.2725181579589844,
"logps/chosen": -296.32763671875,
"logps/rejected": -169.7439727783203,
"loss": 0.0866,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.6951088905334473,
"rewards/margins": 3.6553006172180176,
"rewards/rejected": -0.9601919054985046,
"step": 90
},
{
"epoch": 0.6752293577981652,
"grad_norm": 2.50156307220459,
"learning_rate": 4.765518174651864e-06,
"logits/chosen": -1.1074126958847046,
"logits/rejected": -2.051131248474121,
"logps/chosen": -285.9756164550781,
"logps/rejected": -190.58448791503906,
"loss": 0.0852,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.7018895149230957,
"rewards/margins": 3.8313865661621094,
"rewards/rejected": -1.1294972896575928,
"step": 92
},
{
"epoch": 0.689908256880734,
"grad_norm": 2.272671699523926,
"learning_rate": 4.747089342911793e-06,
"logits/chosen": -0.9693321585655212,
"logits/rejected": -2.168473720550537,
"logps/chosen": -291.7270812988281,
"logps/rejected": -175.2049560546875,
"loss": 0.0446,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.954824209213257,
"rewards/margins": 4.128055095672607,
"rewards/rejected": -1.1732308864593506,
"step": 94
},
{
"epoch": 0.7045871559633028,
"grad_norm": 2.2310574054718018,
"learning_rate": 4.728001887493048e-06,
"logits/chosen": -0.9781808853149414,
"logits/rejected": -2.155506134033203,
"logps/chosen": -299.66314697265625,
"logps/rejected": -194.56436157226562,
"loss": 0.067,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.1443114280700684,
"rewards/margins": 4.226352214813232,
"rewards/rejected": -1.0820410251617432,
"step": 96
},
{
"epoch": 0.7192660550458716,
"grad_norm": 1.7269368171691895,
"learning_rate": 4.708261402944036e-06,
"logits/chosen": -1.0619006156921387,
"logits/rejected": -2.1256189346313477,
"logps/chosen": -315.5987548828125,
"logps/rejected": -188.52439880371094,
"loss": 0.06,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3185007572174072,
"rewards/margins": 4.740314960479736,
"rewards/rejected": -1.4218144416809082,
"step": 98
},
{
"epoch": 0.7339449541284404,
"grad_norm": 3.1934289932250977,
"learning_rate": 4.687873675216522e-06,
"logits/chosen": -0.9534860253334045,
"logits/rejected": -1.9718412160873413,
"logps/chosen": -303.17181396484375,
"logps/rejected": -199.40789794921875,
"loss": 0.0892,
"rewards/accuracies": 0.953125,
"rewards/chosen": 3.4963011741638184,
"rewards/margins": 4.650891304016113,
"rewards/rejected": -1.1545898914337158,
"step": 100
},
{
"epoch": 0.7486238532110092,
"grad_norm": 1.0567034482955933,
"learning_rate": 4.666844679969765e-06,
"logits/chosen": -1.287552833557129,
"logits/rejected": -2.272284507751465,
"logps/chosen": -299.2529296875,
"logps/rejected": -208.53785705566406,
"loss": 0.0373,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.132131814956665,
"rewards/margins": 4.880558967590332,
"rewards/rejected": -1.748427152633667,
"step": 102
},
{
"epoch": 0.763302752293578,
"grad_norm": 1.3455036878585815,
"learning_rate": 4.6451805808190464e-06,
"logits/chosen": -1.049391508102417,
"logits/rejected": -2.1182594299316406,
"logps/chosen": -284.2237548828125,
"logps/rejected": -176.08627319335938,
"loss": 0.0317,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.437601089477539,
"rewards/margins": 5.258786678314209,
"rewards/rejected": -1.821185827255249,
"step": 104
},
{
"epoch": 0.7779816513761468,
"grad_norm": 1.6037604808807373,
"learning_rate": 4.622887727529104e-06,
"logits/chosen": -1.0589053630828857,
"logits/rejected": -2.095472812652588,
"logps/chosen": -257.8381042480469,
"logps/rejected": -207.0792236328125,
"loss": 0.029,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.2390363216400146,
"rewards/margins": 5.350650310516357,
"rewards/rejected": -2.1116137504577637,
"step": 106
},
{
"epoch": 0.7926605504587156,
"grad_norm": 1.9005062580108643,
"learning_rate": 4.599972654153018e-06,
"logits/chosen": -0.9298142194747925,
"logits/rejected": -2.0814666748046875,
"logps/chosen": -301.68865966796875,
"logps/rejected": -174.01010131835938,
"loss": 0.0311,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7089574337005615,
"rewards/margins": 5.569860458374023,
"rewards/rejected": -1.8609036207199097,
"step": 108
},
{
"epoch": 0.8073394495412844,
"grad_norm": 0.712770402431488,
"learning_rate": 4.5764420771170735e-06,
"logits/chosen": -0.9678480625152588,
"logits/rejected": -2.0447123050689697,
"logps/chosen": -278.64398193359375,
"logps/rejected": -192.5853729248047,
"loss": 0.0215,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.513016939163208,
"rewards/margins": 5.6180419921875,
"rewards/rejected": -2.105024814605713,
"step": 110
},
{
"epoch": 0.8220183486238533,
"grad_norm": 1.3919163942337036,
"learning_rate": 4.552302893252166e-06,
"logits/chosen": -1.2199370861053467,
"logits/rejected": -2.197056293487549,
"logps/chosen": -306.26080322265625,
"logps/rejected": -205.06845092773438,
"loss": 0.0296,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.367818832397461,
"rewards/margins": 5.316436290740967,
"rewards/rejected": -1.948617696762085,
"step": 112
},
{
"epoch": 0.8366972477064221,
"grad_norm": 3.037362575531006,
"learning_rate": 4.52756217777234e-06,
"logits/chosen": -1.2299991846084595,
"logits/rejected": -2.1640126705169678,
"logps/chosen": -311.70574951171875,
"logps/rejected": -207.38746643066406,
"loss": 0.0398,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.595170021057129,
"rewards/margins": 5.463950157165527,
"rewards/rejected": -1.8687800168991089,
"step": 114
},
{
"epoch": 0.8513761467889909,
"grad_norm": 0.8069730401039124,
"learning_rate": 4.502227182201035e-06,
"logits/chosen": -0.9528835415840149,
"logits/rejected": -1.977004051208496,
"logps/chosen": -264.5509033203125,
"logps/rejected": -174.93551635742188,
"loss": 0.0227,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.867180109024048,
"rewards/margins": 6.007584571838379,
"rewards/rejected": -2.140403985977173,
"step": 116
},
{
"epoch": 0.8660550458715597,
"grad_norm": 1.4102082252502441,
"learning_rate": 4.476305332245662e-06,
"logits/chosen": -1.0918750762939453,
"logits/rejected": -2.3146743774414062,
"logps/chosen": -314.5960998535156,
"logps/rejected": -152.3535614013672,
"loss": 0.028,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.734642744064331,
"rewards/margins": 6.135974407196045,
"rewards/rejected": -2.4013314247131348,
"step": 118
},
{
"epoch": 0.8807339449541285,
"grad_norm": 2.8867928981781006,
"learning_rate": 4.449804225621116e-06,
"logits/chosen": -1.0288662910461426,
"logits/rejected": -2.0701658725738525,
"logps/chosen": -279.2713317871094,
"logps/rejected": -180.374267578125,
"loss": 0.0485,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.5982298851013184,
"rewards/margins": 5.633719444274902,
"rewards/rejected": -2.0354888439178467,
"step": 120
},
{
"epoch": 0.8954128440366973,
"grad_norm": 0.7778434753417969,
"learning_rate": 4.422731629822887e-06,
"logits/chosen": -0.9540915489196777,
"logits/rejected": -1.9875534772872925,
"logps/chosen": -314.85003662109375,
"logps/rejected": -194.16896057128906,
"loss": 0.0315,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.723828077316284,
"rewards/margins": 6.088706016540527,
"rewards/rejected": -2.364877462387085,
"step": 122
},
{
"epoch": 0.9100917431192661,
"grad_norm": 1.9667764902114868,
"learning_rate": 4.395095479850396e-06,
"logits/chosen": -0.9676120281219482,
"logits/rejected": -1.9072697162628174,
"logps/chosen": -287.99981689453125,
"logps/rejected": -186.82659912109375,
"loss": 0.0548,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.7472550868988037,
"rewards/margins": 6.025314807891846,
"rewards/rejected": -2.278059482574463,
"step": 124
},
{
"epoch": 0.9247706422018349,
"grad_norm": 0.4268924593925476,
"learning_rate": 4.366903875881243e-06,
"logits/chosen": -1.0968043804168701,
"logits/rejected": -2.334925651550293,
"logps/chosen": -275.3115234375,
"logps/rejected": -164.202392578125,
"loss": 0.0128,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.771523952484131,
"rewards/margins": 6.7282609939575195,
"rewards/rejected": -2.9567372798919678,
"step": 126
},
{
"epoch": 0.9394495412844037,
"grad_norm": 1.4270014762878418,
"learning_rate": 4.3381650808970365e-06,
"logits/chosen": -1.0460113286972046,
"logits/rejected": -1.9695379734039307,
"logps/chosen": -254.8202667236328,
"logps/rejected": -185.63243103027344,
"loss": 0.0226,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.706533432006836,
"rewards/margins": 6.099806785583496,
"rewards/rejected": -2.39327335357666,
"step": 128
},
{
"epoch": 0.9541284403669725,
"grad_norm": 0.6754117012023926,
"learning_rate": 4.308887518261507e-06,
"logits/chosen": -0.8909565210342407,
"logits/rejected": -1.9432121515274048,
"logps/chosen": -278.40216064453125,
"logps/rejected": -195.16552734375,
"loss": 0.0194,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.237884521484375,
"rewards/margins": 6.639657974243164,
"rewards/rejected": -2.4017739295959473,
"step": 130
},
{
"epoch": 0.9688073394495413,
"grad_norm": 0.7388483285903931,
"learning_rate": 4.279079769251617e-06,
"logits/chosen": -1.2244815826416016,
"logits/rejected": -2.1885085105895996,
"logps/chosen": -351.21783447265625,
"logps/rejected": -210.98890686035156,
"loss": 0.0184,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.041647911071777,
"rewards/margins": 6.690797328948975,
"rewards/rejected": -2.6491494178771973,
"step": 132
},
{
"epoch": 0.9834862385321101,
"grad_norm": 0.7370263934135437,
"learning_rate": 4.248750570542373e-06,
"logits/chosen": -1.0081679821014404,
"logits/rejected": -2.0711734294891357,
"logps/chosen": -272.2639465332031,
"logps/rejected": -179.82412719726562,
"loss": 0.0231,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.885261058807373,
"rewards/margins": 6.468730449676514,
"rewards/rejected": -2.5834696292877197,
"step": 134
},
{
"epoch": 0.998165137614679,
"grad_norm": 2.1839847564697266,
"learning_rate": 4.21790881164611e-06,
"logits/chosen": -0.9589763879776001,
"logits/rejected": -2.103942394256592,
"logps/chosen": -282.6980285644531,
"logps/rejected": -193.8739776611328,
"loss": 0.0379,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.211104869842529,
"rewards/margins": 7.221211910247803,
"rewards/rejected": -3.0101072788238525,
"step": 136
},
{
"epoch": 1.0128440366972478,
"grad_norm": 2.379425525665283,
"learning_rate": 4.186563532306957e-06,
"logits/chosen": -0.9432098865509033,
"logits/rejected": -2.0608460903167725,
"logps/chosen": -288.9028625488281,
"logps/rejected": -168.07359313964844,
"loss": 0.028,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.061460494995117,
"rewards/margins": 7.086147308349609,
"rewards/rejected": -3.0246872901916504,
"step": 138
},
{
"epoch": 1.0275229357798166,
"grad_norm": 2.2438290119171143,
"learning_rate": 4.154723919851291e-06,
"logits/chosen": -1.1197127103805542,
"logits/rejected": -2.0973258018493652,
"logps/chosen": -290.60296630859375,
"logps/rejected": -173.36465454101562,
"loss": 0.0308,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.5277624130249023,
"rewards/margins": 6.31058406829834,
"rewards/rejected": -2.7828218936920166,
"step": 140
},
{
"epoch": 1.0422018348623854,
"grad_norm": 0.38025742769241333,
"learning_rate": 4.122399306494918e-06,
"logits/chosen": -1.1321005821228027,
"logits/rejected": -2.2533721923828125,
"logps/chosen": -336.11224365234375,
"logps/rejected": -198.53457641601562,
"loss": 0.0211,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.0418477058410645,
"rewards/margins": 7.016913890838623,
"rewards/rejected": -2.975067138671875,
"step": 142
},
{
"epoch": 1.0568807339449542,
"grad_norm": 1.0832823514938354,
"learning_rate": 4.089599166607794e-06,
"logits/chosen": -1.0980923175811768,
"logits/rejected": -2.007105588912964,
"logps/chosen": -292.0760803222656,
"logps/rejected": -186.78787231445312,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.942317008972168,
"rewards/margins": 7.760876178741455,
"rewards/rejected": -3.818559169769287,
"step": 144
},
{
"epoch": 1.071559633027523,
"grad_norm": 0.542005717754364,
"learning_rate": 4.05633311393708e-06,
"logits/chosen": -0.9787145853042603,
"logits/rejected": -2.0150396823883057,
"logps/chosen": -257.6767883300781,
"logps/rejected": -172.47512817382812,
"loss": 0.0268,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.031050682067871,
"rewards/margins": 7.134464263916016,
"rewards/rejected": -3.1034140586853027,
"step": 146
},
{
"epoch": 1.0862385321100918,
"grad_norm": 1.513509750366211,
"learning_rate": 4.022610898789349e-06,
"logits/chosen": -1.008697509765625,
"logits/rejected": -2.0967135429382324,
"logps/chosen": -266.4443664550781,
"logps/rejected": -186.60263061523438,
"loss": 0.0169,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.120553970336914,
"rewards/margins": 7.502930641174316,
"rewards/rejected": -3.382376194000244,
"step": 148
},
{
"epoch": 1.1009174311926606,
"grad_norm": 1.2189836502075195,
"learning_rate": 3.988442405172755e-06,
"logits/chosen": -0.8885701894760132,
"logits/rejected": -2.0014257431030273,
"logps/chosen": -281.70147705078125,
"logps/rejected": -201.9718780517578,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.45000696182251,
"rewards/margins": 7.705287933349609,
"rewards/rejected": -3.2552807331085205,
"step": 150
},
{
"epoch": 1.1155963302752294,
"grad_norm": 0.2563473880290985,
"learning_rate": 3.953837647900031e-06,
"logits/chosen": -0.9757863283157349,
"logits/rejected": -2.0974419116973877,
"logps/chosen": -273.5846862792969,
"logps/rejected": -195.75936889648438,
"loss": 0.0107,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.658951759338379,
"rewards/margins": 8.117878913879395,
"rewards/rejected": -3.4589266777038574,
"step": 152
},
{
"epoch": 1.1302752293577982,
"grad_norm": 2.6809535026550293,
"learning_rate": 3.918806769653135e-06,
"logits/chosen": -0.8756412863731384,
"logits/rejected": -1.9975080490112305,
"logps/chosen": -318.453857421875,
"logps/rejected": -195.71372985839844,
"loss": 0.0324,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.540386199951172,
"rewards/margins": 7.758340358734131,
"rewards/rejected": -3.217954158782959,
"step": 154
},
{
"epoch": 1.144954128440367,
"grad_norm": 0.34194982051849365,
"learning_rate": 3.88336003801042e-06,
"logits/chosen": -0.9494649171829224,
"logits/rejected": -2.052715301513672,
"logps/chosen": -255.02169799804688,
"logps/rejected": -178.14224243164062,
"loss": 0.0114,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.8328986167907715,
"rewards/margins": 7.075186729431152,
"rewards/rejected": -3.2422876358032227,
"step": 156
},
{
"epoch": 1.1596330275229358,
"grad_norm": 0.37112390995025635,
"learning_rate": 3.847507842437205e-06,
"logits/chosen": -0.8547274470329285,
"logits/rejected": -2.1034629344940186,
"logps/chosen": -296.8822021484375,
"logps/rejected": -171.6925048828125,
"loss": 0.007,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.401730060577393,
"rewards/margins": 8.271763801574707,
"rewards/rejected": -3.8700337409973145,
"step": 158
},
{
"epoch": 1.1743119266055047,
"grad_norm": 0.6065702438354492,
"learning_rate": 3.811260691240604e-06,
"logits/chosen": -0.894873857498169,
"logits/rejected": -2.086596965789795,
"logps/chosen": -340.1643981933594,
"logps/rejected": -188.5568389892578,
"loss": 0.009,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.769918441772461,
"rewards/margins": 8.063861846923828,
"rewards/rejected": -3.293943166732788,
"step": 160
},
{
"epoch": 1.1889908256880735,
"grad_norm": 0.38738325238227844,
"learning_rate": 3.774629208489547e-06,
"logits/chosen": -0.9661360383033752,
"logits/rejected": -2.0905256271362305,
"logps/chosen": -241.7164764404297,
"logps/rejected": -172.8728790283203,
"loss": 0.0084,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.948549270629883,
"rewards/margins": 7.093012809753418,
"rewards/rejected": -3.1444640159606934,
"step": 162
},
{
"epoch": 1.2036697247706423,
"grad_norm": 0.4064182639122009,
"learning_rate": 3.7376241309008433e-06,
"logits/chosen": -1.1252474784851074,
"logits/rejected": -2.123969793319702,
"logps/chosen": -326.73370361328125,
"logps/rejected": -183.0895233154297,
"loss": 0.0133,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.725476264953613,
"rewards/margins": 7.916323661804199,
"rewards/rejected": -3.190847635269165,
"step": 164
},
{
"epoch": 1.218348623853211,
"grad_norm": 0.3772048056125641,
"learning_rate": 3.7002563046922502e-06,
"logits/chosen": -1.0913598537445068,
"logits/rejected": -2.229214668273926,
"logps/chosen": -326.4932861328125,
"logps/rejected": -173.02989196777344,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.724410533905029,
"rewards/margins": 8.843596458435059,
"rewards/rejected": -4.119184970855713,
"step": 166
},
{
"epoch": 1.2330275229357799,
"grad_norm": 0.48601555824279785,
"learning_rate": 3.6625366824034337e-06,
"logits/chosen": -0.8681567907333374,
"logits/rejected": -2.067228317260742,
"logps/chosen": -279.7916259765625,
"logps/rejected": -206.9775390625,
"loss": 0.0139,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.7722625732421875,
"rewards/margins": 9.153127670288086,
"rewards/rejected": -4.380865097045898,
"step": 168
},
{
"epoch": 1.2477064220183487,
"grad_norm": 1.1044621467590332,
"learning_rate": 3.6244763196857714e-06,
"logits/chosen": -0.9898172616958618,
"logits/rejected": -2.130460262298584,
"logps/chosen": -296.6734619140625,
"logps/rejected": -181.456298828125,
"loss": 0.013,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.674115180969238,
"rewards/margins": 8.792963981628418,
"rewards/rejected": -4.1188483238220215,
"step": 170
},
{
"epoch": 1.2623853211009175,
"grad_norm": 1.3868632316589355,
"learning_rate": 3.5860863720619333e-06,
"logits/chosen": -1.0125945806503296,
"logits/rejected": -2.080739736557007,
"logps/chosen": -289.3682861328125,
"logps/rejected": -184.58253479003906,
"loss": 0.0137,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.030690670013428,
"rewards/margins": 8.200502395629883,
"rewards/rejected": -3.1698126792907715,
"step": 172
},
{
"epoch": 1.2770642201834863,
"grad_norm": 0.4792233407497406,
"learning_rate": 3.547378091656186e-06,
"logits/chosen": -0.9022351503372192,
"logits/rejected": -2.0722413063049316,
"logps/chosen": -293.7245178222656,
"logps/rejected": -173.53054809570312,
"loss": 0.0092,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.439302444458008,
"rewards/margins": 8.599308013916016,
"rewards/rejected": -4.160006046295166,
"step": 174
},
{
"epoch": 1.2917431192660551,
"grad_norm": 0.7374489903450012,
"learning_rate": 3.5083628238963913e-06,
"logits/chosen": -1.085463047027588,
"logits/rejected": -1.969193935394287,
"logps/chosen": -234.489013671875,
"logps/rejected": -175.44613647460938,
"loss": 0.0147,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.414360523223877,
"rewards/margins": 7.9778642654418945,
"rewards/rejected": -3.5635030269622803,
"step": 176
},
{
"epoch": 1.306422018348624,
"grad_norm": 1.7487801313400269,
"learning_rate": 3.4690520041886473e-06,
"logits/chosen": -0.9150568246841431,
"logits/rejected": -2.0502333641052246,
"logps/chosen": -275.4502258300781,
"logps/rejected": -212.3257598876953,
"loss": 0.0157,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.58690071105957,
"rewards/margins": 8.234541893005371,
"rewards/rejected": -3.647641181945801,
"step": 178
},
{
"epoch": 1.3211009174311927,
"grad_norm": 0.12792479991912842,
"learning_rate": 3.4294571545655653e-06,
"logits/chosen": -0.91706383228302,
"logits/rejected": -2.196730613708496,
"logps/chosen": -293.5966796875,
"logps/rejected": -180.54701232910156,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.876993656158447,
"rewards/margins": 9.287820816040039,
"rewards/rejected": -4.41082763671875,
"step": 180
},
{
"epoch": 1.3357798165137615,
"grad_norm": 1.574942708015442,
"learning_rate": 3.38958988030915e-06,
"logits/chosen": -1.1890692710876465,
"logits/rejected": -2.066960334777832,
"logps/chosen": -274.7825622558594,
"logps/rejected": -224.47837829589844,
"loss": 0.0592,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.447505950927734,
"rewards/margins": 8.213159561157227,
"rewards/rejected": -3.7656538486480713,
"step": 182
},
{
"epoch": 1.3504587155963304,
"grad_norm": 1.0345042943954468,
"learning_rate": 3.3494618665492833e-06,
"logits/chosen": -1.1099860668182373,
"logits/rejected": -2.0204684734344482,
"logps/chosen": -255.40478515625,
"logps/rejected": -192.52752685546875,
"loss": 0.0152,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.036734104156494,
"rewards/margins": 7.486913681030273,
"rewards/rejected": -3.4501795768737793,
"step": 184
},
{
"epoch": 1.3651376146788992,
"grad_norm": 0.1829257309436798,
"learning_rate": 3.3090848748388042e-06,
"logits/chosen": -1.0115846395492554,
"logits/rejected": -2.1213629245758057,
"logps/chosen": -353.5410461425781,
"logps/rejected": -192.9250030517578,
"loss": 0.0061,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.421483993530273,
"rewards/margins": 8.999296188354492,
"rewards/rejected": -4.577812194824219,
"step": 186
},
{
"epoch": 1.379816513761468,
"grad_norm": 0.3030329942703247,
"learning_rate": 3.2684707397061887e-06,
"logits/chosen": -1.0969910621643066,
"logits/rejected": -2.0923759937286377,
"logps/chosen": -293.1423645019531,
"logps/rejected": -173.88784790039062,
"loss": 0.0124,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.567940711975098,
"rewards/margins": 8.394798278808594,
"rewards/rejected": -3.8268580436706543,
"step": 188
},
{
"epoch": 1.3944954128440368,
"grad_norm": 0.8538657426834106,
"learning_rate": 3.2276313651868364e-06,
"logits/chosen": -0.9523632526397705,
"logits/rejected": -2.0854203701019287,
"logps/chosen": -297.4543762207031,
"logps/rejected": -162.13568115234375,
"loss": 0.0139,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.548934459686279,
"rewards/margins": 8.458111763000488,
"rewards/rejected": -3.909177541732788,
"step": 190
},
{
"epoch": 1.4091743119266056,
"grad_norm": 0.4353146553039551,
"learning_rate": 3.1865787213339926e-06,
"logits/chosen": -0.9564714431762695,
"logits/rejected": -2.0908193588256836,
"logps/chosen": -281.3487243652344,
"logps/rejected": -186.54757690429688,
"loss": 0.0115,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.7654829025268555,
"rewards/margins": 9.230217933654785,
"rewards/rejected": -4.464734077453613,
"step": 192
},
{
"epoch": 1.4238532110091744,
"grad_norm": 0.40312162041664124,
"learning_rate": 3.1453248407103156e-06,
"logits/chosen": -0.9966449737548828,
"logits/rejected": -2.1248295307159424,
"logps/chosen": -287.3999328613281,
"logps/rejected": -169.44496154785156,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.19376802444458,
"rewards/margins": 8.428169250488281,
"rewards/rejected": -4.234401702880859,
"step": 194
},
{
"epoch": 1.4385321100917432,
"grad_norm": 1.217081904411316,
"learning_rate": 3.1038818148611178e-06,
"logits/chosen": -1.022183895111084,
"logits/rejected": -2.0069739818573,
"logps/chosen": -312.16973876953125,
"logps/rejected": -181.82955932617188,
"loss": 0.0132,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.685446262359619,
"rewards/margins": 8.785126686096191,
"rewards/rejected": -4.099679946899414,
"step": 196
},
{
"epoch": 1.453211009174312,
"grad_norm": 0.6292124390602112,
"learning_rate": 3.062261790770331e-06,
"logits/chosen": -0.8997288942337036,
"logits/rejected": -1.9895069599151611,
"logps/chosen": -259.7031555175781,
"logps/rejected": -182.4678497314453,
"loss": 0.0253,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.453324794769287,
"rewards/margins": 7.995000839233398,
"rewards/rejected": -3.541675567626953,
"step": 198
},
{
"epoch": 1.4678899082568808,
"grad_norm": 0.46019911766052246,
"learning_rate": 3.0204769673002123e-06,
"logits/chosen": -0.981975793838501,
"logits/rejected": -2.123629331588745,
"logps/chosen": -333.59722900390625,
"logps/rejected": -198.2655487060547,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.988855838775635,
"rewards/margins": 8.937817573547363,
"rewards/rejected": -3.948960781097412,
"step": 200
},
{
"epoch": 1.4825688073394496,
"grad_norm": 1.5788525342941284,
"learning_rate": 2.978539591615848e-06,
"logits/chosen": -1.0232621431350708,
"logits/rejected": -1.9014160633087158,
"logps/chosen": -299.21649169921875,
"logps/rejected": -196.33389282226562,
"loss": 0.0167,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.068594932556152,
"rewards/margins": 8.521183013916016,
"rewards/rejected": -4.452587604522705,
"step": 202
},
{
"epoch": 1.4972477064220184,
"grad_norm": 0.18567878007888794,
"learning_rate": 2.936461955595501e-06,
"logits/chosen": -1.0283303260803223,
"logits/rejected": -2.1100425720214844,
"logps/chosen": -298.8528137207031,
"logps/rejected": -191.35086059570312,
"loss": 0.0068,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9132208824157715,
"rewards/margins": 8.617488861083984,
"rewards/rejected": -3.704267978668213,
"step": 204
},
{
"epoch": 1.5119266055045872,
"grad_norm": 0.13169872760772705,
"learning_rate": 2.8942563922278487e-06,
"logits/chosen": -1.0413228273391724,
"logits/rejected": -2.1321609020233154,
"logps/chosen": -286.2435607910156,
"logps/rejected": -196.66256713867188,
"loss": 0.011,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.733799457550049,
"rewards/margins": 9.53592586517334,
"rewards/rejected": -4.802126407623291,
"step": 206
},
{
"epoch": 1.526605504587156,
"grad_norm": 2.0917320251464844,
"learning_rate": 2.8519352719971783e-06,
"logits/chosen": -1.097141981124878,
"logits/rejected": -2.0799503326416016,
"logps/chosen": -316.93597412109375,
"logps/rejected": -201.67100524902344,
"loss": 0.0309,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.9373931884765625,
"rewards/margins": 8.860273361206055,
"rewards/rejected": -3.9228808879852295,
"step": 208
},
{
"epoch": 1.5412844036697249,
"grad_norm": 0.6296855807304382,
"learning_rate": 2.8095109992575824e-06,
"logits/chosen": -0.9797852039337158,
"logits/rejected": -2.088029146194458,
"logps/chosen": -328.76251220703125,
"logps/rejected": -201.71078491210938,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.399056911468506,
"rewards/margins": 9.34478759765625,
"rewards/rejected": -3.945730209350586,
"step": 210
},
{
"epoch": 1.5559633027522937,
"grad_norm": 0.3409838080406189,
"learning_rate": 2.7669960085972407e-06,
"logits/chosen": -0.9346829652786255,
"logits/rejected": -2.2055399417877197,
"logps/chosen": -351.57489013671875,
"logps/rejected": -219.8714141845703,
"loss": 0.0047,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.243146896362305,
"rewards/margins": 9.388729095458984,
"rewards/rejected": -4.145582675933838,
"step": 212
},
{
"epoch": 1.5706422018348625,
"grad_norm": 0.791716456413269,
"learning_rate": 2.7244027611938247e-06,
"logits/chosen": -0.8380637764930725,
"logits/rejected": -1.925654649734497,
"logps/chosen": -251.362548828125,
"logps/rejected": -220.16436767578125,
"loss": 0.0215,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.47086763381958,
"rewards/margins": 8.536866188049316,
"rewards/rejected": -4.065998554229736,
"step": 214
},
{
"epoch": 1.5853211009174313,
"grad_norm": 0.2506906986236572,
"learning_rate": 2.6817437411621194e-06,
"logits/chosen": -0.9830411076545715,
"logits/rejected": -2.0578300952911377,
"logps/chosen": -342.7967529296875,
"logps/rejected": -237.17025756835938,
"loss": 0.007,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.036384582519531,
"rewards/margins": 9.013383865356445,
"rewards/rejected": -3.976999521255493,
"step": 216
},
{
"epoch": 1.6,
"grad_norm": 0.22265683114528656,
"learning_rate": 2.639031451894923e-06,
"logits/chosen": -1.028990387916565,
"logits/rejected": -1.9095451831817627,
"logps/chosen": -330.1585998535156,
"logps/rejected": -222.2620086669922,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.05686616897583,
"rewards/margins": 9.352256774902344,
"rewards/rejected": -4.2953901290893555,
"step": 218
},
{
"epoch": 1.614678899082569,
"grad_norm": 0.857473611831665,
"learning_rate": 2.5962784123982843e-06,
"logits/chosen": -1.049895167350769,
"logits/rejected": -2.1700665950775146,
"logps/chosen": -305.7288513183594,
"logps/rejected": -198.574462890625,
"loss": 0.0122,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.7103071212768555,
"rewards/margins": 9.439103126525879,
"rewards/rejected": -4.728795528411865,
"step": 220
},
{
"epoch": 1.6293577981651377,
"grad_norm": 0.2698463797569275,
"learning_rate": 2.5534971536221804e-06,
"logits/chosen": -0.861595630645752,
"logits/rejected": -1.9534931182861328,
"logps/chosen": -268.7453918457031,
"logps/rejected": -191.55238342285156,
"loss": 0.024,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.435842514038086,
"rewards/margins": 9.0311918258667,
"rewards/rejected": -4.595349311828613,
"step": 222
},
{
"epoch": 1.6440366972477065,
"grad_norm": 1.8740975856781006,
"learning_rate": 2.5107002147876814e-06,
"logits/chosen": -1.010701298713684,
"logits/rejected": -1.9186618328094482,
"logps/chosen": -263.1980895996094,
"logps/rejected": -206.22360229492188,
"loss": 0.0135,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.652899742126465,
"rewards/margins": 9.141688346862793,
"rewards/rejected": -4.488787651062012,
"step": 224
},
{
"epoch": 1.6587155963302753,
"grad_norm": 2.0281364917755127,
"learning_rate": 2.467900139711693e-06,
"logits/chosen": -1.0440551042556763,
"logits/rejected": -1.971301555633545,
"logps/chosen": -272.8301696777344,
"logps/rejected": -197.07269287109375,
"loss": 0.0238,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.243446350097656,
"rewards/margins": 8.737079620361328,
"rewards/rejected": -4.493633270263672,
"step": 226
},
{
"epoch": 1.6733944954128441,
"grad_norm": 0.23005536198616028,
"learning_rate": 2.4251094731303586e-06,
"logits/chosen": -0.9269182085990906,
"logits/rejected": -2.089838981628418,
"logps/chosen": -291.0529479980469,
"logps/rejected": -179.94895935058594,
"loss": 0.0057,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.278434753417969,
"rewards/margins": 9.40982723236084,
"rewards/rejected": -4.131391525268555,
"step": 228
},
{
"epoch": 1.688073394495413,
"grad_norm": 0.0681939348578453,
"learning_rate": 2.3823407570221812e-06,
"logits/chosen": -0.8353657126426697,
"logits/rejected": -2.02689266204834,
"logps/chosen": -300.3406982421875,
"logps/rejected": -175.13296508789062,
"loss": 0.006,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.921877384185791,
"rewards/margins": 9.119339942932129,
"rewards/rejected": -4.197463035583496,
"step": 230
},
{
"epoch": 1.7027522935779817,
"grad_norm": 0.24333705008029938,
"learning_rate": 2.3396065269319655e-06,
"logits/chosen": -1.0092397928237915,
"logits/rejected": -2.1053268909454346,
"logps/chosen": -300.02294921875,
"logps/rejected": -172.78187561035156,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.987635612487793,
"rewards/margins": 9.38530445098877,
"rewards/rejected": -4.397668361663818,
"step": 232
},
{
"epoch": 1.7174311926605506,
"grad_norm": 0.6797487139701843,
"learning_rate": 2.2969193082966353e-06,
"logits/chosen": -0.8851895332336426,
"logits/rejected": -2.036161422729492,
"logps/chosen": -285.2466735839844,
"logps/rejected": -189.85882568359375,
"loss": 0.0061,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.067909240722656,
"rewards/margins": 9.919548988342285,
"rewards/rejected": -4.851640701293945,
"step": 234
},
{
"epoch": 1.7321100917431194,
"grad_norm": 0.6367282271385193,
"learning_rate": 2.2542916127740194e-06,
"logits/chosen": -0.8543779253959656,
"logits/rejected": -1.752845048904419,
"logps/chosen": -312.6046142578125,
"logps/rejected": -234.28988647460938,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.974181175231934,
"rewards/margins": 9.56247329711914,
"rewards/rejected": -4.588292121887207,
"step": 236
},
{
"epoch": 1.7467889908256882,
"grad_norm": 0.2897071838378906,
"learning_rate": 2.211735934575674e-06,
"logits/chosen": -0.9410618543624878,
"logits/rejected": -2.174349308013916,
"logps/chosen": -281.28863525390625,
"logps/rejected": -165.9616241455078,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.256411552429199,
"rewards/margins": 9.012039184570312,
"rewards/rejected": -4.755627632141113,
"step": 238
},
{
"epoch": 1.761467889908257,
"grad_norm": 0.41199827194213867,
"learning_rate": 2.1692647468048235e-06,
"logits/chosen": -1.0583674907684326,
"logits/rejected": -2.0003695487976074,
"logps/chosen": -307.04766845703125,
"logps/rejected": -206.0718231201172,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.970597743988037,
"rewards/margins": 10.884113311767578,
"rewards/rejected": -5.913515567779541,
"step": 240
},
{
"epoch": 1.7761467889908258,
"grad_norm": 0.5175734162330627,
"learning_rate": 2.126890497800477e-06,
"logits/chosen": -1.0432560443878174,
"logits/rejected": -1.912244439125061,
"logps/chosen": -297.5209655761719,
"logps/rejected": -201.7635498046875,
"loss": 0.0184,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.594603538513184,
"rewards/margins": 8.852926254272461,
"rewards/rejected": -4.258323669433594,
"step": 242
},
{
"epoch": 1.7908256880733946,
"grad_norm": 0.6547983288764954,
"learning_rate": 2.084625607488816e-06,
"logits/chosen": -0.9311404228210449,
"logits/rejected": -2.1106457710266113,
"logps/chosen": -275.57183837890625,
"logps/rejected": -188.14370727539062,
"loss": 0.0105,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.0517778396606445,
"rewards/margins": 9.953323364257812,
"rewards/rejected": -4.901544094085693,
"step": 244
},
{
"epoch": 1.8055045871559634,
"grad_norm": 0.33203306794166565,
"learning_rate": 2.0424824637428995e-06,
"logits/chosen": -0.9116280674934387,
"logits/rejected": -2.247035026550293,
"logps/chosen": -267.2120361328125,
"logps/rejected": -171.6895751953125,
"loss": 0.0084,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.797155857086182,
"rewards/margins": 9.531312942504883,
"rewards/rejected": -4.734157562255859,
"step": 246
},
{
"epoch": 1.8201834862385322,
"grad_norm": 0.5430265069007874,
"learning_rate": 2.0004734187517744e-06,
"logits/chosen": -1.082189917564392,
"logits/rejected": -1.9552661180496216,
"logps/chosen": -318.3630676269531,
"logps/rejected": -176.5325469970703,
"loss": 0.0103,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9373297691345215,
"rewards/margins": 9.501139640808105,
"rewards/rejected": -4.563809871673584,
"step": 248
},
{
"epoch": 1.834862385321101,
"grad_norm": 0.19558808207511902,
"learning_rate": 1.9586107854000327e-06,
"logits/chosen": -1.1152639389038086,
"logits/rejected": -2.129647731781006,
"logps/chosen": -296.6053466796875,
"logps/rejected": -169.00213623046875,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.840038299560547,
"rewards/margins": 9.542232513427734,
"rewards/rejected": -4.702193737030029,
"step": 250
},
{
"epoch": 1.8495412844036698,
"grad_norm": 0.45886340737342834,
"learning_rate": 1.916906833658899e-06,
"logits/chosen": -0.8982828855514526,
"logits/rejected": -2.0570406913757324,
"logps/chosen": -324.3260803222656,
"logps/rejected": -216.9913330078125,
"loss": 0.0041,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.856814861297607,
"rewards/margins": 9.941521644592285,
"rewards/rejected": -5.084706783294678,
"step": 252
},
{
"epoch": 1.8642201834862386,
"grad_norm": 0.9119444489479065,
"learning_rate": 1.8753737869898921e-06,
"logits/chosen": -0.972162663936615,
"logits/rejected": -2.016150951385498,
"logps/chosen": -248.53463745117188,
"logps/rejected": -184.32382202148438,
"loss": 0.007,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.359373092651367,
"rewards/margins": 10.132734298706055,
"rewards/rejected": -5.7733612060546875,
"step": 254
},
{
"epoch": 1.8788990825688074,
"grad_norm": 0.12387188524007797,
"learning_rate": 1.8340238187621185e-06,
"logits/chosen": -0.8442805409431458,
"logits/rejected": -1.9759818315505981,
"logps/chosen": -262.56671142578125,
"logps/rejected": -175.5653839111328,
"loss": 0.0327,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.67296028137207,
"rewards/margins": 9.085709571838379,
"rewards/rejected": -4.412749290466309,
"step": 256
},
{
"epoch": 1.8935779816513763,
"grad_norm": 0.5057358145713806,
"learning_rate": 1.7928690486842438e-06,
"logits/chosen": -1.015974760055542,
"logits/rejected": -2.1081368923187256,
"logps/chosen": -253.27394104003906,
"logps/rejected": -160.09469604492188,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.807024002075195,
"rewards/margins": 9.237527847290039,
"rewards/rejected": -4.4305033683776855,
"step": 258
},
{
"epoch": 1.908256880733945,
"grad_norm": 0.6048524379730225,
"learning_rate": 1.7519215392522026e-06,
"logits/chosen": -0.9711456298828125,
"logits/rejected": -2.1203389167785645,
"logps/chosen": -282.3438720703125,
"logps/rejected": -166.2510528564453,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.09495735168457,
"rewards/margins": 9.728025436401367,
"rewards/rejected": -4.633067607879639,
"step": 260
},
{
"epoch": 1.9229357798165139,
"grad_norm": 0.5542910695075989,
"learning_rate": 1.7111932922136715e-06,
"logits/chosen": -0.9748891592025757,
"logits/rejected": -1.8318710327148438,
"logps/chosen": -253.21209716796875,
"logps/rejected": -202.5255889892578,
"loss": 0.019,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.323376178741455,
"rewards/margins": 9.282448768615723,
"rewards/rejected": -4.959072589874268,
"step": 262
},
{
"epoch": 1.9376146788990827,
"grad_norm": 0.24626314640045166,
"learning_rate": 1.6706962450503408e-06,
"logits/chosen": -0.8283478617668152,
"logits/rejected": -2.0624401569366455,
"logps/chosen": -282.2995300292969,
"logps/rejected": -189.75595092773438,
"loss": 0.0059,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.900084495544434,
"rewards/margins": 10.33100414276123,
"rewards/rejected": -5.4309186935424805,
"step": 264
},
{
"epoch": 1.9522935779816515,
"grad_norm": 1.167913794517517,
"learning_rate": 1.630442267479034e-06,
"logits/chosen": -0.789318323135376,
"logits/rejected": -1.9187240600585938,
"logps/chosen": -266.4274597167969,
"logps/rejected": -198.47540283203125,
"loss": 0.0113,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9975690841674805,
"rewards/margins": 9.762743949890137,
"rewards/rejected": -4.765174865722656,
"step": 266
},
{
"epoch": 1.9669724770642203,
"grad_norm": 0.05298791825771332,
"learning_rate": 1.5904431579726837e-06,
"logits/chosen": -0.9226531982421875,
"logits/rejected": -2.0884995460510254,
"logps/chosen": -295.6236267089844,
"logps/rejected": -165.60801696777344,
"loss": 0.0066,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.444765567779541,
"rewards/margins": 9.557327270507812,
"rewards/rejected": -5.1125617027282715,
"step": 268
},
{
"epoch": 1.981651376146789,
"grad_norm": 0.11875250190496445,
"learning_rate": 1.5507106403021897e-06,
"logits/chosen": -0.8945147395133972,
"logits/rejected": -2.1213436126708984,
"logps/chosen": -329.32354736328125,
"logps/rejected": -205.64938354492188,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.863536834716797,
"rewards/margins": 10.598997116088867,
"rewards/rejected": -4.735459804534912,
"step": 270
},
{
"epoch": 1.996330275229358,
"grad_norm": 0.16463226079940796,
"learning_rate": 1.511256360100171e-06,
"logits/chosen": -0.8653547167778015,
"logits/rejected": -2.120985746383667,
"logps/chosen": -294.7039489746094,
"logps/rejected": -191.1700897216797,
"loss": 0.0052,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6077494621276855,
"rewards/margins": 9.97114372253418,
"rewards/rejected": -5.363394737243652,
"step": 272
},
{
"epoch": 2.0110091743119267,
"grad_norm": 0.5620644092559814,
"learning_rate": 1.4720918814476234e-06,
"logits/chosen": -1.0870428085327148,
"logits/rejected": -2.203629493713379,
"logps/chosen": -255.451171875,
"logps/rejected": -179.9131317138672,
"loss": 0.0107,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.301146984100342,
"rewards/margins": 10.202719688415527,
"rewards/rejected": -5.9015727043151855,
"step": 274
},
{
"epoch": 2.0256880733944955,
"grad_norm": 0.22174260020256042,
"learning_rate": 1.4332286834844792e-06,
"logits/chosen": -1.1182466745376587,
"logits/rejected": -2.1164536476135254,
"logps/chosen": -286.516357421875,
"logps/rejected": -188.3638458251953,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.640047550201416,
"rewards/margins": 9.481383323669434,
"rewards/rejected": -4.841336250305176,
"step": 276
},
{
"epoch": 2.0403669724770643,
"grad_norm": 0.33157217502593994,
"learning_rate": 1.3946781570450563e-06,
"logits/chosen": -0.9743894338607788,
"logits/rejected": -2.0844216346740723,
"logps/chosen": -303.1180419921875,
"logps/rejected": -197.849853515625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.488700866699219,
"rewards/margins": 10.087553024291992,
"rewards/rejected": -4.598852157592773,
"step": 278
},
{
"epoch": 2.055045871559633,
"grad_norm": 0.2071988433599472,
"learning_rate": 1.3564516013194023e-06,
"logits/chosen": -0.7817774415016174,
"logits/rejected": -1.967786431312561,
"logps/chosen": -266.5663757324219,
"logps/rejected": -185.63877868652344,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.6336894035339355,
"rewards/margins": 10.230566024780273,
"rewards/rejected": -5.596876621246338,
"step": 280
},
{
"epoch": 2.069724770642202,
"grad_norm": 0.35437583923339844,
"learning_rate": 1.3185602205414894e-06,
"logits/chosen": -0.9503396153450012,
"logits/rejected": -2.0260818004608154,
"logps/chosen": -269.90093994140625,
"logps/rejected": -172.7965850830078,
"loss": 0.0059,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.775392532348633,
"rewards/margins": 9.074084281921387,
"rewards/rejected": -4.298691272735596,
"step": 282
},
{
"epoch": 2.0844036697247708,
"grad_norm": 0.09949786216020584,
"learning_rate": 1.2810151207052465e-06,
"logits/chosen": -1.025212049484253,
"logits/rejected": -2.090640068054199,
"logps/chosen": -335.35882568359375,
"logps/rejected": -221.98355102539062,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.815802097320557,
"rewards/margins": 9.805765151977539,
"rewards/rejected": -4.989964008331299,
"step": 284
},
{
"epoch": 2.0990825688073396,
"grad_norm": 0.1902090311050415,
"learning_rate": 1.2438273063093811e-06,
"logits/chosen": -0.8500208854675293,
"logits/rejected": -1.9380009174346924,
"logps/chosen": -277.2483215332031,
"logps/rejected": -168.717529296875,
"loss": 0.0127,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.573535442352295,
"rewards/margins": 9.026323318481445,
"rewards/rejected": -4.452788352966309,
"step": 286
},
{
"epoch": 2.1137614678899084,
"grad_norm": 0.3208858072757721,
"learning_rate": 1.2070076771319536e-06,
"logits/chosen": -1.082637906074524,
"logits/rejected": -1.9498220682144165,
"logps/chosen": -353.7499084472656,
"logps/rejected": -200.4058074951172,
"loss": 0.0101,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.898225784301758,
"rewards/margins": 8.967299461364746,
"rewards/rejected": -4.0690741539001465,
"step": 288
},
{
"epoch": 2.128440366972477,
"grad_norm": 1.1437596082687378,
"learning_rate": 1.1705670250356417e-06,
"logits/chosen": -0.8648325800895691,
"logits/rejected": -2.037424087524414,
"logps/chosen": -311.7782287597656,
"logps/rejected": -195.8933563232422,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.304541110992432,
"rewards/margins": 10.098506927490234,
"rewards/rejected": -4.7939653396606445,
"step": 290
},
{
"epoch": 2.143119266055046,
"grad_norm": 0.16270968317985535,
"learning_rate": 1.1345160308046413e-06,
"logits/chosen": -0.9791809916496277,
"logits/rejected": -2.24078369140625,
"logps/chosen": -382.8855895996094,
"logps/rejected": -205.8779754638672,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.505633354187012,
"rewards/margins": 11.220186233520508,
"rewards/rejected": -5.714553356170654,
"step": 292
},
{
"epoch": 2.157798165137615,
"grad_norm": 2.2618370056152344,
"learning_rate": 1.0988652610141154e-06,
"logits/chosen": -0.9164503216743469,
"logits/rejected": -1.9510498046875,
"logps/chosen": -276.1203918457031,
"logps/rejected": -212.5254669189453,
"loss": 0.0214,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.720728874206543,
"rewards/margins": 9.504093170166016,
"rewards/rejected": -4.783364295959473,
"step": 294
},
{
"epoch": 2.1724770642201836,
"grad_norm": 0.08572974801063538,
"learning_rate": 1.063625164933124e-06,
"logits/chosen": -0.8781817555427551,
"logits/rejected": -2.0793867111206055,
"logps/chosen": -330.6020202636719,
"logps/rejected": -208.72425842285156,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.462750434875488,
"rewards/margins": 11.203109741210938,
"rewards/rejected": -5.740358352661133,
"step": 296
},
{
"epoch": 2.1871559633027524,
"grad_norm": 0.5944895148277283,
"learning_rate": 1.0288060714619359e-06,
"logits/chosen": -1.1157301664352417,
"logits/rejected": -2.214977502822876,
"logps/chosen": -316.9060363769531,
"logps/rejected": -167.6466522216797,
"loss": 0.0098,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.41005277633667,
"rewards/margins": 10.054335594177246,
"rewards/rejected": -4.644283294677734,
"step": 298
},
{
"epoch": 2.2018348623853212,
"grad_norm": 0.5239315629005432,
"learning_rate": 9.944181861046188e-07,
"logits/chosen": -0.8929880857467651,
"logits/rejected": -1.9771008491516113,
"logps/chosen": -334.0789489746094,
"logps/rejected": -201.40476989746094,
"loss": 0.0112,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.018026351928711,
"rewards/margins": 10.597818374633789,
"rewards/rejected": -5.579792499542236,
"step": 300
},
{
"epoch": 2.21651376146789,
"grad_norm": 0.17582310736179352,
"learning_rate": 9.604715879777986e-07,
"logits/chosen": -0.9466437101364136,
"logits/rejected": -2.1750948429107666,
"logps/chosen": -279.5908203125,
"logps/rejected": -154.12644958496094,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.104902744293213,
"rewards/margins": 10.134696006774902,
"rewards/rejected": -5.029792785644531,
"step": 302
},
{
"epoch": 2.231192660550459,
"grad_norm": 0.44052350521087646,
"learning_rate": 9.269762268564616e-07,
"logits/chosen": -1.0591435432434082,
"logits/rejected": -2.134446382522583,
"logps/chosen": -255.2496337890625,
"logps/rejected": -161.16136169433594,
"loss": 0.012,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.842598915100098,
"rewards/margins": 9.835264205932617,
"rewards/rejected": -4.992665767669678,
"step": 304
},
{
"epoch": 2.2458715596330276,
"grad_norm": 0.702462375164032,
"learning_rate": 8.939419202576694e-07,
"logits/chosen": -0.768172025680542,
"logits/rejected": -1.7977386713027954,
"logps/chosen": -258.4624938964844,
"logps/rejected": -183.80621337890625,
"loss": 0.0133,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.580810070037842,
"rewards/margins": 7.86204719543457,
"rewards/rejected": -3.2812376022338867,
"step": 306
},
{
"epoch": 2.2605504587155965,
"grad_norm": 0.4431416690349579,
"learning_rate": 8.61378350563033e-07,
"logits/chosen": -0.9345456957817078,
"logits/rejected": -1.9868954420089722,
"logps/chosen": -250.33721923828125,
"logps/rejected": -193.64549255371094,
"loss": 0.0059,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.593289375305176,
"rewards/margins": 9.057455062866211,
"rewards/rejected": -4.464165210723877,
"step": 308
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.30388739705085754,
"learning_rate": 8.292950621808022e-07,
"logits/chosen": -0.9780189990997314,
"logits/rejected": -2.0176703929901123,
"logps/chosen": -285.4472961425781,
"logps/rejected": -191.96495056152344,
"loss": 0.005,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.00777006149292,
"rewards/margins": 10.013311386108398,
"rewards/rejected": -5.005540370941162,
"step": 310
},
{
"epoch": 2.289908256880734,
"grad_norm": 0.3784541189670563,
"learning_rate": 7.977014587483925e-07,
"logits/chosen": -1.0011767148971558,
"logits/rejected": -2.0550498962402344,
"logps/chosen": -273.92138671875,
"logps/rejected": -228.16741943359375,
"loss": 0.011,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.854176998138428,
"rewards/margins": 9.765246391296387,
"rewards/rejected": -4.911068916320801,
"step": 312
},
{
"epoch": 2.304587155963303,
"grad_norm": 0.2992611527442932,
"learning_rate": 7.666068003761684e-07,
"logits/chosen": -0.9273378849029541,
"logits/rejected": -2.042013645172119,
"logps/chosen": -296.50616455078125,
"logps/rejected": -169.5068817138672,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.002749443054199,
"rewards/margins": 10.678738594055176,
"rewards/rejected": -5.675989151000977,
"step": 314
},
{
"epoch": 2.3192660550458717,
"grad_norm": 0.23903429508209229,
"learning_rate": 7.360202009332993e-07,
"logits/chosen": -1.0399566888809204,
"logits/rejected": -2.143623113632202,
"logps/chosen": -296.7044677734375,
"logps/rejected": -185.7502899169922,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.675933361053467,
"rewards/margins": 10.062166213989258,
"rewards/rejected": -5.386232376098633,
"step": 316
},
{
"epoch": 2.3339449541284405,
"grad_norm": 0.23702357709407806,
"learning_rate": 7.059506253764773e-07,
"logits/chosen": -0.9900916814804077,
"logits/rejected": -2.093594789505005,
"logps/chosen": -314.4263000488281,
"logps/rejected": -194.73269653320312,
"loss": 0.0066,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.672959327697754,
"rewards/margins": 10.092663764953613,
"rewards/rejected": -5.419704437255859,
"step": 318
},
{
"epoch": 2.3486238532110093,
"grad_norm": 0.40923863649368286,
"learning_rate": 6.764068871222825e-07,
"logits/chosen": -0.7488622069358826,
"logits/rejected": -1.9413087368011475,
"logps/chosen": -287.322021484375,
"logps/rejected": -188.12283325195312,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.020465850830078,
"rewards/margins": 9.374917984008789,
"rewards/rejected": -4.354452133178711,
"step": 320
},
{
"epoch": 2.363302752293578,
"grad_norm": 0.19543257355690002,
"learning_rate": 6.473976454639608e-07,
"logits/chosen": -0.9299582839012146,
"logits/rejected": -2.107851266860962,
"logps/chosen": -293.9842529296875,
"logps/rejected": -168.69400024414062,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.329434871673584,
"rewards/margins": 10.282726287841797,
"rewards/rejected": -4.9532904624938965,
"step": 322
},
{
"epoch": 2.377981651376147,
"grad_norm": 1.0940320491790771,
"learning_rate": 6.189314030333796e-07,
"logits/chosen": -0.8577584624290466,
"logits/rejected": -1.934208631515503,
"logps/chosen": -280.2929382324219,
"logps/rejected": -220.2890167236328,
"loss": 0.0154,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.774515628814697,
"rewards/margins": 10.356229782104492,
"rewards/rejected": -5.581714153289795,
"step": 324
},
{
"epoch": 2.3926605504587157,
"grad_norm": 0.12193372845649719,
"learning_rate": 5.910165033089e-07,
"logits/chosen": -0.8733283281326294,
"logits/rejected": -2.079462766647339,
"logps/chosen": -316.6996765136719,
"logps/rejected": -201.25564575195312,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.122347831726074,
"rewards/margins": 9.807957649230957,
"rewards/rejected": -4.685609817504883,
"step": 326
},
{
"epoch": 2.4073394495412845,
"grad_norm": 0.43534737825393677,
"learning_rate": 5.636611281698956e-07,
"logits/chosen": -0.8986641764640808,
"logits/rejected": -1.9822278022766113,
"logps/chosen": -262.6693420410156,
"logps/rejected": -185.32846069335938,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.460582733154297,
"rewards/margins": 9.22741413116455,
"rewards/rejected": -4.766830921173096,
"step": 328
},
{
"epoch": 2.4220183486238533,
"grad_norm": 0.17293158173561096,
"learning_rate": 5.368732954986389e-07,
"logits/chosen": -1.0250214338302612,
"logits/rejected": -2.0870189666748047,
"logps/chosen": -279.42999267578125,
"logps/rejected": -196.77059936523438,
"loss": 0.0069,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.643675327301025,
"rewards/margins": 9.602448463439941,
"rewards/rejected": -4.958773136138916,
"step": 330
},
{
"epoch": 2.436697247706422,
"grad_norm": 0.20987118780612946,
"learning_rate": 5.106608568302504e-07,
"logits/chosen": -1.066097617149353,
"logits/rejected": -2.057497978210449,
"logps/chosen": -257.912109375,
"logps/rejected": -195.58677673339844,
"loss": 0.0204,
"rewards/accuracies": 0.984375,
"rewards/chosen": 4.727499961853027,
"rewards/margins": 9.989591598510742,
"rewards/rejected": -5.262092590332031,
"step": 332
},
{
"epoch": 2.451376146788991,
"grad_norm": 1.3423670530319214,
"learning_rate": 4.850314950514124e-07,
"logits/chosen": -0.8067299127578735,
"logits/rejected": -1.9319019317626953,
"logps/chosen": -281.423583984375,
"logps/rejected": -192.34666442871094,
"loss": 0.0104,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.9796462059021,
"rewards/margins": 9.925731658935547,
"rewards/rejected": -4.9460859298706055,
"step": 334
},
{
"epoch": 2.4660550458715598,
"grad_norm": 0.2133161723613739,
"learning_rate": 4.599927221485034e-07,
"logits/chosen": -0.9198440909385681,
"logits/rejected": -2.121577024459839,
"logps/chosen": -277.7024230957031,
"logps/rejected": -171.4612579345703,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.626633167266846,
"rewards/margins": 9.858685493469238,
"rewards/rejected": -5.232051849365234,
"step": 336
},
{
"epoch": 2.4807339449541286,
"grad_norm": 0.16850100457668304,
"learning_rate": 4.3555187700583175e-07,
"logits/chosen": -0.8522999882698059,
"logits/rejected": -2.053220748901367,
"logps/chosen": -265.3820495605469,
"logps/rejected": -188.9971160888672,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.703529357910156,
"rewards/margins": 10.37534236907959,
"rewards/rejected": -5.671813011169434,
"step": 338
},
{
"epoch": 2.4954128440366974,
"grad_norm": 0.15111279487609863,
"learning_rate": 4.1171612325460244e-07,
"logits/chosen": -0.9065884351730347,
"logits/rejected": -1.9212383031845093,
"logps/chosen": -279.0539245605469,
"logps/rejected": -185.0900115966797,
"loss": 0.0045,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.483066082000732,
"rewards/margins": 9.553812026977539,
"rewards/rejected": -5.070746421813965,
"step": 340
},
{
"epoch": 2.510091743119266,
"grad_norm": 0.06084302440285683,
"learning_rate": 3.8849244717325206e-07,
"logits/chosen": -0.9317240715026855,
"logits/rejected": -1.988271713256836,
"logps/chosen": -268.8980407714844,
"logps/rejected": -202.74929809570312,
"loss": 0.0096,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.079599380493164,
"rewards/margins": 10.533794403076172,
"rewards/rejected": -5.454195022583008,
"step": 342
},
{
"epoch": 2.524770642201835,
"grad_norm": 0.8741805553436279,
"learning_rate": 3.658876556397628e-07,
"logits/chosen": -1.1219009160995483,
"logits/rejected": -2.1234138011932373,
"logps/chosen": -255.28311157226562,
"logps/rejected": -171.67091369628906,
"loss": 0.0099,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.061553478240967,
"rewards/margins": 10.136541366577148,
"rewards/rejected": -5.074987411499023,
"step": 344
},
{
"epoch": 2.539449541284404,
"grad_norm": 0.4372842013835907,
"learning_rate": 3.4390837413656256e-07,
"logits/chosen": -0.9813422560691833,
"logits/rejected": -2.116903781890869,
"logps/chosen": -278.31292724609375,
"logps/rejected": -204.9643096923828,
"loss": 0.0083,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.085036277770996,
"rewards/margins": 10.61630630493164,
"rewards/rejected": -5.531269073486328,
"step": 346
},
{
"epoch": 2.5541284403669726,
"grad_norm": 1.2550814151763916,
"learning_rate": 3.225610448085903e-07,
"logits/chosen": -0.9581831693649292,
"logits/rejected": -2.0414552688598633,
"logps/chosen": -270.668701171875,
"logps/rejected": -183.82034301757812,
"loss": 0.0113,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.899204730987549,
"rewards/margins": 10.034035682678223,
"rewards/rejected": -5.134830474853516,
"step": 348
},
{
"epoch": 2.5688073394495414,
"grad_norm": 0.03559936583042145,
"learning_rate": 3.018519245750989e-07,
"logits/chosen": -0.9744287729263306,
"logits/rejected": -1.9595189094543457,
"logps/chosen": -321.4473876953125,
"logps/rejected": -223.63467407226562,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.675147533416748,
"rewards/margins": 10.108884811401367,
"rewards/rejected": -5.433738708496094,
"step": 350
},
{
"epoch": 2.5834862385321102,
"grad_norm": 0.2746826708316803,
"learning_rate": 2.817870832957459e-07,
"logits/chosen": -0.8869858980178833,
"logits/rejected": -2.016246795654297,
"logps/chosen": -259.2815856933594,
"logps/rejected": -180.26258850097656,
"loss": 0.0056,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.067638874053955,
"rewards/margins": 10.355432510375977,
"rewards/rejected": -5.287793159484863,
"step": 352
},
{
"epoch": 2.598165137614679,
"grad_norm": 0.17304402589797974,
"learning_rate": 2.6237240199151386e-07,
"logits/chosen": -1.0045228004455566,
"logits/rejected": -2.091106414794922,
"logps/chosen": -264.62774658203125,
"logps/rejected": -172.3504638671875,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.949882507324219,
"rewards/margins": 9.181565284729004,
"rewards/rejected": -4.231683254241943,
"step": 354
},
{
"epoch": 2.612844036697248,
"grad_norm": 0.9380022883415222,
"learning_rate": 2.436135711209786e-07,
"logits/chosen": -1.1858479976654053,
"logits/rejected": -2.1570074558258057,
"logps/chosen": -279.8266296386719,
"logps/rejected": -165.30809020996094,
"loss": 0.009,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.504940032958984,
"rewards/margins": 9.417243957519531,
"rewards/rejected": -4.912304401397705,
"step": 356
},
{
"epoch": 2.6275229357798167,
"grad_norm": 0.6058441400527954,
"learning_rate": 2.2551608891243026e-07,
"logits/chosen": -1.1764850616455078,
"logits/rejected": -2.1525368690490723,
"logps/chosen": -352.7016296386719,
"logps/rejected": -213.2824249267578,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.686002731323242,
"rewards/margins": 9.250012397766113,
"rewards/rejected": -4.564009666442871,
"step": 358
},
{
"epoch": 2.6422018348623855,
"grad_norm": 0.14885057508945465,
"learning_rate": 2.0808525975233807e-07,
"logits/chosen": -0.8036705255508423,
"logits/rejected": -2.0143167972564697,
"logps/chosen": -282.6025085449219,
"logps/rejected": -200.5447540283203,
"loss": 0.015,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.5807576179504395,
"rewards/margins": 9.530784606933594,
"rewards/rejected": -4.950027942657471,
"step": 360
},
{
"epoch": 2.6568807339449543,
"grad_norm": 0.40349748730659485,
"learning_rate": 1.9132619263063144e-07,
"logits/chosen": -0.8986431360244751,
"logits/rejected": -2.059335231781006,
"logps/chosen": -346.6067810058594,
"logps/rejected": -212.26348876953125,
"loss": 0.0096,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.356535911560059,
"rewards/margins": 10.730274200439453,
"rewards/rejected": -5.373737812042236,
"step": 362
},
{
"epoch": 2.671559633027523,
"grad_norm": 0.02223406359553337,
"learning_rate": 1.7524379964325155e-07,
"logits/chosen": -0.9592161774635315,
"logits/rejected": -2.094557523727417,
"logps/chosen": -327.5130310058594,
"logps/rejected": -203.91741943359375,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.003190517425537,
"rewards/margins": 10.261466979980469,
"rewards/rejected": -5.258275985717773,
"step": 364
},
{
"epoch": 2.686238532110092,
"grad_norm": 0.21345356106758118,
"learning_rate": 1.5984279455240975e-07,
"logits/chosen": -0.9917050004005432,
"logits/rejected": -2.0196518898010254,
"logps/chosen": -282.2841796875,
"logps/rejected": -191.79910278320312,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.118269443511963,
"rewards/margins": 10.18971061706543,
"rewards/rejected": -5.071441173553467,
"step": 366
},
{
"epoch": 2.7009174311926607,
"grad_norm": 0.0874081626534462,
"learning_rate": 1.451276914049818e-07,
"logits/chosen": -0.9789815545082092,
"logits/rejected": -2.004281997680664,
"logps/chosen": -256.3831787109375,
"logps/rejected": -178.12074279785156,
"loss": 0.0037,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.682791233062744,
"rewards/margins": 10.118135452270508,
"rewards/rejected": -5.435344219207764,
"step": 368
},
{
"epoch": 2.7155963302752295,
"grad_norm": 0.15522974729537964,
"learning_rate": 1.3110280320943692e-07,
"logits/chosen": -0.89200758934021,
"logits/rejected": -2.112806797027588,
"logps/chosen": -271.0398254394531,
"logps/rejected": -171.77919006347656,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.064676284790039,
"rewards/margins": 10.34743881225586,
"rewards/rejected": -5.28276252746582,
"step": 370
},
{
"epoch": 2.7302752293577983,
"grad_norm": 0.5010592341423035,
"learning_rate": 1.1777224067169218e-07,
"logits/chosen": -0.8353609442710876,
"logits/rejected": -1.9892935752868652,
"logps/chosen": -279.2965087890625,
"logps/rejected": -191.65855407714844,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.053516864776611,
"rewards/margins": 10.337114334106445,
"rewards/rejected": -5.283597469329834,
"step": 372
},
{
"epoch": 2.744954128440367,
"grad_norm": 0.48516571521759033,
"learning_rate": 1.0513991099025872e-07,
"logits/chosen": -1.016608476638794,
"logits/rejected": -2.1301956176757812,
"logps/chosen": -323.5552673339844,
"logps/rejected": -193.77996826171875,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.830180644989014,
"rewards/margins": 9.39232349395752,
"rewards/rejected": -4.562142372131348,
"step": 374
},
{
"epoch": 2.759633027522936,
"grad_norm": 0.18496806919574738,
"learning_rate": 9.320951671104194e-08,
"logits/chosen": -0.9126584529876709,
"logits/rejected": -2.1175155639648438,
"logps/chosen": -314.1302795410156,
"logps/rejected": -191.1002960205078,
"loss": 0.0048,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.795359134674072,
"rewards/margins": 10.335535049438477,
"rewards/rejected": -4.540175914764404,
"step": 376
},
{
"epoch": 2.7743119266055047,
"grad_norm": 0.158527210354805,
"learning_rate": 8.198455464212108e-08,
"logits/chosen": -0.9621077179908752,
"logits/rejected": -2.066542148590088,
"logps/chosen": -293.69940185546875,
"logps/rejected": -176.17442321777344,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.256305694580078,
"rewards/margins": 10.846275329589844,
"rewards/rejected": -5.589971542358398,
"step": 378
},
{
"epoch": 2.7889908256880735,
"grad_norm": 0.11951223015785217,
"learning_rate": 7.146831482883115e-08,
"logits/chosen": -0.7449550628662109,
"logits/rejected": -2.0898332595825195,
"logps/chosen": -297.49365234375,
"logps/rejected": -172.2970428466797,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.16118860244751,
"rewards/margins": 11.0027437210083,
"rewards/rejected": -5.841555595397949,
"step": 380
},
{
"epoch": 2.8036697247706424,
"grad_norm": 0.32903870940208435,
"learning_rate": 6.16638795894492e-08,
"logits/chosen": -0.9001256823539734,
"logits/rejected": -1.9853109121322632,
"logps/chosen": -261.6986389160156,
"logps/rejected": -200.32876586914062,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.192695617675781,
"rewards/margins": 10.185223579406738,
"rewards/rejected": -4.992527961730957,
"step": 382
},
{
"epoch": 2.818348623853211,
"grad_norm": 1.1356521844863892,
"learning_rate": 5.257412261176375e-08,
"logits/chosen": -1.0478947162628174,
"logits/rejected": -2.031193971633911,
"logps/chosen": -272.9176025390625,
"logps/rejected": -191.03363037109375,
"loss": 0.008,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.408777236938477,
"rewards/margins": 10.281020164489746,
"rewards/rejected": -4.8722429275512695,
"step": 384
},
{
"epoch": 2.83302752293578,
"grad_norm": 0.36936327815055847,
"learning_rate": 4.4201708110795384e-08,
"logits/chosen": -0.9411278963088989,
"logits/rejected": -1.9795866012573242,
"logps/chosen": -292.65386962890625,
"logps/rejected": -204.53778076171875,
"loss": 0.0039,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.8684539794921875,
"rewards/margins": 9.500937461853027,
"rewards/rejected": -4.632482528686523,
"step": 386
},
{
"epoch": 2.847706422018349,
"grad_norm": 0.16482090950012207,
"learning_rate": 3.654909004791152e-08,
"logits/chosen": -0.938539981842041,
"logits/rejected": -2.1424248218536377,
"logps/chosen": -293.4642333984375,
"logps/rejected": -184.5415802001953,
"loss": 0.0121,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.669407844543457,
"rewards/margins": 10.090935707092285,
"rewards/rejected": -5.421527862548828,
"step": 388
},
{
"epoch": 2.8623853211009176,
"grad_norm": 0.13458868861198425,
"learning_rate": 2.9618511411570462e-08,
"logits/chosen": -1.0025708675384521,
"logits/rejected": -2.083418607711792,
"logps/chosen": -284.4067687988281,
"logps/rejected": -172.37875366210938,
"loss": 0.0069,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.38177490234375,
"rewards/margins": 9.599076271057129,
"rewards/rejected": -5.2173004150390625,
"step": 390
},
{
"epoch": 2.8770642201834864,
"grad_norm": 1.0483838319778442,
"learning_rate": 2.3412003559898088e-08,
"logits/chosen": -0.8990004658699036,
"logits/rejected": -1.8701345920562744,
"logps/chosen": -271.26129150390625,
"logps/rejected": -207.7753143310547,
"loss": 0.0125,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.918183326721191,
"rewards/margins": 9.391782760620117,
"rewards/rejected": -4.473598957061768,
"step": 392
},
{
"epoch": 2.891743119266055,
"grad_norm": 0.25216034054756165,
"learning_rate": 1.793138562529634e-08,
"logits/chosen": -0.971919059753418,
"logits/rejected": -2.1569983959198,
"logps/chosen": -346.71875,
"logps/rejected": -184.22348022460938,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.533580780029297,
"rewards/margins": 10.083108901977539,
"rewards/rejected": -4.549527168273926,
"step": 394
},
{
"epoch": 2.906422018348624,
"grad_norm": 0.8910009860992432,
"learning_rate": 1.317826398125277e-08,
"logits/chosen": -1.062324047088623,
"logits/rejected": -2.1035232543945312,
"logps/chosen": -293.03125,
"logps/rejected": -204.35723876953125,
"loss": 0.0135,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.314833641052246,
"rewards/margins": 10.953380584716797,
"rewards/rejected": -5.638547420501709,
"step": 396
},
{
"epoch": 2.921100917431193,
"grad_norm": 0.3026532828807831,
"learning_rate": 9.15403177151275e-09,
"logits/chosen": -0.9711483716964722,
"logits/rejected": -1.8983428478240967,
"logps/chosen": -276.14398193359375,
"logps/rejected": -217.97817993164062,
"loss": 0.0064,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.347715377807617,
"rewards/margins": 10.437253952026367,
"rewards/rejected": -5.08953857421875,
"step": 398
},
{
"epoch": 2.9357798165137616,
"grad_norm": 0.21673916280269623,
"learning_rate": 5.85986850174608e-09,
"logits/chosen": -0.8715996146202087,
"logits/rejected": -2.193289279937744,
"logps/chosen": -312.49847412109375,
"logps/rejected": -185.29078674316406,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.40484619140625,
"rewards/margins": 10.567187309265137,
"rewards/rejected": -5.162341117858887,
"step": 400
},
{
"epoch": 2.9504587155963304,
"grad_norm": 0.18232221901416779,
"learning_rate": 3.296739693834927e-09,
"logits/chosen": -1.094886302947998,
"logits/rejected": -1.9882696866989136,
"logps/chosen": -305.0465087890625,
"logps/rejected": -184.07928466796875,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.589092254638672,
"rewards/margins": 9.142921447753906,
"rewards/rejected": -4.553828239440918,
"step": 402
},
{
"epoch": 2.9651376146788992,
"grad_norm": 0.28508853912353516,
"learning_rate": 1.4653966028774225e-09,
"logits/chosen": -0.9431482553482056,
"logits/rejected": -1.953324556350708,
"logps/chosen": -313.6567077636719,
"logps/rejected": -213.5366973876953,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.910001277923584,
"rewards/margins": 10.507518768310547,
"rewards/rejected": -5.597517490386963,
"step": 404
},
{
"epoch": 2.979816513761468,
"grad_norm": 0.14594300091266632,
"learning_rate": 3.6637599699351766e-10,
"logits/chosen": -0.940761387348175,
"logits/rejected": -2.1918911933898926,
"logps/chosen": -289.67626953125,
"logps/rejected": -180.724365234375,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.103169918060303,
"rewards/margins": 9.98257827758789,
"rewards/rejected": -4.879408359527588,
"step": 406
},
{
"epoch": 2.994495412844037,
"grad_norm": 0.16170361638069153,
"learning_rate": 0.0,
"logits/chosen": -1.0162718296051025,
"logits/rejected": -1.9849637746810913,
"logps/chosen": -319.7056579589844,
"logps/rejected": -213.12435913085938,
"loss": 0.0316,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.299257755279541,
"rewards/margins": 10.258605003356934,
"rewards/rejected": -4.959346771240234,
"step": 408
},
{
"epoch": 2.994495412844037,
"step": 408,
"total_flos": 7.837376281021809e+17,
"train_loss": 0.11720214437923905,
"train_runtime": 8069.9016,
"train_samples_per_second": 1.62,
"train_steps_per_second": 0.051
}
],
"logging_steps": 2,
"max_steps": 408,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.837376281021809e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}