zephyr-7b-dpo-qlora / trainer_state.json
geonmin-kim's picture
Model save
6615799 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 3.8125,
"learning_rate": 1.3054830287206268e-08,
"logits/chosen": -2.377302885055542,
"logits/rejected": -2.2193117141723633,
"logps/chosen": -290.4185485839844,
"logps/rejected": -374.6501770019531,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 2.40625,
"learning_rate": 1.3054830287206266e-07,
"logits/chosen": -2.25045108795166,
"logits/rejected": -2.052776575088501,
"logps/chosen": -279.61688232421875,
"logps/rejected": -245.4197540283203,
"loss": 0.6931,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.000771976076066494,
"rewards/margins": 0.00010288292105542496,
"rewards/rejected": 0.0006690931040793657,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 2.484375,
"learning_rate": 2.610966057441253e-07,
"logits/chosen": -2.2451391220092773,
"logits/rejected": -1.944021224975586,
"logps/chosen": -305.45184326171875,
"logps/rejected": -237.7191619873047,
"loss": 0.6926,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.004307927563786507,
"rewards/margins": 0.0011060098186135292,
"rewards/rejected": 0.003201917978003621,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 2.3125,
"learning_rate": 3.9164490861618804e-07,
"logits/chosen": -2.2053542137145996,
"logits/rejected": -2.136805772781372,
"logps/chosen": -251.1873016357422,
"logps/rejected": -251.39126586914062,
"loss": 0.692,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.012356054969131947,
"rewards/margins": 0.0023143726866692305,
"rewards/rejected": 0.010041682049632072,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 1.9453125,
"learning_rate": 5.221932114882506e-07,
"logits/chosen": -2.062053918838501,
"logits/rejected": -2.0244908332824707,
"logps/chosen": -216.23828125,
"logps/rejected": -221.68917846679688,
"loss": 0.6915,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.019059285521507263,
"rewards/margins": 0.0032902732491493225,
"rewards/rejected": 0.01576901227235794,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 2.078125,
"learning_rate": 6.527415143603135e-07,
"logits/chosen": -2.1121723651885986,
"logits/rejected": -2.1005072593688965,
"logps/chosen": -266.8817443847656,
"logps/rejected": -234.3415069580078,
"loss": 0.6905,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.030057832598686218,
"rewards/margins": 0.005467818584293127,
"rewards/rejected": 0.024590013548731804,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 2.125,
"learning_rate": 7.832898172323761e-07,
"logits/chosen": -2.0995335578918457,
"logits/rejected": -1.9425058364868164,
"logps/chosen": -252.32351684570312,
"logps/rejected": -226.69961547851562,
"loss": 0.69,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03176448494195938,
"rewards/margins": 0.006372343748807907,
"rewards/rejected": 0.025392139330506325,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 2.03125,
"learning_rate": 9.138381201044387e-07,
"logits/chosen": -2.2442469596862793,
"logits/rejected": -2.036492347717285,
"logps/chosen": -272.0433044433594,
"logps/rejected": -246.6951446533203,
"loss": 0.6879,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.04112860932946205,
"rewards/margins": 0.010742614977061749,
"rewards/rejected": 0.030385995283722878,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 2.390625,
"learning_rate": 1.0443864229765013e-06,
"logits/chosen": -2.153740882873535,
"logits/rejected": -1.977267861366272,
"logps/chosen": -257.5650329589844,
"logps/rejected": -246.85354614257812,
"loss": 0.6872,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.038635507225990295,
"rewards/margins": 0.012301743030548096,
"rewards/rejected": 0.0263337641954422,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 2.1875,
"learning_rate": 1.1749347258485642e-06,
"logits/chosen": -2.136314868927002,
"logits/rejected": -2.000256061553955,
"logps/chosen": -250.14096069335938,
"logps/rejected": -234.5118408203125,
"loss": 0.6848,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.04195228964090347,
"rewards/margins": 0.017196740955114365,
"rewards/rejected": 0.02475554868578911,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 2.125,
"learning_rate": 1.305483028720627e-06,
"logits/chosen": -2.179086208343506,
"logits/rejected": -2.068403482437134,
"logps/chosen": -246.95883178710938,
"logps/rejected": -230.7919921875,
"loss": 0.6819,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.04810682684183121,
"rewards/margins": 0.023308780044317245,
"rewards/rejected": 0.024798044934868813,
"step": 100
},
{
"epoch": 0.03,
"eval_logits/chosen": -2.095933198928833,
"eval_logits/rejected": -1.9564727544784546,
"eval_logps/chosen": -259.64715576171875,
"eval_logps/rejected": -241.9028778076172,
"eval_loss": 0.6821568012237549,
"eval_rewards/accuracies": 0.6545000076293945,
"eval_rewards/chosen": 0.05004846677184105,
"eval_rewards/margins": 0.02299799770116806,
"eval_rewards/rejected": 0.02705046720802784,
"eval_runtime": 381.806,
"eval_samples_per_second": 5.238,
"eval_steps_per_second": 0.655,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 2.3125,
"learning_rate": 1.4360313315926894e-06,
"logits/chosen": -2.1454405784606934,
"logits/rejected": -2.0017640590667725,
"logps/chosen": -284.425537109375,
"logps/rejected": -238.8695526123047,
"loss": 0.6795,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.049382902681827545,
"rewards/margins": 0.02859182097017765,
"rewards/rejected": 0.020791077986359596,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 2.140625,
"learning_rate": 1.5665796344647521e-06,
"logits/chosen": -2.1937575340270996,
"logits/rejected": -2.054399013519287,
"logps/chosen": -287.4629821777344,
"logps/rejected": -271.8957824707031,
"loss": 0.6729,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.05581967160105705,
"rewards/margins": 0.042316947132349014,
"rewards/rejected": 0.013502727262675762,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 2.765625,
"learning_rate": 1.6971279373368146e-06,
"logits/chosen": -2.208482265472412,
"logits/rejected": -2.118875503540039,
"logps/chosen": -250.0573272705078,
"logps/rejected": -252.57418823242188,
"loss": 0.6698,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.050946980714797974,
"rewards/margins": 0.049403756856918335,
"rewards/rejected": 0.0015432273503392935,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 2.484375,
"learning_rate": 1.8276762402088774e-06,
"logits/chosen": -2.2458949089050293,
"logits/rejected": -1.911431074142456,
"logps/chosen": -270.4693298339844,
"logps/rejected": -226.22677612304688,
"loss": 0.6685,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.04268602281808853,
"rewards/margins": 0.05290870741009712,
"rewards/rejected": -0.010222683660686016,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 2.640625,
"learning_rate": 1.9582245430809403e-06,
"logits/chosen": -2.2650039196014404,
"logits/rejected": -2.039114475250244,
"logps/chosen": -280.2913818359375,
"logps/rejected": -242.7501983642578,
"loss": 0.6678,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.03719528391957283,
"rewards/margins": 0.05549495667219162,
"rewards/rejected": -0.01829967275261879,
"step": 150
},
{
"epoch": 0.04,
"grad_norm": 2.671875,
"learning_rate": 2.0887728459530026e-06,
"logits/chosen": -2.1557822227478027,
"logits/rejected": -2.0535261631011963,
"logps/chosen": -256.06103515625,
"logps/rejected": -261.87261962890625,
"loss": 0.6687,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.007628369145095348,
"rewards/margins": 0.05603449419140816,
"rewards/rejected": -0.04840613156557083,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 2.875,
"learning_rate": 2.2193211488250653e-06,
"logits/chosen": -2.125109910964966,
"logits/rejected": -1.9704573154449463,
"logps/chosen": -220.9778594970703,
"logps/rejected": -228.26919555664062,
"loss": 0.671,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.0038716509006917477,
"rewards/margins": 0.05044783279299736,
"rewards/rejected": -0.05431948974728584,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 3.53125,
"learning_rate": 2.3498694516971284e-06,
"logits/chosen": -2.1243832111358643,
"logits/rejected": -1.9889084100723267,
"logps/chosen": -258.29095458984375,
"logps/rejected": -251.7142333984375,
"loss": 0.6638,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.025864282622933388,
"rewards/margins": 0.06769417971372604,
"rewards/rejected": -0.09355846792459488,
"step": 180
},
{
"epoch": 0.05,
"grad_norm": 3.015625,
"learning_rate": 2.4804177545691907e-06,
"logits/chosen": -2.2455646991729736,
"logits/rejected": -2.0299086570739746,
"logps/chosen": -272.17633056640625,
"logps/rejected": -253.8187255859375,
"loss": 0.6499,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.020921263843774796,
"rewards/margins": 0.09995204210281372,
"rewards/rejected": -0.12087330967187881,
"step": 190
},
{
"epoch": 0.05,
"grad_norm": 3.546875,
"learning_rate": 2.610966057441254e-06,
"logits/chosen": -2.1975388526916504,
"logits/rejected": -1.9570707082748413,
"logps/chosen": -264.46234130859375,
"logps/rejected": -235.4163818359375,
"loss": 0.6548,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.14753015339374542,
"rewards/margins": 0.09057016670703888,
"rewards/rejected": -0.23810029029846191,
"step": 200
},
{
"epoch": 0.05,
"eval_logits/chosen": -2.0694758892059326,
"eval_logits/rejected": -1.9328563213348389,
"eval_logps/chosen": -279.5373229980469,
"eval_logps/rejected": -269.7627868652344,
"eval_loss": 0.6499924063682556,
"eval_rewards/accuracies": 0.6779999732971191,
"eval_rewards/chosen": -0.1488528698682785,
"eval_rewards/margins": 0.10269534587860107,
"eval_rewards/rejected": -0.2515482008457184,
"eval_runtime": 382.022,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 3.140625,
"learning_rate": 2.741514360313316e-06,
"logits/chosen": -2.198995590209961,
"logits/rejected": -1.9819616079330444,
"logps/chosen": -271.3312072753906,
"logps/rejected": -252.93746948242188,
"loss": 0.6365,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09111092239618301,
"rewards/margins": 0.1327463835477829,
"rewards/rejected": -0.2238573133945465,
"step": 210
},
{
"epoch": 0.06,
"grad_norm": 3.1875,
"learning_rate": 2.872062663185379e-06,
"logits/chosen": -2.097423553466797,
"logits/rejected": -1.9822295904159546,
"logps/chosen": -259.9545593261719,
"logps/rejected": -246.3585662841797,
"loss": 0.6394,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.05938801169395447,
"rewards/margins": 0.12806808948516846,
"rewards/rejected": -0.18745610117912292,
"step": 220
},
{
"epoch": 0.06,
"grad_norm": 6.40625,
"learning_rate": 3.0026109660574416e-06,
"logits/chosen": -2.2377123832702637,
"logits/rejected": -2.050795078277588,
"logps/chosen": -315.82159423828125,
"logps/rejected": -288.96539306640625,
"loss": 0.6629,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.09652389585971832,
"rewards/margins": 0.08648413419723511,
"rewards/rejected": -0.18300803005695343,
"step": 230
},
{
"epoch": 0.06,
"grad_norm": 3.21875,
"learning_rate": 3.1331592689295043e-06,
"logits/chosen": -2.1486618518829346,
"logits/rejected": -1.961085319519043,
"logps/chosen": -312.89373779296875,
"logps/rejected": -312.0883483886719,
"loss": 0.6388,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.1659378707408905,
"rewards/margins": 0.1430220901966095,
"rewards/rejected": -0.3089599311351776,
"step": 240
},
{
"epoch": 0.07,
"grad_norm": 5.15625,
"learning_rate": 3.263707571801567e-06,
"logits/chosen": -2.112567186355591,
"logits/rejected": -2.012845039367676,
"logps/chosen": -277.0249938964844,
"logps/rejected": -268.689208984375,
"loss": 0.6263,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.1744508147239685,
"rewards/margins": 0.17131540179252625,
"rewards/rejected": -0.34576624631881714,
"step": 250
},
{
"epoch": 0.07,
"grad_norm": 4.53125,
"learning_rate": 3.3942558746736293e-06,
"logits/chosen": -2.1583478450775146,
"logits/rejected": -1.9551265239715576,
"logps/chosen": -310.0099792480469,
"logps/rejected": -299.52789306640625,
"loss": 0.6515,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.37605080008506775,
"rewards/margins": 0.11539731919765472,
"rewards/rejected": -0.49144816398620605,
"step": 260
},
{
"epoch": 0.07,
"grad_norm": 3.296875,
"learning_rate": 3.524804177545692e-06,
"logits/chosen": -2.0597262382507324,
"logits/rejected": -1.9347015619277954,
"logps/chosen": -287.3021545410156,
"logps/rejected": -277.96014404296875,
"loss": 0.6083,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.26627668738365173,
"rewards/margins": 0.22069358825683594,
"rewards/rejected": -0.48697033524513245,
"step": 270
},
{
"epoch": 0.07,
"grad_norm": 4.25,
"learning_rate": 3.6553524804177547e-06,
"logits/chosen": -2.125945568084717,
"logits/rejected": -1.954007863998413,
"logps/chosen": -298.900390625,
"logps/rejected": -293.0090637207031,
"loss": 0.6386,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.39951270818710327,
"rewards/margins": 0.1558128446340561,
"rewards/rejected": -0.5553255081176758,
"step": 280
},
{
"epoch": 0.08,
"grad_norm": 3.796875,
"learning_rate": 3.7859007832898174e-06,
"logits/chosen": -2.0477206707000732,
"logits/rejected": -1.9491031169891357,
"logps/chosen": -324.5054626464844,
"logps/rejected": -319.0287780761719,
"loss": 0.6271,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.38871732354164124,
"rewards/margins": 0.19628065824508667,
"rewards/rejected": -0.5849979519844055,
"step": 290
},
{
"epoch": 0.08,
"grad_norm": 4.96875,
"learning_rate": 3.9164490861618806e-06,
"logits/chosen": -2.0910630226135254,
"logits/rejected": -1.888196587562561,
"logps/chosen": -272.47198486328125,
"logps/rejected": -281.57830810546875,
"loss": 0.6084,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.28343814611434937,
"rewards/margins": 0.22831246256828308,
"rewards/rejected": -0.5117505788803101,
"step": 300
},
{
"epoch": 0.08,
"eval_logits/chosen": -2.011384963989258,
"eval_logits/rejected": -1.8770692348480225,
"eval_logps/chosen": -294.2168884277344,
"eval_logps/rejected": -294.5921325683594,
"eval_loss": 0.6213397979736328,
"eval_rewards/accuracies": 0.6809999942779541,
"eval_rewards/chosen": -0.29564887285232544,
"eval_rewards/margins": 0.20419315993785858,
"eval_rewards/rejected": -0.4998420178890228,
"eval_runtime": 381.8433,
"eval_samples_per_second": 5.238,
"eval_steps_per_second": 0.655,
"step": 300
},
{
"epoch": 0.08,
"grad_norm": 4.03125,
"learning_rate": 4.046997389033943e-06,
"logits/chosen": -2.2418582439422607,
"logits/rejected": -2.04129695892334,
"logps/chosen": -316.5093994140625,
"logps/rejected": -291.79010009765625,
"loss": 0.5836,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.27603110671043396,
"rewards/margins": 0.2892019748687744,
"rewards/rejected": -0.565233051776886,
"step": 310
},
{
"epoch": 0.08,
"grad_norm": 4.40625,
"learning_rate": 4.177545691906005e-06,
"logits/chosen": -2.1178698539733887,
"logits/rejected": -1.9309499263763428,
"logps/chosen": -298.84527587890625,
"logps/rejected": -299.9272155761719,
"loss": 0.6369,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.40581315755844116,
"rewards/margins": 0.1810055673122406,
"rewards/rejected": -0.5868188142776489,
"step": 320
},
{
"epoch": 0.09,
"grad_norm": 4.4375,
"learning_rate": 4.308093994778068e-06,
"logits/chosen": -2.046699047088623,
"logits/rejected": -1.9039798974990845,
"logps/chosen": -296.7830505371094,
"logps/rejected": -293.9065246582031,
"loss": 0.6198,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3718874454498291,
"rewards/margins": 0.21303264796733856,
"rewards/rejected": -0.5849201083183289,
"step": 330
},
{
"epoch": 0.09,
"grad_norm": 5.375,
"learning_rate": 4.4386422976501306e-06,
"logits/chosen": -2.1172854900360107,
"logits/rejected": -2.0036845207214355,
"logps/chosen": -316.01226806640625,
"logps/rejected": -323.5932922363281,
"loss": 0.5946,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.26326456665992737,
"rewards/margins": 0.28980451822280884,
"rewards/rejected": -0.5530691146850586,
"step": 340
},
{
"epoch": 0.09,
"grad_norm": 3.8125,
"learning_rate": 4.569190600522193e-06,
"logits/chosen": -2.042684555053711,
"logits/rejected": -1.8946377038955688,
"logps/chosen": -352.21502685546875,
"logps/rejected": -358.153564453125,
"loss": 0.6413,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6295667886734009,
"rewards/margins": 0.17427758872509003,
"rewards/rejected": -0.8038444519042969,
"step": 350
},
{
"epoch": 0.09,
"grad_norm": 3.015625,
"learning_rate": 4.699738903394257e-06,
"logits/chosen": -2.011836528778076,
"logits/rejected": -1.9665615558624268,
"logps/chosen": -317.6282958984375,
"logps/rejected": -318.0123291015625,
"loss": 0.6161,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7664434909820557,
"rewards/margins": 0.21231558918952942,
"rewards/rejected": -0.9787591099739075,
"step": 360
},
{
"epoch": 0.1,
"grad_norm": 4.53125,
"learning_rate": 4.8302872062663196e-06,
"logits/chosen": -2.1028566360473633,
"logits/rejected": -1.9274108409881592,
"logps/chosen": -356.88507080078125,
"logps/rejected": -335.1341857910156,
"loss": 0.6264,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.7998887300491333,
"rewards/margins": 0.22070667147636414,
"rewards/rejected": -1.0205953121185303,
"step": 370
},
{
"epoch": 0.1,
"grad_norm": 4.59375,
"learning_rate": 4.9608355091383814e-06,
"logits/chosen": -2.069827079772949,
"logits/rejected": -1.8606586456298828,
"logps/chosen": -364.96856689453125,
"logps/rejected": -353.82769775390625,
"loss": 0.6264,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6691190004348755,
"rewards/margins": 0.2223375141620636,
"rewards/rejected": -0.8914563059806824,
"step": 380
},
{
"epoch": 0.1,
"grad_norm": 4.78125,
"learning_rate": 4.9999488562447675e-06,
"logits/chosen": -2.088129997253418,
"logits/rejected": -1.971571683883667,
"logps/chosen": -316.87994384765625,
"logps/rejected": -327.4869079589844,
"loss": 0.5863,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3583374619483948,
"rewards/margins": 0.3061427175998688,
"rewards/rejected": -0.6644802093505859,
"step": 390
},
{
"epoch": 0.1,
"grad_norm": 5.125,
"learning_rate": 4.999698361256577e-06,
"logits/chosen": -2.119563341140747,
"logits/rejected": -1.8813574314117432,
"logps/chosen": -296.64593505859375,
"logps/rejected": -276.7133483886719,
"loss": 0.6237,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.31169393658638,
"rewards/margins": 0.207248717546463,
"rewards/rejected": -0.5189425945281982,
"step": 400
},
{
"epoch": 0.1,
"eval_logits/chosen": -1.9655628204345703,
"eval_logits/rejected": -1.836700201034546,
"eval_logps/chosen": -310.03485107421875,
"eval_logps/rejected": -318.6169738769531,
"eval_loss": 0.6038790345191956,
"eval_rewards/accuracies": 0.6934999823570251,
"eval_rewards/chosen": -0.45382827520370483,
"eval_rewards/margins": 0.2862620949745178,
"eval_rewards/rejected": -0.7400903105735779,
"eval_runtime": 382.0228,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 400
},
{
"epoch": 0.11,
"grad_norm": 4.0625,
"learning_rate": 4.999239142174581e-06,
"logits/chosen": -1.988959550857544,
"logits/rejected": -1.9292503595352173,
"logps/chosen": -292.4900817871094,
"logps/rejected": -307.29473876953125,
"loss": 0.6499,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.5016773343086243,
"rewards/margins": 0.16585329174995422,
"rewards/rejected": -0.6675306558609009,
"step": 410
},
{
"epoch": 0.11,
"grad_norm": 5.375,
"learning_rate": 4.99857123734344e-06,
"logits/chosen": -2.0150246620178223,
"logits/rejected": -1.8929126262664795,
"logps/chosen": -260.4281921386719,
"logps/rejected": -280.3924865722656,
"loss": 0.5908,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3163732588291168,
"rewards/margins": 0.29044631123542786,
"rewards/rejected": -0.6068195104598999,
"step": 420
},
{
"epoch": 0.11,
"grad_norm": 4.75,
"learning_rate": 4.997694702533016e-06,
"logits/chosen": -2.0086240768432617,
"logits/rejected": -1.9487006664276123,
"logps/chosen": -308.3887634277344,
"logps/rejected": -317.20904541015625,
"loss": 0.5817,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3084072172641754,
"rewards/margins": 0.3242705166339874,
"rewards/rejected": -0.6326777338981628,
"step": 430
},
{
"epoch": 0.12,
"grad_norm": 7.90625,
"learning_rate": 4.996609610933713e-06,
"logits/chosen": -2.112046718597412,
"logits/rejected": -2.027024984359741,
"logps/chosen": -303.4664306640625,
"logps/rejected": -303.01220703125,
"loss": 0.6025,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3131754994392395,
"rewards/margins": 0.2790473401546478,
"rewards/rejected": -0.5922229290008545,
"step": 440
},
{
"epoch": 0.12,
"grad_norm": 5.8125,
"learning_rate": 4.995316053150366e-06,
"logits/chosen": -1.9543377161026,
"logits/rejected": -1.8296692371368408,
"logps/chosen": -309.422119140625,
"logps/rejected": -325.46173095703125,
"loss": 0.5577,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4158857762813568,
"rewards/margins": 0.3905051648616791,
"rewards/rejected": -0.8063910603523254,
"step": 450
},
{
"epoch": 0.12,
"grad_norm": 8.375,
"learning_rate": 4.9938141371946815e-06,
"logits/chosen": -1.9097979068756104,
"logits/rejected": -1.8239259719848633,
"logps/chosen": -370.8164978027344,
"logps/rejected": -396.86004638671875,
"loss": 0.5805,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0571677684783936,
"rewards/margins": 0.4056069254875183,
"rewards/rejected": -1.4627748727798462,
"step": 460
},
{
"epoch": 0.12,
"grad_norm": 5.40625,
"learning_rate": 4.992103988476206e-06,
"logits/chosen": -1.9127140045166016,
"logits/rejected": -1.7631990909576416,
"logps/chosen": -352.392822265625,
"logps/rejected": -381.87896728515625,
"loss": 0.5803,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0857564210891724,
"rewards/margins": 0.4201774597167969,
"rewards/rejected": -1.5059337615966797,
"step": 470
},
{
"epoch": 0.13,
"grad_norm": 5.0625,
"learning_rate": 4.990185749791866e-06,
"logits/chosen": -1.892653226852417,
"logits/rejected": -1.7571289539337158,
"logps/chosen": -333.1285095214844,
"logps/rejected": -386.10107421875,
"loss": 0.5413,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.7624952793121338,
"rewards/margins": 0.5117734670639038,
"rewards/rejected": -1.2742688655853271,
"step": 480
},
{
"epoch": 0.13,
"grad_norm": 7.0,
"learning_rate": 4.9880595813140395e-06,
"logits/chosen": -1.8925682306289673,
"logits/rejected": -1.7469890117645264,
"logps/chosen": -369.3451232910156,
"logps/rejected": -387.9554443359375,
"loss": 0.5514,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8445364832878113,
"rewards/margins": 0.4895913600921631,
"rewards/rejected": -1.3341277837753296,
"step": 490
},
{
"epoch": 0.13,
"grad_norm": 5.4375,
"learning_rate": 4.985725660577184e-06,
"logits/chosen": -1.8205528259277344,
"logits/rejected": -1.6672782897949219,
"logps/chosen": -371.17864990234375,
"logps/rejected": -382.154296875,
"loss": 0.5534,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9741867780685425,
"rewards/margins": 0.522149920463562,
"rewards/rejected": -1.4963366985321045,
"step": 500
},
{
"epoch": 0.13,
"eval_logits/chosen": -1.671244502067566,
"eval_logits/rejected": -1.5403351783752441,
"eval_logps/chosen": -356.194580078125,
"eval_logps/rejected": -383.8828430175781,
"eval_loss": 0.5691964626312256,
"eval_rewards/accuracies": 0.7049999833106995,
"eval_rewards/chosen": -0.9154260158538818,
"eval_rewards/margins": 0.4773229658603668,
"eval_rewards/rejected": -1.3927491903305054,
"eval_runtime": 382.3757,
"eval_samples_per_second": 5.23,
"eval_steps_per_second": 0.654,
"step": 500
},
{
"epoch": 0.13,
"grad_norm": 6.53125,
"learning_rate": 4.983184182463009e-06,
"logits/chosen": -1.7440261840820312,
"logits/rejected": -1.6317085027694702,
"logps/chosen": -373.0206604003906,
"logps/rejected": -391.50970458984375,
"loss": 0.5646,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.947595477104187,
"rewards/margins": 0.555194079875946,
"rewards/rejected": -1.5027896165847778,
"step": 510
},
{
"epoch": 0.14,
"grad_norm": 7.65625,
"learning_rate": 4.980435359184203e-06,
"logits/chosen": -1.7637799978256226,
"logits/rejected": -1.7051684856414795,
"logps/chosen": -361.0028991699219,
"logps/rejected": -383.77392578125,
"loss": 0.6028,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.8794111013412476,
"rewards/margins": 0.3896932005882263,
"rewards/rejected": -1.2691043615341187,
"step": 520
},
{
"epoch": 0.14,
"grad_norm": 5.3125,
"learning_rate": 4.9774794202667236e-06,
"logits/chosen": -1.7085822820663452,
"logits/rejected": -1.6667120456695557,
"logps/chosen": -398.4223327636719,
"logps/rejected": -447.1837463378906,
"loss": 0.5797,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.373207688331604,
"rewards/margins": 0.4029502272605896,
"rewards/rejected": -1.776158094406128,
"step": 530
},
{
"epoch": 0.14,
"grad_norm": 8.0625,
"learning_rate": 4.974316612530615e-06,
"logits/chosen": -1.6480659246444702,
"logits/rejected": -1.4872467517852783,
"logps/chosen": -413.641845703125,
"logps/rejected": -420.10565185546875,
"loss": 0.5292,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.3515903949737549,
"rewards/margins": 0.5323625206947327,
"rewards/rejected": -1.8839528560638428,
"step": 540
},
{
"epoch": 0.14,
"grad_norm": 9.375,
"learning_rate": 4.970947200069416e-06,
"logits/chosen": -1.6254298686981201,
"logits/rejected": -1.5536671876907349,
"logps/chosen": -402.1681213378906,
"logps/rejected": -431.54510498046875,
"loss": 0.5995,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2365509271621704,
"rewards/margins": 0.4915947914123535,
"rewards/rejected": -1.7281455993652344,
"step": 550
},
{
"epoch": 0.15,
"grad_norm": 5.90625,
"learning_rate": 4.967371464228096e-06,
"logits/chosen": -1.788649559020996,
"logits/rejected": -1.6893421411514282,
"logps/chosen": -362.63739013671875,
"logps/rejected": -421.24505615234375,
"loss": 0.5384,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0513819456100464,
"rewards/margins": 0.5819835066795349,
"rewards/rejected": -1.6333656311035156,
"step": 560
},
{
"epoch": 0.15,
"grad_norm": 7.28125,
"learning_rate": 4.963589703579569e-06,
"logits/chosen": -1.7911745309829712,
"logits/rejected": -1.6469875574111938,
"logps/chosen": -439.2314453125,
"logps/rejected": -465.60174560546875,
"loss": 0.5809,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.391915202140808,
"rewards/margins": 0.61050945520401,
"rewards/rejected": -2.002424716949463,
"step": 570
},
{
"epoch": 0.15,
"grad_norm": 8.375,
"learning_rate": 4.9596022338997615e-06,
"logits/chosen": -1.7446343898773193,
"logits/rejected": -1.5205295085906982,
"logps/chosen": -424.37664794921875,
"logps/rejected": -455.3761291503906,
"loss": 0.5342,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2658993005752563,
"rewards/margins": 0.7207783460617065,
"rewards/rejected": -1.9866775274276733,
"step": 580
},
{
"epoch": 0.15,
"grad_norm": 5.9375,
"learning_rate": 4.955409388141243e-06,
"logits/chosen": -1.5974572896957397,
"logits/rejected": -1.4778482913970947,
"logps/chosen": -365.91943359375,
"logps/rejected": -388.0648498535156,
"loss": 0.6027,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0861928462982178,
"rewards/margins": 0.45802631974220276,
"rewards/rejected": -1.5442192554473877,
"step": 590
},
{
"epoch": 0.16,
"grad_norm": 5.5625,
"learning_rate": 4.951011516405429e-06,
"logits/chosen": -1.682959794998169,
"logits/rejected": -1.6160876750946045,
"logps/chosen": -331.21978759765625,
"logps/rejected": -367.4974060058594,
"loss": 0.5613,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8046302795410156,
"rewards/margins": 0.5121658444404602,
"rewards/rejected": -1.316796064376831,
"step": 600
},
{
"epoch": 0.16,
"eval_logits/chosen": -1.5049409866333008,
"eval_logits/rejected": -1.3701001405715942,
"eval_logps/chosen": -345.8829650878906,
"eval_logps/rejected": -376.7896423339844,
"eval_loss": 0.5658991932868958,
"eval_rewards/accuracies": 0.7024999856948853,
"eval_rewards/chosen": -0.8123093843460083,
"eval_rewards/margins": 0.5095077753067017,
"eval_rewards/rejected": -1.32181715965271,
"eval_runtime": 382.004,
"eval_samples_per_second": 5.236,
"eval_steps_per_second": 0.654,
"step": 600
},
{
"epoch": 0.16,
"grad_norm": 5.375,
"learning_rate": 4.946408985913344e-06,
"logits/chosen": -1.578046202659607,
"logits/rejected": -1.4836609363555908,
"logps/chosen": -328.2045593261719,
"logps/rejected": -375.481201171875,
"loss": 0.5276,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8234087228775024,
"rewards/margins": 0.6500986218452454,
"rewards/rejected": -1.473507285118103,
"step": 610
},
{
"epoch": 0.16,
"grad_norm": 11.875,
"learning_rate": 4.941602180974958e-06,
"logits/chosen": -1.5045579671859741,
"logits/rejected": -1.2604496479034424,
"logps/chosen": -402.4884338378906,
"logps/rejected": -422.79736328125,
"loss": 0.5241,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.146611213684082,
"rewards/margins": 0.7896040678024292,
"rewards/rejected": -1.9362151622772217,
"step": 620
},
{
"epoch": 0.16,
"grad_norm": 10.5,
"learning_rate": 4.936591502957101e-06,
"logits/chosen": -1.372164249420166,
"logits/rejected": -1.2230699062347412,
"logps/chosen": -414.8818359375,
"logps/rejected": -487.482421875,
"loss": 0.538,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.667520523071289,
"rewards/margins": 0.8288544416427612,
"rewards/rejected": -2.4963748455047607,
"step": 630
},
{
"epoch": 0.17,
"grad_norm": 9.6875,
"learning_rate": 4.931377370249946e-06,
"logits/chosen": -1.3338875770568848,
"logits/rejected": -1.1355304718017578,
"logps/chosen": -483.4081115722656,
"logps/rejected": -526.1396484375,
"loss": 0.5676,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.180846691131592,
"rewards/margins": 0.6847688555717468,
"rewards/rejected": -2.8656158447265625,
"step": 640
},
{
"epoch": 0.17,
"grad_norm": 15.6875,
"learning_rate": 4.925960218232073e-06,
"logits/chosen": -1.3147588968276978,
"logits/rejected": -1.1933101415634155,
"logps/chosen": -446.49346923828125,
"logps/rejected": -517.9827270507812,
"loss": 0.5392,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.9119131565093994,
"rewards/margins": 0.8099091649055481,
"rewards/rejected": -2.721822500228882,
"step": 650
},
{
"epoch": 0.17,
"grad_norm": 8.875,
"learning_rate": 4.920340499234116e-06,
"logits/chosen": -1.3101979494094849,
"logits/rejected": -1.1101386547088623,
"logps/chosen": -426.4873046875,
"logps/rejected": -446.02801513671875,
"loss": 0.5772,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6048357486724854,
"rewards/margins": 0.5474850535392761,
"rewards/rejected": -2.152320623397827,
"step": 660
},
{
"epoch": 0.18,
"grad_norm": 7.28125,
"learning_rate": 4.914518682500995e-06,
"logits/chosen": -1.4778305292129517,
"logits/rejected": -1.3038583993911743,
"logps/chosen": -432.8035583496094,
"logps/rejected": -459.92864990234375,
"loss": 0.5359,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4926092624664307,
"rewards/margins": 0.6784954071044922,
"rewards/rejected": -2.171104907989502,
"step": 670
},
{
"epoch": 0.18,
"grad_norm": 5.40625,
"learning_rate": 4.9084952541527315e-06,
"logits/chosen": -1.3521184921264648,
"logits/rejected": -1.1778732538223267,
"logps/chosen": -430.7608947753906,
"logps/rejected": -458.68072509765625,
"loss": 0.5029,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5742666721343994,
"rewards/margins": 0.7503162622451782,
"rewards/rejected": -2.3245832920074463,
"step": 680
},
{
"epoch": 0.18,
"grad_norm": 7.6875,
"learning_rate": 4.902270717143858e-06,
"logits/chosen": -1.3213449716567993,
"logits/rejected": -1.228070855140686,
"logps/chosen": -417.1580505371094,
"logps/rejected": -537.0321044921875,
"loss": 0.4381,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8092586994171143,
"rewards/margins": 1.0819432735443115,
"rewards/rejected": -2.8912017345428467,
"step": 690
},
{
"epoch": 0.18,
"grad_norm": 6.5,
"learning_rate": 4.895845591221427e-06,
"logits/chosen": -1.2542212009429932,
"logits/rejected": -1.1810188293457031,
"logps/chosen": -466.4949645996094,
"logps/rejected": -549.9205932617188,
"loss": 0.5139,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.131845474243164,
"rewards/margins": 0.8790606260299683,
"rewards/rejected": -3.0109057426452637,
"step": 700
},
{
"epoch": 0.18,
"eval_logits/chosen": -1.0174403190612793,
"eval_logits/rejected": -0.8923892974853516,
"eval_logps/chosen": -528.3277587890625,
"eval_logps/rejected": -591.3086547851562,
"eval_loss": 0.5571516156196594,
"eval_rewards/accuracies": 0.7145000100135803,
"eval_rewards/chosen": -2.6367568969726562,
"eval_rewards/margins": 0.8302499055862427,
"eval_rewards/rejected": -3.4670066833496094,
"eval_runtime": 382.0721,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 700
},
{
"epoch": 0.19,
"grad_norm": 10.5625,
"learning_rate": 4.8892204128816e-06,
"logits/chosen": -1.1841003894805908,
"logits/rejected": -1.0792133808135986,
"logps/chosen": -517.9019775390625,
"logps/rejected": -578.2611083984375,
"loss": 0.5501,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.5033812522888184,
"rewards/margins": 0.7395020127296448,
"rewards/rejected": -3.2428832054138184,
"step": 710
},
{
"epoch": 0.19,
"grad_norm": 8.875,
"learning_rate": 4.882395735324864e-06,
"logits/chosen": -1.1759226322174072,
"logits/rejected": -1.0294206142425537,
"logps/chosen": -477.3987731933594,
"logps/rejected": -544.5623779296875,
"loss": 0.4985,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1131680011749268,
"rewards/margins": 0.8433287739753723,
"rewards/rejected": -2.9564967155456543,
"step": 720
},
{
"epoch": 0.19,
"grad_norm": 8.8125,
"learning_rate": 4.87537212840983e-06,
"logits/chosen": -1.1399719715118408,
"logits/rejected": -1.0124037265777588,
"logps/chosen": -500.2403259277344,
"logps/rejected": -533.0379028320312,
"loss": 0.5509,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.3398513793945312,
"rewards/margins": 0.6334503293037415,
"rewards/rejected": -2.973301887512207,
"step": 730
},
{
"epoch": 0.19,
"grad_norm": 12.375,
"learning_rate": 4.8681501786056545e-06,
"logits/chosen": -1.0892612934112549,
"logits/rejected": -0.941753089427948,
"logps/chosen": -450.81402587890625,
"logps/rejected": -503.46636962890625,
"loss": 0.5001,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2259793281555176,
"rewards/margins": 0.8490058183670044,
"rewards/rejected": -3.0749852657318115,
"step": 740
},
{
"epoch": 0.2,
"grad_norm": 24.0,
"learning_rate": 4.860730488943068e-06,
"logits/chosen": -1.0790389776229858,
"logits/rejected": -1.0216121673583984,
"logps/chosen": -440.62109375,
"logps/rejected": -540.6531372070312,
"loss": 0.4802,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.060957431793213,
"rewards/margins": 1.019281029701233,
"rewards/rejected": -3.0802388191223145,
"step": 750
},
{
"epoch": 0.2,
"grad_norm": 7.0,
"learning_rate": 4.853113678964022e-06,
"logits/chosen": -1.1443126201629639,
"logits/rejected": -1.065063238143921,
"logps/chosen": -448.5615234375,
"logps/rejected": -542.3307495117188,
"loss": 0.505,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.76167893409729,
"rewards/margins": 1.016688346862793,
"rewards/rejected": -2.778367280960083,
"step": 760
},
{
"epoch": 0.2,
"grad_norm": 5.90625,
"learning_rate": 4.845300384669958e-06,
"logits/chosen": -1.23788583278656,
"logits/rejected": -1.1094398498535156,
"logps/chosen": -407.1124267578125,
"logps/rejected": -459.88226318359375,
"loss": 0.5488,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5390856266021729,
"rewards/margins": 0.7560388445854187,
"rewards/rejected": -2.2951245307922363,
"step": 770
},
{
"epoch": 0.2,
"grad_norm": 16.625,
"learning_rate": 4.837291258468701e-06,
"logits/chosen": -1.3532726764678955,
"logits/rejected": -1.2090624570846558,
"logps/chosen": -449.90447998046875,
"logps/rejected": -503.38067626953125,
"loss": 0.5803,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.645581603050232,
"rewards/margins": 0.783669114112854,
"rewards/rejected": -2.429250955581665,
"step": 780
},
{
"epoch": 0.21,
"grad_norm": 7.59375,
"learning_rate": 4.829086969119984e-06,
"logits/chosen": -1.2730779647827148,
"logits/rejected": -1.2738616466522217,
"logps/chosen": -398.4493103027344,
"logps/rejected": -460.91387939453125,
"loss": 0.5907,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.4772454500198364,
"rewards/margins": 0.6072799563407898,
"rewards/rejected": -2.0845253467559814,
"step": 790
},
{
"epoch": 0.21,
"grad_norm": 8.1875,
"learning_rate": 4.820688201679605e-06,
"logits/chosen": -1.559012770652771,
"logits/rejected": -1.2587218284606934,
"logps/chosen": -388.8677673339844,
"logps/rejected": -389.87957763671875,
"loss": 0.5184,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2623231410980225,
"rewards/margins": 0.6337946057319641,
"rewards/rejected": -1.8961181640625,
"step": 800
},
{
"epoch": 0.21,
"eval_logits/chosen": -1.246036410331726,
"eval_logits/rejected": -1.1140612363815308,
"eval_logps/chosen": -413.7338562011719,
"eval_logps/rejected": -463.30914306640625,
"eval_loss": 0.5373813509941101,
"eval_rewards/accuracies": 0.7160000205039978,
"eval_rewards/chosen": -1.4908183813095093,
"eval_rewards/margins": 0.6961935758590698,
"eval_rewards/rejected": -2.187012195587158,
"eval_runtime": 382.1333,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 800
},
{
"epoch": 0.21,
"grad_norm": 9.0625,
"learning_rate": 4.8120956574422315e-06,
"logits/chosen": -1.407278060913086,
"logits/rejected": -1.3845430612564087,
"logps/chosen": -428.33648681640625,
"logps/rejected": -478.8470764160156,
"loss": 0.6069,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.544116735458374,
"rewards/margins": 0.5756716132164001,
"rewards/rejected": -2.119788646697998,
"step": 810
},
{
"epoch": 0.21,
"grad_norm": 7.625,
"learning_rate": 4.803310053882831e-06,
"logits/chosen": -1.4305765628814697,
"logits/rejected": -1.4192079305648804,
"logps/chosen": -346.76165771484375,
"logps/rejected": -416.07073974609375,
"loss": 0.5573,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1743587255477905,
"rewards/margins": 0.5695887804031372,
"rewards/rejected": -1.7439473867416382,
"step": 820
},
{
"epoch": 0.22,
"grad_norm": 11.8125,
"learning_rate": 4.794332124596775e-06,
"logits/chosen": -1.4643322229385376,
"logits/rejected": -1.3541513681411743,
"logps/chosen": -378.71685791015625,
"logps/rejected": -430.7264709472656,
"loss": 0.5747,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1027584075927734,
"rewards/margins": 0.5569159984588623,
"rewards/rejected": -1.6596744060516357,
"step": 830
},
{
"epoch": 0.22,
"grad_norm": 7.28125,
"learning_rate": 4.785162619238575e-06,
"logits/chosen": -1.3610130548477173,
"logits/rejected": -1.2018978595733643,
"logps/chosen": -377.59130859375,
"logps/rejected": -424.17108154296875,
"loss": 0.516,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.2374261617660522,
"rewards/margins": 0.7390316128730774,
"rewards/rejected": -1.9764575958251953,
"step": 840
},
{
"epoch": 0.22,
"grad_norm": 7.25,
"learning_rate": 4.775802303459288e-06,
"logits/chosen": -1.230850100517273,
"logits/rejected": -1.153451919555664,
"logps/chosen": -397.7276611328125,
"logps/rejected": -469.70037841796875,
"loss": 0.5533,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.4996331930160522,
"rewards/margins": 0.7669634819030762,
"rewards/rejected": -2.266597032546997,
"step": 850
},
{
"epoch": 0.23,
"grad_norm": 10.8125,
"learning_rate": 4.766251958842589e-06,
"logits/chosen": -1.196821689605713,
"logits/rejected": -1.0929956436157227,
"logps/chosen": -442.42779541015625,
"logps/rejected": -496.02508544921875,
"loss": 0.5516,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6992677450180054,
"rewards/margins": 0.640595018863678,
"rewards/rejected": -2.339862585067749,
"step": 860
},
{
"epoch": 0.23,
"grad_norm": 5.96875,
"learning_rate": 4.7565123828395066e-06,
"logits/chosen": -1.1287126541137695,
"logits/rejected": -1.0260584354400635,
"logps/chosen": -434.9798278808594,
"logps/rejected": -504.6143493652344,
"loss": 0.5191,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.790509819984436,
"rewards/margins": 0.7024968266487122,
"rewards/rejected": -2.493006467819214,
"step": 870
},
{
"epoch": 0.23,
"grad_norm": 9.4375,
"learning_rate": 4.746584388701831e-06,
"logits/chosen": -1.1179661750793457,
"logits/rejected": -1.0696125030517578,
"logps/chosen": -474.17364501953125,
"logps/rejected": -547.4193115234375,
"loss": 0.4941,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.1093432903289795,
"rewards/margins": 0.8745004534721375,
"rewards/rejected": -2.9838438034057617,
"step": 880
},
{
"epoch": 0.23,
"grad_norm": 11.0,
"learning_rate": 4.736468805414218e-06,
"logits/chosen": -1.0214884281158447,
"logits/rejected": -0.9855283498764038,
"logps/chosen": -477.1600646972656,
"logps/rejected": -576.8958740234375,
"loss": 0.5755,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.297248125076294,
"rewards/margins": 0.8587217330932617,
"rewards/rejected": -3.1559698581695557,
"step": 890
},
{
"epoch": 0.24,
"grad_norm": 14.9375,
"learning_rate": 4.7261664776249595e-06,
"logits/chosen": -0.8845041394233704,
"logits/rejected": -0.7875598073005676,
"logps/chosen": -482.1604919433594,
"logps/rejected": -565.8832397460938,
"loss": 0.5211,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.532474994659424,
"rewards/margins": 0.9273085594177246,
"rewards/rejected": -3.4597840309143066,
"step": 900
},
{
"epoch": 0.24,
"eval_logits/chosen": -0.9341001510620117,
"eval_logits/rejected": -0.8115790486335754,
"eval_logps/chosen": -518.949462890625,
"eval_logps/rejected": -584.0806274414062,
"eval_loss": 0.5331768989562988,
"eval_rewards/accuracies": 0.7179999947547913,
"eval_rewards/chosen": -2.5429742336273193,
"eval_rewards/margins": 0.8517529368400574,
"eval_rewards/rejected": -3.3947272300720215,
"eval_runtime": 382.1611,
"eval_samples_per_second": 5.233,
"eval_steps_per_second": 0.654,
"step": 900
},
{
"epoch": 0.24,
"grad_norm": 12.1875,
"learning_rate": 4.715678265575463e-06,
"logits/chosen": -1.1323182582855225,
"logits/rejected": -0.9318205118179321,
"logps/chosen": -521.3104248046875,
"logps/rejected": -533.2903442382812,
"loss": 0.5686,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.3703832626342773,
"rewards/margins": 0.6751216650009155,
"rewards/rejected": -3.0455050468444824,
"step": 910
},
{
"epoch": 0.24,
"grad_norm": 8.625,
"learning_rate": 4.705005045028415e-06,
"logits/chosen": -1.0868864059448242,
"logits/rejected": -0.9571698904037476,
"logps/chosen": -469.189208984375,
"logps/rejected": -530.5699462890625,
"loss": 0.5319,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.0605311393737793,
"rewards/margins": 0.7877290844917297,
"rewards/rejected": -2.8482604026794434,
"step": 920
},
{
"epoch": 0.24,
"grad_norm": 8.8125,
"learning_rate": 4.694147707194659e-06,
"logits/chosen": -1.1987128257751465,
"logits/rejected": -1.1085574626922607,
"logps/chosen": -479.1398010253906,
"logps/rejected": -532.23828125,
"loss": 0.5295,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0726170539855957,
"rewards/margins": 0.7290612459182739,
"rewards/rejected": -2.80167818069458,
"step": 930
},
{
"epoch": 0.25,
"grad_norm": 7.3125,
"learning_rate": 4.683107158658782e-06,
"logits/chosen": -1.1448571681976318,
"logits/rejected": -1.0365805625915527,
"logps/chosen": -478.0250549316406,
"logps/rejected": -530.4112548828125,
"loss": 0.5083,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8778432607650757,
"rewards/margins": 0.811355471611023,
"rewards/rejected": -2.6891987323760986,
"step": 940
},
{
"epoch": 0.25,
"grad_norm": 9.0625,
"learning_rate": 4.671884321303407e-06,
"logits/chosen": -1.2020542621612549,
"logits/rejected": -1.0928010940551758,
"logps/chosen": -440.04864501953125,
"logps/rejected": -496.198486328125,
"loss": 0.5249,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9372259378433228,
"rewards/margins": 0.7049869298934937,
"rewards/rejected": -2.6422126293182373,
"step": 950
},
{
"epoch": 0.25,
"grad_norm": 6.875,
"learning_rate": 4.660480132232224e-06,
"logits/chosen": -1.2815606594085693,
"logits/rejected": -1.1846911907196045,
"logps/chosen": -445.06915283203125,
"logps/rejected": -479.39093017578125,
"loss": 0.5773,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.7325608730316162,
"rewards/margins": 0.5843728184700012,
"rewards/rejected": -2.3169338703155518,
"step": 960
},
{
"epoch": 0.25,
"grad_norm": 9.6875,
"learning_rate": 4.6488955436917414e-06,
"logits/chosen": -1.3540565967559814,
"logits/rejected": -1.1343624591827393,
"logps/chosen": -444.31640625,
"logps/rejected": -482.2098083496094,
"loss": 0.5099,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.6433531045913696,
"rewards/margins": 0.8446812629699707,
"rewards/rejected": -2.48803448677063,
"step": 970
},
{
"epoch": 0.26,
"grad_norm": 5.75,
"learning_rate": 4.6371315229917644e-06,
"logits/chosen": -1.3197797536849976,
"logits/rejected": -1.1996195316314697,
"logps/chosen": -457.05712890625,
"logps/rejected": -514.72802734375,
"loss": 0.5217,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7364044189453125,
"rewards/margins": 0.780579149723053,
"rewards/rejected": -2.5169835090637207,
"step": 980
},
{
"epoch": 0.26,
"grad_norm": 13.6875,
"learning_rate": 4.625189052424638e-06,
"logits/chosen": -1.2102200984954834,
"logits/rejected": -1.0647470951080322,
"logps/chosen": -436.97991943359375,
"logps/rejected": -520.3751220703125,
"loss": 0.4535,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9787667989730835,
"rewards/margins": 1.061232328414917,
"rewards/rejected": -3.039999008178711,
"step": 990
},
{
"epoch": 0.26,
"grad_norm": 8.25,
"learning_rate": 4.613069129183218e-06,
"logits/chosen": -1.240464687347412,
"logits/rejected": -1.0879384279251099,
"logps/chosen": -531.1487426757812,
"logps/rejected": -574.3619384765625,
"loss": 0.5553,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.2774546146392822,
"rewards/margins": 0.7940423488616943,
"rewards/rejected": -3.0714969635009766,
"step": 1000
},
{
"epoch": 0.26,
"eval_logits/chosen": -0.981342613697052,
"eval_logits/rejected": -0.8557386994361877,
"eval_logps/chosen": -482.09930419921875,
"eval_logps/rejected": -548.8490600585938,
"eval_loss": 0.5178083777427673,
"eval_rewards/accuracies": 0.7315000295639038,
"eval_rewards/chosen": -2.1744725704193115,
"eval_rewards/margins": 0.8679391145706177,
"eval_rewards/rejected": -3.0424115657806396,
"eval_runtime": 382.1372,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 1000
},
{
"epoch": 0.26,
"grad_norm": 8.0,
"learning_rate": 4.600772765277607e-06,
"logits/chosen": -1.0305756330490112,
"logits/rejected": -0.9370132684707642,
"logps/chosen": -448.99493408203125,
"logps/rejected": -530.3275146484375,
"loss": 0.4913,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.154376983642578,
"rewards/margins": 0.8647212982177734,
"rewards/rejected": -3.0190985202789307,
"step": 1010
},
{
"epoch": 0.27,
"grad_norm": 16.75,
"learning_rate": 4.588300987450652e-06,
"logits/chosen": -1.0989015102386475,
"logits/rejected": -0.9851810336112976,
"logps/chosen": -443.59423828125,
"logps/rejected": -486.5970764160156,
"loss": 0.5542,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8985025882720947,
"rewards/margins": 0.7655047178268433,
"rewards/rejected": -2.6640071868896484,
"step": 1020
},
{
"epoch": 0.27,
"grad_norm": 5.6875,
"learning_rate": 4.5756548370922136e-06,
"logits/chosen": -1.0507217645645142,
"logits/rejected": -0.9594799280166626,
"logps/chosen": -405.2181091308594,
"logps/rejected": -487.1499938964844,
"loss": 0.4835,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.6451423168182373,
"rewards/margins": 0.9089698791503906,
"rewards/rejected": -2.554112434387207,
"step": 1030
},
{
"epoch": 0.27,
"grad_norm": 13.5625,
"learning_rate": 4.562835370152206e-06,
"logits/chosen": -1.0573441982269287,
"logits/rejected": -0.8775628209114075,
"logps/chosen": -527.5038452148438,
"logps/rejected": -620.2794189453125,
"loss": 0.4742,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2627432346343994,
"rewards/margins": 1.2387964725494385,
"rewards/rejected": -3.501539707183838,
"step": 1040
},
{
"epoch": 0.27,
"grad_norm": 8.8125,
"learning_rate": 4.54984365705243e-06,
"logits/chosen": -0.9812475442886353,
"logits/rejected": -0.8811472654342651,
"logps/chosen": -502.1786193847656,
"logps/rejected": -618.7202758789062,
"loss": 0.4971,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.337085485458374,
"rewards/margins": 1.2312263250350952,
"rewards/rejected": -3.5683116912841797,
"step": 1050
},
{
"epoch": 0.28,
"grad_norm": 9.0,
"learning_rate": 4.536680782597191e-06,
"logits/chosen": -0.9585447311401367,
"logits/rejected": -0.8763798475265503,
"logps/chosen": -443.18878173828125,
"logps/rejected": -523.16015625,
"loss": 0.6028,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.0716359615325928,
"rewards/margins": 0.855958104133606,
"rewards/rejected": -2.9275941848754883,
"step": 1060
},
{
"epoch": 0.28,
"grad_norm": 15.4375,
"learning_rate": 4.523347845882718e-06,
"logits/chosen": -1.122159481048584,
"logits/rejected": -0.9293369054794312,
"logps/chosen": -494.13037109375,
"logps/rejected": -562.1329345703125,
"loss": 0.4613,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0596017837524414,
"rewards/margins": 1.1728570461273193,
"rewards/rejected": -3.2324588298797607,
"step": 1070
},
{
"epoch": 0.28,
"grad_norm": 8.125,
"learning_rate": 4.50984596020539e-06,
"logits/chosen": -0.8647342920303345,
"logits/rejected": -0.826617419719696,
"logps/chosen": -561.8629760742188,
"logps/rejected": -615.0023193359375,
"loss": 0.5557,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.8171119689941406,
"rewards/margins": 0.8539352416992188,
"rewards/rejected": -3.6710472106933594,
"step": 1080
},
{
"epoch": 0.29,
"grad_norm": 9.0,
"learning_rate": 4.4961762529687745e-06,
"logits/chosen": -1.0336081981658936,
"logits/rejected": -0.9252422451972961,
"logps/chosen": -563.8508911132812,
"logps/rejected": -638.390869140625,
"loss": 0.4855,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.0344927310943604,
"rewards/margins": 0.9103133082389832,
"rewards/rejected": -3.944805860519409,
"step": 1090
},
{
"epoch": 0.29,
"grad_norm": 6.9375,
"learning_rate": 4.482339865589492e-06,
"logits/chosen": -1.0671048164367676,
"logits/rejected": -0.9094209671020508,
"logps/chosen": -568.4443359375,
"logps/rejected": -596.6480712890625,
"loss": 0.5994,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.021576404571533,
"rewards/margins": 0.7217450141906738,
"rewards/rejected": -3.743321180343628,
"step": 1100
},
{
"epoch": 0.29,
"eval_logits/chosen": -0.8895747661590576,
"eval_logits/rejected": -0.7614892721176147,
"eval_logps/chosen": -514.6676635742188,
"eval_logps/rejected": -577.3698120117188,
"eval_loss": 0.520658552646637,
"eval_rewards/accuracies": 0.7300000190734863,
"eval_rewards/chosen": -2.5001566410064697,
"eval_rewards/margins": 0.8274616599082947,
"eval_rewards/rejected": -3.3276185989379883,
"eval_runtime": 382.1502,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 1100
},
{
"epoch": 0.29,
"grad_norm": 6.625,
"learning_rate": 4.468337953401909e-06,
"logits/chosen": -1.1065692901611328,
"logits/rejected": -1.0572447776794434,
"logps/chosen": -495.5409240722656,
"logps/rejected": -552.65966796875,
"loss": 0.5707,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.2518980503082275,
"rewards/margins": 0.61982262134552,
"rewards/rejected": -2.871720790863037,
"step": 1110
},
{
"epoch": 0.29,
"grad_norm": 8.875,
"learning_rate": 4.45417168556166e-06,
"logits/chosen": -1.0463123321533203,
"logits/rejected": -0.9469770193099976,
"logps/chosen": -435.6727600097656,
"logps/rejected": -518.3145751953125,
"loss": 0.5007,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9452159404754639,
"rewards/margins": 0.8327676057815552,
"rewards/rejected": -2.7779834270477295,
"step": 1120
},
{
"epoch": 0.3,
"grad_norm": 9.6875,
"learning_rate": 4.439842244948036e-06,
"logits/chosen": -1.0293817520141602,
"logits/rejected": -0.8690570592880249,
"logps/chosen": -486.1783142089844,
"logps/rejected": -559.431396484375,
"loss": 0.5565,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2511630058288574,
"rewards/margins": 0.7881690263748169,
"rewards/rejected": -3.0393319129943848,
"step": 1130
},
{
"epoch": 0.3,
"grad_norm": 14.5,
"learning_rate": 4.425350828065204e-06,
"logits/chosen": -1.0534614324569702,
"logits/rejected": -0.8575074076652527,
"logps/chosen": -497.90167236328125,
"logps/rejected": -537.9634399414062,
"loss": 0.4913,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.1381561756134033,
"rewards/margins": 0.8793197870254517,
"rewards/rejected": -3.0174758434295654,
"step": 1140
},
{
"epoch": 0.3,
"grad_norm": 9.5625,
"learning_rate": 4.410698644942303e-06,
"logits/chosen": -1.0756770372390747,
"logits/rejected": -0.9290148615837097,
"logps/chosen": -489.197265625,
"logps/rejected": -558.8743286132812,
"loss": 0.4893,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1688458919525146,
"rewards/margins": 0.9360774755477905,
"rewards/rejected": -3.1049234867095947,
"step": 1150
},
{
"epoch": 0.3,
"grad_norm": 12.25,
"learning_rate": 4.395886919032406e-06,
"logits/chosen": -0.9989307522773743,
"logits/rejected": -0.8515041470527649,
"logps/chosen": -480.94183349609375,
"logps/rejected": -542.0136108398438,
"loss": 0.5419,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.1710543632507324,
"rewards/margins": 0.8757139444351196,
"rewards/rejected": -3.0467686653137207,
"step": 1160
},
{
"epoch": 0.31,
"grad_norm": 8.625,
"learning_rate": 4.380916887110366e-06,
"logits/chosen": -1.1318533420562744,
"logits/rejected": -0.9459112286567688,
"logps/chosen": -481.12335205078125,
"logps/rejected": -544.0623779296875,
"loss": 0.5083,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.2195496559143066,
"rewards/margins": 1.032907247543335,
"rewards/rejected": -3.2524571418762207,
"step": 1170
},
{
"epoch": 0.31,
"grad_norm": 9.3125,
"learning_rate": 4.365789799169539e-06,
"logits/chosen": -0.9683933258056641,
"logits/rejected": -1.0098755359649658,
"logps/chosen": -474.65283203125,
"logps/rejected": -566.4153442382812,
"loss": 0.5468,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.280418872833252,
"rewards/margins": 0.8640033006668091,
"rewards/rejected": -3.1444220542907715,
"step": 1180
},
{
"epoch": 0.31,
"grad_norm": 11.9375,
"learning_rate": 4.350506918317416e-06,
"logits/chosen": -1.1871801614761353,
"logits/rejected": -1.0333930253982544,
"logps/chosen": -443.02716064453125,
"logps/rejected": -521.8514404296875,
"loss": 0.5037,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9543129205703735,
"rewards/margins": 0.8601529002189636,
"rewards/rejected": -2.8144659996032715,
"step": 1190
},
{
"epoch": 0.31,
"grad_norm": 9.0,
"learning_rate": 4.335069520670149e-06,
"logits/chosen": -0.9967072606086731,
"logits/rejected": -0.9244716763496399,
"logps/chosen": -455.01959228515625,
"logps/rejected": -528.6710205078125,
"loss": 0.5976,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.2530674934387207,
"rewards/margins": 0.6545962691307068,
"rewards/rejected": -2.907663583755493,
"step": 1200
},
{
"epoch": 0.31,
"eval_logits/chosen": -0.9595763087272644,
"eval_logits/rejected": -0.8350398540496826,
"eval_logps/chosen": -482.9834289550781,
"eval_logps/rejected": -543.660400390625,
"eval_loss": 0.5098230838775635,
"eval_rewards/accuracies": 0.7365000247955322,
"eval_rewards/chosen": -2.183314323425293,
"eval_rewards/margins": 0.8072100281715393,
"eval_rewards/rejected": -2.9905245304107666,
"eval_runtime": 382.4857,
"eval_samples_per_second": 5.229,
"eval_steps_per_second": 0.654,
"step": 1200
},
{
"epoch": 0.32,
"grad_norm": 6.40625,
"learning_rate": 4.319478895246e-06,
"logits/chosen": -1.070488691329956,
"logits/rejected": -0.886951744556427,
"logps/chosen": -466.0955505371094,
"logps/rejected": -520.3566284179688,
"loss": 0.4951,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.189800977706909,
"rewards/margins": 0.7895106077194214,
"rewards/rejected": -2.979311466217041,
"step": 1210
},
{
"epoch": 0.32,
"grad_norm": 11.0,
"learning_rate": 4.303736343857704e-06,
"logits/chosen": -1.0415198802947998,
"logits/rejected": -0.9387828707695007,
"logps/chosen": -499.1920471191406,
"logps/rejected": -617.3883666992188,
"loss": 0.4881,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.420851230621338,
"rewards/margins": 1.062877893447876,
"rewards/rejected": -3.483729124069214,
"step": 1220
},
{
"epoch": 0.32,
"grad_norm": 10.0,
"learning_rate": 4.287843181003772e-06,
"logits/chosen": -1.0625154972076416,
"logits/rejected": -0.9172189831733704,
"logps/chosen": -579.9913330078125,
"logps/rejected": -610.0975341796875,
"loss": 0.5905,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8613951206207275,
"rewards/margins": 0.7642954587936401,
"rewards/rejected": -3.6256909370422363,
"step": 1230
},
{
"epoch": 0.32,
"grad_norm": 7.59375,
"learning_rate": 4.27180073375873e-06,
"logits/chosen": -1.1162028312683105,
"logits/rejected": -0.9976137280464172,
"logps/chosen": -525.2400512695312,
"logps/rejected": -569.8626708984375,
"loss": 0.5269,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.3787271976470947,
"rewards/margins": 0.8617948293685913,
"rewards/rejected": -3.2405219078063965,
"step": 1240
},
{
"epoch": 0.33,
"grad_norm": 5.625,
"learning_rate": 4.255610341662304e-06,
"logits/chosen": -1.144928216934204,
"logits/rejected": -0.9519325494766235,
"logps/chosen": -472.40087890625,
"logps/rejected": -529.2858276367188,
"loss": 0.5525,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.171128511428833,
"rewards/margins": 0.767959475517273,
"rewards/rejected": -2.9390883445739746,
"step": 1250
},
{
"epoch": 0.33,
"grad_norm": 8.625,
"learning_rate": 4.2392733566075764e-06,
"logits/chosen": -1.11684250831604,
"logits/rejected": -0.9831358194351196,
"logps/chosen": -500.71484375,
"logps/rejected": -542.6422119140625,
"loss": 0.5654,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.447084903717041,
"rewards/margins": 0.5746163129806519,
"rewards/rejected": -3.0217010974884033,
"step": 1260
},
{
"epoch": 0.33,
"grad_norm": 7.65625,
"learning_rate": 4.2227911427280975e-06,
"logits/chosen": -1.0659453868865967,
"logits/rejected": -0.899361252784729,
"logps/chosen": -475.46148681640625,
"logps/rejected": -525.0037841796875,
"loss": 0.5081,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.251559257507324,
"rewards/margins": 0.823780357837677,
"rewards/rejected": -3.0753397941589355,
"step": 1270
},
{
"epoch": 0.33,
"grad_norm": 11.4375,
"learning_rate": 4.206165076283983e-06,
"logits/chosen": -1.096620798110962,
"logits/rejected": -0.9550498127937317,
"logps/chosen": -487.46136474609375,
"logps/rejected": -576.1992797851562,
"loss": 0.461,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.4152817726135254,
"rewards/margins": 1.0981849431991577,
"rewards/rejected": -3.5134663581848145,
"step": 1280
},
{
"epoch": 0.34,
"grad_norm": 10.6875,
"learning_rate": 4.189396545546995e-06,
"logits/chosen": -1.0538244247436523,
"logits/rejected": -0.9361982345581055,
"logps/chosen": -522.2523193359375,
"logps/rejected": -610.1349487304688,
"loss": 0.5054,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.730778217315674,
"rewards/margins": 1.0780103206634521,
"rewards/rejected": -3.808788776397705,
"step": 1290
},
{
"epoch": 0.34,
"grad_norm": 13.125,
"learning_rate": 4.172486950684627e-06,
"logits/chosen": -1.0185925960540771,
"logits/rejected": -0.9584161639213562,
"logps/chosen": -538.3131103515625,
"logps/rejected": -635.578369140625,
"loss": 0.5237,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.846707820892334,
"rewards/margins": 1.0040740966796875,
"rewards/rejected": -3.8507816791534424,
"step": 1300
},
{
"epoch": 0.34,
"eval_logits/chosen": -0.825871467590332,
"eval_logits/rejected": -0.7071986198425293,
"eval_logps/chosen": -574.3861694335938,
"eval_logps/rejected": -660.885009765625,
"eval_loss": 0.5165807008743286,
"eval_rewards/accuracies": 0.7350000143051147,
"eval_rewards/chosen": -3.097341775894165,
"eval_rewards/margins": 1.0654287338256836,
"eval_rewards/rejected": -4.162771224975586,
"eval_runtime": 382.0912,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 1300
},
{
"epoch": 0.34,
"grad_norm": 11.625,
"learning_rate": 4.155437703643182e-06,
"logits/chosen": -1.0443698167800903,
"logits/rejected": -0.8676601648330688,
"logps/chosen": -536.4607543945312,
"logps/rejected": -606.3543701171875,
"loss": 0.5075,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.8971712589263916,
"rewards/margins": 0.9897411465644836,
"rewards/rejected": -3.8869121074676514,
"step": 1310
},
{
"epoch": 0.35,
"grad_norm": 11.375,
"learning_rate": 4.138250228029882e-06,
"logits/chosen": -1.000579595565796,
"logits/rejected": -0.9191876649856567,
"logps/chosen": -538.9154052734375,
"logps/rejected": -649.7552490234375,
"loss": 0.4767,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.8198482990264893,
"rewards/margins": 1.0736055374145508,
"rewards/rejected": -3.893454074859619,
"step": 1320
},
{
"epoch": 0.35,
"grad_norm": 7.6875,
"learning_rate": 4.120925958993994e-06,
"logits/chosen": -0.9208280444145203,
"logits/rejected": -0.8555585741996765,
"logps/chosen": -512.56787109375,
"logps/rejected": -604.376220703125,
"loss": 0.5584,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.786665439605713,
"rewards/margins": 0.9612969160079956,
"rewards/rejected": -3.747962474822998,
"step": 1330
},
{
"epoch": 0.35,
"grad_norm": 14.0,
"learning_rate": 4.103466343106999e-06,
"logits/chosen": -1.1172326803207397,
"logits/rejected": -0.9976350665092468,
"logps/chosen": -514.8595581054688,
"logps/rejected": -575.3850708007812,
"loss": 0.5422,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.4547386169433594,
"rewards/margins": 0.8639480471611023,
"rewards/rejected": -3.3186867237091064,
"step": 1340
},
{
"epoch": 0.35,
"grad_norm": 10.125,
"learning_rate": 4.085872838241797e-06,
"logits/chosen": -1.0706989765167236,
"logits/rejected": -0.9391083717346191,
"logps/chosen": -489.779296875,
"logps/rejected": -538.4210815429688,
"loss": 0.5948,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.274151563644409,
"rewards/margins": 0.6873086094856262,
"rewards/rejected": -2.9614596366882324,
"step": 1350
},
{
"epoch": 0.36,
"grad_norm": 11.125,
"learning_rate": 4.06814691345098e-06,
"logits/chosen": -1.0508559942245483,
"logits/rejected": -0.9001902341842651,
"logps/chosen": -451.5694274902344,
"logps/rejected": -517.9208984375,
"loss": 0.4809,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9602162837982178,
"rewards/margins": 0.884141743183136,
"rewards/rejected": -2.844357967376709,
"step": 1360
},
{
"epoch": 0.36,
"grad_norm": 14.125,
"learning_rate": 4.050290048844171e-06,
"logits/chosen": -1.129167914390564,
"logits/rejected": -1.0560190677642822,
"logps/chosen": -474.2417907714844,
"logps/rejected": -552.0899047851562,
"loss": 0.5423,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.056283473968506,
"rewards/margins": 0.8298514485359192,
"rewards/rejected": -2.886134624481201,
"step": 1370
},
{
"epoch": 0.36,
"grad_norm": 9.5,
"learning_rate": 4.032303735464422e-06,
"logits/chosen": -1.1856621503829956,
"logits/rejected": -0.9643325805664062,
"logps/chosen": -502.15814208984375,
"logps/rejected": -594.064208984375,
"loss": 0.452,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.301772356033325,
"rewards/margins": 1.1499149799346924,
"rewards/rejected": -3.4516875743865967,
"step": 1380
},
{
"epoch": 0.36,
"grad_norm": 11.6875,
"learning_rate": 4.014189475163727e-06,
"logits/chosen": -0.96733558177948,
"logits/rejected": -0.853344738483429,
"logps/chosen": -489.39990234375,
"logps/rejected": -597.2086181640625,
"loss": 0.4757,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.3474299907684326,
"rewards/margins": 1.1593117713928223,
"rewards/rejected": -3.506741762161255,
"step": 1390
},
{
"epoch": 0.37,
"grad_norm": 12.75,
"learning_rate": 3.995948780477605e-06,
"logits/chosen": -1.1000730991363525,
"logits/rejected": -0.9693312644958496,
"logps/chosen": -477.19549560546875,
"logps/rejected": -542.30615234375,
"loss": 0.516,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.0844216346740723,
"rewards/margins": 0.8978837132453918,
"rewards/rejected": -2.9823052883148193,
"step": 1400
},
{
"epoch": 0.37,
"eval_logits/chosen": -0.9127845168113708,
"eval_logits/rejected": -0.7864713668823242,
"eval_logps/chosen": -474.74249267578125,
"eval_logps/rejected": -551.2366943359375,
"eval_loss": 0.5107593536376953,
"eval_rewards/accuracies": 0.7350000143051147,
"eval_rewards/chosen": -2.100904941558838,
"eval_rewards/margins": 0.9653825163841248,
"eval_rewards/rejected": -3.0662872791290283,
"eval_runtime": 381.6083,
"eval_samples_per_second": 5.241,
"eval_steps_per_second": 0.655,
"step": 1400
},
{
"epoch": 0.37,
"grad_norm": 10.25,
"learning_rate": 3.977583174498816e-06,
"logits/chosen": -1.017508864402771,
"logits/rejected": -0.8959487676620483,
"logps/chosen": -488.11810302734375,
"logps/rejected": -602.2122802734375,
"loss": 0.3715,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.244345188140869,
"rewards/margins": 1.360781192779541,
"rewards/rejected": -3.6051268577575684,
"step": 1410
},
{
"epoch": 0.37,
"grad_norm": 12.125,
"learning_rate": 3.959094190750172e-06,
"logits/chosen": -1.0074245929718018,
"logits/rejected": -0.868901252746582,
"logps/chosen": -552.512939453125,
"logps/rejected": -637.4674072265625,
"loss": 0.4966,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.6735260486602783,
"rewards/margins": 1.1185749769210815,
"rewards/rejected": -3.7921009063720703,
"step": 1420
},
{
"epoch": 0.37,
"grad_norm": 11.6875,
"learning_rate": 3.9404833730564975e-06,
"logits/chosen": -0.8478316068649292,
"logits/rejected": -0.7511281967163086,
"logps/chosen": -535.4224853515625,
"logps/rejected": -637.5137329101562,
"loss": 0.494,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.823219060897827,
"rewards/margins": 1.1367390155792236,
"rewards/rejected": -3.9599578380584717,
"step": 1430
},
{
"epoch": 0.38,
"grad_norm": 17.125,
"learning_rate": 3.921752275415712e-06,
"logits/chosen": -0.9650063514709473,
"logits/rejected": -0.8631266355514526,
"logps/chosen": -534.4532470703125,
"logps/rejected": -645.3438720703125,
"loss": 0.4351,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.8391730785369873,
"rewards/margins": 1.3146858215332031,
"rewards/rejected": -4.1538591384887695,
"step": 1440
},
{
"epoch": 0.38,
"grad_norm": 6.53125,
"learning_rate": 3.902902461869079e-06,
"logits/chosen": -0.9252153635025024,
"logits/rejected": -0.7948675751686096,
"logps/chosen": -540.6839599609375,
"logps/rejected": -642.1290283203125,
"loss": 0.5532,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.0117030143737793,
"rewards/margins": 1.17899751663208,
"rewards/rejected": -4.190700531005859,
"step": 1450
},
{
"epoch": 0.38,
"grad_norm": 13.875,
"learning_rate": 3.883935506370605e-06,
"logits/chosen": -0.9731215238571167,
"logits/rejected": -0.8713979721069336,
"logps/chosen": -526.899658203125,
"logps/rejected": -591.6453857421875,
"loss": 0.5396,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.736586570739746,
"rewards/margins": 0.9257469177246094,
"rewards/rejected": -3.6623332500457764,
"step": 1460
},
{
"epoch": 0.38,
"grad_norm": 5.0625,
"learning_rate": 3.864852992655617e-06,
"logits/chosen": -1.115800380706787,
"logits/rejected": -1.0172771215438843,
"logps/chosen": -478.37420654296875,
"logps/rejected": -573.0581665039062,
"loss": 0.4365,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.2973954677581787,
"rewards/margins": 1.069636344909668,
"rewards/rejected": -3.3670318126678467,
"step": 1470
},
{
"epoch": 0.39,
"grad_norm": 7.0625,
"learning_rate": 3.845656514108516e-06,
"logits/chosen": -1.0454566478729248,
"logits/rejected": -0.8997499346733093,
"logps/chosen": -511.357177734375,
"logps/rejected": -557.3446655273438,
"loss": 0.4913,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.522265672683716,
"rewards/margins": 1.0096194744110107,
"rewards/rejected": -3.5318856239318848,
"step": 1480
},
{
"epoch": 0.39,
"grad_norm": 8.125,
"learning_rate": 3.826347673629738e-06,
"logits/chosen": -1.0593020915985107,
"logits/rejected": -0.8929145932197571,
"logps/chosen": -473.79302978515625,
"logps/rejected": -565.4286499023438,
"loss": 0.4657,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2048957347869873,
"rewards/margins": 1.1790317296981812,
"rewards/rejected": -3.3839271068573,
"step": 1490
},
{
"epoch": 0.39,
"grad_norm": 12.0625,
"learning_rate": 3.8069280835019062e-06,
"logits/chosen": -1.116262674331665,
"logits/rejected": -0.9613265991210938,
"logps/chosen": -477.24810791015625,
"logps/rejected": -587.962646484375,
"loss": 0.4593,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1293580532073975,
"rewards/margins": 1.2989779710769653,
"rewards/rejected": -3.4283363819122314,
"step": 1500
},
{
"epoch": 0.39,
"eval_logits/chosen": -1.0210601091384888,
"eval_logits/rejected": -0.8902665972709656,
"eval_logps/chosen": -496.3184509277344,
"eval_logps/rejected": -587.1505737304688,
"eval_loss": 0.5173963308334351,
"eval_rewards/accuracies": 0.7304999828338623,
"eval_rewards/chosen": -2.316664218902588,
"eval_rewards/margins": 1.1087615489959717,
"eval_rewards/rejected": -3.4254260063171387,
"eval_runtime": 382.2649,
"eval_samples_per_second": 5.232,
"eval_steps_per_second": 0.654,
"step": 1500
},
{
"epoch": 0.4,
"grad_norm": 13.5,
"learning_rate": 3.7873993652552077e-06,
"logits/chosen": -1.0803442001342773,
"logits/rejected": -0.9917434453964233,
"logps/chosen": -461.2118225097656,
"logps/rejected": -549.1537475585938,
"loss": 0.593,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.274977922439575,
"rewards/margins": 0.9378048777580261,
"rewards/rejected": -3.212782621383667,
"step": 1510
},
{
"epoch": 0.4,
"grad_norm": 8.3125,
"learning_rate": 3.7677631495319953e-06,
"logits/chosen": -1.2474887371063232,
"logits/rejected": -1.145392656326294,
"logps/chosen": -428.1084899902344,
"logps/rejected": -485.67694091796875,
"loss": 0.5245,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6442362070083618,
"rewards/margins": 0.7559275031089783,
"rewards/rejected": -2.4001636505126953,
"step": 1520
},
{
"epoch": 0.4,
"grad_norm": 6.75,
"learning_rate": 3.748021075950633e-06,
"logits/chosen": -1.3161629438400269,
"logits/rejected": -1.232714295387268,
"logps/chosen": -440.6031188964844,
"logps/rejected": -481.67926025390625,
"loss": 0.5983,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.6595981121063232,
"rewards/margins": 0.5171489119529724,
"rewards/rejected": -2.1767468452453613,
"step": 1530
},
{
"epoch": 0.4,
"grad_norm": 10.625,
"learning_rate": 3.7281747929685824e-06,
"logits/chosen": -1.132124662399292,
"logits/rejected": -1.0095793008804321,
"logps/chosen": -423.98553466796875,
"logps/rejected": -478.41015625,
"loss": 0.5368,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8633050918579102,
"rewards/margins": 0.7011392712593079,
"rewards/rejected": -2.5644445419311523,
"step": 1540
},
{
"epoch": 0.41,
"grad_norm": 8.625,
"learning_rate": 3.7082259577447604e-06,
"logits/chosen": -1.2295887470245361,
"logits/rejected": -1.1187238693237305,
"logps/chosen": -489.0294494628906,
"logps/rejected": -551.4732666015625,
"loss": 0.4858,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.1783862113952637,
"rewards/margins": 0.8242964744567871,
"rewards/rejected": -3.002682685852051,
"step": 1550
},
{
"epoch": 0.41,
"grad_norm": 10.0,
"learning_rate": 3.6881762360011688e-06,
"logits/chosen": -1.241201639175415,
"logits/rejected": -1.0382106304168701,
"logps/chosen": -548.8870849609375,
"logps/rejected": -611.2633666992188,
"loss": 0.4939,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.6739068031311035,
"rewards/margins": 0.9938074350357056,
"rewards/rejected": -3.6677143573760986,
"step": 1560
},
{
"epoch": 0.41,
"grad_norm": 11.8125,
"learning_rate": 3.668027301883802e-06,
"logits/chosen": -1.154157280921936,
"logits/rejected": -1.0291301012039185,
"logps/chosen": -542.0028076171875,
"logps/rejected": -634.2523803710938,
"loss": 0.5002,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9064033031463623,
"rewards/margins": 1.071606993675232,
"rewards/rejected": -3.9780097007751465,
"step": 1570
},
{
"epoch": 0.41,
"grad_norm": 5.46875,
"learning_rate": 3.64778083782286e-06,
"logits/chosen": -1.0966026782989502,
"logits/rejected": -1.084398627281189,
"logps/chosen": -548.9720458984375,
"logps/rejected": -668.5007934570312,
"loss": 0.5301,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.963653087615967,
"rewards/margins": 0.9051497578620911,
"rewards/rejected": -3.868802547454834,
"step": 1580
},
{
"epoch": 0.42,
"grad_norm": 9.6875,
"learning_rate": 3.627438534392268e-06,
"logits/chosen": -1.2072285413742065,
"logits/rejected": -1.1841914653778076,
"logps/chosen": -524.2724609375,
"logps/rejected": -635.7026977539062,
"loss": 0.483,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.862274169921875,
"rewards/margins": 1.047090768814087,
"rewards/rejected": -3.909365177154541,
"step": 1590
},
{
"epoch": 0.42,
"grad_norm": 7.21875,
"learning_rate": 3.607002090168506e-06,
"logits/chosen": -1.0932730436325073,
"logits/rejected": -1.0192008018493652,
"logps/chosen": -579.1436157226562,
"logps/rejected": -652.6798095703125,
"loss": 0.5545,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.1483500003814697,
"rewards/margins": 0.9495010375976562,
"rewards/rejected": -4.097850799560547,
"step": 1600
},
{
"epoch": 0.42,
"eval_logits/chosen": -1.0082374811172485,
"eval_logits/rejected": -0.8800999522209167,
"eval_logps/chosen": -564.0355224609375,
"eval_logps/rejected": -652.812255859375,
"eval_loss": 0.5032184720039368,
"eval_rewards/accuracies": 0.7369999885559082,
"eval_rewards/chosen": -2.99383544921875,
"eval_rewards/margins": 1.088207483291626,
"eval_rewards/rejected": -4.082043170928955,
"eval_runtime": 381.8998,
"eval_samples_per_second": 5.237,
"eval_steps_per_second": 0.655,
"step": 1600
},
{
"epoch": 0.42,
"grad_norm": 6.71875,
"learning_rate": 3.586473211588787e-06,
"logits/chosen": -1.1385810375213623,
"logits/rejected": -1.0679770708084106,
"logps/chosen": -523.4324340820312,
"logps/rejected": -647.1407470703125,
"loss": 0.4495,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.787372350692749,
"rewards/margins": 1.170562744140625,
"rewards/rejected": -3.957934856414795,
"step": 1610
},
{
"epoch": 0.42,
"grad_norm": 13.0,
"learning_rate": 3.5658536128085623e-06,
"logits/chosen": -1.1914455890655518,
"logits/rejected": -1.0186755657196045,
"logps/chosen": -572.4912719726562,
"logps/rejected": -637.8251953125,
"loss": 0.5878,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -3.0957980155944824,
"rewards/margins": 0.9488485455513,
"rewards/rejected": -4.044646263122559,
"step": 1620
},
{
"epoch": 0.43,
"grad_norm": 10.4375,
"learning_rate": 3.545145015558399e-06,
"logits/chosen": -0.9681538343429565,
"logits/rejected": -0.9621971249580383,
"logps/chosen": -520.1128540039062,
"logps/rejected": -614.5860595703125,
"loss": 0.5109,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.8847546577453613,
"rewards/margins": 1.0869688987731934,
"rewards/rejected": -3.971724271774292,
"step": 1630
},
{
"epoch": 0.43,
"grad_norm": 5.46875,
"learning_rate": 3.5243491490002056e-06,
"logits/chosen": -1.09974205493927,
"logits/rejected": -1.019108533859253,
"logps/chosen": -545.1671142578125,
"logps/rejected": -630.2543334960938,
"loss": 0.5719,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.9147398471832275,
"rewards/margins": 0.9028825759887695,
"rewards/rejected": -3.817622423171997,
"step": 1640
},
{
"epoch": 0.43,
"grad_norm": 8.3125,
"learning_rate": 3.503467749582857e-06,
"logits/chosen": -1.1649540662765503,
"logits/rejected": -0.9812711477279663,
"logps/chosen": -496.32757568359375,
"logps/rejected": -530.1451416015625,
"loss": 0.5901,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.4510443210601807,
"rewards/margins": 0.6782389879226685,
"rewards/rejected": -3.1292831897735596,
"step": 1650
},
{
"epoch": 0.43,
"grad_norm": 11.0,
"learning_rate": 3.4825025608971947e-06,
"logits/chosen": -1.0830554962158203,
"logits/rejected": -1.0159814357757568,
"logps/chosen": -442.962646484375,
"logps/rejected": -521.5462646484375,
"loss": 0.5191,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.2101898193359375,
"rewards/margins": 0.7478699684143066,
"rewards/rejected": -2.958059549331665,
"step": 1660
},
{
"epoch": 0.44,
"grad_norm": 7.40625,
"learning_rate": 3.4614553335304407e-06,
"logits/chosen": -1.1321473121643066,
"logits/rejected": -0.9186077117919922,
"logps/chosen": -502.3970642089844,
"logps/rejected": -575.6217041015625,
"loss": 0.4608,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.3348631858825684,
"rewards/margins": 1.0501439571380615,
"rewards/rejected": -3.385007381439209,
"step": 1670
},
{
"epoch": 0.44,
"grad_norm": 9.625,
"learning_rate": 3.4403278249200222e-06,
"logits/chosen": -1.1406095027923584,
"logits/rejected": -0.9287969470024109,
"logps/chosen": -519.1994018554688,
"logps/rejected": -603.8717041015625,
"loss": 0.4608,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.365922689437866,
"rewards/margins": 1.2659895420074463,
"rewards/rejected": -3.6319122314453125,
"step": 1680
},
{
"epoch": 0.44,
"grad_norm": 16.5,
"learning_rate": 3.4191217992068293e-06,
"logits/chosen": -1.1879878044128418,
"logits/rejected": -0.9813734292984009,
"logps/chosen": -539.6956176757812,
"logps/rejected": -599.0775146484375,
"loss": 0.5446,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.6155307292938232,
"rewards/margins": 1.0494682788848877,
"rewards/rejected": -3.664999008178711,
"step": 1690
},
{
"epoch": 0.44,
"grad_norm": 12.1875,
"learning_rate": 3.3978390270879056e-06,
"logits/chosen": -1.0190632343292236,
"logits/rejected": -0.9378607869148254,
"logps/chosen": -550.7818603515625,
"logps/rejected": -662.2818603515625,
"loss": 0.5425,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.4076619148254395,
"rewards/margins": 1.0471140146255493,
"rewards/rejected": -4.454775810241699,
"step": 1700
},
{
"epoch": 0.44,
"eval_logits/chosen": -0.9685720205307007,
"eval_logits/rejected": -0.8382174968719482,
"eval_logps/chosen": -599.6095581054688,
"eval_logps/rejected": -685.2186889648438,
"eval_loss": 0.49963250756263733,
"eval_rewards/accuracies": 0.7404999732971191,
"eval_rewards/chosen": -3.349576234817505,
"eval_rewards/margins": 1.0565321445465088,
"eval_rewards/rejected": -4.406107425689697,
"eval_runtime": 382.4342,
"eval_samples_per_second": 5.23,
"eval_steps_per_second": 0.654,
"step": 1700
},
{
"epoch": 0.45,
"grad_norm": 11.75,
"learning_rate": 3.3764812856685995e-06,
"logits/chosen": -1.0968348979949951,
"logits/rejected": -1.0862301588058472,
"logps/chosen": -530.6864013671875,
"logps/rejected": -640.4039916992188,
"loss": 0.518,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.0613017082214355,
"rewards/margins": 0.9621230959892273,
"rewards/rejected": -4.0234246253967285,
"step": 1710
},
{
"epoch": 0.45,
"grad_norm": 8.0,
"learning_rate": 3.3550503583141726e-06,
"logits/chosen": -1.2413816452026367,
"logits/rejected": -1.089429259300232,
"logps/chosen": -535.4332275390625,
"logps/rejected": -622.2586059570312,
"loss": 0.4864,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.66583251953125,
"rewards/margins": 1.01954185962677,
"rewards/rejected": -3.6853744983673096,
"step": 1720
},
{
"epoch": 0.45,
"grad_norm": 8.4375,
"learning_rate": 3.3335480345008907e-06,
"logits/chosen": -1.112958312034607,
"logits/rejected": -1.0259140729904175,
"logps/chosen": -486.234375,
"logps/rejected": -564.1868896484375,
"loss": 0.4673,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.260854721069336,
"rewards/margins": 1.0263946056365967,
"rewards/rejected": -3.2872490882873535,
"step": 1730
},
{
"epoch": 0.46,
"grad_norm": 10.4375,
"learning_rate": 3.3119761096666055e-06,
"logits/chosen": -1.1713676452636719,
"logits/rejected": -1.0070645809173584,
"logps/chosen": -514.056396484375,
"logps/rejected": -565.324951171875,
"loss": 0.5375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.390371084213257,
"rewards/margins": 0.8160451054573059,
"rewards/rejected": -3.206415891647339,
"step": 1740
},
{
"epoch": 0.46,
"grad_norm": 7.3125,
"learning_rate": 3.290336385060832e-06,
"logits/chosen": -1.3080298900604248,
"logits/rejected": -1.114485502243042,
"logps/chosen": -513.6076049804688,
"logps/rejected": -580.9697265625,
"loss": 0.5403,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.6475276947021484,
"rewards/margins": 0.8753725290298462,
"rewards/rejected": -3.522900104522705,
"step": 1750
},
{
"epoch": 0.46,
"grad_norm": 10.75,
"learning_rate": 3.268630667594348e-06,
"logits/chosen": -1.1190599203109741,
"logits/rejected": -1.0877625942230225,
"logps/chosen": -520.4367065429688,
"logps/rejected": -593.3540649414062,
"loss": 0.51,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.6478748321533203,
"rewards/margins": 0.9716035723686218,
"rewards/rejected": -3.619478225708008,
"step": 1760
},
{
"epoch": 0.46,
"grad_norm": 10.1875,
"learning_rate": 3.2468607696883147e-06,
"logits/chosen": -1.1805906295776367,
"logits/rejected": -1.1239099502563477,
"logps/chosen": -522.7432861328125,
"logps/rejected": -629.3782958984375,
"loss": 0.4844,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.695678949356079,
"rewards/margins": 1.022963285446167,
"rewards/rejected": -3.718641996383667,
"step": 1770
},
{
"epoch": 0.47,
"grad_norm": 7.0625,
"learning_rate": 3.225028509122944e-06,
"logits/chosen": -1.2425084114074707,
"logits/rejected": -1.1278479099273682,
"logps/chosen": -481.4998474121094,
"logps/rejected": -560.8279418945312,
"loss": 0.5179,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.449826717376709,
"rewards/margins": 0.9064075350761414,
"rewards/rejected": -3.356234073638916,
"step": 1780
},
{
"epoch": 0.47,
"grad_norm": 13.9375,
"learning_rate": 3.2031357088857083e-06,
"logits/chosen": -1.2350413799285889,
"logits/rejected": -1.1462427377700806,
"logps/chosen": -549.2757568359375,
"logps/rejected": -646.181640625,
"loss": 0.5022,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.7407171726226807,
"rewards/margins": 1.003739595413208,
"rewards/rejected": -3.7444565296173096,
"step": 1790
},
{
"epoch": 0.47,
"grad_norm": 14.625,
"learning_rate": 3.181184197019127e-06,
"logits/chosen": -0.9863433837890625,
"logits/rejected": -0.8817607164382935,
"logps/chosen": -533.1535034179688,
"logps/rejected": -698.6467895507812,
"loss": 0.4825,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.029388189315796,
"rewards/margins": 1.3928486108779907,
"rewards/rejected": -4.422236442565918,
"step": 1800
},
{
"epoch": 0.47,
"eval_logits/chosen": -1.0005792379379272,
"eval_logits/rejected": -0.8737620115280151,
"eval_logps/chosen": -569.109130859375,
"eval_logps/rejected": -657.4884033203125,
"eval_loss": 0.503667414188385,
"eval_rewards/accuracies": 0.7379999756813049,
"eval_rewards/chosen": -3.0445713996887207,
"eval_rewards/margins": 1.0842331647872925,
"eval_rewards/rejected": -4.1288042068481445,
"eval_runtime": 382.2565,
"eval_samples_per_second": 5.232,
"eval_steps_per_second": 0.654,
"step": 1800
},
{
"epoch": 0.47,
"grad_norm": 14.3125,
"learning_rate": 3.159175806468126e-06,
"logits/chosen": -1.0082833766937256,
"logits/rejected": -0.8253539800643921,
"logps/chosen": -556.5079956054688,
"logps/rejected": -636.0127563476562,
"loss": 0.5015,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.07832407951355,
"rewards/margins": 1.0969042778015137,
"rewards/rejected": -4.175228595733643,
"step": 1810
},
{
"epoch": 0.48,
"grad_norm": 11.1875,
"learning_rate": 3.1371123749269804e-06,
"logits/chosen": -1.1307703256607056,
"logits/rejected": -1.0529394149780273,
"logps/chosen": -595.5393676757812,
"logps/rejected": -662.37158203125,
"loss": 0.5659,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.1214325428009033,
"rewards/margins": 0.8287679553031921,
"rewards/rejected": -3.950200319290161,
"step": 1820
},
{
"epoch": 0.48,
"grad_norm": 8.5625,
"learning_rate": 3.114995744685877e-06,
"logits/chosen": -1.07692551612854,
"logits/rejected": -1.0323340892791748,
"logps/chosen": -533.2166748046875,
"logps/rejected": -612.94140625,
"loss": 0.5153,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8589041233062744,
"rewards/margins": 0.9276583790779114,
"rewards/rejected": -3.786562442779541,
"step": 1830
},
{
"epoch": 0.48,
"grad_norm": 6.40625,
"learning_rate": 3.0928277624770743e-06,
"logits/chosen": -1.2703588008880615,
"logits/rejected": -1.0852762460708618,
"logps/chosen": -551.0806274414062,
"logps/rejected": -643.0982666015625,
"loss": 0.4817,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.663365125656128,
"rewards/margins": 1.2043039798736572,
"rewards/rejected": -3.8676695823669434,
"step": 1840
},
{
"epoch": 0.48,
"grad_norm": 6.8125,
"learning_rate": 3.070610279320708e-06,
"logits/chosen": -1.248780608177185,
"logits/rejected": -1.084285020828247,
"logps/chosen": -551.0938110351562,
"logps/rejected": -643.5797729492188,
"loss": 0.4411,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.640122652053833,
"rewards/margins": 1.165264368057251,
"rewards/rejected": -3.805387020111084,
"step": 1850
},
{
"epoch": 0.49,
"grad_norm": 6.09375,
"learning_rate": 3.0483451503702264e-06,
"logits/chosen": -1.1745688915252686,
"logits/rejected": -1.0959160327911377,
"logps/chosen": -581.6795654296875,
"logps/rejected": -661.7645263671875,
"loss": 0.5518,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.9558444023132324,
"rewards/margins": 1.0012142658233643,
"rewards/rejected": -3.9570584297180176,
"step": 1860
},
{
"epoch": 0.49,
"grad_norm": 11.875,
"learning_rate": 3.0260342347574916e-06,
"logits/chosen": -1.1434388160705566,
"logits/rejected": -0.9975016713142395,
"logps/chosen": -543.2282104492188,
"logps/rejected": -666.7279052734375,
"loss": 0.4206,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.7089195251464844,
"rewards/margins": 1.425309419631958,
"rewards/rejected": -4.134228706359863,
"step": 1870
},
{
"epoch": 0.49,
"grad_norm": 11.0,
"learning_rate": 3.0036793954375358e-06,
"logits/chosen": -1.0967297554016113,
"logits/rejected": -0.9473203420639038,
"logps/chosen": -603.4558715820312,
"logps/rejected": -692.9251708984375,
"loss": 0.4466,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.3335928916931152,
"rewards/margins": 1.3170349597930908,
"rewards/rejected": -4.650628089904785,
"step": 1880
},
{
"epoch": 0.49,
"grad_norm": 13.0,
"learning_rate": 2.981282499033009e-06,
"logits/chosen": -1.0985617637634277,
"logits/rejected": -0.9863265156745911,
"logps/chosen": -607.0682373046875,
"logps/rejected": -701.697509765625,
"loss": 0.5071,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.307284116744995,
"rewards/margins": 1.200660228729248,
"rewards/rejected": -4.507944583892822,
"step": 1890
},
{
"epoch": 0.5,
"grad_norm": 10.0625,
"learning_rate": 2.9588454156783163e-06,
"logits/chosen": -1.1454726457595825,
"logits/rejected": -0.9831218719482422,
"logps/chosen": -579.2799682617188,
"logps/rejected": -706.1749877929688,
"loss": 0.4455,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.979241371154785,
"rewards/margins": 1.4865919351577759,
"rewards/rejected": -4.4658331871032715,
"step": 1900
},
{
"epoch": 0.5,
"eval_logits/chosen": -1.0213502645492554,
"eval_logits/rejected": -0.891007125377655,
"eval_logps/chosen": -566.8839721679688,
"eval_logps/rejected": -659.4305419921875,
"eval_loss": 0.49620321393013,
"eval_rewards/accuracies": 0.7419999837875366,
"eval_rewards/chosen": -3.0223195552825928,
"eval_rewards/margins": 1.1259068250656128,
"eval_rewards/rejected": -4.148226737976074,
"eval_runtime": 382.1041,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 1900
},
{
"epoch": 0.5,
"grad_norm": 10.8125,
"learning_rate": 2.9363700188634597e-06,
"logits/chosen": -1.1352207660675049,
"logits/rejected": -1.0086506605148315,
"logps/chosen": -588.1229858398438,
"logps/rejected": -648.9054565429688,
"loss": 0.5063,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.230529308319092,
"rewards/margins": 0.9782280921936035,
"rewards/rejected": -4.208757400512695,
"step": 1910
},
{
"epoch": 0.5,
"grad_norm": 13.375,
"learning_rate": 2.9138581852776053e-06,
"logits/chosen": -1.1499899625778198,
"logits/rejected": -1.0288715362548828,
"logps/chosen": -581.2144775390625,
"logps/rejected": -680.3140869140625,
"loss": 0.496,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.2150332927703857,
"rewards/margins": 1.1205800771713257,
"rewards/rejected": -4.335613250732422,
"step": 1920
},
{
"epoch": 0.51,
"grad_norm": 7.3125,
"learning_rate": 2.8913117946523805e-06,
"logits/chosen": -1.1651884317398071,
"logits/rejected": -0.9733787775039673,
"logps/chosen": -579.3433227539062,
"logps/rejected": -649.0181884765625,
"loss": 0.4634,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.129295825958252,
"rewards/margins": 1.077726125717163,
"rewards/rejected": -4.207022190093994,
"step": 1930
},
{
"epoch": 0.51,
"grad_norm": 11.375,
"learning_rate": 2.8687327296049126e-06,
"logits/chosen": -1.163464069366455,
"logits/rejected": -1.0617696046829224,
"logps/chosen": -556.2322998046875,
"logps/rejected": -651.5863037109375,
"loss": 0.5142,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.971095561981201,
"rewards/margins": 1.0506844520568848,
"rewards/rejected": -4.021780014038086,
"step": 1940
},
{
"epoch": 0.51,
"grad_norm": 12.8125,
"learning_rate": 2.8461228754806376e-06,
"logits/chosen": -1.185319185256958,
"logits/rejected": -1.0036907196044922,
"logps/chosen": -566.9384155273438,
"logps/rejected": -628.1956787109375,
"loss": 0.5404,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.911479949951172,
"rewards/margins": 0.8705935478210449,
"rewards/rejected": -3.782073497772217,
"step": 1950
},
{
"epoch": 0.51,
"grad_norm": 7.09375,
"learning_rate": 2.823484120195865e-06,
"logits/chosen": -1.3058470487594604,
"logits/rejected": -1.113465666770935,
"logps/chosen": -529.6067504882812,
"logps/rejected": -606.2987060546875,
"loss": 0.4364,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.5179548263549805,
"rewards/margins": 1.1106722354888916,
"rewards/rejected": -3.628627061843872,
"step": 1960
},
{
"epoch": 0.52,
"grad_norm": 8.75,
"learning_rate": 2.8008183540801486e-06,
"logits/chosen": -1.12172269821167,
"logits/rejected": -0.968579888343811,
"logps/chosen": -553.111083984375,
"logps/rejected": -600.1488037109375,
"loss": 0.5074,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.7947652339935303,
"rewards/margins": 0.9243541955947876,
"rewards/rejected": -3.7191195487976074,
"step": 1970
},
{
"epoch": 0.52,
"grad_norm": 10.75,
"learning_rate": 2.7781274697184353e-06,
"logits/chosen": -0.9661678075790405,
"logits/rejected": -0.9819488525390625,
"logps/chosen": -551.6143798828125,
"logps/rejected": -679.9763793945312,
"loss": 0.5141,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.2225677967071533,
"rewards/margins": 1.0803557634353638,
"rewards/rejected": -4.30292272567749,
"step": 1980
},
{
"epoch": 0.52,
"grad_norm": 7.625,
"learning_rate": 2.7554133617930397e-06,
"logits/chosen": -1.0553234815597534,
"logits/rejected": -0.9197478294372559,
"logps/chosen": -592.0967407226562,
"logps/rejected": -687.3663940429688,
"loss": 0.4817,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.442605495452881,
"rewards/margins": 1.1034131050109863,
"rewards/rejected": -4.546019077301025,
"step": 1990
},
{
"epoch": 0.52,
"grad_norm": 11.375,
"learning_rate": 2.7326779269254363e-06,
"logits/chosen": -1.1949965953826904,
"logits/rejected": -1.0267183780670166,
"logps/chosen": -653.2984619140625,
"logps/rejected": -709.1905517578125,
"loss": 0.4817,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.6396350860595703,
"rewards/margins": 1.1184080839157104,
"rewards/rejected": -4.75804328918457,
"step": 2000
},
{
"epoch": 0.52,
"eval_logits/chosen": -0.9427788257598877,
"eval_logits/rejected": -0.8139032125473022,
"eval_logps/chosen": -624.5250244140625,
"eval_logps/rejected": -711.0853271484375,
"eval_loss": 0.49741417169570923,
"eval_rewards/accuracies": 0.746999979019165,
"eval_rewards/chosen": -3.5987296104431152,
"eval_rewards/margins": 1.0660440921783447,
"eval_rewards/rejected": -4.664773941040039,
"eval_runtime": 382.3502,
"eval_samples_per_second": 5.231,
"eval_steps_per_second": 0.654,
"step": 2000
},
{
"epoch": 0.53,
"grad_norm": 10.5625,
"learning_rate": 2.7099230635178954e-06,
"logits/chosen": -1.0279147624969482,
"logits/rejected": -0.9855324625968933,
"logps/chosen": -615.8596801757812,
"logps/rejected": -704.7830200195312,
"loss": 0.5276,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.510098934173584,
"rewards/margins": 0.954069972038269,
"rewards/rejected": -4.464169025421143,
"step": 2010
},
{
"epoch": 0.53,
"grad_norm": 9.625,
"learning_rate": 2.6871506715949608e-06,
"logits/chosen": -1.177202582359314,
"logits/rejected": -1.0146461725234985,
"logps/chosen": -568.2487182617188,
"logps/rejected": -659.0941162109375,
"loss": 0.4583,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.152796745300293,
"rewards/margins": 1.0889527797698975,
"rewards/rejected": -4.2417497634887695,
"step": 2020
},
{
"epoch": 0.53,
"grad_norm": 13.6875,
"learning_rate": 2.6643626526448063e-06,
"logits/chosen": -1.2432745695114136,
"logits/rejected": -1.0716017484664917,
"logps/chosen": -619.502685546875,
"logps/rejected": -699.7628173828125,
"loss": 0.4576,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.2460086345672607,
"rewards/margins": 1.2264302968978882,
"rewards/rejected": -4.472439289093018,
"step": 2030
},
{
"epoch": 0.53,
"grad_norm": 9.875,
"learning_rate": 2.6415609094604562e-06,
"logits/chosen": -1.0596590042114258,
"logits/rejected": -1.0028278827667236,
"logps/chosen": -631.6947021484375,
"logps/rejected": -728.5841674804688,
"loss": 0.4471,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.6112823486328125,
"rewards/margins": 1.1590890884399414,
"rewards/rejected": -4.770371437072754,
"step": 2040
},
{
"epoch": 0.54,
"grad_norm": 11.375,
"learning_rate": 2.618747345980904e-06,
"logits/chosen": -1.067651629447937,
"logits/rejected": -0.8701795339584351,
"logps/chosen": -667.7681274414062,
"logps/rejected": -718.9295654296875,
"loss": 0.5561,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.1937079429626465,
"rewards/margins": 1.016485333442688,
"rewards/rejected": -5.210193634033203,
"step": 2050
},
{
"epoch": 0.54,
"grad_norm": 6.125,
"learning_rate": 2.595923867132136e-06,
"logits/chosen": -1.1067336797714233,
"logits/rejected": -0.9798781275749207,
"logps/chosen": -685.84228515625,
"logps/rejected": -784.4832763671875,
"loss": 0.4938,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -4.049218654632568,
"rewards/margins": 1.2331972122192383,
"rewards/rejected": -5.282416343688965,
"step": 2060
},
{
"epoch": 0.54,
"grad_norm": 7.9375,
"learning_rate": 2.5730923786680672e-06,
"logits/chosen": -1.017889380455017,
"logits/rejected": -1.0066477060317993,
"logps/chosen": -639.3632202148438,
"logps/rejected": -738.4698486328125,
"loss": 0.5372,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.8783206939697266,
"rewards/margins": 0.9146150350570679,
"rewards/rejected": -4.792935848236084,
"step": 2070
},
{
"epoch": 0.54,
"grad_norm": 7.3125,
"learning_rate": 2.5502547870114137e-06,
"logits/chosen": -1.1123883724212646,
"logits/rejected": -0.9572793245315552,
"logps/chosen": -607.7706909179688,
"logps/rejected": -670.916015625,
"loss": 0.5255,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.5239059925079346,
"rewards/margins": 0.9338981509208679,
"rewards/rejected": -4.457803726196289,
"step": 2080
},
{
"epoch": 0.55,
"grad_norm": 13.375,
"learning_rate": 2.527412999094507e-06,
"logits/chosen": -1.118983507156372,
"logits/rejected": -0.9597452282905579,
"logps/chosen": -620.9295043945312,
"logps/rejected": -721.0320434570312,
"loss": 0.4802,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.2717292308807373,
"rewards/margins": 1.1265954971313477,
"rewards/rejected": -4.398324489593506,
"step": 2090
},
{
"epoch": 0.55,
"grad_norm": 11.75,
"learning_rate": 2.504568922200064e-06,
"logits/chosen": -1.075067400932312,
"logits/rejected": -0.937818706035614,
"logps/chosen": -547.7574462890625,
"logps/rejected": -641.327392578125,
"loss": 0.5079,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.0641894340515137,
"rewards/margins": 1.0973466634750366,
"rewards/rejected": -4.161535739898682,
"step": 2100
},
{
"epoch": 0.55,
"eval_logits/chosen": -1.0030875205993652,
"eval_logits/rejected": -0.8739129900932312,
"eval_logps/chosen": -582.1657104492188,
"eval_logps/rejected": -667.5426025390625,
"eval_loss": 0.4922982156276703,
"eval_rewards/accuracies": 0.7519999742507935,
"eval_rewards/chosen": -3.1751370429992676,
"eval_rewards/margins": 1.0542099475860596,
"eval_rewards/rejected": -4.229346752166748,
"eval_runtime": 382.3169,
"eval_samples_per_second": 5.231,
"eval_steps_per_second": 0.654,
"step": 2100
},
{
"epoch": 0.55,
"grad_norm": 8.1875,
"learning_rate": 2.4817244638019333e-06,
"logits/chosen": -1.137091875076294,
"logits/rejected": -0.9877273440361023,
"logps/chosen": -593.8831787109375,
"logps/rejected": -648.8990478515625,
"loss": 0.5394,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.1807122230529785,
"rewards/margins": 0.9622472524642944,
"rewards/rejected": -4.1429595947265625,
"step": 2110
},
{
"epoch": 0.55,
"grad_norm": 14.1875,
"learning_rate": 2.4588815314058155e-06,
"logits/chosen": -1.117033839225769,
"logits/rejected": -1.0428097248077393,
"logps/chosen": -536.7808227539062,
"logps/rejected": -599.55908203125,
"loss": 0.4755,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.912168025970459,
"rewards/margins": 0.9705360531806946,
"rewards/rejected": -3.882704257965088,
"step": 2120
},
{
"epoch": 0.56,
"grad_norm": 9.0625,
"learning_rate": 2.4360420323899922e-06,
"logits/chosen": -1.1962370872497559,
"logits/rejected": -1.0757726430892944,
"logps/chosen": -545.7897338867188,
"logps/rejected": -594.7244873046875,
"loss": 0.5644,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.6997714042663574,
"rewards/margins": 0.8151930570602417,
"rewards/rejected": -3.5149643421173096,
"step": 2130
},
{
"epoch": 0.56,
"grad_norm": 6.75,
"learning_rate": 2.4132078738460585e-06,
"logits/chosen": -1.2405675649642944,
"logits/rejected": -1.0946118831634521,
"logps/chosen": -528.01611328125,
"logps/rejected": -594.1393432617188,
"loss": 0.4643,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6354479789733887,
"rewards/margins": 1.046671748161316,
"rewards/rejected": -3.682119846343994,
"step": 2140
},
{
"epoch": 0.56,
"grad_norm": 12.5625,
"learning_rate": 2.3903809624196826e-06,
"logits/chosen": -1.1746861934661865,
"logits/rejected": -1.0529396533966064,
"logps/chosen": -520.6478271484375,
"logps/rejected": -572.0309448242188,
"loss": 0.5516,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.8694403171539307,
"rewards/margins": 0.8386090397834778,
"rewards/rejected": -3.7080490589141846,
"step": 2150
},
{
"epoch": 0.57,
"grad_norm": 11.25,
"learning_rate": 2.3675632041513978e-06,
"logits/chosen": -1.2890937328338623,
"logits/rejected": -1.0460366010665894,
"logps/chosen": -595.07275390625,
"logps/rejected": -639.810791015625,
"loss": 0.4788,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.0806915760040283,
"rewards/margins": 1.094292402267456,
"rewards/rejected": -4.174983978271484,
"step": 2160
},
{
"epoch": 0.57,
"grad_norm": 11.5,
"learning_rate": 2.3447565043174533e-06,
"logits/chosen": -1.1292383670806885,
"logits/rejected": -0.9545844793319702,
"logps/chosen": -596.5003662109375,
"logps/rejected": -650.0792236328125,
"loss": 0.5136,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.3818931579589844,
"rewards/margins": 0.922932505607605,
"rewards/rejected": -4.304825782775879,
"step": 2170
},
{
"epoch": 0.57,
"grad_norm": 12.0,
"learning_rate": 2.321962767270724e-06,
"logits/chosen": -1.158575415611267,
"logits/rejected": -1.0298246145248413,
"logps/chosen": -583.9124755859375,
"logps/rejected": -629.5396118164062,
"loss": 0.5615,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.3395965099334717,
"rewards/margins": 0.8280007243156433,
"rewards/rejected": -4.16759729385376,
"step": 2180
},
{
"epoch": 0.57,
"grad_norm": 8.75,
"learning_rate": 2.299183896281692e-06,
"logits/chosen": -1.088763952255249,
"logits/rejected": -0.9791523218154907,
"logps/chosen": -556.0525512695312,
"logps/rejected": -641.457763671875,
"loss": 0.5181,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.0112056732177734,
"rewards/margins": 0.8770611882209778,
"rewards/rejected": -3.8882670402526855,
"step": 2190
},
{
"epoch": 0.58,
"grad_norm": 7.25,
"learning_rate": 2.2764217933795297e-06,
"logits/chosen": -1.2351996898651123,
"logits/rejected": -1.1065688133239746,
"logps/chosen": -519.6819458007812,
"logps/rejected": -608.1278686523438,
"loss": 0.477,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5824990272521973,
"rewards/margins": 1.0897197723388672,
"rewards/rejected": -3.6722190380096436,
"step": 2200
},
{
"epoch": 0.58,
"eval_logits/chosen": -1.0880188941955566,
"eval_logits/rejected": -0.9566530585289001,
"eval_logps/chosen": -525.9181518554688,
"eval_logps/rejected": -601.7401733398438,
"eval_loss": 0.48973530530929565,
"eval_rewards/accuracies": 0.7409999966621399,
"eval_rewards/chosen": -2.612661123275757,
"eval_rewards/margins": 0.9586613774299622,
"eval_rewards/rejected": -3.571322441101074,
"eval_runtime": 382.0537,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 2200
},
{
"epoch": 0.58,
"grad_norm": 5.71875,
"learning_rate": 2.2536783591932786e-06,
"logits/chosen": -1.2977464199066162,
"logits/rejected": -1.1296590566635132,
"logps/chosen": -553.06103515625,
"logps/rejected": -621.307861328125,
"loss": 0.5291,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.7755794525146484,
"rewards/margins": 0.8637927174568176,
"rewards/rejected": -3.6393723487854004,
"step": 2210
},
{
"epoch": 0.58,
"grad_norm": 7.84375,
"learning_rate": 2.230955492793149e-06,
"logits/chosen": -1.0942963361740112,
"logits/rejected": -1.042419195175171,
"logps/chosen": -573.537841796875,
"logps/rejected": -642.611572265625,
"loss": 0.5884,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.997450351715088,
"rewards/margins": 0.8198318481445312,
"rewards/rejected": -3.8172824382781982,
"step": 2220
},
{
"epoch": 0.58,
"grad_norm": 5.71875,
"learning_rate": 2.208255091531947e-06,
"logits/chosen": -1.1044989824295044,
"logits/rejected": -1.0208889245986938,
"logps/chosen": -553.853515625,
"logps/rejected": -632.1079711914062,
"loss": 0.4853,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.7685611248016357,
"rewards/margins": 1.132253646850586,
"rewards/rejected": -3.9008147716522217,
"step": 2230
},
{
"epoch": 0.59,
"grad_norm": 11.75,
"learning_rate": 2.1855790508866435e-06,
"logits/chosen": -1.1996960639953613,
"logits/rejected": -1.0961394309997559,
"logps/chosen": -557.0603637695312,
"logps/rejected": -641.5968017578125,
"loss": 0.5037,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.600390672683716,
"rewards/margins": 1.021994948387146,
"rewards/rejected": -3.6223855018615723,
"step": 2240
},
{
"epoch": 0.59,
"grad_norm": 6.28125,
"learning_rate": 2.162929264300107e-06,
"logits/chosen": -1.2133983373641968,
"logits/rejected": -1.109574556350708,
"logps/chosen": -511.7315979003906,
"logps/rejected": -615.6173095703125,
"loss": 0.416,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.4276764392852783,
"rewards/margins": 1.2624719142913818,
"rewards/rejected": -3.690148115158081,
"step": 2250
},
{
"epoch": 0.59,
"grad_norm": 12.1875,
"learning_rate": 2.1403076230230006e-06,
"logits/chosen": -1.1181437969207764,
"logits/rejected": -0.9982963800430298,
"logps/chosen": -565.5302124023438,
"logps/rejected": -622.5106811523438,
"loss": 0.5759,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.9434773921966553,
"rewards/margins": 0.8478938341140747,
"rewards/rejected": -3.7913711071014404,
"step": 2260
},
{
"epoch": 0.59,
"grad_norm": 7.59375,
"learning_rate": 2.11771601595586e-06,
"logits/chosen": -1.2033512592315674,
"logits/rejected": -1.0716886520385742,
"logps/chosen": -557.2864379882812,
"logps/rejected": -603.1704711914062,
"loss": 0.5099,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.752382755279541,
"rewards/margins": 0.9814404249191284,
"rewards/rejected": -3.73382306098938,
"step": 2270
},
{
"epoch": 0.6,
"grad_norm": 12.8125,
"learning_rate": 2.0951563294913737e-06,
"logits/chosen": -1.177409052848816,
"logits/rejected": -0.9869596362113953,
"logps/chosen": -525.6967163085938,
"logps/rejected": -594.2974853515625,
"loss": 0.4644,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.641634941101074,
"rewards/margins": 0.9909149408340454,
"rewards/rejected": -3.63254976272583,
"step": 2280
},
{
"epoch": 0.6,
"grad_norm": 8.0625,
"learning_rate": 2.0726304473568693e-06,
"logits/chosen": -1.1395372152328491,
"logits/rejected": -1.0176304578781128,
"logps/chosen": -522.652099609375,
"logps/rejected": -593.3766479492188,
"loss": 0.4738,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.6371326446533203,
"rewards/margins": 1.0305713415145874,
"rewards/rejected": -3.667703628540039,
"step": 2290
},
{
"epoch": 0.6,
"grad_norm": 10.4375,
"learning_rate": 2.050140250457023e-06,
"logits/chosen": -1.2590233087539673,
"logits/rejected": -1.052428960800171,
"logps/chosen": -578.8065185546875,
"logps/rejected": -654.0260009765625,
"loss": 0.4829,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0124454498291016,
"rewards/margins": 1.0927618741989136,
"rewards/rejected": -4.1052069664001465,
"step": 2300
},
{
"epoch": 0.6,
"eval_logits/chosen": -1.0313422679901123,
"eval_logits/rejected": -0.9032019972801208,
"eval_logps/chosen": -559.955810546875,
"eval_logps/rejected": -654.1510620117188,
"eval_loss": 0.4887396991252899,
"eval_rewards/accuracies": 0.7484999895095825,
"eval_rewards/chosen": -2.953037738800049,
"eval_rewards/margins": 1.1423934698104858,
"eval_rewards/rejected": -4.095431804656982,
"eval_runtime": 381.9442,
"eval_samples_per_second": 5.236,
"eval_steps_per_second": 0.655,
"step": 2300
},
{
"epoch": 0.6,
"grad_norm": 14.625,
"learning_rate": 2.0276876167168042e-06,
"logits/chosen": -1.0072084665298462,
"logits/rejected": -0.9061794281005859,
"logps/chosen": -509.2284240722656,
"logps/rejected": -580.1068725585938,
"loss": 0.5548,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8472893238067627,
"rewards/margins": 1.0346016883850098,
"rewards/rejected": -3.8818912506103516,
"step": 2310
},
{
"epoch": 0.61,
"grad_norm": 8.25,
"learning_rate": 2.0052744209246682e-06,
"logits/chosen": -1.1624600887298584,
"logits/rejected": -1.04361891746521,
"logps/chosen": -552.9761962890625,
"logps/rejected": -621.9478759765625,
"loss": 0.5046,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.9763803482055664,
"rewards/margins": 1.018448829650879,
"rewards/rejected": -3.9948291778564453,
"step": 2320
},
{
"epoch": 0.61,
"grad_norm": 10.75,
"learning_rate": 1.9829025345760127e-06,
"logits/chosen": -1.1844617128372192,
"logits/rejected": -1.1262612342834473,
"logps/chosen": -559.8540649414062,
"logps/rejected": -640.3355712890625,
"loss": 0.549,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.8085100650787354,
"rewards/margins": 0.865519642829895,
"rewards/rejected": -3.67402982711792,
"step": 2330
},
{
"epoch": 0.61,
"grad_norm": 10.0625,
"learning_rate": 1.9605738257169115e-06,
"logits/chosen": -1.1309086084365845,
"logits/rejected": -0.9911936521530151,
"logps/chosen": -502.54608154296875,
"logps/rejected": -611.60693359375,
"loss": 0.4877,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6956772804260254,
"rewards/margins": 1.1704528331756592,
"rewards/rejected": -3.8661301136016846,
"step": 2340
},
{
"epoch": 0.62,
"grad_norm": 9.1875,
"learning_rate": 1.9382901587881275e-06,
"logits/chosen": -1.196989893913269,
"logits/rejected": -1.0731130838394165,
"logps/chosen": -527.642578125,
"logps/rejected": -616.3968505859375,
"loss": 0.4233,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.6888630390167236,
"rewards/margins": 1.2105457782745361,
"rewards/rejected": -3.8994088172912598,
"step": 2350
},
{
"epoch": 0.62,
"grad_norm": 11.9375,
"learning_rate": 1.916053394469437e-06,
"logits/chosen": -1.2187442779541016,
"logits/rejected": -1.0278013944625854,
"logps/chosen": -555.1328125,
"logps/rejected": -650.1771240234375,
"loss": 0.5309,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.9184062480926514,
"rewards/margins": 1.0958768129348755,
"rewards/rejected": -4.014283180236816,
"step": 2360
},
{
"epoch": 0.62,
"grad_norm": 9.5625,
"learning_rate": 1.8938653895242604e-06,
"logits/chosen": -1.173482894897461,
"logits/rejected": -0.9950237274169922,
"logps/chosen": -563.7232666015625,
"logps/rejected": -654.51611328125,
"loss": 0.4349,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.9941768646240234,
"rewards/margins": 1.1962960958480835,
"rewards/rejected": -4.1904730796813965,
"step": 2370
},
{
"epoch": 0.62,
"grad_norm": 10.9375,
"learning_rate": 1.8717279966446267e-06,
"logits/chosen": -1.0182400941848755,
"logits/rejected": -0.9381190538406372,
"logps/chosen": -567.86376953125,
"logps/rejected": -672.0901489257812,
"loss": 0.4496,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.1928865909576416,
"rewards/margins": 1.1378134489059448,
"rewards/rejected": -4.330699920654297,
"step": 2380
},
{
"epoch": 0.63,
"grad_norm": 6.90625,
"learning_rate": 1.8496430642964698e-06,
"logits/chosen": -1.0953130722045898,
"logits/rejected": -0.9763644337654114,
"logps/chosen": -591.7195434570312,
"logps/rejected": -673.8305053710938,
"loss": 0.4954,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.1930747032165527,
"rewards/margins": 1.0575921535491943,
"rewards/rejected": -4.250667095184326,
"step": 2390
},
{
"epoch": 0.63,
"grad_norm": 7.96875,
"learning_rate": 1.827612436565286e-06,
"logits/chosen": -1.093685507774353,
"logits/rejected": -0.9428181648254395,
"logps/chosen": -569.9864501953125,
"logps/rejected": -664.4702758789062,
"loss": 0.4752,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.0702195167541504,
"rewards/margins": 1.1502970457077026,
"rewards/rejected": -4.220516681671143,
"step": 2400
},
{
"epoch": 0.63,
"eval_logits/chosen": -0.9764781594276428,
"eval_logits/rejected": -0.849520742893219,
"eval_logps/chosen": -579.4506225585938,
"eval_logps/rejected": -672.75830078125,
"eval_loss": 0.49094268679618835,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -3.147986888885498,
"eval_rewards/margins": 1.1335173845291138,
"eval_rewards/rejected": -4.281503677368164,
"eval_runtime": 382.2569,
"eval_samples_per_second": 5.232,
"eval_steps_per_second": 0.654,
"step": 2400
},
{
"epoch": 0.63,
"grad_norm": 18.625,
"learning_rate": 1.8056379530021492e-06,
"logits/chosen": -1.1393061876296997,
"logits/rejected": -1.0437672138214111,
"logps/chosen": -565.1177978515625,
"logps/rejected": -631.9932861328125,
"loss": 0.5436,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.2091946601867676,
"rewards/margins": 0.9168522953987122,
"rewards/rejected": -4.126046180725098,
"step": 2410
},
{
"epoch": 0.63,
"grad_norm": 10.375,
"learning_rate": 1.7837214484701154e-06,
"logits/chosen": -1.182935118675232,
"logits/rejected": -1.0437054634094238,
"logps/chosen": -523.6812133789062,
"logps/rejected": -616.8724975585938,
"loss": 0.4678,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.713310480117798,
"rewards/margins": 1.1654255390167236,
"rewards/rejected": -3.8787360191345215,
"step": 2420
},
{
"epoch": 0.64,
"grad_norm": 14.125,
"learning_rate": 1.7618647529910043e-06,
"logits/chosen": -1.1824162006378174,
"logits/rejected": -1.051477313041687,
"logps/chosen": -526.3547973632812,
"logps/rejected": -624.6488647460938,
"loss": 0.4987,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.6958136558532715,
"rewards/margins": 1.1019628047943115,
"rewards/rejected": -3.797776460647583,
"step": 2430
},
{
"epoch": 0.64,
"grad_norm": 9.375,
"learning_rate": 1.7400696915925996e-06,
"logits/chosen": -1.1761425733566284,
"logits/rejected": -0.9889799952507019,
"logps/chosen": -560.6347045898438,
"logps/rejected": -604.340576171875,
"loss": 0.5198,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.900243043899536,
"rewards/margins": 1.019816279411316,
"rewards/rejected": -3.9200592041015625,
"step": 2440
},
{
"epoch": 0.64,
"grad_norm": 11.8125,
"learning_rate": 1.718338084156254e-06,
"logits/chosen": -1.1455858945846558,
"logits/rejected": -0.9903894662857056,
"logps/chosen": -568.4344482421875,
"logps/rejected": -638.8942260742188,
"loss": 0.4578,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.8164334297180176,
"rewards/margins": 1.0884320735931396,
"rewards/rejected": -3.9048657417297363,
"step": 2450
},
{
"epoch": 0.64,
"grad_norm": 9.625,
"learning_rate": 1.6966717452649372e-06,
"logits/chosen": -1.2747197151184082,
"logits/rejected": -1.101963758468628,
"logps/chosen": -554.3800659179688,
"logps/rejected": -616.3612060546875,
"loss": 0.4412,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.774376392364502,
"rewards/margins": 1.1384481191635132,
"rewards/rejected": -3.9128241539001465,
"step": 2460
},
{
"epoch": 0.65,
"grad_norm": 9.5625,
"learning_rate": 1.6750724840517103e-06,
"logits/chosen": -1.2133910655975342,
"logits/rejected": -1.1471474170684814,
"logps/chosen": -530.1273193359375,
"logps/rejected": -630.1476440429688,
"loss": 0.5062,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.7732110023498535,
"rewards/margins": 0.9591614007949829,
"rewards/rejected": -3.7323715686798096,
"step": 2470
},
{
"epoch": 0.65,
"grad_norm": 11.875,
"learning_rate": 1.6535421040486686e-06,
"logits/chosen": -1.0105046033859253,
"logits/rejected": -0.9159660339355469,
"logps/chosen": -560.3009643554688,
"logps/rejected": -653.0996704101562,
"loss": 0.4182,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.069490909576416,
"rewards/margins": 1.270485520362854,
"rewards/rejected": -4.3399763107299805,
"step": 2480
},
{
"epoch": 0.65,
"grad_norm": 13.4375,
"learning_rate": 1.6320824030363458e-06,
"logits/chosen": -1.0919368267059326,
"logits/rejected": -1.0423280000686646,
"logps/chosen": -547.108154296875,
"logps/rejected": -651.2943725585938,
"loss": 0.4663,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1227710247039795,
"rewards/margins": 1.1962798833847046,
"rewards/rejected": -4.3190507888793945,
"step": 2490
},
{
"epoch": 0.65,
"grad_norm": 14.625,
"learning_rate": 1.6106951728936028e-06,
"logits/chosen": -1.1967922449111938,
"logits/rejected": -1.0710703134536743,
"logps/chosen": -573.5470581054688,
"logps/rejected": -666.3677978515625,
"loss": 0.5249,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.163433313369751,
"rewards/margins": 1.0070708990097046,
"rewards/rejected": -4.170504093170166,
"step": 2500
},
{
"epoch": 0.65,
"eval_logits/chosen": -1.0434505939483643,
"eval_logits/rejected": -0.9135813117027283,
"eval_logps/chosen": -574.00927734375,
"eval_logps/rejected": -664.8961791992188,
"eval_loss": 0.4891022741794586,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -3.0935721397399902,
"eval_rewards/margins": 1.1093100309371948,
"eval_rewards/rejected": -4.202882289886475,
"eval_runtime": 382.3246,
"eval_samples_per_second": 5.231,
"eval_steps_per_second": 0.654,
"step": 2500
},
{
"epoch": 0.66,
"grad_norm": 9.4375,
"learning_rate": 1.5893821994479996e-06,
"logits/chosen": -1.1978858709335327,
"logits/rejected": -1.0786705017089844,
"logps/chosen": -573.3375244140625,
"logps/rejected": -648.0001831054688,
"loss": 0.4737,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.959294080734253,
"rewards/margins": 1.132147192955017,
"rewards/rejected": -4.0914411544799805,
"step": 2510
},
{
"epoch": 0.66,
"grad_norm": 7.875,
"learning_rate": 1.5681452623266868e-06,
"logits/chosen": -1.1913158893585205,
"logits/rejected": -0.9305517077445984,
"logps/chosen": -603.19873046875,
"logps/rejected": -671.5530395507812,
"loss": 0.4638,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1415326595306396,
"rewards/margins": 1.2662583589553833,
"rewards/rejected": -4.4077911376953125,
"step": 2520
},
{
"epoch": 0.66,
"grad_norm": 6.15625,
"learning_rate": 1.5469861348078014e-06,
"logits/chosen": -1.1753239631652832,
"logits/rejected": -1.0243064165115356,
"logps/chosen": -557.4925537109375,
"logps/rejected": -671.5239868164062,
"loss": 0.4264,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1072232723236084,
"rewards/margins": 1.246586561203003,
"rewards/rejected": -4.353809833526611,
"step": 2530
},
{
"epoch": 0.66,
"grad_norm": 8.5625,
"learning_rate": 1.5259065836724035e-06,
"logits/chosen": -1.0654633045196533,
"logits/rejected": -0.9947797656059265,
"logps/chosen": -555.5715942382812,
"logps/rejected": -674.6041259765625,
"loss": 0.428,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.1076245307922363,
"rewards/margins": 1.2515560388565063,
"rewards/rejected": -4.359180927276611,
"step": 2540
},
{
"epoch": 0.67,
"grad_norm": 17.5,
"learning_rate": 1.5049083690569456e-06,
"logits/chosen": -1.117201328277588,
"logits/rejected": -1.024710774421692,
"logps/chosen": -542.8455200195312,
"logps/rejected": -661.6935424804688,
"loss": 0.4846,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.0537521839141846,
"rewards/margins": 1.2326675653457642,
"rewards/rejected": -4.286419868469238,
"step": 2550
},
{
"epoch": 0.67,
"grad_norm": 13.75,
"learning_rate": 1.4839932443063057e-06,
"logits/chosen": -1.1161174774169922,
"logits/rejected": -0.9579364061355591,
"logps/chosen": -589.6568603515625,
"logps/rejected": -655.3709716796875,
"loss": 0.4618,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.976780414581299,
"rewards/margins": 1.220205307006836,
"rewards/rejected": -4.196985721588135,
"step": 2560
},
{
"epoch": 0.67,
"grad_norm": 18.625,
"learning_rate": 1.4631629558273803e-06,
"logits/chosen": -1.1335794925689697,
"logits/rejected": -1.004740595817566,
"logps/chosen": -549.504150390625,
"logps/rejected": -625.6862182617188,
"loss": 0.631,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -3.0777907371520996,
"rewards/margins": 0.8784114122390747,
"rewards/rejected": -3.9562020301818848,
"step": 2570
},
{
"epoch": 0.68,
"grad_norm": 6.03125,
"learning_rate": 1.4424192429432657e-06,
"logits/chosen": -1.2103271484375,
"logits/rejected": -1.1048699617385864,
"logps/chosen": -521.5680541992188,
"logps/rejected": -641.9281616210938,
"loss": 0.4666,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.6569151878356934,
"rewards/margins": 1.1702333688735962,
"rewards/rejected": -3.8271484375,
"step": 2580
},
{
"epoch": 0.68,
"grad_norm": 13.0,
"learning_rate": 1.421763837748016e-06,
"logits/chosen": -1.1741114854812622,
"logits/rejected": -1.0814844369888306,
"logps/chosen": -523.6945190429688,
"logps/rejected": -640.1383056640625,
"loss": 0.4441,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.7306861877441406,
"rewards/margins": 1.2494643926620483,
"rewards/rejected": -3.9801506996154785,
"step": 2590
},
{
"epoch": 0.68,
"grad_norm": 10.75,
"learning_rate": 1.401198464962021e-06,
"logits/chosen": -1.2068405151367188,
"logits/rejected": -1.0479636192321777,
"logps/chosen": -556.838623046875,
"logps/rejected": -625.3237915039062,
"loss": 0.4596,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.882551908493042,
"rewards/margins": 1.0710302591323853,
"rewards/rejected": -3.953582286834717,
"step": 2600
},
{
"epoch": 0.68,
"eval_logits/chosen": -1.0548917055130005,
"eval_logits/rejected": -0.9263830184936523,
"eval_logps/chosen": -559.5697631835938,
"eval_logps/rejected": -654.4569702148438,
"eval_loss": 0.493943989276886,
"eval_rewards/accuracies": 0.7400000095367432,
"eval_rewards/chosen": -2.9491782188415527,
"eval_rewards/margins": 1.149312973022461,
"eval_rewards/rejected": -4.098491191864014,
"eval_runtime": 381.8434,
"eval_samples_per_second": 5.238,
"eval_steps_per_second": 0.655,
"step": 2600
},
{
"epoch": 0.68,
"grad_norm": 8.5,
"learning_rate": 1.3807248417879896e-06,
"logits/chosen": -1.2618989944458008,
"logits/rejected": -1.1420848369598389,
"logps/chosen": -562.00146484375,
"logps/rejected": -670.0994873046875,
"loss": 0.4435,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.889355182647705,
"rewards/margins": 1.2824347019195557,
"rewards/rejected": -4.17179012298584,
"step": 2610
},
{
"epoch": 0.69,
"grad_norm": 29.125,
"learning_rate": 1.3603446777675665e-06,
"logits/chosen": -1.0890090465545654,
"logits/rejected": -0.966164767742157,
"logps/chosen": -583.3985595703125,
"logps/rejected": -678.4222412109375,
"loss": 0.5331,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.2020103931427,
"rewards/margins": 1.1710442304611206,
"rewards/rejected": -4.373054504394531,
"step": 2620
},
{
"epoch": 0.69,
"grad_norm": 6.84375,
"learning_rate": 1.3400596746385817e-06,
"logits/chosen": -1.2348748445510864,
"logits/rejected": -1.083888053894043,
"logps/chosen": -578.0357666015625,
"logps/rejected": -659.4061279296875,
"loss": 0.522,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.088522434234619,
"rewards/margins": 1.0845201015472412,
"rewards/rejected": -4.173042297363281,
"step": 2630
},
{
"epoch": 0.69,
"grad_norm": 8.6875,
"learning_rate": 1.3198715261929587e-06,
"logits/chosen": -1.1974236965179443,
"logits/rejected": -1.0507824420928955,
"logps/chosen": -558.0233764648438,
"logps/rejected": -667.1177978515625,
"loss": 0.4239,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.2156710624694824,
"rewards/margins": 1.2272260189056396,
"rewards/rejected": -4.442896842956543,
"step": 2640
},
{
"epoch": 0.69,
"grad_norm": 7.34375,
"learning_rate": 1.2997819181352823e-06,
"logits/chosen": -1.2283174991607666,
"logits/rejected": -1.0654624700546265,
"logps/chosen": -604.8272705078125,
"logps/rejected": -724.4739379882812,
"loss": 0.4118,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.079939365386963,
"rewards/margins": 1.4414037466049194,
"rewards/rejected": -4.521343231201172,
"step": 2650
},
{
"epoch": 0.7,
"grad_norm": 23.625,
"learning_rate": 1.2797925279420454e-06,
"logits/chosen": -1.1807067394256592,
"logits/rejected": -1.0574986934661865,
"logps/chosen": -610.4517822265625,
"logps/rejected": -721.9064331054688,
"loss": 0.489,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.385681629180908,
"rewards/margins": 1.2347917556762695,
"rewards/rejected": -4.620473384857178,
"step": 2660
},
{
"epoch": 0.7,
"grad_norm": 12.5,
"learning_rate": 1.2599050247215764e-06,
"logits/chosen": -1.129962682723999,
"logits/rejected": -1.0201483964920044,
"logps/chosen": -585.4744262695312,
"logps/rejected": -686.8712158203125,
"loss": 0.4794,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.2754268646240234,
"rewards/margins": 1.2443504333496094,
"rewards/rejected": -4.519776821136475,
"step": 2670
},
{
"epoch": 0.7,
"grad_norm": 12.25,
"learning_rate": 1.2401210690746705e-06,
"logits/chosen": -1.155137300491333,
"logits/rejected": -1.012924313545227,
"logps/chosen": -587.5916748046875,
"logps/rejected": -667.8207397460938,
"loss": 0.5131,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.2043495178222656,
"rewards/margins": 1.134204626083374,
"rewards/rejected": -4.338554859161377,
"step": 2680
},
{
"epoch": 0.7,
"grad_norm": 12.4375,
"learning_rate": 1.2204423129559306e-06,
"logits/chosen": -1.1951662302017212,
"logits/rejected": -1.140353798866272,
"logps/chosen": -567.091552734375,
"logps/rejected": -681.1925048828125,
"loss": 0.4925,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0157859325408936,
"rewards/margins": 1.1822477579116821,
"rewards/rejected": -4.198033332824707,
"step": 2690
},
{
"epoch": 0.71,
"grad_norm": 15.0625,
"learning_rate": 1.20087039953583e-06,
"logits/chosen": -1.2230998277664185,
"logits/rejected": -1.1086806058883667,
"logps/chosen": -558.0277099609375,
"logps/rejected": -655.5286865234375,
"loss": 0.5152,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.9380927085876465,
"rewards/margins": 1.2388523817062378,
"rewards/rejected": -4.176945209503174,
"step": 2700
},
{
"epoch": 0.71,
"eval_logits/chosen": -1.052660346031189,
"eval_logits/rejected": -0.9249356985092163,
"eval_logps/chosen": -566.6193237304688,
"eval_logps/rejected": -660.3236083984375,
"eval_loss": 0.49224671721458435,
"eval_rewards/accuracies": 0.7440000176429749,
"eval_rewards/chosen": -3.0196733474731445,
"eval_rewards/margins": 1.1374843120574951,
"eval_rewards/rejected": -4.1571574211120605,
"eval_runtime": 382.3055,
"eval_samples_per_second": 5.231,
"eval_steps_per_second": 0.654,
"step": 2700
},
{
"epoch": 0.71,
"grad_norm": 10.625,
"learning_rate": 1.181406963063507e-06,
"logits/chosen": -1.1344083547592163,
"logits/rejected": -1.0651142597198486,
"logps/chosen": -557.28125,
"logps/rejected": -663.6448974609375,
"loss": 0.5133,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.9143428802490234,
"rewards/margins": 1.0695984363555908,
"rewards/rejected": -3.9839415550231934,
"step": 2710
},
{
"epoch": 0.71,
"grad_norm": 6.84375,
"learning_rate": 1.1620536287303052e-06,
"logits/chosen": -1.2466278076171875,
"logits/rejected": -1.1265995502471924,
"logps/chosen": -571.1409301757812,
"logps/rejected": -636.3128662109375,
"loss": 0.5366,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.799872875213623,
"rewards/margins": 0.9532085657119751,
"rewards/rejected": -3.7530815601348877,
"step": 2720
},
{
"epoch": 0.71,
"grad_norm": 9.8125,
"learning_rate": 1.1428120125340717e-06,
"logits/chosen": -1.1743571758270264,
"logits/rejected": -1.024549126625061,
"logps/chosen": -524.5095825195312,
"logps/rejected": -638.3724365234375,
"loss": 0.3937,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.7037060260772705,
"rewards/margins": 1.5533134937286377,
"rewards/rejected": -4.257019519805908,
"step": 2730
},
{
"epoch": 0.72,
"grad_norm": 9.5625,
"learning_rate": 1.123683721144223e-06,
"logits/chosen": -1.186992883682251,
"logits/rejected": -1.0803272724151611,
"logps/chosen": -567.0985107421875,
"logps/rejected": -677.031005859375,
"loss": 0.4245,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.834251880645752,
"rewards/margins": 1.422716498374939,
"rewards/rejected": -4.256968021392822,
"step": 2740
},
{
"epoch": 0.72,
"grad_norm": 6.96875,
"learning_rate": 1.1046703517675848e-06,
"logits/chosen": -1.1976065635681152,
"logits/rejected": -1.1182498931884766,
"logps/chosen": -537.647216796875,
"logps/rejected": -647.6975708007812,
"loss": 0.5195,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.8253092765808105,
"rewards/margins": 1.0392690896987915,
"rewards/rejected": -3.8645782470703125,
"step": 2750
},
{
"epoch": 0.72,
"grad_norm": 10.9375,
"learning_rate": 1.085773492015028e-06,
"logits/chosen": -1.1978458166122437,
"logits/rejected": -1.0323292016983032,
"logps/chosen": -516.9109497070312,
"logps/rejected": -612.7794189453125,
"loss": 0.4273,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.7053141593933105,
"rewards/margins": 1.3017933368682861,
"rewards/rejected": -4.007107257843018,
"step": 2760
},
{
"epoch": 0.72,
"grad_norm": 18.5,
"learning_rate": 1.0669947197689034e-06,
"logits/chosen": -1.15623140335083,
"logits/rejected": -1.0121409893035889,
"logps/chosen": -561.9942626953125,
"logps/rejected": -639.5707397460938,
"loss": 0.5067,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.893584728240967,
"rewards/margins": 1.0627275705337524,
"rewards/rejected": -3.956312656402588,
"step": 2770
},
{
"epoch": 0.73,
"grad_norm": 9.4375,
"learning_rate": 1.048335603051291e-06,
"logits/chosen": -1.1546833515167236,
"logits/rejected": -1.0220603942871094,
"logps/chosen": -599.4776611328125,
"logps/rejected": -710.4974975585938,
"loss": 0.4331,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.1167383193969727,
"rewards/margins": 1.39237380027771,
"rewards/rejected": -4.5091118812561035,
"step": 2780
},
{
"epoch": 0.73,
"grad_norm": 9.75,
"learning_rate": 1.0297976998930665e-06,
"logits/chosen": -1.1516591310501099,
"logits/rejected": -1.0285645723342896,
"logps/chosen": -560.0816650390625,
"logps/rejected": -675.4591064453125,
"loss": 0.4367,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.9946718215942383,
"rewards/margins": 1.4317247867584229,
"rewards/rejected": -4.42639684677124,
"step": 2790
},
{
"epoch": 0.73,
"grad_norm": 10.9375,
"learning_rate": 1.0113825582038078e-06,
"logits/chosen": -1.1821314096450806,
"logits/rejected": -1.0650185346603394,
"logps/chosen": -576.8660278320312,
"logps/rejected": -679.5147705078125,
"loss": 0.4518,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.115962266921997,
"rewards/margins": 1.1937239170074463,
"rewards/rejected": -4.309685707092285,
"step": 2800
},
{
"epoch": 0.73,
"eval_logits/chosen": -1.053481936454773,
"eval_logits/rejected": -0.9260234236717224,
"eval_logps/chosen": -571.3138427734375,
"eval_logps/rejected": -668.0293579101562,
"eval_loss": 0.49084553122520447,
"eval_rewards/accuracies": 0.7415000200271606,
"eval_rewards/chosen": -3.066617965698242,
"eval_rewards/margins": 1.1675963401794434,
"eval_rewards/rejected": -4.2342143058776855,
"eval_runtime": 382.1708,
"eval_samples_per_second": 5.233,
"eval_steps_per_second": 0.654,
"step": 2800
},
{
"epoch": 0.74,
"grad_norm": 10.625,
"learning_rate": 9.930917156425477e-07,
"logits/chosen": -1.1559561491012573,
"logits/rejected": -1.0568530559539795,
"logps/chosen": -582.1268310546875,
"logps/rejected": -691.96875,
"loss": 0.5368,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.210031509399414,
"rewards/margins": 1.1272989511489868,
"rewards/rejected": -4.337330341339111,
"step": 2810
},
{
"epoch": 0.74,
"grad_norm": 15.25,
"learning_rate": 9.749266994893756e-07,
"logits/chosen": -1.0973955392837524,
"logits/rejected": -0.9485132098197937,
"logps/chosen": -550.6517333984375,
"logps/rejected": -629.6903686523438,
"loss": 0.5621,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.0995354652404785,
"rewards/margins": 0.9246597290039062,
"rewards/rejected": -4.024195671081543,
"step": 2820
},
{
"epoch": 0.74,
"grad_norm": 15.3125,
"learning_rate": 9.56889026517913e-07,
"logits/chosen": -1.1514110565185547,
"logits/rejected": -1.0361002683639526,
"logps/chosen": -582.6224365234375,
"logps/rejected": -664.3800659179688,
"loss": 0.5019,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.2128403186798096,
"rewards/margins": 1.0774794816970825,
"rewards/rejected": -4.290319442749023,
"step": 2830
},
{
"epoch": 0.74,
"grad_norm": 7.03125,
"learning_rate": 9.389802028686617e-07,
"logits/chosen": -1.2338387966156006,
"logits/rejected": -1.1366431713104248,
"logps/chosen": -566.8738403320312,
"logps/rejected": -616.0252685546875,
"loss": 0.5826,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0610568523406982,
"rewards/margins": 0.8211328387260437,
"rewards/rejected": -3.882189989089966,
"step": 2840
},
{
"epoch": 0.75,
"grad_norm": 9.75,
"learning_rate": 9.212017239232427e-07,
"logits/chosen": -1.1542332172393799,
"logits/rejected": -1.017268180847168,
"logps/chosen": -568.286376953125,
"logps/rejected": -668.4588623046875,
"loss": 0.4741,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.9533185958862305,
"rewards/margins": 1.2311924695968628,
"rewards/rejected": -4.184511184692383,
"step": 2850
},
{
"epoch": 0.75,
"grad_norm": 10.0625,
"learning_rate": 9.03555074179533e-07,
"logits/chosen": -1.1374441385269165,
"logits/rejected": -1.1105449199676514,
"logps/chosen": -544.0662231445312,
"logps/rejected": -676.7945556640625,
"loss": 0.446,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.872863292694092,
"rewards/margins": 1.2689627408981323,
"rewards/rejected": -4.1418256759643555,
"step": 2860
},
{
"epoch": 0.75,
"grad_norm": 14.5625,
"learning_rate": 8.860417271277067e-07,
"logits/chosen": -1.263672947883606,
"logits/rejected": -1.2044599056243896,
"logps/chosen": -563.6286010742188,
"logps/rejected": -651.6553955078125,
"loss": 0.4788,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9439358711242676,
"rewards/margins": 0.9601505398750305,
"rewards/rejected": -3.9040865898132324,
"step": 2870
},
{
"epoch": 0.75,
"grad_norm": 8.75,
"learning_rate": 8.686631451272029e-07,
"logits/chosen": -1.2087829113006592,
"logits/rejected": -1.0665159225463867,
"logps/chosen": -564.14892578125,
"logps/rejected": -660.1915893554688,
"loss": 0.4861,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1072099208831787,
"rewards/margins": 1.2149550914764404,
"rewards/rejected": -4.322165489196777,
"step": 2880
},
{
"epoch": 0.76,
"grad_norm": 8.625,
"learning_rate": 8.514207792846168e-07,
"logits/chosen": -1.2422146797180176,
"logits/rejected": -1.1245746612548828,
"logps/chosen": -556.6324462890625,
"logps/rejected": -642.3776245117188,
"loss": 0.4902,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.0577263832092285,
"rewards/margins": 1.1418030261993408,
"rewards/rejected": -4.19952917098999,
"step": 2890
},
{
"epoch": 0.76,
"grad_norm": 8.0625,
"learning_rate": 8.343160693325356e-07,
"logits/chosen": -1.1230237483978271,
"logits/rejected": -1.0151801109313965,
"logps/chosen": -566.5771484375,
"logps/rejected": -679.12646484375,
"loss": 0.5018,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.1102497577667236,
"rewards/margins": 1.1686756610870361,
"rewards/rejected": -4.278925895690918,
"step": 2900
},
{
"epoch": 0.76,
"eval_logits/chosen": -1.059489130973816,
"eval_logits/rejected": -0.9320334792137146,
"eval_logps/chosen": -574.426025390625,
"eval_logps/rejected": -668.4285278320312,
"eval_loss": 0.4876534342765808,
"eval_rewards/accuracies": 0.7465000152587891,
"eval_rewards/chosen": -3.0977394580841064,
"eval_rewards/margins": 1.1404662132263184,
"eval_rewards/rejected": -4.238205432891846,
"eval_runtime": 382.316,
"eval_samples_per_second": 5.231,
"eval_steps_per_second": 0.654,
"step": 2900
},
{
"epoch": 0.76,
"grad_norm": 8.125,
"learning_rate": 8.173504435093174e-07,
"logits/chosen": -1.1287494897842407,
"logits/rejected": -0.955623984336853,
"logps/chosen": -547.8873291015625,
"logps/rejected": -640.971923828125,
"loss": 0.477,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.0631890296936035,
"rewards/margins": 1.2520211935043335,
"rewards/rejected": -4.315210342407227,
"step": 2910
},
{
"epoch": 0.76,
"grad_norm": 6.5625,
"learning_rate": 8.00525318439836e-07,
"logits/chosen": -1.158349871635437,
"logits/rejected": -1.0400350093841553,
"logps/chosen": -583.4833374023438,
"logps/rejected": -674.5729370117188,
"loss": 0.5408,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0665407180786133,
"rewards/margins": 0.9871135950088501,
"rewards/rejected": -4.053654193878174,
"step": 2920
},
{
"epoch": 0.77,
"grad_norm": 7.6875,
"learning_rate": 7.838420990171927e-07,
"logits/chosen": -1.2469195127487183,
"logits/rejected": -1.0984286069869995,
"logps/chosen": -567.165283203125,
"logps/rejected": -650.6731567382812,
"loss": 0.5017,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.977949380874634,
"rewards/margins": 1.090990424156189,
"rewards/rejected": -4.068940162658691,
"step": 2930
},
{
"epoch": 0.77,
"grad_norm": 7.5625,
"learning_rate": 7.673021782854084e-07,
"logits/chosen": -1.1217727661132812,
"logits/rejected": -0.9839452505111694,
"logps/chosen": -561.6543579101562,
"logps/rejected": -643.6695556640625,
"loss": 0.4898,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.009657382965088,
"rewards/margins": 1.2389792203903198,
"rewards/rejected": -4.248636722564697,
"step": 2940
},
{
"epoch": 0.77,
"grad_norm": 9.9375,
"learning_rate": 7.509069373231039e-07,
"logits/chosen": -1.129913568496704,
"logits/rejected": -1.0110609531402588,
"logps/chosen": -554.6318969726562,
"logps/rejected": -622.6085205078125,
"loss": 0.5441,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.0194828510284424,
"rewards/margins": 0.9275726079940796,
"rewards/rejected": -3.9470553398132324,
"step": 2950
},
{
"epoch": 0.77,
"grad_norm": 7.71875,
"learning_rate": 7.346577451281822e-07,
"logits/chosen": -1.1370588541030884,
"logits/rejected": -1.0633890628814697,
"logps/chosen": -551.51123046875,
"logps/rejected": -660.9559936523438,
"loss": 0.4596,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.892915725708008,
"rewards/margins": 1.3355481624603271,
"rewards/rejected": -4.228463649749756,
"step": 2960
},
{
"epoch": 0.78,
"grad_norm": 18.625,
"learning_rate": 7.185559585035138e-07,
"logits/chosen": -1.1904377937316895,
"logits/rejected": -1.0318008661270142,
"logps/chosen": -591.028564453125,
"logps/rejected": -693.4492797851562,
"loss": 0.4733,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.0825228691101074,
"rewards/margins": 1.1828874349594116,
"rewards/rejected": -4.26540994644165,
"step": 2970
},
{
"epoch": 0.78,
"grad_norm": 7.78125,
"learning_rate": 7.026029219436504e-07,
"logits/chosen": -1.2153565883636475,
"logits/rejected": -1.0524095296859741,
"logps/chosen": -546.4449462890625,
"logps/rejected": -655.5341186523438,
"loss": 0.4637,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.9415435791015625,
"rewards/margins": 1.2188594341278076,
"rewards/rejected": -4.160403251647949,
"step": 2980
},
{
"epoch": 0.78,
"grad_norm": 9.5,
"learning_rate": 6.867999675225523e-07,
"logits/chosen": -1.2460225820541382,
"logits/rejected": -1.1109936237335205,
"logps/chosen": -518.8594970703125,
"logps/rejected": -621.4867553710938,
"loss": 0.4754,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.8794169425964355,
"rewards/margins": 1.1684167385101318,
"rewards/rejected": -4.047833442687988,
"step": 2990
},
{
"epoch": 0.79,
"grad_norm": 10.375,
"learning_rate": 6.711484147823663e-07,
"logits/chosen": -1.1477627754211426,
"logits/rejected": -1.0689526796340942,
"logps/chosen": -520.4979858398438,
"logps/rejected": -650.7647094726562,
"loss": 0.4592,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.862730026245117,
"rewards/margins": 1.2541263103485107,
"rewards/rejected": -4.116856575012207,
"step": 3000
},
{
"epoch": 0.79,
"eval_logits/chosen": -1.0787907838821411,
"eval_logits/rejected": -0.9509702324867249,
"eval_logps/chosen": -563.9876708984375,
"eval_logps/rejected": -655.9471435546875,
"eval_loss": 0.48733198642730713,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -2.993356466293335,
"eval_rewards/margins": 1.1200352907180786,
"eval_rewards/rejected": -4.113391399383545,
"eval_runtime": 382.8007,
"eval_samples_per_second": 5.225,
"eval_steps_per_second": 0.653,
"step": 3000
},
{
"epoch": 0.79,
"grad_norm": 11.25,
"learning_rate": 6.556495706232413e-07,
"logits/chosen": -1.1598658561706543,
"logits/rejected": -1.0877033472061157,
"logps/chosen": -578.8084106445312,
"logps/rejected": -665.4705200195312,
"loss": 0.5453,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.1011300086975098,
"rewards/margins": 1.050903081893921,
"rewards/rejected": -4.152032852172852,
"step": 3010
},
{
"epoch": 0.79,
"grad_norm": 9.8125,
"learning_rate": 6.403047291942057e-07,
"logits/chosen": -1.0840625762939453,
"logits/rejected": -0.9331427812576294,
"logps/chosen": -521.8424682617188,
"logps/rejected": -612.9337768554688,
"loss": 0.495,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.95574688911438,
"rewards/margins": 1.1547616720199585,
"rewards/rejected": -4.110508441925049,
"step": 3020
},
{
"epoch": 0.79,
"grad_norm": 12.375,
"learning_rate": 6.251151717851023e-07,
"logits/chosen": -1.1582403182983398,
"logits/rejected": -1.0655838251113892,
"logps/chosen": -526.1175537109375,
"logps/rejected": -627.6626586914062,
"loss": 0.4861,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.9271697998046875,
"rewards/margins": 1.1482912302017212,
"rewards/rejected": -4.075460910797119,
"step": 3030
},
{
"epoch": 0.8,
"grad_norm": 6.25,
"learning_rate": 6.100821667196041e-07,
"logits/chosen": -1.323209524154663,
"logits/rejected": -1.0637619495391846,
"logps/chosen": -561.310791015625,
"logps/rejected": -609.7546997070312,
"loss": 0.4726,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.856724500656128,
"rewards/margins": 1.1393463611602783,
"rewards/rejected": -3.9960708618164062,
"step": 3040
},
{
"epoch": 0.8,
"grad_norm": 55.5,
"learning_rate": 5.952069692493062e-07,
"logits/chosen": -1.1378008127212524,
"logits/rejected": -1.033092737197876,
"logps/chosen": -511.969482421875,
"logps/rejected": -648.4796752929688,
"loss": 0.4149,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.8365421295166016,
"rewards/margins": 1.3306509256362915,
"rewards/rejected": -4.1671929359436035,
"step": 3050
},
{
"epoch": 0.8,
"grad_norm": 10.5625,
"learning_rate": 5.80490821448918e-07,
"logits/chosen": -1.1030110120773315,
"logits/rejected": -1.0928280353546143,
"logps/chosen": -549.79052734375,
"logps/rejected": -727.48876953125,
"loss": 0.4284,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.8726837635040283,
"rewards/margins": 1.3525440692901611,
"rewards/rejected": -4.225228309631348,
"step": 3060
},
{
"epoch": 0.8,
"grad_norm": 9.0625,
"learning_rate": 5.659349521125459e-07,
"logits/chosen": -1.2849022150039673,
"logits/rejected": -1.2295571565628052,
"logps/chosen": -560.9410400390625,
"logps/rejected": -645.2173461914062,
"loss": 0.4973,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.7952258586883545,
"rewards/margins": 1.048758864402771,
"rewards/rejected": -3.843984603881836,
"step": 3070
},
{
"epoch": 0.81,
"grad_norm": 6.90625,
"learning_rate": 5.5154057665109e-07,
"logits/chosen": -1.2467188835144043,
"logits/rejected": -1.0997190475463867,
"logps/chosen": -557.9779052734375,
"logps/rejected": -661.7819213867188,
"loss": 0.4889,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.9708826541900635,
"rewards/margins": 1.3023018836975098,
"rewards/rejected": -4.273184776306152,
"step": 3080
},
{
"epoch": 0.81,
"grad_norm": 8.25,
"learning_rate": 5.373088969907586e-07,
"logits/chosen": -1.2789522409439087,
"logits/rejected": -1.0984174013137817,
"logps/chosen": -573.76123046875,
"logps/rejected": -637.1810302734375,
"loss": 0.4581,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.969475269317627,
"rewards/margins": 1.136474370956421,
"rewards/rejected": -4.105949878692627,
"step": 3090
},
{
"epoch": 0.81,
"grad_norm": 8.625,
"learning_rate": 5.23241101472709e-07,
"logits/chosen": -1.1879446506500244,
"logits/rejected": -1.0638211965560913,
"logps/chosen": -563.8876342773438,
"logps/rejected": -645.8259887695312,
"loss": 0.4905,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.8758597373962402,
"rewards/margins": 1.0394397974014282,
"rewards/rejected": -3.9152991771698,
"step": 3100
},
{
"epoch": 0.81,
"eval_logits/chosen": -1.0740700960159302,
"eval_logits/rejected": -0.9464629888534546,
"eval_logps/chosen": -562.904296875,
"eval_logps/rejected": -656.5853271484375,
"eval_loss": 0.48781928420066833,
"eval_rewards/accuracies": 0.7429999709129333,
"eval_rewards/chosen": -2.982522964477539,
"eval_rewards/margins": 1.1372504234313965,
"eval_rewards/rejected": -4.1197733879089355,
"eval_runtime": 382.0441,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 3100
},
{
"epoch": 0.81,
"grad_norm": 8.5625,
"learning_rate": 5.09338364753818e-07,
"logits/chosen": -1.2681617736816406,
"logits/rejected": -1.0949214696884155,
"logps/chosen": -578.9161376953125,
"logps/rejected": -673.3041381835938,
"loss": 0.5304,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.9349396228790283,
"rewards/margins": 1.1018182039260864,
"rewards/rejected": -4.036757469177246,
"step": 3110
},
{
"epoch": 0.82,
"grad_norm": 10.125,
"learning_rate": 4.956018477086005e-07,
"logits/chosen": -1.2264713048934937,
"logits/rejected": -1.0714534521102905,
"logps/chosen": -574.7757568359375,
"logps/rejected": -661.6316528320312,
"loss": 0.5111,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.056429624557495,
"rewards/margins": 1.1420024633407593,
"rewards/rejected": -4.198431968688965,
"step": 3120
},
{
"epoch": 0.82,
"grad_norm": 11.625,
"learning_rate": 4.820326973322764e-07,
"logits/chosen": -1.1282936334609985,
"logits/rejected": -1.0485918521881104,
"logps/chosen": -566.1331787109375,
"logps/rejected": -665.1694946289062,
"loss": 0.5658,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1516964435577393,
"rewards/margins": 1.0504977703094482,
"rewards/rejected": -4.202193737030029,
"step": 3130
},
{
"epoch": 0.82,
"grad_norm": 10.25,
"learning_rate": 4.686320466449981e-07,
"logits/chosen": -1.1074498891830444,
"logits/rejected": -0.9338695406913757,
"logps/chosen": -530.6743774414062,
"logps/rejected": -670.0709838867188,
"loss": 0.4495,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.914252281188965,
"rewards/margins": 1.4155068397521973,
"rewards/rejected": -4.329759120941162,
"step": 3140
},
{
"epoch": 0.82,
"grad_norm": 8.1875,
"learning_rate": 4.554010145972418e-07,
"logits/chosen": -1.2932242155075073,
"logits/rejected": -1.10805344581604,
"logps/chosen": -569.38818359375,
"logps/rejected": -671.8726806640625,
"loss": 0.551,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.0557217597961426,
"rewards/margins": 1.1381008625030518,
"rewards/rejected": -4.193822860717773,
"step": 3150
},
{
"epoch": 0.83,
"grad_norm": 8.75,
"learning_rate": 4.4234070597637455e-07,
"logits/chosen": -1.1201348304748535,
"logits/rejected": -1.0320645570755005,
"logps/chosen": -575.7613525390625,
"logps/rejected": -669.4164428710938,
"loss": 0.5272,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.0022788047790527,
"rewards/margins": 1.0558512210845947,
"rewards/rejected": -4.058130264282227,
"step": 3160
},
{
"epoch": 0.83,
"grad_norm": 6.5625,
"learning_rate": 4.2945221131440783e-07,
"logits/chosen": -1.114639401435852,
"logits/rejected": -0.9161049127578735,
"logps/chosen": -552.2017211914062,
"logps/rejected": -653.031005859375,
"loss": 0.4203,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -2.864920139312744,
"rewards/margins": 1.3247652053833008,
"rewards/rejected": -4.189684867858887,
"step": 3170
},
{
"epoch": 0.83,
"grad_norm": 9.25,
"learning_rate": 4.167366067969381e-07,
"logits/chosen": -1.216722846031189,
"logits/rejected": -1.144590139389038,
"logps/chosen": -516.3905639648438,
"logps/rejected": -639.3621826171875,
"loss": 0.4982,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9348020553588867,
"rewards/margins": 0.9914267659187317,
"rewards/rejected": -3.9262290000915527,
"step": 3180
},
{
"epoch": 0.83,
"grad_norm": 6.125,
"learning_rate": 4.041949541732826e-07,
"logits/chosen": -1.1988582611083984,
"logits/rejected": -1.1241180896759033,
"logps/chosen": -567.2083740234375,
"logps/rejected": -659.522216796875,
"loss": 0.5194,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.033240795135498,
"rewards/margins": 1.0658702850341797,
"rewards/rejected": -4.0991106033325195,
"step": 3190
},
{
"epoch": 0.84,
"grad_norm": 10.1875,
"learning_rate": 3.9182830066782614e-07,
"logits/chosen": -1.1077312231063843,
"logits/rejected": -1.0953607559204102,
"logps/chosen": -557.6238403320312,
"logps/rejected": -689.794921875,
"loss": 0.485,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.9987998008728027,
"rewards/margins": 1.1976327896118164,
"rewards/rejected": -4.196433067321777,
"step": 3200
},
{
"epoch": 0.84,
"eval_logits/chosen": -1.0807329416275024,
"eval_logits/rejected": -0.9531368613243103,
"eval_logps/chosen": -559.239990234375,
"eval_logps/rejected": -652.1516723632812,
"eval_loss": 0.4873969852924347,
"eval_rewards/accuracies": 0.7455000281333923,
"eval_rewards/chosen": -2.9458799362182617,
"eval_rewards/margins": 1.1295573711395264,
"eval_rewards/rejected": -4.075437068939209,
"eval_runtime": 381.6886,
"eval_samples_per_second": 5.24,
"eval_steps_per_second": 0.655,
"step": 3200
},
{
"epoch": 0.84,
"grad_norm": 8.3125,
"learning_rate": 3.796376788925771e-07,
"logits/chosen": -1.1163936853408813,
"logits/rejected": -1.0554332733154297,
"logps/chosen": -541.477294921875,
"logps/rejected": -619.0269165039062,
"loss": 0.4946,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.800494909286499,
"rewards/margins": 1.0160177946090698,
"rewards/rejected": -3.8165130615234375,
"step": 3210
},
{
"epoch": 0.84,
"grad_norm": 7.625,
"learning_rate": 3.676241067609465e-07,
"logits/chosen": -1.2064073085784912,
"logits/rejected": -1.0841269493103027,
"logps/chosen": -582.91259765625,
"logps/rejected": -648.9725952148438,
"loss": 0.5138,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.892125129699707,
"rewards/margins": 1.091489315032959,
"rewards/rejected": -3.983614444732666,
"step": 3220
},
{
"epoch": 0.85,
"grad_norm": 15.1875,
"learning_rate": 3.5578858740274976e-07,
"logits/chosen": -1.123425841331482,
"logits/rejected": -1.0302746295928955,
"logps/chosen": -566.611328125,
"logps/rejected": -648.7924194335938,
"loss": 0.5326,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.048774003982544,
"rewards/margins": 0.9477185010910034,
"rewards/rejected": -3.996492385864258,
"step": 3230
},
{
"epoch": 0.85,
"grad_norm": 11.5625,
"learning_rate": 3.44132109080447e-07,
"logits/chosen": -1.3182079792022705,
"logits/rejected": -1.1424782276153564,
"logps/chosen": -549.4573364257812,
"logps/rejected": -634.7244873046875,
"loss": 0.4425,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.8423376083374023,
"rewards/margins": 1.2275350093841553,
"rewards/rejected": -4.069872856140137,
"step": 3240
},
{
"epoch": 0.85,
"grad_norm": 12.125,
"learning_rate": 3.3265564510662344e-07,
"logits/chosen": -1.2581889629364014,
"logits/rejected": -1.1089788675308228,
"logps/chosen": -572.9723510742188,
"logps/rejected": -676.4666137695312,
"loss": 0.4207,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.796302318572998,
"rewards/margins": 1.2862600088119507,
"rewards/rejected": -4.082562446594238,
"step": 3250
},
{
"epoch": 0.85,
"grad_norm": 14.3125,
"learning_rate": 3.213601537627195e-07,
"logits/chosen": -1.1619012355804443,
"logits/rejected": -1.0473229885101318,
"logps/chosen": -574.4371948242188,
"logps/rejected": -662.361083984375,
"loss": 0.5456,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.1731839179992676,
"rewards/margins": 1.0502443313598633,
"rewards/rejected": -4.223428249359131,
"step": 3260
},
{
"epoch": 0.86,
"grad_norm": 12.0,
"learning_rate": 3.1024657821901063e-07,
"logits/chosen": -1.2181814908981323,
"logits/rejected": -1.1487758159637451,
"logps/chosen": -531.4067993164062,
"logps/rejected": -627.6771240234375,
"loss": 0.5005,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8366494178771973,
"rewards/margins": 1.1211111545562744,
"rewards/rejected": -3.9577605724334717,
"step": 3270
},
{
"epoch": 0.86,
"grad_norm": 14.25,
"learning_rate": 2.9931584645585654e-07,
"logits/chosen": -1.147289514541626,
"logits/rejected": -1.1335127353668213,
"logps/chosen": -557.3380737304688,
"logps/rejected": -666.0869140625,
"loss": 0.5042,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.828235626220703,
"rewards/margins": 1.0581908226013184,
"rewards/rejected": -3.8864264488220215,
"step": 3280
},
{
"epoch": 0.86,
"grad_norm": 6.96875,
"learning_rate": 2.885688711862136e-07,
"logits/chosen": -1.1895829439163208,
"logits/rejected": -1.1866552829742432,
"logps/chosen": -561.8271484375,
"logps/rejected": -686.0377197265625,
"loss": 0.51,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.007755756378174,
"rewards/margins": 1.261817216873169,
"rewards/rejected": -4.269573211669922,
"step": 3290
},
{
"epoch": 0.86,
"grad_norm": 7.6875,
"learning_rate": 2.7800654977942486e-07,
"logits/chosen": -1.1794744729995728,
"logits/rejected": -1.0672075748443604,
"logps/chosen": -547.8685302734375,
"logps/rejected": -650.7493286132812,
"loss": 0.5157,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.9051201343536377,
"rewards/margins": 1.0670777559280396,
"rewards/rejected": -3.972198009490967,
"step": 3300
},
{
"epoch": 0.86,
"eval_logits/chosen": -1.0755009651184082,
"eval_logits/rejected": -0.9480787515640259,
"eval_logps/chosen": -560.1488647460938,
"eval_logps/rejected": -652.9912109375,
"eval_loss": 0.4874354600906372,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -2.9549689292907715,
"eval_rewards/margins": 1.128864049911499,
"eval_rewards/rejected": -4.083832740783691,
"eval_runtime": 383.0008,
"eval_samples_per_second": 5.222,
"eval_steps_per_second": 0.653,
"step": 3300
},
{
"epoch": 0.87,
"grad_norm": 12.75,
"learning_rate": 2.6762976418628797e-07,
"logits/chosen": -1.1829874515533447,
"logits/rejected": -1.0443121194839478,
"logps/chosen": -508.328857421875,
"logps/rejected": -573.6398315429688,
"loss": 0.5093,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.785529851913452,
"rewards/margins": 1.0893114805221558,
"rewards/rejected": -3.8748409748077393,
"step": 3310
},
{
"epoch": 0.87,
"grad_norm": 9.375,
"learning_rate": 2.5743938086541354e-07,
"logits/chosen": -1.1776726245880127,
"logits/rejected": -1.0596325397491455,
"logps/chosen": -558.5306396484375,
"logps/rejected": -649.6300048828125,
"loss": 0.4969,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.942483425140381,
"rewards/margins": 1.159317135810852,
"rewards/rejected": -4.101800441741943,
"step": 3320
},
{
"epoch": 0.87,
"grad_norm": 13.0625,
"learning_rate": 2.4743625071087574e-07,
"logits/chosen": -1.3345047235488892,
"logits/rejected": -1.1562585830688477,
"logps/chosen": -557.7296142578125,
"logps/rejected": -661.87109375,
"loss": 0.4702,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.8015923500061035,
"rewards/margins": 1.3364170789718628,
"rewards/rejected": -4.138009548187256,
"step": 3330
},
{
"epoch": 0.87,
"grad_norm": 11.875,
"learning_rate": 2.3762120898116498e-07,
"logits/chosen": -1.1994738578796387,
"logits/rejected": -1.097899079322815,
"logps/chosen": -579.8328857421875,
"logps/rejected": -674.6861572265625,
"loss": 0.4926,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.1103997230529785,
"rewards/margins": 1.0284258127212524,
"rewards/rejected": -4.138825416564941,
"step": 3340
},
{
"epoch": 0.88,
"grad_norm": 8.0,
"learning_rate": 2.2799507522944048e-07,
"logits/chosen": -1.1523630619049072,
"logits/rejected": -1.0521692037582397,
"logps/chosen": -551.5980224609375,
"logps/rejected": -671.2841186523438,
"loss": 0.4455,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.8580057621002197,
"rewards/margins": 1.2792617082595825,
"rewards/rejected": -4.137267112731934,
"step": 3350
},
{
"epoch": 0.88,
"grad_norm": 8.5625,
"learning_rate": 2.1855865323510056e-07,
"logits/chosen": -1.2028191089630127,
"logits/rejected": -1.0033330917358398,
"logps/chosen": -563.6111450195312,
"logps/rejected": -704.59228515625,
"loss": 0.4213,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.9093270301818848,
"rewards/margins": 1.4688284397125244,
"rewards/rejected": -4.378155708312988,
"step": 3360
},
{
"epoch": 0.88,
"grad_norm": 7.3125,
"learning_rate": 2.0931273093666575e-07,
"logits/chosen": -1.1482703685760498,
"logits/rejected": -1.0027369260787964,
"logps/chosen": -540.7926635742188,
"logps/rejected": -644.9227294921875,
"loss": 0.439,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.006060838699341,
"rewards/margins": 1.2438604831695557,
"rewards/rejected": -4.2499213218688965,
"step": 3370
},
{
"epoch": 0.88,
"grad_norm": 13.625,
"learning_rate": 2.002580803659873e-07,
"logits/chosen": -1.1630356311798096,
"logits/rejected": -1.0312206745147705,
"logps/chosen": -559.203125,
"logps/rejected": -652.8720092773438,
"loss": 0.4651,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.046480417251587,
"rewards/margins": 1.1182465553283691,
"rewards/rejected": -4.164727210998535,
"step": 3380
},
{
"epoch": 0.89,
"grad_norm": 6.71875,
"learning_rate": 1.913954575837826e-07,
"logits/chosen": -1.2169429063796997,
"logits/rejected": -0.9856022596359253,
"logps/chosen": -575.2197875976562,
"logps/rejected": -634.4151000976562,
"loss": 0.4808,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.0310537815093994,
"rewards/margins": 1.0889911651611328,
"rewards/rejected": -4.120044708251953,
"step": 3390
},
{
"epoch": 0.89,
"grad_norm": 10.3125,
"learning_rate": 1.827256026165028e-07,
"logits/chosen": -1.2307440042495728,
"logits/rejected": -1.0502979755401611,
"logps/chosen": -592.2626342773438,
"logps/rejected": -664.5699462890625,
"loss": 0.4474,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.771921396255493,
"rewards/margins": 1.2936856746673584,
"rewards/rejected": -4.065607070922852,
"step": 3400
},
{
"epoch": 0.89,
"eval_logits/chosen": -1.077279806137085,
"eval_logits/rejected": -0.9499141573905945,
"eval_logps/chosen": -561.6380615234375,
"eval_logps/rejected": -654.8016967773438,
"eval_loss": 0.4871050715446472,
"eval_rewards/accuracies": 0.7434999942779541,
"eval_rewards/chosen": -2.969860553741455,
"eval_rewards/margins": 1.1320772171020508,
"eval_rewards/rejected": -4.101937770843506,
"eval_runtime": 382.1089,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 3400
},
{
"epoch": 0.89,
"grad_norm": 11.5625,
"learning_rate": 1.7424923939454274e-07,
"logits/chosen": -1.174843430519104,
"logits/rejected": -1.0021690130233765,
"logps/chosen": -579.2442626953125,
"logps/rejected": -661.9432373046875,
"loss": 0.4255,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.9789488315582275,
"rewards/margins": 1.2606755495071411,
"rewards/rejected": -4.239624500274658,
"step": 3410
},
{
"epoch": 0.9,
"grad_norm": 16.25,
"learning_rate": 1.6596707569179304e-07,
"logits/chosen": -1.2912896871566772,
"logits/rejected": -1.1392004489898682,
"logps/chosen": -576.8416748046875,
"logps/rejected": -653.64501953125,
"loss": 0.4901,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.956543207168579,
"rewards/margins": 1.119319200515747,
"rewards/rejected": -4.075861930847168,
"step": 3420
},
{
"epoch": 0.9,
"grad_norm": 9.625,
"learning_rate": 1.578798030665385e-07,
"logits/chosen": -1.2196199893951416,
"logits/rejected": -1.0388673543930054,
"logps/chosen": -565.8033447265625,
"logps/rejected": -686.4707641601562,
"loss": 0.4313,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.9110517501831055,
"rewards/margins": 1.3773367404937744,
"rewards/rejected": -4.288388729095459,
"step": 3430
},
{
"epoch": 0.9,
"grad_norm": 8.25,
"learning_rate": 1.499880968037165e-07,
"logits/chosen": -1.1975353956222534,
"logits/rejected": -1.0588737726211548,
"logps/chosen": -544.4766845703125,
"logps/rejected": -618.7376098632812,
"loss": 0.513,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.888631820678711,
"rewards/margins": 1.1052820682525635,
"rewards/rejected": -3.9939143657684326,
"step": 3440
},
{
"epoch": 0.9,
"grad_norm": 14.5625,
"learning_rate": 1.4229261585852805e-07,
"logits/chosen": -1.230802297592163,
"logits/rejected": -1.1439770460128784,
"logps/chosen": -553.6980590820312,
"logps/rejected": -644.7520751953125,
"loss": 0.4489,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.8526644706726074,
"rewards/margins": 1.1686475276947021,
"rewards/rejected": -4.0213117599487305,
"step": 3450
},
{
"epoch": 0.91,
"grad_norm": 10.6875,
"learning_rate": 1.3479400280141886e-07,
"logits/chosen": -1.1431211233139038,
"logits/rejected": -1.1035680770874023,
"logps/chosen": -544.6209106445312,
"logps/rejected": -662.7022705078125,
"loss": 0.4784,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.9836173057556152,
"rewards/margins": 1.2009574174880981,
"rewards/rejected": -4.184575080871582,
"step": 3460
},
{
"epoch": 0.91,
"grad_norm": 9.625,
"learning_rate": 1.2749288376442044e-07,
"logits/chosen": -1.2415331602096558,
"logits/rejected": -1.0389716625213623,
"logps/chosen": -587.2742309570312,
"logps/rejected": -644.300048828125,
"loss": 0.4742,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.9073996543884277,
"rewards/margins": 1.1453628540039062,
"rewards/rejected": -4.052762508392334,
"step": 3470
},
{
"epoch": 0.91,
"grad_norm": 8.875,
"learning_rate": 1.203898683888713e-07,
"logits/chosen": -1.2313424348831177,
"logits/rejected": -1.1037070751190186,
"logps/chosen": -548.36962890625,
"logps/rejected": -643.1497192382812,
"loss": 0.5615,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.043722152709961,
"rewards/margins": 0.9787699580192566,
"rewards/rejected": -4.022491931915283,
"step": 3480
},
{
"epoch": 0.91,
"grad_norm": 8.5625,
"learning_rate": 1.1348554977451132e-07,
"logits/chosen": -1.2611653804779053,
"logits/rejected": -1.1225281953811646,
"logps/chosen": -574.4703369140625,
"logps/rejected": -650.3907470703125,
"loss": 0.495,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.924811840057373,
"rewards/margins": 1.0881900787353516,
"rewards/rejected": -4.013001918792725,
"step": 3490
},
{
"epoch": 0.92,
"grad_norm": 6.875,
"learning_rate": 1.0678050442995802e-07,
"logits/chosen": -1.2225737571716309,
"logits/rejected": -1.0173273086547852,
"logps/chosen": -580.7540283203125,
"logps/rejected": -643.2467651367188,
"loss": 0.5379,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.9996070861816406,
"rewards/margins": 1.0754629373550415,
"rewards/rejected": -4.075070381164551,
"step": 3500
},
{
"epoch": 0.92,
"eval_logits/chosen": -1.074249267578125,
"eval_logits/rejected": -0.9468256831169128,
"eval_logps/chosen": -561.2808227539062,
"eval_logps/rejected": -654.5006103515625,
"eval_loss": 0.48737701773643494,
"eval_rewards/accuracies": 0.7429999709129333,
"eval_rewards/chosen": -2.9662883281707764,
"eval_rewards/margins": 1.1326382160186768,
"eval_rewards/rejected": -4.098926544189453,
"eval_runtime": 382.1229,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 3500
},
{
"epoch": 0.92,
"grad_norm": 9.1875,
"learning_rate": 1.0027529222456755e-07,
"logits/chosen": -1.1973202228546143,
"logits/rejected": -1.0237270593643188,
"logps/chosen": -544.4231567382812,
"logps/rejected": -646.541015625,
"loss": 0.4368,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.902569532394409,
"rewards/margins": 1.1967476606369019,
"rewards/rejected": -4.0993170738220215,
"step": 3510
},
{
"epoch": 0.92,
"grad_norm": 8.4375,
"learning_rate": 9.397045634168766e-08,
"logits/chosen": -1.227426290512085,
"logits/rejected": -1.1496341228485107,
"logps/chosen": -555.9089965820312,
"logps/rejected": -687.0352783203125,
"loss": 0.4491,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.873260974884033,
"rewards/margins": 1.3088066577911377,
"rewards/rejected": -4.182066917419434,
"step": 3520
},
{
"epoch": 0.92,
"grad_norm": 10.9375,
"learning_rate": 8.78665232332998e-08,
"logits/chosen": -1.1654760837554932,
"logits/rejected": -1.0858592987060547,
"logps/chosen": -537.4627685546875,
"logps/rejected": -640.0810546875,
"loss": 0.489,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.0675268173217773,
"rewards/margins": 1.0338232517242432,
"rewards/rejected": -4.101349830627441,
"step": 3530
},
{
"epoch": 0.93,
"grad_norm": 8.1875,
"learning_rate": 8.196400257606208e-08,
"logits/chosen": -1.2670751810073853,
"logits/rejected": -1.104811191558838,
"logps/chosen": -576.2312622070312,
"logps/rejected": -708.0988159179688,
"loss": 0.4292,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.9314615726470947,
"rewards/margins": 1.371382236480713,
"rewards/rejected": -4.3028435707092285,
"step": 3540
},
{
"epoch": 0.93,
"grad_norm": 9.5,
"learning_rate": 7.626338722875076e-08,
"logits/chosen": -1.1996467113494873,
"logits/rejected": -1.1349631547927856,
"logps/chosen": -546.021240234375,
"logps/rejected": -657.2860107421875,
"loss": 0.503,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.9192748069763184,
"rewards/margins": 1.102920651435852,
"rewards/rejected": -4.022195816040039,
"step": 3550
},
{
"epoch": 0.93,
"grad_norm": 5.84375,
"learning_rate": 7.076515319110688e-08,
"logits/chosen": -1.2043834924697876,
"logits/rejected": -1.0872790813446045,
"logps/chosen": -546.2125854492188,
"logps/rejected": -628.4691162109375,
"loss": 0.5091,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.899247646331787,
"rewards/margins": 1.2382572889328003,
"rewards/rejected": -4.1375041007995605,
"step": 3560
},
{
"epoch": 0.93,
"grad_norm": 8.1875,
"learning_rate": 6.54697595640899e-08,
"logits/chosen": -1.2246639728546143,
"logits/rejected": -1.1050646305084229,
"logps/chosen": -588.5670166015625,
"logps/rejected": -679.2962646484375,
"loss": 0.4803,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.9634485244750977,
"rewards/margins": 1.1589770317077637,
"rewards/rejected": -4.1224260330200195,
"step": 3570
},
{
"epoch": 0.94,
"grad_norm": 9.125,
"learning_rate": 6.037764851154426e-08,
"logits/chosen": -1.2126811742782593,
"logits/rejected": -1.1511167287826538,
"logps/chosen": -555.2306518554688,
"logps/rejected": -671.8084716796875,
"loss": 0.5096,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.911715030670166,
"rewards/margins": 1.1182584762573242,
"rewards/rejected": -4.029973030090332,
"step": 3580
},
{
"epoch": 0.94,
"grad_norm": 7.1875,
"learning_rate": 5.548924522327748e-08,
"logits/chosen": -1.1890180110931396,
"logits/rejected": -1.0672190189361572,
"logps/chosen": -549.8150634765625,
"logps/rejected": -647.8394775390625,
"loss": 0.4832,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.880577564239502,
"rewards/margins": 1.1271222829818726,
"rewards/rejected": -4.007699489593506,
"step": 3590
},
{
"epoch": 0.94,
"grad_norm": 11.0625,
"learning_rate": 5.0804957879556915e-08,
"logits/chosen": -1.109243392944336,
"logits/rejected": -1.0201053619384766,
"logps/chosen": -514.1246337890625,
"logps/rejected": -630.8916625976562,
"loss": 0.464,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.8855831623077393,
"rewards/margins": 1.122081995010376,
"rewards/rejected": -4.007665157318115,
"step": 3600
},
{
"epoch": 0.94,
"eval_logits/chosen": -1.0748145580291748,
"eval_logits/rejected": -0.9474833607673645,
"eval_logps/chosen": -561.028564453125,
"eval_logps/rejected": -654.279052734375,
"eval_loss": 0.48736903071403503,
"eval_rewards/accuracies": 0.7425000071525574,
"eval_rewards/chosen": -2.9637651443481445,
"eval_rewards/margins": 1.1329458951950073,
"eval_rewards/rejected": -4.096711158752441,
"eval_runtime": 382.7111,
"eval_samples_per_second": 5.226,
"eval_steps_per_second": 0.653,
"step": 3600
},
{
"epoch": 0.94,
"grad_norm": 9.125,
"learning_rate": 4.632517761702815e-08,
"logits/chosen": -1.1433720588684082,
"logits/rejected": -1.0008645057678223,
"logps/chosen": -530.3574829101562,
"logps/rejected": -652.87255859375,
"loss": 0.4428,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.9491429328918457,
"rewards/margins": 1.3483526706695557,
"rewards/rejected": -4.2974958419799805,
"step": 3610
},
{
"epoch": 0.95,
"grad_norm": 11.125,
"learning_rate": 4.205027849605359e-08,
"logits/chosen": -1.1681492328643799,
"logits/rejected": -1.0669422149658203,
"logps/chosen": -553.4034423828125,
"logps/rejected": -626.2314453125,
"loss": 0.5421,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.0548007488250732,
"rewards/margins": 1.0290553569793701,
"rewards/rejected": -4.083855628967285,
"step": 3620
},
{
"epoch": 0.95,
"grad_norm": 9.9375,
"learning_rate": 3.798061746947995e-08,
"logits/chosen": -1.2855480909347534,
"logits/rejected": -1.1476643085479736,
"logps/chosen": -555.6473999023438,
"logps/rejected": -633.9293823242188,
"loss": 0.4785,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.9149746894836426,
"rewards/margins": 1.1746852397918701,
"rewards/rejected": -4.089660167694092,
"step": 3630
},
{
"epoch": 0.95,
"grad_norm": 10.25,
"learning_rate": 3.411653435283158e-08,
"logits/chosen": -1.1988470554351807,
"logits/rejected": -0.9911161661148071,
"logps/chosen": -560.5934448242188,
"logps/rejected": -617.925048828125,
"loss": 0.4611,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.873368740081787,
"rewards/margins": 1.1307556629180908,
"rewards/rejected": -4.004124641418457,
"step": 3640
},
{
"epoch": 0.96,
"grad_norm": 9.5,
"learning_rate": 3.04583517959367e-08,
"logits/chosen": -1.2440365552902222,
"logits/rejected": -1.0937076807022095,
"logps/chosen": -528.578125,
"logps/rejected": -617.3880004882812,
"loss": 0.448,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.7646141052246094,
"rewards/margins": 1.2126356363296509,
"rewards/rejected": -3.9772496223449707,
"step": 3650
},
{
"epoch": 0.96,
"grad_norm": 10.0,
"learning_rate": 2.7006375255985984e-08,
"logits/chosen": -1.1879501342773438,
"logits/rejected": -1.1580005884170532,
"logps/chosen": -571.6791381835938,
"logps/rejected": -661.9193725585938,
"loss": 0.5788,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.06877064704895,
"rewards/margins": 0.8969556093215942,
"rewards/rejected": -3.965726375579834,
"step": 3660
},
{
"epoch": 0.96,
"grad_norm": 11.625,
"learning_rate": 2.3760892972027328e-08,
"logits/chosen": -1.303144931793213,
"logits/rejected": -1.1418662071228027,
"logps/chosen": -583.8892822265625,
"logps/rejected": -663.1383666992188,
"loss": 0.5326,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.113860845565796,
"rewards/margins": 1.1326040029525757,
"rewards/rejected": -4.246464729309082,
"step": 3670
},
{
"epoch": 0.96,
"grad_norm": 13.4375,
"learning_rate": 2.072217594089765e-08,
"logits/chosen": -1.156292200088501,
"logits/rejected": -1.146905541419983,
"logps/chosen": -559.3345336914062,
"logps/rejected": -672.4237060546875,
"loss": 0.4237,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.9779343605041504,
"rewards/margins": 1.253351092338562,
"rewards/rejected": -4.231285572052002,
"step": 3680
},
{
"epoch": 0.97,
"grad_norm": 8.9375,
"learning_rate": 1.789047789459375e-08,
"logits/chosen": -1.266901969909668,
"logits/rejected": -1.072322964668274,
"logps/chosen": -611.783203125,
"logps/rejected": -680.0989379882812,
"loss": 0.5071,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.9480648040771484,
"rewards/margins": 1.1735531091690063,
"rewards/rejected": -4.121617794036865,
"step": 3690
},
{
"epoch": 0.97,
"grad_norm": 8.9375,
"learning_rate": 1.5266035279088708e-08,
"logits/chosen": -1.1054164171218872,
"logits/rejected": -0.985053539276123,
"logps/chosen": -610.8778076171875,
"logps/rejected": -699.9169921875,
"loss": 0.4729,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1426022052764893,
"rewards/margins": 1.1523752212524414,
"rewards/rejected": -4.29497766494751,
"step": 3700
},
{
"epoch": 0.97,
"eval_logits/chosen": -1.0769954919815063,
"eval_logits/rejected": -0.9495205879211426,
"eval_logps/chosen": -561.3129272460938,
"eval_logps/rejected": -654.6014404296875,
"eval_loss": 0.48729926347732544,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -2.966609239578247,
"eval_rewards/margins": 1.1333256959915161,
"eval_rewards/rejected": -4.099935054779053,
"eval_runtime": 382.1,
"eval_samples_per_second": 5.234,
"eval_steps_per_second": 0.654,
"step": 3700
},
{
"epoch": 0.97,
"grad_norm": 11.75,
"learning_rate": 1.2849067234584623e-08,
"logits/chosen": -1.0827583074569702,
"logits/rejected": -1.0175631046295166,
"logps/chosen": -534.8372192382812,
"logps/rejected": -647.8695678710938,
"loss": 0.4762,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.9847655296325684,
"rewards/margins": 1.2061764001846313,
"rewards/rejected": -4.190942287445068,
"step": 3710
},
{
"epoch": 0.97,
"grad_norm": 11.1875,
"learning_rate": 1.0639775577218625e-08,
"logits/chosen": -1.0798698663711548,
"logits/rejected": -0.9149328470230103,
"logps/chosen": -549.2965087890625,
"logps/rejected": -631.1814575195312,
"loss": 0.5133,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.001096248626709,
"rewards/margins": 1.178213119506836,
"rewards/rejected": -4.179308891296387,
"step": 3720
},
{
"epoch": 0.98,
"grad_norm": 9.5,
"learning_rate": 8.638344782207486e-09,
"logits/chosen": -1.1081641912460327,
"logits/rejected": -1.0127241611480713,
"logps/chosen": -530.3636474609375,
"logps/rejected": -619.5350341796875,
"loss": 0.4791,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.8507590293884277,
"rewards/margins": 1.1195757389068604,
"rewards/rejected": -3.97033429145813,
"step": 3730
},
{
"epoch": 0.98,
"grad_norm": 10.0,
"learning_rate": 6.84494196844715e-09,
"logits/chosen": -1.16922128200531,
"logits/rejected": -1.0506504774093628,
"logps/chosen": -563.3178100585938,
"logps/rejected": -685.6429443359375,
"loss": 0.4573,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.912113666534424,
"rewards/margins": 1.3391534090042114,
"rewards/rejected": -4.251267433166504,
"step": 3740
},
{
"epoch": 0.98,
"grad_norm": 10.1875,
"learning_rate": 5.259716884556121e-09,
"logits/chosen": -1.2230274677276611,
"logits/rejected": -1.0869773626327515,
"logps/chosen": -557.9898681640625,
"logps/rejected": -660.3572998046875,
"loss": 0.4564,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.9352307319641113,
"rewards/margins": 1.1718149185180664,
"rewards/rejected": -4.107045650482178,
"step": 3750
},
{
"epoch": 0.98,
"grad_norm": 9.75,
"learning_rate": 3.882801896372967e-09,
"logits/chosen": -1.2255470752716064,
"logits/rejected": -1.1375856399536133,
"logps/chosen": -556.98193359375,
"logps/rejected": -639.6429443359375,
"loss": 0.4908,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.91620135307312,
"rewards/margins": 1.1449532508850098,
"rewards/rejected": -4.061154842376709,
"step": 3760
},
{
"epoch": 0.99,
"grad_norm": 12.25,
"learning_rate": 2.7143119759026614e-09,
"logits/chosen": -1.242653727531433,
"logits/rejected": -1.0747482776641846,
"logps/chosen": -574.4716796875,
"logps/rejected": -665.8096313476562,
"loss": 0.4263,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.901991605758667,
"rewards/margins": 1.1683391332626343,
"rewards/rejected": -4.070330619812012,
"step": 3770
},
{
"epoch": 0.99,
"grad_norm": 9.4375,
"learning_rate": 1.754344691717591e-09,
"logits/chosen": -1.1282501220703125,
"logits/rejected": -1.0916457176208496,
"logps/chosen": -552.8446655273438,
"logps/rejected": -669.7666015625,
"loss": 0.5197,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.030531406402588,
"rewards/margins": 0.9483699798583984,
"rewards/rejected": -3.9789013862609863,
"step": 3780
},
{
"epoch": 0.99,
"grad_norm": 13.125,
"learning_rate": 1.0029802008096335e-09,
"logits/chosen": -1.1534841060638428,
"logits/rejected": -0.994836151599884,
"logps/chosen": -570.4867553710938,
"logps/rejected": -668.6637573242188,
"loss": 0.4803,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.9619479179382324,
"rewards/margins": 1.2098205089569092,
"rewards/rejected": -4.171768665313721,
"step": 3790
},
{
"epoch": 0.99,
"grad_norm": 8.5,
"learning_rate": 4.602812418974534e-10,
"logits/chosen": -1.2624783515930176,
"logits/rejected": -1.1238892078399658,
"logps/chosen": -582.1685180664062,
"logps/rejected": -673.0120239257812,
"loss": 0.5017,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.003277540206909,
"rewards/margins": 1.1538227796554565,
"rewards/rejected": -4.157099723815918,
"step": 3800
},
{
"epoch": 0.99,
"eval_logits/chosen": -1.0723599195480347,
"eval_logits/rejected": -0.9449748396873474,
"eval_logps/chosen": -561.3216552734375,
"eval_logps/rejected": -654.607177734375,
"eval_loss": 0.48731154203414917,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -2.966696262359619,
"eval_rewards/margins": 1.133296012878418,
"eval_rewards/rejected": -4.099992275238037,
"eval_runtime": 382.0182,
"eval_samples_per_second": 5.235,
"eval_steps_per_second": 0.654,
"step": 3800
},
{
"epoch": 1.0,
"grad_norm": 9.875,
"learning_rate": 1.2629313018819312e-10,
"logits/chosen": -1.171769142150879,
"logits/rejected": -1.0495896339416504,
"logps/chosen": -542.8326416015625,
"logps/rejected": -627.7073974609375,
"loss": 0.5191,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.9036014080047607,
"rewards/margins": 1.005274772644043,
"rewards/rejected": -3.9088759422302246,
"step": 3810
},
{
"epoch": 1.0,
"grad_norm": 18.25,
"learning_rate": 1.0437535929996855e-12,
"logits/chosen": -1.1617281436920166,
"logits/rejected": -0.9952475428581238,
"logps/chosen": -585.9136962890625,
"logps/rejected": -680.009521484375,
"loss": 0.4659,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.991471767425537,
"rewards/margins": 1.3875491619110107,
"rewards/rejected": -4.379020690917969,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.5021860111574015,
"train_runtime": 41123.41,
"train_samples_per_second": 1.487,
"train_steps_per_second": 0.093
}
],
"logging_steps": 10,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}