Qwen2-72B-Instruct-Step-DPO / trainer_state.json
xinlai's picture
upload model
ee76254
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.982222222222222,
"eval_steps": 1,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011851851851851851,
"grad_norm": 62.70548815519655,
"learning_rate": 1.4705882352941176e-08,
"logits/chosen": 0.030916133895516396,
"logits/rejected": 0.09742362797260284,
"logps/chosen": -40.58351516723633,
"logps/rejected": -58.42578887939453,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.023703703703703703,
"grad_norm": 67.3907670519946,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": 0.15014928579330444,
"logits/rejected": 0.2673640847206116,
"logps/chosen": -31.35921859741211,
"logps/rejected": -54.71299743652344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.035555555555555556,
"grad_norm": 73.59075381908265,
"learning_rate": 4.411764705882353e-08,
"logits/chosen": 0.35403603315353394,
"logits/rejected": 0.3630790412425995,
"logps/chosen": -30.862504959106445,
"logps/rejected": -43.55963897705078,
"loss": 0.6991,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.023167992010712624,
"rewards/margins": 0.016683291643857956,
"rewards/rejected": 0.00648469990119338,
"step": 3
},
{
"epoch": 0.047407407407407405,
"grad_norm": 58.78689431688176,
"learning_rate": 5.88235294117647e-08,
"logits/chosen": 0.3042946457862854,
"logits/rejected": 0.25474676489830017,
"logps/chosen": -34.22315979003906,
"logps/rejected": -39.93827438354492,
"loss": 0.6941,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.021718814969062805,
"rewards/margins": -0.024322079494595528,
"rewards/rejected": 0.0026032691821455956,
"step": 4
},
{
"epoch": 0.05925925925925926,
"grad_norm": 59.402728518681464,
"learning_rate": 7.352941176470588e-08,
"logits/chosen": 0.20607078075408936,
"logits/rejected": 0.27008742094039917,
"logps/chosen": -40.86919403076172,
"logps/rejected": -51.17314910888672,
"loss": 0.6937,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.02542915567755699,
"rewards/margins": 0.01238556019961834,
"rewards/rejected": 0.013043595477938652,
"step": 5
},
{
"epoch": 0.07111111111111111,
"grad_norm": 60.3208791733097,
"learning_rate": 8.823529411764706e-08,
"logits/chosen": 0.39524325728416443,
"logits/rejected": 0.3147166669368744,
"logps/chosen": -45.889522552490234,
"logps/rejected": -47.271080017089844,
"loss": 0.6967,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.035939525812864304,
"rewards/margins": 0.035303808748722076,
"rewards/rejected": 0.0006357184611260891,
"step": 6
},
{
"epoch": 0.08296296296296296,
"grad_norm": 65.84343450368385,
"learning_rate": 1.0294117647058822e-07,
"logits/chosen": 0.19424982368946075,
"logits/rejected": 0.36947980523109436,
"logps/chosen": -32.91363525390625,
"logps/rejected": -43.79743194580078,
"loss": 0.7058,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.012993477284908295,
"rewards/margins": -0.015153911896049976,
"rewards/rejected": 0.028147388249635696,
"step": 7
},
{
"epoch": 0.09481481481481481,
"grad_norm": 68.52239761198476,
"learning_rate": 1.176470588235294e-07,
"logits/chosen": 0.1946137249469757,
"logits/rejected": 0.28064286708831787,
"logps/chosen": -32.246864318847656,
"logps/rejected": -41.628746032714844,
"loss": 0.6735,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.010772847570478916,
"rewards/margins": 0.011450938880443573,
"rewards/rejected": -0.022223783656954765,
"step": 8
},
{
"epoch": 0.10666666666666667,
"grad_norm": 77.93025419789252,
"learning_rate": 1.3235294117647057e-07,
"logits/chosen": 0.32008448243141174,
"logits/rejected": 0.21636219322681427,
"logps/chosen": -40.00132369995117,
"logps/rejected": -44.613426208496094,
"loss": 0.6976,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.008494901470839977,
"rewards/margins": 0.011597584001719952,
"rewards/rejected": -0.003102683462202549,
"step": 9
},
{
"epoch": 0.11851851851851852,
"grad_norm": 60.27023294648464,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": 0.011551467701792717,
"logits/rejected": 0.1401338428258896,
"logps/chosen": -35.68666076660156,
"logps/rejected": -47.44255065917969,
"loss": 0.6817,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.02254348061978817,
"rewards/margins": 0.029230808839201927,
"rewards/rejected": -0.0517742857336998,
"step": 10
},
{
"epoch": 0.13037037037037036,
"grad_norm": 70.77005341416117,
"learning_rate": 1.6176470588235293e-07,
"logits/chosen": 0.07894501090049744,
"logits/rejected": 0.09966235607862473,
"logps/chosen": -30.685501098632812,
"logps/rejected": -42.800785064697266,
"loss": 0.6773,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.006754852831363678,
"rewards/margins": 0.031156515702605247,
"rewards/rejected": -0.037911366671323776,
"step": 11
},
{
"epoch": 0.14222222222222222,
"grad_norm": 58.87977165275071,
"learning_rate": 1.764705882352941e-07,
"logits/chosen": 0.23514162003993988,
"logits/rejected": 0.2450232207775116,
"logps/chosen": -41.01308822631836,
"logps/rejected": -52.138641357421875,
"loss": 0.6775,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.05471659079194069,
"rewards/margins": -0.00840845424681902,
"rewards/rejected": -0.046308137476444244,
"step": 12
},
{
"epoch": 0.15407407407407409,
"grad_norm": 68.28764496281121,
"learning_rate": 1.9117647058823527e-07,
"logits/chosen": 0.13845381140708923,
"logits/rejected": 0.06714074313640594,
"logps/chosen": -36.72666549682617,
"logps/rejected": -44.98724365234375,
"loss": 0.6698,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.027201365679502487,
"rewards/margins": 0.0457880012691021,
"rewards/rejected": -0.07298936694860458,
"step": 13
},
{
"epoch": 0.16592592592592592,
"grad_norm": 67.0476123108747,
"learning_rate": 2.0588235294117645e-07,
"logits/chosen": 0.13570713996887207,
"logits/rejected": 0.02110590785741806,
"logps/chosen": -39.4144287109375,
"logps/rejected": -46.626033782958984,
"loss": 0.6694,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.05015880987048149,
"rewards/margins": 0.06351868808269501,
"rewards/rejected": -0.1136775016784668,
"step": 14
},
{
"epoch": 0.17777777777777778,
"grad_norm": 56.97224502598152,
"learning_rate": 2.2058823529411763e-07,
"logits/chosen": 0.14530636370182037,
"logits/rejected": 0.22717420756816864,
"logps/chosen": -33.9251823425293,
"logps/rejected": -47.67527770996094,
"loss": 0.6504,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07779295742511749,
"rewards/margins": 0.12380316108465195,
"rewards/rejected": -0.20159611105918884,
"step": 15
},
{
"epoch": 0.18962962962962962,
"grad_norm": 58.43872340752561,
"learning_rate": 2.352941176470588e-07,
"logits/chosen": 0.1471043974161148,
"logits/rejected": 0.26890620589256287,
"logps/chosen": -36.48509979248047,
"logps/rejected": -53.876888275146484,
"loss": 0.6503,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.12628403306007385,
"rewards/margins": 0.1476879119873047,
"rewards/rejected": -0.27397194504737854,
"step": 16
},
{
"epoch": 0.20148148148148148,
"grad_norm": 53.27712198861993,
"learning_rate": 2.5e-07,
"logits/chosen": 0.3565051555633545,
"logits/rejected": 0.25773561000823975,
"logps/chosen": -32.79841232299805,
"logps/rejected": -36.324119567871094,
"loss": 0.6414,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.06849764287471771,
"rewards/margins": 0.11378694325685501,
"rewards/rejected": -0.18228457868099213,
"step": 17
},
{
"epoch": 0.21333333333333335,
"grad_norm": 58.90356354190709,
"learning_rate": 2.6470588235294114e-07,
"logits/chosen": -0.1638399213552475,
"logits/rejected": -0.027556225657463074,
"logps/chosen": -35.214149475097656,
"logps/rejected": -54.174049377441406,
"loss": 0.6328,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.15428400039672852,
"rewards/margins": 0.18084901571273804,
"rewards/rejected": -0.33513307571411133,
"step": 18
},
{
"epoch": 0.22518518518518518,
"grad_norm": 54.674461580476404,
"learning_rate": 2.7941176470588235e-07,
"logits/chosen": 0.2881383001804352,
"logits/rejected": 0.32903048396110535,
"logps/chosen": -30.270973205566406,
"logps/rejected": -40.31577682495117,
"loss": 0.5957,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.22714650630950928,
"rewards/margins": 0.15492179989814758,
"rewards/rejected": -0.38206830620765686,
"step": 19
},
{
"epoch": 0.23703703703703705,
"grad_norm": 49.124258720104926,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": 0.1832781583070755,
"logits/rejected": 0.22061079740524292,
"logps/chosen": -31.4530029296875,
"logps/rejected": -43.574642181396484,
"loss": 0.5737,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2013498842716217,
"rewards/margins": 0.24712926149368286,
"rewards/rejected": -0.44847914576530457,
"step": 20
},
{
"epoch": 0.24888888888888888,
"grad_norm": 49.31720284300177,
"learning_rate": 3.088235294117647e-07,
"logits/chosen": 0.049509014934301376,
"logits/rejected": 0.09894949197769165,
"logps/chosen": -38.46732711791992,
"logps/rejected": -53.03515625,
"loss": 0.5683,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.25855958461761475,
"rewards/margins": 0.35330522060394287,
"rewards/rejected": -0.6118648052215576,
"step": 21
},
{
"epoch": 0.2607407407407407,
"grad_norm": 49.41172490919738,
"learning_rate": 3.2352941176470586e-07,
"logits/chosen": 0.09565885365009308,
"logits/rejected": 0.13914039731025696,
"logps/chosen": -28.3892765045166,
"logps/rejected": -40.65375518798828,
"loss": 0.5608,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2191687971353531,
"rewards/margins": 0.21512456238269806,
"rewards/rejected": -0.43429338932037354,
"step": 22
},
{
"epoch": 0.2725925925925926,
"grad_norm": 46.290056641280934,
"learning_rate": 3.3823529411764707e-07,
"logits/chosen": 0.33468642830848694,
"logits/rejected": 0.3287414312362671,
"logps/chosen": -41.56681823730469,
"logps/rejected": -49.69163131713867,
"loss": 0.5531,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6832336187362671,
"rewards/margins": 0.1509827822446823,
"rewards/rejected": -0.8342164158821106,
"step": 23
},
{
"epoch": 0.28444444444444444,
"grad_norm": 45.27726525885794,
"learning_rate": 3.529411764705882e-07,
"logits/chosen": 0.29047060012817383,
"logits/rejected": 0.25504833459854126,
"logps/chosen": -40.559715270996094,
"logps/rejected": -43.744140625,
"loss": 0.5839,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4380490183830261,
"rewards/margins": 0.21297261118888855,
"rewards/rejected": -0.6510215997695923,
"step": 24
},
{
"epoch": 0.2962962962962963,
"grad_norm": 45.96606934366563,
"learning_rate": 3.6764705882352943e-07,
"logits/chosen": 0.19954881072044373,
"logits/rejected": 0.24337519705295563,
"logps/chosen": -26.71196937561035,
"logps/rejected": -45.159339904785156,
"loss": 0.5114,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4065704047679901,
"rewards/margins": 0.6559778451919556,
"rewards/rejected": -1.0625481605529785,
"step": 25
},
{
"epoch": 0.30814814814814817,
"grad_norm": 40.51480580129527,
"learning_rate": 3.8235294117647053e-07,
"logits/chosen": 0.18569591641426086,
"logits/rejected": 0.24005870521068573,
"logps/chosen": -32.402259826660156,
"logps/rejected": -50.5438117980957,
"loss": 0.4871,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.35379940271377563,
"rewards/margins": 0.88099205493927,
"rewards/rejected": -1.2347913980484009,
"step": 26
},
{
"epoch": 0.32,
"grad_norm": 42.88615154569158,
"learning_rate": 3.9705882352941174e-07,
"logits/chosen": 0.28236454725265503,
"logits/rejected": 0.25888901948928833,
"logps/chosen": -39.2940673828125,
"logps/rejected": -53.0938606262207,
"loss": 0.4142,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6994470953941345,
"rewards/margins": 1.0878021717071533,
"rewards/rejected": -1.7872494459152222,
"step": 27
},
{
"epoch": 0.33185185185185184,
"grad_norm": 47.29875578100757,
"learning_rate": 4.117647058823529e-07,
"logits/chosen": 0.3194928467273712,
"logits/rejected": 0.32101473212242126,
"logps/chosen": -45.893978118896484,
"logps/rejected": -52.5146484375,
"loss": 0.5076,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0526106357574463,
"rewards/margins": 0.718239426612854,
"rewards/rejected": -1.7708501815795898,
"step": 28
},
{
"epoch": 0.3437037037037037,
"grad_norm": 37.18564900832489,
"learning_rate": 4.264705882352941e-07,
"logits/chosen": 0.19094619154930115,
"logits/rejected": 0.23362189531326294,
"logps/chosen": -36.12297439575195,
"logps/rejected": -46.864871978759766,
"loss": 0.4014,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7660520076751709,
"rewards/margins": 1.0089162588119507,
"rewards/rejected": -1.774968147277832,
"step": 29
},
{
"epoch": 0.35555555555555557,
"grad_norm": 43.31791813857577,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": 0.15151675045490265,
"logits/rejected": 0.11997775733470917,
"logps/chosen": -38.58032989501953,
"logps/rejected": -48.824798583984375,
"loss": 0.4572,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2670563459396362,
"rewards/margins": 0.831606388092041,
"rewards/rejected": -2.098662853240967,
"step": 30
},
{
"epoch": 0.3674074074074074,
"grad_norm": 37.063035160208685,
"learning_rate": 4.5588235294117646e-07,
"logits/chosen": 0.2560815215110779,
"logits/rejected": 0.3223097026348114,
"logps/chosen": -33.990726470947266,
"logps/rejected": -47.65345764160156,
"loss": 0.3806,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2836799621582031,
"rewards/margins": 1.0427335500717163,
"rewards/rejected": -2.326413631439209,
"step": 31
},
{
"epoch": 0.37925925925925924,
"grad_norm": 37.90256850942511,
"learning_rate": 4.705882352941176e-07,
"logits/chosen": 0.18444910645484924,
"logits/rejected": 0.27790239453315735,
"logps/chosen": -34.04519271850586,
"logps/rejected": -58.64192199707031,
"loss": 0.3634,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8921431303024292,
"rewards/margins": 1.9067623615264893,
"rewards/rejected": -2.798905372619629,
"step": 32
},
{
"epoch": 0.39111111111111113,
"grad_norm": 38.28825801306995,
"learning_rate": 4.852941176470588e-07,
"logits/chosen": 0.21531561017036438,
"logits/rejected": 0.22173307836055756,
"logps/chosen": -38.73649978637695,
"logps/rejected": -51.891937255859375,
"loss": 0.3549,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.469386339187622,
"rewards/margins": 1.3347928524017334,
"rewards/rejected": -2.8041794300079346,
"step": 33
},
{
"epoch": 0.40296296296296297,
"grad_norm": 53.57688480965446,
"learning_rate": 5e-07,
"logits/chosen": -0.14031767845153809,
"logits/rejected": -0.009732939302921295,
"logps/chosen": -30.219802856445312,
"logps/rejected": -48.51620864868164,
"loss": 0.4584,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7153716683387756,
"rewards/margins": 2.133821487426758,
"rewards/rejected": -2.8491930961608887,
"step": 34
},
{
"epoch": 0.4148148148148148,
"grad_norm": 43.6191926386301,
"learning_rate": 4.999864732969518e-07,
"logits/chosen": 0.2249789983034134,
"logits/rejected": 0.2375878542661667,
"logps/chosen": -42.989952087402344,
"logps/rejected": -60.248451232910156,
"loss": 0.307,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.3359755277633667,
"rewards/margins": 2.923412799835205,
"rewards/rejected": -4.259388446807861,
"step": 35
},
{
"epoch": 0.4266666666666667,
"grad_norm": 41.68496640354453,
"learning_rate": 4.999458946515807e-07,
"logits/chosen": 0.04158564656972885,
"logits/rejected": 0.04120251536369324,
"logps/chosen": -47.079593658447266,
"logps/rejected": -64.6259536743164,
"loss": 0.3243,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.9504570960998535,
"rewards/margins": 2.1364314556121826,
"rewards/rejected": -4.086888313293457,
"step": 36
},
{
"epoch": 0.43851851851851853,
"grad_norm": 37.998309675066224,
"learning_rate": 4.998782684550491e-07,
"logits/chosen": 0.15689387917518616,
"logits/rejected": 0.22760489583015442,
"logps/chosen": -31.412202835083008,
"logps/rejected": -57.521270751953125,
"loss": 0.3499,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.422287940979004,
"rewards/margins": 2.10537052154541,
"rewards/rejected": -3.527658462524414,
"step": 37
},
{
"epoch": 0.45037037037037037,
"grad_norm": 39.14624538309661,
"learning_rate": 4.997836020254328e-07,
"logits/chosen": 0.09242415428161621,
"logits/rejected": 0.12390726059675217,
"logps/chosen": -38.68524932861328,
"logps/rejected": -59.322471618652344,
"loss": 0.3809,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4135229587554932,
"rewards/margins": 2.5775866508483887,
"rewards/rejected": -3.991109609603882,
"step": 38
},
{
"epoch": 0.4622222222222222,
"grad_norm": 34.10903781264625,
"learning_rate": 4.996619056069291e-07,
"logits/chosen": 0.15454381704330444,
"logits/rejected": 0.16882330179214478,
"logps/chosen": -44.294654846191406,
"logps/rejected": -66.8642578125,
"loss": 0.3106,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.2089152336120605,
"rewards/margins": 3.4627161026000977,
"rewards/rejected": -5.671631336212158,
"step": 39
},
{
"epoch": 0.4740740740740741,
"grad_norm": 41.64494289319624,
"learning_rate": 4.995131923687487e-07,
"logits/chosen": 0.03869347274303436,
"logits/rejected": 0.13989922404289246,
"logps/chosen": -48.224884033203125,
"logps/rejected": -68.6059341430664,
"loss": 0.3563,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.3006927967071533,
"rewards/margins": 3.536550521850586,
"rewards/rejected": -5.837243556976318,
"step": 40
},
{
"epoch": 0.48592592592592593,
"grad_norm": 43.27974364485945,
"learning_rate": 4.993374784036901e-07,
"logits/chosen": -0.13991862535476685,
"logits/rejected": 0.015330532565712929,
"logps/chosen": -44.3278923034668,
"logps/rejected": -62.52472686767578,
"loss": 0.4348,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.4953625202178955,
"rewards/margins": 3.03660249710083,
"rewards/rejected": -5.5319647789001465,
"step": 41
},
{
"epoch": 0.49777777777777776,
"grad_norm": 46.320431667154644,
"learning_rate": 4.991347827263982e-07,
"logits/chosen": -0.051238611340522766,
"logits/rejected": -0.0033771172165870667,
"logps/chosen": -43.90919876098633,
"logps/rejected": -65.11723327636719,
"loss": 0.4051,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.612180471420288,
"rewards/margins": 3.3464155197143555,
"rewards/rejected": -4.9585957527160645,
"step": 42
},
{
"epoch": 0.5096296296296297,
"grad_norm": 52.045033376790954,
"learning_rate": 4.989051272713069e-07,
"logits/chosen": -0.10396721214056015,
"logits/rejected": 0.10225249826908112,
"logps/chosen": -45.72818374633789,
"logps/rejected": -77.21394348144531,
"loss": 0.3283,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.5264527797698975,
"rewards/margins": 4.776139736175537,
"rewards/rejected": -7.302592754364014,
"step": 43
},
{
"epoch": 0.5214814814814814,
"grad_norm": 40.91521718681392,
"learning_rate": 4.986485368902656e-07,
"logits/chosen": -0.08990158140659332,
"logits/rejected": 0.014565035700798035,
"logps/chosen": -38.39900588989258,
"logps/rejected": -57.75357437133789,
"loss": 0.3584,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.275327205657959,
"rewards/margins": 2.417283058166504,
"rewards/rejected": -4.692610263824463,
"step": 44
},
{
"epoch": 0.5333333333333333,
"grad_norm": 50.74127412580387,
"learning_rate": 4.983650393498489e-07,
"logits/chosen": 0.037050001323223114,
"logits/rejected": -0.008276170119643211,
"logps/chosen": -50.93760299682617,
"logps/rejected": -55.42501449584961,
"loss": 0.3717,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.5346438884735107,
"rewards/margins": 1.2831640243530273,
"rewards/rejected": -3.8178083896636963,
"step": 45
},
{
"epoch": 0.5451851851851852,
"grad_norm": 38.61624054825256,
"learning_rate": 4.980546653283537e-07,
"logits/chosen": -0.41439855098724365,
"logits/rejected": -0.4113887548446655,
"logps/chosen": -41.68315124511719,
"logps/rejected": -67.61707305908203,
"loss": 0.3181,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8887252807617188,
"rewards/margins": 4.549580097198486,
"rewards/rejected": -6.438305377960205,
"step": 46
},
{
"epoch": 0.557037037037037,
"grad_norm": 38.9863916338003,
"learning_rate": 4.977174484124775e-07,
"logits/chosen": -0.009788192808628082,
"logits/rejected": -0.09325724095106125,
"logps/chosen": -46.573936462402344,
"logps/rejected": -59.36362838745117,
"loss": 0.2595,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1944921016693115,
"rewards/margins": 4.177679538726807,
"rewards/rejected": -6.372171401977539,
"step": 47
},
{
"epoch": 0.5688888888888889,
"grad_norm": 46.90129706189482,
"learning_rate": 4.97353425093685e-07,
"logits/chosen": -0.1219746470451355,
"logits/rejected": -0.10111116617918015,
"logps/chosen": -49.24342346191406,
"logps/rejected": -65.33878326416016,
"loss": 0.3779,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.4194295406341553,
"rewards/margins": 3.601774215698242,
"rewards/rejected": -6.021203994750977,
"step": 48
},
{
"epoch": 0.5807407407407408,
"grad_norm": 54.305491610936826,
"learning_rate": 4.96962634764259e-07,
"logits/chosen": -0.13463924825191498,
"logits/rejected": -0.09778477251529694,
"logps/chosen": -50.75926971435547,
"logps/rejected": -64.77645111083984,
"loss": 0.3942,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.944636344909668,
"rewards/margins": 3.1688618659973145,
"rewards/rejected": -6.113498687744141,
"step": 49
},
{
"epoch": 0.5925925925925926,
"grad_norm": 40.79377104202132,
"learning_rate": 4.965451197130372e-07,
"logits/chosen": -0.0598304346203804,
"logits/rejected": 0.03133855387568474,
"logps/chosen": -41.48918151855469,
"logps/rejected": -72.72964477539062,
"loss": 0.3083,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6170328855514526,
"rewards/margins": 4.565612316131592,
"rewards/rejected": -6.182644844055176,
"step": 50
},
{
"epoch": 0.6044444444444445,
"grad_norm": 42.877178470441834,
"learning_rate": 4.961009251208367e-07,
"logits/chosen": -0.014419106766581535,
"logits/rejected": -0.018680818378925323,
"logps/chosen": -34.062870025634766,
"logps/rejected": -66.85511779785156,
"loss": 0.2943,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.9279564023017883,
"rewards/margins": 6.2386088371276855,
"rewards/rejected": -7.166565418243408,
"step": 51
},
{
"epoch": 0.6162962962962963,
"grad_norm": 36.563663666294495,
"learning_rate": 4.956300990555643e-07,
"logits/chosen": -0.23208701610565186,
"logits/rejected": -0.15281593799591064,
"logps/chosen": -34.16992950439453,
"logps/rejected": -48.25387954711914,
"loss": 0.2707,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.390017032623291,
"rewards/margins": 2.9138500690460205,
"rewards/rejected": -4.303867340087891,
"step": 52
},
{
"epoch": 0.6281481481481481,
"grad_norm": 48.996542745383714,
"learning_rate": 4.951326924670147e-07,
"logits/chosen": -0.029582835733890533,
"logits/rejected": 0.13870403170585632,
"logps/chosen": -46.177825927734375,
"logps/rejected": -64.03628540039062,
"loss": 0.4606,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.360954761505127,
"rewards/margins": 2.338740825653076,
"rewards/rejected": -4.699695587158203,
"step": 53
},
{
"epoch": 0.64,
"grad_norm": 37.53257312713413,
"learning_rate": 4.94608759181358e-07,
"logits/chosen": -0.2242709845304489,
"logits/rejected": -0.022289041429758072,
"logps/chosen": -43.091976165771484,
"logps/rejected": -56.94826126098633,
"loss": 0.2428,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1584746837615967,
"rewards/margins": 2.6602895259857178,
"rewards/rejected": -3.8187644481658936,
"step": 54
},
{
"epoch": 0.6518518518518519,
"grad_norm": 38.495616892469634,
"learning_rate": 4.940583558953137e-07,
"logits/chosen": -0.3163710832595825,
"logits/rejected": -0.2686666250228882,
"logps/chosen": -41.02560806274414,
"logps/rejected": -75.63497924804688,
"loss": 0.3113,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4144959449768066,
"rewards/margins": 5.705674648284912,
"rewards/rejected": -7.120170593261719,
"step": 55
},
{
"epoch": 0.6637037037037037,
"grad_norm": 44.72353141808658,
"learning_rate": 4.934815421700164e-07,
"logits/chosen": -0.28492411971092224,
"logits/rejected": -0.2709801495075226,
"logps/chosen": -36.71954345703125,
"logps/rejected": -55.933624267578125,
"loss": 0.3607,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.8905187845230103,
"rewards/margins": 3.9906599521636963,
"rewards/rejected": -4.881179332733154,
"step": 56
},
{
"epoch": 0.6755555555555556,
"grad_norm": 36.35665672904958,
"learning_rate": 4.928783804245699e-07,
"logits/chosen": 0.2555558681488037,
"logits/rejected": 0.18786108493804932,
"logps/chosen": -40.80218505859375,
"logps/rejected": -54.24163055419922,
"loss": 0.2751,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8028124570846558,
"rewards/margins": 3.192255735397339,
"rewards/rejected": -3.995068073272705,
"step": 57
},
{
"epoch": 0.6874074074074074,
"grad_norm": 29.859662847563737,
"learning_rate": 4.922489359292927e-07,
"logits/chosen": -0.17547199130058289,
"logits/rejected": -0.06896121799945831,
"logps/chosen": -40.20637893676758,
"logps/rejected": -68.45681762695312,
"loss": 0.2406,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9151208400726318,
"rewards/margins": 4.224143981933594,
"rewards/rejected": -5.139265060424805,
"step": 58
},
{
"epoch": 0.6992592592592592,
"grad_norm": 33.69014037830799,
"learning_rate": 4.915932767986551e-07,
"logits/chosen": -0.2176772654056549,
"logits/rejected": -0.14603368937969208,
"logps/chosen": -35.77494430541992,
"logps/rejected": -56.28825378417969,
"loss": 0.2639,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.7671470642089844,
"rewards/margins": 3.049193859100342,
"rewards/rejected": -3.816340923309326,
"step": 59
},
{
"epoch": 0.7111111111111111,
"grad_norm": 33.1631924356696,
"learning_rate": 4.909114739839079e-07,
"logits/chosen": -0.09617012739181519,
"logits/rejected": -0.08796259015798569,
"logps/chosen": -33.88630294799805,
"logps/rejected": -55.623878479003906,
"loss": 0.2556,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.8068188428878784,
"rewards/margins": 3.1127305030822754,
"rewards/rejected": -3.9195497035980225,
"step": 60
},
{
"epoch": 0.7229629629629629,
"grad_norm": 40.838783872872355,
"learning_rate": 4.902036012654048e-07,
"logits/chosen": 0.11093666404485703,
"logits/rejected": 0.1355137974023819,
"logps/chosen": -34.699256896972656,
"logps/rejected": -55.77449035644531,
"loss": 0.2753,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.288212776184082,
"rewards/margins": 3.224027633666992,
"rewards/rejected": -4.512240409851074,
"step": 61
},
{
"epoch": 0.7348148148148148,
"grad_norm": 35.91873621159586,
"learning_rate": 4.894697352446182e-07,
"logits/chosen": -0.10412248969078064,
"logits/rejected": -0.1209147572517395,
"logps/chosen": -34.93061447143555,
"logps/rejected": -52.149208068847656,
"loss": 0.2958,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7685253620147705,
"rewards/margins": 2.5662384033203125,
"rewards/rejected": -3.334764003753662,
"step": 62
},
{
"epoch": 0.7466666666666667,
"grad_norm": 28.279332840993643,
"learning_rate": 4.887099553358501e-07,
"logits/chosen": -0.1916661560535431,
"logits/rejected": -0.14164935052394867,
"logps/chosen": -40.58860397338867,
"logps/rejected": -50.00385284423828,
"loss": 0.229,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3654404282569885,
"rewards/margins": 2.5260181427001953,
"rewards/rejected": -2.891458511352539,
"step": 63
},
{
"epoch": 0.7585185185185185,
"grad_norm": 43.562956759714005,
"learning_rate": 4.879243437576383e-07,
"logits/chosen": -0.09250672161579132,
"logits/rejected": -0.06184221804141998,
"logps/chosen": -33.61621856689453,
"logps/rejected": -48.81718826293945,
"loss": 0.286,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7941789031028748,
"rewards/margins": 2.6042628288269043,
"rewards/rejected": -3.398441791534424,
"step": 64
},
{
"epoch": 0.7703703703703704,
"grad_norm": 43.71274039442652,
"learning_rate": 4.871129855238588e-07,
"logits/chosen": -0.1322498917579651,
"logits/rejected": -0.05121883377432823,
"logps/chosen": -40.41231918334961,
"logps/rejected": -68.0873031616211,
"loss": 0.3236,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.39779654145240784,
"rewards/margins": 3.27742600440979,
"rewards/rejected": -3.675222396850586,
"step": 65
},
{
"epoch": 0.7822222222222223,
"grad_norm": 33.295271431641005,
"learning_rate": 4.862759684345269e-07,
"logits/chosen": -0.35007691383361816,
"logits/rejected": -0.3479149341583252,
"logps/chosen": -38.88081359863281,
"logps/rejected": -49.428382873535156,
"loss": 0.245,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.42857033014297485,
"rewards/margins": 3.455155372619629,
"rewards/rejected": -3.883725643157959,
"step": 66
},
{
"epoch": 0.794074074074074,
"grad_norm": 22.368867699781628,
"learning_rate": 4.854133830662955e-07,
"logits/chosen": -0.26884549856185913,
"logits/rejected": -0.27821600437164307,
"logps/chosen": -40.386207580566406,
"logps/rejected": -55.080604553222656,
"loss": 0.1921,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1675169467926025,
"rewards/margins": 4.228756904602051,
"rewards/rejected": -5.396273136138916,
"step": 67
},
{
"epoch": 0.8059259259259259,
"grad_norm": 32.90815737667523,
"learning_rate": 4.845253227626536e-07,
"logits/chosen": 0.12926915287971497,
"logits/rejected": 0.03172997385263443,
"logps/chosen": -56.2171516418457,
"logps/rejected": -61.29096984863281,
"loss": 0.2468,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6303930282592773,
"rewards/margins": 2.693972110748291,
"rewards/rejected": -3.3243653774261475,
"step": 68
},
{
"epoch": 0.8177777777777778,
"grad_norm": 31.286678704407063,
"learning_rate": 4.836118836238252e-07,
"logits/chosen": -0.11010141670703888,
"logits/rejected": -0.09770654886960983,
"logps/chosen": -38.13661193847656,
"logps/rejected": -59.40013122558594,
"loss": 0.2512,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08025422692298889,
"rewards/margins": 2.979079246520996,
"rewards/rejected": -2.898824691772461,
"step": 69
},
{
"epoch": 0.8296296296296296,
"grad_norm": 27.574620445125923,
"learning_rate": 4.826731644963704e-07,
"logits/chosen": -0.25498491525650024,
"logits/rejected": -0.24903275072574615,
"logps/chosen": -32.7315788269043,
"logps/rejected": -47.2717399597168,
"loss": 0.222,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6683451533317566,
"rewards/margins": 3.523477554321289,
"rewards/rejected": -4.191822528839111,
"step": 70
},
{
"epoch": 0.8414814814814815,
"grad_norm": 35.73348427460115,
"learning_rate": 4.817092669624882e-07,
"logits/chosen": -0.018255462870001793,
"logits/rejected": -0.0020784977823495865,
"logps/chosen": -34.584449768066406,
"logps/rejected": -54.02947998046875,
"loss": 0.3535,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.04187864065170288,
"rewards/margins": 3.698057174682617,
"rewards/rejected": -3.739936113357544,
"step": 71
},
{
"epoch": 0.8533333333333334,
"grad_norm": 28.886553550180775,
"learning_rate": 4.807202953290243e-07,
"logits/chosen": -0.2388785183429718,
"logits/rejected": -0.1512915939092636,
"logps/chosen": -32.676937103271484,
"logps/rejected": -51.861934661865234,
"loss": 0.2502,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43571335077285767,
"rewards/margins": 3.212143898010254,
"rewards/rejected": -3.6478569507598877,
"step": 72
},
{
"epoch": 0.8651851851851852,
"grad_norm": 31.329548468083193,
"learning_rate": 4.797063566161834e-07,
"logits/chosen": -0.016514137387275696,
"logits/rejected": -0.02128826081752777,
"logps/chosen": -42.21635818481445,
"logps/rejected": -53.1080436706543,
"loss": 0.2729,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.650065541267395,
"rewards/margins": 1.7332451343536377,
"rewards/rejected": -2.383310556411743,
"step": 73
},
{
"epoch": 0.8770370370370371,
"grad_norm": 25.241915116630672,
"learning_rate": 4.786675605459487e-07,
"logits/chosen": -0.18203724920749664,
"logits/rejected": -0.12511923909187317,
"logps/chosen": -37.860111236572266,
"logps/rejected": -67.00776672363281,
"loss": 0.2269,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.003457695245742798,
"rewards/margins": 4.446203708648682,
"rewards/rejected": -4.442746162414551,
"step": 74
},
{
"epoch": 0.8888888888888888,
"grad_norm": 26.546681076158777,
"learning_rate": 4.776040195302079e-07,
"logits/chosen": -0.20350059866905212,
"logits/rejected": -0.16488413512706757,
"logps/chosen": -29.37200164794922,
"logps/rejected": -53.26748275756836,
"loss": 0.2272,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.028457432985305786,
"rewards/margins": 4.081994533538818,
"rewards/rejected": -4.053536891937256,
"step": 75
},
{
"epoch": 0.9007407407407407,
"grad_norm": 36.79575801377004,
"learning_rate": 4.76515848658589e-07,
"logits/chosen": -0.06815146654844284,
"logits/rejected": 0.04448368400335312,
"logps/chosen": -40.69670867919922,
"logps/rejected": -58.95963668823242,
"loss": 0.3201,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6759670972824097,
"rewards/margins": 2.9853808879852295,
"rewards/rejected": -3.6613478660583496,
"step": 76
},
{
"epoch": 0.9125925925925926,
"grad_norm": 33.040470283627364,
"learning_rate": 4.754031656860059e-07,
"logits/chosen": 0.059698522090911865,
"logits/rejected": 0.1003262847661972,
"logps/chosen": -36.32437515258789,
"logps/rejected": -45.55727005004883,
"loss": 0.2451,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.1681911200284958,
"rewards/margins": 3.335202217102051,
"rewards/rejected": -3.167011022567749,
"step": 77
},
{
"epoch": 0.9244444444444444,
"grad_norm": 23.533686338243243,
"learning_rate": 4.74266091019916e-07,
"logits/chosen": -0.03607799857854843,
"logits/rejected": -0.07650981843471527,
"logps/chosen": -40.76026153564453,
"logps/rejected": -51.91889190673828,
"loss": 0.1909,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.06481152772903442,
"rewards/margins": 3.4574201107025146,
"rewards/rejected": -3.392608642578125,
"step": 78
},
{
"epoch": 0.9362962962962963,
"grad_norm": 34.00814256216538,
"learning_rate": 4.7310474770728996e-07,
"logits/chosen": -0.24094080924987793,
"logits/rejected": -0.2244417816400528,
"logps/chosen": -36.15470504760742,
"logps/rejected": -50.766448974609375,
"loss": 0.3092,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.04541383683681488,
"rewards/margins": 2.0906879901885986,
"rewards/rejected": -2.1361019611358643,
"step": 79
},
{
"epoch": 0.9481481481481482,
"grad_norm": 28.43036399847798,
"learning_rate": 4.719192614212969e-07,
"logits/chosen": 0.04508206248283386,
"logits/rejected": 0.04841914027929306,
"logps/chosen": -44.18794631958008,
"logps/rejected": -74.9228515625,
"loss": 0.1841,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.0396180152893066,
"rewards/margins": 3.7275805473327637,
"rewards/rejected": -4.76719856262207,
"step": 80
},
{
"epoch": 0.96,
"grad_norm": 32.15305104854747,
"learning_rate": 4.707097604477045e-07,
"logits/chosen": 0.10437710583209991,
"logits/rejected": 0.10021185874938965,
"logps/chosen": -41.348716735839844,
"logps/rejected": -53.517127990722656,
"loss": 0.2768,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.03640615940093994,
"rewards/margins": 3.230294704437256,
"rewards/rejected": -3.1938881874084473,
"step": 81
},
{
"epoch": 0.9718518518518519,
"grad_norm": 30.563184515965578,
"learning_rate": 4.694763756709967e-07,
"logits/chosen": -0.17636063694953918,
"logits/rejected": -0.21469731628894806,
"logps/chosen": -39.04137420654297,
"logps/rejected": -52.79186248779297,
"loss": 0.2415,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2509702444076538,
"rewards/margins": 3.593198776245117,
"rewards/rejected": -3.8441689014434814,
"step": 82
},
{
"epoch": 0.9837037037037037,
"grad_norm": 28.891738925831007,
"learning_rate": 4.6821924056021053e-07,
"logits/chosen": -0.11742343008518219,
"logits/rejected": 0.009909386746585369,
"logps/chosen": -30.184894561767578,
"logps/rejected": -65.191650390625,
"loss": 0.2097,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.2642693817615509,
"rewards/margins": 5.374275207519531,
"rewards/rejected": -5.638545036315918,
"step": 83
},
{
"epoch": 0.9955555555555555,
"grad_norm": 34.49435985459448,
"learning_rate": 4.669384911544926e-07,
"logits/chosen": -0.07497820258140564,
"logits/rejected": -0.026325395330786705,
"logps/chosen": -33.93386459350586,
"logps/rejected": -50.840423583984375,
"loss": 0.2826,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5045543909072876,
"rewards/margins": 2.2327775955200195,
"rewards/rejected": -2.7373318672180176,
"step": 84
},
{
"epoch": 1.0074074074074073,
"grad_norm": 25.60399667566007,
"learning_rate": 4.6563426604837817e-07,
"logits/chosen": -0.07658643275499344,
"logits/rejected": -0.06745781004428864,
"logps/chosen": -45.01917266845703,
"logps/rejected": -59.43114471435547,
"loss": 0.1973,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.29737135767936707,
"rewards/margins": 5.139443874359131,
"rewards/rejected": -5.4368157386779785,
"step": 85
},
{
"epoch": 1.0192592592592593,
"grad_norm": 21.460741017889536,
"learning_rate": 4.6430670637679294e-07,
"logits/chosen": -0.2205628752708435,
"logits/rejected": -0.0864521712064743,
"logps/chosen": -31.902761459350586,
"logps/rejected": -51.85260009765625,
"loss": 0.1667,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.194830521941185,
"rewards/margins": 3.8075246810913086,
"rewards/rejected": -4.002355098724365,
"step": 86
},
{
"epoch": 1.031111111111111,
"grad_norm": 15.350784705262274,
"learning_rate": 4.629559557997804e-07,
"logits/chosen": -0.12179061770439148,
"logits/rejected": -0.10868389904499054,
"logps/chosen": -40.21116638183594,
"logps/rejected": -62.20214080810547,
"loss": 0.1195,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.19310609996318817,
"rewards/margins": 4.397428512573242,
"rewards/rejected": -4.590534687042236,
"step": 87
},
{
"epoch": 1.0429629629629629,
"grad_norm": 13.188223717252573,
"learning_rate": 4.615821604869563e-07,
"logits/chosen": -0.13621510565280914,
"logits/rejected": -0.044875748455524445,
"logps/chosen": -39.49315643310547,
"logps/rejected": -65.40257263183594,
"loss": 0.1026,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10489638894796371,
"rewards/margins": 5.503582000732422,
"rewards/rejected": -5.608478546142578,
"step": 88
},
{
"epoch": 1.0548148148148149,
"grad_norm": 27.4435618488601,
"learning_rate": 4.6018546910169067e-07,
"logits/chosen": -0.2304653376340866,
"logits/rejected": -0.30428558588027954,
"logps/chosen": -38.354759216308594,
"logps/rejected": -57.675498962402344,
"loss": 0.1946,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.1100564002990723,
"rewards/margins": 4.001605033874512,
"rewards/rejected": -5.111660957336426,
"step": 89
},
{
"epoch": 1.0666666666666667,
"grad_norm": 18.807247445635458,
"learning_rate": 4.5876603278502027e-07,
"logits/chosen": -0.12274541705846786,
"logits/rejected": 0.009613536298274994,
"logps/chosen": -41.16261291503906,
"logps/rejected": -73.863037109375,
"loss": 0.1405,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8482255935668945,
"rewards/margins": 5.244417667388916,
"rewards/rejected": -6.0926432609558105,
"step": 90
},
{
"epoch": 1.0785185185185184,
"grad_norm": 24.62512250085559,
"learning_rate": 4.573240051392935e-07,
"logits/chosen": -0.1303870528936386,
"logits/rejected": -0.14886482059955597,
"logps/chosen": -39.42107009887695,
"logps/rejected": -54.32372283935547,
"loss": 0.1612,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6151587963104248,
"rewards/margins": 3.3702774047851562,
"rewards/rejected": -3.98543643951416,
"step": 91
},
{
"epoch": 1.0903703703703704,
"grad_norm": 18.29323289164473,
"learning_rate": 4.5585954221154853e-07,
"logits/chosen": -0.4573056101799011,
"logits/rejected": -0.31245023012161255,
"logps/chosen": -32.956050872802734,
"logps/rejected": -59.963035583496094,
"loss": 0.1649,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5122833251953125,
"rewards/margins": 4.091685771942139,
"rewards/rejected": -4.603969097137451,
"step": 92
},
{
"epoch": 1.1022222222222222,
"grad_norm": 19.321908739095367,
"learning_rate": 4.5437280247662646e-07,
"logits/chosen": -0.023740939795970917,
"logits/rejected": -0.0003247186541557312,
"logps/chosen": -39.27166748046875,
"logps/rejected": -55.00670623779297,
"loss": 0.1251,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7074174880981445,
"rewards/margins": 3.3870468139648438,
"rewards/rejected": -4.094464302062988,
"step": 93
},
{
"epoch": 1.114074074074074,
"grad_norm": 17.03649474940601,
"learning_rate": 4.528639468200226e-07,
"logits/chosen": 0.2261081337928772,
"logits/rejected": 0.2597027122974396,
"logps/chosen": -37.43301773071289,
"logps/rejected": -51.47749328613281,
"loss": 0.1186,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10310641676187515,
"rewards/margins": 3.6022109985351562,
"rewards/rejected": -3.4991047382354736,
"step": 94
},
{
"epoch": 1.125925925925926,
"grad_norm": 16.559378309982034,
"learning_rate": 4.5133313852047613e-07,
"logits/chosen": -0.11725334078073502,
"logits/rejected": -0.08741730451583862,
"logps/chosen": -33.64775085449219,
"logps/rejected": -55.1715087890625,
"loss": 0.1273,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.01142565906047821,
"rewards/margins": 3.431326150894165,
"rewards/rejected": -3.442751407623291,
"step": 95
},
{
"epoch": 1.1377777777777778,
"grad_norm": 20.611754629262013,
"learning_rate": 4.4978054323230144e-07,
"logits/chosen": 0.051543403416872025,
"logits/rejected": 0.11705614626407623,
"logps/chosen": -33.534278869628906,
"logps/rejected": -51.728057861328125,
"loss": 0.1365,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.07938066124916077,
"rewards/margins": 3.254304885864258,
"rewards/rejected": -3.17492413520813,
"step": 96
},
{
"epoch": 1.1496296296296296,
"grad_norm": 12.616532653018533,
"learning_rate": 4.482063289674618e-07,
"logits/chosen": -0.0702984482049942,
"logits/rejected": 0.004200812429189682,
"logps/chosen": -34.993507385253906,
"logps/rejected": -57.94596481323242,
"loss": 0.0914,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19305679202079773,
"rewards/margins": 4.31368350982666,
"rewards/rejected": -4.120626449584961,
"step": 97
},
{
"epoch": 1.1614814814814816,
"grad_norm": 12.762301209570738,
"learning_rate": 4.466106660773884e-07,
"logits/chosen": -0.1491287350654602,
"logits/rejected": -0.03166097402572632,
"logps/chosen": -39.788230895996094,
"logps/rejected": -59.54990005493164,
"loss": 0.094,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.40045109391212463,
"rewards/margins": 4.702445030212402,
"rewards/rejected": -5.102896213531494,
"step": 98
},
{
"epoch": 1.1733333333333333,
"grad_norm": 17.826250546113457,
"learning_rate": 4.44993727234546e-07,
"logits/chosen": 0.11196614801883698,
"logits/rejected": 0.07824762165546417,
"logps/chosen": -39.74019241333008,
"logps/rejected": -49.98065185546875,
"loss": 0.1199,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.37030303478240967,
"rewards/margins": 2.883664608001709,
"rewards/rejected": -3.25396728515625,
"step": 99
},
{
"epoch": 1.1851851851851851,
"grad_norm": 16.340929612227512,
"learning_rate": 4.4335568741374695e-07,
"logits/chosen": -0.3065292239189148,
"logits/rejected": -0.243222177028656,
"logps/chosen": -39.87810516357422,
"logps/rejected": -49.80472946166992,
"loss": 0.1362,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.09828188270330429,
"rewards/margins": 4.0216569900512695,
"rewards/rejected": -4.119938850402832,
"step": 100
},
{
"epoch": 1.1970370370370371,
"grad_norm": 15.511054375166765,
"learning_rate": 4.4169672387321735e-07,
"logits/chosen": -0.06340894848108292,
"logits/rejected": -0.0665612518787384,
"logps/chosen": -43.078765869140625,
"logps/rejected": -65.42969512939453,
"loss": 0.1102,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.3077806830406189,
"rewards/margins": 5.954348087310791,
"rewards/rejected": -6.262128829956055,
"step": 101
},
{
"epoch": 1.208888888888889,
"grad_norm": 17.09598873374456,
"learning_rate": 4.4001701613541454e-07,
"logits/chosen": 0.09894056618213654,
"logits/rejected": 0.11925836652517319,
"logps/chosen": -32.494293212890625,
"logps/rejected": -51.34983825683594,
"loss": 0.128,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.22729653120040894,
"rewards/margins": 4.20453405380249,
"rewards/rejected": -4.431830406188965,
"step": 102
},
{
"epoch": 1.2207407407407407,
"grad_norm": 22.603451847311774,
"learning_rate": 4.383167459676008e-07,
"logits/chosen": -0.09994232654571533,
"logits/rejected": -0.03670894354581833,
"logps/chosen": -34.782405853271484,
"logps/rejected": -57.43785095214844,
"loss": 0.1482,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.40158677101135254,
"rewards/margins": 3.9667258262634277,
"rewards/rejected": -4.368312358856201,
"step": 103
},
{
"epoch": 1.2325925925925927,
"grad_norm": 17.176654931396367,
"learning_rate": 4.365960973621734e-07,
"logits/chosen": -0.3010917007923126,
"logits/rejected": -0.2688751220703125,
"logps/chosen": -30.779415130615234,
"logps/rejected": -59.66587448120117,
"loss": 0.1077,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5755228996276855,
"rewards/margins": 5.500998497009277,
"rewards/rejected": -6.076521396636963,
"step": 104
},
{
"epoch": 1.2444444444444445,
"grad_norm": 16.54203727829074,
"learning_rate": 4.348552565167542e-07,
"logits/chosen": 0.03528839722275734,
"logits/rejected": 0.025072041898965836,
"logps/chosen": -35.11994171142578,
"logps/rejected": -49.86817169189453,
"loss": 0.1169,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.683509349822998,
"rewards/margins": 4.702343940734863,
"rewards/rejected": -5.385853290557861,
"step": 105
},
{
"epoch": 1.2562962962962962,
"grad_norm": 26.56869923226247,
"learning_rate": 4.330944118140406e-07,
"logits/chosen": -0.08033540099859238,
"logits/rejected": -0.015175499022006989,
"logps/chosen": -40.06020736694336,
"logps/rejected": -58.90888977050781,
"loss": 0.1479,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.02627614513039589,
"rewards/margins": 4.818339824676514,
"rewards/rejected": -4.7920637130737305,
"step": 106
},
{
"epoch": 1.268148148148148,
"grad_norm": 14.298587740255858,
"learning_rate": 4.313137538014198e-07,
"logits/chosen": -0.08439959585666656,
"logits/rejected": -0.15276381373405457,
"logps/chosen": -34.021949768066406,
"logps/rejected": -43.68275451660156,
"loss": 0.0835,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.03106861561536789,
"rewards/margins": 3.872872829437256,
"rewards/rejected": -3.841804265975952,
"step": 107
},
{
"epoch": 1.28,
"grad_norm": 25.029691995336584,
"learning_rate": 4.295134751703492e-07,
"logits/chosen": 0.039663467556238174,
"logits/rejected": 0.03390619903802872,
"logps/chosen": -50.46000671386719,
"logps/rejected": -61.62244415283203,
"loss": 0.1408,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.41419780254364014,
"rewards/margins": 5.797426223754883,
"rewards/rejected": -6.2116241455078125,
"step": 108
},
{
"epoch": 1.2918518518518518,
"grad_norm": 23.688132743134673,
"learning_rate": 4.276937707355044e-07,
"logits/chosen": -0.017152896150946617,
"logits/rejected": -0.053842976689338684,
"logps/chosen": -41.905521392822266,
"logps/rejected": -65.56050109863281,
"loss": 0.1163,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.573278546333313,
"rewards/margins": 6.835091590881348,
"rewards/rejected": -7.408369064331055,
"step": 109
},
{
"epoch": 1.3037037037037038,
"grad_norm": 22.26748210189321,
"learning_rate": 4.2585483741369755e-07,
"logits/chosen": -0.3594672679901123,
"logits/rejected": -0.2787400782108307,
"logps/chosen": -33.10090255737305,
"logps/rejected": -65.98353576660156,
"loss": 0.1206,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1710546016693115,
"rewards/margins": 5.390981197357178,
"rewards/rejected": -6.562036037445068,
"step": 110
},
{
"epoch": 1.3155555555555556,
"grad_norm": 12.864266715484298,
"learning_rate": 4.239968742025684e-07,
"logits/chosen": -0.1566499024629593,
"logits/rejected": -0.02022075653076172,
"logps/chosen": -31.641050338745117,
"logps/rejected": -70.607421875,
"loss": 0.0872,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5755561590194702,
"rewards/margins": 5.668292999267578,
"rewards/rejected": -6.243849277496338,
"step": 111
},
{
"epoch": 1.3274074074074074,
"grad_norm": 11.527896741156843,
"learning_rate": 4.2212008215905e-07,
"logits/chosen": -0.18084248900413513,
"logits/rejected": -0.06485521793365479,
"logps/chosen": -32.15618133544922,
"logps/rejected": -60.565914154052734,
"loss": 0.0793,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.524111270904541,
"rewards/margins": 4.708486557006836,
"rewards/rejected": -5.2325968742370605,
"step": 112
},
{
"epoch": 1.3392592592592591,
"grad_norm": 18.981341764098254,
"learning_rate": 4.2022466437761154e-07,
"logits/chosen": 0.11151312291622162,
"logits/rejected": 0.2410213202238083,
"logps/chosen": -35.73467254638672,
"logps/rejected": -61.45711135864258,
"loss": 0.1104,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03812497854232788,
"rewards/margins": 3.9962849617004395,
"rewards/rejected": -4.034409999847412,
"step": 113
},
{
"epoch": 1.3511111111111112,
"grad_norm": 15.055020570658602,
"learning_rate": 4.18310825968281e-07,
"logits/chosen": -0.13380703330039978,
"logits/rejected": -0.056154295802116394,
"logps/chosen": -47.65923309326172,
"logps/rejected": -68.50222778320312,
"loss": 0.097,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0497931241989136,
"rewards/margins": 5.255577564239502,
"rewards/rejected": -6.305370807647705,
"step": 114
},
{
"epoch": 1.362962962962963,
"grad_norm": 15.94507937001582,
"learning_rate": 4.1637877403444923e-07,
"logits/chosen": -0.13782085478305817,
"logits/rejected": -0.1237938329577446,
"logps/chosen": -34.57268142700195,
"logps/rejected": -60.96284103393555,
"loss": 0.1328,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2373143583536148,
"rewards/margins": 5.874894142150879,
"rewards/rejected": -5.637579917907715,
"step": 115
},
{
"epoch": 1.374814814814815,
"grad_norm": 17.6499216351512,
"learning_rate": 4.144287176504582e-07,
"logits/chosen": -0.028891492635011673,
"logits/rejected": 0.010559901595115662,
"logps/chosen": -41.175785064697266,
"logps/rejected": -56.148109436035156,
"loss": 0.1233,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5991055965423584,
"rewards/margins": 4.076951503753662,
"rewards/rejected": -4.6760573387146,
"step": 116
},
{
"epoch": 1.3866666666666667,
"grad_norm": 17.440881402981706,
"learning_rate": 4.1246086783897713e-07,
"logits/chosen": -0.07107866555452347,
"logits/rejected": -0.05355262756347656,
"logps/chosen": -29.337356567382812,
"logps/rejected": -58.07263946533203,
"loss": 0.107,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.25651735067367554,
"rewards/margins": 5.097599983215332,
"rewards/rejected": -4.841082572937012,
"step": 117
},
{
"epoch": 1.3985185185185185,
"grad_norm": 20.485113734375798,
"learning_rate": 4.104754375481664e-07,
"logits/chosen": -0.027082689106464386,
"logits/rejected": 0.0022195279598236084,
"logps/chosen": -35.18750762939453,
"logps/rejected": -54.038787841796875,
"loss": 0.0963,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7344773411750793,
"rewards/margins": 4.06456995010376,
"rewards/rejected": -4.799046993255615,
"step": 118
},
{
"epoch": 1.4103703703703703,
"grad_norm": 15.73045559795655,
"learning_rate": 4.084726416286337e-07,
"logits/chosen": -0.2252836376428604,
"logits/rejected": -0.1367400735616684,
"logps/chosen": -28.606430053710938,
"logps/rejected": -54.020267486572266,
"loss": 0.0872,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.016864344477653503,
"rewards/margins": 4.223577499389648,
"rewards/rejected": -4.2067131996154785,
"step": 119
},
{
"epoch": 1.4222222222222223,
"grad_norm": 18.987468125277015,
"learning_rate": 4.0645269681018434e-07,
"logits/chosen": -0.16107773780822754,
"logits/rejected": -0.030207287520170212,
"logps/chosen": -29.666810989379883,
"logps/rejected": -61.85116958618164,
"loss": 0.119,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.07655519992113113,
"rewards/margins": 5.755571365356445,
"rewards/rejected": -5.832127571105957,
"step": 120
},
{
"epoch": 1.434074074074074,
"grad_norm": 14.503662362117073,
"learning_rate": 4.044158216783684e-07,
"logits/chosen": -0.515570878982544,
"logits/rejected": -0.3124653100967407,
"logps/chosen": -37.86850357055664,
"logps/rejected": -69.43701934814453,
"loss": 0.0985,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2003718763589859,
"rewards/margins": 7.45107364654541,
"rewards/rejected": -7.6514458656311035,
"step": 121
},
{
"epoch": 1.445925925925926,
"grad_norm": 18.569284512541163,
"learning_rate": 4.0236223665082605e-07,
"logits/chosen": -0.21073125302791595,
"logits/rejected": -0.2305406928062439,
"logps/chosen": -31.442550659179688,
"logps/rejected": -55.32566833496094,
"loss": 0.079,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6064890027046204,
"rewards/margins": 5.698796272277832,
"rewards/rejected": -6.3052849769592285,
"step": 122
},
{
"epoch": 1.4577777777777778,
"grad_norm": 17.336812678453057,
"learning_rate": 4.0029216395343617e-07,
"logits/chosen": -0.06248122453689575,
"logits/rejected": -0.05541558563709259,
"logps/chosen": -35.73994445800781,
"logps/rejected": -60.30720901489258,
"loss": 0.1032,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7068940997123718,
"rewards/margins": 5.66333532333374,
"rewards/rejected": -6.3702287673950195,
"step": 123
},
{
"epoch": 1.4696296296296296,
"grad_norm": 14.094947491287853,
"learning_rate": 3.982058275962682e-07,
"logits/chosen": -0.21389123797416687,
"logits/rejected": -0.16907303035259247,
"logps/chosen": -28.326904296875,
"logps/rejected": -56.3559455871582,
"loss": 0.0846,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10696560144424438,
"rewards/margins": 4.861944675445557,
"rewards/rejected": -4.968909740447998,
"step": 124
},
{
"epoch": 1.4814814814814814,
"grad_norm": 16.97368933164184,
"learning_rate": 3.9610345334934094e-07,
"logits/chosen": -0.13031917810440063,
"logits/rejected": -0.03798413649201393,
"logps/chosen": -41.05464172363281,
"logps/rejected": -65.29788208007812,
"loss": 0.1205,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.038674697279930115,
"rewards/margins": 6.0994062423706055,
"rewards/rejected": -6.060731887817383,
"step": 125
},
{
"epoch": 1.4933333333333334,
"grad_norm": 15.284029740961785,
"learning_rate": 3.939852687181915e-07,
"logits/chosen": -0.14361430704593658,
"logits/rejected": -0.10849830508232117,
"logps/chosen": -36.10377502441406,
"logps/rejected": -67.89326477050781,
"loss": 0.0939,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7120703458786011,
"rewards/margins": 5.921784400939941,
"rewards/rejected": -6.633854389190674,
"step": 126
},
{
"epoch": 1.5051851851851852,
"grad_norm": 18.417078359527594,
"learning_rate": 3.9185150291925585e-07,
"logits/chosen": -0.21707814931869507,
"logits/rejected": -0.20940172672271729,
"logps/chosen": -36.678348541259766,
"logps/rejected": -59.70578384399414,
"loss": 0.1107,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0581213235855103,
"rewards/margins": 5.090505123138428,
"rewards/rejected": -6.14862585067749,
"step": 127
},
{
"epoch": 1.5170370370370372,
"grad_norm": 15.713415294775897,
"learning_rate": 3.8970238685506486e-07,
"logits/chosen": -0.04535888880491257,
"logits/rejected": 0.03404983878135681,
"logps/chosen": -34.13862609863281,
"logps/rejected": -66.47319793701172,
"loss": 0.0976,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.735641360282898,
"rewards/margins": 5.098552227020264,
"rewards/rejected": -5.834194183349609,
"step": 128
},
{
"epoch": 1.528888888888889,
"grad_norm": 15.290294429948705,
"learning_rate": 3.8753815308925685e-07,
"logits/chosen": -0.44447335600852966,
"logits/rejected": -0.5022441148757935,
"logps/chosen": -34.69385528564453,
"logps/rejected": -63.007972717285156,
"loss": 0.0908,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.753921389579773,
"rewards/margins": 6.32866907119751,
"rewards/rejected": -7.082590579986572,
"step": 129
},
{
"epoch": 1.5407407407407407,
"grad_norm": 22.7399302377654,
"learning_rate": 3.8535903582141184e-07,
"logits/chosen": -0.3744094669818878,
"logits/rejected": -0.2060033231973648,
"logps/chosen": -32.91036605834961,
"logps/rejected": -61.743282318115234,
"loss": 0.1268,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4923054873943329,
"rewards/margins": 5.070975303649902,
"rewards/rejected": -5.5632805824279785,
"step": 130
},
{
"epoch": 1.5525925925925925,
"grad_norm": 19.47533309428857,
"learning_rate": 3.8316527086170727e-07,
"logits/chosen": -0.1633462905883789,
"logits/rejected": -0.07288794964551926,
"logps/chosen": -36.646484375,
"logps/rejected": -59.22307205200195,
"loss": 0.125,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2637479305267334,
"rewards/margins": 5.560683727264404,
"rewards/rejected": -5.824431419372559,
"step": 131
},
{
"epoch": 1.5644444444444443,
"grad_norm": 18.594859296902843,
"learning_rate": 3.809570956054003e-07,
"logits/chosen": -0.5365747213363647,
"logits/rejected": -0.39433979988098145,
"logps/chosen": -31.459264755249023,
"logps/rejected": -62.012969970703125,
"loss": 0.1097,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.048090934753418,
"rewards/margins": 5.831287860870361,
"rewards/rejected": -6.879378318786621,
"step": 132
},
{
"epoch": 1.5762962962962963,
"grad_norm": 14.282275106755108,
"learning_rate": 3.787347490071389e-07,
"logits/chosen": -0.20027217268943787,
"logits/rejected": -0.11141454428434372,
"logps/chosen": -39.01911163330078,
"logps/rejected": -64.89835357666016,
"loss": 0.0991,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3214702308177948,
"rewards/margins": 5.419382095336914,
"rewards/rejected": -5.740852355957031,
"step": 133
},
{
"epoch": 1.5881481481481483,
"grad_norm": 13.42088885786826,
"learning_rate": 3.764984715551031e-07,
"logits/chosen": -0.12170754373073578,
"logits/rejected": -0.046029090881347656,
"logps/chosen": -29.234134674072266,
"logps/rejected": -60.45512771606445,
"loss": 0.0895,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1851249635219574,
"rewards/margins": 6.193130970001221,
"rewards/rejected": -6.378255844116211,
"step": 134
},
{
"epoch": 1.6,
"grad_norm": 17.391251462773063,
"learning_rate": 3.7424850524498113e-07,
"logits/chosen": -0.18073627352714539,
"logits/rejected": -0.05371435731649399,
"logps/chosen": -35.178985595703125,
"logps/rejected": -62.41261672973633,
"loss": 0.112,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.027589425444602966,
"rewards/margins": 5.411984443664551,
"rewards/rejected": -5.384395599365234,
"step": 135
},
{
"epoch": 1.6118518518518519,
"grad_norm": 13.216285367583016,
"learning_rate": 3.7198509355378207e-07,
"logits/chosen": -0.3801528811454773,
"logits/rejected": -0.3222590982913971,
"logps/chosen": -40.775054931640625,
"logps/rejected": -52.93413543701172,
"loss": 0.1169,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5116928815841675,
"rewards/margins": 3.9808902740478516,
"rewards/rejected": -5.49258279800415,
"step": 136
},
{
"epoch": 1.6237037037037036,
"grad_norm": 20.45643460429083,
"learning_rate": 3.6970848141348855e-07,
"logits/chosen": -0.17568367719650269,
"logits/rejected": -0.11519981920719147,
"logps/chosen": -39.35060501098633,
"logps/rejected": -59.42844009399414,
"loss": 0.1259,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.02011704444885254,
"rewards/margins": 5.978552341461182,
"rewards/rejected": -5.998669624328613,
"step": 137
},
{
"epoch": 1.6355555555555554,
"grad_norm": 20.16029126070904,
"learning_rate": 3.6741891518455146e-07,
"logits/chosen": -0.16909295320510864,
"logits/rejected": -0.11791606992483139,
"logps/chosen": -39.2324333190918,
"logps/rejected": -67.33000183105469,
"loss": 0.0818,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9399681091308594,
"rewards/margins": 5.435766220092773,
"rewards/rejected": -6.375733375549316,
"step": 138
},
{
"epoch": 1.6474074074074074,
"grad_norm": 16.83558679733193,
"learning_rate": 3.6511664262923094e-07,
"logits/chosen": -0.2512515187263489,
"logits/rejected": -0.12882237136363983,
"logps/chosen": -27.65873908996582,
"logps/rejected": -61.08441925048828,
"loss": 0.0919,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27784237265586853,
"rewards/margins": 6.476930141448975,
"rewards/rejected": -6.754772663116455,
"step": 139
},
{
"epoch": 1.6592592592592592,
"grad_norm": 12.155079766899926,
"learning_rate": 3.6280191288478435e-07,
"logits/chosen": -0.15503238141536713,
"logits/rejected": -0.06224162131547928,
"logps/chosen": -34.50273895263672,
"logps/rejected": -62.98163604736328,
"loss": 0.0725,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17935335636138916,
"rewards/margins": 5.786401271820068,
"rewards/rejected": -5.965755462646484,
"step": 140
},
{
"epoch": 1.6711111111111112,
"grad_norm": 14.838007532934355,
"learning_rate": 3.604749764365069e-07,
"logits/chosen": -0.2061130404472351,
"logits/rejected": -0.11659687012434006,
"logps/chosen": -27.322792053222656,
"logps/rejected": -62.887542724609375,
"loss": 0.0849,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2183775156736374,
"rewards/margins": 7.025249481201172,
"rewards/rejected": -7.243628025054932,
"step": 141
},
{
"epoch": 1.682962962962963,
"grad_norm": 18.816292977620716,
"learning_rate": 3.5813608509062526e-07,
"logits/chosen": -0.21707522869110107,
"logits/rejected": -0.09010873734951019,
"logps/chosen": -36.751190185546875,
"logps/rejected": -75.8062744140625,
"loss": 0.1009,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9384999871253967,
"rewards/margins": 6.734908580780029,
"rewards/rejected": -7.673408508300781,
"step": 142
},
{
"epoch": 1.6948148148148148,
"grad_norm": 19.8821437700751,
"learning_rate": 3.557854919470491e-07,
"logits/chosen": -0.27580782771110535,
"logits/rejected": -0.22370710968971252,
"logps/chosen": -44.46840286254883,
"logps/rejected": -56.556922912597656,
"loss": 0.1165,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.3054481744766235,
"rewards/margins": 3.6606221199035645,
"rewards/rejected": -4.966070175170898,
"step": 143
},
{
"epoch": 1.7066666666666666,
"grad_norm": 16.302116776290895,
"learning_rate": 3.5342345137198206e-07,
"logits/chosen": -0.12199485301971436,
"logits/rejected": -0.08353496342897415,
"logps/chosen": -38.30879211425781,
"logps/rejected": -51.09149932861328,
"loss": 0.112,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1275089532136917,
"rewards/margins": 3.7598049640655518,
"rewards/rejected": -3.8873140811920166,
"step": 144
},
{
"epoch": 1.7185185185185186,
"grad_norm": 15.133588204814354,
"learning_rate": 3.510502189703954e-07,
"logits/chosen": -0.10772836208343506,
"logits/rejected": -0.06885837763547897,
"logps/chosen": -40.77737045288086,
"logps/rejected": -69.48592376708984,
"loss": 0.0693,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8564466834068298,
"rewards/margins": 8.315703392028809,
"rewards/rejected": -9.172150611877441,
"step": 145
},
{
"epoch": 1.7303703703703703,
"grad_norm": 19.362732647667297,
"learning_rate": 3.486660515583691e-07,
"logits/chosen": -0.2726586163043976,
"logits/rejected": -0.21814075112342834,
"logps/chosen": -30.717233657836914,
"logps/rejected": -65.25225830078125,
"loss": 0.0885,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.21597516536712646,
"rewards/margins": 6.636316776275635,
"rewards/rejected": -6.852292060852051,
"step": 146
},
{
"epoch": 1.7422222222222223,
"grad_norm": 11.230644499357242,
"learning_rate": 3.4627120713529983e-07,
"logits/chosen": -0.2115684449672699,
"logits/rejected": -0.11108352243900299,
"logps/chosen": -34.891475677490234,
"logps/rejected": -77.10374450683594,
"loss": 0.0803,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.222525954246521,
"rewards/margins": 9.00995922088623,
"rewards/rejected": -10.232484817504883,
"step": 147
},
{
"epoch": 1.7540740740740741,
"grad_norm": 13.137309578079025,
"learning_rate": 3.438659448559825e-07,
"logits/chosen": -0.13276290893554688,
"logits/rejected": -0.10878665745258331,
"logps/chosen": -37.001060485839844,
"logps/rejected": -73.28951263427734,
"loss": 0.067,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.598874568939209,
"rewards/margins": 7.266829013824463,
"rewards/rejected": -7.865704536437988,
"step": 148
},
{
"epoch": 1.765925925925926,
"grad_norm": 21.677182436717583,
"learning_rate": 3.414505250025659e-07,
"logits/chosen": -0.04767221957445145,
"logits/rejected": 0.07465275377035141,
"logps/chosen": -40.81106185913086,
"logps/rejected": -67.47488403320312,
"loss": 0.0942,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7800440788269043,
"rewards/margins": 5.173450469970703,
"rewards/rejected": -5.953495025634766,
"step": 149
},
{
"epoch": 1.7777777777777777,
"grad_norm": 16.148540981867313,
"learning_rate": 3.390252089563867e-07,
"logits/chosen": -0.18858963251113892,
"logits/rejected": -0.20340172946453094,
"logps/chosen": -36.109954833984375,
"logps/rejected": -55.27710723876953,
"loss": 0.0796,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0519292950630188,
"rewards/margins": 5.729560852050781,
"rewards/rejected": -5.677631855010986,
"step": 150
},
{
"epoch": 1.7896296296296297,
"grad_norm": 24.312535239343674,
"learning_rate": 3.3659025916968475e-07,
"logits/chosen": -0.2818453013896942,
"logits/rejected": -0.19897550344467163,
"logps/chosen": -38.93151092529297,
"logps/rejected": -76.9036636352539,
"loss": 0.1537,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5907295346260071,
"rewards/margins": 6.914373397827148,
"rewards/rejected": -7.505102634429932,
"step": 151
},
{
"epoch": 1.8014814814814815,
"grad_norm": 20.831076322049565,
"learning_rate": 3.3414593913720155e-07,
"logits/chosen": -0.22526244819164276,
"logits/rejected": -0.18128839135169983,
"logps/chosen": -34.0900764465332,
"logps/rejected": -55.26824951171875,
"loss": 0.126,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0966421514749527,
"rewards/margins": 5.949459075927734,
"rewards/rejected": -6.046100616455078,
"step": 152
},
{
"epoch": 1.8133333333333335,
"grad_norm": 11.740893074967538,
"learning_rate": 3.3169251336766697e-07,
"logits/chosen": -0.1995951235294342,
"logits/rejected": -0.08064538240432739,
"logps/chosen": -34.26948165893555,
"logps/rejected": -62.14799499511719,
"loss": 0.0776,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.565490484237671,
"rewards/margins": 5.796334266662598,
"rewards/rejected": -7.361824989318848,
"step": 153
},
{
"epoch": 1.8251851851851852,
"grad_norm": 16.192262072429642,
"learning_rate": 3.2923024735517567e-07,
"logits/chosen": -0.3225496709346771,
"logits/rejected": -0.23174233734607697,
"logps/chosen": -31.906171798706055,
"logps/rejected": -58.93853759765625,
"loss": 0.0956,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.29122716188430786,
"rewards/margins": 5.078674793243408,
"rewards/rejected": -5.369902610778809,
"step": 154
},
{
"epoch": 1.837037037037037,
"grad_norm": 20.0031978017647,
"learning_rate": 3.2675940755045713e-07,
"logits/chosen": 0.008214278146624565,
"logits/rejected": 0.14055991172790527,
"logps/chosen": -46.526527404785156,
"logps/rejected": -81.31307983398438,
"loss": 0.1503,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6333274245262146,
"rewards/margins": 6.6069817543029785,
"rewards/rejected": -7.240309238433838,
"step": 155
},
{
"epoch": 1.8488888888888888,
"grad_norm": 21.076223385192634,
"learning_rate": 3.242802613320418e-07,
"logits/chosen": 0.0031320489943027496,
"logits/rejected": 0.012668165378272533,
"logps/chosen": -37.56488037109375,
"logps/rejected": -64.01717376708984,
"loss": 0.1549,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7534648180007935,
"rewards/margins": 6.474049091339111,
"rewards/rejected": -7.227513313293457,
"step": 156
},
{
"epoch": 1.8607407407407406,
"grad_norm": 16.652570591848395,
"learning_rate": 3.217930769773275e-07,
"logits/chosen": -0.35275697708129883,
"logits/rejected": -0.24555784463882446,
"logps/chosen": -33.54517364501953,
"logps/rejected": -62.51717758178711,
"loss": 0.0959,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.42744529247283936,
"rewards/margins": 6.722561836242676,
"rewards/rejected": -7.150007247924805,
"step": 157
},
{
"epoch": 1.8725925925925926,
"grad_norm": 12.597802388134623,
"learning_rate": 3.1929812363354764e-07,
"logits/chosen": -0.2830018103122711,
"logits/rejected": -0.1911703646183014,
"logps/chosen": -34.85630416870117,
"logps/rejected": -65.27281188964844,
"loss": 0.0536,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6973313689231873,
"rewards/margins": 5.912741184234619,
"rewards/rejected": -6.610072612762451,
"step": 158
},
{
"epoch": 1.8844444444444446,
"grad_norm": 12.10851684482882,
"learning_rate": 3.167956712886463e-07,
"logits/chosen": -0.11961568146944046,
"logits/rejected": -0.07325749099254608,
"logps/chosen": -39.96379470825195,
"logps/rejected": -55.56465148925781,
"loss": 0.061,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3007912635803223,
"rewards/margins": 4.756231307983398,
"rewards/rejected": -6.0570220947265625,
"step": 159
},
{
"epoch": 1.8962962962962964,
"grad_norm": 20.233134180855856,
"learning_rate": 3.142859907420615e-07,
"logits/chosen": -0.10496459901332855,
"logits/rejected": 0.02405383251607418,
"logps/chosen": -33.706703186035156,
"logps/rejected": -68.788818359375,
"loss": 0.1011,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.46070706844329834,
"rewards/margins": 5.680697917938232,
"rewards/rejected": -6.14140510559082,
"step": 160
},
{
"epoch": 1.9081481481481481,
"grad_norm": 19.119673878649753,
"learning_rate": 3.117693535754213e-07,
"logits/chosen": -0.10953935980796814,
"logits/rejected": -0.043600670993328094,
"logps/chosen": -32.68844985961914,
"logps/rejected": -67.97357177734375,
"loss": 0.098,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.133280411362648,
"rewards/margins": 7.29306173324585,
"rewards/rejected": -7.159781455993652,
"step": 161
},
{
"epoch": 1.92,
"grad_norm": 20.235536301916813,
"learning_rate": 3.092460321231547e-07,
"logits/chosen": -0.25959259271621704,
"logits/rejected": -0.20939543843269348,
"logps/chosen": -35.41835021972656,
"logps/rejected": -67.97672271728516,
"loss": 0.1272,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.018587589263916,
"rewards/margins": 7.818422317504883,
"rewards/rejected": -8.837010383605957,
"step": 162
},
{
"epoch": 1.9318518518518517,
"grad_norm": 13.909230300778022,
"learning_rate": 3.0671629944302164e-07,
"logits/chosen": -0.12026870250701904,
"logits/rejected": -0.1089366227388382,
"logps/chosen": -36.70647048950195,
"logps/rejected": -53.47910690307617,
"loss": 0.0742,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5273434519767761,
"rewards/margins": 5.643020153045654,
"rewards/rejected": -6.170363903045654,
"step": 163
},
{
"epoch": 1.9437037037037037,
"grad_norm": 12.864589613039078,
"learning_rate": 3.0418042928656415e-07,
"logits/chosen": -0.19225530326366425,
"logits/rejected": -0.10623307526111603,
"logps/chosen": -30.59122085571289,
"logps/rejected": -59.536502838134766,
"loss": 0.0836,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13015054166316986,
"rewards/margins": 5.6972761154174805,
"rewards/rejected": -5.827426910400391,
"step": 164
},
{
"epoch": 1.9555555555555557,
"grad_norm": 15.975537356292229,
"learning_rate": 3.016386960694827e-07,
"logits/chosen": -0.41196244955062866,
"logits/rejected": -0.28604307770729065,
"logps/chosen": -39.74413299560547,
"logps/rejected": -67.16585540771484,
"loss": 0.1075,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.3397538661956787,
"rewards/margins": 5.61893892288208,
"rewards/rejected": -6.958693027496338,
"step": 165
},
{
"epoch": 1.9674074074074075,
"grad_norm": 32.234428734857076,
"learning_rate": 2.990913748419411e-07,
"logits/chosen": 0.07510136812925339,
"logits/rejected": 0.11186552792787552,
"logps/chosen": -41.38081741333008,
"logps/rejected": -67.78083038330078,
"loss": 0.1778,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.3183833360671997,
"rewards/margins": 5.662228107452393,
"rewards/rejected": -5.980611801147461,
"step": 166
},
{
"epoch": 1.9792592592592593,
"grad_norm": 19.969450441352706,
"learning_rate": 2.9653874125880167e-07,
"logits/chosen": -0.17725233733654022,
"logits/rejected": -0.10895150154829025,
"logps/chosen": -36.67229461669922,
"logps/rejected": -61.394744873046875,
"loss": 0.1436,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03877316415309906,
"rewards/margins": 5.38054895401001,
"rewards/rejected": -5.341775894165039,
"step": 167
},
{
"epoch": 1.991111111111111,
"grad_norm": 11.405884683324148,
"learning_rate": 2.9398107154979634e-07,
"logits/chosen": -0.21582955121994019,
"logits/rejected": -0.1547984778881073,
"logps/chosen": -33.66050338745117,
"logps/rejected": -71.53202056884766,
"loss": 0.0568,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2750725746154785,
"rewards/margins": 6.011247158050537,
"rewards/rejected": -6.286320209503174,
"step": 168
},
{
"epoch": 2.002962962962963,
"grad_norm": 10.684152961615103,
"learning_rate": 2.9141864248963427e-07,
"logits/chosen": -0.43596649169921875,
"logits/rejected": -0.3842291235923767,
"logps/chosen": -39.145042419433594,
"logps/rejected": -52.33580780029297,
"loss": 0.0686,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2999729812145233,
"rewards/margins": 5.18059778213501,
"rewards/rejected": -5.4805707931518555,
"step": 169
},
{
"epoch": 2.0148148148148146,
"grad_norm": 7.120772046885603,
"learning_rate": 2.8885173136805125e-07,
"logits/chosen": -0.0826089009642601,
"logits/rejected": 0.052907198667526245,
"logps/chosen": -34.04817199707031,
"logps/rejected": -73.21731567382812,
"loss": 0.0483,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0369320884346962,
"rewards/margins": 5.923883438110352,
"rewards/rejected": -5.960815906524658,
"step": 170
},
{
"epoch": 2.026666666666667,
"grad_norm": 6.919197934532035,
"learning_rate": 2.862806159598032e-07,
"logits/chosen": -0.45804017782211304,
"logits/rejected": -0.408170610666275,
"logps/chosen": -35.05866241455078,
"logps/rejected": -58.95463562011719,
"loss": 0.0433,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.28513020277023315,
"rewards/margins": 6.706875324249268,
"rewards/rejected": -6.421745300292969,
"step": 171
},
{
"epoch": 2.0385185185185186,
"grad_norm": 7.6734571457046075,
"learning_rate": 2.837055744946072e-07,
"logits/chosen": -0.20093482732772827,
"logits/rejected": -0.15153169631958008,
"logps/chosen": -26.53860092163086,
"logps/rejected": -60.75783157348633,
"loss": 0.0522,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1763535887002945,
"rewards/margins": 6.684775352478027,
"rewards/rejected": -6.508421421051025,
"step": 172
},
{
"epoch": 2.0503703703703704,
"grad_norm": 7.089127360949598,
"learning_rate": 2.811268856270332e-07,
"logits/chosen": -0.19816571474075317,
"logits/rejected": -0.17036175727844238,
"logps/chosen": -30.929105758666992,
"logps/rejected": -63.191619873046875,
"loss": 0.0436,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12159377336502075,
"rewards/margins": 6.226848602294922,
"rewards/rejected": -6.348442077636719,
"step": 173
},
{
"epoch": 2.062222222222222,
"grad_norm": 6.879444744601849,
"learning_rate": 2.7854482840634965e-07,
"logits/chosen": -0.3685060143470764,
"logits/rejected": -0.2508692145347595,
"logps/chosen": -30.003223419189453,
"logps/rejected": -66.00552368164062,
"loss": 0.0503,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6152498126029968,
"rewards/margins": 7.741688251495361,
"rewards/rejected": -8.356938362121582,
"step": 174
},
{
"epoch": 2.074074074074074,
"grad_norm": 7.35254551338382,
"learning_rate": 2.759596822463267e-07,
"logits/chosen": -0.23846019804477692,
"logits/rejected": -0.21598272025585175,
"logps/chosen": -35.864341735839844,
"logps/rejected": -60.58774948120117,
"loss": 0.0493,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5300172567367554,
"rewards/margins": 6.528465747833252,
"rewards/rejected": -7.0584821701049805,
"step": 175
},
{
"epoch": 2.0859259259259257,
"grad_norm": 6.0224009777317535,
"learning_rate": 2.73371726895e-07,
"logits/chosen": -0.4624573588371277,
"logits/rejected": -0.3582268953323364,
"logps/chosen": -38.6595573425293,
"logps/rejected": -68.8594970703125,
"loss": 0.035,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1498335599899292,
"rewards/margins": 6.414737224578857,
"rewards/rejected": -6.564570903778076,
"step": 176
},
{
"epoch": 2.097777777777778,
"grad_norm": 5.373444495677693,
"learning_rate": 2.7078124240439793e-07,
"logits/chosen": -0.293597012758255,
"logits/rejected": -0.18846404552459717,
"logps/chosen": -37.71804428100586,
"logps/rejected": -79.11727905273438,
"loss": 0.0295,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9366405606269836,
"rewards/margins": 8.907712936401367,
"rewards/rejected": -9.844353675842285,
"step": 177
},
{
"epoch": 2.1096296296296297,
"grad_norm": 6.426554871574788,
"learning_rate": 2.68188509100236e-07,
"logits/chosen": -0.07860371470451355,
"logits/rejected": -0.032592758536338806,
"logps/chosen": -36.26288604736328,
"logps/rejected": -67.72529602050781,
"loss": 0.041,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.021683871746063232,
"rewards/margins": 6.0264506340026855,
"rewards/rejected": -6.048134803771973,
"step": 178
},
{
"epoch": 2.1214814814814815,
"grad_norm": 7.4465908422134435,
"learning_rate": 2.6559380755158206e-07,
"logits/chosen": -0.2125643938779831,
"logits/rejected": -0.11938470602035522,
"logps/chosen": -41.93673324584961,
"logps/rejected": -67.91219329833984,
"loss": 0.0526,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.314428448677063,
"rewards/margins": 6.577666759490967,
"rewards/rejected": -6.892095565795898,
"step": 179
},
{
"epoch": 2.1333333333333333,
"grad_norm": 8.207906364980415,
"learning_rate": 2.629974185404951e-07,
"logits/chosen": -0.19172216951847076,
"logits/rejected": -0.09138239920139313,
"logps/chosen": -33.569427490234375,
"logps/rejected": -83.35604858398438,
"loss": 0.0522,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1560230553150177,
"rewards/margins": 6.664412975311279,
"rewards/rejected": -6.820435523986816,
"step": 180
},
{
"epoch": 2.145185185185185,
"grad_norm": 7.056836784079578,
"learning_rate": 2.603996230316402e-07,
"logits/chosen": 0.04577064514160156,
"logits/rejected": 0.027345050126314163,
"logps/chosen": -28.81571388244629,
"logps/rejected": -50.763343811035156,
"loss": 0.0463,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12452198565006256,
"rewards/margins": 5.144538879394531,
"rewards/rejected": -5.269061088562012,
"step": 181
},
{
"epoch": 2.157037037037037,
"grad_norm": 11.894887100183427,
"learning_rate": 2.5780070214188474e-07,
"logits/chosen": -0.2564762532711029,
"logits/rejected": -0.17662659287452698,
"logps/chosen": -44.12029266357422,
"logps/rejected": -67.84626007080078,
"loss": 0.067,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7224655151367188,
"rewards/margins": 6.115478038787842,
"rewards/rejected": -6.837944030761719,
"step": 182
},
{
"epoch": 2.168888888888889,
"grad_norm": 7.773592692412279,
"learning_rate": 2.552009371098778e-07,
"logits/chosen": -0.22470326721668243,
"logits/rejected": -0.16598555445671082,
"logps/chosen": -40.50588607788086,
"logps/rejected": -66.48469543457031,
"loss": 0.0511,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6875913143157959,
"rewards/margins": 6.013528823852539,
"rewards/rejected": -6.7011213302612305,
"step": 183
},
{
"epoch": 2.180740740740741,
"grad_norm": 6.760102453494981,
"learning_rate": 2.5260060926561604e-07,
"logits/chosen": -0.10662063956260681,
"logits/rejected": 0.02468992955982685,
"logps/chosen": -30.439607620239258,
"logps/rejected": -68.47404479980469,
"loss": 0.042,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1491369903087616,
"rewards/margins": 7.288580894470215,
"rewards/rejected": -7.437718391418457,
"step": 184
},
{
"epoch": 2.1925925925925926,
"grad_norm": 8.50819275884591,
"learning_rate": 2.5e-07,
"logits/chosen": -0.314007043838501,
"logits/rejected": -0.2440987378358841,
"logps/chosen": -40.95098114013672,
"logps/rejected": -58.714630126953125,
"loss": 0.061,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.0700465440750122,
"rewards/margins": 4.418706893920898,
"rewards/rejected": -5.488753795623779,
"step": 185
},
{
"epoch": 2.2044444444444444,
"grad_norm": 7.253010502441156,
"learning_rate": 2.4739939073438393e-07,
"logits/chosen": -0.3739926815032959,
"logits/rejected": -0.29570120573043823,
"logps/chosen": -46.3231201171875,
"logps/rejected": -73.8995361328125,
"loss": 0.0448,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3167559504508972,
"rewards/margins": 5.942593574523926,
"rewards/rejected": -6.259350299835205,
"step": 186
},
{
"epoch": 2.216296296296296,
"grad_norm": 10.218903811635839,
"learning_rate": 2.4479906289012216e-07,
"logits/chosen": -0.23329459130764008,
"logits/rejected": -0.16335441172122955,
"logps/chosen": -41.237979888916016,
"logps/rejected": -61.02803039550781,
"loss": 0.0585,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29829394817352295,
"rewards/margins": 6.824221611022949,
"rewards/rejected": -6.525927543640137,
"step": 187
},
{
"epoch": 2.228148148148148,
"grad_norm": 5.706051711082172,
"learning_rate": 2.421992978581152e-07,
"logits/chosen": -0.20098957419395447,
"logits/rejected": -0.1625077724456787,
"logps/chosen": -33.27192306518555,
"logps/rejected": -63.34418869018555,
"loss": 0.0339,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16527414321899414,
"rewards/margins": 7.369152069091797,
"rewards/rejected": -7.534427165985107,
"step": 188
},
{
"epoch": 2.24,
"grad_norm": 5.574632829097292,
"learning_rate": 2.3960037696835987e-07,
"logits/chosen": -0.15070542693138123,
"logits/rejected": -0.09224209934473038,
"logps/chosen": -36.12839126586914,
"logps/rejected": -78.54652404785156,
"loss": 0.0296,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1367915868759155,
"rewards/margins": 9.016799926757812,
"rewards/rejected": -10.15359115600586,
"step": 189
},
{
"epoch": 2.251851851851852,
"grad_norm": 9.222402573182322,
"learning_rate": 2.3700258145950493e-07,
"logits/chosen": -0.16236115992069244,
"logits/rejected": -0.21426168084144592,
"logps/chosen": -32.171287536621094,
"logps/rejected": -65.74127960205078,
"loss": 0.0555,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.31431764364242554,
"rewards/margins": 7.022213459014893,
"rewards/rejected": -7.336531162261963,
"step": 190
},
{
"epoch": 2.2637037037037038,
"grad_norm": 7.127001970520486,
"learning_rate": 2.3440619244841794e-07,
"logits/chosen": -0.2093840390443802,
"logits/rejected": -0.20409150421619415,
"logps/chosen": -31.866199493408203,
"logps/rejected": -57.632957458496094,
"loss": 0.0419,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5825138092041016,
"rewards/margins": 5.033719539642334,
"rewards/rejected": -5.616233825683594,
"step": 191
},
{
"epoch": 2.2755555555555556,
"grad_norm": 7.894992729942256,
"learning_rate": 2.3181149089976404e-07,
"logits/chosen": -0.07013247907161713,
"logits/rejected": -0.04877481237053871,
"logps/chosen": -33.34114456176758,
"logps/rejected": -54.995853424072266,
"loss": 0.0496,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006869621574878693,
"rewards/margins": 6.3635053634643555,
"rewards/rejected": -6.356635570526123,
"step": 192
},
{
"epoch": 2.2874074074074073,
"grad_norm": 11.705188167009563,
"learning_rate": 2.2921875759560207e-07,
"logits/chosen": -0.13075098395347595,
"logits/rejected": -0.20485468208789825,
"logps/chosen": -47.33964157104492,
"logps/rejected": -67.68054962158203,
"loss": 0.0641,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.68022620677948,
"rewards/margins": 6.593554973602295,
"rewards/rejected": -7.273780822753906,
"step": 193
},
{
"epoch": 2.299259259259259,
"grad_norm": 7.134458814614045,
"learning_rate": 2.2662827310499995e-07,
"logits/chosen": -0.2494003027677536,
"logits/rejected": -0.20588865876197815,
"logps/chosen": -36.393733978271484,
"logps/rejected": -58.504547119140625,
"loss": 0.042,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11634793132543564,
"rewards/margins": 6.01495885848999,
"rewards/rejected": -6.1313066482543945,
"step": 194
},
{
"epoch": 2.311111111111111,
"grad_norm": 12.206709819986875,
"learning_rate": 2.2404031775367332e-07,
"logits/chosen": -0.29956668615341187,
"logits/rejected": -0.21865390241146088,
"logps/chosen": -32.450687408447266,
"logps/rejected": -69.80120086669922,
"loss": 0.0409,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12443738430738449,
"rewards/margins": 8.257219314575195,
"rewards/rejected": -8.132781982421875,
"step": 195
},
{
"epoch": 2.322962962962963,
"grad_norm": 6.1999786310250675,
"learning_rate": 2.2145517159365043e-07,
"logits/chosen": -0.5200955271720886,
"logits/rejected": -0.4296617805957794,
"logps/chosen": -36.10881805419922,
"logps/rejected": -63.84122848510742,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5274075865745544,
"rewards/margins": 6.768378257751465,
"rewards/rejected": -7.295785903930664,
"step": 196
},
{
"epoch": 2.334814814814815,
"grad_norm": 12.570655654481419,
"learning_rate": 2.1887311437296684e-07,
"logits/chosen": -0.31551796197891235,
"logits/rejected": -0.281019389629364,
"logps/chosen": -29.494091033935547,
"logps/rejected": -46.46226501464844,
"loss": 0.0761,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.2344491183757782,
"rewards/margins": 4.557922840118408,
"rewards/rejected": -4.3234734535217285,
"step": 197
},
{
"epoch": 2.3466666666666667,
"grad_norm": 7.628823812638006,
"learning_rate": 2.162944255053928e-07,
"logits/chosen": -0.31029197573661804,
"logits/rejected": -0.24161145091056824,
"logps/chosen": -29.65079689025879,
"logps/rejected": -57.47043228149414,
"loss": 0.0435,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04058557003736496,
"rewards/margins": 6.636114597320557,
"rewards/rejected": -6.595529079437256,
"step": 198
},
{
"epoch": 2.3585185185185185,
"grad_norm": 8.85295248102532,
"learning_rate": 2.137193840401968e-07,
"logits/chosen": -0.3979605734348297,
"logits/rejected": -0.32846879959106445,
"logps/chosen": -34.3480224609375,
"logps/rejected": -58.354896545410156,
"loss": 0.0533,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3420531153678894,
"rewards/margins": 5.625160217285156,
"rewards/rejected": -5.283106803894043,
"step": 199
},
{
"epoch": 2.3703703703703702,
"grad_norm": 6.562343692053369,
"learning_rate": 2.1114826863194878e-07,
"logits/chosen": -0.25852352380752563,
"logits/rejected": -0.17369653284549713,
"logps/chosen": -35.14963912963867,
"logps/rejected": -68.84390258789062,
"loss": 0.0393,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.06890328228473663,
"rewards/margins": 7.833587169647217,
"rewards/rejected": -7.902491092681885,
"step": 200
},
{
"epoch": 2.3822222222222225,
"grad_norm": 7.374090556871228,
"learning_rate": 2.0858135751036568e-07,
"logits/chosen": -0.347494900226593,
"logits/rejected": -0.366567462682724,
"logps/chosen": -48.93431091308594,
"logps/rejected": -69.73056030273438,
"loss": 0.033,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1343318223953247,
"rewards/margins": 7.379057884216309,
"rewards/rejected": -8.51339054107666,
"step": 201
},
{
"epoch": 2.3940740740740742,
"grad_norm": 7.264335195847015,
"learning_rate": 2.060189284502037e-07,
"logits/chosen": -0.28864482045173645,
"logits/rejected": -0.14989601075649261,
"logps/chosen": -36.92799377441406,
"logps/rejected": -68.40786743164062,
"loss": 0.039,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6765543818473816,
"rewards/margins": 6.945480823516846,
"rewards/rejected": -7.622035503387451,
"step": 202
},
{
"epoch": 2.405925925925926,
"grad_norm": 7.5132642347332155,
"learning_rate": 2.0346125874119838e-07,
"logits/chosen": -0.35054826736450195,
"logits/rejected": -0.35159796476364136,
"logps/chosen": -35.19864273071289,
"logps/rejected": -67.63153839111328,
"loss": 0.0423,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2543597221374512,
"rewards/margins": 7.7994771003723145,
"rewards/rejected": -9.053837776184082,
"step": 203
},
{
"epoch": 2.417777777777778,
"grad_norm": 10.04344534438614,
"learning_rate": 2.0090862515805895e-07,
"logits/chosen": -0.13007110357284546,
"logits/rejected": -0.1110701858997345,
"logps/chosen": -43.942832946777344,
"logps/rejected": -60.823211669921875,
"loss": 0.0693,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1511293649673462,
"rewards/margins": 6.567122936248779,
"rewards/rejected": -7.718252658843994,
"step": 204
},
{
"epoch": 2.4296296296296296,
"grad_norm": 9.49554839896552,
"learning_rate": 1.983613039305173e-07,
"logits/chosen": -0.4052940905094147,
"logits/rejected": -0.23975247144699097,
"logps/chosen": -27.93557357788086,
"logps/rejected": -65.16735076904297,
"loss": 0.0571,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3964582085609436,
"rewards/margins": 8.21355152130127,
"rewards/rejected": -8.610010147094727,
"step": 205
},
{
"epoch": 2.4414814814814814,
"grad_norm": 7.396028421519832,
"learning_rate": 1.9581957071343588e-07,
"logits/chosen": -0.3185134828090668,
"logits/rejected": -0.1753259003162384,
"logps/chosen": -45.08576202392578,
"logps/rejected": -88.92870330810547,
"loss": 0.047,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9330806732177734,
"rewards/margins": 8.258587837219238,
"rewards/rejected": -10.191668510437012,
"step": 206
},
{
"epoch": 2.453333333333333,
"grad_norm": 6.455933266927346,
"learning_rate": 1.9328370055697832e-07,
"logits/chosen": -0.20377328991889954,
"logits/rejected": -0.09884392470121384,
"logps/chosen": -31.141347885131836,
"logps/rejected": -68.2867431640625,
"loss": 0.032,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.21240955591201782,
"rewards/margins": 7.815065860748291,
"rewards/rejected": -8.027475357055664,
"step": 207
},
{
"epoch": 2.4651851851851854,
"grad_norm": 9.373669301801096,
"learning_rate": 1.907539678768453e-07,
"logits/chosen": -0.5164112448692322,
"logits/rejected": -0.4492265284061432,
"logps/chosen": -31.97795295715332,
"logps/rejected": -73.8591079711914,
"loss": 0.0554,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6608830094337463,
"rewards/margins": 7.822652816772461,
"rewards/rejected": -8.483535766601562,
"step": 208
},
{
"epoch": 2.477037037037037,
"grad_norm": 8.80264276423414,
"learning_rate": 1.8823064642457876e-07,
"logits/chosen": -0.19362421333789825,
"logits/rejected": -0.09952510893344879,
"logps/chosen": -36.87741470336914,
"logps/rejected": -76.23078918457031,
"loss": 0.0465,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9911991357803345,
"rewards/margins": 7.124609470367432,
"rewards/rejected": -8.115808486938477,
"step": 209
},
{
"epoch": 2.488888888888889,
"grad_norm": 6.872370358967133,
"learning_rate": 1.8571400925793852e-07,
"logits/chosen": -0.28052157163619995,
"logits/rejected": -0.18670235574245453,
"logps/chosen": -34.98965835571289,
"logps/rejected": -62.01945495605469,
"loss": 0.036,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1422077715396881,
"rewards/margins": 7.268811225891113,
"rewards/rejected": -7.126603126525879,
"step": 210
},
{
"epoch": 2.5007407407407407,
"grad_norm": 6.102992921343477,
"learning_rate": 1.8320432871135376e-07,
"logits/chosen": -0.012273239903151989,
"logits/rejected": 0.11726081371307373,
"logps/chosen": -41.02720642089844,
"logps/rejected": -70.41509246826172,
"loss": 0.0326,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5696543455123901,
"rewards/margins": 6.976672649383545,
"rewards/rejected": -7.546327114105225,
"step": 211
},
{
"epoch": 2.5125925925925925,
"grad_norm": 7.523877695462096,
"learning_rate": 1.8070187636645237e-07,
"logits/chosen": -0.25425052642822266,
"logits/rejected": -0.20172733068466187,
"logps/chosen": -29.920835494995117,
"logps/rejected": -58.127830505371094,
"loss": 0.0442,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9108222126960754,
"rewards/margins": 5.636702060699463,
"rewards/rejected": -6.547523498535156,
"step": 212
},
{
"epoch": 2.5244444444444447,
"grad_norm": 8.233825538587578,
"learning_rate": 1.782069230226725e-07,
"logits/chosen": -0.1111406460404396,
"logits/rejected": -0.07830701768398285,
"logps/chosen": -36.62953567504883,
"logps/rejected": -69.00572204589844,
"loss": 0.0489,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7046716809272766,
"rewards/margins": 8.098491668701172,
"rewards/rejected": -8.803162574768066,
"step": 213
},
{
"epoch": 2.536296296296296,
"grad_norm": 7.26503668987421,
"learning_rate": 1.7571973866795813e-07,
"logits/chosen": -0.3010156750679016,
"logits/rejected": -0.14240717887878418,
"logps/chosen": -28.9267635345459,
"logps/rejected": -62.530731201171875,
"loss": 0.0385,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.38880711793899536,
"rewards/margins": 7.034039497375488,
"rewards/rejected": -7.422846794128418,
"step": 214
},
{
"epoch": 2.5481481481481483,
"grad_norm": 5.978089867317629,
"learning_rate": 1.7324059244954292e-07,
"logits/chosen": -0.4227255582809448,
"logits/rejected": -0.40319374203681946,
"logps/chosen": -33.19075012207031,
"logps/rejected": -64.02904510498047,
"loss": 0.0325,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5749123096466064,
"rewards/margins": 9.057025909423828,
"rewards/rejected": -9.631938934326172,
"step": 215
},
{
"epoch": 2.56,
"grad_norm": 9.696303087236457,
"learning_rate": 1.7076975264482433e-07,
"logits/chosen": -0.37839898467063904,
"logits/rejected": -0.273608922958374,
"logps/chosen": -36.7985954284668,
"logps/rejected": -64.79552459716797,
"loss": 0.0498,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2072323560714722,
"rewards/margins": 6.241967678070068,
"rewards/rejected": -7.449199676513672,
"step": 216
},
{
"epoch": 2.571851851851852,
"grad_norm": 7.034579383121281,
"learning_rate": 1.6830748663233303e-07,
"logits/chosen": -0.25258129835128784,
"logits/rejected": -0.23720452189445496,
"logps/chosen": -31.781192779541016,
"logps/rejected": -62.308006286621094,
"loss": 0.0353,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.21115940809249878,
"rewards/margins": 7.114487648010254,
"rewards/rejected": -7.325647354125977,
"step": 217
},
{
"epoch": 2.5837037037037036,
"grad_norm": 8.754965473386916,
"learning_rate": 1.6585406086279846e-07,
"logits/chosen": -0.43037766218185425,
"logits/rejected": -0.375767320394516,
"logps/chosen": -40.90904998779297,
"logps/rejected": -76.52076721191406,
"loss": 0.0544,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7350015044212341,
"rewards/margins": 8.066032409667969,
"rewards/rejected": -8.801033973693848,
"step": 218
},
{
"epoch": 2.5955555555555554,
"grad_norm": 5.4726077818919565,
"learning_rate": 1.6340974083031523e-07,
"logits/chosen": -0.29419374465942383,
"logits/rejected": -0.28271955251693726,
"logps/chosen": -30.226680755615234,
"logps/rejected": -55.334014892578125,
"loss": 0.0302,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04286186397075653,
"rewards/margins": 6.613791465759277,
"rewards/rejected": -6.570930004119873,
"step": 219
},
{
"epoch": 2.6074074074074076,
"grad_norm": 7.600745751196765,
"learning_rate": 1.6097479104361326e-07,
"logits/chosen": -0.38003548979759216,
"logits/rejected": -0.20779910683631897,
"logps/chosen": -26.411277770996094,
"logps/rejected": -65.45418548583984,
"loss": 0.0318,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08172351866960526,
"rewards/margins": 7.309746742248535,
"rewards/rejected": -7.228023052215576,
"step": 220
},
{
"epoch": 2.6192592592592594,
"grad_norm": 5.529941543826629,
"learning_rate": 1.5854947499743413e-07,
"logits/chosen": -0.2649455666542053,
"logits/rejected": -0.1322605013847351,
"logps/chosen": -28.605796813964844,
"logps/rejected": -67.70567321777344,
"loss": 0.0316,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8953390717506409,
"rewards/margins": 7.937841892242432,
"rewards/rejected": -8.83318042755127,
"step": 221
},
{
"epoch": 2.631111111111111,
"grad_norm": 7.828964734675065,
"learning_rate": 1.5613405514401757e-07,
"logits/chosen": -0.4999098479747772,
"logits/rejected": -0.46459028124809265,
"logps/chosen": -32.34528350830078,
"logps/rejected": -64.39063262939453,
"loss": 0.0417,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7131352424621582,
"rewards/margins": 7.366223335266113,
"rewards/rejected": -9.07935905456543,
"step": 222
},
{
"epoch": 2.642962962962963,
"grad_norm": 6.684903510896331,
"learning_rate": 1.537287928647002e-07,
"logits/chosen": -0.33326905965805054,
"logits/rejected": -0.2772333025932312,
"logps/chosen": -33.04732894897461,
"logps/rejected": -55.9017448425293,
"loss": 0.034,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4501342475414276,
"rewards/margins": 6.482940196990967,
"rewards/rejected": -6.933073997497559,
"step": 223
},
{
"epoch": 2.6548148148148147,
"grad_norm": 7.774671778619875,
"learning_rate": 1.513339484416309e-07,
"logits/chosen": -0.350558876991272,
"logits/rejected": -0.28618156909942627,
"logps/chosen": -49.11450958251953,
"logps/rejected": -80.40065002441406,
"loss": 0.0421,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9198662042617798,
"rewards/margins": 8.049423217773438,
"rewards/rejected": -9.969290733337402,
"step": 224
},
{
"epoch": 2.6666666666666665,
"grad_norm": 6.147372232877328,
"learning_rate": 1.489497810296046e-07,
"logits/chosen": -0.2636696696281433,
"logits/rejected": -0.1582798808813095,
"logps/chosen": -33.32222366333008,
"logps/rejected": -88.90251159667969,
"loss": 0.0347,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8391042947769165,
"rewards/margins": 9.376372337341309,
"rewards/rejected": -10.215476036071777,
"step": 225
},
{
"epoch": 2.6785185185185183,
"grad_norm": 6.058104127594901,
"learning_rate": 1.4657654862801797e-07,
"logits/chosen": -0.3205685317516327,
"logits/rejected": -0.24160242080688477,
"logps/chosen": -28.52737045288086,
"logps/rejected": -70.28912353515625,
"loss": 0.0233,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.24154254794120789,
"rewards/margins": 6.4893083572387695,
"rewards/rejected": -6.730850696563721,
"step": 226
},
{
"epoch": 2.6903703703703705,
"grad_norm": 10.148811706386386,
"learning_rate": 1.4421450805295082e-07,
"logits/chosen": -0.33272168040275574,
"logits/rejected": -0.3017561733722687,
"logps/chosen": -40.76533889770508,
"logps/rejected": -58.748046875,
"loss": 0.0433,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23764045536518097,
"rewards/margins": 5.5487141609191895,
"rewards/rejected": -5.7863545417785645,
"step": 227
},
{
"epoch": 2.7022222222222223,
"grad_norm": 7.063093696385122,
"learning_rate": 1.418639149093748e-07,
"logits/chosen": -0.5206415057182312,
"logits/rejected": -0.42945098876953125,
"logps/chosen": -35.94019317626953,
"logps/rejected": -51.362579345703125,
"loss": 0.0385,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5331210494041443,
"rewards/margins": 5.205883502960205,
"rewards/rejected": -5.739004611968994,
"step": 228
},
{
"epoch": 2.714074074074074,
"grad_norm": 7.873488891465637,
"learning_rate": 1.3952502356349323e-07,
"logits/chosen": -0.2090906947851181,
"logits/rejected": -0.116541787981987,
"logps/chosen": -36.972110748291016,
"logps/rejected": -70.80089569091797,
"loss": 0.0445,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16982126235961914,
"rewards/margins": 8.14144515991211,
"rewards/rejected": -8.311266899108887,
"step": 229
},
{
"epoch": 2.725925925925926,
"grad_norm": 6.247579493903217,
"learning_rate": 1.371980871152157e-07,
"logits/chosen": -0.14566001296043396,
"logits/rejected": -0.19308951497077942,
"logps/chosen": -41.63805389404297,
"logps/rejected": -75.58000183105469,
"loss": 0.0357,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.45496970415115356,
"rewards/margins": 8.866785049438477,
"rewards/rejected": -9.321755409240723,
"step": 230
},
{
"epoch": 2.7377777777777776,
"grad_norm": 6.023624860692066,
"learning_rate": 1.3488335737076911e-07,
"logits/chosen": -0.24541382491588593,
"logits/rejected": -0.2724686861038208,
"logps/chosen": -33.94440841674805,
"logps/rejected": -54.19181823730469,
"loss": 0.0349,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.641716718673706,
"rewards/margins": 6.787499904632568,
"rewards/rejected": -7.4292168617248535,
"step": 231
},
{
"epoch": 2.74962962962963,
"grad_norm": 6.829922502232146,
"learning_rate": 1.3258108481544847e-07,
"logits/chosen": -0.2750440537929535,
"logits/rejected": -0.2242782562971115,
"logps/chosen": -47.98163986206055,
"logps/rejected": -71.17113494873047,
"loss": 0.0381,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.330321192741394,
"rewards/margins": 5.847900390625,
"rewards/rejected": -7.178221702575684,
"step": 232
},
{
"epoch": 2.7614814814814816,
"grad_norm": 9.939842224750613,
"learning_rate": 1.3029151858651143e-07,
"logits/chosen": -0.4768088757991791,
"logits/rejected": -0.3678171634674072,
"logps/chosen": -28.488832473754883,
"logps/rejected": -72.2415542602539,
"loss": 0.0563,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4457303285598755,
"rewards/margins": 7.388967514038086,
"rewards/rejected": -7.834697723388672,
"step": 233
},
{
"epoch": 2.7733333333333334,
"grad_norm": 5.772137373566735,
"learning_rate": 1.2801490644621788e-07,
"logits/chosen": -0.10860705375671387,
"logits/rejected": -0.09419623762369156,
"logps/chosen": -41.14183807373047,
"logps/rejected": -73.03929138183594,
"loss": 0.0334,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0385979413986206,
"rewards/margins": 9.18422794342041,
"rewards/rejected": -10.222824096679688,
"step": 234
},
{
"epoch": 2.785185185185185,
"grad_norm": 6.872437338157428,
"learning_rate": 1.257514947550189e-07,
"logits/chosen": -0.32558369636535645,
"logits/rejected": -0.23454414308071136,
"logps/chosen": -27.442285537719727,
"logps/rejected": -47.84419250488281,
"loss": 0.0369,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5560548305511475,
"rewards/margins": 5.780346870422363,
"rewards/rejected": -6.336400985717773,
"step": 235
},
{
"epoch": 2.797037037037037,
"grad_norm": 9.526509052512719,
"learning_rate": 1.2350152844489688e-07,
"logits/chosen": -0.3666895925998688,
"logits/rejected": -0.22046907246112823,
"logps/chosen": -38.18906021118164,
"logps/rejected": -68.36451721191406,
"loss": 0.0521,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.1897006034851074,
"rewards/margins": 6.763792991638184,
"rewards/rejected": -7.953493118286133,
"step": 236
},
{
"epoch": 2.8088888888888888,
"grad_norm": 6.2325582014169205,
"learning_rate": 1.2126525099286108e-07,
"logits/chosen": -0.3752056956291199,
"logits/rejected": -0.29132628440856934,
"logps/chosen": -39.4105110168457,
"logps/rejected": -70.36639404296875,
"loss": 0.0344,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.190868854522705,
"rewards/margins": 7.782485008239746,
"rewards/rejected": -8.973353385925293,
"step": 237
},
{
"epoch": 2.8207407407407405,
"grad_norm": 6.813356776563027,
"learning_rate": 1.1904290439459971e-07,
"logits/chosen": -0.4055876135826111,
"logits/rejected": -0.34407296776771545,
"logps/chosen": -36.34687805175781,
"logps/rejected": -64.90667724609375,
"loss": 0.029,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6599448323249817,
"rewards/margins": 7.470705509185791,
"rewards/rejected": -8.130650520324707,
"step": 238
},
{
"epoch": 2.8325925925925928,
"grad_norm": 8.170401197759237,
"learning_rate": 1.1683472913829284e-07,
"logits/chosen": -0.20514726638793945,
"logits/rejected": -0.11336632817983627,
"logps/chosen": -51.73873519897461,
"logps/rejected": -71.9918441772461,
"loss": 0.0379,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8510772585868835,
"rewards/margins": 6.703639507293701,
"rewards/rejected": -7.554717063903809,
"step": 239
},
{
"epoch": 2.8444444444444446,
"grad_norm": 8.191853793447038,
"learning_rate": 1.146409641785882e-07,
"logits/chosen": -0.16900832951068878,
"logits/rejected": -0.16538989543914795,
"logps/chosen": -44.017845153808594,
"logps/rejected": -54.396583557128906,
"loss": 0.0415,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.656957983970642,
"rewards/margins": 4.629300117492676,
"rewards/rejected": -6.286258697509766,
"step": 240
},
{
"epoch": 2.8562962962962963,
"grad_norm": 7.381077568622568,
"learning_rate": 1.1246184691074314e-07,
"logits/chosen": -0.24956950545310974,
"logits/rejected": -0.24476227164268494,
"logps/chosen": -42.77149963378906,
"logps/rejected": -79.72845458984375,
"loss": 0.0398,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6535344123840332,
"rewards/margins": 9.295120239257812,
"rewards/rejected": -9.948655128479004,
"step": 241
},
{
"epoch": 2.868148148148148,
"grad_norm": 8.160459964737315,
"learning_rate": 1.1029761314493518e-07,
"logits/chosen": -0.30951178073883057,
"logits/rejected": -0.3085937201976776,
"logps/chosen": -40.81586456298828,
"logps/rejected": -60.63238525390625,
"loss": 0.0367,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3215899467468262,
"rewards/margins": 6.157529830932617,
"rewards/rejected": -7.479120254516602,
"step": 242
},
{
"epoch": 2.88,
"grad_norm": 8.037040216975743,
"learning_rate": 1.0814849708074414e-07,
"logits/chosen": -0.30921998620033264,
"logits/rejected": -0.3065870404243469,
"logps/chosen": -51.514556884765625,
"logps/rejected": -66.55127716064453,
"loss": 0.0324,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17993119359016418,
"rewards/margins": 6.790800094604492,
"rewards/rejected": -6.970730781555176,
"step": 243
},
{
"epoch": 2.891851851851852,
"grad_norm": 9.656051990182503,
"learning_rate": 1.0601473128180854e-07,
"logits/chosen": -0.2805531620979309,
"logits/rejected": -0.2533468008041382,
"logps/chosen": -46.80023193359375,
"logps/rejected": -68.6700668334961,
"loss": 0.0383,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8430509567260742,
"rewards/margins": 8.124579429626465,
"rewards/rejected": -8.967630386352539,
"step": 244
},
{
"epoch": 2.9037037037037035,
"grad_norm": 8.425061421071412,
"learning_rate": 1.0389654665065908e-07,
"logits/chosen": -0.2665305435657501,
"logits/rejected": -0.30099448561668396,
"logps/chosen": -36.76901626586914,
"logps/rejected": -62.380638122558594,
"loss": 0.0436,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7122980952262878,
"rewards/margins": 7.467397689819336,
"rewards/rejected": -8.179695129394531,
"step": 245
},
{
"epoch": 2.9155555555555557,
"grad_norm": 7.412242548453604,
"learning_rate": 1.0179417240373182e-07,
"logits/chosen": -0.32122743129730225,
"logits/rejected": -0.22737433016300201,
"logps/chosen": -52.4939079284668,
"logps/rejected": -86.98854064941406,
"loss": 0.0332,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.623575210571289,
"rewards/margins": 8.667156219482422,
"rewards/rejected": -11.290731430053711,
"step": 246
},
{
"epoch": 2.9274074074074075,
"grad_norm": 6.343263474138892,
"learning_rate": 9.970783604656383e-08,
"logits/chosen": -0.40669649839401245,
"logits/rejected": -0.2913494408130646,
"logps/chosen": -40.742733001708984,
"logps/rejected": -70.4676513671875,
"loss": 0.0345,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4790029525756836,
"rewards/margins": 7.971775054931641,
"rewards/rejected": -8.450777053833008,
"step": 247
},
{
"epoch": 2.9392592592592592,
"grad_norm": 8.350168639074793,
"learning_rate": 9.763776334917398e-08,
"logits/chosen": -0.28063714504241943,
"logits/rejected": -0.2614033818244934,
"logps/chosen": -39.683170318603516,
"logps/rejected": -56.151100158691406,
"loss": 0.0442,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.2253810167312622,
"rewards/margins": 5.419277667999268,
"rewards/rejected": -6.644659042358398,
"step": 248
},
{
"epoch": 2.951111111111111,
"grad_norm": 6.1204430667008864,
"learning_rate": 9.558417832163162e-08,
"logits/chosen": -0.13153290748596191,
"logits/rejected": -0.19138801097869873,
"logps/chosen": -38.79569625854492,
"logps/rejected": -55.324180603027344,
"loss": 0.0339,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.37145036458969116,
"rewards/margins": 6.06140661239624,
"rewards/rejected": -6.432857513427734,
"step": 249
},
{
"epoch": 2.962962962962963,
"grad_norm": 7.267482062373579,
"learning_rate": 9.354730318981561e-08,
"logits/chosen": -0.4541955590248108,
"logits/rejected": -0.3866829574108124,
"logps/chosen": -31.26105308532715,
"logps/rejected": -66.00698852539062,
"loss": 0.0438,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9516152739524841,
"rewards/margins": 7.78145170211792,
"rewards/rejected": -8.73306655883789,
"step": 250
},
{
"epoch": 2.974814814814815,
"grad_norm": 5.662458264963396,
"learning_rate": 9.15273583713663e-08,
"logits/chosen": -0.37392503023147583,
"logits/rejected": -0.29210343956947327,
"logps/chosen": -47.48450469970703,
"logps/rejected": -90.92308044433594,
"loss": 0.0351,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.219913959503174,
"rewards/margins": 12.272329330444336,
"rewards/rejected": -14.492244720458984,
"step": 251
},
{
"epoch": 2.986666666666667,
"grad_norm": 5.97436680259246,
"learning_rate": 8.95245624518336e-08,
"logits/chosen": -0.3016185760498047,
"logits/rejected": -0.2964284420013428,
"logps/chosen": -34.3846321105957,
"logps/rejected": -68.51025390625,
"loss": 0.0343,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7737685441970825,
"rewards/margins": 7.51839017868042,
"rewards/rejected": -8.292159080505371,
"step": 252
},
{
"epoch": 2.9985185185185186,
"grad_norm": 6.809722463225596,
"learning_rate": 8.753913216102285e-08,
"logits/chosen": -0.26927924156188965,
"logits/rejected": -0.09772679954767227,
"logps/chosen": -39.219181060791016,
"logps/rejected": -83.87496185302734,
"loss": 0.0378,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.874872088432312,
"rewards/margins": 7.590030193328857,
"rewards/rejected": -9.464902877807617,
"step": 253
},
{
"epoch": 3.0103703703703704,
"grad_norm": 5.7996860529913565,
"learning_rate": 8.557128234954189e-08,
"logits/chosen": -0.40512141585350037,
"logits/rejected": -0.3163852393627167,
"logps/chosen": -29.341394424438477,
"logps/rejected": -70.40370178222656,
"loss": 0.0283,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9042503833770752,
"rewards/margins": 8.99386978149414,
"rewards/rejected": -9.898119926452637,
"step": 254
},
{
"epoch": 3.022222222222222,
"grad_norm": 5.286377326614028,
"learning_rate": 8.362122596555088e-08,
"logits/chosen": -0.47114288806915283,
"logits/rejected": -0.3825288712978363,
"logps/chosen": -33.60106658935547,
"logps/rejected": -76.53120422363281,
"loss": 0.0282,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.207690954208374,
"rewards/margins": 10.646196365356445,
"rewards/rejected": -11.853886604309082,
"step": 255
},
{
"epoch": 3.034074074074074,
"grad_norm": 6.253088250116567,
"learning_rate": 8.16891740317189e-08,
"logits/chosen": -0.3761712908744812,
"logits/rejected": -0.35195398330688477,
"logps/chosen": -33.14177322387695,
"logps/rejected": -55.4194450378418,
"loss": 0.0385,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6758537888526917,
"rewards/margins": 6.090365409851074,
"rewards/rejected": -6.766219615936279,
"step": 256
},
{
"epoch": 3.0459259259259257,
"grad_norm": 6.986196206894695,
"learning_rate": 7.977533562238838e-08,
"logits/chosen": -0.4037611186504364,
"logits/rejected": -0.3634166419506073,
"logps/chosen": -34.40519332885742,
"logps/rejected": -70.6146240234375,
"loss": 0.0372,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2543308138847351,
"rewards/margins": 7.034722328186035,
"rewards/rejected": -7.289052963256836,
"step": 257
},
{
"epoch": 3.057777777777778,
"grad_norm": 5.505207911608369,
"learning_rate": 7.787991784094999e-08,
"logits/chosen": -0.2387389987707138,
"logits/rejected": -0.08904880285263062,
"logps/chosen": -36.03857421875,
"logps/rejected": -89.65563201904297,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6398571133613586,
"rewards/margins": 8.411537170410156,
"rewards/rejected": -9.051393508911133,
"step": 258
},
{
"epoch": 3.0696296296296297,
"grad_norm": 6.3788891492475415,
"learning_rate": 7.60031257974316e-08,
"logits/chosen": -0.36758318543434143,
"logits/rejected": -0.28655973076820374,
"logps/chosen": -34.49348831176758,
"logps/rejected": -75.51551818847656,
"loss": 0.0368,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3124196529388428,
"rewards/margins": 9.141406059265137,
"rewards/rejected": -10.453825950622559,
"step": 259
},
{
"epoch": 3.0814814814814815,
"grad_norm": 7.213670504047207,
"learning_rate": 7.414516258630244e-08,
"logits/chosen": -0.2934122681617737,
"logits/rejected": -0.2819547653198242,
"logps/chosen": -49.67085647583008,
"logps/rejected": -82.62693786621094,
"loss": 0.0342,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0997469425201416,
"rewards/margins": 9.53528118133545,
"rewards/rejected": -10.635027885437012,
"step": 260
},
{
"epoch": 3.0933333333333333,
"grad_norm": 5.692432592478289,
"learning_rate": 7.230622926449564e-08,
"logits/chosen": -0.2739347219467163,
"logits/rejected": -0.20775115489959717,
"logps/chosen": -37.56914520263672,
"logps/rejected": -65.73677062988281,
"loss": 0.0333,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4430259466171265,
"rewards/margins": 8.155195236206055,
"rewards/rejected": -9.598221778869629,
"step": 261
},
{
"epoch": 3.105185185185185,
"grad_norm": 7.076581419281233,
"learning_rate": 7.048652482965078e-08,
"logits/chosen": -0.18068230152130127,
"logits/rejected": -0.1852649450302124,
"logps/chosen": -46.34651184082031,
"logps/rejected": -68.5006103515625,
"loss": 0.037,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1328610181808472,
"rewards/margins": 7.682218551635742,
"rewards/rejected": -8.815078735351562,
"step": 262
},
{
"epoch": 3.117037037037037,
"grad_norm": 6.338236077273577,
"learning_rate": 6.868624619858021e-08,
"logits/chosen": -0.2783002257347107,
"logits/rejected": -0.3345209062099457,
"logps/chosen": -36.772254943847656,
"logps/rejected": -83.5664291381836,
"loss": 0.0277,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4274479150772095,
"rewards/margins": 7.226326942443848,
"rewards/rejected": -7.653774261474609,
"step": 263
},
{
"epoch": 3.128888888888889,
"grad_norm": 5.495669924723658,
"learning_rate": 6.690558818595943e-08,
"logits/chosen": -0.34206265211105347,
"logits/rejected": -0.19553421437740326,
"logps/chosen": -32.38424301147461,
"logps/rejected": -85.89747619628906,
"loss": 0.0276,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1448084115982056,
"rewards/margins": 9.975635528564453,
"rewards/rejected": -11.120444297790527,
"step": 264
},
{
"epoch": 3.140740740740741,
"grad_norm": 5.0666203766090385,
"learning_rate": 6.514474348324581e-08,
"logits/chosen": -0.38811901211738586,
"logits/rejected": -0.2747833728790283,
"logps/chosen": -48.448951721191406,
"logps/rejected": -77.10838317871094,
"loss": 0.028,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6878550052642822,
"rewards/margins": 7.391786575317383,
"rewards/rejected": -9.079641342163086,
"step": 265
},
{
"epoch": 3.1525925925925926,
"grad_norm": 8.885007920801026,
"learning_rate": 6.340390263782655e-08,
"logits/chosen": -0.5093058347702026,
"logits/rejected": -0.3832343816757202,
"logps/chosen": -32.23210144042969,
"logps/rejected": -76.0536117553711,
"loss": 0.047,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.038543462753296,
"rewards/margins": 9.122876167297363,
"rewards/rejected": -10.161420822143555,
"step": 266
},
{
"epoch": 3.1644444444444444,
"grad_norm": 7.984860081690398,
"learning_rate": 6.168325403239913e-08,
"logits/chosen": -0.4433887004852295,
"logits/rejected": -0.3792242109775543,
"logps/chosen": -29.287479400634766,
"logps/rejected": -58.2768669128418,
"loss": 0.0387,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.46901828050613403,
"rewards/margins": 7.602441787719727,
"rewards/rejected": -8.071460723876953,
"step": 267
},
{
"epoch": 3.176296296296296,
"grad_norm": 4.958032867566694,
"learning_rate": 5.998298386458545e-08,
"logits/chosen": -0.22974154353141785,
"logits/rejected": -0.19992095232009888,
"logps/chosen": -38.70039367675781,
"logps/rejected": -72.84990692138672,
"loss": 0.0231,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0671855211257935,
"rewards/margins": 7.9272379875183105,
"rewards/rejected": -8.994423866271973,
"step": 268
},
{
"epoch": 3.188148148148148,
"grad_norm": 5.988243698878292,
"learning_rate": 5.830327612678265e-08,
"logits/chosen": -0.32180365920066833,
"logits/rejected": -0.3045603632926941,
"logps/chosen": -43.980316162109375,
"logps/rejected": -83.64070129394531,
"loss": 0.0285,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.710637331008911,
"rewards/margins": 9.31506061553955,
"rewards/rejected": -12.025696754455566,
"step": 269
},
{
"epoch": 3.2,
"grad_norm": 5.602823035809897,
"learning_rate": 5.6644312586253044e-08,
"logits/chosen": -0.0014043133705854416,
"logits/rejected": -0.04007536917924881,
"logps/chosen": -63.08719253540039,
"logps/rejected": -80.3905029296875,
"loss": 0.0292,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9075740575790405,
"rewards/margins": 7.626714706420898,
"rewards/rejected": -9.53428840637207,
"step": 270
},
{
"epoch": 3.211851851851852,
"grad_norm": 7.326638872216747,
"learning_rate": 5.5006272765454056e-08,
"logits/chosen": -0.43287378549575806,
"logits/rejected": -0.27151188254356384,
"logps/chosen": -33.51972579956055,
"logps/rejected": -58.82566452026367,
"loss": 0.0425,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.519016444683075,
"rewards/margins": 6.466248512268066,
"rewards/rejected": -6.985265731811523,
"step": 271
},
{
"epoch": 3.2237037037037037,
"grad_norm": 4.755215217001098,
"learning_rate": 5.338933392261158e-08,
"logits/chosen": -0.2298121452331543,
"logits/rejected": -0.17249788343906403,
"logps/chosen": -37.40292739868164,
"logps/rejected": -64.0435562133789,
"loss": 0.0245,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.865021288394928,
"rewards/margins": 6.909914016723633,
"rewards/rejected": -7.774934768676758,
"step": 272
},
{
"epoch": 3.2355555555555555,
"grad_norm": 7.328735036408531,
"learning_rate": 5.1793671032538206e-08,
"logits/chosen": -0.5466493368148804,
"logits/rejected": -0.49364450573921204,
"logps/chosen": -31.749622344970703,
"logps/rejected": -76.33462524414062,
"loss": 0.0351,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5585615634918213,
"rewards/margins": 8.347299575805664,
"rewards/rejected": -8.905860900878906,
"step": 273
},
{
"epoch": 3.2474074074074073,
"grad_norm": 4.866161229053689,
"learning_rate": 5.021945676769859e-08,
"logits/chosen": -0.5478118658065796,
"logits/rejected": -0.38123688101768494,
"logps/chosen": -26.91775131225586,
"logps/rejected": -66.9200668334961,
"loss": 0.0256,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.39309221506118774,
"rewards/margins": 8.147336959838867,
"rewards/rejected": -8.540430068969727,
"step": 274
},
{
"epoch": 3.259259259259259,
"grad_norm": 5.465123706910723,
"learning_rate": 4.866686147952387e-08,
"logits/chosen": -0.15793977677822113,
"logits/rejected": -0.13396653532981873,
"logps/chosen": -38.39078140258789,
"logps/rejected": -68.49137115478516,
"loss": 0.0332,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5564990639686584,
"rewards/margins": 7.465914726257324,
"rewards/rejected": -8.022414207458496,
"step": 275
},
{
"epoch": 3.2711111111111113,
"grad_norm": 6.892669002618101,
"learning_rate": 4.71360531799774e-08,
"logits/chosen": -0.17291544377803802,
"logits/rejected": -0.11444761604070663,
"logps/chosen": -52.285491943359375,
"logps/rejected": -84.1561508178711,
"loss": 0.0411,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.8530375957489014,
"rewards/margins": 7.457571983337402,
"rewards/rejected": -10.310609817504883,
"step": 276
},
{
"epoch": 3.282962962962963,
"grad_norm": 4.92957758138627,
"learning_rate": 4.562719752337349e-08,
"logits/chosen": -0.47689568996429443,
"logits/rejected": -0.38014689087867737,
"logps/chosen": -51.07635498046875,
"logps/rejected": -94.97547149658203,
"loss": 0.0246,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.258345603942871,
"rewards/margins": 8.710776329040527,
"rewards/rejected": -10.969121932983398,
"step": 277
},
{
"epoch": 3.294814814814815,
"grad_norm": 5.454251196984929,
"learning_rate": 4.4140457788451434e-08,
"logits/chosen": -0.3425113260746002,
"logits/rejected": -0.25056517124176025,
"logps/chosen": -31.684978485107422,
"logps/rejected": -69.71234130859375,
"loss": 0.0282,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.24049748480319977,
"rewards/margins": 7.845020771026611,
"rewards/rejected": -8.085518836975098,
"step": 278
},
{
"epoch": 3.3066666666666666,
"grad_norm": 5.883268469101933,
"learning_rate": 4.267599486070647e-08,
"logits/chosen": -0.172508105635643,
"logits/rejected": -0.20862886309623718,
"logps/chosen": -39.89122772216797,
"logps/rejected": -52.314510345458984,
"loss": 0.0292,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3147379755973816,
"rewards/margins": 5.7033185958862305,
"rewards/rejected": -6.0180559158325195,
"step": 279
},
{
"epoch": 3.3185185185185184,
"grad_norm": 6.523804321940008,
"learning_rate": 4.1233967214979764e-08,
"logits/chosen": -0.3144129812717438,
"logits/rejected": -0.22518262267112732,
"logps/chosen": -42.5799560546875,
"logps/rejected": -53.63530349731445,
"loss": 0.0332,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.022313117980957,
"rewards/margins": 4.446239471435547,
"rewards/rejected": -5.468553066253662,
"step": 280
},
{
"epoch": 3.33037037037037,
"grad_norm": 5.61217781900571,
"learning_rate": 3.9814530898309356e-08,
"logits/chosen": -0.2805265784263611,
"logits/rejected": -0.17468589544296265,
"logps/chosen": -36.89150619506836,
"logps/rejected": -73.11276245117188,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6258598566055298,
"rewards/margins": 8.552447319030762,
"rewards/rejected": -9.17830753326416,
"step": 281
},
{
"epoch": 3.3422222222222224,
"grad_norm": 5.921953356785593,
"learning_rate": 3.8417839513043646e-08,
"logits/chosen": -0.28441232442855835,
"logits/rejected": -0.20493623614311218,
"logps/chosen": -41.19779586791992,
"logps/rejected": -60.01893615722656,
"loss": 0.0339,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5952982902526855,
"rewards/margins": 5.5062713623046875,
"rewards/rejected": -7.101569652557373,
"step": 282
},
{
"epoch": 3.354074074074074,
"grad_norm": 5.347638108561962,
"learning_rate": 3.704404420021956e-08,
"logits/chosen": -0.3048914670944214,
"logits/rejected": -0.17332229018211365,
"logps/chosen": -33.9517936706543,
"logps/rejected": -71.91447448730469,
"loss": 0.0351,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8167021870613098,
"rewards/margins": 8.507079124450684,
"rewards/rejected": -9.32378101348877,
"step": 283
},
{
"epoch": 3.365925925925926,
"grad_norm": 5.238374490213889,
"learning_rate": 3.569329362320708e-08,
"logits/chosen": -0.2728411853313446,
"logits/rejected": -0.2584533393383026,
"logps/chosen": -30.238557815551758,
"logps/rejected": -75.02142333984375,
"loss": 0.0269,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.49139711260795593,
"rewards/margins": 9.32203197479248,
"rewards/rejected": -9.813429832458496,
"step": 284
},
{
"epoch": 3.3777777777777778,
"grad_norm": 6.525469512599618,
"learning_rate": 3.436573395162179e-08,
"logits/chosen": -0.3524834215641022,
"logits/rejected": -0.30102020502090454,
"logps/chosen": -30.73918914794922,
"logps/rejected": -59.49354553222656,
"loss": 0.0396,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09267859905958176,
"rewards/margins": 6.891367435455322,
"rewards/rejected": -6.984046459197998,
"step": 285
},
{
"epoch": 3.3896296296296295,
"grad_norm": 5.564012623506871,
"learning_rate": 3.306150884550732e-08,
"logits/chosen": -0.3768519461154938,
"logits/rejected": -0.3264191448688507,
"logps/chosen": -41.36799240112305,
"logps/rejected": -67.31135559082031,
"loss": 0.0329,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0806761980056763,
"rewards/margins": 7.03513240814209,
"rewards/rejected": -8.115808486938477,
"step": 286
},
{
"epoch": 3.4014814814814813,
"grad_norm": 5.931359172922728,
"learning_rate": 3.17807594397895e-08,
"logits/chosen": -0.3008241653442383,
"logits/rejected": -0.1854274868965149,
"logps/chosen": -35.424800872802734,
"logps/rejected": -75.77165222167969,
"loss": 0.0242,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3784024715423584,
"rewards/margins": 9.62315845489502,
"rewards/rejected": -11.00156021118164,
"step": 287
},
{
"epoch": 3.413333333333333,
"grad_norm": 5.696220945908767,
"learning_rate": 3.052362432900332e-08,
"logits/chosen": -0.4203820526599884,
"logits/rejected": -0.3877807557582855,
"logps/chosen": -37.054630279541016,
"logps/rejected": -61.031986236572266,
"loss": 0.0281,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9897009134292603,
"rewards/margins": 7.574882984161377,
"rewards/rejected": -8.564583778381348,
"step": 288
},
{
"epoch": 3.4251851851851853,
"grad_norm": 5.040781372610454,
"learning_rate": 2.9290239552295538e-08,
"logits/chosen": -0.04420602694153786,
"logits/rejected": -0.0779787227511406,
"logps/chosen": -49.03828811645508,
"logps/rejected": -64.86919403076172,
"loss": 0.0233,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1655118465423584,
"rewards/margins": 7.464972496032715,
"rewards/rejected": -8.630484580993652,
"step": 289
},
{
"epoch": 3.437037037037037,
"grad_norm": 6.3238698051370825,
"learning_rate": 2.8080738578703052e-08,
"logits/chosen": -0.2396240085363388,
"logits/rejected": -0.18526090681552887,
"logps/chosen": -35.554222106933594,
"logps/rejected": -80.34398651123047,
"loss": 0.0385,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.15909190475940704,
"rewards/margins": 11.187956809997559,
"rewards/rejected": -11.347049713134766,
"step": 290
},
{
"epoch": 3.448888888888889,
"grad_norm": 6.096416193339527,
"learning_rate": 2.6895252292709974e-08,
"logits/chosen": -0.3143896460533142,
"logits/rejected": -0.2961388826370239,
"logps/chosen": -45.88547897338867,
"logps/rejected": -72.7055892944336,
"loss": 0.0357,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5558788776397705,
"rewards/margins": 7.8544135093688965,
"rewards/rejected": -9.41029167175293,
"step": 291
},
{
"epoch": 3.4607407407407407,
"grad_norm": 6.109539533433742,
"learning_rate": 2.5733908980083984e-08,
"logits/chosen": -0.20717650651931763,
"logits/rejected": -0.1371382474899292,
"logps/chosen": -34.01182556152344,
"logps/rejected": -69.25910949707031,
"loss": 0.0275,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.003832221031189,
"rewards/margins": 6.742988586425781,
"rewards/rejected": -7.74682092666626,
"step": 292
},
{
"epoch": 3.4725925925925925,
"grad_norm": 5.397666624928578,
"learning_rate": 2.4596834313994037e-08,
"logits/chosen": -0.2090422511100769,
"logits/rejected": -0.1905803680419922,
"logps/chosen": -37.257659912109375,
"logps/rejected": -59.028656005859375,
"loss": 0.0267,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.009922012686729431,
"rewards/margins": 7.401028633117676,
"rewards/rejected": -7.410951614379883,
"step": 293
},
{
"epoch": 3.4844444444444447,
"grad_norm": 4.277274237028717,
"learning_rate": 2.3484151341411018e-08,
"logits/chosen": -0.28495097160339355,
"logits/rejected": -0.13665924966335297,
"logps/chosen": -30.814395904541016,
"logps/rejected": -71.38455963134766,
"loss": 0.0167,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.883634090423584,
"rewards/margins": 8.096288681030273,
"rewards/rejected": -8.979923248291016,
"step": 294
},
{
"epoch": 3.4962962962962965,
"grad_norm": 5.969698336918465,
"learning_rate": 2.23959804697921e-08,
"logits/chosen": 0.0012427568435668945,
"logits/rejected": -0.05258895084261894,
"logps/chosen": -43.88676071166992,
"logps/rejected": -69.91567993164062,
"loss": 0.0243,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0892465114593506,
"rewards/margins": 8.158609390258789,
"rewards/rejected": -9.247856140136719,
"step": 295
},
{
"epoch": 3.5081481481481482,
"grad_norm": 4.0211602970173095,
"learning_rate": 2.1332439454051277e-08,
"logits/chosen": -0.20002827048301697,
"logits/rejected": -0.1429349184036255,
"logps/chosen": -34.301002502441406,
"logps/rejected": -55.76853561401367,
"loss": 0.0213,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09878039360046387,
"rewards/margins": 6.826313495635986,
"rewards/rejected": -6.925093650817871,
"step": 296
},
{
"epoch": 3.52,
"grad_norm": 5.481157431560272,
"learning_rate": 2.029364338381656e-08,
"logits/chosen": -0.38807621598243713,
"logits/rejected": -0.3801242709159851,
"logps/chosen": -46.397727966308594,
"logps/rejected": -55.0689582824707,
"loss": 0.0289,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3208569288253784,
"rewards/margins": 5.903377056121826,
"rewards/rejected": -6.224234580993652,
"step": 297
},
{
"epoch": 3.531851851851852,
"grad_norm": 5.352166370096892,
"learning_rate": 1.9279704670975726e-08,
"logits/chosen": -0.2355162501335144,
"logits/rejected": -0.07986889034509659,
"logps/chosen": -34.6320686340332,
"logps/rejected": -71.03166198730469,
"loss": 0.0233,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7058801651000977,
"rewards/margins": 7.573249816894531,
"rewards/rejected": -8.279129981994629,
"step": 298
},
{
"epoch": 3.5437037037037036,
"grad_norm": 5.786741095008315,
"learning_rate": 1.829073303751172e-08,
"logits/chosen": -0.3151942193508148,
"logits/rejected": -0.33904606103897095,
"logps/chosen": -29.33087921142578,
"logps/rejected": -61.94608688354492,
"loss": 0.0277,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2769862413406372,
"rewards/margins": 8.623825073242188,
"rewards/rejected": -9.900812149047852,
"step": 299
},
{
"epoch": 3.5555555555555554,
"grad_norm": 4.36518877931545,
"learning_rate": 1.732683550362954e-08,
"logits/chosen": -0.23127204179763794,
"logits/rejected": -0.15411251783370972,
"logps/chosen": -50.591552734375,
"logps/rejected": -77.29461669921875,
"loss": 0.0216,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3554518222808838,
"rewards/margins": 7.547949314117432,
"rewards/rejected": -8.903400421142578,
"step": 300
},
{
"epoch": 3.5674074074074076,
"grad_norm": 4.847754834407613,
"learning_rate": 1.6388116376174765e-08,
"logits/chosen": -0.3548241853713989,
"logits/rejected": -0.2721732556819916,
"logps/chosen": -38.101295471191406,
"logps/rejected": -78.74886322021484,
"loss": 0.0229,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.5179784297943115,
"rewards/margins": 8.753091812133789,
"rewards/rejected": -10.27107048034668,
"step": 301
},
{
"epoch": 3.5792592592592594,
"grad_norm": 7.781488916643506,
"learning_rate": 1.5474677237346468e-08,
"logits/chosen": -0.3061152994632721,
"logits/rejected": -0.3108983337879181,
"logps/chosen": -41.504512786865234,
"logps/rejected": -78.45973205566406,
"loss": 0.0451,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2482175827026367,
"rewards/margins": 8.759658813476562,
"rewards/rejected": -10.0078763961792,
"step": 302
},
{
"epoch": 3.591111111111111,
"grad_norm": 6.94980194588748,
"learning_rate": 1.4586616933704527e-08,
"logits/chosen": -0.018258891999721527,
"logits/rejected": -0.007322182413190603,
"logps/chosen": -54.701812744140625,
"logps/rejected": -73.30741882324219,
"loss": 0.0412,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3399360179901123,
"rewards/margins": 6.371276378631592,
"rewards/rejected": -7.711213111877441,
"step": 303
},
{
"epoch": 3.602962962962963,
"grad_norm": 6.423577076131564,
"learning_rate": 1.372403156547311e-08,
"logits/chosen": -0.43270713090896606,
"logits/rejected": -0.36236703395843506,
"logps/chosen": -33.90000534057617,
"logps/rejected": -60.915306091308594,
"loss": 0.0339,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.318174958229065,
"rewards/margins": 7.222588062286377,
"rewards/rejected": -8.540762901306152,
"step": 304
},
{
"epoch": 3.6148148148148147,
"grad_norm": 4.091095308871669,
"learning_rate": 1.2887014476141212e-08,
"logits/chosen": -0.30502232909202576,
"logits/rejected": -0.36349910497665405,
"logps/chosen": -40.1556510925293,
"logps/rejected": -68.93075561523438,
"loss": 0.0244,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.034249335527420044,
"rewards/margins": 9.958725929260254,
"rewards/rejected": -9.992975234985352,
"step": 305
},
{
"epoch": 3.626666666666667,
"grad_norm": 6.177303284695512,
"learning_rate": 1.2075656242361732e-08,
"logits/chosen": -0.2732085585594177,
"logits/rejected": -0.1902616024017334,
"logps/chosen": -34.206607818603516,
"logps/rejected": -68.56214904785156,
"loss": 0.0255,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0075308084487915,
"rewards/margins": 8.230666160583496,
"rewards/rejected": -9.238195419311523,
"step": 306
},
{
"epoch": 3.6385185185185183,
"grad_norm": 4.816450379471901,
"learning_rate": 1.1290044664149873e-08,
"logits/chosen": -0.11257211118936539,
"logits/rejected": -0.13122713565826416,
"logps/chosen": -48.688873291015625,
"logps/rejected": -74.23355102539062,
"loss": 0.021,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.2951655387878418,
"rewards/margins": 8.786190032958984,
"rewards/rejected": -10.081355094909668,
"step": 307
},
{
"epoch": 3.6503703703703705,
"grad_norm": 6.851539041964001,
"learning_rate": 1.0530264755381824e-08,
"logits/chosen": -0.4197385013103485,
"logits/rejected": -0.43813198804855347,
"logps/chosen": -33.74197769165039,
"logps/rejected": -56.243553161621094,
"loss": 0.0363,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6660629510879517,
"rewards/margins": 5.48460054397583,
"rewards/rejected": -6.150663375854492,
"step": 308
},
{
"epoch": 3.6622222222222223,
"grad_norm": 4.955677182707875,
"learning_rate": 9.796398734595284e-09,
"logits/chosen": -0.26788032054901123,
"logits/rejected": -0.26230883598327637,
"logps/chosen": -28.0145206451416,
"logps/rejected": -51.17897033691406,
"loss": 0.0243,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11930375546216965,
"rewards/margins": 5.247139930725098,
"rewards/rejected": -5.366443634033203,
"step": 309
},
{
"epoch": 3.674074074074074,
"grad_norm": 5.676351317770164,
"learning_rate": 9.088526016092141e-09,
"logits/chosen": -0.3300250470638275,
"logits/rejected": -0.31215977668762207,
"logps/chosen": -33.89133834838867,
"logps/rejected": -67.8143539428711,
"loss": 0.033,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.08850759267807007,
"rewards/margins": 9.25622272491455,
"rewards/rejected": -9.344730377197266,
"step": 310
},
{
"epoch": 3.685925925925926,
"grad_norm": 5.070558881430833,
"learning_rate": 8.40672320134489e-09,
"logits/chosen": -0.305147260427475,
"logits/rejected": -0.17320549488067627,
"logps/chosen": -33.947872161865234,
"logps/rejected": -74.02790832519531,
"loss": 0.0242,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0814896896481514,
"rewards/margins": 9.62078857421875,
"rewards/rejected": -9.539299011230469,
"step": 311
},
{
"epoch": 3.6977777777777776,
"grad_norm": 4.665245148624672,
"learning_rate": 7.751064070707247e-09,
"logits/chosen": -0.44625863432884216,
"logits/rejected": -0.4555772542953491,
"logps/chosen": -42.39094161987305,
"logps/rejected": -67.60736846923828,
"loss": 0.0215,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6547796130180359,
"rewards/margins": 7.189840793609619,
"rewards/rejected": -7.844620227813721,
"step": 312
},
{
"epoch": 3.70962962962963,
"grad_norm": 7.083642027360033,
"learning_rate": 7.12161957543006e-09,
"logits/chosen": -0.251234769821167,
"logits/rejected": -0.13808919489383698,
"logps/chosen": -54.0128288269043,
"logps/rejected": -92.3724365234375,
"loss": 0.035,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6177242994308472,
"rewards/margins": 8.311200141906738,
"rewards/rejected": -9.928923606872559,
"step": 313
},
{
"epoch": 3.7214814814814816,
"grad_norm": 4.713181664530663,
"learning_rate": 6.518457829983559e-09,
"logits/chosen": -0.3703707158565521,
"logits/rejected": -0.2648147940635681,
"logps/chosen": -48.93006896972656,
"logps/rejected": -66.18186950683594,
"loss": 0.0258,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7085801362991333,
"rewards/margins": 5.302105903625488,
"rewards/rejected": -6.010685920715332,
"step": 314
},
{
"epoch": 3.7333333333333334,
"grad_norm": 4.802422252346654,
"learning_rate": 5.9416441046862555e-09,
"logits/chosen": -0.3589542508125305,
"logits/rejected": -0.3287428319454193,
"logps/chosen": -27.614389419555664,
"logps/rejected": -59.38009262084961,
"loss": 0.0301,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17799943685531616,
"rewards/margins": 7.3723649978637695,
"rewards/rejected": -7.550364017486572,
"step": 315
},
{
"epoch": 3.745185185185185,
"grad_norm": 5.504211939139539,
"learning_rate": 5.3912408186420064e-09,
"logits/chosen": -0.19225972890853882,
"logits/rejected": -0.24219948053359985,
"logps/chosen": -39.883174896240234,
"logps/rejected": -57.19430160522461,
"loss": 0.026,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7933384776115417,
"rewards/margins": 7.422837257385254,
"rewards/rejected": -8.21617603302002,
"step": 316
},
{
"epoch": 3.757037037037037,
"grad_norm": 5.560326027847558,
"learning_rate": 4.867307532985227e-09,
"logits/chosen": -0.48098868131637573,
"logits/rejected": -0.3568829894065857,
"logps/chosen": -54.420379638671875,
"logps/rejected": -85.24766540527344,
"loss": 0.0297,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9853860139846802,
"rewards/margins": 7.260770320892334,
"rewards/rejected": -9.246155738830566,
"step": 317
},
{
"epoch": 3.7688888888888887,
"grad_norm": 7.196198216327019,
"learning_rate": 4.369900944435734e-09,
"logits/chosen": -0.16600579023361206,
"logits/rejected": -0.0450252965092659,
"logps/chosen": -42.486473083496094,
"logps/rejected": -81.37212371826172,
"loss": 0.0389,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9354689121246338,
"rewards/margins": 7.651963233947754,
"rewards/rejected": -8.587431907653809,
"step": 318
},
{
"epoch": 3.7807407407407405,
"grad_norm": 6.279239735558416,
"learning_rate": 3.899074879163244e-09,
"logits/chosen": -0.40112194418907166,
"logits/rejected": -0.339353084564209,
"logps/chosen": -34.84368133544922,
"logps/rejected": -61.614341735839844,
"loss": 0.0348,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.0974161624908447,
"rewards/margins": 7.075118064880371,
"rewards/rejected": -8.172533988952637,
"step": 319
},
{
"epoch": 3.7925925925925927,
"grad_norm": 6.9892724428640705,
"learning_rate": 3.4548802869627804e-09,
"logits/chosen": -0.3009003698825836,
"logits/rejected": -0.21556389331817627,
"logps/chosen": -41.47652816772461,
"logps/rejected": -67.13237762451172,
"loss": 0.0368,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2825208604335785,
"rewards/margins": 4.948946952819824,
"rewards/rejected": -5.231468200683594,
"step": 320
},
{
"epoch": 3.8044444444444445,
"grad_norm": 6.856578977189586,
"learning_rate": 3.037365235741024e-09,
"logits/chosen": -0.16049635410308838,
"logits/rejected": -0.12033607065677643,
"logps/chosen": -32.831031799316406,
"logps/rejected": -62.16967010498047,
"loss": 0.0433,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.41414308547973633,
"rewards/margins": 7.311939239501953,
"rewards/rejected": -7.726081371307373,
"step": 321
},
{
"epoch": 3.8162962962962963,
"grad_norm": 9.951539288327684,
"learning_rate": 2.6465749063149245e-09,
"logits/chosen": -0.6789449453353882,
"logits/rejected": -0.6332409381866455,
"logps/chosen": -36.879947662353516,
"logps/rejected": -80.12794494628906,
"loss": 0.0538,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3466782569885254,
"rewards/margins": 9.836599349975586,
"rewards/rejected": -11.183279037475586,
"step": 322
},
{
"epoch": 3.828148148148148,
"grad_norm": 6.85068630218398,
"learning_rate": 2.282551587522441e-09,
"logits/chosen": -0.5462524890899658,
"logits/rejected": -0.43087050318717957,
"logps/chosen": -32.245567321777344,
"logps/rejected": -56.410179138183594,
"loss": 0.0455,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1382488012313843,
"rewards/margins": 6.717283248901367,
"rewards/rejected": -7.855532169342041,
"step": 323
},
{
"epoch": 3.84,
"grad_norm": 6.505661948537336,
"learning_rate": 1.9453346716462316e-09,
"logits/chosen": -0.3759889602661133,
"logits/rejected": -0.4021185040473938,
"logps/chosen": -36.99543380737305,
"logps/rejected": -46.82232666015625,
"loss": 0.0393,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.881350576877594,
"rewards/margins": 4.79249382019043,
"rewards/rejected": -5.673844337463379,
"step": 324
},
{
"epoch": 3.851851851851852,
"grad_norm": 6.937575318919963,
"learning_rate": 1.6349606501509794e-09,
"logits/chosen": -0.2532769441604614,
"logits/rejected": -0.2831670045852661,
"logps/chosen": -41.752403259277344,
"logps/rejected": -52.2098388671875,
"loss": 0.0363,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7358890771865845,
"rewards/margins": 6.947170734405518,
"rewards/rejected": -7.6830596923828125,
"step": 325
},
{
"epoch": 3.863703703703704,
"grad_norm": 3.940918067532272,
"learning_rate": 1.351463109734441e-09,
"logits/chosen": -0.6125096082687378,
"logits/rejected": -0.3610725700855255,
"logps/chosen": -36.78853225708008,
"logps/rejected": -68.35499572753906,
"loss": 0.0198,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8846864700317383,
"rewards/margins": 8.920228004455566,
"rewards/rejected": -9.804913520812988,
"step": 326
},
{
"epoch": 3.8755555555555556,
"grad_norm": 5.731819862714682,
"learning_rate": 1.0948727286930192e-09,
"logits/chosen": -0.07925964891910553,
"logits/rejected": -0.04111175611615181,
"logps/chosen": -35.71060562133789,
"logps/rejected": -57.488651275634766,
"loss": 0.0309,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09710303694009781,
"rewards/margins": 6.214367866516113,
"rewards/rejected": -6.311470985412598,
"step": 327
},
{
"epoch": 3.8874074074074074,
"grad_norm": 7.4093470103902845,
"learning_rate": 8.652172736017816e-10,
"logits/chosen": -0.2285485416650772,
"logits/rejected": -0.22180640697479248,
"logps/chosen": -45.58677673339844,
"logps/rejected": -74.19960021972656,
"loss": 0.0412,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.34262484312057495,
"rewards/margins": 6.549678325653076,
"rewards/rejected": -6.892302989959717,
"step": 328
},
{
"epoch": 3.899259259259259,
"grad_norm": 5.597791664125433,
"learning_rate": 6.625215963098896e-10,
"logits/chosen": -0.22412040829658508,
"logits/rejected": -0.25815126299858093,
"logps/chosen": -39.15158462524414,
"logps/rejected": -52.841060638427734,
"loss": 0.0284,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.1085604429244995,
"rewards/margins": 5.7370734214782715,
"rewards/rejected": -6.845633506774902,
"step": 329
},
{
"epoch": 3.911111111111111,
"grad_norm": 4.652252051429055,
"learning_rate": 4.868076312512515e-10,
"logits/chosen": -0.3777186870574951,
"logits/rejected": -0.3193652629852295,
"logps/chosen": -32.07898712158203,
"logps/rejected": -64.57249450683594,
"loss": 0.0241,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5756787657737732,
"rewards/margins": 7.689653396606445,
"rewards/rejected": -8.265332221984863,
"step": 330
},
{
"epoch": 3.9229629629629628,
"grad_norm": 5.81932820439485,
"learning_rate": 3.3809439307086463e-10,
"logits/chosen": -0.20917584002017975,
"logits/rejected": -0.12255613505840302,
"logps/chosen": -31.15184783935547,
"logps/rejected": -63.176910400390625,
"loss": 0.0335,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12342405319213867,
"rewards/margins": 6.832948684692383,
"rewards/rejected": -6.956372261047363,
"step": 331
},
{
"epoch": 3.934814814814815,
"grad_norm": 6.583884307182356,
"learning_rate": 2.1639797456723952e-10,
"logits/chosen": -0.3113446831703186,
"logits/rejected": -0.34424495697021484,
"logps/chosen": -50.414649963378906,
"logps/rejected": -66.15986633300781,
"loss": 0.0305,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8464060425758362,
"rewards/margins": 7.252386093139648,
"rewards/rejected": -8.098793029785156,
"step": 332
},
{
"epoch": 3.9466666666666668,
"grad_norm": 5.902458355798929,
"learning_rate": 1.21731544950876e-10,
"logits/chosen": -0.23517660796642303,
"logits/rejected": -0.3104686737060547,
"logps/chosen": -45.34723663330078,
"logps/rejected": -86.45767974853516,
"loss": 0.0269,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3326948881149292,
"rewards/margins": 9.871392250061035,
"rewards/rejected": -11.204087257385254,
"step": 333
},
{
"epoch": 3.9585185185185185,
"grad_norm": 5.025564856386988,
"learning_rate": 5.4105348419264394e-11,
"logits/chosen": -0.6560889482498169,
"logits/rejected": -0.6381913423538208,
"logps/chosen": -31.789920806884766,
"logps/rejected": -57.232948303222656,
"loss": 0.0281,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43755847215652466,
"rewards/margins": 6.743692398071289,
"rewards/rejected": -7.181251525878906,
"step": 334
},
{
"epoch": 3.9703703703703703,
"grad_norm": 4.328140885361808,
"learning_rate": 1.3526703048216682e-11,
"logits/chosen": -0.5039613246917725,
"logits/rejected": -0.4086998999118805,
"logps/chosen": -34.5649528503418,
"logps/rejected": -85.53738403320312,
"loss": 0.0214,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7040150165557861,
"rewards/margins": 11.698722839355469,
"rewards/rejected": -12.402737617492676,
"step": 335
},
{
"epoch": 3.982222222222222,
"grad_norm": 5.989193672621692,
"learning_rate": 0.0,
"logits/chosen": -0.3169300854206085,
"logits/rejected": -0.32203078269958496,
"logps/chosen": -37.71720886230469,
"logps/rejected": -64.16942596435547,
"loss": 0.0329,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6902213096618652,
"rewards/margins": 7.074714183807373,
"rewards/rejected": -7.76493501663208,
"step": 336
},
{
"epoch": 3.982222222222222,
"step": 336,
"total_flos": 0.0,
"train_loss": 0.14947671072912358,
"train_runtime": 64472.8667,
"train_samples_per_second": 0.67,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 336,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}