RyanYr's picture
Training in progress, step 160, checkpoint
10c7e61 verified
raw
history blame
43.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.998828811243412,
"eval_steps": 75,
"global_step": 160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024985360140542652,
"grad_norm": 14.177000429059124,
"learning_rate": 4e-09,
"logits/chosen": -0.7216415405273438,
"logits/rejected": -0.7776755690574646,
"logps/chosen": -180.14370727539062,
"logps/rejected": -163.619384765625,
"loss": 0.5911,
"rewards/accuracies": 0.671875,
"rewards/chosen": 2.089264392852783,
"rewards/margins": 0.4413459897041321,
"rewards/rejected": 1.647918462753296,
"step": 2
},
{
"epoch": 0.049970720281085304,
"grad_norm": 13.032161582993453,
"learning_rate": 8e-09,
"logits/chosen": -0.6800286769866943,
"logits/rejected": -0.7293923497200012,
"logps/chosen": -187.60638427734375,
"logps/rejected": -170.1859893798828,
"loss": 0.5762,
"rewards/accuracies": 0.66796875,
"rewards/chosen": 2.104745388031006,
"rewards/margins": 0.4643358588218689,
"rewards/rejected": 1.6404094696044922,
"step": 4
},
{
"epoch": 0.07495608042162795,
"grad_norm": 14.424559495239313,
"learning_rate": 1.1999999999999998e-08,
"logits/chosen": -0.7123140096664429,
"logits/rejected": -0.7762022018432617,
"logps/chosen": -186.28652954101562,
"logps/rejected": -169.9141387939453,
"loss": 0.589,
"rewards/accuracies": 0.66015625,
"rewards/chosen": 2.052624464035034,
"rewards/margins": 0.40381550788879395,
"rewards/rejected": 1.6488089561462402,
"step": 6
},
{
"epoch": 0.09994144056217061,
"grad_norm": 15.25048206057953,
"learning_rate": 1.6e-08,
"logits/chosen": -0.6801178455352783,
"logits/rejected": -0.7383629083633423,
"logps/chosen": -184.20584106445312,
"logps/rejected": -170.67449951171875,
"loss": 0.5855,
"rewards/accuracies": 0.703125,
"rewards/chosen": 2.0641534328460693,
"rewards/margins": 0.4149114787578583,
"rewards/rejected": 1.6492421627044678,
"step": 8
},
{
"epoch": 0.12492680070271325,
"grad_norm": 15.020860898871753,
"learning_rate": 2e-08,
"logits/chosen": -0.6762746572494507,
"logits/rejected": -0.7306088805198669,
"logps/chosen": -181.46649169921875,
"logps/rejected": -171.62355041503906,
"loss": 0.5886,
"rewards/accuracies": 0.671875,
"rewards/chosen": 2.0547406673431396,
"rewards/margins": 0.4107271730899811,
"rewards/rejected": 1.6440132856369019,
"step": 10
},
{
"epoch": 0.1499121608432559,
"grad_norm": 15.546401340611093,
"learning_rate": 2.3999999999999997e-08,
"logits/chosen": -0.6915724873542786,
"logits/rejected": -0.7597174048423767,
"logps/chosen": -191.33026123046875,
"logps/rejected": -172.05740356445312,
"loss": 0.5952,
"rewards/accuracies": 0.68359375,
"rewards/chosen": 2.090841293334961,
"rewards/margins": 0.4058065116405487,
"rewards/rejected": 1.685034990310669,
"step": 12
},
{
"epoch": 0.17489752098379854,
"grad_norm": 15.179087180823856,
"learning_rate": 2.8000000000000003e-08,
"logits/chosen": -0.648224413394928,
"logits/rejected": -0.7036635875701904,
"logps/chosen": -181.56240844726562,
"logps/rejected": -161.77291870117188,
"loss": 0.5662,
"rewards/accuracies": 0.69140625,
"rewards/chosen": 2.07316255569458,
"rewards/margins": 0.45191100239753723,
"rewards/rejected": 1.6212515830993652,
"step": 14
},
{
"epoch": 0.19988288112434122,
"grad_norm": 14.717039853262694,
"learning_rate": 3.2e-08,
"logits/chosen": -0.678563117980957,
"logits/rejected": -0.7321793437004089,
"logps/chosen": -191.10882568359375,
"logps/rejected": -172.11471557617188,
"loss": 0.5792,
"rewards/accuracies": 0.69140625,
"rewards/chosen": 2.1359810829162598,
"rewards/margins": 0.4702029824256897,
"rewards/rejected": 1.6657780408859253,
"step": 16
},
{
"epoch": 0.22486824126488386,
"grad_norm": 15.26388916085504,
"learning_rate": 3.6e-08,
"logits/chosen": -0.6794100999832153,
"logits/rejected": -0.7467265129089355,
"logps/chosen": -190.895751953125,
"logps/rejected": -171.15126037597656,
"loss": 0.5949,
"rewards/accuracies": 0.66796875,
"rewards/chosen": 2.0863959789276123,
"rewards/margins": 0.4278351664543152,
"rewards/rejected": 1.658560872077942,
"step": 18
},
{
"epoch": 0.2498536014054265,
"grad_norm": 14.719456857161541,
"learning_rate": 4e-08,
"logits/chosen": -0.6606219410896301,
"logits/rejected": -0.7190724611282349,
"logps/chosen": -179.43295288085938,
"logps/rejected": -163.46678161621094,
"loss": 0.5819,
"rewards/accuracies": 0.71484375,
"rewards/chosen": 2.1400554180145264,
"rewards/margins": 0.5210827589035034,
"rewards/rejected": 1.6189727783203125,
"step": 20
},
{
"epoch": 0.27483896154596915,
"grad_norm": 14.799553727376024,
"learning_rate": 4.4e-08,
"logits/chosen": -0.6596983671188354,
"logits/rejected": -0.7132915258407593,
"logps/chosen": -186.89849853515625,
"logps/rejected": -177.6392364501953,
"loss": 0.5881,
"rewards/accuracies": 0.73828125,
"rewards/chosen": 2.0855584144592285,
"rewards/margins": 0.5197086334228516,
"rewards/rejected": 1.5658495426177979,
"step": 22
},
{
"epoch": 0.2998243216865118,
"grad_norm": 16.401751337438842,
"learning_rate": 4.799999999999999e-08,
"logits/chosen": -0.6935199499130249,
"logits/rejected": -0.7622916102409363,
"logps/chosen": -191.56312561035156,
"logps/rejected": -166.0808563232422,
"loss": 0.5876,
"rewards/accuracies": 0.72265625,
"rewards/chosen": 2.1759369373321533,
"rewards/margins": 0.584960401058197,
"rewards/rejected": 1.590976357460022,
"step": 24
},
{
"epoch": 0.32480968182705444,
"grad_norm": 14.092401453744207,
"learning_rate": 5.2e-08,
"logits/chosen": -0.6964302062988281,
"logits/rejected": -0.7522369623184204,
"logps/chosen": -183.28709411621094,
"logps/rejected": -176.45947265625,
"loss": 0.5887,
"rewards/accuracies": 0.71484375,
"rewards/chosen": 2.1809558868408203,
"rewards/margins": 0.4667380154132843,
"rewards/rejected": 1.7142179012298584,
"step": 26
},
{
"epoch": 0.3497950419675971,
"grad_norm": 14.992677802834425,
"learning_rate": 5.6000000000000005e-08,
"logits/chosen": -0.6727583408355713,
"logits/rejected": -0.7299581170082092,
"logps/chosen": -176.1138458251953,
"logps/rejected": -165.51553344726562,
"loss": 0.5947,
"rewards/accuracies": 0.6875,
"rewards/chosen": 2.0702695846557617,
"rewards/margins": 0.42442983388900757,
"rewards/rejected": 1.645839810371399,
"step": 28
},
{
"epoch": 0.3747804021081398,
"grad_norm": 14.234258704647342,
"learning_rate": 6e-08,
"logits/chosen": -0.6875941753387451,
"logits/rejected": -0.7378899455070496,
"logps/chosen": -179.08218383789062,
"logps/rejected": -170.3502197265625,
"loss": 0.5854,
"rewards/accuracies": 0.67578125,
"rewards/chosen": 2.12485408782959,
"rewards/margins": 0.4190685749053955,
"rewards/rejected": 1.7057857513427734,
"step": 30
},
{
"epoch": 0.39976576224868243,
"grad_norm": 15.320369672206587,
"learning_rate": 6.4e-08,
"logits/chosen": -0.6457805633544922,
"logits/rejected": -0.7087669372558594,
"logps/chosen": -174.39279174804688,
"logps/rejected": -161.18417358398438,
"loss": 0.556,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 2.1353416442871094,
"rewards/margins": 0.5485972166061401,
"rewards/rejected": 1.5867444276809692,
"step": 32
},
{
"epoch": 0.4247511223892251,
"grad_norm": 14.996986688498861,
"learning_rate": 6.8e-08,
"logits/chosen": -0.677814781665802,
"logits/rejected": -0.7355855703353882,
"logps/chosen": -184.22764587402344,
"logps/rejected": -164.98434448242188,
"loss": 0.5726,
"rewards/accuracies": 0.71484375,
"rewards/chosen": 2.0874392986297607,
"rewards/margins": 0.5497796535491943,
"rewards/rejected": 1.5376596450805664,
"step": 34
},
{
"epoch": 0.4497364825297677,
"grad_norm": 15.836133161520731,
"learning_rate": 7.2e-08,
"logits/chosen": -0.6465247273445129,
"logits/rejected": -0.7009165287017822,
"logps/chosen": -183.50096130371094,
"logps/rejected": -172.29428100585938,
"loss": 0.5909,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 2.1548638343811035,
"rewards/margins": 0.48915886878967285,
"rewards/rejected": 1.6657049655914307,
"step": 36
},
{
"epoch": 0.47472184267031037,
"grad_norm": 15.068791059044901,
"learning_rate": 7.599999999999999e-08,
"logits/chosen": -0.684742271900177,
"logits/rejected": -0.7406108379364014,
"logps/chosen": -178.16604614257812,
"logps/rejected": -172.45472717285156,
"loss": 0.5728,
"rewards/accuracies": 0.69140625,
"rewards/chosen": 2.1049206256866455,
"rewards/margins": 0.4164605736732483,
"rewards/rejected": 1.6884599924087524,
"step": 38
},
{
"epoch": 0.499707202810853,
"grad_norm": 15.172663789942417,
"learning_rate": 8e-08,
"logits/chosen": -0.6802005767822266,
"logits/rejected": -0.7308796048164368,
"logps/chosen": -178.501708984375,
"logps/rejected": -162.97750854492188,
"loss": 0.5778,
"rewards/accuracies": 0.703125,
"rewards/chosen": 2.145007848739624,
"rewards/margins": 0.5102132558822632,
"rewards/rejected": 1.6347947120666504,
"step": 40
},
{
"epoch": 0.5246925629513957,
"grad_norm": 13.193729893516823,
"learning_rate": 8.4e-08,
"logits/chosen": -0.67890864610672,
"logits/rejected": -0.7359157204627991,
"logps/chosen": -180.63043212890625,
"logps/rejected": -177.0836181640625,
"loss": 0.5706,
"rewards/accuracies": 0.70703125,
"rewards/chosen": 2.1336517333984375,
"rewards/margins": 0.4847910404205322,
"rewards/rejected": 1.6488608121871948,
"step": 42
},
{
"epoch": 0.5496779230919383,
"grad_norm": 14.543118187410414,
"learning_rate": 8.8e-08,
"logits/chosen": -0.6593753099441528,
"logits/rejected": -0.7200923562049866,
"logps/chosen": -179.75027465820312,
"logps/rejected": -161.65733337402344,
"loss": 0.5614,
"rewards/accuracies": 0.72265625,
"rewards/chosen": 2.1679983139038086,
"rewards/margins": 0.6123022437095642,
"rewards/rejected": 1.5556960105895996,
"step": 44
},
{
"epoch": 0.574663283232481,
"grad_norm": 14.478724863209543,
"learning_rate": 9.2e-08,
"logits/chosen": -0.6733975410461426,
"logits/rejected": -0.725917398929596,
"logps/chosen": -179.19137573242188,
"logps/rejected": -167.48928833007812,
"loss": 0.5721,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 2.126537561416626,
"rewards/margins": 0.5884015560150146,
"rewards/rejected": 1.5381361246109009,
"step": 46
},
{
"epoch": 0.5996486433730236,
"grad_norm": 13.598065354511457,
"learning_rate": 9.599999999999999e-08,
"logits/chosen": -0.6861451864242554,
"logits/rejected": -0.7490273118019104,
"logps/chosen": -192.40524291992188,
"logps/rejected": -166.65826416015625,
"loss": 0.5666,
"rewards/accuracies": 0.69140625,
"rewards/chosen": 2.1456832885742188,
"rewards/margins": 0.5288498997688293,
"rewards/rejected": 1.6168336868286133,
"step": 48
},
{
"epoch": 0.6246340035135662,
"grad_norm": 13.749586623653736,
"learning_rate": 1e-07,
"logits/chosen": -0.6737085580825806,
"logits/rejected": -0.7165706753730774,
"logps/chosen": -176.8297119140625,
"logps/rejected": -168.13772583007812,
"loss": 0.5656,
"rewards/accuracies": 0.734375,
"rewards/chosen": 2.0855467319488525,
"rewards/margins": 0.5343782305717468,
"rewards/rejected": 1.551168441772461,
"step": 50
},
{
"epoch": 0.6496193636541089,
"grad_norm": 14.236522635027217,
"learning_rate": 1.04e-07,
"logits/chosen": -0.6797468662261963,
"logits/rejected": -0.7432878613471985,
"logps/chosen": -180.42208862304688,
"logps/rejected": -165.42669677734375,
"loss": 0.5405,
"rewards/accuracies": 0.71875,
"rewards/chosen": 2.1683268547058105,
"rewards/margins": 0.678287148475647,
"rewards/rejected": 1.4900394678115845,
"step": 52
},
{
"epoch": 0.6746047237946515,
"grad_norm": 12.868471228668062,
"learning_rate": 1.08e-07,
"logits/chosen": -0.6745160818099976,
"logits/rejected": -0.7254283428192139,
"logps/chosen": -183.60704040527344,
"logps/rejected": -170.13792419433594,
"loss": 0.5348,
"rewards/accuracies": 0.73046875,
"rewards/chosen": 2.0506410598754883,
"rewards/margins": 0.5821288228034973,
"rewards/rejected": 1.4685120582580566,
"step": 54
},
{
"epoch": 0.6995900839351942,
"grad_norm": 13.767404224251546,
"learning_rate": 1.1200000000000001e-07,
"logits/chosen": -0.6854877471923828,
"logits/rejected": -0.746857225894928,
"logps/chosen": -178.72006225585938,
"logps/rejected": -162.08724975585938,
"loss": 0.5387,
"rewards/accuracies": 0.703125,
"rewards/chosen": 2.124311923980713,
"rewards/margins": 0.6121358871459961,
"rewards/rejected": 1.5121760368347168,
"step": 56
},
{
"epoch": 0.7245754440757368,
"grad_norm": 13.792453070210335,
"learning_rate": 1.1599999999999999e-07,
"logits/chosen": -0.6948191523551941,
"logits/rejected": -0.7636308073997498,
"logps/chosen": -195.91062927246094,
"logps/rejected": -176.2474365234375,
"loss": 0.5338,
"rewards/accuracies": 0.73046875,
"rewards/chosen": 2.136486530303955,
"rewards/margins": 0.6362313032150269,
"rewards/rejected": 1.5002549886703491,
"step": 58
},
{
"epoch": 0.7495608042162796,
"grad_norm": 12.863853198025703,
"learning_rate": 1.2e-07,
"logits/chosen": -0.6513829231262207,
"logits/rejected": -0.7188961505889893,
"logps/chosen": -190.9204864501953,
"logps/rejected": -170.77809143066406,
"loss": 0.5334,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 2.1112589836120605,
"rewards/margins": 0.6223936676979065,
"rewards/rejected": 1.4888653755187988,
"step": 60
},
{
"epoch": 0.7745461643568222,
"grad_norm": 13.104450063440881,
"learning_rate": 1.24e-07,
"logits/chosen": -0.6542866230010986,
"logits/rejected": -0.7189180254936218,
"logps/chosen": -183.43380737304688,
"logps/rejected": -169.60279846191406,
"loss": 0.538,
"rewards/accuracies": 0.7578125,
"rewards/chosen": 2.178042411804199,
"rewards/margins": 0.7206485867500305,
"rewards/rejected": 1.4573938846588135,
"step": 62
},
{
"epoch": 0.7995315244973649,
"grad_norm": 14.223685904396252,
"learning_rate": 1.28e-07,
"logits/chosen": -0.6746785640716553,
"logits/rejected": -0.7393426895141602,
"logps/chosen": -187.83718872070312,
"logps/rejected": -170.26303100585938,
"loss": 0.5409,
"rewards/accuracies": 0.71875,
"rewards/chosen": 2.0394065380096436,
"rewards/margins": 0.6107546091079712,
"rewards/rejected": 1.4286518096923828,
"step": 64
},
{
"epoch": 0.8245168846379075,
"grad_norm": 11.892937673420246,
"learning_rate": 1.32e-07,
"logits/chosen": -0.6900768280029297,
"logits/rejected": -0.7491022944450378,
"logps/chosen": -190.11602783203125,
"logps/rejected": -177.3651123046875,
"loss": 0.4913,
"rewards/accuracies": 0.7890625,
"rewards/chosen": 2.1415176391601562,
"rewards/margins": 0.7801377773284912,
"rewards/rejected": 1.3613799810409546,
"step": 66
},
{
"epoch": 0.8495022447784502,
"grad_norm": 12.90980511070953,
"learning_rate": 1.36e-07,
"logits/chosen": -0.671898603439331,
"logits/rejected": -0.7221825122833252,
"logps/chosen": -182.26194763183594,
"logps/rejected": -171.02517700195312,
"loss": 0.5013,
"rewards/accuracies": 0.78125,
"rewards/chosen": 1.9341095685958862,
"rewards/margins": 0.7295835018157959,
"rewards/rejected": 1.2045260667800903,
"step": 68
},
{
"epoch": 0.8744876049189928,
"grad_norm": 11.941560260701717,
"learning_rate": 1.3999999999999998e-07,
"logits/chosen": -0.6553590297698975,
"logits/rejected": -0.7277964353561401,
"logps/chosen": -191.17935180664062,
"logps/rejected": -182.75697326660156,
"loss": 0.4992,
"rewards/accuracies": 0.76171875,
"rewards/chosen": 1.9563246965408325,
"rewards/margins": 0.735268771648407,
"rewards/rejected": 1.2210559844970703,
"step": 70
},
{
"epoch": 0.8994729650595354,
"grad_norm": 11.363918779296476,
"learning_rate": 1.44e-07,
"logits/chosen": -0.6677660346031189,
"logits/rejected": -0.7358181476593018,
"logps/chosen": -187.28823852539062,
"logps/rejected": -175.23736572265625,
"loss": 0.4782,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 1.8920280933380127,
"rewards/margins": 0.9304031729698181,
"rewards/rejected": 0.9616249799728394,
"step": 72
},
{
"epoch": 0.9244583252000781,
"grad_norm": 11.665594088546383,
"learning_rate": 1.48e-07,
"logits/chosen": -0.6850963830947876,
"logits/rejected": -0.750001072883606,
"logps/chosen": -190.98031616210938,
"logps/rejected": -173.23446655273438,
"loss": 0.4758,
"rewards/accuracies": 0.79296875,
"rewards/chosen": 1.8105218410491943,
"rewards/margins": 0.8698927760124207,
"rewards/rejected": 0.9406291246414185,
"step": 74
},
{
"epoch": 0.9369510052703494,
"eval_logits/chosen": -0.6242849230766296,
"eval_logits/rejected": -0.7280451059341431,
"eval_logps/chosen": -193.7286376953125,
"eval_logps/rejected": -160.47738647460938,
"eval_loss": 0.5220226645469666,
"eval_rewards/accuracies": 0.8399999737739563,
"eval_rewards/chosen": 1.9179359674453735,
"eval_rewards/margins": 1.0572994947433472,
"eval_rewards/rejected": 0.8606364727020264,
"eval_runtime": 29.5424,
"eval_samples_per_second": 3.385,
"eval_steps_per_second": 0.846,
"step": 75
},
{
"epoch": 0.9494436853406207,
"grad_norm": 11.894064434017581,
"learning_rate": 1.5199999999999998e-07,
"logits/chosen": -0.6667495965957642,
"logits/rejected": -0.7195772528648376,
"logps/chosen": -188.1094512939453,
"logps/rejected": -173.92083740234375,
"loss": 0.4603,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.8090986013412476,
"rewards/margins": 0.9019326567649841,
"rewards/rejected": 0.9071658849716187,
"step": 76
},
{
"epoch": 0.9744290454811634,
"grad_norm": 12.785309864943024,
"learning_rate": 1.56e-07,
"logits/chosen": -0.6789891123771667,
"logits/rejected": -0.7438546419143677,
"logps/chosen": -190.32470703125,
"logps/rejected": -169.19207763671875,
"loss": 0.4541,
"rewards/accuracies": 0.80078125,
"rewards/chosen": 1.736232042312622,
"rewards/margins": 0.9202592372894287,
"rewards/rejected": 0.8159728050231934,
"step": 78
},
{
"epoch": 0.999414405621706,
"grad_norm": 12.07847451966405,
"learning_rate": 1.6e-07,
"logits/chosen": -0.687256395816803,
"logits/rejected": -0.7596179246902466,
"logps/chosen": -195.14768981933594,
"logps/rejected": -174.72589111328125,
"loss": 0.4471,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.6982433795928955,
"rewards/margins": 0.9969222545623779,
"rewards/rejected": 0.7013211250305176,
"step": 80
},
{
"epoch": 1.0243997657622488,
"grad_norm": 12.209730650020617,
"learning_rate": 1.6399999999999999e-07,
"logits/chosen": -0.6745339632034302,
"logits/rejected": -0.7286314368247986,
"logps/chosen": -184.5238494873047,
"logps/rejected": -180.26815795898438,
"loss": 0.4635,
"rewards/accuracies": 0.78515625,
"rewards/chosen": 1.6792489290237427,
"rewards/margins": 0.8308749198913574,
"rewards/rejected": 0.8483741879463196,
"step": 82
},
{
"epoch": 1.0493851259027913,
"grad_norm": 11.63839350311622,
"learning_rate": 1.68e-07,
"logits/chosen": -0.6981229186058044,
"logits/rejected": -0.7625120878219604,
"logps/chosen": -191.15847778320312,
"logps/rejected": -189.08364868164062,
"loss": 0.4418,
"rewards/accuracies": 0.80078125,
"rewards/chosen": 1.7394218444824219,
"rewards/margins": 1.0957342386245728,
"rewards/rejected": 0.6436874866485596,
"step": 84
},
{
"epoch": 1.074370486043334,
"grad_norm": 12.047547514582906,
"learning_rate": 1.7199999999999998e-07,
"logits/chosen": -0.6573597192764282,
"logits/rejected": -0.7058761715888977,
"logps/chosen": -189.32237243652344,
"logps/rejected": -182.69403076171875,
"loss": 0.4339,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.724177598953247,
"rewards/margins": 1.020638108253479,
"rewards/rejected": 0.7035394906997681,
"step": 86
},
{
"epoch": 1.0993558461838766,
"grad_norm": 12.65465734374361,
"learning_rate": 1.76e-07,
"logits/chosen": -0.67812180519104,
"logits/rejected": -0.7225789427757263,
"logps/chosen": -185.65040588378906,
"logps/rejected": -187.9286651611328,
"loss": 0.437,
"rewards/accuracies": 0.81640625,
"rewards/chosen": 1.5955438613891602,
"rewards/margins": 0.9583697319030762,
"rewards/rejected": 0.6371738910675049,
"step": 88
},
{
"epoch": 1.1243412063244194,
"grad_norm": 10.575082434339704,
"learning_rate": 1.8e-07,
"logits/chosen": -0.6781046986579895,
"logits/rejected": -0.7251250743865967,
"logps/chosen": -189.33551025390625,
"logps/rejected": -188.9590606689453,
"loss": 0.4156,
"rewards/accuracies": 0.8359375,
"rewards/chosen": 1.5732731819152832,
"rewards/margins": 1.1116917133331299,
"rewards/rejected": 0.4615815281867981,
"step": 90
},
{
"epoch": 1.149326566464962,
"grad_norm": 9.55392077902883,
"learning_rate": 1.84e-07,
"logits/chosen": -0.6788798570632935,
"logits/rejected": -0.7428586483001709,
"logps/chosen": -198.3631591796875,
"logps/rejected": -182.88487243652344,
"loss": 0.3982,
"rewards/accuracies": 0.82421875,
"rewards/chosen": 1.3846931457519531,
"rewards/margins": 1.0993235111236572,
"rewards/rejected": 0.2853696346282959,
"step": 92
},
{
"epoch": 1.1743119266055047,
"grad_norm": 9.670365765437687,
"learning_rate": 1.88e-07,
"logits/chosen": -0.7081943154335022,
"logits/rejected": -0.775234580039978,
"logps/chosen": -189.57760620117188,
"logps/rejected": -181.6404571533203,
"loss": 0.3911,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 1.2745850086212158,
"rewards/margins": 1.3070428371429443,
"rewards/rejected": -0.032457947731018066,
"step": 94
},
{
"epoch": 1.1992972867460472,
"grad_norm": 10.78932105106093,
"learning_rate": 1.9199999999999997e-07,
"logits/chosen": -0.6830898523330688,
"logits/rejected": -0.734713613986969,
"logps/chosen": -191.22511291503906,
"logps/rejected": -186.43077087402344,
"loss": 0.3897,
"rewards/accuracies": 0.82421875,
"rewards/chosen": 1.0674785375595093,
"rewards/margins": 1.171497106552124,
"rewards/rejected": -0.10401848703622818,
"step": 96
},
{
"epoch": 1.22428264688659,
"grad_norm": 9.097857033695211,
"learning_rate": 1.9599999999999998e-07,
"logits/chosen": -0.7023120522499084,
"logits/rejected": -0.7581274509429932,
"logps/chosen": -195.01312255859375,
"logps/rejected": -188.2948455810547,
"loss": 0.3536,
"rewards/accuracies": 0.85546875,
"rewards/chosen": 1.0987714529037476,
"rewards/margins": 1.5480579137802124,
"rewards/rejected": -0.4492865800857544,
"step": 98
},
{
"epoch": 1.2492680070271325,
"grad_norm": 12.363599073474044,
"learning_rate": 2e-07,
"logits/chosen": -0.6939373016357422,
"logits/rejected": -0.7522105574607849,
"logps/chosen": -193.21104431152344,
"logps/rejected": -190.04568481445312,
"loss": 0.3649,
"rewards/accuracies": 0.85546875,
"rewards/chosen": 0.7619870901107788,
"rewards/margins": 1.3099664449691772,
"rewards/rejected": -0.547979474067688,
"step": 100
},
{
"epoch": 1.2742533671676752,
"grad_norm": 9.956621854528622,
"learning_rate": 1.9945218953682733e-07,
"logits/chosen": -0.7141095399856567,
"logits/rejected": -0.772229015827179,
"logps/chosen": -206.34132385253906,
"logps/rejected": -199.00970458984375,
"loss": 0.3505,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.8092713952064514,
"rewards/margins": 1.6670289039611816,
"rewards/rejected": -0.8577573299407959,
"step": 102
},
{
"epoch": 1.2992387273082178,
"grad_norm": 9.301525503546692,
"learning_rate": 1.9781476007338056e-07,
"logits/chosen": -0.7332565188407898,
"logits/rejected": -0.7983365058898926,
"logps/chosen": -201.5232696533203,
"logps/rejected": -191.0933380126953,
"loss": 0.3195,
"rewards/accuracies": 0.89453125,
"rewards/chosen": 0.5621832609176636,
"rewards/margins": 1.6282891035079956,
"rewards/rejected": -1.0661057233810425,
"step": 104
},
{
"epoch": 1.3242240874487605,
"grad_norm": 9.869807635078832,
"learning_rate": 1.9510565162951537e-07,
"logits/chosen": -0.7436533570289612,
"logits/rejected": -0.8179137706756592,
"logps/chosen": -198.8864288330078,
"logps/rejected": -192.3636016845703,
"loss": 0.3368,
"rewards/accuracies": 0.87109375,
"rewards/chosen": 0.5397917628288269,
"rewards/margins": 1.7668784856796265,
"rewards/rejected": -1.2270865440368652,
"step": 106
},
{
"epoch": 1.349209447589303,
"grad_norm": 10.035654401238393,
"learning_rate": 1.9135454576426007e-07,
"logits/chosen": -0.6918727159500122,
"logits/rejected": -0.7655491828918457,
"logps/chosen": -203.7888946533203,
"logps/rejected": -215.9702606201172,
"loss": 0.3397,
"rewards/accuracies": 0.90234375,
"rewards/chosen": 0.5103797912597656,
"rewards/margins": 1.8079906702041626,
"rewards/rejected": -1.297610878944397,
"step": 108
},
{
"epoch": 1.3741948077298458,
"grad_norm": 9.358482838142306,
"learning_rate": 1.8660254037844388e-07,
"logits/chosen": -0.7152352929115295,
"logits/rejected": -0.7777791023254395,
"logps/chosen": -207.73023986816406,
"logps/rejected": -194.01126098632812,
"loss": 0.3365,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4271202087402344,
"rewards/margins": 1.687686562538147,
"rewards/rejected": -1.2605663537979126,
"step": 110
},
{
"epoch": 1.3991801678703886,
"grad_norm": 9.789127179150574,
"learning_rate": 1.8090169943749475e-07,
"logits/chosen": -0.7317672371864319,
"logits/rejected": -0.7882843017578125,
"logps/chosen": -203.46849060058594,
"logps/rejected": -208.32135009765625,
"loss": 0.3157,
"rewards/accuracies": 0.87890625,
"rewards/chosen": 0.33698615431785583,
"rewards/margins": 1.7098716497421265,
"rewards/rejected": -1.3728857040405273,
"step": 112
},
{
"epoch": 1.424165528010931,
"grad_norm": 9.275207179944992,
"learning_rate": 1.7431448254773942e-07,
"logits/chosen": -0.7219483852386475,
"logits/rejected": -0.7694462537765503,
"logps/chosen": -199.74270629882812,
"logps/rejected": -204.81101989746094,
"loss": 0.3034,
"rewards/accuracies": 0.88671875,
"rewards/chosen": 0.40492168068885803,
"rewards/margins": 1.9214580059051514,
"rewards/rejected": -1.5165363550186157,
"step": 114
},
{
"epoch": 1.4491508881514736,
"grad_norm": 9.183521827422608,
"learning_rate": 1.669130606358858e-07,
"logits/chosen": -0.7337281107902527,
"logits/rejected": -0.7940360307693481,
"logps/chosen": -198.19046020507812,
"logps/rejected": -200.4697265625,
"loss": 0.3176,
"rewards/accuracies": 0.88671875,
"rewards/chosen": 0.3844246566295624,
"rewards/margins": 1.8816416263580322,
"rewards/rejected": -1.4972169399261475,
"step": 116
},
{
"epoch": 1.4741362482920164,
"grad_norm": 9.337739314896169,
"learning_rate": 1.5877852522924732e-07,
"logits/chosen": -0.7224444150924683,
"logits/rejected": -0.7723821401596069,
"logps/chosen": -195.0646209716797,
"logps/rejected": -201.89569091796875,
"loss": 0.2752,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3896394371986389,
"rewards/margins": 1.9735894203186035,
"rewards/rejected": -1.5839500427246094,
"step": 118
},
{
"epoch": 1.4991216084325591,
"grad_norm": 8.793369516249312,
"learning_rate": 1.5e-07,
"logits/chosen": -0.7569531202316284,
"logits/rejected": -0.8058477640151978,
"logps/chosen": -208.27215576171875,
"logps/rejected": -208.09347534179688,
"loss": 0.317,
"rewards/accuracies": 0.87890625,
"rewards/chosen": 0.2652769982814789,
"rewards/margins": 1.866006851196289,
"rewards/rejected": -1.6007298231124878,
"step": 120
},
{
"epoch": 1.5241069685731017,
"grad_norm": 9.48150415114474,
"learning_rate": 1.4067366430758004e-07,
"logits/chosen": -0.7591882348060608,
"logits/rejected": -0.8140251636505127,
"logps/chosen": -205.2285614013672,
"logps/rejected": -203.8860321044922,
"loss": 0.2965,
"rewards/accuracies": 0.86328125,
"rewards/chosen": 0.22455668449401855,
"rewards/margins": 1.875580072402954,
"rewards/rejected": -1.6510233879089355,
"step": 122
},
{
"epoch": 1.5490923287136442,
"grad_norm": 9.98138144476122,
"learning_rate": 1.3090169943749475e-07,
"logits/chosen": -0.7185292840003967,
"logits/rejected": -0.7869015336036682,
"logps/chosen": -207.1554718017578,
"logps/rejected": -239.03298950195312,
"loss": 0.301,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.21323075890541077,
"rewards/margins": 1.779089093208313,
"rewards/rejected": -1.565858244895935,
"step": 124
},
{
"epoch": 1.574077688854187,
"grad_norm": 7.9906817459937995,
"learning_rate": 1.207911690817759e-07,
"logits/chosen": -0.7233790159225464,
"logits/rejected": -0.7781089544296265,
"logps/chosen": -194.47396850585938,
"logps/rejected": -199.163330078125,
"loss": 0.3084,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.403320848941803,
"rewards/margins": 1.8897595405578613,
"rewards/rejected": -1.486438512802124,
"step": 126
},
{
"epoch": 1.5990630489947297,
"grad_norm": 9.241594419692872,
"learning_rate": 1.1045284632676535e-07,
"logits/chosen": -0.7427763342857361,
"logits/rejected": -0.811578094959259,
"logps/chosen": -211.1174774169922,
"logps/rejected": -199.56015014648438,
"loss": 0.2962,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.37835511565208435,
"rewards/margins": 2.0853826999664307,
"rewards/rejected": -1.7070273160934448,
"step": 128
},
{
"epoch": 1.6240484091352723,
"grad_norm": 9.666544829037878,
"learning_rate": 1e-07,
"logits/chosen": -0.7333863973617554,
"logits/rejected": -0.7908891439437866,
"logps/chosen": -201.62767028808594,
"logps/rejected": -204.39947509765625,
"loss": 0.3047,
"rewards/accuracies": 0.91015625,
"rewards/chosen": 0.5189218521118164,
"rewards/margins": 2.134669542312622,
"rewards/rejected": -1.6157476902008057,
"step": 130
},
{
"epoch": 1.6490337692758148,
"grad_norm": 8.419662280451101,
"learning_rate": 8.954715367323466e-08,
"logits/chosen": -0.7702259421348572,
"logits/rejected": -0.8284745812416077,
"logps/chosen": -199.48992919921875,
"logps/rejected": -219.00027465820312,
"loss": 0.2776,
"rewards/accuracies": 0.86328125,
"rewards/chosen": 0.36517998576164246,
"rewards/margins": 2.0277538299560547,
"rewards/rejected": -1.6625735759735107,
"step": 132
},
{
"epoch": 1.6740191294163576,
"grad_norm": 9.261225693287605,
"learning_rate": 7.920883091822408e-08,
"logits/chosen": -0.7342085242271423,
"logits/rejected": -0.7807326912879944,
"logps/chosen": -198.77467346191406,
"logps/rejected": -204.98635864257812,
"loss": 0.3185,
"rewards/accuracies": 0.88671875,
"rewards/chosen": 0.39617919921875,
"rewards/margins": 1.929458737373352,
"rewards/rejected": -1.533279538154602,
"step": 134
},
{
"epoch": 1.6990044895569003,
"grad_norm": 9.299697052167406,
"learning_rate": 6.909830056250527e-08,
"logits/chosen": -0.7558231949806213,
"logits/rejected": -0.8186966776847839,
"logps/chosen": -200.72955322265625,
"logps/rejected": -197.19003295898438,
"loss": 0.2807,
"rewards/accuracies": 0.88671875,
"rewards/chosen": 0.42143842577934265,
"rewards/margins": 2.1845474243164062,
"rewards/rejected": -1.7631090879440308,
"step": 136
},
{
"epoch": 1.723989849697443,
"grad_norm": 8.587815614728,
"learning_rate": 5.9326335692419996e-08,
"logits/chosen": -0.755694568157196,
"logits/rejected": -0.8112677335739136,
"logps/chosen": -198.68673706054688,
"logps/rejected": -197.39120483398438,
"loss": 0.2802,
"rewards/accuracies": 0.8671875,
"rewards/chosen": 0.23039419949054718,
"rewards/margins": 1.9206252098083496,
"rewards/rejected": -1.6902309656143188,
"step": 138
},
{
"epoch": 1.7489752098379856,
"grad_norm": 8.680976094750916,
"learning_rate": 5.000000000000002e-08,
"logits/chosen": -0.7553902864456177,
"logits/rejected": -0.8158895373344421,
"logps/chosen": -199.0127716064453,
"logps/rejected": -202.2300262451172,
"loss": 0.2768,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.5232993960380554,
"rewards/margins": 2.2032899856567383,
"rewards/rejected": -1.6799907684326172,
"step": 140
},
{
"epoch": 1.7739605699785281,
"grad_norm": 9.234907906222318,
"learning_rate": 4.1221474770752695e-08,
"logits/chosen": -0.7363643646240234,
"logits/rejected": -0.79323410987854,
"logps/chosen": -203.90921020507812,
"logps/rejected": -207.4276123046875,
"loss": 0.2921,
"rewards/accuracies": 0.85546875,
"rewards/chosen": 0.3827190697193146,
"rewards/margins": 2.0326881408691406,
"rewards/rejected": -1.6499687433242798,
"step": 142
},
{
"epoch": 1.798945930119071,
"grad_norm": 8.988965068155167,
"learning_rate": 3.3086939364114206e-08,
"logits/chosen": -0.7579203844070435,
"logits/rejected": -0.8293938636779785,
"logps/chosen": -201.67063903808594,
"logps/rejected": -223.98065185546875,
"loss": 0.2825,
"rewards/accuracies": 0.90234375,
"rewards/chosen": 0.3738960325717926,
"rewards/margins": 2.088986873626709,
"rewards/rejected": -1.7150908708572388,
"step": 144
},
{
"epoch": 1.8239312902596136,
"grad_norm": 8.4124330379094,
"learning_rate": 2.5685517452260564e-08,
"logits/chosen": -0.7071250081062317,
"logits/rejected": -0.7688826322555542,
"logps/chosen": -203.57652282714844,
"logps/rejected": -203.83291625976562,
"loss": 0.282,
"rewards/accuracies": 0.90234375,
"rewards/chosen": 0.2634541988372803,
"rewards/margins": 2.0646886825561523,
"rewards/rejected": -1.801234245300293,
"step": 146
},
{
"epoch": 1.8489166504001562,
"grad_norm": 8.280715517124031,
"learning_rate": 1.9098300562505266e-08,
"logits/chosen": -0.755478024482727,
"logits/rejected": -0.8133871555328369,
"logps/chosen": -202.27098083496094,
"logps/rejected": -195.1833038330078,
"loss": 0.2677,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.45085206627845764,
"rewards/margins": 2.2143564224243164,
"rewards/rejected": -1.7635046243667603,
"step": 148
},
{
"epoch": 1.8739020105406987,
"grad_norm": 9.365073690139782,
"learning_rate": 1.3397459621556128e-08,
"logits/chosen": -0.7708315849304199,
"logits/rejected": -0.8214279413223267,
"logps/chosen": -198.73464965820312,
"logps/rejected": -201.75244140625,
"loss": 0.2866,
"rewards/accuracies": 0.8828125,
"rewards/chosen": 0.23038014769554138,
"rewards/margins": 1.8956291675567627,
"rewards/rejected": -1.6652488708496094,
"step": 150
},
{
"epoch": 1.8739020105406987,
"eval_logits/chosen": -0.6863436102867126,
"eval_logits/rejected": -0.7882587909698486,
"eval_logps/chosen": -206.22607421875,
"eval_logps/rejected": -185.4351806640625,
"eval_loss": 0.28332585096359253,
"eval_rewards/accuracies": 0.9200000166893005,
"eval_rewards/chosen": 0.6681913137435913,
"eval_rewards/margins": 2.303332567214966,
"eval_rewards/rejected": -1.6351412534713745,
"eval_runtime": 30.6661,
"eval_samples_per_second": 3.261,
"eval_steps_per_second": 0.815,
"step": 150
},
{
"epoch": 1.8988873706812415,
"grad_norm": 8.503745608023737,
"learning_rate": 8.645454235739902e-09,
"logits/chosen": -0.7426515817642212,
"logits/rejected": -0.8051266670227051,
"logps/chosen": -195.47421264648438,
"logps/rejected": -199.013916015625,
"loss": 0.2643,
"rewards/accuracies": 0.9140625,
"rewards/chosen": 0.39815255999565125,
"rewards/margins": 2.137446403503418,
"rewards/rejected": -1.7392936944961548,
"step": 152
},
{
"epoch": 1.9238727308217842,
"grad_norm": 8.367825814127663,
"learning_rate": 4.8943483704846465e-09,
"logits/chosen": -0.7322957515716553,
"logits/rejected": -0.7974464893341064,
"logps/chosen": -193.97613525390625,
"logps/rejected": -191.2456817626953,
"loss": 0.2622,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.45596182346343994,
"rewards/margins": 2.185451030731201,
"rewards/rejected": -1.7294889688491821,
"step": 154
},
{
"epoch": 1.9488580909623268,
"grad_norm": 7.920183430972945,
"learning_rate": 2.1852399266194312e-09,
"logits/chosen": -0.7559969425201416,
"logits/rejected": -0.8131712079048157,
"logps/chosen": -203.8223876953125,
"logps/rejected": -202.947509765625,
"loss": 0.2773,
"rewards/accuracies": 0.91015625,
"rewards/chosen": 0.32228347659111023,
"rewards/margins": 2.08817458152771,
"rewards/rejected": -1.7658910751342773,
"step": 156
},
{
"epoch": 1.9738434511028693,
"grad_norm": 8.06856390519066,
"learning_rate": 5.47810463172671e-10,
"logits/chosen": -0.7470804452896118,
"logits/rejected": -0.8129448294639587,
"logps/chosen": -210.6734619140625,
"logps/rejected": -196.4785919189453,
"loss": 0.2755,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.42607438564300537,
"rewards/margins": 2.0202014446258545,
"rewards/rejected": -1.5941270589828491,
"step": 158
},
{
"epoch": 1.998828811243412,
"grad_norm": 9.773194489937959,
"learning_rate": 0.0,
"logits/chosen": -0.80845707654953,
"logits/rejected": -0.8671077489852905,
"logps/chosen": -203.4849853515625,
"logps/rejected": -204.71002197265625,
"loss": 0.2941,
"rewards/accuracies": 0.91796875,
"rewards/chosen": 0.35994353890419006,
"rewards/margins": 2.0885844230651855,
"rewards/rejected": -1.7286407947540283,
"step": 160
}
],
"logging_steps": 2,
"max_steps": 160,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}