dpo_0622_policy2 / trainer_state.json
WDong's picture
Upload 17 files
72a8e55 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994495412844037,
"eval_steps": 500,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014678899082568808,
"grad_norm": 2.871569871902466,
"learning_rate": 2.439024390243903e-07,
"logits/chosen": -1.156640887260437,
"logits/rejected": -2.0261764526367188,
"logps/chosen": -291.95379638671875,
"logps/rejected": -199.91015625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.029357798165137616,
"grad_norm": 2.7688803672790527,
"learning_rate": 4.878048780487805e-07,
"logits/chosen": -1.1512565612792969,
"logits/rejected": -1.9958158731460571,
"logps/chosen": -313.67742919921875,
"logps/rejected": -219.4925537109375,
"loss": 0.6952,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0017839791253209114,
"rewards/margins": -0.0021596220321953297,
"rewards/rejected": 0.003943601623177528,
"step": 4
},
{
"epoch": 0.044036697247706424,
"grad_norm": 2.8300042152404785,
"learning_rate": 7.317073170731707e-07,
"logits/chosen": -1.217061996459961,
"logits/rejected": -2.1603338718414307,
"logps/chosen": -318.8204650878906,
"logps/rejected": -219.18704223632812,
"loss": 0.6906,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.003621376119554043,
"rewards/margins": 0.007228089962154627,
"rewards/rejected": -0.003606713144108653,
"step": 6
},
{
"epoch": 0.05871559633027523,
"grad_norm": 2.636244058609009,
"learning_rate": 9.75609756097561e-07,
"logits/chosen": -1.359943151473999,
"logits/rejected": -2.125555992126465,
"logps/chosen": -271.85272216796875,
"logps/rejected": -177.42059326171875,
"loss": 0.6913,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.010891949757933617,
"rewards/margins": 0.005428856238722801,
"rewards/rejected": 0.005463093984872103,
"step": 8
},
{
"epoch": 0.07339449541284404,
"grad_norm": 3.117539882659912,
"learning_rate": 1.2195121951219514e-06,
"logits/chosen": -1.1746495962142944,
"logits/rejected": -2.142481565475464,
"logps/chosen": -329.56201171875,
"logps/rejected": -171.868896484375,
"loss": 0.6837,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.019563177600502968,
"rewards/margins": 0.020585114136338234,
"rewards/rejected": -0.0010219333926215768,
"step": 10
},
{
"epoch": 0.08807339449541285,
"grad_norm": 3.573014497756958,
"learning_rate": 1.4634146341463414e-06,
"logits/chosen": -1.1120442152023315,
"logits/rejected": -1.9781230688095093,
"logps/chosen": -373.2279052734375,
"logps/rejected": -240.803955078125,
"loss": 0.6932,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.010507804341614246,
"rewards/margins": 0.00216490775346756,
"rewards/rejected": 0.008342898450791836,
"step": 12
},
{
"epoch": 0.10275229357798166,
"grad_norm": 3.1432557106018066,
"learning_rate": 1.707317073170732e-06,
"logits/chosen": -1.1176837682724,
"logits/rejected": -1.9580059051513672,
"logps/chosen": -281.2641296386719,
"logps/rejected": -181.50938415527344,
"loss": 0.6889,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.01434221863746643,
"rewards/margins": 0.010814160108566284,
"rewards/rejected": 0.0035280571319162846,
"step": 14
},
{
"epoch": 0.11743119266055047,
"grad_norm": 3.08245587348938,
"learning_rate": 1.951219512195122e-06,
"logits/chosen": -1.2329456806182861,
"logits/rejected": -2.0007548332214355,
"logps/chosen": -292.1178894042969,
"logps/rejected": -199.83258056640625,
"loss": 0.6897,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.004180246964097023,
"rewards/margins": 0.008593017235398293,
"rewards/rejected": -0.004412769805639982,
"step": 16
},
{
"epoch": 0.13211009174311927,
"grad_norm": 3.315281391143799,
"learning_rate": 2.1951219512195125e-06,
"logits/chosen": -1.1571717262268066,
"logits/rejected": -2.041630268096924,
"logps/chosen": -337.57818603515625,
"logps/rejected": -212.22586059570312,
"loss": 0.6881,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.012045616284012794,
"rewards/margins": 0.011737149208784103,
"rewards/rejected": 0.0003084660565946251,
"step": 18
},
{
"epoch": 0.14678899082568808,
"grad_norm": 3.288015127182007,
"learning_rate": 2.4390243902439027e-06,
"logits/chosen": -1.170533299446106,
"logits/rejected": -2.111523389816284,
"logps/chosen": -332.5646057128906,
"logps/rejected": -171.13861083984375,
"loss": 0.6866,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.018359623849391937,
"rewards/margins": 0.014990389347076416,
"rewards/rejected": 0.003369236597791314,
"step": 20
},
{
"epoch": 0.1614678899082569,
"grad_norm": 3.0890462398529053,
"learning_rate": 2.682926829268293e-06,
"logits/chosen": -1.326155662536621,
"logits/rejected": -2.235764265060425,
"logps/chosen": -321.82012939453125,
"logps/rejected": -199.34010314941406,
"loss": 0.6867,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.009998606517910957,
"rewards/margins": 0.014448178000748158,
"rewards/rejected": -0.0044495705515146255,
"step": 22
},
{
"epoch": 0.1761467889908257,
"grad_norm": 3.174973249435425,
"learning_rate": 2.926829268292683e-06,
"logits/chosen": -1.1311931610107422,
"logits/rejected": -2.1738736629486084,
"logps/chosen": -394.0300598144531,
"logps/rejected": -168.5726776123047,
"loss": 0.6941,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.001855961512774229,
"rewards/margins": -0.00018751714378595352,
"rewards/rejected": -0.0016684436704963446,
"step": 24
},
{
"epoch": 0.1908256880733945,
"grad_norm": 2.7846882343292236,
"learning_rate": 3.1707317073170736e-06,
"logits/chosen": -1.315462589263916,
"logits/rejected": -2.179847478866577,
"logps/chosen": -349.72467041015625,
"logps/rejected": -194.91355895996094,
"loss": 0.6842,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.020769033581018448,
"rewards/margins": 0.020399674773216248,
"rewards/rejected": 0.00036935764364898205,
"step": 26
},
{
"epoch": 0.20550458715596331,
"grad_norm": 2.960986852645874,
"learning_rate": 3.414634146341464e-06,
"logits/chosen": -1.218693733215332,
"logits/rejected": -2.219115734100342,
"logps/chosen": -303.5213928222656,
"logps/rejected": -176.81622314453125,
"loss": 0.693,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.001343409065157175,
"rewards/margins": 0.0020109512843191624,
"rewards/rejected": -0.0006675421027466655,
"step": 28
},
{
"epoch": 0.22018348623853212,
"grad_norm": 2.6187989711761475,
"learning_rate": 3.6585365853658537e-06,
"logits/chosen": -1.2147996425628662,
"logits/rejected": -2.09503173828125,
"logps/chosen": -311.60198974609375,
"logps/rejected": -211.1887664794922,
"loss": 0.6827,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.027364609763026237,
"rewards/margins": 0.022295203059911728,
"rewards/rejected": 0.0050694081000983715,
"step": 30
},
{
"epoch": 0.23486238532110093,
"grad_norm": 3.18058180809021,
"learning_rate": 3.902439024390244e-06,
"logits/chosen": -1.269258975982666,
"logits/rejected": -2.129913806915283,
"logps/chosen": -310.4969787597656,
"logps/rejected": -175.62393188476562,
"loss": 0.6784,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.02811383828520775,
"rewards/margins": 0.031415536999702454,
"rewards/rejected": -0.0033016952220350504,
"step": 32
},
{
"epoch": 0.24954128440366974,
"grad_norm": 3.44490647315979,
"learning_rate": 4.146341463414634e-06,
"logits/chosen": -1.2504366636276245,
"logits/rejected": -2.2198028564453125,
"logps/chosen": -346.65069580078125,
"logps/rejected": -176.64193725585938,
"loss": 0.6835,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.027392717078328133,
"rewards/margins": 0.02168484590947628,
"rewards/rejected": 0.0057078697718679905,
"step": 34
},
{
"epoch": 0.26422018348623855,
"grad_norm": 2.8181567192077637,
"learning_rate": 4.390243902439025e-06,
"logits/chosen": -1.2708137035369873,
"logits/rejected": -2.0570731163024902,
"logps/chosen": -332.41156005859375,
"logps/rejected": -219.01556396484375,
"loss": 0.6776,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.04396076127886772,
"rewards/margins": 0.0344666913151741,
"rewards/rejected": 0.009494070895016193,
"step": 36
},
{
"epoch": 0.27889908256880735,
"grad_norm": 3.29911208152771,
"learning_rate": 4.634146341463416e-06,
"logits/chosen": -1.2899575233459473,
"logits/rejected": -2.1684398651123047,
"logps/chosen": -316.49993896484375,
"logps/rejected": -214.9636688232422,
"loss": 0.6867,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.022868501022458076,
"rewards/margins": 0.014677047729492188,
"rewards/rejected": 0.008191454224288464,
"step": 38
},
{
"epoch": 0.29357798165137616,
"grad_norm": 2.80910325050354,
"learning_rate": 4.8780487804878055e-06,
"logits/chosen": -1.1400400400161743,
"logits/rejected": -2.0709128379821777,
"logps/chosen": -368.51824951171875,
"logps/rejected": -194.36216735839844,
"loss": 0.6702,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.04931124299764633,
"rewards/margins": 0.04868461191654205,
"rewards/rejected": 0.0006266293348744512,
"step": 40
},
{
"epoch": 0.30825688073394497,
"grad_norm": 3.187028169631958,
"learning_rate": 4.999908404322799e-06,
"logits/chosen": -1.142716646194458,
"logits/rejected": -2.20780348777771,
"logps/chosen": -343.4991760253906,
"logps/rejected": -184.4697265625,
"loss": 0.6621,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.06981995701789856,
"rewards/margins": 0.06430794298648834,
"rewards/rejected": 0.005512019619345665,
"step": 42
},
{
"epoch": 0.3229357798165138,
"grad_norm": 2.664074659347534,
"learning_rate": 4.999175679175577e-06,
"logits/chosen": -1.209214448928833,
"logits/rejected": -2.1323928833007812,
"logps/chosen": -270.0044860839844,
"logps/rejected": -171.32073974609375,
"loss": 0.656,
"rewards/accuracies": 0.828125,
"rewards/chosen": 0.07609987258911133,
"rewards/margins": 0.07787147164344788,
"rewards/rejected": -0.0017715932335704565,
"step": 44
},
{
"epoch": 0.3376146788990826,
"grad_norm": 2.661236047744751,
"learning_rate": 4.997710443643461e-06,
"logits/chosen": -1.235365629196167,
"logits/rejected": -2.0518736839294434,
"logps/chosen": -279.3170166015625,
"logps/rejected": -219.13522338867188,
"loss": 0.6659,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.0708962082862854,
"rewards/margins": 0.05834145471453667,
"rewards/rejected": 0.012554753571748734,
"step": 46
},
{
"epoch": 0.3522935779816514,
"grad_norm": 3.9819839000701904,
"learning_rate": 4.995513127188151e-06,
"logits/chosen": -1.1877082586288452,
"logits/rejected": -2.2009482383728027,
"logps/chosen": -392.36041259765625,
"logps/rejected": -197.9148406982422,
"loss": 0.661,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.10185055434703827,
"rewards/margins": 0.06925681233406067,
"rewards/rejected": 0.0325937457382679,
"step": 48
},
{
"epoch": 0.3669724770642202,
"grad_norm": 3.3627212047576904,
"learning_rate": 4.992584373844853e-06,
"logits/chosen": -1.3079514503479004,
"logits/rejected": -2.1042516231536865,
"logps/chosen": -367.0893859863281,
"logps/rejected": -195.80905151367188,
"loss": 0.6609,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.10206526517868042,
"rewards/margins": 0.06879469007253647,
"rewards/rejected": 0.03327057510614395,
"step": 50
},
{
"epoch": 0.381651376146789,
"grad_norm": 3.4364843368530273,
"learning_rate": 4.98892504203351e-06,
"logits/chosen": -1.3703242540359497,
"logits/rejected": -2.135772228240967,
"logps/chosen": -305.8392639160156,
"logps/rejected": -170.4441680908203,
"loss": 0.6426,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.13675755262374878,
"rewards/margins": 0.10614188760519028,
"rewards/rejected": 0.030615665018558502,
"step": 52
},
{
"epoch": 0.3963302752293578,
"grad_norm": 3.014284372329712,
"learning_rate": 4.9845362043071925e-06,
"logits/chosen": -1.1213593482971191,
"logits/rejected": -2.040038585662842,
"logps/chosen": -311.7105712890625,
"logps/rejected": -176.02438354492188,
"loss": 0.6448,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.13455447554588318,
"rewards/margins": 0.10230613499879837,
"rewards/rejected": 0.03224834054708481,
"step": 54
},
{
"epoch": 0.41100917431192663,
"grad_norm": 3.0536396503448486,
"learning_rate": 4.97941914703774e-06,
"logits/chosen": -1.2472190856933594,
"logits/rejected": -2.175790309906006,
"logps/chosen": -310.2051086425781,
"logps/rejected": -214.69712829589844,
"loss": 0.6303,
"rewards/accuracies": 0.859375,
"rewards/chosen": 0.18810473382472992,
"rewards/margins": 0.13538572192192078,
"rewards/rejected": 0.052719030529260635,
"step": 56
},
{
"epoch": 0.42568807339449544,
"grad_norm": 3.687453031539917,
"learning_rate": 4.973575370038718e-06,
"logits/chosen": -1.161484956741333,
"logits/rejected": -2.056807518005371,
"logps/chosen": -331.156005859375,
"logps/rejected": -206.752685546875,
"loss": 0.6109,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.22813093662261963,
"rewards/margins": 0.178737074136734,
"rewards/rejected": 0.04939386993646622,
"step": 58
},
{
"epoch": 0.44036697247706424,
"grad_norm": 2.6800389289855957,
"learning_rate": 4.967006586125827e-06,
"logits/chosen": -1.3047680854797363,
"logits/rejected": -2.1053338050842285,
"logps/chosen": -320.47052001953125,
"logps/rejected": -198.96849060058594,
"loss": 0.5949,
"rewards/accuracies": 0.984375,
"rewards/chosen": 0.25883767008781433,
"rewards/margins": 0.21324561536312103,
"rewards/rejected": 0.0455920584499836,
"step": 60
},
{
"epoch": 0.45504587155963305,
"grad_norm": 3.319866180419922,
"learning_rate": 4.959714720614871e-06,
"logits/chosen": -1.2463948726654053,
"logits/rejected": -2.2376761436462402,
"logps/chosen": -343.1983642578125,
"logps/rejected": -197.24610900878906,
"loss": 0.5745,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.3166658878326416,
"rewards/margins": 0.26522931456565857,
"rewards/rejected": 0.05143657326698303,
"step": 62
},
{
"epoch": 0.46972477064220186,
"grad_norm": 2.6847336292266846,
"learning_rate": 4.951701910757446e-06,
"logits/chosen": -1.252946138381958,
"logits/rejected": -2.0270633697509766,
"logps/chosen": -273.5660400390625,
"logps/rejected": -200.33984375,
"loss": 0.5721,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.3213706910610199,
"rewards/margins": 0.2726665437221527,
"rewards/rejected": 0.04870418459177017,
"step": 64
},
{
"epoch": 0.48440366972477067,
"grad_norm": 3.1882617473602295,
"learning_rate": 4.942970505114514e-06,
"logits/chosen": -1.1212793588638306,
"logits/rejected": -2.0485286712646484,
"logps/chosen": -329.2900390625,
"logps/rejected": -188.50067138671875,
"loss": 0.551,
"rewards/accuracies": 0.921875,
"rewards/chosen": 0.37985220551490784,
"rewards/margins": 0.32368168234825134,
"rewards/rejected": 0.056170523166656494,
"step": 66
},
{
"epoch": 0.4990825688073395,
"grad_norm": 2.6009716987609863,
"learning_rate": 4.933523062868033e-06,
"logits/chosen": -1.1749910116195679,
"logits/rejected": -2.1656789779663086,
"logps/chosen": -290.49560546875,
"logps/rejected": -177.18348693847656,
"loss": 0.5495,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.399168461561203,
"rewards/margins": 0.3291959762573242,
"rewards/rejected": 0.0699724480509758,
"step": 68
},
{
"epoch": 0.5137614678899083,
"grad_norm": 2.7933292388916016,
"learning_rate": 4.923362353070859e-06,
"logits/chosen": -0.9930830597877502,
"logits/rejected": -2.1664011478424072,
"logps/chosen": -308.12164306640625,
"logps/rejected": -170.24810791015625,
"loss": 0.5133,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.4771941602230072,
"rewards/margins": 0.4229365289211273,
"rewards/rejected": 0.05425760895013809,
"step": 70
},
{
"epoch": 0.5284403669724771,
"grad_norm": 2.4665513038635254,
"learning_rate": 4.912491353835138e-06,
"logits/chosen": -1.2331562042236328,
"logits/rejected": -2.0544230937957764,
"logps/chosen": -277.6913757324219,
"logps/rejected": -196.8771209716797,
"loss": 0.5365,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.4595210552215576,
"rewards/margins": 0.3660896122455597,
"rewards/rejected": 0.09343138337135315,
"step": 72
},
{
"epoch": 0.5431192660550459,
"grad_norm": 2.463873863220215,
"learning_rate": 4.900913251459418e-06,
"logits/chosen": -1.1638422012329102,
"logits/rejected": -2.0549814701080322,
"logps/chosen": -280.3222961425781,
"logps/rejected": -182.9549560546875,
"loss": 0.5144,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.49483105540275574,
"rewards/margins": 0.4287148714065552,
"rewards/rejected": 0.06611625105142593,
"step": 74
},
{
"epoch": 0.5577981651376147,
"grad_norm": 2.5419061183929443,
"learning_rate": 4.8886314394947396e-06,
"logits/chosen": -1.0577822923660278,
"logits/rejected": -2.03446364402771,
"logps/chosen": -299.0617980957031,
"logps/rejected": -196.64585876464844,
"loss": 0.4634,
"rewards/accuracies": 0.984375,
"rewards/chosen": 0.6822911500930786,
"rewards/margins": 0.5850739479064941,
"rewards/rejected": 0.09721729159355164,
"step": 76
},
{
"epoch": 0.5724770642201835,
"grad_norm": 2.5450778007507324,
"learning_rate": 4.875649517749985e-06,
"logits/chosen": -1.0982365608215332,
"logits/rejected": -2.1213526725769043,
"logps/chosen": -301.862548828125,
"logps/rejected": -203.84742736816406,
"loss": 0.4663,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.6690702438354492,
"rewards/margins": 0.5742719769477844,
"rewards/rejected": 0.0947982519865036,
"step": 78
},
{
"epoch": 0.5871559633027523,
"grad_norm": 2.306406259536743,
"learning_rate": 4.861971291236772e-06,
"logits/chosen": -1.243112325668335,
"logits/rejected": -2.0873706340789795,
"logps/chosen": -346.6309509277344,
"logps/rejected": -203.404052734375,
"loss": 0.4685,
"rewards/accuracies": 0.890625,
"rewards/chosen": 0.7503749132156372,
"rewards/margins": 0.5946022868156433,
"rewards/rejected": 0.15577253699302673,
"step": 80
},
{
"epoch": 0.6018348623853211,
"grad_norm": 2.5219640731811523,
"learning_rate": 4.847600769054201e-06,
"logits/chosen": -1.2759498357772827,
"logits/rejected": -2.1124911308288574,
"logps/chosen": -385.54498291015625,
"logps/rejected": -234.3006591796875,
"loss": 0.425,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.8981677293777466,
"rewards/margins": 0.7219379544258118,
"rewards/rejected": 0.1762298047542572,
"step": 82
},
{
"epoch": 0.6165137614678899,
"grad_norm": 2.297736406326294,
"learning_rate": 4.832542163213787e-06,
"logits/chosen": -1.1348319053649902,
"logits/rejected": -2.198058605194092,
"logps/chosen": -278.73016357421875,
"logps/rejected": -165.02432250976562,
"loss": 0.4082,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.8287730813026428,
"rewards/margins": 0.7731601595878601,
"rewards/rejected": 0.05561291426420212,
"step": 84
},
{
"epoch": 0.6311926605504588,
"grad_norm": 2.225381851196289,
"learning_rate": 4.816799887404911e-06,
"logits/chosen": -1.299065351486206,
"logits/rejected": -2.1710290908813477,
"logps/chosen": -316.4049377441406,
"logps/rejected": -197.56884765625,
"loss": 0.4463,
"rewards/accuracies": 0.984375,
"rewards/chosen": 0.8337810039520264,
"rewards/margins": 0.6480390429496765,
"rewards/rejected": 0.18574194610118866,
"step": 86
},
{
"epoch": 0.6458715596330276,
"grad_norm": 2.1773393154144287,
"learning_rate": 4.800378555701168e-06,
"logits/chosen": -1.145480751991272,
"logits/rejected": -2.0223851203918457,
"logps/chosen": -370.3527526855469,
"logps/rejected": -196.968505859375,
"loss": 0.4075,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.898174524307251,
"rewards/margins": 0.7861010432243347,
"rewards/rejected": 0.11207354068756104,
"step": 88
},
{
"epoch": 0.6605504587155964,
"grad_norm": 2.2705249786376953,
"learning_rate": 4.783282981207979e-06,
"logits/chosen": -1.191556453704834,
"logits/rejected": -2.307077407836914,
"logps/chosen": -312.4258728027344,
"logps/rejected": -179.88075256347656,
"loss": 0.3893,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.0026105642318726,
"rewards/margins": 0.8912415504455566,
"rewards/rejected": 0.11136899888515472,
"step": 90
},
{
"epoch": 0.6752293577981652,
"grad_norm": 1.855381965637207,
"learning_rate": 4.765518174651864e-06,
"logits/chosen": -1.1708786487579346,
"logits/rejected": -2.0928103923797607,
"logps/chosen": -301.8147277832031,
"logps/rejected": -201.09478759765625,
"loss": 0.3757,
"rewards/accuracies": 0.953125,
"rewards/chosen": 0.9942986369132996,
"rewards/margins": 0.8888772130012512,
"rewards/rejected": 0.10542140901088715,
"step": 92
},
{
"epoch": 0.689908256880734,
"grad_norm": 2.0521061420440674,
"learning_rate": 4.747089342911793e-06,
"logits/chosen": -1.011386513710022,
"logits/rejected": -2.1828246116638184,
"logps/chosen": -308.777099609375,
"logps/rejected": -185.42471313476562,
"loss": 0.3329,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.1515001058578491,
"rewards/margins": 1.04723060131073,
"rewards/rejected": 0.10426945239305496,
"step": 94
},
{
"epoch": 0.7045871559633028,
"grad_norm": 1.8322721719741821,
"learning_rate": 4.728001887493048e-06,
"logits/chosen": -1.0440161228179932,
"logits/rejected": -2.2036566734313965,
"logps/chosen": -317.36346435546875,
"logps/rejected": -204.70556640625,
"loss": 0.3371,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.257871150970459,
"rewards/margins": 1.1031622886657715,
"rewards/rejected": 0.15470871329307556,
"step": 96
},
{
"epoch": 0.7192660550458716,
"grad_norm": 1.716375708580017,
"learning_rate": 4.708261402944036e-06,
"logits/chosen": -1.1383062601089478,
"logits/rejected": -2.189666271209717,
"logps/chosen": -333.7127380371094,
"logps/rejected": -198.931884765625,
"loss": 0.2993,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.3938922882080078,
"rewards/margins": 1.2491440773010254,
"rewards/rejected": 0.14474821090698242,
"step": 98
},
{
"epoch": 0.7339449541284404,
"grad_norm": 1.7844756841659546,
"learning_rate": 4.687873675216522e-06,
"logits/chosen": -1.0265507698059082,
"logits/rejected": -1.989030122756958,
"logps/chosen": -318.661865234375,
"logps/rejected": -211.2397918701172,
"loss": 0.3127,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.453789472579956,
"rewards/margins": 1.268122673034668,
"rewards/rejected": 0.1856667846441269,
"step": 100
},
{
"epoch": 0.7486238532110092,
"grad_norm": 1.7730361223220825,
"learning_rate": 4.666844679969765e-06,
"logits/chosen": -1.3037304878234863,
"logits/rejected": -2.2598671913146973,
"logps/chosen": -312.95440673828125,
"logps/rejected": -219.03636169433594,
"loss": 0.3016,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2438600063323975,
"rewards/margins": 1.2158725261688232,
"rewards/rejected": 0.02798762172460556,
"step": 102
},
{
"epoch": 0.763302752293578,
"grad_norm": 1.6278932094573975,
"learning_rate": 4.6451805808190464e-06,
"logits/chosen": -1.1335176229476929,
"logits/rejected": -2.17392635345459,
"logps/chosen": -299.39410400390625,
"logps/rejected": -186.06622314453125,
"loss": 0.2634,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.4063000679016113,
"rewards/margins": 1.4068892002105713,
"rewards/rejected": -0.0005892012268304825,
"step": 104
},
{
"epoch": 0.7779816513761468,
"grad_norm": 1.5209800004959106,
"learning_rate": 4.622887727529104e-06,
"logits/chosen": -1.1014411449432373,
"logits/rejected": -2.1214916706085205,
"logps/chosen": -271.7640075683594,
"logps/rejected": -216.89988708496094,
"loss": 0.2627,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.386069416999817,
"rewards/margins": 1.4243448972702026,
"rewards/rejected": -0.03827540576457977,
"step": 106
},
{
"epoch": 0.7926605504587156,
"grad_norm": 1.5802730321884155,
"learning_rate": 4.599972654153018e-06,
"logits/chosen": -0.9640820026397705,
"logits/rejected": -2.146678924560547,
"logps/chosen": -315.3819885253906,
"logps/rejected": -184.68304443359375,
"loss": 0.2601,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.5327023267745972,
"rewards/margins": 1.4815881252288818,
"rewards/rejected": 0.05111423879861832,
"step": 108
},
{
"epoch": 0.8073394495412844,
"grad_norm": 1.6033107042312622,
"learning_rate": 4.5764420771170735e-06,
"logits/chosen": -0.9946492910385132,
"logits/rejected": -2.0975136756896973,
"logps/chosen": -292.52398681640625,
"logps/rejected": -202.83602905273438,
"loss": 0.2738,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.4424684047698975,
"rewards/margins": 1.4901291131973267,
"rewards/rejected": -0.04766057804226875,
"step": 110
},
{
"epoch": 0.8220183486238533,
"grad_norm": 1.6146634817123413,
"learning_rate": 4.552302893252166e-06,
"logits/chosen": -1.2488244771957397,
"logits/rejected": -2.2399239540100098,
"logps/chosen": -319.301025390625,
"logps/rejected": -215.10731506347656,
"loss": 0.2432,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.492225170135498,
"rewards/margins": 1.5009602308273315,
"rewards/rejected": -0.008735168725252151,
"step": 112
},
{
"epoch": 0.8366972477064221,
"grad_norm": 1.879619836807251,
"learning_rate": 4.52756217777234e-06,
"logits/chosen": -1.2845666408538818,
"logits/rejected": -2.2133727073669434,
"logps/chosen": -325.5247497558594,
"logps/rejected": -219.1314697265625,
"loss": 0.2626,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.571778655052185,
"rewards/margins": 1.5141938924789429,
"rewards/rejected": 0.05758478865027428,
"step": 114
},
{
"epoch": 0.8513761467889909,
"grad_norm": 1.5598102807998657,
"learning_rate": 4.502227182201035e-06,
"logits/chosen": -0.9802009463310242,
"logits/rejected": -2.0259878635406494,
"logps/chosen": -275.55816650390625,
"logps/rejected": -185.3338165283203,
"loss": 0.2275,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5974626541137695,
"rewards/margins": 1.620395541191101,
"rewards/rejected": -0.022933142259716988,
"step": 116
},
{
"epoch": 0.8660550458715597,
"grad_norm": 1.277979850769043,
"learning_rate": 4.476305332245662e-06,
"logits/chosen": -1.1266419887542725,
"logits/rejected": -2.322726249694824,
"logps/chosen": -327.810302734375,
"logps/rejected": -161.39149475097656,
"loss": 0.1984,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5967400074005127,
"rewards/margins": 1.8176448345184326,
"rewards/rejected": -0.2209048718214035,
"step": 118
},
{
"epoch": 0.8807339449541285,
"grad_norm": 1.6042323112487793,
"learning_rate": 4.449804225621116e-06,
"logits/chosen": -1.0289760828018188,
"logits/rejected": -2.102262496948242,
"logps/chosen": -291.6026611328125,
"logps/rejected": -190.6699676513672,
"loss": 0.249,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.5607997179031372,
"rewards/margins": 1.628572702407837,
"rewards/rejected": -0.06777279078960419,
"step": 120
},
{
"epoch": 0.8954128440366973,
"grad_norm": 1.430982232093811,
"learning_rate": 4.422731629822887e-06,
"logits/chosen": -0.9640188217163086,
"logits/rejected": -2.000277519226074,
"logps/chosen": -327.7152404785156,
"logps/rejected": -205.96337890625,
"loss": 0.2425,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.6806552410125732,
"rewards/margins": 1.685612440109253,
"rewards/rejected": -0.004957253113389015,
"step": 122
},
{
"epoch": 0.9100917431192661,
"grad_norm": 1.513214111328125,
"learning_rate": 4.395095479850396e-06,
"logits/chosen": -0.972959578037262,
"logits/rejected": -1.9764440059661865,
"logps/chosen": -299.74847412109375,
"logps/rejected": -197.39337158203125,
"loss": 0.2516,
"rewards/accuracies": 0.953125,
"rewards/chosen": 1.5539629459381104,
"rewards/margins": 1.6879091262817383,
"rewards/rejected": -0.13394607603549957,
"step": 124
},
{
"epoch": 0.9247706422018349,
"grad_norm": 1.2460252046585083,
"learning_rate": 4.366903875881243e-06,
"logits/chosen": -1.1148145198822021,
"logits/rejected": -2.3518619537353516,
"logps/chosen": -287.5447692871094,
"logps/rejected": -175.43360900878906,
"loss": 0.19,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5850166082382202,
"rewards/margins": 1.998946189880371,
"rewards/rejected": -0.4139295220375061,
"step": 126
},
{
"epoch": 0.9394495412844037,
"grad_norm": 1.4544743299484253,
"learning_rate": 4.3381650808970365e-06,
"logits/chosen": -1.0423675775527954,
"logits/rejected": -1.992466926574707,
"logps/chosen": -265.5049743652344,
"logps/rejected": -196.2741241455078,
"loss": 0.2207,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.5719702243804932,
"rewards/margins": 1.7563403844833374,
"rewards/rejected": -0.18437033891677856,
"step": 128
},
{
"epoch": 0.9541284403669725,
"grad_norm": 1.5302927494049072,
"learning_rate": 4.308887518261507e-06,
"logits/chosen": -0.8528121113777161,
"logits/rejected": -1.961355447769165,
"logps/chosen": -288.3016357421875,
"logps/rejected": -206.49557495117188,
"loss": 0.2075,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.7642028331756592,
"rewards/margins": 1.9030241966247559,
"rewards/rejected": -0.13882134854793549,
"step": 130
},
{
"epoch": 0.9688073394495413,
"grad_norm": 1.4101622104644775,
"learning_rate": 4.279079769251617e-06,
"logits/chosen": -1.2729012966156006,
"logits/rejected": -2.241056203842163,
"logps/chosen": -362.6707458496094,
"logps/rejected": -222.91549682617188,
"loss": 0.1861,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.8280537128448486,
"rewards/margins": 2.065840005874634,
"rewards/rejected": -0.23778626322746277,
"step": 132
},
{
"epoch": 0.9834862385321101,
"grad_norm": 1.1177998781204224,
"learning_rate": 4.248750570542373e-06,
"logits/chosen": -1.0287914276123047,
"logits/rejected": -2.1009342670440674,
"logps/chosen": -281.2322998046875,
"logps/rejected": -189.8081512451172,
"loss": 0.1931,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5686054229736328,
"rewards/margins": 1.9290703535079956,
"rewards/rejected": -0.360464870929718,
"step": 134
},
{
"epoch": 0.998165137614679,
"grad_norm": 1.2145086526870728,
"learning_rate": 4.21790881164611e-06,
"logits/chosen": -0.9554519653320312,
"logits/rejected": -2.0969762802124023,
"logps/chosen": -292.5300598144531,
"logps/rejected": -207.0960235595703,
"loss": 0.1734,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.7437012195587158,
"rewards/margins": 2.259512424468994,
"rewards/rejected": -0.5158110857009888,
"step": 136
},
{
"epoch": 1.0128440366972478,
"grad_norm": 1.095413327217102,
"learning_rate": 4.186563532306957e-06,
"logits/chosen": -0.9077868461608887,
"logits/rejected": -2.1029911041259766,
"logps/chosen": -300.1116943359375,
"logps/rejected": -180.19322204589844,
"loss": 0.1588,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7422677278518677,
"rewards/margins": 2.208810567855835,
"rewards/rejected": -0.4665430784225464,
"step": 138
},
{
"epoch": 1.0275229357798166,
"grad_norm": 1.3220425844192505,
"learning_rate": 4.154723919851291e-06,
"logits/chosen": -1.077134132385254,
"logits/rejected": -2.1211631298065186,
"logps/chosen": -300.9671325683594,
"logps/rejected": -185.4986114501953,
"loss": 0.2096,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.5104981660842896,
"rewards/margins": 1.9692846536636353,
"rewards/rejected": -0.4587865471839905,
"step": 140
},
{
"epoch": 1.0422018348623854,
"grad_norm": 0.9982088208198547,
"learning_rate": 4.122399306494918e-06,
"logits/chosen": -1.1294522285461426,
"logits/rejected": -2.265366792678833,
"logps/chosen": -348.11553955078125,
"logps/rejected": -211.96484375,
"loss": 0.1527,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.8556833267211914,
"rewards/margins": 2.2423272132873535,
"rewards/rejected": -0.38664379715919495,
"step": 142
},
{
"epoch": 1.0568807339449542,
"grad_norm": 0.9512726068496704,
"learning_rate": 4.089599166607794e-06,
"logits/chosen": -1.0260741710662842,
"logits/rejected": -2.078310489654541,
"logps/chosen": -301.7906494140625,
"logps/rejected": -200.17333984375,
"loss": 0.132,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.709304690361023,
"rewards/margins": 2.428473949432373,
"rewards/rejected": -0.7191690802574158,
"step": 144
},
{
"epoch": 1.071559633027523,
"grad_norm": 0.9233289957046509,
"learning_rate": 4.05633311393708e-06,
"logits/chosen": -0.9745887517929077,
"logits/rejected": -2.032710313796997,
"logps/chosen": -267.1161804199219,
"logps/rejected": -185.32769775390625,
"loss": 0.1508,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.708602786064148,
"rewards/margins": 2.2974541187286377,
"rewards/rejected": -0.5888515710830688,
"step": 146
},
{
"epoch": 1.0862385321100918,
"grad_norm": 0.9916685223579407,
"learning_rate": 4.022610898789349e-06,
"logits/chosen": -0.9556669592857361,
"logits/rejected": -2.117856979370117,
"logps/chosen": -277.4543762207031,
"logps/rejected": -200.896728515625,
"loss": 0.1394,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.785478115081787,
"rewards/margins": 2.374891519546509,
"rewards/rejected": -0.5894135236740112,
"step": 148
},
{
"epoch": 1.1009174311926606,
"grad_norm": 1.2182554006576538,
"learning_rate": 3.988442405172755e-06,
"logits/chosen": -0.8240389823913574,
"logits/rejected": -2.0166051387786865,
"logps/chosen": -293.0532531738281,
"logps/rejected": -215.48983764648438,
"loss": 0.121,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0409958362579346,
"rewards/margins": 2.6529250144958496,
"rewards/rejected": -0.6119292378425598,
"step": 150
},
{
"epoch": 1.1155963302752294,
"grad_norm": 1.0240944623947144,
"learning_rate": 3.953837647900031e-06,
"logits/chosen": -0.899176836013794,
"logits/rejected": -2.119375705718994,
"logps/chosen": -283.8042907714844,
"logps/rejected": -211.6457977294922,
"loss": 0.1437,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.990633249282837,
"rewards/margins": 2.620699405670166,
"rewards/rejected": -0.6300662159919739,
"step": 152
},
{
"epoch": 1.1302752293577982,
"grad_norm": 1.21559476852417,
"learning_rate": 3.918806769653135e-06,
"logits/chosen": -0.8191251754760742,
"logits/rejected": -2.017087459564209,
"logps/chosen": -331.17724609375,
"logps/rejected": -209.400146484375,
"loss": 0.152,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.9586645364761353,
"rewards/margins": 2.562222480773926,
"rewards/rejected": -0.6035579442977905,
"step": 154
},
{
"epoch": 1.144954128440367,
"grad_norm": 1.2314106225967407,
"learning_rate": 3.88336003801042e-06,
"logits/chosen": -0.9168681502342224,
"logits/rejected": -2.054666519165039,
"logps/chosen": -264.9989013671875,
"logps/rejected": -192.7652587890625,
"loss": 0.1509,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7247517108917236,
"rewards/margins": 2.3017380237579346,
"rewards/rejected": -0.5769862532615662,
"step": 156
},
{
"epoch": 1.1596330275229358,
"grad_norm": 0.9996971487998962,
"learning_rate": 3.847507842437205e-06,
"logits/chosen": -0.788710355758667,
"logits/rejected": -2.0527966022491455,
"logps/chosen": -306.01373291015625,
"logps/rejected": -187.79794311523438,
"loss": 0.1149,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9181973934173584,
"rewards/margins": 2.8037965297698975,
"rewards/rejected": -0.8855991959571838,
"step": 158
},
{
"epoch": 1.1743119266055047,
"grad_norm": 0.9679911732673645,
"learning_rate": 3.811260691240604e-06,
"logits/chosen": -0.8132730722427368,
"logits/rejected": -2.0696139335632324,
"logps/chosen": -351.917236328125,
"logps/rejected": -204.27964782714844,
"loss": 0.1011,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.159268856048584,
"rewards/margins": 2.8514890670776367,
"rewards/rejected": -0.6922197937965393,
"step": 160
},
{
"epoch": 1.1889908256880735,
"grad_norm": 0.9500184059143066,
"learning_rate": 3.774629208489547e-06,
"logits/chosen": -0.9215357899665833,
"logits/rejected": -2.1160709857940674,
"logps/chosen": -253.12631225585938,
"logps/rejected": -187.95811462402344,
"loss": 0.1173,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8172812461853027,
"rewards/margins": 2.5532562732696533,
"rewards/rejected": -0.7359753251075745,
"step": 162
},
{
"epoch": 1.2036697247706423,
"grad_norm": 1.0494946241378784,
"learning_rate": 3.7376241309008433e-06,
"logits/chosen": -1.0810823440551758,
"logits/rejected": -2.151219606399536,
"logps/chosen": -337.37255859375,
"logps/rejected": -198.8866424560547,
"loss": 0.1245,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.12947154045105,
"rewards/margins": 2.8516111373901367,
"rewards/rejected": -0.7221395373344421,
"step": 164
},
{
"epoch": 1.218348623853211,
"grad_norm": 0.822201669216156,
"learning_rate": 3.7002563046922502e-06,
"logits/chosen": -1.0325469970703125,
"logits/rejected": -2.2076807022094727,
"logps/chosen": -337.1971435546875,
"logps/rejected": -189.85455322265625,
"loss": 0.0923,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9719089269638062,
"rewards/margins": 3.0117526054382324,
"rewards/rejected": -1.0398434400558472,
"step": 166
},
{
"epoch": 1.2330275229357799,
"grad_norm": 0.681236982345581,
"learning_rate": 3.6625366824034337e-06,
"logits/chosen": -0.7656459212303162,
"logits/rejected": -2.049311399459839,
"logps/chosen": -289.5611877441406,
"logps/rejected": -224.8207550048828,
"loss": 0.0906,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0607948303222656,
"rewards/margins": 3.2540061473846436,
"rewards/rejected": -1.1932108402252197,
"step": 168
},
{
"epoch": 1.2477064220183487,
"grad_norm": 0.9994679689407349,
"learning_rate": 3.6244763196857714e-06,
"logits/chosen": -0.9609106183052063,
"logits/rejected": -2.1387076377868652,
"logps/chosen": -307.74798583984375,
"logps/rejected": -199.2579345703125,
"loss": 0.1095,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0705361366271973,
"rewards/margins": 3.1018238067626953,
"rewards/rejected": -1.0312877893447876,
"step": 170
},
{
"epoch": 1.2623853211009175,
"grad_norm": 1.2497354745864868,
"learning_rate": 3.5860863720619333e-06,
"logits/chosen": -0.9625377058982849,
"logits/rejected": -2.073275089263916,
"logps/chosen": -297.9329833984375,
"logps/rejected": -200.70681762695312,
"loss": 0.1191,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.188173532485962,
"rewards/margins": 2.902965784072876,
"rewards/rejected": -0.7147922515869141,
"step": 172
},
{
"epoch": 1.2770642201834863,
"grad_norm": 0.862918496131897,
"learning_rate": 3.547378091656186e-06,
"logits/chosen": -0.7778910994529724,
"logits/rejected": -2.1054413318634033,
"logps/chosen": -304.24798583984375,
"logps/rejected": -189.96273803710938,
"loss": 0.0918,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.911596655845642,
"rewards/margins": 3.0262608528137207,
"rewards/rejected": -1.1146641969680786,
"step": 174
},
{
"epoch": 1.2917431192660551,
"grad_norm": 0.7902020812034607,
"learning_rate": 3.5083628238963913e-06,
"logits/chosen": -1.0238415002822876,
"logits/rejected": -1.960688829421997,
"logps/chosen": -243.5750274658203,
"logps/rejected": -191.24264526367188,
"loss": 0.1312,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8321638107299805,
"rewards/margins": 2.780503988265991,
"rewards/rejected": -0.948340117931366,
"step": 176
},
{
"epoch": 1.306422018348624,
"grad_norm": 0.9199721813201904,
"learning_rate": 3.4690520041886473e-06,
"logits/chosen": -0.7949679493904114,
"logits/rejected": -2.0139424800872803,
"logps/chosen": -287.1697082519531,
"logps/rejected": -230.3143310546875,
"loss": 0.1007,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.025575876235962,
"rewards/margins": 3.0686216354370117,
"rewards/rejected": -1.0430455207824707,
"step": 178
},
{
"epoch": 1.3211009174311927,
"grad_norm": 0.6183698773384094,
"learning_rate": 3.4294571545655653e-06,
"logits/chosen": -0.8391042947769165,
"logits/rejected": -2.1887526512145996,
"logps/chosen": -302.6844482421875,
"logps/rejected": -199.70486450195312,
"loss": 0.0751,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0722954273223877,
"rewards/margins": 3.3604629039764404,
"rewards/rejected": -1.2881678342819214,
"step": 180
},
{
"epoch": 1.3357798165137615,
"grad_norm": 0.6749584674835205,
"learning_rate": 3.38958988030915e-06,
"logits/chosen": -1.1391972303390503,
"logits/rejected": -2.056378126144409,
"logps/chosen": -285.07562255859375,
"logps/rejected": -243.91146850585938,
"loss": 0.1161,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.9815781116485596,
"rewards/margins": 3.0620830059051514,
"rewards/rejected": -1.0805050134658813,
"step": 182
},
{
"epoch": 1.3504587155963304,
"grad_norm": 0.9916686415672302,
"learning_rate": 3.3494618665492833e-06,
"logits/chosen": -0.974543571472168,
"logits/rejected": -1.9790008068084717,
"logps/chosen": -265.7524719238281,
"logps/rejected": -210.968994140625,
"loss": 0.1316,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7159056663513184,
"rewards/margins": 2.7169814109802246,
"rewards/rejected": -1.0010758638381958,
"step": 184
},
{
"epoch": 1.3651376146788992,
"grad_norm": 0.7534170746803284,
"learning_rate": 3.3090848748388042e-06,
"logits/chosen": -0.9359984993934631,
"logits/rejected": -2.1165120601654053,
"logps/chosen": -365.70556640625,
"logps/rejected": -213.4051513671875,
"loss": 0.0804,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9640876054763794,
"rewards/margins": 3.4107747077941895,
"rewards/rejected": -1.4466872215270996,
"step": 186
},
{
"epoch": 1.379816513761468,
"grad_norm": 0.7047733068466187,
"learning_rate": 3.2684707397061887e-06,
"logits/chosen": -1.0234425067901611,
"logits/rejected": -2.067413806915283,
"logps/chosen": -304.1073913574219,
"logps/rejected": -191.95208740234375,
"loss": 0.0835,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.091536045074463,
"rewards/margins": 3.2847490310668945,
"rewards/rejected": -1.1932129859924316,
"step": 188
},
{
"epoch": 1.3944954128440368,
"grad_norm": 0.915761411190033,
"learning_rate": 3.2276313651868364e-06,
"logits/chosen": -0.8797706365585327,
"logits/rejected": -2.130256414413452,
"logps/chosen": -307.41839599609375,
"logps/rejected": -180.803466796875,
"loss": 0.0983,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8479622602462769,
"rewards/margins": 3.153357982635498,
"rewards/rejected": -1.3053958415985107,
"step": 190
},
{
"epoch": 1.4091743119266056,
"grad_norm": 0.7284132838249207,
"learning_rate": 3.1865787213339926e-06,
"logits/chosen": -0.8553410768508911,
"logits/rejected": -2.044377565383911,
"logps/chosen": -292.92144775390625,
"logps/rejected": -205.9124298095703,
"loss": 0.0818,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0033276081085205,
"rewards/margins": 3.4773898124694824,
"rewards/rejected": -1.474062442779541,
"step": 192
},
{
"epoch": 1.4238532110091744,
"grad_norm": 0.7230023145675659,
"learning_rate": 3.1453248407103156e-06,
"logits/chosen": -0.8956843614578247,
"logits/rejected": -2.0704410076141357,
"logps/chosen": -297.47418212890625,
"logps/rejected": -189.42091369628906,
"loss": 0.0838,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.829613208770752,
"rewards/margins": 3.2233619689941406,
"rewards/rejected": -1.3937489986419678,
"step": 194
},
{
"epoch": 1.4385321100917432,
"grad_norm": 1.092043161392212,
"learning_rate": 3.1038818148611178e-06,
"logits/chosen": -0.9160604476928711,
"logits/rejected": -1.9689029455184937,
"logps/chosen": -323.4578552246094,
"logps/rejected": -202.6251220703125,
"loss": 0.0932,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.9870991706848145,
"rewards/margins": 3.379544973373413,
"rewards/rejected": -1.3924458026885986,
"step": 196
},
{
"epoch": 1.453211009174312,
"grad_norm": 0.849423348903656,
"learning_rate": 3.062261790770331e-06,
"logits/chosen": -0.8054502010345459,
"logits/rejected": -2.017672061920166,
"logps/chosen": -268.9284973144531,
"logps/rejected": -201.11390686035156,
"loss": 0.1081,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7870291471481323,
"rewards/margins": 2.959620952606201,
"rewards/rejected": -1.1725919246673584,
"step": 198
},
{
"epoch": 1.4678899082568808,
"grad_norm": 0.5849136710166931,
"learning_rate": 3.0204769673002123e-06,
"logits/chosen": -0.8214648365974426,
"logits/rejected": -2.103921890258789,
"logps/chosen": -343.6684265136719,
"logps/rejected": -218.4034423828125,
"loss": 0.0698,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9067230224609375,
"rewards/margins": 3.2361087799072266,
"rewards/rejected": -1.3293852806091309,
"step": 200
},
{
"epoch": 1.4825688073394496,
"grad_norm": 0.8638609647750854,
"learning_rate": 2.978539591615848e-06,
"logits/chosen": -0.9360217452049255,
"logits/rejected": -1.8377161026000977,
"logps/chosen": -310.77203369140625,
"logps/rejected": -217.95361328125,
"loss": 0.0893,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8223047256469727,
"rewards/margins": 3.374411106109619,
"rewards/rejected": -1.5521066188812256,
"step": 202
},
{
"epoch": 1.4972477064220184,
"grad_norm": 0.7469986081123352,
"learning_rate": 2.936461955595501e-06,
"logits/chosen": -0.9148820638656616,
"logits/rejected": -2.0849192142486572,
"logps/chosen": -309.4117736816406,
"logps/rejected": -211.26283264160156,
"loss": 0.0876,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0167510509490967,
"rewards/margins": 3.318629503250122,
"rewards/rejected": -1.301878571510315,
"step": 204
},
{
"epoch": 1.5119266055045872,
"grad_norm": 0.48730000853538513,
"learning_rate": 2.8942563922278487e-06,
"logits/chosen": -0.8627596497535706,
"logits/rejected": -1.997396469116211,
"logps/chosen": -297.6988220214844,
"logps/rejected": -219.91180419921875,
"loss": 0.0659,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9595637321472168,
"rewards/margins": 3.744344711303711,
"rewards/rejected": -1.7847814559936523,
"step": 206
},
{
"epoch": 1.526605504587156,
"grad_norm": 1.1042286157608032,
"learning_rate": 2.8519352719971783e-06,
"logits/chosen": -0.9377632141113281,
"logits/rejected": -2.024191379547119,
"logps/chosen": -327.47027587890625,
"logps/rejected": -223.6087646484375,
"loss": 0.1017,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.0964934825897217,
"rewards/margins": 3.4565787315368652,
"rewards/rejected": -1.3600847721099854,
"step": 208
},
{
"epoch": 1.5412844036697249,
"grad_norm": 0.7358872294425964,
"learning_rate": 2.8095109992575824e-06,
"logits/chosen": -0.9008034467697144,
"logits/rejected": -2.1022136211395264,
"logps/chosen": -340.1212158203125,
"logps/rejected": -223.19918823242188,
"loss": 0.0661,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.2666172981262207,
"rewards/margins": 3.668931007385254,
"rewards/rejected": -1.4023137092590332,
"step": 210
},
{
"epoch": 1.5559633027522937,
"grad_norm": 0.823003888130188,
"learning_rate": 2.7669960085972407e-06,
"logits/chosen": -0.8504350185394287,
"logits/rejected": -2.14664888381958,
"logps/chosen": -363.5140075683594,
"logps/rejected": -241.92892456054688,
"loss": 0.068,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1337637901306152,
"rewards/margins": 3.620941638946533,
"rewards/rejected": -1.4871773719787598,
"step": 212
},
{
"epoch": 1.5706422018348625,
"grad_norm": 0.9012424349784851,
"learning_rate": 2.7244027611938247e-06,
"logits/chosen": -0.6944912672042847,
"logits/rejected": -1.8317877054214478,
"logps/chosen": -261.44049072265625,
"logps/rejected": -243.61410522460938,
"loss": 0.1016,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.750416874885559,
"rewards/margins": 3.3300774097442627,
"rewards/rejected": -1.5796607732772827,
"step": 214
},
{
"epoch": 1.5853211009174313,
"grad_norm": 0.9822458028793335,
"learning_rate": 2.6817437411621194e-06,
"logits/chosen": -0.8393555283546448,
"logits/rejected": -1.9610698223114014,
"logps/chosen": -357.4717102050781,
"logps/rejected": -259.9384765625,
"loss": 0.0773,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1349892616271973,
"rewards/margins": 3.471536636352539,
"rewards/rejected": -1.336547613143921,
"step": 216
},
{
"epoch": 1.6,
"grad_norm": 0.7191787958145142,
"learning_rate": 2.639031451894923e-06,
"logits/chosen": -0.8827037811279297,
"logits/rejected": -1.878009557723999,
"logps/chosen": -341.8013916015625,
"logps/rejected": -246.149169921875,
"loss": 0.0649,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.039233446121216,
"rewards/margins": 3.67873215675354,
"rewards/rejected": -1.6394988298416138,
"step": 218
},
{
"epoch": 1.614678899082569,
"grad_norm": 0.7397493124008179,
"learning_rate": 2.5962784123982843e-06,
"logits/chosen": -0.9270643591880798,
"logits/rejected": -2.148819923400879,
"logps/chosen": -318.17242431640625,
"logps/rejected": -221.3958282470703,
"loss": 0.077,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9221253395080566,
"rewards/margins": 3.66496205329895,
"rewards/rejected": -1.7428367137908936,
"step": 220
},
{
"epoch": 1.6293577981651377,
"grad_norm": 0.5408302545547485,
"learning_rate": 2.5534971536221804e-06,
"logits/chosen": -0.7174456715583801,
"logits/rejected": -1.9402276277542114,
"logps/chosen": -280.41265869140625,
"logps/rejected": -214.61036682128906,
"loss": 0.0679,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.8169009685516357,
"rewards/margins": 3.658087730407715,
"rewards/rejected": -1.841186761856079,
"step": 222
},
{
"epoch": 1.6440366972477065,
"grad_norm": 0.6373718976974487,
"learning_rate": 2.5107002147876814e-06,
"logits/chosen": -0.8338260650634766,
"logits/rejected": -1.8052666187286377,
"logps/chosen": -274.18408203125,
"logps/rejected": -231.13385009765625,
"loss": 0.0677,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8962229490280151,
"rewards/margins": 3.6884357929229736,
"rewards/rejected": -1.792212724685669,
"step": 224
},
{
"epoch": 1.6587155963302753,
"grad_norm": 1.006023645401001,
"learning_rate": 2.467900139711693e-06,
"logits/chosen": -0.8586325645446777,
"logits/rejected": -1.8590312004089355,
"logps/chosen": -284.29498291015625,
"logps/rejected": -219.96942138671875,
"loss": 0.1002,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.610360860824585,
"rewards/margins": 3.314877986907959,
"rewards/rejected": -1.7045170068740845,
"step": 226
},
{
"epoch": 1.6733944954128441,
"grad_norm": 0.4218728542327881,
"learning_rate": 2.4251094731303586e-06,
"logits/chosen": -0.7588306665420532,
"logits/rejected": -2.020467758178711,
"logps/chosen": -301.7962341308594,
"logps/rejected": -201.7028350830078,
"loss": 0.0583,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0787746906280518,
"rewards/margins": 3.6262543201446533,
"rewards/rejected": -1.547479271888733,
"step": 228
},
{
"epoch": 1.688073394495413,
"grad_norm": 0.44578853249549866,
"learning_rate": 2.3823407570221812e-06,
"logits/chosen": -0.681371808052063,
"logits/rejected": -2.0245919227600098,
"logps/chosen": -310.5913391113281,
"logps/rejected": -196.76724243164062,
"loss": 0.0614,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.959742784500122,
"rewards/margins": 3.5932304859161377,
"rewards/rejected": -1.633487582206726,
"step": 230
},
{
"epoch": 1.7027522935779817,
"grad_norm": 0.6186323761940002,
"learning_rate": 2.3396065269319655e-06,
"logits/chosen": -0.8481271862983704,
"logits/rejected": -2.065420150756836,
"logps/chosen": -310.2200012207031,
"logps/rejected": -194.04493713378906,
"loss": 0.0628,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0263564586639404,
"rewards/margins": 3.842787265777588,
"rewards/rejected": -1.8164305686950684,
"step": 232
},
{
"epoch": 1.7174311926605506,
"grad_norm": 0.5135802626609802,
"learning_rate": 2.2969193082966353e-06,
"logits/chosen": -0.7080973386764526,
"logits/rejected": -2.007819652557373,
"logps/chosen": -296.1108093261719,
"logps/rejected": -216.99868774414062,
"loss": 0.0534,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9229261875152588,
"rewards/margins": 3.921962022781372,
"rewards/rejected": -1.9990354776382446,
"step": 234
},
{
"epoch": 1.7321100917431194,
"grad_norm": 0.9138413071632385,
"learning_rate": 2.2542916127740194e-06,
"logits/chosen": -0.6951168775558472,
"logits/rejected": -1.6621724367141724,
"logps/chosen": -323.9538269042969,
"logps/rejected": -260.28900146484375,
"loss": 0.0728,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0283045768737793,
"rewards/margins": 3.7805428504943848,
"rewards/rejected": -1.7522385120391846,
"step": 236
},
{
"epoch": 1.7467889908256882,
"grad_norm": 0.62326979637146,
"learning_rate": 2.211735934575674e-06,
"logits/chosen": -0.7624643445014954,
"logits/rejected": -2.0803322792053223,
"logps/chosen": -293.841552734375,
"logps/rejected": -189.69631958007812,
"loss": 0.0703,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6327762603759766,
"rewards/margins": 3.612248420715332,
"rewards/rejected": -1.979472279548645,
"step": 238
},
{
"epoch": 1.761467889908257,
"grad_norm": 0.5615968108177185,
"learning_rate": 2.1692647468048235e-06,
"logits/chosen": -0.8942849636077881,
"logits/rejected": -1.9355003833770752,
"logps/chosen": -318.2629699707031,
"logps/rejected": -235.68296813964844,
"loss": 0.0618,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.87472403049469,
"rewards/margins": 4.362048149108887,
"rewards/rejected": -2.4873242378234863,
"step": 240
},
{
"epoch": 1.7761467889908258,
"grad_norm": 0.6113856434822083,
"learning_rate": 2.126890497800477e-06,
"logits/chosen": -0.9161121845245361,
"logits/rejected": -1.843569040298462,
"logps/chosen": -309.8831787109375,
"logps/rejected": -226.34967041015625,
"loss": 0.0821,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9025682210922241,
"rewards/margins": 3.5658414363861084,
"rewards/rejected": -1.6632736921310425,
"step": 242
},
{
"epoch": 1.7908256880733946,
"grad_norm": 0.7386473417282104,
"learning_rate": 2.084625607488816e-06,
"logits/chosen": -0.7687922716140747,
"logits/rejected": -1.982967734336853,
"logps/chosen": -285.9901428222656,
"logps/rejected": -213.30564880371094,
"loss": 0.0699,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0006887912750244,
"rewards/margins": 4.181596279144287,
"rewards/rejected": -2.180907726287842,
"step": 244
},
{
"epoch": 1.8055045871559634,
"grad_norm": 0.620130717754364,
"learning_rate": 2.0424824637428995e-06,
"logits/chosen": -0.7613787651062012,
"logits/rejected": -2.176778554916382,
"logps/chosen": -278.2284851074219,
"logps/rejected": -196.99716186523438,
"loss": 0.0636,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.8024476766586304,
"rewards/margins": 3.8208680152893066,
"rewards/rejected": -2.0184202194213867,
"step": 246
},
{
"epoch": 1.8201834862385322,
"grad_norm": 0.964788556098938,
"learning_rate": 2.0004734187517744e-06,
"logits/chosen": -0.9343721270561218,
"logits/rejected": -1.8525314331054688,
"logps/chosen": -328.5677795410156,
"logps/rejected": -199.83840942382812,
"loss": 0.0751,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8486145734786987,
"rewards/margins": 3.6849093437194824,
"rewards/rejected": -1.8362950086593628,
"step": 248
},
{
"epoch": 1.834862385321101,
"grad_norm": 0.3955663740634918,
"learning_rate": 1.9586107854000327e-06,
"logits/chosen": -0.9676373600959778,
"logits/rejected": -2.1090657711029053,
"logps/chosen": -307.66302490234375,
"logps/rejected": -193.6895751953125,
"loss": 0.057,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8102517127990723,
"rewards/margins": 3.7792747020721436,
"rewards/rejected": -1.9690231084823608,
"step": 250
},
{
"epoch": 1.8495412844036698,
"grad_norm": 0.6646362543106079,
"learning_rate": 1.916906833658899e-06,
"logits/chosen": -0.7113239169120789,
"logits/rejected": -1.9650328159332275,
"logps/chosen": -337.66107177734375,
"logps/rejected": -242.595703125,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.94010329246521,
"rewards/margins": 4.105426788330078,
"rewards/rejected": -2.165323495864868,
"step": 252
},
{
"epoch": 1.8642201834862386,
"grad_norm": 0.6509953737258911,
"learning_rate": 1.8753737869898921e-06,
"logits/chosen": -0.794485330581665,
"logits/rejected": -1.901089072227478,
"logps/chosen": -258.750732421875,
"logps/rejected": -213.0238037109375,
"loss": 0.0543,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.602518916130066,
"rewards/margins": 4.075562477111816,
"rewards/rejected": -2.473043441772461,
"step": 254
},
{
"epoch": 1.8788990825688074,
"grad_norm": 0.4499273896217346,
"learning_rate": 1.8340238187621185e-06,
"logits/chosen": -0.7047321200370789,
"logits/rejected": -1.8908119201660156,
"logps/chosen": -273.62322998046875,
"logps/rejected": -198.1600341796875,
"loss": 0.0727,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.7582603693008423,
"rewards/margins": 3.6687071323394775,
"rewards/rejected": -1.9104465246200562,
"step": 256
},
{
"epoch": 1.8935779816513763,
"grad_norm": 0.8414962291717529,
"learning_rate": 1.7928690486842438e-06,
"logits/chosen": -0.871714174747467,
"logits/rejected": -2.0030646324157715,
"logps/chosen": -264.42059326171875,
"logps/rejected": -185.0747833251953,
"loss": 0.0654,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.900803565979004,
"rewards/margins": 3.7862067222595215,
"rewards/rejected": -1.8854031562805176,
"step": 258
},
{
"epoch": 1.908256880733945,
"grad_norm": 0.8687112927436829,
"learning_rate": 1.7519215392522026e-06,
"logits/chosen": -0.8036646246910095,
"logits/rejected": -2.0354790687561035,
"logps/chosen": -293.50811767578125,
"logps/rejected": -191.86962890625,
"loss": 0.0604,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9246323108673096,
"rewards/margins": 4.019070148468018,
"rewards/rejected": -2.094437599182129,
"step": 260
},
{
"epoch": 1.9229357798165139,
"grad_norm": 0.4811760485172272,
"learning_rate": 1.7111932922136715e-06,
"logits/chosen": -0.7815529108047485,
"logits/rejected": -1.7573397159576416,
"logps/chosen": -263.450927734375,
"logps/rejected": -229.46728515625,
"loss": 0.0629,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6779381036758423,
"rewards/margins": 3.870631217956543,
"rewards/rejected": -2.1926932334899902,
"step": 262
},
{
"epoch": 1.9376146788990827,
"grad_norm": 0.5513655543327332,
"learning_rate": 1.6706962450503408e-06,
"logits/chosen": -0.6383249759674072,
"logits/rejected": -1.9680360555648804,
"logps/chosen": -293.52130126953125,
"logps/rejected": -217.34693908691406,
"loss": 0.0472,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9342231750488281,
"rewards/margins": 4.369998931884766,
"rewards/rejected": -2.4357762336730957,
"step": 264
},
{
"epoch": 1.9522935779816515,
"grad_norm": 0.7187495827674866,
"learning_rate": 1.630442267479034e-06,
"logits/chosen": -0.6566349267959595,
"logits/rejected": -1.9347317218780518,
"logps/chosen": -277.68890380859375,
"logps/rejected": -224.22335815429688,
"loss": 0.0569,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.052359104156494,
"rewards/margins": 4.0643744468688965,
"rewards/rejected": -2.0120151042938232,
"step": 266
},
{
"epoch": 1.9669724770642203,
"grad_norm": 0.33258092403411865,
"learning_rate": 1.5904431579726837e-06,
"logits/chosen": -0.7657849192619324,
"logits/rejected": -2.030609369277954,
"logps/chosen": -306.955322265625,
"logps/rejected": -190.61703491210938,
"loss": 0.0564,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.6874788999557495,
"rewards/margins": 4.016414165496826,
"rewards/rejected": -2.328935146331787,
"step": 268
},
{
"epoch": 1.981651376146789,
"grad_norm": 0.5519306659698486,
"learning_rate": 1.5507106403021897e-06,
"logits/chosen": -0.7592746019363403,
"logits/rejected": -2.0932528972625732,
"logps/chosen": -341.2933349609375,
"logps/rejected": -232.65756225585938,
"loss": 0.0422,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3383517265319824,
"rewards/margins": 4.453994274139404,
"rewards/rejected": -2.115642547607422,
"step": 270
},
{
"epoch": 1.996330275229358,
"grad_norm": 0.5155956745147705,
"learning_rate": 1.511256360100171e-06,
"logits/chosen": -0.7073550224304199,
"logits/rejected": -2.0184946060180664,
"logps/chosen": -306.38116455078125,
"logps/rejected": -217.38668823242188,
"loss": 0.0538,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8283504247665405,
"rewards/margins": 4.175046920776367,
"rewards/rejected": -2.346696376800537,
"step": 272
},
{
"epoch": 2.0110091743119267,
"grad_norm": 0.7801055908203125,
"learning_rate": 1.4720918814476234e-06,
"logits/chosen": -0.9376870393753052,
"logits/rejected": -2.1091787815093994,
"logps/chosen": -266.37811279296875,
"logps/rejected": -209.93710327148438,
"loss": 0.0743,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6686122417449951,
"rewards/margins": 4.310929775238037,
"rewards/rejected": -2.642317771911621,
"step": 274
},
{
"epoch": 2.0256880733944955,
"grad_norm": 0.6762734055519104,
"learning_rate": 1.4332286834844792e-06,
"logits/chosen": -0.9745014309883118,
"logits/rejected": -2.0172030925750732,
"logps/chosen": -297.90997314453125,
"logps/rejected": -215.2535400390625,
"loss": 0.0594,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7038819789886475,
"rewards/margins": 3.7982983589172363,
"rewards/rejected": -2.0944161415100098,
"step": 276
},
{
"epoch": 2.0403669724770643,
"grad_norm": 0.6311278939247131,
"learning_rate": 1.3946781570450563e-06,
"logits/chosen": -0.792485773563385,
"logits/rejected": -2.0446367263793945,
"logps/chosen": -316.6257019042969,
"logps/rejected": -225.79873657226562,
"loss": 0.0436,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1141061782836914,
"rewards/margins": 4.177621841430664,
"rewards/rejected": -2.0635154247283936,
"step": 278
},
{
"epoch": 2.055045871559633,
"grad_norm": 0.4802553653717041,
"learning_rate": 1.3564516013194023e-06,
"logits/chosen": -0.5846218466758728,
"logits/rejected": -1.8708997964859009,
"logps/chosen": -278.2353515625,
"logps/rejected": -215.2820587158203,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7591543197631836,
"rewards/margins": 4.34114933013916,
"rewards/rejected": -2.5819950103759766,
"step": 280
},
{
"epoch": 2.069724770642202,
"grad_norm": 0.6416748762130737,
"learning_rate": 1.3185602205414894e-06,
"logits/chosen": -0.7558883428573608,
"logits/rejected": -1.8708809614181519,
"logps/chosen": -280.8486633300781,
"logps/rejected": -198.6562042236328,
"loss": 0.0654,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9279158115386963,
"rewards/margins": 3.8500123023986816,
"rewards/rejected": -1.9220962524414062,
"step": 282
},
{
"epoch": 2.0844036697247708,
"grad_norm": 0.8262112736701965,
"learning_rate": 1.2810151207052465e-06,
"logits/chosen": -0.8148822784423828,
"logits/rejected": -1.9564712047576904,
"logps/chosen": -348.1204833984375,
"logps/rejected": -250.0408172607422,
"loss": 0.0648,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6666010618209839,
"rewards/margins": 3.944627046585083,
"rewards/rejected": -2.2780258655548096,
"step": 284
},
{
"epoch": 2.0990825688073396,
"grad_norm": 0.4954426884651184,
"learning_rate": 1.2438273063093811e-06,
"logits/chosen": -0.6735963225364685,
"logits/rejected": -1.8776307106018066,
"logps/chosen": -291.0019836425781,
"logps/rejected": -194.4311981201172,
"loss": 0.0706,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.7643864154815674,
"rewards/margins": 3.9881067276000977,
"rewards/rejected": -2.223719835281372,
"step": 286
},
{
"epoch": 2.1137614678899084,
"grad_norm": 0.8490874171257019,
"learning_rate": 1.2070076771319536e-06,
"logits/chosen": -0.9455384612083435,
"logits/rejected": -1.8131248950958252,
"logps/chosen": -365.147705078125,
"logps/rejected": -228.15090942382812,
"loss": 0.0704,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8452998399734497,
"rewards/margins": 3.8042831420898438,
"rewards/rejected": -1.958983063697815,
"step": 288
},
{
"epoch": 2.128440366972477,
"grad_norm": 0.7720925807952881,
"learning_rate": 1.1705670250356417e-06,
"logits/chosen": -0.6748377084732056,
"logits/rejected": -1.9302213191986084,
"logps/chosen": -322.6198425292969,
"logps/rejected": -224.2333526611328,
"loss": 0.053,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.065561532974243,
"rewards/margins": 4.38578987121582,
"rewards/rejected": -2.320228338241577,
"step": 290
},
{
"epoch": 2.143119266055046,
"grad_norm": 0.4491863548755646,
"learning_rate": 1.1345160308046413e-06,
"logits/chosen": -0.7005204558372498,
"logits/rejected": -2.1741456985473633,
"logps/chosen": -398.5745849609375,
"logps/rejected": -235.8988800048828,
"loss": 0.0613,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.9305046796798706,
"rewards/margins": 4.522059440612793,
"rewards/rejected": -2.591554641723633,
"step": 292
},
{
"epoch": 2.157798165137615,
"grad_norm": 0.6360311508178711,
"learning_rate": 1.0988652610141154e-06,
"logits/chosen": -0.7096176147460938,
"logits/rejected": -1.7769296169281006,
"logps/chosen": -288.8116760253906,
"logps/rejected": -241.828369140625,
"loss": 0.0548,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.8457667827606201,
"rewards/margins": 3.990344524383545,
"rewards/rejected": -2.144577980041504,
"step": 294
},
{
"epoch": 2.1724770642201836,
"grad_norm": 0.3716106116771698,
"learning_rate": 1.063625164933124e-06,
"logits/chosen": -0.6774280667304993,
"logits/rejected": -1.9684358835220337,
"logps/chosen": -345.6900939941406,
"logps/rejected": -240.35296630859375,
"loss": 0.0367,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1445114612579346,
"rewards/margins": 4.849350452423096,
"rewards/rejected": -2.704838752746582,
"step": 296
},
{
"epoch": 2.1871559633027524,
"grad_norm": 0.525005578994751,
"learning_rate": 1.0288060714619359e-06,
"logits/chosen": -0.9460769891738892,
"logits/rejected": -2.1344943046569824,
"logps/chosen": -330.6282043457031,
"logps/rejected": -195.7637481689453,
"loss": 0.0515,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1682145595550537,
"rewards/margins": 4.502593040466309,
"rewards/rejected": -2.334378480911255,
"step": 298
},
{
"epoch": 2.2018348623853212,
"grad_norm": 0.458524227142334,
"learning_rate": 9.944181861046188e-07,
"logits/chosen": -0.7203876376152039,
"logits/rejected": -1.8515840768814087,
"logps/chosen": -347.7017517089844,
"logps/rejected": -233.29393005371094,
"loss": 0.0642,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.7970588207244873,
"rewards/margins": 4.392740249633789,
"rewards/rejected": -2.5956814289093018,
"step": 300
},
{
"epoch": 2.21651376146789,
"grad_norm": 0.43879008293151855,
"learning_rate": 9.604715879777986e-07,
"logits/chosen": -0.7226991057395935,
"logits/rejected": -2.0477523803710938,
"logps/chosen": -290.74530029296875,
"logps/rejected": -181.2649688720703,
"loss": 0.0503,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9678212404251099,
"rewards/margins": 4.382383346557617,
"rewards/rejected": -2.4145617485046387,
"step": 302
},
{
"epoch": 2.231192660550459,
"grad_norm": 0.4379405081272125,
"learning_rate": 9.269762268564616e-07,
"logits/chosen": -0.8170676231384277,
"logits/rejected": -2.0070619583129883,
"logps/chosen": -265.16571044921875,
"logps/rejected": -186.93112182617188,
"loss": 0.0614,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8528399467468262,
"rewards/margins": 4.132846355438232,
"rewards/rejected": -2.2800064086914062,
"step": 304
},
{
"epoch": 2.2458715596330276,
"grad_norm": 0.8870872855186462,
"learning_rate": 8.939419202576694e-07,
"logits/chosen": -0.5970391631126404,
"logits/rejected": -1.7150076627731323,
"logps/chosen": -268.1172180175781,
"logps/rejected": -207.63734436035156,
"loss": 0.0883,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.653077244758606,
"rewards/margins": 3.3603720664978027,
"rewards/rejected": -1.7072948217391968,
"step": 306
},
{
"epoch": 2.2605504587155965,
"grad_norm": 0.737343966960907,
"learning_rate": 8.61378350563033e-07,
"logits/chosen": -0.7202005386352539,
"logits/rejected": -1.8895469903945923,
"logps/chosen": -262.6046447753906,
"logps/rejected": -221.35336303710938,
"loss": 0.0627,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.750601053237915,
"rewards/margins": 3.905064582824707,
"rewards/rejected": -2.154463529586792,
"step": 308
},
{
"epoch": 2.2752293577981653,
"grad_norm": 0.5748594403266907,
"learning_rate": 8.292950621808022e-07,
"logits/chosen": -0.7942256927490234,
"logits/rejected": -1.9462255239486694,
"logps/chosen": -297.3062438964844,
"logps/rejected": -220.7073516845703,
"loss": 0.0439,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.907147765159607,
"rewards/margins": 4.309232711791992,
"rewards/rejected": -2.402085065841675,
"step": 310
},
{
"epoch": 2.289908256880734,
"grad_norm": 0.5790998339653015,
"learning_rate": 7.977014587483925e-07,
"logits/chosen": -0.8033642768859863,
"logits/rejected": -1.9477308988571167,
"logps/chosen": -285.6184997558594,
"logps/rejected": -257.89910888671875,
"loss": 0.0548,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8432916402816772,
"rewards/margins": 4.054279804229736,
"rewards/rejected": -2.2109880447387695,
"step": 312
},
{
"epoch": 2.304587155963303,
"grad_norm": 0.6188729405403137,
"learning_rate": 7.666068003761684e-07,
"logits/chosen": -0.7408751249313354,
"logits/rejected": -1.9631062746047974,
"logps/chosen": -308.1776123046875,
"logps/rejected": -198.61410522460938,
"loss": 0.0454,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7822659015655518,
"rewards/margins": 4.317424297332764,
"rewards/rejected": -2.535158634185791,
"step": 314
},
{
"epoch": 2.3192660550458717,
"grad_norm": 0.5603534579277039,
"learning_rate": 7.360202009332993e-07,
"logits/chosen": -0.8284570574760437,
"logits/rejected": -2.0091702938079834,
"logps/chosen": -307.47088623046875,
"logps/rejected": -215.27903747558594,
"loss": 0.0523,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6950868368148804,
"rewards/margins": 4.28853178024292,
"rewards/rejected": -2.59344482421875,
"step": 316
},
{
"epoch": 2.3339449541284405,
"grad_norm": 0.3757495582103729,
"learning_rate": 7.059506253764773e-07,
"logits/chosen": -0.7530102729797363,
"logits/rejected": -1.9654746055603027,
"logps/chosen": -326.5684814453125,
"logps/rejected": -226.15786743164062,
"loss": 0.0454,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8712252378463745,
"rewards/margins": 4.424376964569092,
"rewards/rejected": -2.553151845932007,
"step": 318
},
{
"epoch": 2.3486238532110093,
"grad_norm": 0.6858806014060974,
"learning_rate": 6.764068871222825e-07,
"logits/chosen": -0.5249571204185486,
"logits/rejected": -1.8156137466430664,
"logps/chosen": -298.5492858886719,
"logps/rejected": -212.81187438964844,
"loss": 0.0564,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8946092128753662,
"rewards/margins": 4.014616012573242,
"rewards/rejected": -2.120006561279297,
"step": 320
},
{
"epoch": 2.363302752293578,
"grad_norm": 0.4449942409992218,
"learning_rate": 6.473976454639608e-07,
"logits/chosen": -0.7823415398597717,
"logits/rejected": -2.0849199295043945,
"logps/chosen": -306.5335693359375,
"logps/rejected": -197.4770050048828,
"loss": 0.0458,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.062614679336548,
"rewards/margins": 4.463562488555908,
"rewards/rejected": -2.4009478092193604,
"step": 322
},
{
"epoch": 2.377981651376147,
"grad_norm": 0.699175238609314,
"learning_rate": 6.189314030333796e-07,
"logits/chosen": -0.5810756087303162,
"logits/rejected": -1.8031431436538696,
"logps/chosen": -292.385009765625,
"logps/rejected": -252.5111083984375,
"loss": 0.0572,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7218618392944336,
"rewards/margins": 4.281242847442627,
"rewards/rejected": -2.5593810081481934,
"step": 324
},
{
"epoch": 2.3926605504587157,
"grad_norm": 0.5212377309799194,
"learning_rate": 5.910165033089e-07,
"logits/chosen": -0.6628118753433228,
"logits/rejected": -2.0212368965148926,
"logps/chosen": -328.28729248046875,
"logps/rejected": -230.48863220214844,
"loss": 0.0409,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0009829998016357,
"rewards/margins": 4.275434494018555,
"rewards/rejected": -2.274451732635498,
"step": 326
},
{
"epoch": 2.4073394495412845,
"grad_norm": 0.5146971344947815,
"learning_rate": 5.636611281698956e-07,
"logits/chosen": -0.7095816731452942,
"logits/rejected": -1.83794367313385,
"logps/chosen": -272.989990234375,
"logps/rejected": -213.9049835205078,
"loss": 0.0519,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5980645418167114,
"rewards/margins": 3.967491388320923,
"rewards/rejected": -2.369426727294922,
"step": 328
},
{
"epoch": 2.4220183486238533,
"grad_norm": 0.6070245504379272,
"learning_rate": 5.368732954986389e-07,
"logits/chosen": -0.8353590369224548,
"logits/rejected": -1.9633159637451172,
"logps/chosen": -291.64990234375,
"logps/rejected": -226.0115966796875,
"loss": 0.0595,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7673264741897583,
"rewards/margins": 4.178317070007324,
"rewards/rejected": -2.4109902381896973,
"step": 330
},
{
"epoch": 2.436697247706422,
"grad_norm": 0.411520391702652,
"learning_rate": 5.106608568302504e-07,
"logits/chosen": -0.8378889560699463,
"logits/rejected": -1.9491535425186157,
"logps/chosen": -269.3817443847656,
"logps/rejected": -226.02801513671875,
"loss": 0.0607,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.776644229888916,
"rewards/margins": 4.346107006072998,
"rewards/rejected": -2.569462537765503,
"step": 332
},
{
"epoch": 2.451376146788991,
"grad_norm": 0.538725733757019,
"learning_rate": 4.850314950514124e-07,
"logits/chosen": -0.5758827328681946,
"logits/rejected": -1.8072640895843506,
"logps/chosen": -293.7169189453125,
"logps/rejected": -222.18191528320312,
"loss": 0.0476,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8555824756622314,
"rewards/margins": 4.301581382751465,
"rewards/rejected": -2.4459989070892334,
"step": 334
},
{
"epoch": 2.4660550458715598,
"grad_norm": 0.6865962147712708,
"learning_rate": 4.599927221485034e-07,
"logits/chosen": -0.6990569233894348,
"logits/rejected": -1.9968361854553223,
"logps/chosen": -290.4656066894531,
"logps/rejected": -200.7618408203125,
"loss": 0.0467,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.737301230430603,
"rewards/margins": 4.213505744934082,
"rewards/rejected": -2.4762046337127686,
"step": 336
},
{
"epoch": 2.4807339449541286,
"grad_norm": 0.5077099204063416,
"learning_rate": 4.3555187700583175e-07,
"logits/chosen": -0.6568117141723633,
"logits/rejected": -1.949430227279663,
"logps/chosen": -277.690673828125,
"logps/rejected": -220.354248046875,
"loss": 0.0369,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8167202472686768,
"rewards/margins": 4.532853126525879,
"rewards/rejected": -2.716132879257202,
"step": 338
},
{
"epoch": 2.4954128440366974,
"grad_norm": 0.49507051706314087,
"learning_rate": 4.1171612325460244e-07,
"logits/chosen": -0.7259389162063599,
"logits/rejected": -1.808924674987793,
"logps/chosen": -290.60845947265625,
"logps/rejected": -214.2810821533203,
"loss": 0.0532,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6286503076553345,
"rewards/margins": 4.091249942779541,
"rewards/rejected": -2.462599515914917,
"step": 340
},
{
"epoch": 2.510091743119266,
"grad_norm": 0.348964124917984,
"learning_rate": 3.8849244717325206e-07,
"logits/chosen": -0.727351188659668,
"logits/rejected": -1.7707502841949463,
"logps/chosen": -281.7944030761719,
"logps/rejected": -234.1016082763672,
"loss": 0.0477,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9929590225219727,
"rewards/margins": 4.754191875457764,
"rewards/rejected": -2.761232852935791,
"step": 342
},
{
"epoch": 2.524770642201835,
"grad_norm": 0.5973061919212341,
"learning_rate": 3.658876556397628e-07,
"logits/chosen": -0.8893070816993713,
"logits/rejected": -2.0552244186401367,
"logps/chosen": -266.6897277832031,
"logps/rejected": -199.23097229003906,
"loss": 0.059,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8175071477890015,
"rewards/margins": 4.262746810913086,
"rewards/rejected": -2.445240020751953,
"step": 344
},
{
"epoch": 2.539449541284404,
"grad_norm": 0.669189453125,
"learning_rate": 3.4390837413656256e-07,
"logits/chosen": -0.7612945437431335,
"logits/rejected": -2.0197830200195312,
"logps/chosen": -289.519775390625,
"logps/rejected": -236.5748748779297,
"loss": 0.0455,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8634060621261597,
"rewards/margins": 4.5830864906311035,
"rewards/rejected": -2.7196803092956543,
"step": 346
},
{
"epoch": 2.5541284403669726,
"grad_norm": 0.49376487731933594,
"learning_rate": 3.225610448085903e-07,
"logits/chosen": -0.703992486000061,
"logits/rejected": -1.8440505266189575,
"logps/chosen": -282.47967529296875,
"logps/rejected": -213.64584350585938,
"loss": 0.0459,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8914369344711304,
"rewards/margins": 4.45714807510376,
"rewards/rejected": -2.565711498260498,
"step": 348
},
{
"epoch": 2.5688073394495414,
"grad_norm": 0.6668093204498291,
"learning_rate": 3.018519245750989e-07,
"logits/chosen": -0.775786817073822,
"logits/rejected": -1.7931033372879028,
"logps/chosen": -332.7348937988281,
"logps/rejected": -254.2784423828125,
"loss": 0.0578,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.7138545513153076,
"rewards/margins": 4.272766590118408,
"rewards/rejected": -2.5589118003845215,
"step": 350
},
{
"epoch": 2.5834862385321102,
"grad_norm": 0.46660616993904114,
"learning_rate": 2.817870832957459e-07,
"logits/chosen": -0.6354199051856995,
"logits/rejected": -1.8320108652114868,
"logps/chosen": -270.6486511230469,
"logps/rejected": -209.35401916503906,
"loss": 0.0468,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8665962219238281,
"rewards/margins": 4.460667133331299,
"rewards/rejected": -2.594071388244629,
"step": 352
},
{
"epoch": 2.598165137614679,
"grad_norm": 0.7453739047050476,
"learning_rate": 2.6237240199151386e-07,
"logits/chosen": -0.7968777418136597,
"logits/rejected": -2.040590763092041,
"logps/chosen": -278.96051025390625,
"logps/rejected": -198.50344848632812,
"loss": 0.0617,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8018391132354736,
"rewards/margins": 3.8879919052124023,
"rewards/rejected": -2.0861527919769287,
"step": 354
},
{
"epoch": 2.612844036697248,
"grad_norm": 0.6645973920822144,
"learning_rate": 2.436135711209786e-07,
"logits/chosen": -1.0428318977355957,
"logits/rejected": -2.0885515213012695,
"logps/chosen": -291.73846435546875,
"logps/rejected": -194.1337890625,
"loss": 0.0523,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.6952629089355469,
"rewards/margins": 4.137482166290283,
"rewards/rejected": -2.4422197341918945,
"step": 356
},
{
"epoch": 2.6275229357798167,
"grad_norm": 0.6470810174942017,
"learning_rate": 2.2551608891243026e-07,
"logits/chosen": -1.004224419593811,
"logits/rejected": -2.139845609664917,
"logps/chosen": -366.6258544921875,
"logps/rejected": -242.1497344970703,
"loss": 0.053,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7366106510162354,
"rewards/margins": 3.9900641441345215,
"rewards/rejected": -2.2534537315368652,
"step": 358
},
{
"epoch": 2.6422018348623855,
"grad_norm": 0.4664456248283386,
"learning_rate": 2.0808525975233807e-07,
"logits/chosen": -0.6308703422546387,
"logits/rejected": -1.8344846963882446,
"logps/chosen": -294.6444091796875,
"logps/rejected": -229.31024169921875,
"loss": 0.0686,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.5859092473983765,
"rewards/margins": 4.006678104400635,
"rewards/rejected": -2.4207687377929688,
"step": 360
},
{
"epoch": 2.6568807339449543,
"grad_norm": 0.8631575107574463,
"learning_rate": 1.9132619263063144e-07,
"logits/chosen": -0.6818079352378845,
"logits/rejected": -1.9622324705123901,
"logps/chosen": -360.9144287109375,
"logps/rejected": -245.8763885498047,
"loss": 0.0507,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.0507702827453613,
"rewards/margins": 4.704620838165283,
"rewards/rejected": -2.653850555419922,
"step": 362
},
{
"epoch": 2.671559633027523,
"grad_norm": 0.2709774672985077,
"learning_rate": 1.7524379964325155e-07,
"logits/chosen": -0.7185477614402771,
"logits/rejected": -1.9397680759429932,
"logps/chosen": -340.63604736328125,
"logps/rejected": -234.94650268554688,
"loss": 0.038,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.689025640487671,
"rewards/margins": 4.374544620513916,
"rewards/rejected": -2.685518741607666,
"step": 364
},
{
"epoch": 2.686238532110092,
"grad_norm": 0.533819854259491,
"learning_rate": 1.5984279455240975e-07,
"logits/chosen": -0.8093196153640747,
"logits/rejected": -1.8389997482299805,
"logps/chosen": -295.1790771484375,
"logps/rejected": -222.28091430664062,
"loss": 0.0453,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.990577220916748,
"rewards/margins": 4.476598262786865,
"rewards/rejected": -2.486021041870117,
"step": 366
},
{
"epoch": 2.7009174311926607,
"grad_norm": 0.49332335591316223,
"learning_rate": 1.451276914049818e-07,
"logits/chosen": -0.7148327827453613,
"logits/rejected": -1.8303236961364746,
"logps/chosen": -265.9716491699219,
"logps/rejected": -208.4017791748047,
"loss": 0.0493,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5792350769042969,
"rewards/margins": 4.226253509521484,
"rewards/rejected": -2.6470184326171875,
"step": 368
},
{
"epoch": 2.7155963302752295,
"grad_norm": 0.44270747900009155,
"learning_rate": 1.3110280320943692e-07,
"logits/chosen": -0.6963136792182922,
"logits/rejected": -2.0225512981414795,
"logps/chosen": -283.3634033203125,
"logps/rejected": -202.95513916015625,
"loss": 0.0372,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.947371244430542,
"rewards/margins": 4.56584358215332,
"rewards/rejected": -2.6184728145599365,
"step": 370
},
{
"epoch": 2.7302752293577983,
"grad_norm": 0.43144798278808594,
"learning_rate": 1.1777224067169218e-07,
"logits/chosen": -0.6372362375259399,
"logits/rejected": -1.8398162126541138,
"logps/chosen": -290.5965576171875,
"logps/rejected": -221.44293212890625,
"loss": 0.0397,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9076143503189087,
"rewards/margins": 4.488263130187988,
"rewards/rejected": -2.580648899078369,
"step": 372
},
{
"epoch": 2.744954128440367,
"grad_norm": 0.6024923920631409,
"learning_rate": 1.0513991099025872e-07,
"logits/chosen": -0.797070324420929,
"logits/rejected": -1.9885629415512085,
"logps/chosen": -334.8035888671875,
"logps/rejected": -221.12759399414062,
"loss": 0.0531,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.713030457496643,
"rewards/margins": 3.9955036640167236,
"rewards/rejected": -2.28247332572937,
"step": 374
},
{
"epoch": 2.759633027522936,
"grad_norm": 0.42591243982315063,
"learning_rate": 9.320951671104194e-08,
"logits/chosen": -0.6949442625045776,
"logits/rejected": -1.9821323156356812,
"logps/chosen": -326.1830749511719,
"logps/rejected": -219.29837036132812,
"loss": 0.0364,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.3185112476348877,
"rewards/margins": 4.512205600738525,
"rewards/rejected": -2.1936943531036377,
"step": 376
},
{
"epoch": 2.7743119266055047,
"grad_norm": 0.5629270076751709,
"learning_rate": 8.198455464212108e-08,
"logits/chosen": -0.734917402267456,
"logits/rejected": -1.9623744487762451,
"logps/chosen": -304.7027282714844,
"logps/rejected": -207.42129516601562,
"loss": 0.0376,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.003225326538086,
"rewards/margins": 4.722169876098633,
"rewards/rejected": -2.7189443111419678,
"step": 378
},
{
"epoch": 2.7889908256880735,
"grad_norm": 0.45993342995643616,
"learning_rate": 7.146831482883115e-08,
"logits/chosen": -0.5041406750679016,
"logits/rejected": -1.9728295803070068,
"logps/chosen": -310.0588073730469,
"logps/rejected": -202.70693969726562,
"loss": 0.0358,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.020038604736328,
"rewards/margins": 4.748332500457764,
"rewards/rejected": -2.7282943725585938,
"step": 380
},
{
"epoch": 2.8036697247706424,
"grad_norm": 0.772245466709137,
"learning_rate": 6.16638795894492e-08,
"logits/chosen": -0.6536301374435425,
"logits/rejected": -1.7665328979492188,
"logps/chosen": -273.5377197265625,
"logps/rejected": -230.2478485107422,
"loss": 0.0608,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9565868377685547,
"rewards/margins": 4.441068649291992,
"rewards/rejected": -2.4844815731048584,
"step": 382
},
{
"epoch": 2.818348623853211,
"grad_norm": 0.7546908855438232,
"learning_rate": 5.257412261176375e-08,
"logits/chosen": -0.8912358283996582,
"logits/rejected": -1.845367193222046,
"logps/chosen": -286.1430969238281,
"logps/rejected": -220.30615234375,
"loss": 0.0467,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.9721031188964844,
"rewards/margins": 4.426693916320801,
"rewards/rejected": -2.4545907974243164,
"step": 384
},
{
"epoch": 2.83302752293578,
"grad_norm": 0.8646131157875061,
"learning_rate": 4.4201708110795384e-08,
"logits/chosen": -0.7442179918289185,
"logits/rejected": -1.8478055000305176,
"logps/chosen": -304.1729431152344,
"logps/rejected": -233.10855102539062,
"loss": 0.0542,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.854830265045166,
"rewards/margins": 4.118017673492432,
"rewards/rejected": -2.2631874084472656,
"step": 386
},
{
"epoch": 2.847706422018349,
"grad_norm": 0.5237764120101929,
"learning_rate": 3.654909004791152e-08,
"logits/chosen": -0.7583023309707642,
"logits/rejected": -2.0417721271514893,
"logps/chosen": -305.7694091796875,
"logps/rejected": -214.91346740722656,
"loss": 0.0519,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.7464522123336792,
"rewards/margins": 4.34686279296875,
"rewards/rejected": -2.600410223007202,
"step": 388
},
{
"epoch": 2.8623853211009176,
"grad_norm": 0.4562954306602478,
"learning_rate": 2.9618511411570462e-08,
"logits/chosen": -0.8513392210006714,
"logits/rejected": -1.93741774559021,
"logps/chosen": -298.6360778808594,
"logps/rejected": -200.97865295410156,
"loss": 0.0594,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.5848456621170044,
"rewards/margins": 4.198357105255127,
"rewards/rejected": -2.613511562347412,
"step": 390
},
{
"epoch": 2.8770642201834864,
"grad_norm": 0.8423421382904053,
"learning_rate": 2.3412003559898088e-08,
"logits/chosen": -0.701295018196106,
"logits/rejected": -1.7541186809539795,
"logps/chosen": -282.74078369140625,
"logps/rejected": -235.9966278076172,
"loss": 0.0715,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.771613359451294,
"rewards/margins": 3.9545040130615234,
"rewards/rejected": -2.1828906536102295,
"step": 392
},
{
"epoch": 2.891743119266055,
"grad_norm": 0.7763597369194031,
"learning_rate": 1.793138562529634e-08,
"logits/chosen": -0.818265438079834,
"logits/rejected": -2.0317091941833496,
"logps/chosen": -358.9674377441406,
"logps/rejected": -211.57412719726562,
"loss": 0.0469,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.096813917160034,
"rewards/margins": 4.269334316253662,
"rewards/rejected": -2.172520399093628,
"step": 394
},
{
"epoch": 2.906422018348624,
"grad_norm": 0.543138325214386,
"learning_rate": 1.317826398125277e-08,
"logits/chosen": -0.8907778263092041,
"logits/rejected": -2.0098018646240234,
"logps/chosen": -306.7756652832031,
"logps/rejected": -236.91026306152344,
"loss": 0.0441,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.1162776947021484,
"rewards/margins": 4.927333831787109,
"rewards/rejected": -2.81105637550354,
"step": 396
},
{
"epoch": 2.921100917431193,
"grad_norm": 0.40663444995880127,
"learning_rate": 9.15403177151275e-09,
"logits/chosen": -0.744702160358429,
"logits/rejected": -1.7668923139572144,
"logps/chosen": -288.4136962890625,
"logps/rejected": -249.9727325439453,
"loss": 0.0399,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.962416648864746,
"rewards/margins": 4.565864562988281,
"rewards/rejected": -2.603447914123535,
"step": 398
},
{
"epoch": 2.9357798165137616,
"grad_norm": 0.4989350736141205,
"learning_rate": 5.85986850174608e-09,
"logits/chosen": -0.6515053510665894,
"logits/rejected": -2.1018004417419434,
"logps/chosen": -325.62371826171875,
"logps/rejected": -215.43890380859375,
"loss": 0.0425,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.057291030883789,
"rewards/margins": 4.5394287109375,
"rewards/rejected": -2.4821372032165527,
"step": 400
},
{
"epoch": 2.9504587155963304,
"grad_norm": 0.5903070569038391,
"learning_rate": 3.296739693834927e-09,
"logits/chosen": -0.936674952507019,
"logits/rejected": -1.8789682388305664,
"logps/chosen": -317.982666015625,
"logps/rejected": -212.38453674316406,
"loss": 0.0552,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.4384199380874634,
"rewards/margins": 3.8503003120422363,
"rewards/rejected": -2.4118804931640625,
"step": 402
},
{
"epoch": 2.9651376146788992,
"grad_norm": 0.7236863374710083,
"learning_rate": 1.4653966028774225e-09,
"logits/chosen": -0.7320691347122192,
"logits/rejected": -1.8207372426986694,
"logps/chosen": -326.4574890136719,
"logps/rejected": -244.78536987304688,
"loss": 0.0475,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.709165096282959,
"rewards/margins": 4.418180465698242,
"rewards/rejected": -2.7090158462524414,
"step": 404
},
{
"epoch": 2.979816513761468,
"grad_norm": 0.37622901797294617,
"learning_rate": 3.6637599699351766e-10,
"logits/chosen": -0.6842759847640991,
"logits/rejected": -2.033496856689453,
"logps/chosen": -302.9255065917969,
"logps/rejected": -209.06802368164062,
"loss": 0.0455,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.8500210046768188,
"rewards/margins": 4.324655532836914,
"rewards/rejected": -2.4746341705322266,
"step": 406
},
{
"epoch": 2.994495412844037,
"grad_norm": 0.5513418316841125,
"learning_rate": 0.0,
"logits/chosen": -0.8159844279289246,
"logits/rejected": -1.814368724822998,
"logps/chosen": -333.46990966796875,
"logps/rejected": -244.63433837890625,
"loss": 0.0625,
"rewards/accuracies": 0.984375,
"rewards/chosen": 1.8996886014938354,
"rewards/margins": 4.328461170196533,
"rewards/rejected": -2.4287726879119873,
"step": 408
},
{
"epoch": 2.994495412844037,
"step": 408,
"total_flos": 7.837376281021809e+17,
"train_loss": 0.2111055671474805,
"train_runtime": 8097.1834,
"train_samples_per_second": 1.614,
"train_steps_per_second": 0.05
}
],
"logging_steps": 2,
"max_steps": 408,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.837376281021809e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}