mistral-sft-simpo-cleaned-re / trainer_state.json
JW17's picture
Model save
6b1122d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 476,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01050420168067227,
"grad_norm": 18.321366845625462,
"learning_rate": 3.125e-08,
"logits/chosen": -2.9222915172576904,
"logits/rejected": -2.8865013122558594,
"logps/chosen": -0.9845348596572876,
"logps/rejected": -1.163271427154541,
"loss": 1.6281,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9690697193145752,
"rewards/margins": 0.35747313499450684,
"rewards/rejected": -2.326542854309082,
"step": 5
},
{
"epoch": 0.02100840336134454,
"grad_norm": 17.6534655125861,
"learning_rate": 6.25e-08,
"logits/chosen": -2.9073705673217773,
"logits/rejected": -2.8619837760925293,
"logps/chosen": -0.9123918414115906,
"logps/rejected": -1.1516292095184326,
"loss": 1.5762,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8247836828231812,
"rewards/margins": 0.47847509384155273,
"rewards/rejected": -2.3032584190368652,
"step": 10
},
{
"epoch": 0.031512605042016806,
"grad_norm": 19.44309460886479,
"learning_rate": 9.375e-08,
"logits/chosen": -2.939253807067871,
"logits/rejected": -2.871269941329956,
"logps/chosen": -0.9964561462402344,
"logps/rejected": -1.157931923866272,
"loss": 1.6292,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9929122924804688,
"rewards/margins": 0.32295167446136475,
"rewards/rejected": -2.315863847732544,
"step": 15
},
{
"epoch": 0.04201680672268908,
"grad_norm": 23.00550320924175,
"learning_rate": 1.25e-07,
"logits/chosen": -2.8980793952941895,
"logits/rejected": -2.8317883014678955,
"logps/chosen": -1.0304123163223267,
"logps/rejected": -1.2014151811599731,
"loss": 1.598,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0608246326446533,
"rewards/margins": 0.34200599789619446,
"rewards/rejected": -2.4028303623199463,
"step": 20
},
{
"epoch": 0.052521008403361345,
"grad_norm": 25.91201580448508,
"learning_rate": 1.5625e-07,
"logits/chosen": -2.89921236038208,
"logits/rejected": -2.838594913482666,
"logps/chosen": -0.9657201766967773,
"logps/rejected": -1.170414686203003,
"loss": 1.6399,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9314403533935547,
"rewards/margins": 0.40938907861709595,
"rewards/rejected": -2.340829372406006,
"step": 25
},
{
"epoch": 0.06302521008403361,
"grad_norm": 19.053951631856187,
"learning_rate": 1.875e-07,
"logits/chosen": -2.915055513381958,
"logits/rejected": -2.8307695388793945,
"logps/chosen": -1.031659722328186,
"logps/rejected": -1.2121422290802002,
"loss": 1.5382,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.063319444656372,
"rewards/margins": 0.3609650731086731,
"rewards/rejected": -2.4242844581604004,
"step": 30
},
{
"epoch": 0.07352941176470588,
"grad_norm": 22.225870405405676,
"learning_rate": 2.1874999999999997e-07,
"logits/chosen": -2.8420331478118896,
"logits/rejected": -2.8062918186187744,
"logps/chosen": -1.0356570482254028,
"logps/rejected": -1.2093064785003662,
"loss": 1.5637,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0713140964508057,
"rewards/margins": 0.34729865193367004,
"rewards/rejected": -2.4186129570007324,
"step": 35
},
{
"epoch": 0.08403361344537816,
"grad_norm": 25.66800900270909,
"learning_rate": 2.5e-07,
"logits/chosen": -2.845728635787964,
"logits/rejected": -2.8214545249938965,
"logps/chosen": -1.0431854724884033,
"logps/rejected": -1.3399583101272583,
"loss": 1.5204,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0863709449768066,
"rewards/margins": 0.5935453176498413,
"rewards/rejected": -2.6799166202545166,
"step": 40
},
{
"epoch": 0.09453781512605042,
"grad_norm": 18.254417500947117,
"learning_rate": 2.8125e-07,
"logits/chosen": -2.8101553916931152,
"logits/rejected": -2.773531436920166,
"logps/chosen": -1.061798334121704,
"logps/rejected": -1.3759087324142456,
"loss": 1.501,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.123596668243408,
"rewards/margins": 0.6282207369804382,
"rewards/rejected": -2.751817464828491,
"step": 45
},
{
"epoch": 0.10504201680672269,
"grad_norm": 20.430861520566957,
"learning_rate": 2.999838368626891e-07,
"logits/chosen": -2.9204559326171875,
"logits/rejected": -2.878157615661621,
"logps/chosen": -1.0430495738983154,
"logps/rejected": -1.2767090797424316,
"loss": 1.5858,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.086099147796631,
"rewards/margins": 0.4673191010951996,
"rewards/rejected": -2.5534181594848633,
"step": 50
},
{
"epoch": 0.11554621848739496,
"grad_norm": 19.914448467924856,
"learning_rate": 2.9980204156901854e-07,
"logits/chosen": -2.7936322689056396,
"logits/rejected": -2.7450051307678223,
"logps/chosen": -1.1547470092773438,
"logps/rejected": -1.436762809753418,
"loss": 1.5254,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.3094940185546875,
"rewards/margins": 0.5640314817428589,
"rewards/rejected": -2.873525619506836,
"step": 55
},
{
"epoch": 0.12605042016806722,
"grad_norm": 27.25108493191,
"learning_rate": 2.994184927185504e-07,
"logits/chosen": -2.8165132999420166,
"logits/rejected": -2.765676736831665,
"logps/chosen": -1.178091287612915,
"logps/rejected": -1.3924609422683716,
"loss": 1.5556,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.35618257522583,
"rewards/margins": 0.428739458322525,
"rewards/rejected": -2.784921884536743,
"step": 60
},
{
"epoch": 0.13655462184873948,
"grad_norm": 25.118665709906168,
"learning_rate": 2.9883370687530456e-07,
"logits/chosen": -2.8244755268096924,
"logits/rejected": -2.7773241996765137,
"logps/chosen": -1.1520100831985474,
"logps/rejected": -1.447547197341919,
"loss": 1.451,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.3040201663970947,
"rewards/margins": 0.5910741090774536,
"rewards/rejected": -2.895094394683838,
"step": 65
},
{
"epoch": 0.14705882352941177,
"grad_norm": 29.16487182636346,
"learning_rate": 2.980484716295075e-07,
"logits/chosen": -2.787673234939575,
"logits/rejected": -2.726388692855835,
"logps/chosen": -1.0457687377929688,
"logps/rejected": -1.5030543804168701,
"loss": 1.4511,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0915374755859375,
"rewards/margins": 0.9145712852478027,
"rewards/rejected": -3.0061087608337402,
"step": 70
},
{
"epoch": 0.15756302521008403,
"grad_norm": 26.07757243320597,
"learning_rate": 2.970638445368648e-07,
"logits/chosen": -2.776176929473877,
"logits/rejected": -2.7326908111572266,
"logps/chosen": -1.0123913288116455,
"logps/rejected": -1.404775619506836,
"loss": 1.4303,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.024782657623291,
"rewards/margins": 0.78476881980896,
"rewards/rejected": -2.809551239013672,
"step": 75
},
{
"epoch": 0.16806722689075632,
"grad_norm": 35.195975635749924,
"learning_rate": 2.958811516942438e-07,
"logits/chosen": -2.767622470855713,
"logits/rejected": -2.7111330032348633,
"logps/chosen": -1.1310784816741943,
"logps/rejected": -1.712956428527832,
"loss": 1.3445,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.2621569633483887,
"rewards/margins": 1.1637558937072754,
"rewards/rejected": -3.425912857055664,
"step": 80
},
{
"epoch": 0.17857142857142858,
"grad_norm": 29.558096882428416,
"learning_rate": 2.9450198595368514e-07,
"logits/chosen": -2.7697668075561523,
"logits/rejected": -2.7279648780822754,
"logps/chosen": -1.150879979133606,
"logps/rejected": -1.5715720653533936,
"loss": 1.3627,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.301759958267212,
"rewards/margins": 0.8413840532302856,
"rewards/rejected": -3.143144130706787,
"step": 85
},
{
"epoch": 0.18907563025210083,
"grad_norm": 31.18138106236945,
"learning_rate": 2.929282047771477e-07,
"logits/chosen": -2.696549892425537,
"logits/rejected": -2.6848576068878174,
"logps/chosen": -1.1329095363616943,
"logps/rejected": -1.585242509841919,
"loss": 1.3747,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.2658190727233887,
"rewards/margins": 0.9046661257743835,
"rewards/rejected": -3.170485019683838,
"step": 90
},
{
"epoch": 0.19957983193277312,
"grad_norm": 91.23116963300726,
"learning_rate": 2.9116192773487665e-07,
"logits/chosen": -2.682312488555908,
"logits/rejected": -2.673649549484253,
"logps/chosen": -1.3071677684783936,
"logps/rejected": -1.7945388555526733,
"loss": 1.4405,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.614335536956787,
"rewards/margins": 0.9747417569160461,
"rewards/rejected": -3.5890777111053467,
"step": 95
},
{
"epoch": 0.21008403361344538,
"grad_norm": 38.910751298944,
"learning_rate": 2.892055336507641e-07,
"logits/chosen": -2.6822099685668945,
"logits/rejected": -2.6384642124176025,
"logps/chosen": -1.2206847667694092,
"logps/rejected": -1.8117921352386475,
"loss": 1.3468,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.4413695335388184,
"rewards/margins": 1.1822149753570557,
"rewards/rejected": -3.623584270477295,
"step": 100
},
{
"epoch": 0.22058823529411764,
"grad_norm": 27.439545713989038,
"learning_rate": 2.8706165739854637e-07,
"logits/chosen": -2.684013605117798,
"logits/rejected": -2.660853147506714,
"logps/chosen": -1.1910176277160645,
"logps/rejected": -1.6350994110107422,
"loss": 1.3852,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.382035255432129,
"rewards/margins": 0.8881640434265137,
"rewards/rejected": -3.2701988220214844,
"step": 105
},
{
"epoch": 0.23109243697478993,
"grad_norm": 29.807019016962876,
"learning_rate": 2.847331863531529e-07,
"logits/chosen": -2.6825053691864014,
"logits/rejected": -2.6679558753967285,
"logps/chosen": -1.1532232761383057,
"logps/rejected": -1.7548431158065796,
"loss": 1.2615,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.3064465522766113,
"rewards/margins": 1.203240156173706,
"rewards/rejected": -3.509686231613159,
"step": 110
},
{
"epoch": 0.2415966386554622,
"grad_norm": 47.6414807939217,
"learning_rate": 2.8222325650198677e-07,
"logits/chosen": -2.676471471786499,
"logits/rejected": -2.6575491428375244,
"logps/chosen": -1.2915210723876953,
"logps/rejected": -1.9804328680038452,
"loss": 1.3405,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5830421447753906,
"rewards/margins": 1.377823829650879,
"rewards/rejected": -3.9608657360076904,
"step": 115
},
{
"epoch": 0.25210084033613445,
"grad_norm": 33.68771160542956,
"learning_rate": 2.7953524822137317e-07,
"logits/chosen": -2.6282732486724854,
"logits/rejected": -2.6111860275268555,
"logps/chosen": -1.2532024383544922,
"logps/rejected": -2.1360292434692383,
"loss": 1.2154,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.5064048767089844,
"rewards/margins": 1.7656539678573608,
"rewards/rejected": -4.272058486938477,
"step": 120
},
{
"epoch": 0.26260504201680673,
"grad_norm": 36.94049761692212,
"learning_rate": 2.766727817238648e-07,
"logits/chosen": -2.625383138656616,
"logits/rejected": -2.5985493659973145,
"logps/chosen": -1.3159258365631104,
"logps/rejected": -1.8669437170028687,
"loss": 1.3794,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.6318516731262207,
"rewards/margins": 1.1020352840423584,
"rewards/rejected": -3.7338874340057373,
"step": 125
},
{
"epoch": 0.27310924369747897,
"grad_norm": 44.2795876444211,
"learning_rate": 2.7363971218253573e-07,
"logits/chosen": -2.585216760635376,
"logits/rejected": -2.5424036979675293,
"logps/chosen": -1.410796046257019,
"logps/rejected": -2.0416605472564697,
"loss": 1.3051,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.821592092514038,
"rewards/margins": 1.261729121208191,
"rewards/rejected": -4.0833210945129395,
"step": 130
},
{
"epoch": 0.28361344537815125,
"grad_norm": 41.62676495102148,
"learning_rate": 2.7044012453882974e-07,
"logits/chosen": -2.5913612842559814,
"logits/rejected": -2.554213047027588,
"logps/chosen": -1.5970208644866943,
"logps/rejected": -2.28006649017334,
"loss": 1.2034,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.1940417289733887,
"rewards/margins": 1.3660913705825806,
"rewards/rejected": -4.56013298034668,
"step": 135
},
{
"epoch": 0.29411764705882354,
"grad_norm": 36.45682514602446,
"learning_rate": 2.670783280009569e-07,
"logits/chosen": -2.583467960357666,
"logits/rejected": -2.563615083694458,
"logps/chosen": -1.3852840662002563,
"logps/rejected": -1.976252794265747,
"loss": 1.2209,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.7705681324005127,
"rewards/margins": 1.1819374561309814,
"rewards/rejected": -3.952505588531494,
"step": 140
},
{
"epoch": 0.30462184873949577,
"grad_norm": 32.90514134094626,
"learning_rate": 2.635588502402468e-07,
"logits/chosen": -2.6025681495666504,
"logits/rejected": -2.5791728496551514,
"logps/chosen": -1.444962978363037,
"logps/rejected": -2.082648515701294,
"loss": 1.2251,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.889925956726074,
"rewards/margins": 1.2753708362579346,
"rewards/rejected": -4.165297031402588,
"step": 145
},
{
"epoch": 0.31512605042016806,
"grad_norm": 46.925189207028446,
"learning_rate": 2.598864312932762e-07,
"logits/chosen": -2.5708370208740234,
"logits/rejected": -2.5425729751586914,
"logps/chosen": -1.558255910873413,
"logps/rejected": -2.360576629638672,
"loss": 1.2404,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.116511821746826,
"rewards/margins": 1.6046416759490967,
"rewards/rejected": -4.721153259277344,
"step": 150
},
{
"epoch": 0.32563025210084034,
"grad_norm": 44.68173396497493,
"learning_rate": 2.560660171779821e-07,
"logits/chosen": -2.5237948894500732,
"logits/rejected": -2.5131349563598633,
"logps/chosen": -1.7005817890167236,
"logps/rejected": -2.477543592453003,
"loss": 1.2383,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.4011635780334473,
"rewards/margins": 1.5539240837097168,
"rewards/rejected": -4.955087184906006,
"step": 155
},
{
"epoch": 0.33613445378151263,
"grad_norm": 42.56897964236879,
"learning_rate": 2.521027532323594e-07,
"logits/chosen": -2.50708270072937,
"logits/rejected": -2.4973719120025635,
"logps/chosen": -1.5736862421035767,
"logps/rejected": -2.4314279556274414,
"loss": 1.2177,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.1473724842071533,
"rewards/margins": 1.7154836654663086,
"rewards/rejected": -4.862855911254883,
"step": 160
},
{
"epoch": 0.34663865546218486,
"grad_norm": 42.67514136639567,
"learning_rate": 2.480019771847139e-07,
"logits/chosen": -2.4965438842773438,
"logits/rejected": -2.5141289234161377,
"logps/chosen": -1.6085281372070312,
"logps/rejected": -2.5046117305755615,
"loss": 1.1715,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.2170562744140625,
"rewards/margins": 1.79216730594635,
"rewards/rejected": -5.009223461151123,
"step": 165
},
{
"epoch": 0.35714285714285715,
"grad_norm": 56.3843788509327,
"learning_rate": 2.4376921196480405e-07,
"logits/chosen": -2.4241461753845215,
"logits/rejected": -2.4171204566955566,
"logps/chosen": -1.8740981817245483,
"logps/rejected": -2.842223644256592,
"loss": 1.1553,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.7481963634490967,
"rewards/margins": 1.9362504482269287,
"rewards/rejected": -5.684447288513184,
"step": 170
},
{
"epoch": 0.36764705882352944,
"grad_norm": 58.35243830598972,
"learning_rate": 2.3941015826555265e-07,
"logits/chosen": -2.433060646057129,
"logits/rejected": -2.4348819255828857,
"logps/chosen": -2.003147840499878,
"logps/rejected": -2.907435894012451,
"loss": 1.2262,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.006295680999756,
"rewards/margins": 1.808576226234436,
"rewards/rejected": -5.814871788024902,
"step": 175
},
{
"epoch": 0.37815126050420167,
"grad_norm": 62.00858329659252,
"learning_rate": 2.3493068686534757e-07,
"logits/chosen": -2.4191861152648926,
"logits/rejected": -2.4209141731262207,
"logps/chosen": -2.0410985946655273,
"logps/rejected": -3.1209053993225098,
"loss": 1.2189,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -4.082197189331055,
"rewards/margins": 2.159613847732544,
"rewards/rejected": -6.2418107986450195,
"step": 180
},
{
"epoch": 0.38865546218487396,
"grad_norm": 52.62029016306216,
"learning_rate": 2.3033683072127066e-07,
"logits/chosen": -2.4004642963409424,
"logits/rejected": -2.3723645210266113,
"logps/chosen": -1.9122893810272217,
"logps/rejected": -3.104297161102295,
"loss": 1.1119,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.8245787620544434,
"rewards/margins": 2.38401460647583,
"rewards/rejected": -6.20859432220459,
"step": 185
},
{
"epoch": 0.39915966386554624,
"grad_norm": 49.57165162916381,
"learning_rate": 2.2563477684390454e-07,
"logits/chosen": -2.394556999206543,
"logits/rejected": -2.4077131748199463,
"logps/chosen": -1.9445598125457764,
"logps/rejected": -3.2773900032043457,
"loss": 1.0746,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.8891196250915527,
"rewards/margins": 2.6656596660614014,
"rewards/rejected": -6.554780006408691,
"step": 190
},
{
"epoch": 0.4096638655462185,
"grad_norm": 42.22482180826213,
"learning_rate": 2.2083085796465976e-07,
"logits/chosen": -2.3444042205810547,
"logits/rejected": -2.3371148109436035,
"logps/chosen": -2.0608248710632324,
"logps/rejected": -2.9502105712890625,
"loss": 1.1684,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -4.121649742126465,
"rewards/margins": 1.7787716388702393,
"rewards/rejected": -5.900421142578125,
"step": 195
},
{
"epoch": 0.42016806722689076,
"grad_norm": 62.069592428442725,
"learning_rate": 2.1593154400684523e-07,
"logits/chosen": -2.3920085430145264,
"logits/rejected": -2.3790066242218018,
"logps/chosen": -2.172396183013916,
"logps/rejected": -3.3875110149383545,
"loss": 1.1134,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -4.344792366027832,
"rewards/margins": 2.430229663848877,
"rewards/rejected": -6.775022029876709,
"step": 200
},
{
"epoch": 0.43067226890756305,
"grad_norm": 63.80548454611886,
"learning_rate": 2.1094343337196797e-07,
"logits/chosen": -2.2799956798553467,
"logits/rejected": -2.3044838905334473,
"logps/chosen": -2.1241445541381836,
"logps/rejected": -3.2871341705322266,
"loss": 1.074,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -4.248289108276367,
"rewards/margins": 2.325979471206665,
"rewards/rejected": -6.574268341064453,
"step": 205
},
{
"epoch": 0.4411764705882353,
"grad_norm": 60.76644197865358,
"learning_rate": 2.058732440529989e-07,
"logits/chosen": -2.369267225265503,
"logits/rejected": -2.3428282737731934,
"logps/chosen": -2.2345564365386963,
"logps/rejected": -3.428501844406128,
"loss": 1.0777,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.469112873077393,
"rewards/margins": 2.3878910541534424,
"rewards/rejected": -6.857003688812256,
"step": 210
},
{
"epoch": 0.45168067226890757,
"grad_norm": 49.5591416904311,
"learning_rate": 2.0072780458657222e-07,
"logits/chosen": -2.3571441173553467,
"logits/rejected": -2.3563666343688965,
"logps/chosen": -2.1674928665161133,
"logps/rejected": -3.2230000495910645,
"loss": 1.0862,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.334985733032227,
"rewards/margins": 2.1110141277313232,
"rewards/rejected": -6.446000099182129,
"step": 215
},
{
"epoch": 0.46218487394957986,
"grad_norm": 53.25790647881489,
"learning_rate": 1.9551404485630487e-07,
"logits/chosen": -2.3252339363098145,
"logits/rejected": -2.3368701934814453,
"logps/chosen": -2.3293991088867188,
"logps/rejected": -3.515172243118286,
"loss": 1.113,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.6587982177734375,
"rewards/margins": 2.371546506881714,
"rewards/rejected": -7.030344486236572,
"step": 220
},
{
"epoch": 0.4726890756302521,
"grad_norm": 107.94133477979558,
"learning_rate": 1.9023898675962123e-07,
"logits/chosen": -2.2349350452423096,
"logits/rejected": -2.270430088043213,
"logps/chosen": -2.319396495819092,
"logps/rejected": -3.6063385009765625,
"loss": 1.0598,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.638792991638184,
"rewards/margins": 2.573883533477783,
"rewards/rejected": -7.212677001953125,
"step": 225
},
{
"epoch": 0.4831932773109244,
"grad_norm": 51.80093777317445,
"learning_rate": 1.8490973475065407e-07,
"logits/chosen": -2.2946877479553223,
"logits/rejected": -2.2905642986297607,
"logps/chosen": -2.3950748443603516,
"logps/rejected": -3.634678602218628,
"loss": 1.0982,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.790149688720703,
"rewards/margins": 2.4792075157165527,
"rewards/rejected": -7.269357204437256,
"step": 230
},
{
"epoch": 0.49369747899159666,
"grad_norm": 72.76258850252798,
"learning_rate": 1.795334662719576e-07,
"logits/chosen": -2.278480052947998,
"logits/rejected": -2.299923896789551,
"logps/chosen": -2.357292652130127,
"logps/rejected": -3.7696902751922607,
"loss": 1.0057,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -4.714585304260254,
"rewards/margins": 2.8247950077056885,
"rewards/rejected": -7.5393805503845215,
"step": 235
},
{
"epoch": 0.5042016806722689,
"grad_norm": 64.28632501194514,
"learning_rate": 1.7411742208792024e-07,
"logits/chosen": -2.2843871116638184,
"logits/rejected": -2.300901412963867,
"logps/chosen": -2.508634090423584,
"logps/rejected": -3.8370189666748047,
"loss": 1.033,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -5.017268180847168,
"rewards/margins": 2.6567699909210205,
"rewards/rejected": -7.674037933349609,
"step": 240
},
{
"epoch": 0.5147058823529411,
"grad_norm": 56.78201656922531,
"learning_rate": 1.686688965328944e-07,
"logits/chosen": -2.2179243564605713,
"logits/rejected": -2.2388010025024414,
"logps/chosen": -2.3462517261505127,
"logps/rejected": -3.506201982498169,
"loss": 0.9703,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.692503452301025,
"rewards/margins": 2.3199009895324707,
"rewards/rejected": -7.012403964996338,
"step": 245
},
{
"epoch": 0.5252100840336135,
"grad_norm": 66.31368878059381,
"learning_rate": 1.6319522768717944e-07,
"logits/chosen": -2.254875421524048,
"logits/rejected": -2.2779059410095215,
"logps/chosen": -2.398496150970459,
"logps/rejected": -3.7779440879821777,
"loss": 1.0355,
"rewards/accuracies": 0.8125,
"rewards/chosen": -4.796992301940918,
"rewards/margins": 2.758897542953491,
"rewards/rejected": -7.5558881759643555,
"step": 250
},
{
"epoch": 0.5357142857142857,
"grad_norm": 56.3335721813079,
"learning_rate": 1.5770378749408654e-07,
"logits/chosen": -2.2989799976348877,
"logits/rejected": -2.2941720485687256,
"logps/chosen": -2.581568479537964,
"logps/rejected": -3.853482723236084,
"loss": 1.0114,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -5.163136959075928,
"rewards/margins": 2.5438289642333984,
"rewards/rejected": -7.706965446472168,
"step": 255
},
{
"epoch": 0.5462184873949579,
"grad_norm": 64.04241236117856,
"learning_rate": 1.522019718313975e-07,
"logits/chosen": -2.2507102489471436,
"logits/rejected": -2.272916316986084,
"logps/chosen": -2.6012022495269775,
"logps/rejected": -4.0311384201049805,
"loss": 0.992,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.202404499053955,
"rewards/margins": 2.859873056411743,
"rewards/rejected": -8.062276840209961,
"step": 260
},
{
"epoch": 0.5567226890756303,
"grad_norm": 59.88114738443522,
"learning_rate": 1.4669719055058805e-07,
"logits/chosen": -2.2266743183135986,
"logits/rejected": -2.2351810932159424,
"logps/chosen": -2.7907989025115967,
"logps/rejected": -3.9706473350524902,
"loss": 1.0608,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -5.581597805023193,
"rewards/margins": 2.35969614982605,
"rewards/rejected": -7.9412946701049805,
"step": 265
},
{
"epoch": 0.5672268907563025,
"grad_norm": 63.37030995368488,
"learning_rate": 1.411968574972317e-07,
"logits/chosen": -2.230888843536377,
"logits/rejected": -2.2535951137542725,
"logps/chosen": -2.7027249336242676,
"logps/rejected": -4.1824774742126465,
"loss": 0.8988,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -5.405449867248535,
"rewards/margins": 2.9595046043395996,
"rewards/rejected": -8.364954948425293,
"step": 270
},
{
"epoch": 0.5777310924369747,
"grad_norm": 69.41737055216304,
"learning_rate": 1.357083805260243e-07,
"logits/chosen": -2.2285051345825195,
"logits/rejected": -2.2328968048095703,
"logps/chosen": -2.7089076042175293,
"logps/rejected": -3.9290478229522705,
"loss": 0.969,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -5.417815208435059,
"rewards/margins": 2.440279483795166,
"rewards/rejected": -7.858095645904541,
"step": 275
},
{
"epoch": 0.5882352941176471,
"grad_norm": 63.48615863862009,
"learning_rate": 1.302391515238772e-07,
"logits/chosen": -2.2015397548675537,
"logits/rejected": -2.2215192317962646,
"logps/chosen": -2.722857713699341,
"logps/rejected": -4.155056953430176,
"loss": 0.9593,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -5.445715427398682,
"rewards/margins": 2.86439847946167,
"rewards/rejected": -8.310113906860352,
"step": 280
},
{
"epoch": 0.5987394957983193,
"grad_norm": 87.6726372411929,
"learning_rate": 1.247965364545152e-07,
"logits/chosen": -2.1690385341644287,
"logits/rejected": -2.1941065788269043,
"logps/chosen": -2.697335720062256,
"logps/rejected": -4.129209995269775,
"loss": 1.0182,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.394671440124512,
"rewards/margins": 2.8637471199035645,
"rewards/rejected": -8.25841999053955,
"step": 285
},
{
"epoch": 0.6092436974789915,
"grad_norm": 54.49746884782157,
"learning_rate": 1.193878654379889e-07,
"logits/chosen": -2.1245057582855225,
"logits/rejected": -2.1610589027404785,
"logps/chosen": -2.6949501037597656,
"logps/rejected": -4.0747246742248535,
"loss": 1.0182,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -5.389900207519531,
"rewards/margins": 2.759549617767334,
"rewards/rejected": -8.149449348449707,
"step": 290
},
{
"epoch": 0.6197478991596639,
"grad_norm": 49.136356343546524,
"learning_rate": 1.1402042287846068e-07,
"logits/chosen": -2.1676132678985596,
"logits/rejected": -2.1930439472198486,
"logps/chosen": -2.85373592376709,
"logps/rejected": -4.212955951690674,
"loss": 1.0398,
"rewards/accuracies": 0.78125,
"rewards/chosen": -5.70747184753418,
"rewards/margins": 2.7184391021728516,
"rewards/rejected": -8.425911903381348,
"step": 295
},
{
"epoch": 0.6302521008403361,
"grad_norm": 56.2186810691314,
"learning_rate": 1.0870143765356105e-07,
"logits/chosen": -2.1709885597229004,
"logits/rejected": -2.1842150688171387,
"logps/chosen": -2.9935240745544434,
"logps/rejected": -4.36973762512207,
"loss": 1.0064,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.987048149108887,
"rewards/margins": 2.7524266242980957,
"rewards/rejected": -8.73947525024414,
"step": 300
},
{
"epoch": 0.6407563025210085,
"grad_norm": 74.55055606717697,
"learning_rate": 1.0343807337852794e-07,
"logits/chosen": -2.1351749897003174,
"logits/rejected": -2.1373703479766846,
"logps/chosen": -2.965303897857666,
"logps/rejected": -4.419961929321289,
"loss": 1.0268,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -5.930607795715332,
"rewards/margins": 2.9093151092529297,
"rewards/rejected": -8.839923858642578,
"step": 305
},
{
"epoch": 0.6512605042016807,
"grad_norm": 53.97579171817796,
"learning_rate": 9.82374187582421e-08,
"logits/chosen": -2.1092991828918457,
"logits/rejected": -2.133781909942627,
"logps/chosen": -2.9700093269348145,
"logps/rejected": -4.346618175506592,
"loss": 0.9648,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -5.940018653869629,
"rewards/margins": 2.753218650817871,
"rewards/rejected": -8.693236351013184,
"step": 310
},
{
"epoch": 0.6617647058823529,
"grad_norm": 66.28146153490614,
"learning_rate": 9.310647804015124e-08,
"logits/chosen": -2.133643627166748,
"logits/rejected": -2.160266637802124,
"logps/chosen": -2.9957821369171143,
"logps/rejected": -4.556756973266602,
"loss": 0.937,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -5.9915642738342285,
"rewards/margins": 3.1219494342803955,
"rewards/rejected": -9.113513946533203,
"step": 315
},
{
"epoch": 0.6722689075630253,
"grad_norm": 49.303213418937055,
"learning_rate": 8.805216158094177e-08,
"logits/chosen": -2.076920986175537,
"logits/rejected": -2.103963851928711,
"logps/chosen": -2.907010555267334,
"logps/rejected": -4.666647911071777,
"loss": 0.9387,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.814021110534668,
"rewards/margins": 3.5192761421203613,
"rewards/rejected": -9.333295822143555,
"step": 320
},
{
"epoch": 0.6827731092436975,
"grad_norm": 67.32319494946066,
"learning_rate": 8.308127653966262e-08,
"logits/chosen": -2.0415196418762207,
"logits/rejected": -2.0577666759490967,
"logps/chosen": -3.1487503051757812,
"logps/rejected": -4.704668045043945,
"loss": 0.9346,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.2975006103515625,
"rewards/margins": 3.111835241317749,
"rewards/rejected": -9.40933609008789,
"step": 325
},
{
"epoch": 0.6932773109243697,
"grad_norm": 60.93426199203996,
"learning_rate": 7.820051770983612e-08,
"logits/chosen": -2.0549426078796387,
"logits/rejected": -2.080475330352783,
"logps/chosen": -3.1458420753479004,
"logps/rejected": -4.8635969161987305,
"loss": 0.966,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -6.291684150695801,
"rewards/margins": 3.4355111122131348,
"rewards/rejected": -9.727193832397461,
"step": 330
},
{
"epoch": 0.7037815126050421,
"grad_norm": 72.28419657503075,
"learning_rate": 7.341645850290216e-08,
"logits/chosen": -2.1288955211639404,
"logits/rejected": -2.1594443321228027,
"logps/chosen": -3.1346468925476074,
"logps/rejected": -4.768304347991943,
"loss": 1.019,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -6.269293785095215,
"rewards/margins": 3.267315626144409,
"rewards/rejected": -9.536608695983887,
"step": 335
},
{
"epoch": 0.7142857142857143,
"grad_norm": 60.72644174180833,
"learning_rate": 6.873554209514085e-08,
"logits/chosen": -2.0705599784851074,
"logits/rejected": -2.0726349353790283,
"logps/chosen": -2.935683488845825,
"logps/rejected": -4.3867692947387695,
"loss": 0.9702,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -5.87136697769165,
"rewards/margins": 2.9021708965301514,
"rewards/rejected": -8.773538589477539,
"step": 340
},
{
"epoch": 0.7247899159663865,
"grad_norm": 52.6099555735741,
"learning_rate": 6.416407274999497e-08,
"logits/chosen": -2.113405227661133,
"logits/rejected": -2.1457953453063965,
"logps/chosen": -3.0049102306365967,
"logps/rejected": -4.615386962890625,
"loss": 0.9687,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -6.009820461273193,
"rewards/margins": 3.2209534645080566,
"rewards/rejected": -9.23077392578125,
"step": 345
},
{
"epoch": 0.7352941176470589,
"grad_norm": 69.6143506053754,
"learning_rate": 5.970820732748143e-08,
"logits/chosen": -2.145555257797241,
"logits/rejected": -2.155163288116455,
"logps/chosen": -2.938427209854126,
"logps/rejected": -4.6191511154174805,
"loss": 0.878,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -5.876854419708252,
"rewards/margins": 3.3614463806152344,
"rewards/rejected": -9.238302230834961,
"step": 350
},
{
"epoch": 0.7457983193277311,
"grad_norm": 70.71307640111154,
"learning_rate": 5.537394699212498e-08,
"logits/chosen": -2.1382346153259277,
"logits/rejected": -2.163740634918213,
"logps/chosen": -2.980686664581299,
"logps/rejected": -4.480741500854492,
"loss": 0.9898,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -5.961373329162598,
"rewards/margins": 3.0001087188720703,
"rewards/rejected": -8.961483001708984,
"step": 355
},
{
"epoch": 0.7563025210084033,
"grad_norm": 73.19945321147338,
"learning_rate": 5.1167129130583346e-08,
"logits/chosen": -2.109528064727783,
"logits/rejected": -2.1514618396759033,
"logps/chosen": -2.996703624725342,
"logps/rejected": -4.683353900909424,
"loss": 1.0311,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -5.993407249450684,
"rewards/margins": 3.373300075531006,
"rewards/rejected": -9.366707801818848,
"step": 360
},
{
"epoch": 0.7668067226890757,
"grad_norm": 70.68128938841156,
"learning_rate": 4.709341948984809e-08,
"logits/chosen": -2.0933072566986084,
"logits/rejected": -2.1408255100250244,
"logps/chosen": -2.9475154876708984,
"logps/rejected": -4.628712177276611,
"loss": 1.0051,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.895030975341797,
"rewards/margins": 3.3623931407928467,
"rewards/rejected": -9.257424354553223,
"step": 365
},
{
"epoch": 0.7773109243697479,
"grad_norm": 64.71452548748283,
"learning_rate": 4.315830454661059e-08,
"logits/chosen": -2.086402654647827,
"logits/rejected": -2.1012749671936035,
"logps/chosen": -2.9121134281158447,
"logps/rejected": -4.349917888641357,
"loss": 0.9727,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.8242268562316895,
"rewards/margins": 2.8756089210510254,
"rewards/rejected": -8.699835777282715,
"step": 370
},
{
"epoch": 0.7878151260504201,
"grad_norm": 71.60834624596436,
"learning_rate": 3.936708411806887e-08,
"logits/chosen": -2.124846935272217,
"logits/rejected": -2.1803550720214844,
"logps/chosen": -2.9349002838134766,
"logps/rejected": -4.718347549438477,
"loss": 0.9764,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -5.869800567626953,
"rewards/margins": 3.566895008087158,
"rewards/rejected": -9.436695098876953,
"step": 375
},
{
"epoch": 0.7983193277310925,
"grad_norm": 55.835007766843376,
"learning_rate": 3.572486422412786e-08,
"logits/chosen": -2.104611873626709,
"logits/rejected": -2.1398825645446777,
"logps/chosen": -2.874159336090088,
"logps/rejected": -4.522528648376465,
"loss": 0.9513,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -5.748318672180176,
"rewards/margins": 3.296739101409912,
"rewards/rejected": -9.04505729675293,
"step": 380
},
{
"epoch": 0.8088235294117647,
"grad_norm": 54.54718274731096,
"learning_rate": 3.2236550210606293e-08,
"logits/chosen": -2.13325834274292,
"logits/rejected": -2.1514346599578857,
"logps/chosen": -2.728529691696167,
"logps/rejected": -4.492846488952637,
"loss": 0.9402,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -5.457059383392334,
"rewards/margins": 3.5286338329315186,
"rewards/rejected": -8.985692977905273,
"step": 385
},
{
"epoch": 0.819327731092437,
"grad_norm": 64.73590798684994,
"learning_rate": 2.8906840142711338e-08,
"logits/chosen": -2.0870397090911865,
"logits/rejected": -2.1221370697021484,
"logps/chosen": -2.9295685291290283,
"logps/rejected": -4.712892055511475,
"loss": 0.9203,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -5.859137058258057,
"rewards/margins": 3.5666465759277344,
"rewards/rejected": -9.42578411102295,
"step": 390
},
{
"epoch": 0.8298319327731093,
"grad_norm": 56.24812000405815,
"learning_rate": 2.5740218477679143e-08,
"logits/chosen": -2.076784610748291,
"logits/rejected": -2.0827224254608154,
"logps/chosen": -2.910884141921997,
"logps/rejected": -4.398539066314697,
"loss": 0.8926,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -5.821768283843994,
"rewards/margins": 2.975309371948242,
"rewards/rejected": -8.797078132629395,
"step": 395
},
{
"epoch": 0.8403361344537815,
"grad_norm": 65.02327391971039,
"learning_rate": 2.2740950025102763e-08,
"logits/chosen": -2.0536999702453613,
"logits/rejected": -2.058232545852661,
"logps/chosen": -3.009183883666992,
"logps/rejected": -4.569349765777588,
"loss": 0.9758,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -6.018367767333984,
"rewards/margins": 3.1203320026397705,
"rewards/rejected": -9.138699531555176,
"step": 400
},
{
"epoch": 0.8508403361344538,
"grad_norm": 71.60344245483444,
"learning_rate": 1.9913074203082053e-08,
"logits/chosen": -2.0714104175567627,
"logits/rejected": -2.0895228385925293,
"logps/chosen": -3.0680434703826904,
"logps/rejected": -4.809669494628906,
"loss": 1.002,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -6.136086940765381,
"rewards/margins": 3.483250856399536,
"rewards/rejected": -9.619338989257812,
"step": 405
},
{
"epoch": 0.8613445378151261,
"grad_norm": 65.02582256297173,
"learning_rate": 1.726039959793059e-08,
"logits/chosen": -2.0531625747680664,
"logits/rejected": -2.0893194675445557,
"logps/chosen": -3.2407803535461426,
"logps/rejected": -4.729245185852051,
"loss": 0.9391,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -6.481560707092285,
"rewards/margins": 2.9769301414489746,
"rewards/rejected": -9.458490371704102,
"step": 410
},
{
"epoch": 0.8718487394957983,
"grad_norm": 66.60722999226081,
"learning_rate": 1.4786498834767618e-08,
"logits/chosen": -1.971679449081421,
"logits/rejected": -2.0226242542266846,
"logps/chosen": -2.956986427307129,
"logps/rejected": -4.357911109924316,
"loss": 0.9793,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -5.913972854614258,
"rewards/margins": 2.8018486499786377,
"rewards/rejected": -8.715822219848633,
"step": 415
},
{
"epoch": 0.8823529411764706,
"grad_norm": 67.46172075980118,
"learning_rate": 1.2494703765902337e-08,
"logits/chosen": -2.0839121341705322,
"logits/rejected": -2.104898452758789,
"logps/chosen": -3.1962718963623047,
"logps/rejected": -4.687077522277832,
"loss": 0.9073,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -6.392543792724609,
"rewards/margins": 2.9816107749938965,
"rewards/rejected": -9.374155044555664,
"step": 420
},
{
"epoch": 0.8928571428571429,
"grad_norm": 80.87130272740922,
"learning_rate": 1.0388100983491676e-08,
"logits/chosen": -2.0597221851348877,
"logits/rejected": -2.0896944999694824,
"logps/chosen": -3.026052236557007,
"logps/rejected": -4.573755741119385,
"loss": 0.9555,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -6.052104473114014,
"rewards/margins": 3.0954062938690186,
"rewards/rejected": -9.14751148223877,
"step": 425
},
{
"epoch": 0.9033613445378151,
"grad_norm": 70.56768229498226,
"learning_rate": 8.469527662514425e-09,
"logits/chosen": -2.0741794109344482,
"logits/rejected": -2.097032070159912,
"logps/chosen": -3.0541605949401855,
"logps/rejected": -4.719814777374268,
"loss": 1.0143,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -6.108321189880371,
"rewards/margins": 3.331307888031006,
"rewards/rejected": -9.439629554748535,
"step": 430
},
{
"epoch": 0.9138655462184874,
"grad_norm": 72.71200868786163,
"learning_rate": 6.7415677396608474e-09,
"logits/chosen": -2.0740599632263184,
"logits/rejected": -2.0966227054595947,
"logps/chosen": -3.1755881309509277,
"logps/rejected": -5.003739356994629,
"loss": 0.9747,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -6.3511762619018555,
"rewards/margins": 3.6563029289245605,
"rewards/rejected": -10.007478713989258,
"step": 435
},
{
"epoch": 0.9243697478991597,
"grad_norm": 68.28482752709235,
"learning_rate": 5.206548433283803e-09,
"logits/chosen": -2.015186071395874,
"logits/rejected": -2.100969076156616,
"logps/chosen": -3.135103464126587,
"logps/rejected": -4.680062294006348,
"loss": 0.9059,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -6.270206928253174,
"rewards/margins": 3.0899174213409424,
"rewards/rejected": -9.360124588012695,
"step": 440
},
{
"epoch": 0.9348739495798319,
"grad_norm": 53.32723170520827,
"learning_rate": 3.866537109098561e-09,
"logits/chosen": -2.0853240489959717,
"logits/rejected": -2.0845720767974854,
"logps/chosen": -2.9771265983581543,
"logps/rejected": -4.7920613288879395,
"loss": 0.9242,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -5.954253196716309,
"rewards/margins": 3.6298699378967285,
"rewards/rejected": -9.584122657775879,
"step": 445
},
{
"epoch": 0.9453781512605042,
"grad_norm": 77.58999305035255,
"learning_rate": 2.7233384958522676e-09,
"logits/chosen": -2.0929324626922607,
"logits/rejected": -2.088423490524292,
"logps/chosen": -3.0112125873565674,
"logps/rejected": -4.747193336486816,
"loss": 0.859,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -6.022425174713135,
"rewards/margins": 3.471961498260498,
"rewards/rejected": -9.494386672973633,
"step": 450
},
{
"epoch": 0.9558823529411765,
"grad_norm": 69.00371191627924,
"learning_rate": 1.7784922547133318e-09,
"logits/chosen": -2.03417706489563,
"logits/rejected": -2.0785162448883057,
"logps/chosen": -3.0350539684295654,
"logps/rejected": -4.6372761726379395,
"loss": 1.0211,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -6.070107936859131,
"rewards/margins": 3.204444408416748,
"rewards/rejected": -9.274552345275879,
"step": 455
},
{
"epoch": 0.9663865546218487,
"grad_norm": 80.70006340013546,
"learning_rate": 1.033270905653949e-09,
"logits/chosen": -2.077859878540039,
"logits/rejected": -2.1275644302368164,
"logps/chosen": -3.1961588859558105,
"logps/rejected": -5.026784420013428,
"loss": 0.9054,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -6.392317771911621,
"rewards/margins": 3.6612517833709717,
"rewards/rejected": -10.053568840026855,
"step": 460
},
{
"epoch": 0.976890756302521,
"grad_norm": 65.94555657144473,
"learning_rate": 4.8867811361889e-10,
"logits/chosen": -2.0415802001953125,
"logits/rejected": -2.073897123336792,
"logps/chosen": -3.136763572692871,
"logps/rejected": -4.838761329650879,
"loss": 0.9205,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -6.273527145385742,
"rewards/margins": 3.40399432182312,
"rewards/rejected": -9.677522659301758,
"step": 465
},
{
"epoch": 0.9873949579831933,
"grad_norm": 72.55584643358395,
"learning_rate": 1.454473367883291e-10,
"logits/chosen": -2.0744833946228027,
"logits/rejected": -2.1010680198669434,
"logps/chosen": -3.007612943649292,
"logps/rejected": -4.534255027770996,
"loss": 0.8893,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -6.015225887298584,
"rewards/margins": 3.0532851219177246,
"rewards/rejected": -9.068510055541992,
"step": 470
},
{
"epoch": 0.9978991596638656,
"grad_norm": 71.68265122537953,
"learning_rate": 4.040838755653419e-12,
"logits/chosen": -2.0488152503967285,
"logits/rejected": -2.0957658290863037,
"logps/chosen": -2.9260973930358887,
"logps/rejected": -4.68855619430542,
"loss": 0.9609,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -5.852194786071777,
"rewards/margins": 3.5249176025390625,
"rewards/rejected": -9.37711238861084,
"step": 475
},
{
"epoch": 1.0,
"step": 476,
"total_flos": 0.0,
"train_loss": 1.1419020675811447,
"train_runtime": 10201.3152,
"train_samples_per_second": 5.971,
"train_steps_per_second": 0.047
}
],
"logging_steps": 5,
"max_steps": 476,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}