diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5070 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.982222222222222, + "eval_steps": 1, + "global_step": 336, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.011851851851851851, + "grad_norm": 62.70548815519655, + "learning_rate": 1.4705882352941176e-08, + "logits/chosen": 0.030916133895516396, + "logits/rejected": 0.09742362797260284, + "logps/chosen": -40.58351516723633, + "logps/rejected": -58.42578887939453, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 67.3907670519946, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": 0.15014928579330444, + "logits/rejected": 0.2673640847206116, + "logps/chosen": -31.35921859741211, + "logps/rejected": -54.71299743652344, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 73.59075381908265, + "learning_rate": 4.411764705882353e-08, + "logits/chosen": 0.35403603315353394, + "logits/rejected": 0.3630790412425995, + "logps/chosen": -30.862504959106445, + "logps/rejected": -43.55963897705078, + "loss": 0.6991, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023167992010712624, + "rewards/margins": 0.016683291643857956, + "rewards/rejected": 0.00648469990119338, + "step": 3 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 58.78689431688176, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": 0.3042946457862854, + "logits/rejected": 0.25474676489830017, + "logps/chosen": -34.22315979003906, + "logps/rejected": -39.93827438354492, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021718814969062805, + "rewards/margins": -0.024322079494595528, + "rewards/rejected": 0.0026032691821455956, + "step": 4 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 59.402728518681464, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": 0.20607078075408936, + "logits/rejected": 0.27008742094039917, + "logps/chosen": -40.86919403076172, + "logps/rejected": -51.17314910888672, + "loss": 0.6937, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02542915567755699, + "rewards/margins": 0.01238556019961834, + "rewards/rejected": 0.013043595477938652, + "step": 5 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 60.3208791733097, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": 0.39524325728416443, + "logits/rejected": 0.3147166669368744, + "logps/chosen": -45.889522552490234, + "logps/rejected": -47.271080017089844, + "loss": 0.6967, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.035939525812864304, + "rewards/margins": 0.035303808748722076, + "rewards/rejected": 0.0006357184611260891, + "step": 6 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 65.84343450368385, + "learning_rate": 1.0294117647058822e-07, + "logits/chosen": 0.19424982368946075, + "logits/rejected": 0.36947980523109436, + "logps/chosen": -32.91363525390625, + "logps/rejected": -43.79743194580078, + "loss": 0.7058, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012993477284908295, + "rewards/margins": -0.015153911896049976, + "rewards/rejected": 0.028147388249635696, + "step": 7 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 68.52239761198476, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": 0.1946137249469757, + "logits/rejected": 0.28064286708831787, + "logps/chosen": -32.246864318847656, + "logps/rejected": -41.628746032714844, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010772847570478916, + "rewards/margins": 0.011450938880443573, + "rewards/rejected": -0.022223783656954765, + "step": 8 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 77.93025419789252, + "learning_rate": 1.3235294117647057e-07, + "logits/chosen": 0.32008448243141174, + "logits/rejected": 0.21636219322681427, + "logps/chosen": -40.00132369995117, + "logps/rejected": -44.613426208496094, + "loss": 0.6976, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.008494901470839977, + "rewards/margins": 0.011597584001719952, + "rewards/rejected": -0.003102683462202549, + "step": 9 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 60.27023294648464, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": 0.011551467701792717, + "logits/rejected": 0.1401338428258896, + "logps/chosen": -35.68666076660156, + "logps/rejected": -47.44255065917969, + "loss": 0.6817, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02254348061978817, + "rewards/margins": 0.029230808839201927, + "rewards/rejected": -0.0517742857336998, + "step": 10 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 70.77005341416117, + "learning_rate": 1.6176470588235293e-07, + "logits/chosen": 0.07894501090049744, + "logits/rejected": 0.09966235607862473, + "logps/chosen": -30.685501098632812, + "logps/rejected": -42.800785064697266, + "loss": 0.6773, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.006754852831363678, + "rewards/margins": 0.031156515702605247, + "rewards/rejected": -0.037911366671323776, + "step": 11 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 58.87977165275071, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": 0.23514162003993988, + "logits/rejected": 0.2450232207775116, + "logps/chosen": -41.01308822631836, + "logps/rejected": -52.138641357421875, + "loss": 0.6775, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05471659079194069, + "rewards/margins": -0.00840845424681902, + "rewards/rejected": -0.046308137476444244, + "step": 12 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 68.28764496281121, + "learning_rate": 1.9117647058823527e-07, + "logits/chosen": 0.13845381140708923, + "logits/rejected": 0.06714074313640594, + "logps/chosen": -36.72666549682617, + "logps/rejected": -44.98724365234375, + "loss": 0.6698, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.027201365679502487, + "rewards/margins": 0.0457880012691021, + "rewards/rejected": -0.07298936694860458, + "step": 13 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 67.0476123108747, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": 0.13570713996887207, + "logits/rejected": 0.02110590785741806, + "logps/chosen": -39.4144287109375, + "logps/rejected": -46.626033782958984, + "loss": 0.6694, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05015880987048149, + "rewards/margins": 0.06351868808269501, + "rewards/rejected": -0.1136775016784668, + "step": 14 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 56.97224502598152, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": 0.14530636370182037, + "logits/rejected": 0.22717420756816864, + "logps/chosen": -33.9251823425293, + "logps/rejected": -47.67527770996094, + "loss": 0.6504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07779295742511749, + "rewards/margins": 0.12380316108465195, + "rewards/rejected": -0.20159611105918884, + "step": 15 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 58.43872340752561, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": 0.1471043974161148, + "logits/rejected": 0.26890620589256287, + "logps/chosen": -36.48509979248047, + "logps/rejected": -53.876888275146484, + "loss": 0.6503, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.12628403306007385, + "rewards/margins": 0.1476879119873047, + "rewards/rejected": -0.27397194504737854, + "step": 16 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 53.27712198861993, + "learning_rate": 2.5e-07, + "logits/chosen": 0.3565051555633545, + "logits/rejected": 0.25773561000823975, + "logps/chosen": -32.79841232299805, + "logps/rejected": -36.324119567871094, + "loss": 0.6414, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06849764287471771, + "rewards/margins": 0.11378694325685501, + "rewards/rejected": -0.18228457868099213, + "step": 17 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 58.90356354190709, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -0.1638399213552475, + "logits/rejected": -0.027556225657463074, + "logps/chosen": -35.214149475097656, + "logps/rejected": -54.174049377441406, + "loss": 0.6328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15428400039672852, + "rewards/margins": 0.18084901571273804, + "rewards/rejected": -0.33513307571411133, + "step": 18 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 54.674461580476404, + "learning_rate": 2.7941176470588235e-07, + "logits/chosen": 0.2881383001804352, + "logits/rejected": 0.32903048396110535, + "logps/chosen": -30.270973205566406, + "logps/rejected": -40.31577682495117, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22714650630950928, + "rewards/margins": 0.15492179989814758, + "rewards/rejected": -0.38206830620765686, + "step": 19 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 49.124258720104926, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": 0.1832781583070755, + "logits/rejected": 0.22061079740524292, + "logps/chosen": -31.4530029296875, + "logps/rejected": -43.574642181396484, + "loss": 0.5737, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2013498842716217, + "rewards/margins": 0.24712926149368286, + "rewards/rejected": -0.44847914576530457, + "step": 20 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 49.31720284300177, + "learning_rate": 3.088235294117647e-07, + "logits/chosen": 0.049509014934301376, + "logits/rejected": 0.09894949197769165, + "logps/chosen": -38.46732711791992, + "logps/rejected": -53.03515625, + "loss": 0.5683, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.25855958461761475, + "rewards/margins": 0.35330522060394287, + "rewards/rejected": -0.6118648052215576, + "step": 21 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 49.41172490919738, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": 0.09565885365009308, + "logits/rejected": 0.13914039731025696, + "logps/chosen": -28.3892765045166, + "logps/rejected": -40.65375518798828, + "loss": 0.5608, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2191687971353531, + "rewards/margins": 0.21512456238269806, + "rewards/rejected": -0.43429338932037354, + "step": 22 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 46.290056641280934, + "learning_rate": 3.3823529411764707e-07, + "logits/chosen": 0.33468642830848694, + "logits/rejected": 0.3287414312362671, + "logps/chosen": -41.56681823730469, + "logps/rejected": -49.69163131713867, + "loss": 0.5531, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6832336187362671, + "rewards/margins": 0.1509827822446823, + "rewards/rejected": -0.8342164158821106, + "step": 23 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 45.27726525885794, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": 0.29047060012817383, + "logits/rejected": 0.25504833459854126, + "logps/chosen": -40.559715270996094, + "logps/rejected": -43.744140625, + "loss": 0.5839, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4380490183830261, + "rewards/margins": 0.21297261118888855, + "rewards/rejected": -0.6510215997695923, + "step": 24 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 45.96606934366563, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": 0.19954881072044373, + "logits/rejected": 0.24337519705295563, + "logps/chosen": -26.71196937561035, + "logps/rejected": -45.159339904785156, + "loss": 0.5114, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4065704047679901, + "rewards/margins": 0.6559778451919556, + "rewards/rejected": -1.0625481605529785, + "step": 25 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 40.51480580129527, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": 0.18569591641426086, + "logits/rejected": 0.24005870521068573, + "logps/chosen": -32.402259826660156, + "logps/rejected": -50.5438117980957, + "loss": 0.4871, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35379940271377563, + "rewards/margins": 0.88099205493927, + "rewards/rejected": -1.2347913980484009, + "step": 26 + }, + { + "epoch": 0.32, + "grad_norm": 42.88615154569158, + "learning_rate": 3.9705882352941174e-07, + "logits/chosen": 0.28236454725265503, + "logits/rejected": 0.25888901948928833, + "logps/chosen": -39.2940673828125, + "logps/rejected": -53.0938606262207, + "loss": 0.4142, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6994470953941345, + "rewards/margins": 1.0878021717071533, + "rewards/rejected": -1.7872494459152222, + "step": 27 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 47.29875578100757, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": 0.3194928467273712, + "logits/rejected": 0.32101473212242126, + "logps/chosen": -45.893978118896484, + "logps/rejected": -52.5146484375, + "loss": 0.5076, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0526106357574463, + "rewards/margins": 0.718239426612854, + "rewards/rejected": -1.7708501815795898, + "step": 28 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 37.18564900832489, + "learning_rate": 4.264705882352941e-07, + "logits/chosen": 0.19094619154930115, + "logits/rejected": 0.23362189531326294, + "logps/chosen": -36.12297439575195, + "logps/rejected": -46.864871978759766, + "loss": 0.4014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7660520076751709, + "rewards/margins": 1.0089162588119507, + "rewards/rejected": -1.774968147277832, + "step": 29 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 43.31791813857577, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": 0.15151675045490265, + "logits/rejected": 0.11997775733470917, + "logps/chosen": -38.58032989501953, + "logps/rejected": -48.824798583984375, + "loss": 0.4572, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2670563459396362, + "rewards/margins": 0.831606388092041, + "rewards/rejected": -2.098662853240967, + "step": 30 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 37.063035160208685, + "learning_rate": 4.5588235294117646e-07, + "logits/chosen": 0.2560815215110779, + "logits/rejected": 0.3223097026348114, + "logps/chosen": -33.990726470947266, + "logps/rejected": -47.65345764160156, + "loss": 0.3806, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2836799621582031, + "rewards/margins": 1.0427335500717163, + "rewards/rejected": -2.326413631439209, + "step": 31 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 37.90256850942511, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": 0.18444910645484924, + "logits/rejected": 0.27790239453315735, + "logps/chosen": -34.04519271850586, + "logps/rejected": -58.64192199707031, + "loss": 0.3634, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8921431303024292, + "rewards/margins": 1.9067623615264893, + "rewards/rejected": -2.798905372619629, + "step": 32 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 38.28825801306995, + "learning_rate": 4.852941176470588e-07, + "logits/chosen": 0.21531561017036438, + "logits/rejected": 0.22173307836055756, + "logps/chosen": -38.73649978637695, + "logps/rejected": -51.891937255859375, + "loss": 0.3549, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.469386339187622, + "rewards/margins": 1.3347928524017334, + "rewards/rejected": -2.8041794300079346, + "step": 33 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 53.57688480965446, + "learning_rate": 5e-07, + "logits/chosen": -0.14031767845153809, + "logits/rejected": -0.009732939302921295, + "logps/chosen": -30.219802856445312, + "logps/rejected": -48.51620864868164, + "loss": 0.4584, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7153716683387756, + "rewards/margins": 2.133821487426758, + "rewards/rejected": -2.8491930961608887, + "step": 34 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 43.6191926386301, + "learning_rate": 4.999864732969518e-07, + "logits/chosen": 0.2249789983034134, + "logits/rejected": 0.2375878542661667, + "logps/chosen": -42.989952087402344, + "logps/rejected": -60.248451232910156, + "loss": 0.307, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3359755277633667, + "rewards/margins": 2.923412799835205, + "rewards/rejected": -4.259388446807861, + "step": 35 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 41.68496640354453, + "learning_rate": 4.999458946515807e-07, + "logits/chosen": 0.04158564656972885, + "logits/rejected": 0.04120251536369324, + "logps/chosen": -47.079593658447266, + "logps/rejected": -64.6259536743164, + "loss": 0.3243, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9504570960998535, + "rewards/margins": 2.1364314556121826, + "rewards/rejected": -4.086888313293457, + "step": 36 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 37.998309675066224, + "learning_rate": 4.998782684550491e-07, + "logits/chosen": 0.15689387917518616, + "logits/rejected": 0.22760489583015442, + "logps/chosen": -31.412202835083008, + "logps/rejected": -57.521270751953125, + "loss": 0.3499, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.422287940979004, + "rewards/margins": 2.10537052154541, + "rewards/rejected": -3.527658462524414, + "step": 37 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 39.14624538309661, + "learning_rate": 4.997836020254328e-07, + "logits/chosen": 0.09242415428161621, + "logits/rejected": 0.12390726059675217, + "logps/chosen": -38.68524932861328, + "logps/rejected": -59.322471618652344, + "loss": 0.3809, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4135229587554932, + "rewards/margins": 2.5775866508483887, + "rewards/rejected": -3.991109609603882, + "step": 38 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 34.10903781264625, + "learning_rate": 4.996619056069291e-07, + "logits/chosen": 0.15454381704330444, + "logits/rejected": 0.16882330179214478, + "logps/chosen": -44.294654846191406, + "logps/rejected": -66.8642578125, + "loss": 0.3106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2089152336120605, + "rewards/margins": 3.4627161026000977, + "rewards/rejected": -5.671631336212158, + "step": 39 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 41.64494289319624, + "learning_rate": 4.995131923687487e-07, + "logits/chosen": 0.03869347274303436, + "logits/rejected": 0.13989922404289246, + "logps/chosen": -48.224884033203125, + "logps/rejected": -68.6059341430664, + "loss": 0.3563, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3006927967071533, + "rewards/margins": 3.536550521850586, + "rewards/rejected": -5.837243556976318, + "step": 40 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 43.27974364485945, + "learning_rate": 4.993374784036901e-07, + "logits/chosen": -0.13991862535476685, + "logits/rejected": 0.015330532565712929, + "logps/chosen": -44.3278923034668, + "logps/rejected": -62.52472686767578, + "loss": 0.4348, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4953625202178955, + "rewards/margins": 3.03660249710083, + "rewards/rejected": -5.5319647789001465, + "step": 41 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 46.320431667154644, + "learning_rate": 4.991347827263982e-07, + "logits/chosen": -0.051238611340522766, + "logits/rejected": -0.0033771172165870667, + "logps/chosen": -43.90919876098633, + "logps/rejected": -65.11723327636719, + "loss": 0.4051, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.612180471420288, + "rewards/margins": 3.3464155197143555, + "rewards/rejected": -4.9585957527160645, + "step": 42 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 52.045033376790954, + "learning_rate": 4.989051272713069e-07, + "logits/chosen": -0.10396721214056015, + "logits/rejected": 0.10225249826908112, + "logps/chosen": -45.72818374633789, + "logps/rejected": -77.21394348144531, + "loss": 0.3283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5264527797698975, + "rewards/margins": 4.776139736175537, + "rewards/rejected": -7.302592754364014, + "step": 43 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 40.91521718681392, + "learning_rate": 4.986485368902656e-07, + "logits/chosen": -0.08990158140659332, + "logits/rejected": 0.014565035700798035, + "logps/chosen": -38.39900588989258, + "logps/rejected": -57.75357437133789, + "loss": 0.3584, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.275327205657959, + "rewards/margins": 2.417283058166504, + "rewards/rejected": -4.692610263824463, + "step": 44 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 50.74127412580387, + "learning_rate": 4.983650393498489e-07, + "logits/chosen": 0.037050001323223114, + "logits/rejected": -0.008276170119643211, + "logps/chosen": -50.93760299682617, + "logps/rejected": -55.42501449584961, + "loss": 0.3717, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5346438884735107, + "rewards/margins": 1.2831640243530273, + "rewards/rejected": -3.8178083896636963, + "step": 45 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 38.61624054825256, + "learning_rate": 4.980546653283537e-07, + "logits/chosen": -0.41439855098724365, + "logits/rejected": -0.4113887548446655, + "logps/chosen": -41.68315124511719, + "logps/rejected": -67.61707305908203, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8887252807617188, + "rewards/margins": 4.549580097198486, + "rewards/rejected": -6.438305377960205, + "step": 46 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 38.9863916338003, + "learning_rate": 4.977174484124775e-07, + "logits/chosen": -0.009788192808628082, + "logits/rejected": -0.09325724095106125, + "logps/chosen": -46.573936462402344, + "logps/rejected": -59.36362838745117, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1944921016693115, + "rewards/margins": 4.177679538726807, + "rewards/rejected": -6.372171401977539, + "step": 47 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 46.90129706189482, + "learning_rate": 4.97353425093685e-07, + "logits/chosen": -0.1219746470451355, + "logits/rejected": -0.10111116617918015, + "logps/chosen": -49.24342346191406, + "logps/rejected": -65.33878326416016, + "loss": 0.3779, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4194295406341553, + "rewards/margins": 3.601774215698242, + "rewards/rejected": -6.021203994750977, + "step": 48 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 54.305491610936826, + "learning_rate": 4.96962634764259e-07, + "logits/chosen": -0.13463924825191498, + "logits/rejected": -0.09778477251529694, + "logps/chosen": -50.75926971435547, + "logps/rejected": -64.77645111083984, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.944636344909668, + "rewards/margins": 3.1688618659973145, + "rewards/rejected": -6.113498687744141, + "step": 49 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 40.79377104202132, + "learning_rate": 4.965451197130372e-07, + "logits/chosen": -0.0598304346203804, + "logits/rejected": 0.03133855387568474, + "logps/chosen": -41.48918151855469, + "logps/rejected": -72.72964477539062, + "loss": 0.3083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6170328855514526, + "rewards/margins": 4.565612316131592, + "rewards/rejected": -6.182644844055176, + "step": 50 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 42.877178470441834, + "learning_rate": 4.961009251208367e-07, + "logits/chosen": -0.014419106766581535, + "logits/rejected": -0.018680818378925323, + "logps/chosen": -34.062870025634766, + "logps/rejected": -66.85511779785156, + "loss": 0.2943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9279564023017883, + "rewards/margins": 6.2386088371276855, + "rewards/rejected": -7.166565418243408, + "step": 51 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 36.563663666294495, + "learning_rate": 4.956300990555643e-07, + "logits/chosen": -0.23208701610565186, + "logits/rejected": -0.15281593799591064, + "logps/chosen": -34.16992950439453, + "logps/rejected": -48.25387954711914, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.390017032623291, + "rewards/margins": 2.9138500690460205, + "rewards/rejected": -4.303867340087891, + "step": 52 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 48.996542745383714, + "learning_rate": 4.951326924670147e-07, + "logits/chosen": -0.029582835733890533, + "logits/rejected": 0.13870403170585632, + "logps/chosen": -46.177825927734375, + "logps/rejected": -64.03628540039062, + "loss": 0.4606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.360954761505127, + "rewards/margins": 2.338740825653076, + "rewards/rejected": -4.699695587158203, + "step": 53 + }, + { + "epoch": 0.64, + "grad_norm": 37.53257312713413, + "learning_rate": 4.94608759181358e-07, + "logits/chosen": -0.2242709845304489, + "logits/rejected": -0.022289041429758072, + "logps/chosen": -43.091976165771484, + "logps/rejected": -56.94826126098633, + "loss": 0.2428, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1584746837615967, + "rewards/margins": 2.6602895259857178, + "rewards/rejected": -3.8187644481658936, + "step": 54 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 38.495616892469634, + "learning_rate": 4.940583558953137e-07, + "logits/chosen": -0.3163710832595825, + "logits/rejected": -0.2686666250228882, + "logps/chosen": -41.02560806274414, + "logps/rejected": -75.63497924804688, + "loss": 0.3113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4144959449768066, + "rewards/margins": 5.705674648284912, + "rewards/rejected": -7.120170593261719, + "step": 55 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 44.72353141808658, + "learning_rate": 4.934815421700164e-07, + "logits/chosen": -0.28492411971092224, + "logits/rejected": -0.2709801495075226, + "logps/chosen": -36.71954345703125, + "logps/rejected": -55.933624267578125, + "loss": 0.3607, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8905187845230103, + "rewards/margins": 3.9906599521636963, + "rewards/rejected": -4.881179332733154, + "step": 56 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 36.35665672904958, + "learning_rate": 4.928783804245699e-07, + "logits/chosen": 0.2555558681488037, + "logits/rejected": 0.18786108493804932, + "logps/chosen": -40.80218505859375, + "logps/rejected": -54.24163055419922, + "loss": 0.2751, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8028124570846558, + "rewards/margins": 3.192255735397339, + "rewards/rejected": -3.995068073272705, + "step": 57 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 29.859662847563737, + "learning_rate": 4.922489359292927e-07, + "logits/chosen": -0.17547199130058289, + "logits/rejected": -0.06896121799945831, + "logps/chosen": -40.20637893676758, + "logps/rejected": -68.45681762695312, + "loss": 0.2406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9151208400726318, + "rewards/margins": 4.224143981933594, + "rewards/rejected": -5.139265060424805, + "step": 58 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 33.69014037830799, + "learning_rate": 4.915932767986551e-07, + "logits/chosen": -0.2176772654056549, + "logits/rejected": -0.14603368937969208, + "logps/chosen": -35.77494430541992, + "logps/rejected": -56.28825378417969, + "loss": 0.2639, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7671470642089844, + "rewards/margins": 3.049193859100342, + "rewards/rejected": -3.816340923309326, + "step": 59 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 33.1631924356696, + "learning_rate": 4.909114739839079e-07, + "logits/chosen": -0.09617012739181519, + "logits/rejected": -0.08796259015798569, + "logps/chosen": -33.88630294799805, + "logps/rejected": -55.623878479003906, + "loss": 0.2556, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8068188428878784, + "rewards/margins": 3.1127305030822754, + "rewards/rejected": -3.9195497035980225, + "step": 60 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 40.838783872872355, + "learning_rate": 4.902036012654048e-07, + "logits/chosen": 0.11093666404485703, + "logits/rejected": 0.1355137974023819, + "logps/chosen": -34.699256896972656, + "logps/rejected": -55.77449035644531, + "loss": 0.2753, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.288212776184082, + "rewards/margins": 3.224027633666992, + "rewards/rejected": -4.512240409851074, + "step": 61 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 35.91873621159586, + "learning_rate": 4.894697352446182e-07, + "logits/chosen": -0.10412248969078064, + "logits/rejected": -0.1209147572517395, + "logps/chosen": -34.93061447143555, + "logps/rejected": -52.149208068847656, + "loss": 0.2958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7685253620147705, + "rewards/margins": 2.5662384033203125, + "rewards/rejected": -3.334764003753662, + "step": 62 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 28.279332840993643, + "learning_rate": 4.887099553358501e-07, + "logits/chosen": -0.1916661560535431, + "logits/rejected": -0.14164935052394867, + "logps/chosen": -40.58860397338867, + "logps/rejected": -50.00385284423828, + "loss": 0.229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3654404282569885, + "rewards/margins": 2.5260181427001953, + "rewards/rejected": -2.891458511352539, + "step": 63 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 43.562956759714005, + "learning_rate": 4.879243437576383e-07, + "logits/chosen": -0.09250672161579132, + "logits/rejected": -0.06184221804141998, + "logps/chosen": -33.61621856689453, + "logps/rejected": -48.81718826293945, + "loss": 0.286, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7941789031028748, + "rewards/margins": 2.6042628288269043, + "rewards/rejected": -3.398441791534424, + "step": 64 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 43.71274039442652, + "learning_rate": 4.871129855238588e-07, + "logits/chosen": -0.1322498917579651, + "logits/rejected": -0.05121883377432823, + "logps/chosen": -40.41231918334961, + "logps/rejected": -68.0873031616211, + "loss": 0.3236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39779654145240784, + "rewards/margins": 3.27742600440979, + "rewards/rejected": -3.675222396850586, + "step": 65 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 33.295271431641005, + "learning_rate": 4.862759684345269e-07, + "logits/chosen": -0.35007691383361816, + "logits/rejected": -0.3479149341583252, + "logps/chosen": -38.88081359863281, + "logps/rejected": -49.428382873535156, + "loss": 0.245, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42857033014297485, + "rewards/margins": 3.455155372619629, + "rewards/rejected": -3.883725643157959, + "step": 66 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 22.368867699781628, + "learning_rate": 4.854133830662955e-07, + "logits/chosen": -0.26884549856185913, + "logits/rejected": -0.27821600437164307, + "logps/chosen": -40.386207580566406, + "logps/rejected": -55.080604553222656, + "loss": 0.1921, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1675169467926025, + "rewards/margins": 4.228756904602051, + "rewards/rejected": -5.396273136138916, + "step": 67 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 32.90815737667523, + "learning_rate": 4.845253227626536e-07, + "logits/chosen": 0.12926915287971497, + "logits/rejected": 0.03172997385263443, + "logps/chosen": -56.2171516418457, + "logps/rejected": -61.29096984863281, + "loss": 0.2468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6303930282592773, + "rewards/margins": 2.693972110748291, + "rewards/rejected": -3.3243653774261475, + "step": 68 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 31.286678704407063, + "learning_rate": 4.836118836238252e-07, + "logits/chosen": -0.11010141670703888, + "logits/rejected": -0.09770654886960983, + "logps/chosen": -38.13661193847656, + "logps/rejected": -59.40013122558594, + "loss": 0.2512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08025422692298889, + "rewards/margins": 2.979079246520996, + "rewards/rejected": -2.898824691772461, + "step": 69 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 27.574620445125923, + "learning_rate": 4.826731644963704e-07, + "logits/chosen": -0.25498491525650024, + "logits/rejected": -0.24903275072574615, + "logps/chosen": -32.7315788269043, + "logps/rejected": -47.2717399597168, + "loss": 0.222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6683451533317566, + "rewards/margins": 3.523477554321289, + "rewards/rejected": -4.191822528839111, + "step": 70 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 35.73348427460115, + "learning_rate": 4.817092669624882e-07, + "logits/chosen": -0.018255462870001793, + "logits/rejected": -0.0020784977823495865, + "logps/chosen": -34.584449768066406, + "logps/rejected": -54.02947998046875, + "loss": 0.3535, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04187864065170288, + "rewards/margins": 3.698057174682617, + "rewards/rejected": -3.739936113357544, + "step": 71 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 28.886553550180775, + "learning_rate": 4.807202953290243e-07, + "logits/chosen": -0.2388785183429718, + "logits/rejected": -0.1512915939092636, + "logps/chosen": -32.676937103271484, + "logps/rejected": -51.861934661865234, + "loss": 0.2502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43571335077285767, + "rewards/margins": 3.212143898010254, + "rewards/rejected": -3.6478569507598877, + "step": 72 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 31.329548468083193, + "learning_rate": 4.797063566161834e-07, + "logits/chosen": -0.016514137387275696, + "logits/rejected": -0.02128826081752777, + "logps/chosen": -42.21635818481445, + "logps/rejected": -53.1080436706543, + "loss": 0.2729, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.650065541267395, + "rewards/margins": 1.7332451343536377, + "rewards/rejected": -2.383310556411743, + "step": 73 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 25.241915116630672, + "learning_rate": 4.786675605459487e-07, + "logits/chosen": -0.18203724920749664, + "logits/rejected": -0.12511923909187317, + "logps/chosen": -37.860111236572266, + "logps/rejected": -67.00776672363281, + "loss": 0.2269, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.003457695245742798, + "rewards/margins": 4.446203708648682, + "rewards/rejected": -4.442746162414551, + "step": 74 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 26.546681076158777, + "learning_rate": 4.776040195302079e-07, + "logits/chosen": -0.20350059866905212, + "logits/rejected": -0.16488413512706757, + "logps/chosen": -29.37200164794922, + "logps/rejected": -53.26748275756836, + "loss": 0.2272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.028457432985305786, + "rewards/margins": 4.081994533538818, + "rewards/rejected": -4.053536891937256, + "step": 75 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 36.79575801377004, + "learning_rate": 4.76515848658589e-07, + "logits/chosen": -0.06815146654844284, + "logits/rejected": 0.04448368400335312, + "logps/chosen": -40.69670867919922, + "logps/rejected": -58.95963668823242, + "loss": 0.3201, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6759670972824097, + "rewards/margins": 2.9853808879852295, + "rewards/rejected": -3.6613478660583496, + "step": 76 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 33.040470283627364, + "learning_rate": 4.754031656860059e-07, + "logits/chosen": 0.059698522090911865, + "logits/rejected": 0.1003262847661972, + "logps/chosen": -36.32437515258789, + "logps/rejected": -45.55727005004883, + "loss": 0.2451, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1681911200284958, + "rewards/margins": 3.335202217102051, + "rewards/rejected": -3.167011022567749, + "step": 77 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 23.533686338243243, + "learning_rate": 4.74266091019916e-07, + "logits/chosen": -0.03607799857854843, + "logits/rejected": -0.07650981843471527, + "logps/chosen": -40.76026153564453, + "logps/rejected": -51.91889190673828, + "loss": 0.1909, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06481152772903442, + "rewards/margins": 3.4574201107025146, + "rewards/rejected": -3.392608642578125, + "step": 78 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 34.00814256216538, + "learning_rate": 4.7310474770728996e-07, + "logits/chosen": -0.24094080924987793, + "logits/rejected": -0.2244417816400528, + "logps/chosen": -36.15470504760742, + "logps/rejected": -50.766448974609375, + "loss": 0.3092, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04541383683681488, + "rewards/margins": 2.0906879901885986, + "rewards/rejected": -2.1361019611358643, + "step": 79 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 28.43036399847798, + "learning_rate": 4.719192614212969e-07, + "logits/chosen": 0.04508206248283386, + "logits/rejected": 0.04841914027929306, + "logps/chosen": -44.18794631958008, + "logps/rejected": -74.9228515625, + "loss": 0.1841, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0396180152893066, + "rewards/margins": 3.7275805473327637, + "rewards/rejected": -4.76719856262207, + "step": 80 + }, + { + "epoch": 0.96, + "grad_norm": 32.15305104854747, + "learning_rate": 4.707097604477045e-07, + "logits/chosen": 0.10437710583209991, + "logits/rejected": 0.10021185874938965, + "logps/chosen": -41.348716735839844, + "logps/rejected": -53.517127990722656, + "loss": 0.2768, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03640615940093994, + "rewards/margins": 3.230294704437256, + "rewards/rejected": -3.1938881874084473, + "step": 81 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 30.563184515965578, + "learning_rate": 4.694763756709967e-07, + "logits/chosen": -0.17636063694953918, + "logits/rejected": -0.21469731628894806, + "logps/chosen": -39.04137420654297, + "logps/rejected": -52.79186248779297, + "loss": 0.2415, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2509702444076538, + "rewards/margins": 3.593198776245117, + "rewards/rejected": -3.8441689014434814, + "step": 82 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 28.891738925831007, + "learning_rate": 4.6821924056021053e-07, + "logits/chosen": -0.11742343008518219, + "logits/rejected": 0.009909386746585369, + "logps/chosen": -30.184894561767578, + "logps/rejected": -65.191650390625, + "loss": 0.2097, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2642693817615509, + "rewards/margins": 5.374275207519531, + "rewards/rejected": -5.638545036315918, + "step": 83 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 34.49435985459448, + "learning_rate": 4.669384911544926e-07, + "logits/chosen": -0.07497820258140564, + "logits/rejected": -0.026325395330786705, + "logps/chosen": -33.93386459350586, + "logps/rejected": -50.840423583984375, + "loss": 0.2826, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5045543909072876, + "rewards/margins": 2.2327775955200195, + "rewards/rejected": -2.7373318672180176, + "step": 84 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 25.60399667566007, + "learning_rate": 4.6563426604837817e-07, + "logits/chosen": -0.07658643275499344, + "logits/rejected": -0.06745781004428864, + "logps/chosen": -45.01917266845703, + "logps/rejected": -59.43114471435547, + "loss": 0.1973, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29737135767936707, + "rewards/margins": 5.139443874359131, + "rewards/rejected": -5.4368157386779785, + "step": 85 + }, + { + "epoch": 1.0192592592592593, + "grad_norm": 21.460741017889536, + "learning_rate": 4.6430670637679294e-07, + "logits/chosen": -0.2205628752708435, + "logits/rejected": -0.0864521712064743, + "logps/chosen": -31.902761459350586, + "logps/rejected": -51.85260009765625, + "loss": 0.1667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.194830521941185, + "rewards/margins": 3.8075246810913086, + "rewards/rejected": -4.002355098724365, + "step": 86 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 15.350784705262274, + "learning_rate": 4.629559557997804e-07, + "logits/chosen": -0.12179061770439148, + "logits/rejected": -0.10868389904499054, + "logps/chosen": -40.21116638183594, + "logps/rejected": -62.20214080810547, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19310609996318817, + "rewards/margins": 4.397428512573242, + "rewards/rejected": -4.590534687042236, + "step": 87 + }, + { + "epoch": 1.0429629629629629, + "grad_norm": 13.188223717252573, + "learning_rate": 4.615821604869563e-07, + "logits/chosen": -0.13621510565280914, + "logits/rejected": -0.044875748455524445, + "logps/chosen": -39.49315643310547, + "logps/rejected": -65.40257263183594, + "loss": 0.1026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10489638894796371, + "rewards/margins": 5.503582000732422, + "rewards/rejected": -5.608478546142578, + "step": 88 + }, + { + "epoch": 1.0548148148148149, + "grad_norm": 27.4435618488601, + "learning_rate": 4.6018546910169067e-07, + "logits/chosen": -0.2304653376340866, + "logits/rejected": -0.30428558588027954, + "logps/chosen": -38.354759216308594, + "logps/rejected": -57.675498962402344, + "loss": 0.1946, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1100564002990723, + "rewards/margins": 4.001605033874512, + "rewards/rejected": -5.111660957336426, + "step": 89 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 18.807247445635458, + "learning_rate": 4.5876603278502027e-07, + "logits/chosen": -0.12274541705846786, + "logits/rejected": 0.009613536298274994, + "logps/chosen": -41.16261291503906, + "logps/rejected": -73.863037109375, + "loss": 0.1405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8482255935668945, + "rewards/margins": 5.244417667388916, + "rewards/rejected": -6.0926432609558105, + "step": 90 + }, + { + "epoch": 1.0785185185185184, + "grad_norm": 24.62512250085559, + "learning_rate": 4.573240051392935e-07, + "logits/chosen": -0.1303870528936386, + "logits/rejected": -0.14886482059955597, + "logps/chosen": -39.42107009887695, + "logps/rejected": -54.32372283935547, + "loss": 0.1612, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6151587963104248, + "rewards/margins": 3.3702774047851562, + "rewards/rejected": -3.98543643951416, + "step": 91 + }, + { + "epoch": 1.0903703703703704, + "grad_norm": 18.29323289164473, + "learning_rate": 4.5585954221154853e-07, + "logits/chosen": -0.4573056101799011, + "logits/rejected": -0.31245023012161255, + "logps/chosen": -32.956050872802734, + "logps/rejected": -59.963035583496094, + "loss": 0.1649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5122833251953125, + "rewards/margins": 4.091685771942139, + "rewards/rejected": -4.603969097137451, + "step": 92 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 19.321908739095367, + "learning_rate": 4.5437280247662646e-07, + "logits/chosen": -0.023740939795970917, + "logits/rejected": -0.0003247186541557312, + "logps/chosen": -39.27166748046875, + "logps/rejected": -55.00670623779297, + "loss": 0.1251, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7074174880981445, + "rewards/margins": 3.3870468139648438, + "rewards/rejected": -4.094464302062988, + "step": 93 + }, + { + "epoch": 1.114074074074074, + "grad_norm": 17.03649474940601, + "learning_rate": 4.528639468200226e-07, + "logits/chosen": 0.2261081337928772, + "logits/rejected": 0.2597027122974396, + "logps/chosen": -37.43301773071289, + "logps/rejected": -51.47749328613281, + "loss": 0.1186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10310641676187515, + "rewards/margins": 3.6022109985351562, + "rewards/rejected": -3.4991047382354736, + "step": 94 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 16.559378309982034, + "learning_rate": 4.5133313852047613e-07, + "logits/chosen": -0.11725334078073502, + "logits/rejected": -0.08741730451583862, + "logps/chosen": -33.64775085449219, + "logps/rejected": -55.1715087890625, + "loss": 0.1273, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.01142565906047821, + "rewards/margins": 3.431326150894165, + "rewards/rejected": -3.442751407623291, + "step": 95 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 20.611754629262013, + "learning_rate": 4.4978054323230144e-07, + "logits/chosen": 0.051543403416872025, + "logits/rejected": 0.11705614626407623, + "logps/chosen": -33.534278869628906, + "logps/rejected": -51.728057861328125, + "loss": 0.1365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07938066124916077, + "rewards/margins": 3.254304885864258, + "rewards/rejected": -3.17492413520813, + "step": 96 + }, + { + "epoch": 1.1496296296296296, + "grad_norm": 12.616532653018533, + "learning_rate": 4.482063289674618e-07, + "logits/chosen": -0.0702984482049942, + "logits/rejected": 0.004200812429189682, + "logps/chosen": -34.993507385253906, + "logps/rejected": -57.94596481323242, + "loss": 0.0914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19305679202079773, + "rewards/margins": 4.31368350982666, + "rewards/rejected": -4.120626449584961, + "step": 97 + }, + { + "epoch": 1.1614814814814816, + "grad_norm": 12.762301209570738, + "learning_rate": 4.466106660773884e-07, + "logits/chosen": -0.1491287350654602, + "logits/rejected": -0.03166097402572632, + "logps/chosen": -39.788230895996094, + "logps/rejected": -59.54990005493164, + "loss": 0.094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40045109391212463, + "rewards/margins": 4.702445030212402, + "rewards/rejected": -5.102896213531494, + "step": 98 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 17.826250546113457, + "learning_rate": 4.44993727234546e-07, + "logits/chosen": 0.11196614801883698, + "logits/rejected": 0.07824762165546417, + "logps/chosen": -39.74019241333008, + "logps/rejected": -49.98065185546875, + "loss": 0.1199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.37030303478240967, + "rewards/margins": 2.883664608001709, + "rewards/rejected": -3.25396728515625, + "step": 99 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 16.340929612227512, + "learning_rate": 4.4335568741374695e-07, + "logits/chosen": -0.3065292239189148, + "logits/rejected": -0.243222177028656, + "logps/chosen": -39.87810516357422, + "logps/rejected": -49.80472946166992, + "loss": 0.1362, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09828188270330429, + "rewards/margins": 4.0216569900512695, + "rewards/rejected": -4.119938850402832, + "step": 100 + }, + { + "epoch": 1.1970370370370371, + "grad_norm": 15.511054375166765, + "learning_rate": 4.4169672387321735e-07, + "logits/chosen": -0.06340894848108292, + "logits/rejected": -0.0665612518787384, + "logps/chosen": -43.078765869140625, + "logps/rejected": -65.42969512939453, + "loss": 0.1102, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3077806830406189, + "rewards/margins": 5.954348087310791, + "rewards/rejected": -6.262128829956055, + "step": 101 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 17.09598873374456, + "learning_rate": 4.4001701613541454e-07, + "logits/chosen": 0.09894056618213654, + "logits/rejected": 0.11925836652517319, + "logps/chosen": -32.494293212890625, + "logps/rejected": -51.34983825683594, + "loss": 0.128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22729653120040894, + "rewards/margins": 4.20453405380249, + "rewards/rejected": -4.431830406188965, + "step": 102 + }, + { + "epoch": 1.2207407407407407, + "grad_norm": 22.603451847311774, + "learning_rate": 4.383167459676008e-07, + "logits/chosen": -0.09994232654571533, + "logits/rejected": -0.03670894354581833, + "logps/chosen": -34.782405853271484, + "logps/rejected": -57.43785095214844, + "loss": 0.1482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.40158677101135254, + "rewards/margins": 3.9667258262634277, + "rewards/rejected": -4.368312358856201, + "step": 103 + }, + { + "epoch": 1.2325925925925927, + "grad_norm": 17.176654931396367, + "learning_rate": 4.365960973621734e-07, + "logits/chosen": -0.3010917007923126, + "logits/rejected": -0.2688751220703125, + "logps/chosen": -30.779415130615234, + "logps/rejected": -59.66587448120117, + "loss": 0.1077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5755228996276855, + "rewards/margins": 5.500998497009277, + "rewards/rejected": -6.076521396636963, + "step": 104 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 16.54203727829074, + "learning_rate": 4.348552565167542e-07, + "logits/chosen": 0.03528839722275734, + "logits/rejected": 0.025072041898965836, + "logps/chosen": -35.11994171142578, + "logps/rejected": -49.86817169189453, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.683509349822998, + "rewards/margins": 4.702343940734863, + "rewards/rejected": -5.385853290557861, + "step": 105 + }, + { + "epoch": 1.2562962962962962, + "grad_norm": 26.56869923226247, + "learning_rate": 4.330944118140406e-07, + "logits/chosen": -0.08033540099859238, + "logits/rejected": -0.015175499022006989, + "logps/chosen": -40.06020736694336, + "logps/rejected": -58.90888977050781, + "loss": 0.1479, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02627614513039589, + "rewards/margins": 4.818339824676514, + "rewards/rejected": -4.7920637130737305, + "step": 106 + }, + { + "epoch": 1.268148148148148, + "grad_norm": 14.298587740255858, + "learning_rate": 4.313137538014198e-07, + "logits/chosen": -0.08439959585666656, + "logits/rejected": -0.15276381373405457, + "logps/chosen": -34.021949768066406, + "logps/rejected": -43.68275451660156, + "loss": 0.0835, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03106861561536789, + "rewards/margins": 3.872872829437256, + "rewards/rejected": -3.841804265975952, + "step": 107 + }, + { + "epoch": 1.28, + "grad_norm": 25.029691995336584, + "learning_rate": 4.295134751703492e-07, + "logits/chosen": 0.039663467556238174, + "logits/rejected": 0.03390619903802872, + "logps/chosen": -50.46000671386719, + "logps/rejected": -61.62244415283203, + "loss": 0.1408, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41419780254364014, + "rewards/margins": 5.797426223754883, + "rewards/rejected": -6.2116241455078125, + "step": 108 + }, + { + "epoch": 1.2918518518518518, + "grad_norm": 23.688132743134673, + "learning_rate": 4.276937707355044e-07, + "logits/chosen": -0.017152896150946617, + "logits/rejected": -0.053842976689338684, + "logps/chosen": -41.905521392822266, + "logps/rejected": -65.56050109863281, + "loss": 0.1163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.573278546333313, + "rewards/margins": 6.835091590881348, + "rewards/rejected": -7.408369064331055, + "step": 109 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 22.26748210189321, + "learning_rate": 4.2585483741369755e-07, + "logits/chosen": -0.3594672679901123, + "logits/rejected": -0.2787400782108307, + "logps/chosen": -33.10090255737305, + "logps/rejected": -65.98353576660156, + "loss": 0.1206, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1710546016693115, + "rewards/margins": 5.390981197357178, + "rewards/rejected": -6.562036037445068, + "step": 110 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 12.864266715484298, + "learning_rate": 4.239968742025684e-07, + "logits/chosen": -0.1566499024629593, + "logits/rejected": -0.02022075653076172, + "logps/chosen": -31.641050338745117, + "logps/rejected": -70.607421875, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5755561590194702, + "rewards/margins": 5.668292999267578, + "rewards/rejected": -6.243849277496338, + "step": 111 + }, + { + "epoch": 1.3274074074074074, + "grad_norm": 11.527896741156843, + "learning_rate": 4.2212008215905e-07, + "logits/chosen": -0.18084248900413513, + "logits/rejected": -0.06485521793365479, + "logps/chosen": -32.15618133544922, + "logps/rejected": -60.565914154052734, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.524111270904541, + "rewards/margins": 4.708486557006836, + "rewards/rejected": -5.2325968742370605, + "step": 112 + }, + { + "epoch": 1.3392592592592591, + "grad_norm": 18.981341764098254, + "learning_rate": 4.2022466437761154e-07, + "logits/chosen": 0.11151312291622162, + "logits/rejected": 0.2410213202238083, + "logps/chosen": -35.73467254638672, + "logps/rejected": -61.45711135864258, + "loss": 0.1104, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03812497854232788, + "rewards/margins": 3.9962849617004395, + "rewards/rejected": -4.034409999847412, + "step": 113 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 15.055020570658602, + "learning_rate": 4.18310825968281e-07, + "logits/chosen": -0.13380703330039978, + "logits/rejected": -0.056154295802116394, + "logps/chosen": -47.65923309326172, + "logps/rejected": -68.50222778320312, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0497931241989136, + "rewards/margins": 5.255577564239502, + "rewards/rejected": -6.305370807647705, + "step": 114 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 15.94507937001582, + "learning_rate": 4.1637877403444923e-07, + "logits/chosen": -0.13782085478305817, + "logits/rejected": -0.1237938329577446, + "logps/chosen": -34.57268142700195, + "logps/rejected": -60.96284103393555, + "loss": 0.1328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2373143583536148, + "rewards/margins": 5.874894142150879, + "rewards/rejected": -5.637579917907715, + "step": 115 + }, + { + "epoch": 1.374814814814815, + "grad_norm": 17.6499216351512, + "learning_rate": 4.144287176504582e-07, + "logits/chosen": -0.028891492635011673, + "logits/rejected": 0.010559901595115662, + "logps/chosen": -41.175785064697266, + "logps/rejected": -56.148109436035156, + "loss": 0.1233, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5991055965423584, + "rewards/margins": 4.076951503753662, + "rewards/rejected": -4.6760573387146, + "step": 116 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 17.440881402981706, + "learning_rate": 4.1246086783897713e-07, + "logits/chosen": -0.07107866555452347, + "logits/rejected": -0.05355262756347656, + "logps/chosen": -29.337356567382812, + "logps/rejected": -58.07263946533203, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25651735067367554, + "rewards/margins": 5.097599983215332, + "rewards/rejected": -4.841082572937012, + "step": 117 + }, + { + "epoch": 1.3985185185185185, + "grad_norm": 20.485113734375798, + "learning_rate": 4.104754375481664e-07, + "logits/chosen": -0.027082689106464386, + "logits/rejected": 0.0022195279598236084, + "logps/chosen": -35.18750762939453, + "logps/rejected": -54.038787841796875, + "loss": 0.0963, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7344773411750793, + "rewards/margins": 4.06456995010376, + "rewards/rejected": -4.799046993255615, + "step": 118 + }, + { + "epoch": 1.4103703703703703, + "grad_norm": 15.73045559795655, + "learning_rate": 4.084726416286337e-07, + "logits/chosen": -0.2252836376428604, + "logits/rejected": -0.1367400735616684, + "logps/chosen": -28.606430053710938, + "logps/rejected": -54.020267486572266, + "loss": 0.0872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016864344477653503, + "rewards/margins": 4.223577499389648, + "rewards/rejected": -4.2067131996154785, + "step": 119 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 18.987468125277015, + "learning_rate": 4.0645269681018434e-07, + "logits/chosen": -0.16107773780822754, + "logits/rejected": -0.030207287520170212, + "logps/chosen": -29.666810989379883, + "logps/rejected": -61.85116958618164, + "loss": 0.119, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07655519992113113, + "rewards/margins": 5.755571365356445, + "rewards/rejected": -5.832127571105957, + "step": 120 + }, + { + "epoch": 1.434074074074074, + "grad_norm": 14.503662362117073, + "learning_rate": 4.044158216783684e-07, + "logits/chosen": -0.515570878982544, + "logits/rejected": -0.3124653100967407, + "logps/chosen": -37.86850357055664, + "logps/rejected": -69.43701934814453, + "loss": 0.0985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2003718763589859, + "rewards/margins": 7.45107364654541, + "rewards/rejected": -7.6514458656311035, + "step": 121 + }, + { + "epoch": 1.445925925925926, + "grad_norm": 18.569284512541163, + "learning_rate": 4.0236223665082605e-07, + "logits/chosen": -0.21073125302791595, + "logits/rejected": -0.2305406928062439, + "logps/chosen": -31.442550659179688, + "logps/rejected": -55.32566833496094, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6064890027046204, + "rewards/margins": 5.698796272277832, + "rewards/rejected": -6.3052849769592285, + "step": 122 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 17.336812678453057, + "learning_rate": 4.0029216395343617e-07, + "logits/chosen": -0.06248122453689575, + "logits/rejected": -0.05541558563709259, + "logps/chosen": -35.73994445800781, + "logps/rejected": -60.30720901489258, + "loss": 0.1032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7068940997123718, + "rewards/margins": 5.66333532333374, + "rewards/rejected": -6.3702287673950195, + "step": 123 + }, + { + "epoch": 1.4696296296296296, + "grad_norm": 14.094947491287853, + "learning_rate": 3.982058275962682e-07, + "logits/chosen": -0.21389123797416687, + "logits/rejected": -0.16907303035259247, + "logps/chosen": -28.326904296875, + "logps/rejected": -56.3559455871582, + "loss": 0.0846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10696560144424438, + "rewards/margins": 4.861944675445557, + "rewards/rejected": -4.968909740447998, + "step": 124 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 16.97368933164184, + "learning_rate": 3.9610345334934094e-07, + "logits/chosen": -0.13031917810440063, + "logits/rejected": -0.03798413649201393, + "logps/chosen": -41.05464172363281, + "logps/rejected": -65.29788208007812, + "loss": 0.1205, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.038674697279930115, + "rewards/margins": 6.0994062423706055, + "rewards/rejected": -6.060731887817383, + "step": 125 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 15.284029740961785, + "learning_rate": 3.939852687181915e-07, + "logits/chosen": -0.14361430704593658, + "logits/rejected": -0.10849830508232117, + "logps/chosen": -36.10377502441406, + "logps/rejected": -67.89326477050781, + "loss": 0.0939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7120703458786011, + "rewards/margins": 5.921784400939941, + "rewards/rejected": -6.633854389190674, + "step": 126 + }, + { + "epoch": 1.5051851851851852, + "grad_norm": 18.417078359527594, + "learning_rate": 3.9185150291925585e-07, + "logits/chosen": -0.21707814931869507, + "logits/rejected": -0.20940172672271729, + "logps/chosen": -36.678348541259766, + "logps/rejected": -59.70578384399414, + "loss": 0.1107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0581213235855103, + "rewards/margins": 5.090505123138428, + "rewards/rejected": -6.14862585067749, + "step": 127 + }, + { + "epoch": 1.5170370370370372, + "grad_norm": 15.713415294775897, + "learning_rate": 3.8970238685506486e-07, + "logits/chosen": -0.04535888880491257, + "logits/rejected": 0.03404983878135681, + "logps/chosen": -34.13862609863281, + "logps/rejected": -66.47319793701172, + "loss": 0.0976, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.735641360282898, + "rewards/margins": 5.098552227020264, + "rewards/rejected": -5.834194183349609, + "step": 128 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 15.290294429948705, + "learning_rate": 3.8753815308925685e-07, + "logits/chosen": -0.44447335600852966, + "logits/rejected": -0.5022441148757935, + "logps/chosen": -34.69385528564453, + "logps/rejected": -63.007972717285156, + "loss": 0.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.753921389579773, + "rewards/margins": 6.32866907119751, + "rewards/rejected": -7.082590579986572, + "step": 129 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 22.7399302377654, + "learning_rate": 3.8535903582141184e-07, + "logits/chosen": -0.3744094669818878, + "logits/rejected": -0.2060033231973648, + "logps/chosen": -32.91036605834961, + "logps/rejected": -61.743282318115234, + "loss": 0.1268, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4923054873943329, + "rewards/margins": 5.070975303649902, + "rewards/rejected": -5.5632805824279785, + "step": 130 + }, + { + "epoch": 1.5525925925925925, + "grad_norm": 19.47533309428857, + "learning_rate": 3.8316527086170727e-07, + "logits/chosen": -0.1633462905883789, + "logits/rejected": -0.07288794964551926, + "logps/chosen": -36.646484375, + "logps/rejected": -59.22307205200195, + "loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2637479305267334, + "rewards/margins": 5.560683727264404, + "rewards/rejected": -5.824431419372559, + "step": 131 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 18.594859296902843, + "learning_rate": 3.809570956054003e-07, + "logits/chosen": -0.5365747213363647, + "logits/rejected": -0.39433979988098145, + "logps/chosen": -31.459264755249023, + "logps/rejected": -62.012969970703125, + "loss": 0.1097, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.048090934753418, + "rewards/margins": 5.831287860870361, + "rewards/rejected": -6.879378318786621, + "step": 132 + }, + { + "epoch": 1.5762962962962963, + "grad_norm": 14.282275106755108, + "learning_rate": 3.787347490071389e-07, + "logits/chosen": -0.20027217268943787, + "logits/rejected": -0.11141454428434372, + "logps/chosen": -39.01911163330078, + "logps/rejected": -64.89835357666016, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3214702308177948, + "rewards/margins": 5.419382095336914, + "rewards/rejected": -5.740852355957031, + "step": 133 + }, + { + "epoch": 1.5881481481481483, + "grad_norm": 13.42088885786826, + "learning_rate": 3.764984715551031e-07, + "logits/chosen": -0.12170754373073578, + "logits/rejected": -0.046029090881347656, + "logps/chosen": -29.234134674072266, + "logps/rejected": -60.45512771606445, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1851249635219574, + "rewards/margins": 6.193130970001221, + "rewards/rejected": -6.378255844116211, + "step": 134 + }, + { + "epoch": 1.6, + "grad_norm": 17.391251462773063, + "learning_rate": 3.7424850524498113e-07, + "logits/chosen": -0.18073627352714539, + "logits/rejected": -0.05371435731649399, + "logps/chosen": -35.178985595703125, + "logps/rejected": -62.41261672973633, + "loss": 0.112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027589425444602966, + "rewards/margins": 5.411984443664551, + "rewards/rejected": -5.384395599365234, + "step": 135 + }, + { + "epoch": 1.6118518518518519, + "grad_norm": 13.216285367583016, + "learning_rate": 3.7198509355378207e-07, + "logits/chosen": -0.3801528811454773, + "logits/rejected": -0.3222590982913971, + "logps/chosen": -40.775054931640625, + "logps/rejected": -52.93413543701172, + "loss": 0.1169, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5116928815841675, + "rewards/margins": 3.9808902740478516, + "rewards/rejected": -5.49258279800415, + "step": 136 + }, + { + "epoch": 1.6237037037037036, + "grad_norm": 20.45643460429083, + "learning_rate": 3.6970848141348855e-07, + "logits/chosen": -0.17568367719650269, + "logits/rejected": -0.11519981920719147, + "logps/chosen": -39.35060501098633, + "logps/rejected": -59.42844009399414, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02011704444885254, + "rewards/margins": 5.978552341461182, + "rewards/rejected": -5.998669624328613, + "step": 137 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 20.16029126070904, + "learning_rate": 3.6741891518455146e-07, + "logits/chosen": -0.16909295320510864, + "logits/rejected": -0.11791606992483139, + "logps/chosen": -39.2324333190918, + "logps/rejected": -67.33000183105469, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9399681091308594, + "rewards/margins": 5.435766220092773, + "rewards/rejected": -6.375733375549316, + "step": 138 + }, + { + "epoch": 1.6474074074074074, + "grad_norm": 16.83558679733193, + "learning_rate": 3.6511664262923094e-07, + "logits/chosen": -0.2512515187263489, + "logits/rejected": -0.12882237136363983, + "logps/chosen": -27.65873908996582, + "logps/rejected": -61.08441925048828, + "loss": 0.0919, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27784237265586853, + "rewards/margins": 6.476930141448975, + "rewards/rejected": -6.754772663116455, + "step": 139 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 12.155079766899926, + "learning_rate": 3.6280191288478435e-07, + "logits/chosen": -0.15503238141536713, + "logits/rejected": -0.06224162131547928, + "logps/chosen": -34.50273895263672, + "logps/rejected": -62.98163604736328, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17935335636138916, + "rewards/margins": 5.786401271820068, + "rewards/rejected": -5.965755462646484, + "step": 140 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 14.838007532934355, + "learning_rate": 3.604749764365069e-07, + "logits/chosen": -0.2061130404472351, + "logits/rejected": -0.11659687012434006, + "logps/chosen": -27.322792053222656, + "logps/rejected": -62.887542724609375, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2183775156736374, + "rewards/margins": 7.025249481201172, + "rewards/rejected": -7.243628025054932, + "step": 141 + }, + { + "epoch": 1.682962962962963, + "grad_norm": 18.816292977620716, + "learning_rate": 3.5813608509062526e-07, + "logits/chosen": -0.21707522869110107, + "logits/rejected": -0.09010873734951019, + "logps/chosen": -36.751190185546875, + "logps/rejected": -75.8062744140625, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9384999871253967, + "rewards/margins": 6.734908580780029, + "rewards/rejected": -7.673408508300781, + "step": 142 + }, + { + "epoch": 1.6948148148148148, + "grad_norm": 19.8821437700751, + "learning_rate": 3.557854919470491e-07, + "logits/chosen": -0.27580782771110535, + "logits/rejected": -0.22370710968971252, + "logps/chosen": -44.46840286254883, + "logps/rejected": -56.556922912597656, + "loss": 0.1165, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3054481744766235, + "rewards/margins": 3.6606221199035645, + "rewards/rejected": -4.966070175170898, + "step": 143 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 16.302116776290895, + "learning_rate": 3.5342345137198206e-07, + "logits/chosen": -0.12199485301971436, + "logits/rejected": -0.08353496342897415, + "logps/chosen": -38.30879211425781, + "logps/rejected": -51.09149932861328, + "loss": 0.112, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1275089532136917, + "rewards/margins": 3.7598049640655518, + "rewards/rejected": -3.8873140811920166, + "step": 144 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 15.133588204814354, + "learning_rate": 3.510502189703954e-07, + "logits/chosen": -0.10772836208343506, + "logits/rejected": -0.06885837763547897, + "logps/chosen": -40.77737045288086, + "logps/rejected": -69.48592376708984, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8564466834068298, + "rewards/margins": 8.315703392028809, + "rewards/rejected": -9.172150611877441, + "step": 145 + }, + { + "epoch": 1.7303703703703703, + "grad_norm": 19.362732647667297, + "learning_rate": 3.486660515583691e-07, + "logits/chosen": -0.2726586163043976, + "logits/rejected": -0.21814075112342834, + "logps/chosen": -30.717233657836914, + "logps/rejected": -65.25225830078125, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21597516536712646, + "rewards/margins": 6.636316776275635, + "rewards/rejected": -6.852292060852051, + "step": 146 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 11.230644499357242, + "learning_rate": 3.4627120713529983e-07, + "logits/chosen": -0.2115684449672699, + "logits/rejected": -0.11108352243900299, + "logps/chosen": -34.891475677490234, + "logps/rejected": -77.10374450683594, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.222525954246521, + "rewards/margins": 9.00995922088623, + "rewards/rejected": -10.232484817504883, + "step": 147 + }, + { + "epoch": 1.7540740740740741, + "grad_norm": 13.137309578079025, + "learning_rate": 3.438659448559825e-07, + "logits/chosen": -0.13276290893554688, + "logits/rejected": -0.10878665745258331, + "logps/chosen": -37.001060485839844, + "logps/rejected": -73.28951263427734, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.598874568939209, + "rewards/margins": 7.266829013824463, + "rewards/rejected": -7.865704536437988, + "step": 148 + }, + { + "epoch": 1.765925925925926, + "grad_norm": 21.677182436717583, + "learning_rate": 3.414505250025659e-07, + "logits/chosen": -0.04767221957445145, + "logits/rejected": 0.07465275377035141, + "logps/chosen": -40.81106185913086, + "logps/rejected": -67.47488403320312, + "loss": 0.0942, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7800440788269043, + "rewards/margins": 5.173450469970703, + "rewards/rejected": -5.953495025634766, + "step": 149 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 16.148540981867313, + "learning_rate": 3.390252089563867e-07, + "logits/chosen": -0.18858963251113892, + "logits/rejected": -0.20340172946453094, + "logps/chosen": -36.109954833984375, + "logps/rejected": -55.27710723876953, + "loss": 0.0796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0519292950630188, + "rewards/margins": 5.729560852050781, + "rewards/rejected": -5.677631855010986, + "step": 150 + }, + { + "epoch": 1.7896296296296297, + "grad_norm": 24.312535239343674, + "learning_rate": 3.3659025916968475e-07, + "logits/chosen": -0.2818453013896942, + "logits/rejected": -0.19897550344467163, + "logps/chosen": -38.93151092529297, + "logps/rejected": -76.9036636352539, + "loss": 0.1537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5907295346260071, + "rewards/margins": 6.914373397827148, + "rewards/rejected": -7.505102634429932, + "step": 151 + }, + { + "epoch": 1.8014814814814815, + "grad_norm": 20.831076322049565, + "learning_rate": 3.3414593913720155e-07, + "logits/chosen": -0.22526244819164276, + "logits/rejected": -0.18128839135169983, + "logps/chosen": -34.0900764465332, + "logps/rejected": -55.26824951171875, + "loss": 0.126, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0966421514749527, + "rewards/margins": 5.949459075927734, + "rewards/rejected": -6.046100616455078, + "step": 152 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 11.740893074967538, + "learning_rate": 3.3169251336766697e-07, + "logits/chosen": -0.1995951235294342, + "logits/rejected": -0.08064538240432739, + "logps/chosen": -34.26948165893555, + "logps/rejected": -62.14799499511719, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.565490484237671, + "rewards/margins": 5.796334266662598, + "rewards/rejected": -7.361824989318848, + "step": 153 + }, + { + "epoch": 1.8251851851851852, + "grad_norm": 16.192262072429642, + "learning_rate": 3.2923024735517567e-07, + "logits/chosen": -0.3225496709346771, + "logits/rejected": -0.23174233734607697, + "logps/chosen": -31.906171798706055, + "logps/rejected": -58.93853759765625, + "loss": 0.0956, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29122716188430786, + "rewards/margins": 5.078674793243408, + "rewards/rejected": -5.369902610778809, + "step": 154 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 20.0031978017647, + "learning_rate": 3.2675940755045713e-07, + "logits/chosen": 0.008214278146624565, + "logits/rejected": 0.14055991172790527, + "logps/chosen": -46.526527404785156, + "logps/rejected": -81.31307983398438, + "loss": 0.1503, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6333274245262146, + "rewards/margins": 6.6069817543029785, + "rewards/rejected": -7.240309238433838, + "step": 155 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 21.076223385192634, + "learning_rate": 3.242802613320418e-07, + "logits/chosen": 0.0031320489943027496, + "logits/rejected": 0.012668165378272533, + "logps/chosen": -37.56488037109375, + "logps/rejected": -64.01717376708984, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7534648180007935, + "rewards/margins": 6.474049091339111, + "rewards/rejected": -7.227513313293457, + "step": 156 + }, + { + "epoch": 1.8607407407407406, + "grad_norm": 16.652570591848395, + "learning_rate": 3.217930769773275e-07, + "logits/chosen": -0.35275697708129883, + "logits/rejected": -0.24555784463882446, + "logps/chosen": -33.54517364501953, + "logps/rejected": -62.51717758178711, + "loss": 0.0959, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42744529247283936, + "rewards/margins": 6.722561836242676, + "rewards/rejected": -7.150007247924805, + "step": 157 + }, + { + "epoch": 1.8725925925925926, + "grad_norm": 12.597802388134623, + "learning_rate": 3.1929812363354764e-07, + "logits/chosen": -0.2830018103122711, + "logits/rejected": -0.1911703646183014, + "logps/chosen": -34.85630416870117, + "logps/rejected": -65.27281188964844, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6973313689231873, + "rewards/margins": 5.912741184234619, + "rewards/rejected": -6.610072612762451, + "step": 158 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 12.10851684482882, + "learning_rate": 3.167956712886463e-07, + "logits/chosen": -0.11961568146944046, + "logits/rejected": -0.07325749099254608, + "logps/chosen": -39.96379470825195, + "logps/rejected": -55.56465148925781, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3007912635803223, + "rewards/margins": 4.756231307983398, + "rewards/rejected": -6.0570220947265625, + "step": 159 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 20.233134180855856, + "learning_rate": 3.142859907420615e-07, + "logits/chosen": -0.10496459901332855, + "logits/rejected": 0.02405383251607418, + "logps/chosen": -33.706703186035156, + "logps/rejected": -68.788818359375, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46070706844329834, + "rewards/margins": 5.680697917938232, + "rewards/rejected": -6.14140510559082, + "step": 160 + }, + { + "epoch": 1.9081481481481481, + "grad_norm": 19.119673878649753, + "learning_rate": 3.117693535754213e-07, + "logits/chosen": -0.10953935980796814, + "logits/rejected": -0.043600670993328094, + "logps/chosen": -32.68844985961914, + "logps/rejected": -67.97357177734375, + "loss": 0.098, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.133280411362648, + "rewards/margins": 7.29306173324585, + "rewards/rejected": -7.159781455993652, + "step": 161 + }, + { + "epoch": 1.92, + "grad_norm": 20.235536301916813, + "learning_rate": 3.092460321231547e-07, + "logits/chosen": -0.25959259271621704, + "logits/rejected": -0.20939543843269348, + "logps/chosen": -35.41835021972656, + "logps/rejected": -67.97672271728516, + "loss": 0.1272, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.018587589263916, + "rewards/margins": 7.818422317504883, + "rewards/rejected": -8.837010383605957, + "step": 162 + }, + { + "epoch": 1.9318518518518517, + "grad_norm": 13.909230300778022, + "learning_rate": 3.0671629944302164e-07, + "logits/chosen": -0.12026870250701904, + "logits/rejected": -0.1089366227388382, + "logps/chosen": -36.70647048950195, + "logps/rejected": -53.47910690307617, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5273434519767761, + "rewards/margins": 5.643020153045654, + "rewards/rejected": -6.170363903045654, + "step": 163 + }, + { + "epoch": 1.9437037037037037, + "grad_norm": 12.864589613039078, + "learning_rate": 3.0418042928656415e-07, + "logits/chosen": -0.19225530326366425, + "logits/rejected": -0.10623307526111603, + "logps/chosen": -30.59122085571289, + "logps/rejected": -59.536502838134766, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13015054166316986, + "rewards/margins": 5.6972761154174805, + "rewards/rejected": -5.827426910400391, + "step": 164 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 15.975537356292229, + "learning_rate": 3.016386960694827e-07, + "logits/chosen": -0.41196244955062866, + "logits/rejected": -0.28604307770729065, + "logps/chosen": -39.74413299560547, + "logps/rejected": -67.16585540771484, + "loss": 0.1075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3397538661956787, + "rewards/margins": 5.61893892288208, + "rewards/rejected": -6.958693027496338, + "step": 165 + }, + { + "epoch": 1.9674074074074075, + "grad_norm": 32.234428734857076, + "learning_rate": 2.990913748419411e-07, + "logits/chosen": 0.07510136812925339, + "logits/rejected": 0.11186552792787552, + "logps/chosen": -41.38081741333008, + "logps/rejected": -67.78083038330078, + "loss": 0.1778, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3183833360671997, + "rewards/margins": 5.662228107452393, + "rewards/rejected": -5.980611801147461, + "step": 166 + }, + { + "epoch": 1.9792592592592593, + "grad_norm": 19.969450441352706, + "learning_rate": 2.9653874125880167e-07, + "logits/chosen": -0.17725233733654022, + "logits/rejected": -0.10895150154829025, + "logps/chosen": -36.67229461669922, + "logps/rejected": -61.394744873046875, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03877316415309906, + "rewards/margins": 5.38054895401001, + "rewards/rejected": -5.341775894165039, + "step": 167 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 11.405884683324148, + "learning_rate": 2.9398107154979634e-07, + "logits/chosen": -0.21582955121994019, + "logits/rejected": -0.1547984778881073, + "logps/chosen": -33.66050338745117, + "logps/rejected": -71.53202056884766, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2750725746154785, + "rewards/margins": 6.011247158050537, + "rewards/rejected": -6.286320209503174, + "step": 168 + }, + { + "epoch": 2.002962962962963, + "grad_norm": 10.684152961615103, + "learning_rate": 2.9141864248963427e-07, + "logits/chosen": -0.43596649169921875, + "logits/rejected": -0.3842291235923767, + "logps/chosen": -39.145042419433594, + "logps/rejected": -52.33580780029297, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2999729812145233, + "rewards/margins": 5.18059778213501, + "rewards/rejected": -5.4805707931518555, + "step": 169 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 7.120772046885603, + "learning_rate": 2.8885173136805125e-07, + "logits/chosen": -0.0826089009642601, + "logits/rejected": 0.052907198667526245, + "logps/chosen": -34.04817199707031, + "logps/rejected": -73.21731567382812, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0369320884346962, + "rewards/margins": 5.923883438110352, + "rewards/rejected": -5.960815906524658, + "step": 170 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 6.919197934532035, + "learning_rate": 2.862806159598032e-07, + "logits/chosen": -0.45804017782211304, + "logits/rejected": -0.408170610666275, + "logps/chosen": -35.05866241455078, + "logps/rejected": -58.95463562011719, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28513020277023315, + "rewards/margins": 6.706875324249268, + "rewards/rejected": -6.421745300292969, + "step": 171 + }, + { + "epoch": 2.0385185185185186, + "grad_norm": 7.6734571457046075, + "learning_rate": 2.837055744946072e-07, + "logits/chosen": -0.20093482732772827, + "logits/rejected": -0.15153169631958008, + "logps/chosen": -26.53860092163086, + "logps/rejected": -60.75783157348633, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1763535887002945, + "rewards/margins": 6.684775352478027, + "rewards/rejected": -6.508421421051025, + "step": 172 + }, + { + "epoch": 2.0503703703703704, + "grad_norm": 7.089127360949598, + "learning_rate": 2.811268856270332e-07, + "logits/chosen": -0.19816571474075317, + "logits/rejected": -0.17036175727844238, + "logps/chosen": -30.929105758666992, + "logps/rejected": -63.191619873046875, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12159377336502075, + "rewards/margins": 6.226848602294922, + "rewards/rejected": -6.348442077636719, + "step": 173 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 6.879444744601849, + "learning_rate": 2.7854482840634965e-07, + "logits/chosen": -0.3685060143470764, + "logits/rejected": -0.2508692145347595, + "logps/chosen": -30.003223419189453, + "logps/rejected": -66.00552368164062, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6152498126029968, + "rewards/margins": 7.741688251495361, + "rewards/rejected": -8.356938362121582, + "step": 174 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 7.35254551338382, + "learning_rate": 2.759596822463267e-07, + "logits/chosen": -0.23846019804477692, + "logits/rejected": -0.21598272025585175, + "logps/chosen": -35.864341735839844, + "logps/rejected": -60.58774948120117, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5300172567367554, + "rewards/margins": 6.528465747833252, + "rewards/rejected": -7.0584821701049805, + "step": 175 + }, + { + "epoch": 2.0859259259259257, + "grad_norm": 6.0224009777317535, + "learning_rate": 2.73371726895e-07, + "logits/chosen": -0.4624573588371277, + "logits/rejected": -0.3582268953323364, + "logps/chosen": -38.6595573425293, + "logps/rejected": -68.8594970703125, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1498335599899292, + "rewards/margins": 6.414737224578857, + "rewards/rejected": -6.564570903778076, + "step": 176 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 5.373444495677693, + "learning_rate": 2.7078124240439793e-07, + "logits/chosen": -0.293597012758255, + "logits/rejected": -0.18846404552459717, + "logps/chosen": -37.71804428100586, + "logps/rejected": -79.11727905273438, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9366405606269836, + "rewards/margins": 8.907712936401367, + "rewards/rejected": -9.844353675842285, + "step": 177 + }, + { + "epoch": 2.1096296296296297, + "grad_norm": 6.426554871574788, + "learning_rate": 2.68188509100236e-07, + "logits/chosen": -0.07860371470451355, + "logits/rejected": -0.032592758536338806, + "logps/chosen": -36.26288604736328, + "logps/rejected": -67.72529602050781, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021683871746063232, + "rewards/margins": 6.0264506340026855, + "rewards/rejected": -6.048134803771973, + "step": 178 + }, + { + "epoch": 2.1214814814814815, + "grad_norm": 7.4465908422134435, + "learning_rate": 2.6559380755158206e-07, + "logits/chosen": -0.2125643938779831, + "logits/rejected": -0.11938470602035522, + "logps/chosen": -41.93673324584961, + "logps/rejected": -67.91219329833984, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.314428448677063, + "rewards/margins": 6.577666759490967, + "rewards/rejected": -6.892095565795898, + "step": 179 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 8.207906364980415, + "learning_rate": 2.629974185404951e-07, + "logits/chosen": -0.19172216951847076, + "logits/rejected": -0.09138239920139313, + "logps/chosen": -33.569427490234375, + "logps/rejected": -83.35604858398438, + "loss": 0.0522, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1560230553150177, + "rewards/margins": 6.664412975311279, + "rewards/rejected": -6.820435523986816, + "step": 180 + }, + { + "epoch": 2.145185185185185, + "grad_norm": 7.056836784079578, + "learning_rate": 2.603996230316402e-07, + "logits/chosen": 0.04577064514160156, + "logits/rejected": 0.027345050126314163, + "logps/chosen": -28.81571388244629, + "logps/rejected": -50.763343811035156, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12452198565006256, + "rewards/margins": 5.144538879394531, + "rewards/rejected": -5.269061088562012, + "step": 181 + }, + { + "epoch": 2.157037037037037, + "grad_norm": 11.894887100183427, + "learning_rate": 2.5780070214188474e-07, + "logits/chosen": -0.2564762532711029, + "logits/rejected": -0.17662659287452698, + "logps/chosen": -44.12029266357422, + "logps/rejected": -67.84626007080078, + "loss": 0.067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7224655151367188, + "rewards/margins": 6.115478038787842, + "rewards/rejected": -6.837944030761719, + "step": 182 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 7.773592692412279, + "learning_rate": 2.552009371098778e-07, + "logits/chosen": -0.22470326721668243, + "logits/rejected": -0.16598555445671082, + "logps/chosen": -40.50588607788086, + "logps/rejected": -66.48469543457031, + "loss": 0.0511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6875913143157959, + "rewards/margins": 6.013528823852539, + "rewards/rejected": -6.7011213302612305, + "step": 183 + }, + { + "epoch": 2.180740740740741, + "grad_norm": 6.760102453494981, + "learning_rate": 2.5260060926561604e-07, + "logits/chosen": -0.10662063956260681, + "logits/rejected": 0.02468992955982685, + "logps/chosen": -30.439607620239258, + "logps/rejected": -68.47404479980469, + "loss": 0.042, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1491369903087616, + "rewards/margins": 7.288580894470215, + "rewards/rejected": -7.437718391418457, + "step": 184 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 8.50819275884591, + "learning_rate": 2.5e-07, + "logits/chosen": -0.314007043838501, + "logits/rejected": -0.2440987378358841, + "logps/chosen": -40.95098114013672, + "logps/rejected": -58.714630126953125, + "loss": 0.061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0700465440750122, + "rewards/margins": 4.418706893920898, + "rewards/rejected": -5.488753795623779, + "step": 185 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 7.253010502441156, + "learning_rate": 2.4739939073438393e-07, + "logits/chosen": -0.3739926815032959, + "logits/rejected": -0.29570120573043823, + "logps/chosen": -46.3231201171875, + "logps/rejected": -73.8995361328125, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3167559504508972, + "rewards/margins": 5.942593574523926, + "rewards/rejected": -6.259350299835205, + "step": 186 + }, + { + "epoch": 2.216296296296296, + "grad_norm": 10.218903811635839, + "learning_rate": 2.4479906289012216e-07, + "logits/chosen": -0.23329459130764008, + "logits/rejected": -0.16335441172122955, + "logps/chosen": -41.237979888916016, + "logps/rejected": -61.02803039550781, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29829394817352295, + "rewards/margins": 6.824221611022949, + "rewards/rejected": -6.525927543640137, + "step": 187 + }, + { + "epoch": 2.228148148148148, + "grad_norm": 5.706051711082172, + "learning_rate": 2.421992978581152e-07, + "logits/chosen": -0.20098957419395447, + "logits/rejected": -0.1625077724456787, + "logps/chosen": -33.27192306518555, + "logps/rejected": -63.34418869018555, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16527414321899414, + "rewards/margins": 7.369152069091797, + "rewards/rejected": -7.534427165985107, + "step": 188 + }, + { + "epoch": 2.24, + "grad_norm": 5.574632829097292, + "learning_rate": 2.3960037696835987e-07, + "logits/chosen": -0.15070542693138123, + "logits/rejected": -0.09224209934473038, + "logps/chosen": -36.12839126586914, + "logps/rejected": -78.54652404785156, + "loss": 0.0296, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1367915868759155, + "rewards/margins": 9.016799926757812, + "rewards/rejected": -10.15359115600586, + "step": 189 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 9.222402573182322, + "learning_rate": 2.3700258145950493e-07, + "logits/chosen": -0.16236115992069244, + "logits/rejected": -0.21426168084144592, + "logps/chosen": -32.171287536621094, + "logps/rejected": -65.74127960205078, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31431764364242554, + "rewards/margins": 7.022213459014893, + "rewards/rejected": -7.336531162261963, + "step": 190 + }, + { + "epoch": 2.2637037037037038, + "grad_norm": 7.127001970520486, + "learning_rate": 2.3440619244841794e-07, + "logits/chosen": -0.2093840390443802, + "logits/rejected": -0.20409150421619415, + "logps/chosen": -31.866199493408203, + "logps/rejected": -57.632957458496094, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5825138092041016, + "rewards/margins": 5.033719539642334, + "rewards/rejected": -5.616233825683594, + "step": 191 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 7.894992729942256, + "learning_rate": 2.3181149089976404e-07, + "logits/chosen": -0.07013247907161713, + "logits/rejected": -0.04877481237053871, + "logps/chosen": -33.34114456176758, + "logps/rejected": -54.995853424072266, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006869621574878693, + "rewards/margins": 6.3635053634643555, + "rewards/rejected": -6.356635570526123, + "step": 192 + }, + { + "epoch": 2.2874074074074073, + "grad_norm": 11.705188167009563, + "learning_rate": 2.2921875759560207e-07, + "logits/chosen": -0.13075098395347595, + "logits/rejected": -0.20485468208789825, + "logps/chosen": -47.33964157104492, + "logps/rejected": -67.68054962158203, + "loss": 0.0641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.68022620677948, + "rewards/margins": 6.593554973602295, + "rewards/rejected": -7.273780822753906, + "step": 193 + }, + { + "epoch": 2.299259259259259, + "grad_norm": 7.134458814614045, + "learning_rate": 2.2662827310499995e-07, + "logits/chosen": -0.2494003027677536, + "logits/rejected": -0.20588865876197815, + "logps/chosen": -36.393733978271484, + "logps/rejected": -58.504547119140625, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11634793132543564, + "rewards/margins": 6.01495885848999, + "rewards/rejected": -6.1313066482543945, + "step": 194 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 12.206709819986875, + "learning_rate": 2.2404031775367332e-07, + "logits/chosen": -0.29956668615341187, + "logits/rejected": -0.21865390241146088, + "logps/chosen": -32.450687408447266, + "logps/rejected": -69.80120086669922, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12443738430738449, + "rewards/margins": 8.257219314575195, + "rewards/rejected": -8.132781982421875, + "step": 195 + }, + { + "epoch": 2.322962962962963, + "grad_norm": 6.1999786310250675, + "learning_rate": 2.2145517159365043e-07, + "logits/chosen": -0.5200955271720886, + "logits/rejected": -0.4296617805957794, + "logps/chosen": -36.10881805419922, + "logps/rejected": -63.84122848510742, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5274075865745544, + "rewards/margins": 6.768378257751465, + "rewards/rejected": -7.295785903930664, + "step": 196 + }, + { + "epoch": 2.334814814814815, + "grad_norm": 12.570655654481419, + "learning_rate": 2.1887311437296684e-07, + "logits/chosen": -0.31551796197891235, + "logits/rejected": -0.281019389629364, + "logps/chosen": -29.494091033935547, + "logps/rejected": -46.46226501464844, + "loss": 0.0761, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2344491183757782, + "rewards/margins": 4.557922840118408, + "rewards/rejected": -4.3234734535217285, + "step": 197 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 7.628823812638006, + "learning_rate": 2.162944255053928e-07, + "logits/chosen": -0.31029197573661804, + "logits/rejected": -0.24161145091056824, + "logps/chosen": -29.65079689025879, + "logps/rejected": -57.47043228149414, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04058557003736496, + "rewards/margins": 6.636114597320557, + "rewards/rejected": -6.595529079437256, + "step": 198 + }, + { + "epoch": 2.3585185185185185, + "grad_norm": 8.85295248102532, + "learning_rate": 2.137193840401968e-07, + "logits/chosen": -0.3979605734348297, + "logits/rejected": -0.32846879959106445, + "logps/chosen": -34.3480224609375, + "logps/rejected": -58.354896545410156, + "loss": 0.0533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3420531153678894, + "rewards/margins": 5.625160217285156, + "rewards/rejected": -5.283106803894043, + "step": 199 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 6.562343692053369, + "learning_rate": 2.1114826863194878e-07, + "logits/chosen": -0.25852352380752563, + "logits/rejected": -0.17369653284549713, + "logps/chosen": -35.14963912963867, + "logps/rejected": -68.84390258789062, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06890328228473663, + "rewards/margins": 7.833587169647217, + "rewards/rejected": -7.902491092681885, + "step": 200 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 7.374090556871228, + "learning_rate": 2.0858135751036568e-07, + "logits/chosen": -0.347494900226593, + "logits/rejected": -0.366567462682724, + "logps/chosen": -48.93431091308594, + "logps/rejected": -69.73056030273438, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1343318223953247, + "rewards/margins": 7.379057884216309, + "rewards/rejected": -8.51339054107666, + "step": 201 + }, + { + "epoch": 2.3940740740740742, + "grad_norm": 7.264335195847015, + "learning_rate": 2.060189284502037e-07, + "logits/chosen": -0.28864482045173645, + "logits/rejected": -0.14989601075649261, + "logps/chosen": -36.92799377441406, + "logps/rejected": -68.40786743164062, + "loss": 0.039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6765543818473816, + "rewards/margins": 6.945480823516846, + "rewards/rejected": -7.622035503387451, + "step": 202 + }, + { + "epoch": 2.405925925925926, + "grad_norm": 7.5132642347332155, + "learning_rate": 2.0346125874119838e-07, + "logits/chosen": -0.35054826736450195, + "logits/rejected": -0.35159796476364136, + "logps/chosen": -35.19864273071289, + "logps/rejected": -67.63153839111328, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2543597221374512, + "rewards/margins": 7.7994771003723145, + "rewards/rejected": -9.053837776184082, + "step": 203 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 10.04344534438614, + "learning_rate": 2.0090862515805895e-07, + "logits/chosen": -0.13007110357284546, + "logits/rejected": -0.1110701858997345, + "logps/chosen": -43.942832946777344, + "logps/rejected": -60.823211669921875, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1511293649673462, + "rewards/margins": 6.567122936248779, + "rewards/rejected": -7.718252658843994, + "step": 204 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 9.49554839896552, + "learning_rate": 1.983613039305173e-07, + "logits/chosen": -0.4052940905094147, + "logits/rejected": -0.23975247144699097, + "logps/chosen": -27.93557357788086, + "logps/rejected": -65.16735076904297, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3964582085609436, + "rewards/margins": 8.21355152130127, + "rewards/rejected": -8.610010147094727, + "step": 205 + }, + { + "epoch": 2.4414814814814814, + "grad_norm": 7.396028421519832, + "learning_rate": 1.9581957071343588e-07, + "logits/chosen": -0.3185134828090668, + "logits/rejected": -0.1753259003162384, + "logps/chosen": -45.08576202392578, + "logps/rejected": -88.92870330810547, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9330806732177734, + "rewards/margins": 8.258587837219238, + "rewards/rejected": -10.191668510437012, + "step": 206 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 6.455933266927346, + "learning_rate": 1.9328370055697832e-07, + "logits/chosen": -0.20377328991889954, + "logits/rejected": -0.09884392470121384, + "logps/chosen": -31.141347885131836, + "logps/rejected": -68.2867431640625, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21240955591201782, + "rewards/margins": 7.815065860748291, + "rewards/rejected": -8.027475357055664, + "step": 207 + }, + { + "epoch": 2.4651851851851854, + "grad_norm": 9.373669301801096, + "learning_rate": 1.907539678768453e-07, + "logits/chosen": -0.5164112448692322, + "logits/rejected": -0.4492265284061432, + "logps/chosen": -31.97795295715332, + "logps/rejected": -73.8591079711914, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6608830094337463, + "rewards/margins": 7.822652816772461, + "rewards/rejected": -8.483535766601562, + "step": 208 + }, + { + "epoch": 2.477037037037037, + "grad_norm": 8.80264276423414, + "learning_rate": 1.8823064642457876e-07, + "logits/chosen": -0.19362421333789825, + "logits/rejected": -0.09952510893344879, + "logps/chosen": -36.87741470336914, + "logps/rejected": -76.23078918457031, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9911991357803345, + "rewards/margins": 7.124609470367432, + "rewards/rejected": -8.115808486938477, + "step": 209 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 6.872370358967133, + "learning_rate": 1.8571400925793852e-07, + "logits/chosen": -0.28052157163619995, + "logits/rejected": -0.18670235574245453, + "logps/chosen": -34.98965835571289, + "logps/rejected": -62.01945495605469, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1422077715396881, + "rewards/margins": 7.268811225891113, + "rewards/rejected": -7.126603126525879, + "step": 210 + }, + { + "epoch": 2.5007407407407407, + "grad_norm": 6.102992921343477, + "learning_rate": 1.8320432871135376e-07, + "logits/chosen": -0.012273239903151989, + "logits/rejected": 0.11726081371307373, + "logps/chosen": -41.02720642089844, + "logps/rejected": -70.41509246826172, + "loss": 0.0326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5696543455123901, + "rewards/margins": 6.976672649383545, + "rewards/rejected": -7.546327114105225, + "step": 211 + }, + { + "epoch": 2.5125925925925925, + "grad_norm": 7.523877695462096, + "learning_rate": 1.8070187636645237e-07, + "logits/chosen": -0.25425052642822266, + "logits/rejected": -0.20172733068466187, + "logps/chosen": -29.920835494995117, + "logps/rejected": -58.127830505371094, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9108222126960754, + "rewards/margins": 5.636702060699463, + "rewards/rejected": -6.547523498535156, + "step": 212 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 8.233825538587578, + "learning_rate": 1.782069230226725e-07, + "logits/chosen": -0.1111406460404396, + "logits/rejected": -0.07830701768398285, + "logps/chosen": -36.62953567504883, + "logps/rejected": -69.00572204589844, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7046716809272766, + "rewards/margins": 8.098491668701172, + "rewards/rejected": -8.803162574768066, + "step": 213 + }, + { + "epoch": 2.536296296296296, + "grad_norm": 7.26503668987421, + "learning_rate": 1.7571973866795813e-07, + "logits/chosen": -0.3010156750679016, + "logits/rejected": -0.14240717887878418, + "logps/chosen": -28.9267635345459, + "logps/rejected": -62.530731201171875, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38880711793899536, + "rewards/margins": 7.034039497375488, + "rewards/rejected": -7.422846794128418, + "step": 214 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 5.978089867317629, + "learning_rate": 1.7324059244954292e-07, + "logits/chosen": -0.4227255582809448, + "logits/rejected": -0.40319374203681946, + "logps/chosen": -33.19075012207031, + "logps/rejected": -64.02904510498047, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5749123096466064, + "rewards/margins": 9.057025909423828, + "rewards/rejected": -9.631938934326172, + "step": 215 + }, + { + "epoch": 2.56, + "grad_norm": 9.696303087236457, + "learning_rate": 1.7076975264482433e-07, + "logits/chosen": -0.37839898467063904, + "logits/rejected": -0.273608922958374, + "logps/chosen": -36.7985954284668, + "logps/rejected": -64.79552459716797, + "loss": 0.0498, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2072323560714722, + "rewards/margins": 6.241967678070068, + "rewards/rejected": -7.449199676513672, + "step": 216 + }, + { + "epoch": 2.571851851851852, + "grad_norm": 7.034579383121281, + "learning_rate": 1.6830748663233303e-07, + "logits/chosen": -0.25258129835128784, + "logits/rejected": -0.23720452189445496, + "logps/chosen": -31.781192779541016, + "logps/rejected": -62.308006286621094, + "loss": 0.0353, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21115940809249878, + "rewards/margins": 7.114487648010254, + "rewards/rejected": -7.325647354125977, + "step": 217 + }, + { + "epoch": 2.5837037037037036, + "grad_norm": 8.754965473386916, + "learning_rate": 1.6585406086279846e-07, + "logits/chosen": -0.43037766218185425, + "logits/rejected": -0.375767320394516, + "logps/chosen": -40.90904998779297, + "logps/rejected": -76.52076721191406, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7350015044212341, + "rewards/margins": 8.066032409667969, + "rewards/rejected": -8.801033973693848, + "step": 218 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 5.4726077818919565, + "learning_rate": 1.6340974083031523e-07, + "logits/chosen": -0.29419374465942383, + "logits/rejected": -0.28271955251693726, + "logps/chosen": -30.226680755615234, + "logps/rejected": -55.334014892578125, + "loss": 0.0302, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04286186397075653, + "rewards/margins": 6.613791465759277, + "rewards/rejected": -6.570930004119873, + "step": 219 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 7.600745751196765, + "learning_rate": 1.6097479104361326e-07, + "logits/chosen": -0.38003548979759216, + "logits/rejected": -0.20779910683631897, + "logps/chosen": -26.411277770996094, + "logps/rejected": -65.45418548583984, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08172351866960526, + "rewards/margins": 7.309746742248535, + "rewards/rejected": -7.228023052215576, + "step": 220 + }, + { + "epoch": 2.6192592592592594, + "grad_norm": 5.529941543826629, + "learning_rate": 1.5854947499743413e-07, + "logits/chosen": -0.2649455666542053, + "logits/rejected": -0.1322605013847351, + "logps/chosen": -28.605796813964844, + "logps/rejected": -67.70567321777344, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8953390717506409, + "rewards/margins": 7.937841892242432, + "rewards/rejected": -8.83318042755127, + "step": 221 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 7.828964734675065, + "learning_rate": 1.5613405514401757e-07, + "logits/chosen": -0.4999098479747772, + "logits/rejected": -0.46459028124809265, + "logps/chosen": -32.34528350830078, + "logps/rejected": -64.39063262939453, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7131352424621582, + "rewards/margins": 7.366223335266113, + "rewards/rejected": -9.07935905456543, + "step": 222 + }, + { + "epoch": 2.642962962962963, + "grad_norm": 6.684903510896331, + "learning_rate": 1.537287928647002e-07, + "logits/chosen": -0.33326905965805054, + "logits/rejected": -0.2772333025932312, + "logps/chosen": -33.04732894897461, + "logps/rejected": -55.9017448425293, + "loss": 0.034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4501342475414276, + "rewards/margins": 6.482940196990967, + "rewards/rejected": -6.933073997497559, + "step": 223 + }, + { + "epoch": 2.6548148148148147, + "grad_norm": 7.774671778619875, + "learning_rate": 1.513339484416309e-07, + "logits/chosen": -0.350558876991272, + "logits/rejected": -0.28618156909942627, + "logps/chosen": -49.11450958251953, + "logps/rejected": -80.40065002441406, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9198662042617798, + "rewards/margins": 8.049423217773438, + "rewards/rejected": -9.969290733337402, + "step": 224 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 6.147372232877328, + "learning_rate": 1.489497810296046e-07, + "logits/chosen": -0.2636696696281433, + "logits/rejected": -0.1582798808813095, + "logps/chosen": -33.32222366333008, + "logps/rejected": -88.90251159667969, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8391042947769165, + "rewards/margins": 9.376372337341309, + "rewards/rejected": -10.215476036071777, + "step": 225 + }, + { + "epoch": 2.6785185185185183, + "grad_norm": 6.058104127594901, + "learning_rate": 1.4657654862801797e-07, + "logits/chosen": -0.3205685317516327, + "logits/rejected": -0.24160242080688477, + "logps/chosen": -28.52737045288086, + "logps/rejected": -70.28912353515625, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24154254794120789, + "rewards/margins": 6.4893083572387695, + "rewards/rejected": -6.730850696563721, + "step": 226 + }, + { + "epoch": 2.6903703703703705, + "grad_norm": 10.148811706386386, + "learning_rate": 1.4421450805295082e-07, + "logits/chosen": -0.33272168040275574, + "logits/rejected": -0.3017561733722687, + "logps/chosen": -40.76533889770508, + "logps/rejected": -58.748046875, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23764045536518097, + "rewards/margins": 5.5487141609191895, + "rewards/rejected": -5.7863545417785645, + "step": 227 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 7.063093696385122, + "learning_rate": 1.418639149093748e-07, + "logits/chosen": -0.5206415057182312, + "logits/rejected": -0.42945098876953125, + "logps/chosen": -35.94019317626953, + "logps/rejected": -51.362579345703125, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5331210494041443, + "rewards/margins": 5.205883502960205, + "rewards/rejected": -5.739004611968994, + "step": 228 + }, + { + "epoch": 2.714074074074074, + "grad_norm": 7.873488891465637, + "learning_rate": 1.3952502356349323e-07, + "logits/chosen": -0.2090906947851181, + "logits/rejected": -0.116541787981987, + "logps/chosen": -36.972110748291016, + "logps/rejected": -70.80089569091797, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16982126235961914, + "rewards/margins": 8.14144515991211, + "rewards/rejected": -8.311266899108887, + "step": 229 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 6.247579493903217, + "learning_rate": 1.371980871152157e-07, + "logits/chosen": -0.14566001296043396, + "logits/rejected": -0.19308951497077942, + "logps/chosen": -41.63805389404297, + "logps/rejected": -75.58000183105469, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45496970415115356, + "rewards/margins": 8.866785049438477, + "rewards/rejected": -9.321755409240723, + "step": 230 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 6.023624860692066, + "learning_rate": 1.3488335737076911e-07, + "logits/chosen": -0.24541382491588593, + "logits/rejected": -0.2724686861038208, + "logps/chosen": -33.94440841674805, + "logps/rejected": -54.19181823730469, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.641716718673706, + "rewards/margins": 6.787499904632568, + "rewards/rejected": -7.4292168617248535, + "step": 231 + }, + { + "epoch": 2.74962962962963, + "grad_norm": 6.829922502232146, + "learning_rate": 1.3258108481544847e-07, + "logits/chosen": -0.2750440537929535, + "logits/rejected": -0.2242782562971115, + "logps/chosen": -47.98163986206055, + "logps/rejected": -71.17113494873047, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.330321192741394, + "rewards/margins": 5.847900390625, + "rewards/rejected": -7.178221702575684, + "step": 232 + }, + { + "epoch": 2.7614814814814816, + "grad_norm": 9.939842224750613, + "learning_rate": 1.3029151858651143e-07, + "logits/chosen": -0.4768088757991791, + "logits/rejected": -0.3678171634674072, + "logps/chosen": -28.488832473754883, + "logps/rejected": -72.2415542602539, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4457303285598755, + "rewards/margins": 7.388967514038086, + "rewards/rejected": -7.834697723388672, + "step": 233 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 5.772137373566735, + "learning_rate": 1.2801490644621788e-07, + "logits/chosen": -0.10860705375671387, + "logits/rejected": -0.09419623762369156, + "logps/chosen": -41.14183807373047, + "logps/rejected": -73.03929138183594, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0385979413986206, + "rewards/margins": 9.18422794342041, + "rewards/rejected": -10.222824096679688, + "step": 234 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 6.872437338157428, + "learning_rate": 1.257514947550189e-07, + "logits/chosen": -0.32558369636535645, + "logits/rejected": -0.23454414308071136, + "logps/chosen": -27.442285537719727, + "logps/rejected": -47.84419250488281, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5560548305511475, + "rewards/margins": 5.780346870422363, + "rewards/rejected": -6.336400985717773, + "step": 235 + }, + { + "epoch": 2.797037037037037, + "grad_norm": 9.526509052512719, + "learning_rate": 1.2350152844489688e-07, + "logits/chosen": -0.3666895925998688, + "logits/rejected": -0.22046907246112823, + "logps/chosen": -38.18906021118164, + "logps/rejected": -68.36451721191406, + "loss": 0.0521, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1897006034851074, + "rewards/margins": 6.763792991638184, + "rewards/rejected": -7.953493118286133, + "step": 236 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 6.2325582014169205, + "learning_rate": 1.2126525099286108e-07, + "logits/chosen": -0.3752056956291199, + "logits/rejected": -0.29132628440856934, + "logps/chosen": -39.4105110168457, + "logps/rejected": -70.36639404296875, + "loss": 0.0344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.190868854522705, + "rewards/margins": 7.782485008239746, + "rewards/rejected": -8.973353385925293, + "step": 237 + }, + { + "epoch": 2.8207407407407405, + "grad_norm": 6.813356776563027, + "learning_rate": 1.1904290439459971e-07, + "logits/chosen": -0.4055876135826111, + "logits/rejected": -0.34407296776771545, + "logps/chosen": -36.34687805175781, + "logps/rejected": -64.90667724609375, + "loss": 0.029, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6599448323249817, + "rewards/margins": 7.470705509185791, + "rewards/rejected": -8.130650520324707, + "step": 238 + }, + { + "epoch": 2.8325925925925928, + "grad_norm": 8.170401197759237, + "learning_rate": 1.1683472913829284e-07, + "logits/chosen": -0.20514726638793945, + "logits/rejected": -0.11336632817983627, + "logps/chosen": -51.73873519897461, + "logps/rejected": -71.9918441772461, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8510772585868835, + "rewards/margins": 6.703639507293701, + "rewards/rejected": -7.554717063903809, + "step": 239 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 8.191853793447038, + "learning_rate": 1.146409641785882e-07, + "logits/chosen": -0.16900832951068878, + "logits/rejected": -0.16538989543914795, + "logps/chosen": -44.017845153808594, + "logps/rejected": -54.396583557128906, + "loss": 0.0415, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.656957983970642, + "rewards/margins": 4.629300117492676, + "rewards/rejected": -6.286258697509766, + "step": 240 + }, + { + "epoch": 2.8562962962962963, + "grad_norm": 7.381077568622568, + "learning_rate": 1.1246184691074314e-07, + "logits/chosen": -0.24956950545310974, + "logits/rejected": -0.24476227164268494, + "logps/chosen": -42.77149963378906, + "logps/rejected": -79.72845458984375, + "loss": 0.0398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6535344123840332, + "rewards/margins": 9.295120239257812, + "rewards/rejected": -9.948655128479004, + "step": 241 + }, + { + "epoch": 2.868148148148148, + "grad_norm": 8.160459964737315, + "learning_rate": 1.1029761314493518e-07, + "logits/chosen": -0.30951178073883057, + "logits/rejected": -0.3085937201976776, + "logps/chosen": -40.81586456298828, + "logps/rejected": -60.63238525390625, + "loss": 0.0367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3215899467468262, + "rewards/margins": 6.157529830932617, + "rewards/rejected": -7.479120254516602, + "step": 242 + }, + { + "epoch": 2.88, + "grad_norm": 8.037040216975743, + "learning_rate": 1.0814849708074414e-07, + "logits/chosen": -0.30921998620033264, + "logits/rejected": -0.3065870404243469, + "logps/chosen": -51.514556884765625, + "logps/rejected": -66.55127716064453, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17993119359016418, + "rewards/margins": 6.790800094604492, + "rewards/rejected": -6.970730781555176, + "step": 243 + }, + { + "epoch": 2.891851851851852, + "grad_norm": 9.656051990182503, + "learning_rate": 1.0601473128180854e-07, + "logits/chosen": -0.2805531620979309, + "logits/rejected": -0.2533468008041382, + "logps/chosen": -46.80023193359375, + "logps/rejected": -68.6700668334961, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8430509567260742, + "rewards/margins": 8.124579429626465, + "rewards/rejected": -8.967630386352539, + "step": 244 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 8.425061421071412, + "learning_rate": 1.0389654665065908e-07, + "logits/chosen": -0.2665305435657501, + "logits/rejected": -0.30099448561668396, + "logps/chosen": -36.76901626586914, + "logps/rejected": -62.380638122558594, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7122980952262878, + "rewards/margins": 7.467397689819336, + "rewards/rejected": -8.179695129394531, + "step": 245 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 7.412242548453604, + "learning_rate": 1.0179417240373182e-07, + "logits/chosen": -0.32122743129730225, + "logits/rejected": -0.22737433016300201, + "logps/chosen": -52.4939079284668, + "logps/rejected": -86.98854064941406, + "loss": 0.0332, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.623575210571289, + "rewards/margins": 8.667156219482422, + "rewards/rejected": -11.290731430053711, + "step": 246 + }, + { + "epoch": 2.9274074074074075, + "grad_norm": 6.343263474138892, + "learning_rate": 9.970783604656383e-08, + "logits/chosen": -0.40669649839401245, + "logits/rejected": -0.2913494408130646, + "logps/chosen": -40.742733001708984, + "logps/rejected": -70.4676513671875, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4790029525756836, + "rewards/margins": 7.971775054931641, + "rewards/rejected": -8.450777053833008, + "step": 247 + }, + { + "epoch": 2.9392592592592592, + "grad_norm": 8.350168639074793, + "learning_rate": 9.763776334917398e-08, + "logits/chosen": -0.28063714504241943, + "logits/rejected": -0.2614033818244934, + "logps/chosen": -39.683170318603516, + "logps/rejected": -56.151100158691406, + "loss": 0.0442, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2253810167312622, + "rewards/margins": 5.419277667999268, + "rewards/rejected": -6.644659042358398, + "step": 248 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 6.1204430667008864, + "learning_rate": 9.558417832163162e-08, + "logits/chosen": -0.13153290748596191, + "logits/rejected": -0.19138801097869873, + "logps/chosen": -38.79569625854492, + "logps/rejected": -55.324180603027344, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37145036458969116, + "rewards/margins": 6.06140661239624, + "rewards/rejected": -6.432857513427734, + "step": 249 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 7.267482062373579, + "learning_rate": 9.354730318981561e-08, + "logits/chosen": -0.4541955590248108, + "logits/rejected": -0.3866829574108124, + "logps/chosen": -31.26105308532715, + "logps/rejected": -66.00698852539062, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9516152739524841, + "rewards/margins": 7.78145170211792, + "rewards/rejected": -8.73306655883789, + "step": 250 + }, + { + "epoch": 2.974814814814815, + "grad_norm": 5.662458264963396, + "learning_rate": 9.15273583713663e-08, + "logits/chosen": -0.37392503023147583, + "logits/rejected": -0.29210343956947327, + "logps/chosen": -47.48450469970703, + "logps/rejected": -90.92308044433594, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.219913959503174, + "rewards/margins": 12.272329330444336, + "rewards/rejected": -14.492244720458984, + "step": 251 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 5.97436680259246, + "learning_rate": 8.95245624518336e-08, + "logits/chosen": -0.3016185760498047, + "logits/rejected": -0.2964284420013428, + "logps/chosen": -34.3846321105957, + "logps/rejected": -68.51025390625, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7737685441970825, + "rewards/margins": 7.51839017868042, + "rewards/rejected": -8.292159080505371, + "step": 252 + }, + { + "epoch": 2.9985185185185186, + "grad_norm": 6.809722463225596, + "learning_rate": 8.753913216102285e-08, + "logits/chosen": -0.26927924156188965, + "logits/rejected": -0.09772679954767227, + "logps/chosen": -39.219181060791016, + "logps/rejected": -83.87496185302734, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.874872088432312, + "rewards/margins": 7.590030193328857, + "rewards/rejected": -9.464902877807617, + "step": 253 + }, + { + "epoch": 3.0103703703703704, + "grad_norm": 5.7996860529913565, + "learning_rate": 8.557128234954189e-08, + "logits/chosen": -0.40512141585350037, + "logits/rejected": -0.3163852393627167, + "logps/chosen": -29.341394424438477, + "logps/rejected": -70.40370178222656, + "loss": 0.0283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9042503833770752, + "rewards/margins": 8.99386978149414, + "rewards/rejected": -9.898119926452637, + "step": 254 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 5.286377326614028, + "learning_rate": 8.362122596555088e-08, + "logits/chosen": -0.47114288806915283, + "logits/rejected": -0.3825288712978363, + "logps/chosen": -33.60106658935547, + "logps/rejected": -76.53120422363281, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.207690954208374, + "rewards/margins": 10.646196365356445, + "rewards/rejected": -11.853886604309082, + "step": 255 + }, + { + "epoch": 3.034074074074074, + "grad_norm": 6.253088250116567, + "learning_rate": 8.16891740317189e-08, + "logits/chosen": -0.3761712908744812, + "logits/rejected": -0.35195398330688477, + "logps/chosen": -33.14177322387695, + "logps/rejected": -55.4194450378418, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6758537888526917, + "rewards/margins": 6.090365409851074, + "rewards/rejected": -6.766219615936279, + "step": 256 + }, + { + "epoch": 3.0459259259259257, + "grad_norm": 6.986196206894695, + "learning_rate": 7.977533562238838e-08, + "logits/chosen": -0.4037611186504364, + "logits/rejected": -0.3634166419506073, + "logps/chosen": -34.40519332885742, + "logps/rejected": -70.6146240234375, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2543308138847351, + "rewards/margins": 7.034722328186035, + "rewards/rejected": -7.289052963256836, + "step": 257 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 5.505207911608369, + "learning_rate": 7.787991784094999e-08, + "logits/chosen": -0.2387389987707138, + "logits/rejected": -0.08904880285263062, + "logps/chosen": -36.03857421875, + "logps/rejected": -89.65563201904297, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6398571133613586, + "rewards/margins": 8.411537170410156, + "rewards/rejected": -9.051393508911133, + "step": 258 + }, + { + "epoch": 3.0696296296296297, + "grad_norm": 6.3788891492475415, + "learning_rate": 7.60031257974316e-08, + "logits/chosen": -0.36758318543434143, + "logits/rejected": -0.28655973076820374, + "logps/chosen": -34.49348831176758, + "logps/rejected": -75.51551818847656, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3124196529388428, + "rewards/margins": 9.141406059265137, + "rewards/rejected": -10.453825950622559, + "step": 259 + }, + { + "epoch": 3.0814814814814815, + "grad_norm": 7.213670504047207, + "learning_rate": 7.414516258630244e-08, + "logits/chosen": -0.2934122681617737, + "logits/rejected": -0.2819547653198242, + "logps/chosen": -49.67085647583008, + "logps/rejected": -82.62693786621094, + "loss": 0.0342, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0997469425201416, + "rewards/margins": 9.53528118133545, + "rewards/rejected": -10.635027885437012, + "step": 260 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 5.692432592478289, + "learning_rate": 7.230622926449564e-08, + "logits/chosen": -0.2739347219467163, + "logits/rejected": -0.20775115489959717, + "logps/chosen": -37.56914520263672, + "logps/rejected": -65.73677062988281, + "loss": 0.0333, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4430259466171265, + "rewards/margins": 8.155195236206055, + "rewards/rejected": -9.598221778869629, + "step": 261 + }, + { + "epoch": 3.105185185185185, + "grad_norm": 7.076581419281233, + "learning_rate": 7.048652482965078e-08, + "logits/chosen": -0.18068230152130127, + "logits/rejected": -0.1852649450302124, + "logps/chosen": -46.34651184082031, + "logps/rejected": -68.5006103515625, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1328610181808472, + "rewards/margins": 7.682218551635742, + "rewards/rejected": -8.815078735351562, + "step": 262 + }, + { + "epoch": 3.117037037037037, + "grad_norm": 6.338236077273577, + "learning_rate": 6.868624619858021e-08, + "logits/chosen": -0.2783002257347107, + "logits/rejected": -0.3345209062099457, + "logps/chosen": -36.772254943847656, + "logps/rejected": -83.5664291381836, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4274479150772095, + "rewards/margins": 7.226326942443848, + "rewards/rejected": -7.653774261474609, + "step": 263 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 5.495669924723658, + "learning_rate": 6.690558818595943e-08, + "logits/chosen": -0.34206265211105347, + "logits/rejected": -0.19553421437740326, + "logps/chosen": -32.38424301147461, + "logps/rejected": -85.89747619628906, + "loss": 0.0276, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1448084115982056, + "rewards/margins": 9.975635528564453, + "rewards/rejected": -11.120444297790527, + "step": 264 + }, + { + "epoch": 3.140740740740741, + "grad_norm": 5.0666203766090385, + "learning_rate": 6.514474348324581e-08, + "logits/chosen": -0.38811901211738586, + "logits/rejected": -0.2747833728790283, + "logps/chosen": -48.448951721191406, + "logps/rejected": -77.10838317871094, + "loss": 0.028, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6878550052642822, + "rewards/margins": 7.391786575317383, + "rewards/rejected": -9.079641342163086, + "step": 265 + }, + { + "epoch": 3.1525925925925926, + "grad_norm": 8.885007920801026, + "learning_rate": 6.340390263782655e-08, + "logits/chosen": -0.5093058347702026, + "logits/rejected": -0.3832343816757202, + "logps/chosen": -32.23210144042969, + "logps/rejected": -76.0536117553711, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.038543462753296, + "rewards/margins": 9.122876167297363, + "rewards/rejected": -10.161420822143555, + "step": 266 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 7.984860081690398, + "learning_rate": 6.168325403239913e-08, + "logits/chosen": -0.4433887004852295, + "logits/rejected": -0.3792242109775543, + "logps/chosen": -29.287479400634766, + "logps/rejected": -58.2768669128418, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46901828050613403, + "rewards/margins": 7.602441787719727, + "rewards/rejected": -8.071460723876953, + "step": 267 + }, + { + "epoch": 3.176296296296296, + "grad_norm": 4.958032867566694, + "learning_rate": 5.998298386458545e-08, + "logits/chosen": -0.22974154353141785, + "logits/rejected": -0.19992095232009888, + "logps/chosen": -38.70039367675781, + "logps/rejected": -72.84990692138672, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0671855211257935, + "rewards/margins": 7.9272379875183105, + "rewards/rejected": -8.994423866271973, + "step": 268 + }, + { + "epoch": 3.188148148148148, + "grad_norm": 5.988243698878292, + "learning_rate": 5.830327612678265e-08, + "logits/chosen": -0.32180365920066833, + "logits/rejected": -0.3045603632926941, + "logps/chosen": -43.980316162109375, + "logps/rejected": -83.64070129394531, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.710637331008911, + "rewards/margins": 9.31506061553955, + "rewards/rejected": -12.025696754455566, + "step": 269 + }, + { + "epoch": 3.2, + "grad_norm": 5.602823035809897, + "learning_rate": 5.6644312586253044e-08, + "logits/chosen": -0.0014043133705854416, + "logits/rejected": -0.04007536917924881, + "logps/chosen": -63.08719253540039, + "logps/rejected": -80.3905029296875, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9075740575790405, + "rewards/margins": 7.626714706420898, + "rewards/rejected": -9.53428840637207, + "step": 270 + }, + { + "epoch": 3.211851851851852, + "grad_norm": 7.326638872216747, + "learning_rate": 5.5006272765454056e-08, + "logits/chosen": -0.43287378549575806, + "logits/rejected": -0.27151188254356384, + "logps/chosen": -33.51972579956055, + "logps/rejected": -58.82566452026367, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.519016444683075, + "rewards/margins": 6.466248512268066, + "rewards/rejected": -6.985265731811523, + "step": 271 + }, + { + "epoch": 3.2237037037037037, + "grad_norm": 4.755215217001098, + "learning_rate": 5.338933392261158e-08, + "logits/chosen": -0.2298121452331543, + "logits/rejected": -0.17249788343906403, + "logps/chosen": -37.40292739868164, + "logps/rejected": -64.0435562133789, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.865021288394928, + "rewards/margins": 6.909914016723633, + "rewards/rejected": -7.774934768676758, + "step": 272 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 7.328735036408531, + "learning_rate": 5.1793671032538206e-08, + "logits/chosen": -0.5466493368148804, + "logits/rejected": -0.49364450573921204, + "logps/chosen": -31.749622344970703, + "logps/rejected": -76.33462524414062, + "loss": 0.0351, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5585615634918213, + "rewards/margins": 8.347299575805664, + "rewards/rejected": -8.905860900878906, + "step": 273 + }, + { + "epoch": 3.2474074074074073, + "grad_norm": 4.866161229053689, + "learning_rate": 5.021945676769859e-08, + "logits/chosen": -0.5478118658065796, + "logits/rejected": -0.38123688101768494, + "logps/chosen": -26.91775131225586, + "logps/rejected": -66.9200668334961, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39309221506118774, + "rewards/margins": 8.147336959838867, + "rewards/rejected": -8.540430068969727, + "step": 274 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 5.465123706910723, + "learning_rate": 4.866686147952387e-08, + "logits/chosen": -0.15793977677822113, + "logits/rejected": -0.13396653532981873, + "logps/chosen": -38.39078140258789, + "logps/rejected": -68.49137115478516, + "loss": 0.0332, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5564990639686584, + "rewards/margins": 7.465914726257324, + "rewards/rejected": -8.022414207458496, + "step": 275 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 6.892669002618101, + "learning_rate": 4.71360531799774e-08, + "logits/chosen": -0.17291544377803802, + "logits/rejected": -0.11444761604070663, + "logps/chosen": -52.285491943359375, + "logps/rejected": -84.1561508178711, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8530375957489014, + "rewards/margins": 7.457571983337402, + "rewards/rejected": -10.310609817504883, + "step": 276 + }, + { + "epoch": 3.282962962962963, + "grad_norm": 4.92957758138627, + "learning_rate": 4.562719752337349e-08, + "logits/chosen": -0.47689568996429443, + "logits/rejected": -0.38014689087867737, + "logps/chosen": -51.07635498046875, + "logps/rejected": -94.97547149658203, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.258345603942871, + "rewards/margins": 8.710776329040527, + "rewards/rejected": -10.969121932983398, + "step": 277 + }, + { + "epoch": 3.294814814814815, + "grad_norm": 5.454251196984929, + "learning_rate": 4.4140457788451434e-08, + "logits/chosen": -0.3425113260746002, + "logits/rejected": -0.25056517124176025, + "logps/chosen": -31.684978485107422, + "logps/rejected": -69.71234130859375, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24049748480319977, + "rewards/margins": 7.845020771026611, + "rewards/rejected": -8.085518836975098, + "step": 278 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 5.883268469101933, + "learning_rate": 4.267599486070647e-08, + "logits/chosen": -0.172508105635643, + "logits/rejected": -0.20862886309623718, + "logps/chosen": -39.89122772216797, + "logps/rejected": -52.314510345458984, + "loss": 0.0292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3147379755973816, + "rewards/margins": 5.7033185958862305, + "rewards/rejected": -6.0180559158325195, + "step": 279 + }, + { + "epoch": 3.3185185185185184, + "grad_norm": 6.523804321940008, + "learning_rate": 4.1233967214979764e-08, + "logits/chosen": -0.3144129812717438, + "logits/rejected": -0.22518262267112732, + "logps/chosen": -42.5799560546875, + "logps/rejected": -53.63530349731445, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.022313117980957, + "rewards/margins": 4.446239471435547, + "rewards/rejected": -5.468553066253662, + "step": 280 + }, + { + "epoch": 3.33037037037037, + "grad_norm": 5.61217781900571, + "learning_rate": 3.9814530898309356e-08, + "logits/chosen": -0.2805265784263611, + "logits/rejected": -0.17468589544296265, + "logps/chosen": -36.89150619506836, + "logps/rejected": -73.11276245117188, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6258598566055298, + "rewards/margins": 8.552447319030762, + "rewards/rejected": -9.17830753326416, + "step": 281 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 5.921953356785593, + "learning_rate": 3.8417839513043646e-08, + "logits/chosen": -0.28441232442855835, + "logits/rejected": -0.20493623614311218, + "logps/chosen": -41.19779586791992, + "logps/rejected": -60.01893615722656, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5952982902526855, + "rewards/margins": 5.5062713623046875, + "rewards/rejected": -7.101569652557373, + "step": 282 + }, + { + "epoch": 3.354074074074074, + "grad_norm": 5.347638108561962, + "learning_rate": 3.704404420021956e-08, + "logits/chosen": -0.3048914670944214, + "logits/rejected": -0.17332229018211365, + "logps/chosen": -33.9517936706543, + "logps/rejected": -71.91447448730469, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8167021870613098, + "rewards/margins": 8.507079124450684, + "rewards/rejected": -9.32378101348877, + "step": 283 + }, + { + "epoch": 3.365925925925926, + "grad_norm": 5.238374490213889, + "learning_rate": 3.569329362320708e-08, + "logits/chosen": -0.2728411853313446, + "logits/rejected": -0.2584533393383026, + "logps/chosen": -30.238557815551758, + "logps/rejected": -75.02142333984375, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49139711260795593, + "rewards/margins": 9.32203197479248, + "rewards/rejected": -9.813429832458496, + "step": 284 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 6.525469512599618, + "learning_rate": 3.436573395162179e-08, + "logits/chosen": -0.3524834215641022, + "logits/rejected": -0.30102020502090454, + "logps/chosen": -30.73918914794922, + "logps/rejected": -59.49354553222656, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09267859905958176, + "rewards/margins": 6.891367435455322, + "rewards/rejected": -6.984046459197998, + "step": 285 + }, + { + "epoch": 3.3896296296296295, + "grad_norm": 5.564012623506871, + "learning_rate": 3.306150884550732e-08, + "logits/chosen": -0.3768519461154938, + "logits/rejected": -0.3264191448688507, + "logps/chosen": -41.36799240112305, + "logps/rejected": -67.31135559082031, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0806761980056763, + "rewards/margins": 7.03513240814209, + "rewards/rejected": -8.115808486938477, + "step": 286 + }, + { + "epoch": 3.4014814814814813, + "grad_norm": 5.931359172922728, + "learning_rate": 3.17807594397895e-08, + "logits/chosen": -0.3008241653442383, + "logits/rejected": -0.1854274868965149, + "logps/chosen": -35.424800872802734, + "logps/rejected": -75.77165222167969, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3784024715423584, + "rewards/margins": 9.62315845489502, + "rewards/rejected": -11.00156021118164, + "step": 287 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 5.696220945908767, + "learning_rate": 3.052362432900332e-08, + "logits/chosen": -0.4203820526599884, + "logits/rejected": -0.3877807557582855, + "logps/chosen": -37.054630279541016, + "logps/rejected": -61.031986236572266, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9897009134292603, + "rewards/margins": 7.574882984161377, + "rewards/rejected": -8.564583778381348, + "step": 288 + }, + { + "epoch": 3.4251851851851853, + "grad_norm": 5.040781372610454, + "learning_rate": 2.9290239552295538e-08, + "logits/chosen": -0.04420602694153786, + "logits/rejected": -0.0779787227511406, + "logps/chosen": -49.03828811645508, + "logps/rejected": -64.86919403076172, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1655118465423584, + "rewards/margins": 7.464972496032715, + "rewards/rejected": -8.630484580993652, + "step": 289 + }, + { + "epoch": 3.437037037037037, + "grad_norm": 6.3238698051370825, + "learning_rate": 2.8080738578703052e-08, + "logits/chosen": -0.2396240085363388, + "logits/rejected": -0.18526090681552887, + "logps/chosen": -35.554222106933594, + "logps/rejected": -80.34398651123047, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15909190475940704, + "rewards/margins": 11.187956809997559, + "rewards/rejected": -11.347049713134766, + "step": 290 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 6.096416193339527, + "learning_rate": 2.6895252292709974e-08, + "logits/chosen": -0.3143896460533142, + "logits/rejected": -0.2961388826370239, + "logps/chosen": -45.88547897338867, + "logps/rejected": -72.7055892944336, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5558788776397705, + "rewards/margins": 7.8544135093688965, + "rewards/rejected": -9.41029167175293, + "step": 291 + }, + { + "epoch": 3.4607407407407407, + "grad_norm": 6.109539533433742, + "learning_rate": 2.5733908980083984e-08, + "logits/chosen": -0.20717650651931763, + "logits/rejected": -0.1371382474899292, + "logps/chosen": -34.01182556152344, + "logps/rejected": -69.25910949707031, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.003832221031189, + "rewards/margins": 6.742988586425781, + "rewards/rejected": -7.74682092666626, + "step": 292 + }, + { + "epoch": 3.4725925925925925, + "grad_norm": 5.397666624928578, + "learning_rate": 2.4596834313994037e-08, + "logits/chosen": -0.2090422511100769, + "logits/rejected": -0.1905803680419922, + "logps/chosen": -37.257659912109375, + "logps/rejected": -59.028656005859375, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.009922012686729431, + "rewards/margins": 7.401028633117676, + "rewards/rejected": -7.410951614379883, + "step": 293 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 4.277274237028717, + "learning_rate": 2.3484151341411018e-08, + "logits/chosen": -0.28495097160339355, + "logits/rejected": -0.13665924966335297, + "logps/chosen": -30.814395904541016, + "logps/rejected": -71.38455963134766, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.883634090423584, + "rewards/margins": 8.096288681030273, + "rewards/rejected": -8.979923248291016, + "step": 294 + }, + { + "epoch": 3.4962962962962965, + "grad_norm": 5.969698336918465, + "learning_rate": 2.23959804697921e-08, + "logits/chosen": 0.0012427568435668945, + "logits/rejected": -0.05258895084261894, + "logps/chosen": -43.88676071166992, + "logps/rejected": -69.91567993164062, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0892465114593506, + "rewards/margins": 8.158609390258789, + "rewards/rejected": -9.247856140136719, + "step": 295 + }, + { + "epoch": 3.5081481481481482, + "grad_norm": 4.0211602970173095, + "learning_rate": 2.1332439454051277e-08, + "logits/chosen": -0.20002827048301697, + "logits/rejected": -0.1429349184036255, + "logps/chosen": -34.301002502441406, + "logps/rejected": -55.76853561401367, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09878039360046387, + "rewards/margins": 6.826313495635986, + "rewards/rejected": -6.925093650817871, + "step": 296 + }, + { + "epoch": 3.52, + "grad_norm": 5.481157431560272, + "learning_rate": 2.029364338381656e-08, + "logits/chosen": -0.38807621598243713, + "logits/rejected": -0.3801242709159851, + "logps/chosen": -46.397727966308594, + "logps/rejected": -55.0689582824707, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3208569288253784, + "rewards/margins": 5.903377056121826, + "rewards/rejected": -6.224234580993652, + "step": 297 + }, + { + "epoch": 3.531851851851852, + "grad_norm": 5.352166370096892, + "learning_rate": 1.9279704670975726e-08, + "logits/chosen": -0.2355162501335144, + "logits/rejected": -0.07986889034509659, + "logps/chosen": -34.6320686340332, + "logps/rejected": -71.03166198730469, + "loss": 0.0233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7058801651000977, + "rewards/margins": 7.573249816894531, + "rewards/rejected": -8.279129981994629, + "step": 298 + }, + { + "epoch": 3.5437037037037036, + "grad_norm": 5.786741095008315, + "learning_rate": 1.829073303751172e-08, + "logits/chosen": -0.3151942193508148, + "logits/rejected": -0.33904606103897095, + "logps/chosen": -29.33087921142578, + "logps/rejected": -61.94608688354492, + "loss": 0.0277, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2769862413406372, + "rewards/margins": 8.623825073242188, + "rewards/rejected": -9.900812149047852, + "step": 299 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 4.36518877931545, + "learning_rate": 1.732683550362954e-08, + "logits/chosen": -0.23127204179763794, + "logits/rejected": -0.15411251783370972, + "logps/chosen": -50.591552734375, + "logps/rejected": -77.29461669921875, + "loss": 0.0216, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3554518222808838, + "rewards/margins": 7.547949314117432, + "rewards/rejected": -8.903400421142578, + "step": 300 + }, + { + "epoch": 3.5674074074074076, + "grad_norm": 4.847754834407613, + "learning_rate": 1.6388116376174765e-08, + "logits/chosen": -0.3548241853713989, + "logits/rejected": -0.2721732556819916, + "logps/chosen": -38.101295471191406, + "logps/rejected": -78.74886322021484, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5179784297943115, + "rewards/margins": 8.753091812133789, + "rewards/rejected": -10.27107048034668, + "step": 301 + }, + { + "epoch": 3.5792592592592594, + "grad_norm": 7.781488916643506, + "learning_rate": 1.5474677237346468e-08, + "logits/chosen": -0.3061152994632721, + "logits/rejected": -0.3108983337879181, + "logps/chosen": -41.504512786865234, + "logps/rejected": -78.45973205566406, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2482175827026367, + "rewards/margins": 8.759658813476562, + "rewards/rejected": -10.0078763961792, + "step": 302 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 6.94980194588748, + "learning_rate": 1.4586616933704527e-08, + "logits/chosen": -0.018258891999721527, + "logits/rejected": -0.007322182413190603, + "logps/chosen": -54.701812744140625, + "logps/rejected": -73.30741882324219, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3399360179901123, + "rewards/margins": 6.371276378631592, + "rewards/rejected": -7.711213111877441, + "step": 303 + }, + { + "epoch": 3.602962962962963, + "grad_norm": 6.423577076131564, + "learning_rate": 1.372403156547311e-08, + "logits/chosen": -0.43270713090896606, + "logits/rejected": -0.36236703395843506, + "logps/chosen": -33.90000534057617, + "logps/rejected": -60.915306091308594, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.318174958229065, + "rewards/margins": 7.222588062286377, + "rewards/rejected": -8.540762901306152, + "step": 304 + }, + { + "epoch": 3.6148148148148147, + "grad_norm": 4.091095308871669, + "learning_rate": 1.2887014476141212e-08, + "logits/chosen": -0.30502232909202576, + "logits/rejected": -0.36349910497665405, + "logps/chosen": -40.1556510925293, + "logps/rejected": -68.93075561523438, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.034249335527420044, + "rewards/margins": 9.958725929260254, + "rewards/rejected": -9.992975234985352, + "step": 305 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 6.177303284695512, + "learning_rate": 1.2075656242361732e-08, + "logits/chosen": -0.2732085585594177, + "logits/rejected": -0.1902616024017334, + "logps/chosen": -34.206607818603516, + "logps/rejected": -68.56214904785156, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0075308084487915, + "rewards/margins": 8.230666160583496, + "rewards/rejected": -9.238195419311523, + "step": 306 + }, + { + "epoch": 3.6385185185185183, + "grad_norm": 4.816450379471901, + "learning_rate": 1.1290044664149873e-08, + "logits/chosen": -0.11257211118936539, + "logits/rejected": -0.13122713565826416, + "logps/chosen": -48.688873291015625, + "logps/rejected": -74.23355102539062, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2951655387878418, + "rewards/margins": 8.786190032958984, + "rewards/rejected": -10.081355094909668, + "step": 307 + }, + { + "epoch": 3.6503703703703705, + "grad_norm": 6.851539041964001, + "learning_rate": 1.0530264755381824e-08, + "logits/chosen": -0.4197385013103485, + "logits/rejected": -0.43813198804855347, + "logps/chosen": -33.74197769165039, + "logps/rejected": -56.243553161621094, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6660629510879517, + "rewards/margins": 5.48460054397583, + "rewards/rejected": -6.150663375854492, + "step": 308 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 4.955677182707875, + "learning_rate": 9.796398734595284e-09, + "logits/chosen": -0.26788032054901123, + "logits/rejected": -0.26230883598327637, + "logps/chosen": -28.0145206451416, + "logps/rejected": -51.17897033691406, + "loss": 0.0243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11930375546216965, + "rewards/margins": 5.247139930725098, + "rewards/rejected": -5.366443634033203, + "step": 309 + }, + { + "epoch": 3.674074074074074, + "grad_norm": 5.676351317770164, + "learning_rate": 9.088526016092141e-09, + "logits/chosen": -0.3300250470638275, + "logits/rejected": -0.31215977668762207, + "logps/chosen": -33.89133834838867, + "logps/rejected": -67.8143539428711, + "loss": 0.033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08850759267807007, + "rewards/margins": 9.25622272491455, + "rewards/rejected": -9.344730377197266, + "step": 310 + }, + { + "epoch": 3.685925925925926, + "grad_norm": 5.070558881430833, + "learning_rate": 8.40672320134489e-09, + "logits/chosen": -0.305147260427475, + "logits/rejected": -0.17320549488067627, + "logps/chosen": -33.947872161865234, + "logps/rejected": -74.02790832519531, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0814896896481514, + "rewards/margins": 9.62078857421875, + "rewards/rejected": -9.539299011230469, + "step": 311 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 4.665245148624672, + "learning_rate": 7.751064070707247e-09, + "logits/chosen": -0.44625863432884216, + "logits/rejected": -0.4555772542953491, + "logps/chosen": -42.39094161987305, + "logps/rejected": -67.60736846923828, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6547796130180359, + "rewards/margins": 7.189840793609619, + "rewards/rejected": -7.844620227813721, + "step": 312 + }, + { + "epoch": 3.70962962962963, + "grad_norm": 7.083642027360033, + "learning_rate": 7.12161957543006e-09, + "logits/chosen": -0.251234769821167, + "logits/rejected": -0.13808919489383698, + "logps/chosen": -54.0128288269043, + "logps/rejected": -92.3724365234375, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6177242994308472, + "rewards/margins": 8.311200141906738, + "rewards/rejected": -9.928923606872559, + "step": 313 + }, + { + "epoch": 3.7214814814814816, + "grad_norm": 4.713181664530663, + "learning_rate": 6.518457829983559e-09, + "logits/chosen": -0.3703707158565521, + "logits/rejected": -0.2648147940635681, + "logps/chosen": -48.93006896972656, + "logps/rejected": -66.18186950683594, + "loss": 0.0258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7085801362991333, + "rewards/margins": 5.302105903625488, + "rewards/rejected": -6.010685920715332, + "step": 314 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 4.802422252346654, + "learning_rate": 5.9416441046862555e-09, + "logits/chosen": -0.3589542508125305, + "logits/rejected": -0.3287428319454193, + "logps/chosen": -27.614389419555664, + "logps/rejected": -59.38009262084961, + "loss": 0.0301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17799943685531616, + "rewards/margins": 7.3723649978637695, + "rewards/rejected": -7.550364017486572, + "step": 315 + }, + { + "epoch": 3.745185185185185, + "grad_norm": 5.504211939139539, + "learning_rate": 5.3912408186420064e-09, + "logits/chosen": -0.19225972890853882, + "logits/rejected": -0.24219948053359985, + "logps/chosen": -39.883174896240234, + "logps/rejected": -57.19430160522461, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7933384776115417, + "rewards/margins": 7.422837257385254, + "rewards/rejected": -8.21617603302002, + "step": 316 + }, + { + "epoch": 3.757037037037037, + "grad_norm": 5.560326027847558, + "learning_rate": 4.867307532985227e-09, + "logits/chosen": -0.48098868131637573, + "logits/rejected": -0.3568829894065857, + "logps/chosen": -54.420379638671875, + "logps/rejected": -85.24766540527344, + "loss": 0.0297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9853860139846802, + "rewards/margins": 7.260770320892334, + "rewards/rejected": -9.246155738830566, + "step": 317 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 7.196198216327019, + "learning_rate": 4.369900944435734e-09, + "logits/chosen": -0.16600579023361206, + "logits/rejected": -0.0450252965092659, + "logps/chosen": -42.486473083496094, + "logps/rejected": -81.37212371826172, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9354689121246338, + "rewards/margins": 7.651963233947754, + "rewards/rejected": -8.587431907653809, + "step": 318 + }, + { + "epoch": 3.7807407407407405, + "grad_norm": 6.279239735558416, + "learning_rate": 3.899074879163244e-09, + "logits/chosen": -0.40112194418907166, + "logits/rejected": -0.339353084564209, + "logps/chosen": -34.84368133544922, + "logps/rejected": -61.614341735839844, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0974161624908447, + "rewards/margins": 7.075118064880371, + "rewards/rejected": -8.172533988952637, + "step": 319 + }, + { + "epoch": 3.7925925925925927, + "grad_norm": 6.9892724428640705, + "learning_rate": 3.4548802869627804e-09, + "logits/chosen": -0.3009003698825836, + "logits/rejected": -0.21556389331817627, + "logps/chosen": -41.47652816772461, + "logps/rejected": -67.13237762451172, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2825208604335785, + "rewards/margins": 4.948946952819824, + "rewards/rejected": -5.231468200683594, + "step": 320 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 6.856578977189586, + "learning_rate": 3.037365235741024e-09, + "logits/chosen": -0.16049635410308838, + "logits/rejected": -0.12033607065677643, + "logps/chosen": -32.831031799316406, + "logps/rejected": -62.16967010498047, + "loss": 0.0433, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.41414308547973633, + "rewards/margins": 7.311939239501953, + "rewards/rejected": -7.726081371307373, + "step": 321 + }, + { + "epoch": 3.8162962962962963, + "grad_norm": 9.951539288327684, + "learning_rate": 2.6465749063149245e-09, + "logits/chosen": -0.6789449453353882, + "logits/rejected": -0.6332409381866455, + "logps/chosen": -36.879947662353516, + "logps/rejected": -80.12794494628906, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3466782569885254, + "rewards/margins": 9.836599349975586, + "rewards/rejected": -11.183279037475586, + "step": 322 + }, + { + "epoch": 3.828148148148148, + "grad_norm": 6.85068630218398, + "learning_rate": 2.282551587522441e-09, + "logits/chosen": -0.5462524890899658, + "logits/rejected": -0.43087050318717957, + "logps/chosen": -32.245567321777344, + "logps/rejected": -56.410179138183594, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1382488012313843, + "rewards/margins": 6.717283248901367, + "rewards/rejected": -7.855532169342041, + "step": 323 + }, + { + "epoch": 3.84, + "grad_norm": 6.505661948537336, + "learning_rate": 1.9453346716462316e-09, + "logits/chosen": -0.3759889602661133, + "logits/rejected": -0.4021185040473938, + "logps/chosen": -36.99543380737305, + "logps/rejected": -46.82232666015625, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.881350576877594, + "rewards/margins": 4.79249382019043, + "rewards/rejected": -5.673844337463379, + "step": 324 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 6.937575318919963, + "learning_rate": 1.6349606501509794e-09, + "logits/chosen": -0.2532769441604614, + "logits/rejected": -0.2831670045852661, + "logps/chosen": -41.752403259277344, + "logps/rejected": -52.2098388671875, + "loss": 0.0363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7358890771865845, + "rewards/margins": 6.947170734405518, + "rewards/rejected": -7.6830596923828125, + "step": 325 + }, + { + "epoch": 3.863703703703704, + "grad_norm": 3.940918067532272, + "learning_rate": 1.351463109734441e-09, + "logits/chosen": -0.6125096082687378, + "logits/rejected": -0.3610725700855255, + "logps/chosen": -36.78853225708008, + "logps/rejected": -68.35499572753906, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8846864700317383, + "rewards/margins": 8.920228004455566, + "rewards/rejected": -9.804913520812988, + "step": 326 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 5.731819862714682, + "learning_rate": 1.0948727286930192e-09, + "logits/chosen": -0.07925964891910553, + "logits/rejected": -0.04111175611615181, + "logps/chosen": -35.71060562133789, + "logps/rejected": -57.488651275634766, + "loss": 0.0309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09710303694009781, + "rewards/margins": 6.214367866516113, + "rewards/rejected": -6.311470985412598, + "step": 327 + }, + { + "epoch": 3.8874074074074074, + "grad_norm": 7.4093470103902845, + "learning_rate": 8.652172736017816e-10, + "logits/chosen": -0.2285485416650772, + "logits/rejected": -0.22180640697479248, + "logps/chosen": -45.58677673339844, + "logps/rejected": -74.19960021972656, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34262484312057495, + "rewards/margins": 6.549678325653076, + "rewards/rejected": -6.892302989959717, + "step": 328 + }, + { + "epoch": 3.899259259259259, + "grad_norm": 5.597791664125433, + "learning_rate": 6.625215963098896e-10, + "logits/chosen": -0.22412040829658508, + "logits/rejected": -0.25815126299858093, + "logps/chosen": -39.15158462524414, + "logps/rejected": -52.841060638427734, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1085604429244995, + "rewards/margins": 5.7370734214782715, + "rewards/rejected": -6.845633506774902, + "step": 329 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 4.652252051429055, + "learning_rate": 4.868076312512515e-10, + "logits/chosen": -0.3777186870574951, + "logits/rejected": -0.3193652629852295, + "logps/chosen": -32.07898712158203, + "logps/rejected": -64.57249450683594, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5756787657737732, + "rewards/margins": 7.689653396606445, + "rewards/rejected": -8.265332221984863, + "step": 330 + }, + { + "epoch": 3.9229629629629628, + "grad_norm": 5.81932820439485, + "learning_rate": 3.3809439307086463e-10, + "logits/chosen": -0.20917584002017975, + "logits/rejected": -0.12255613505840302, + "logps/chosen": -31.15184783935547, + "logps/rejected": -63.176910400390625, + "loss": 0.0335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12342405319213867, + "rewards/margins": 6.832948684692383, + "rewards/rejected": -6.956372261047363, + "step": 331 + }, + { + "epoch": 3.934814814814815, + "grad_norm": 6.583884307182356, + "learning_rate": 2.1639797456723952e-10, + "logits/chosen": -0.3113446831703186, + "logits/rejected": -0.34424495697021484, + "logps/chosen": -50.414649963378906, + "logps/rejected": -66.15986633300781, + "loss": 0.0305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8464060425758362, + "rewards/margins": 7.252386093139648, + "rewards/rejected": -8.098793029785156, + "step": 332 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 5.902458355798929, + "learning_rate": 1.21731544950876e-10, + "logits/chosen": -0.23517660796642303, + "logits/rejected": -0.3104686737060547, + "logps/chosen": -45.34723663330078, + "logps/rejected": -86.45767974853516, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3326948881149292, + "rewards/margins": 9.871392250061035, + "rewards/rejected": -11.204087257385254, + "step": 333 + }, + { + "epoch": 3.9585185185185185, + "grad_norm": 5.025564856386988, + "learning_rate": 5.4105348419264394e-11, + "logits/chosen": -0.6560889482498169, + "logits/rejected": -0.6381913423538208, + "logps/chosen": -31.789920806884766, + "logps/rejected": -57.232948303222656, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43755847215652466, + "rewards/margins": 6.743692398071289, + "rewards/rejected": -7.181251525878906, + "step": 334 + }, + { + "epoch": 3.9703703703703703, + "grad_norm": 4.328140885361808, + "learning_rate": 1.3526703048216682e-11, + "logits/chosen": -0.5039613246917725, + "logits/rejected": -0.4086998999118805, + "logps/chosen": -34.5649528503418, + "logps/rejected": -85.53738403320312, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7040150165557861, + "rewards/margins": 11.698722839355469, + "rewards/rejected": -12.402737617492676, + "step": 335 + }, + { + "epoch": 3.982222222222222, + "grad_norm": 5.989193672621692, + "learning_rate": 0.0, + "logits/chosen": -0.3169300854206085, + "logits/rejected": -0.32203078269958496, + "logps/chosen": -37.71720886230469, + "logps/rejected": -64.16942596435547, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6902213096618652, + "rewards/margins": 7.074714183807373, + "rewards/rejected": -7.76493501663208, + "step": 336 + }, + { + "epoch": 3.982222222222222, + "step": 336, + "total_flos": 0.0, + "train_loss": 0.14947671072912358, + "train_runtime": 64472.8667, + "train_samples_per_second": 0.67, + "train_steps_per_second": 0.005 + } + ], + "logging_steps": 1, + "max_steps": 336, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}