{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 1910, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 2.21875, "learning_rate": 2.617801047120419e-08, "logits/chosen": -2.2882843017578125, "logits/rejected": -2.187748432159424, "logps/chosen": -352.931640625, "logps/rejected": -289.82647705078125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 1.6640625, "learning_rate": 2.617801047120419e-07, "logits/chosen": -2.276374340057373, "logits/rejected": -2.168278932571411, "logps/chosen": -297.0078430175781, "logps/rejected": -234.94876098632812, "loss": 0.6934, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": 0.0010548082645982504, "rewards/margins": -8.178578718798235e-05, "rewards/rejected": 0.001136594102717936, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 1.53125, "learning_rate": 5.235602094240838e-07, "logits/chosen": -2.2468607425689697, "logits/rejected": -2.1891446113586426, "logps/chosen": -255.3127899169922, "logps/rejected": -243.41683959960938, "loss": 0.6923, "rewards/accuracies": 0.59375, "rewards/chosen": 0.010193122550845146, "rewards/margins": 0.0027918864507228136, "rewards/rejected": 0.007401236332952976, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 1.6328125, "learning_rate": 7.853403141361258e-07, "logits/chosen": -2.149106502532959, "logits/rejected": -2.098741054534912, "logps/chosen": -269.2379455566406, "logps/rejected": -230.85171508789062, "loss": 0.691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0259039755910635, "rewards/margins": 0.004932164214551449, "rewards/rejected": 0.020971812307834625, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 1.625, "learning_rate": 1.0471204188481676e-06, "logits/chosen": -2.2627243995666504, "logits/rejected": -2.186748504638672, "logps/chosen": -258.2460021972656, "logps/rejected": -261.2386474609375, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03576713055372238, "rewards/margins": 0.00888143666088581, "rewards/rejected": 0.02688569389283657, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 1.6171875, "learning_rate": 1.3089005235602096e-06, "logits/chosen": -2.2943031787872314, "logits/rejected": -2.185983896255493, "logps/chosen": -275.8368225097656, "logps/rejected": -245.889404296875, "loss": 0.6862, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04387350007891655, "rewards/margins": 0.017351223155856133, "rewards/rejected": 0.026522275060415268, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 1.6015625, "learning_rate": 1.5706806282722515e-06, "logits/chosen": -2.2970504760742188, "logits/rejected": -2.142427444458008, "logps/chosen": -301.9741516113281, "logps/rejected": -252.27340698242188, "loss": 0.6817, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.051615793257951736, "rewards/margins": 0.024326015263795853, "rewards/rejected": 0.027289772406220436, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 1.6484375, "learning_rate": 1.8324607329842933e-06, "logits/chosen": -2.3068978786468506, "logits/rejected": -2.151620626449585, "logps/chosen": -258.6965637207031, "logps/rejected": -226.7814483642578, "loss": 0.6773, "rewards/accuracies": 0.71875, "rewards/chosen": 0.05611586570739746, "rewards/margins": 0.03871893882751465, "rewards/rejected": 0.017396926879882812, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 1.78125, "learning_rate": 2.094240837696335e-06, "logits/chosen": -2.2305335998535156, "logits/rejected": -2.167931318283081, "logps/chosen": -272.13787841796875, "logps/rejected": -263.77557373046875, "loss": 0.6763, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05462193489074707, "rewards/margins": 0.041644386947155, "rewards/rejected": 0.012977546080946922, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 1.7109375, "learning_rate": 2.356020942408377e-06, "logits/chosen": -2.265904188156128, "logits/rejected": -2.198087215423584, "logps/chosen": -244.1629638671875, "logps/rejected": -235.1984405517578, "loss": 0.6758, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.039959438145160675, "rewards/margins": 0.034967485815286636, "rewards/rejected": 0.004991954658180475, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 1.8359375, "learning_rate": 2.617801047120419e-06, "logits/chosen": -2.284364938735962, "logits/rejected": -2.13216495513916, "logps/chosen": -236.353271484375, "logps/rejected": -198.71908569335938, "loss": 0.6635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.042473744601011276, "rewards/margins": 0.07647411525249481, "rewards/rejected": -0.034000374376773834, "step": 100 }, { "epoch": 0.05234231876472128, "eval_logits/chosen": -2.1424341201782227, "eval_logits/rejected": -2.0469071865081787, "eval_logps/chosen": -264.0175476074219, "eval_logps/rejected": -247.58309936523438, "eval_loss": 0.6640408039093018, "eval_rewards/accuracies": 0.6830000281333923, "eval_rewards/chosen": 0.012926395051181316, "eval_rewards/margins": 0.07607292383909225, "eval_rewards/rejected": -0.06314651668071747, "eval_runtime": 419.0041, "eval_samples_per_second": 4.773, "eval_steps_per_second": 0.298, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 2.0625, "learning_rate": 2.8795811518324613e-06, "logits/chosen": -2.256439447402954, "logits/rejected": -2.129594326019287, "logps/chosen": -259.8465881347656, "logps/rejected": -214.66455078125, "loss": 0.6551, "rewards/accuracies": 0.6875, "rewards/chosen": -0.001864680671133101, "rewards/margins": 0.09259083122015, "rewards/rejected": -0.09445551782846451, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 2.546875, "learning_rate": 3.141361256544503e-06, "logits/chosen": -2.2723679542541504, "logits/rejected": -2.1830503940582275, "logps/chosen": -317.4821472167969, "logps/rejected": -291.89141845703125, "loss": 0.6617, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06429656594991684, "rewards/margins": 0.06679816544055939, "rewards/rejected": -0.13109473884105682, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 2.296875, "learning_rate": 3.403141361256545e-06, "logits/chosen": -2.2017934322357178, "logits/rejected": -2.1079394817352295, "logps/chosen": -288.2923889160156, "logps/rejected": -273.95855712890625, "loss": 0.6459, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.05360643193125725, "rewards/margins": 0.12373937666416168, "rewards/rejected": -0.17734579741954803, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 2.578125, "learning_rate": 3.6649214659685865e-06, "logits/chosen": -2.1394448280334473, "logits/rejected": -2.0755527019500732, "logps/chosen": -271.42828369140625, "logps/rejected": -276.28118896484375, "loss": 0.6416, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.197081059217453, "rewards/margins": 0.14488813281059265, "rewards/rejected": -0.34196919202804565, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 3.484375, "learning_rate": 3.926701570680629e-06, "logits/chosen": -2.182858467102051, "logits/rejected": -2.0637383460998535, "logps/chosen": -291.70867919921875, "logps/rejected": -289.05181884765625, "loss": 0.6315, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20205065608024597, "rewards/margins": 0.13050489127635956, "rewards/rejected": -0.3325555920600891, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 3.5625, "learning_rate": 4.18848167539267e-06, "logits/chosen": -2.290510654449463, "logits/rejected": -2.123711109161377, "logps/chosen": -300.88226318359375, "logps/rejected": -282.322265625, "loss": 0.6245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3104555010795593, "rewards/margins": 0.1691991090774536, "rewards/rejected": -0.4796546399593353, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 2.375, "learning_rate": 4.450261780104713e-06, "logits/chosen": -2.131312131881714, "logits/rejected": -2.1103098392486572, "logps/chosen": -247.94003295898438, "logps/rejected": -265.16619873046875, "loss": 0.6256, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.07472637295722961, "rewards/margins": 0.18387499451637268, "rewards/rejected": -0.2586013674736023, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 3.078125, "learning_rate": 4.712041884816754e-06, "logits/chosen": -2.241760730743408, "logits/rejected": -2.09507417678833, "logps/chosen": -307.455810546875, "logps/rejected": -297.59942626953125, "loss": 0.6358, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18923737108707428, "rewards/margins": 0.17130598425865173, "rewards/rejected": -0.36054331064224243, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 3.890625, "learning_rate": 4.9738219895287965e-06, "logits/chosen": -2.1892220973968506, "logits/rejected": -2.068312168121338, "logps/chosen": -312.1988830566406, "logps/rejected": -304.8963928222656, "loss": 0.6112, "rewards/accuracies": 0.71875, "rewards/chosen": -0.39928480982780457, "rewards/margins": 0.23079952597618103, "rewards/rejected": -0.6300843954086304, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 3.78125, "learning_rate": 4.999661831436499e-06, "logits/chosen": -2.144768238067627, "logits/rejected": -2.0535120964050293, "logps/chosen": -311.6337890625, "logps/rejected": -324.09271240234375, "loss": 0.6119, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5136710405349731, "rewards/margins": 0.24318833649158478, "rewards/rejected": -0.7568593621253967, "step": 200 }, { "epoch": 0.10468463752944256, "eval_logits/chosen": -2.044912099838257, "eval_logits/rejected": -1.9524328708648682, "eval_logps/chosen": -320.8676452636719, "eval_logps/rejected": -323.3910827636719, "eval_loss": 0.6207260489463806, "eval_rewards/accuracies": 0.6790000200271606, "eval_rewards/chosen": -0.5555742383003235, "eval_rewards/margins": 0.2656523585319519, "eval_rewards/rejected": -0.8212265372276306, "eval_runtime": 418.3747, "eval_samples_per_second": 4.78, "eval_steps_per_second": 0.299, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 5.15625, "learning_rate": 4.9984929711403395e-06, "logits/chosen": -2.093073844909668, "logits/rejected": -1.9828155040740967, "logps/chosen": -272.14434814453125, "logps/rejected": -275.6336975097656, "loss": 0.6375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.39326217770576477, "rewards/margins": 0.1773087978363037, "rewards/rejected": -0.5705710053443909, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 4.84375, "learning_rate": 4.996489634487865e-06, "logits/chosen": -2.136625051498413, "logits/rejected": -2.05576491355896, "logps/chosen": -302.97900390625, "logps/rejected": -302.3898010253906, "loss": 0.5994, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.29794472455978394, "rewards/margins": 0.29993799328804016, "rewards/rejected": -0.5978826880455017, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 3.828125, "learning_rate": 4.9936524905772466e-06, "logits/chosen": -2.0738844871520996, "logits/rejected": -1.9929462671279907, "logps/chosen": -280.2706298828125, "logps/rejected": -310.33148193359375, "loss": 0.576, "rewards/accuracies": 0.8125, "rewards/chosen": -0.42290401458740234, "rewards/margins": 0.41287049651145935, "rewards/rejected": -0.8357745409011841, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 4.375, "learning_rate": 4.9899824869915e-06, "logits/chosen": -2.1067252159118652, "logits/rejected": -2.0308070182800293, "logps/chosen": -333.824462890625, "logps/rejected": -358.0045166015625, "loss": 0.5908, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8457317352294922, "rewards/margins": 0.3239857256412506, "rewards/rejected": -1.16971755027771, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 5.15625, "learning_rate": 4.985480849482012e-06, "logits/chosen": -2.0834193229675293, "logits/rejected": -1.948685884475708, "logps/chosen": -371.03472900390625, "logps/rejected": -339.9703674316406, "loss": 0.5867, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.684076189994812, "rewards/margins": 0.28442835807800293, "rewards/rejected": -0.9685045480728149, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 4.75, "learning_rate": 4.980149081559142e-06, "logits/chosen": -2.097219944000244, "logits/rejected": -2.031381607055664, "logps/chosen": -321.78955078125, "logps/rejected": -324.4165954589844, "loss": 0.5984, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5368736982345581, "rewards/margins": 0.34659940004348755, "rewards/rejected": -0.8834732174873352, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 4.03125, "learning_rate": 4.9739889639900655e-06, "logits/chosen": -2.0381791591644287, "logits/rejected": -1.9909448623657227, "logps/chosen": -340.8160705566406, "logps/rejected": -379.08575439453125, "loss": 0.5608, "rewards/accuracies": 0.75, "rewards/chosen": -0.7386736869812012, "rewards/margins": 0.4662117063999176, "rewards/rejected": -1.204885482788086, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 7.0, "learning_rate": 4.967002554204009e-06, "logits/chosen": -2.0414297580718994, "logits/rejected": -2.0170059204101562, "logps/chosen": -358.2557678222656, "logps/rejected": -398.809326171875, "loss": 0.5953, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.9335516691207886, "rewards/margins": 0.3366740942001343, "rewards/rejected": -1.2702257633209229, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 4.875, "learning_rate": 4.959192185605089e-06, "logits/chosen": -2.239074230194092, "logits/rejected": -2.134113311767578, "logps/chosen": -366.4399719238281, "logps/rejected": -371.07293701171875, "loss": 0.589, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5544485449790955, "rewards/margins": 0.426436185836792, "rewards/rejected": -0.9808847308158875, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 3.578125, "learning_rate": 4.950560466792969e-06, "logits/chosen": -2.1252686977386475, "logits/rejected": -2.0176823139190674, "logps/chosen": -313.9684143066406, "logps/rejected": -321.84600830078125, "loss": 0.5874, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.45316630601882935, "rewards/margins": 0.42708855867385864, "rewards/rejected": -0.880254864692688, "step": 300 }, { "epoch": 0.15702695629416383, "eval_logits/chosen": -2.0493786334991455, "eval_logits/rejected": -1.9609495401382446, "eval_logps/chosen": -307.7115173339844, "eval_logps/rejected": -321.7127685546875, "eval_loss": 0.5849232077598572, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": -0.4240134358406067, "eval_rewards/margins": 0.380429744720459, "eval_rewards/rejected": -0.8044431209564209, "eval_runtime": 417.9483, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.299, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 6.9375, "learning_rate": 4.9411102806916185e-06, "logits/chosen": -2.093439817428589, "logits/rejected": -2.018660306930542, "logps/chosen": -311.8453063964844, "logps/rejected": -316.3194885253906, "loss": 0.5748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4401545524597168, "rewards/margins": 0.3743765354156494, "rewards/rejected": -0.8145310282707214, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 7.25, "learning_rate": 4.930844783586424e-06, "logits/chosen": -2.11296010017395, "logits/rejected": -1.9415552616119385, "logps/chosen": -324.8458251953125, "logps/rejected": -336.02606201171875, "loss": 0.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6088758707046509, "rewards/margins": 0.4853138327598572, "rewards/rejected": -1.0941897630691528, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 4.625, "learning_rate": 4.919767404070033e-06, "logits/chosen": -2.0613181591033936, "logits/rejected": -1.933327078819275, "logps/chosen": -333.0088806152344, "logps/rejected": -363.67864990234375, "loss": 0.5743, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7408894300460815, "rewards/margins": 0.5184401869773865, "rewards/rejected": -1.2593296766281128, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 5.8125, "learning_rate": 4.907881841897216e-06, "logits/chosen": -2.1247360706329346, "logits/rejected": -2.043912172317505, "logps/chosen": -368.68804931640625, "logps/rejected": -375.9783935546875, "loss": 0.5642, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8249163627624512, "rewards/margins": 0.3822299540042877, "rewards/rejected": -1.207146406173706, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 5.96875, "learning_rate": 4.89519206674919e-06, "logits/chosen": -1.9861600399017334, "logits/rejected": -1.9854027032852173, "logps/chosen": -362.7377624511719, "logps/rejected": -466.30859375, "loss": 0.4995, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2443187236785889, "rewards/margins": 0.8100869059562683, "rewards/rejected": -2.054405689239502, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 6.75, "learning_rate": 4.881702316907769e-06, "logits/chosen": -2.0150065422058105, "logits/rejected": -1.8952875137329102, "logps/chosen": -412.15167236328125, "logps/rejected": -451.9383850097656, "loss": 0.5385, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.192842960357666, "rewards/margins": 0.7140182256698608, "rewards/rejected": -1.9068610668182373, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 6.3125, "learning_rate": 4.86741709783982e-06, "logits/chosen": -1.8865272998809814, "logits/rejected": -1.7796827554702759, "logps/chosen": -363.8604431152344, "logps/rejected": -400.41717529296875, "loss": 0.5527, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1201820373535156, "rewards/margins": 0.6793785095214844, "rewards/rejected": -1.799560785293579, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 5.71875, "learning_rate": 4.852341180692471e-06, "logits/chosen": -1.8515560626983643, "logits/rejected": -1.8265050649642944, "logps/chosen": -352.3423156738281, "logps/rejected": -427.00616455078125, "loss": 0.5161, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0433505773544312, "rewards/margins": 0.5754044055938721, "rewards/rejected": -1.6187551021575928, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 6.9375, "learning_rate": 4.836479600699579e-06, "logits/chosen": -1.855102300643921, "logits/rejected": -1.7473504543304443, "logps/chosen": -397.20770263671875, "logps/rejected": -405.9667053222656, "loss": 0.5747, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.059572696685791, "rewards/margins": 0.5552623867988586, "rewards/rejected": -1.6148353815078735, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 7.5625, "learning_rate": 4.819837655500014e-06, "logits/chosen": -1.8241382837295532, "logits/rejected": -1.753692388534546, "logps/chosen": -352.1536865234375, "logps/rejected": -386.19818115234375, "loss": 0.5608, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0556440353393555, "rewards/margins": 0.5675815343856812, "rewards/rejected": -1.6232255697250366, "step": 400 }, { "epoch": 0.2093692750588851, "eval_logits/chosen": -1.7823258638381958, "eval_logits/rejected": -1.6968703269958496, "eval_logps/chosen": -383.4810791015625, "eval_logps/rejected": -418.78936767578125, "eval_loss": 0.5606996417045593, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -1.1817084550857544, "eval_rewards/margins": 0.5935006141662598, "eval_rewards/rejected": -1.7752091884613037, "eval_runtime": 417.9932, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.299, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 6.40625, "learning_rate": 4.802420903368286e-06, "logits/chosen": -1.8709204196929932, "logits/rejected": -1.8814146518707275, "logps/chosen": -361.89141845703125, "logps/rejected": -420.5713806152344, "loss": 0.5709, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1421557664871216, "rewards/margins": 0.5057119131088257, "rewards/rejected": -1.6478675603866577, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 6.1875, "learning_rate": 4.784235161358124e-06, "logits/chosen": -1.9516538381576538, "logits/rejected": -1.870397925376892, "logps/chosen": -361.6144714355469, "logps/rejected": -400.68505859375, "loss": 0.5721, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.9393259286880493, "rewards/margins": 0.47111278772354126, "rewards/rejected": -1.4104387760162354, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 5.75, "learning_rate": 4.765286503359632e-06, "logits/chosen": -1.8665249347686768, "logits/rejected": -1.7852083444595337, "logps/chosen": -367.1800842285156, "logps/rejected": -395.7027587890625, "loss": 0.5594, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0680012702941895, "rewards/margins": 0.4708701968193054, "rewards/rejected": -1.53887140750885, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 6.03125, "learning_rate": 4.745581258070654e-06, "logits/chosen": -1.725019097328186, "logits/rejected": -1.6572344303131104, "logps/chosen": -384.79949951171875, "logps/rejected": -438.75665283203125, "loss": 0.535, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1692166328430176, "rewards/margins": 0.6970125436782837, "rewards/rejected": -1.8662292957305908, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 6.09375, "learning_rate": 4.725126006883047e-06, "logits/chosen": -1.6160274744033813, "logits/rejected": -1.5949280261993408, "logps/chosen": -354.0497131347656, "logps/rejected": -433.2997131347656, "loss": 0.5444, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2347891330718994, "rewards/margins": 0.6934897303581238, "rewards/rejected": -1.928278923034668, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 5.90625, "learning_rate": 4.70392758168454e-06, "logits/chosen": -1.8258330821990967, "logits/rejected": -1.6724697351455688, "logps/chosen": -378.1096496582031, "logps/rejected": -387.38348388671875, "loss": 0.5756, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0020791292190552, "rewards/margins": 0.5104026794433594, "rewards/rejected": -1.512481927871704, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 5.0625, "learning_rate": 4.68199306257695e-06, "logits/chosen": -1.8071792125701904, "logits/rejected": -1.7476246356964111, "logps/chosen": -400.9126892089844, "logps/rejected": -424.7237854003906, "loss": 0.5261, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0324732065200806, "rewards/margins": 0.5401113629341125, "rewards/rejected": -1.5725845098495483, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 4.875, "learning_rate": 4.659329775511478e-06, "logits/chosen": -1.7627513408660889, "logits/rejected": -1.7228662967681885, "logps/chosen": -365.78173828125, "logps/rejected": -419.61785888671875, "loss": 0.541, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1906752586364746, "rewards/margins": 0.6715251207351685, "rewards/rejected": -1.8622004985809326, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 4.78125, "learning_rate": 4.635945289841902e-06, "logits/chosen": -1.8155529499053955, "logits/rejected": -1.7483114004135132, "logps/chosen": -383.2039794921875, "logps/rejected": -413.91949462890625, "loss": 0.5285, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1114593744277954, "rewards/margins": 0.6581318378448486, "rewards/rejected": -1.7695910930633545, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 6.1875, "learning_rate": 4.611847415796476e-06, "logits/chosen": -1.7952792644500732, "logits/rejected": -1.6973575353622437, "logps/chosen": -411.4215393066406, "logps/rejected": -455.2433166503906, "loss": 0.5287, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.436229944229126, "rewards/margins": 0.7496575713157654, "rewards/rejected": -2.185887575149536, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -1.628432273864746, "eval_logits/rejected": -1.5394303798675537, "eval_logps/chosen": -437.787841796875, "eval_logps/rejected": -486.7725524902344, "eval_loss": 0.5434273481369019, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": -1.724776029586792, "eval_rewards/margins": 0.7302653193473816, "eval_rewards/rejected": -2.4550411701202393, "eval_runtime": 418.2127, "eval_samples_per_second": 4.782, "eval_steps_per_second": 0.299, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 8.5, "learning_rate": 4.587044201869378e-06, "logits/chosen": -1.6594860553741455, "logits/rejected": -1.5737857818603516, "logps/chosen": -433.07781982421875, "logps/rejected": -469.0967712402344, "loss": 0.5491, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8182029724121094, "rewards/margins": 0.6091849207878113, "rewards/rejected": -2.4273881912231445, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 5.25, "learning_rate": 4.561543932132574e-06, "logits/chosen": -1.7765411138534546, "logits/rejected": -1.683142900466919, "logps/chosen": -393.26312255859375, "logps/rejected": -443.51190185546875, "loss": 0.52, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2223466634750366, "rewards/margins": 0.7156563997268677, "rewards/rejected": -1.9380029439926147, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 14.8125, "learning_rate": 4.535355123469009e-06, "logits/chosen": -1.7674404382705688, "logits/rejected": -1.6884065866470337, "logps/chosen": -382.9159851074219, "logps/rejected": -448.69598388671875, "loss": 0.5456, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1929681301116943, "rewards/margins": 0.7450785636901855, "rewards/rejected": -1.9380466938018799, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 12.0625, "learning_rate": 4.508486522728037e-06, "logits/chosen": -1.648045301437378, "logits/rejected": -1.5743778944015503, "logps/chosen": -397.73272705078125, "logps/rejected": -434.5792541503906, "loss": 0.5224, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2816746234893799, "rewards/margins": 0.7174645662307739, "rewards/rejected": -1.999139428138733, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 9.3125, "learning_rate": 4.480947103804044e-06, "logits/chosen": -1.7355165481567383, "logits/rejected": -1.6410526037216187, "logps/chosen": -402.465576171875, "logps/rejected": -429.6012268066406, "loss": 0.5549, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3331598043441772, "rewards/margins": 0.670220673084259, "rewards/rejected": -2.003380298614502, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 5.03125, "learning_rate": 4.452746064639239e-06, "logits/chosen": -1.6943957805633545, "logits/rejected": -1.6555038690567017, "logps/chosen": -365.50982666015625, "logps/rejected": -437.04559326171875, "loss": 0.5469, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.0253641605377197, "rewards/margins": 0.7539097666740417, "rewards/rejected": -1.7792737483978271, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 5.78125, "learning_rate": 4.423892824151617e-06, "logits/chosen": -1.8259334564208984, "logits/rejected": -1.6719919443130493, "logps/chosen": -405.76190185546875, "logps/rejected": -420.0641174316406, "loss": 0.5416, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.094802737236023, "rewards/margins": 0.5915030241012573, "rewards/rejected": -1.6863057613372803, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 5.875, "learning_rate": 4.3943970190891164e-06, "logits/chosen": -1.742593765258789, "logits/rejected": -1.6240646839141846, "logps/chosen": -411.27191162109375, "logps/rejected": -426.0252380371094, "loss": 0.5299, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1612910032272339, "rewards/margins": 0.6791490316390991, "rewards/rejected": -1.840440034866333, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 7.46875, "learning_rate": 4.364268500811025e-06, "logits/chosen": -1.5770652294158936, "logits/rejected": -1.5232446193695068, "logps/chosen": -380.0727233886719, "logps/rejected": -434.089599609375, "loss": 0.5222, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2521073818206787, "rewards/margins": 0.756430447101593, "rewards/rejected": -2.008537769317627, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 6.78125, "learning_rate": 4.333517331997704e-06, "logits/chosen": -1.6388943195343018, "logits/rejected": -1.5273475646972656, "logps/chosen": -360.4878845214844, "logps/rejected": -418.6941833496094, "loss": 0.5504, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1942436695098877, "rewards/margins": 0.695041835308075, "rewards/rejected": -1.8892854452133179, "step": 600 }, { "epoch": 0.31405391258832765, "eval_logits/chosen": -1.5286706686019897, "eval_logits/rejected": -1.4438811540603638, "eval_logps/chosen": -400.715576171875, "eval_logps/rejected": -454.2872009277344, "eval_loss": 0.52777099609375, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -1.3540534973144531, "eval_rewards/margins": 0.7761339545249939, "eval_rewards/rejected": -2.130187511444092, "eval_runtime": 417.9932, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.299, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 6.71875, "learning_rate": 4.302153783289737e-06, "logits/chosen": -1.5999139547348022, "logits/rejected": -1.5058938264846802, "logps/chosen": -398.68878173828125, "logps/rejected": -476.72613525390625, "loss": 0.5157, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.3395607471466064, "rewards/margins": 0.7813156843185425, "rewards/rejected": -2.1208765506744385, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 6.96875, "learning_rate": 4.270188329857613e-06, "logits/chosen": -1.6522363424301147, "logits/rejected": -1.5563585758209229, "logps/chosen": -411.0440979003906, "logps/rejected": -424.77667236328125, "loss": 0.5539, "rewards/accuracies": 0.71875, "rewards/chosen": -1.190372109413147, "rewards/margins": 0.6327346563339233, "rewards/rejected": -1.8231067657470703, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 4.28125, "learning_rate": 4.237631647903115e-06, "logits/chosen": -1.6967754364013672, "logits/rejected": -1.5837347507476807, "logps/chosen": -365.4174499511719, "logps/rejected": -415.60693359375, "loss": 0.5761, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1029731035232544, "rewards/margins": 0.6401803493499756, "rewards/rejected": -1.7431533336639404, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 5.78125, "learning_rate": 4.204494611093548e-06, "logits/chosen": -1.677026391029358, "logits/rejected": -1.5535941123962402, "logps/chosen": -353.6123046875, "logps/rejected": -395.86444091796875, "loss": 0.5159, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1177022457122803, "rewards/margins": 0.6914916038513184, "rewards/rejected": -1.8091938495635986, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 6.90625, "learning_rate": 4.170788286930024e-06, "logits/chosen": -1.5932585000991821, "logits/rejected": -1.5849659442901611, "logps/chosen": -361.3525695800781, "logps/rejected": -454.78863525390625, "loss": 0.527, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2824896574020386, "rewards/margins": 0.766124963760376, "rewards/rejected": -2.048614740371704, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 7.34375, "learning_rate": 4.136523933051005e-06, "logits/chosen": -1.4587924480438232, "logits/rejected": -1.3809664249420166, "logps/chosen": -420.695556640625, "logps/rejected": -494.5306091308594, "loss": 0.5183, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6998589038848877, "rewards/margins": 0.7845654487609863, "rewards/rejected": -2.484424352645874, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 9.3125, "learning_rate": 4.101712993472348e-06, "logits/chosen": -1.5042357444763184, "logits/rejected": -1.4506601095199585, "logps/chosen": -392.58563232421875, "logps/rejected": -443.7020568847656, "loss": 0.5484, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3401987552642822, "rewards/margins": 0.688759446144104, "rewards/rejected": -2.028958320617676, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 8.375, "learning_rate": 4.066367094765091e-06, "logits/chosen": -1.506446123123169, "logits/rejected": -1.4123358726501465, "logps/chosen": -377.2737121582031, "logps/rejected": -416.51934814453125, "loss": 0.5565, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1745784282684326, "rewards/margins": 0.6505630612373352, "rewards/rejected": -1.8251415491104126, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 5.84375, "learning_rate": 4.030498042172277e-06, "logits/chosen": -1.6745965480804443, "logits/rejected": -1.6025259494781494, "logps/chosen": -371.5199279785156, "logps/rejected": -448.5033264160156, "loss": 0.5053, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0983096361160278, "rewards/margins": 0.7682539820671082, "rewards/rejected": -1.8665635585784912, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 6.875, "learning_rate": 3.994117815666095e-06, "logits/chosen": -1.6488659381866455, "logits/rejected": -1.547518253326416, "logps/chosen": -375.697265625, "logps/rejected": -433.5043029785156, "loss": 0.5243, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.112671136856079, "rewards/margins": 0.7226445078849792, "rewards/rejected": -1.835315465927124, "step": 700 }, { "epoch": 0.36639623135304894, "eval_logits/chosen": -1.5753525495529175, "eval_logits/rejected": -1.4888389110565186, "eval_logps/chosen": -364.6462097167969, "eval_logps/rejected": -415.41790771484375, "eval_loss": 0.5277618169784546, "eval_rewards/accuracies": 0.7419999837875366, "eval_rewards/chosen": -0.9933602213859558, "eval_rewards/margins": 0.7481338977813721, "eval_rewards/rejected": -1.7414941787719727, "eval_runtime": 418.0469, "eval_samples_per_second": 4.784, "eval_steps_per_second": 0.299, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 5.3125, "learning_rate": 3.957238565946672e-06, "logits/chosen": -1.6395299434661865, "logits/rejected": -1.5400069952011108, "logps/chosen": -405.160888671875, "logps/rejected": -441.86187744140625, "loss": 0.4668, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0931313037872314, "rewards/margins": 0.8092358708381653, "rewards/rejected": -1.9023672342300415, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 8.4375, "learning_rate": 3.919872610383831e-06, "logits/chosen": -1.4249577522277832, "logits/rejected": -1.4130737781524658, "logps/chosen": -369.44122314453125, "logps/rejected": -465.67474365234375, "loss": 0.4786, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2502071857452393, "rewards/margins": 1.0794490575790405, "rewards/rejected": -2.3296561241149902, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 9.875, "learning_rate": 3.882032428903195e-06, "logits/chosen": -1.482121229171753, "logits/rejected": -1.4346544742584229, "logps/chosen": -378.8681335449219, "logps/rejected": -464.63128662109375, "loss": 0.5658, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4904229640960693, "rewards/margins": 0.9111854434013367, "rewards/rejected": -2.4016082286834717, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 5.71875, "learning_rate": 3.84373065981799e-06, "logits/chosen": -1.4889881610870361, "logits/rejected": -1.437328815460205, "logps/chosen": -399.4703063964844, "logps/rejected": -442.62994384765625, "loss": 0.4799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4881459474563599, "rewards/margins": 0.8073347210884094, "rewards/rejected": -2.295480489730835, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 12.9375, "learning_rate": 3.8049800956079552e-06, "logits/chosen": -1.570738673210144, "logits/rejected": -1.4659771919250488, "logps/chosen": -399.2367248535156, "logps/rejected": -470.23846435546875, "loss": 0.4844, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4377801418304443, "rewards/margins": 0.8683385848999023, "rewards/rejected": -2.3061187267303467, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 6.8125, "learning_rate": 3.765793678646753e-06, "logits/chosen": -1.5883232355117798, "logits/rejected": -1.546722650527954, "logps/chosen": -324.77978515625, "logps/rejected": -381.92425537109375, "loss": 0.5781, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8907216191291809, "rewards/margins": 0.7443931698799133, "rewards/rejected": -1.6351149082183838, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 6.9375, "learning_rate": 3.726184496879323e-06, "logits/chosen": -1.607028603553772, "logits/rejected": -1.5617607831954956, "logps/chosen": -352.07916259765625, "logps/rejected": -389.9503479003906, "loss": 0.5861, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9706624746322632, "rewards/margins": 0.4382301867008209, "rewards/rejected": -1.4088926315307617, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 5.21875, "learning_rate": 3.686165779450619e-06, "logits/chosen": -1.7265808582305908, "logits/rejected": -1.6542351245880127, "logps/chosen": -362.64129638671875, "logps/rejected": -404.32843017578125, "loss": 0.506, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0118680000305176, "rewards/margins": 0.6141448616981506, "rewards/rejected": -1.6260128021240234, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 4.90625, "learning_rate": 3.645750892287178e-06, "logits/chosen": -1.6205097436904907, "logits/rejected": -1.5864428281784058, "logps/chosen": -365.2150573730469, "logps/rejected": -460.1787109375, "loss": 0.523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1948273181915283, "rewards/margins": 0.7571195363998413, "rewards/rejected": -1.9519468545913696, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 6.1875, "learning_rate": 3.604953333633009e-06, "logits/chosen": -1.6255521774291992, "logits/rejected": -1.5807536840438843, "logps/chosen": -367.66815185546875, "logps/rejected": -433.361083984375, "loss": 0.5346, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1196119785308838, "rewards/margins": 0.7731973528862, "rewards/rejected": -1.892809271812439, "step": 800 }, { "epoch": 0.4187385501177702, "eval_logits/chosen": -1.5717787742614746, "eval_logits/rejected": -1.4860730171203613, "eval_logps/chosen": -370.40435791015625, "eval_logps/rejected": -423.1763610839844, "eval_loss": 0.5285139083862305, "eval_rewards/accuracies": 0.7360000014305115, "eval_rewards/chosen": -1.0509413480758667, "eval_rewards/margins": 0.7681376934051514, "eval_rewards/rejected": -1.8190791606903076, "eval_runtime": 417.8003, "eval_samples_per_second": 4.787, "eval_steps_per_second": 0.299, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 6.5, "learning_rate": 3.56378672954129e-06, "logits/chosen": -1.643397569656372, "logits/rejected": -1.5696380138397217, "logps/chosen": -358.78411865234375, "logps/rejected": -437.92578125, "loss": 0.5276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0859898328781128, "rewards/margins": 0.7730637192726135, "rewards/rejected": -1.859053611755371, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 6.03125, "learning_rate": 3.5222648293233806e-06, "logits/chosen": -1.5449378490447998, "logits/rejected": -1.5294286012649536, "logps/chosen": -367.80780029296875, "logps/rejected": -437.8079528808594, "loss": 0.5418, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1534316539764404, "rewards/margins": 0.8924697041511536, "rewards/rejected": -2.045901298522949, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 8.4375, "learning_rate": 3.4804015009566573e-06, "logits/chosen": -1.6001161336898804, "logits/rejected": -1.49454665184021, "logps/chosen": -335.2679138183594, "logps/rejected": -354.86114501953125, "loss": 0.5801, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.000899076461792, "rewards/margins": 0.49975353479385376, "rewards/rejected": -1.5006526708602905, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 7.90625, "learning_rate": 3.4382107264527244e-06, "logits/chosen": -1.5931416749954224, "logits/rejected": -1.4416605234146118, "logps/chosen": -392.618896484375, "logps/rejected": -418.7245178222656, "loss": 0.4917, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9880319833755493, "rewards/margins": 0.7607929706573486, "rewards/rejected": -1.7488250732421875, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 7.125, "learning_rate": 3.3957065971875387e-06, "logits/chosen": -1.5637991428375244, "logits/rejected": -1.4544169902801514, "logps/chosen": -394.06976318359375, "logps/rejected": -438.48455810546875, "loss": 0.5502, "rewards/accuracies": 0.71875, "rewards/chosen": -1.495042085647583, "rewards/margins": 0.7206583023071289, "rewards/rejected": -2.215700149536133, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 5.5625, "learning_rate": 3.352903309194999e-06, "logits/chosen": -1.573926568031311, "logits/rejected": -1.4945261478424072, "logps/chosen": -406.85443115234375, "logps/rejected": -494.858642578125, "loss": 0.5348, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8176378011703491, "rewards/margins": 0.8436784744262695, "rewards/rejected": -2.661316394805908, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 6.4375, "learning_rate": 3.309815158425591e-06, "logits/chosen": -1.422074556350708, "logits/rejected": -1.401736855506897, "logps/chosen": -404.21826171875, "logps/rejected": -468.27728271484375, "loss": 0.5317, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5278290510177612, "rewards/margins": 0.7724257111549377, "rewards/rejected": -2.3002545833587646, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 6.8125, "learning_rate": 3.266456535971654e-06, "logits/chosen": -1.6365457773208618, "logits/rejected": -1.5819311141967773, "logps/chosen": -383.20794677734375, "logps/rejected": -434.4812927246094, "loss": 0.5318, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4176002740859985, "rewards/margins": 0.6418014764785767, "rewards/rejected": -2.059401750564575, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 5.90625, "learning_rate": 3.2228419232608692e-06, "logits/chosen": -1.6183061599731445, "logits/rejected": -1.543496012687683, "logps/chosen": -391.3949279785156, "logps/rejected": -469.3917541503906, "loss": 0.5226, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.397723913192749, "rewards/margins": 0.8512004613876343, "rewards/rejected": -2.2489242553710938, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 7.125, "learning_rate": 3.1789858872195888e-06, "logits/chosen": -1.461332082748413, "logits/rejected": -1.369974136352539, "logps/chosen": -399.14324951171875, "logps/rejected": -514.7387084960938, "loss": 0.5072, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6139371395111084, "rewards/margins": 0.9934059977531433, "rewards/rejected": -2.6073431968688965, "step": 900 }, { "epoch": 0.4710808688824915, "eval_logits/chosen": -1.4531197547912598, "eval_logits/rejected": -1.36509370803833, "eval_logps/chosen": -428.54742431640625, "eval_logps/rejected": -498.6239318847656, "eval_loss": 0.5197117924690247, "eval_rewards/accuracies": 0.7300000190734863, "eval_rewards/chosen": -1.632372260093689, "eval_rewards/margins": 0.941182017326355, "eval_rewards/rejected": -2.573554515838623, "eval_runtime": 418.1319, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.299, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 6.5, "learning_rate": 3.1349030754075945e-06, "logits/chosen": -1.4467111825942993, "logits/rejected": -1.4052071571350098, "logps/chosen": -416.12664794921875, "logps/rejected": -477.246337890625, "loss": 0.549, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6408107280731201, "rewards/margins": 0.7182919979095459, "rewards/rejected": -2.359102725982666, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 5.3125, "learning_rate": 3.0906082111259313e-06, "logits/chosen": -1.5265501737594604, "logits/rejected": -1.442169427871704, "logps/chosen": -382.52642822265625, "logps/rejected": -446.12408447265625, "loss": 0.5215, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.242361307144165, "rewards/margins": 0.8638324737548828, "rewards/rejected": -2.106193780899048, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 10.6875, "learning_rate": 3.046116088499449e-06, "logits/chosen": -1.5302464962005615, "logits/rejected": -1.4620704650878906, "logps/chosen": -413.49920654296875, "logps/rejected": -459.68157958984375, "loss": 0.5184, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.468367576599121, "rewards/margins": 0.6104483008384705, "rewards/rejected": -2.0788159370422363, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 4.96875, "learning_rate": 3.0014415675356813e-06, "logits/chosen": -1.5114811658859253, "logits/rejected": -1.3885831832885742, "logps/chosen": -448.4798889160156, "logps/rejected": -505.93133544921875, "loss": 0.4512, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7153390645980835, "rewards/margins": 0.898027241230011, "rewards/rejected": -2.6133663654327393, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 6.375, "learning_rate": 2.9565995691617242e-06, "logits/chosen": -1.4272489547729492, "logits/rejected": -1.3025448322296143, "logps/chosen": -497.10919189453125, "logps/rejected": -571.2003173828125, "loss": 0.4877, "rewards/accuracies": 0.75, "rewards/chosen": -2.0273447036743164, "rewards/margins": 1.047803282737732, "rewards/rejected": -3.075148105621338, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 8.75, "learning_rate": 2.9116050702407706e-06, "logits/chosen": -1.4145638942718506, "logits/rejected": -1.3232591152191162, "logps/chosen": -506.157958984375, "logps/rejected": -553.6846313476562, "loss": 0.5178, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3269948959350586, "rewards/margins": 0.8322129249572754, "rewards/rejected": -3.159207820892334, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 6.125, "learning_rate": 2.8664730985699537e-06, "logits/chosen": -1.4710081815719604, "logits/rejected": -1.400832176208496, "logps/chosen": -453.1319274902344, "logps/rejected": -530.8441162109375, "loss": 0.5043, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9779431819915771, "rewards/margins": 0.7908161878585815, "rewards/rejected": -2.768759250640869, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 4.875, "learning_rate": 2.8212187278611907e-06, "logits/chosen": -1.5695170164108276, "logits/rejected": -1.4374539852142334, "logps/chosen": -442.06927490234375, "logps/rejected": -488.41497802734375, "loss": 0.4929, "rewards/accuracies": 0.75, "rewards/chosen": -1.5862115621566772, "rewards/margins": 0.8322838544845581, "rewards/rejected": -2.4184956550598145, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 6.21875, "learning_rate": 2.7758570727066843e-06, "logits/chosen": -1.4076306819915771, "logits/rejected": -1.3754841089248657, "logps/chosen": -399.33367919921875, "logps/rejected": -473.54736328125, "loss": 0.5269, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4874498844146729, "rewards/margins": 0.9017409086227417, "rewards/rejected": -2.389191150665283, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 7.125, "learning_rate": 2.730403283530767e-06, "logits/chosen": -1.425042986869812, "logits/rejected": -1.3490030765533447, "logps/chosen": -425.017578125, "logps/rejected": -501.9007873535156, "loss": 0.5023, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6772304773330688, "rewards/margins": 0.9379782676696777, "rewards/rejected": -2.615208864212036, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -1.3778769969940186, "eval_logits/rejected": -1.2853200435638428, "eval_logps/chosen": -434.580810546875, "eval_logps/rejected": -508.81787109375, "eval_loss": 0.515815794467926, "eval_rewards/accuracies": 0.7459999918937683, "eval_rewards/chosen": -1.6927061080932617, "eval_rewards/margins": 0.9827882051467896, "eval_rewards/rejected": -2.6754941940307617, "eval_runtime": 418.9677, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.298, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 7.5, "learning_rate": 2.6848725415297888e-06, "logits/chosen": -1.4254539012908936, "logits/rejected": -1.388588547706604, "logps/chosen": -399.80242919921875, "logps/rejected": -473.43536376953125, "loss": 0.5038, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.657282829284668, "rewards/margins": 0.7969308495521545, "rewards/rejected": -2.4542136192321777, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 10.125, "learning_rate": 2.639280053601719e-06, "logits/chosen": -1.4977766275405884, "logits/rejected": -1.4331027269363403, "logps/chosen": -457.36090087890625, "logps/rejected": -515.6148681640625, "loss": 0.4737, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6834481954574585, "rewards/margins": 0.9655658006668091, "rewards/rejected": -2.6490142345428467, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 5.8125, "learning_rate": 2.59364104726716e-06, "logits/chosen": -1.3670612573623657, "logits/rejected": -1.2159000635147095, "logps/chosen": -491.13616943359375, "logps/rejected": -529.3936767578125, "loss": 0.5202, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.3312811851501465, "rewards/margins": 0.8179370760917664, "rewards/rejected": -3.1492180824279785, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 5.0625, "learning_rate": 2.547970765583491e-06, "logits/chosen": -1.352312684059143, "logits/rejected": -1.3161817789077759, "logps/chosen": -484.44305419921875, "logps/rejected": -554.0833129882812, "loss": 0.5355, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.126248836517334, "rewards/margins": 0.8889590501785278, "rewards/rejected": -3.0152080059051514, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 8.75, "learning_rate": 2.502284462053799e-06, "logits/chosen": -1.4135512113571167, "logits/rejected": -1.2861578464508057, "logps/chosen": -486.4259338378906, "logps/rejected": -548.2239990234375, "loss": 0.5047, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9501330852508545, "rewards/margins": 0.9651840925216675, "rewards/rejected": -2.9153170585632324, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 8.125, "learning_rate": 2.456597395532338e-06, "logits/chosen": -1.4719403982162476, "logits/rejected": -1.4020650386810303, "logps/chosen": -421.3262634277344, "logps/rejected": -455.7506408691406, "loss": 0.5156, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6838070154190063, "rewards/margins": 0.7241236567497253, "rewards/rejected": -2.407930374145508, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 6.8125, "learning_rate": 2.4109248251281953e-06, "logits/chosen": -1.4956601858139038, "logits/rejected": -1.4263279438018799, "logps/chosen": -441.20245361328125, "logps/rejected": -469.7433166503906, "loss": 0.5302, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.600812554359436, "rewards/margins": 0.7271523475646973, "rewards/rejected": -2.3279647827148438, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 6.3125, "learning_rate": 2.365282005108875e-06, "logits/chosen": -1.5269404649734497, "logits/rejected": -1.4040794372558594, "logps/chosen": -406.04461669921875, "logps/rejected": -427.8402404785156, "loss": 0.5438, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5406584739685059, "rewards/margins": 0.7544614672660828, "rewards/rejected": -2.2951200008392334, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 10.5625, "learning_rate": 2.319684179805491e-06, "logits/chosen": -1.5088317394256592, "logits/rejected": -1.3840144872665405, "logps/chosen": -429.84918212890625, "logps/rejected": -446.9833068847656, "loss": 0.5405, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7651935815811157, "rewards/margins": 0.6635828018188477, "rewards/rejected": -2.428776264190674, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 6.59375, "learning_rate": 2.2741465785212905e-06, "logits/chosen": -1.4707400798797607, "logits/rejected": -1.3996989727020264, "logps/chosen": -404.25128173828125, "logps/rejected": -485.39306640625, "loss": 0.4954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5357749462127686, "rewards/margins": 0.778855562210083, "rewards/rejected": -2.3146309852600098, "step": 1100 }, { "epoch": 0.575765506411934, "eval_logits/chosen": -1.4842890501022339, "eval_logits/rejected": -1.3920989036560059, "eval_logps/chosen": -411.3603210449219, "eval_logps/rejected": -474.9687805175781, "eval_loss": 0.5126367211341858, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -1.4605008363723755, "eval_rewards/margins": 0.8765026926994324, "eval_rewards/rejected": -2.337003231048584, "eval_runtime": 417.8839, "eval_samples_per_second": 4.786, "eval_steps_per_second": 0.299, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 6.34375, "learning_rate": 2.2286844104451848e-06, "logits/chosen": -1.462570309638977, "logits/rejected": -1.398058295249939, "logps/chosen": -400.0235900878906, "logps/rejected": -480.5478515625, "loss": 0.5597, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4543365240097046, "rewards/margins": 0.8828409314155579, "rewards/rejected": -2.337177276611328, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 6.90625, "learning_rate": 2.183312859572008e-06, "logits/chosen": -1.4648702144622803, "logits/rejected": -1.3869099617004395, "logps/chosen": -439.00689697265625, "logps/rejected": -483.339111328125, "loss": 0.4975, "rewards/accuracies": 0.75, "rewards/chosen": -1.566976547241211, "rewards/margins": 0.8774997591972351, "rewards/rejected": -2.444476366043091, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 7.40625, "learning_rate": 2.1380470796311843e-06, "logits/chosen": -1.4076474905014038, "logits/rejected": -1.2980060577392578, "logps/chosen": -462.13079833984375, "logps/rejected": -514.0867919921875, "loss": 0.5183, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.829978585243225, "rewards/margins": 0.8743548393249512, "rewards/rejected": -2.7043333053588867, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 13.0625, "learning_rate": 2.092902189025507e-06, "logits/chosen": -1.526375412940979, "logits/rejected": -1.3943004608154297, "logps/chosen": -463.09942626953125, "logps/rejected": -512.2612915039062, "loss": 0.5156, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.7574234008789062, "rewards/margins": 0.9618258476257324, "rewards/rejected": -2.7192490100860596, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 6.625, "learning_rate": 2.0478932657817105e-06, "logits/chosen": -1.50194251537323, "logits/rejected": -1.3701152801513672, "logps/chosen": -464.9756774902344, "logps/rejected": -515.4832763671875, "loss": 0.4941, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9383106231689453, "rewards/margins": 0.8438523411750793, "rewards/rejected": -2.782163143157959, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 6.875, "learning_rate": 2.0030353425145376e-06, "logits/chosen": -1.3441766500473022, "logits/rejected": -1.339890480041504, "logps/chosen": -419.56011962890625, "logps/rejected": -479.64178466796875, "loss": 0.5518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.932283639907837, "rewards/margins": 0.7835468649864197, "rewards/rejected": -2.7158303260803223, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 6.125, "learning_rate": 1.958343401405964e-06, "logits/chosen": -1.4456775188446045, "logits/rejected": -1.3890011310577393, "logps/chosen": -420.3235778808594, "logps/rejected": -521.9119873046875, "loss": 0.5171, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6922845840454102, "rewards/margins": 1.060473918914795, "rewards/rejected": -2.752758502960205, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 8.9375, "learning_rate": 1.9138323692012734e-06, "logits/chosen": -1.483940601348877, "logits/rejected": -1.3970738649368286, "logps/chosen": -453.9214782714844, "logps/rejected": -521.9664306640625, "loss": 0.4843, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.8300726413726807, "rewards/margins": 0.8861376047134399, "rewards/rejected": -2.71621036529541, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 5.96875, "learning_rate": 1.8695171122236443e-06, "logits/chosen": -1.3713841438293457, "logits/rejected": -1.2403717041015625, "logps/chosen": -489.3267517089844, "logps/rejected": -552.6810913085938, "loss": 0.4591, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8935291767120361, "rewards/margins": 1.1078187227249146, "rewards/rejected": -3.001347541809082, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 7.03125, "learning_rate": 1.8254124314089225e-06, "logits/chosen": -1.2732470035552979, "logits/rejected": -1.1547812223434448, "logps/chosen": -476.4207458496094, "logps/rejected": -533.1109619140625, "loss": 0.4983, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.027547836303711, "rewards/margins": 0.8173040151596069, "rewards/rejected": -2.8448517322540283, "step": 1200 }, { "epoch": 0.6281078251766553, "eval_logits/chosen": -1.2848007678985596, "eval_logits/rejected": -1.1942012310028076, "eval_logps/chosen": -470.9687194824219, "eval_logps/rejected": -548.050537109375, "eval_loss": 0.5105060935020447, "eval_rewards/accuracies": 0.7450000047683716, "eval_rewards/chosen": -2.0565850734710693, "eval_rewards/margins": 1.0112360715866089, "eval_rewards/rejected": -3.0678207874298096, "eval_runtime": 418.1749, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.299, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 10.625, "learning_rate": 1.781533057362221e-06, "logits/chosen": -1.4669840335845947, "logits/rejected": -1.4140782356262207, "logps/chosen": -458.5419921875, "logps/rejected": -524.1234741210938, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": -1.9135513305664062, "rewards/margins": 0.89533531665802, "rewards/rejected": -2.808886766433716, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 9.3125, "learning_rate": 1.7378936454380277e-06, "logits/chosen": -1.3845133781433105, "logits/rejected": -1.2915741205215454, "logps/chosen": -459.07403564453125, "logps/rejected": -527.103515625, "loss": 0.5325, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9770281314849854, "rewards/margins": 0.9904762506484985, "rewards/rejected": -2.9675042629241943, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 6.53125, "learning_rate": 1.6945087708454273e-06, "logits/chosen": -1.4331893920898438, "logits/rejected": -1.3246791362762451, "logps/chosen": -466.3059997558594, "logps/rejected": -507.17327880859375, "loss": 0.4633, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.825950026512146, "rewards/margins": 0.9006016850471497, "rewards/rejected": -2.7265517711639404, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 7.84375, "learning_rate": 1.651392923780105e-06, "logits/chosen": -1.4348574876785278, "logits/rejected": -1.368645429611206, "logps/chosen": -439.5028381347656, "logps/rejected": -533.1715698242188, "loss": 0.4936, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8169918060302734, "rewards/margins": 0.9930621981620789, "rewards/rejected": -2.810054063796997, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 6.65625, "learning_rate": 1.608560504584737e-06, "logits/chosen": -1.3752681016921997, "logits/rejected": -1.3227896690368652, "logps/chosen": -440.70843505859375, "logps/rejected": -534.2369384765625, "loss": 0.4828, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.8807871341705322, "rewards/margins": 0.9859061241149902, "rewards/rejected": -2.8666930198669434, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 4.53125, "learning_rate": 1.5660258189393945e-06, "logits/chosen": -1.468229055404663, "logits/rejected": -1.3634376525878906, "logps/chosen": -461.6888732910156, "logps/rejected": -510.41497802734375, "loss": 0.4842, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7835785150527954, "rewards/margins": 1.0229034423828125, "rewards/rejected": -2.8064818382263184, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 9.75, "learning_rate": 1.5238030730835578e-06, "logits/chosen": -1.3363033533096313, "logits/rejected": -1.269829511642456, "logps/chosen": -436.25140380859375, "logps/rejected": -529.2796630859375, "loss": 0.4368, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9228322505950928, "rewards/margins": 1.0578147172927856, "rewards/rejected": -2.980647087097168, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 10.9375, "learning_rate": 1.4819063690713565e-06, "logits/chosen": -1.3741873502731323, "logits/rejected": -1.2815735340118408, "logps/chosen": -440.1422424316406, "logps/rejected": -543.0122680664062, "loss": 0.5134, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9842660427093506, "rewards/margins": 1.0995490550994873, "rewards/rejected": -3.083815097808838, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 6.0625, "learning_rate": 1.4403497000615885e-06, "logits/chosen": -1.4114688634872437, "logits/rejected": -1.3185184001922607, "logps/chosen": -459.4325256347656, "logps/rejected": -547.32275390625, "loss": 0.538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9570932388305664, "rewards/margins": 0.997153103351593, "rewards/rejected": -2.9542460441589355, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 7.28125, "learning_rate": 1.3991469456441273e-06, "logits/chosen": -1.4207992553710938, "logits/rejected": -1.3556959629058838, "logps/chosen": -460.5257873535156, "logps/rejected": -550.6207275390625, "loss": 0.4774, "rewards/accuracies": 0.75, "rewards/chosen": -1.9047996997833252, "rewards/margins": 1.07115638256073, "rewards/rejected": -2.9759559631347656, "step": 1300 }, { "epoch": 0.6804501439413766, "eval_logits/chosen": -1.3491259813308716, "eval_logits/rejected": -1.2574185132980347, "eval_logps/chosen": -463.32537841796875, "eval_logps/rejected": -542.3931274414062, "eval_loss": 0.5092620253562927, "eval_rewards/accuracies": 0.7509999871253967, "eval_rewards/chosen": -1.9801515340805054, "eval_rewards/margins": 1.0310951471328735, "eval_rewards/rejected": -3.011246919631958, "eval_runtime": 418.216, "eval_samples_per_second": 4.782, "eval_steps_per_second": 0.299, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 10.125, "learning_rate": 1.3583118672042441e-06, "logits/chosen": -1.4378445148468018, "logits/rejected": -1.343832015991211, "logps/chosen": -458.37139892578125, "logps/rejected": -546.48974609375, "loss": 0.5018, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0344271659851074, "rewards/margins": 1.0906623601913452, "rewards/rejected": -3.125089645385742, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 8.4375, "learning_rate": 1.3178581033264218e-06, "logits/chosen": -1.4039326906204224, "logits/rejected": -1.3160288333892822, "logps/chosen": -478.83807373046875, "logps/rejected": -551.1180419921875, "loss": 0.4764, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.2119452953338623, "rewards/margins": 0.9416030645370483, "rewards/rejected": -3.1535484790802, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 11.0625, "learning_rate": 1.2777991652391757e-06, "logits/chosen": -1.3877145051956177, "logits/rejected": -1.3046467304229736, "logps/chosen": -481.75347900390625, "logps/rejected": -591.049072265625, "loss": 0.4574, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.188952684402466, "rewards/margins": 1.0771480798721313, "rewards/rejected": -3.266101121902466, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 7.84375, "learning_rate": 1.2381484323024178e-06, "logits/chosen": -1.3359429836273193, "logits/rejected": -1.260980248451233, "logps/chosen": -475.9947204589844, "logps/rejected": -553.0256958007812, "loss": 0.4889, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2213656902313232, "rewards/margins": 1.141205072402954, "rewards/rejected": -3.3625710010528564, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 9.25, "learning_rate": 1.1989191475388518e-06, "logits/chosen": -1.422424554824829, "logits/rejected": -1.3796898126602173, "logps/chosen": -472.56756591796875, "logps/rejected": -540.7282104492188, "loss": 0.5045, "rewards/accuracies": 0.75, "rewards/chosen": -2.030181646347046, "rewards/margins": 0.8926876187324524, "rewards/rejected": -2.9228694438934326, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 8.3125, "learning_rate": 1.160124413210918e-06, "logits/chosen": -1.3512452840805054, "logits/rejected": -1.2879610061645508, "logps/chosen": -493.9940490722656, "logps/rejected": -554.204345703125, "loss": 0.5303, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9853309392929077, "rewards/margins": 0.9203692674636841, "rewards/rejected": -2.905700206756592, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 5.75, "learning_rate": 1.1217771864447396e-06, "logits/chosen": -1.4557918310165405, "logits/rejected": -1.3968652486801147, "logps/chosen": -489.9331970214844, "logps/rejected": -553.3250732421875, "loss": 0.4331, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.907529592514038, "rewards/margins": 1.206814169883728, "rewards/rejected": -3.1143438816070557, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 6.84375, "learning_rate": 1.08389027490255e-06, "logits/chosen": -1.4385818243026733, "logits/rejected": -1.3422445058822632, "logps/chosen": -444.6114196777344, "logps/rejected": -524.5347900390625, "loss": 0.4722, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9272987842559814, "rewards/margins": 0.8720090985298157, "rewards/rejected": -2.7993075847625732, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 5.625, "learning_rate": 1.046476332505036e-06, "logits/chosen": -1.3551748991012573, "logits/rejected": -1.2887896299362183, "logps/chosen": -482.06256103515625, "logps/rejected": -566.3648681640625, "loss": 0.467, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0686769485473633, "rewards/margins": 1.1158945560455322, "rewards/rejected": -3.1845717430114746, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 7.375, "learning_rate": 1.0095478552050348e-06, "logits/chosen": -1.3571363687515259, "logits/rejected": -1.3296940326690674, "logps/chosen": -464.2586364746094, "logps/rejected": -580.7628784179688, "loss": 0.4516, "rewards/accuracies": 0.78125, "rewards/chosen": -2.159075975418091, "rewards/margins": 1.1367342472076416, "rewards/rejected": -3.2958102226257324, "step": 1400 }, { "epoch": 0.7327924627060979, "eval_logits/chosen": -1.3500477075576782, "eval_logits/rejected": -1.2592344284057617, "eval_logps/chosen": -480.7001953125, "eval_logps/rejected": -561.2969360351562, "eval_loss": 0.5057799220085144, "eval_rewards/accuracies": 0.753000020980835, "eval_rewards/chosen": -2.153899908065796, "eval_rewards/margins": 1.046384334564209, "eval_rewards/rejected": -3.200284004211426, "eval_runtime": 417.9014, "eval_samples_per_second": 4.786, "eval_steps_per_second": 0.299, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 10.8125, "learning_rate": 9.731171768139808e-07, "logits/chosen": -1.3519372940063477, "logits/rejected": -1.2831850051879883, "logps/chosen": -501.4341735839844, "logps/rejected": -577.0115356445312, "loss": 0.5527, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.3738415241241455, "rewards/margins": 0.8363450765609741, "rewards/rejected": -3.210186719894409, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 6.03125, "learning_rate": 9.371964648825221e-07, "logits/chosen": -1.4183461666107178, "logits/rejected": -1.3280363082885742, "logps/chosen": -500.97491455078125, "logps/rejected": -540.0017700195312, "loss": 0.5627, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2344210147857666, "rewards/margins": 0.8289459943771362, "rewards/rejected": -3.0633671283721924, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 6.375, "learning_rate": 9.017977166366445e-07, "logits/chosen": -1.4208705425262451, "logits/rejected": -1.3211179971694946, "logps/chosen": -489.2748107910156, "logps/rejected": -585.6953735351562, "loss": 0.4729, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1298253536224365, "rewards/margins": 1.080087423324585, "rewards/rejected": -3.2099127769470215, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 6.71875, "learning_rate": 8.669327549707096e-07, "logits/chosen": -1.5535459518432617, "logits/rejected": -1.4916893243789673, "logps/chosen": -493.194091796875, "logps/rejected": -568.2269287109375, "loss": 0.4936, "rewards/accuracies": 0.75, "rewards/chosen": -2.0372254848480225, "rewards/margins": 0.8856596946716309, "rewards/rejected": -2.9228854179382324, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 6.4375, "learning_rate": 8.326132244986932e-07, "logits/chosen": -1.4485315084457397, "logits/rejected": -1.3667795658111572, "logps/chosen": -469.4761657714844, "logps/rejected": -573.5386962890625, "loss": 0.5005, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0715749263763428, "rewards/margins": 1.1704132556915283, "rewards/rejected": -3.241987943649292, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 5.28125, "learning_rate": 7.988505876649863e-07, "logits/chosen": -1.398831844329834, "logits/rejected": -1.294712781906128, "logps/chosen": -452.2408142089844, "logps/rejected": -523.3919677734375, "loss": 0.5111, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.079291820526123, "rewards/margins": 0.8256057500839233, "rewards/rejected": -2.904897451400757, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 7.09375, "learning_rate": 7.656561209160248e-07, "logits/chosen": -1.3782474994659424, "logits/rejected": -1.2796392440795898, "logps/chosen": -480.0965881347656, "logps/rejected": -540.91357421875, "loss": 0.4984, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.089986801147461, "rewards/margins": 1.0079736709594727, "rewards/rejected": -3.0979602336883545, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 6.3125, "learning_rate": 7.330409109340563e-07, "logits/chosen": -1.4387518167495728, "logits/rejected": -1.3483431339263916, "logps/chosen": -494.4837951660156, "logps/rejected": -547.7198486328125, "loss": 0.5172, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.12834095954895, "rewards/margins": 0.997391402721405, "rewards/rejected": -3.125732421875, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 7.625, "learning_rate": 7.010158509342682e-07, "logits/chosen": -1.3690738677978516, "logits/rejected": -1.3240485191345215, "logps/chosen": -485.38671875, "logps/rejected": -598.2340087890625, "loss": 0.4909, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3237967491149902, "rewards/margins": 1.0460586547851562, "rewards/rejected": -3.3698551654815674, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 7.21875, "learning_rate": 6.695916370265529e-07, "logits/chosen": -1.3943806886672974, "logits/rejected": -1.3494868278503418, "logps/chosen": -455.65185546875, "logps/rejected": -563.0969848632812, "loss": 0.4758, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.195671796798706, "rewards/margins": 1.081477165222168, "rewards/rejected": -3.277149200439453, "step": 1500 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": -1.3710448741912842, "eval_logits/rejected": -1.2802826166152954, "eval_logps/chosen": -488.7256774902344, "eval_logps/rejected": -565.533935546875, "eval_loss": 0.5017847418785095, "eval_rewards/accuracies": 0.7549999952316284, "eval_rewards/chosen": -2.23415470123291, "eval_rewards/margins": 1.0084995031356812, "eval_rewards/rejected": -3.242654323577881, "eval_runtime": 418.1465, "eval_samples_per_second": 4.783, "eval_steps_per_second": 0.299, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 8.6875, "learning_rate": 6.387787646430854e-07, "logits/chosen": -1.3753058910369873, "logits/rejected": -1.3127710819244385, "logps/chosen": -496.5264587402344, "logps/rejected": -541.249755859375, "loss": 0.52, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.313178777694702, "rewards/margins": 0.8921319246292114, "rewards/rejected": -3.205310821533203, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 6.90625, "learning_rate": 6.085875250329401e-07, "logits/chosen": -1.483332872390747, "logits/rejected": -1.397125005722046, "logps/chosen": -464.541015625, "logps/rejected": -523.9359130859375, "loss": 0.5011, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2222468852996826, "rewards/margins": 0.8111170530319214, "rewards/rejected": -3.0333640575408936, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 7.875, "learning_rate": 5.79028001824894e-07, "logits/chosen": -1.3503555059432983, "logits/rejected": -1.365071177482605, "logps/chosen": -444.367431640625, "logps/rejected": -603.1524658203125, "loss": 0.4428, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.040860891342163, "rewards/margins": 1.1968605518341064, "rewards/rejected": -3.2377212047576904, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 4.6875, "learning_rate": 5.501100676595761e-07, "logits/chosen": -1.482961654663086, "logits/rejected": -1.430662751197815, "logps/chosen": -496.47113037109375, "logps/rejected": -574.3890380859375, "loss": 0.4985, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.17643666267395, "rewards/margins": 0.9874047040939331, "rewards/rejected": -3.163841485977173, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 6.21875, "learning_rate": 5.218433808920884e-07, "logits/chosen": -1.5109050273895264, "logits/rejected": -1.3711371421813965, "logps/chosen": -486.56390380859375, "logps/rejected": -531.2752685546875, "loss": 0.4784, "rewards/accuracies": 0.75, "rewards/chosen": -2.1899561882019043, "rewards/margins": 0.85688316822052, "rewards/rejected": -3.0468392372131348, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 5.8125, "learning_rate": 4.942373823661928e-07, "logits/chosen": -1.4783203601837158, "logits/rejected": -1.3000494241714478, "logps/chosen": -524.826171875, "logps/rejected": -568.1104736328125, "loss": 0.5195, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0753016471862793, "rewards/margins": 1.0555765628814697, "rewards/rejected": -3.130878210067749, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 7.28125, "learning_rate": 4.6730129226114363e-07, "logits/chosen": -1.400521159172058, "logits/rejected": -1.3176801204681396, "logps/chosen": -468.08209228515625, "logps/rejected": -573.2125854492188, "loss": 0.5045, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.281712293624878, "rewards/margins": 0.9516536593437195, "rewards/rejected": -3.2333664894104004, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 6.8125, "learning_rate": 4.4104410701222703e-07, "logits/chosen": -1.4205479621887207, "logits/rejected": -1.344589352607727, "logps/chosen": -487.1837463378906, "logps/rejected": -570.49755859375, "loss": 0.5408, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3290271759033203, "rewards/margins": 0.8474947214126587, "rewards/rejected": -3.1765220165252686, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 7.21875, "learning_rate": 4.154745963060197e-07, "logits/chosen": -1.3773494958877563, "logits/rejected": -1.2772356271743774, "logps/chosen": -450.824951171875, "logps/rejected": -527.1939697265625, "loss": 0.4705, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1658689975738525, "rewards/margins": 0.9598967432975769, "rewards/rejected": -3.125765562057495, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 6.15625, "learning_rate": 3.9060130015138863e-07, "logits/chosen": -1.3875905275344849, "logits/rejected": -1.3845030069351196, "logps/chosen": -471.2229919433594, "logps/rejected": -560.0380859375, "loss": 0.4967, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2258384227752686, "rewards/margins": 0.9428621530532837, "rewards/rejected": -3.168700695037842, "step": 1600 }, { "epoch": 0.8374771002355405, "eval_logits/chosen": -1.3836894035339355, "eval_logits/rejected": -1.2938653230667114, "eval_logps/chosen": -482.2090148925781, "eval_logps/rejected": -558.7111206054688, "eval_loss": 0.5018904805183411, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -2.16898775100708, "eval_rewards/margins": 1.005439043045044, "eval_rewards/rejected": -3.174426794052124, "eval_runtime": 417.9377, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.299, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 6.5625, "learning_rate": 3.664325260271953e-07, "logits/chosen": -1.3876458406448364, "logits/rejected": -1.3377858400344849, "logps/chosen": -499.890625, "logps/rejected": -540.2047119140625, "loss": 0.5166, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1201303005218506, "rewards/margins": 0.7784576416015625, "rewards/rejected": -2.898587942123413, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 8.125, "learning_rate": 3.429763461076677e-07, "logits/chosen": -1.5190773010253906, "logits/rejected": -1.4078010320663452, "logps/chosen": -504.95440673828125, "logps/rejected": -553.4002075195312, "loss": 0.4931, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1076488494873047, "rewards/margins": 0.9432754516601562, "rewards/rejected": -3.050924777984619, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 6.71875, "learning_rate": 3.202405945663556e-07, "logits/chosen": -1.477140188217163, "logits/rejected": -1.378173589706421, "logps/chosen": -491.18255615234375, "logps/rejected": -552.9224243164062, "loss": 0.4939, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2754175662994385, "rewards/margins": 0.8862501978874207, "rewards/rejected": -3.161667585372925, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 7.125, "learning_rate": 2.982328649595856e-07, "logits/chosen": -1.4160666465759277, "logits/rejected": -1.3694106340408325, "logps/chosen": -456.3934631347656, "logps/rejected": -542.6761474609375, "loss": 0.514, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.017909049987793, "rewards/margins": 1.0016233921051025, "rewards/rejected": -3.0195322036743164, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 5.25, "learning_rate": 2.7696050769026954e-07, "logits/chosen": -1.4229543209075928, "logits/rejected": -1.401234745979309, "logps/chosen": -478.1155700683594, "logps/rejected": -582.1157836914062, "loss": 0.5175, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1747491359710693, "rewards/margins": 1.023895263671875, "rewards/rejected": -3.1986443996429443, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 6.8125, "learning_rate": 2.564306275529341e-07, "logits/chosen": -1.401556134223938, "logits/rejected": -1.32057785987854, "logps/chosen": -453.3318786621094, "logps/rejected": -505.73553466796875, "loss": 0.5158, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.061955451965332, "rewards/margins": 0.896806538105011, "rewards/rejected": -2.9587621688842773, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 7.28125, "learning_rate": 2.3665008136077332e-07, "logits/chosen": -1.5141568183898926, "logits/rejected": -1.4125263690948486, "logps/chosen": -492.5354919433594, "logps/rejected": -581.779541015625, "loss": 0.5019, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.24076509475708, "rewards/margins": 1.061389684677124, "rewards/rejected": -3.302154541015625, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 6.5625, "learning_rate": 2.1762547565553293e-07, "logits/chosen": -1.425940990447998, "logits/rejected": -1.3354724645614624, "logps/chosen": -475.68267822265625, "logps/rejected": -578.599609375, "loss": 0.457, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0755348205566406, "rewards/margins": 1.1317123174667358, "rewards/rejected": -3.207247257232666, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 7.75, "learning_rate": 1.993631645009747e-07, "logits/chosen": -1.3983068466186523, "logits/rejected": -1.3008172512054443, "logps/chosen": -452.4076232910156, "logps/rejected": -554.677734375, "loss": 0.4582, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0816917419433594, "rewards/margins": 1.1499135494232178, "rewards/rejected": -3.2316055297851562, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 7.125, "learning_rate": 1.818692473606748e-07, "logits/chosen": -1.4285587072372437, "logits/rejected": -1.3024065494537354, "logps/chosen": -481.10028076171875, "logps/rejected": -540.8759765625, "loss": 0.4769, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1295053958892822, "rewards/margins": 0.9817113876342773, "rewards/rejected": -3.1112167835235596, "step": 1700 }, { "epoch": 0.8898194190002617, "eval_logits/chosen": -1.383370280265808, "eval_logits/rejected": -1.2936170101165771, "eval_logps/chosen": -479.62451171875, "eval_logps/rejected": -555.8690795898438, "eval_loss": 0.5018261671066284, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -2.1431422233581543, "eval_rewards/margins": 1.0028636455535889, "eval_rewards/rejected": -3.1460063457489014, "eval_runtime": 417.8262, "eval_samples_per_second": 4.787, "eval_steps_per_second": 0.299, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 9.3125, "learning_rate": 1.6514956706084885e-07, "logits/chosen": -1.514194369316101, "logits/rejected": -1.3973029851913452, "logps/chosen": -485.75274658203125, "logps/rejected": -541.7138061523438, "loss": 0.4736, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1425530910491943, "rewards/margins": 0.9163280725479126, "rewards/rejected": -3.0588815212249756, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 8.5, "learning_rate": 1.4920970783889737e-07, "logits/chosen": -1.4195497035980225, "logits/rejected": -1.301021933555603, "logps/chosen": -480.7032165527344, "logps/rejected": -547.6854248046875, "loss": 0.4827, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1341195106506348, "rewards/margins": 0.9015541076660156, "rewards/rejected": -3.0356740951538086, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 7.4375, "learning_rate": 1.340549934783164e-07, "logits/chosen": -1.3841962814331055, "logits/rejected": -1.3453882932662964, "logps/chosen": -471.51409912109375, "logps/rejected": -565.4349975585938, "loss": 0.4788, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.137712001800537, "rewards/margins": 1.1250284910202026, "rewards/rejected": -3.26274037361145, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 4.84375, "learning_rate": 1.196904855305961e-07, "logits/chosen": -1.426884412765503, "logits/rejected": -1.3216279745101929, "logps/chosen": -475.46966552734375, "logps/rejected": -533.2386474609375, "loss": 0.513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1345527172088623, "rewards/margins": 0.9738175272941589, "rewards/rejected": -3.108370542526245, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 5.84375, "learning_rate": 1.0612098162470302e-07, "logits/chosen": -1.4746651649475098, "logits/rejected": -1.341615080833435, "logps/chosen": -508.99725341796875, "logps/rejected": -557.3981323242188, "loss": 0.5213, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2222418785095215, "rewards/margins": 0.8908487558364868, "rewards/rejected": -3.1130905151367188, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 6.09375, "learning_rate": 9.335101386471285e-08, "logits/chosen": -1.455482840538025, "logits/rejected": -1.359724760055542, "logps/chosen": -447.65460205078125, "logps/rejected": -554.7275390625, "loss": 0.4529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0690581798553467, "rewards/margins": 1.1081968545913696, "rewards/rejected": -3.177255153656006, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 5.71875, "learning_rate": 8.138484731612273e-08, "logits/chosen": -1.383230447769165, "logits/rejected": -1.2893016338348389, "logps/chosen": -459.4063415527344, "logps/rejected": -560.9949340820312, "loss": 0.467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.167651414871216, "rewards/margins": 1.112287163734436, "rewards/rejected": -3.2799384593963623, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 4.625, "learning_rate": 7.022647858135501e-08, "logits/chosen": -1.4165370464324951, "logits/rejected": -1.4020284414291382, "logps/chosen": -465.9361877441406, "logps/rejected": -553.9609375, "loss": 0.5148, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.164100408554077, "rewards/margins": 1.0550620555877686, "rewards/rejected": -3.219162702560425, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 6.5, "learning_rate": 5.987963446492384e-08, "logits/chosen": -1.4179389476776123, "logits/rejected": -1.3730156421661377, "logps/chosen": -521.4404296875, "logps/rejected": -637.6373291015625, "loss": 0.4848, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1412482261657715, "rewards/margins": 1.2061725854873657, "rewards/rejected": -3.3474209308624268, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 10.25, "learning_rate": 5.034777072871394e-08, "logits/chosen": -1.3642308712005615, "logits/rejected": -1.286407470703125, "logps/chosen": -457.43011474609375, "logps/rejected": -546.7205810546875, "loss": 0.4843, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.126072645187378, "rewards/margins": 0.9802547693252563, "rewards/rejected": -3.106327533721924, "step": 1800 }, { "epoch": 0.942161737764983, "eval_logits/chosen": -1.3828879594802856, "eval_logits/rejected": -1.2932060956954956, "eval_logps/chosen": -480.0619812011719, "eval_logps/rejected": -556.609375, "eval_loss": 0.5018665194511414, "eval_rewards/accuracies": 0.7580000162124634, "eval_rewards/chosen": -2.147517681121826, "eval_rewards/margins": 1.0058910846710205, "eval_rewards/rejected": -3.153409004211426, "eval_runtime": 417.9301, "eval_samples_per_second": 4.785, "eval_steps_per_second": 0.299, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 6.4375, "learning_rate": 4.163407093778243e-08, "logits/chosen": -1.4097586870193481, "logits/rejected": -1.3039724826812744, "logps/chosen": -472.35009765625, "logps/rejected": -532.8004760742188, "loss": 0.51, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.165250301361084, "rewards/margins": 1.0300533771514893, "rewards/rejected": -3.1953039169311523, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 7.6875, "learning_rate": 3.37414453970758e-08, "logits/chosen": -1.48902428150177, "logits/rejected": -1.3304731845855713, "logps/chosen": -477.92327880859375, "logps/rejected": -527.3563232421875, "loss": 0.4981, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.067380905151367, "rewards/margins": 1.0794079303741455, "rewards/rejected": -3.1467888355255127, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 8.125, "learning_rate": 2.6672530179410183e-08, "logits/chosen": -1.4282177686691284, "logits/rejected": -1.3993771076202393, "logps/chosen": -457.6155700683594, "logps/rejected": -527.4920043945312, "loss": 0.5193, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.091433525085449, "rewards/margins": 0.8957453966140747, "rewards/rejected": -2.9871792793273926, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 7.5625, "learning_rate": 2.04296862450451e-08, "logits/chosen": -1.4081538915634155, "logits/rejected": -1.3607032299041748, "logps/chosen": -513.7534790039062, "logps/rejected": -570.0910034179688, "loss": 0.4811, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3009374141693115, "rewards/margins": 0.9127146005630493, "rewards/rejected": -3.2136521339416504, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 4.75, "learning_rate": 1.501499865314171e-08, "logits/chosen": -1.34174382686615, "logits/rejected": -1.2826250791549683, "logps/chosen": -534.8573608398438, "logps/rejected": -590.8583984375, "loss": 0.5025, "rewards/accuracies": 0.71875, "rewards/chosen": -2.214351177215576, "rewards/margins": 0.9238801002502441, "rewards/rejected": -3.1382312774658203, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 7.71875, "learning_rate": 1.0430275865371265e-08, "logits/chosen": -1.3105404376983643, "logits/rejected": -1.1937357187271118, "logps/chosen": -463.4999084472656, "logps/rejected": -528.9051513671875, "loss": 0.4948, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.202223777770996, "rewards/margins": 0.9966042637825012, "rewards/rejected": -3.1988282203674316, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 9.5625, "learning_rate": 6.677049141901315e-09, "logits/chosen": -1.3534274101257324, "logits/rejected": -1.3385337591171265, "logps/chosen": -457.401123046875, "logps/rejected": -556.7625122070312, "loss": 0.485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1478359699249268, "rewards/margins": 0.9303666949272156, "rewards/rejected": -3.078202724456787, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 5.46875, "learning_rate": 3.756572029968708e-09, "logits/chosen": -1.486336350440979, "logits/rejected": -1.3973008394241333, "logps/chosen": -482.0902404785156, "logps/rejected": -552.3614501953125, "loss": 0.4895, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1343655586242676, "rewards/margins": 1.0286964178085327, "rewards/rejected": -3.163062334060669, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 8.4375, "learning_rate": 1.6698199452053199e-09, "logits/chosen": -1.4284842014312744, "logits/rejected": -1.315100073814392, "logps/chosen": -496.64324951171875, "logps/rejected": -574.592529296875, "loss": 0.4807, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.185387372970581, "rewards/margins": 0.9584697484970093, "rewards/rejected": -3.14385724067688, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 7.375, "learning_rate": 4.1748984585560094e-10, "logits/chosen": -1.4348084926605225, "logits/rejected": -1.299529790878296, "logps/chosen": -478.8993225097656, "logps/rejected": -551.3607177734375, "loss": 0.5048, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.188791036605835, "rewards/margins": 1.035099983215332, "rewards/rejected": -3.223891496658325, "step": 1900 }, { "epoch": 0.9945040565297043, "eval_logits/chosen": -1.3829154968261719, "eval_logits/rejected": -1.2933340072631836, "eval_logps/chosen": -480.1491394042969, "eval_logps/rejected": -556.6638793945312, "eval_loss": 0.5018510818481445, "eval_rewards/accuracies": 0.7590000033378601, "eval_rewards/chosen": -2.1483893394470215, "eval_rewards/margins": 1.005564570426941, "eval_rewards/rejected": -3.153954029083252, "eval_runtime": 418.0312, "eval_samples_per_second": 4.784, "eval_steps_per_second": 0.299, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 10.375, "learning_rate": 0.0, "logits/chosen": -1.4434951543807983, "logits/rejected": -1.3596076965332031, "logps/chosen": -480.58087158203125, "logps/rejected": -560.4929809570312, "loss": 0.4864, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1725149154663086, "rewards/margins": 0.8602474927902222, "rewards/rejected": -3.0327625274658203, "step": 1910 }, { "epoch": 0.9997382884061764, "step": 1910, "total_flos": 0.0, "train_loss": 0.5325085540092428, "train_runtime": 36177.5726, "train_samples_per_second": 1.69, "train_steps_per_second": 0.053 } ], "logging_steps": 10, "max_steps": 1910, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }