{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.8125, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.377302885055542, "logits/rejected": -2.2193117141723633, "logps/chosen": -290.4185485839844, "logps/rejected": -374.6501770019531, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 2.40625, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.25045108795166, "logits/rejected": -2.052776575088501, "logps/chosen": -279.61688232421875, "logps/rejected": -245.4197540283203, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": 0.000771976076066494, "rewards/margins": 0.00010288292105542496, "rewards/rejected": 0.0006690931040793657, "step": 10 }, { "epoch": 0.01, "grad_norm": 2.484375, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.2451391220092773, "logits/rejected": -1.944021224975586, "logps/chosen": -305.45184326171875, "logps/rejected": -237.7191619873047, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004307927563786507, "rewards/margins": 0.0011060098186135292, "rewards/rejected": 0.003201917978003621, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.3125, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.2053542137145996, "logits/rejected": -2.136805772781372, "logps/chosen": -251.1873016357422, "logps/rejected": -251.39126586914062, "loss": 0.692, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.012356054969131947, "rewards/margins": 0.0023143726866692305, "rewards/rejected": 0.010041682049632072, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.9453125, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.062053918838501, "logits/rejected": -2.0244908332824707, "logps/chosen": -216.23828125, "logps/rejected": -221.68917846679688, "loss": 0.6915, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.019059285521507263, "rewards/margins": 0.0032902732491493225, "rewards/rejected": 0.01576901227235794, "step": 40 }, { "epoch": 0.01, "grad_norm": 2.078125, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.1121723651885986, "logits/rejected": -2.1005072593688965, "logps/chosen": -266.8817443847656, "logps/rejected": -234.3415069580078, "loss": 0.6905, "rewards/accuracies": 0.59375, "rewards/chosen": 0.030057832598686218, "rewards/margins": 0.005467818584293127, "rewards/rejected": 0.024590013548731804, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.125, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.0995335578918457, "logits/rejected": -1.9425058364868164, "logps/chosen": -252.32351684570312, "logps/rejected": -226.69961547851562, "loss": 0.69, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03176448494195938, "rewards/margins": 0.006372343748807907, "rewards/rejected": 0.025392139330506325, "step": 60 }, { "epoch": 0.02, "grad_norm": 2.03125, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.2442469596862793, "logits/rejected": -2.036492347717285, "logps/chosen": -272.0433044433594, "logps/rejected": -246.6951446533203, "loss": 0.6879, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.04112860932946205, "rewards/margins": 0.010742614977061749, "rewards/rejected": 0.030385995283722878, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.390625, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.153740882873535, "logits/rejected": -1.977267861366272, "logps/chosen": -257.5650329589844, "logps/rejected": -246.85354614257812, "loss": 0.6872, "rewards/accuracies": 0.65625, "rewards/chosen": 0.038635507225990295, "rewards/margins": 0.012301743030548096, "rewards/rejected": 0.0263337641954422, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.1875, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.136314868927002, "logits/rejected": -2.000256061553955, "logps/chosen": -250.14096069335938, "logps/rejected": -234.5118408203125, "loss": 0.6848, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.04195228964090347, "rewards/margins": 0.017196740955114365, "rewards/rejected": 0.02475554868578911, "step": 90 }, { "epoch": 0.03, "grad_norm": 2.125, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.179086208343506, "logits/rejected": -2.068403482437134, "logps/chosen": -246.95883178710938, "logps/rejected": -230.7919921875, "loss": 0.6819, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04810682684183121, "rewards/margins": 0.023308780044317245, "rewards/rejected": 0.024798044934868813, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -2.095933198928833, "eval_logits/rejected": -1.9564727544784546, "eval_logps/chosen": -259.64715576171875, "eval_logps/rejected": -241.9028778076172, "eval_loss": 0.6821568012237549, "eval_rewards/accuracies": 0.6545000076293945, "eval_rewards/chosen": 0.05004846677184105, "eval_rewards/margins": 0.02299799770116806, "eval_rewards/rejected": 0.02705046720802784, "eval_runtime": 381.806, "eval_samples_per_second": 5.238, "eval_steps_per_second": 0.655, "step": 100 }, { "epoch": 0.03, "grad_norm": 2.3125, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.1454405784606934, "logits/rejected": -2.0017640590667725, "logps/chosen": -284.425537109375, "logps/rejected": -238.8695526123047, "loss": 0.6795, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.049382902681827545, "rewards/margins": 0.02859182097017765, "rewards/rejected": 0.020791077986359596, "step": 110 }, { "epoch": 0.03, "grad_norm": 2.140625, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.1937575340270996, "logits/rejected": -2.054399013519287, "logps/chosen": -287.4629821777344, "logps/rejected": -271.8957824707031, "loss": 0.6729, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.05581967160105705, "rewards/margins": 0.042316947132349014, "rewards/rejected": 0.013502727262675762, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.765625, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.208482265472412, "logits/rejected": -2.118875503540039, "logps/chosen": -250.0573272705078, "logps/rejected": -252.57418823242188, "loss": 0.6698, "rewards/accuracies": 0.71875, "rewards/chosen": 0.050946980714797974, "rewards/margins": 0.049403756856918335, "rewards/rejected": 0.0015432273503392935, "step": 130 }, { "epoch": 0.04, "grad_norm": 2.484375, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.2458949089050293, "logits/rejected": -1.911431074142456, "logps/chosen": -270.4693298339844, "logps/rejected": -226.22677612304688, "loss": 0.6685, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04268602281808853, "rewards/margins": 0.05290870741009712, "rewards/rejected": -0.010222683660686016, "step": 140 }, { "epoch": 0.04, "grad_norm": 2.640625, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.2650039196014404, "logits/rejected": -2.039114475250244, "logps/chosen": -280.2913818359375, "logps/rejected": -242.7501983642578, "loss": 0.6678, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.03719528391957283, "rewards/margins": 0.05549495667219162, "rewards/rejected": -0.01829967275261879, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.671875, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.1557822227478027, "logits/rejected": -2.0535261631011963, "logps/chosen": -256.06103515625, "logps/rejected": -261.87261962890625, "loss": 0.6687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007628369145095348, "rewards/margins": 0.05603449419140816, "rewards/rejected": -0.04840613156557083, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.875, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.125109910964966, "logits/rejected": -1.9704573154449463, "logps/chosen": -220.9778594970703, "logps/rejected": -228.26919555664062, "loss": 0.671, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0038716509006917477, "rewards/margins": 0.05044783279299736, "rewards/rejected": -0.05431948974728584, "step": 170 }, { "epoch": 0.05, "grad_norm": 3.53125, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.1243832111358643, "logits/rejected": -1.9889084100723267, "logps/chosen": -258.29095458984375, "logps/rejected": -251.7142333984375, "loss": 0.6638, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.025864282622933388, "rewards/margins": 0.06769417971372604, "rewards/rejected": -0.09355846792459488, "step": 180 }, { "epoch": 0.05, "grad_norm": 3.015625, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.2455646991729736, "logits/rejected": -2.0299086570739746, "logps/chosen": -272.17633056640625, "logps/rejected": -253.8187255859375, "loss": 0.6499, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.020921263843774796, "rewards/margins": 0.09995204210281372, "rewards/rejected": -0.12087330967187881, "step": 190 }, { "epoch": 0.05, "grad_norm": 3.546875, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.1975388526916504, "logits/rejected": -1.9570707082748413, "logps/chosen": -264.46234130859375, "logps/rejected": -235.4163818359375, "loss": 0.6548, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14753015339374542, "rewards/margins": 0.09057016670703888, "rewards/rejected": -0.23810029029846191, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.0694758892059326, "eval_logits/rejected": -1.9328563213348389, "eval_logps/chosen": -279.5373229980469, "eval_logps/rejected": -269.7627868652344, "eval_loss": 0.6499924063682556, "eval_rewards/accuracies": 0.6779999732971191, "eval_rewards/chosen": -0.1488528698682785, "eval_rewards/margins": 0.10269534587860107, "eval_rewards/rejected": -0.2515482008457184, "eval_runtime": 382.022, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 200 }, { "epoch": 0.05, "grad_norm": 3.140625, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.198995590209961, "logits/rejected": -1.9819616079330444, "logps/chosen": -271.3312072753906, "logps/rejected": -252.93746948242188, "loss": 0.6365, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09111092239618301, "rewards/margins": 0.1327463835477829, "rewards/rejected": -0.2238573133945465, "step": 210 }, { "epoch": 0.06, "grad_norm": 3.1875, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.097423553466797, "logits/rejected": -1.9822295904159546, "logps/chosen": -259.9545593261719, "logps/rejected": -246.3585662841797, "loss": 0.6394, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05938801169395447, "rewards/margins": 0.12806808948516846, "rewards/rejected": -0.18745610117912292, "step": 220 }, { "epoch": 0.06, "grad_norm": 6.40625, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.2377123832702637, "logits/rejected": -2.050795078277588, "logps/chosen": -315.82159423828125, "logps/rejected": -288.96539306640625, "loss": 0.6629, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09652389585971832, "rewards/margins": 0.08648413419723511, "rewards/rejected": -0.18300803005695343, "step": 230 }, { "epoch": 0.06, "grad_norm": 3.21875, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.1486618518829346, "logits/rejected": -1.961085319519043, "logps/chosen": -312.89373779296875, "logps/rejected": -312.0883483886719, "loss": 0.6388, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.1659378707408905, "rewards/margins": 0.1430220901966095, "rewards/rejected": -0.3089599311351776, "step": 240 }, { "epoch": 0.07, "grad_norm": 5.15625, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.112567186355591, "logits/rejected": -2.012845039367676, "logps/chosen": -277.0249938964844, "logps/rejected": -268.689208984375, "loss": 0.6263, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1744508147239685, "rewards/margins": 0.17131540179252625, "rewards/rejected": -0.34576624631881714, "step": 250 }, { "epoch": 0.07, "grad_norm": 4.53125, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.1583478450775146, "logits/rejected": -1.9551265239715576, "logps/chosen": -310.0099792480469, "logps/rejected": -299.52789306640625, "loss": 0.6515, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.37605080008506775, "rewards/margins": 0.11539731919765472, "rewards/rejected": -0.49144816398620605, "step": 260 }, { "epoch": 0.07, "grad_norm": 3.296875, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.0597262382507324, "logits/rejected": -1.9347015619277954, "logps/chosen": -287.3021545410156, "logps/rejected": -277.96014404296875, "loss": 0.6083, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.26627668738365173, "rewards/margins": 0.22069358825683594, "rewards/rejected": -0.48697033524513245, "step": 270 }, { "epoch": 0.07, "grad_norm": 4.25, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.125945568084717, "logits/rejected": -1.954007863998413, "logps/chosen": -298.900390625, "logps/rejected": -293.0090637207031, "loss": 0.6386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.39951270818710327, "rewards/margins": 0.1558128446340561, "rewards/rejected": -0.5553255081176758, "step": 280 }, { "epoch": 0.08, "grad_norm": 3.796875, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.0477206707000732, "logits/rejected": -1.9491031169891357, "logps/chosen": -324.5054626464844, "logps/rejected": -319.0287780761719, "loss": 0.6271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.38871732354164124, "rewards/margins": 0.19628065824508667, "rewards/rejected": -0.5849979519844055, "step": 290 }, { "epoch": 0.08, "grad_norm": 4.96875, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.0910630226135254, "logits/rejected": -1.888196587562561, "logps/chosen": -272.47198486328125, "logps/rejected": -281.57830810546875, "loss": 0.6084, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28343814611434937, "rewards/margins": 0.22831246256828308, "rewards/rejected": -0.5117505788803101, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": -2.011384963989258, "eval_logits/rejected": -1.8770692348480225, "eval_logps/chosen": -294.2168884277344, "eval_logps/rejected": -294.5921325683594, "eval_loss": 0.6213397979736328, "eval_rewards/accuracies": 0.6809999942779541, "eval_rewards/chosen": -0.29564887285232544, "eval_rewards/margins": 0.20419315993785858, "eval_rewards/rejected": -0.4998420178890228, "eval_runtime": 381.8433, "eval_samples_per_second": 5.238, "eval_steps_per_second": 0.655, "step": 300 }, { "epoch": 0.08, "grad_norm": 4.03125, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.2418582439422607, "logits/rejected": -2.04129695892334, "logps/chosen": -316.5093994140625, "logps/rejected": -291.79010009765625, "loss": 0.5836, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.27603110671043396, "rewards/margins": 0.2892019748687744, "rewards/rejected": -0.565233051776886, "step": 310 }, { "epoch": 0.08, "grad_norm": 4.40625, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.1178698539733887, "logits/rejected": -1.9309499263763428, "logps/chosen": -298.84527587890625, "logps/rejected": -299.9272155761719, "loss": 0.6369, "rewards/accuracies": 0.65625, "rewards/chosen": -0.40581315755844116, "rewards/margins": 0.1810055673122406, "rewards/rejected": -0.5868188142776489, "step": 320 }, { "epoch": 0.09, "grad_norm": 4.4375, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.046699047088623, "logits/rejected": -1.9039798974990845, "logps/chosen": -296.7830505371094, "logps/rejected": -293.9065246582031, "loss": 0.6198, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3718874454498291, "rewards/margins": 0.21303264796733856, "rewards/rejected": -0.5849201083183289, "step": 330 }, { "epoch": 0.09, "grad_norm": 5.375, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.1172854900360107, "logits/rejected": -2.0036845207214355, "logps/chosen": -316.01226806640625, "logps/rejected": -323.5932922363281, "loss": 0.5946, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.26326456665992737, "rewards/margins": 0.28980451822280884, "rewards/rejected": -0.5530691146850586, "step": 340 }, { "epoch": 0.09, "grad_norm": 3.8125, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.042684555053711, "logits/rejected": -1.8946377038955688, "logps/chosen": -352.21502685546875, "logps/rejected": -358.153564453125, "loss": 0.6413, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6295667886734009, "rewards/margins": 0.17427758872509003, "rewards/rejected": -0.8038444519042969, "step": 350 }, { "epoch": 0.09, "grad_norm": 3.015625, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.011836528778076, "logits/rejected": -1.9665615558624268, "logps/chosen": -317.6282958984375, "logps/rejected": -318.0123291015625, "loss": 0.6161, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7664434909820557, "rewards/margins": 0.21231558918952942, "rewards/rejected": -0.9787591099739075, "step": 360 }, { "epoch": 0.1, "grad_norm": 4.53125, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.1028566360473633, "logits/rejected": -1.9274108409881592, "logps/chosen": -356.88507080078125, "logps/rejected": -335.1341857910156, "loss": 0.6264, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7998887300491333, "rewards/margins": 0.22070667147636414, "rewards/rejected": -1.0205953121185303, "step": 370 }, { "epoch": 0.1, "grad_norm": 4.59375, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.069827079772949, "logits/rejected": -1.8606586456298828, "logps/chosen": -364.96856689453125, "logps/rejected": -353.82769775390625, "loss": 0.6264, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6691190004348755, "rewards/margins": 0.2223375141620636, "rewards/rejected": -0.8914563059806824, "step": 380 }, { "epoch": 0.1, "grad_norm": 4.78125, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.088129997253418, "logits/rejected": -1.971571683883667, "logps/chosen": -316.87994384765625, "logps/rejected": -327.4869079589844, "loss": 0.5863, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3583374619483948, "rewards/margins": 0.3061427175998688, "rewards/rejected": -0.6644802093505859, "step": 390 }, { "epoch": 0.1, "grad_norm": 5.125, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.119563341140747, "logits/rejected": -1.8813574314117432, "logps/chosen": -296.64593505859375, "logps/rejected": -276.7133483886719, "loss": 0.6237, "rewards/accuracies": 0.6875, "rewards/chosen": -0.31169393658638, "rewards/margins": 0.207248717546463, "rewards/rejected": -0.5189425945281982, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -1.9655628204345703, "eval_logits/rejected": -1.836700201034546, "eval_logps/chosen": -310.03485107421875, "eval_logps/rejected": -318.6169738769531, "eval_loss": 0.6038790345191956, "eval_rewards/accuracies": 0.6934999823570251, "eval_rewards/chosen": -0.45382827520370483, "eval_rewards/margins": 0.2862620949745178, "eval_rewards/rejected": -0.7400903105735779, "eval_runtime": 382.0228, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 400 }, { "epoch": 0.11, "grad_norm": 4.0625, "learning_rate": 4.999239142174581e-06, "logits/chosen": -1.988959550857544, "logits/rejected": -1.9292503595352173, "logps/chosen": -292.4900817871094, "logps/rejected": -307.29473876953125, "loss": 0.6499, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5016773343086243, "rewards/margins": 0.16585329174995422, "rewards/rejected": -0.6675306558609009, "step": 410 }, { "epoch": 0.11, "grad_norm": 5.375, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.0150246620178223, "logits/rejected": -1.8929126262664795, "logps/chosen": -260.4281921386719, "logps/rejected": -280.3924865722656, "loss": 0.5908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3163732588291168, "rewards/margins": 0.29044631123542786, "rewards/rejected": -0.6068195104598999, "step": 420 }, { "epoch": 0.11, "grad_norm": 4.75, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.0086240768432617, "logits/rejected": -1.9487006664276123, "logps/chosen": -308.3887634277344, "logps/rejected": -317.20904541015625, "loss": 0.5817, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3084072172641754, "rewards/margins": 0.3242705166339874, "rewards/rejected": -0.6326777338981628, "step": 430 }, { "epoch": 0.12, "grad_norm": 7.90625, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.112046718597412, "logits/rejected": -2.027024984359741, "logps/chosen": -303.4664306640625, "logps/rejected": -303.01220703125, "loss": 0.6025, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3131754994392395, "rewards/margins": 0.2790473401546478, "rewards/rejected": -0.5922229290008545, "step": 440 }, { "epoch": 0.12, "grad_norm": 5.8125, "learning_rate": 4.995316053150366e-06, "logits/chosen": -1.9543377161026, "logits/rejected": -1.8296692371368408, "logps/chosen": -309.422119140625, "logps/rejected": -325.46173095703125, "loss": 0.5577, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4158857762813568, "rewards/margins": 0.3905051648616791, "rewards/rejected": -0.8063910603523254, "step": 450 }, { "epoch": 0.12, "grad_norm": 8.375, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -1.9097979068756104, "logits/rejected": -1.8239259719848633, "logps/chosen": -370.8164978027344, "logps/rejected": -396.86004638671875, "loss": 0.5805, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0571677684783936, "rewards/margins": 0.4056069254875183, "rewards/rejected": -1.4627748727798462, "step": 460 }, { "epoch": 0.12, "grad_norm": 5.40625, "learning_rate": 4.992103988476206e-06, "logits/chosen": -1.9127140045166016, "logits/rejected": -1.7631990909576416, "logps/chosen": -352.392822265625, "logps/rejected": -381.87896728515625, "loss": 0.5803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0857564210891724, "rewards/margins": 0.4201774597167969, "rewards/rejected": -1.5059337615966797, "step": 470 }, { "epoch": 0.13, "grad_norm": 5.0625, "learning_rate": 4.990185749791866e-06, "logits/chosen": -1.892653226852417, "logits/rejected": -1.7571289539337158, "logps/chosen": -333.1285095214844, "logps/rejected": -386.10107421875, "loss": 0.5413, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7624952793121338, "rewards/margins": 0.5117734670639038, "rewards/rejected": -1.2742688655853271, "step": 480 }, { "epoch": 0.13, "grad_norm": 7.0, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -1.8925682306289673, "logits/rejected": -1.7469890117645264, "logps/chosen": -369.3451232910156, "logps/rejected": -387.9554443359375, "loss": 0.5514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8445364832878113, "rewards/margins": 0.4895913600921631, "rewards/rejected": -1.3341277837753296, "step": 490 }, { "epoch": 0.13, "grad_norm": 5.4375, "learning_rate": 4.985725660577184e-06, "logits/chosen": -1.8205528259277344, "logits/rejected": -1.6672782897949219, "logps/chosen": -371.17864990234375, "logps/rejected": -382.154296875, "loss": 0.5534, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9741867780685425, "rewards/margins": 0.522149920463562, "rewards/rejected": -1.4963366985321045, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": -1.671244502067566, "eval_logits/rejected": -1.5403351783752441, "eval_logps/chosen": -356.194580078125, "eval_logps/rejected": -383.8828430175781, "eval_loss": 0.5691964626312256, "eval_rewards/accuracies": 0.7049999833106995, "eval_rewards/chosen": -0.9154260158538818, "eval_rewards/margins": 0.4773229658603668, "eval_rewards/rejected": -1.3927491903305054, "eval_runtime": 382.3757, "eval_samples_per_second": 5.23, "eval_steps_per_second": 0.654, "step": 500 }, { "epoch": 0.13, "grad_norm": 6.53125, "learning_rate": 4.983184182463009e-06, "logits/chosen": -1.7440261840820312, "logits/rejected": -1.6317085027694702, "logps/chosen": -373.0206604003906, "logps/rejected": -391.50970458984375, "loss": 0.5646, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.947595477104187, "rewards/margins": 0.555194079875946, "rewards/rejected": -1.5027896165847778, "step": 510 }, { "epoch": 0.14, "grad_norm": 7.65625, "learning_rate": 4.980435359184203e-06, "logits/chosen": -1.7637799978256226, "logits/rejected": -1.7051684856414795, "logps/chosen": -361.0028991699219, "logps/rejected": -383.77392578125, "loss": 0.6028, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8794111013412476, "rewards/margins": 0.3896932005882263, "rewards/rejected": -1.2691043615341187, "step": 520 }, { "epoch": 0.14, "grad_norm": 5.3125, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -1.7085822820663452, "logits/rejected": -1.6667120456695557, "logps/chosen": -398.4223327636719, "logps/rejected": -447.1837463378906, "loss": 0.5797, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.373207688331604, "rewards/margins": 0.4029502272605896, "rewards/rejected": -1.776158094406128, "step": 530 }, { "epoch": 0.14, "grad_norm": 8.0625, "learning_rate": 4.974316612530615e-06, "logits/chosen": -1.6480659246444702, "logits/rejected": -1.4872467517852783, "logps/chosen": -413.641845703125, "logps/rejected": -420.10565185546875, "loss": 0.5292, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3515903949737549, "rewards/margins": 0.5323625206947327, "rewards/rejected": -1.8839528560638428, "step": 540 }, { "epoch": 0.14, "grad_norm": 9.375, "learning_rate": 4.970947200069416e-06, "logits/chosen": -1.6254298686981201, "logits/rejected": -1.5536671876907349, "logps/chosen": -402.1681213378906, "logps/rejected": -431.54510498046875, "loss": 0.5995, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2365509271621704, "rewards/margins": 0.4915947914123535, "rewards/rejected": -1.7281455993652344, "step": 550 }, { "epoch": 0.15, "grad_norm": 5.90625, "learning_rate": 4.967371464228096e-06, "logits/chosen": -1.788649559020996, "logits/rejected": -1.6893421411514282, "logps/chosen": -362.63739013671875, "logps/rejected": -421.24505615234375, "loss": 0.5384, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0513819456100464, "rewards/margins": 0.5819835066795349, "rewards/rejected": -1.6333656311035156, "step": 560 }, { "epoch": 0.15, "grad_norm": 7.28125, "learning_rate": 4.963589703579569e-06, "logits/chosen": -1.7911745309829712, "logits/rejected": -1.6469875574111938, "logps/chosen": -439.2314453125, "logps/rejected": -465.60174560546875, "loss": 0.5809, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.391915202140808, "rewards/margins": 0.61050945520401, "rewards/rejected": -2.002424716949463, "step": 570 }, { "epoch": 0.15, "grad_norm": 8.375, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -1.7446343898773193, "logits/rejected": -1.5205295085906982, "logps/chosen": -424.37664794921875, "logps/rejected": -455.3761291503906, "loss": 0.5342, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2658993005752563, "rewards/margins": 0.7207783460617065, "rewards/rejected": -1.9866775274276733, "step": 580 }, { "epoch": 0.15, "grad_norm": 5.9375, "learning_rate": 4.955409388141243e-06, "logits/chosen": -1.5974572896957397, "logits/rejected": -1.4778482913970947, "logps/chosen": -365.91943359375, "logps/rejected": -388.0648498535156, "loss": 0.6027, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0861928462982178, "rewards/margins": 0.45802631974220276, "rewards/rejected": -1.5442192554473877, "step": 590 }, { "epoch": 0.16, "grad_norm": 5.5625, "learning_rate": 4.951011516405429e-06, "logits/chosen": -1.682959794998169, "logits/rejected": -1.6160876750946045, "logps/chosen": -331.21978759765625, "logps/rejected": -367.4974060058594, "loss": 0.5613, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8046302795410156, "rewards/margins": 0.5121658444404602, "rewards/rejected": -1.316796064376831, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": -1.5049409866333008, "eval_logits/rejected": -1.3701001405715942, "eval_logps/chosen": -345.8829650878906, "eval_logps/rejected": -376.7896423339844, "eval_loss": 0.5658991932868958, "eval_rewards/accuracies": 0.7024999856948853, "eval_rewards/chosen": -0.8123093843460083, "eval_rewards/margins": 0.5095077753067017, "eval_rewards/rejected": -1.32181715965271, "eval_runtime": 382.004, "eval_samples_per_second": 5.236, "eval_steps_per_second": 0.654, "step": 600 }, { "epoch": 0.16, "grad_norm": 5.375, "learning_rate": 4.946408985913344e-06, "logits/chosen": -1.578046202659607, "logits/rejected": -1.4836609363555908, "logps/chosen": -328.2045593261719, "logps/rejected": -375.481201171875, "loss": 0.5276, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8234087228775024, "rewards/margins": 0.6500986218452454, "rewards/rejected": -1.473507285118103, "step": 610 }, { "epoch": 0.16, "grad_norm": 11.875, "learning_rate": 4.941602180974958e-06, "logits/chosen": -1.5045579671859741, "logits/rejected": -1.2604496479034424, "logps/chosen": -402.4884338378906, "logps/rejected": -422.79736328125, "loss": 0.5241, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.146611213684082, "rewards/margins": 0.7896040678024292, "rewards/rejected": -1.9362151622772217, "step": 620 }, { "epoch": 0.16, "grad_norm": 10.5, "learning_rate": 4.936591502957101e-06, "logits/chosen": -1.372164249420166, "logits/rejected": -1.2230699062347412, "logps/chosen": -414.8818359375, "logps/rejected": -487.482421875, "loss": 0.538, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.667520523071289, "rewards/margins": 0.8288544416427612, "rewards/rejected": -2.4963748455047607, "step": 630 }, { "epoch": 0.17, "grad_norm": 9.6875, "learning_rate": 4.931377370249946e-06, "logits/chosen": -1.3338875770568848, "logits/rejected": -1.1355304718017578, "logps/chosen": -483.4081115722656, "logps/rejected": -526.1396484375, "loss": 0.5676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.180846691131592, "rewards/margins": 0.6847688555717468, "rewards/rejected": -2.8656158447265625, "step": 640 }, { "epoch": 0.17, "grad_norm": 15.6875, "learning_rate": 4.925960218232073e-06, "logits/chosen": -1.3147588968276978, "logits/rejected": -1.1933101415634155, "logps/chosen": -446.49346923828125, "logps/rejected": -517.9827270507812, "loss": 0.5392, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9119131565093994, "rewards/margins": 0.8099091649055481, "rewards/rejected": -2.721822500228882, "step": 650 }, { "epoch": 0.17, "grad_norm": 8.875, "learning_rate": 4.920340499234116e-06, "logits/chosen": -1.3101979494094849, "logits/rejected": -1.1101386547088623, "logps/chosen": -426.4873046875, "logps/rejected": -446.02801513671875, "loss": 0.5772, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6048357486724854, "rewards/margins": 0.5474850535392761, "rewards/rejected": -2.152320623397827, "step": 660 }, { "epoch": 0.18, "grad_norm": 7.28125, "learning_rate": 4.914518682500995e-06, "logits/chosen": -1.4778305292129517, "logits/rejected": -1.3038583993911743, "logps/chosen": -432.8035583496094, "logps/rejected": -459.92864990234375, "loss": 0.5359, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.4926092624664307, "rewards/margins": 0.6784954071044922, "rewards/rejected": -2.171104907989502, "step": 670 }, { "epoch": 0.18, "grad_norm": 5.40625, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -1.3521184921264648, "logits/rejected": -1.1778732538223267, "logps/chosen": -430.7608947753906, "logps/rejected": -458.68072509765625, "loss": 0.5029, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5742666721343994, "rewards/margins": 0.7503162622451782, "rewards/rejected": -2.3245832920074463, "step": 680 }, { "epoch": 0.18, "grad_norm": 7.6875, "learning_rate": 4.902270717143858e-06, "logits/chosen": -1.3213449716567993, "logits/rejected": -1.228070855140686, "logps/chosen": -417.1580505371094, "logps/rejected": -537.0321044921875, "loss": 0.4381, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8092586994171143, "rewards/margins": 1.0819432735443115, "rewards/rejected": -2.8912017345428467, "step": 690 }, { "epoch": 0.18, "grad_norm": 6.5, "learning_rate": 4.895845591221427e-06, "logits/chosen": -1.2542212009429932, "logits/rejected": -1.1810188293457031, "logps/chosen": -466.4949645996094, "logps/rejected": -549.9205932617188, "loss": 0.5139, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.131845474243164, "rewards/margins": 0.8790606260299683, "rewards/rejected": -3.0109057426452637, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": -1.0174403190612793, "eval_logits/rejected": -0.8923892974853516, "eval_logps/chosen": -528.3277587890625, "eval_logps/rejected": -591.3086547851562, "eval_loss": 0.5571516156196594, "eval_rewards/accuracies": 0.7145000100135803, "eval_rewards/chosen": -2.6367568969726562, "eval_rewards/margins": 0.8302499055862427, "eval_rewards/rejected": -3.4670066833496094, "eval_runtime": 382.0721, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 700 }, { "epoch": 0.19, "grad_norm": 10.5625, "learning_rate": 4.8892204128816e-06, "logits/chosen": -1.1841003894805908, "logits/rejected": -1.0792133808135986, "logps/chosen": -517.9019775390625, "logps/rejected": -578.2611083984375, "loss": 0.5501, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5033812522888184, "rewards/margins": 0.7395020127296448, "rewards/rejected": -3.2428832054138184, "step": 710 }, { "epoch": 0.19, "grad_norm": 8.875, "learning_rate": 4.882395735324864e-06, "logits/chosen": -1.1759226322174072, "logits/rejected": -1.0294206142425537, "logps/chosen": -477.3987731933594, "logps/rejected": -544.5623779296875, "loss": 0.4985, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1131680011749268, "rewards/margins": 0.8433287739753723, "rewards/rejected": -2.9564967155456543, "step": 720 }, { "epoch": 0.19, "grad_norm": 8.8125, "learning_rate": 4.87537212840983e-06, "logits/chosen": -1.1399719715118408, "logits/rejected": -1.0124037265777588, "logps/chosen": -500.2403259277344, "logps/rejected": -533.0379028320312, "loss": 0.5509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.3398513793945312, "rewards/margins": 0.6334503293037415, "rewards/rejected": -2.973301887512207, "step": 730 }, { "epoch": 0.19, "grad_norm": 12.375, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -1.0892612934112549, "logits/rejected": -0.941753089427948, "logps/chosen": -450.81402587890625, "logps/rejected": -503.46636962890625, "loss": 0.5001, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2259793281555176, "rewards/margins": 0.8490058183670044, "rewards/rejected": -3.0749852657318115, "step": 740 }, { "epoch": 0.2, "grad_norm": 24.0, "learning_rate": 4.860730488943068e-06, "logits/chosen": -1.0790389776229858, "logits/rejected": -1.0216121673583984, "logps/chosen": -440.62109375, "logps/rejected": -540.6531372070312, "loss": 0.4802, "rewards/accuracies": 0.75, "rewards/chosen": -2.060957431793213, "rewards/margins": 1.019281029701233, "rewards/rejected": -3.0802388191223145, "step": 750 }, { "epoch": 0.2, "grad_norm": 7.0, "learning_rate": 4.853113678964022e-06, "logits/chosen": -1.1443126201629639, "logits/rejected": -1.065063238143921, "logps/chosen": -448.5615234375, "logps/rejected": -542.3307495117188, "loss": 0.505, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.76167893409729, "rewards/margins": 1.016688346862793, "rewards/rejected": -2.778367280960083, "step": 760 }, { "epoch": 0.2, "grad_norm": 5.90625, "learning_rate": 4.845300384669958e-06, "logits/chosen": -1.23788583278656, "logits/rejected": -1.1094398498535156, "logps/chosen": -407.1124267578125, "logps/rejected": -459.88226318359375, "loss": 0.5488, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5390856266021729, "rewards/margins": 0.7560388445854187, "rewards/rejected": -2.2951245307922363, "step": 770 }, { "epoch": 0.2, "grad_norm": 16.625, "learning_rate": 4.837291258468701e-06, "logits/chosen": -1.3532726764678955, "logits/rejected": -1.2090624570846558, "logps/chosen": -449.90447998046875, "logps/rejected": -503.38067626953125, "loss": 0.5803, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.645581603050232, "rewards/margins": 0.783669114112854, "rewards/rejected": -2.429250955581665, "step": 780 }, { "epoch": 0.21, "grad_norm": 7.59375, "learning_rate": 4.829086969119984e-06, "logits/chosen": -1.2730779647827148, "logits/rejected": -1.2738616466522217, "logps/chosen": -398.4493103027344, "logps/rejected": -460.91387939453125, "loss": 0.5907, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4772454500198364, "rewards/margins": 0.6072799563407898, "rewards/rejected": -2.0845253467559814, "step": 790 }, { "epoch": 0.21, "grad_norm": 8.1875, "learning_rate": 4.820688201679605e-06, "logits/chosen": -1.559012770652771, "logits/rejected": -1.2587218284606934, "logps/chosen": -388.8677673339844, "logps/rejected": -389.87957763671875, "loss": 0.5184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2623231410980225, "rewards/margins": 0.6337946057319641, "rewards/rejected": -1.8961181640625, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": -1.246036410331726, "eval_logits/rejected": -1.1140612363815308, "eval_logps/chosen": -413.7338562011719, "eval_logps/rejected": -463.30914306640625, "eval_loss": 0.5373813509941101, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": -1.4908183813095093, "eval_rewards/margins": 0.6961935758590698, "eval_rewards/rejected": -2.187012195587158, "eval_runtime": 382.1333, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 800 }, { "epoch": 0.21, "grad_norm": 9.0625, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -1.407278060913086, "logits/rejected": -1.3845430612564087, "logps/chosen": -428.33648681640625, "logps/rejected": -478.8470764160156, "loss": 0.6069, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.544116735458374, "rewards/margins": 0.5756716132164001, "rewards/rejected": -2.119788646697998, "step": 810 }, { "epoch": 0.21, "grad_norm": 7.625, "learning_rate": 4.803310053882831e-06, "logits/chosen": -1.4305765628814697, "logits/rejected": -1.4192079305648804, "logps/chosen": -346.76165771484375, "logps/rejected": -416.07073974609375, "loss": 0.5573, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.1743587255477905, "rewards/margins": 0.5695887804031372, "rewards/rejected": -1.7439473867416382, "step": 820 }, { "epoch": 0.22, "grad_norm": 11.8125, "learning_rate": 4.794332124596775e-06, "logits/chosen": -1.4643322229385376, "logits/rejected": -1.3541513681411743, "logps/chosen": -378.71685791015625, "logps/rejected": -430.7264709472656, "loss": 0.5747, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1027584075927734, "rewards/margins": 0.5569159984588623, "rewards/rejected": -1.6596744060516357, "step": 830 }, { "epoch": 0.22, "grad_norm": 7.28125, "learning_rate": 4.785162619238575e-06, "logits/chosen": -1.3610130548477173, "logits/rejected": -1.2018978595733643, "logps/chosen": -377.59130859375, "logps/rejected": -424.17108154296875, "loss": 0.516, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2374261617660522, "rewards/margins": 0.7390316128730774, "rewards/rejected": -1.9764575958251953, "step": 840 }, { "epoch": 0.22, "grad_norm": 7.25, "learning_rate": 4.775802303459288e-06, "logits/chosen": -1.230850100517273, "logits/rejected": -1.153451919555664, "logps/chosen": -397.7276611328125, "logps/rejected": -469.70037841796875, "loss": 0.5533, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.4996331930160522, "rewards/margins": 0.7669634819030762, "rewards/rejected": -2.266597032546997, "step": 850 }, { "epoch": 0.23, "grad_norm": 10.8125, "learning_rate": 4.766251958842589e-06, "logits/chosen": -1.196821689605713, "logits/rejected": -1.0929956436157227, "logps/chosen": -442.42779541015625, "logps/rejected": -496.02508544921875, "loss": 0.5516, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6992677450180054, "rewards/margins": 0.640595018863678, "rewards/rejected": -2.339862585067749, "step": 860 }, { "epoch": 0.23, "grad_norm": 5.96875, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -1.1287126541137695, "logits/rejected": -1.0260584354400635, "logps/chosen": -434.9798278808594, "logps/rejected": -504.6143493652344, "loss": 0.5191, "rewards/accuracies": 0.75, "rewards/chosen": -1.790509819984436, "rewards/margins": 0.7024968266487122, "rewards/rejected": -2.493006467819214, "step": 870 }, { "epoch": 0.23, "grad_norm": 9.4375, "learning_rate": 4.746584388701831e-06, "logits/chosen": -1.1179661750793457, "logits/rejected": -1.0696125030517578, "logps/chosen": -474.17364501953125, "logps/rejected": -547.4193115234375, "loss": 0.4941, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.1093432903289795, "rewards/margins": 0.8745004534721375, "rewards/rejected": -2.9838438034057617, "step": 880 }, { "epoch": 0.23, "grad_norm": 11.0, "learning_rate": 4.736468805414218e-06, "logits/chosen": -1.0214884281158447, "logits/rejected": -0.9855283498764038, "logps/chosen": -477.1600646972656, "logps/rejected": -576.8958740234375, "loss": 0.5755, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.297248125076294, "rewards/margins": 0.8587217330932617, "rewards/rejected": -3.1559698581695557, "step": 890 }, { "epoch": 0.24, "grad_norm": 14.9375, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -0.8845041394233704, "logits/rejected": -0.7875598073005676, "logps/chosen": -482.1604919433594, "logps/rejected": -565.8832397460938, "loss": 0.5211, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.532474994659424, "rewards/margins": 0.9273085594177246, "rewards/rejected": -3.4597840309143066, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": -0.9341001510620117, "eval_logits/rejected": -0.8115790486335754, "eval_logps/chosen": -518.949462890625, "eval_logps/rejected": -584.0806274414062, "eval_loss": 0.5331768989562988, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": -2.5429742336273193, "eval_rewards/margins": 0.8517529368400574, "eval_rewards/rejected": -3.3947272300720215, "eval_runtime": 382.1611, "eval_samples_per_second": 5.233, "eval_steps_per_second": 0.654, "step": 900 }, { "epoch": 0.24, "grad_norm": 12.1875, "learning_rate": 4.715678265575463e-06, "logits/chosen": -1.1323182582855225, "logits/rejected": -0.9318205118179321, "logps/chosen": -521.3104248046875, "logps/rejected": -533.2903442382812, "loss": 0.5686, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.3703832626342773, "rewards/margins": 0.6751216650009155, "rewards/rejected": -3.0455050468444824, "step": 910 }, { "epoch": 0.24, "grad_norm": 8.625, "learning_rate": 4.705005045028415e-06, "logits/chosen": -1.0868864059448242, "logits/rejected": -0.9571698904037476, "logps/chosen": -469.189208984375, "logps/rejected": -530.5699462890625, "loss": 0.5319, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0605311393737793, "rewards/margins": 0.7877290844917297, "rewards/rejected": -2.8482604026794434, "step": 920 }, { "epoch": 0.24, "grad_norm": 8.8125, "learning_rate": 4.694147707194659e-06, "logits/chosen": -1.1987128257751465, "logits/rejected": -1.1085574626922607, "logps/chosen": -479.1398010253906, "logps/rejected": -532.23828125, "loss": 0.5295, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0726170539855957, "rewards/margins": 0.7290612459182739, "rewards/rejected": -2.80167818069458, "step": 930 }, { "epoch": 0.25, "grad_norm": 7.3125, "learning_rate": 4.683107158658782e-06, "logits/chosen": -1.1448571681976318, "logits/rejected": -1.0365805625915527, "logps/chosen": -478.0250549316406, "logps/rejected": -530.4112548828125, "loss": 0.5083, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8778432607650757, "rewards/margins": 0.811355471611023, "rewards/rejected": -2.6891987323760986, "step": 940 }, { "epoch": 0.25, "grad_norm": 9.0625, "learning_rate": 4.671884321303407e-06, "logits/chosen": -1.2020542621612549, "logits/rejected": -1.0928010940551758, "logps/chosen": -440.04864501953125, "logps/rejected": -496.198486328125, "loss": 0.5249, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9372259378433228, "rewards/margins": 0.7049869298934937, "rewards/rejected": -2.6422126293182373, "step": 950 }, { "epoch": 0.25, "grad_norm": 6.875, "learning_rate": 4.660480132232224e-06, "logits/chosen": -1.2815606594085693, "logits/rejected": -1.1846911907196045, "logps/chosen": -445.06915283203125, "logps/rejected": -479.39093017578125, "loss": 0.5773, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.7325608730316162, "rewards/margins": 0.5843728184700012, "rewards/rejected": -2.3169338703155518, "step": 960 }, { "epoch": 0.25, "grad_norm": 9.6875, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -1.3540565967559814, "logits/rejected": -1.1343624591827393, "logps/chosen": -444.31640625, "logps/rejected": -482.2098083496094, "loss": 0.5099, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6433531045913696, "rewards/margins": 0.8446812629699707, "rewards/rejected": -2.48803448677063, "step": 970 }, { "epoch": 0.26, "grad_norm": 5.75, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -1.3197797536849976, "logits/rejected": -1.1996195316314697, "logps/chosen": -457.05712890625, "logps/rejected": -514.72802734375, "loss": 0.5217, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7364044189453125, "rewards/margins": 0.780579149723053, "rewards/rejected": -2.5169835090637207, "step": 980 }, { "epoch": 0.26, "grad_norm": 13.6875, "learning_rate": 4.625189052424638e-06, "logits/chosen": -1.2102200984954834, "logits/rejected": -1.0647470951080322, "logps/chosen": -436.97991943359375, "logps/rejected": -520.3751220703125, "loss": 0.4535, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9787667989730835, "rewards/margins": 1.061232328414917, "rewards/rejected": -3.039999008178711, "step": 990 }, { "epoch": 0.26, "grad_norm": 8.25, "learning_rate": 4.613069129183218e-06, "logits/chosen": -1.240464687347412, "logits/rejected": -1.0879384279251099, "logps/chosen": -531.1487426757812, "logps/rejected": -574.3619384765625, "loss": 0.5553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2774546146392822, "rewards/margins": 0.7940423488616943, "rewards/rejected": -3.0714969635009766, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -0.981342613697052, "eval_logits/rejected": -0.8557386994361877, "eval_logps/chosen": -482.09930419921875, "eval_logps/rejected": -548.8490600585938, "eval_loss": 0.5178083777427673, "eval_rewards/accuracies": 0.7315000295639038, "eval_rewards/chosen": -2.1744725704193115, "eval_rewards/margins": 0.8679391145706177, "eval_rewards/rejected": -3.0424115657806396, "eval_runtime": 382.1372, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 1000 }, { "epoch": 0.26, "grad_norm": 8.0, "learning_rate": 4.600772765277607e-06, "logits/chosen": -1.0305756330490112, "logits/rejected": -0.9370132684707642, "logps/chosen": -448.99493408203125, "logps/rejected": -530.3275146484375, "loss": 0.4913, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.154376983642578, "rewards/margins": 0.8647212982177734, "rewards/rejected": -3.0190985202789307, "step": 1010 }, { "epoch": 0.27, "grad_norm": 16.75, "learning_rate": 4.588300987450652e-06, "logits/chosen": -1.0989015102386475, "logits/rejected": -0.9851810336112976, "logps/chosen": -443.59423828125, "logps/rejected": -486.5970764160156, "loss": 0.5542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8985025882720947, "rewards/margins": 0.7655047178268433, "rewards/rejected": -2.6640071868896484, "step": 1020 }, { "epoch": 0.27, "grad_norm": 5.6875, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -1.0507217645645142, "logits/rejected": -0.9594799280166626, "logps/chosen": -405.2181091308594, "logps/rejected": -487.1499938964844, "loss": 0.4835, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6451423168182373, "rewards/margins": 0.9089698791503906, "rewards/rejected": -2.554112434387207, "step": 1030 }, { "epoch": 0.27, "grad_norm": 13.5625, "learning_rate": 4.562835370152206e-06, "logits/chosen": -1.0573441982269287, "logits/rejected": -0.8775628209114075, "logps/chosen": -527.5038452148438, "logps/rejected": -620.2794189453125, "loss": 0.4742, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2627432346343994, "rewards/margins": 1.2387964725494385, "rewards/rejected": -3.501539707183838, "step": 1040 }, { "epoch": 0.27, "grad_norm": 8.8125, "learning_rate": 4.54984365705243e-06, "logits/chosen": -0.9812475442886353, "logits/rejected": -0.8811472654342651, "logps/chosen": -502.1786193847656, "logps/rejected": -618.7202758789062, "loss": 0.4971, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.337085485458374, "rewards/margins": 1.2312263250350952, "rewards/rejected": -3.5683116912841797, "step": 1050 }, { "epoch": 0.28, "grad_norm": 9.0, "learning_rate": 4.536680782597191e-06, "logits/chosen": -0.9585447311401367, "logits/rejected": -0.8763798475265503, "logps/chosen": -443.18878173828125, "logps/rejected": -523.16015625, "loss": 0.6028, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0716359615325928, "rewards/margins": 0.855958104133606, "rewards/rejected": -2.9275941848754883, "step": 1060 }, { "epoch": 0.28, "grad_norm": 15.4375, "learning_rate": 4.523347845882718e-06, "logits/chosen": -1.122159481048584, "logits/rejected": -0.9293369054794312, "logps/chosen": -494.13037109375, "logps/rejected": -562.1329345703125, "loss": 0.4613, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0596017837524414, "rewards/margins": 1.1728570461273193, "rewards/rejected": -3.2324588298797607, "step": 1070 }, { "epoch": 0.28, "grad_norm": 8.125, "learning_rate": 4.50984596020539e-06, "logits/chosen": -0.8647342920303345, "logits/rejected": -0.826617419719696, "logps/chosen": -561.8629760742188, "logps/rejected": -615.0023193359375, "loss": 0.5557, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8171119689941406, "rewards/margins": 0.8539352416992188, "rewards/rejected": -3.6710472106933594, "step": 1080 }, { "epoch": 0.29, "grad_norm": 9.0, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -1.0336081981658936, "logits/rejected": -0.9252422451972961, "logps/chosen": -563.8508911132812, "logps/rejected": -638.390869140625, "loss": 0.4855, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0344927310943604, "rewards/margins": 0.9103133082389832, "rewards/rejected": -3.944805860519409, "step": 1090 }, { "epoch": 0.29, "grad_norm": 6.9375, "learning_rate": 4.482339865589492e-06, "logits/chosen": -1.0671048164367676, "logits/rejected": -0.9094209671020508, "logps/chosen": -568.4443359375, "logps/rejected": -596.6480712890625, "loss": 0.5994, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.021576404571533, "rewards/margins": 0.7217450141906738, "rewards/rejected": -3.743321180343628, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": -0.8895747661590576, "eval_logits/rejected": -0.7614892721176147, "eval_logps/chosen": -514.6676635742188, "eval_logps/rejected": -577.3698120117188, "eval_loss": 0.520658552646637, "eval_rewards/accuracies": 0.7300000190734863, "eval_rewards/chosen": -2.5001566410064697, "eval_rewards/margins": 0.8274616599082947, "eval_rewards/rejected": -3.3276185989379883, "eval_runtime": 382.1502, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 1100 }, { "epoch": 0.29, "grad_norm": 6.625, "learning_rate": 4.468337953401909e-06, "logits/chosen": -1.1065692901611328, "logits/rejected": -1.0572447776794434, "logps/chosen": -495.5409240722656, "logps/rejected": -552.65966796875, "loss": 0.5707, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2518980503082275, "rewards/margins": 0.61982262134552, "rewards/rejected": -2.871720790863037, "step": 1110 }, { "epoch": 0.29, "grad_norm": 8.875, "learning_rate": 4.45417168556166e-06, "logits/chosen": -1.0463123321533203, "logits/rejected": -0.9469770193099976, "logps/chosen": -435.6727600097656, "logps/rejected": -518.3145751953125, "loss": 0.5007, "rewards/accuracies": 0.75, "rewards/chosen": -1.9452159404754639, "rewards/margins": 0.8327676057815552, "rewards/rejected": -2.7779834270477295, "step": 1120 }, { "epoch": 0.3, "grad_norm": 9.6875, "learning_rate": 4.439842244948036e-06, "logits/chosen": -1.0293817520141602, "logits/rejected": -0.8690570592880249, "logps/chosen": -486.1783142089844, "logps/rejected": -559.431396484375, "loss": 0.5565, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2511630058288574, "rewards/margins": 0.7881690263748169, "rewards/rejected": -3.0393319129943848, "step": 1130 }, { "epoch": 0.3, "grad_norm": 14.5, "learning_rate": 4.425350828065204e-06, "logits/chosen": -1.0534614324569702, "logits/rejected": -0.8575074076652527, "logps/chosen": -497.90167236328125, "logps/rejected": -537.9634399414062, "loss": 0.4913, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1381561756134033, "rewards/margins": 0.8793197870254517, "rewards/rejected": -3.0174758434295654, "step": 1140 }, { "epoch": 0.3, "grad_norm": 9.5625, "learning_rate": 4.410698644942303e-06, "logits/chosen": -1.0756770372390747, "logits/rejected": -0.9290148615837097, "logps/chosen": -489.197265625, "logps/rejected": -558.8743286132812, "loss": 0.4893, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1688458919525146, "rewards/margins": 0.9360774755477905, "rewards/rejected": -3.1049234867095947, "step": 1150 }, { "epoch": 0.3, "grad_norm": 12.25, "learning_rate": 4.395886919032406e-06, "logits/chosen": -0.9989307522773743, "logits/rejected": -0.8515041470527649, "logps/chosen": -480.94183349609375, "logps/rejected": -542.0136108398438, "loss": 0.5419, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1710543632507324, "rewards/margins": 0.8757139444351196, "rewards/rejected": -3.0467686653137207, "step": 1160 }, { "epoch": 0.31, "grad_norm": 8.625, "learning_rate": 4.380916887110366e-06, "logits/chosen": -1.1318533420562744, "logits/rejected": -0.9459112286567688, "logps/chosen": -481.12335205078125, "logps/rejected": -544.0623779296875, "loss": 0.5083, "rewards/accuracies": 0.75, "rewards/chosen": -2.2195496559143066, "rewards/margins": 1.032907247543335, "rewards/rejected": -3.2524571418762207, "step": 1170 }, { "epoch": 0.31, "grad_norm": 9.3125, "learning_rate": 4.365789799169539e-06, "logits/chosen": -0.9683933258056641, "logits/rejected": -1.0098755359649658, "logps/chosen": -474.65283203125, "logps/rejected": -566.4153442382812, "loss": 0.5468, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.280418872833252, "rewards/margins": 0.8640033006668091, "rewards/rejected": -3.1444220542907715, "step": 1180 }, { "epoch": 0.31, "grad_norm": 11.9375, "learning_rate": 4.350506918317416e-06, "logits/chosen": -1.1871801614761353, "logits/rejected": -1.0333930253982544, "logps/chosen": -443.02716064453125, "logps/rejected": -521.8514404296875, "loss": 0.5037, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9543129205703735, "rewards/margins": 0.8601529002189636, "rewards/rejected": -2.8144659996032715, "step": 1190 }, { "epoch": 0.31, "grad_norm": 9.0, "learning_rate": 4.335069520670149e-06, "logits/chosen": -0.9967072606086731, "logits/rejected": -0.9244716763496399, "logps/chosen": -455.01959228515625, "logps/rejected": -528.6710205078125, "loss": 0.5976, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2530674934387207, "rewards/margins": 0.6545962691307068, "rewards/rejected": -2.907663583755493, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": -0.9595763087272644, "eval_logits/rejected": -0.8350398540496826, "eval_logps/chosen": -482.9834289550781, "eval_logps/rejected": -543.660400390625, "eval_loss": 0.5098230838775635, "eval_rewards/accuracies": 0.7365000247955322, "eval_rewards/chosen": -2.183314323425293, "eval_rewards/margins": 0.8072100281715393, "eval_rewards/rejected": -2.9905245304107666, "eval_runtime": 382.4857, "eval_samples_per_second": 5.229, "eval_steps_per_second": 0.654, "step": 1200 }, { "epoch": 0.32, "grad_norm": 6.40625, "learning_rate": 4.319478895246e-06, "logits/chosen": -1.070488691329956, "logits/rejected": -0.886951744556427, "logps/chosen": -466.0955505371094, "logps/rejected": -520.3566284179688, "loss": 0.4951, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.189800977706909, "rewards/margins": 0.7895106077194214, "rewards/rejected": -2.979311466217041, "step": 1210 }, { "epoch": 0.32, "grad_norm": 11.0, "learning_rate": 4.303736343857704e-06, "logits/chosen": -1.0415198802947998, "logits/rejected": -0.9387828707695007, "logps/chosen": -499.1920471191406, "logps/rejected": -617.3883666992188, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": -2.420851230621338, "rewards/margins": 1.062877893447876, "rewards/rejected": -3.483729124069214, "step": 1220 }, { "epoch": 0.32, "grad_norm": 10.0, "learning_rate": 4.287843181003772e-06, "logits/chosen": -1.0625154972076416, "logits/rejected": -0.9172189831733704, "logps/chosen": -579.9913330078125, "logps/rejected": -610.0975341796875, "loss": 0.5905, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8613951206207275, "rewards/margins": 0.7642954587936401, "rewards/rejected": -3.6256909370422363, "step": 1230 }, { "epoch": 0.32, "grad_norm": 7.59375, "learning_rate": 4.27180073375873e-06, "logits/chosen": -1.1162028312683105, "logits/rejected": -0.9976137280464172, "logps/chosen": -525.2400512695312, "logps/rejected": -569.8626708984375, "loss": 0.5269, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3787271976470947, "rewards/margins": 0.8617948293685913, "rewards/rejected": -3.2405219078063965, "step": 1240 }, { "epoch": 0.33, "grad_norm": 5.625, "learning_rate": 4.255610341662304e-06, "logits/chosen": -1.144928216934204, "logits/rejected": -0.9519325494766235, "logps/chosen": -472.40087890625, "logps/rejected": -529.2858276367188, "loss": 0.5525, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.171128511428833, "rewards/margins": 0.767959475517273, "rewards/rejected": -2.9390883445739746, "step": 1250 }, { "epoch": 0.33, "grad_norm": 8.625, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -1.11684250831604, "logits/rejected": -0.9831358194351196, "logps/chosen": -500.71484375, "logps/rejected": -542.6422119140625, "loss": 0.5654, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.447084903717041, "rewards/margins": 0.5746163129806519, "rewards/rejected": -3.0217010974884033, "step": 1260 }, { "epoch": 0.33, "grad_norm": 7.65625, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -1.0659453868865967, "logits/rejected": -0.899361252784729, "logps/chosen": -475.46148681640625, "logps/rejected": -525.0037841796875, "loss": 0.5081, "rewards/accuracies": 0.71875, "rewards/chosen": -2.251559257507324, "rewards/margins": 0.823780357837677, "rewards/rejected": -3.0753397941589355, "step": 1270 }, { "epoch": 0.33, "grad_norm": 11.4375, "learning_rate": 4.206165076283983e-06, "logits/chosen": -1.096620798110962, "logits/rejected": -0.9550498127937317, "logps/chosen": -487.46136474609375, "logps/rejected": -576.1992797851562, "loss": 0.461, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.4152817726135254, "rewards/margins": 1.0981849431991577, "rewards/rejected": -3.5134663581848145, "step": 1280 }, { "epoch": 0.34, "grad_norm": 10.6875, "learning_rate": 4.189396545546995e-06, "logits/chosen": -1.0538244247436523, "logits/rejected": -0.9361982345581055, "logps/chosen": -522.2523193359375, "logps/rejected": -610.1349487304688, "loss": 0.5054, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.730778217315674, "rewards/margins": 1.0780103206634521, "rewards/rejected": -3.808788776397705, "step": 1290 }, { "epoch": 0.34, "grad_norm": 13.125, "learning_rate": 4.172486950684627e-06, "logits/chosen": -1.0185925960540771, "logits/rejected": -0.9584161639213562, "logps/chosen": -538.3131103515625, "logps/rejected": -635.578369140625, "loss": 0.5237, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.846707820892334, "rewards/margins": 1.0040740966796875, "rewards/rejected": -3.8507816791534424, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": -0.825871467590332, "eval_logits/rejected": -0.7071986198425293, "eval_logps/chosen": -574.3861694335938, "eval_logps/rejected": -660.885009765625, "eval_loss": 0.5165807008743286, "eval_rewards/accuracies": 0.7350000143051147, "eval_rewards/chosen": -3.097341775894165, "eval_rewards/margins": 1.0654287338256836, "eval_rewards/rejected": -4.162771224975586, "eval_runtime": 382.0912, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 1300 }, { "epoch": 0.34, "grad_norm": 11.625, "learning_rate": 4.155437703643182e-06, "logits/chosen": -1.0443698167800903, "logits/rejected": -0.8676601648330688, "logps/chosen": -536.4607543945312, "logps/rejected": -606.3543701171875, "loss": 0.5075, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8971712589263916, "rewards/margins": 0.9897411465644836, "rewards/rejected": -3.8869121074676514, "step": 1310 }, { "epoch": 0.35, "grad_norm": 11.375, "learning_rate": 4.138250228029882e-06, "logits/chosen": -1.000579595565796, "logits/rejected": -0.9191876649856567, "logps/chosen": -538.9154052734375, "logps/rejected": -649.7552490234375, "loss": 0.4767, "rewards/accuracies": 0.75, "rewards/chosen": -2.8198482990264893, "rewards/margins": 1.0736055374145508, "rewards/rejected": -3.893454074859619, "step": 1320 }, { "epoch": 0.35, "grad_norm": 7.6875, "learning_rate": 4.120925958993994e-06, "logits/chosen": -0.9208280444145203, "logits/rejected": -0.8555585741996765, "logps/chosen": -512.56787109375, "logps/rejected": -604.376220703125, "loss": 0.5584, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.786665439605713, "rewards/margins": 0.9612969160079956, "rewards/rejected": -3.747962474822998, "step": 1330 }, { "epoch": 0.35, "grad_norm": 14.0, "learning_rate": 4.103466343106999e-06, "logits/chosen": -1.1172326803207397, "logits/rejected": -0.9976350665092468, "logps/chosen": -514.8595581054688, "logps/rejected": -575.3850708007812, "loss": 0.5422, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4547386169433594, "rewards/margins": 0.8639480471611023, "rewards/rejected": -3.3186867237091064, "step": 1340 }, { "epoch": 0.35, "grad_norm": 10.125, "learning_rate": 4.085872838241797e-06, "logits/chosen": -1.0706989765167236, "logits/rejected": -0.9391083717346191, "logps/chosen": -489.779296875, "logps/rejected": -538.4210815429688, "loss": 0.5948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.274151563644409, "rewards/margins": 0.6873086094856262, "rewards/rejected": -2.9614596366882324, "step": 1350 }, { "epoch": 0.36, "grad_norm": 11.125, "learning_rate": 4.06814691345098e-06, "logits/chosen": -1.0508559942245483, "logits/rejected": -0.9001902341842651, "logps/chosen": -451.5694274902344, "logps/rejected": -517.9208984375, "loss": 0.4809, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9602162837982178, "rewards/margins": 0.884141743183136, "rewards/rejected": -2.844357967376709, "step": 1360 }, { "epoch": 0.36, "grad_norm": 14.125, "learning_rate": 4.050290048844171e-06, "logits/chosen": -1.129167914390564, "logits/rejected": -1.0560190677642822, "logps/chosen": -474.2417907714844, "logps/rejected": -552.0899047851562, "loss": 0.5423, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.056283473968506, "rewards/margins": 0.8298514485359192, "rewards/rejected": -2.886134624481201, "step": 1370 }, { "epoch": 0.36, "grad_norm": 9.5, "learning_rate": 4.032303735464422e-06, "logits/chosen": -1.1856621503829956, "logits/rejected": -0.9643325805664062, "logps/chosen": -502.15814208984375, "logps/rejected": -594.064208984375, "loss": 0.452, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.301772356033325, "rewards/margins": 1.1499149799346924, "rewards/rejected": -3.4516875743865967, "step": 1380 }, { "epoch": 0.36, "grad_norm": 11.6875, "learning_rate": 4.014189475163727e-06, "logits/chosen": -0.96733558177948, "logits/rejected": -0.853344738483429, "logps/chosen": -489.39990234375, "logps/rejected": -597.2086181640625, "loss": 0.4757, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.3474299907684326, "rewards/margins": 1.1593117713928223, "rewards/rejected": -3.506741762161255, "step": 1390 }, { "epoch": 0.37, "grad_norm": 12.75, "learning_rate": 3.995948780477605e-06, "logits/chosen": -1.1000730991363525, "logits/rejected": -0.9693312644958496, "logps/chosen": -477.19549560546875, "logps/rejected": -542.30615234375, "loss": 0.516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0844216346740723, "rewards/margins": 0.8978837132453918, "rewards/rejected": -2.9823052883148193, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": -0.9127845168113708, "eval_logits/rejected": -0.7864713668823242, "eval_logps/chosen": -474.74249267578125, "eval_logps/rejected": -551.2366943359375, "eval_loss": 0.5107593536376953, "eval_rewards/accuracies": 0.7350000143051147, "eval_rewards/chosen": -2.100904941558838, "eval_rewards/margins": 0.9653825163841248, "eval_rewards/rejected": -3.0662872791290283, "eval_runtime": 381.6083, "eval_samples_per_second": 5.241, "eval_steps_per_second": 0.655, "step": 1400 }, { "epoch": 0.37, "grad_norm": 10.25, "learning_rate": 3.977583174498816e-06, "logits/chosen": -1.017508864402771, "logits/rejected": -0.8959487676620483, "logps/chosen": -488.11810302734375, "logps/rejected": -602.2122802734375, "loss": 0.3715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.244345188140869, "rewards/margins": 1.360781192779541, "rewards/rejected": -3.6051268577575684, "step": 1410 }, { "epoch": 0.37, "grad_norm": 12.125, "learning_rate": 3.959094190750172e-06, "logits/chosen": -1.0074245929718018, "logits/rejected": -0.868901252746582, "logps/chosen": -552.512939453125, "logps/rejected": -637.4674072265625, "loss": 0.4966, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6735260486602783, "rewards/margins": 1.1185749769210815, "rewards/rejected": -3.7921009063720703, "step": 1420 }, { "epoch": 0.37, "grad_norm": 11.6875, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -0.8478316068649292, "logits/rejected": -0.7511281967163086, "logps/chosen": -535.4224853515625, "logps/rejected": -637.5137329101562, "loss": 0.494, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.823219060897827, "rewards/margins": 1.1367390155792236, "rewards/rejected": -3.9599578380584717, "step": 1430 }, { "epoch": 0.38, "grad_norm": 17.125, "learning_rate": 3.921752275415712e-06, "logits/chosen": -0.9650063514709473, "logits/rejected": -0.8631266355514526, "logps/chosen": -534.4532470703125, "logps/rejected": -645.3438720703125, "loss": 0.4351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8391730785369873, "rewards/margins": 1.3146858215332031, "rewards/rejected": -4.1538591384887695, "step": 1440 }, { "epoch": 0.38, "grad_norm": 6.53125, "learning_rate": 3.902902461869079e-06, "logits/chosen": -0.9252153635025024, "logits/rejected": -0.7948675751686096, "logps/chosen": -540.6839599609375, "logps/rejected": -642.1290283203125, "loss": 0.5532, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0117030143737793, "rewards/margins": 1.17899751663208, "rewards/rejected": -4.190700531005859, "step": 1450 }, { "epoch": 0.38, "grad_norm": 13.875, "learning_rate": 3.883935506370605e-06, "logits/chosen": -0.9731215238571167, "logits/rejected": -0.8713979721069336, "logps/chosen": -526.899658203125, "logps/rejected": -591.6453857421875, "loss": 0.5396, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.736586570739746, "rewards/margins": 0.9257469177246094, "rewards/rejected": -3.6623332500457764, "step": 1460 }, { "epoch": 0.38, "grad_norm": 5.0625, "learning_rate": 3.864852992655617e-06, "logits/chosen": -1.115800380706787, "logits/rejected": -1.0172771215438843, "logps/chosen": -478.37420654296875, "logps/rejected": -573.0581665039062, "loss": 0.4365, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2973954677581787, "rewards/margins": 1.069636344909668, "rewards/rejected": -3.3670318126678467, "step": 1470 }, { "epoch": 0.39, "grad_norm": 7.0625, "learning_rate": 3.845656514108516e-06, "logits/chosen": -1.0454566478729248, "logits/rejected": -0.8997499346733093, "logps/chosen": -511.357177734375, "logps/rejected": -557.3446655273438, "loss": 0.4913, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.522265672683716, "rewards/margins": 1.0096194744110107, "rewards/rejected": -3.5318856239318848, "step": 1480 }, { "epoch": 0.39, "grad_norm": 8.125, "learning_rate": 3.826347673629738e-06, "logits/chosen": -1.0593020915985107, "logits/rejected": -0.8929145932197571, "logps/chosen": -473.79302978515625, "logps/rejected": -565.4286499023438, "loss": 0.4657, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2048957347869873, "rewards/margins": 1.1790317296981812, "rewards/rejected": -3.3839271068573, "step": 1490 }, { "epoch": 0.39, "grad_norm": 12.0625, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -1.116262674331665, "logits/rejected": -0.9613265991210938, "logps/chosen": -477.24810791015625, "logps/rejected": -587.962646484375, "loss": 0.4593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1293580532073975, "rewards/margins": 1.2989779710769653, "rewards/rejected": -3.4283363819122314, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": -1.0210601091384888, "eval_logits/rejected": -0.8902665972709656, "eval_logps/chosen": -496.3184509277344, "eval_logps/rejected": -587.1505737304688, "eval_loss": 0.5173963308334351, "eval_rewards/accuracies": 0.7304999828338623, "eval_rewards/chosen": -2.316664218902588, "eval_rewards/margins": 1.1087615489959717, "eval_rewards/rejected": -3.4254260063171387, "eval_runtime": 382.2649, "eval_samples_per_second": 5.232, "eval_steps_per_second": 0.654, "step": 1500 }, { "epoch": 0.4, "grad_norm": 13.5, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -1.0803442001342773, "logits/rejected": -0.9917434453964233, "logps/chosen": -461.2118225097656, "logps/rejected": -549.1537475585938, "loss": 0.593, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.274977922439575, "rewards/margins": 0.9378048777580261, "rewards/rejected": -3.212782621383667, "step": 1510 }, { "epoch": 0.4, "grad_norm": 8.3125, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -1.2474887371063232, "logits/rejected": -1.145392656326294, "logps/chosen": -428.1084899902344, "logps/rejected": -485.67694091796875, "loss": 0.5245, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6442362070083618, "rewards/margins": 0.7559275031089783, "rewards/rejected": -2.4001636505126953, "step": 1520 }, { "epoch": 0.4, "grad_norm": 6.75, "learning_rate": 3.748021075950633e-06, "logits/chosen": -1.3161629438400269, "logits/rejected": -1.232714295387268, "logps/chosen": -440.6031188964844, "logps/rejected": -481.67926025390625, "loss": 0.5983, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6595981121063232, "rewards/margins": 0.5171489119529724, "rewards/rejected": -2.1767468452453613, "step": 1530 }, { "epoch": 0.4, "grad_norm": 10.625, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -1.132124662399292, "logits/rejected": -1.0095793008804321, "logps/chosen": -423.98553466796875, "logps/rejected": -478.41015625, "loss": 0.5368, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8633050918579102, "rewards/margins": 0.7011392712593079, "rewards/rejected": -2.5644445419311523, "step": 1540 }, { "epoch": 0.41, "grad_norm": 8.625, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -1.2295887470245361, "logits/rejected": -1.1187238693237305, "logps/chosen": -489.0294494628906, "logps/rejected": -551.4732666015625, "loss": 0.4858, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1783862113952637, "rewards/margins": 0.8242964744567871, "rewards/rejected": -3.002682685852051, "step": 1550 }, { "epoch": 0.41, "grad_norm": 10.0, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -1.241201639175415, "logits/rejected": -1.0382106304168701, "logps/chosen": -548.8870849609375, "logps/rejected": -611.2633666992188, "loss": 0.4939, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6739068031311035, "rewards/margins": 0.9938074350357056, "rewards/rejected": -3.6677143573760986, "step": 1560 }, { "epoch": 0.41, "grad_norm": 11.8125, "learning_rate": 3.668027301883802e-06, "logits/chosen": -1.154157280921936, "logits/rejected": -1.0291301012039185, "logps/chosen": -542.0028076171875, "logps/rejected": -634.2523803710938, "loss": 0.5002, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9064033031463623, "rewards/margins": 1.071606993675232, "rewards/rejected": -3.9780097007751465, "step": 1570 }, { "epoch": 0.41, "grad_norm": 5.46875, "learning_rate": 3.64778083782286e-06, "logits/chosen": -1.0966026782989502, "logits/rejected": -1.084398627281189, "logps/chosen": -548.9720458984375, "logps/rejected": -668.5007934570312, "loss": 0.5301, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.963653087615967, "rewards/margins": 0.9051497578620911, "rewards/rejected": -3.868802547454834, "step": 1580 }, { "epoch": 0.42, "grad_norm": 9.6875, "learning_rate": 3.627438534392268e-06, "logits/chosen": -1.2072285413742065, "logits/rejected": -1.1841914653778076, "logps/chosen": -524.2724609375, "logps/rejected": -635.7026977539062, "loss": 0.483, "rewards/accuracies": 0.78125, "rewards/chosen": -2.862274169921875, "rewards/margins": 1.047090768814087, "rewards/rejected": -3.909365177154541, "step": 1590 }, { "epoch": 0.42, "grad_norm": 7.21875, "learning_rate": 3.607002090168506e-06, "logits/chosen": -1.0932730436325073, "logits/rejected": -1.0192008018493652, "logps/chosen": -579.1436157226562, "logps/rejected": -652.6798095703125, "loss": 0.5545, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.1483500003814697, "rewards/margins": 0.9495010375976562, "rewards/rejected": -4.097850799560547, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": -1.0082374811172485, "eval_logits/rejected": -0.8800999522209167, "eval_logps/chosen": -564.0355224609375, "eval_logps/rejected": -652.812255859375, "eval_loss": 0.5032184720039368, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -2.99383544921875, "eval_rewards/margins": 1.088207483291626, "eval_rewards/rejected": -4.082043170928955, "eval_runtime": 381.8998, "eval_samples_per_second": 5.237, "eval_steps_per_second": 0.655, "step": 1600 }, { "epoch": 0.42, "grad_norm": 6.71875, "learning_rate": 3.586473211588787e-06, "logits/chosen": -1.1385810375213623, "logits/rejected": -1.0679770708084106, "logps/chosen": -523.4324340820312, "logps/rejected": -647.1407470703125, "loss": 0.4495, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.787372350692749, "rewards/margins": 1.170562744140625, "rewards/rejected": -3.957934856414795, "step": 1610 }, { "epoch": 0.42, "grad_norm": 13.0, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -1.1914455890655518, "logits/rejected": -1.0186755657196045, "logps/chosen": -572.4912719726562, "logps/rejected": -637.8251953125, "loss": 0.5878, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.0957980155944824, "rewards/margins": 0.9488485455513, "rewards/rejected": -4.044646263122559, "step": 1620 }, { "epoch": 0.43, "grad_norm": 10.4375, "learning_rate": 3.545145015558399e-06, "logits/chosen": -0.9681538343429565, "logits/rejected": -0.9621971249580383, "logps/chosen": -520.1128540039062, "logps/rejected": -614.5860595703125, "loss": 0.5109, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8847546577453613, "rewards/margins": 1.0869688987731934, "rewards/rejected": -3.971724271774292, "step": 1630 }, { "epoch": 0.43, "grad_norm": 5.46875, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -1.09974205493927, "logits/rejected": -1.019108533859253, "logps/chosen": -545.1671142578125, "logps/rejected": -630.2543334960938, "loss": 0.5719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9147398471832275, "rewards/margins": 0.9028825759887695, "rewards/rejected": -3.817622423171997, "step": 1640 }, { "epoch": 0.43, "grad_norm": 8.3125, "learning_rate": 3.503467749582857e-06, "logits/chosen": -1.1649540662765503, "logits/rejected": -0.9812711477279663, "logps/chosen": -496.32757568359375, "logps/rejected": -530.1451416015625, "loss": 0.5901, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4510443210601807, "rewards/margins": 0.6782389879226685, "rewards/rejected": -3.1292831897735596, "step": 1650 }, { "epoch": 0.43, "grad_norm": 11.0, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -1.0830554962158203, "logits/rejected": -1.0159814357757568, "logps/chosen": -442.962646484375, "logps/rejected": -521.5462646484375, "loss": 0.5191, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2101898193359375, "rewards/margins": 0.7478699684143066, "rewards/rejected": -2.958059549331665, "step": 1660 }, { "epoch": 0.44, "grad_norm": 7.40625, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -1.1321473121643066, "logits/rejected": -0.9186077117919922, "logps/chosen": -502.3970642089844, "logps/rejected": -575.6217041015625, "loss": 0.4608, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3348631858825684, "rewards/margins": 1.0501439571380615, "rewards/rejected": -3.385007381439209, "step": 1670 }, { "epoch": 0.44, "grad_norm": 9.625, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -1.1406095027923584, "logits/rejected": -0.9287969470024109, "logps/chosen": -519.1994018554688, "logps/rejected": -603.8717041015625, "loss": 0.4608, "rewards/accuracies": 0.75, "rewards/chosen": -2.365922689437866, "rewards/margins": 1.2659895420074463, "rewards/rejected": -3.6319122314453125, "step": 1680 }, { "epoch": 0.44, "grad_norm": 16.5, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -1.1879878044128418, "logits/rejected": -0.9813734292984009, "logps/chosen": -539.6956176757812, "logps/rejected": -599.0775146484375, "loss": 0.5446, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6155307292938232, "rewards/margins": 1.0494682788848877, "rewards/rejected": -3.664999008178711, "step": 1690 }, { "epoch": 0.44, "grad_norm": 12.1875, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -1.0190632343292236, "logits/rejected": -0.9378607869148254, "logps/chosen": -550.7818603515625, "logps/rejected": -662.2818603515625, "loss": 0.5425, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.4076619148254395, "rewards/margins": 1.0471140146255493, "rewards/rejected": -4.454775810241699, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": -0.9685720205307007, "eval_logits/rejected": -0.8382174968719482, "eval_logps/chosen": -599.6095581054688, "eval_logps/rejected": -685.2186889648438, "eval_loss": 0.49963250756263733, "eval_rewards/accuracies": 0.7404999732971191, "eval_rewards/chosen": -3.349576234817505, "eval_rewards/margins": 1.0565321445465088, "eval_rewards/rejected": -4.406107425689697, "eval_runtime": 382.4342, "eval_samples_per_second": 5.23, "eval_steps_per_second": 0.654, "step": 1700 }, { "epoch": 0.45, "grad_norm": 11.75, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -1.0968348979949951, "logits/rejected": -1.0862301588058472, "logps/chosen": -530.6864013671875, "logps/rejected": -640.4039916992188, "loss": 0.518, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0613017082214355, "rewards/margins": 0.9621230959892273, "rewards/rejected": -4.0234246253967285, "step": 1710 }, { "epoch": 0.45, "grad_norm": 8.0, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -1.2413816452026367, "logits/rejected": -1.089429259300232, "logps/chosen": -535.4332275390625, "logps/rejected": -622.2586059570312, "loss": 0.4864, "rewards/accuracies": 0.78125, "rewards/chosen": -2.66583251953125, "rewards/margins": 1.01954185962677, "rewards/rejected": -3.6853744983673096, "step": 1720 }, { "epoch": 0.45, "grad_norm": 8.4375, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -1.112958312034607, "logits/rejected": -1.0259140729904175, "logps/chosen": -486.234375, "logps/rejected": -564.1868896484375, "loss": 0.4673, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.260854721069336, "rewards/margins": 1.0263946056365967, "rewards/rejected": -3.2872490882873535, "step": 1730 }, { "epoch": 0.46, "grad_norm": 10.4375, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -1.1713676452636719, "logits/rejected": -1.0070645809173584, "logps/chosen": -514.056396484375, "logps/rejected": -565.324951171875, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.390371084213257, "rewards/margins": 0.8160451054573059, "rewards/rejected": -3.206415891647339, "step": 1740 }, { "epoch": 0.46, "grad_norm": 7.3125, "learning_rate": 3.290336385060832e-06, "logits/chosen": -1.3080298900604248, "logits/rejected": -1.114485502243042, "logps/chosen": -513.6076049804688, "logps/rejected": -580.9697265625, "loss": 0.5403, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6475276947021484, "rewards/margins": 0.8753725290298462, "rewards/rejected": -3.522900104522705, "step": 1750 }, { "epoch": 0.46, "grad_norm": 10.75, "learning_rate": 3.268630667594348e-06, "logits/chosen": -1.1190599203109741, "logits/rejected": -1.0877625942230225, "logps/chosen": -520.4367065429688, "logps/rejected": -593.3540649414062, "loss": 0.51, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6478748321533203, "rewards/margins": 0.9716035723686218, "rewards/rejected": -3.619478225708008, "step": 1760 }, { "epoch": 0.46, "grad_norm": 10.1875, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -1.1805906295776367, "logits/rejected": -1.1239099502563477, "logps/chosen": -522.7432861328125, "logps/rejected": -629.3782958984375, "loss": 0.4844, "rewards/accuracies": 0.75, "rewards/chosen": -2.695678949356079, "rewards/margins": 1.022963285446167, "rewards/rejected": -3.718641996383667, "step": 1770 }, { "epoch": 0.47, "grad_norm": 7.0625, "learning_rate": 3.225028509122944e-06, "logits/chosen": -1.2425084114074707, "logits/rejected": -1.1278479099273682, "logps/chosen": -481.4998474121094, "logps/rejected": -560.8279418945312, "loss": 0.5179, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.449826717376709, "rewards/margins": 0.9064075350761414, "rewards/rejected": -3.356234073638916, "step": 1780 }, { "epoch": 0.47, "grad_norm": 13.9375, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -1.2350413799285889, "logits/rejected": -1.1462427377700806, "logps/chosen": -549.2757568359375, "logps/rejected": -646.181640625, "loss": 0.5022, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7407171726226807, "rewards/margins": 1.003739595413208, "rewards/rejected": -3.7444565296173096, "step": 1790 }, { "epoch": 0.47, "grad_norm": 14.625, "learning_rate": 3.181184197019127e-06, "logits/chosen": -0.9863433837890625, "logits/rejected": -0.8817607164382935, "logps/chosen": -533.1535034179688, "logps/rejected": -698.6467895507812, "loss": 0.4825, "rewards/accuracies": 0.75, "rewards/chosen": -3.029388189315796, "rewards/margins": 1.3928486108779907, "rewards/rejected": -4.422236442565918, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": -1.0005792379379272, "eval_logits/rejected": -0.8737620115280151, "eval_logps/chosen": -569.109130859375, "eval_logps/rejected": -657.4884033203125, "eval_loss": 0.503667414188385, "eval_rewards/accuracies": 0.7379999756813049, "eval_rewards/chosen": -3.0445713996887207, "eval_rewards/margins": 1.0842331647872925, "eval_rewards/rejected": -4.1288042068481445, "eval_runtime": 382.2565, "eval_samples_per_second": 5.232, "eval_steps_per_second": 0.654, "step": 1800 }, { "epoch": 0.47, "grad_norm": 14.3125, "learning_rate": 3.159175806468126e-06, "logits/chosen": -1.0082833766937256, "logits/rejected": -0.8253539800643921, "logps/chosen": -556.5079956054688, "logps/rejected": -636.0127563476562, "loss": 0.5015, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.07832407951355, "rewards/margins": 1.0969042778015137, "rewards/rejected": -4.175228595733643, "step": 1810 }, { "epoch": 0.48, "grad_norm": 11.1875, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -1.1307703256607056, "logits/rejected": -1.0529394149780273, "logps/chosen": -595.5393676757812, "logps/rejected": -662.37158203125, "loss": 0.5659, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.1214325428009033, "rewards/margins": 0.8287679553031921, "rewards/rejected": -3.950200319290161, "step": 1820 }, { "epoch": 0.48, "grad_norm": 8.5625, "learning_rate": 3.114995744685877e-06, "logits/chosen": -1.07692551612854, "logits/rejected": -1.0323340892791748, "logps/chosen": -533.2166748046875, "logps/rejected": -612.94140625, "loss": 0.5153, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8589041233062744, "rewards/margins": 0.9276583790779114, "rewards/rejected": -3.786562442779541, "step": 1830 }, { "epoch": 0.48, "grad_norm": 6.40625, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -1.2703588008880615, "logits/rejected": -1.0852762460708618, "logps/chosen": -551.0806274414062, "logps/rejected": -643.0982666015625, "loss": 0.4817, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.663365125656128, "rewards/margins": 1.2043039798736572, "rewards/rejected": -3.8676695823669434, "step": 1840 }, { "epoch": 0.48, "grad_norm": 6.8125, "learning_rate": 3.070610279320708e-06, "logits/chosen": -1.248780608177185, "logits/rejected": -1.084285020828247, "logps/chosen": -551.0938110351562, "logps/rejected": -643.5797729492188, "loss": 0.4411, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.640122652053833, "rewards/margins": 1.165264368057251, "rewards/rejected": -3.805387020111084, "step": 1850 }, { "epoch": 0.49, "grad_norm": 6.09375, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -1.1745688915252686, "logits/rejected": -1.0959160327911377, "logps/chosen": -581.6795654296875, "logps/rejected": -661.7645263671875, "loss": 0.5518, "rewards/accuracies": 0.6875, "rewards/chosen": -2.9558444023132324, "rewards/margins": 1.0012142658233643, "rewards/rejected": -3.9570584297180176, "step": 1860 }, { "epoch": 0.49, "grad_norm": 11.875, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -1.1434388160705566, "logits/rejected": -0.9975016713142395, "logps/chosen": -543.2282104492188, "logps/rejected": -666.7279052734375, "loss": 0.4206, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.7089195251464844, "rewards/margins": 1.425309419631958, "rewards/rejected": -4.134228706359863, "step": 1870 }, { "epoch": 0.49, "grad_norm": 11.0, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -1.0967297554016113, "logits/rejected": -0.9473203420639038, "logps/chosen": -603.4558715820312, "logps/rejected": -692.9251708984375, "loss": 0.4466, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.3335928916931152, "rewards/margins": 1.3170349597930908, "rewards/rejected": -4.650628089904785, "step": 1880 }, { "epoch": 0.49, "grad_norm": 13.0, "learning_rate": 2.981282499033009e-06, "logits/chosen": -1.0985617637634277, "logits/rejected": -0.9863265156745911, "logps/chosen": -607.0682373046875, "logps/rejected": -701.697509765625, "loss": 0.5071, "rewards/accuracies": 0.71875, "rewards/chosen": -3.307284116744995, "rewards/margins": 1.200660228729248, "rewards/rejected": -4.507944583892822, "step": 1890 }, { "epoch": 0.5, "grad_norm": 10.0625, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -1.1454726457595825, "logits/rejected": -0.9831218719482422, "logps/chosen": -579.2799682617188, "logps/rejected": -706.1749877929688, "loss": 0.4455, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.979241371154785, "rewards/margins": 1.4865919351577759, "rewards/rejected": -4.4658331871032715, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": -1.0213502645492554, "eval_logits/rejected": -0.891007125377655, "eval_logps/chosen": -566.8839721679688, "eval_logps/rejected": -659.4305419921875, "eval_loss": 0.49620321393013, "eval_rewards/accuracies": 0.7419999837875366, "eval_rewards/chosen": -3.0223195552825928, "eval_rewards/margins": 1.1259068250656128, "eval_rewards/rejected": -4.148226737976074, "eval_runtime": 382.1041, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 1900 }, { "epoch": 0.5, "grad_norm": 10.8125, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -1.1352207660675049, "logits/rejected": -1.0086506605148315, "logps/chosen": -588.1229858398438, "logps/rejected": -648.9054565429688, "loss": 0.5063, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.230529308319092, "rewards/margins": 0.9782280921936035, "rewards/rejected": -4.208757400512695, "step": 1910 }, { "epoch": 0.5, "grad_norm": 13.375, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -1.1499899625778198, "logits/rejected": -1.0288715362548828, "logps/chosen": -581.2144775390625, "logps/rejected": -680.3140869140625, "loss": 0.496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2150332927703857, "rewards/margins": 1.1205800771713257, "rewards/rejected": -4.335613250732422, "step": 1920 }, { "epoch": 0.51, "grad_norm": 7.3125, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -1.1651884317398071, "logits/rejected": -0.9733787775039673, "logps/chosen": -579.3433227539062, "logps/rejected": -649.0181884765625, "loss": 0.4634, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.129295825958252, "rewards/margins": 1.077726125717163, "rewards/rejected": -4.207022190093994, "step": 1930 }, { "epoch": 0.51, "grad_norm": 11.375, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -1.163464069366455, "logits/rejected": -1.0617696046829224, "logps/chosen": -556.2322998046875, "logps/rejected": -651.5863037109375, "loss": 0.5142, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.971095561981201, "rewards/margins": 1.0506844520568848, "rewards/rejected": -4.021780014038086, "step": 1940 }, { "epoch": 0.51, "grad_norm": 12.8125, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -1.185319185256958, "logits/rejected": -1.0036907196044922, "logps/chosen": -566.9384155273438, "logps/rejected": -628.1956787109375, "loss": 0.5404, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.911479949951172, "rewards/margins": 0.8705935478210449, "rewards/rejected": -3.782073497772217, "step": 1950 }, { "epoch": 0.51, "grad_norm": 7.09375, "learning_rate": 2.823484120195865e-06, "logits/chosen": -1.3058470487594604, "logits/rejected": -1.113465666770935, "logps/chosen": -529.6067504882812, "logps/rejected": -606.2987060546875, "loss": 0.4364, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.5179548263549805, "rewards/margins": 1.1106722354888916, "rewards/rejected": -3.628627061843872, "step": 1960 }, { "epoch": 0.52, "grad_norm": 8.75, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -1.12172269821167, "logits/rejected": -0.968579888343811, "logps/chosen": -553.111083984375, "logps/rejected": -600.1488037109375, "loss": 0.5074, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7947652339935303, "rewards/margins": 0.9243541955947876, "rewards/rejected": -3.7191195487976074, "step": 1970 }, { "epoch": 0.52, "grad_norm": 10.75, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -0.9661678075790405, "logits/rejected": -0.9819488525390625, "logps/chosen": -551.6143798828125, "logps/rejected": -679.9763793945312, "loss": 0.5141, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2225677967071533, "rewards/margins": 1.0803557634353638, "rewards/rejected": -4.30292272567749, "step": 1980 }, { "epoch": 0.52, "grad_norm": 7.625, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -1.0553234815597534, "logits/rejected": -0.9197478294372559, "logps/chosen": -592.0967407226562, "logps/rejected": -687.3663940429688, "loss": 0.4817, "rewards/accuracies": 0.71875, "rewards/chosen": -3.442605495452881, "rewards/margins": 1.1034131050109863, "rewards/rejected": -4.546019077301025, "step": 1990 }, { "epoch": 0.52, "grad_norm": 11.375, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -1.1949965953826904, "logits/rejected": -1.0267183780670166, "logps/chosen": -653.2984619140625, "logps/rejected": -709.1905517578125, "loss": 0.4817, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.6396350860595703, "rewards/margins": 1.1184080839157104, "rewards/rejected": -4.75804328918457, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -0.9427788257598877, "eval_logits/rejected": -0.8139032125473022, "eval_logps/chosen": -624.5250244140625, "eval_logps/rejected": -711.0853271484375, "eval_loss": 0.49741417169570923, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -3.5987296104431152, "eval_rewards/margins": 1.0660440921783447, "eval_rewards/rejected": -4.664773941040039, "eval_runtime": 382.3502, "eval_samples_per_second": 5.231, "eval_steps_per_second": 0.654, "step": 2000 }, { "epoch": 0.53, "grad_norm": 10.5625, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -1.0279147624969482, "logits/rejected": -0.9855324625968933, "logps/chosen": -615.8596801757812, "logps/rejected": -704.7830200195312, "loss": 0.5276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.510098934173584, "rewards/margins": 0.954069972038269, "rewards/rejected": -4.464169025421143, "step": 2010 }, { "epoch": 0.53, "grad_norm": 9.625, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -1.177202582359314, "logits/rejected": -1.0146461725234985, "logps/chosen": -568.2487182617188, "logps/rejected": -659.0941162109375, "loss": 0.4583, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.152796745300293, "rewards/margins": 1.0889527797698975, "rewards/rejected": -4.2417497634887695, "step": 2020 }, { "epoch": 0.53, "grad_norm": 13.6875, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -1.2432745695114136, "logits/rejected": -1.0716017484664917, "logps/chosen": -619.502685546875, "logps/rejected": -699.7628173828125, "loss": 0.4576, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.2460086345672607, "rewards/margins": 1.2264302968978882, "rewards/rejected": -4.472439289093018, "step": 2030 }, { "epoch": 0.53, "grad_norm": 9.875, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -1.0596590042114258, "logits/rejected": -1.0028278827667236, "logps/chosen": -631.6947021484375, "logps/rejected": -728.5841674804688, "loss": 0.4471, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.6112823486328125, "rewards/margins": 1.1590890884399414, "rewards/rejected": -4.770371437072754, "step": 2040 }, { "epoch": 0.54, "grad_norm": 11.375, "learning_rate": 2.618747345980904e-06, "logits/chosen": -1.067651629447937, "logits/rejected": -0.8701795339584351, "logps/chosen": -667.7681274414062, "logps/rejected": -718.9295654296875, "loss": 0.5561, "rewards/accuracies": 0.71875, "rewards/chosen": -4.1937079429626465, "rewards/margins": 1.016485333442688, "rewards/rejected": -5.210193634033203, "step": 2050 }, { "epoch": 0.54, "grad_norm": 6.125, "learning_rate": 2.595923867132136e-06, "logits/chosen": -1.1067336797714233, "logits/rejected": -0.9798781275749207, "logps/chosen": -685.84228515625, "logps/rejected": -784.4832763671875, "loss": 0.4938, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.049218654632568, "rewards/margins": 1.2331972122192383, "rewards/rejected": -5.282416343688965, "step": 2060 }, { "epoch": 0.54, "grad_norm": 7.9375, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -1.017889380455017, "logits/rejected": -1.0066477060317993, "logps/chosen": -639.3632202148438, "logps/rejected": -738.4698486328125, "loss": 0.5372, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.8783206939697266, "rewards/margins": 0.9146150350570679, "rewards/rejected": -4.792935848236084, "step": 2070 }, { "epoch": 0.54, "grad_norm": 7.3125, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -1.1123883724212646, "logits/rejected": -0.9572793245315552, "logps/chosen": -607.7706909179688, "logps/rejected": -670.916015625, "loss": 0.5255, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.5239059925079346, "rewards/margins": 0.9338981509208679, "rewards/rejected": -4.457803726196289, "step": 2080 }, { "epoch": 0.55, "grad_norm": 13.375, "learning_rate": 2.527412999094507e-06, "logits/chosen": -1.118983507156372, "logits/rejected": -0.9597452282905579, "logps/chosen": -620.9295043945312, "logps/rejected": -721.0320434570312, "loss": 0.4802, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.2717292308807373, "rewards/margins": 1.1265954971313477, "rewards/rejected": -4.398324489593506, "step": 2090 }, { "epoch": 0.55, "grad_norm": 11.75, "learning_rate": 2.504568922200064e-06, "logits/chosen": -1.075067400932312, "logits/rejected": -0.937818706035614, "logps/chosen": -547.7574462890625, "logps/rejected": -641.327392578125, "loss": 0.5079, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0641894340515137, "rewards/margins": 1.0973466634750366, "rewards/rejected": -4.161535739898682, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": -1.0030875205993652, "eval_logits/rejected": -0.8739129900932312, "eval_logps/chosen": -582.1657104492188, "eval_logps/rejected": -667.5426025390625, "eval_loss": 0.4922982156276703, "eval_rewards/accuracies": 0.7519999742507935, "eval_rewards/chosen": -3.1751370429992676, "eval_rewards/margins": 1.0542099475860596, "eval_rewards/rejected": -4.229346752166748, "eval_runtime": 382.3169, "eval_samples_per_second": 5.231, "eval_steps_per_second": 0.654, "step": 2100 }, { "epoch": 0.55, "grad_norm": 8.1875, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -1.137091875076294, "logits/rejected": -0.9877273440361023, "logps/chosen": -593.8831787109375, "logps/rejected": -648.8990478515625, "loss": 0.5394, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.1807122230529785, "rewards/margins": 0.9622472524642944, "rewards/rejected": -4.1429595947265625, "step": 2110 }, { "epoch": 0.55, "grad_norm": 14.1875, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -1.117033839225769, "logits/rejected": -1.0428097248077393, "logps/chosen": -536.7808227539062, "logps/rejected": -599.55908203125, "loss": 0.4755, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.912168025970459, "rewards/margins": 0.9705360531806946, "rewards/rejected": -3.882704257965088, "step": 2120 }, { "epoch": 0.56, "grad_norm": 9.0625, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -1.1962370872497559, "logits/rejected": -1.0757726430892944, "logps/chosen": -545.7897338867188, "logps/rejected": -594.7244873046875, "loss": 0.5644, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6997714042663574, "rewards/margins": 0.8151930570602417, "rewards/rejected": -3.5149643421173096, "step": 2130 }, { "epoch": 0.56, "grad_norm": 6.75, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -1.2405675649642944, "logits/rejected": -1.0946118831634521, "logps/chosen": -528.01611328125, "logps/rejected": -594.1393432617188, "loss": 0.4643, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6354479789733887, "rewards/margins": 1.046671748161316, "rewards/rejected": -3.682119846343994, "step": 2140 }, { "epoch": 0.56, "grad_norm": 12.5625, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -1.1746861934661865, "logits/rejected": -1.0529396533966064, "logps/chosen": -520.6478271484375, "logps/rejected": -572.0309448242188, "loss": 0.5516, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8694403171539307, "rewards/margins": 0.8386090397834778, "rewards/rejected": -3.7080490589141846, "step": 2150 }, { "epoch": 0.57, "grad_norm": 11.25, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -1.2890937328338623, "logits/rejected": -1.0460366010665894, "logps/chosen": -595.07275390625, "logps/rejected": -639.810791015625, "loss": 0.4788, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0806915760040283, "rewards/margins": 1.094292402267456, "rewards/rejected": -4.174983978271484, "step": 2160 }, { "epoch": 0.57, "grad_norm": 11.5, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -1.1292383670806885, "logits/rejected": -0.9545844793319702, "logps/chosen": -596.5003662109375, "logps/rejected": -650.0792236328125, "loss": 0.5136, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.3818931579589844, "rewards/margins": 0.922932505607605, "rewards/rejected": -4.304825782775879, "step": 2170 }, { "epoch": 0.57, "grad_norm": 12.0, "learning_rate": 2.321962767270724e-06, "logits/chosen": -1.158575415611267, "logits/rejected": -1.0298246145248413, "logps/chosen": -583.9124755859375, "logps/rejected": -629.5396118164062, "loss": 0.5615, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.3395965099334717, "rewards/margins": 0.8280007243156433, "rewards/rejected": -4.16759729385376, "step": 2180 }, { "epoch": 0.57, "grad_norm": 8.75, "learning_rate": 2.299183896281692e-06, "logits/chosen": -1.088763952255249, "logits/rejected": -0.9791523218154907, "logps/chosen": -556.0525512695312, "logps/rejected": -641.457763671875, "loss": 0.5181, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0112056732177734, "rewards/margins": 0.8770611882209778, "rewards/rejected": -3.8882670402526855, "step": 2190 }, { "epoch": 0.58, "grad_norm": 7.25, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -1.2351996898651123, "logits/rejected": -1.1065688133239746, "logps/chosen": -519.6819458007812, "logps/rejected": -608.1278686523438, "loss": 0.477, "rewards/accuracies": 0.75, "rewards/chosen": -2.5824990272521973, "rewards/margins": 1.0897197723388672, "rewards/rejected": -3.6722190380096436, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": -1.0880188941955566, "eval_logits/rejected": -0.9566530585289001, "eval_logps/chosen": -525.9181518554688, "eval_logps/rejected": -601.7401733398438, "eval_loss": 0.48973530530929565, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.612661123275757, "eval_rewards/margins": 0.9586613774299622, "eval_rewards/rejected": -3.571322441101074, "eval_runtime": 382.0537, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 2200 }, { "epoch": 0.58, "grad_norm": 5.71875, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -1.2977464199066162, "logits/rejected": -1.1296590566635132, "logps/chosen": -553.06103515625, "logps/rejected": -621.307861328125, "loss": 0.5291, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7755794525146484, "rewards/margins": 0.8637927174568176, "rewards/rejected": -3.6393723487854004, "step": 2210 }, { "epoch": 0.58, "grad_norm": 7.84375, "learning_rate": 2.230955492793149e-06, "logits/chosen": -1.0942963361740112, "logits/rejected": -1.042419195175171, "logps/chosen": -573.537841796875, "logps/rejected": -642.611572265625, "loss": 0.5884, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.997450351715088, "rewards/margins": 0.8198318481445312, "rewards/rejected": -3.8172824382781982, "step": 2220 }, { "epoch": 0.58, "grad_norm": 5.71875, "learning_rate": 2.208255091531947e-06, "logits/chosen": -1.1044989824295044, "logits/rejected": -1.0208889245986938, "logps/chosen": -553.853515625, "logps/rejected": -632.1079711914062, "loss": 0.4853, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7685611248016357, "rewards/margins": 1.132253646850586, "rewards/rejected": -3.9008147716522217, "step": 2230 }, { "epoch": 0.59, "grad_norm": 11.75, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -1.1996960639953613, "logits/rejected": -1.0961394309997559, "logps/chosen": -557.0603637695312, "logps/rejected": -641.5968017578125, "loss": 0.5037, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.600390672683716, "rewards/margins": 1.021994948387146, "rewards/rejected": -3.6223855018615723, "step": 2240 }, { "epoch": 0.59, "grad_norm": 6.28125, "learning_rate": 2.162929264300107e-06, "logits/chosen": -1.2133983373641968, "logits/rejected": -1.109574556350708, "logps/chosen": -511.7315979003906, "logps/rejected": -615.6173095703125, "loss": 0.416, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4276764392852783, "rewards/margins": 1.2624719142913818, "rewards/rejected": -3.690148115158081, "step": 2250 }, { "epoch": 0.59, "grad_norm": 12.1875, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -1.1181437969207764, "logits/rejected": -0.9982963800430298, "logps/chosen": -565.5302124023438, "logps/rejected": -622.5106811523438, "loss": 0.5759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9434773921966553, "rewards/margins": 0.8478938341140747, "rewards/rejected": -3.7913711071014404, "step": 2260 }, { "epoch": 0.59, "grad_norm": 7.59375, "learning_rate": 2.11771601595586e-06, "logits/chosen": -1.2033512592315674, "logits/rejected": -1.0716886520385742, "logps/chosen": -557.2864379882812, "logps/rejected": -603.1704711914062, "loss": 0.5099, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.752382755279541, "rewards/margins": 0.9814404249191284, "rewards/rejected": -3.73382306098938, "step": 2270 }, { "epoch": 0.6, "grad_norm": 12.8125, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -1.177409052848816, "logits/rejected": -0.9869596362113953, "logps/chosen": -525.6967163085938, "logps/rejected": -594.2974853515625, "loss": 0.4644, "rewards/accuracies": 0.78125, "rewards/chosen": -2.641634941101074, "rewards/margins": 0.9909149408340454, "rewards/rejected": -3.63254976272583, "step": 2280 }, { "epoch": 0.6, "grad_norm": 8.0625, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -1.1395372152328491, "logits/rejected": -1.0176304578781128, "logps/chosen": -522.652099609375, "logps/rejected": -593.3766479492188, "loss": 0.4738, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6371326446533203, "rewards/margins": 1.0305713415145874, "rewards/rejected": -3.667703628540039, "step": 2290 }, { "epoch": 0.6, "grad_norm": 10.4375, "learning_rate": 2.050140250457023e-06, "logits/chosen": -1.2590233087539673, "logits/rejected": -1.052428960800171, "logps/chosen": -578.8065185546875, "logps/rejected": -654.0260009765625, "loss": 0.4829, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0124454498291016, "rewards/margins": 1.0927618741989136, "rewards/rejected": -4.1052069664001465, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": -1.0313422679901123, "eval_logits/rejected": -0.9032019972801208, "eval_logps/chosen": -559.955810546875, "eval_logps/rejected": -654.1510620117188, "eval_loss": 0.4887396991252899, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -2.953037738800049, "eval_rewards/margins": 1.1423934698104858, "eval_rewards/rejected": -4.095431804656982, "eval_runtime": 381.9442, "eval_samples_per_second": 5.236, "eval_steps_per_second": 0.655, "step": 2300 }, { "epoch": 0.6, "grad_norm": 14.625, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -1.0072084665298462, "logits/rejected": -0.9061794281005859, "logps/chosen": -509.2284240722656, "logps/rejected": -580.1068725585938, "loss": 0.5548, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8472893238067627, "rewards/margins": 1.0346016883850098, "rewards/rejected": -3.8818912506103516, "step": 2310 }, { "epoch": 0.61, "grad_norm": 8.25, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -1.1624600887298584, "logits/rejected": -1.04361891746521, "logps/chosen": -552.9761962890625, "logps/rejected": -621.9478759765625, "loss": 0.5046, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9763803482055664, "rewards/margins": 1.018448829650879, "rewards/rejected": -3.9948291778564453, "step": 2320 }, { "epoch": 0.61, "grad_norm": 10.75, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -1.1844617128372192, "logits/rejected": -1.1262612342834473, "logps/chosen": -559.8540649414062, "logps/rejected": -640.3355712890625, "loss": 0.549, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.8085100650787354, "rewards/margins": 0.865519642829895, "rewards/rejected": -3.67402982711792, "step": 2330 }, { "epoch": 0.61, "grad_norm": 10.0625, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -1.1309086084365845, "logits/rejected": -0.9911936521530151, "logps/chosen": -502.54608154296875, "logps/rejected": -611.60693359375, "loss": 0.4877, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.6956772804260254, "rewards/margins": 1.1704528331756592, "rewards/rejected": -3.8661301136016846, "step": 2340 }, { "epoch": 0.62, "grad_norm": 9.1875, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -1.196989893913269, "logits/rejected": -1.0731130838394165, "logps/chosen": -527.642578125, "logps/rejected": -616.3968505859375, "loss": 0.4233, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6888630390167236, "rewards/margins": 1.2105457782745361, "rewards/rejected": -3.8994088172912598, "step": 2350 }, { "epoch": 0.62, "grad_norm": 11.9375, "learning_rate": 1.916053394469437e-06, "logits/chosen": -1.2187442779541016, "logits/rejected": -1.0278013944625854, "logps/chosen": -555.1328125, "logps/rejected": -650.1771240234375, "loss": 0.5309, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9184062480926514, "rewards/margins": 1.0958768129348755, "rewards/rejected": -4.014283180236816, "step": 2360 }, { "epoch": 0.62, "grad_norm": 9.5625, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -1.173482894897461, "logits/rejected": -0.9950237274169922, "logps/chosen": -563.7232666015625, "logps/rejected": -654.51611328125, "loss": 0.4349, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9941768646240234, "rewards/margins": 1.1962960958480835, "rewards/rejected": -4.1904730796813965, "step": 2370 }, { "epoch": 0.62, "grad_norm": 10.9375, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -1.0182400941848755, "logits/rejected": -0.9381190538406372, "logps/chosen": -567.86376953125, "logps/rejected": -672.0901489257812, "loss": 0.4496, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1928865909576416, "rewards/margins": 1.1378134489059448, "rewards/rejected": -4.330699920654297, "step": 2380 }, { "epoch": 0.63, "grad_norm": 6.90625, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -1.0953130722045898, "logits/rejected": -0.9763644337654114, "logps/chosen": -591.7195434570312, "logps/rejected": -673.8305053710938, "loss": 0.4954, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1930747032165527, "rewards/margins": 1.0575921535491943, "rewards/rejected": -4.250667095184326, "step": 2390 }, { "epoch": 0.63, "grad_norm": 7.96875, "learning_rate": 1.827612436565286e-06, "logits/chosen": -1.093685507774353, "logits/rejected": -0.9428181648254395, "logps/chosen": -569.9864501953125, "logps/rejected": -664.4702758789062, "loss": 0.4752, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0702195167541504, "rewards/margins": 1.1502970457077026, "rewards/rejected": -4.220516681671143, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": -0.9764781594276428, "eval_logits/rejected": -0.849520742893219, "eval_logps/chosen": -579.4506225585938, "eval_logps/rejected": -672.75830078125, "eval_loss": 0.49094268679618835, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -3.147986888885498, "eval_rewards/margins": 1.1335173845291138, "eval_rewards/rejected": -4.281503677368164, "eval_runtime": 382.2569, "eval_samples_per_second": 5.232, "eval_steps_per_second": 0.654, "step": 2400 }, { "epoch": 0.63, "grad_norm": 18.625, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -1.1393061876296997, "logits/rejected": -1.0437672138214111, "logps/chosen": -565.1177978515625, "logps/rejected": -631.9932861328125, "loss": 0.5436, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2091946601867676, "rewards/margins": 0.9168522953987122, "rewards/rejected": -4.126046180725098, "step": 2410 }, { "epoch": 0.63, "grad_norm": 10.375, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -1.182935118675232, "logits/rejected": -1.0437054634094238, "logps/chosen": -523.6812133789062, "logps/rejected": -616.8724975585938, "loss": 0.4678, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.713310480117798, "rewards/margins": 1.1654255390167236, "rewards/rejected": -3.8787360191345215, "step": 2420 }, { "epoch": 0.64, "grad_norm": 14.125, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -1.1824162006378174, "logits/rejected": -1.051477313041687, "logps/chosen": -526.3547973632812, "logps/rejected": -624.6488647460938, "loss": 0.4987, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6958136558532715, "rewards/margins": 1.1019628047943115, "rewards/rejected": -3.797776460647583, "step": 2430 }, { "epoch": 0.64, "grad_norm": 9.375, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -1.1761425733566284, "logits/rejected": -0.9889799952507019, "logps/chosen": -560.6347045898438, "logps/rejected": -604.340576171875, "loss": 0.5198, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.900243043899536, "rewards/margins": 1.019816279411316, "rewards/rejected": -3.9200592041015625, "step": 2440 }, { "epoch": 0.64, "grad_norm": 11.8125, "learning_rate": 1.718338084156254e-06, "logits/chosen": -1.1455858945846558, "logits/rejected": -0.9903894662857056, "logps/chosen": -568.4344482421875, "logps/rejected": -638.8942260742188, "loss": 0.4578, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8164334297180176, "rewards/margins": 1.0884320735931396, "rewards/rejected": -3.9048657417297363, "step": 2450 }, { "epoch": 0.64, "grad_norm": 9.625, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -1.2747197151184082, "logits/rejected": -1.101963758468628, "logps/chosen": -554.3800659179688, "logps/rejected": -616.3612060546875, "loss": 0.4412, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.774376392364502, "rewards/margins": 1.1384481191635132, "rewards/rejected": -3.9128241539001465, "step": 2460 }, { "epoch": 0.65, "grad_norm": 9.5625, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -1.2133910655975342, "logits/rejected": -1.1471474170684814, "logps/chosen": -530.1273193359375, "logps/rejected": -630.1476440429688, "loss": 0.5062, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7732110023498535, "rewards/margins": 0.9591614007949829, "rewards/rejected": -3.7323715686798096, "step": 2470 }, { "epoch": 0.65, "grad_norm": 11.875, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -1.0105046033859253, "logits/rejected": -0.9159660339355469, "logps/chosen": -560.3009643554688, "logps/rejected": -653.0996704101562, "loss": 0.4182, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.069490909576416, "rewards/margins": 1.270485520362854, "rewards/rejected": -4.3399763107299805, "step": 2480 }, { "epoch": 0.65, "grad_norm": 13.4375, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -1.0919368267059326, "logits/rejected": -1.0423280000686646, "logps/chosen": -547.108154296875, "logps/rejected": -651.2943725585938, "loss": 0.4663, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1227710247039795, "rewards/margins": 1.1962798833847046, "rewards/rejected": -4.3190507888793945, "step": 2490 }, { "epoch": 0.65, "grad_norm": 14.625, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -1.1967922449111938, "logits/rejected": -1.0710703134536743, "logps/chosen": -573.5470581054688, "logps/rejected": -666.3677978515625, "loss": 0.5249, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.163433313369751, "rewards/margins": 1.0070708990097046, "rewards/rejected": -4.170504093170166, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": -1.0434505939483643, "eval_logits/rejected": -0.9135813117027283, "eval_logps/chosen": -574.00927734375, "eval_logps/rejected": -664.8961791992188, "eval_loss": 0.4891022741794586, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -3.0935721397399902, "eval_rewards/margins": 1.1093100309371948, "eval_rewards/rejected": -4.202882289886475, "eval_runtime": 382.3246, "eval_samples_per_second": 5.231, "eval_steps_per_second": 0.654, "step": 2500 }, { "epoch": 0.66, "grad_norm": 9.4375, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -1.1978858709335327, "logits/rejected": -1.0786705017089844, "logps/chosen": -573.3375244140625, "logps/rejected": -648.0001831054688, "loss": 0.4737, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.959294080734253, "rewards/margins": 1.132147192955017, "rewards/rejected": -4.0914411544799805, "step": 2510 }, { "epoch": 0.66, "grad_norm": 7.875, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -1.1913158893585205, "logits/rejected": -0.9305517077445984, "logps/chosen": -603.19873046875, "logps/rejected": -671.5530395507812, "loss": 0.4638, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1415326595306396, "rewards/margins": 1.2662583589553833, "rewards/rejected": -4.4077911376953125, "step": 2520 }, { "epoch": 0.66, "grad_norm": 6.15625, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -1.1753239631652832, "logits/rejected": -1.0243064165115356, "logps/chosen": -557.4925537109375, "logps/rejected": -671.5239868164062, "loss": 0.4264, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1072232723236084, "rewards/margins": 1.246586561203003, "rewards/rejected": -4.353809833526611, "step": 2530 }, { "epoch": 0.66, "grad_norm": 8.5625, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -1.0654633045196533, "logits/rejected": -0.9947797656059265, "logps/chosen": -555.5715942382812, "logps/rejected": -674.6041259765625, "loss": 0.428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1076245307922363, "rewards/margins": 1.2515560388565063, "rewards/rejected": -4.359180927276611, "step": 2540 }, { "epoch": 0.67, "grad_norm": 17.5, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -1.117201328277588, "logits/rejected": -1.024710774421692, "logps/chosen": -542.8455200195312, "logps/rejected": -661.6935424804688, "loss": 0.4846, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.0537521839141846, "rewards/margins": 1.2326675653457642, "rewards/rejected": -4.286419868469238, "step": 2550 }, { "epoch": 0.67, "grad_norm": 13.75, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -1.1161174774169922, "logits/rejected": -0.9579364061355591, "logps/chosen": -589.6568603515625, "logps/rejected": -655.3709716796875, "loss": 0.4618, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.976780414581299, "rewards/margins": 1.220205307006836, "rewards/rejected": -4.196985721588135, "step": 2560 }, { "epoch": 0.67, "grad_norm": 18.625, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -1.1335794925689697, "logits/rejected": -1.004740595817566, "logps/chosen": -549.504150390625, "logps/rejected": -625.6862182617188, "loss": 0.631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -3.0777907371520996, "rewards/margins": 0.8784114122390747, "rewards/rejected": -3.9562020301818848, "step": 2570 }, { "epoch": 0.68, "grad_norm": 6.03125, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -1.2103271484375, "logits/rejected": -1.1048699617385864, "logps/chosen": -521.5680541992188, "logps/rejected": -641.9281616210938, "loss": 0.4666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6569151878356934, "rewards/margins": 1.1702333688735962, "rewards/rejected": -3.8271484375, "step": 2580 }, { "epoch": 0.68, "grad_norm": 13.0, "learning_rate": 1.421763837748016e-06, "logits/chosen": -1.1741114854812622, "logits/rejected": -1.0814844369888306, "logps/chosen": -523.6945190429688, "logps/rejected": -640.1383056640625, "loss": 0.4441, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7306861877441406, "rewards/margins": 1.2494643926620483, "rewards/rejected": -3.9801506996154785, "step": 2590 }, { "epoch": 0.68, "grad_norm": 10.75, "learning_rate": 1.401198464962021e-06, "logits/chosen": -1.2068405151367188, "logits/rejected": -1.0479636192321777, "logps/chosen": -556.838623046875, "logps/rejected": -625.3237915039062, "loss": 0.4596, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.882551908493042, "rewards/margins": 1.0710302591323853, "rewards/rejected": -3.953582286834717, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": -1.0548917055130005, "eval_logits/rejected": -0.9263830184936523, "eval_logps/chosen": -559.5697631835938, "eval_logps/rejected": -654.4569702148438, "eval_loss": 0.493943989276886, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -2.9491782188415527, "eval_rewards/margins": 1.149312973022461, "eval_rewards/rejected": -4.098491191864014, "eval_runtime": 381.8434, "eval_samples_per_second": 5.238, "eval_steps_per_second": 0.655, "step": 2600 }, { "epoch": 0.68, "grad_norm": 8.5, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -1.2618989944458008, "logits/rejected": -1.1420848369598389, "logps/chosen": -562.00146484375, "logps/rejected": -670.0994873046875, "loss": 0.4435, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.889355182647705, "rewards/margins": 1.2824347019195557, "rewards/rejected": -4.17179012298584, "step": 2610 }, { "epoch": 0.69, "grad_norm": 29.125, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -1.0890090465545654, "logits/rejected": -0.966164767742157, "logps/chosen": -583.3985595703125, "logps/rejected": -678.4222412109375, "loss": 0.5331, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2020103931427, "rewards/margins": 1.1710442304611206, "rewards/rejected": -4.373054504394531, "step": 2620 }, { "epoch": 0.69, "grad_norm": 6.84375, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -1.2348748445510864, "logits/rejected": -1.083888053894043, "logps/chosen": -578.0357666015625, "logps/rejected": -659.4061279296875, "loss": 0.522, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.088522434234619, "rewards/margins": 1.0845201015472412, "rewards/rejected": -4.173042297363281, "step": 2630 }, { "epoch": 0.69, "grad_norm": 8.6875, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -1.1974236965179443, "logits/rejected": -1.0507824420928955, "logps/chosen": -558.0233764648438, "logps/rejected": -667.1177978515625, "loss": 0.4239, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.2156710624694824, "rewards/margins": 1.2272260189056396, "rewards/rejected": -4.442896842956543, "step": 2640 }, { "epoch": 0.69, "grad_norm": 7.34375, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -1.2283174991607666, "logits/rejected": -1.0654624700546265, "logps/chosen": -604.8272705078125, "logps/rejected": -724.4739379882812, "loss": 0.4118, "rewards/accuracies": 0.8125, "rewards/chosen": -3.079939365386963, "rewards/margins": 1.4414037466049194, "rewards/rejected": -4.521343231201172, "step": 2650 }, { "epoch": 0.7, "grad_norm": 23.625, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -1.1807067394256592, "logits/rejected": -1.0574986934661865, "logps/chosen": -610.4517822265625, "logps/rejected": -721.9064331054688, "loss": 0.489, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.385681629180908, "rewards/margins": 1.2347917556762695, "rewards/rejected": -4.620473384857178, "step": 2660 }, { "epoch": 0.7, "grad_norm": 12.5, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -1.129962682723999, "logits/rejected": -1.0201483964920044, "logps/chosen": -585.4744262695312, "logps/rejected": -686.8712158203125, "loss": 0.4794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2754268646240234, "rewards/margins": 1.2443504333496094, "rewards/rejected": -4.519776821136475, "step": 2670 }, { "epoch": 0.7, "grad_norm": 12.25, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -1.155137300491333, "logits/rejected": -1.012924313545227, "logps/chosen": -587.5916748046875, "logps/rejected": -667.8207397460938, "loss": 0.5131, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.2043495178222656, "rewards/margins": 1.134204626083374, "rewards/rejected": -4.338554859161377, "step": 2680 }, { "epoch": 0.7, "grad_norm": 12.4375, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -1.1951662302017212, "logits/rejected": -1.140353798866272, "logps/chosen": -567.091552734375, "logps/rejected": -681.1925048828125, "loss": 0.4925, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0157859325408936, "rewards/margins": 1.1822477579116821, "rewards/rejected": -4.198033332824707, "step": 2690 }, { "epoch": 0.71, "grad_norm": 15.0625, "learning_rate": 1.20087039953583e-06, "logits/chosen": -1.2230998277664185, "logits/rejected": -1.1086806058883667, "logps/chosen": -558.0277099609375, "logps/rejected": -655.5286865234375, "loss": 0.5152, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9380927085876465, "rewards/margins": 1.2388523817062378, "rewards/rejected": -4.176945209503174, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": -1.052660346031189, "eval_logits/rejected": -0.9249356985092163, "eval_logps/chosen": -566.6193237304688, "eval_logps/rejected": -660.3236083984375, "eval_loss": 0.49224671721458435, "eval_rewards/accuracies": 0.7440000176429749, "eval_rewards/chosen": -3.0196733474731445, "eval_rewards/margins": 1.1374843120574951, "eval_rewards/rejected": -4.1571574211120605, "eval_runtime": 382.3055, "eval_samples_per_second": 5.231, "eval_steps_per_second": 0.654, "step": 2700 }, { "epoch": 0.71, "grad_norm": 10.625, "learning_rate": 1.181406963063507e-06, "logits/chosen": -1.1344083547592163, "logits/rejected": -1.0651142597198486, "logps/chosen": -557.28125, "logps/rejected": -663.6448974609375, "loss": 0.5133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9143428802490234, "rewards/margins": 1.0695984363555908, "rewards/rejected": -3.9839415550231934, "step": 2710 }, { "epoch": 0.71, "grad_norm": 6.84375, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -1.2466278076171875, "logits/rejected": -1.1265995502471924, "logps/chosen": -571.1409301757812, "logps/rejected": -636.3128662109375, "loss": 0.5366, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.799872875213623, "rewards/margins": 0.9532085657119751, "rewards/rejected": -3.7530815601348877, "step": 2720 }, { "epoch": 0.71, "grad_norm": 9.8125, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -1.1743571758270264, "logits/rejected": -1.024549126625061, "logps/chosen": -524.5095825195312, "logps/rejected": -638.3724365234375, "loss": 0.3937, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7037060260772705, "rewards/margins": 1.5533134937286377, "rewards/rejected": -4.257019519805908, "step": 2730 }, { "epoch": 0.72, "grad_norm": 9.5625, "learning_rate": 1.123683721144223e-06, "logits/chosen": -1.186992883682251, "logits/rejected": -1.0803272724151611, "logps/chosen": -567.0985107421875, "logps/rejected": -677.031005859375, "loss": 0.4245, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.834251880645752, "rewards/margins": 1.422716498374939, "rewards/rejected": -4.256968021392822, "step": 2740 }, { "epoch": 0.72, "grad_norm": 6.96875, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -1.1976065635681152, "logits/rejected": -1.1182498931884766, "logps/chosen": -537.647216796875, "logps/rejected": -647.6975708007812, "loss": 0.5195, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8253092765808105, "rewards/margins": 1.0392690896987915, "rewards/rejected": -3.8645782470703125, "step": 2750 }, { "epoch": 0.72, "grad_norm": 10.9375, "learning_rate": 1.085773492015028e-06, "logits/chosen": -1.1978458166122437, "logits/rejected": -1.0323292016983032, "logps/chosen": -516.9109497070312, "logps/rejected": -612.7794189453125, "loss": 0.4273, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7053141593933105, "rewards/margins": 1.3017933368682861, "rewards/rejected": -4.007107257843018, "step": 2760 }, { "epoch": 0.72, "grad_norm": 18.5, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -1.15623140335083, "logits/rejected": -1.0121409893035889, "logps/chosen": -561.9942626953125, "logps/rejected": -639.5707397460938, "loss": 0.5067, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.893584728240967, "rewards/margins": 1.0627275705337524, "rewards/rejected": -3.956312656402588, "step": 2770 }, { "epoch": 0.73, "grad_norm": 9.4375, "learning_rate": 1.048335603051291e-06, "logits/chosen": -1.1546833515167236, "logits/rejected": -1.0220603942871094, "logps/chosen": -599.4776611328125, "logps/rejected": -710.4974975585938, "loss": 0.4331, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1167383193969727, "rewards/margins": 1.39237380027771, "rewards/rejected": -4.5091118812561035, "step": 2780 }, { "epoch": 0.73, "grad_norm": 9.75, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -1.1516591310501099, "logits/rejected": -1.0285645723342896, "logps/chosen": -560.0816650390625, "logps/rejected": -675.4591064453125, "loss": 0.4367, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9946718215942383, "rewards/margins": 1.4317247867584229, "rewards/rejected": -4.42639684677124, "step": 2790 }, { "epoch": 0.73, "grad_norm": 10.9375, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -1.1821314096450806, "logits/rejected": -1.0650185346603394, "logps/chosen": -576.8660278320312, "logps/rejected": -679.5147705078125, "loss": 0.4518, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.115962266921997, "rewards/margins": 1.1937239170074463, "rewards/rejected": -4.309685707092285, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": -1.053481936454773, "eval_logits/rejected": -0.9260234236717224, "eval_logps/chosen": -571.3138427734375, "eval_logps/rejected": -668.0293579101562, "eval_loss": 0.49084553122520447, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -3.066617965698242, "eval_rewards/margins": 1.1675963401794434, "eval_rewards/rejected": -4.2342143058776855, "eval_runtime": 382.1708, "eval_samples_per_second": 5.233, "eval_steps_per_second": 0.654, "step": 2800 }, { "epoch": 0.74, "grad_norm": 10.625, "learning_rate": 9.930917156425477e-07, "logits/chosen": -1.1559561491012573, "logits/rejected": -1.0568530559539795, "logps/chosen": -582.1268310546875, "logps/rejected": -691.96875, "loss": 0.5368, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.210031509399414, "rewards/margins": 1.1272989511489868, "rewards/rejected": -4.337330341339111, "step": 2810 }, { "epoch": 0.74, "grad_norm": 15.25, "learning_rate": 9.749266994893756e-07, "logits/chosen": -1.0973955392837524, "logits/rejected": -0.9485132098197937, "logps/chosen": -550.6517333984375, "logps/rejected": -629.6903686523438, "loss": 0.5621, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0995354652404785, "rewards/margins": 0.9246597290039062, "rewards/rejected": -4.024195671081543, "step": 2820 }, { "epoch": 0.74, "grad_norm": 15.3125, "learning_rate": 9.56889026517913e-07, "logits/chosen": -1.1514110565185547, "logits/rejected": -1.0361002683639526, "logps/chosen": -582.6224365234375, "logps/rejected": -664.3800659179688, "loss": 0.5019, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.2128403186798096, "rewards/margins": 1.0774794816970825, "rewards/rejected": -4.290319442749023, "step": 2830 }, { "epoch": 0.74, "grad_norm": 7.03125, "learning_rate": 9.389802028686617e-07, "logits/chosen": -1.2338387966156006, "logits/rejected": -1.1366431713104248, "logps/chosen": -566.8738403320312, "logps/rejected": -616.0252685546875, "loss": 0.5826, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0610568523406982, "rewards/margins": 0.8211328387260437, "rewards/rejected": -3.882189989089966, "step": 2840 }, { "epoch": 0.75, "grad_norm": 9.75, "learning_rate": 9.212017239232427e-07, "logits/chosen": -1.1542332172393799, "logits/rejected": -1.017268180847168, "logps/chosen": -568.286376953125, "logps/rejected": -668.4588623046875, "loss": 0.4741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9533185958862305, "rewards/margins": 1.2311924695968628, "rewards/rejected": -4.184511184692383, "step": 2850 }, { "epoch": 0.75, "grad_norm": 10.0625, "learning_rate": 9.03555074179533e-07, "logits/chosen": -1.1374441385269165, "logits/rejected": -1.1105449199676514, "logps/chosen": -544.0662231445312, "logps/rejected": -676.7945556640625, "loss": 0.446, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.872863292694092, "rewards/margins": 1.2689627408981323, "rewards/rejected": -4.1418256759643555, "step": 2860 }, { "epoch": 0.75, "grad_norm": 14.5625, "learning_rate": 8.860417271277067e-07, "logits/chosen": -1.263672947883606, "logits/rejected": -1.2044599056243896, "logps/chosen": -563.6286010742188, "logps/rejected": -651.6553955078125, "loss": 0.4788, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9439358711242676, "rewards/margins": 0.9601505398750305, "rewards/rejected": -3.9040865898132324, "step": 2870 }, { "epoch": 0.75, "grad_norm": 8.75, "learning_rate": 8.686631451272029e-07, "logits/chosen": -1.2087829113006592, "logits/rejected": -1.0665159225463867, "logps/chosen": -564.14892578125, "logps/rejected": -660.1915893554688, "loss": 0.4861, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1072099208831787, "rewards/margins": 1.2149550914764404, "rewards/rejected": -4.322165489196777, "step": 2880 }, { "epoch": 0.76, "grad_norm": 8.625, "learning_rate": 8.514207792846168e-07, "logits/chosen": -1.2422146797180176, "logits/rejected": -1.1245746612548828, "logps/chosen": -556.6324462890625, "logps/rejected": -642.3776245117188, "loss": 0.4902, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.0577263832092285, "rewards/margins": 1.1418030261993408, "rewards/rejected": -4.19952917098999, "step": 2890 }, { "epoch": 0.76, "grad_norm": 8.0625, "learning_rate": 8.343160693325356e-07, "logits/chosen": -1.1230237483978271, "logits/rejected": -1.0151801109313965, "logps/chosen": -566.5771484375, "logps/rejected": -679.12646484375, "loss": 0.5018, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1102497577667236, "rewards/margins": 1.1686756610870361, "rewards/rejected": -4.278925895690918, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": -1.059489130973816, "eval_logits/rejected": -0.9320334792137146, "eval_logps/chosen": -574.426025390625, "eval_logps/rejected": -668.4285278320312, "eval_loss": 0.4876534342765808, "eval_rewards/accuracies": 0.7465000152587891, "eval_rewards/chosen": -3.0977394580841064, "eval_rewards/margins": 1.1404662132263184, "eval_rewards/rejected": -4.238205432891846, "eval_runtime": 382.316, "eval_samples_per_second": 5.231, "eval_steps_per_second": 0.654, "step": 2900 }, { "epoch": 0.76, "grad_norm": 8.125, "learning_rate": 8.173504435093174e-07, "logits/chosen": -1.1287494897842407, "logits/rejected": -0.955623984336853, "logps/chosen": -547.8873291015625, "logps/rejected": -640.971923828125, "loss": 0.477, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0631890296936035, "rewards/margins": 1.2520211935043335, "rewards/rejected": -4.315210342407227, "step": 2910 }, { "epoch": 0.76, "grad_norm": 6.5625, "learning_rate": 8.00525318439836e-07, "logits/chosen": -1.158349871635437, "logits/rejected": -1.0400350093841553, "logps/chosen": -583.4833374023438, "logps/rejected": -674.5729370117188, "loss": 0.5408, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0665407180786133, "rewards/margins": 0.9871135950088501, "rewards/rejected": -4.053654193878174, "step": 2920 }, { "epoch": 0.77, "grad_norm": 7.6875, "learning_rate": 7.838420990171927e-07, "logits/chosen": -1.2469195127487183, "logits/rejected": -1.0984286069869995, "logps/chosen": -567.165283203125, "logps/rejected": -650.6731567382812, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": -2.977949380874634, "rewards/margins": 1.090990424156189, "rewards/rejected": -4.068940162658691, "step": 2930 }, { "epoch": 0.77, "grad_norm": 7.5625, "learning_rate": 7.673021782854084e-07, "logits/chosen": -1.1217727661132812, "logits/rejected": -0.9839452505111694, "logps/chosen": -561.6543579101562, "logps/rejected": -643.6695556640625, "loss": 0.4898, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.009657382965088, "rewards/margins": 1.2389792203903198, "rewards/rejected": -4.248636722564697, "step": 2940 }, { "epoch": 0.77, "grad_norm": 9.9375, "learning_rate": 7.509069373231039e-07, "logits/chosen": -1.129913568496704, "logits/rejected": -1.0110609531402588, "logps/chosen": -554.6318969726562, "logps/rejected": -622.6085205078125, "loss": 0.5441, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0194828510284424, "rewards/margins": 0.9275726079940796, "rewards/rejected": -3.9470553398132324, "step": 2950 }, { "epoch": 0.77, "grad_norm": 7.71875, "learning_rate": 7.346577451281822e-07, "logits/chosen": -1.1370588541030884, "logits/rejected": -1.0633890628814697, "logps/chosen": -551.51123046875, "logps/rejected": -660.9559936523438, "loss": 0.4596, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.892915725708008, "rewards/margins": 1.3355481624603271, "rewards/rejected": -4.228463649749756, "step": 2960 }, { "epoch": 0.78, "grad_norm": 18.625, "learning_rate": 7.185559585035138e-07, "logits/chosen": -1.1904377937316895, "logits/rejected": -1.0318008661270142, "logps/chosen": -591.028564453125, "logps/rejected": -693.4492797851562, "loss": 0.4733, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.0825228691101074, "rewards/margins": 1.1828874349594116, "rewards/rejected": -4.26540994644165, "step": 2970 }, { "epoch": 0.78, "grad_norm": 7.78125, "learning_rate": 7.026029219436504e-07, "logits/chosen": -1.2153565883636475, "logits/rejected": -1.0524095296859741, "logps/chosen": -546.4449462890625, "logps/rejected": -655.5341186523438, "loss": 0.4637, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9415435791015625, "rewards/margins": 1.2188594341278076, "rewards/rejected": -4.160403251647949, "step": 2980 }, { "epoch": 0.78, "grad_norm": 9.5, "learning_rate": 6.867999675225523e-07, "logits/chosen": -1.2460225820541382, "logits/rejected": -1.1109936237335205, "logps/chosen": -518.8594970703125, "logps/rejected": -621.4867553710938, "loss": 0.4754, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8794169425964355, "rewards/margins": 1.1684167385101318, "rewards/rejected": -4.047833442687988, "step": 2990 }, { "epoch": 0.79, "grad_norm": 10.375, "learning_rate": 6.711484147823663e-07, "logits/chosen": -1.1477627754211426, "logits/rejected": -1.0689526796340942, "logps/chosen": -520.4979858398438, "logps/rejected": -650.7647094726562, "loss": 0.4592, "rewards/accuracies": 0.75, "rewards/chosen": -2.862730026245117, "rewards/margins": 1.2541263103485107, "rewards/rejected": -4.116856575012207, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": -1.0787907838821411, "eval_logits/rejected": -0.9509702324867249, "eval_logps/chosen": -563.9876708984375, "eval_logps/rejected": -655.9471435546875, "eval_loss": 0.48733198642730713, "eval_rewards/accuracies": 0.7459999918937683, "eval_rewards/chosen": -2.993356466293335, "eval_rewards/margins": 1.1200352907180786, "eval_rewards/rejected": -4.113391399383545, "eval_runtime": 382.8007, "eval_samples_per_second": 5.225, "eval_steps_per_second": 0.653, "step": 3000 }, { "epoch": 0.79, "grad_norm": 11.25, "learning_rate": 6.556495706232413e-07, "logits/chosen": -1.1598658561706543, "logits/rejected": -1.0877033472061157, "logps/chosen": -578.8084106445312, "logps/rejected": -665.4705200195312, "loss": 0.5453, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.1011300086975098, "rewards/margins": 1.050903081893921, "rewards/rejected": -4.152032852172852, "step": 3010 }, { "epoch": 0.79, "grad_norm": 9.8125, "learning_rate": 6.403047291942057e-07, "logits/chosen": -1.0840625762939453, "logits/rejected": -0.9331427812576294, "logps/chosen": -521.8424682617188, "logps/rejected": -612.9337768554688, "loss": 0.495, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.95574688911438, "rewards/margins": 1.1547616720199585, "rewards/rejected": -4.110508441925049, "step": 3020 }, { "epoch": 0.79, "grad_norm": 12.375, "learning_rate": 6.251151717851023e-07, "logits/chosen": -1.1582403182983398, "logits/rejected": -1.0655838251113892, "logps/chosen": -526.1175537109375, "logps/rejected": -627.6626586914062, "loss": 0.4861, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9271697998046875, "rewards/margins": 1.1482912302017212, "rewards/rejected": -4.075460910797119, "step": 3030 }, { "epoch": 0.8, "grad_norm": 6.25, "learning_rate": 6.100821667196041e-07, "logits/chosen": -1.323209524154663, "logits/rejected": -1.0637619495391846, "logps/chosen": -561.310791015625, "logps/rejected": -609.7546997070312, "loss": 0.4726, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.856724500656128, "rewards/margins": 1.1393463611602783, "rewards/rejected": -3.9960708618164062, "step": 3040 }, { "epoch": 0.8, "grad_norm": 55.5, "learning_rate": 5.952069692493062e-07, "logits/chosen": -1.1378008127212524, "logits/rejected": -1.033092737197876, "logps/chosen": -511.969482421875, "logps/rejected": -648.4796752929688, "loss": 0.4149, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8365421295166016, "rewards/margins": 1.3306509256362915, "rewards/rejected": -4.1671929359436035, "step": 3050 }, { "epoch": 0.8, "grad_norm": 10.5625, "learning_rate": 5.80490821448918e-07, "logits/chosen": -1.1030110120773315, "logits/rejected": -1.0928280353546143, "logps/chosen": -549.79052734375, "logps/rejected": -727.48876953125, "loss": 0.4284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8726837635040283, "rewards/margins": 1.3525440692901611, "rewards/rejected": -4.225228309631348, "step": 3060 }, { "epoch": 0.8, "grad_norm": 9.0625, "learning_rate": 5.659349521125459e-07, "logits/chosen": -1.2849022150039673, "logits/rejected": -1.2295571565628052, "logps/chosen": -560.9410400390625, "logps/rejected": -645.2173461914062, "loss": 0.4973, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7952258586883545, "rewards/margins": 1.048758864402771, "rewards/rejected": -3.843984603881836, "step": 3070 }, { "epoch": 0.81, "grad_norm": 6.90625, "learning_rate": 5.5154057665109e-07, "logits/chosen": -1.2467188835144043, "logits/rejected": -1.0997190475463867, "logps/chosen": -557.9779052734375, "logps/rejected": -661.7819213867188, "loss": 0.4889, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9708826541900635, "rewards/margins": 1.3023018836975098, "rewards/rejected": -4.273184776306152, "step": 3080 }, { "epoch": 0.81, "grad_norm": 8.25, "learning_rate": 5.373088969907586e-07, "logits/chosen": -1.2789522409439087, "logits/rejected": -1.0984174013137817, "logps/chosen": -573.76123046875, "logps/rejected": -637.1810302734375, "loss": 0.4581, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.969475269317627, "rewards/margins": 1.136474370956421, "rewards/rejected": -4.105949878692627, "step": 3090 }, { "epoch": 0.81, "grad_norm": 8.625, "learning_rate": 5.23241101472709e-07, "logits/chosen": -1.1879446506500244, "logits/rejected": -1.0638211965560913, "logps/chosen": -563.8876342773438, "logps/rejected": -645.8259887695312, "loss": 0.4905, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.8758597373962402, "rewards/margins": 1.0394397974014282, "rewards/rejected": -3.9152991771698, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": -1.0740700960159302, "eval_logits/rejected": -0.9464629888534546, "eval_logps/chosen": -562.904296875, "eval_logps/rejected": -656.5853271484375, "eval_loss": 0.48781928420066833, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -2.982522964477539, "eval_rewards/margins": 1.1372504234313965, "eval_rewards/rejected": -4.1197733879089355, "eval_runtime": 382.0441, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 3100 }, { "epoch": 0.81, "grad_norm": 8.5625, "learning_rate": 5.09338364753818e-07, "logits/chosen": -1.2681617736816406, "logits/rejected": -1.0949214696884155, "logps/chosen": -578.9161376953125, "logps/rejected": -673.3041381835938, "loss": 0.5304, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9349396228790283, "rewards/margins": 1.1018182039260864, "rewards/rejected": -4.036757469177246, "step": 3110 }, { "epoch": 0.82, "grad_norm": 10.125, "learning_rate": 4.956018477086005e-07, "logits/chosen": -1.2264713048934937, "logits/rejected": -1.0714534521102905, "logps/chosen": -574.7757568359375, "logps/rejected": -661.6316528320312, "loss": 0.5111, "rewards/accuracies": 0.71875, "rewards/chosen": -3.056429624557495, "rewards/margins": 1.1420024633407593, "rewards/rejected": -4.198431968688965, "step": 3120 }, { "epoch": 0.82, "grad_norm": 11.625, "learning_rate": 4.820326973322764e-07, "logits/chosen": -1.1282936334609985, "logits/rejected": -1.0485918521881104, "logps/chosen": -566.1331787109375, "logps/rejected": -665.1694946289062, "loss": 0.5658, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1516964435577393, "rewards/margins": 1.0504977703094482, "rewards/rejected": -4.202193737030029, "step": 3130 }, { "epoch": 0.82, "grad_norm": 10.25, "learning_rate": 4.686320466449981e-07, "logits/chosen": -1.1074498891830444, "logits/rejected": -0.9338695406913757, "logps/chosen": -530.6743774414062, "logps/rejected": -670.0709838867188, "loss": 0.4495, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.914252281188965, "rewards/margins": 1.4155068397521973, "rewards/rejected": -4.329759120941162, "step": 3140 }, { "epoch": 0.82, "grad_norm": 8.1875, "learning_rate": 4.554010145972418e-07, "logits/chosen": -1.2932242155075073, "logits/rejected": -1.10805344581604, "logps/chosen": -569.38818359375, "logps/rejected": -671.8726806640625, "loss": 0.551, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.0557217597961426, "rewards/margins": 1.1381008625030518, "rewards/rejected": -4.193822860717773, "step": 3150 }, { "epoch": 0.83, "grad_norm": 8.75, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -1.1201348304748535, "logits/rejected": -1.0320645570755005, "logps/chosen": -575.7613525390625, "logps/rejected": -669.4164428710938, "loss": 0.5272, "rewards/accuracies": 0.75, "rewards/chosen": -3.0022788047790527, "rewards/margins": 1.0558512210845947, "rewards/rejected": -4.058130264282227, "step": 3160 }, { "epoch": 0.83, "grad_norm": 6.5625, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -1.114639401435852, "logits/rejected": -0.9161049127578735, "logps/chosen": -552.2017211914062, "logps/rejected": -653.031005859375, "loss": 0.4203, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.864920139312744, "rewards/margins": 1.3247652053833008, "rewards/rejected": -4.189684867858887, "step": 3170 }, { "epoch": 0.83, "grad_norm": 9.25, "learning_rate": 4.167366067969381e-07, "logits/chosen": -1.216722846031189, "logits/rejected": -1.144590139389038, "logps/chosen": -516.3905639648438, "logps/rejected": -639.3621826171875, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": -2.9348020553588867, "rewards/margins": 0.9914267659187317, "rewards/rejected": -3.9262290000915527, "step": 3180 }, { "epoch": 0.83, "grad_norm": 6.125, "learning_rate": 4.041949541732826e-07, "logits/chosen": -1.1988582611083984, "logits/rejected": -1.1241180896759033, "logps/chosen": -567.2083740234375, "logps/rejected": -659.522216796875, "loss": 0.5194, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.033240795135498, "rewards/margins": 1.0658702850341797, "rewards/rejected": -4.0991106033325195, "step": 3190 }, { "epoch": 0.84, "grad_norm": 10.1875, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -1.1077312231063843, "logits/rejected": -1.0953607559204102, "logps/chosen": -557.6238403320312, "logps/rejected": -689.794921875, "loss": 0.485, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9987998008728027, "rewards/margins": 1.1976327896118164, "rewards/rejected": -4.196433067321777, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": -1.0807329416275024, "eval_logits/rejected": -0.9531368613243103, "eval_logps/chosen": -559.239990234375, "eval_logps/rejected": -652.1516723632812, "eval_loss": 0.4873969852924347, "eval_rewards/accuracies": 0.7455000281333923, "eval_rewards/chosen": -2.9458799362182617, "eval_rewards/margins": 1.1295573711395264, "eval_rewards/rejected": -4.075437068939209, "eval_runtime": 381.6886, "eval_samples_per_second": 5.24, "eval_steps_per_second": 0.655, "step": 3200 }, { "epoch": 0.84, "grad_norm": 8.3125, "learning_rate": 3.796376788925771e-07, "logits/chosen": -1.1163936853408813, "logits/rejected": -1.0554332733154297, "logps/chosen": -541.477294921875, "logps/rejected": -619.0269165039062, "loss": 0.4946, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.800494909286499, "rewards/margins": 1.0160177946090698, "rewards/rejected": -3.8165130615234375, "step": 3210 }, { "epoch": 0.84, "grad_norm": 7.625, "learning_rate": 3.676241067609465e-07, "logits/chosen": -1.2064073085784912, "logits/rejected": -1.0841269493103027, "logps/chosen": -582.91259765625, "logps/rejected": -648.9725952148438, "loss": 0.5138, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.892125129699707, "rewards/margins": 1.091489315032959, "rewards/rejected": -3.983614444732666, "step": 3220 }, { "epoch": 0.85, "grad_norm": 15.1875, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -1.123425841331482, "logits/rejected": -1.0302746295928955, "logps/chosen": -566.611328125, "logps/rejected": -648.7924194335938, "loss": 0.5326, "rewards/accuracies": 0.71875, "rewards/chosen": -3.048774003982544, "rewards/margins": 0.9477185010910034, "rewards/rejected": -3.996492385864258, "step": 3230 }, { "epoch": 0.85, "grad_norm": 11.5625, "learning_rate": 3.44132109080447e-07, "logits/chosen": -1.3182079792022705, "logits/rejected": -1.1424782276153564, "logps/chosen": -549.4573364257812, "logps/rejected": -634.7244873046875, "loss": 0.4425, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8423376083374023, "rewards/margins": 1.2275350093841553, "rewards/rejected": -4.069872856140137, "step": 3240 }, { "epoch": 0.85, "grad_norm": 12.125, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -1.2581889629364014, "logits/rejected": -1.1089788675308228, "logps/chosen": -572.9723510742188, "logps/rejected": -676.4666137695312, "loss": 0.4207, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.796302318572998, "rewards/margins": 1.2862600088119507, "rewards/rejected": -4.082562446594238, "step": 3250 }, { "epoch": 0.85, "grad_norm": 14.3125, "learning_rate": 3.213601537627195e-07, "logits/chosen": -1.1619012355804443, "logits/rejected": -1.0473229885101318, "logps/chosen": -574.4371948242188, "logps/rejected": -662.361083984375, "loss": 0.5456, "rewards/accuracies": 0.6875, "rewards/chosen": -3.1731839179992676, "rewards/margins": 1.0502443313598633, "rewards/rejected": -4.223428249359131, "step": 3260 }, { "epoch": 0.86, "grad_norm": 12.0, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -1.2181814908981323, "logits/rejected": -1.1487758159637451, "logps/chosen": -531.4067993164062, "logps/rejected": -627.6771240234375, "loss": 0.5005, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8366494178771973, "rewards/margins": 1.1211111545562744, "rewards/rejected": -3.9577605724334717, "step": 3270 }, { "epoch": 0.86, "grad_norm": 14.25, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -1.147289514541626, "logits/rejected": -1.1335127353668213, "logps/chosen": -557.3380737304688, "logps/rejected": -666.0869140625, "loss": 0.5042, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.828235626220703, "rewards/margins": 1.0581908226013184, "rewards/rejected": -3.8864264488220215, "step": 3280 }, { "epoch": 0.86, "grad_norm": 6.96875, "learning_rate": 2.885688711862136e-07, "logits/chosen": -1.1895829439163208, "logits/rejected": -1.1866552829742432, "logps/chosen": -561.8271484375, "logps/rejected": -686.0377197265625, "loss": 0.51, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -3.007755756378174, "rewards/margins": 1.261817216873169, "rewards/rejected": -4.269573211669922, "step": 3290 }, { "epoch": 0.86, "grad_norm": 7.6875, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -1.1794744729995728, "logits/rejected": -1.0672075748443604, "logps/chosen": -547.8685302734375, "logps/rejected": -650.7493286132812, "loss": 0.5157, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9051201343536377, "rewards/margins": 1.0670777559280396, "rewards/rejected": -3.972198009490967, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": -1.0755009651184082, "eval_logits/rejected": -0.9480787515640259, "eval_logps/chosen": -560.1488647460938, "eval_logps/rejected": -652.9912109375, "eval_loss": 0.4874354600906372, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -2.9549689292907715, "eval_rewards/margins": 1.128864049911499, "eval_rewards/rejected": -4.083832740783691, "eval_runtime": 383.0008, "eval_samples_per_second": 5.222, "eval_steps_per_second": 0.653, "step": 3300 }, { "epoch": 0.87, "grad_norm": 12.75, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -1.1829874515533447, "logits/rejected": -1.0443121194839478, "logps/chosen": -508.328857421875, "logps/rejected": -573.6398315429688, "loss": 0.5093, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.785529851913452, "rewards/margins": 1.0893114805221558, "rewards/rejected": -3.8748409748077393, "step": 3310 }, { "epoch": 0.87, "grad_norm": 9.375, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -1.1776726245880127, "logits/rejected": -1.0596325397491455, "logps/chosen": -558.5306396484375, "logps/rejected": -649.6300048828125, "loss": 0.4969, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.942483425140381, "rewards/margins": 1.159317135810852, "rewards/rejected": -4.101800441741943, "step": 3320 }, { "epoch": 0.87, "grad_norm": 13.0625, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -1.3345047235488892, "logits/rejected": -1.1562585830688477, "logps/chosen": -557.7296142578125, "logps/rejected": -661.87109375, "loss": 0.4702, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8015923500061035, "rewards/margins": 1.3364170789718628, "rewards/rejected": -4.138009548187256, "step": 3330 }, { "epoch": 0.87, "grad_norm": 11.875, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -1.1994738578796387, "logits/rejected": -1.097899079322815, "logps/chosen": -579.8328857421875, "logps/rejected": -674.6861572265625, "loss": 0.4926, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.1103997230529785, "rewards/margins": 1.0284258127212524, "rewards/rejected": -4.138825416564941, "step": 3340 }, { "epoch": 0.88, "grad_norm": 8.0, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -1.1523630619049072, "logits/rejected": -1.0521692037582397, "logps/chosen": -551.5980224609375, "logps/rejected": -671.2841186523438, "loss": 0.4455, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8580057621002197, "rewards/margins": 1.2792617082595825, "rewards/rejected": -4.137267112731934, "step": 3350 }, { "epoch": 0.88, "grad_norm": 8.5625, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -1.2028191089630127, "logits/rejected": -1.0033330917358398, "logps/chosen": -563.6111450195312, "logps/rejected": -704.59228515625, "loss": 0.4213, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9093270301818848, "rewards/margins": 1.4688284397125244, "rewards/rejected": -4.378155708312988, "step": 3360 }, { "epoch": 0.88, "grad_norm": 7.3125, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -1.1482703685760498, "logits/rejected": -1.0027369260787964, "logps/chosen": -540.7926635742188, "logps/rejected": -644.9227294921875, "loss": 0.439, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.006060838699341, "rewards/margins": 1.2438604831695557, "rewards/rejected": -4.2499213218688965, "step": 3370 }, { "epoch": 0.88, "grad_norm": 13.625, "learning_rate": 2.002580803659873e-07, "logits/chosen": -1.1630356311798096, "logits/rejected": -1.0312206745147705, "logps/chosen": -559.203125, "logps/rejected": -652.8720092773438, "loss": 0.4651, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.046480417251587, "rewards/margins": 1.1182465553283691, "rewards/rejected": -4.164727210998535, "step": 3380 }, { "epoch": 0.89, "grad_norm": 6.71875, "learning_rate": 1.913954575837826e-07, "logits/chosen": -1.2169429063796997, "logits/rejected": -0.9856022596359253, "logps/chosen": -575.2197875976562, "logps/rejected": -634.4151000976562, "loss": 0.4808, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0310537815093994, "rewards/margins": 1.0889911651611328, "rewards/rejected": -4.120044708251953, "step": 3390 }, { "epoch": 0.89, "grad_norm": 10.3125, "learning_rate": 1.827256026165028e-07, "logits/chosen": -1.2307440042495728, "logits/rejected": -1.0502979755401611, "logps/chosen": -592.2626342773438, "logps/rejected": -664.5699462890625, "loss": 0.4474, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.771921396255493, "rewards/margins": 1.2936856746673584, "rewards/rejected": -4.065607070922852, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": -1.077279806137085, "eval_logits/rejected": -0.9499141573905945, "eval_logps/chosen": -561.6380615234375, "eval_logps/rejected": -654.8016967773438, "eval_loss": 0.4871050715446472, "eval_rewards/accuracies": 0.7434999942779541, "eval_rewards/chosen": -2.969860553741455, "eval_rewards/margins": 1.1320772171020508, "eval_rewards/rejected": -4.101937770843506, "eval_runtime": 382.1089, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 3400 }, { "epoch": 0.89, "grad_norm": 11.5625, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -1.174843430519104, "logits/rejected": -1.0021690130233765, "logps/chosen": -579.2442626953125, "logps/rejected": -661.9432373046875, "loss": 0.4255, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.9789488315582275, "rewards/margins": 1.2606755495071411, "rewards/rejected": -4.239624500274658, "step": 3410 }, { "epoch": 0.9, "grad_norm": 16.25, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -1.2912896871566772, "logits/rejected": -1.1392004489898682, "logps/chosen": -576.8416748046875, "logps/rejected": -653.64501953125, "loss": 0.4901, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.956543207168579, "rewards/margins": 1.119319200515747, "rewards/rejected": -4.075861930847168, "step": 3420 }, { "epoch": 0.9, "grad_norm": 9.625, "learning_rate": 1.578798030665385e-07, "logits/chosen": -1.2196199893951416, "logits/rejected": -1.0388673543930054, "logps/chosen": -565.8033447265625, "logps/rejected": -686.4707641601562, "loss": 0.4313, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9110517501831055, "rewards/margins": 1.3773367404937744, "rewards/rejected": -4.288388729095459, "step": 3430 }, { "epoch": 0.9, "grad_norm": 8.25, "learning_rate": 1.499880968037165e-07, "logits/chosen": -1.1975353956222534, "logits/rejected": -1.0588737726211548, "logps/chosen": -544.4766845703125, "logps/rejected": -618.7376098632812, "loss": 0.513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.888631820678711, "rewards/margins": 1.1052820682525635, "rewards/rejected": -3.9939143657684326, "step": 3440 }, { "epoch": 0.9, "grad_norm": 14.5625, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -1.230802297592163, "logits/rejected": -1.1439770460128784, "logps/chosen": -553.6980590820312, "logps/rejected": -644.7520751953125, "loss": 0.4489, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8526644706726074, "rewards/margins": 1.1686475276947021, "rewards/rejected": -4.0213117599487305, "step": 3450 }, { "epoch": 0.91, "grad_norm": 10.6875, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -1.1431211233139038, "logits/rejected": -1.1035680770874023, "logps/chosen": -544.6209106445312, "logps/rejected": -662.7022705078125, "loss": 0.4784, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9836173057556152, "rewards/margins": 1.2009574174880981, "rewards/rejected": -4.184575080871582, "step": 3460 }, { "epoch": 0.91, "grad_norm": 9.625, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -1.2415331602096558, "logits/rejected": -1.0389716625213623, "logps/chosen": -587.2742309570312, "logps/rejected": -644.300048828125, "loss": 0.4742, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9073996543884277, "rewards/margins": 1.1453628540039062, "rewards/rejected": -4.052762508392334, "step": 3470 }, { "epoch": 0.91, "grad_norm": 8.875, "learning_rate": 1.203898683888713e-07, "logits/chosen": -1.2313424348831177, "logits/rejected": -1.1037070751190186, "logps/chosen": -548.36962890625, "logps/rejected": -643.1497192382812, "loss": 0.5615, "rewards/accuracies": 0.6875, "rewards/chosen": -3.043722152709961, "rewards/margins": 0.9787699580192566, "rewards/rejected": -4.022491931915283, "step": 3480 }, { "epoch": 0.91, "grad_norm": 8.5625, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -1.2611653804779053, "logits/rejected": -1.1225281953811646, "logps/chosen": -574.4703369140625, "logps/rejected": -650.3907470703125, "loss": 0.495, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.924811840057373, "rewards/margins": 1.0881900787353516, "rewards/rejected": -4.013001918792725, "step": 3490 }, { "epoch": 0.92, "grad_norm": 6.875, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -1.2225737571716309, "logits/rejected": -1.0173273086547852, "logps/chosen": -580.7540283203125, "logps/rejected": -643.2467651367188, "loss": 0.5379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9996070861816406, "rewards/margins": 1.0754629373550415, "rewards/rejected": -4.075070381164551, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": -1.074249267578125, "eval_logits/rejected": -0.9468256831169128, "eval_logps/chosen": -561.2808227539062, "eval_logps/rejected": -654.5006103515625, "eval_loss": 0.48737701773643494, "eval_rewards/accuracies": 0.7429999709129333, "eval_rewards/chosen": -2.9662883281707764, "eval_rewards/margins": 1.1326382160186768, "eval_rewards/rejected": -4.098926544189453, "eval_runtime": 382.1229, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 3500 }, { "epoch": 0.92, "grad_norm": 9.1875, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -1.1973202228546143, "logits/rejected": -1.0237270593643188, "logps/chosen": -544.4231567382812, "logps/rejected": -646.541015625, "loss": 0.4368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.902569532394409, "rewards/margins": 1.1967476606369019, "rewards/rejected": -4.0993170738220215, "step": 3510 }, { "epoch": 0.92, "grad_norm": 8.4375, "learning_rate": 9.397045634168766e-08, "logits/chosen": -1.227426290512085, "logits/rejected": -1.1496341228485107, "logps/chosen": -555.9089965820312, "logps/rejected": -687.0352783203125, "loss": 0.4491, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.873260974884033, "rewards/margins": 1.3088066577911377, "rewards/rejected": -4.182066917419434, "step": 3520 }, { "epoch": 0.92, "grad_norm": 10.9375, "learning_rate": 8.78665232332998e-08, "logits/chosen": -1.1654760837554932, "logits/rejected": -1.0858592987060547, "logps/chosen": -537.4627685546875, "logps/rejected": -640.0810546875, "loss": 0.489, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.0675268173217773, "rewards/margins": 1.0338232517242432, "rewards/rejected": -4.101349830627441, "step": 3530 }, { "epoch": 0.93, "grad_norm": 8.1875, "learning_rate": 8.196400257606208e-08, "logits/chosen": -1.2670751810073853, "logits/rejected": -1.104811191558838, "logps/chosen": -576.2312622070312, "logps/rejected": -708.0988159179688, "loss": 0.4292, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9314615726470947, "rewards/margins": 1.371382236480713, "rewards/rejected": -4.3028435707092285, "step": 3540 }, { "epoch": 0.93, "grad_norm": 9.5, "learning_rate": 7.626338722875076e-08, "logits/chosen": -1.1996467113494873, "logits/rejected": -1.1349631547927856, "logps/chosen": -546.021240234375, "logps/rejected": -657.2860107421875, "loss": 0.503, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9192748069763184, "rewards/margins": 1.102920651435852, "rewards/rejected": -4.022195816040039, "step": 3550 }, { "epoch": 0.93, "grad_norm": 5.84375, "learning_rate": 7.076515319110688e-08, "logits/chosen": -1.2043834924697876, "logits/rejected": -1.0872790813446045, "logps/chosen": -546.2125854492188, "logps/rejected": -628.4691162109375, "loss": 0.5091, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.899247646331787, "rewards/margins": 1.2382572889328003, "rewards/rejected": -4.1375041007995605, "step": 3560 }, { "epoch": 0.93, "grad_norm": 8.1875, "learning_rate": 6.54697595640899e-08, "logits/chosen": -1.2246639728546143, "logits/rejected": -1.1050646305084229, "logps/chosen": -588.5670166015625, "logps/rejected": -679.2962646484375, "loss": 0.4803, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9634485244750977, "rewards/margins": 1.1589770317077637, "rewards/rejected": -4.1224260330200195, "step": 3570 }, { "epoch": 0.94, "grad_norm": 9.125, "learning_rate": 6.037764851154426e-08, "logits/chosen": -1.2126811742782593, "logits/rejected": -1.1511167287826538, "logps/chosen": -555.2306518554688, "logps/rejected": -671.8084716796875, "loss": 0.5096, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.911715030670166, "rewards/margins": 1.1182584762573242, "rewards/rejected": -4.029973030090332, "step": 3580 }, { "epoch": 0.94, "grad_norm": 7.1875, "learning_rate": 5.548924522327748e-08, "logits/chosen": -1.1890180110931396, "logits/rejected": -1.0672190189361572, "logps/chosen": -549.8150634765625, "logps/rejected": -647.8394775390625, "loss": 0.4832, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.880577564239502, "rewards/margins": 1.1271222829818726, "rewards/rejected": -4.007699489593506, "step": 3590 }, { "epoch": 0.94, "grad_norm": 11.0625, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -1.109243392944336, "logits/rejected": -1.0201053619384766, "logps/chosen": -514.1246337890625, "logps/rejected": -630.8916625976562, "loss": 0.464, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8855831623077393, "rewards/margins": 1.122081995010376, "rewards/rejected": -4.007665157318115, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": -1.0748145580291748, "eval_logits/rejected": -0.9474833607673645, "eval_logps/chosen": -561.028564453125, "eval_logps/rejected": -654.279052734375, "eval_loss": 0.48736903071403503, "eval_rewards/accuracies": 0.7425000071525574, "eval_rewards/chosen": -2.9637651443481445, "eval_rewards/margins": 1.1329458951950073, "eval_rewards/rejected": -4.096711158752441, "eval_runtime": 382.7111, "eval_samples_per_second": 5.226, "eval_steps_per_second": 0.653, "step": 3600 }, { "epoch": 0.94, "grad_norm": 9.125, "learning_rate": 4.632517761702815e-08, "logits/chosen": -1.1433720588684082, "logits/rejected": -1.0008645057678223, "logps/chosen": -530.3574829101562, "logps/rejected": -652.87255859375, "loss": 0.4428, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9491429328918457, "rewards/margins": 1.3483526706695557, "rewards/rejected": -4.2974958419799805, "step": 3610 }, { "epoch": 0.95, "grad_norm": 11.125, "learning_rate": 4.205027849605359e-08, "logits/chosen": -1.1681492328643799, "logits/rejected": -1.0669422149658203, "logps/chosen": -553.4034423828125, "logps/rejected": -626.2314453125, "loss": 0.5421, "rewards/accuracies": 0.75, "rewards/chosen": -3.0548007488250732, "rewards/margins": 1.0290553569793701, "rewards/rejected": -4.083855628967285, "step": 3620 }, { "epoch": 0.95, "grad_norm": 9.9375, "learning_rate": 3.798061746947995e-08, "logits/chosen": -1.2855480909347534, "logits/rejected": -1.1476643085479736, "logps/chosen": -555.6473999023438, "logps/rejected": -633.9293823242188, "loss": 0.4785, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.9149746894836426, "rewards/margins": 1.1746852397918701, "rewards/rejected": -4.089660167694092, "step": 3630 }, { "epoch": 0.95, "grad_norm": 10.25, "learning_rate": 3.411653435283158e-08, "logits/chosen": -1.1988470554351807, "logits/rejected": -0.9911161661148071, "logps/chosen": -560.5934448242188, "logps/rejected": -617.925048828125, "loss": 0.4611, "rewards/accuracies": 0.75, "rewards/chosen": -2.873368740081787, "rewards/margins": 1.1307556629180908, "rewards/rejected": -4.004124641418457, "step": 3640 }, { "epoch": 0.96, "grad_norm": 9.5, "learning_rate": 3.04583517959367e-08, "logits/chosen": -1.2440365552902222, "logits/rejected": -1.0937076807022095, "logps/chosen": -528.578125, "logps/rejected": -617.3880004882812, "loss": 0.448, "rewards/accuracies": 0.75, "rewards/chosen": -2.7646141052246094, "rewards/margins": 1.2126356363296509, "rewards/rejected": -3.9772496223449707, "step": 3650 }, { "epoch": 0.96, "grad_norm": 10.0, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -1.1879501342773438, "logits/rejected": -1.1580005884170532, "logps/chosen": -571.6791381835938, "logps/rejected": -661.9193725585938, "loss": 0.5788, "rewards/accuracies": 0.6875, "rewards/chosen": -3.06877064704895, "rewards/margins": 0.8969556093215942, "rewards/rejected": -3.965726375579834, "step": 3660 }, { "epoch": 0.96, "grad_norm": 11.625, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -1.303144931793213, "logits/rejected": -1.1418662071228027, "logps/chosen": -583.8892822265625, "logps/rejected": -663.1383666992188, "loss": 0.5326, "rewards/accuracies": 0.75, "rewards/chosen": -3.113860845565796, "rewards/margins": 1.1326040029525757, "rewards/rejected": -4.246464729309082, "step": 3670 }, { "epoch": 0.96, "grad_norm": 13.4375, "learning_rate": 2.072217594089765e-08, "logits/chosen": -1.156292200088501, "logits/rejected": -1.146905541419983, "logps/chosen": -559.3345336914062, "logps/rejected": -672.4237060546875, "loss": 0.4237, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.9779343605041504, "rewards/margins": 1.253351092338562, "rewards/rejected": -4.231285572052002, "step": 3680 }, { "epoch": 0.97, "grad_norm": 8.9375, "learning_rate": 1.789047789459375e-08, "logits/chosen": -1.266901969909668, "logits/rejected": -1.072322964668274, "logps/chosen": -611.783203125, "logps/rejected": -680.0989379882812, "loss": 0.5071, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9480648040771484, "rewards/margins": 1.1735531091690063, "rewards/rejected": -4.121617794036865, "step": 3690 }, { "epoch": 0.97, "grad_norm": 8.9375, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -1.1054164171218872, "logits/rejected": -0.985053539276123, "logps/chosen": -610.8778076171875, "logps/rejected": -699.9169921875, "loss": 0.4729, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.1426022052764893, "rewards/margins": 1.1523752212524414, "rewards/rejected": -4.29497766494751, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": -1.0769954919815063, "eval_logits/rejected": -0.9495205879211426, "eval_logps/chosen": -561.3129272460938, "eval_logps/rejected": -654.6014404296875, "eval_loss": 0.48729926347732544, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -2.966609239578247, "eval_rewards/margins": 1.1333256959915161, "eval_rewards/rejected": -4.099935054779053, "eval_runtime": 382.1, "eval_samples_per_second": 5.234, "eval_steps_per_second": 0.654, "step": 3700 }, { "epoch": 0.97, "grad_norm": 11.75, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -1.0827583074569702, "logits/rejected": -1.0175631046295166, "logps/chosen": -534.8372192382812, "logps/rejected": -647.8695678710938, "loss": 0.4762, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9847655296325684, "rewards/margins": 1.2061764001846313, "rewards/rejected": -4.190942287445068, "step": 3710 }, { "epoch": 0.97, "grad_norm": 11.1875, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -1.0798698663711548, "logits/rejected": -0.9149328470230103, "logps/chosen": -549.2965087890625, "logps/rejected": -631.1814575195312, "loss": 0.5133, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.001096248626709, "rewards/margins": 1.178213119506836, "rewards/rejected": -4.179308891296387, "step": 3720 }, { "epoch": 0.98, "grad_norm": 9.5, "learning_rate": 8.638344782207486e-09, "logits/chosen": -1.1081641912460327, "logits/rejected": -1.0127241611480713, "logps/chosen": -530.3636474609375, "logps/rejected": -619.5350341796875, "loss": 0.4791, "rewards/accuracies": 0.75, "rewards/chosen": -2.8507590293884277, "rewards/margins": 1.1195757389068604, "rewards/rejected": -3.97033429145813, "step": 3730 }, { "epoch": 0.98, "grad_norm": 10.0, "learning_rate": 6.84494196844715e-09, "logits/chosen": -1.16922128200531, "logits/rejected": -1.0506504774093628, "logps/chosen": -563.3178100585938, "logps/rejected": -685.6429443359375, "loss": 0.4573, "rewards/accuracies": 0.75, "rewards/chosen": -2.912113666534424, "rewards/margins": 1.3391534090042114, "rewards/rejected": -4.251267433166504, "step": 3740 }, { "epoch": 0.98, "grad_norm": 10.1875, "learning_rate": 5.259716884556121e-09, "logits/chosen": -1.2230274677276611, "logits/rejected": -1.0869773626327515, "logps/chosen": -557.9898681640625, "logps/rejected": -660.3572998046875, "loss": 0.4564, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.9352307319641113, "rewards/margins": 1.1718149185180664, "rewards/rejected": -4.107045650482178, "step": 3750 }, { "epoch": 0.98, "grad_norm": 9.75, "learning_rate": 3.882801896372967e-09, "logits/chosen": -1.2255470752716064, "logits/rejected": -1.1375856399536133, "logps/chosen": -556.98193359375, "logps/rejected": -639.6429443359375, "loss": 0.4908, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.91620135307312, "rewards/margins": 1.1449532508850098, "rewards/rejected": -4.061154842376709, "step": 3760 }, { "epoch": 0.99, "grad_norm": 12.25, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -1.242653727531433, "logits/rejected": -1.0747482776641846, "logps/chosen": -574.4716796875, "logps/rejected": -665.8096313476562, "loss": 0.4263, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.901991605758667, "rewards/margins": 1.1683391332626343, "rewards/rejected": -4.070330619812012, "step": 3770 }, { "epoch": 0.99, "grad_norm": 9.4375, "learning_rate": 1.754344691717591e-09, "logits/chosen": -1.1282501220703125, "logits/rejected": -1.0916457176208496, "logps/chosen": -552.8446655273438, "logps/rejected": -669.7666015625, "loss": 0.5197, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.030531406402588, "rewards/margins": 0.9483699798583984, "rewards/rejected": -3.9789013862609863, "step": 3780 }, { "epoch": 0.99, "grad_norm": 13.125, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -1.1534841060638428, "logits/rejected": -0.994836151599884, "logps/chosen": -570.4867553710938, "logps/rejected": -668.6637573242188, "loss": 0.4803, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9619479179382324, "rewards/margins": 1.2098205089569092, "rewards/rejected": -4.171768665313721, "step": 3790 }, { "epoch": 0.99, "grad_norm": 8.5, "learning_rate": 4.602812418974534e-10, "logits/chosen": -1.2624783515930176, "logits/rejected": -1.1238892078399658, "logps/chosen": -582.1685180664062, "logps/rejected": -673.0120239257812, "loss": 0.5017, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.003277540206909, "rewards/margins": 1.1538227796554565, "rewards/rejected": -4.157099723815918, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": -1.0723599195480347, "eval_logits/rejected": -0.9449748396873474, "eval_logps/chosen": -561.3216552734375, "eval_logps/rejected": -654.607177734375, "eval_loss": 0.48731154203414917, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -2.966696262359619, "eval_rewards/margins": 1.133296012878418, "eval_rewards/rejected": -4.099992275238037, "eval_runtime": 382.0182, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.654, "step": 3800 }, { "epoch": 1.0, "grad_norm": 9.875, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -1.171769142150879, "logits/rejected": -1.0495896339416504, "logps/chosen": -542.8326416015625, "logps/rejected": -627.7073974609375, "loss": 0.5191, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9036014080047607, "rewards/margins": 1.005274772644043, "rewards/rejected": -3.9088759422302246, "step": 3810 }, { "epoch": 1.0, "grad_norm": 18.25, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -1.1617281436920166, "logits/rejected": -0.9952475428581238, "logps/chosen": -585.9136962890625, "logps/rejected": -680.009521484375, "loss": 0.4659, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.991471767425537, "rewards/margins": 1.3875491619110107, "rewards/rejected": -4.379020690917969, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.5021860111574015, "train_runtime": 41123.41, "train_samples_per_second": 1.487, "train_steps_per_second": 0.093 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }