{ "best_metric": 0.8629826903343201, "best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.2/lora/orpo-salt/checkpoint-1500", "epoch": 2.9969690846635686, "eval_steps": 500, "global_step": 1854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01616488179430188, "grad_norm": 16.64879608154297, "learning_rate": 4.999648198770648e-06, "logits/chosen": -2.4989278316497803, "logits/rejected": -2.5208303928375244, "logps/chosen": -1.9139716625213623, "logps/rejected": -3.1082823276519775, "loss": 1.9977, "odds_ratio_loss": 0.8370735049247742, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.19139717519283295, "rewards/margins": 0.1194310411810875, "rewards/rejected": -0.31082823872566223, "sft_loss": 1.9139716625213623, "step": 10 }, { "epoch": 0.03232976358860376, "grad_norm": 13.894062042236328, "learning_rate": 4.998578646361359e-06, "logits/chosen": -2.5156219005584717, "logits/rejected": -2.51640248298645, "logps/chosen": -1.635488748550415, "logps/rejected": -2.132800817489624, "loss": 1.7095, "odds_ratio_loss": 0.7404953241348267, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.16354887187480927, "rewards/margins": 0.04973122477531433, "rewards/rejected": -0.2132801115512848, "sft_loss": 1.635488748550415, "step": 20 }, { "epoch": 0.04849464538290564, "grad_norm": 23.089773178100586, "learning_rate": 4.996791614004449e-06, "logits/chosen": -2.518998861312866, "logits/rejected": -2.544835090637207, "logps/chosen": -1.6531565189361572, "logps/rejected": -2.541318893432617, "loss": 1.7385, "odds_ratio_loss": 0.8539272546768188, "rewards/accuracies": 0.625, "rewards/chosen": -0.165315642952919, "rewards/margins": 0.08881621062755585, "rewards/rejected": -0.25413185358047485, "sft_loss": 1.6531565189361572, "step": 30 }, { "epoch": 0.06465952717720752, "grad_norm": 13.833389282226562, "learning_rate": 4.994287614855618e-06, "logits/chosen": -2.518852472305298, "logits/rejected": -2.551032066345215, "logps/chosen": -1.7646430730819702, "logps/rejected": -2.508850574493408, "loss": 1.8742, "odds_ratio_loss": 1.0958486795425415, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.17646430432796478, "rewards/margins": 0.07442077249288559, "rewards/rejected": -0.2508850693702698, "sft_loss": 1.7646430730819702, "step": 40 }, { "epoch": 0.0808244089715094, "grad_norm": 28.34682846069336, "learning_rate": 4.991067367951343e-06, "logits/chosen": -2.5992355346679688, "logits/rejected": -2.5891082286834717, "logps/chosen": -1.345651388168335, "logps/rejected": -2.2306911945343018, "loss": 1.4115, "odds_ratio_loss": 0.6583842039108276, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13456514477729797, "rewards/margins": 0.08850395679473877, "rewards/rejected": -0.22306910157203674, "sft_loss": 1.345651388168335, "step": 50 }, { "epoch": 0.09698929076581128, "grad_norm": 3.4724316596984863, "learning_rate": 4.987131798002389e-06, "logits/chosen": -2.539771556854248, "logits/rejected": -2.5456976890563965, "logps/chosen": -1.3674490451812744, "logps/rejected": -2.1061840057373047, "loss": 1.4542, "odds_ratio_loss": 0.8671566247940063, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.1367449164390564, "rewards/margins": 0.07387349754571915, "rewards/rejected": -0.21061840653419495, "sft_loss": 1.3674490451812744, "step": 60 }, { "epoch": 0.11315417256011315, "grad_norm": 46.33675003051758, "learning_rate": 4.982482035128285e-06, "logits/chosen": -2.5208637714385986, "logits/rejected": -2.528776168823242, "logps/chosen": -1.4248360395431519, "logps/rejected": -2.067411184310913, "loss": 1.5025, "odds_ratio_loss": 0.7764666676521301, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14248362183570862, "rewards/margins": 0.06425751000642776, "rewards/rejected": -0.2067411243915558, "sft_loss": 1.4248360395431519, "step": 70 }, { "epoch": 0.12931905435441504, "grad_norm": 25.993545532226562, "learning_rate": 4.9771194145328e-06, "logits/chosen": -2.5788090229034424, "logits/rejected": -2.572688341140747, "logps/chosen": -1.0824676752090454, "logps/rejected": -1.7445621490478516, "loss": 1.1449, "odds_ratio_loss": 0.6242043972015381, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10824675858020782, "rewards/margins": 0.06620947271585464, "rewards/rejected": -0.17445623874664307, "sft_loss": 1.0824676752090454, "step": 80 }, { "epoch": 0.1454839361487169, "grad_norm": 19.184228897094727, "learning_rate": 4.971045476120532e-06, "logits/chosen": -2.5863890647888184, "logits/rejected": -2.591404914855957, "logps/chosen": -1.080370306968689, "logps/rejected": -1.753382682800293, "loss": 1.1463, "odds_ratio_loss": 0.6591774821281433, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10803703963756561, "rewards/margins": 0.06730123609304428, "rewards/rejected": -0.1753382831811905, "sft_loss": 1.080370306968689, "step": 90 }, { "epoch": 0.1616488179430188, "grad_norm": 5.7092084884643555, "learning_rate": 4.964261964054713e-06, "logits/chosen": -2.5851123332977295, "logits/rejected": -2.5928287506103516, "logps/chosen": -1.20145583152771, "logps/rejected": -1.920117735862732, "loss": 1.2771, "odds_ratio_loss": 0.7563266754150391, "rewards/accuracies": 0.625, "rewards/chosen": -0.12014558166265488, "rewards/margins": 0.07186620682477951, "rewards/rejected": -0.1920117884874344, "sft_loss": 1.20145583152771, "step": 100 }, { "epoch": 0.17781369973732067, "grad_norm": 4.211212635040283, "learning_rate": 4.956770826256372e-06, "logits/chosen": -2.6192798614501953, "logits/rejected": -2.6177656650543213, "logps/chosen": -1.1085783243179321, "logps/rejected": -1.4738147258758545, "loss": 1.1766, "odds_ratio_loss": 0.6805119514465332, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.11085782200098038, "rewards/margins": 0.03652365505695343, "rewards/rejected": -0.1473814696073532, "sft_loss": 1.1085783243179321, "step": 110 }, { "epoch": 0.19397858153162256, "grad_norm": 3.4872381687164307, "learning_rate": 4.94857421384497e-06, "logits/chosen": -2.602118968963623, "logits/rejected": -2.6089630126953125, "logps/chosen": -1.0341213941574097, "logps/rejected": -1.5845638513565063, "loss": 1.1041, "odds_ratio_loss": 0.6995517611503601, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10341213643550873, "rewards/margins": 0.05504424497485161, "rewards/rejected": -0.15845640003681183, "sft_loss": 1.0341213941574097, "step": 120 }, { "epoch": 0.21014346332592443, "grad_norm": 5.468324661254883, "learning_rate": 4.939674480520701e-06, "logits/chosen": -2.6128063201904297, "logits/rejected": -2.6255507469177246, "logps/chosen": -0.9619969129562378, "logps/rejected": -1.390077829360962, "loss": 1.0297, "odds_ratio_loss": 0.6766607165336609, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09619969129562378, "rewards/margins": 0.04280809685587883, "rewards/rejected": -0.1390077769756317, "sft_loss": 0.9619969129562378, "step": 130 }, { "epoch": 0.2263083451202263, "grad_norm": 5.18142032623291, "learning_rate": 4.930074181888613e-06, "logits/chosen": -2.6814427375793457, "logits/rejected": -2.7020936012268066, "logps/chosen": -0.9705274701118469, "logps/rejected": -1.315450668334961, "loss": 1.0341, "odds_ratio_loss": 0.636103630065918, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09705274552106857, "rewards/margins": 0.03449232131242752, "rewards/rejected": -0.1315450817346573, "sft_loss": 0.9705274701118469, "step": 140 }, { "epoch": 0.2424732269145282, "grad_norm": 1.4752620458602905, "learning_rate": 4.91977607472475e-06, "logits/chosen": -2.704951524734497, "logits/rejected": -2.7246315479278564, "logps/chosen": -1.0248619318008423, "logps/rejected": -1.4426223039627075, "loss": 1.0895, "odds_ratio_loss": 0.6460444331169128, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.10248619318008423, "rewards/margins": 0.04177603870630264, "rewards/rejected": -0.14426222443580627, "sft_loss": 1.0248619318008423, "step": 150 }, { "epoch": 0.2586381087088301, "grad_norm": 2.9540135860443115, "learning_rate": 4.908783116184534e-06, "logits/chosen": -2.671297550201416, "logits/rejected": -2.676952838897705, "logps/chosen": -0.9303582906723022, "logps/rejected": -1.28878653049469, "loss": 0.991, "odds_ratio_loss": 0.6061214208602905, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09303583949804306, "rewards/margins": 0.03584280610084534, "rewards/rejected": -0.1288786381483078, "sft_loss": 0.9303582906723022, "step": 160 }, { "epoch": 0.27480299050313195, "grad_norm": 2.913118839263916, "learning_rate": 4.897098462953598e-06, "logits/chosen": -2.7513809204101562, "logits/rejected": -2.7600345611572266, "logps/chosen": -0.8939758539199829, "logps/rejected": -1.4527159929275513, "loss": 0.9601, "odds_ratio_loss": 0.661632239818573, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08939759433269501, "rewards/margins": 0.05587399750947952, "rewards/rejected": -0.14527159929275513, "sft_loss": 0.8939758539199829, "step": 170 }, { "epoch": 0.2909678722974338, "grad_norm": 1.985352635383606, "learning_rate": 4.884725470341331e-06, "logits/chosen": -2.7102103233337402, "logits/rejected": -2.739673137664795, "logps/chosen": -0.8302527666091919, "logps/rejected": -1.2092260122299194, "loss": 0.8851, "odds_ratio_loss": 0.5487207174301147, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08302526921033859, "rewards/margins": 0.03789733722805977, "rewards/rejected": -0.12092261016368866, "sft_loss": 0.8302527666091919, "step": 180 }, { "epoch": 0.3071327540917357, "grad_norm": 8.031681060791016, "learning_rate": 4.871667691317377e-06, "logits/chosen": -2.764559745788574, "logits/rejected": -2.767064332962036, "logps/chosen": -1.0171376466751099, "logps/rejected": -1.1592780351638794, "loss": 1.0939, "odds_ratio_loss": 0.7678386569023132, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.10171377658843994, "rewards/margins": 0.01421402208507061, "rewards/rejected": -0.1159278005361557, "sft_loss": 1.0171376466751099, "step": 190 }, { "epoch": 0.3232976358860376, "grad_norm": 4.448939323425293, "learning_rate": 4.857928875491392e-06, "logits/chosen": -2.750746965408325, "logits/rejected": -2.7596051692962646, "logps/chosen": -0.8164304494857788, "logps/rejected": -1.0888216495513916, "loss": 0.8794, "odds_ratio_loss": 0.6294754147529602, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08164305239915848, "rewards/margins": 0.027239132672548294, "rewards/rejected": -0.10888218879699707, "sft_loss": 0.8164304494857788, "step": 200 }, { "epoch": 0.33946251768033947, "grad_norm": 2.216554641723633, "learning_rate": 4.843512968036314e-06, "logits/chosen": -2.7625343799591064, "logits/rejected": -2.7599010467529297, "logps/chosen": -0.833400547504425, "logps/rejected": -1.0677030086517334, "loss": 0.8944, "odds_ratio_loss": 0.6096410751342773, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08334006369113922, "rewards/margins": 0.0234302319586277, "rewards/rejected": -0.10677029192447662, "sft_loss": 0.833400547504425, "step": 210 }, { "epoch": 0.35562739947464134, "grad_norm": 1.4112659692764282, "learning_rate": 4.828424108555486e-06, "logits/chosen": -2.807507276535034, "logits/rejected": -2.803765296936035, "logps/chosen": -1.0460469722747803, "logps/rejected": -1.4173492193222046, "loss": 1.1091, "odds_ratio_loss": 0.6301766037940979, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10460470616817474, "rewards/margins": 0.037130214273929596, "rewards/rejected": -0.14173491299152374, "sft_loss": 1.0460469722747803, "step": 220 }, { "epoch": 0.3717922812689432, "grad_norm": 0.9852223992347717, "learning_rate": 4.812666629893957e-06, "logits/chosen": -2.795703649520874, "logits/rejected": -2.8211073875427246, "logps/chosen": -0.891126275062561, "logps/rejected": -1.0855722427368164, "loss": 0.9626, "odds_ratio_loss": 0.7152143716812134, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08911262452602386, "rewards/margins": 0.019444596022367477, "rewards/rejected": -0.10855722427368164, "sft_loss": 0.891126275062561, "step": 230 }, { "epoch": 0.3879571630632451, "grad_norm": 2.482409954071045, "learning_rate": 4.796245056894273e-06, "logits/chosen": -2.757913112640381, "logits/rejected": -2.794553518295288, "logps/chosen": -0.9089745283126831, "logps/rejected": -1.3391778469085693, "loss": 0.9804, "odds_ratio_loss": 0.7146768569946289, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09089745581150055, "rewards/margins": 0.0430203452706337, "rewards/rejected": -0.13391780853271484, "sft_loss": 0.9089745283126831, "step": 240 }, { "epoch": 0.404122044857547, "grad_norm": 1.3123791217803955, "learning_rate": 4.779164105097148e-06, "logits/chosen": -2.796814441680908, "logits/rejected": -2.8013055324554443, "logps/chosen": -0.8589127659797668, "logps/rejected": -1.3229057788848877, "loss": 0.9186, "odds_ratio_loss": 0.5965861082077026, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08589127659797668, "rewards/margins": 0.046399302780628204, "rewards/rejected": -0.1322905719280243, "sft_loss": 0.8589127659797668, "step": 250 }, { "epoch": 0.42028692665184886, "grad_norm": 2.171173095703125, "learning_rate": 4.761428679387373e-06, "logits/chosen": -2.790588617324829, "logits/rejected": -2.7970798015594482, "logps/chosen": -0.8536098599433899, "logps/rejected": -1.0807464122772217, "loss": 0.9168, "odds_ratio_loss": 0.6316367387771606, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08536098152399063, "rewards/margins": 0.022713668644428253, "rewards/rejected": -0.10807464271783829, "sft_loss": 0.8536098599433899, "step": 260 }, { "epoch": 0.4364518084461507, "grad_norm": 3.753523111343384, "learning_rate": 4.7430438725853515e-06, "logits/chosen": -2.7550888061523438, "logits/rejected": -2.766615629196167, "logps/chosen": -0.913661003112793, "logps/rejected": -1.41799795627594, "loss": 0.9739, "odds_ratio_loss": 0.6024969816207886, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09136610478162766, "rewards/margins": 0.0504336841404438, "rewards/rejected": -0.14179977774620056, "sft_loss": 0.913661003112793, "step": 270 }, { "epoch": 0.4526166902404526, "grad_norm": 1.5982986688613892, "learning_rate": 4.724014963984669e-06, "logits/chosen": -2.798797130584717, "logits/rejected": -2.8145482540130615, "logps/chosen": -0.8752357363700867, "logps/rejected": -1.1694762706756592, "loss": 0.9358, "odds_ratio_loss": 0.6060217618942261, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08752357959747314, "rewards/margins": 0.029424061998724937, "rewards/rejected": -0.11694763600826263, "sft_loss": 0.8752357363700867, "step": 280 }, { "epoch": 0.4687815720347545, "grad_norm": 3.8735010623931885, "learning_rate": 4.704347417836116e-06, "logits/chosen": -2.7753589153289795, "logits/rejected": -2.829224109649658, "logps/chosen": -0.7804813385009766, "logps/rejected": -1.1957075595855713, "loss": 0.8432, "odds_ratio_loss": 0.6271591186523438, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07804813235998154, "rewards/margins": 0.04152262955904007, "rewards/rejected": -0.1195707693696022, "sft_loss": 0.7804813385009766, "step": 290 }, { "epoch": 0.4849464538290564, "grad_norm": 2.0640830993652344, "learning_rate": 4.684046881778603e-06, "logits/chosen": -2.8023476600646973, "logits/rejected": -2.8235526084899902, "logps/chosen": -0.8398802876472473, "logps/rejected": -0.9978183507919312, "loss": 0.9045, "odds_ratio_loss": 0.6464654803276062, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08398802578449249, "rewards/margins": 0.015793804079294205, "rewards/rejected": -0.099781833589077, "sft_loss": 0.8398802876472473, "step": 300 }, { "epoch": 0.5011113356233583, "grad_norm": 1.626105785369873, "learning_rate": 4.663119185217409e-06, "logits/chosen": -2.796461343765259, "logits/rejected": -2.8225197792053223, "logps/chosen": -0.8273599743843079, "logps/rejected": -1.096482515335083, "loss": 0.8875, "odds_ratio_loss": 0.6016198396682739, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08273600041866302, "rewards/margins": 0.026912260800600052, "rewards/rejected": -0.10964826494455338, "sft_loss": 0.8273599743843079, "step": 310 }, { "epoch": 0.5172762174176602, "grad_norm": 1.5098748207092285, "learning_rate": 4.641570337650232e-06, "logits/chosen": -2.847539186477661, "logits/rejected": -2.85341215133667, "logps/chosen": -0.7699432969093323, "logps/rejected": -1.0820213556289673, "loss": 0.8268, "odds_ratio_loss": 0.5688191652297974, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07699433714151382, "rewards/margins": 0.03120780549943447, "rewards/rejected": -0.10820214450359344, "sft_loss": 0.7699432969093323, "step": 320 }, { "epoch": 0.533441099211962, "grad_norm": 1.3477349281311035, "learning_rate": 4.61940652694154e-06, "logits/chosen": -2.7625374794006348, "logits/rejected": -2.8054728507995605, "logps/chosen": -0.8576439023017883, "logps/rejected": -1.2374662160873413, "loss": 0.9224, "odds_ratio_loss": 0.6476989984512329, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08576439321041107, "rewards/margins": 0.03798223286867142, "rewards/rejected": -0.12374663352966309, "sft_loss": 0.8576439023017883, "step": 330 }, { "epoch": 0.5496059810062639, "grad_norm": 2.094233274459839, "learning_rate": 4.596634117545689e-06, "logits/chosen": -2.8440895080566406, "logits/rejected": -2.8477485179901123, "logps/chosen": -0.8450831174850464, "logps/rejected": -1.1874289512634277, "loss": 0.9084, "odds_ratio_loss": 0.6333492994308472, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08450832217931747, "rewards/margins": 0.03423457592725754, "rewards/rejected": -0.11874288320541382, "sft_loss": 0.8450831174850464, "step": 340 }, { "epoch": 0.5657708628005658, "grad_norm": 1.2610398530960083, "learning_rate": 4.573259648679335e-06, "logits/chosen": -2.8393020629882812, "logits/rejected": -2.8172850608825684, "logps/chosen": -0.8293860554695129, "logps/rejected": -1.1484854221343994, "loss": 0.8924, "odds_ratio_loss": 0.6304416060447693, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08293859660625458, "rewards/margins": 0.03190993517637253, "rewards/rejected": -0.1148485392332077, "sft_loss": 0.8293860554695129, "step": 350 }, { "epoch": 0.5819357445948676, "grad_norm": 7.934630870819092, "learning_rate": 4.549289832443663e-06, "logits/chosen": -2.8159756660461426, "logits/rejected": -2.8409628868103027, "logps/chosen": -0.885659396648407, "logps/rejected": -1.2282092571258545, "loss": 0.9498, "odds_ratio_loss": 0.641811192035675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08856594562530518, "rewards/margins": 0.03425499051809311, "rewards/rejected": -0.12282093614339828, "sft_loss": 0.885659396648407, "step": 360 }, { "epoch": 0.5981006263891695, "grad_norm": 1.7960658073425293, "learning_rate": 4.524731551896978e-06, "logits/chosen": -2.8090755939483643, "logits/rejected": -2.825777292251587, "logps/chosen": -0.7784116864204407, "logps/rejected": -0.9700002670288086, "loss": 0.8424, "odds_ratio_loss": 0.6396910548210144, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07784116268157959, "rewards/margins": 0.01915885880589485, "rewards/rejected": -0.09700002521276474, "sft_loss": 0.7784116864204407, "step": 370 }, { "epoch": 0.6142655081834714, "grad_norm": 3.268920421600342, "learning_rate": 4.4995918590781925e-06, "logits/chosen": -2.853820562362671, "logits/rejected": -2.8512935638427734, "logps/chosen": -0.8428764343261719, "logps/rejected": -1.0172072649002075, "loss": 0.9104, "odds_ratio_loss": 0.6751636266708374, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08428764343261719, "rewards/margins": 0.01743307337164879, "rewards/rejected": -0.10172072798013687, "sft_loss": 0.8428764343261719, "step": 380 }, { "epoch": 0.6304303899777733, "grad_norm": 1.0598444938659668, "learning_rate": 4.473877972981797e-06, "logits/chosen": -2.7993013858795166, "logits/rejected": -2.789777994155884, "logps/chosen": -0.8297500610351562, "logps/rejected": -1.0850985050201416, "loss": 0.8895, "odds_ratio_loss": 0.5971348881721497, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08297501504421234, "rewards/margins": 0.025534838438034058, "rewards/rejected": -0.1085098534822464, "sft_loss": 0.8297500610351562, "step": 390 }, { "epoch": 0.6465952717720752, "grad_norm": 1.9357444047927856, "learning_rate": 4.447597277484894e-06, "logits/chosen": -2.7699055671691895, "logits/rejected": -2.798750400543213, "logps/chosen": -0.7733790874481201, "logps/rejected": -0.9783531427383423, "loss": 0.8347, "odds_ratio_loss": 0.6135808825492859, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07733791321516037, "rewards/margins": 0.020497407764196396, "rewards/rejected": -0.09783531725406647, "sft_loss": 0.7733790874481201, "step": 400 }, { "epoch": 0.6627601535663771, "grad_norm": 1.4025357961654663, "learning_rate": 4.42075731922687e-06, "logits/chosen": -2.8587729930877686, "logits/rejected": -2.87328839302063, "logps/chosen": -0.9505017995834351, "logps/rejected": -1.1930662393569946, "loss": 1.0132, "odds_ratio_loss": 0.6273903250694275, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09505018591880798, "rewards/margins": 0.024256447330117226, "rewards/rejected": -0.11930663883686066, "sft_loss": 0.9505017995834351, "step": 410 }, { "epoch": 0.6789250353606789, "grad_norm": 5.174298286437988, "learning_rate": 4.3933658054423465e-06, "logits/chosen": -2.8345279693603516, "logits/rejected": -2.83827543258667, "logps/chosen": -0.80866539478302, "logps/rejected": -1.174803614616394, "loss": 0.8664, "odds_ratio_loss": 0.5777753591537476, "rewards/accuracies": 0.625, "rewards/chosen": -0.08086653053760529, "rewards/margins": 0.036613818258047104, "rewards/rejected": -0.11748035252094269, "sft_loss": 0.80866539478302, "step": 420 }, { "epoch": 0.6950899171549808, "grad_norm": 2.207981586456299, "learning_rate": 4.365430601748003e-06, "logits/chosen": -2.8343446254730225, "logits/rejected": -2.857731342315674, "logps/chosen": -0.9037211537361145, "logps/rejected": -1.0559289455413818, "loss": 0.9705, "odds_ratio_loss": 0.6677287817001343, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09037211537361145, "rewards/margins": 0.01522077340632677, "rewards/rejected": -0.10559289157390594, "sft_loss": 0.9037211537361145, "step": 430 }, { "epoch": 0.7112547989492827, "grad_norm": 15.49936580657959, "learning_rate": 4.336959729883925e-06, "logits/chosen": -2.8130838871002197, "logits/rejected": -2.8357608318328857, "logps/chosen": -0.8217814564704895, "logps/rejected": -0.9188777804374695, "loss": 0.8923, "odds_ratio_loss": 0.7047211527824402, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08217814564704895, "rewards/margins": 0.00970962829887867, "rewards/rejected": -0.09188777953386307, "sft_loss": 0.8217814564704895, "step": 440 }, { "epoch": 0.7274196807435845, "grad_norm": 1.7557275295257568, "learning_rate": 4.307961365410118e-06, "logits/chosen": -2.790027379989624, "logits/rejected": -2.809622049331665, "logps/chosen": -0.840091347694397, "logps/rejected": -1.0152480602264404, "loss": 0.9039, "odds_ratio_loss": 0.6380866169929504, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08400914072990417, "rewards/margins": 0.017515674233436584, "rewards/rejected": -0.10152481496334076, "sft_loss": 0.840091347694397, "step": 450 }, { "epoch": 0.7435845625378864, "grad_norm": 2.8990914821624756, "learning_rate": 4.278443835358854e-06, "logits/chosen": -2.812924861907959, "logits/rejected": -2.811110734939575, "logps/chosen": -0.8139681816101074, "logps/rejected": -1.0690581798553467, "loss": 0.8748, "odds_ratio_loss": 0.6082891225814819, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08139681816101074, "rewards/margins": 0.025509005412459373, "rewards/rejected": -0.10690581798553467, "sft_loss": 0.8139681816101074, "step": 460 }, { "epoch": 0.7597494443321883, "grad_norm": 2.2395644187927246, "learning_rate": 4.248415615843523e-06, "logits/chosen": -2.8422694206237793, "logits/rejected": -2.850648880004883, "logps/chosen": -0.8527294993400574, "logps/rejected": -1.0392307043075562, "loss": 0.9183, "odds_ratio_loss": 0.6552284359931946, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08527294546365738, "rewards/margins": 0.01865011267364025, "rewards/rejected": -0.10392306745052338, "sft_loss": 0.8527294993400574, "step": 470 }, { "epoch": 0.7759143261264903, "grad_norm": 2.0075857639312744, "learning_rate": 4.217885329624666e-06, "logits/chosen": -2.8313276767730713, "logits/rejected": -2.8245348930358887, "logps/chosen": -0.790324330329895, "logps/rejected": -1.0767412185668945, "loss": 0.8498, "odds_ratio_loss": 0.5943514108657837, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07903242856264114, "rewards/margins": 0.02864169515669346, "rewards/rejected": -0.10767412185668945, "sft_loss": 0.790324330329895, "step": 480 }, { "epoch": 0.7920792079207921, "grad_norm": 1.7681854963302612, "learning_rate": 4.186861743633911e-06, "logits/chosen": -2.8171868324279785, "logits/rejected": -2.8480162620544434, "logps/chosen": -0.7983497381210327, "logps/rejected": -1.1061131954193115, "loss": 0.8646, "odds_ratio_loss": 0.6626344919204712, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.07983498275279999, "rewards/margins": 0.03077634610235691, "rewards/rejected": -0.11061131954193115, "sft_loss": 0.7983497381210327, "step": 490 }, { "epoch": 0.808244089715094, "grad_norm": 1.5298829078674316, "learning_rate": 4.155353766456497e-06, "logits/chosen": -2.874368190765381, "logits/rejected": -2.8658576011657715, "logps/chosen": -0.8663871884346008, "logps/rejected": -1.0296813249588013, "loss": 0.93, "odds_ratio_loss": 0.636117160320282, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08663871884346008, "rewards/margins": 0.016329411417245865, "rewards/rejected": -0.10296813398599625, "sft_loss": 0.8663871884346008, "step": 500 }, { "epoch": 0.808244089715094, "eval_logits/chosen": -2.836843729019165, "eval_logits/rejected": -2.8441994190216064, "eval_logps/chosen": -0.8278239965438843, "eval_logps/rejected": -1.0567275285720825, "eval_loss": 0.8927881121635437, "eval_odds_ratio_loss": 0.6496399641036987, "eval_rewards/accuracies": 0.5772727131843567, "eval_rewards/chosen": -0.08278240263462067, "eval_rewards/margins": 0.02289034053683281, "eval_rewards/rejected": -0.10567274689674377, "eval_runtime": 194.5311, "eval_samples_per_second": 5.655, "eval_sft_loss": 0.8278239965438843, "eval_steps_per_second": 2.827, "step": 500 }, { "epoch": 0.8244089715093958, "grad_norm": 1.6909329891204834, "learning_rate": 4.123370445773134e-06, "logits/chosen": -2.8691649436950684, "logits/rejected": -2.8811800479888916, "logps/chosen": -0.8283156156539917, "logps/rejected": -0.9291037321090698, "loss": 0.8973, "odds_ratio_loss": 0.689969539642334, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08283156156539917, "rewards/margins": 0.01007880363613367, "rewards/rejected": -0.09291036427021027, "sft_loss": 0.8283156156539917, "step": 510 }, { "epoch": 0.8405738533036977, "grad_norm": 4.33729362487793, "learning_rate": 4.090920965761906e-06, "logits/chosen": -2.808586597442627, "logits/rejected": -2.8186278343200684, "logps/chosen": -0.8606308698654175, "logps/rejected": -1.0332623720169067, "loss": 0.9284, "odds_ratio_loss": 0.6780760884284973, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08606309443712234, "rewards/margins": 0.0172631423920393, "rewards/rejected": -0.1033262237906456, "sft_loss": 0.8606308698654175, "step": 520 }, { "epoch": 0.8567387350979996, "grad_norm": 6.002406120300293, "learning_rate": 4.058014644460991e-06, "logits/chosen": -2.833061456680298, "logits/rejected": -2.8458170890808105, "logps/chosen": -0.8242424726486206, "logps/rejected": -0.9793018102645874, "loss": 0.8862, "odds_ratio_loss": 0.6198969483375549, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08242423832416534, "rewards/margins": 0.015505945309996605, "rewards/rejected": -0.0979301929473877, "sft_loss": 0.8242424726486206, "step": 530 }, { "epoch": 0.8729036168923014, "grad_norm": 1.998780608177185, "learning_rate": 4.024660931092939e-06, "logits/chosen": -2.81856369972229, "logits/rejected": -2.8293251991271973, "logps/chosen": -0.8208298683166504, "logps/rejected": -1.0441166162490845, "loss": 0.8828, "odds_ratio_loss": 0.6198452115058899, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08208298683166504, "rewards/margins": 0.022328665480017662, "rewards/rejected": -0.10441166162490845, "sft_loss": 0.8208298683166504, "step": 540 }, { "epoch": 0.8890684986866033, "grad_norm": 2.4577414989471436, "learning_rate": 3.990869403351272e-06, "logits/chosen": -2.8507511615753174, "logits/rejected": -2.8566970825195312, "logps/chosen": -0.8117038011550903, "logps/rejected": -1.0751911401748657, "loss": 0.8674, "odds_ratio_loss": 0.5573362112045288, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08117038011550903, "rewards/margins": 0.026348743587732315, "rewards/rejected": -0.10751912742853165, "sft_loss": 0.8117038011550903, "step": 550 }, { "epoch": 0.9052333804809052, "grad_norm": 3.4686763286590576, "learning_rate": 3.956649764650206e-06, "logits/chosen": -2.881647825241089, "logits/rejected": -2.8819093704223633, "logps/chosen": -0.840446949005127, "logps/rejected": -1.052137017250061, "loss": 0.907, "odds_ratio_loss": 0.6658841967582703, "rewards/accuracies": 0.5, "rewards/chosen": -0.0840446949005127, "rewards/margins": 0.02116900309920311, "rewards/rejected": -0.1052137017250061, "sft_loss": 0.840446949005127, "step": 560 }, { "epoch": 0.9213982622752072, "grad_norm": 2.2446658611297607, "learning_rate": 3.92201184133826e-06, "logits/chosen": -2.864419460296631, "logits/rejected": -2.8783581256866455, "logps/chosen": -0.7979758381843567, "logps/rejected": -1.0608371496200562, "loss": 0.858, "odds_ratio_loss": 0.5999220609664917, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07979758828878403, "rewards/margins": 0.026286140084266663, "rewards/rejected": -0.1060837134718895, "sft_loss": 0.7979758381843567, "step": 570 }, { "epoch": 0.937563144069509, "grad_norm": 1.9976744651794434, "learning_rate": 3.886965579876572e-06, "logits/chosen": -2.900329351425171, "logits/rejected": -2.90751051902771, "logps/chosen": -0.8153482675552368, "logps/rejected": -0.9346411824226379, "loss": 0.8816, "odds_ratio_loss": 0.6620460748672485, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08153482526540756, "rewards/margins": 0.011929300613701344, "rewards/rejected": -0.09346412122249603, "sft_loss": 0.8153482675552368, "step": 580 }, { "epoch": 0.9537280258638109, "grad_norm": 1.6091820001602173, "learning_rate": 3.851521043982716e-06, "logits/chosen": -2.8917582035064697, "logits/rejected": -2.902100086212158, "logps/chosen": -0.8334836959838867, "logps/rejected": -1.004950761795044, "loss": 0.9, "odds_ratio_loss": 0.6651790738105774, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0833483636379242, "rewards/margins": 0.017146697267889977, "rewards/rejected": -0.10049506276845932, "sft_loss": 0.8334836959838867, "step": 590 }, { "epoch": 0.9698929076581128, "grad_norm": 5.672989845275879, "learning_rate": 3.81568841174086e-06, "logits/chosen": -2.861603021621704, "logits/rejected": -2.876756191253662, "logps/chosen": -0.7806357145309448, "logps/rejected": -1.1542575359344482, "loss": 0.8442, "odds_ratio_loss": 0.6360144019126892, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07806357741355896, "rewards/margins": 0.03736215457320213, "rewards/rejected": -0.1154257282614708, "sft_loss": 0.7806357145309448, "step": 600 }, { "epoch": 0.9860577894524146, "grad_norm": 1.4658279418945312, "learning_rate": 3.7794779726790664e-06, "logits/chosen": -2.845876455307007, "logits/rejected": -2.8574581146240234, "logps/chosen": -0.7789396047592163, "logps/rejected": -1.1114189624786377, "loss": 0.8409, "odds_ratio_loss": 0.6194978952407837, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07789396494626999, "rewards/margins": 0.0332479402422905, "rewards/rejected": -0.11114190518856049, "sft_loss": 0.7789396047592163, "step": 610 }, { "epoch": 1.0022226712467166, "grad_norm": 2.5747179985046387, "learning_rate": 3.7429001248146096e-06, "logits/chosen": -2.8244144916534424, "logits/rejected": -2.832597494125366, "logps/chosen": -0.7860082387924194, "logps/rejected": -1.0231492519378662, "loss": 0.8435, "odds_ratio_loss": 0.5752763748168945, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07860083132982254, "rewards/margins": 0.023714100942015648, "rewards/rejected": -0.10231492668390274, "sft_loss": 0.7860082387924194, "step": 620 }, { "epoch": 1.0183875530410185, "grad_norm": 1.24222993850708, "learning_rate": 3.7059653716681227e-06, "logits/chosen": -2.8329997062683105, "logits/rejected": -2.8287994861602783, "logps/chosen": -0.8590106964111328, "logps/rejected": -1.0588136911392212, "loss": 0.9265, "odds_ratio_loss": 0.6749905347824097, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08590107411146164, "rewards/margins": 0.01998029835522175, "rewards/rejected": -0.10588137060403824, "sft_loss": 0.8590106964111328, "step": 630 }, { "epoch": 1.0345524348353203, "grad_norm": 1.6466968059539795, "learning_rate": 3.668684319247463e-06, "logits/chosen": -2.8495888710021973, "logits/rejected": -2.872880220413208, "logps/chosen": -0.7487844824790955, "logps/rejected": -1.0430450439453125, "loss": 0.8035, "odds_ratio_loss": 0.5467280149459839, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07487844675779343, "rewards/margins": 0.02942606247961521, "rewards/rejected": -0.10430450737476349, "sft_loss": 0.7487844824790955, "step": 640 }, { "epoch": 1.0507173166296222, "grad_norm": 1.1547085046768188, "learning_rate": 3.6310676730021373e-06, "logits/chosen": -2.8986639976501465, "logits/rejected": -2.900839328765869, "logps/chosen": -0.7881689071655273, "logps/rejected": -0.9517928957939148, "loss": 0.8509, "odds_ratio_loss": 0.6268683075904846, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07881689816713333, "rewards/margins": 0.016362406313419342, "rewards/rejected": -0.09517930448055267, "sft_loss": 0.7881689071655273, "step": 650 }, { "epoch": 1.066882198423924, "grad_norm": 3.282292604446411, "learning_rate": 3.593126234749178e-06, "logits/chosen": -2.8645131587982178, "logits/rejected": -2.898613929748535, "logps/chosen": -0.9009162187576294, "logps/rejected": -1.1612458229064941, "loss": 0.9648, "odds_ratio_loss": 0.6383681297302246, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09009162336587906, "rewards/margins": 0.026032963767647743, "rewards/rejected": -0.11612458527088165, "sft_loss": 0.9009162187576294, "step": 660 }, { "epoch": 1.083047080218226, "grad_norm": 1.7910722494125366, "learning_rate": 3.554870899571343e-06, "logits/chosen": -2.8563625812530518, "logits/rejected": -2.8744523525238037, "logps/chosen": -0.8285778760910034, "logps/rejected": -1.0025149583816528, "loss": 0.8927, "odds_ratio_loss": 0.6415389776229858, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08285778015851974, "rewards/margins": 0.01739371195435524, "rewards/rejected": -0.10025149583816528, "sft_loss": 0.8285778760910034, "step": 670 }, { "epoch": 1.0992119620125278, "grad_norm": 3.5774316787719727, "learning_rate": 3.5163126526885373e-06, "logits/chosen": -2.8437960147857666, "logits/rejected": -2.870513916015625, "logps/chosen": -0.7732303142547607, "logps/rejected": -1.0101302862167358, "loss": 0.8343, "odds_ratio_loss": 0.6102721095085144, "rewards/accuracies": 0.625, "rewards/chosen": -0.07732303440570831, "rewards/margins": 0.023690002039074898, "rewards/rejected": -0.10101302713155746, "sft_loss": 0.7732303142547607, "step": 680 }, { "epoch": 1.1153768438068297, "grad_norm": 2.30400013923645, "learning_rate": 3.4774625663033484e-06, "logits/chosen": -2.849010467529297, "logits/rejected": -2.8660061359405518, "logps/chosen": -0.7853142619132996, "logps/rejected": -0.9644325971603394, "loss": 0.8466, "odds_ratio_loss": 0.6127563714981079, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0785314291715622, "rewards/margins": 0.017911842092871666, "rewards/rejected": -0.09644327312707901, "sft_loss": 0.7853142619132996, "step": 690 }, { "epoch": 1.1315417256011315, "grad_norm": 1.4719704389572144, "learning_rate": 3.4383317964216067e-06, "logits/chosen": -2.8511626720428467, "logits/rejected": -2.881286382675171, "logps/chosen": -0.7790023684501648, "logps/rejected": -0.9076374173164368, "loss": 0.8484, "odds_ratio_loss": 0.6935282945632935, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0779002457857132, "rewards/margins": 0.012863497249782085, "rewards/rejected": -0.09076374769210815, "sft_loss": 0.7790023684501648, "step": 700 }, { "epoch": 1.1477066073954334, "grad_norm": 2.1927425861358643, "learning_rate": 3.398931579648877e-06, "logits/chosen": -2.8756051063537598, "logits/rejected": -2.880699872970581, "logps/chosen": -0.8047206997871399, "logps/rejected": -1.1634694337844849, "loss": 0.8667, "odds_ratio_loss": 0.6202768087387085, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08047207444906235, "rewards/margins": 0.0358748659491539, "rewards/rejected": -0.11634693294763565, "sft_loss": 0.8047206997871399, "step": 710 }, { "epoch": 1.1638714891897353, "grad_norm": 1.4328726530075073, "learning_rate": 3.359273229963813e-06, "logits/chosen": -2.8490045070648193, "logits/rejected": -2.8502037525177, "logps/chosen": -0.7575694918632507, "logps/rejected": -0.9301745295524597, "loss": 0.821, "odds_ratio_loss": 0.6343931555747986, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07575695216655731, "rewards/margins": 0.017260495573282242, "rewards/rejected": -0.09301744401454926, "sft_loss": 0.7575694918632507, "step": 720 }, { "epoch": 1.1800363709840371, "grad_norm": 1.3170576095581055, "learning_rate": 3.319368135469285e-06, "logits/chosen": -2.8658504486083984, "logits/rejected": -2.8875842094421387, "logps/chosen": -0.8195670247077942, "logps/rejected": -1.1535929441452026, "loss": 0.8841, "odds_ratio_loss": 0.6450805068016052, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08195669949054718, "rewards/margins": 0.03340259566903114, "rewards/rejected": -0.11535929143428802, "sft_loss": 0.8195670247077942, "step": 730 }, { "epoch": 1.196201252778339, "grad_norm": 4.55858850479126, "learning_rate": 3.279227755122228e-06, "logits/chosen": -2.858372211456299, "logits/rejected": -2.860966205596924, "logps/chosen": -0.7807797193527222, "logps/rejected": -1.1492526531219482, "loss": 0.839, "odds_ratio_loss": 0.5826634764671326, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.07807797938585281, "rewards/margins": 0.03684728592634201, "rewards/rejected": -0.11492526531219482, "sft_loss": 0.7807797193527222, "step": 740 }, { "epoch": 1.2123661345726409, "grad_norm": 2.330960988998413, "learning_rate": 3.2388636154431417e-06, "logits/chosen": -2.868211507797241, "logits/rejected": -2.898150682449341, "logps/chosen": -0.8243536949157715, "logps/rejected": -1.195150375366211, "loss": 0.883, "odds_ratio_loss": 0.5866126418113708, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08243536949157715, "rewards/margins": 0.03707967326045036, "rewards/rejected": -0.11951503902673721, "sft_loss": 0.8243536949157715, "step": 750 }, { "epoch": 1.2285310163669427, "grad_norm": 2.7208411693573, "learning_rate": 3.198287307206192e-06, "logits/chosen": -2.844311237335205, "logits/rejected": -2.8444716930389404, "logps/chosen": -0.7780786752700806, "logps/rejected": -0.9966138005256653, "loss": 0.8378, "odds_ratio_loss": 0.5971704721450806, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07780785858631134, "rewards/margins": 0.021853512153029442, "rewards/rejected": -0.09966136515140533, "sft_loss": 0.7780786752700806, "step": 760 }, { "epoch": 1.2446958981612446, "grad_norm": 1.3042361736297607, "learning_rate": 3.157510482110856e-06, "logits/chosen": -2.9084322452545166, "logits/rejected": -2.905463933944702, "logps/chosen": -0.7917675971984863, "logps/rejected": -1.0798178911209106, "loss": 0.8557, "odds_ratio_loss": 0.6388932466506958, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07917676120996475, "rewards/margins": 0.02880503609776497, "rewards/rejected": -0.10798178613185883, "sft_loss": 0.7917675971984863, "step": 770 }, { "epoch": 1.2608607799555465, "grad_norm": 1.315172553062439, "learning_rate": 3.116544849436077e-06, "logits/chosen": -2.828716993331909, "logits/rejected": -2.8282887935638428, "logps/chosen": -0.8439006805419922, "logps/rejected": -1.2042268514633179, "loss": 0.9037, "odds_ratio_loss": 0.5979124307632446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0843900591135025, "rewards/margins": 0.03603263571858406, "rewards/rejected": -0.12042269855737686, "sft_loss": 0.8439006805419922, "step": 780 }, { "epoch": 1.2770256617498483, "grad_norm": 1.611197829246521, "learning_rate": 3.0754021726778848e-06, "logits/chosen": -2.84073543548584, "logits/rejected": -2.832176685333252, "logps/chosen": -0.7603198885917664, "logps/rejected": -1.202492117881775, "loss": 0.8152, "odds_ratio_loss": 0.5489572882652283, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07603198289871216, "rewards/margins": 0.044217221438884735, "rewards/rejected": -0.1202491968870163, "sft_loss": 0.7603198885917664, "step": 790 }, { "epoch": 1.2931905435441502, "grad_norm": 1.179275631904602, "learning_rate": 3.0340942661714463e-06, "logits/chosen": -2.877725839614868, "logits/rejected": -2.891244411468506, "logps/chosen": -0.8281265497207642, "logps/rejected": -1.0409139394760132, "loss": 0.8904, "odds_ratio_loss": 0.6231717467308044, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08281265199184418, "rewards/margins": 0.021278750151395798, "rewards/rejected": -0.10409140586853027, "sft_loss": 0.8281265497207642, "step": 800 }, { "epoch": 1.3093554253384523, "grad_norm": 2.1846208572387695, "learning_rate": 2.992632991698512e-06, "logits/chosen": -2.8389461040496826, "logits/rejected": -2.85896635055542, "logps/chosen": -0.8289766311645508, "logps/rejected": -1.0603488683700562, "loss": 0.8918, "odds_ratio_loss": 0.6286410093307495, "rewards/accuracies": 0.625, "rewards/chosen": -0.08289766311645508, "rewards/margins": 0.02313724346458912, "rewards/rejected": -0.10603491216897964, "sft_loss": 0.8289766311645508, "step": 810 }, { "epoch": 1.3255203071327541, "grad_norm": 1.583357334136963, "learning_rate": 2.9510302550812537e-06, "logits/chosen": -2.845541000366211, "logits/rejected": -2.8782455921173096, "logps/chosen": -0.7186457514762878, "logps/rejected": -1.0902959108352661, "loss": 0.776, "odds_ratio_loss": 0.573469340801239, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07186457514762878, "rewards/margins": 0.03716501593589783, "rewards/rejected": -0.10902959108352661, "sft_loss": 0.7186457514762878, "step": 820 }, { "epoch": 1.341685188927056, "grad_norm": 2.841128349304199, "learning_rate": 2.9092980027634325e-06, "logits/chosen": -2.8583426475524902, "logits/rejected": -2.874774217605591, "logps/chosen": -0.7276403903961182, "logps/rejected": -1.0125164985656738, "loss": 0.788, "odds_ratio_loss": 0.6034457683563232, "rewards/accuracies": 0.625, "rewards/chosen": -0.07276404649019241, "rewards/margins": 0.028487607836723328, "rewards/rejected": -0.10125164687633514, "sft_loss": 0.7276403903961182, "step": 830 }, { "epoch": 1.3578500707213579, "grad_norm": 1.7055377960205078, "learning_rate": 2.867448218379927e-06, "logits/chosen": -2.8610100746154785, "logits/rejected": -2.8836147785186768, "logps/chosen": -0.8485835790634155, "logps/rejected": -1.0031511783599854, "loss": 0.9172, "odds_ratio_loss": 0.6861482858657837, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08485837280750275, "rewards/margins": 0.015456756576895714, "rewards/rejected": -0.10031511634588242, "sft_loss": 0.8485835790634155, "step": 840 }, { "epoch": 1.3740149525156597, "grad_norm": 9.629118919372559, "learning_rate": 2.825492919315559e-06, "logits/chosen": -2.8479480743408203, "logits/rejected": -2.8763227462768555, "logps/chosen": -0.8768585324287415, "logps/rejected": -0.999729335308075, "loss": 0.9437, "odds_ratio_loss": 0.668052613735199, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08768586814403534, "rewards/margins": 0.012287073768675327, "rewards/rejected": -0.0999729260802269, "sft_loss": 0.8768585324287415, "step": 850 }, { "epoch": 1.3901798343099616, "grad_norm": 2.416870594024658, "learning_rate": 2.7834441532542482e-06, "logits/chosen": -2.8881735801696777, "logits/rejected": -2.9063100814819336, "logps/chosen": -0.7879316210746765, "logps/rejected": -1.023233413696289, "loss": 0.8456, "odds_ratio_loss": 0.5766496658325195, "rewards/accuracies": 0.625, "rewards/chosen": -0.07879316806793213, "rewards/margins": 0.023530183359980583, "rewards/rejected": -0.10232335329055786, "sft_loss": 0.7879316210746765, "step": 860 }, { "epoch": 1.4063447161042635, "grad_norm": 1.5628215074539185, "learning_rate": 2.74131399471945e-06, "logits/chosen": -2.855931520462036, "logits/rejected": -2.8686752319335938, "logps/chosen": -0.7991023063659668, "logps/rejected": -0.9908691644668579, "loss": 0.8644, "odds_ratio_loss": 0.6528818607330322, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07991023361682892, "rewards/margins": 0.01917668618261814, "rewards/rejected": -0.09908691793680191, "sft_loss": 0.7991023063659668, "step": 870 }, { "epoch": 1.4225095978985653, "grad_norm": 2.0555615425109863, "learning_rate": 2.6991145416068947e-06, "logits/chosen": -2.846782922744751, "logits/rejected": -2.8673818111419678, "logps/chosen": -0.8078680038452148, "logps/rejected": -0.9619809985160828, "loss": 0.8714, "odds_ratio_loss": 0.6356260180473328, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0807868018746376, "rewards/margins": 0.01541130244731903, "rewards/rejected": -0.09619811177253723, "sft_loss": 0.8078680038452148, "step": 880 }, { "epoch": 1.4386744796928672, "grad_norm": 0.9378024339675903, "learning_rate": 2.6568579117106143e-06, "logits/chosen": -2.8469960689544678, "logits/rejected": -2.850614070892334, "logps/chosen": -0.7744920253753662, "logps/rejected": -1.0393074750900269, "loss": 0.8347, "odds_ratio_loss": 0.6018751859664917, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07744920998811722, "rewards/margins": 0.026481550186872482, "rewards/rejected": -0.1039307564496994, "sft_loss": 0.7744920253753662, "step": 890 }, { "epoch": 1.454839361487169, "grad_norm": 0.9352036118507385, "learning_rate": 2.6145562392432544e-06, "logits/chosen": -2.875109910964966, "logits/rejected": -2.887655735015869, "logps/chosen": -0.8057360649108887, "logps/rejected": -0.9923427700996399, "loss": 0.8708, "odds_ratio_loss": 0.6502856016159058, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08057360351085663, "rewards/margins": 0.01866067573428154, "rewards/rejected": -0.09923428297042847, "sft_loss": 0.8057360649108887, "step": 900 }, { "epoch": 1.471004243281471, "grad_norm": 2.6385111808776855, "learning_rate": 2.5722216713516682e-06, "logits/chosen": -2.8550915718078613, "logits/rejected": -2.8972582817077637, "logps/chosen": -0.7460139989852905, "logps/rejected": -0.9863673448562622, "loss": 0.8057, "odds_ratio_loss": 0.5972028374671936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07460139691829681, "rewards/margins": 0.024035323411226273, "rewards/rejected": -0.09863673150539398, "sft_loss": 0.7460139989852905, "step": 910 }, { "epoch": 1.4871691250757728, "grad_norm": 1.7198817729949951, "learning_rate": 2.5298663646288064e-06, "logits/chosen": -2.8807036876678467, "logits/rejected": -2.888306140899658, "logps/chosen": -0.7764211893081665, "logps/rejected": -1.0312559604644775, "loss": 0.8377, "odds_ratio_loss": 0.6123490333557129, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07764211297035217, "rewards/margins": 0.025483474135398865, "rewards/rejected": -0.10312558710575104, "sft_loss": 0.7764211893081665, "step": 920 }, { "epoch": 1.503334006870075, "grad_norm": 2.7615318298339844, "learning_rate": 2.487502481622879e-06, "logits/chosen": -2.8637490272521973, "logits/rejected": -2.874497652053833, "logps/chosen": -0.8163179159164429, "logps/rejected": -0.9841713905334473, "loss": 0.8791, "odds_ratio_loss": 0.6274018287658691, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08163177967071533, "rewards/margins": 0.01678534969687462, "rewards/rejected": -0.09841714054346085, "sft_loss": 0.8163179159164429, "step": 930 }, { "epoch": 1.5194988886643768, "grad_norm": 1.7173594236373901, "learning_rate": 2.4451421873448253e-06, "logits/chosen": -2.8568150997161865, "logits/rejected": -2.879917621612549, "logps/chosen": -0.8009888529777527, "logps/rejected": -0.9833795428276062, "loss": 0.8678, "odds_ratio_loss": 0.667960524559021, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08009888231754303, "rewards/margins": 0.018239066004753113, "rewards/rejected": -0.09833794832229614, "sft_loss": 0.8009888529777527, "step": 940 }, { "epoch": 1.5356637704586786, "grad_norm": 3.4001808166503906, "learning_rate": 2.40279764577506e-06, "logits/chosen": -2.885816812515259, "logits/rejected": -2.9209980964660645, "logps/chosen": -0.8259257078170776, "logps/rejected": -0.9810823202133179, "loss": 0.8903, "odds_ratio_loss": 0.6437360048294067, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08259257674217224, "rewards/margins": 0.015515660867094994, "rewards/rejected": -0.09810823202133179, "sft_loss": 0.8259257078170776, "step": 950 }, { "epoch": 1.5518286522529805, "grad_norm": 3.7369155883789062, "learning_rate": 2.3604810163705242e-06, "logits/chosen": -2.878312587738037, "logits/rejected": -2.9087862968444824, "logps/chosen": -0.7468287944793701, "logps/rejected": -0.999441921710968, "loss": 0.8033, "odds_ratio_loss": 0.5650970339775085, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07468288391828537, "rewards/margins": 0.02526130899786949, "rewards/rejected": -0.09994419664144516, "sft_loss": 0.7468287944793701, "step": 960 }, { "epoch": 1.5679935340472824, "grad_norm": 1.2655407190322876, "learning_rate": 2.3182044505730364e-06, "logits/chosen": -2.872468948364258, "logits/rejected": -2.873964309692383, "logps/chosen": -0.7006109952926636, "logps/rejected": -0.9527314901351929, "loss": 0.7581, "odds_ratio_loss": 0.5752806067466736, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0700611025094986, "rewards/margins": 0.025212040171027184, "rewards/rejected": -0.09527313709259033, "sft_loss": 0.7006109952926636, "step": 970 }, { "epoch": 1.5841584158415842, "grad_norm": 2.9336001873016357, "learning_rate": 2.275980088319941e-06, "logits/chosen": -2.8779749870300293, "logits/rejected": -2.8763155937194824, "logps/chosen": -0.7721344232559204, "logps/rejected": -0.9309911727905273, "loss": 0.8406, "odds_ratio_loss": 0.6845985651016235, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07721343636512756, "rewards/margins": 0.01588568463921547, "rewards/rejected": -0.09309910982847214, "sft_loss": 0.7721344232559204, "step": 980 }, { "epoch": 1.600323297635886, "grad_norm": 2.434041738510132, "learning_rate": 2.2338200545580577e-06, "logits/chosen": -2.849057674407959, "logits/rejected": -2.873142957687378, "logps/chosen": -0.7509113550186157, "logps/rejected": -1.0347163677215576, "loss": 0.8135, "odds_ratio_loss": 0.6254162788391113, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07509114593267441, "rewards/margins": 0.02838050201535225, "rewards/rejected": -0.10347163677215576, "sft_loss": 0.7509113550186157, "step": 990 }, { "epoch": 1.616488179430188, "grad_norm": 1.686830997467041, "learning_rate": 2.191736455761947e-06, "logits/chosen": -2.8971669673919678, "logits/rejected": -2.9139630794525146, "logps/chosen": -0.7013322114944458, "logps/rejected": -0.8860443234443665, "loss": 0.7571, "odds_ratio_loss": 0.5572749972343445, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07013322412967682, "rewards/margins": 0.01847122237086296, "rewards/rejected": -0.08860443532466888, "sft_loss": 0.7013322114944458, "step": 1000 }, { "epoch": 1.616488179430188, "eval_logits/chosen": -2.8644185066223145, "eval_logits/rejected": -2.8728911876678467, "eval_logps/chosen": -0.8028324842453003, "eval_logps/rejected": -1.0336546897888184, "eval_loss": 0.8679323792457581, "eval_odds_ratio_loss": 0.6509982943534851, "eval_rewards/accuracies": 0.5699999928474426, "eval_rewards/chosen": -0.08028324693441391, "eval_rewards/margins": 0.02308221347630024, "eval_rewards/rejected": -0.1033654510974884, "eval_runtime": 194.7336, "eval_samples_per_second": 5.649, "eval_sft_loss": 0.8028324842453003, "eval_steps_per_second": 2.824, "step": 1000 }, { "epoch": 1.6326530612244898, "grad_norm": 1.7911335229873657, "learning_rate": 2.1497413764574673e-06, "logits/chosen": -2.8975167274475098, "logits/rejected": -2.8892812728881836, "logps/chosen": -0.7816007137298584, "logps/rejected": -1.069588541984558, "loss": 0.8393, "odds_ratio_loss": 0.5774248242378235, "rewards/accuracies": 0.625, "rewards/chosen": -0.07816006988286972, "rewards/margins": 0.02879878506064415, "rewards/rejected": -0.10695885121822357, "sft_loss": 0.7816007137298584, "step": 1010 }, { "epoch": 1.6488179430187917, "grad_norm": 1.912550687789917, "learning_rate": 2.1078468757516395e-06, "logits/chosen": -2.8402116298675537, "logits/rejected": -2.8773112297058105, "logps/chosen": -0.7441704273223877, "logps/rejected": -0.9479702115058899, "loss": 0.8035, "odds_ratio_loss": 0.5933586955070496, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07441703975200653, "rewards/margins": 0.02037998102605343, "rewards/rejected": -0.0947970300912857, "sft_loss": 0.7441704273223877, "step": 1020 }, { "epoch": 1.6649828248130936, "grad_norm": 2.0232253074645996, "learning_rate": 2.0660649838698145e-06, "logits/chosen": -2.8627827167510986, "logits/rejected": -2.882736921310425, "logps/chosen": -0.7718713283538818, "logps/rejected": -1.1140234470367432, "loss": 0.832, "odds_ratio_loss": 0.6009626984596252, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07718713581562042, "rewards/margins": 0.03421521559357643, "rewards/rejected": -0.11140235513448715, "sft_loss": 0.7718713283538818, "step": 1030 }, { "epoch": 1.6811477066073954, "grad_norm": 1.9653966426849365, "learning_rate": 2.0244076987011284e-06, "logits/chosen": -2.905303716659546, "logits/rejected": -2.9009556770324707, "logps/chosen": -0.827530562877655, "logps/rejected": -1.0324897766113281, "loss": 0.8888, "odds_ratio_loss": 0.6124246716499329, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08275305479764938, "rewards/margins": 0.020495926961302757, "rewards/rejected": -0.1032489761710167, "sft_loss": 0.827530562877655, "step": 1040 }, { "epoch": 1.6973125884016973, "grad_norm": 1.4363154172897339, "learning_rate": 1.982886982353251e-06, "logits/chosen": -2.888767957687378, "logits/rejected": -2.8874547481536865, "logps/chosen": -0.7899632453918457, "logps/rejected": -1.1214802265167236, "loss": 0.8526, "odds_ratio_loss": 0.6266939640045166, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07899631559848785, "rewards/margins": 0.03315168619155884, "rewards/rejected": -0.11214800179004669, "sft_loss": 0.7899632453918457, "step": 1050 }, { "epoch": 1.7134774701959992, "grad_norm": 1.8043084144592285, "learning_rate": 1.941514757717392e-06, "logits/chosen": -2.866079330444336, "logits/rejected": -2.879364490509033, "logps/chosen": -0.8468548655509949, "logps/rejected": -1.1184252500534058, "loss": 0.9022, "odds_ratio_loss": 0.552977442741394, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08468548208475113, "rewards/margins": 0.02715705707669258, "rewards/rejected": -0.11184253543615341, "sft_loss": 0.8468548655509949, "step": 1060 }, { "epoch": 1.729642351990301, "grad_norm": 3.669512987136841, "learning_rate": 1.9003029050445953e-06, "logits/chosen": -2.8407020568847656, "logits/rejected": -2.8639755249023438, "logps/chosen": -0.8030735850334167, "logps/rejected": -0.9715849757194519, "loss": 0.8692, "odds_ratio_loss": 0.660782516002655, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08030736446380615, "rewards/margins": 0.016851136460900307, "rewards/rejected": -0.0971585065126419, "sft_loss": 0.8030735850334167, "step": 1070 }, { "epoch": 1.745807233784603, "grad_norm": 1.9885250329971313, "learning_rate": 1.8592632585342523e-06, "logits/chosen": -2.849134922027588, "logits/rejected": -2.8679654598236084, "logps/chosen": -0.7700011730194092, "logps/rejected": -1.0313342809677124, "loss": 0.8306, "odds_ratio_loss": 0.6062373518943787, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07700012624263763, "rewards/margins": 0.02613331377506256, "rewards/rejected": -0.1031334400177002, "sft_loss": 0.7700011730194092, "step": 1080 }, { "epoch": 1.7619721155789048, "grad_norm": 4.0624895095825195, "learning_rate": 1.8184076029358527e-06, "logits/chosen": -2.840611457824707, "logits/rejected": -2.8494577407836914, "logps/chosen": -0.7611902952194214, "logps/rejected": -0.9082427024841309, "loss": 0.8272, "odds_ratio_loss": 0.6598888635635376, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07611902803182602, "rewards/margins": 0.014705238863825798, "rewards/rejected": -0.09082427620887756, "sft_loss": 0.7611902952194214, "step": 1090 }, { "epoch": 1.7781369973732066, "grad_norm": 1.7686785459518433, "learning_rate": 1.7777476701649318e-06, "logits/chosen": -2.8446550369262695, "logits/rejected": -2.85874342918396, "logps/chosen": -0.7774368524551392, "logps/rejected": -1.0228512287139893, "loss": 0.8388, "odds_ratio_loss": 0.6141053438186646, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07774369418621063, "rewards/margins": 0.024541418999433517, "rewards/rejected": -0.10228510946035385, "sft_loss": 0.7774368524551392, "step": 1100 }, { "epoch": 1.7943018791675085, "grad_norm": 2.743757724761963, "learning_rate": 1.7372951359341925e-06, "logits/chosen": -2.8636326789855957, "logits/rejected": -2.8647377490997314, "logps/chosen": -0.750954806804657, "logps/rejected": -0.9340154528617859, "loss": 0.814, "odds_ratio_loss": 0.6307731866836548, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07509546726942062, "rewards/margins": 0.018306076526641846, "rewards/rejected": -0.09340154379606247, "sft_loss": 0.750954806804657, "step": 1110 }, { "epoch": 1.8104667609618104, "grad_norm": 3.9680521488189697, "learning_rate": 1.6970616164007547e-06, "logits/chosen": -2.8542914390563965, "logits/rejected": -2.8552489280700684, "logps/chosen": -0.7380022406578064, "logps/rejected": -0.9561580419540405, "loss": 0.801, "odds_ratio_loss": 0.6301542520523071, "rewards/accuracies": 0.625, "rewards/chosen": -0.0738002210855484, "rewards/margins": 0.0218155849725008, "rewards/rejected": -0.09561581164598465, "sft_loss": 0.7380022406578064, "step": 1120 }, { "epoch": 1.8266316427561122, "grad_norm": 2.8756582736968994, "learning_rate": 1.6570586648305276e-06, "logits/chosen": -2.8676905632019043, "logits/rejected": -2.895289897918701, "logps/chosen": -0.7943655252456665, "logps/rejected": -1.0809084177017212, "loss": 0.8591, "odds_ratio_loss": 0.6475063562393188, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07943655550479889, "rewards/margins": 0.028654297813773155, "rewards/rejected": -0.1080908551812172, "sft_loss": 0.7943655252456665, "step": 1130 }, { "epoch": 1.842796524550414, "grad_norm": 1.8805325031280518, "learning_rate": 1.6172977682806151e-06, "logits/chosen": -2.8678653240203857, "logits/rejected": -2.900193214416504, "logps/chosen": -0.7862238883972168, "logps/rejected": -1.0396199226379395, "loss": 0.8453, "odds_ratio_loss": 0.5909398198127747, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07862239331007004, "rewards/margins": 0.02533959411084652, "rewards/rejected": -0.1039619892835617, "sft_loss": 0.7862238883972168, "step": 1140 }, { "epoch": 1.858961406344716, "grad_norm": 1.586294174194336, "learning_rate": 1.5777903443007586e-06, "logits/chosen": -2.8388750553131104, "logits/rejected": -2.838686466217041, "logps/chosen": -0.7984446883201599, "logps/rejected": -1.093590497970581, "loss": 0.8601, "odds_ratio_loss": 0.6163803935050964, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07984446734189987, "rewards/margins": 0.029514577239751816, "rewards/rejected": -0.10935904830694199, "sft_loss": 0.7984446883201599, "step": 1150 }, { "epoch": 1.8751262881390178, "grad_norm": 3.058032751083374, "learning_rate": 1.5385477376547226e-06, "logits/chosen": -2.853109121322632, "logits/rejected": -2.863646984100342, "logps/chosen": -0.7820562124252319, "logps/rejected": -1.004570484161377, "loss": 0.8417, "odds_ratio_loss": 0.5969026684761047, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07820562273263931, "rewards/margins": 0.022251427173614502, "rewards/rejected": -0.10045703500509262, "sft_loss": 0.7820562124252319, "step": 1160 }, { "epoch": 1.89129116993332, "grad_norm": 3.296496868133545, "learning_rate": 1.4995812170625845e-06, "logits/chosen": -2.8537023067474365, "logits/rejected": -2.8620083332061768, "logps/chosen": -0.7803040742874146, "logps/rejected": -1.1614640951156616, "loss": 0.8383, "odds_ratio_loss": 0.5798701047897339, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07803040742874146, "rewards/margins": 0.038116004317998886, "rewards/rejected": -0.11614640802145004, "sft_loss": 0.7803040742874146, "step": 1170 }, { "epoch": 1.9074560517276218, "grad_norm": 2.4982151985168457, "learning_rate": 1.4609019719648666e-06, "logits/chosen": -2.8664259910583496, "logits/rejected": -2.880103826522827, "logps/chosen": -0.7934621572494507, "logps/rejected": -1.0411931276321411, "loss": 0.8522, "odds_ratio_loss": 0.5876864194869995, "rewards/accuracies": 0.625, "rewards/chosen": -0.07934621721506119, "rewards/margins": 0.024773094803094864, "rewards/rejected": -0.10411931574344635, "sft_loss": 0.7934621572494507, "step": 1180 }, { "epoch": 1.9236209335219236, "grad_norm": 4.357522964477539, "learning_rate": 1.42252110930943e-06, "logits/chosen": -2.8305060863494873, "logits/rejected": -2.850817918777466, "logps/chosen": -0.7121320962905884, "logps/rejected": -0.97893887758255, "loss": 0.7723, "odds_ratio_loss": 0.6020933389663696, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07121320813894272, "rewards/margins": 0.02668066881597042, "rewards/rejected": -0.09789387881755829, "sft_loss": 0.7121320962905884, "step": 1190 }, { "epoch": 1.9397858153162255, "grad_norm": 3.2690622806549072, "learning_rate": 1.3844496503620493e-06, "logits/chosen": -2.855846881866455, "logits/rejected": -2.885960817337036, "logps/chosen": -0.7993025779724121, "logps/rejected": -1.008312702178955, "loss": 0.8606, "odds_ratio_loss": 0.6124933362007141, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07993026077747345, "rewards/margins": 0.020901009440422058, "rewards/rejected": -0.10083127021789551, "sft_loss": 0.7993025779724121, "step": 1200 }, { "epoch": 1.9559506971105274, "grad_norm": 3.07012677192688, "learning_rate": 1.3466985275416081e-06, "logits/chosen": -2.8368687629699707, "logits/rejected": -2.8513948917388916, "logps/chosen": -0.8561896085739136, "logps/rejected": -1.0195033550262451, "loss": 0.9234, "odds_ratio_loss": 0.6718183159828186, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08561895787715912, "rewards/margins": 0.016331372782588005, "rewards/rejected": -0.10195034742355347, "sft_loss": 0.8561896085739136, "step": 1210 }, { "epoch": 1.9721155789048292, "grad_norm": 4.26687479019165, "learning_rate": 1.309278581280791e-06, "logits/chosen": -2.8606760501861572, "logits/rejected": -2.868224620819092, "logps/chosen": -0.7406347990036011, "logps/rejected": -1.0179945230484009, "loss": 0.7986, "odds_ratio_loss": 0.5793353319168091, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07406347990036011, "rewards/margins": 0.02773597277700901, "rewards/rejected": -0.10179946571588516, "sft_loss": 0.7406347990036011, "step": 1220 }, { "epoch": 1.9882804606991311, "grad_norm": 1.2442247867584229, "learning_rate": 1.272200556913199e-06, "logits/chosen": -2.8689868450164795, "logits/rejected": -2.8818325996398926, "logps/chosen": -0.812061607837677, "logps/rejected": -1.029280424118042, "loss": 0.8795, "odds_ratio_loss": 0.6747404336929321, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08120616525411606, "rewards/margins": 0.02172188088297844, "rewards/rejected": -0.102928027510643, "sft_loss": 0.812061607837677, "step": 1230 }, { "epoch": 2.004445342493433, "grad_norm": 2.5222415924072266, "learning_rate": 1.2354751015877698e-06, "logits/chosen": -2.842041015625, "logits/rejected": -2.861173629760742, "logps/chosen": -0.7999058961868286, "logps/rejected": -1.1007378101348877, "loss": 0.86, "odds_ratio_loss": 0.6008915305137634, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07999058067798615, "rewards/margins": 0.030083194375038147, "rewards/rejected": -0.11007378250360489, "sft_loss": 0.7999058961868286, "step": 1240 }, { "epoch": 2.020610224287735, "grad_norm": 3.1796367168426514, "learning_rate": 1.1991127612113945e-06, "logits/chosen": -2.860217571258545, "logits/rejected": -2.8857686519622803, "logps/chosen": -0.7788959741592407, "logps/rejected": -1.0279576778411865, "loss": 0.8366, "odds_ratio_loss": 0.5771896839141846, "rewards/accuracies": 0.625, "rewards/chosen": -0.0778895914554596, "rewards/margins": 0.02490617148578167, "rewards/rejected": -0.10279576480388641, "sft_loss": 0.7788959741592407, "step": 1250 }, { "epoch": 2.036775106082037, "grad_norm": 2.174238681793213, "learning_rate": 1.1631239774206035e-06, "logits/chosen": -2.8261468410491943, "logits/rejected": -2.8276760578155518, "logps/chosen": -0.7623487114906311, "logps/rejected": -1.0154896974563599, "loss": 0.8249, "odds_ratio_loss": 0.6253183484077454, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0762348622083664, "rewards/margins": 0.02531411312520504, "rewards/rejected": -0.10154898464679718, "sft_loss": 0.7623487114906311, "step": 1260 }, { "epoch": 2.052939987876339, "grad_norm": 3.220973253250122, "learning_rate": 1.1275190845831978e-06, "logits/chosen": -2.8474819660186768, "logits/rejected": -2.8597018718719482, "logps/chosen": -0.730771541595459, "logps/rejected": -1.0029503107070923, "loss": 0.7858, "odds_ratio_loss": 0.550129234790802, "rewards/accuracies": 0.625, "rewards/chosen": -0.07307715713977814, "rewards/margins": 0.02721787989139557, "rewards/rejected": -0.10029502958059311, "sft_loss": 0.730771541595459, "step": 1270 }, { "epoch": 2.0691048696706407, "grad_norm": 2.44575834274292, "learning_rate": 1.0923083068306778e-06, "logits/chosen": -2.8472275733947754, "logits/rejected": -2.8387467861175537, "logps/chosen": -0.7656749486923218, "logps/rejected": -1.1094231605529785, "loss": 0.8236, "odds_ratio_loss": 0.5792102813720703, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07656749337911606, "rewards/margins": 0.03437482565641403, "rewards/rejected": -0.11094231903553009, "sft_loss": 0.7656749486923218, "step": 1280 }, { "epoch": 2.0852697514649425, "grad_norm": 1.4943968057632446, "learning_rate": 1.0575017551223348e-06, "logits/chosen": -2.829378128051758, "logits/rejected": -2.8376450538635254, "logps/chosen": -0.7342156171798706, "logps/rejected": -0.9912710189819336, "loss": 0.7958, "odds_ratio_loss": 0.6156936883926392, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07342156767845154, "rewards/margins": 0.02570553496479988, "rewards/rejected": -0.09912709891796112, "sft_loss": 0.7342156171798706, "step": 1290 }, { "epoch": 2.1014346332592444, "grad_norm": 2.5311193466186523, "learning_rate": 1.023109424341833e-06, "logits/chosen": -2.8397974967956543, "logits/rejected": -2.8779385089874268, "logps/chosen": -0.7779219746589661, "logps/rejected": -1.1433827877044678, "loss": 0.8376, "odds_ratio_loss": 0.5970156192779541, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0777921974658966, "rewards/margins": 0.03654608502984047, "rewards/rejected": -0.11433827877044678, "sft_loss": 0.7779219746589661, "step": 1300 }, { "epoch": 2.1175995150535463, "grad_norm": 2.6538310050964355, "learning_rate": 9.891411904271273e-07, "logits/chosen": -2.856947422027588, "logits/rejected": -2.86110782623291, "logps/chosen": -0.7499477863311768, "logps/rejected": -0.9801033139228821, "loss": 0.8093, "odds_ratio_loss": 0.593558669090271, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0749947652220726, "rewards/margins": 0.023015562444925308, "rewards/rejected": -0.0980103388428688, "sft_loss": 0.7499477863311768, "step": 1310 }, { "epoch": 2.133764396847848, "grad_norm": 1.2850011587142944, "learning_rate": 9.556068075345363e-07, "logits/chosen": -2.8736729621887207, "logits/rejected": -2.8673884868621826, "logps/chosen": -0.7692313194274902, "logps/rejected": -0.9742280840873718, "loss": 0.8271, "odds_ratio_loss": 0.5790851712226868, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07692314684391022, "rewards/margins": 0.02049967274069786, "rewards/rejected": -0.09742281585931778, "sft_loss": 0.7692313194274902, "step": 1320 }, { "epoch": 2.14992927864215, "grad_norm": 1.7034938335418701, "learning_rate": 9.225159052377838e-07, "logits/chosen": -2.834965944290161, "logits/rejected": -2.8684887886047363, "logps/chosen": -0.796667218208313, "logps/rejected": -1.1322475671768188, "loss": 0.8554, "odds_ratio_loss": 0.587177574634552, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07966671884059906, "rewards/margins": 0.03355802968144417, "rewards/rejected": -0.11322475969791412, "sft_loss": 0.796667218208313, "step": 1330 }, { "epoch": 2.166094160436452, "grad_norm": 2.5143074989318848, "learning_rate": 8.898779857628184e-07, "logits/chosen": -2.8322224617004395, "logits/rejected": -2.8632161617279053, "logps/chosen": -0.6862845420837402, "logps/rejected": -0.923437774181366, "loss": 0.7449, "odds_ratio_loss": 0.5857266783714294, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0686284601688385, "rewards/margins": 0.02371532842516899, "rewards/rejected": -0.0923437848687172, "sft_loss": 0.6862845420837402, "step": 1340 }, { "epoch": 2.1822590422307537, "grad_norm": 1.7262011766433716, "learning_rate": 8.577024212591975e-07, "logits/chosen": -2.8671224117279053, "logits/rejected": -2.867626428604126, "logps/chosen": -0.7982193231582642, "logps/rejected": -0.9524084329605103, "loss": 0.862, "odds_ratio_loss": 0.6382196545600891, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07982192933559418, "rewards/margins": 0.01541891973465681, "rewards/rejected": -0.09524084627628326, "sft_loss": 0.7982193231582642, "step": 1350 }, { "epoch": 2.1984239240250556, "grad_norm": 1.9137386083602905, "learning_rate": 8.259984511088276e-07, "logits/chosen": -2.8300180435180664, "logits/rejected": -2.8534936904907227, "logps/chosen": -0.7877185940742493, "logps/rejected": -1.0415524244308472, "loss": 0.8505, "odds_ratio_loss": 0.6278126239776611, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07877186685800552, "rewards/margins": 0.025383388623595238, "rewards/rejected": -0.10415525734424591, "sft_loss": 0.7877185940742493, "step": 1360 }, { "epoch": 2.2145888058193575, "grad_norm": 2.398965835571289, "learning_rate": 7.947751792728237e-07, "logits/chosen": -2.8527517318725586, "logits/rejected": -2.8384506702423096, "logps/chosen": -0.7678119540214539, "logps/rejected": -1.105531930923462, "loss": 0.8275, "odds_ratio_loss": 0.5968826413154602, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07678119093179703, "rewards/margins": 0.03377201408147812, "rewards/rejected": -0.11055320501327515, "sft_loss": 0.7678119540214539, "step": 1370 }, { "epoch": 2.2307536876136593, "grad_norm": 2.101724147796631, "learning_rate": 7.640415716772626e-07, "logits/chosen": -2.8620262145996094, "logits/rejected": -2.881200075149536, "logps/chosen": -0.7912808656692505, "logps/rejected": -1.0620834827423096, "loss": 0.8546, "odds_ratio_loss": 0.6336351633071899, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07912809401750565, "rewards/margins": 0.027080247178673744, "rewards/rejected": -0.10620833933353424, "sft_loss": 0.7912808656692505, "step": 1380 }, { "epoch": 2.246918569407961, "grad_norm": 1.2350420951843262, "learning_rate": 7.338064536385722e-07, "logits/chosen": -2.839816093444824, "logits/rejected": -2.84806489944458, "logps/chosen": -0.7491471171379089, "logps/rejected": -1.098024606704712, "loss": 0.8078, "odds_ratio_loss": 0.5867569446563721, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07491471618413925, "rewards/margins": 0.03488774597644806, "rewards/rejected": -0.10980246961116791, "sft_loss": 0.7491471171379089, "step": 1390 }, { "epoch": 2.263083451202263, "grad_norm": 3.2553515434265137, "learning_rate": 7.040785073292883e-07, "logits/chosen": -2.795974016189575, "logits/rejected": -2.812316417694092, "logps/chosen": -0.8446899652481079, "logps/rejected": -1.1183385848999023, "loss": 0.9119, "odds_ratio_loss": 0.6722968220710754, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08446899801492691, "rewards/margins": 0.02736486867070198, "rewards/rejected": -0.111833855509758, "sft_loss": 0.8446899652481079, "step": 1400 }, { "epoch": 2.279248332996565, "grad_norm": 1.5375083684921265, "learning_rate": 6.748662692849297e-07, "logits/chosen": -2.8378682136535645, "logits/rejected": -2.8527588844299316, "logps/chosen": -0.7140767574310303, "logps/rejected": -1.1210377216339111, "loss": 0.7679, "odds_ratio_loss": 0.5377554893493652, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07140768319368362, "rewards/margins": 0.04069609194993973, "rewards/rejected": -0.11210376024246216, "sft_loss": 0.7140767574310303, "step": 1410 }, { "epoch": 2.295413214790867, "grad_norm": 3.371690273284912, "learning_rate": 6.46178127952686e-07, "logits/chosen": -2.8596229553222656, "logits/rejected": -2.86143159866333, "logps/chosen": -0.7527777552604675, "logps/rejected": -1.0262553691864014, "loss": 0.8073, "odds_ratio_loss": 0.5452762842178345, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07527776062488556, "rewards/margins": 0.02734777331352234, "rewards/rejected": -0.10262554883956909, "sft_loss": 0.7527777552604675, "step": 1420 }, { "epoch": 2.3115780965851687, "grad_norm": 5.5002760887146, "learning_rate": 6.180223212826289e-07, "logits/chosen": -2.8466854095458984, "logits/rejected": -2.84420108795166, "logps/chosen": -0.760028600692749, "logps/rejected": -1.0010223388671875, "loss": 0.8196, "odds_ratio_loss": 0.595847487449646, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07600285112857819, "rewards/margins": 0.024099376052618027, "rewards/rejected": -0.10010223090648651, "sft_loss": 0.760028600692749, "step": 1430 }, { "epoch": 2.3277429783794705, "grad_norm": 2.094597339630127, "learning_rate": 5.904069343621443e-07, "logits/chosen": -2.8559889793395996, "logits/rejected": -2.843318462371826, "logps/chosen": -0.7583047747612, "logps/rejected": -1.0201733112335205, "loss": 0.8157, "odds_ratio_loss": 0.5739010572433472, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07583048194646835, "rewards/margins": 0.02618684433400631, "rewards/rejected": -0.10201732814311981, "sft_loss": 0.7583047747612, "step": 1440 }, { "epoch": 2.3439078601737724, "grad_norm": 3.256753444671631, "learning_rate": 5.633398970942544e-07, "logits/chosen": -2.8187243938446045, "logits/rejected": -2.8463759422302246, "logps/chosen": -0.763822078704834, "logps/rejected": -0.9972942471504211, "loss": 0.8274, "odds_ratio_loss": 0.6356968283653259, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07638221234083176, "rewards/margins": 0.023347217589616776, "rewards/rejected": -0.09972943365573883, "sft_loss": 0.763822078704834, "step": 1450 }, { "epoch": 2.3600727419680743, "grad_norm": 2.1988418102264404, "learning_rate": 5.368289819205069e-07, "logits/chosen": -2.8621747493743896, "logits/rejected": -2.8629798889160156, "logps/chosen": -0.699676513671875, "logps/rejected": -0.9881321787834167, "loss": 0.7602, "odds_ratio_loss": 0.6056861877441406, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06996765732765198, "rewards/margins": 0.028845559805631638, "rewards/rejected": -0.09881322085857391, "sft_loss": 0.699676513671875, "step": 1460 }, { "epoch": 2.376237623762376, "grad_norm": 2.666426181793213, "learning_rate": 5.108818015890785e-07, "logits/chosen": -2.8656005859375, "logits/rejected": -2.889970302581787, "logps/chosen": -0.8437716364860535, "logps/rejected": -1.0408810377120972, "loss": 0.9052, "odds_ratio_loss": 0.6140363216400146, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08437716960906982, "rewards/margins": 0.01971094310283661, "rewards/rejected": -0.10408811271190643, "sft_loss": 0.8437716364860535, "step": 1470 }, { "epoch": 2.392402505556678, "grad_norm": 2.2777225971221924, "learning_rate": 4.855058069687291e-07, "logits/chosen": -2.834155559539795, "logits/rejected": -2.8524587154388428, "logps/chosen": -0.7329773306846619, "logps/rejected": -1.1425807476043701, "loss": 0.7861, "odds_ratio_loss": 0.5314901471138, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07329773157835007, "rewards/margins": 0.04096033796668053, "rewards/rejected": -0.11425807327032089, "sft_loss": 0.7329773306846619, "step": 1480 }, { "epoch": 2.40856738735098, "grad_norm": 2.6650478839874268, "learning_rate": 4.607082849092523e-07, "logits/chosen": -2.862356662750244, "logits/rejected": -2.864802598953247, "logps/chosen": -0.829633891582489, "logps/rejected": -1.0255271196365356, "loss": 0.8935, "odds_ratio_loss": 0.638370156288147, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08296339213848114, "rewards/margins": 0.019589336588978767, "rewards/rejected": -0.10255272686481476, "sft_loss": 0.829633891582489, "step": 1490 }, { "epoch": 2.4247322691452817, "grad_norm": 3.085514783859253, "learning_rate": 4.3649635614901405e-07, "logits/chosen": -2.8451571464538574, "logits/rejected": -2.8950095176696777, "logps/chosen": -0.7389890551567078, "logps/rejected": -0.8802745938301086, "loss": 0.8035, "odds_ratio_loss": 0.6446704864501953, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07389890402555466, "rewards/margins": 0.014128552749752998, "rewards/rejected": -0.0880274623632431, "sft_loss": 0.7389890551567078, "step": 1500 }, { "epoch": 2.4247322691452817, "eval_logits/chosen": -2.8472585678100586, "eval_logits/rejected": -2.8558220863342285, "eval_logps/chosen": -0.7975095510482788, "eval_logps/rejected": -1.0328320264816284, "eval_loss": 0.8629826903343201, "eval_odds_ratio_loss": 0.6547309160232544, "eval_rewards/accuracies": 0.5618181824684143, "eval_rewards/chosen": -0.07975095510482788, "eval_rewards/margins": 0.02353225089609623, "eval_rewards/rejected": -0.10328320413827896, "eval_runtime": 194.6849, "eval_samples_per_second": 5.65, "eval_sft_loss": 0.7975095510482788, "eval_steps_per_second": 2.825, "step": 1500 }, { "epoch": 2.4408971509395836, "grad_norm": 1.7019646167755127, "learning_rate": 4.128769732701973e-07, "logits/chosen": -2.82879638671875, "logits/rejected": -2.832578420639038, "logps/chosen": -0.7700603604316711, "logps/rejected": -0.9951756596565247, "loss": 0.8304, "odds_ratio_loss": 0.6030290722846985, "rewards/accuracies": 0.59375, "rewards/chosen": -0.077006034553051, "rewards/margins": 0.022511538118124008, "rewards/rejected": -0.0995175689458847, "sft_loss": 0.7700603604316711, "step": 1510 }, { "epoch": 2.4570620327338855, "grad_norm": 2.5611681938171387, "learning_rate": 3.8985691870233046e-07, "logits/chosen": -2.882220506668091, "logits/rejected": -2.880516529083252, "logps/chosen": -0.7692660689353943, "logps/rejected": -1.0380921363830566, "loss": 0.8284, "odds_ratio_loss": 0.5917290449142456, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07692660391330719, "rewards/margins": 0.026882609352469444, "rewards/rejected": -0.10380921512842178, "sft_loss": 0.7692660689353943, "step": 1520 }, { "epoch": 2.4732269145281873, "grad_norm": 2.6633763313293457, "learning_rate": 3.6744280277467904e-07, "logits/chosen": -2.8530020713806152, "logits/rejected": -2.8719234466552734, "logps/chosen": -0.7769867181777954, "logps/rejected": -1.0218976736068726, "loss": 0.8392, "odds_ratio_loss": 0.6218123435974121, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07769867032766342, "rewards/margins": 0.024491112679243088, "rewards/rejected": -0.10218977928161621, "sft_loss": 0.7769867181777954, "step": 1530 }, { "epoch": 2.489391796322489, "grad_norm": 2.7384212017059326, "learning_rate": 3.456410618180503e-07, "logits/chosen": -2.832824468612671, "logits/rejected": -2.856114149093628, "logps/chosen": -0.7060586810112, "logps/rejected": -1.0986192226409912, "loss": 0.7646, "odds_ratio_loss": 0.5853801965713501, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07060587406158447, "rewards/margins": 0.03925605118274689, "rewards/rejected": -0.10986192524433136, "sft_loss": 0.7060586810112, "step": 1540 }, { "epoch": 2.5055566781167915, "grad_norm": 1.9465371370315552, "learning_rate": 3.244579563165753e-07, "logits/chosen": -2.8586621284484863, "logits/rejected": -2.869255542755127, "logps/chosen": -0.7589577436447144, "logps/rejected": -1.1315686702728271, "loss": 0.8173, "odds_ratio_loss": 0.5836090445518494, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0758957713842392, "rewards/margins": 0.03726109117269516, "rewards/rejected": -0.11315685510635376, "sft_loss": 0.7589577436447144, "step": 1550 }, { "epoch": 2.521721559911093, "grad_norm": 1.2344708442687988, "learning_rate": 3.038995691099697e-07, "logits/chosen": -2.8416831493377686, "logits/rejected": -2.85313081741333, "logps/chosen": -0.7924615144729614, "logps/rejected": -1.2077696323394775, "loss": 0.8503, "odds_ratio_loss": 0.5783108472824097, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0792461559176445, "rewards/margins": 0.041530806571245193, "rewards/rejected": -0.1207769513130188, "sft_loss": 0.7924615144729614, "step": 1560 }, { "epoch": 2.5378864417053952, "grad_norm": 12.726688385009766, "learning_rate": 2.839718036468192e-07, "logits/chosen": -2.8868002891540527, "logits/rejected": -2.9153692722320557, "logps/chosen": -0.884573757648468, "logps/rejected": -1.0609769821166992, "loss": 0.9513, "odds_ratio_loss": 0.6675292253494263, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0884573832154274, "rewards/margins": 0.01764032617211342, "rewards/rejected": -0.10609769821166992, "sft_loss": 0.884573757648468, "step": 1570 }, { "epoch": 2.5540513234996967, "grad_norm": 2.5232503414154053, "learning_rate": 2.646803822893723e-07, "logits/chosen": -2.8850457668304443, "logits/rejected": -2.894557476043701, "logps/chosen": -0.8000026941299438, "logps/rejected": -1.0157983303070068, "loss": 0.8627, "odds_ratio_loss": 0.6269931793212891, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08000027388334274, "rewards/margins": 0.02157955802977085, "rewards/rejected": -0.10157983005046844, "sft_loss": 0.8000026941299438, "step": 1580 }, { "epoch": 2.570216205293999, "grad_norm": 2.3380508422851562, "learning_rate": 2.460308446703341e-07, "logits/chosen": -2.8933000564575195, "logits/rejected": -2.8834781646728516, "logps/chosen": -0.791167676448822, "logps/rejected": -0.9255102276802063, "loss": 0.8556, "odds_ratio_loss": 0.6445525884628296, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07911677658557892, "rewards/margins": 0.013434251770377159, "rewards/rejected": -0.09255101531744003, "sft_loss": 0.791167676448822, "step": 1590 }, { "epoch": 2.5863810870883004, "grad_norm": 3.6344377994537354, "learning_rate": 2.2802854610213143e-07, "logits/chosen": -2.8420848846435547, "logits/rejected": -2.8515543937683105, "logps/chosen": -0.6993797421455383, "logps/rejected": -1.0781666040420532, "loss": 0.7531, "odds_ratio_loss": 0.5369757413864136, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06993797421455383, "rewards/margins": 0.03787868469953537, "rewards/rejected": -0.1078166589140892, "sft_loss": 0.6993797421455383, "step": 1600 }, { "epoch": 2.6025459688826027, "grad_norm": 2.515239715576172, "learning_rate": 2.106786560391072e-07, "logits/chosen": -2.8365635871887207, "logits/rejected": -2.8803467750549316, "logps/chosen": -0.8032782673835754, "logps/rejected": -1.0168392658233643, "loss": 0.8638, "odds_ratio_loss": 0.6049396395683289, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0803278312087059, "rewards/margins": 0.021356089040637016, "rewards/rejected": -0.10168392956256866, "sft_loss": 0.8032782673835754, "step": 1610 }, { "epoch": 2.6187108506769046, "grad_norm": 1.520639181137085, "learning_rate": 1.9398615659308255e-07, "logits/chosen": -2.861687183380127, "logits/rejected": -2.89752459526062, "logps/chosen": -0.7549802660942078, "logps/rejected": -0.9435558319091797, "loss": 0.8181, "odds_ratio_loss": 0.6309365034103394, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07549802213907242, "rewards/margins": 0.018857568502426147, "rewards/rejected": -0.09435557574033737, "sft_loss": 0.7549802660942078, "step": 1620 }, { "epoch": 2.6348757324712064, "grad_norm": 2.2465171813964844, "learning_rate": 1.7795584110272184e-07, "logits/chosen": -2.8905723094940186, "logits/rejected": -2.877936840057373, "logps/chosen": -0.7934287786483765, "logps/rejected": -1.0050441026687622, "loss": 0.8594, "odds_ratio_loss": 0.6593586802482605, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07934287935495377, "rewards/margins": 0.021161522716283798, "rewards/rejected": -0.10050439834594727, "sft_loss": 0.7934287786483765, "step": 1630 }, { "epoch": 2.6510406142655083, "grad_norm": 4.033486366271973, "learning_rate": 1.6259231275709636e-07, "logits/chosen": -2.8982126712799072, "logits/rejected": -2.8980660438537598, "logps/chosen": -0.7681853175163269, "logps/rejected": -0.9490568041801453, "loss": 0.8356, "odds_ratio_loss": 0.6740620732307434, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0768185406923294, "rewards/margins": 0.018087133765220642, "rewards/rejected": -0.09490568190813065, "sft_loss": 0.7681853175163269, "step": 1640 }, { "epoch": 2.66720549605981, "grad_norm": 1.5368350744247437, "learning_rate": 1.478999832738548e-07, "logits/chosen": -2.8781023025512695, "logits/rejected": -2.8767361640930176, "logps/chosen": -0.7599083185195923, "logps/rejected": -1.0983332395553589, "loss": 0.82, "odds_ratio_loss": 0.601204514503479, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07599084079265594, "rewards/margins": 0.033842481672763824, "rewards/rejected": -0.10983331501483917, "sft_loss": 0.7599083185195923, "step": 1650 }, { "epoch": 2.683370377854112, "grad_norm": 1.8103063106536865, "learning_rate": 1.338830716323769e-07, "logits/chosen": -2.8456664085388184, "logits/rejected": -2.8552403450012207, "logps/chosen": -0.8041807413101196, "logps/rejected": -0.9866863489151001, "loss": 0.8687, "odds_ratio_loss": 0.6454349756240845, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08041806519031525, "rewards/margins": 0.018250569701194763, "rewards/rejected": -0.0986686423420906, "sft_loss": 0.8041807413101196, "step": 1660 }, { "epoch": 2.699535259648414, "grad_norm": 3.796130657196045, "learning_rate": 1.205456028622723e-07, "logits/chosen": -2.8858485221862793, "logits/rejected": -2.883568286895752, "logps/chosen": -0.7273125648498535, "logps/rejected": -1.0116485357284546, "loss": 0.7835, "odds_ratio_loss": 0.5615276098251343, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07273125648498535, "rewards/margins": 0.02843359112739563, "rewards/rejected": -0.10116485506296158, "sft_loss": 0.7273125648498535, "step": 1670 }, { "epoch": 2.7157001414427158, "grad_norm": 1.619040608406067, "learning_rate": 1.0789140688756805e-07, "logits/chosen": -2.8932971954345703, "logits/rejected": -2.8933002948760986, "logps/chosen": -0.7631897926330566, "logps/rejected": -1.0072143077850342, "loss": 0.8217, "odds_ratio_loss": 0.5846946239471436, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07631897926330566, "rewards/margins": 0.024402452632784843, "rewards/rejected": -0.10072143375873566, "sft_loss": 0.7631897926330566, "step": 1680 }, { "epoch": 2.7318650232370176, "grad_norm": 4.591987133026123, "learning_rate": 9.592411742693098e-08, "logits/chosen": -2.8280813694000244, "logits/rejected": -2.832314968109131, "logps/chosen": -0.7757545709609985, "logps/rejected": -0.9772068858146667, "loss": 0.845, "odds_ratio_loss": 0.6925373673439026, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0775754451751709, "rewards/margins": 0.020145252346992493, "rewards/rejected": -0.09772069752216339, "sft_loss": 0.7757545709609985, "step": 1690 }, { "epoch": 2.7480299050313195, "grad_norm": 2.0528857707977295, "learning_rate": 8.464717095022168e-08, "logits/chosen": -2.8116049766540527, "logits/rejected": -2.8237504959106445, "logps/chosen": -0.7476006746292114, "logps/rejected": -1.0309717655181885, "loss": 0.805, "odds_ratio_loss": 0.574048638343811, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0747600644826889, "rewards/margins": 0.02833711728453636, "rewards/rejected": -0.10309717804193497, "sft_loss": 0.7476006746292114, "step": 1700 }, { "epoch": 2.7641947868256214, "grad_norm": 2.445467233657837, "learning_rate": 7.406380569169841e-08, "logits/chosen": -2.860349178314209, "logits/rejected": -2.8944199085235596, "logps/chosen": -0.7957582473754883, "logps/rejected": -0.9725676774978638, "loss": 0.8593, "odds_ratio_loss": 0.6357892155647278, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07957582920789719, "rewards/margins": 0.01768093928694725, "rewards/rejected": -0.09725676476955414, "sft_loss": 0.7957582473754883, "step": 1710 }, { "epoch": 2.7803596686199232, "grad_norm": 11.543617248535156, "learning_rate": 6.417706072013808e-08, "logits/chosen": -2.8683581352233887, "logits/rejected": -2.894205331802368, "logps/chosen": -0.7598998546600342, "logps/rejected": -0.9663190841674805, "loss": 0.8231, "odds_ratio_loss": 0.6316258907318115, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07598999887704849, "rewards/margins": 0.02064192108809948, "rewards/rejected": -0.09663191437721252, "sft_loss": 0.7598998546600342, "step": 1720 }, { "epoch": 2.796524550414225, "grad_norm": 3.360384941101074, "learning_rate": 5.498977506615294e-08, "logits/chosen": -2.8601443767547607, "logits/rejected": -2.898664712905884, "logps/chosen": -0.790396511554718, "logps/rejected": -0.9606446027755737, "loss": 0.8544, "odds_ratio_loss": 0.6396982073783875, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07903965562582016, "rewards/margins": 0.01702481135725975, "rewards/rejected": -0.09606447070837021, "sft_loss": 0.790396511554718, "step": 1730 }, { "epoch": 2.812689432208527, "grad_norm": 2.132490873336792, "learning_rate": 4.6504586906947756e-08, "logits/chosen": -2.8836772441864014, "logits/rejected": -2.9003067016601562, "logps/chosen": -0.8166056871414185, "logps/rejected": -0.9932202100753784, "loss": 0.8767, "odds_ratio_loss": 0.6010292768478394, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08166056871414185, "rewards/margins": 0.01766144670546055, "rewards/rejected": -0.09932202100753784, "sft_loss": 0.8166056871414185, "step": 1740 }, { "epoch": 2.828854314002829, "grad_norm": 7.204352855682373, "learning_rate": 3.8723932808754914e-08, "logits/chosen": -2.887660503387451, "logits/rejected": -2.9059557914733887, "logps/chosen": -0.8569768667221069, "logps/rejected": -0.9907077550888062, "loss": 0.9219, "odds_ratio_loss": 0.6491862535476685, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08569768816232681, "rewards/margins": 0.01337310392409563, "rewards/rejected": -0.09907079488039017, "sft_loss": 0.8569768667221069, "step": 1750 }, { "epoch": 2.8450191957971307, "grad_norm": 3.7778828144073486, "learning_rate": 3.1650047027158014e-08, "logits/chosen": -2.8876945972442627, "logits/rejected": -2.9152872562408447, "logps/chosen": -0.7689987421035767, "logps/rejected": -0.981308102607727, "loss": 0.828, "odds_ratio_loss": 0.5896368622779846, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07689988613128662, "rewards/margins": 0.02123093418776989, "rewards/rejected": -0.09813080728054047, "sft_loss": 0.7689987421035767, "step": 1760 }, { "epoch": 2.8611840775914326, "grad_norm": 1.726138949394226, "learning_rate": 2.5284960865517848e-08, "logits/chosen": -2.851304769515991, "logits/rejected": -2.871598243713379, "logps/chosen": -0.7240949273109436, "logps/rejected": -1.0288841724395752, "loss": 0.7798, "odds_ratio_loss": 0.5571027994155884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0724094957113266, "rewards/margins": 0.030478913336992264, "rewards/rejected": -0.10288842022418976, "sft_loss": 0.7240949273109436, "step": 1770 }, { "epoch": 2.8773489593857344, "grad_norm": 2.2119297981262207, "learning_rate": 1.9630502091670388e-08, "logits/chosen": -2.8459057807922363, "logits/rejected": -2.866259813308716, "logps/chosen": -0.7477800250053406, "logps/rejected": -1.0080687999725342, "loss": 0.8054, "odds_ratio_loss": 0.5758811235427856, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.0747780054807663, "rewards/margins": 0.026028871536254883, "rewards/rejected": -0.10080687701702118, "sft_loss": 0.7477800250053406, "step": 1780 }, { "epoch": 2.8935138411800363, "grad_norm": 2.910409450531006, "learning_rate": 1.4688294413074677e-08, "logits/chosen": -2.850733757019043, "logits/rejected": -2.8780460357666016, "logps/chosen": -0.6847941279411316, "logps/rejected": -1.00661301612854, "loss": 0.7411, "odds_ratio_loss": 0.5632899403572083, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06847941130399704, "rewards/margins": 0.03218189254403114, "rewards/rejected": -0.10066130012273788, "sft_loss": 0.6847941279411316, "step": 1790 }, { "epoch": 2.909678722974338, "grad_norm": 2.044072389602661, "learning_rate": 1.0459757010556626e-08, "logits/chosen": -2.856724262237549, "logits/rejected": -2.877833366394043, "logps/chosen": -0.7718300223350525, "logps/rejected": -0.9458082914352417, "loss": 0.8346, "odds_ratio_loss": 0.6273509860038757, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07718300819396973, "rewards/margins": 0.017397824674844742, "rewards/rejected": -0.09458083659410477, "sft_loss": 0.7718300223350525, "step": 1800 }, { "epoch": 2.92584360476864, "grad_norm": 1.9232614040374756, "learning_rate": 6.94610413078306e-09, "logits/chosen": -2.8028831481933594, "logits/rejected": -2.8568198680877686, "logps/chosen": -0.8266820907592773, "logps/rejected": -1.2092140913009644, "loss": 0.8869, "odds_ratio_loss": 0.6017346382141113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08266820758581161, "rewards/margins": 0.03825319558382034, "rewards/rejected": -0.12092139571905136, "sft_loss": 0.8266820907592773, "step": 1810 }, { "epoch": 2.942008486562942, "grad_norm": 1.0960156917572021, "learning_rate": 4.14834473758563e-09, "logits/chosen": -2.8286824226379395, "logits/rejected": -2.838784694671631, "logps/chosen": -0.7189845442771912, "logps/rejected": -0.9857820272445679, "loss": 0.7756, "odds_ratio_loss": 0.5664829015731812, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0718984454870224, "rewards/margins": 0.02667975425720215, "rewards/rejected": -0.09857820719480515, "sft_loss": 0.7189845442771912, "step": 1820 }, { "epoch": 2.9581733683572438, "grad_norm": 1.63419771194458, "learning_rate": 2.067282222230349e-09, "logits/chosen": -2.8597445487976074, "logits/rejected": -2.8696541786193848, "logps/chosen": -0.7367098331451416, "logps/rejected": -1.0127137899398804, "loss": 0.7943, "odds_ratio_loss": 0.5762413740158081, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07367098331451416, "rewards/margins": 0.027600402012467384, "rewards/rejected": -0.1012713760137558, "sft_loss": 0.7367098331451416, "step": 1830 }, { "epoch": 2.9743382501515456, "grad_norm": 2.9457271099090576, "learning_rate": 7.035141727212979e-10, "logits/chosen": -2.8564071655273438, "logits/rejected": -2.8889355659484863, "logps/chosen": -0.7218343615531921, "logps/rejected": -1.0010156631469727, "loss": 0.7784, "odds_ratio_loss": 0.5654899477958679, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07218344509601593, "rewards/margins": 0.02791813388466835, "rewards/rejected": -0.10010156780481339, "sft_loss": 0.7218343615531921, "step": 1840 }, { "epoch": 2.9905031319458475, "grad_norm": 4.486654758453369, "learning_rate": 5.743220219761592e-11, "logits/chosen": -2.8505501747131348, "logits/rejected": -2.870176076889038, "logps/chosen": -0.8715106248855591, "logps/rejected": -1.054720401763916, "loss": 0.9404, "odds_ratio_loss": 0.6889584064483643, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08715107291936874, "rewards/margins": 0.0183209627866745, "rewards/rejected": -0.10547204315662384, "sft_loss": 0.8715106248855591, "step": 1850 }, { "epoch": 2.9969690846635686, "step": 1854, "total_flos": 2.1013894560546816e+18, "train_loss": 0.9013287582572352, "train_runtime": 18144.1457, "train_samples_per_second": 1.637, "train_steps_per_second": 0.102 } ], "logging_steps": 10, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.1013894560546816e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }