diff --git "a/QLoRa/trainer_state.json" "b/QLoRa/trainer_state.json" new file mode 100644--- /dev/null +++ "b/QLoRa/trainer_state.json" @@ -0,0 +1,43737 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12855, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 9.701138496398926, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": -1.4137420654296875, + "logits/rejected": -1.1609899997711182, + "logps/chosen": -1.056043028831482, + "logps/rejected": -0.9840900301933289, + "loss": 1.1358, + "odds_ratio_loss": 0.7978944778442383, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10560431331396103, + "rewards/margins": -0.007195314858108759, + "rewards/rejected": -0.09840899705886841, + "sft_loss": 1.056043028831482, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 14.184392929077148, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -1.4698854684829712, + "logits/rejected": -0.8033113479614258, + "logps/chosen": -1.0688271522521973, + "logps/rejected": -0.8340684771537781, + "loss": 1.1659, + "odds_ratio_loss": 0.9709033966064453, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1068827286362648, + "rewards/margins": -0.023475874215364456, + "rewards/rejected": -0.08340685069561005, + "sft_loss": 1.0688271522521973, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 5.41991662979126, + "learning_rate": 1.5e-06, + "logits/chosen": -1.3370453119277954, + "logits/rejected": -1.307146668434143, + "logps/chosen": -1.2577979564666748, + "logps/rejected": -0.7277308106422424, + "loss": 1.3774, + "odds_ratio_loss": 1.196258783340454, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12577982246875763, + "rewards/margins": -0.05300673842430115, + "rewards/rejected": -0.07277307659387589, + "sft_loss": 1.2577979564666748, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 54.984066009521484, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -1.2405481338500977, + "logits/rejected": -1.1191933155059814, + "logps/chosen": -1.3065041303634644, + "logps/rejected": -0.9139581918716431, + "loss": 1.4111, + "odds_ratio_loss": 1.045548915863037, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13065043091773987, + "rewards/margins": -0.03925459831953049, + "rewards/rejected": -0.09139582514762878, + "sft_loss": 1.3065041303634644, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 7.118112087249756, + "learning_rate": 2.5e-06, + "logits/chosen": -1.370082139968872, + "logits/rejected": -1.216050386428833, + "logps/chosen": -1.2248599529266357, + "logps/rejected": -1.151605486869812, + "loss": 1.3155, + "odds_ratio_loss": 0.9067083597183228, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.12248601019382477, + "rewards/margins": -0.007325439713895321, + "rewards/rejected": -0.11516056209802628, + "sft_loss": 1.2248599529266357, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 25.97501564025879, + "learning_rate": 3e-06, + "logits/chosen": -1.396695613861084, + "logits/rejected": -0.8267443776130676, + "logps/chosen": -0.8272935748100281, + "logps/rejected": -0.9551171064376831, + "loss": 0.8852, + "odds_ratio_loss": 0.5793362855911255, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08272935450077057, + "rewards/margins": 0.012782363221049309, + "rewards/rejected": -0.09551171958446503, + "sft_loss": 0.8272935748100281, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 6.940337657928467, + "learning_rate": 3.5e-06, + "logits/chosen": -1.3188809156417847, + "logits/rejected": -0.8736783266067505, + "logps/chosen": -1.076306939125061, + "logps/rejected": -0.9470604658126831, + "loss": 1.1587, + "odds_ratio_loss": 0.8241630792617798, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10763069242238998, + "rewards/margins": -0.012924641370773315, + "rewards/rejected": -0.09470604360103607, + "sft_loss": 1.076306939125061, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 10.32648754119873, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -1.295627236366272, + "logits/rejected": -0.9906957745552063, + "logps/chosen": -1.1894115209579468, + "logps/rejected": -0.960677444934845, + "loss": 1.28, + "odds_ratio_loss": 0.9056330919265747, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11894116550683975, + "rewards/margins": -0.02287341095507145, + "rewards/rejected": -0.09606774151325226, + "sft_loss": 1.1894115209579468, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 5.0081634521484375, + "learning_rate": 4.5e-06, + "logits/chosen": -1.487616777420044, + "logits/rejected": -1.2281392812728882, + "logps/chosen": -0.6496211290359497, + "logps/rejected": -1.5355957746505737, + "loss": 0.6776, + "odds_ratio_loss": 0.2798224091529846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06496210396289825, + "rewards/margins": 0.08859746903181076, + "rewards/rejected": -0.1535595953464508, + "sft_loss": 0.6496211290359497, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 10.065803527832031, + "learning_rate": 5e-06, + "logits/chosen": -1.2928639650344849, + "logits/rejected": -0.9908641576766968, + "logps/chosen": -0.9380599856376648, + "logps/rejected": -2.4311070442199707, + "loss": 0.9678, + "odds_ratio_loss": 0.2970461845397949, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09380599856376648, + "rewards/margins": 0.14930468797683716, + "rewards/rejected": -0.24311068654060364, + "sft_loss": 0.9380599856376648, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 5.9541497230529785, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": -1.368978500366211, + "logits/rejected": -0.7404534220695496, + "logps/chosen": -0.8750090599060059, + "logps/rejected": -0.8971788287162781, + "loss": 0.9421, + "odds_ratio_loss": 0.6712835431098938, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0875009149312973, + "rewards/margins": 0.002216977532953024, + "rewards/rejected": -0.08971788734197617, + "sft_loss": 0.8750090599060059, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 5.816754341125488, + "learning_rate": 6e-06, + "logits/chosen": -1.284854531288147, + "logits/rejected": -1.3418177366256714, + "logps/chosen": -0.9037086367607117, + "logps/rejected": -0.9571773409843445, + "loss": 0.9751, + "odds_ratio_loss": 0.713750422000885, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09037085622549057, + "rewards/margins": 0.005346869118511677, + "rewards/rejected": -0.09571772813796997, + "sft_loss": 0.9037086367607117, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 10.776530265808105, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": -1.157330870628357, + "logits/rejected": -1.152259349822998, + "logps/chosen": -1.0829627513885498, + "logps/rejected": -1.1035131216049194, + "loss": 1.1594, + "odds_ratio_loss": 0.764147937297821, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10829626023769379, + "rewards/margins": 0.002055042190477252, + "rewards/rejected": -0.1103513091802597, + "sft_loss": 1.0829627513885498, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 12.743794441223145, + "learning_rate": 7e-06, + "logits/chosen": -1.3514772653579712, + "logits/rejected": -1.089449405670166, + "logps/chosen": -0.9904440641403198, + "logps/rejected": -1.0389869213104248, + "loss": 1.0554, + "odds_ratio_loss": 0.6493497490882874, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09904440492391586, + "rewards/margins": 0.004854282829910517, + "rewards/rejected": -0.10389868915081024, + "sft_loss": 0.9904440641403198, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 7.272915840148926, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": -1.3866747617721558, + "logits/rejected": -0.6379045248031616, + "logps/chosen": -1.008975625038147, + "logps/rejected": -1.168935775756836, + "loss": 1.0743, + "odds_ratio_loss": 0.653215765953064, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10089756548404694, + "rewards/margins": 0.015996018424630165, + "rewards/rejected": -0.11689357459545135, + "sft_loss": 1.008975625038147, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 198.51878356933594, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -1.4508455991744995, + "logits/rejected": -1.2784332036972046, + "logps/chosen": -1.6414655447006226, + "logps/rejected": -2.3989181518554688, + "loss": 1.7165, + "odds_ratio_loss": 0.7502911686897278, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1641465425491333, + "rewards/margins": 0.07574529200792313, + "rewards/rejected": -0.23989181220531464, + "sft_loss": 1.6414655447006226, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 5.810291290283203, + "learning_rate": 8.5e-06, + "logits/chosen": -1.3303495645523071, + "logits/rejected": -0.7018251419067383, + "logps/chosen": -0.8492165803909302, + "logps/rejected": -0.7184539437294006, + "loss": 0.9336, + "odds_ratio_loss": 0.8441473841667175, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08492164313793182, + "rewards/margins": -0.013076257891952991, + "rewards/rejected": -0.0718453973531723, + "sft_loss": 0.8492165803909302, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 5.648757457733154, + "learning_rate": 9e-06, + "logits/chosen": -1.4343656301498413, + "logits/rejected": -1.3420077562332153, + "logps/chosen": -1.1983975172042847, + "logps/rejected": -2.7446682453155518, + "loss": 1.2826, + "odds_ratio_loss": 0.8421158790588379, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11983975023031235, + "rewards/margins": 0.15462705492973328, + "rewards/rejected": -0.2744668126106262, + "sft_loss": 1.1983975172042847, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 9.33594036102295, + "learning_rate": 9.5e-06, + "logits/chosen": -1.4645180702209473, + "logits/rejected": -1.1062164306640625, + "logps/chosen": -0.8589698672294617, + "logps/rejected": -0.9380282163619995, + "loss": 0.9372, + "odds_ratio_loss": 0.7820344567298889, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08589698374271393, + "rewards/margins": 0.007905842736363411, + "rewards/rejected": -0.09380282461643219, + "sft_loss": 0.8589698672294617, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 6.393992900848389, + "learning_rate": 1e-05, + "logits/chosen": -1.0969440937042236, + "logits/rejected": -1.246144413948059, + "logps/chosen": -1.0969411134719849, + "logps/rejected": -0.7135934829711914, + "loss": 1.2099, + "odds_ratio_loss": 1.1294300556182861, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10969410836696625, + "rewards/margins": -0.038334764540195465, + "rewards/rejected": -0.07135935127735138, + "sft_loss": 1.0969411134719849, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 13.763640403747559, + "learning_rate": 9.999996208432589e-06, + "logits/chosen": -1.2595760822296143, + "logits/rejected": -1.132385492324829, + "logps/chosen": -1.1437269449234009, + "logps/rejected": -1.4012759923934937, + "loss": 1.2214, + "odds_ratio_loss": 0.7770463824272156, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11437269300222397, + "rewards/margins": 0.025754917412996292, + "rewards/rejected": -0.14012759923934937, + "sft_loss": 1.1437269449234009, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 4.661446571350098, + "learning_rate": 9.999984833736102e-06, + "logits/chosen": -1.322127342224121, + "logits/rejected": -0.9959952235221863, + "logps/chosen": -1.2580044269561768, + "logps/rejected": -0.9094411730766296, + "loss": 1.362, + "odds_ratio_loss": 1.0395630598068237, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.12580044567584991, + "rewards/margins": -0.03485632687807083, + "rewards/rejected": -0.09094411134719849, + "sft_loss": 1.2580044269561768, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 43.42310333251953, + "learning_rate": 9.999965875927792e-06, + "logits/chosen": -1.4291982650756836, + "logits/rejected": -1.1815580129623413, + "logps/chosen": -0.7048165798187256, + "logps/rejected": -1.222245216369629, + "loss": 0.7449, + "odds_ratio_loss": 0.4009336829185486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07048166543245316, + "rewards/margins": 0.05174286291003227, + "rewards/rejected": -0.12222452461719513, + "sft_loss": 0.7048165798187256, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 8.35852336883545, + "learning_rate": 9.99993933503641e-06, + "logits/chosen": -1.299971342086792, + "logits/rejected": -0.8148666620254517, + "logps/chosen": -0.9109383821487427, + "logps/rejected": -1.1609586477279663, + "loss": 0.9817, + "odds_ratio_loss": 0.7075847387313843, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09109384566545486, + "rewards/margins": 0.025002023205161095, + "rewards/rejected": -0.11609586328268051, + "sft_loss": 0.9109383821487427, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 8.823124885559082, + "learning_rate": 9.99990521110221e-06, + "logits/chosen": -1.3256946802139282, + "logits/rejected": -0.8278160095214844, + "logps/chosen": -0.9099394679069519, + "logps/rejected": -1.1701653003692627, + "loss": 0.9708, + "odds_ratio_loss": 0.6086278557777405, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0909939557313919, + "rewards/margins": 0.02602258324623108, + "rewards/rejected": -0.11701653897762299, + "sft_loss": 0.9099394679069519, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 6.742435455322266, + "learning_rate": 9.999863504176946e-06, + "logits/chosen": -1.3721487522125244, + "logits/rejected": -0.9425897598266602, + "logps/chosen": -1.0938831567764282, + "logps/rejected": -1.0658223628997803, + "loss": 1.1687, + "odds_ratio_loss": 0.7483164072036743, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1093883141875267, + "rewards/margins": -0.0028060779441148043, + "rewards/rejected": -0.10658223927021027, + "sft_loss": 1.0938831567764282, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 22.933637619018555, + "learning_rate": 9.999814214323868e-06, + "logits/chosen": -1.4473979473114014, + "logits/rejected": -1.0455162525177002, + "logps/chosen": -0.8949346542358398, + "logps/rejected": -1.8385162353515625, + "loss": 0.9424, + "odds_ratio_loss": 0.47495514154434204, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08949346840381622, + "rewards/margins": 0.0943581610918045, + "rewards/rejected": -0.18385162949562073, + "sft_loss": 0.8949346542358398, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 24.416580200195312, + "learning_rate": 9.999757341617735e-06, + "logits/chosen": -1.2248870134353638, + "logits/rejected": -1.105491280555725, + "logps/chosen": -1.0170341730117798, + "logps/rejected": -1.9063619375228882, + "loss": 1.0536, + "odds_ratio_loss": 0.36606836318969727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10170342028141022, + "rewards/margins": 0.08893279731273651, + "rewards/rejected": -0.19063621759414673, + "sft_loss": 1.0170341730117798, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 14.371286392211914, + "learning_rate": 9.9996928861448e-06, + "logits/chosen": -1.3428990840911865, + "logits/rejected": -1.150731086730957, + "logps/chosen": -0.9745044708251953, + "logps/rejected": -1.088010549545288, + "loss": 1.0341, + "odds_ratio_loss": 0.5962523818016052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09745045006275177, + "rewards/margins": 0.011350591666996479, + "rewards/rejected": -0.10880105197429657, + "sft_loss": 0.9745044708251953, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 10.886384963989258, + "learning_rate": 9.999620848002815e-06, + "logits/chosen": -1.401794672012329, + "logits/rejected": -0.9987030029296875, + "logps/chosen": -1.0617334842681885, + "logps/rejected": -1.506756067276001, + "loss": 1.1242, + "odds_ratio_loss": 0.624306857585907, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10617335140705109, + "rewards/margins": 0.04450225830078125, + "rewards/rejected": -0.15067560970783234, + "sft_loss": 1.0617334842681885, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 7.1690449714660645, + "learning_rate": 9.99954122730104e-06, + "logits/chosen": -1.4592490196228027, + "logits/rejected": -1.0813499689102173, + "logps/chosen": -1.055484414100647, + "logps/rejected": -0.9076594114303589, + "loss": 1.143, + "odds_ratio_loss": 0.875472366809845, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1055484414100647, + "rewards/margins": -0.014782501384615898, + "rewards/rejected": -0.09076593816280365, + "sft_loss": 1.055484414100647, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 6.417614936828613, + "learning_rate": 9.999454024160225e-06, + "logits/chosen": -1.1997926235198975, + "logits/rejected": -0.984194278717041, + "logps/chosen": -1.0787403583526611, + "logps/rejected": -0.9788461923599243, + "loss": 1.1611, + "odds_ratio_loss": 0.8234724998474121, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10787403583526611, + "rewards/margins": -0.009989425539970398, + "rewards/rejected": -0.09788461774587631, + "sft_loss": 1.0787403583526611, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 5.122522830963135, + "learning_rate": 9.999359238712628e-06, + "logits/chosen": -1.332296371459961, + "logits/rejected": -0.7916947603225708, + "logps/chosen": -0.8795109987258911, + "logps/rejected": -0.7710585594177246, + "loss": 0.9758, + "odds_ratio_loss": 0.9632610082626343, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08795110136270523, + "rewards/margins": -0.01084524393081665, + "rewards/rejected": -0.07710584998130798, + "sft_loss": 0.8795109987258911, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 19.687545776367188, + "learning_rate": 9.999256871102002e-06, + "logits/chosen": -1.3000966310501099, + "logits/rejected": -1.3263441324234009, + "logps/chosen": -1.4506076574325562, + "logps/rejected": -1.1992508172988892, + "loss": 1.5467, + "odds_ratio_loss": 0.960999608039856, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14506077766418457, + "rewards/margins": -0.025135690346360207, + "rewards/rejected": -0.11992508172988892, + "sft_loss": 1.4506076574325562, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 7.098282337188721, + "learning_rate": 9.9991469214836e-06, + "logits/chosen": -1.183526873588562, + "logits/rejected": -1.3057854175567627, + "logps/chosen": -0.8797661066055298, + "logps/rejected": -0.6698340177536011, + "loss": 0.9943, + "odds_ratio_loss": 1.1449306011199951, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.0879766121506691, + "rewards/margins": -0.02099320851266384, + "rewards/rejected": -0.0669834092259407, + "sft_loss": 0.8797661066055298, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 11.24793529510498, + "learning_rate": 9.999029390024176e-06, + "logits/chosen": -1.4583027362823486, + "logits/rejected": -1.0522215366363525, + "logps/chosen": -0.7333666086196899, + "logps/rejected": -1.126899003982544, + "loss": 0.7751, + "odds_ratio_loss": 0.41780009865760803, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.073336660861969, + "rewards/margins": 0.03935323283076286, + "rewards/rejected": -0.11268989741802216, + "sft_loss": 0.7333666086196899, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 61.70880889892578, + "learning_rate": 9.99890427690198e-06, + "logits/chosen": -1.3517203330993652, + "logits/rejected": -1.0869548320770264, + "logps/chosen": -0.9237698316574097, + "logps/rejected": -1.4883028268814087, + "loss": 0.9746, + "odds_ratio_loss": 0.5079615116119385, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09237699955701828, + "rewards/margins": 0.05645329877734184, + "rewards/rejected": -0.14883029460906982, + "sft_loss": 0.9237698316574097, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 15.535794258117676, + "learning_rate": 9.998771582306763e-06, + "logits/chosen": -1.2531821727752686, + "logits/rejected": -1.177253007888794, + "logps/chosen": -1.2667360305786133, + "logps/rejected": -1.0743447542190552, + "loss": 1.3596, + "odds_ratio_loss": 0.9285286068916321, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1266736090183258, + "rewards/margins": -0.019239135086536407, + "rewards/rejected": -0.1074344664812088, + "sft_loss": 1.2667360305786133, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 16.82183265686035, + "learning_rate": 9.998631306439772e-06, + "logits/chosen": -1.4082105159759521, + "logits/rejected": -1.0227917432785034, + "logps/chosen": -0.9651159048080444, + "logps/rejected": -0.8935340046882629, + "loss": 1.0417, + "odds_ratio_loss": 0.76555335521698, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.0965115949511528, + "rewards/margins": -0.007158198393881321, + "rewards/rejected": -0.08935339748859406, + "sft_loss": 0.9651159048080444, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 6.718238353729248, + "learning_rate": 9.998483449513756e-06, + "logits/chosen": -1.2385953664779663, + "logits/rejected": -0.7646237015724182, + "logps/chosen": -1.0081157684326172, + "logps/rejected": -1.0011669397354126, + "loss": 1.0797, + "odds_ratio_loss": 0.7158805131912231, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10081157833337784, + "rewards/margins": -0.00069489108864218, + "rewards/rejected": -0.10011669248342514, + "sft_loss": 1.0081157684326172, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 9.732516288757324, + "learning_rate": 9.998328011752954e-06, + "logits/chosen": -1.3042540550231934, + "logits/rejected": -0.8548318147659302, + "logps/chosen": -1.2158184051513672, + "logps/rejected": -1.2620457410812378, + "loss": 1.2815, + "odds_ratio_loss": 0.6565437316894531, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12158183753490448, + "rewards/margins": 0.004622741136699915, + "rewards/rejected": -0.12620458006858826, + "sft_loss": 1.2158184051513672, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 15.401986122131348, + "learning_rate": 9.99816499339311e-06, + "logits/chosen": -1.1956886053085327, + "logits/rejected": -0.8465889692306519, + "logps/chosen": -1.0776029825210571, + "logps/rejected": -1.3274129629135132, + "loss": 1.1322, + "odds_ratio_loss": 0.545491099357605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10776031017303467, + "rewards/margins": 0.024980993941426277, + "rewards/rejected": -0.1327413022518158, + "sft_loss": 1.0776029825210571, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 10.379392623901367, + "learning_rate": 9.997994394681463e-06, + "logits/chosen": -1.2499234676361084, + "logits/rejected": -1.2567858695983887, + "logps/chosen": -0.9094167947769165, + "logps/rejected": -1.306290864944458, + "loss": 0.9605, + "odds_ratio_loss": 0.5112447738647461, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09094168245792389, + "rewards/margins": 0.039687395095825195, + "rewards/rejected": -0.13062907755374908, + "sft_loss": 0.9094167947769165, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 11.4965181350708, + "learning_rate": 9.997816215876746e-06, + "logits/chosen": -1.376062035560608, + "logits/rejected": -1.2260369062423706, + "logps/chosen": -0.8196055293083191, + "logps/rejected": -0.5650048851966858, + "loss": 0.9326, + "odds_ratio_loss": 1.1297613382339478, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.08196055144071579, + "rewards/margins": -0.02546006441116333, + "rewards/rejected": -0.05650048330426216, + "sft_loss": 0.8196055293083191, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 14.551139831542969, + "learning_rate": 9.99763045724919e-06, + "logits/chosen": -1.43378746509552, + "logits/rejected": -0.9187615513801575, + "logps/chosen": -1.6133413314819336, + "logps/rejected": -2.874190330505371, + "loss": 1.7014, + "odds_ratio_loss": 0.8803858757019043, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.16133412718772888, + "rewards/margins": 0.12608489394187927, + "rewards/rejected": -0.28741902112960815, + "sft_loss": 1.6133413314819336, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 6.731103420257568, + "learning_rate": 9.997437119080521e-06, + "logits/chosen": -1.3006021976470947, + "logits/rejected": -1.0998780727386475, + "logps/chosen": -1.077310562133789, + "logps/rejected": -1.445723295211792, + "loss": 1.1458, + "odds_ratio_loss": 0.6849013566970825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10773106664419174, + "rewards/margins": 0.03684128075838089, + "rewards/rejected": -0.14457234740257263, + "sft_loss": 1.077310562133789, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 34.18952941894531, + "learning_rate": 9.997236201663962e-06, + "logits/chosen": -1.480669617652893, + "logits/rejected": -0.9708350300788879, + "logps/chosen": -0.9847885370254517, + "logps/rejected": -1.239406943321228, + "loss": 1.049, + "odds_ratio_loss": 0.6420648694038391, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09847886860370636, + "rewards/margins": 0.025461841374635696, + "rewards/rejected": -0.12394070625305176, + "sft_loss": 0.9847885370254517, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 7.8535075187683105, + "learning_rate": 9.99702770530423e-06, + "logits/chosen": -1.290428876876831, + "logits/rejected": -1.2067062854766846, + "logps/chosen": -1.3287948369979858, + "logps/rejected": -0.8534753918647766, + "loss": 1.4482, + "odds_ratio_loss": 1.1943113803863525, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13287948071956635, + "rewards/margins": -0.04753195121884346, + "rewards/rejected": -0.08534753322601318, + "sft_loss": 1.3287948369979858, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 16.53376007080078, + "learning_rate": 9.996811630317534e-06, + "logits/chosen": -1.4881622791290283, + "logits/rejected": -1.0307799577713013, + "logps/chosen": -1.1290310621261597, + "logps/rejected": -0.9766250848770142, + "loss": 1.2165, + "odds_ratio_loss": 0.8741899728775024, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11290310323238373, + "rewards/margins": -0.015240591950714588, + "rewards/rejected": -0.09766252338886261, + "sft_loss": 1.1290310621261597, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 9.521791458129883, + "learning_rate": 9.996587977031583e-06, + "logits/chosen": -1.3156846761703491, + "logits/rejected": -0.6501539945602417, + "logps/chosen": -1.225023627281189, + "logps/rejected": -1.778752088546753, + "loss": 1.285, + "odds_ratio_loss": 0.5998628735542297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12250236421823502, + "rewards/margins": 0.05537285655736923, + "rewards/rejected": -0.17787523567676544, + "sft_loss": 1.225023627281189, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 14.721240997314453, + "learning_rate": 9.996356745785572e-06, + "logits/chosen": -1.3730287551879883, + "logits/rejected": -1.036368489265442, + "logps/chosen": -1.005418062210083, + "logps/rejected": -2.0516037940979004, + "loss": 1.0814, + "odds_ratio_loss": 0.7593610286712646, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10054180771112442, + "rewards/margins": 0.10461856424808502, + "rewards/rejected": -0.20516034960746765, + "sft_loss": 1.005418062210083, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 94.28663635253906, + "learning_rate": 9.996117936930194e-06, + "logits/chosen": -1.1371415853500366, + "logits/rejected": -1.0226691961288452, + "logps/chosen": -1.1712075471878052, + "logps/rejected": -1.5980589389801025, + "loss": 1.2284, + "odds_ratio_loss": 0.5723423957824707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11712075769901276, + "rewards/margins": 0.0426851324737072, + "rewards/rejected": -0.15980589389801025, + "sft_loss": 1.1712075471878052, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 15.995462417602539, + "learning_rate": 9.995871550827632e-06, + "logits/chosen": -1.2461187839508057, + "logits/rejected": -0.6994976997375488, + "logps/chosen": -1.089324951171875, + "logps/rejected": -0.9996173977851868, + "loss": 1.1704, + "odds_ratio_loss": 0.8106604814529419, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1089324951171875, + "rewards/margins": -0.008970752358436584, + "rewards/rejected": -0.09996173530817032, + "sft_loss": 1.089324951171875, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 11.086663246154785, + "learning_rate": 9.995617587851563e-06, + "logits/chosen": -1.3579238653182983, + "logits/rejected": -0.8066130876541138, + "logps/chosen": -1.0367783308029175, + "logps/rejected": -3.150995969772339, + "loss": 1.1058, + "odds_ratio_loss": 0.6898540258407593, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10367783159017563, + "rewards/margins": 0.21142175793647766, + "rewards/rejected": -0.3150995671749115, + "sft_loss": 1.0367783308029175, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 29.14234733581543, + "learning_rate": 9.995356048387154e-06, + "logits/chosen": -1.1860498189926147, + "logits/rejected": -1.4626860618591309, + "logps/chosen": -1.1332612037658691, + "logps/rejected": -1.233254313468933, + "loss": 1.2158, + "odds_ratio_loss": 0.8249042630195618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11332611739635468, + "rewards/margins": 0.009999324567615986, + "rewards/rejected": -0.12332544475793839, + "sft_loss": 1.1332612037658691, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 17.886398315429688, + "learning_rate": 9.995086932831063e-06, + "logits/chosen": -1.3031141757965088, + "logits/rejected": -1.1265239715576172, + "logps/chosen": -0.8843557238578796, + "logps/rejected": -0.5077162981033325, + "loss": 1.0095, + "odds_ratio_loss": 1.2514125108718872, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.0884355679154396, + "rewards/margins": -0.037663936614990234, + "rewards/rejected": -0.05077163502573967, + "sft_loss": 0.8843557238578796, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 6.501739025115967, + "learning_rate": 9.994810241591437e-06, + "logits/chosen": -1.4174010753631592, + "logits/rejected": -0.9393793344497681, + "logps/chosen": -0.9986799359321594, + "logps/rejected": -1.7344329357147217, + "loss": 1.0552, + "odds_ratio_loss": 0.5652368068695068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09986799955368042, + "rewards/margins": 0.07357531040906906, + "rewards/rejected": -0.17344330251216888, + "sft_loss": 0.9986799359321594, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 3.8505361080169678, + "learning_rate": 9.994525975087914e-06, + "logits/chosen": -1.5343291759490967, + "logits/rejected": -0.9835977554321289, + "logps/chosen": -0.9028434753417969, + "logps/rejected": -0.9722514152526855, + "loss": 0.9772, + "odds_ratio_loss": 0.7433664798736572, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09028434753417969, + "rewards/margins": 0.006940790917724371, + "rewards/rejected": -0.0972251445055008, + "sft_loss": 0.9028434753417969, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 7.451793670654297, + "learning_rate": 9.99423413375162e-06, + "logits/chosen": -1.3480112552642822, + "logits/rejected": -0.4587880074977875, + "logps/chosen": -0.9931986927986145, + "logps/rejected": -1.0893638134002686, + "loss": 1.0575, + "odds_ratio_loss": 0.6427035331726074, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09931986033916473, + "rewards/margins": 0.009616507217288017, + "rewards/rejected": -0.1089363843202591, + "sft_loss": 0.9931986927986145, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 6.356688499450684, + "learning_rate": 9.99393471802517e-06, + "logits/chosen": -1.119459867477417, + "logits/rejected": -0.8443056344985962, + "logps/chosen": -0.929328441619873, + "logps/rejected": -0.7678232192993164, + "loss": 1.0215, + "odds_ratio_loss": 0.9219361543655396, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09293285012245178, + "rewards/margins": -0.01615052856504917, + "rewards/rejected": -0.07678232342004776, + "sft_loss": 0.929328441619873, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 7.339206695556641, + "learning_rate": 9.993627728362663e-06, + "logits/chosen": -1.1373573541641235, + "logits/rejected": -0.6757062673568726, + "logps/chosen": -1.032034158706665, + "logps/rejected": -1.0653880834579468, + "loss": 1.1037, + "odds_ratio_loss": 0.7163321375846863, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1032034158706665, + "rewards/margins": 0.003335393965244293, + "rewards/rejected": -0.1065388172864914, + "sft_loss": 1.032034158706665, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 10.089631080627441, + "learning_rate": 9.993313165229692e-06, + "logits/chosen": -1.3139688968658447, + "logits/rejected": -0.9246614575386047, + "logps/chosen": -0.921240508556366, + "logps/rejected": -0.9949884414672852, + "loss": 0.9923, + "odds_ratio_loss": 0.710671603679657, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09212405979633331, + "rewards/margins": 0.007374781183898449, + "rewards/rejected": -0.09949883818626404, + "sft_loss": 0.921240508556366, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 15.631460189819336, + "learning_rate": 9.99299102910333e-06, + "logits/chosen": -1.333685278892517, + "logits/rejected": -0.8180558085441589, + "logps/chosen": -1.2940690517425537, + "logps/rejected": -1.4365034103393555, + "loss": 1.3564, + "odds_ratio_loss": 0.623278796672821, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1294068992137909, + "rewards/margins": 0.01424344815313816, + "rewards/rejected": -0.1436503529548645, + "sft_loss": 1.2940690517425537, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 5.078014373779297, + "learning_rate": 9.992661320472139e-06, + "logits/chosen": -1.337958812713623, + "logits/rejected": -0.6402798295021057, + "logps/chosen": -0.9434254765510559, + "logps/rejected": -0.7878870964050293, + "loss": 1.0331, + "odds_ratio_loss": 0.8966773152351379, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.09434254467487335, + "rewards/margins": -0.015553837642073631, + "rewards/rejected": -0.07878871262073517, + "sft_loss": 0.9434254765510559, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 13.075796127319336, + "learning_rate": 9.992324039836161e-06, + "logits/chosen": -1.4848788976669312, + "logits/rejected": -1.2323976755142212, + "logps/chosen": -0.967354953289032, + "logps/rejected": -0.8414648771286011, + "loss": 1.0499, + "odds_ratio_loss": 0.8251625299453735, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.09673549234867096, + "rewards/margins": -0.012589002028107643, + "rewards/rejected": -0.08414648473262787, + "sft_loss": 0.967354953289032, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 12.756791114807129, + "learning_rate": 9.991979187706925e-06, + "logits/chosen": -1.4242439270019531, + "logits/rejected": -0.8965123295783997, + "logps/chosen": -1.032097339630127, + "logps/rejected": -1.1921613216400146, + "loss": 1.0902, + "odds_ratio_loss": 0.5806518793106079, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1032097339630127, + "rewards/margins": 0.01600639522075653, + "rewards/rejected": -0.11921612918376923, + "sft_loss": 1.032097339630127, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 34.52486038208008, + "learning_rate": 9.991626764607447e-06, + "logits/chosen": -1.3143986463546753, + "logits/rejected": -0.7883418798446655, + "logps/chosen": -1.0698704719543457, + "logps/rejected": -1.3327813148498535, + "loss": 1.1451, + "odds_ratio_loss": 0.7526839971542358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10698704421520233, + "rewards/margins": 0.02629108354449272, + "rewards/rejected": -0.13327813148498535, + "sft_loss": 1.0698704719543457, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 6.101346015930176, + "learning_rate": 9.991266771072219e-06, + "logits/chosen": -1.2860424518585205, + "logits/rejected": -0.9905912280082703, + "logps/chosen": -1.153584361076355, + "logps/rejected": -1.241302490234375, + "loss": 1.2418, + "odds_ratio_loss": 0.8823820948600769, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11535843461751938, + "rewards/margins": 0.00877181626856327, + "rewards/rejected": -0.1241302490234375, + "sft_loss": 1.153584361076355, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 14.8096284866333, + "learning_rate": 9.990899207647215e-06, + "logits/chosen": -1.2073997259140015, + "logits/rejected": -0.8682562708854675, + "logps/chosen": -1.5429751873016357, + "logps/rejected": -1.2009754180908203, + "loss": 1.6562, + "odds_ratio_loss": 1.1326292753219604, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.15429750084877014, + "rewards/margins": -0.034199971705675125, + "rewards/rejected": -0.12009753286838531, + "sft_loss": 1.5429751873016357, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 16.363656997680664, + "learning_rate": 9.990524074889894e-06, + "logits/chosen": -1.4137274026870728, + "logits/rejected": -1.1143226623535156, + "logps/chosen": -1.3855955600738525, + "logps/rejected": -3.2896132469177246, + "loss": 1.4376, + "odds_ratio_loss": 0.5200487375259399, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13855956494808197, + "rewards/margins": 0.1904018074274063, + "rewards/rejected": -0.3289613723754883, + "sft_loss": 1.3855955600738525, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 8.82497501373291, + "learning_rate": 9.990141373369192e-06, + "logits/chosen": -1.2464665174484253, + "logits/rejected": -1.108237385749817, + "logps/chosen": -0.788859486579895, + "logps/rejected": -0.8404549360275269, + "loss": 0.8587, + "odds_ratio_loss": 0.6983199119567871, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07888595759868622, + "rewards/margins": 0.005159544292837381, + "rewards/rejected": -0.08404550701379776, + "sft_loss": 0.788859486579895, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 5.63923454284668, + "learning_rate": 9.989751103665523e-06, + "logits/chosen": -1.2375560998916626, + "logits/rejected": -0.7072022557258606, + "logps/chosen": -0.8630796670913696, + "logps/rejected": -1.0150066614151, + "loss": 0.9281, + "odds_ratio_loss": 0.6499568819999695, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08630797266960144, + "rewards/margins": 0.015192699618637562, + "rewards/rejected": -0.10150066763162613, + "sft_loss": 0.8630796670913696, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 6.3448486328125, + "learning_rate": 9.989353266370785e-06, + "logits/chosen": -1.2494094371795654, + "logits/rejected": -0.7897502779960632, + "logps/chosen": -0.7433738112449646, + "logps/rejected": -2.6994833946228027, + "loss": 0.7757, + "odds_ratio_loss": 0.32370153069496155, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07433737814426422, + "rewards/margins": 0.19561094045639038, + "rewards/rejected": -0.2699483335018158, + "sft_loss": 0.7433738112449646, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 29.389131546020508, + "learning_rate": 9.988947862088343e-06, + "logits/chosen": -1.3360885381698608, + "logits/rejected": -1.07649564743042, + "logps/chosen": -1.17660391330719, + "logps/rejected": -1.7249990701675415, + "loss": 1.234, + "odds_ratio_loss": 0.5744453072547913, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11766038835048676, + "rewards/margins": 0.05483951419591904, + "rewards/rejected": -0.17249992489814758, + "sft_loss": 1.17660391330719, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 5.227707386016846, + "learning_rate": 9.988534891433048e-06, + "logits/chosen": -1.237786054611206, + "logits/rejected": -1.303662896156311, + "logps/chosen": -0.9617307782173157, + "logps/rejected": -1.3591482639312744, + "loss": 1.0078, + "odds_ratio_loss": 0.46071720123291016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09617307037115097, + "rewards/margins": 0.03974176198244095, + "rewards/rejected": -0.13591483235359192, + "sft_loss": 0.9617307782173157, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 6.049983501434326, + "learning_rate": 9.98811435503122e-06, + "logits/chosen": -1.420792818069458, + "logits/rejected": -1.1584433317184448, + "logps/chosen": -1.0772771835327148, + "logps/rejected": -0.9023246765136719, + "loss": 1.1665, + "odds_ratio_loss": 0.8922192454338074, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10772772133350372, + "rewards/margins": -0.01749524101614952, + "rewards/rejected": -0.0902324691414833, + "sft_loss": 1.0772771835327148, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 95.02626037597656, + "learning_rate": 9.987686253520657e-06, + "logits/chosen": -1.4909613132476807, + "logits/rejected": -1.1455066204071045, + "logps/chosen": -0.9977737665176392, + "logps/rejected": -1.1992311477661133, + "loss": 1.061, + "odds_ratio_loss": 0.6324112415313721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09977736324071884, + "rewards/margins": 0.020145747810602188, + "rewards/rejected": -0.11992311477661133, + "sft_loss": 0.9977737665176392, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 10.860966682434082, + "learning_rate": 9.98725058755063e-06, + "logits/chosen": -1.4899775981903076, + "logits/rejected": -1.1015746593475342, + "logps/chosen": -1.325384497642517, + "logps/rejected": -1.309385895729065, + "loss": 1.3994, + "odds_ratio_loss": 0.7398003339767456, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13253845274448395, + "rewards/margins": -0.0015998601447790861, + "rewards/rejected": -0.1309386044740677, + "sft_loss": 1.325384497642517, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 5.0103983879089355, + "learning_rate": 9.986807357781878e-06, + "logits/chosen": -1.4857903718948364, + "logits/rejected": -0.9983230829238892, + "logps/chosen": -0.9025314450263977, + "logps/rejected": -0.843797504901886, + "loss": 0.9809, + "odds_ratio_loss": 0.7840155363082886, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09025314450263977, + "rewards/margins": -0.005873396061360836, + "rewards/rejected": -0.08437974750995636, + "sft_loss": 0.9025314450263977, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 12.473005294799805, + "learning_rate": 9.986356564886621e-06, + "logits/chosen": -1.3183162212371826, + "logits/rejected": -0.7969815135002136, + "logps/chosen": -1.0109480619430542, + "logps/rejected": -1.190051794052124, + "loss": 1.0721, + "odds_ratio_loss": 0.6110685467720032, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1010948047041893, + "rewards/margins": 0.017910365015268326, + "rewards/rejected": -0.11900516599416733, + "sft_loss": 1.0109480619430542, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 5.28497838973999, + "learning_rate": 9.985898209548541e-06, + "logits/chosen": -1.3139629364013672, + "logits/rejected": -0.9575953483581543, + "logps/chosen": -1.0471327304840088, + "logps/rejected": -1.388662338256836, + "loss": 1.107, + "odds_ratio_loss": 0.5983954071998596, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10471327602863312, + "rewards/margins": 0.03415297716856003, + "rewards/rejected": -0.13886624574661255, + "sft_loss": 1.0471327304840088, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 7.404033184051514, + "learning_rate": 9.98543229246279e-06, + "logits/chosen": -1.2961384057998657, + "logits/rejected": -0.8433519601821899, + "logps/chosen": -1.1629507541656494, + "logps/rejected": -1.3445327281951904, + "loss": 1.2312, + "odds_ratio_loss": 0.6821416616439819, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11629508435726166, + "rewards/margins": 0.0181582011282444, + "rewards/rejected": -0.13445329666137695, + "sft_loss": 1.1629507541656494, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 67.28243255615234, + "learning_rate": 9.984958814335995e-06, + "logits/chosen": -1.206665277481079, + "logits/rejected": -0.8332012295722961, + "logps/chosen": -1.3545682430267334, + "logps/rejected": -0.9981037378311157, + "loss": 1.4571, + "odds_ratio_loss": 1.0249886512756348, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13545681536197662, + "rewards/margins": -0.035646453499794006, + "rewards/rejected": -0.09981036931276321, + "sft_loss": 1.3545682430267334, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 10.88105583190918, + "learning_rate": 9.984477775886241e-06, + "logits/chosen": -1.2840704917907715, + "logits/rejected": -0.9073774218559265, + "logps/chosen": -1.0101227760314941, + "logps/rejected": -1.3432159423828125, + "loss": 1.0694, + "odds_ratio_loss": 0.5929387211799622, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10101227462291718, + "rewards/margins": 0.033309321850538254, + "rewards/rejected": -0.13432160019874573, + "sft_loss": 1.0101227760314941, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 7.556691646575928, + "learning_rate": 9.983989177843088e-06, + "logits/chosen": -1.2547153234481812, + "logits/rejected": -0.6640560626983643, + "logps/chosen": -0.8858125805854797, + "logps/rejected": -0.8567777872085571, + "loss": 0.9631, + "odds_ratio_loss": 0.7732707262039185, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08858126401901245, + "rewards/margins": -0.002903483808040619, + "rewards/rejected": -0.08567778021097183, + "sft_loss": 0.8858125805854797, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 18.232669830322266, + "learning_rate": 9.983493020947553e-06, + "logits/chosen": -1.4706394672393799, + "logits/rejected": -1.0486671924591064, + "logps/chosen": -1.2855408191680908, + "logps/rejected": -1.816890001296997, + "loss": 1.3368, + "odds_ratio_loss": 0.5123870968818665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1285540759563446, + "rewards/margins": 0.053134918212890625, + "rewards/rejected": -0.18168899416923523, + "sft_loss": 1.2855408191680908, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 13.52712631225586, + "learning_rate": 9.982989305952125e-06, + "logits/chosen": -1.439195990562439, + "logits/rejected": -1.036635398864746, + "logps/chosen": -1.218326449394226, + "logps/rejected": -2.1572933197021484, + "loss": 1.2839, + "odds_ratio_loss": 0.6560950875282288, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12183265388011932, + "rewards/margins": 0.09389667212963104, + "rewards/rejected": -0.21572932600975037, + "sft_loss": 1.218326449394226, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 14.365988731384277, + "learning_rate": 9.982478033620746e-06, + "logits/chosen": -1.367100477218628, + "logits/rejected": -0.7920514345169067, + "logps/chosen": -0.8387094736099243, + "logps/rejected": -4.545905590057373, + "loss": 0.8913, + "odds_ratio_loss": 0.5259830951690674, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08387093245983124, + "rewards/margins": 0.3707196116447449, + "rewards/rejected": -0.4545905590057373, + "sft_loss": 0.8387094736099243, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 9.169198989868164, + "learning_rate": 9.98195920472883e-06, + "logits/chosen": -1.3178900480270386, + "logits/rejected": -1.0445078611373901, + "logps/chosen": -0.848944365978241, + "logps/rejected": -0.767935574054718, + "loss": 0.9279, + "odds_ratio_loss": 0.7899635434150696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08489444106817245, + "rewards/margins": -0.008100892417132854, + "rewards/rejected": -0.07679355144500732, + "sft_loss": 0.848944365978241, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 7.628312587738037, + "learning_rate": 9.981432820063249e-06, + "logits/chosen": -1.4050272703170776, + "logits/rejected": -1.0616410970687866, + "logps/chosen": -0.9740702509880066, + "logps/rejected": -1.0533596277236938, + "loss": 1.0414, + "odds_ratio_loss": 0.6728119254112244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0974070280790329, + "rewards/margins": 0.007928932085633278, + "rewards/rejected": -0.10533596575260162, + "sft_loss": 0.9740702509880066, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 22.665225982666016, + "learning_rate": 9.980898880422324e-06, + "logits/chosen": -1.3875292539596558, + "logits/rejected": -0.9992658495903015, + "logps/chosen": -1.2542369365692139, + "logps/rejected": -1.6469194889068604, + "loss": 1.31, + "odds_ratio_loss": 0.5573633909225464, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12542369961738586, + "rewards/margins": 0.03926824778318405, + "rewards/rejected": -0.1646919548511505, + "sft_loss": 1.2542369365692139, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 36.47029113769531, + "learning_rate": 9.980357386615852e-06, + "logits/chosen": -1.1831592321395874, + "logits/rejected": -0.7186304330825806, + "logps/chosen": -1.766829252243042, + "logps/rejected": -0.7457488775253296, + "loss": 1.947, + "odds_ratio_loss": 1.8014347553253174, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.17668291926383972, + "rewards/margins": -0.10210806131362915, + "rewards/rejected": -0.07457488030195236, + "sft_loss": 1.766829252243042, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 17.059722900390625, + "learning_rate": 9.97980833946507e-06, + "logits/chosen": -1.448505163192749, + "logits/rejected": -0.8654235005378723, + "logps/chosen": -1.2362231016159058, + "logps/rejected": -3.4485504627227783, + "loss": 1.273, + "odds_ratio_loss": 0.36770644783973694, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12362232059240341, + "rewards/margins": 0.22123269736766815, + "rewards/rejected": -0.34485501050949097, + "sft_loss": 1.2362231016159058, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 9.267600059509277, + "learning_rate": 9.97925173980268e-06, + "logits/chosen": -1.36885666847229, + "logits/rejected": -0.8539026379585266, + "logps/chosen": -1.0619010925292969, + "logps/rejected": -1.1658036708831787, + "loss": 1.1364, + "odds_ratio_loss": 0.7445744276046753, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10619010776281357, + "rewards/margins": 0.010390259325504303, + "rewards/rejected": -0.11658036708831787, + "sft_loss": 1.0619010925292969, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 10.483030319213867, + "learning_rate": 9.978687588472838e-06, + "logits/chosen": -1.2792476415634155, + "logits/rejected": -0.5440900921821594, + "logps/chosen": -1.1186585426330566, + "logps/rejected": -1.2156472206115723, + "loss": 1.1859, + "odds_ratio_loss": 0.6719276905059814, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11186586320400238, + "rewards/margins": 0.009698860347270966, + "rewards/rejected": -0.12156472355127335, + "sft_loss": 1.1186585426330566, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 5.348385810852051, + "learning_rate": 9.978115886331147e-06, + "logits/chosen": -1.2613639831542969, + "logits/rejected": -0.782037079334259, + "logps/chosen": -1.2296042442321777, + "logps/rejected": -1.3125030994415283, + "loss": 1.3113, + "odds_ratio_loss": 0.8173338770866394, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12296042591333389, + "rewards/margins": 0.008289876393973827, + "rewards/rejected": -0.1312503069639206, + "sft_loss": 1.2296042442321777, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 18.186199188232422, + "learning_rate": 9.977536634244668e-06, + "logits/chosen": -1.3923256397247314, + "logits/rejected": -0.8637619018554688, + "logps/chosen": -1.040005087852478, + "logps/rejected": -1.5120867490768433, + "loss": 1.0898, + "odds_ratio_loss": 0.4984281659126282, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.104000523686409, + "rewards/margins": 0.04720815271139145, + "rewards/rejected": -0.15120866894721985, + "sft_loss": 1.040005087852478, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 12.430156707763672, + "learning_rate": 9.976949833091912e-06, + "logits/chosen": -1.278136134147644, + "logits/rejected": -0.9997159838676453, + "logps/chosen": -1.2000458240509033, + "logps/rejected": -1.8299169540405273, + "loss": 1.297, + "odds_ratio_loss": 0.9691734313964844, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1200045794248581, + "rewards/margins": 0.06298711150884628, + "rewards/rejected": -0.18299169838428497, + "sft_loss": 1.2000458240509033, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 5.525488376617432, + "learning_rate": 9.976355483762836e-06, + "logits/chosen": -1.1536939144134521, + "logits/rejected": -1.0693880319595337, + "logps/chosen": -0.8165262341499329, + "logps/rejected": -0.9334398508071899, + "loss": 0.8891, + "odds_ratio_loss": 0.7261025309562683, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08165262639522552, + "rewards/margins": 0.011691351421177387, + "rewards/rejected": -0.09334397315979004, + "sft_loss": 0.8165262341499329, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 16.369741439819336, + "learning_rate": 9.975753587158845e-06, + "logits/chosen": -1.271452784538269, + "logits/rejected": -1.020087718963623, + "logps/chosen": -1.0287582874298096, + "logps/rejected": -2.0818448066711426, + "loss": 1.0777, + "odds_ratio_loss": 0.4890977442264557, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10287582874298096, + "rewards/margins": 0.10530862957239151, + "rewards/rejected": -0.20818445086479187, + "sft_loss": 1.0287582874298096, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 7.120456695556641, + "learning_rate": 9.975144144192794e-06, + "logits/chosen": -1.433354139328003, + "logits/rejected": -0.9860296249389648, + "logps/chosen": -0.8577227592468262, + "logps/rejected": -1.743486762046814, + "loss": 0.9003, + "odds_ratio_loss": 0.4252917766571045, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08577227592468262, + "rewards/margins": 0.08857639133930206, + "rewards/rejected": -0.17434866726398468, + "sft_loss": 0.8577227592468262, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 19.190956115722656, + "learning_rate": 9.97452715578898e-06, + "logits/chosen": -1.2277740240097046, + "logits/rejected": -1.2004899978637695, + "logps/chosen": -0.9218929409980774, + "logps/rejected": -1.2253445386886597, + "loss": 0.9733, + "odds_ratio_loss": 0.5138994455337524, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09218929708003998, + "rewards/margins": 0.030345162376761436, + "rewards/rejected": -0.12253445386886597, + "sft_loss": 0.9218929409980774, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 29.12042236328125, + "learning_rate": 9.973902622883142e-06, + "logits/chosen": -1.1885347366333008, + "logits/rejected": -1.1869349479675293, + "logps/chosen": -1.316017508506775, + "logps/rejected": -0.8871244192123413, + "loss": 1.4462, + "odds_ratio_loss": 1.3014862537384033, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1316017359495163, + "rewards/margins": -0.0428893081843853, + "rewards/rejected": -0.08871243894100189, + "sft_loss": 1.316017508506775, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 12.59604549407959, + "learning_rate": 9.973270546422465e-06, + "logits/chosen": -1.1499083042144775, + "logits/rejected": -1.0709830522537231, + "logps/chosen": -1.061827540397644, + "logps/rejected": -1.1412575244903564, + "loss": 1.1507, + "odds_ratio_loss": 0.888770580291748, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1061827540397644, + "rewards/margins": 0.007943002507090569, + "rewards/rejected": -0.11412575095891953, + "sft_loss": 1.061827540397644, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 29.967601776123047, + "learning_rate": 9.972630927365574e-06, + "logits/chosen": -1.269574522972107, + "logits/rejected": -0.6280655860900879, + "logps/chosen": -1.1269054412841797, + "logps/rejected": -1.688178300857544, + "loss": 1.1772, + "odds_ratio_loss": 0.5029090046882629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11269054561853409, + "rewards/margins": 0.056127287447452545, + "rewards/rejected": -0.16881783306598663, + "sft_loss": 1.1269054412841797, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 32.36668014526367, + "learning_rate": 9.971983766682532e-06, + "logits/chosen": -1.2929702997207642, + "logits/rejected": -0.6805129647254944, + "logps/chosen": -0.9838630557060242, + "logps/rejected": -1.9721927642822266, + "loss": 1.0517, + "odds_ratio_loss": 0.6784967184066772, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09838631004095078, + "rewards/margins": 0.09883297979831696, + "rewards/rejected": -0.19721928238868713, + "sft_loss": 0.9838630557060242, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 7.508626461029053, + "learning_rate": 9.97132906535484e-06, + "logits/chosen": -1.4744285345077515, + "logits/rejected": -1.0449306964874268, + "logps/chosen": -1.0357000827789307, + "logps/rejected": -2.0317296981811523, + "loss": 1.074, + "odds_ratio_loss": 0.3831598162651062, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10357002168893814, + "rewards/margins": 0.09960293769836426, + "rewards/rejected": -0.203172966837883, + "sft_loss": 1.0357000827789307, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 5.905401706695557, + "learning_rate": 9.970666824375436e-06, + "logits/chosen": -1.3433470726013184, + "logits/rejected": -0.807905375957489, + "logps/chosen": -1.0914314985275269, + "logps/rejected": -1.157183051109314, + "loss": 1.1575, + "odds_ratio_loss": 0.660188615322113, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10914316028356552, + "rewards/margins": 0.006575146224349737, + "rewards/rejected": -0.1157183051109314, + "sft_loss": 1.0914314985275269, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 11.22401237487793, + "learning_rate": 9.969997044748691e-06, + "logits/chosen": -1.3542927503585815, + "logits/rejected": -0.971311092376709, + "logps/chosen": -1.3239521980285645, + "logps/rejected": -1.7703745365142822, + "loss": 1.4147, + "odds_ratio_loss": 0.9070507884025574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1323952078819275, + "rewards/margins": 0.04464225098490715, + "rewards/rejected": -0.17703744769096375, + "sft_loss": 1.3239521980285645, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 7.181665897369385, + "learning_rate": 9.969319727490415e-06, + "logits/chosen": -1.4340964555740356, + "logits/rejected": -0.8164238929748535, + "logps/chosen": -0.9762603640556335, + "logps/rejected": -1.5105446577072144, + "loss": 1.0495, + "odds_ratio_loss": 0.7322009205818176, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09762604534626007, + "rewards/margins": 0.053428418934345245, + "rewards/rejected": -0.1510544717311859, + "sft_loss": 0.9762603640556335, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 6.170845031738281, + "learning_rate": 9.96863487362784e-06, + "logits/chosen": -1.3454340696334839, + "logits/rejected": -0.6722744703292847, + "logps/chosen": -0.8916047811508179, + "logps/rejected": -1.5881434679031372, + "loss": 0.945, + "odds_ratio_loss": 0.5340217351913452, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0891604870557785, + "rewards/margins": 0.06965386122465134, + "rewards/rejected": -0.15881435573101044, + "sft_loss": 0.8916047811508179, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 6.939874172210693, + "learning_rate": 9.967942484199638e-06, + "logits/chosen": -1.319394588470459, + "logits/rejected": -0.7050190567970276, + "logps/chosen": -0.8833843469619751, + "logps/rejected": -1.2793550491333008, + "loss": 0.9408, + "odds_ratio_loss": 0.5743966698646545, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0883384421467781, + "rewards/margins": 0.03959707170724869, + "rewards/rejected": -0.1279354989528656, + "sft_loss": 0.8833843469619751, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 215.84613037109375, + "learning_rate": 9.967242560255906e-06, + "logits/chosen": -1.2708102464675903, + "logits/rejected": -1.2701189517974854, + "logps/chosen": -0.9652470350265503, + "logps/rejected": -2.828083038330078, + "loss": 0.9969, + "odds_ratio_loss": 0.3170081079006195, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09652470052242279, + "rewards/margins": 0.18628360331058502, + "rewards/rejected": -0.2828083038330078, + "sft_loss": 0.9652470350265503, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 20.366199493408203, + "learning_rate": 9.966535102858163e-06, + "logits/chosen": -1.1454098224639893, + "logits/rejected": -1.2445162534713745, + "logps/chosen": -1.3247931003570557, + "logps/rejected": -0.9458759427070618, + "loss": 1.4305, + "odds_ratio_loss": 1.0566179752349854, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.13247931003570557, + "rewards/margins": -0.03789170831441879, + "rewards/rejected": -0.09458760172128677, + "sft_loss": 1.3247931003570557, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 8.253193855285645, + "learning_rate": 9.965820113079361e-06, + "logits/chosen": -1.3363704681396484, + "logits/rejected": -0.9352320432662964, + "logps/chosen": -0.9640430212020874, + "logps/rejected": -1.2558622360229492, + "loss": 1.0235, + "odds_ratio_loss": 0.5942882299423218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0964042991399765, + "rewards/margins": 0.029181916266679764, + "rewards/rejected": -0.12558622658252716, + "sft_loss": 0.9640430212020874, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 9.946394920349121, + "learning_rate": 9.965097592003874e-06, + "logits/chosen": -1.3996388912200928, + "logits/rejected": -1.0949664115905762, + "logps/chosen": -0.8815194368362427, + "logps/rejected": -2.7259840965270996, + "loss": 0.9355, + "odds_ratio_loss": 0.5393964052200317, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0881519466638565, + "rewards/margins": 0.18444648385047913, + "rewards/rejected": -0.2725984454154968, + "sft_loss": 0.8815194368362427, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 7.081384658813477, + "learning_rate": 9.964367540727492e-06, + "logits/chosen": -1.2155399322509766, + "logits/rejected": -0.9352075457572937, + "logps/chosen": -1.4964728355407715, + "logps/rejected": -2.1899776458740234, + "loss": 1.5801, + "odds_ratio_loss": 0.8366632461547852, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14964726567268372, + "rewards/margins": 0.0693504810333252, + "rewards/rejected": -0.2189977616071701, + "sft_loss": 1.4964728355407715, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 9.30331039428711, + "learning_rate": 9.963629960357438e-06, + "logits/chosen": -1.3705943822860718, + "logits/rejected": -0.8403999209403992, + "logps/chosen": -1.2652610540390015, + "logps/rejected": -1.490480661392212, + "loss": 1.3256, + "odds_ratio_loss": 0.6036292910575867, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1265261024236679, + "rewards/margins": 0.0225219689309597, + "rewards/rejected": -0.1490480750799179, + "sft_loss": 1.2652610540390015, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 11.273741722106934, + "learning_rate": 9.96288485201234e-06, + "logits/chosen": -1.4199020862579346, + "logits/rejected": -1.1455968618392944, + "logps/chosen": -0.9536746740341187, + "logps/rejected": -1.5848302841186523, + "loss": 0.9977, + "odds_ratio_loss": 0.44051140546798706, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09536747634410858, + "rewards/margins": 0.06311556696891785, + "rewards/rejected": -0.15848304331302643, + "sft_loss": 0.9536746740341187, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 12.181118965148926, + "learning_rate": 9.962132216822252e-06, + "logits/chosen": -1.4740102291107178, + "logits/rejected": -0.8962316513061523, + "logps/chosen": -1.0127413272857666, + "logps/rejected": -1.6857773065567017, + "loss": 1.066, + "odds_ratio_loss": 0.5327258706092834, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10127413272857666, + "rewards/margins": 0.0673035979270935, + "rewards/rejected": -0.16857774555683136, + "sft_loss": 1.0127413272857666, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 15.026229858398438, + "learning_rate": 9.96137205592864e-06, + "logits/chosen": -1.5482691526412964, + "logits/rejected": -1.1137874126434326, + "logps/chosen": -0.8740970492362976, + "logps/rejected": -1.5898300409317017, + "loss": 0.9234, + "odds_ratio_loss": 0.4932100176811218, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08740969747304916, + "rewards/margins": 0.07157330214977264, + "rewards/rejected": -0.1589830219745636, + "sft_loss": 0.8740970492362976, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 31.824352264404297, + "learning_rate": 9.960604370484385e-06, + "logits/chosen": -1.4865148067474365, + "logits/rejected": -1.0594385862350464, + "logps/chosen": -1.0654785633087158, + "logps/rejected": -1.3001195192337036, + "loss": 1.1611, + "odds_ratio_loss": 0.9559617042541504, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10654785484075546, + "rewards/margins": 0.023464106023311615, + "rewards/rejected": -0.13001194596290588, + "sft_loss": 1.0654785633087158, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 9.08944034576416, + "learning_rate": 9.959829161653778e-06, + "logits/chosen": -1.456381916999817, + "logits/rejected": -0.8847858309745789, + "logps/chosen": -0.9969884157180786, + "logps/rejected": -0.8793743252754211, + "loss": 1.0802, + "odds_ratio_loss": 0.8324553370475769, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09969884157180786, + "rewards/margins": -0.011761406436562538, + "rewards/rejected": -0.08793742954730988, + "sft_loss": 0.9969884157180786, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 8.12314224243164, + "learning_rate": 9.959046430612524e-06, + "logits/chosen": -1.420644760131836, + "logits/rejected": -1.0690540075302124, + "logps/chosen": -0.9774150848388672, + "logps/rejected": -0.9998822212219238, + "loss": 1.0705, + "odds_ratio_loss": 0.9305570721626282, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0977415144443512, + "rewards/margins": 0.002246710704639554, + "rewards/rejected": -0.09998822957277298, + "sft_loss": 0.9774150848388672, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 5.643470287322998, + "learning_rate": 9.958256178547734e-06, + "logits/chosen": -1.3239028453826904, + "logits/rejected": -1.0841190814971924, + "logps/chosen": -1.0758788585662842, + "logps/rejected": -2.1539533138275146, + "loss": 1.1522, + "odds_ratio_loss": 0.7633380889892578, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10758789628744125, + "rewards/margins": 0.10780743509531021, + "rewards/rejected": -0.21539533138275146, + "sft_loss": 1.0758788585662842, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 7.302849769592285, + "learning_rate": 9.957458406657924e-06, + "logits/chosen": -1.3189386129379272, + "logits/rejected": -0.8606952428817749, + "logps/chosen": -0.8432208895683289, + "logps/rejected": -0.9131423234939575, + "loss": 0.9137, + "odds_ratio_loss": 0.7045748829841614, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08432208746671677, + "rewards/margins": 0.006992141250520945, + "rewards/rejected": -0.09131423383951187, + "sft_loss": 0.8432208895683289, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 8.54017448425293, + "learning_rate": 9.956653116153015e-06, + "logits/chosen": -1.2382190227508545, + "logits/rejected": -1.2081432342529297, + "logps/chosen": -0.7250022888183594, + "logps/rejected": -0.5700157284736633, + "loss": 0.8206, + "odds_ratio_loss": 0.95604407787323, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07250023633241653, + "rewards/margins": -0.015498657710850239, + "rewards/rejected": -0.05700157210230827, + "sft_loss": 0.7250022888183594, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 40.91093444824219, + "learning_rate": 9.955840308254336e-06, + "logits/chosen": -1.2277017831802368, + "logits/rejected": -0.9123063087463379, + "logps/chosen": -1.060502529144287, + "logps/rejected": -2.4313435554504395, + "loss": 1.1204, + "odds_ratio_loss": 0.5990681052207947, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1060502678155899, + "rewards/margins": 0.13708409667015076, + "rewards/rejected": -0.24313434958457947, + "sft_loss": 1.060502529144287, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 7.556923866271973, + "learning_rate": 9.955019984194611e-06, + "logits/chosen": -1.3158109188079834, + "logits/rejected": -0.7409260272979736, + "logps/chosen": -1.08364737033844, + "logps/rejected": -1.4582345485687256, + "loss": 1.1466, + "odds_ratio_loss": 0.6292973756790161, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10836473852396011, + "rewards/margins": 0.03745872527360916, + "rewards/rejected": -0.14582346379756927, + "sft_loss": 1.08364737033844, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 15.207615852355957, + "learning_rate": 9.954192145217966e-06, + "logits/chosen": -1.3571964502334595, + "logits/rejected": -0.6947706937789917, + "logps/chosen": -1.1696298122406006, + "logps/rejected": -0.8519676327705383, + "loss": 1.2938, + "odds_ratio_loss": 1.2413297891616821, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1169629842042923, + "rewards/margins": -0.031766220927238464, + "rewards/rejected": -0.08519675582647324, + "sft_loss": 1.1696298122406006, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 5.49888277053833, + "learning_rate": 9.953356792579925e-06, + "logits/chosen": -1.4864259958267212, + "logits/rejected": -0.9483200311660767, + "logps/chosen": -1.032362461090088, + "logps/rejected": -1.8657007217407227, + "loss": 1.082, + "odds_ratio_loss": 0.4959256649017334, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10323625802993774, + "rewards/margins": 0.08333383500576019, + "rewards/rejected": -0.18657009303569794, + "sft_loss": 1.032362461090088, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 9.250358581542969, + "learning_rate": 9.952513927547405e-06, + "logits/chosen": -1.4891111850738525, + "logits/rejected": -1.4543288946151733, + "logps/chosen": -1.168774127960205, + "logps/rejected": -2.8399128913879395, + "loss": 1.2364, + "odds_ratio_loss": 0.6762497425079346, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11687741428613663, + "rewards/margins": 0.16711387038230896, + "rewards/rejected": -0.283991277217865, + "sft_loss": 1.168774127960205, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 9.42989730834961, + "learning_rate": 9.951663551398717e-06, + "logits/chosen": -1.4089422225952148, + "logits/rejected": -1.302330732345581, + "logps/chosen": -1.2792950868606567, + "logps/rejected": -1.7675468921661377, + "loss": 1.3501, + "odds_ratio_loss": 0.7083412408828735, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12792950868606567, + "rewards/margins": 0.048825182020664215, + "rewards/rejected": -0.17675469815731049, + "sft_loss": 1.2792950868606567, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 57.358638763427734, + "learning_rate": 9.950805665423566e-06, + "logits/chosen": -1.3845967054367065, + "logits/rejected": -0.9723657369613647, + "logps/chosen": -0.7079774141311646, + "logps/rejected": -3.329834461212158, + "loss": 0.7583, + "odds_ratio_loss": 0.5035191178321838, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07079773396253586, + "rewards/margins": 0.2621857225894928, + "rewards/rejected": -0.33298343420028687, + "sft_loss": 0.7079774141311646, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 14.513296127319336, + "learning_rate": 9.949940270923047e-06, + "logits/chosen": -1.201442003250122, + "logits/rejected": -0.9176227450370789, + "logps/chosen": -0.8187268972396851, + "logps/rejected": -1.3563072681427002, + "loss": 0.8591, + "odds_ratio_loss": 0.40391626954078674, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08187268674373627, + "rewards/margins": 0.05375804752111435, + "rewards/rejected": -0.1356307417154312, + "sft_loss": 0.8187268972396851, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 6.561885833740234, + "learning_rate": 9.949067369209635e-06, + "logits/chosen": -1.3256163597106934, + "logits/rejected": -1.0909916162490845, + "logps/chosen": -1.040527105331421, + "logps/rejected": -1.0341429710388184, + "loss": 1.1175, + "odds_ratio_loss": 0.7698140144348145, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10405270755290985, + "rewards/margins": -0.0006384074804373085, + "rewards/rejected": -0.10341429710388184, + "sft_loss": 1.040527105331421, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 4.70972204208374, + "learning_rate": 9.9481869616072e-06, + "logits/chosen": -1.390150785446167, + "logits/rejected": -0.5502735376358032, + "logps/chosen": -1.3307173252105713, + "logps/rejected": -3.408107042312622, + "loss": 1.3648, + "odds_ratio_loss": 0.34050750732421875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13307173550128937, + "rewards/margins": 0.207738995552063, + "rewards/rejected": -0.34081071615219116, + "sft_loss": 1.3307173252105713, + "step": 685 + }, + { + "epoch": 0.05, + "grad_norm": 18.70503044128418, + "learning_rate": 9.947299049450994e-06, + "logits/chosen": -1.2619187831878662, + "logits/rejected": -0.8109877705574036, + "logps/chosen": -0.7874099612236023, + "logps/rejected": -0.9367067217826843, + "loss": 0.8516, + "odds_ratio_loss": 0.6423176527023315, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07874099910259247, + "rewards/margins": 0.014929664321243763, + "rewards/rejected": -0.09367066621780396, + "sft_loss": 0.7874099612236023, + "step": 690 + }, + { + "epoch": 0.05, + "grad_norm": 7.274419784545898, + "learning_rate": 9.946403634087643e-06, + "logits/chosen": -1.3324792385101318, + "logits/rejected": -0.749333381652832, + "logps/chosen": -0.8649250864982605, + "logps/rejected": -1.0733157396316528, + "loss": 0.9191, + "odds_ratio_loss": 0.5421899557113647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08649250864982605, + "rewards/margins": 0.020839063450694084, + "rewards/rejected": -0.10733157396316528, + "sft_loss": 0.8649250864982605, + "step": 695 + }, + { + "epoch": 0.05, + "grad_norm": 9.046555519104004, + "learning_rate": 9.945500716875162e-06, + "logits/chosen": -1.4016413688659668, + "logits/rejected": -1.1516118049621582, + "logps/chosen": -1.357739806175232, + "logps/rejected": -3.03171968460083, + "loss": 1.4345, + "odds_ratio_loss": 0.7673938870429993, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13577398657798767, + "rewards/margins": 0.16739800572395325, + "rewards/rejected": -0.30317196249961853, + "sft_loss": 1.357739806175232, + "step": 700 + }, + { + "epoch": 0.05, + "grad_norm": 46.778812408447266, + "learning_rate": 9.944590299182939e-06, + "logits/chosen": -1.2562851905822754, + "logits/rejected": -1.2785260677337646, + "logps/chosen": -1.5307793617248535, + "logps/rejected": -2.245178461074829, + "loss": 1.6408, + "odds_ratio_loss": 1.1004269123077393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15307793021202087, + "rewards/margins": 0.07143990695476532, + "rewards/rejected": -0.2245178520679474, + "sft_loss": 1.5307793617248535, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 27.53387451171875, + "learning_rate": 9.943672382391738e-06, + "logits/chosen": -1.4019700288772583, + "logits/rejected": -0.750403881072998, + "logps/chosen": -1.2429605722427368, + "logps/rejected": -1.03725266456604, + "loss": 1.3308, + "odds_ratio_loss": 0.8781029582023621, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.12429605424404144, + "rewards/margins": -0.02057078294456005, + "rewards/rejected": -0.10372526943683624, + "sft_loss": 1.2429605722427368, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 5.3482160568237305, + "learning_rate": 9.942746967893695e-06, + "logits/chosen": -1.3286592960357666, + "logits/rejected": -1.1365609169006348, + "logps/chosen": -1.1098437309265137, + "logps/rejected": -1.0078884363174438, + "loss": 1.1921, + "odds_ratio_loss": 0.8228921890258789, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11098437011241913, + "rewards/margins": -0.010195528157055378, + "rewards/rejected": -0.10078884661197662, + "sft_loss": 1.1098437309265137, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 17.593463897705078, + "learning_rate": 9.94181405709232e-06, + "logits/chosen": -1.448482871055603, + "logits/rejected": -0.9955105781555176, + "logps/chosen": -0.7879932522773743, + "logps/rejected": -1.9978437423706055, + "loss": 0.8648, + "odds_ratio_loss": 0.767823338508606, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.07879932224750519, + "rewards/margins": 0.12098504602909088, + "rewards/rejected": -0.19978436827659607, + "sft_loss": 0.7879932522773743, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 6.694170951843262, + "learning_rate": 9.94087365140249e-06, + "logits/chosen": -1.31051504611969, + "logits/rejected": -1.086120367050171, + "logps/chosen": -1.2709503173828125, + "logps/rejected": -0.9000130891799927, + "loss": 1.3821, + "odds_ratio_loss": 1.1116914749145508, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.1270950436592102, + "rewards/margins": -0.03709372878074646, + "rewards/rejected": -0.09000130742788315, + "sft_loss": 1.2709503173828125, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 39.7230110168457, + "learning_rate": 9.93992575225045e-06, + "logits/chosen": -1.38053297996521, + "logits/rejected": -1.010858178138733, + "logps/chosen": -1.0346620082855225, + "logps/rejected": -1.122179627418518, + "loss": 1.1033, + "odds_ratio_loss": 0.6862468123435974, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10346619784832001, + "rewards/margins": 0.00875175278633833, + "rewards/rejected": -0.11221794784069061, + "sft_loss": 1.0346620082855225, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 25.078582763671875, + "learning_rate": 9.93897036107381e-06, + "logits/chosen": -1.3918646574020386, + "logits/rejected": -1.1127347946166992, + "logps/chosen": -0.9163225293159485, + "logps/rejected": -3.5592429637908936, + "loss": 0.9362, + "odds_ratio_loss": 0.19927331805229187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09163224697113037, + "rewards/margins": 0.26429206132888794, + "rewards/rejected": -0.3559243083000183, + "sft_loss": 0.9163225293159485, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 18.3801212310791, + "learning_rate": 9.938007479321541e-06, + "logits/chosen": -1.1464792490005493, + "logits/rejected": -0.6045576930046082, + "logps/chosen": -0.9576179385185242, + "logps/rejected": -1.123015284538269, + "loss": 1.0146, + "odds_ratio_loss": 0.5698369145393372, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09576179087162018, + "rewards/margins": 0.016539743170142174, + "rewards/rejected": -0.1123015433549881, + "sft_loss": 0.9576179385185242, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 6.056693077087402, + "learning_rate": 9.937037108453974e-06, + "logits/chosen": -1.2240662574768066, + "logits/rejected": -0.912168025970459, + "logps/chosen": -0.9915501475334167, + "logps/rejected": -1.0677486658096313, + "loss": 1.0657, + "odds_ratio_loss": 0.741681694984436, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09915502369403839, + "rewards/margins": 0.0076198456808924675, + "rewards/rejected": -0.10677486658096313, + "sft_loss": 0.9915501475334167, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 5.033421039581299, + "learning_rate": 9.936059249942805e-06, + "logits/chosen": -1.3071390390396118, + "logits/rejected": -0.8568423986434937, + "logps/chosen": -0.8234881162643433, + "logps/rejected": -1.3216912746429443, + "loss": 0.8871, + "odds_ratio_loss": 0.6363648772239685, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08234880864620209, + "rewards/margins": 0.04982032626867294, + "rewards/rejected": -0.13216914236545563, + "sft_loss": 0.8234881162643433, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 6.3838701248168945, + "learning_rate": 9.935073905271074e-06, + "logits/chosen": -1.204453468322754, + "logits/rejected": -0.6676325798034668, + "logps/chosen": -1.0909837484359741, + "logps/rejected": -1.807064414024353, + "loss": 1.149, + "odds_ratio_loss": 0.5803178548812866, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1090983897447586, + "rewards/margins": 0.0716080591082573, + "rewards/rejected": -0.1807064414024353, + "sft_loss": 1.0909837484359741, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 11.226985931396484, + "learning_rate": 9.934081075933187e-06, + "logits/chosen": -1.3518896102905273, + "logits/rejected": -0.982338547706604, + "logps/chosen": -1.2844316959381104, + "logps/rejected": -1.6078770160675049, + "loss": 1.365, + "odds_ratio_loss": 0.8052700161933899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1284431666135788, + "rewards/margins": 0.03234453126788139, + "rewards/rejected": -0.1607877016067505, + "sft_loss": 1.2844316959381104, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 31.29439353942871, + "learning_rate": 9.93308076343489e-06, + "logits/chosen": -1.3497084379196167, + "logits/rejected": -0.9264364242553711, + "logps/chosen": -1.1290289163589478, + "logps/rejected": -1.3911527395248413, + "loss": 1.1867, + "odds_ratio_loss": 0.576228141784668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11290287971496582, + "rewards/margins": 0.026212383061647415, + "rewards/rejected": -0.13911525905132294, + "sft_loss": 1.1290289163589478, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 10.17575740814209, + "learning_rate": 9.932072969293288e-06, + "logits/chosen": -1.3262922763824463, + "logits/rejected": -1.022761344909668, + "logps/chosen": -0.9707845449447632, + "logps/rejected": -1.6581337451934814, + "loss": 1.0134, + "odds_ratio_loss": 0.4258590340614319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09707845747470856, + "rewards/margins": 0.06873490661382675, + "rewards/rejected": -0.1658133566379547, + "sft_loss": 0.9707845449447632, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 14.091787338256836, + "learning_rate": 9.931057695036828e-06, + "logits/chosen": -1.356696367263794, + "logits/rejected": -1.0855176448822021, + "logps/chosen": -1.1384265422821045, + "logps/rejected": -0.9075587391853333, + "loss": 1.2365, + "odds_ratio_loss": 0.9805895686149597, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11384264379739761, + "rewards/margins": -0.023086776956915855, + "rewards/rejected": -0.09075586497783661, + "sft_loss": 1.1384265422821045, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 8.655046463012695, + "learning_rate": 9.930034942205303e-06, + "logits/chosen": -1.3109716176986694, + "logits/rejected": -0.816183865070343, + "logps/chosen": -0.8931499719619751, + "logps/rejected": -1.218641996383667, + "loss": 0.9495, + "odds_ratio_loss": 0.563424825668335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08931499719619751, + "rewards/margins": 0.03254919499158859, + "rewards/rejected": -0.1218641996383667, + "sft_loss": 0.8931499719619751, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 5.672267436981201, + "learning_rate": 9.929004712349844e-06, + "logits/chosen": -1.364993929862976, + "logits/rejected": -1.0111385583877563, + "logps/chosen": -1.0240195989608765, + "logps/rejected": -1.448655605316162, + "loss": 1.0751, + "odds_ratio_loss": 0.510697066783905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1024019718170166, + "rewards/margins": 0.04246360436081886, + "rewards/rejected": -0.14486555755138397, + "sft_loss": 1.0240195989608765, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 8.213885307312012, + "learning_rate": 9.92796700703293e-06, + "logits/chosen": -1.2081053256988525, + "logits/rejected": -1.3082879781723022, + "logps/chosen": -0.8951700329780579, + "logps/rejected": -1.3031384944915771, + "loss": 0.9444, + "odds_ratio_loss": 0.49256768822669983, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08951699733734131, + "rewards/margins": 0.04079686850309372, + "rewards/rejected": -0.13031387329101562, + "sft_loss": 0.8951700329780579, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 9.180521965026855, + "learning_rate": 9.926921827828368e-06, + "logits/chosen": -1.3431509733200073, + "logits/rejected": -1.1413524150848389, + "logps/chosen": -1.030499815940857, + "logps/rejected": -0.9735819697380066, + "loss": 1.1098, + "odds_ratio_loss": 0.7934376001358032, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10304999351501465, + "rewards/margins": -0.00569178769364953, + "rewards/rejected": -0.09735819697380066, + "sft_loss": 1.030499815940857, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 21.196258544921875, + "learning_rate": 9.92586917632131e-06, + "logits/chosen": -1.408015251159668, + "logits/rejected": -1.2530816793441772, + "logps/chosen": -0.9153121709823608, + "logps/rejected": -1.3999773263931274, + "loss": 0.9899, + "odds_ratio_loss": 0.745948314666748, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09153122454881668, + "rewards/margins": 0.0484665222465992, + "rewards/rejected": -0.13999773561954498, + "sft_loss": 0.9153121709823608, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 16.20098114013672, + "learning_rate": 9.924809054108232e-06, + "logits/chosen": -1.193213939666748, + "logits/rejected": -1.042197585105896, + "logps/chosen": -0.8584068417549133, + "logps/rejected": -0.8795690536499023, + "loss": 0.9491, + "odds_ratio_loss": 0.9071296453475952, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08584068715572357, + "rewards/margins": 0.0021162242628633976, + "rewards/rejected": -0.08795692026615143, + "sft_loss": 0.8584068417549133, + "step": 805 + }, + { + "epoch": 0.06, + "grad_norm": 26.68222427368164, + "learning_rate": 9.923741462796947e-06, + "logits/chosen": -1.0805448293685913, + "logits/rejected": -0.7028868794441223, + "logps/chosen": -1.1113938093185425, + "logps/rejected": -1.3654364347457886, + "loss": 1.179, + "odds_ratio_loss": 0.676261305809021, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11113937944173813, + "rewards/margins": 0.02540426328778267, + "rewards/rejected": -0.1365436464548111, + "sft_loss": 1.1113938093185425, + "step": 810 + }, + { + "epoch": 0.06, + "grad_norm": 6.551802635192871, + "learning_rate": 9.922666404006592e-06, + "logits/chosen": -1.418948769569397, + "logits/rejected": -0.7345365881919861, + "logps/chosen": -0.9354351162910461, + "logps/rejected": -1.849973440170288, + "loss": 0.9887, + "odds_ratio_loss": 0.5327891111373901, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09354351460933685, + "rewards/margins": 0.09145383536815643, + "rewards/rejected": -0.1849973499774933, + "sft_loss": 0.9354351162910461, + "step": 815 + }, + { + "epoch": 0.06, + "grad_norm": 6.564281940460205, + "learning_rate": 9.921583879367627e-06, + "logits/chosen": -1.2443413734436035, + "logits/rejected": -1.0016214847564697, + "logps/chosen": -1.3639390468597412, + "logps/rejected": -1.252862572669983, + "loss": 1.4527, + "odds_ratio_loss": 0.887839674949646, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13639390468597412, + "rewards/margins": -0.011107642203569412, + "rewards/rejected": -0.1252862513065338, + "sft_loss": 1.3639390468597412, + "step": 820 + }, + { + "epoch": 0.06, + "grad_norm": 10.34186840057373, + "learning_rate": 9.920493890521842e-06, + "logits/chosen": -1.3167495727539062, + "logits/rejected": -0.8088476061820984, + "logps/chosen": -1.0269895792007446, + "logps/rejected": -1.8802833557128906, + "loss": 1.0725, + "odds_ratio_loss": 0.45481786131858826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10269895941019058, + "rewards/margins": 0.08532937616109848, + "rewards/rejected": -0.18802833557128906, + "sft_loss": 1.0269895792007446, + "step": 825 + }, + { + "epoch": 0.06, + "grad_norm": 7.909381866455078, + "learning_rate": 9.91939643912234e-06, + "logits/chosen": -1.3650422096252441, + "logits/rejected": -0.854724109172821, + "logps/chosen": -0.98639976978302, + "logps/rejected": -1.1779707670211792, + "loss": 1.0439, + "odds_ratio_loss": 0.5750349164009094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09863997995853424, + "rewards/margins": 0.019157104194164276, + "rewards/rejected": -0.11779709160327911, + "sft_loss": 0.98639976978302, + "step": 830 + }, + { + "epoch": 0.06, + "grad_norm": 63.97218704223633, + "learning_rate": 9.918291526833548e-06, + "logits/chosen": -1.4302396774291992, + "logits/rejected": -0.8879743814468384, + "logps/chosen": -1.32662034034729, + "logps/rejected": -2.953354597091675, + "loss": 1.3797, + "odds_ratio_loss": 0.5310118198394775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13266204297542572, + "rewards/margins": 0.16267342865467072, + "rewards/rejected": -0.29533547163009644, + "sft_loss": 1.32662034034729, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 7.25938081741333, + "learning_rate": 9.917179155331206e-06, + "logits/chosen": -1.1007791757583618, + "logits/rejected": -0.7932504415512085, + "logps/chosen": -1.3327864408493042, + "logps/rejected": -1.0754101276397705, + "loss": 1.4221, + "odds_ratio_loss": 0.8930914998054504, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.13327865302562714, + "rewards/margins": -0.025737643241882324, + "rewards/rejected": -0.10754100978374481, + "sft_loss": 1.3327864408493042, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 11.274470329284668, + "learning_rate": 9.916059326302364e-06, + "logits/chosen": -1.3314870595932007, + "logits/rejected": -0.8154805302619934, + "logps/chosen": -1.0468391180038452, + "logps/rejected": -1.227414608001709, + "loss": 1.1067, + "odds_ratio_loss": 0.5981887578964233, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10468391329050064, + "rewards/margins": 0.01805754378437996, + "rewards/rejected": -0.1227414458990097, + "sft_loss": 1.0468391180038452, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 12.382113456726074, + "learning_rate": 9.914932041445386e-06, + "logits/chosen": -1.3826451301574707, + "logits/rejected": -1.1742017269134521, + "logps/chosen": -1.1019704341888428, + "logps/rejected": -3.996877670288086, + "loss": 1.1651, + "odds_ratio_loss": 0.6314796209335327, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1101970449090004, + "rewards/margins": 0.2894907593727112, + "rewards/rejected": -0.399687796831131, + "sft_loss": 1.1019704341888428, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 9.66358757019043, + "learning_rate": 9.913797302469944e-06, + "logits/chosen": -1.204119324684143, + "logits/rejected": -1.0197612047195435, + "logps/chosen": -0.855130672454834, + "logps/rejected": -0.7614967823028564, + "loss": 0.9439, + "odds_ratio_loss": 0.888095498085022, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08551307767629623, + "rewards/margins": -0.009363390505313873, + "rewards/rejected": -0.07614968717098236, + "sft_loss": 0.855130672454834, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 5.658473968505859, + "learning_rate": 9.912655111097014e-06, + "logits/chosen": -1.2263609170913696, + "logits/rejected": -0.691985011100769, + "logps/chosen": -1.2378971576690674, + "logps/rejected": -1.0723927021026611, + "loss": 1.3234, + "odds_ratio_loss": 0.8548486828804016, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1237897127866745, + "rewards/margins": -0.016550447791814804, + "rewards/rejected": -0.1072392612695694, + "sft_loss": 1.2378971576690674, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 5.982890605926514, + "learning_rate": 9.911505469058872e-06, + "logits/chosen": -1.2504374980926514, + "logits/rejected": -1.004959225654602, + "logps/chosen": -1.328712821006775, + "logps/rejected": -1.461451530456543, + "loss": 1.4051, + "odds_ratio_loss": 0.7639222145080566, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.13287129998207092, + "rewards/margins": 0.013273855671286583, + "rewards/rejected": -0.14614513516426086, + "sft_loss": 1.328712821006775, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 5.529323101043701, + "learning_rate": 9.910348378099098e-06, + "logits/chosen": -1.312565565109253, + "logits/rejected": -0.8698539733886719, + "logps/chosen": -1.003717064857483, + "logps/rejected": -1.205470085144043, + "loss": 1.0766, + "odds_ratio_loss": 0.7283679246902466, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10037169605493546, + "rewards/margins": 0.020175311714410782, + "rewards/rejected": -0.12054701149463654, + "sft_loss": 1.003717064857483, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 94.74681854248047, + "learning_rate": 9.909183839972565e-06, + "logits/chosen": -1.375828742980957, + "logits/rejected": -1.1545583009719849, + "logps/chosen": -1.1318230628967285, + "logps/rejected": -3.6160807609558105, + "loss": 1.1897, + "odds_ratio_loss": 0.5786797404289246, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11318230628967285, + "rewards/margins": 0.24842575192451477, + "rewards/rejected": -0.36160808801651, + "sft_loss": 1.1318230628967285, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 21.118417739868164, + "learning_rate": 9.908011856445444e-06, + "logits/chosen": -1.4701459407806396, + "logits/rejected": -1.0760300159454346, + "logps/chosen": -1.4192107915878296, + "logps/rejected": -1.8034175634384155, + "loss": 1.4791, + "odds_ratio_loss": 0.598541796207428, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14192108809947968, + "rewards/margins": 0.03842068463563919, + "rewards/rejected": -0.18034176528453827, + "sft_loss": 1.4192107915878296, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 10.34803295135498, + "learning_rate": 9.906832429295199e-06, + "logits/chosen": -1.2245054244995117, + "logits/rejected": -0.9822576642036438, + "logps/chosen": -0.933951199054718, + "logps/rejected": -1.1507595777511597, + "loss": 0.9959, + "odds_ratio_loss": 0.619904100894928, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09339512884616852, + "rewards/margins": 0.021680831909179688, + "rewards/rejected": -0.1150759607553482, + "sft_loss": 0.933951199054718, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 17.897947311401367, + "learning_rate": 9.905645560310577e-06, + "logits/chosen": -1.383840799331665, + "logits/rejected": -1.3219610452651978, + "logps/chosen": -0.9768409729003906, + "logps/rejected": -1.3957569599151611, + "loss": 1.0512, + "odds_ratio_loss": 0.7432494759559631, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0976840928196907, + "rewards/margins": 0.04189159721136093, + "rewards/rejected": -0.13957569003105164, + "sft_loss": 0.9768409729003906, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 9.26870059967041, + "learning_rate": 9.90445125129162e-06, + "logits/chosen": -1.4521198272705078, + "logits/rejected": -1.1943159103393555, + "logps/chosen": -1.0366110801696777, + "logps/rejected": -1.3479697704315186, + "loss": 1.0942, + "odds_ratio_loss": 0.5757991075515747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10366110503673553, + "rewards/margins": 0.03113587573170662, + "rewards/rejected": -0.13479699194431305, + "sft_loss": 1.0366110801696777, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 6.55729341506958, + "learning_rate": 9.903249504049645e-06, + "logits/chosen": -1.2487952709197998, + "logits/rejected": -0.6370812654495239, + "logps/chosen": -0.9395875930786133, + "logps/rejected": -0.9449079632759094, + "loss": 1.016, + "odds_ratio_loss": 0.7639524340629578, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09395875781774521, + "rewards/margins": 0.0005320362979546189, + "rewards/rejected": -0.09449080377817154, + "sft_loss": 0.9395875930786133, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 45.118629455566406, + "learning_rate": 9.902040320407258e-06, + "logits/chosen": -1.2511450052261353, + "logits/rejected": -0.8519547581672668, + "logps/chosen": -1.1160920858383179, + "logps/rejected": -1.2400343418121338, + "loss": 1.2332, + "odds_ratio_loss": 1.171097755432129, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11160922050476074, + "rewards/margins": 0.012394214980304241, + "rewards/rejected": -0.12400342524051666, + "sft_loss": 1.1160920858383179, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 5.698209285736084, + "learning_rate": 9.900823702198338e-06, + "logits/chosen": -1.47048020362854, + "logits/rejected": -0.7404271364212036, + "logps/chosen": -1.0461819171905518, + "logps/rejected": -1.5807592868804932, + "loss": 1.0939, + "odds_ratio_loss": 0.477081835269928, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10461819171905518, + "rewards/margins": 0.053457729518413544, + "rewards/rejected": -0.15807592868804932, + "sft_loss": 1.0461819171905518, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 9.900472640991211, + "learning_rate": 9.899599651268039e-06, + "logits/chosen": -1.3266518115997314, + "logits/rejected": -1.1770989894866943, + "logps/chosen": -1.1734721660614014, + "logps/rejected": -2.0207362174987793, + "loss": 1.2137, + "odds_ratio_loss": 0.40272313356399536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11734724044799805, + "rewards/margins": 0.08472639322280884, + "rewards/rejected": -0.2020736187696457, + "sft_loss": 1.1734721660614014, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 14.617350578308105, + "learning_rate": 9.898368169472794e-06, + "logits/chosen": -1.3530217409133911, + "logits/rejected": -0.8183524012565613, + "logps/chosen": -0.8375462293624878, + "logps/rejected": -2.5914151668548584, + "loss": 0.8782, + "odds_ratio_loss": 0.4067561626434326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08375462144613266, + "rewards/margins": 0.17538690567016602, + "rewards/rejected": -0.2591415345668793, + "sft_loss": 0.8375462293624878, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 38.70340347290039, + "learning_rate": 9.897129258680298e-06, + "logits/chosen": -1.382263422012329, + "logits/rejected": -1.0352758169174194, + "logps/chosen": -0.6595112681388855, + "logps/rejected": -2.8338589668273926, + "loss": 0.6854, + "odds_ratio_loss": 0.2593601644039154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06595112383365631, + "rewards/margins": 0.21743479371070862, + "rewards/rejected": -0.28338590264320374, + "sft_loss": 0.6595112681388855, + "step": 925 + }, + { + "epoch": 0.07, + "grad_norm": 30.268157958984375, + "learning_rate": 9.895882920769515e-06, + "logits/chosen": -1.3694220781326294, + "logits/rejected": -1.1358081102371216, + "logps/chosen": -1.0219380855560303, + "logps/rejected": -0.8565353155136108, + "loss": 1.1132, + "odds_ratio_loss": 0.9122053980827332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10219381004571915, + "rewards/margins": -0.016540277749300003, + "rewards/rejected": -0.08565352857112885, + "sft_loss": 1.0219380855560303, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 8.992692947387695, + "learning_rate": 9.89462915763068e-06, + "logits/chosen": -1.2513294219970703, + "logits/rejected": -0.9854547381401062, + "logps/chosen": -0.9552785754203796, + "logps/rejected": -1.058593511581421, + "loss": 1.0204, + "odds_ratio_loss": 0.6511629223823547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09552786499261856, + "rewards/margins": 0.010331504046916962, + "rewards/rejected": -0.10585936158895493, + "sft_loss": 0.9552785754203796, + "step": 935 + }, + { + "epoch": 0.07, + "grad_norm": 12.573531150817871, + "learning_rate": 9.893367971165279e-06, + "logits/chosen": -1.388832688331604, + "logits/rejected": -0.8999320864677429, + "logps/chosen": -1.093196988105774, + "logps/rejected": -1.4769212007522583, + "loss": 1.1543, + "odds_ratio_loss": 0.6115171313285828, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10931968688964844, + "rewards/margins": 0.03837241604924202, + "rewards/rejected": -0.14769211411476135, + "sft_loss": 1.093196988105774, + "step": 940 + }, + { + "epoch": 0.07, + "grad_norm": 23.18633460998535, + "learning_rate": 9.892099363286065e-06, + "logits/chosen": -1.479446530342102, + "logits/rejected": -1.034317135810852, + "logps/chosen": -1.3559527397155762, + "logps/rejected": -1.4490649700164795, + "loss": 1.453, + "odds_ratio_loss": 0.9702537655830383, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13559527695178986, + "rewards/margins": 0.009311218746006489, + "rewards/rejected": -0.14490649104118347, + "sft_loss": 1.3559527397155762, + "step": 945 + }, + { + "epoch": 0.07, + "grad_norm": 5.014403343200684, + "learning_rate": 9.890823335917041e-06, + "logits/chosen": -1.2923214435577393, + "logits/rejected": -0.7891890406608582, + "logps/chosen": -1.1381855010986328, + "logps/rejected": -1.3333497047424316, + "loss": 1.2146, + "odds_ratio_loss": 0.7636581659317017, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11381856352090836, + "rewards/margins": 0.019516412168741226, + "rewards/rejected": -0.1333349645137787, + "sft_loss": 1.1381855010986328, + "step": 950 + }, + { + "epoch": 0.07, + "grad_norm": 5.407112121582031, + "learning_rate": 9.889539890993467e-06, + "logits/chosen": -1.3674460649490356, + "logits/rejected": -0.7717936635017395, + "logps/chosen": -1.107033371925354, + "logps/rejected": -2.2579007148742676, + "loss": 1.1523, + "odds_ratio_loss": 0.45254993438720703, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11070333421230316, + "rewards/margins": 0.11508671194314957, + "rewards/rejected": -0.22579005360603333, + "sft_loss": 1.107033371925354, + "step": 955 + }, + { + "epoch": 0.07, + "grad_norm": 22.41192054748535, + "learning_rate": 9.888249030461845e-06, + "logits/chosen": -1.3265749216079712, + "logits/rejected": -0.8742873072624207, + "logps/chosen": -0.7914237976074219, + "logps/rejected": -2.211487293243408, + "loss": 0.8814, + "odds_ratio_loss": 0.9001585245132446, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07914238423109055, + "rewards/margins": 0.14200636744499207, + "rewards/rejected": -0.22114872932434082, + "sft_loss": 0.7914237976074219, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 51.93904495239258, + "learning_rate": 9.886950756279933e-06, + "logits/chosen": -1.2333533763885498, + "logits/rejected": -0.6385300159454346, + "logps/chosen": -1.252501368522644, + "logps/rejected": -1.7653671503067017, + "loss": 1.3013, + "odds_ratio_loss": 0.48805102705955505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12525013089179993, + "rewards/margins": 0.051286570727825165, + "rewards/rejected": -0.1765367090702057, + "sft_loss": 1.252501368522644, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 16.045190811157227, + "learning_rate": 9.885645070416728e-06, + "logits/chosen": -1.3813869953155518, + "logits/rejected": -1.1527819633483887, + "logps/chosen": -0.8400813937187195, + "logps/rejected": -1.084443211555481, + "loss": 0.8877, + "odds_ratio_loss": 0.47596150636672974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08400814235210419, + "rewards/margins": 0.024436186999082565, + "rewards/rejected": -0.10844433307647705, + "sft_loss": 0.8400813937187195, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 14.392130851745605, + "learning_rate": 9.884331974852468e-06, + "logits/chosen": -1.3196592330932617, + "logits/rejected": -1.043312430381775, + "logps/chosen": -1.1383229494094849, + "logps/rejected": -0.9930847883224487, + "loss": 1.2273, + "odds_ratio_loss": 0.8899247050285339, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11383228003978729, + "rewards/margins": -0.014523811638355255, + "rewards/rejected": -0.09930847585201263, + "sft_loss": 1.1383229494094849, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 15.830513954162598, + "learning_rate": 9.88301147157863e-06, + "logits/chosen": -1.24079167842865, + "logits/rejected": -1.3131943941116333, + "logps/chosen": -0.9807993769645691, + "logps/rejected": -1.419396996498108, + "loss": 1.0474, + "odds_ratio_loss": 0.6656183004379272, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09807994216680527, + "rewards/margins": 0.04385975003242493, + "rewards/rejected": -0.1419396847486496, + "sft_loss": 0.9807993769645691, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 11.744377136230469, + "learning_rate": 9.881683562597924e-06, + "logits/chosen": -1.2285785675048828, + "logits/rejected": -1.3326553106307983, + "logps/chosen": -0.7297677397727966, + "logps/rejected": -1.0131244659423828, + "loss": 0.7885, + "odds_ratio_loss": 0.5870744585990906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07297678291797638, + "rewards/margins": 0.02833567187190056, + "rewards/rejected": -0.10131244361400604, + "sft_loss": 0.7297677397727966, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 4.617860794067383, + "learning_rate": 9.88034824992429e-06, + "logits/chosen": -1.4039509296417236, + "logits/rejected": -0.8233305215835571, + "logps/chosen": -1.162663221359253, + "logps/rejected": -2.416837692260742, + "loss": 1.1918, + "odds_ratio_loss": 0.29158255457878113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11626632511615753, + "rewards/margins": 0.12541747093200684, + "rewards/rejected": -0.24168379604816437, + "sft_loss": 1.162663221359253, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 9.872160911560059, + "learning_rate": 9.879005535582904e-06, + "logits/chosen": -1.3318212032318115, + "logits/rejected": -1.442276954650879, + "logps/chosen": -1.0921529531478882, + "logps/rejected": -1.2366979122161865, + "loss": 1.1696, + "odds_ratio_loss": 0.7744948267936707, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10921530425548553, + "rewards/margins": 0.014454501681029797, + "rewards/rejected": -0.1236698180437088, + "sft_loss": 1.0921529531478882, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 6.297494411468506, + "learning_rate": 9.87765542161016e-06, + "logits/chosen": -1.530379056930542, + "logits/rejected": -0.9569048881530762, + "logps/chosen": -0.9592668414115906, + "logps/rejected": -1.179861307144165, + "loss": 1.0172, + "odds_ratio_loss": 0.579362690448761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0959266871213913, + "rewards/margins": 0.02205944061279297, + "rewards/rejected": -0.11798612773418427, + "sft_loss": 0.9592668414115906, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 8.824081420898438, + "learning_rate": 9.876297910053678e-06, + "logits/chosen": -1.4260704517364502, + "logits/rejected": -0.9216135740280151, + "logps/chosen": -0.927899956703186, + "logps/rejected": -1.1985846757888794, + "loss": 0.98, + "odds_ratio_loss": 0.5209888219833374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09278999269008636, + "rewards/margins": 0.027068469673395157, + "rewards/rejected": -0.11985846608877182, + "sft_loss": 0.927899956703186, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 34.00803756713867, + "learning_rate": 9.874933002972297e-06, + "logits/chosen": -1.4401054382324219, + "logits/rejected": -0.8292428851127625, + "logps/chosen": -0.8494836688041687, + "logps/rejected": -1.0477453470230103, + "loss": 0.9141, + "odds_ratio_loss": 0.6464778184890747, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08494836837053299, + "rewards/margins": 0.019826162606477737, + "rewards/rejected": -0.10477451980113983, + "sft_loss": 0.8494836688041687, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 5.185765743255615, + "learning_rate": 9.873560702436072e-06, + "logits/chosen": -1.36220383644104, + "logits/rejected": -0.8890962600708008, + "logps/chosen": -1.1309592723846436, + "logps/rejected": -2.1118993759155273, + "loss": 1.1752, + "odds_ratio_loss": 0.44284600019454956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11309593915939331, + "rewards/margins": 0.09809400886297226, + "rewards/rejected": -0.21118994057178497, + "sft_loss": 1.1309592723846436, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 31.76227378845215, + "learning_rate": 9.87218101052627e-06, + "logits/chosen": -1.484521508216858, + "logits/rejected": -1.0151021480560303, + "logps/chosen": -1.1594613790512085, + "logps/rejected": -1.8886810541152954, + "loss": 1.2184, + "odds_ratio_loss": 0.5893402099609375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11594613641500473, + "rewards/margins": 0.07292197644710541, + "rewards/rejected": -0.18886812031269073, + "sft_loss": 1.1594613790512085, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 22.267271041870117, + "learning_rate": 9.870793929335367e-06, + "logits/chosen": -1.544433832168579, + "logits/rejected": -1.0945574045181274, + "logps/chosen": -0.7931126356124878, + "logps/rejected": -1.0429248809814453, + "loss": 0.8478, + "odds_ratio_loss": 0.5466041564941406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07931126654148102, + "rewards/margins": 0.024981223046779633, + "rewards/rejected": -0.10429248958826065, + "sft_loss": 0.7931126356124878, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 6.590450286865234, + "learning_rate": 9.869399460967052e-06, + "logits/chosen": -1.3002541065216064, + "logits/rejected": -1.009333610534668, + "logps/chosen": -1.110409140586853, + "logps/rejected": -1.0610884428024292, + "loss": 1.1878, + "odds_ratio_loss": 0.7737922668457031, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11104090511798859, + "rewards/margins": -0.004932059440761805, + "rewards/rejected": -0.10610884428024292, + "sft_loss": 1.110409140586853, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 8.192142486572266, + "learning_rate": 9.867997607536212e-06, + "logits/chosen": -1.3670454025268555, + "logits/rejected": -0.8192941546440125, + "logps/chosen": -1.0318758487701416, + "logps/rejected": -0.930589497089386, + "loss": 1.1152, + "odds_ratio_loss": 0.8337259292602539, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10318758338689804, + "rewards/margins": -0.010128635913133621, + "rewards/rejected": -0.09305894374847412, + "sft_loss": 1.0318758487701416, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 5.612542152404785, + "learning_rate": 9.866588371168935e-06, + "logits/chosen": -1.2067945003509521, + "logits/rejected": -0.731787383556366, + "logps/chosen": -0.8820972442626953, + "logps/rejected": -0.7037122845649719, + "loss": 0.9745, + "odds_ratio_loss": 0.9238243103027344, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08820972591638565, + "rewards/margins": -0.017838502302765846, + "rewards/rejected": -0.07037122547626495, + "sft_loss": 0.8820972442626953, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 6.599497318267822, + "learning_rate": 9.865171754002505e-06, + "logits/chosen": -1.2855665683746338, + "logits/rejected": -0.6700264811515808, + "logps/chosen": -0.7599323987960815, + "logps/rejected": -1.3745027780532837, + "loss": 0.7921, + "odds_ratio_loss": 0.3221582770347595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07599325478076935, + "rewards/margins": 0.06145703047513962, + "rewards/rejected": -0.13745027780532837, + "sft_loss": 0.7599323987960815, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 141.1793212890625, + "learning_rate": 9.863747758185405e-06, + "logits/chosen": -1.0344316959381104, + "logits/rejected": -0.966625988483429, + "logps/chosen": -1.0074121952056885, + "logps/rejected": -1.2289427518844604, + "loss": 1.0665, + "odds_ratio_loss": 0.5912154912948608, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10074122250080109, + "rewards/margins": 0.022153064608573914, + "rewards/rejected": -0.1228942722082138, + "sft_loss": 1.0074121952056885, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 5.1217217445373535, + "learning_rate": 9.862316385877305e-06, + "logits/chosen": -1.3833470344543457, + "logits/rejected": -0.8735666275024414, + "logps/chosen": -0.799805760383606, + "logps/rejected": -3.1053359508514404, + "loss": 0.8412, + "odds_ratio_loss": 0.41374388337135315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07998057454824448, + "rewards/margins": 0.2305530607700348, + "rewards/rejected": -0.31053364276885986, + "sft_loss": 0.799805760383606, + "step": 1055 + }, + { + "epoch": 0.08, + "grad_norm": 9.84019660949707, + "learning_rate": 9.860877639249063e-06, + "logits/chosen": -1.3688457012176514, + "logits/rejected": -0.9568573832511902, + "logps/chosen": -1.2854890823364258, + "logps/rejected": -1.8219703435897827, + "loss": 1.3305, + "odds_ratio_loss": 0.4498967230319977, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12854890525341034, + "rewards/margins": 0.05364813655614853, + "rewards/rejected": -0.18219704926013947, + "sft_loss": 1.2854890823364258, + "step": 1060 + }, + { + "epoch": 0.08, + "grad_norm": 29.234161376953125, + "learning_rate": 9.859431520482716e-06, + "logits/chosen": -1.2526874542236328, + "logits/rejected": -0.9805940389633179, + "logps/chosen": -0.9354702234268188, + "logps/rejected": -1.7557783126831055, + "loss": 0.975, + "odds_ratio_loss": 0.3954404294490814, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09354700893163681, + "rewards/margins": 0.08203083276748657, + "rewards/rejected": -0.17557783424854279, + "sft_loss": 0.9354702234268188, + "step": 1065 + }, + { + "epoch": 0.08, + "grad_norm": 5.677125453948975, + "learning_rate": 9.857978031771494e-06, + "logits/chosen": -1.3821674585342407, + "logits/rejected": -0.8005634546279907, + "logps/chosen": -0.8023947477340698, + "logps/rejected": -0.9100324511528015, + "loss": 0.8635, + "odds_ratio_loss": 0.6106234192848206, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08023947477340698, + "rewards/margins": 0.010763769038021564, + "rewards/rejected": -0.09100324660539627, + "sft_loss": 0.8023947477340698, + "step": 1070 + }, + { + "epoch": 0.08, + "grad_norm": 7.841827869415283, + "learning_rate": 9.856517175319794e-06, + "logits/chosen": -1.4441566467285156, + "logits/rejected": -1.024996042251587, + "logps/chosen": -0.8957603573799133, + "logps/rejected": -1.3328006267547607, + "loss": 0.9526, + "odds_ratio_loss": 0.5685744881629944, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08957605063915253, + "rewards/margins": 0.04370402172207832, + "rewards/rejected": -0.13328006863594055, + "sft_loss": 0.8957603573799133, + "step": 1075 + }, + { + "epoch": 0.08, + "grad_norm": 134.03005981445312, + "learning_rate": 9.85504895334319e-06, + "logits/chosen": -1.0448005199432373, + "logits/rejected": -0.9956331253051758, + "logps/chosen": -1.2046611309051514, + "logps/rejected": -1.4932407140731812, + "loss": 1.2943, + "odds_ratio_loss": 0.895898163318634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12046612799167633, + "rewards/margins": 0.02885795198380947, + "rewards/rejected": -0.14932407438755035, + "sft_loss": 1.2046611309051514, + "step": 1080 + }, + { + "epoch": 0.08, + "grad_norm": 10.544418334960938, + "learning_rate": 9.853573368068426e-06, + "logits/chosen": -1.3604201078414917, + "logits/rejected": -0.9904428720474243, + "logps/chosen": -1.3646055459976196, + "logps/rejected": -1.1506952047348022, + "loss": 1.457, + "odds_ratio_loss": 0.9236465692520142, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1364605724811554, + "rewards/margins": -0.021391037851572037, + "rewards/rejected": -0.11506952345371246, + "sft_loss": 1.3646055459976196, + "step": 1085 + }, + { + "epoch": 0.08, + "grad_norm": 8.134223937988281, + "learning_rate": 9.852090421733416e-06, + "logits/chosen": -1.2988691329956055, + "logits/rejected": -0.8891332745552063, + "logps/chosen": -1.0880292654037476, + "logps/rejected": -0.9603763818740845, + "loss": 1.1707, + "odds_ratio_loss": 0.826417088508606, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.108802929520607, + "rewards/margins": -0.012765288352966309, + "rewards/rejected": -0.09603764116764069, + "sft_loss": 1.0880292654037476, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 14.613194465637207, + "learning_rate": 9.850600116587236e-06, + "logits/chosen": -1.2537554502487183, + "logits/rejected": -1.1704847812652588, + "logps/chosen": -0.8787897229194641, + "logps/rejected": -2.0588157176971436, + "loss": 0.9807, + "odds_ratio_loss": 1.0188149213790894, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08787896484136581, + "rewards/margins": 0.11800263077020645, + "rewards/rejected": -0.20588159561157227, + "sft_loss": 0.8787897229194641, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 10.195762634277344, + "learning_rate": 9.849102454890122e-06, + "logits/chosen": -1.3870155811309814, + "logits/rejected": -0.9877394437789917, + "logps/chosen": -1.125795602798462, + "logps/rejected": -1.0407532453536987, + "loss": 1.2027, + "odds_ratio_loss": 0.7686118483543396, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11257956176996231, + "rewards/margins": -0.008504234254360199, + "rewards/rejected": -0.10407533496618271, + "sft_loss": 1.125795602798462, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 20.717632293701172, + "learning_rate": 9.847597438913471e-06, + "logits/chosen": -1.339658498764038, + "logits/rejected": -1.0415122509002686, + "logps/chosen": -1.1699798107147217, + "logps/rejected": -1.8965017795562744, + "loss": 1.2401, + "odds_ratio_loss": 0.7011226415634155, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11699797958135605, + "rewards/margins": 0.0726521909236908, + "rewards/rejected": -0.18965016305446625, + "sft_loss": 1.1699798107147217, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 7.274383068084717, + "learning_rate": 9.846085070939829e-06, + "logits/chosen": -1.492741346359253, + "logits/rejected": -1.3278045654296875, + "logps/chosen": -1.2755863666534424, + "logps/rejected": -1.2807852029800415, + "loss": 1.3526, + "odds_ratio_loss": 0.7704776525497437, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1275586187839508, + "rewards/margins": 0.0005198910948820412, + "rewards/rejected": -0.12807850539684296, + "sft_loss": 1.2755863666534424, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 10.192612648010254, + "learning_rate": 9.844565353262892e-06, + "logits/chosen": -1.1960515975952148, + "logits/rejected": -0.8686011433601379, + "logps/chosen": -0.7996153235435486, + "logps/rejected": -1.4538285732269287, + "loss": 0.8463, + "odds_ratio_loss": 0.4663833677768707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07996153086423874, + "rewards/margins": 0.06542132049798965, + "rewards/rejected": -0.1453828513622284, + "sft_loss": 0.7996153235435486, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 9.693170547485352, + "learning_rate": 9.843038288187508e-06, + "logits/chosen": -1.298099398612976, + "logits/rejected": -1.1106765270233154, + "logps/chosen": -0.8047016263008118, + "logps/rejected": -1.351907730102539, + "loss": 0.8499, + "odds_ratio_loss": 0.4519086480140686, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08047015964984894, + "rewards/margins": 0.05472060292959213, + "rewards/rejected": -0.13519077003002167, + "sft_loss": 0.8047016263008118, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 21.928857803344727, + "learning_rate": 9.841503878029663e-06, + "logits/chosen": -1.2702182531356812, + "logits/rejected": -0.8940436244010925, + "logps/chosen": -1.007550835609436, + "logps/rejected": -2.1225733757019043, + "loss": 1.0861, + "odds_ratio_loss": 0.7852200269699097, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10075507313013077, + "rewards/margins": 0.11150224506855011, + "rewards/rejected": -0.21225731074810028, + "sft_loss": 1.007550835609436, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 7.138321876525879, + "learning_rate": 9.839962125116489e-06, + "logits/chosen": -1.3176006078720093, + "logits/rejected": -0.6804088354110718, + "logps/chosen": -1.1514666080474854, + "logps/rejected": -1.2209088802337646, + "loss": 1.2284, + "odds_ratio_loss": 0.7691280245780945, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11514665931463242, + "rewards/margins": 0.006944227032363415, + "rewards/rejected": -0.1220908910036087, + "sft_loss": 1.1514666080474854, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 6.852372169494629, + "learning_rate": 9.838413031786242e-06, + "logits/chosen": -1.281246304512024, + "logits/rejected": -0.9366620182991028, + "logps/chosen": -0.9345917701721191, + "logps/rejected": -0.9236852526664734, + "loss": 1.0247, + "odds_ratio_loss": 0.9015239477157593, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09345918148756027, + "rewards/margins": -0.001090650213882327, + "rewards/rejected": -0.09236852824687958, + "sft_loss": 0.9345917701721191, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 5.963149547576904, + "learning_rate": 9.836856600388327e-06, + "logits/chosen": -1.3900423049926758, + "logits/rejected": -0.7759448289871216, + "logps/chosen": -1.133793592453003, + "logps/rejected": -1.5137890577316284, + "loss": 1.1933, + "odds_ratio_loss": 0.5953725576400757, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11337937414646149, + "rewards/margins": 0.03799953684210777, + "rewards/rejected": -0.15137891471385956, + "sft_loss": 1.133793592453003, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 22.794923782348633, + "learning_rate": 9.835292833283265e-06, + "logits/chosen": -1.3036854267120361, + "logits/rejected": -1.2684440612792969, + "logps/chosen": -0.8737970590591431, + "logps/rejected": -2.6831753253936768, + "loss": 0.9183, + "odds_ratio_loss": 0.4450407922267914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08737970888614655, + "rewards/margins": 0.18093781173229218, + "rewards/rejected": -0.2683175206184387, + "sft_loss": 0.8737970590591431, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 29.34295082092285, + "learning_rate": 9.833721732842709e-06, + "logits/chosen": -1.2447541952133179, + "logits/rejected": -1.1019372940063477, + "logps/chosen": -0.7875004410743713, + "logps/rejected": -0.9674968719482422, + "loss": 0.8673, + "odds_ratio_loss": 0.7978585958480835, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07875005155801773, + "rewards/margins": 0.017999637871980667, + "rewards/rejected": -0.0967496782541275, + "sft_loss": 0.7875004410743713, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 45.776451110839844, + "learning_rate": 9.83214330144943e-06, + "logits/chosen": -1.4828782081604004, + "logits/rejected": -1.3936234712600708, + "logps/chosen": -1.1248772144317627, + "logps/rejected": -1.4694101810455322, + "loss": 1.1788, + "odds_ratio_loss": 0.5393758416175842, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11248771846294403, + "rewards/margins": 0.03445331007242203, + "rewards/rejected": -0.14694103598594666, + "sft_loss": 1.1248772144317627, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 13.452958106994629, + "learning_rate": 9.830557541497324e-06, + "logits/chosen": -1.2950688600540161, + "logits/rejected": -1.086715817451477, + "logps/chosen": -0.9902938008308411, + "logps/rejected": -1.3207643032073975, + "loss": 1.0503, + "odds_ratio_loss": 0.6004306674003601, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09902938455343246, + "rewards/margins": 0.03304705396294594, + "rewards/rejected": -0.1320764124393463, + "sft_loss": 0.9902938008308411, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 5.851574420928955, + "learning_rate": 9.828964455391394e-06, + "logits/chosen": -1.3571841716766357, + "logits/rejected": -1.0242164134979248, + "logps/chosen": -1.282517671585083, + "logps/rejected": -1.1523053646087646, + "loss": 1.3857, + "odds_ratio_loss": 1.0314857959747314, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.12825177609920502, + "rewards/margins": -0.013021242804825306, + "rewards/rejected": -0.11523053795099258, + "sft_loss": 1.282517671585083, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 9.247029304504395, + "learning_rate": 9.827364045547758e-06, + "logits/chosen": -1.2307461500167847, + "logits/rejected": -0.9085014462471008, + "logps/chosen": -1.0744895935058594, + "logps/rejected": -0.807500958442688, + "loss": 1.1704, + "odds_ratio_loss": 0.9586833119392395, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10744895786046982, + "rewards/margins": -0.026698868721723557, + "rewards/rejected": -0.08075009286403656, + "sft_loss": 1.0744895935058594, + "step": 1170 + }, + { + "epoch": 0.09, + "grad_norm": 7.52495813369751, + "learning_rate": 9.825756314393642e-06, + "logits/chosen": -1.3226604461669922, + "logits/rejected": -0.9620486497879028, + "logps/chosen": -0.7998035550117493, + "logps/rejected": -1.128028392791748, + "loss": 0.8536, + "odds_ratio_loss": 0.5378514528274536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07998035848140717, + "rewards/margins": 0.03282248228788376, + "rewards/rejected": -0.11280284821987152, + "sft_loss": 0.7998035550117493, + "step": 1175 + }, + { + "epoch": 0.09, + "grad_norm": 14.74472427368164, + "learning_rate": 9.824141264367372e-06, + "logits/chosen": -1.0445303916931152, + "logits/rejected": -1.044632911682129, + "logps/chosen": -0.960924506187439, + "logps/rejected": -1.0087189674377441, + "loss": 1.0244, + "odds_ratio_loss": 0.6351147294044495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09609244763851166, + "rewards/margins": 0.004779445938766003, + "rewards/rejected": -0.10087189823389053, + "sft_loss": 0.960924506187439, + "step": 1180 + }, + { + "epoch": 0.09, + "grad_norm": 4.776372909545898, + "learning_rate": 9.822518897918377e-06, + "logits/chosen": -1.3258750438690186, + "logits/rejected": -0.4255582392215729, + "logps/chosen": -1.1844384670257568, + "logps/rejected": -1.5825674533843994, + "loss": 1.2435, + "odds_ratio_loss": 0.590488076210022, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11844384670257568, + "rewards/margins": 0.03981289640069008, + "rewards/rejected": -0.15825673937797546, + "sft_loss": 1.1844384670257568, + "step": 1185 + }, + { + "epoch": 0.09, + "grad_norm": 7.216888904571533, + "learning_rate": 9.820889217507184e-06, + "logits/chosen": -1.306216835975647, + "logits/rejected": -0.8492482304573059, + "logps/chosen": -0.8745654225349426, + "logps/rejected": -1.1286555528640747, + "loss": 0.9287, + "odds_ratio_loss": 0.5410099029541016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08745653927326202, + "rewards/margins": 0.025409013032913208, + "rewards/rejected": -0.11286555230617523, + "sft_loss": 0.8745654225349426, + "step": 1190 + }, + { + "epoch": 0.09, + "grad_norm": 16.240503311157227, + "learning_rate": 9.819252225605409e-06, + "logits/chosen": -1.1860531568527222, + "logits/rejected": -1.108520269393921, + "logps/chosen": -1.0403330326080322, + "logps/rejected": -1.2271296977996826, + "loss": 1.1174, + "odds_ratio_loss": 0.7707337141036987, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10403330624103546, + "rewards/margins": 0.018679680302739143, + "rewards/rejected": -0.12271298468112946, + "sft_loss": 1.0403330326080322, + "step": 1195 + }, + { + "epoch": 0.09, + "grad_norm": 14.383599281311035, + "learning_rate": 9.817607924695756e-06, + "logits/chosen": -1.2943899631500244, + "logits/rejected": -1.288130283355713, + "logps/chosen": -1.0370019674301147, + "logps/rejected": -1.4093286991119385, + "loss": 1.0911, + "odds_ratio_loss": 0.5407058596611023, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10370020568370819, + "rewards/margins": 0.037232670933008194, + "rewards/rejected": -0.1409328728914261, + "sft_loss": 1.0370019674301147, + "step": 1200 + }, + { + "epoch": 0.09, + "grad_norm": 8.22785472869873, + "learning_rate": 9.81595631727202e-06, + "logits/chosen": -1.1882762908935547, + "logits/rejected": -0.9258209466934204, + "logps/chosen": -0.8339977264404297, + "logps/rejected": -1.1905823945999146, + "loss": 0.8972, + "odds_ratio_loss": 0.6322519779205322, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08339976519346237, + "rewards/margins": 0.03565846011042595, + "rewards/rejected": -0.11905822902917862, + "sft_loss": 0.8339977264404297, + "step": 1205 + }, + { + "epoch": 0.09, + "grad_norm": 11.784111976623535, + "learning_rate": 9.81429740583907e-06, + "logits/chosen": -1.306536316871643, + "logits/rejected": -0.9648883938789368, + "logps/chosen": -0.957537055015564, + "logps/rejected": -1.3278559446334839, + "loss": 1.0264, + "odds_ratio_loss": 0.6885043382644653, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09575371444225311, + "rewards/margins": 0.03703188896179199, + "rewards/rejected": -0.1327856034040451, + "sft_loss": 0.957537055015564, + "step": 1210 + }, + { + "epoch": 0.09, + "grad_norm": 46.6259880065918, + "learning_rate": 9.812631192912856e-06, + "logits/chosen": -1.2349252700805664, + "logits/rejected": -0.45999327301979065, + "logps/chosen": -0.8471899032592773, + "logps/rejected": -2.678170680999756, + "loss": 0.8966, + "odds_ratio_loss": 0.49428415298461914, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0847189873456955, + "rewards/margins": 0.18309810757637024, + "rewards/rejected": -0.26781708002090454, + "sft_loss": 0.8471899032592773, + "step": 1215 + }, + { + "epoch": 0.09, + "grad_norm": 117.32431030273438, + "learning_rate": 9.810957681020404e-06, + "logits/chosen": -1.10783052444458, + "logits/rejected": -1.2066490650177002, + "logps/chosen": -0.875723659992218, + "logps/rejected": -1.6135194301605225, + "loss": 0.9094, + "odds_ratio_loss": 0.3370504379272461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0875723659992218, + "rewards/margins": 0.07377958297729492, + "rewards/rejected": -0.16135194897651672, + "sft_loss": 0.875723659992218, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 65.14192199707031, + "learning_rate": 9.809276872699806e-06, + "logits/chosen": -1.25543212890625, + "logits/rejected": -1.2210181951522827, + "logps/chosen": -0.9082130193710327, + "logps/rejected": -6.193860054016113, + "loss": 0.9352, + "odds_ratio_loss": 0.26966392993927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09082130342721939, + "rewards/margins": 0.5285647511482239, + "rewards/rejected": -0.6193860173225403, + "sft_loss": 0.9082130193710327, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 6.9969987869262695, + "learning_rate": 9.80758877050022e-06, + "logits/chosen": -1.374524474143982, + "logits/rejected": -0.625987708568573, + "logps/chosen": -0.9819453954696655, + "logps/rejected": -1.604069709777832, + "loss": 1.0319, + "odds_ratio_loss": 0.4998885989189148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09819453954696655, + "rewards/margins": 0.06221244856715202, + "rewards/rejected": -0.16040697693824768, + "sft_loss": 0.9819453954696655, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 10.998051643371582, + "learning_rate": 9.80589337698187e-06, + "logits/chosen": -1.3153575658798218, + "logits/rejected": -0.9550518989562988, + "logps/chosen": -0.9396483302116394, + "logps/rejected": -1.1199018955230713, + "loss": 1.0017, + "odds_ratio_loss": 0.6204285621643066, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0939648374915123, + "rewards/margins": 0.01802534982562065, + "rewards/rejected": -0.11199019104242325, + "sft_loss": 0.9396483302116394, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 15.231605529785156, + "learning_rate": 9.804190694716031e-06, + "logits/chosen": -1.3787672519683838, + "logits/rejected": -0.8150347471237183, + "logps/chosen": -1.0452436208724976, + "logps/rejected": -2.0565788745880127, + "loss": 1.0783, + "odds_ratio_loss": 0.3310582637786865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10452437400817871, + "rewards/margins": 0.10113354027271271, + "rewards/rejected": -0.20565791428089142, + "sft_loss": 1.0452436208724976, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 5.462742805480957, + "learning_rate": 9.802480726285041e-06, + "logits/chosen": -1.3796197175979614, + "logits/rejected": -0.7846873998641968, + "logps/chosen": -0.6047025918960571, + "logps/rejected": -5.4197773933410645, + "loss": 0.6444, + "odds_ratio_loss": 0.3967844843864441, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06047026067972183, + "rewards/margins": 0.48150748014450073, + "rewards/rejected": -0.5419777631759644, + "sft_loss": 0.6047025918960571, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 6.896554470062256, + "learning_rate": 9.800763474282284e-06, + "logits/chosen": -1.3147039413452148, + "logits/rejected": -0.6698298454284668, + "logps/chosen": -1.0624172687530518, + "logps/rejected": -1.228693962097168, + "loss": 1.1291, + "odds_ratio_loss": 0.6668539047241211, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10624171793460846, + "rewards/margins": 0.016627687960863113, + "rewards/rejected": -0.12286939471960068, + "sft_loss": 1.0624172687530518, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 31.245380401611328, + "learning_rate": 9.79903894131219e-06, + "logits/chosen": -1.3014583587646484, + "logits/rejected": -1.0607304573059082, + "logps/chosen": -1.271507740020752, + "logps/rejected": -1.0553052425384521, + "loss": 1.3613, + "odds_ratio_loss": 0.8978258967399597, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1271507740020752, + "rewards/margins": -0.02162025310099125, + "rewards/rejected": -0.1055305227637291, + "sft_loss": 1.271507740020752, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 13.336468696594238, + "learning_rate": 9.797307129990227e-06, + "logits/chosen": -1.0399762392044067, + "logits/rejected": -1.2948471307754517, + "logps/chosen": -0.8306800723075867, + "logps/rejected": -1.1717512607574463, + "loss": 0.8946, + "odds_ratio_loss": 0.6392477750778198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08306801319122314, + "rewards/margins": 0.03410711884498596, + "rewards/rejected": -0.11717512458562851, + "sft_loss": 0.8306800723075867, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 11.68088436126709, + "learning_rate": 9.795568042942916e-06, + "logits/chosen": -1.3981082439422607, + "logits/rejected": -1.2398771047592163, + "logps/chosen": -0.8838878870010376, + "logps/rejected": -5.310142517089844, + "loss": 0.9177, + "odds_ratio_loss": 0.3379073739051819, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08838878571987152, + "rewards/margins": 0.4426254630088806, + "rewards/rejected": -0.5310143232345581, + "sft_loss": 0.8838878870010376, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 116.3707046508789, + "learning_rate": 9.793821682807797e-06, + "logits/chosen": -1.3818522691726685, + "logits/rejected": -0.6395906209945679, + "logps/chosen": -1.6790422201156616, + "logps/rejected": -2.7593679428100586, + "loss": 1.7259, + "odds_ratio_loss": 0.4688444137573242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16790422797203064, + "rewards/margins": 0.10803258419036865, + "rewards/rejected": -0.2759368121623993, + "sft_loss": 1.6790422201156616, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 5.718632698059082, + "learning_rate": 9.79206805223345e-06, + "logits/chosen": -1.3850736618041992, + "logits/rejected": -1.1277830600738525, + "logps/chosen": -1.1699743270874023, + "logps/rejected": -3.3309669494628906, + "loss": 1.2019, + "odds_ratio_loss": 0.3196646571159363, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11699743568897247, + "rewards/margins": 0.21609926223754883, + "rewards/rejected": -0.3330966830253601, + "sft_loss": 1.1699743270874023, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 7.716296195983887, + "learning_rate": 9.790307153879477e-06, + "logits/chosen": -1.3103944063186646, + "logits/rejected": -0.6352896094322205, + "logps/chosen": -1.0161986351013184, + "logps/rejected": -1.074324131011963, + "loss": 1.0845, + "odds_ratio_loss": 0.6833962798118591, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10161986202001572, + "rewards/margins": 0.005812540650367737, + "rewards/rejected": -0.10743240267038345, + "sft_loss": 1.0161986351013184, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 18.123607635498047, + "learning_rate": 9.788538990416503e-06, + "logits/chosen": -1.4223918914794922, + "logits/rejected": -1.336246132850647, + "logps/chosen": -0.8674184083938599, + "logps/rejected": -1.0089197158813477, + "loss": 0.9325, + "odds_ratio_loss": 0.6510507464408875, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08674183487892151, + "rewards/margins": 0.014150148257613182, + "rewards/rejected": -0.10089198499917984, + "sft_loss": 0.8674184083938599, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 4.782289981842041, + "learning_rate": 9.786763564526173e-06, + "logits/chosen": -1.364534616470337, + "logits/rejected": -0.5958219766616821, + "logps/chosen": -0.8636929392814636, + "logps/rejected": -1.2223838567733765, + "loss": 0.9152, + "odds_ratio_loss": 0.5149844288825989, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08636929094791412, + "rewards/margins": 0.035869091749191284, + "rewards/rejected": -0.12223838269710541, + "sft_loss": 0.8636929392814636, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 7.08095121383667, + "learning_rate": 9.78498087890115e-06, + "logits/chosen": -1.5712015628814697, + "logits/rejected": -1.1497470140457153, + "logps/chosen": -0.9990192651748657, + "logps/rejected": -1.026200294494629, + "loss": 1.0707, + "odds_ratio_loss": 0.7166789770126343, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09990192949771881, + "rewards/margins": 0.0027181021869182587, + "rewards/rejected": -0.10262002795934677, + "sft_loss": 0.9990192651748657, + "step": 1295 + }, + { + "epoch": 0.1, + "grad_norm": 12.181612014770508, + "learning_rate": 9.783190936245096e-06, + "logits/chosen": -1.2592464685440063, + "logits/rejected": -0.9734852910041809, + "logps/chosen": -0.7972999811172485, + "logps/rejected": -1.1606394052505493, + "loss": 0.8397, + "odds_ratio_loss": 0.4241735339164734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07972999662160873, + "rewards/margins": 0.03633394464850426, + "rewards/rejected": -0.11606393754482269, + "sft_loss": 0.7972999811172485, + "step": 1300 + }, + { + "epoch": 0.1, + "grad_norm": 7.701669216156006, + "learning_rate": 9.781393739272689e-06, + "logits/chosen": -1.1740028858184814, + "logits/rejected": -0.9590864181518555, + "logps/chosen": -1.0751943588256836, + "logps/rejected": -1.5750086307525635, + "loss": 1.1349, + "odds_ratio_loss": 0.5967916250228882, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10751944780349731, + "rewards/margins": 0.049981411546468735, + "rewards/rejected": -0.15750086307525635, + "sft_loss": 1.0751943588256836, + "step": 1305 + }, + { + "epoch": 0.1, + "grad_norm": 5.968133449554443, + "learning_rate": 9.779589290709607e-06, + "logits/chosen": -1.5533254146575928, + "logits/rejected": -1.0931613445281982, + "logps/chosen": -0.9862836003303528, + "logps/rejected": -1.3846126794815063, + "loss": 1.0598, + "odds_ratio_loss": 0.7355043888092041, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09862836450338364, + "rewards/margins": 0.039832908660173416, + "rewards/rejected": -0.13846126198768616, + "sft_loss": 0.9862836003303528, + "step": 1310 + }, + { + "epoch": 0.1, + "grad_norm": 74.6002426147461, + "learning_rate": 9.777777593292527e-06, + "logits/chosen": -1.1837421655654907, + "logits/rejected": -0.6344571709632874, + "logps/chosen": -0.9500184059143066, + "logps/rejected": -2.0003952980041504, + "loss": 0.9983, + "odds_ratio_loss": 0.4824226498603821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09500183165073395, + "rewards/margins": 0.10503768920898438, + "rewards/rejected": -0.20003953576087952, + "sft_loss": 0.9500184059143066, + "step": 1315 + }, + { + "epoch": 0.1, + "grad_norm": 500.04864501953125, + "learning_rate": 9.775958649769117e-06, + "logits/chosen": -1.3982659578323364, + "logits/rejected": -1.1193958520889282, + "logps/chosen": -1.9695625305175781, + "logps/rejected": -1.2773897647857666, + "loss": 2.1121, + "odds_ratio_loss": 1.425871729850769, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.19695624709129333, + "rewards/margins": -0.0692172572016716, + "rewards/rejected": -0.12773898243904114, + "sft_loss": 1.9695625305175781, + "step": 1320 + }, + { + "epoch": 0.1, + "grad_norm": 7.868124008178711, + "learning_rate": 9.774132462898033e-06, + "logits/chosen": -1.336496353149414, + "logits/rejected": -1.1353330612182617, + "logps/chosen": -0.7955440282821655, + "logps/rejected": -1.272878646850586, + "loss": 0.8501, + "odds_ratio_loss": 0.5457112193107605, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07955440133810043, + "rewards/margins": 0.047733455896377563, + "rewards/rejected": -0.1272878646850586, + "sft_loss": 0.7955440282821655, + "step": 1325 + }, + { + "epoch": 0.1, + "grad_norm": 8.51680850982666, + "learning_rate": 9.772299035448924e-06, + "logits/chosen": -1.1256171464920044, + "logits/rejected": -0.9005386233329773, + "logps/chosen": -1.3089529275894165, + "logps/rejected": -1.4084324836730957, + "loss": 1.383, + "odds_ratio_loss": 0.7403467297554016, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13089528679847717, + "rewards/margins": 0.009947952814400196, + "rewards/rejected": -0.1408432424068451, + "sft_loss": 1.3089529275894165, + "step": 1330 + }, + { + "epoch": 0.1, + "grad_norm": 6.1593499183654785, + "learning_rate": 9.770458370202412e-06, + "logits/chosen": -1.2082802057266235, + "logits/rejected": -0.9753934741020203, + "logps/chosen": -1.1150516271591187, + "logps/rejected": -2.4671926498413086, + "loss": 1.1581, + "odds_ratio_loss": 0.4302564561367035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1115051656961441, + "rewards/margins": 0.13521410524845123, + "rewards/rejected": -0.24671927094459534, + "sft_loss": 1.1150516271591187, + "step": 1335 + }, + { + "epoch": 0.1, + "grad_norm": 6.688883304595947, + "learning_rate": 9.7686104699501e-06, + "logits/chosen": -1.2678996324539185, + "logits/rejected": -0.839708149433136, + "logps/chosen": -0.8351173400878906, + "logps/rejected": -1.2071555852890015, + "loss": 0.8955, + "odds_ratio_loss": 0.6039993166923523, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08351172506809235, + "rewards/margins": 0.0372038260102272, + "rewards/rejected": -0.12071555852890015, + "sft_loss": 0.8351173400878906, + "step": 1340 + }, + { + "epoch": 0.1, + "grad_norm": 5.224099159240723, + "learning_rate": 9.766755337494565e-06, + "logits/chosen": -1.248985767364502, + "logits/rejected": -0.7827884554862976, + "logps/chosen": -0.9480899572372437, + "logps/rejected": -0.8805420994758606, + "loss": 1.0298, + "odds_ratio_loss": 0.8172227740287781, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09480900317430496, + "rewards/margins": -0.006754803005605936, + "rewards/rejected": -0.08805420249700546, + "sft_loss": 0.9480899572372437, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 11.390414237976074, + "learning_rate": 9.764892975649349e-06, + "logits/chosen": -1.3521184921264648, + "logits/rejected": -0.9650154113769531, + "logps/chosen": -1.0387392044067383, + "logps/rejected": -1.35995352268219, + "loss": 1.0944, + "odds_ratio_loss": 0.5570557713508606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10387392342090607, + "rewards/margins": 0.0321214385330677, + "rewards/rejected": -0.13599535822868347, + "sft_loss": 1.0387392044067383, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 6.765865325927734, + "learning_rate": 9.763023387238961e-06, + "logits/chosen": -1.3084546327590942, + "logits/rejected": -0.7172307968139648, + "logps/chosen": -1.1213195323944092, + "logps/rejected": -1.8646224737167358, + "loss": 1.2094, + "odds_ratio_loss": 0.8810539245605469, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11213195323944092, + "rewards/margins": 0.07433030009269714, + "rewards/rejected": -0.18646225333213806, + "sft_loss": 1.1213195323944092, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 32.736473083496094, + "learning_rate": 9.76114657509887e-06, + "logits/chosen": -1.277092695236206, + "logits/rejected": -0.8425081968307495, + "logps/chosen": -0.9858977198600769, + "logps/rejected": -1.1639198064804077, + "loss": 1.0578, + "odds_ratio_loss": 0.7190499305725098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09858977794647217, + "rewards/margins": 0.017802204936742783, + "rewards/rejected": -0.11639197915792465, + "sft_loss": 0.9858977198600769, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 12.08522891998291, + "learning_rate": 9.759262542075498e-06, + "logits/chosen": -1.3010032176971436, + "logits/rejected": -0.9239484071731567, + "logps/chosen": -0.8678629994392395, + "logps/rejected": -1.2071233987808228, + "loss": 0.9279, + "odds_ratio_loss": 0.6008371114730835, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08678629994392395, + "rewards/margins": 0.03392603620886803, + "rewards/rejected": -0.12071233987808228, + "sft_loss": 0.8678629994392395, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 7.780725955963135, + "learning_rate": 9.757371291026223e-06, + "logits/chosen": -1.204240083694458, + "logits/rejected": -0.834067702293396, + "logps/chosen": -0.9842250943183899, + "logps/rejected": -1.17227303981781, + "loss": 1.0623, + "odds_ratio_loss": 0.7805746793746948, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09842251241207123, + "rewards/margins": 0.01880478672683239, + "rewards/rejected": -0.11722730100154877, + "sft_loss": 0.9842250943183899, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 6.746867656707764, + "learning_rate": 9.755472824819366e-06, + "logits/chosen": -1.333153486251831, + "logits/rejected": -1.0350208282470703, + "logps/chosen": -1.1680481433868408, + "logps/rejected": -1.410505771636963, + "loss": 1.2276, + "odds_ratio_loss": 0.5952333211898804, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1168048158288002, + "rewards/margins": 0.024245765060186386, + "rewards/rejected": -0.1410505771636963, + "sft_loss": 1.1680481433868408, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 109.35551452636719, + "learning_rate": 9.753567146334189e-06, + "logits/chosen": -1.2310402393341064, + "logits/rejected": -0.8814123272895813, + "logps/chosen": -1.1120350360870361, + "logps/rejected": -2.6216845512390137, + "loss": 1.1726, + "odds_ratio_loss": 0.6051499247550964, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11120350658893585, + "rewards/margins": 0.15096496045589447, + "rewards/rejected": -0.2621684670448303, + "sft_loss": 1.1120350360870361, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 82.70585632324219, + "learning_rate": 9.7516542584609e-06, + "logits/chosen": -1.381817102432251, + "logits/rejected": -1.098077416419983, + "logps/chosen": -1.1308685541152954, + "logps/rejected": -1.3681640625, + "loss": 1.2013, + "odds_ratio_loss": 0.7040119171142578, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11308685690164566, + "rewards/margins": 0.023729555308818817, + "rewards/rejected": -0.13681641221046448, + "sft_loss": 1.1308685541152954, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 27.114789962768555, + "learning_rate": 9.749734164100635e-06, + "logits/chosen": -0.9133816957473755, + "logits/rejected": -1.0223209857940674, + "logps/chosen": -0.8734237551689148, + "logps/rejected": -1.461368203163147, + "loss": 0.9209, + "odds_ratio_loss": 0.4743104875087738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08734238147735596, + "rewards/margins": 0.058794427663087845, + "rewards/rejected": -0.1461368054151535, + "sft_loss": 0.8734237551689148, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 22.410390853881836, + "learning_rate": 9.74780686616546e-06, + "logits/chosen": -1.1132972240447998, + "logits/rejected": -1.0505969524383545, + "logps/chosen": -1.0885108709335327, + "logps/rejected": -2.5375404357910156, + "loss": 1.1221, + "odds_ratio_loss": 0.3356505036354065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10885109752416611, + "rewards/margins": 0.14490298926830292, + "rewards/rejected": -0.2537540793418884, + "sft_loss": 1.0885108709335327, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 11.0577974319458, + "learning_rate": 9.745872367578366e-06, + "logits/chosen": -1.3365861177444458, + "logits/rejected": -0.8117850422859192, + "logps/chosen": -1.005133032798767, + "logps/rejected": -1.0743589401245117, + "loss": 1.0747, + "odds_ratio_loss": 0.6960892677307129, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10051330178976059, + "rewards/margins": 0.006922594271600246, + "rewards/rejected": -0.10743590444326401, + "sft_loss": 1.005133032798767, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 9.530302047729492, + "learning_rate": 9.743930671273269e-06, + "logits/chosen": -1.328739881515503, + "logits/rejected": -1.3169947862625122, + "logps/chosen": -1.1289832592010498, + "logps/rejected": -4.339430809020996, + "loss": 1.1739, + "odds_ratio_loss": 0.44948825240135193, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1128983274102211, + "rewards/margins": 0.3210447430610657, + "rewards/rejected": -0.43394309282302856, + "sft_loss": 1.1289832592010498, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 6.579575538635254, + "learning_rate": 9.741981780194996e-06, + "logits/chosen": -1.3872390985488892, + "logits/rejected": -1.1063064336776733, + "logps/chosen": -1.0387804508209229, + "logps/rejected": -1.274841070175171, + "loss": 1.1011, + "odds_ratio_loss": 0.6233776807785034, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10387805849313736, + "rewards/margins": 0.023606054484844208, + "rewards/rejected": -0.12748411297798157, + "sft_loss": 1.0387804508209229, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 5.642371654510498, + "learning_rate": 9.740025697299288e-06, + "logits/chosen": -1.2045328617095947, + "logits/rejected": -0.5057806968688965, + "logps/chosen": -0.9965047836303711, + "logps/rejected": -2.6621475219726562, + "loss": 1.0185, + "odds_ratio_loss": 0.22015976905822754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09965048730373383, + "rewards/margins": 0.16656428575515747, + "rewards/rejected": -0.2662147581577301, + "sft_loss": 0.9965047836303711, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 24.070602416992188, + "learning_rate": 9.73806242555279e-06, + "logits/chosen": -1.1469743251800537, + "logits/rejected": -1.0040260553359985, + "logps/chosen": -0.9361482858657837, + "logps/rejected": -2.1627020835876465, + "loss": 0.9727, + "odds_ratio_loss": 0.3653944730758667, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09361483156681061, + "rewards/margins": 0.12265539169311523, + "rewards/rejected": -0.21627020835876465, + "sft_loss": 0.9361482858657837, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 19.208091735839844, + "learning_rate": 9.736091967933058e-06, + "logits/chosen": -0.9995628595352173, + "logits/rejected": -1.2005598545074463, + "logps/chosen": -1.0177791118621826, + "logps/rejected": -4.385058879852295, + "loss": 1.0733, + "odds_ratio_loss": 0.5556063652038574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10177791118621826, + "rewards/margins": 0.33672797679901123, + "rewards/rejected": -0.4385058879852295, + "sft_loss": 1.0177791118621826, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 8.281458854675293, + "learning_rate": 9.73411432742854e-06, + "logits/chosen": -1.1627318859100342, + "logits/rejected": -0.9758650064468384, + "logps/chosen": -1.022214651107788, + "logps/rejected": -1.0216261148452759, + "loss": 1.0938, + "odds_ratio_loss": 0.7162176966667175, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10222147405147552, + "rewards/margins": -5.8867037296295166e-05, + "rewards/rejected": -0.10216259956359863, + "sft_loss": 1.022214651107788, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 6.446768283843994, + "learning_rate": 9.732129507038576e-06, + "logits/chosen": -1.0612138509750366, + "logits/rejected": -1.085730791091919, + "logps/chosen": -1.2515077590942383, + "logps/rejected": -1.6711082458496094, + "loss": 1.3005, + "odds_ratio_loss": 0.4898054003715515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12515076994895935, + "rewards/margins": 0.04196004569530487, + "rewards/rejected": -0.16711083054542542, + "sft_loss": 1.2515077590942383, + "step": 1435 + }, + { + "epoch": 0.11, + "grad_norm": 5.9316511154174805, + "learning_rate": 9.730137509773401e-06, + "logits/chosen": -1.2018150091171265, + "logits/rejected": -0.5067328214645386, + "logps/chosen": -1.010285496711731, + "logps/rejected": -2.8801932334899902, + "loss": 1.0461, + "odds_ratio_loss": 0.35789528489112854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10102854669094086, + "rewards/margins": 0.18699078261852264, + "rewards/rejected": -0.2880193293094635, + "sft_loss": 1.010285496711731, + "step": 1440 + }, + { + "epoch": 0.11, + "grad_norm": 5.917844772338867, + "learning_rate": 9.728138338654131e-06, + "logits/chosen": -1.0095337629318237, + "logits/rejected": -0.9432841539382935, + "logps/chosen": -1.1060152053833008, + "logps/rejected": -1.2171921730041504, + "loss": 1.1696, + "odds_ratio_loss": 0.6360725164413452, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1106015220284462, + "rewards/margins": 0.011117706075310707, + "rewards/rejected": -0.12171921879053116, + "sft_loss": 1.1060152053833008, + "step": 1445 + }, + { + "epoch": 0.11, + "grad_norm": 64.06980895996094, + "learning_rate": 9.726131996712763e-06, + "logits/chosen": -1.41555655002594, + "logits/rejected": -1.0206005573272705, + "logps/chosen": -1.3561770915985107, + "logps/rejected": -3.290637493133545, + "loss": 1.3955, + "odds_ratio_loss": 0.393208771944046, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1356177031993866, + "rewards/margins": 0.1934460550546646, + "rewards/rejected": -0.3290637731552124, + "sft_loss": 1.3561770915985107, + "step": 1450 + }, + { + "epoch": 0.11, + "grad_norm": 6.217737197875977, + "learning_rate": 9.724118486992167e-06, + "logits/chosen": -1.3447182178497314, + "logits/rejected": -0.9803462028503418, + "logps/chosen": -1.143945574760437, + "logps/rejected": -1.5074526071548462, + "loss": 1.2113, + "odds_ratio_loss": 0.6737285852432251, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11439456045627594, + "rewards/margins": 0.03635070472955704, + "rewards/rejected": -0.1507452428340912, + "sft_loss": 1.143945574760437, + "step": 1455 + }, + { + "epoch": 0.11, + "grad_norm": 4.317702770233154, + "learning_rate": 9.72209781254609e-06, + "logits/chosen": -1.3004049062728882, + "logits/rejected": -0.6786057353019714, + "logps/chosen": -0.8832548260688782, + "logps/rejected": -1.0870282649993896, + "loss": 0.941, + "odds_ratio_loss": 0.5778591632843018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08832548558712006, + "rewards/margins": 0.020377354696393013, + "rewards/rejected": -0.10870283842086792, + "sft_loss": 0.8832548260688782, + "step": 1460 + }, + { + "epoch": 0.11, + "grad_norm": 19.033479690551758, + "learning_rate": 9.720069976439138e-06, + "logits/chosen": -1.258845567703247, + "logits/rejected": -0.6938650012016296, + "logps/chosen": -1.0265159606933594, + "logps/rejected": -2.807030439376831, + "loss": 1.0617, + "odds_ratio_loss": 0.3515172600746155, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10265159606933594, + "rewards/margins": 0.1780514419078827, + "rewards/rejected": -0.28070303797721863, + "sft_loss": 1.0265159606933594, + "step": 1465 + }, + { + "epoch": 0.11, + "grad_norm": 5.5583295822143555, + "learning_rate": 9.718034981746784e-06, + "logits/chosen": -1.2468676567077637, + "logits/rejected": -0.6721242070198059, + "logps/chosen": -0.950405478477478, + "logps/rejected": -2.5747733116149902, + "loss": 1.0046, + "odds_ratio_loss": 0.5416213274002075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09504055976867676, + "rewards/margins": 0.16243679821491241, + "rewards/rejected": -0.257477343082428, + "sft_loss": 0.950405478477478, + "step": 1470 + }, + { + "epoch": 0.11, + "grad_norm": 5.851568222045898, + "learning_rate": 9.715992831555356e-06, + "logits/chosen": -1.1191495656967163, + "logits/rejected": -0.8659588694572449, + "logps/chosen": -1.1071155071258545, + "logps/rejected": -1.0805102586746216, + "loss": 1.1782, + "odds_ratio_loss": 0.7104871273040771, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11071155965328217, + "rewards/margins": -0.0026605359744280577, + "rewards/rejected": -0.10805102437734604, + "sft_loss": 1.1071155071258545, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 11.510098457336426, + "learning_rate": 9.713943528962031e-06, + "logits/chosen": -1.4745122194290161, + "logits/rejected": -1.000880479812622, + "logps/chosen": -1.1318867206573486, + "logps/rejected": -3.0544705390930176, + "loss": 1.17, + "odds_ratio_loss": 0.3810274004936218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11318866908550262, + "rewards/margins": 0.19225840270519257, + "rewards/rejected": -0.3054470717906952, + "sft_loss": 1.1318867206573486, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 128.35110473632812, + "learning_rate": 9.71188707707484e-06, + "logits/chosen": -1.1662242412567139, + "logits/rejected": -1.034181833267212, + "logps/chosen": -1.1305862665176392, + "logps/rejected": -1.239294409751892, + "loss": 1.1962, + "odds_ratio_loss": 0.6565095782279968, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11305862665176392, + "rewards/margins": 0.010870824567973614, + "rewards/rejected": -0.12392944097518921, + "sft_loss": 1.1305862665176392, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 24.923812866210938, + "learning_rate": 9.709823479012652e-06, + "logits/chosen": -1.242684245109558, + "logits/rejected": -1.0049892663955688, + "logps/chosen": -1.0039126873016357, + "logps/rejected": -1.7858537435531616, + "loss": 1.0471, + "odds_ratio_loss": 0.43143850564956665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10039126873016357, + "rewards/margins": 0.07819411903619766, + "rewards/rejected": -0.17858538031578064, + "sft_loss": 1.0039126873016357, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 7.694460868835449, + "learning_rate": 9.707752737905175e-06, + "logits/chosen": -1.4090046882629395, + "logits/rejected": -1.0745227336883545, + "logps/chosen": -0.6600391864776611, + "logps/rejected": -0.9888060688972473, + "loss": 0.7161, + "odds_ratio_loss": 0.5601866841316223, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06600391864776611, + "rewards/margins": 0.03287668898701668, + "rewards/rejected": -0.09888060390949249, + "sft_loss": 0.6600391864776611, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 15.502304077148438, + "learning_rate": 9.705674856892953e-06, + "logits/chosen": -1.4167745113372803, + "logits/rejected": -1.0435190200805664, + "logps/chosen": -0.6534953713417053, + "logps/rejected": -5.029236793518066, + "loss": 0.6772, + "odds_ratio_loss": 0.23742082715034485, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0653495341539383, + "rewards/margins": 0.4375740885734558, + "rewards/rejected": -0.5029236078262329, + "sft_loss": 0.6534953713417053, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 18.245086669921875, + "learning_rate": 9.703589839127355e-06, + "logits/chosen": -1.4975404739379883, + "logits/rejected": -0.8924884796142578, + "logps/chosen": -0.7357920408248901, + "logps/rejected": -0.905347466468811, + "loss": 0.8012, + "odds_ratio_loss": 0.6539013981819153, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07357920706272125, + "rewards/margins": 0.016955537721514702, + "rewards/rejected": -0.0905347466468811, + "sft_loss": 0.7357920408248901, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 10.855551719665527, + "learning_rate": 9.701497687770572e-06, + "logits/chosen": -1.4533500671386719, + "logits/rejected": -1.1055314540863037, + "logps/chosen": -1.0148468017578125, + "logps/rejected": -0.8302356600761414, + "loss": 1.1049, + "odds_ratio_loss": 0.9005705118179321, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10148467868566513, + "rewards/margins": -0.018461106345057487, + "rewards/rejected": -0.0830235704779625, + "sft_loss": 1.0148468017578125, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 23.625286102294922, + "learning_rate": 9.699398405995621e-06, + "logits/chosen": -1.3821407556533813, + "logits/rejected": -1.2004427909851074, + "logps/chosen": -1.1571322679519653, + "logps/rejected": -0.9725432395935059, + "loss": 1.2445, + "odds_ratio_loss": 0.8739679455757141, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.1157132238149643, + "rewards/margins": -0.018458906561136246, + "rewards/rejected": -0.09725432842969894, + "sft_loss": 1.1571322679519653, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 9.365283012390137, + "learning_rate": 9.69729199698633e-06, + "logits/chosen": -1.1097975969314575, + "logits/rejected": -0.9331648945808411, + "logps/chosen": -0.8817958831787109, + "logps/rejected": -1.2720427513122559, + "loss": 0.9337, + "odds_ratio_loss": 0.5186026692390442, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0881795883178711, + "rewards/margins": 0.03902468830347061, + "rewards/rejected": -0.1272042840719223, + "sft_loss": 0.8817958831787109, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 7.966055393218994, + "learning_rate": 9.695178463937333e-06, + "logits/chosen": -1.245266318321228, + "logits/rejected": -0.7221423387527466, + "logps/chosen": -1.0875027179718018, + "logps/rejected": -1.6951267719268799, + "loss": 1.1337, + "odds_ratio_loss": 0.46217623353004456, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10875026881694794, + "rewards/margins": 0.06076239421963692, + "rewards/rejected": -0.16951265931129456, + "sft_loss": 1.0875027179718018, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 7.182195663452148, + "learning_rate": 9.693057810054073e-06, + "logits/chosen": -1.4914124011993408, + "logits/rejected": -1.0812093019485474, + "logps/chosen": -0.942160964012146, + "logps/rejected": -3.100853204727173, + "loss": 0.9657, + "odds_ratio_loss": 0.235835000872612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09421609342098236, + "rewards/margins": 0.2158692181110382, + "rewards/rejected": -0.3100852966308594, + "sft_loss": 0.942160964012146, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 8.808905601501465, + "learning_rate": 9.69093003855279e-06, + "logits/chosen": -1.4138414859771729, + "logits/rejected": -1.0396727323532104, + "logps/chosen": -1.1173908710479736, + "logps/rejected": -3.009047746658325, + "loss": 1.1733, + "odds_ratio_loss": 0.5594109296798706, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11173909902572632, + "rewards/margins": 0.18916571140289307, + "rewards/rejected": -0.300904780626297, + "sft_loss": 1.1173908710479736, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 53.27058792114258, + "learning_rate": 9.68879515266052e-06, + "logits/chosen": -1.5334960222244263, + "logits/rejected": -1.2154088020324707, + "logps/chosen": -0.9809530973434448, + "logps/rejected": -1.6464436054229736, + "loss": 1.024, + "odds_ratio_loss": 0.43026527762413025, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09809531271457672, + "rewards/margins": 0.06654904782772064, + "rewards/rejected": -0.16464434564113617, + "sft_loss": 0.9809530973434448, + "step": 1540 + }, + { + "epoch": 0.12, + "grad_norm": 5.976083755493164, + "learning_rate": 9.686653155615089e-06, + "logits/chosen": -1.4377057552337646, + "logits/rejected": -1.030389428138733, + "logps/chosen": -0.8061229586601257, + "logps/rejected": -4.470185279846191, + "loss": 0.8256, + "odds_ratio_loss": 0.195216566324234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08061229437589645, + "rewards/margins": 0.36640629172325134, + "rewards/rejected": -0.447018563747406, + "sft_loss": 0.8061229586601257, + "step": 1545 + }, + { + "epoch": 0.12, + "grad_norm": 10.860502243041992, + "learning_rate": 9.684504050665106e-06, + "logits/chosen": -1.3172852993011475, + "logits/rejected": -1.1030348539352417, + "logps/chosen": -0.8627084493637085, + "logps/rejected": -1.4381139278411865, + "loss": 0.8992, + "odds_ratio_loss": 0.365181028842926, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08627085387706757, + "rewards/margins": 0.057540543377399445, + "rewards/rejected": -0.1438113898038864, + "sft_loss": 0.8627084493637085, + "step": 1550 + }, + { + "epoch": 0.12, + "grad_norm": 8.827674865722656, + "learning_rate": 9.682347841069961e-06, + "logits/chosen": -1.2773463726043701, + "logits/rejected": -0.7163883447647095, + "logps/chosen": -0.9320128560066223, + "logps/rejected": -2.8091297149658203, + "loss": 0.9929, + "odds_ratio_loss": 0.6083893775939941, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09320129454135895, + "rewards/margins": 0.1877116858959198, + "rewards/rejected": -0.28091296553611755, + "sft_loss": 0.9320128560066223, + "step": 1555 + }, + { + "epoch": 0.12, + "grad_norm": 11.049094200134277, + "learning_rate": 9.680184530099822e-06, + "logits/chosen": -1.1425576210021973, + "logits/rejected": -1.0668671131134033, + "logps/chosen": -1.2148054838180542, + "logps/rejected": -1.9959310293197632, + "loss": 1.2595, + "odds_ratio_loss": 0.44650644063949585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1214805468916893, + "rewards/margins": 0.07811255753040314, + "rewards/rejected": -0.19959311187267303, + "sft_loss": 1.2148054838180542, + "step": 1560 + }, + { + "epoch": 0.12, + "grad_norm": 7.441697597503662, + "learning_rate": 9.678014121035626e-06, + "logits/chosen": -1.2251560688018799, + "logits/rejected": -0.7328432202339172, + "logps/chosen": -1.2359546422958374, + "logps/rejected": -1.459623098373413, + "loss": 1.2983, + "odds_ratio_loss": 0.6238261461257935, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1235954612493515, + "rewards/margins": 0.022366849705576897, + "rewards/rejected": -0.14596232771873474, + "sft_loss": 1.2359546422958374, + "step": 1565 + }, + { + "epoch": 0.12, + "grad_norm": 11.25999641418457, + "learning_rate": 9.67583661716907e-06, + "logits/chosen": -1.2264257669448853, + "logits/rejected": -0.9383336901664734, + "logps/chosen": -1.144769549369812, + "logps/rejected": -1.3678714036941528, + "loss": 1.2072, + "odds_ratio_loss": 0.624315083026886, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11447696387767792, + "rewards/margins": 0.022310173138976097, + "rewards/rejected": -0.13678714632987976, + "sft_loss": 1.144769549369812, + "step": 1570 + }, + { + "epoch": 0.12, + "grad_norm": 6.60423469543457, + "learning_rate": 9.673652021802615e-06, + "logits/chosen": -1.313948631286621, + "logits/rejected": -0.851759135723114, + "logps/chosen": -0.7691382765769958, + "logps/rejected": -3.526015520095825, + "loss": 0.8021, + "odds_ratio_loss": 0.3292834162712097, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07691384106874466, + "rewards/margins": 0.27568769454956055, + "rewards/rejected": -0.3526015281677246, + "sft_loss": 0.7691382765769958, + "step": 1575 + }, + { + "epoch": 0.12, + "grad_norm": 9.645243644714355, + "learning_rate": 9.671460338249481e-06, + "logits/chosen": -1.3922085762023926, + "logits/rejected": -1.1013203859329224, + "logps/chosen": -0.712190568447113, + "logps/rejected": -2.190417528152466, + "loss": 0.7394, + "odds_ratio_loss": 0.27252617478370667, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0712190493941307, + "rewards/margins": 0.14782269299030304, + "rewards/rejected": -0.21904174983501434, + "sft_loss": 0.712190568447113, + "step": 1580 + }, + { + "epoch": 0.12, + "grad_norm": 12.494514465332031, + "learning_rate": 9.669261569833632e-06, + "logits/chosen": -1.3502166271209717, + "logits/rejected": -1.1128828525543213, + "logps/chosen": -1.1661348342895508, + "logps/rejected": -1.326633095741272, + "loss": 1.2252, + "odds_ratio_loss": 0.5908174514770508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1166134849190712, + "rewards/margins": 0.016049817204475403, + "rewards/rejected": -0.132663294672966, + "sft_loss": 1.1661348342895508, + "step": 1585 + }, + { + "epoch": 0.12, + "grad_norm": 5.0776567459106445, + "learning_rate": 9.667055719889778e-06, + "logits/chosen": -1.349551796913147, + "logits/rejected": -0.9270380735397339, + "logps/chosen": -0.8073859214782715, + "logps/rejected": -1.9800093173980713, + "loss": 0.8363, + "odds_ratio_loss": 0.2894620895385742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08073858916759491, + "rewards/margins": 0.1172623261809349, + "rewards/rejected": -0.1980009377002716, + "sft_loss": 0.8073859214782715, + "step": 1590 + }, + { + "epoch": 0.12, + "grad_norm": 5.410458564758301, + "learning_rate": 9.664842791763374e-06, + "logits/chosen": -1.4142658710479736, + "logits/rejected": -0.8889997601509094, + "logps/chosen": -0.9126744270324707, + "logps/rejected": -2.523634910583496, + "loss": 0.9385, + "odds_ratio_loss": 0.25870975852012634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09126743674278259, + "rewards/margins": 0.16109606623649597, + "rewards/rejected": -0.25236350297927856, + "sft_loss": 0.9126744270324707, + "step": 1595 + }, + { + "epoch": 0.12, + "grad_norm": 5.038325786590576, + "learning_rate": 9.662622788810604e-06, + "logits/chosen": -1.31760835647583, + "logits/rejected": -0.963549017906189, + "logps/chosen": -0.8750256299972534, + "logps/rejected": -1.8355424404144287, + "loss": 0.911, + "odds_ratio_loss": 0.35989946126937866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08750256896018982, + "rewards/margins": 0.09605169296264648, + "rewards/rejected": -0.1835542619228363, + "sft_loss": 0.8750256299972534, + "step": 1600 + }, + { + "epoch": 0.12, + "grad_norm": 7.45214319229126, + "learning_rate": 9.660395714398387e-06, + "logits/chosen": -1.283881664276123, + "logits/rejected": -0.7890142202377319, + "logps/chosen": -0.9697272181510925, + "logps/rejected": -1.381255865097046, + "loss": 1.0188, + "odds_ratio_loss": 0.49079370498657227, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09697272628545761, + "rewards/margins": 0.041152846068143845, + "rewards/rejected": -0.13812556862831116, + "sft_loss": 0.9697272181510925, + "step": 1605 + }, + { + "epoch": 0.13, + "grad_norm": 52.52288818359375, + "learning_rate": 9.65816157190436e-06, + "logits/chosen": -1.067137598991394, + "logits/rejected": -1.3262689113616943, + "logps/chosen": -1.0371512174606323, + "logps/rejected": -4.629284381866455, + "loss": 1.0658, + "odds_ratio_loss": 0.2862391173839569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10371513664722443, + "rewards/margins": 0.35921329259872437, + "rewards/rejected": -0.46292844414711, + "sft_loss": 1.0371512174606323, + "step": 1610 + }, + { + "epoch": 0.13, + "grad_norm": 17.328516006469727, + "learning_rate": 9.655920364716888e-06, + "logits/chosen": -1.2435493469238281, + "logits/rejected": -1.1730600595474243, + "logps/chosen": -1.148634433746338, + "logps/rejected": -1.3713942766189575, + "loss": 1.215, + "odds_ratio_loss": 0.6635384559631348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11486344039440155, + "rewards/margins": 0.02227597124874592, + "rewards/rejected": -0.13713940978050232, + "sft_loss": 1.148634433746338, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 8.536588668823242, + "learning_rate": 9.653672096235042e-06, + "logits/chosen": -1.2994670867919922, + "logits/rejected": -0.5064767599105835, + "logps/chosen": -0.8236101269721985, + "logps/rejected": -1.6406495571136475, + "loss": 0.8592, + "odds_ratio_loss": 0.35558241605758667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08236101269721985, + "rewards/margins": 0.08170395344495773, + "rewards/rejected": -0.16406495869159698, + "sft_loss": 0.8236101269721985, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 12.32243537902832, + "learning_rate": 9.651416769868611e-06, + "logits/chosen": -1.3300247192382812, + "logits/rejected": -0.6631767153739929, + "logps/chosen": -0.9727737307548523, + "logps/rejected": -1.7273757457733154, + "loss": 1.0122, + "odds_ratio_loss": 0.39440396428108215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09727738052606583, + "rewards/margins": 0.07546021044254303, + "rewards/rejected": -0.17273758351802826, + "sft_loss": 0.9727737307548523, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 32.79615783691406, + "learning_rate": 9.64915438903808e-06, + "logits/chosen": -1.4940061569213867, + "logits/rejected": -1.1153476238250732, + "logps/chosen": -1.0253182649612427, + "logps/rejected": -6.928043365478516, + "loss": 1.0591, + "odds_ratio_loss": 0.33826178312301636, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10253182798624039, + "rewards/margins": 0.5902725458145142, + "rewards/rejected": -0.6928043961524963, + "sft_loss": 1.0253182649612427, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 6.1653289794921875, + "learning_rate": 9.646884957174639e-06, + "logits/chosen": -1.3538758754730225, + "logits/rejected": -0.8061432838439941, + "logps/chosen": -0.9848779439926147, + "logps/rejected": -2.1277546882629395, + "loss": 1.0324, + "odds_ratio_loss": 0.47568100690841675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09848780184984207, + "rewards/margins": 0.11428769677877426, + "rewards/rejected": -0.21277549862861633, + "sft_loss": 0.9848779439926147, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 24.2289981842041, + "learning_rate": 9.64460847772017e-06, + "logits/chosen": -1.1956393718719482, + "logits/rejected": -1.2510998249053955, + "logps/chosen": -1.0347821712493896, + "logps/rejected": -9.191658020019531, + "loss": 1.0668, + "odds_ratio_loss": 0.3206237256526947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10347823053598404, + "rewards/margins": 0.8156875371932983, + "rewards/rejected": -0.9191657900810242, + "sft_loss": 1.0347821712493896, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 13.129053115844727, + "learning_rate": 9.642324954127241e-06, + "logits/chosen": -1.4041489362716675, + "logits/rejected": -1.0233229398727417, + "logps/chosen": -1.1459414958953857, + "logps/rejected": -4.507222652435303, + "loss": 1.1969, + "odds_ratio_loss": 0.5096083283424377, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11459414660930634, + "rewards/margins": 0.3361281156539917, + "rewards/rejected": -0.45072221755981445, + "sft_loss": 1.1459414958953857, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 5.412563323974609, + "learning_rate": 9.640034389859105e-06, + "logits/chosen": -1.2122042179107666, + "logits/rejected": -0.7289305925369263, + "logps/chosen": -0.8540544509887695, + "logps/rejected": -1.4945560693740845, + "loss": 0.8913, + "odds_ratio_loss": 0.37204310297966003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08540545403957367, + "rewards/margins": 0.06405016034841537, + "rewards/rejected": -0.14945560693740845, + "sft_loss": 0.8540544509887695, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 6.666295528411865, + "learning_rate": 9.637736788389698e-06, + "logits/chosen": -1.3543541431427002, + "logits/rejected": -0.9869254231452942, + "logps/chosen": -0.9420193433761597, + "logps/rejected": -1.282476544380188, + "loss": 0.9937, + "odds_ratio_loss": 0.5166751146316528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09420192986726761, + "rewards/margins": 0.03404572606086731, + "rewards/rejected": -0.12824766337871552, + "sft_loss": 0.9420193433761597, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 5.094629287719727, + "learning_rate": 9.635432153203618e-06, + "logits/chosen": -1.3278597593307495, + "logits/rejected": -0.9322368502616882, + "logps/chosen": -0.8363531827926636, + "logps/rejected": -0.9989891052246094, + "loss": 0.8974, + "odds_ratio_loss": 0.6107999682426453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08363533020019531, + "rewards/margins": 0.016263587400317192, + "rewards/rejected": -0.09989891946315765, + "sft_loss": 0.8363531827926636, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 6.596664905548096, + "learning_rate": 9.633120487796145e-06, + "logits/chosen": -1.32304048538208, + "logits/rejected": -0.7720328569412231, + "logps/chosen": -0.904772162437439, + "logps/rejected": -1.3722435235977173, + "loss": 0.9454, + "odds_ratio_loss": 0.4057803750038147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09047721326351166, + "rewards/margins": 0.04674714058637619, + "rewards/rejected": -0.13722436130046844, + "sft_loss": 0.904772162437439, + "step": 1665 + }, + { + "epoch": 0.13, + "grad_norm": 8.84036636352539, + "learning_rate": 9.630801795673203e-06, + "logits/chosen": -1.4791336059570312, + "logits/rejected": -1.0405861139297485, + "logps/chosen": -0.7008475065231323, + "logps/rejected": -1.8262150287628174, + "loss": 0.7382, + "odds_ratio_loss": 0.3737823963165283, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07008475810289383, + "rewards/margins": 0.11253675073385239, + "rewards/rejected": -0.18262150883674622, + "sft_loss": 0.7008475065231323, + "step": 1670 + }, + { + "epoch": 0.13, + "grad_norm": 14.353055953979492, + "learning_rate": 9.628476080351392e-06, + "logits/chosen": -1.5037769079208374, + "logits/rejected": -1.0465939044952393, + "logps/chosen": -1.0048973560333252, + "logps/rejected": -1.4740705490112305, + "loss": 1.0509, + "odds_ratio_loss": 0.4605104923248291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10048973560333252, + "rewards/margins": 0.04691731557250023, + "rewards/rejected": -0.14740703999996185, + "sft_loss": 1.0048973560333252, + "step": 1675 + }, + { + "epoch": 0.13, + "grad_norm": 6.722265720367432, + "learning_rate": 9.62614334535795e-06, + "logits/chosen": -1.1800585985183716, + "logits/rejected": -1.0569149255752563, + "logps/chosen": -0.9442615509033203, + "logps/rejected": -1.088505506515503, + "loss": 1.0093, + "odds_ratio_loss": 0.6508314609527588, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09442616999149323, + "rewards/margins": 0.0144243985414505, + "rewards/rejected": -0.10885056108236313, + "sft_loss": 0.9442615509033203, + "step": 1680 + }, + { + "epoch": 0.13, + "grad_norm": 22.2547607421875, + "learning_rate": 9.623803594230768e-06, + "logits/chosen": -1.3695385456085205, + "logits/rejected": -1.2436089515686035, + "logps/chosen": -0.7560356855392456, + "logps/rejected": -1.0340205430984497, + "loss": 0.8104, + "odds_ratio_loss": 0.5438529253005981, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07560355961322784, + "rewards/margins": 0.027798492461442947, + "rewards/rejected": -0.10340205579996109, + "sft_loss": 0.7560356855392456, + "step": 1685 + }, + { + "epoch": 0.13, + "grad_norm": 7.995831489562988, + "learning_rate": 9.621456830518372e-06, + "logits/chosen": -1.3292655944824219, + "logits/rejected": -1.1388499736785889, + "logps/chosen": -1.080413579940796, + "logps/rejected": -1.0560017824172974, + "loss": 1.15, + "odds_ratio_loss": 0.6955240368843079, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10804136842489243, + "rewards/margins": -0.002441184129565954, + "rewards/rejected": -0.10560017824172974, + "sft_loss": 1.080413579940796, + "step": 1690 + }, + { + "epoch": 0.13, + "grad_norm": 11.818976402282715, + "learning_rate": 9.61910305777993e-06, + "logits/chosen": -1.2284135818481445, + "logits/rejected": -0.9383662939071655, + "logps/chosen": -1.0092649459838867, + "logps/rejected": -1.004175066947937, + "loss": 1.0855, + "odds_ratio_loss": 0.7619765996932983, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10092649608850479, + "rewards/margins": -0.0005089893820695579, + "rewards/rejected": -0.10041750967502594, + "sft_loss": 1.0092649459838867, + "step": 1695 + }, + { + "epoch": 0.13, + "grad_norm": 5.940454483032227, + "learning_rate": 9.616742279585237e-06, + "logits/chosen": -1.2692519426345825, + "logits/rejected": -0.5092897415161133, + "logps/chosen": -1.0339243412017822, + "logps/rejected": -1.1642903089523315, + "loss": 1.094, + "odds_ratio_loss": 0.6005213856697083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10339243710041046, + "rewards/margins": 0.013036603108048439, + "rewards/rejected": -0.11642904579639435, + "sft_loss": 1.0339243412017822, + "step": 1700 + }, + { + "epoch": 0.13, + "grad_norm": 5.760742664337158, + "learning_rate": 9.614374499514712e-06, + "logits/chosen": -1.2022879123687744, + "logits/rejected": -0.7368927001953125, + "logps/chosen": -1.0404551029205322, + "logps/rejected": -0.8103219866752625, + "loss": 1.1465, + "odds_ratio_loss": 1.0605835914611816, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10404551029205322, + "rewards/margins": -0.023013316094875336, + "rewards/rejected": -0.08103219419717789, + "sft_loss": 1.0404551029205322, + "step": 1705 + }, + { + "epoch": 0.13, + "grad_norm": 10.959287643432617, + "learning_rate": 9.611999721159397e-06, + "logits/chosen": -1.2368319034576416, + "logits/rejected": -1.1851609945297241, + "logps/chosen": -1.2158687114715576, + "logps/rejected": -3.8632235527038574, + "loss": 1.256, + "odds_ratio_loss": 0.4010738730430603, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.121586874127388, + "rewards/margins": 0.26473551988601685, + "rewards/rejected": -0.38632237911224365, + "sft_loss": 1.2158687114715576, + "step": 1710 + }, + { + "epoch": 0.13, + "grad_norm": 8.698751449584961, + "learning_rate": 9.609617948120939e-06, + "logits/chosen": -1.3946199417114258, + "logits/rejected": -1.0465425252914429, + "logps/chosen": -0.876213550567627, + "logps/rejected": -3.2105965614318848, + "loss": 0.9136, + "odds_ratio_loss": 0.3735765218734741, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08762134611606598, + "rewards/margins": 0.23343829810619354, + "rewards/rejected": -0.3210596740245819, + "sft_loss": 0.876213550567627, + "step": 1715 + }, + { + "epoch": 0.13, + "grad_norm": 5.588376522064209, + "learning_rate": 9.607229184011605e-06, + "logits/chosen": -1.3224467039108276, + "logits/rejected": -0.8747288584709167, + "logps/chosen": -1.0286288261413574, + "logps/rejected": -1.219234824180603, + "loss": 1.0875, + "odds_ratio_loss": 0.5890880823135376, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1028628796339035, + "rewards/margins": 0.019060594961047173, + "rewards/rejected": -0.12192346900701523, + "sft_loss": 1.0286288261413574, + "step": 1720 + }, + { + "epoch": 0.13, + "grad_norm": 11.192124366760254, + "learning_rate": 9.604833432454257e-06, + "logits/chosen": -1.3245986700057983, + "logits/rejected": -0.7286895513534546, + "logps/chosen": -1.0568475723266602, + "logps/rejected": -1.8521363735198975, + "loss": 1.0987, + "odds_ratio_loss": 0.418095201253891, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10568475723266602, + "rewards/margins": 0.07952889800071716, + "rewards/rejected": -0.18521365523338318, + "sft_loss": 1.0568475723266602, + "step": 1725 + }, + { + "epoch": 0.13, + "grad_norm": 5.898077011108398, + "learning_rate": 9.602430697082357e-06, + "logits/chosen": -1.3225600719451904, + "logits/rejected": -0.8455519676208496, + "logps/chosen": -1.1911251544952393, + "logps/rejected": -2.105005979537964, + "loss": 1.2411, + "odds_ratio_loss": 0.49952688813209534, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11911250650882721, + "rewards/margins": 0.09138808399438858, + "rewards/rejected": -0.2105005979537964, + "sft_loss": 1.1911251544952393, + "step": 1730 + }, + { + "epoch": 0.13, + "grad_norm": 17.48480987548828, + "learning_rate": 9.600020981539956e-06, + "logits/chosen": -1.2910171747207642, + "logits/rejected": -0.7443715929985046, + "logps/chosen": -1.270747423171997, + "logps/rejected": -1.583164930343628, + "loss": 1.3271, + "odds_ratio_loss": 0.5633386373519897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12707474827766418, + "rewards/margins": 0.031241733580827713, + "rewards/rejected": -0.1583164930343628, + "sft_loss": 1.270747423171997, + "step": 1735 + }, + { + "epoch": 0.14, + "grad_norm": 24.316553115844727, + "learning_rate": 9.597604289481694e-06, + "logits/chosen": -1.2967002391815186, + "logits/rejected": -1.2517060041427612, + "logps/chosen": -1.1744412183761597, + "logps/rejected": -1.2925034761428833, + "loss": 1.2822, + "odds_ratio_loss": 1.077358365058899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11744412034749985, + "rewards/margins": 0.011806219816207886, + "rewards/rejected": -0.12925033271312714, + "sft_loss": 1.1744412183761597, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 8.544798851013184, + "learning_rate": 9.595180624572796e-06, + "logits/chosen": -1.1876946687698364, + "logits/rejected": -0.711455762386322, + "logps/chosen": -1.0169483423233032, + "logps/rejected": -1.6038730144500732, + "loss": 1.0643, + "odds_ratio_loss": 0.47310882806777954, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10169483721256256, + "rewards/margins": 0.05869249254465103, + "rewards/rejected": -0.160387322306633, + "sft_loss": 1.0169483423233032, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 11.438017845153809, + "learning_rate": 9.59274999048905e-06, + "logits/chosen": -1.2468347549438477, + "logits/rejected": -1.1550118923187256, + "logps/chosen": -1.0963854789733887, + "logps/rejected": -1.1607807874679565, + "loss": 1.1889, + "odds_ratio_loss": 0.9248707890510559, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10963854938745499, + "rewards/margins": 0.006439529359340668, + "rewards/rejected": -0.11607807874679565, + "sft_loss": 1.0963854789733887, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 10.904440879821777, + "learning_rate": 9.590312390916827e-06, + "logits/chosen": -1.3679497241973877, + "logits/rejected": -1.2733559608459473, + "logps/chosen": -0.9566739797592163, + "logps/rejected": -1.2512528896331787, + "loss": 1.0128, + "odds_ratio_loss": 0.5608989000320435, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09566739946603775, + "rewards/margins": 0.029457902535796165, + "rewards/rejected": -0.12512531876564026, + "sft_loss": 0.9566739797592163, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 83.32542419433594, + "learning_rate": 9.587867829553055e-06, + "logits/chosen": -1.494425654411316, + "logits/rejected": -1.193587064743042, + "logps/chosen": -0.7924457788467407, + "logps/rejected": -1.9595781564712524, + "loss": 0.8297, + "odds_ratio_loss": 0.37288200855255127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07924458384513855, + "rewards/margins": 0.11671324074268341, + "rewards/rejected": -0.19595780968666077, + "sft_loss": 0.7924457788467407, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 6.813757419586182, + "learning_rate": 9.58541631010522e-06, + "logits/chosen": -1.3695342540740967, + "logits/rejected": -0.6321445107460022, + "logps/chosen": -0.7915887236595154, + "logps/rejected": -4.873553276062012, + "loss": 0.8314, + "odds_ratio_loss": 0.39797115325927734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07915887981653214, + "rewards/margins": 0.40819644927978516, + "rewards/rejected": -0.48735538125038147, + "sft_loss": 0.7915887236595154, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 94.25627899169922, + "learning_rate": 9.582957836291365e-06, + "logits/chosen": -1.3012911081314087, + "logits/rejected": -1.3341959714889526, + "logps/chosen": -1.2055413722991943, + "logps/rejected": -5.983010292053223, + "loss": 1.2533, + "odds_ratio_loss": 0.4779466986656189, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12055414915084839, + "rewards/margins": 0.4777469038963318, + "rewards/rejected": -0.5983010530471802, + "sft_loss": 1.2055413722991943, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 9.417245864868164, + "learning_rate": 9.580492411840074e-06, + "logits/chosen": -1.2206226587295532, + "logits/rejected": -1.2017875909805298, + "logps/chosen": -1.0882132053375244, + "logps/rejected": -2.914821147918701, + "loss": 1.1314, + "odds_ratio_loss": 0.43143337965011597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1088213175535202, + "rewards/margins": 0.1826608031988144, + "rewards/rejected": -0.291482150554657, + "sft_loss": 1.0882132053375244, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 6.386961936950684, + "learning_rate": 9.57802004049048e-06, + "logits/chosen": -1.163529396057129, + "logits/rejected": -0.9100425839424133, + "logps/chosen": -0.9114856719970703, + "logps/rejected": -1.1690971851348877, + "loss": 0.976, + "odds_ratio_loss": 0.6448505520820618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09114857017993927, + "rewards/margins": 0.025761157274246216, + "rewards/rejected": -0.11690972000360489, + "sft_loss": 0.9114856719970703, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 8.349102020263672, + "learning_rate": 9.575540725992247e-06, + "logits/chosen": -1.204837441444397, + "logits/rejected": -0.7992308139801025, + "logps/chosen": -1.0231040716171265, + "logps/rejected": -1.9079253673553467, + "loss": 1.0584, + "odds_ratio_loss": 0.3529992401599884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10231040418148041, + "rewards/margins": 0.08848213404417038, + "rewards/rejected": -0.1907925307750702, + "sft_loss": 1.0231040716171265, + "step": 1785 + }, + { + "epoch": 0.14, + "grad_norm": 6.984671115875244, + "learning_rate": 9.573054472105569e-06, + "logits/chosen": -1.160017967224121, + "logits/rejected": -0.872658908367157, + "logps/chosen": -1.000333309173584, + "logps/rejected": -1.3170549869537354, + "loss": 1.0523, + "odds_ratio_loss": 0.5200861692428589, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10003332793712616, + "rewards/margins": 0.031672172248363495, + "rewards/rejected": -0.13170549273490906, + "sft_loss": 1.000333309173584, + "step": 1790 + }, + { + "epoch": 0.14, + "grad_norm": 122.16671752929688, + "learning_rate": 9.570561282601167e-06, + "logits/chosen": -1.2130086421966553, + "logits/rejected": -0.7942731380462646, + "logps/chosen": -1.1537272930145264, + "logps/rejected": -1.1185890436172485, + "loss": 1.3542, + "odds_ratio_loss": 2.0043020248413086, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11537273228168488, + "rewards/margins": -0.003513835370540619, + "rewards/rejected": -0.11185890436172485, + "sft_loss": 1.1537272930145264, + "step": 1795 + }, + { + "epoch": 0.14, + "grad_norm": 6.3150434494018555, + "learning_rate": 9.568061161260278e-06, + "logits/chosen": -1.4022128582000732, + "logits/rejected": -1.072725772857666, + "logps/chosen": -0.8498795628547668, + "logps/rejected": -3.3911800384521484, + "loss": 0.8751, + "odds_ratio_loss": 0.25199708342552185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08498796075582504, + "rewards/margins": 0.2541300654411316, + "rewards/rejected": -0.33911800384521484, + "sft_loss": 0.8498795628547668, + "step": 1800 + }, + { + "epoch": 0.14, + "grad_norm": 52.85586929321289, + "learning_rate": 9.565554111874656e-06, + "logits/chosen": -1.3249465227127075, + "logits/rejected": -1.126591682434082, + "logps/chosen": -1.2122596502304077, + "logps/rejected": -2.6743364334106445, + "loss": 1.2655, + "odds_ratio_loss": 0.5320409536361694, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12122596800327301, + "rewards/margins": 0.14620766043663025, + "rewards/rejected": -0.26743364334106445, + "sft_loss": 1.2122596502304077, + "step": 1805 + }, + { + "epoch": 0.14, + "grad_norm": 12.57426643371582, + "learning_rate": 9.563040138246555e-06, + "logits/chosen": -1.338123083114624, + "logits/rejected": -0.9952232241630554, + "logps/chosen": -1.2487857341766357, + "logps/rejected": -4.27414083480835, + "loss": 1.2862, + "odds_ratio_loss": 0.3736916184425354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12487858533859253, + "rewards/margins": 0.3025355041027069, + "rewards/rejected": -0.4274141192436218, + "sft_loss": 1.2487857341766357, + "step": 1810 + }, + { + "epoch": 0.14, + "grad_norm": 5.040468692779541, + "learning_rate": 9.560519244188741e-06, + "logits/chosen": -1.1609458923339844, + "logits/rejected": -1.012904405593872, + "logps/chosen": -0.879071056842804, + "logps/rejected": -2.6562893390655518, + "loss": 0.8998, + "odds_ratio_loss": 0.20734456181526184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0879070982336998, + "rewards/margins": 0.17772184312343597, + "rewards/rejected": -0.2656289339065552, + "sft_loss": 0.879071056842804, + "step": 1815 + }, + { + "epoch": 0.14, + "grad_norm": 7.037989616394043, + "learning_rate": 9.557991433524465e-06, + "logits/chosen": -1.0672191381454468, + "logits/rejected": -0.8686882257461548, + "logps/chosen": -0.988175094127655, + "logps/rejected": -3.527780055999756, + "loss": 1.0157, + "odds_ratio_loss": 0.27490872144699097, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09881751239299774, + "rewards/margins": 0.2539605498313904, + "rewards/rejected": -0.35277801752090454, + "sft_loss": 0.988175094127655, + "step": 1820 + }, + { + "epoch": 0.14, + "grad_norm": 14.428661346435547, + "learning_rate": 9.555456710087476e-06, + "logits/chosen": -1.3363722562789917, + "logits/rejected": -0.9723957777023315, + "logps/chosen": -1.0374715328216553, + "logps/rejected": -1.990022897720337, + "loss": 1.0726, + "odds_ratio_loss": 0.35115641355514526, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1037471666932106, + "rewards/margins": 0.09525513648986816, + "rewards/rejected": -0.19900229573249817, + "sft_loss": 1.0374715328216553, + "step": 1825 + }, + { + "epoch": 0.14, + "grad_norm": 8.201395988464355, + "learning_rate": 9.552915077722002e-06, + "logits/chosen": -1.3187824487686157, + "logits/rejected": -0.9559124112129211, + "logps/chosen": -0.977192759513855, + "logps/rejected": -1.3955583572387695, + "loss": 1.0304, + "odds_ratio_loss": 0.5320570468902588, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09771927446126938, + "rewards/margins": 0.041836559772491455, + "rewards/rejected": -0.13955584168434143, + "sft_loss": 0.977192759513855, + "step": 1830 + }, + { + "epoch": 0.14, + "grad_norm": 5.69422721862793, + "learning_rate": 9.550366540282753e-06, + "logits/chosen": -1.3487383127212524, + "logits/rejected": -0.5610469579696655, + "logps/chosen": -1.0741031169891357, + "logps/rejected": -1.5127952098846436, + "loss": 1.1234, + "odds_ratio_loss": 0.4929905831813812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10741032660007477, + "rewards/margins": 0.04386921599507332, + "rewards/rejected": -0.1512795388698578, + "sft_loss": 1.0741031169891357, + "step": 1835 + }, + { + "epoch": 0.14, + "grad_norm": 27.743892669677734, + "learning_rate": 9.54781110163491e-06, + "logits/chosen": -1.2624667882919312, + "logits/rejected": -0.8160993456840515, + "logps/chosen": -0.9436119794845581, + "logps/rejected": -1.8064286708831787, + "loss": 0.9741, + "odds_ratio_loss": 0.3050013780593872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09436120092868805, + "rewards/margins": 0.0862816870212555, + "rewards/rejected": -0.18064287304878235, + "sft_loss": 0.9436119794845581, + "step": 1840 + }, + { + "epoch": 0.14, + "grad_norm": 21.507509231567383, + "learning_rate": 9.545248765654116e-06, + "logits/chosen": -1.3653714656829834, + "logits/rejected": -1.1635246276855469, + "logps/chosen": -1.2835218906402588, + "logps/rejected": -4.833404541015625, + "loss": 1.3401, + "odds_ratio_loss": 0.5661024451255798, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12835219502449036, + "rewards/margins": 0.3549882471561432, + "rewards/rejected": -0.48334044218063354, + "sft_loss": 1.2835218906402588, + "step": 1845 + }, + { + "epoch": 0.14, + "grad_norm": 5.130289077758789, + "learning_rate": 9.542679536226483e-06, + "logits/chosen": -1.1667282581329346, + "logits/rejected": -0.9166663289070129, + "logps/chosen": -1.0290402173995972, + "logps/rejected": -2.8194077014923096, + "loss": 1.0678, + "odds_ratio_loss": 0.3873458802700043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10290402173995972, + "rewards/margins": 0.17903673648834229, + "rewards/rejected": -0.281940758228302, + "sft_loss": 1.0290402173995972, + "step": 1850 + }, + { + "epoch": 0.14, + "grad_norm": 55.149497985839844, + "learning_rate": 9.540103417248572e-06, + "logits/chosen": -1.3182549476623535, + "logits/rejected": -0.9496681094169617, + "logps/chosen": -0.9547656774520874, + "logps/rejected": -1.39925217628479, + "loss": 1.0033, + "odds_ratio_loss": 0.485520601272583, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09547658264636993, + "rewards/margins": 0.04444863274693489, + "rewards/rejected": -0.13992521166801453, + "sft_loss": 0.9547656774520874, + "step": 1855 + }, + { + "epoch": 0.14, + "grad_norm": 5.448976039886475, + "learning_rate": 9.537520412627395e-06, + "logits/chosen": -1.3200477361679077, + "logits/rejected": -0.9220551252365112, + "logps/chosen": -1.221671223640442, + "logps/rejected": -3.222076416015625, + "loss": 1.2677, + "odds_ratio_loss": 0.4604717195034027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12216712534427643, + "rewards/margins": 0.2000405341386795, + "rewards/rejected": -0.32220762968063354, + "sft_loss": 1.221671223640442, + "step": 1860 + }, + { + "epoch": 0.15, + "grad_norm": 17.277822494506836, + "learning_rate": 9.534930526280406e-06, + "logits/chosen": -1.3141456842422485, + "logits/rejected": -1.0591038465499878, + "logps/chosen": -1.1151491403579712, + "logps/rejected": -5.379051208496094, + "loss": 1.1551, + "odds_ratio_loss": 0.39921683073043823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11151491105556488, + "rewards/margins": 0.42639026045799255, + "rewards/rejected": -0.5379050970077515, + "sft_loss": 1.1151491403579712, + "step": 1865 + }, + { + "epoch": 0.15, + "grad_norm": 19.059919357299805, + "learning_rate": 9.532333762135498e-06, + "logits/chosen": -1.5041942596435547, + "logits/rejected": -1.2156248092651367, + "logps/chosen": -0.9729844331741333, + "logps/rejected": -0.8604519963264465, + "loss": 1.0619, + "odds_ratio_loss": 0.8896477818489075, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09729844331741333, + "rewards/margins": -0.0112532377243042, + "rewards/rejected": -0.08604519814252853, + "sft_loss": 0.9729844331741333, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 5.341587066650391, + "learning_rate": 9.52973012413099e-06, + "logits/chosen": -1.4371575117111206, + "logits/rejected": -0.9742057919502258, + "logps/chosen": -1.0960537195205688, + "logps/rejected": -1.8433376550674438, + "loss": 1.1446, + "odds_ratio_loss": 0.48561111092567444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10960537195205688, + "rewards/margins": 0.07472839951515198, + "rewards/rejected": -0.18433377146720886, + "sft_loss": 1.0960537195205688, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 13.053421974182129, + "learning_rate": 9.527119616215632e-06, + "logits/chosen": -1.3161237239837646, + "logits/rejected": -1.1920946836471558, + "logps/chosen": -0.9443743824958801, + "logps/rejected": -5.979962348937988, + "loss": 0.9891, + "odds_ratio_loss": 0.4469161629676819, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09443744271993637, + "rewards/margins": 0.5035587549209595, + "rewards/rejected": -0.5979962348937988, + "sft_loss": 0.9443743824958801, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 7.233993053436279, + "learning_rate": 9.524502242348592e-06, + "logits/chosen": -1.0858581066131592, + "logits/rejected": -0.6852259635925293, + "logps/chosen": -1.0311671495437622, + "logps/rejected": -1.154486894607544, + "loss": 1.0932, + "odds_ratio_loss": 0.6205244064331055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1031167283654213, + "rewards/margins": 0.012331971898674965, + "rewards/rejected": -0.11544869095087051, + "sft_loss": 1.0311671495437622, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 6.396711826324463, + "learning_rate": 9.521878006499447e-06, + "logits/chosen": -1.2524656057357788, + "logits/rejected": -0.5958696007728577, + "logps/chosen": -1.1098922491073608, + "logps/rejected": -6.758184909820557, + "loss": 1.1436, + "odds_ratio_loss": 0.33697599172592163, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11098922789096832, + "rewards/margins": 0.5648292303085327, + "rewards/rejected": -0.6758184432983398, + "sft_loss": 1.1098922491073608, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 10.993759155273438, + "learning_rate": 9.519246912648186e-06, + "logits/chosen": -1.3063591718673706, + "logits/rejected": -0.8458372354507446, + "logps/chosen": -0.8645821809768677, + "logps/rejected": -1.1725999116897583, + "loss": 0.918, + "odds_ratio_loss": 0.5336938500404358, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.086458221077919, + "rewards/margins": 0.030801767483353615, + "rewards/rejected": -0.11725999414920807, + "sft_loss": 0.8645821809768677, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 5.961607933044434, + "learning_rate": 9.516608964785196e-06, + "logits/chosen": -1.3859764337539673, + "logits/rejected": -1.096915602684021, + "logps/chosen": -0.8092068433761597, + "logps/rejected": -2.0697178840637207, + "loss": 0.8515, + "odds_ratio_loss": 0.42318421602249146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08092068880796432, + "rewards/margins": 0.12605109810829163, + "rewards/rejected": -0.20697179436683655, + "sft_loss": 0.8092068433761597, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 7.101929187774658, + "learning_rate": 9.513964166911258e-06, + "logits/chosen": -1.4362437725067139, + "logits/rejected": -1.1196719408035278, + "logps/chosen": -0.7003141641616821, + "logps/rejected": -0.7816325426101685, + "loss": 0.7882, + "odds_ratio_loss": 0.8789259791374207, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07003141939640045, + "rewards/margins": 0.008131841197609901, + "rewards/rejected": -0.0781632512807846, + "sft_loss": 0.7003141641616821, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 6.640874862670898, + "learning_rate": 9.511312523037549e-06, + "logits/chosen": -1.416475772857666, + "logits/rejected": -0.9427222013473511, + "logps/chosen": -0.9190031290054321, + "logps/rejected": -0.7494014501571655, + "loss": 1.0176, + "odds_ratio_loss": 0.9855034947395325, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.0919003039598465, + "rewards/margins": -0.01696016639471054, + "rewards/rejected": -0.07494015246629715, + "sft_loss": 0.9190031290054321, + "step": 1910 + }, + { + "epoch": 0.15, + "grad_norm": 6.442885875701904, + "learning_rate": 9.508654037185619e-06, + "logits/chosen": -1.3226420879364014, + "logits/rejected": -1.0659013986587524, + "logps/chosen": -1.0546033382415771, + "logps/rejected": -1.6016933917999268, + "loss": 1.1363, + "odds_ratio_loss": 0.8171154260635376, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10546033084392548, + "rewards/margins": 0.054708998650312424, + "rewards/rejected": -0.1601693332195282, + "sft_loss": 1.0546033382415771, + "step": 1915 + }, + { + "epoch": 0.15, + "grad_norm": 4.770626068115234, + "learning_rate": 9.505988713387398e-06, + "logits/chosen": -1.2759541273117065, + "logits/rejected": -0.9872153997421265, + "logps/chosen": -1.1276299953460693, + "logps/rejected": -1.5661379098892212, + "loss": 1.181, + "odds_ratio_loss": 0.5340844392776489, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11276300996541977, + "rewards/margins": 0.04385078325867653, + "rewards/rejected": -0.1566137969493866, + "sft_loss": 1.1276299953460693, + "step": 1920 + }, + { + "epoch": 0.15, + "grad_norm": 12.970624923706055, + "learning_rate": 9.503316555685194e-06, + "logits/chosen": -1.363874912261963, + "logits/rejected": -0.482523113489151, + "logps/chosen": -1.0931895971298218, + "logps/rejected": -1.794382095336914, + "loss": 1.1275, + "odds_ratio_loss": 0.343106746673584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10931895673274994, + "rewards/margins": 0.07011924684047699, + "rewards/rejected": -0.17943820357322693, + "sft_loss": 1.0931895971298218, + "step": 1925 + }, + { + "epoch": 0.15, + "grad_norm": 7.856452465057373, + "learning_rate": 9.500637568131667e-06, + "logits/chosen": -1.2575715780258179, + "logits/rejected": -1.1664985418319702, + "logps/chosen": -0.9274052381515503, + "logps/rejected": -1.0278116464614868, + "loss": 0.9976, + "odds_ratio_loss": 0.7020447850227356, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09274052828550339, + "rewards/margins": 0.010040633380413055, + "rewards/rejected": -0.10278116166591644, + "sft_loss": 0.9274052381515503, + "step": 1930 + }, + { + "epoch": 0.15, + "grad_norm": 11.970571517944336, + "learning_rate": 9.497951754789847e-06, + "logits/chosen": -1.3612158298492432, + "logits/rejected": -0.9841570854187012, + "logps/chosen": -1.069284439086914, + "logps/rejected": -3.824751377105713, + "loss": 1.1041, + "odds_ratio_loss": 0.34776008129119873, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10692846775054932, + "rewards/margins": 0.275546669960022, + "rewards/rejected": -0.3824751377105713, + "sft_loss": 1.069284439086914, + "step": 1935 + }, + { + "epoch": 0.15, + "grad_norm": 32.797340393066406, + "learning_rate": 9.495259119733108e-06, + "logits/chosen": -1.3850946426391602, + "logits/rejected": -1.2886738777160645, + "logps/chosen": -0.9914947748184204, + "logps/rejected": -1.1922539472579956, + "loss": 1.0509, + "odds_ratio_loss": 0.5942121744155884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09914946556091309, + "rewards/margins": 0.020075935870409012, + "rewards/rejected": -0.1192254051566124, + "sft_loss": 0.9914947748184204, + "step": 1940 + }, + { + "epoch": 0.15, + "grad_norm": 11.763391494750977, + "learning_rate": 9.492559667045174e-06, + "logits/chosen": -1.273818016052246, + "logits/rejected": -1.1597501039505005, + "logps/chosen": -0.7548612952232361, + "logps/rejected": -8.429471015930176, + "loss": 0.7835, + "odds_ratio_loss": 0.2863296866416931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07548613101243973, + "rewards/margins": 0.7674610018730164, + "rewards/rejected": -0.8429471254348755, + "sft_loss": 0.7548612952232361, + "step": 1945 + }, + { + "epoch": 0.15, + "grad_norm": 11.239775657653809, + "learning_rate": 9.489853400820106e-06, + "logits/chosen": -1.252996563911438, + "logits/rejected": -0.5567450523376465, + "logps/chosen": -0.9976485967636108, + "logps/rejected": -2.493691921234131, + "loss": 1.0317, + "odds_ratio_loss": 0.3405466675758362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0997648537158966, + "rewards/margins": 0.14960432052612305, + "rewards/rejected": -0.24936918914318085, + "sft_loss": 0.9976485967636108, + "step": 1950 + }, + { + "epoch": 0.15, + "grad_norm": 8.288493156433105, + "learning_rate": 9.487140325162303e-06, + "logits/chosen": -1.3895283937454224, + "logits/rejected": -1.0272207260131836, + "logps/chosen": -0.6770743131637573, + "logps/rejected": -3.0709948539733887, + "loss": 0.7256, + "odds_ratio_loss": 0.48501911759376526, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06770743429660797, + "rewards/margins": 0.23939207196235657, + "rewards/rejected": -0.30709946155548096, + "sft_loss": 0.6770743131637573, + "step": 1955 + }, + { + "epoch": 0.15, + "grad_norm": 41.745906829833984, + "learning_rate": 9.484420444186486e-06, + "logits/chosen": -1.0695884227752686, + "logits/rejected": -0.9039579629898071, + "logps/chosen": -0.7909213304519653, + "logps/rejected": -0.9298604130744934, + "loss": 0.8515, + "odds_ratio_loss": 0.6061114072799683, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07909214496612549, + "rewards/margins": 0.013893899507820606, + "rewards/rejected": -0.09298603981733322, + "sft_loss": 0.7909213304519653, + "step": 1960 + }, + { + "epoch": 0.15, + "grad_norm": 5.356940746307373, + "learning_rate": 9.481693762017702e-06, + "logits/chosen": -1.3417937755584717, + "logits/rejected": -0.7301517724990845, + "logps/chosen": -1.1014509201049805, + "logps/rejected": -1.3265063762664795, + "loss": 1.1633, + "odds_ratio_loss": 0.6179971694946289, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11014509201049805, + "rewards/margins": 0.022505560889840126, + "rewards/rejected": -0.13265065848827362, + "sft_loss": 1.1014509201049805, + "step": 1965 + }, + { + "epoch": 0.15, + "grad_norm": 33.3004035949707, + "learning_rate": 9.47896028279131e-06, + "logits/chosen": -1.3251553773880005, + "logits/rejected": -0.9696298837661743, + "logps/chosen": -1.0699702501296997, + "logps/rejected": -0.8503786325454712, + "loss": 1.1599, + "odds_ratio_loss": 0.8989558219909668, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10699701309204102, + "rewards/margins": -0.021959159523248672, + "rewards/rejected": -0.08503787219524384, + "sft_loss": 1.0699702501296997, + "step": 1970 + }, + { + "epoch": 0.15, + "grad_norm": 8.4285888671875, + "learning_rate": 9.476220010652978e-06, + "logits/chosen": -1.1229819059371948, + "logits/rejected": -0.9317318201065063, + "logps/chosen": -0.8251851797103882, + "logps/rejected": -0.8069744110107422, + "loss": 0.9044, + "odds_ratio_loss": 0.7926350831985474, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08251851797103882, + "rewards/margins": -0.0018210865091532469, + "rewards/rejected": -0.0806974321603775, + "sft_loss": 0.8251851797103882, + "step": 1975 + }, + { + "epoch": 0.15, + "grad_norm": 8.010242462158203, + "learning_rate": 9.473472949758677e-06, + "logits/chosen": -1.1807481050491333, + "logits/rejected": -0.8190022706985474, + "logps/chosen": -1.0934984683990479, + "logps/rejected": -1.292314887046814, + "loss": 1.1505, + "odds_ratio_loss": 0.5697231292724609, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10934984683990479, + "rewards/margins": 0.019881639629602432, + "rewards/rejected": -0.1292314976453781, + "sft_loss": 1.0934984683990479, + "step": 1980 + }, + { + "epoch": 0.15, + "grad_norm": 5.987873554229736, + "learning_rate": 9.470719104274675e-06, + "logits/chosen": -1.3005092144012451, + "logits/rejected": -0.6768913269042969, + "logps/chosen": -1.0687867403030396, + "logps/rejected": -1.6101669073104858, + "loss": 1.1172, + "odds_ratio_loss": 0.4846063554286957, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10687867552042007, + "rewards/margins": 0.054138004779815674, + "rewards/rejected": -0.16101667284965515, + "sft_loss": 1.0687867403030396, + "step": 1985 + }, + { + "epoch": 0.15, + "grad_norm": 15.15396785736084, + "learning_rate": 9.467958478377525e-06, + "logits/chosen": -1.2687232494354248, + "logits/rejected": -0.8056892156600952, + "logps/chosen": -1.0249241590499878, + "logps/rejected": -1.5329560041427612, + "loss": 1.07, + "odds_ratio_loss": 0.4506935477256775, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10249241441488266, + "rewards/margins": 0.05080317705869675, + "rewards/rejected": -0.1532956063747406, + "sft_loss": 1.0249241590499878, + "step": 1990 + }, + { + "epoch": 0.16, + "grad_norm": 33.13812255859375, + "learning_rate": 9.465191076254067e-06, + "logits/chosen": -1.2532131671905518, + "logits/rejected": -1.1098170280456543, + "logps/chosen": -0.9342479705810547, + "logps/rejected": -1.725606918334961, + "loss": 0.9789, + "odds_ratio_loss": 0.44632625579833984, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09342481195926666, + "rewards/margins": 0.07913590222597122, + "rewards/rejected": -0.1725607067346573, + "sft_loss": 0.9342479705810547, + "step": 1995 + }, + { + "epoch": 0.16, + "grad_norm": 5.594306468963623, + "learning_rate": 9.462416902101422e-06, + "logits/chosen": -1.0992923974990845, + "logits/rejected": -0.8452315330505371, + "logps/chosen": -1.4404175281524658, + "logps/rejected": -2.1999869346618652, + "loss": 1.502, + "odds_ratio_loss": 0.6154400110244751, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1440417468547821, + "rewards/margins": 0.07595695555210114, + "rewards/rejected": -0.21999871730804443, + "sft_loss": 1.4404175281524658, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 15.01916217803955, + "learning_rate": 9.459635960126973e-06, + "logits/chosen": -1.1773990392684937, + "logits/rejected": -0.8374426960945129, + "logps/chosen": -0.9732074737548828, + "logps/rejected": -1.0846574306488037, + "loss": 1.073, + "odds_ratio_loss": 0.9976279139518738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09732075035572052, + "rewards/margins": 0.011144992895424366, + "rewards/rejected": -0.10846574604511261, + "sft_loss": 0.9732074737548828, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 3.13708233833313, + "learning_rate": 9.456848254548373e-06, + "logits/chosen": -1.4408618211746216, + "logits/rejected": -0.8943861722946167, + "logps/chosen": -1.1539796590805054, + "logps/rejected": -3.8530449867248535, + "loss": 1.1918, + "odds_ratio_loss": 0.37784868478775024, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11539797484874725, + "rewards/margins": 0.26990655064582825, + "rewards/rejected": -0.3853045105934143, + "sft_loss": 1.1539796590805054, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 10.954537391662598, + "learning_rate": 9.454053789593532e-06, + "logits/chosen": -1.28227698802948, + "logits/rejected": -0.8783136606216431, + "logps/chosen": -0.9702832102775574, + "logps/rejected": -2.25543475151062, + "loss": 1.0171, + "odds_ratio_loss": 0.4677762985229492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09702832996845245, + "rewards/margins": 0.12851516902446747, + "rewards/rejected": -0.22554349899291992, + "sft_loss": 0.9702832102775574, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 5.139076232910156, + "learning_rate": 9.451252569500609e-06, + "logits/chosen": -1.4350674152374268, + "logits/rejected": -1.1181226968765259, + "logps/chosen": -1.0790382623672485, + "logps/rejected": -2.726478099822998, + "loss": 1.1174, + "odds_ratio_loss": 0.38367173075675964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10790381580591202, + "rewards/margins": 0.16474398970603943, + "rewards/rejected": -0.27264779806137085, + "sft_loss": 1.0790382623672485, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 17.504884719848633, + "learning_rate": 9.448444598518013e-06, + "logits/chosen": -1.28285813331604, + "logits/rejected": -0.5891727209091187, + "logps/chosen": -1.0441431999206543, + "logps/rejected": -1.1051876544952393, + "loss": 1.1253, + "odds_ratio_loss": 0.8114008903503418, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10441432148218155, + "rewards/margins": 0.006104458123445511, + "rewards/rejected": -0.11051877588033676, + "sft_loss": 1.0441431999206543, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 8.165739059448242, + "learning_rate": 9.445629880904386e-06, + "logits/chosen": -1.2738823890686035, + "logits/rejected": -1.1569932699203491, + "logps/chosen": -0.8925234079360962, + "logps/rejected": -1.6296312808990479, + "loss": 0.925, + "odds_ratio_loss": 0.3249626159667969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08925233781337738, + "rewards/margins": 0.07371079176664352, + "rewards/rejected": -0.1629631370306015, + "sft_loss": 0.8925234079360962, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 7.672553539276123, + "learning_rate": 9.442808420928606e-06, + "logits/chosen": -1.3439207077026367, + "logits/rejected": -0.768078625202179, + "logps/chosen": -0.9887423515319824, + "logps/rejected": -2.255394220352173, + "loss": 1.0238, + "odds_ratio_loss": 0.3510589003562927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09887423366308212, + "rewards/margins": 0.1266651749610901, + "rewards/rejected": -0.2255394011735916, + "sft_loss": 0.9887423515319824, + "step": 2035 + }, + { + "epoch": 0.16, + "grad_norm": 5.961903095245361, + "learning_rate": 9.439980222869774e-06, + "logits/chosen": -1.3768771886825562, + "logits/rejected": -0.7793500423431396, + "logps/chosen": -1.2942712306976318, + "logps/rejected": -1.8602949380874634, + "loss": 1.3402, + "odds_ratio_loss": 0.4595089852809906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12942712008953094, + "rewards/margins": 0.056602366268634796, + "rewards/rejected": -0.18602950870990753, + "sft_loss": 1.2942712306976318, + "step": 2040 + }, + { + "epoch": 0.16, + "grad_norm": 33.816226959228516, + "learning_rate": 9.437145291017213e-06, + "logits/chosen": -1.2508987188339233, + "logits/rejected": -1.352567434310913, + "logps/chosen": -0.686786413192749, + "logps/rejected": -1.241456151008606, + "loss": 0.7285, + "odds_ratio_loss": 0.4171048700809479, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06867863982915878, + "rewards/margins": 0.05546698719263077, + "rewards/rejected": -0.12414562702178955, + "sft_loss": 0.686786413192749, + "step": 2045 + }, + { + "epoch": 0.16, + "grad_norm": 5.645260810852051, + "learning_rate": 9.434303629670456e-06, + "logits/chosen": -1.3583166599273682, + "logits/rejected": -0.9741853475570679, + "logps/chosen": -0.840481162071228, + "logps/rejected": -1.6484066247940063, + "loss": 0.9071, + "odds_ratio_loss": 0.6664353609085083, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08404812216758728, + "rewards/margins": 0.08079256117343903, + "rewards/rejected": -0.1648406684398651, + "sft_loss": 0.840481162071228, + "step": 2050 + }, + { + "epoch": 0.16, + "grad_norm": 26.574750900268555, + "learning_rate": 9.431455243139242e-06, + "logits/chosen": -1.1145718097686768, + "logits/rejected": -1.1763341426849365, + "logps/chosen": -1.0447132587432861, + "logps/rejected": -0.8651531934738159, + "loss": 1.1318, + "odds_ratio_loss": 0.8704781532287598, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10447131097316742, + "rewards/margins": -0.017955996096134186, + "rewards/rejected": -0.08651532232761383, + "sft_loss": 1.0447132587432861, + "step": 2055 + }, + { + "epoch": 0.16, + "grad_norm": 84.78226470947266, + "learning_rate": 9.428600135743514e-06, + "logits/chosen": -1.399448037147522, + "logits/rejected": -0.9923950433731079, + "logps/chosen": -0.8076363801956177, + "logps/rejected": -1.329791784286499, + "loss": 0.8522, + "odds_ratio_loss": 0.4460652470588684, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08076363801956177, + "rewards/margins": 0.05221554636955261, + "rewards/rejected": -0.13297918438911438, + "sft_loss": 0.8076363801956177, + "step": 2060 + }, + { + "epoch": 0.16, + "grad_norm": 11.356202125549316, + "learning_rate": 9.425738311813403e-06, + "logits/chosen": -1.3921433687210083, + "logits/rejected": -1.068137764930725, + "logps/chosen": -1.303889513015747, + "logps/rejected": -1.587685227394104, + "loss": 1.373, + "odds_ratio_loss": 0.6907719373703003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13038896024227142, + "rewards/margins": 0.02837957814335823, + "rewards/rejected": -0.15876853466033936, + "sft_loss": 1.303889513015747, + "step": 2065 + }, + { + "epoch": 0.16, + "grad_norm": 26.550935745239258, + "learning_rate": 9.422869775689227e-06, + "logits/chosen": -1.0806728601455688, + "logits/rejected": -1.7661387920379639, + "logps/chosen": -1.0194052457809448, + "logps/rejected": -7.546328544616699, + "loss": 1.0475, + "odds_ratio_loss": 0.28096455335617065, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10194053500890732, + "rewards/margins": 0.6526923179626465, + "rewards/rejected": -0.754632830619812, + "sft_loss": 1.0194052457809448, + "step": 2070 + }, + { + "epoch": 0.16, + "grad_norm": 7.983142375946045, + "learning_rate": 9.419994531721488e-06, + "logits/chosen": -1.3471333980560303, + "logits/rejected": -1.0253812074661255, + "logps/chosen": -0.8576242327690125, + "logps/rejected": -1.83051335811615, + "loss": 0.888, + "odds_ratio_loss": 0.30377694964408875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08576242625713348, + "rewards/margins": 0.09728892892599106, + "rewards/rejected": -0.18305133283138275, + "sft_loss": 0.8576242327690125, + "step": 2075 + }, + { + "epoch": 0.16, + "grad_norm": 7.150165557861328, + "learning_rate": 9.417112584270858e-06, + "logits/chosen": -1.5308114290237427, + "logits/rejected": -1.1309223175048828, + "logps/chosen": -1.0225160121917725, + "logps/rejected": -5.807229042053223, + "loss": 1.0527, + "odds_ratio_loss": 0.3023206293582916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10225160419940948, + "rewards/margins": 0.4784712791442871, + "rewards/rejected": -0.5807229280471802, + "sft_loss": 1.0225160121917725, + "step": 2080 + }, + { + "epoch": 0.16, + "grad_norm": 36.25994110107422, + "learning_rate": 9.414223937708175e-06, + "logits/chosen": -1.3622469902038574, + "logits/rejected": -1.109290361404419, + "logps/chosen": -0.9507700204849243, + "logps/rejected": -1.2247527837753296, + "loss": 1.0037, + "odds_ratio_loss": 0.5290084481239319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09507700055837631, + "rewards/margins": 0.027398282662034035, + "rewards/rejected": -0.1224752888083458, + "sft_loss": 0.9507700204849243, + "step": 2085 + }, + { + "epoch": 0.16, + "grad_norm": 6.515985488891602, + "learning_rate": 9.411328596414439e-06, + "logits/chosen": -1.2988791465759277, + "logits/rejected": -1.144550085067749, + "logps/chosen": -1.0585949420928955, + "logps/rejected": -1.748295783996582, + "loss": 1.1064, + "odds_ratio_loss": 0.4781731963157654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10585949569940567, + "rewards/margins": 0.06897006928920746, + "rewards/rejected": -0.17482957243919373, + "sft_loss": 1.0585949420928955, + "step": 2090 + }, + { + "epoch": 0.16, + "grad_norm": 53.535892486572266, + "learning_rate": 9.4084265647808e-06, + "logits/chosen": -1.3115184307098389, + "logits/rejected": -1.0028080940246582, + "logps/chosen": -0.8936856389045715, + "logps/rejected": -2.9408774375915527, + "loss": 0.9272, + "odds_ratio_loss": 0.3351721465587616, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0893685594201088, + "rewards/margins": 0.2047191858291626, + "rewards/rejected": -0.2940877377986908, + "sft_loss": 0.8936856389045715, + "step": 2095 + }, + { + "epoch": 0.16, + "grad_norm": 4.780445098876953, + "learning_rate": 9.405517847208562e-06, + "logits/chosen": -1.1145271062850952, + "logits/rejected": -0.7654793858528137, + "logps/chosen": -1.0037806034088135, + "logps/rejected": -1.2181943655014038, + "loss": 1.0649, + "odds_ratio_loss": 0.6109730005264282, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10037805885076523, + "rewards/margins": 0.021441373974084854, + "rewards/rejected": -0.12181943655014038, + "sft_loss": 1.0037806034088135, + "step": 2100 + }, + { + "epoch": 0.16, + "grad_norm": 20.804689407348633, + "learning_rate": 9.402602448109163e-06, + "logits/chosen": -1.0638505220413208, + "logits/rejected": -0.7821773290634155, + "logps/chosen": -1.1113550662994385, + "logps/rejected": -1.3830729722976685, + "loss": 1.1701, + "odds_ratio_loss": 0.587774932384491, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11113552004098892, + "rewards/margins": 0.027171779423952103, + "rewards/rejected": -0.13830728828907013, + "sft_loss": 1.1113550662994385, + "step": 2105 + }, + { + "epoch": 0.16, + "grad_norm": 16.03834342956543, + "learning_rate": 9.399680371904174e-06, + "logits/chosen": -0.8229387998580933, + "logits/rejected": -1.0971095561981201, + "logps/chosen": -0.6263498067855835, + "logps/rejected": -1.5988829135894775, + "loss": 0.6522, + "odds_ratio_loss": 0.25828564167022705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06263498961925507, + "rewards/margins": 0.09725330770015717, + "rewards/rejected": -0.15988829731941223, + "sft_loss": 0.6263498067855835, + "step": 2110 + }, + { + "epoch": 0.16, + "grad_norm": 52.247398376464844, + "learning_rate": 9.396751623025297e-06, + "logits/chosen": -1.418639898300171, + "logits/rejected": -0.9515444040298462, + "logps/chosen": -1.0927083492279053, + "logps/rejected": -3.6967151165008545, + "loss": 1.1338, + "odds_ratio_loss": 0.4113377630710602, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10927083343267441, + "rewards/margins": 0.2604006826877594, + "rewards/rejected": -0.3696715235710144, + "sft_loss": 1.0927083492279053, + "step": 2115 + }, + { + "epoch": 0.16, + "grad_norm": 15.996710777282715, + "learning_rate": 9.393816205914348e-06, + "logits/chosen": -1.182966947555542, + "logits/rejected": -0.7741319537162781, + "logps/chosen": -0.864666759967804, + "logps/rejected": -1.3445454835891724, + "loss": 0.9136, + "odds_ratio_loss": 0.48978710174560547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08646667748689651, + "rewards/margins": 0.04798787087202072, + "rewards/rejected": -0.13445454835891724, + "sft_loss": 0.864666759967804, + "step": 2120 + }, + { + "epoch": 0.17, + "grad_norm": 5.824068546295166, + "learning_rate": 9.390874125023265e-06, + "logits/chosen": -1.319427490234375, + "logits/rejected": -0.6739305257797241, + "logps/chosen": -1.122070074081421, + "logps/rejected": -1.3879320621490479, + "loss": 1.178, + "odds_ratio_loss": 0.5590441823005676, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11220701038837433, + "rewards/margins": 0.026586193591356277, + "rewards/rejected": -0.1387932002544403, + "sft_loss": 1.122070074081421, + "step": 2125 + }, + { + "epoch": 0.17, + "grad_norm": 16.000850677490234, + "learning_rate": 9.387925384814083e-06, + "logits/chosen": -1.3848741054534912, + "logits/rejected": -1.0673903226852417, + "logps/chosen": -1.0577576160430908, + "logps/rejected": -4.389899253845215, + "loss": 1.1129, + "odds_ratio_loss": 0.5515421032905579, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10577578842639923, + "rewards/margins": 0.33321413397789, + "rewards/rejected": -0.43898987770080566, + "sft_loss": 1.0577576160430908, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 39.926612854003906, + "learning_rate": 9.384969989758942e-06, + "logits/chosen": -1.1923563480377197, + "logits/rejected": -0.7410465478897095, + "logps/chosen": -1.1001938581466675, + "logps/rejected": -1.2848488092422485, + "loss": 1.1697, + "odds_ratio_loss": 0.695135772228241, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11001940071582794, + "rewards/margins": 0.01846548728644848, + "rewards/rejected": -0.12848487496376038, + "sft_loss": 1.1001938581466675, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 11.095691680908203, + "learning_rate": 9.382007944340075e-06, + "logits/chosen": -1.323016881942749, + "logits/rejected": -0.8939719200134277, + "logps/chosen": -1.0650465488433838, + "logps/rejected": -0.9738779067993164, + "loss": 1.148, + "odds_ratio_loss": 0.8292935490608215, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1065046414732933, + "rewards/margins": -0.009116856381297112, + "rewards/rejected": -0.09738779067993164, + "sft_loss": 1.0650465488433838, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 4.360654830932617, + "learning_rate": 9.379039253049798e-06, + "logits/chosen": -1.243947148323059, + "logits/rejected": -0.8853607177734375, + "logps/chosen": -1.1823415756225586, + "logps/rejected": -1.3976190090179443, + "loss": 1.2821, + "odds_ratio_loss": 0.9972108006477356, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11823415756225586, + "rewards/margins": 0.021527742967009544, + "rewards/rejected": -0.13976189494132996, + "sft_loss": 1.1823415756225586, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 4.95953369140625, + "learning_rate": 9.376063920390509e-06, + "logits/chosen": -1.208055853843689, + "logits/rejected": -0.985071063041687, + "logps/chosen": -0.8940626978874207, + "logps/rejected": -1.6095507144927979, + "loss": 0.9281, + "odds_ratio_loss": 0.34006446599960327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08940626680850983, + "rewards/margins": 0.07154880464076996, + "rewards/rejected": -0.16095507144927979, + "sft_loss": 0.8940626978874207, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 52.542457580566406, + "learning_rate": 9.373081950874678e-06, + "logits/chosen": -1.2736427783966064, + "logits/rejected": -0.7791846990585327, + "logps/chosen": -1.151737093925476, + "logps/rejected": -7.746405601501465, + "loss": 1.243, + "odds_ratio_loss": 0.9124053120613098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11517371982336044, + "rewards/margins": 0.6594668626785278, + "rewards/rejected": -0.7746405601501465, + "sft_loss": 1.151737093925476, + "step": 2155 + }, + { + "epoch": 0.17, + "grad_norm": 9.984672546386719, + "learning_rate": 9.370093349024842e-06, + "logits/chosen": -1.3483428955078125, + "logits/rejected": -1.1995983123779297, + "logps/chosen": -1.030102014541626, + "logps/rejected": -5.066445827484131, + "loss": 1.0975, + "odds_ratio_loss": 0.6737374067306519, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10301021486520767, + "rewards/margins": 0.40363436937332153, + "rewards/rejected": -0.506644606590271, + "sft_loss": 1.030102014541626, + "step": 2160 + }, + { + "epoch": 0.17, + "grad_norm": 8.059449195861816, + "learning_rate": 9.367098119373592e-06, + "logits/chosen": -1.26102614402771, + "logits/rejected": -0.9494965672492981, + "logps/chosen": -0.9010990262031555, + "logps/rejected": -1.2991758584976196, + "loss": 0.9497, + "odds_ratio_loss": 0.48647040128707886, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09010989964008331, + "rewards/margins": 0.03980768844485283, + "rewards/rejected": -0.12991759181022644, + "sft_loss": 0.9010990262031555, + "step": 2165 + }, + { + "epoch": 0.17, + "grad_norm": 7.4642815589904785, + "learning_rate": 9.364096266463577e-06, + "logits/chosen": -1.4591320753097534, + "logits/rejected": -1.0818897485733032, + "logps/chosen": -0.9885584115982056, + "logps/rejected": -1.1860976219177246, + "loss": 1.051, + "odds_ratio_loss": 0.6242485642433167, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09885583817958832, + "rewards/margins": 0.019753912463784218, + "rewards/rejected": -0.11860976368188858, + "sft_loss": 0.9885584115982056, + "step": 2170 + }, + { + "epoch": 0.17, + "grad_norm": 8.31343936920166, + "learning_rate": 9.361087794847485e-06, + "logits/chosen": -1.3828623294830322, + "logits/rejected": -1.0185619592666626, + "logps/chosen": -0.8504158854484558, + "logps/rejected": -1.4988811016082764, + "loss": 0.9257, + "odds_ratio_loss": 0.7529224157333374, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0850415974855423, + "rewards/margins": 0.06484649330377579, + "rewards/rejected": -0.14988809823989868, + "sft_loss": 0.8504158854484558, + "step": 2175 + }, + { + "epoch": 0.17, + "grad_norm": 7.262338638305664, + "learning_rate": 9.358072709088046e-06, + "logits/chosen": -1.2764170169830322, + "logits/rejected": -1.0059707164764404, + "logps/chosen": -0.6619844436645508, + "logps/rejected": -7.922823905944824, + "loss": 0.6878, + "odds_ratio_loss": 0.2583473324775696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0661984458565712, + "rewards/margins": 0.7260838747024536, + "rewards/rejected": -0.7922824025154114, + "sft_loss": 0.6619844436645508, + "step": 2180 + }, + { + "epoch": 0.17, + "grad_norm": 26.565011978149414, + "learning_rate": 9.355051013758023e-06, + "logits/chosen": -1.3052898645401, + "logits/rejected": -1.0967880487442017, + "logps/chosen": -0.8731967210769653, + "logps/rejected": -1.453786849975586, + "loss": 0.9169, + "odds_ratio_loss": 0.43690699338912964, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08731966465711594, + "rewards/margins": 0.05805901437997818, + "rewards/rejected": -0.14537867903709412, + "sft_loss": 0.8731967210769653, + "step": 2185 + }, + { + "epoch": 0.17, + "grad_norm": 33.61377716064453, + "learning_rate": 9.352022713440198e-06, + "logits/chosen": -1.225475549697876, + "logits/rejected": -1.052595853805542, + "logps/chosen": -0.8050892949104309, + "logps/rejected": -0.9161909222602844, + "loss": 0.8753, + "odds_ratio_loss": 0.7018290758132935, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08050892502069473, + "rewards/margins": 0.011110165156424046, + "rewards/rejected": -0.0916190966963768, + "sft_loss": 0.8050892949104309, + "step": 2190 + }, + { + "epoch": 0.17, + "grad_norm": 8.001112937927246, + "learning_rate": 9.348987812727375e-06, + "logits/chosen": -1.2792341709136963, + "logits/rejected": -0.8740849494934082, + "logps/chosen": -1.581862211227417, + "logps/rejected": -1.0725221633911133, + "loss": 1.7016, + "odds_ratio_loss": 1.1973588466644287, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15818621218204498, + "rewards/margins": -0.050933998078107834, + "rewards/rejected": -0.10725221782922745, + "sft_loss": 1.581862211227417, + "step": 2195 + }, + { + "epoch": 0.17, + "grad_norm": 11.436029434204102, + "learning_rate": 9.345946316222365e-06, + "logits/chosen": -1.3950872421264648, + "logits/rejected": -1.0151147842407227, + "logps/chosen": -0.9989234209060669, + "logps/rejected": -0.9934768676757812, + "loss": 1.0715, + "odds_ratio_loss": 0.7261602282524109, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09989233314990997, + "rewards/margins": -0.0005446464056149125, + "rewards/rejected": -0.09934769570827484, + "sft_loss": 0.9989234209060669, + "step": 2200 + }, + { + "epoch": 0.17, + "grad_norm": 30.081104278564453, + "learning_rate": 9.342898228537983e-06, + "logits/chosen": -1.218121886253357, + "logits/rejected": -1.3421285152435303, + "logps/chosen": -1.7342615127563477, + "logps/rejected": -1.285917043685913, + "loss": 1.8419, + "odds_ratio_loss": 1.0760008096694946, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.17342616617679596, + "rewards/margins": -0.04483447223901749, + "rewards/rejected": -0.12859170138835907, + "sft_loss": 1.7342615127563477, + "step": 2205 + }, + { + "epoch": 0.17, + "grad_norm": 9.11589241027832, + "learning_rate": 9.339843554297042e-06, + "logits/chosen": -1.4177284240722656, + "logits/rejected": -0.9231816530227661, + "logps/chosen": -0.9941714406013489, + "logps/rejected": -1.2196388244628906, + "loss": 1.0538, + "odds_ratio_loss": 0.5962220430374146, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09941715002059937, + "rewards/margins": 0.02254674583673477, + "rewards/rejected": -0.12196389585733414, + "sft_loss": 0.9941714406013489, + "step": 2210 + }, + { + "epoch": 0.17, + "grad_norm": 10.992576599121094, + "learning_rate": 9.33678229813234e-06, + "logits/chosen": -1.3786756992340088, + "logits/rejected": -1.3493220806121826, + "logps/chosen": -0.9669061899185181, + "logps/rejected": -1.7660000324249268, + "loss": 1.0027, + "odds_ratio_loss": 0.3575536608695984, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09669061005115509, + "rewards/margins": 0.07990939170122147, + "rewards/rejected": -0.17660000920295715, + "sft_loss": 0.9669061899185181, + "step": 2215 + }, + { + "epoch": 0.17, + "grad_norm": 8.534214973449707, + "learning_rate": 9.333714464686668e-06, + "logits/chosen": -1.202580213546753, + "logits/rejected": -1.1010208129882812, + "logps/chosen": -1.1651172637939453, + "logps/rejected": -1.1718440055847168, + "loss": 1.2364, + "odds_ratio_loss": 0.7130334973335266, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11651172488927841, + "rewards/margins": 0.0006726846331730485, + "rewards/rejected": -0.11718441545963287, + "sft_loss": 1.1651172637939453, + "step": 2220 + }, + { + "epoch": 0.17, + "grad_norm": 15.713763236999512, + "learning_rate": 9.330640058612777e-06, + "logits/chosen": -1.5429356098175049, + "logits/rejected": -0.8350217938423157, + "logps/chosen": -1.4011694192886353, + "logps/rejected": -1.835808515548706, + "loss": 1.469, + "odds_ratio_loss": 0.6785898804664612, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.14011695981025696, + "rewards/margins": 0.043463896960020065, + "rewards/rejected": -0.18358086049556732, + "sft_loss": 1.4011694192886353, + "step": 2225 + }, + { + "epoch": 0.17, + "grad_norm": 7.8334126472473145, + "learning_rate": 9.327559084573399e-06, + "logits/chosen": -1.346500277519226, + "logits/rejected": -1.1512044668197632, + "logps/chosen": -1.2371644973754883, + "logps/rejected": -1.9674861431121826, + "loss": 1.2897, + "odds_ratio_loss": 0.5257105827331543, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12371645122766495, + "rewards/margins": 0.07303217798471451, + "rewards/rejected": -0.19674862921237946, + "sft_loss": 1.2371644973754883, + "step": 2230 + }, + { + "epoch": 0.17, + "grad_norm": 10.72829818725586, + "learning_rate": 9.32447154724122e-06, + "logits/chosen": -1.3767178058624268, + "logits/rejected": -1.1266549825668335, + "logps/chosen": -1.1838889122009277, + "logps/rejected": -3.7046380043029785, + "loss": 1.2507, + "odds_ratio_loss": 0.6676278114318848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11838889122009277, + "rewards/margins": 0.2520748972892761, + "rewards/rejected": -0.3704637885093689, + "sft_loss": 1.1838889122009277, + "step": 2235 + }, + { + "epoch": 0.17, + "grad_norm": 5.583121299743652, + "learning_rate": 9.321377451298886e-06, + "logits/chosen": -1.2876561880111694, + "logits/rejected": -1.1324607133865356, + "logps/chosen": -0.8333019018173218, + "logps/rejected": -1.9429174661636353, + "loss": 0.8603, + "odds_ratio_loss": 0.27039480209350586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0833301991224289, + "rewards/margins": 0.11096155643463135, + "rewards/rejected": -0.19429175555706024, + "sft_loss": 0.8333019018173218, + "step": 2240 + }, + { + "epoch": 0.17, + "grad_norm": 6.8710856437683105, + "learning_rate": 9.318276801438981e-06, + "logits/chosen": -1.5717952251434326, + "logits/rejected": -0.9774179458618164, + "logps/chosen": -0.9509202837944031, + "logps/rejected": -1.7160125970840454, + "loss": 1.0017, + "odds_ratio_loss": 0.5077941417694092, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0950920358300209, + "rewards/margins": 0.07650923728942871, + "rewards/rejected": -0.17160126566886902, + "sft_loss": 0.9509202837944031, + "step": 2245 + }, + { + "epoch": 0.18, + "grad_norm": 26.359830856323242, + "learning_rate": 9.315169602364038e-06, + "logits/chosen": -1.438494324684143, + "logits/rejected": -1.2772901058197021, + "logps/chosen": -0.963545024394989, + "logps/rejected": -1.3646507263183594, + "loss": 1.018, + "odds_ratio_loss": 0.5441502928733826, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09635450690984726, + "rewards/margins": 0.04011055827140808, + "rewards/rejected": -0.13646505773067474, + "sft_loss": 0.963545024394989, + "step": 2250 + }, + { + "epoch": 0.18, + "grad_norm": 11.73903751373291, + "learning_rate": 9.312055858786517e-06, + "logits/chosen": -1.4635202884674072, + "logits/rejected": -1.1016645431518555, + "logps/chosen": -0.8518376350402832, + "logps/rejected": -1.5293066501617432, + "loss": 0.8947, + "odds_ratio_loss": 0.42870035767555237, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0851837620139122, + "rewards/margins": 0.06774689257144928, + "rewards/rejected": -0.15293066203594208, + "sft_loss": 0.8518376350402832, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 18.999237060546875, + "learning_rate": 9.308935575428808e-06, + "logits/chosen": -1.4547194242477417, + "logits/rejected": -1.178485631942749, + "logps/chosen": -1.1227895021438599, + "logps/rejected": -6.287472724914551, + "loss": 1.1341, + "odds_ratio_loss": 0.11270429193973541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11227895319461823, + "rewards/margins": 0.5164682269096375, + "rewards/rejected": -0.6287471652030945, + "sft_loss": 1.1227895021438599, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 8.228962898254395, + "learning_rate": 9.305808757023213e-06, + "logits/chosen": -1.4437748193740845, + "logits/rejected": -0.9789659380912781, + "logps/chosen": -1.308319330215454, + "logps/rejected": -4.175393104553223, + "loss": 1.3224, + "odds_ratio_loss": 0.14052268862724304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13083192706108093, + "rewards/margins": 0.28670734167099, + "rewards/rejected": -0.4175392985343933, + "sft_loss": 1.308319330215454, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 10.382689476013184, + "learning_rate": 9.302675408311953e-06, + "logits/chosen": -1.3628900051116943, + "logits/rejected": -1.0054218769073486, + "logps/chosen": -0.9282622337341309, + "logps/rejected": -1.898972511291504, + "loss": 0.9669, + "odds_ratio_loss": 0.38678181171417236, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0928262248635292, + "rewards/margins": 0.09707103669643402, + "rewards/rejected": -0.18989725410938263, + "sft_loss": 0.9282622337341309, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 5.989121437072754, + "learning_rate": 9.299535534047145e-06, + "logits/chosen": -1.4441674947738647, + "logits/rejected": -1.094054937362671, + "logps/chosen": -0.8899669647216797, + "logps/rejected": -1.7062129974365234, + "loss": 0.9281, + "odds_ratio_loss": 0.3816324770450592, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08899669349193573, + "rewards/margins": 0.0816246047616005, + "rewards/rejected": -0.17062130570411682, + "sft_loss": 0.8899669647216797, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 12.697397232055664, + "learning_rate": 9.296389138990812e-06, + "logits/chosen": -1.1660821437835693, + "logits/rejected": -1.038727045059204, + "logps/chosen": -1.1280267238616943, + "logps/rejected": -1.5812492370605469, + "loss": 1.1791, + "odds_ratio_loss": 0.5110870003700256, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11280268430709839, + "rewards/margins": 0.045322246849536896, + "rewards/rejected": -0.1581249237060547, + "sft_loss": 1.1280267238616943, + "step": 2280 + }, + { + "epoch": 0.18, + "grad_norm": 17.71702766418457, + "learning_rate": 9.293236227914856e-06, + "logits/chosen": -1.3867652416229248, + "logits/rejected": -0.7695621252059937, + "logps/chosen": -1.1080429553985596, + "logps/rejected": -1.8552356958389282, + "loss": 1.1608, + "odds_ratio_loss": 0.5275697708129883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11080429702997208, + "rewards/margins": 0.07471926510334015, + "rewards/rejected": -0.18552353978157043, + "sft_loss": 1.1080429553985596, + "step": 2285 + }, + { + "epoch": 0.18, + "grad_norm": 6.90964937210083, + "learning_rate": 9.290076805601071e-06, + "logits/chosen": -1.4995615482330322, + "logits/rejected": -0.8228855133056641, + "logps/chosen": -1.0126750469207764, + "logps/rejected": -1.122462511062622, + "loss": 1.0914, + "odds_ratio_loss": 0.7877473831176758, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10126751661300659, + "rewards/margins": 0.010978740639984608, + "rewards/rejected": -0.11224625259637833, + "sft_loss": 1.0126750469207764, + "step": 2290 + }, + { + "epoch": 0.18, + "grad_norm": 5.955761432647705, + "learning_rate": 9.286910876841122e-06, + "logits/chosen": -1.4855306148529053, + "logits/rejected": -1.0339566469192505, + "logps/chosen": -1.0420253276824951, + "logps/rejected": -1.2085959911346436, + "loss": 1.105, + "odds_ratio_loss": 0.629336953163147, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10420253127813339, + "rewards/margins": 0.016657069325447083, + "rewards/rejected": -0.12085960805416107, + "sft_loss": 1.0420253276824951, + "step": 2295 + }, + { + "epoch": 0.18, + "grad_norm": 65.14389038085938, + "learning_rate": 9.28373844643654e-06, + "logits/chosen": -1.0751597881317139, + "logits/rejected": -1.0112690925598145, + "logps/chosen": -0.9169561266899109, + "logps/rejected": -1.040330171585083, + "loss": 0.9989, + "odds_ratio_loss": 0.8191441297531128, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.09169561415910721, + "rewards/margins": 0.012337413616478443, + "rewards/rejected": -0.10403303056955338, + "sft_loss": 0.9169561266899109, + "step": 2300 + }, + { + "epoch": 0.18, + "grad_norm": 17.593547821044922, + "learning_rate": 9.28055951919872e-06, + "logits/chosen": -1.4363292455673218, + "logits/rejected": -1.0941526889801025, + "logps/chosen": -0.7269886136054993, + "logps/rejected": -1.8268687725067139, + "loss": 0.7623, + "odds_ratio_loss": 0.3534363806247711, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07269886881113052, + "rewards/margins": 0.1099880188703537, + "rewards/rejected": -0.18268688023090363, + "sft_loss": 0.7269886136054993, + "step": 2305 + }, + { + "epoch": 0.18, + "grad_norm": 7.644922256469727, + "learning_rate": 9.277374099948908e-06, + "logits/chosen": -1.32466721534729, + "logits/rejected": -0.9058082699775696, + "logps/chosen": -1.083791732788086, + "logps/rejected": -2.586423873901367, + "loss": 1.1446, + "odds_ratio_loss": 0.6080135107040405, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10837917029857635, + "rewards/margins": 0.1502632200717926, + "rewards/rejected": -0.25864237546920776, + "sft_loss": 1.083791732788086, + "step": 2310 + }, + { + "epoch": 0.18, + "grad_norm": 6.352106094360352, + "learning_rate": 9.274182193518195e-06, + "logits/chosen": -1.3612511157989502, + "logits/rejected": -1.0367705821990967, + "logps/chosen": -0.9755949974060059, + "logps/rejected": -1.4311678409576416, + "loss": 1.0182, + "odds_ratio_loss": 0.42629605531692505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09755949676036835, + "rewards/margins": 0.045557279139757156, + "rewards/rejected": -0.1431167870759964, + "sft_loss": 0.9755949974060059, + "step": 2315 + }, + { + "epoch": 0.18, + "grad_norm": 8.281800270080566, + "learning_rate": 9.270983804747516e-06, + "logits/chosen": -1.256667137145996, + "logits/rejected": -0.8662792444229126, + "logps/chosen": -1.1143862009048462, + "logps/rejected": -0.9090407490730286, + "loss": 1.2097, + "odds_ratio_loss": 0.9536363482475281, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11143863201141357, + "rewards/margins": -0.02053455263376236, + "rewards/rejected": -0.09090407192707062, + "sft_loss": 1.1143862009048462, + "step": 2320 + }, + { + "epoch": 0.18, + "grad_norm": 6.306697845458984, + "learning_rate": 9.267778938487633e-06, + "logits/chosen": -1.4480441808700562, + "logits/rejected": -0.8130139112472534, + "logps/chosen": -1.0359063148498535, + "logps/rejected": -1.931692123413086, + "loss": 1.0838, + "odds_ratio_loss": 0.47853994369506836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10359062999486923, + "rewards/margins": 0.08957858383655548, + "rewards/rejected": -0.19316920638084412, + "sft_loss": 1.0359063148498535, + "step": 2325 + }, + { + "epoch": 0.18, + "grad_norm": 53.15751266479492, + "learning_rate": 9.264567599599129e-06, + "logits/chosen": -1.4305864572525024, + "logits/rejected": -1.1423676013946533, + "logps/chosen": -1.1684757471084595, + "logps/rejected": -1.9025484323501587, + "loss": 1.2308, + "odds_ratio_loss": 0.6231135129928589, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.11684757471084595, + "rewards/margins": 0.07340727746486664, + "rewards/rejected": -0.19025485217571259, + "sft_loss": 1.1684757471084595, + "step": 2330 + }, + { + "epoch": 0.18, + "grad_norm": 14.97216510772705, + "learning_rate": 9.26134979295241e-06, + "logits/chosen": -1.1160290241241455, + "logits/rejected": -0.7148804664611816, + "logps/chosen": -0.8799691200256348, + "logps/rejected": -1.4380155801773071, + "loss": 0.9225, + "odds_ratio_loss": 0.4255514144897461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08799691498279572, + "rewards/margins": 0.05580463260412216, + "rewards/rejected": -0.14380155503749847, + "sft_loss": 0.8799691200256348, + "step": 2335 + }, + { + "epoch": 0.18, + "grad_norm": 5.440201282501221, + "learning_rate": 9.25812552342769e-06, + "logits/chosen": -1.1734294891357422, + "logits/rejected": -0.7869594693183899, + "logps/chosen": -1.125270128250122, + "logps/rejected": -0.9226048588752747, + "loss": 1.217, + "odds_ratio_loss": 0.9175472259521484, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1125270277261734, + "rewards/margins": -0.020266540348529816, + "rewards/rejected": -0.09226048737764359, + "sft_loss": 1.125270128250122, + "step": 2340 + }, + { + "epoch": 0.18, + "grad_norm": 6.525335788726807, + "learning_rate": 9.254894795914979e-06, + "logits/chosen": -1.0414087772369385, + "logits/rejected": -1.1698774099349976, + "logps/chosen": -1.0201406478881836, + "logps/rejected": -1.202549695968628, + "loss": 1.0814, + "odds_ratio_loss": 0.6127591133117676, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10201406478881836, + "rewards/margins": 0.018240921199321747, + "rewards/rejected": -0.12025497853755951, + "sft_loss": 1.0201406478881836, + "step": 2345 + }, + { + "epoch": 0.18, + "grad_norm": 10.264086723327637, + "learning_rate": 9.251657615314088e-06, + "logits/chosen": -1.3329168558120728, + "logits/rejected": -1.095649003982544, + "logps/chosen": -0.8811469078063965, + "logps/rejected": -1.50531804561615, + "loss": 0.9181, + "odds_ratio_loss": 0.369753360748291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08811469376087189, + "rewards/margins": 0.06241711229085922, + "rewards/rejected": -0.1505317986011505, + "sft_loss": 0.8811469078063965, + "step": 2350 + }, + { + "epoch": 0.18, + "grad_norm": 7.230742931365967, + "learning_rate": 9.248413986534612e-06, + "logits/chosen": -1.4222882986068726, + "logits/rejected": -1.0978469848632812, + "logps/chosen": -0.7870491743087769, + "logps/rejected": -2.3897218704223633, + "loss": 0.8448, + "odds_ratio_loss": 0.577102780342102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07870491594076157, + "rewards/margins": 0.16026729345321655, + "rewards/rejected": -0.23897221684455872, + "sft_loss": 0.7870491743087769, + "step": 2355 + }, + { + "epoch": 0.18, + "grad_norm": 7.951897621154785, + "learning_rate": 9.245163914495926e-06, + "logits/chosen": -1.176154613494873, + "logits/rejected": -0.6637752056121826, + "logps/chosen": -1.075390100479126, + "logps/rejected": -2.4893295764923096, + "loss": 1.1414, + "odds_ratio_loss": 0.6604292988777161, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10753902047872543, + "rewards/margins": 0.14139392971992493, + "rewards/rejected": -0.24893295764923096, + "sft_loss": 1.075390100479126, + "step": 2360 + }, + { + "epoch": 0.18, + "grad_norm": 9.111823081970215, + "learning_rate": 9.241907404127176e-06, + "logits/chosen": -1.1562446355819702, + "logits/rejected": -0.9415414929389954, + "logps/chosen": -0.654162585735321, + "logps/rejected": -2.4974708557128906, + "loss": 0.6713, + "odds_ratio_loss": 0.17174024879932404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06541626155376434, + "rewards/margins": 0.18433082103729248, + "rewards/rejected": -0.2497471123933792, + "sft_loss": 0.654162585735321, + "step": 2365 + }, + { + "epoch": 0.18, + "grad_norm": 7.270651340484619, + "learning_rate": 9.238644460367274e-06, + "logits/chosen": -1.2659519910812378, + "logits/rejected": -0.7697997093200684, + "logps/chosen": -0.9518246650695801, + "logps/rejected": -2.2645275592803955, + "loss": 0.9776, + "odds_ratio_loss": 0.2575392723083496, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09518247097730637, + "rewards/margins": 0.13127027451992035, + "rewards/rejected": -0.2264527529478073, + "sft_loss": 0.9518246650695801, + "step": 2370 + }, + { + "epoch": 0.18, + "grad_norm": 12.318182945251465, + "learning_rate": 9.235375088164891e-06, + "logits/chosen": -1.2376809120178223, + "logits/rejected": -0.8642924427986145, + "logps/chosen": -1.155277967453003, + "logps/rejected": -1.1210805177688599, + "loss": 1.2366, + "odds_ratio_loss": 0.8130849599838257, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11552779376506805, + "rewards/margins": -0.0034197538625448942, + "rewards/rejected": -0.11210803687572479, + "sft_loss": 1.155277967453003, + "step": 2375 + }, + { + "epoch": 0.19, + "grad_norm": 15.424654960632324, + "learning_rate": 9.23209929247844e-06, + "logits/chosen": -1.2814226150512695, + "logits/rejected": -0.8389566540718079, + "logps/chosen": -0.8976057767868042, + "logps/rejected": -1.801372766494751, + "loss": 0.9418, + "odds_ratio_loss": 0.4419097304344177, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08976057916879654, + "rewards/margins": 0.09037669003009796, + "rewards/rejected": -0.1801372617483139, + "sft_loss": 0.8976057767868042, + "step": 2380 + }, + { + "epoch": 0.19, + "grad_norm": 6.769630432128906, + "learning_rate": 9.228817078276084e-06, + "logits/chosen": -1.233152151107788, + "logits/rejected": -0.6566184759140015, + "logps/chosen": -1.1040191650390625, + "logps/rejected": -1.585508108139038, + "loss": 1.1519, + "odds_ratio_loss": 0.47855883836746216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11040191352367401, + "rewards/margins": 0.04814889281988144, + "rewards/rejected": -0.15855081379413605, + "sft_loss": 1.1040191650390625, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 9.030645370483398, + "learning_rate": 9.225528450535718e-06, + "logits/chosen": -1.408438801765442, + "logits/rejected": -0.8707016706466675, + "logps/chosen": -1.164117693901062, + "logps/rejected": -2.1822171211242676, + "loss": 1.2221, + "odds_ratio_loss": 0.5799840092658997, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11641178280115128, + "rewards/margins": 0.10180990397930145, + "rewards/rejected": -0.21822169423103333, + "sft_loss": 1.164117693901062, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 5.576879501342773, + "learning_rate": 9.222233414244963e-06, + "logits/chosen": -1.548218011856079, + "logits/rejected": -1.1314003467559814, + "logps/chosen": -0.889999270439148, + "logps/rejected": -1.3307504653930664, + "loss": 0.9389, + "odds_ratio_loss": 0.48932284116744995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08899993449449539, + "rewards/margins": 0.044075123965740204, + "rewards/rejected": -0.1330750584602356, + "sft_loss": 0.889999270439148, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 30.624387741088867, + "learning_rate": 9.218931974401158e-06, + "logits/chosen": -1.4214167594909668, + "logits/rejected": -0.949417769908905, + "logps/chosen": -1.1055647134780884, + "logps/rejected": -1.0172256231307983, + "loss": 1.1906, + "odds_ratio_loss": 0.850671112537384, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11055648326873779, + "rewards/margins": -0.008833914995193481, + "rewards/rejected": -0.10172256082296371, + "sft_loss": 1.1055647134780884, + "step": 2400 + }, + { + "epoch": 0.19, + "grad_norm": 4.737048149108887, + "learning_rate": 9.21562413601136e-06, + "logits/chosen": -1.3176019191741943, + "logits/rejected": -0.5910056829452515, + "logps/chosen": -0.7685356140136719, + "logps/rejected": -1.1007511615753174, + "loss": 0.819, + "odds_ratio_loss": 0.5050622224807739, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07685355842113495, + "rewards/margins": 0.03322155401110649, + "rewards/rejected": -0.11007511615753174, + "sft_loss": 0.7685356140136719, + "step": 2405 + }, + { + "epoch": 0.19, + "grad_norm": 11.416131973266602, + "learning_rate": 9.21230990409232e-06, + "logits/chosen": -1.397815227508545, + "logits/rejected": -1.1700330972671509, + "logps/chosen": -0.7989002466201782, + "logps/rejected": -2.4671411514282227, + "loss": 0.8525, + "odds_ratio_loss": 0.5361326336860657, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07989002764225006, + "rewards/margins": 0.1668240875005722, + "rewards/rejected": -0.24671411514282227, + "sft_loss": 0.7989002466201782, + "step": 2410 + }, + { + "epoch": 0.19, + "grad_norm": 70.81454467773438, + "learning_rate": 9.208989283670498e-06, + "logits/chosen": -1.350322961807251, + "logits/rejected": -1.126625895500183, + "logps/chosen": -1.1136561632156372, + "logps/rejected": -1.5111583471298218, + "loss": 1.161, + "odds_ratio_loss": 0.4734528660774231, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11136561632156372, + "rewards/margins": 0.039750225841999054, + "rewards/rejected": -0.15111584961414337, + "sft_loss": 1.1136561632156372, + "step": 2415 + }, + { + "epoch": 0.19, + "grad_norm": 11.097872734069824, + "learning_rate": 9.20566227978203e-06, + "logits/chosen": -1.2976523637771606, + "logits/rejected": -0.7848803997039795, + "logps/chosen": -1.0063698291778564, + "logps/rejected": -0.9979653358459473, + "loss": 1.0846, + "odds_ratio_loss": 0.7826443910598755, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10063699632883072, + "rewards/margins": -0.0008404649561271071, + "rewards/rejected": -0.09979653358459473, + "sft_loss": 1.0063698291778564, + "step": 2420 + }, + { + "epoch": 0.19, + "grad_norm": 10.51763916015625, + "learning_rate": 9.202328897472746e-06, + "logits/chosen": -1.3862828016281128, + "logits/rejected": -1.381044626235962, + "logps/chosen": -0.9244590997695923, + "logps/rejected": -1.8402154445648193, + "loss": 0.9667, + "odds_ratio_loss": 0.42251425981521606, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09244590997695923, + "rewards/margins": 0.09157565236091614, + "rewards/rejected": -0.18402156233787537, + "sft_loss": 0.9244590997695923, + "step": 2425 + }, + { + "epoch": 0.19, + "grad_norm": 48.170528411865234, + "learning_rate": 9.198989141798138e-06, + "logits/chosen": -1.2233293056488037, + "logits/rejected": -0.9823969006538391, + "logps/chosen": -1.0954564809799194, + "logps/rejected": -1.1875765323638916, + "loss": 1.1602, + "odds_ratio_loss": 0.6473854184150696, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10954566299915314, + "rewards/margins": 0.009211997501552105, + "rewards/rejected": -0.11875765025615692, + "sft_loss": 1.0954564809799194, + "step": 2430 + }, + { + "epoch": 0.19, + "grad_norm": 6.970828533172607, + "learning_rate": 9.195643017823374e-06, + "logits/chosen": -1.1448237895965576, + "logits/rejected": -1.0005139112472534, + "logps/chosen": -1.251326084136963, + "logps/rejected": -4.398923873901367, + "loss": 1.3175, + "odds_ratio_loss": 0.6618432998657227, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12513260543346405, + "rewards/margins": 0.3147597908973694, + "rewards/rejected": -0.43989238142967224, + "sft_loss": 1.251326084136963, + "step": 2435 + }, + { + "epoch": 0.19, + "grad_norm": 11.294906616210938, + "learning_rate": 9.192290530623274e-06, + "logits/chosen": -1.1162798404693604, + "logits/rejected": -0.7499058842658997, + "logps/chosen": -1.2006309032440186, + "logps/rejected": -1.4552767276763916, + "loss": 1.2631, + "odds_ratio_loss": 0.6247986555099487, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12006310373544693, + "rewards/margins": 0.025464575737714767, + "rewards/rejected": -0.1455276906490326, + "sft_loss": 1.2006309032440186, + "step": 2440 + }, + { + "epoch": 0.19, + "grad_norm": 72.77327728271484, + "learning_rate": 9.18893168528231e-06, + "logits/chosen": -1.0885790586471558, + "logits/rejected": -0.7597593665122986, + "logps/chosen": -1.6269006729125977, + "logps/rejected": -1.4666858911514282, + "loss": 1.7196, + "odds_ratio_loss": 0.9270656704902649, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16269007325172424, + "rewards/margins": -0.016021480783820152, + "rewards/rejected": -0.14666858315467834, + "sft_loss": 1.6269006729125977, + "step": 2445 + }, + { + "epoch": 0.19, + "grad_norm": 10.69701862335205, + "learning_rate": 9.185566486894597e-06, + "logits/chosen": -1.1007755994796753, + "logits/rejected": -1.287514090538025, + "logps/chosen": -0.8829814791679382, + "logps/rejected": -1.010026216506958, + "loss": 0.9429, + "odds_ratio_loss": 0.5991403460502625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08829815685749054, + "rewards/margins": 0.012704456225037575, + "rewards/rejected": -0.10100261121988297, + "sft_loss": 0.8829814791679382, + "step": 2450 + }, + { + "epoch": 0.19, + "grad_norm": 13.927633285522461, + "learning_rate": 9.182194940563887e-06, + "logits/chosen": -0.9834834933280945, + "logits/rejected": -1.1958738565444946, + "logps/chosen": -0.699195921421051, + "logps/rejected": -1.6200673580169678, + "loss": 0.741, + "odds_ratio_loss": 0.4177609086036682, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06991958618164062, + "rewards/margins": 0.09208716452121735, + "rewards/rejected": -0.16200676560401917, + "sft_loss": 0.699195921421051, + "step": 2455 + }, + { + "epoch": 0.19, + "grad_norm": 8.999503135681152, + "learning_rate": 9.178817051403556e-06, + "logits/chosen": -1.2043085098266602, + "logits/rejected": -0.9950903058052063, + "logps/chosen": -0.9628440141677856, + "logps/rejected": -1.3881338834762573, + "loss": 1.0108, + "odds_ratio_loss": 0.47996312379837036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0962844043970108, + "rewards/margins": 0.042528990656137466, + "rewards/rejected": -0.13881340622901917, + "sft_loss": 0.9628440141677856, + "step": 2460 + }, + { + "epoch": 0.19, + "grad_norm": 26.092164993286133, + "learning_rate": 9.175432824536604e-06, + "logits/chosen": -1.362648606300354, + "logits/rejected": -1.0125370025634766, + "logps/chosen": -1.0991069078445435, + "logps/rejected": -1.0862895250320435, + "loss": 1.1755, + "odds_ratio_loss": 0.7640587687492371, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10991068929433823, + "rewards/margins": -0.0012817397946491838, + "rewards/rejected": -0.10862895101308823, + "sft_loss": 1.0991069078445435, + "step": 2465 + }, + { + "epoch": 0.19, + "grad_norm": 32.62111282348633, + "learning_rate": 9.17204226509564e-06, + "logits/chosen": -1.08524751663208, + "logits/rejected": -0.9146364331245422, + "logps/chosen": -1.142344355583191, + "logps/rejected": -0.99430912733078, + "loss": 1.232, + "odds_ratio_loss": 0.8967956304550171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11423443257808685, + "rewards/margins": -0.01480352133512497, + "rewards/rejected": -0.09943092614412308, + "sft_loss": 1.142344355583191, + "step": 2470 + }, + { + "epoch": 0.19, + "grad_norm": 6.865599155426025, + "learning_rate": 9.16864537822288e-06, + "logits/chosen": -1.104943871498108, + "logits/rejected": -1.0251874923706055, + "logps/chosen": -0.8822401762008667, + "logps/rejected": -0.8117051124572754, + "loss": 0.9565, + "odds_ratio_loss": 0.7424853444099426, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08822402358055115, + "rewards/margins": -0.007053514011204243, + "rewards/rejected": -0.08117051422595978, + "sft_loss": 0.8822401762008667, + "step": 2475 + }, + { + "epoch": 0.19, + "grad_norm": 11.853043556213379, + "learning_rate": 9.165242169070129e-06, + "logits/chosen": -1.2353646755218506, + "logits/rejected": -0.8373567461967468, + "logps/chosen": -0.9056793451309204, + "logps/rejected": -1.1309173107147217, + "loss": 0.9665, + "odds_ratio_loss": 0.6080256104469299, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0905679315328598, + "rewards/margins": 0.02252379059791565, + "rewards/rejected": -0.11309172958135605, + "sft_loss": 0.9056793451309204, + "step": 2480 + }, + { + "epoch": 0.19, + "grad_norm": 7.211149215698242, + "learning_rate": 9.16183264279879e-06, + "logits/chosen": -1.17579984664917, + "logits/rejected": -0.982610821723938, + "logps/chosen": -0.7722324728965759, + "logps/rejected": -2.1497650146484375, + "loss": 0.8077, + "odds_ratio_loss": 0.35467711091041565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07722325623035431, + "rewards/margins": 0.13775324821472168, + "rewards/rejected": -0.214976504445076, + "sft_loss": 0.7722324728965759, + "step": 2485 + }, + { + "epoch": 0.19, + "grad_norm": 8.517918586730957, + "learning_rate": 9.158416804579841e-06, + "logits/chosen": -1.2026476860046387, + "logits/rejected": -0.8483761548995972, + "logps/chosen": -1.1936149597167969, + "logps/rejected": -1.2055301666259766, + "loss": 1.2698, + "odds_ratio_loss": 0.7616353034973145, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1193614974617958, + "rewards/margins": 0.0011915117502212524, + "rewards/rejected": -0.12055301666259766, + "sft_loss": 1.1936149597167969, + "step": 2490 + }, + { + "epoch": 0.19, + "grad_norm": 7.186039447784424, + "learning_rate": 9.154994659593836e-06, + "logits/chosen": -1.1555768251419067, + "logits/rejected": -0.8785859942436218, + "logps/chosen": -0.7830021381378174, + "logps/rejected": -1.2585175037384033, + "loss": 0.8254, + "odds_ratio_loss": 0.4238009452819824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07830022275447845, + "rewards/margins": 0.04755154997110367, + "rewards/rejected": -0.12585176527500153, + "sft_loss": 0.7830021381378174, + "step": 2495 + }, + { + "epoch": 0.19, + "grad_norm": 45.5631217956543, + "learning_rate": 9.151566213030891e-06, + "logits/chosen": -1.4677820205688477, + "logits/rejected": -1.2527766227722168, + "logps/chosen": -1.0443228483200073, + "logps/rejected": -3.552145004272461, + "loss": 1.0872, + "odds_ratio_loss": 0.428349107503891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10443229973316193, + "rewards/margins": 0.25078222155570984, + "rewards/rejected": -0.35521453619003296, + "sft_loss": 1.0443228483200073, + "step": 2500 + }, + { + "epoch": 0.19, + "grad_norm": 22.740446090698242, + "learning_rate": 9.14813147009068e-06, + "logits/chosen": -1.301882266998291, + "logits/rejected": -1.150469183921814, + "logps/chosen": -0.7994235754013062, + "logps/rejected": -1.0263789892196655, + "loss": 0.855, + "odds_ratio_loss": 0.5557239055633545, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07994236052036285, + "rewards/margins": 0.022695545107126236, + "rewards/rejected": -0.10263790935277939, + "sft_loss": 0.7994235754013062, + "step": 2505 + }, + { + "epoch": 0.2, + "grad_norm": 2.394989252090454, + "learning_rate": 9.144690435982427e-06, + "logits/chosen": -1.2863742113113403, + "logits/rejected": -0.9021209478378296, + "logps/chosen": -1.0736862421035767, + "logps/rejected": -2.3909831047058105, + "loss": 1.1261, + "odds_ratio_loss": 0.5244934558868408, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10736862570047379, + "rewards/margins": 0.13172969222068787, + "rewards/rejected": -0.23909831047058105, + "sft_loss": 1.0736862421035767, + "step": 2510 + }, + { + "epoch": 0.2, + "grad_norm": 37.13337326049805, + "learning_rate": 9.141243115924898e-06, + "logits/chosen": -1.3705681562423706, + "logits/rejected": -0.9290679693222046, + "logps/chosen": -0.7407183647155762, + "logps/rejected": -1.197664499282837, + "loss": 0.8096, + "odds_ratio_loss": 0.6890398859977722, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07407183945178986, + "rewards/margins": 0.04569462686777115, + "rewards/rejected": -0.11976645886898041, + "sft_loss": 0.7407183647155762, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 10.020195960998535, + "learning_rate": 9.13778951514639e-06, + "logits/chosen": -1.2581623792648315, + "logits/rejected": -0.8416546583175659, + "logps/chosen": -1.1621432304382324, + "logps/rejected": -2.305757522583008, + "loss": 1.2145, + "odds_ratio_loss": 0.5237524509429932, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.116214320063591, + "rewards/margins": 0.11436142027378082, + "rewards/rejected": -0.23057575523853302, + "sft_loss": 1.1621432304382324, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 2.6576640605926514, + "learning_rate": 9.134329638884729e-06, + "logits/chosen": -1.2983006238937378, + "logits/rejected": -0.7410690784454346, + "logps/chosen": -0.8040630221366882, + "logps/rejected": -2.982477903366089, + "loss": 0.8338, + "odds_ratio_loss": 0.2976140081882477, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0804063007235527, + "rewards/margins": 0.2178414762020111, + "rewards/rejected": -0.2982478141784668, + "sft_loss": 0.8040630221366882, + "step": 2525 + }, + { + "epoch": 0.2, + "grad_norm": 18.113977432250977, + "learning_rate": 9.130863492387254e-06, + "logits/chosen": -1.171718716621399, + "logits/rejected": -1.1481155157089233, + "logps/chosen": -1.0631039142608643, + "logps/rejected": -1.413201928138733, + "loss": 1.1453, + "odds_ratio_loss": 0.8220809698104858, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1063103899359703, + "rewards/margins": 0.03500979021191597, + "rewards/rejected": -0.14132019877433777, + "sft_loss": 1.0631039142608643, + "step": 2530 + }, + { + "epoch": 0.2, + "grad_norm": 6.383808612823486, + "learning_rate": 9.12739108091082e-06, + "logits/chosen": -1.3552782535552979, + "logits/rejected": -0.7787104249000549, + "logps/chosen": -1.0695326328277588, + "logps/rejected": -1.128353238105774, + "loss": 1.1368, + "odds_ratio_loss": 0.6724019050598145, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10695326328277588, + "rewards/margins": 0.005882044322788715, + "rewards/rejected": -0.11283531039953232, + "sft_loss": 1.0695326328277588, + "step": 2535 + }, + { + "epoch": 0.2, + "grad_norm": 94.40227508544922, + "learning_rate": 9.123912409721777e-06, + "logits/chosen": -1.1765425205230713, + "logits/rejected": -0.8706881403923035, + "logps/chosen": -0.9728943109512329, + "logps/rejected": -0.8993405103683472, + "loss": 1.0499, + "odds_ratio_loss": 0.7700883150100708, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09728943556547165, + "rewards/margins": -0.007355389650911093, + "rewards/rejected": -0.08993404358625412, + "sft_loss": 0.9728943109512329, + "step": 2540 + }, + { + "epoch": 0.2, + "grad_norm": 13.701937675476074, + "learning_rate": 9.120427484095972e-06, + "logits/chosen": -1.2000676393508911, + "logits/rejected": -0.7381768822669983, + "logps/chosen": -0.7090678215026855, + "logps/rejected": -1.2819874286651611, + "loss": 0.743, + "odds_ratio_loss": 0.3396037220954895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07090678811073303, + "rewards/margins": 0.05729196220636368, + "rewards/rejected": -0.1281987428665161, + "sft_loss": 0.7090678215026855, + "step": 2545 + }, + { + "epoch": 0.2, + "grad_norm": 5.354995250701904, + "learning_rate": 9.116936309318739e-06, + "logits/chosen": -1.2523038387298584, + "logits/rejected": -0.9723421335220337, + "logps/chosen": -0.747650146484375, + "logps/rejected": -1.4843276739120483, + "loss": 0.7891, + "odds_ratio_loss": 0.4141773581504822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07476501911878586, + "rewards/margins": 0.07366775721311569, + "rewards/rejected": -0.14843276143074036, + "sft_loss": 0.747650146484375, + "step": 2550 + }, + { + "epoch": 0.2, + "grad_norm": 6.774299621582031, + "learning_rate": 9.113438890684886e-06, + "logits/chosen": -1.3876330852508545, + "logits/rejected": -0.5892521142959595, + "logps/chosen": -0.8726062774658203, + "logps/rejected": -1.361816167831421, + "loss": 0.9561, + "odds_ratio_loss": 0.8344847559928894, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08726062625646591, + "rewards/margins": 0.048920996487140656, + "rewards/rejected": -0.13618160784244537, + "sft_loss": 0.8726062774658203, + "step": 2555 + }, + { + "epoch": 0.2, + "grad_norm": 6.449535369873047, + "learning_rate": 9.10993523349869e-06, + "logits/chosen": -1.2600324153900146, + "logits/rejected": -0.6542048454284668, + "logps/chosen": -1.1139378547668457, + "logps/rejected": -2.1798148155212402, + "loss": 1.1481, + "odds_ratio_loss": 0.3412316143512726, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1113937720656395, + "rewards/margins": 0.10658769309520721, + "rewards/rejected": -0.2179814875125885, + "sft_loss": 1.1139378547668457, + "step": 2560 + }, + { + "epoch": 0.2, + "grad_norm": 12.766084671020508, + "learning_rate": 9.106425343073897e-06, + "logits/chosen": -1.1841919422149658, + "logits/rejected": -1.0210702419281006, + "logps/chosen": -0.8251067996025085, + "logps/rejected": -1.2712657451629639, + "loss": 0.873, + "odds_ratio_loss": 0.47851771116256714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08251067250967026, + "rewards/margins": 0.04461590573191643, + "rewards/rejected": -0.12712658941745758, + "sft_loss": 0.8251067996025085, + "step": 2565 + }, + { + "epoch": 0.2, + "grad_norm": 11.313969612121582, + "learning_rate": 9.1029092247337e-06, + "logits/chosen": -1.2296737432479858, + "logits/rejected": -0.9485149383544922, + "logps/chosen": -0.928150475025177, + "logps/rejected": -2.3807272911071777, + "loss": 0.9677, + "odds_ratio_loss": 0.3950374722480774, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09281504154205322, + "rewards/margins": 0.14525768160820007, + "rewards/rejected": -0.2380727082490921, + "sft_loss": 0.928150475025177, + "step": 2570 + }, + { + "epoch": 0.2, + "grad_norm": 12.160032272338867, + "learning_rate": 9.099386883810736e-06, + "logits/chosen": -1.254393219947815, + "logits/rejected": -0.8775162696838379, + "logps/chosen": -1.248295545578003, + "logps/rejected": -1.8063290119171143, + "loss": 1.3072, + "odds_ratio_loss": 0.5886574983596802, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12482955306768417, + "rewards/margins": 0.05580334737896919, + "rewards/rejected": -0.18063290417194366, + "sft_loss": 1.248295545578003, + "step": 2575 + }, + { + "epoch": 0.2, + "grad_norm": 12.303729057312012, + "learning_rate": 9.095858325647084e-06, + "logits/chosen": -1.2073808908462524, + "logits/rejected": -0.872015118598938, + "logps/chosen": -1.0176329612731934, + "logps/rejected": -1.3849756717681885, + "loss": 1.0668, + "odds_ratio_loss": 0.49167561531066895, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1017632931470871, + "rewards/margins": 0.03673427551984787, + "rewards/rejected": -0.13849757611751556, + "sft_loss": 1.0176329612731934, + "step": 2580 + }, + { + "epoch": 0.2, + "grad_norm": 7.670443534851074, + "learning_rate": 9.092323555594254e-06, + "logits/chosen": -1.3451400995254517, + "logits/rejected": -1.0177253484725952, + "logps/chosen": -1.0468345880508423, + "logps/rejected": -1.6150470972061157, + "loss": 1.1329, + "odds_ratio_loss": 0.8601625561714172, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10468345880508423, + "rewards/margins": 0.056821249425411224, + "rewards/rejected": -0.16150471568107605, + "sft_loss": 1.0468345880508423, + "step": 2585 + }, + { + "epoch": 0.2, + "grad_norm": 17.628047943115234, + "learning_rate": 9.088782579013167e-06, + "logits/chosen": -1.3400976657867432, + "logits/rejected": -0.9388322830200195, + "logps/chosen": -0.7216960787773132, + "logps/rejected": -0.7665186524391174, + "loss": 0.8021, + "odds_ratio_loss": 0.8036432266235352, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07216961681842804, + "rewards/margins": 0.004482255782932043, + "rewards/rejected": -0.07665187120437622, + "sft_loss": 0.7216960787773132, + "step": 2590 + }, + { + "epoch": 0.2, + "grad_norm": 15.716256141662598, + "learning_rate": 9.08523540127417e-06, + "logits/chosen": -1.2134828567504883, + "logits/rejected": -1.1170094013214111, + "logps/chosen": -0.9080262184143066, + "logps/rejected": -1.2125194072723389, + "loss": 0.9598, + "odds_ratio_loss": 0.5175421237945557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0908026248216629, + "rewards/margins": 0.030449315905570984, + "rewards/rejected": -0.12125194072723389, + "sft_loss": 0.9080262184143066, + "step": 2595 + }, + { + "epoch": 0.2, + "grad_norm": 16.95549201965332, + "learning_rate": 9.081682027757001e-06, + "logits/chosen": -1.1575896739959717, + "logits/rejected": -0.7381139993667603, + "logps/chosen": -0.824720025062561, + "logps/rejected": -4.392837047576904, + "loss": 0.8607, + "odds_ratio_loss": 0.3601614534854889, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08247199654579163, + "rewards/margins": 0.3568117022514343, + "rewards/rejected": -0.4392837584018707, + "sft_loss": 0.824720025062561, + "step": 2600 + }, + { + "epoch": 0.2, + "grad_norm": 16.999509811401367, + "learning_rate": 9.07812246385081e-06, + "logits/chosen": -1.338104248046875, + "logits/rejected": -0.9916426539421082, + "logps/chosen": -1.3346434831619263, + "logps/rejected": -4.51638650894165, + "loss": 1.4249, + "odds_ratio_loss": 0.9022731781005859, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.13346433639526367, + "rewards/margins": 0.31817427277565, + "rewards/rejected": -0.4516386091709137, + "sft_loss": 1.3346434831619263, + "step": 2605 + }, + { + "epoch": 0.2, + "grad_norm": 95.34765625, + "learning_rate": 9.074556714954121e-06, + "logits/chosen": -1.1742122173309326, + "logits/rejected": -0.8790088891983032, + "logps/chosen": -0.996563732624054, + "logps/rejected": -1.647459626197815, + "loss": 1.0471, + "odds_ratio_loss": 0.5054280757904053, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0996563732624054, + "rewards/margins": 0.06508957594633102, + "rewards/rejected": -0.16474595665931702, + "sft_loss": 0.996563732624054, + "step": 2610 + }, + { + "epoch": 0.2, + "grad_norm": 9.325736045837402, + "learning_rate": 9.07098478647485e-06, + "logits/chosen": -1.3385982513427734, + "logits/rejected": -0.8874040842056274, + "logps/chosen": -1.1331145763397217, + "logps/rejected": -1.6950451135635376, + "loss": 1.1968, + "odds_ratio_loss": 0.6363669633865356, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11331145465373993, + "rewards/margins": 0.056193046271800995, + "rewards/rejected": -0.16950449347496033, + "sft_loss": 1.1331145763397217, + "step": 2615 + }, + { + "epoch": 0.2, + "grad_norm": 22.75711441040039, + "learning_rate": 9.067406683830278e-06, + "logits/chosen": -1.2250087261199951, + "logits/rejected": -0.9773575663566589, + "logps/chosen": -1.0626041889190674, + "logps/rejected": -1.3051557540893555, + "loss": 1.1219, + "odds_ratio_loss": 0.5928469896316528, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10626041889190674, + "rewards/margins": 0.02425515279173851, + "rewards/rejected": -0.13051557540893555, + "sft_loss": 1.0626041889190674, + "step": 2620 + }, + { + "epoch": 0.2, + "grad_norm": 6.514204978942871, + "learning_rate": 9.06382241244705e-06, + "logits/chosen": -1.1473455429077148, + "logits/rejected": -0.6742517352104187, + "logps/chosen": -1.03928804397583, + "logps/rejected": -3.828688144683838, + "loss": 1.0807, + "odds_ratio_loss": 0.41433659195899963, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10392878949642181, + "rewards/margins": 0.27894002199172974, + "rewards/rejected": -0.38286882638931274, + "sft_loss": 1.03928804397583, + "step": 2625 + }, + { + "epoch": 0.2, + "grad_norm": 81.91191864013672, + "learning_rate": 9.060231977761173e-06, + "logits/chosen": -1.4000599384307861, + "logits/rejected": -0.8785271644592285, + "logps/chosen": -0.8743749856948853, + "logps/rejected": -4.196818828582764, + "loss": 0.9417, + "odds_ratio_loss": 0.673501193523407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08743749558925629, + "rewards/margins": 0.3322443962097168, + "rewards/rejected": -0.4196818470954895, + "sft_loss": 0.8743749856948853, + "step": 2630 + }, + { + "epoch": 0.2, + "grad_norm": 25.8214111328125, + "learning_rate": 9.056635385217994e-06, + "logits/chosen": -1.306239366531372, + "logits/rejected": -1.0091142654418945, + "logps/chosen": -0.8158831596374512, + "logps/rejected": -1.0368363857269287, + "loss": 0.8713, + "odds_ratio_loss": 0.5539509057998657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08158832043409348, + "rewards/margins": 0.022095322608947754, + "rewards/rejected": -0.10368363559246063, + "sft_loss": 0.8158831596374512, + "step": 2635 + }, + { + "epoch": 0.21, + "grad_norm": 6.849333763122559, + "learning_rate": 9.053032640272202e-06, + "logits/chosen": -1.411442756652832, + "logits/rejected": -1.0196300745010376, + "logps/chosen": -0.845949649810791, + "logps/rejected": -1.260571002960205, + "loss": 0.8894, + "odds_ratio_loss": 0.43468767404556274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0845949649810791, + "rewards/margins": 0.041462142020463943, + "rewards/rejected": -0.12605710327625275, + "sft_loss": 0.845949649810791, + "step": 2640 + }, + { + "epoch": 0.21, + "grad_norm": 14.452634811401367, + "learning_rate": 9.049423748387819e-06, + "logits/chosen": -1.3134500980377197, + "logits/rejected": -0.8436886668205261, + "logps/chosen": -1.2949182987213135, + "logps/rejected": -4.814360618591309, + "loss": 1.337, + "odds_ratio_loss": 0.4204350411891937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12949183583259583, + "rewards/margins": 0.351944237947464, + "rewards/rejected": -0.4814360737800598, + "sft_loss": 1.2949182987213135, + "step": 2645 + }, + { + "epoch": 0.21, + "grad_norm": 29.886714935302734, + "learning_rate": 9.045808715038184e-06, + "logits/chosen": -1.2806081771850586, + "logits/rejected": -0.842139720916748, + "logps/chosen": -0.9247692227363586, + "logps/rejected": -1.1130586862564087, + "loss": 0.982, + "odds_ratio_loss": 0.5720853209495544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09247691929340363, + "rewards/margins": 0.018828941509127617, + "rewards/rejected": -0.1113058552145958, + "sft_loss": 0.9247692227363586, + "step": 2650 + }, + { + "epoch": 0.21, + "grad_norm": 33.8629035949707, + "learning_rate": 9.04218754570596e-06, + "logits/chosen": -1.1466368436813354, + "logits/rejected": -0.9131369590759277, + "logps/chosen": -1.3172786235809326, + "logps/rejected": -3.089834690093994, + "loss": 1.3603, + "odds_ratio_loss": 0.43001121282577515, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13172785937786102, + "rewards/margins": 0.17725564539432526, + "rewards/rejected": -0.3089835047721863, + "sft_loss": 1.3172786235809326, + "step": 2655 + }, + { + "epoch": 0.21, + "grad_norm": 15.455634117126465, + "learning_rate": 9.038560245883105e-06, + "logits/chosen": -1.412030577659607, + "logits/rejected": -1.102386236190796, + "logps/chosen": -1.2090859413146973, + "logps/rejected": -1.5070087909698486, + "loss": 1.2694, + "odds_ratio_loss": 0.6028513312339783, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12090860307216644, + "rewards/margins": 0.02979227900505066, + "rewards/rejected": -0.1507008820772171, + "sft_loss": 1.2090859413146973, + "step": 2660 + }, + { + "epoch": 0.21, + "grad_norm": 5.274667739868164, + "learning_rate": 9.034926821070883e-06, + "logits/chosen": -1.2878334522247314, + "logits/rejected": -0.8525265455245972, + "logps/chosen": -1.1998794078826904, + "logps/rejected": -1.4124677181243896, + "loss": 1.2567, + "odds_ratio_loss": 0.5681849718093872, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11998794227838516, + "rewards/margins": 0.02125883661210537, + "rewards/rejected": -0.1412467658519745, + "sft_loss": 1.1998794078826904, + "step": 2665 + }, + { + "epoch": 0.21, + "grad_norm": 17.122329711914062, + "learning_rate": 9.03128727677984e-06, + "logits/chosen": -1.2906736135482788, + "logits/rejected": -0.612545907497406, + "logps/chosen": -1.0701696872711182, + "logps/rejected": -1.8918075561523438, + "loss": 1.1072, + "odds_ratio_loss": 0.370770126581192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10701696574687958, + "rewards/margins": 0.08216379582881927, + "rewards/rejected": -0.18918077647686005, + "sft_loss": 1.0701696872711182, + "step": 2670 + }, + { + "epoch": 0.21, + "grad_norm": 13.244214057922363, + "learning_rate": 9.027641618529813e-06, + "logits/chosen": -1.4152615070343018, + "logits/rejected": -1.1327699422836304, + "logps/chosen": -1.1863515377044678, + "logps/rejected": -1.7167526483535767, + "loss": 1.2365, + "odds_ratio_loss": 0.5016016960144043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1186351552605629, + "rewards/margins": 0.05304010584950447, + "rewards/rejected": -0.17167526483535767, + "sft_loss": 1.1863515377044678, + "step": 2675 + }, + { + "epoch": 0.21, + "grad_norm": 6.127563953399658, + "learning_rate": 9.023989851849899e-06, + "logits/chosen": -1.3840445280075073, + "logits/rejected": -1.0111591815948486, + "logps/chosen": -1.281292200088501, + "logps/rejected": -2.281388282775879, + "loss": 1.3298, + "odds_ratio_loss": 0.4846586287021637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12812921404838562, + "rewards/margins": 0.10000962018966675, + "rewards/rejected": -0.22813884913921356, + "sft_loss": 1.281292200088501, + "step": 2680 + }, + { + "epoch": 0.21, + "grad_norm": 6.400108814239502, + "learning_rate": 9.02033198227847e-06, + "logits/chosen": -1.47408127784729, + "logits/rejected": -0.6259415745735168, + "logps/chosen": -0.9586564898490906, + "logps/rejected": -4.168588161468506, + "loss": 0.9967, + "odds_ratio_loss": 0.38054126501083374, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0958656519651413, + "rewards/margins": 0.32099318504333496, + "rewards/rejected": -0.41685882210731506, + "sft_loss": 0.9586564898490906, + "step": 2685 + }, + { + "epoch": 0.21, + "grad_norm": 27.631380081176758, + "learning_rate": 9.01666801536315e-06, + "logits/chosen": -1.2730721235275269, + "logits/rejected": -1.0189874172210693, + "logps/chosen": -1.5083661079406738, + "logps/rejected": -2.5043952465057373, + "loss": 1.5848, + "odds_ratio_loss": 0.7645183205604553, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15083661675453186, + "rewards/margins": 0.09960292279720306, + "rewards/rejected": -0.25043952465057373, + "sft_loss": 1.5083661079406738, + "step": 2690 + }, + { + "epoch": 0.21, + "grad_norm": 48.11613082885742, + "learning_rate": 9.012997956660807e-06, + "logits/chosen": -1.2310947179794312, + "logits/rejected": -0.9574726819992065, + "logps/chosen": -0.809436023235321, + "logps/rejected": -1.5579251050949097, + "loss": 0.8551, + "odds_ratio_loss": 0.45699796080589294, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08094360679388046, + "rewards/margins": 0.07484889030456543, + "rewards/rejected": -0.1557925045490265, + "sft_loss": 0.809436023235321, + "step": 2695 + }, + { + "epoch": 0.21, + "grad_norm": 8.816434860229492, + "learning_rate": 9.009321811737553e-06, + "logits/chosen": -1.2568022012710571, + "logits/rejected": -0.9708755612373352, + "logps/chosen": -0.8139735460281372, + "logps/rejected": -1.1872222423553467, + "loss": 0.868, + "odds_ratio_loss": 0.5405431985855103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08139735460281372, + "rewards/margins": 0.03732486814260483, + "rewards/rejected": -0.11872222274541855, + "sft_loss": 0.8139735460281372, + "step": 2700 + }, + { + "epoch": 0.21, + "grad_norm": 21.41962242126465, + "learning_rate": 9.005639586168728e-06, + "logits/chosen": -1.076005458831787, + "logits/rejected": -1.0931252241134644, + "logps/chosen": -0.6707652807235718, + "logps/rejected": -2.1130595207214355, + "loss": 0.6878, + "odds_ratio_loss": 0.16990558803081512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06707652658224106, + "rewards/margins": 0.14422942698001862, + "rewards/rejected": -0.21130594611167908, + "sft_loss": 0.6707652807235718, + "step": 2705 + }, + { + "epoch": 0.21, + "grad_norm": 31.207767486572266, + "learning_rate": 9.001951285538897e-06, + "logits/chosen": -1.1946260929107666, + "logits/rejected": -0.9534111022949219, + "logps/chosen": -1.2287012338638306, + "logps/rejected": -2.725151538848877, + "loss": 1.2922, + "odds_ratio_loss": 0.634853720664978, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12287012487649918, + "rewards/margins": 0.14964503049850464, + "rewards/rejected": -0.2725151479244232, + "sft_loss": 1.2287012338638306, + "step": 2710 + }, + { + "epoch": 0.21, + "grad_norm": 36.257164001464844, + "learning_rate": 8.998256915441831e-06, + "logits/chosen": -1.2600607872009277, + "logits/rejected": -0.5680662393569946, + "logps/chosen": -0.9931353330612183, + "logps/rejected": -2.1860270500183105, + "loss": 1.0308, + "odds_ratio_loss": 0.37645816802978516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09931354224681854, + "rewards/margins": 0.11928915977478027, + "rewards/rejected": -0.21860270202159882, + "sft_loss": 0.9931353330612183, + "step": 2715 + }, + { + "epoch": 0.21, + "grad_norm": 9.290372848510742, + "learning_rate": 8.994556481480517e-06, + "logits/chosen": -0.9715999364852905, + "logits/rejected": -0.8543729782104492, + "logps/chosen": -0.6672913432121277, + "logps/rejected": -1.5520168542861938, + "loss": 0.6947, + "odds_ratio_loss": 0.2744672894477844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06672913581132889, + "rewards/margins": 0.08847255259752274, + "rewards/rejected": -0.15520168840885162, + "sft_loss": 0.6672913432121277, + "step": 2720 + }, + { + "epoch": 0.21, + "grad_norm": 29.265583038330078, + "learning_rate": 8.990849989267127e-06, + "logits/chosen": -1.2258832454681396, + "logits/rejected": -1.1225849390029907, + "logps/chosen": -0.988205075263977, + "logps/rejected": -0.9803763628005981, + "loss": 1.063, + "odds_ratio_loss": 0.7477890849113464, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0988205075263977, + "rewards/margins": -0.0007828705129213631, + "rewards/rejected": -0.09803762286901474, + "sft_loss": 0.988205075263977, + "step": 2725 + }, + { + "epoch": 0.21, + "grad_norm": 30.148481369018555, + "learning_rate": 8.987137444423033e-06, + "logits/chosen": -1.217212438583374, + "logits/rejected": -0.7578709125518799, + "logps/chosen": -0.9597604870796204, + "logps/rejected": -1.3635588884353638, + "loss": 1.0054, + "odds_ratio_loss": 0.4561527669429779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09597603976726532, + "rewards/margins": 0.0403798371553421, + "rewards/rejected": -0.13635587692260742, + "sft_loss": 0.9597604870796204, + "step": 2730 + }, + { + "epoch": 0.21, + "grad_norm": 6.763095378875732, + "learning_rate": 8.983418852578776e-06, + "logits/chosen": -1.2369617223739624, + "logits/rejected": -0.9568303823471069, + "logps/chosen": -1.1668567657470703, + "logps/rejected": -1.3419616222381592, + "loss": 1.2252, + "odds_ratio_loss": 0.5838689804077148, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11668567359447479, + "rewards/margins": 0.01751049794256687, + "rewards/rejected": -0.1341961771249771, + "sft_loss": 1.1668567657470703, + "step": 2735 + }, + { + "epoch": 0.21, + "grad_norm": 6.214346885681152, + "learning_rate": 8.979694219374076e-06, + "logits/chosen": -1.202335238456726, + "logits/rejected": -1.1979544162750244, + "logps/chosen": -0.9494991302490234, + "logps/rejected": -2.23736834526062, + "loss": 0.9914, + "odds_ratio_loss": 0.4188031256198883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09494991600513458, + "rewards/margins": 0.12878692150115967, + "rewards/rejected": -0.22373683750629425, + "sft_loss": 0.9494991302490234, + "step": 2740 + }, + { + "epoch": 0.21, + "grad_norm": 6.824653625488281, + "learning_rate": 8.975963550457809e-06, + "logits/chosen": -1.456319808959961, + "logits/rejected": -1.2307958602905273, + "logps/chosen": -0.7797650694847107, + "logps/rejected": -1.6499179601669312, + "loss": 0.8556, + "odds_ratio_loss": 0.7585657835006714, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.07797650992870331, + "rewards/margins": 0.087015300989151, + "rewards/rejected": -0.1649918109178543, + "sft_loss": 0.7797650694847107, + "step": 2745 + }, + { + "epoch": 0.21, + "grad_norm": 12.145956993103027, + "learning_rate": 8.97222685148801e-06, + "logits/chosen": -1.3519519567489624, + "logits/rejected": -1.1805908679962158, + "logps/chosen": -1.2413402795791626, + "logps/rejected": -3.3706047534942627, + "loss": 1.284, + "odds_ratio_loss": 0.4268109202384949, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12413404136896133, + "rewards/margins": 0.2129264622926712, + "rewards/rejected": -0.33706048130989075, + "sft_loss": 1.2413402795791626, + "step": 2750 + }, + { + "epoch": 0.21, + "grad_norm": 11.599871635437012, + "learning_rate": 8.968484128131858e-06, + "logits/chosen": -1.253807544708252, + "logits/rejected": -0.717187225818634, + "logps/chosen": -1.1322736740112305, + "logps/rejected": -4.610743045806885, + "loss": 1.1829, + "odds_ratio_loss": 0.5058093070983887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11322736740112305, + "rewards/margins": 0.34784695506095886, + "rewards/rejected": -0.4610742926597595, + "sft_loss": 1.1322736740112305, + "step": 2755 + }, + { + "epoch": 0.21, + "grad_norm": 5.39821720123291, + "learning_rate": 8.964735386065669e-06, + "logits/chosen": -1.196275234222412, + "logits/rejected": -0.8260028958320618, + "logps/chosen": -1.1362543106079102, + "logps/rejected": -1.7856251001358032, + "loss": 1.1734, + "odds_ratio_loss": 0.3714643120765686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11362544447183609, + "rewards/margins": 0.06493707001209259, + "rewards/rejected": -0.1785624921321869, + "sft_loss": 1.1362543106079102, + "step": 2760 + }, + { + "epoch": 0.22, + "grad_norm": 14.481295585632324, + "learning_rate": 8.960980630974881e-06, + "logits/chosen": -0.6916632056236267, + "logits/rejected": -1.1656701564788818, + "logps/chosen": -1.1203669309616089, + "logps/rejected": -1.38091242313385, + "loss": 1.1758, + "odds_ratio_loss": 0.5547033548355103, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11203669011592865, + "rewards/margins": 0.026054542511701584, + "rewards/rejected": -0.13809125125408173, + "sft_loss": 1.1203669309616089, + "step": 2765 + }, + { + "epoch": 0.22, + "grad_norm": 5.397483825683594, + "learning_rate": 8.957219868554064e-06, + "logits/chosen": -1.1858707666397095, + "logits/rejected": -0.7736166715621948, + "logps/chosen": -0.6137800812721252, + "logps/rejected": -2.991201877593994, + "loss": 0.6472, + "odds_ratio_loss": 0.3342156410217285, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06137801334261894, + "rewards/margins": 0.23774215579032898, + "rewards/rejected": -0.2991201877593994, + "sft_loss": 0.6137800812721252, + "step": 2770 + }, + { + "epoch": 0.22, + "grad_norm": 25.549015045166016, + "learning_rate": 8.953453104506886e-06, + "logits/chosen": -1.0718984603881836, + "logits/rejected": -1.097247838973999, + "logps/chosen": -1.0510852336883545, + "logps/rejected": -1.8239879608154297, + "loss": 1.0958, + "odds_ratio_loss": 0.4471181035041809, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10510852187871933, + "rewards/margins": 0.07729027420282364, + "rewards/rejected": -0.18239879608154297, + "sft_loss": 1.0510852336883545, + "step": 2775 + }, + { + "epoch": 0.22, + "grad_norm": 24.154788970947266, + "learning_rate": 8.949680344546125e-06, + "logits/chosen": -1.3380266427993774, + "logits/rejected": -1.0320308208465576, + "logps/chosen": -1.1754895448684692, + "logps/rejected": -1.2273073196411133, + "loss": 1.2414, + "odds_ratio_loss": 0.6591736674308777, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11754894256591797, + "rewards/margins": 0.0051817819476127625, + "rewards/rejected": -0.12273073196411133, + "sft_loss": 1.1754895448684692, + "step": 2780 + }, + { + "epoch": 0.22, + "grad_norm": 6.844038009643555, + "learning_rate": 8.94590159439365e-06, + "logits/chosen": -1.368798851966858, + "logits/rejected": -0.6834617257118225, + "logps/chosen": -1.0847288370132446, + "logps/rejected": -1.476498007774353, + "loss": 1.1346, + "odds_ratio_loss": 0.49874448776245117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10847288370132446, + "rewards/margins": 0.03917691111564636, + "rewards/rejected": -0.14764979481697083, + "sft_loss": 1.0847288370132446, + "step": 2785 + }, + { + "epoch": 0.22, + "grad_norm": 16.110488891601562, + "learning_rate": 8.942116859780416e-06, + "logits/chosen": -1.2514593601226807, + "logits/rejected": -1.0386936664581299, + "logps/chosen": -0.8438073396682739, + "logps/rejected": -3.5511767864227295, + "loss": 0.88, + "odds_ratio_loss": 0.3621874451637268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08438073098659515, + "rewards/margins": 0.270736962556839, + "rewards/rejected": -0.35511770844459534, + "sft_loss": 0.8438073396682739, + "step": 2790 + }, + { + "epoch": 0.22, + "grad_norm": 6.961461067199707, + "learning_rate": 8.938326146446455e-06, + "logits/chosen": -1.1863247156143188, + "logits/rejected": -0.8467245101928711, + "logps/chosen": -1.0176557302474976, + "logps/rejected": -1.874770164489746, + "loss": 1.0542, + "odds_ratio_loss": 0.3654174208641052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10176558792591095, + "rewards/margins": 0.08571141958236694, + "rewards/rejected": -0.1874770075082779, + "sft_loss": 1.0176557302474976, + "step": 2795 + }, + { + "epoch": 0.22, + "grad_norm": 5.351832389831543, + "learning_rate": 8.934529460140864e-06, + "logits/chosen": -1.2538938522338867, + "logits/rejected": -0.8942509889602661, + "logps/chosen": -0.9106897115707397, + "logps/rejected": -2.4164175987243652, + "loss": 0.94, + "odds_ratio_loss": 0.2929394841194153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09106897562742233, + "rewards/margins": 0.1505727767944336, + "rewards/rejected": -0.24164175987243652, + "sft_loss": 0.9106897115707397, + "step": 2800 + }, + { + "epoch": 0.22, + "grad_norm": 15.171923637390137, + "learning_rate": 8.930726806621797e-06, + "logits/chosen": -1.1682040691375732, + "logits/rejected": -0.8433181643486023, + "logps/chosen": -1.0888586044311523, + "logps/rejected": -4.776035785675049, + "loss": 1.1557, + "odds_ratio_loss": 0.6680582761764526, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.10888586193323135, + "rewards/margins": 0.3687177300453186, + "rewards/rejected": -0.47760358452796936, + "sft_loss": 1.0888586044311523, + "step": 2805 + }, + { + "epoch": 0.22, + "grad_norm": 18.85120391845703, + "learning_rate": 8.926918191656465e-06, + "logits/chosen": -1.31667160987854, + "logits/rejected": -1.06211256980896, + "logps/chosen": -1.033383846282959, + "logps/rejected": -3.6757144927978516, + "loss": 1.072, + "odds_ratio_loss": 0.3861328065395355, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10333838313817978, + "rewards/margins": 0.2642330825328827, + "rewards/rejected": -0.36757150292396545, + "sft_loss": 1.033383846282959, + "step": 2810 + }, + { + "epoch": 0.22, + "grad_norm": 50.54446792602539, + "learning_rate": 8.923103621021114e-06, + "logits/chosen": -1.196171522140503, + "logits/rejected": -1.0727007389068604, + "logps/chosen": -1.1526124477386475, + "logps/rejected": -1.1024290323257446, + "loss": 1.2311, + "odds_ratio_loss": 0.7850225567817688, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1152612566947937, + "rewards/margins": -0.0050183548592031, + "rewards/rejected": -0.11024288833141327, + "sft_loss": 1.1526124477386475, + "step": 2815 + }, + { + "epoch": 0.22, + "grad_norm": 5.652276515960693, + "learning_rate": 8.919283100501025e-06, + "logits/chosen": -1.1610941886901855, + "logits/rejected": -0.6269224882125854, + "logps/chosen": -0.9240479469299316, + "logps/rejected": -1.1618537902832031, + "loss": 0.9806, + "odds_ratio_loss": 0.5652657151222229, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0924047976732254, + "rewards/margins": 0.02378058061003685, + "rewards/rejected": -0.11618538200855255, + "sft_loss": 0.9240479469299316, + "step": 2820 + }, + { + "epoch": 0.22, + "grad_norm": 6.214611530303955, + "learning_rate": 8.915456635890503e-06, + "logits/chosen": -1.1411340236663818, + "logits/rejected": -0.859778881072998, + "logps/chosen": -0.7868792414665222, + "logps/rejected": -5.329236030578613, + "loss": 0.8132, + "odds_ratio_loss": 0.26342785358428955, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07868792116641998, + "rewards/margins": 0.45423564314842224, + "rewards/rejected": -0.5329235792160034, + "sft_loss": 0.7868792414665222, + "step": 2825 + }, + { + "epoch": 0.22, + "grad_norm": 14.912748336791992, + "learning_rate": 8.911624232992867e-06, + "logits/chosen": -1.376704454421997, + "logits/rejected": -0.46073848009109497, + "logps/chosen": -1.1049047708511353, + "logps/rejected": -1.2518284320831299, + "loss": 1.1677, + "odds_ratio_loss": 0.6280218362808228, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11049047857522964, + "rewards/margins": 0.014692360535264015, + "rewards/rejected": -0.1251828372478485, + "sft_loss": 1.1049047708511353, + "step": 2830 + }, + { + "epoch": 0.22, + "grad_norm": 12.590785026550293, + "learning_rate": 8.90778589762044e-06, + "logits/chosen": -1.136904001235962, + "logits/rejected": -0.8878492116928101, + "logps/chosen": -1.616204023361206, + "logps/rejected": -1.5891444683074951, + "loss": 1.7234, + "odds_ratio_loss": 1.0716639757156372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1616203933954239, + "rewards/margins": -0.0027059554122388363, + "rewards/rejected": -0.1589144468307495, + "sft_loss": 1.616204023361206, + "step": 2835 + }, + { + "epoch": 0.22, + "grad_norm": 8.816327095031738, + "learning_rate": 8.90394163559455e-06, + "logits/chosen": -1.3756773471832275, + "logits/rejected": -1.093867301940918, + "logps/chosen": -0.8899661302566528, + "logps/rejected": -5.7786455154418945, + "loss": 0.9543, + "odds_ratio_loss": 0.6429457664489746, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08899661153554916, + "rewards/margins": 0.48886799812316895, + "rewards/rejected": -0.5778645277023315, + "sft_loss": 0.8899661302566528, + "step": 2840 + }, + { + "epoch": 0.22, + "grad_norm": 7.251855373382568, + "learning_rate": 8.900091452745506e-06, + "logits/chosen": -1.2806169986724854, + "logits/rejected": -1.050431251525879, + "logps/chosen": -0.8310653567314148, + "logps/rejected": -0.6103538274765015, + "loss": 0.9344, + "odds_ratio_loss": 1.0331388711929321, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.08310654014348984, + "rewards/margins": -0.02207115665078163, + "rewards/rejected": -0.06103537604212761, + "sft_loss": 0.8310653567314148, + "step": 2845 + }, + { + "epoch": 0.22, + "grad_norm": 10.528026580810547, + "learning_rate": 8.896235354912597e-06, + "logits/chosen": -1.1204943656921387, + "logits/rejected": -1.0685988664627075, + "logps/chosen": -1.2023366689682007, + "logps/rejected": -1.5722219944000244, + "loss": 1.2622, + "odds_ratio_loss": 0.598936140537262, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12023366987705231, + "rewards/margins": 0.03698853403329849, + "rewards/rejected": -0.1572222113609314, + "sft_loss": 1.2023366689682007, + "step": 2850 + }, + { + "epoch": 0.22, + "grad_norm": 74.59951782226562, + "learning_rate": 8.892373347944088e-06, + "logits/chosen": -1.1229302883148193, + "logits/rejected": -0.9611026644706726, + "logps/chosen": -1.1025960445404053, + "logps/rejected": -2.4425206184387207, + "loss": 1.1763, + "odds_ratio_loss": 0.7375203967094421, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11025960743427277, + "rewards/margins": 0.13399241864681244, + "rewards/rejected": -0.2442520409822464, + "sft_loss": 1.1025960445404053, + "step": 2855 + }, + { + "epoch": 0.22, + "grad_norm": 16.174827575683594, + "learning_rate": 8.888505437697201e-06, + "logits/chosen": -1.128024697303772, + "logits/rejected": -0.9245980381965637, + "logps/chosen": -0.8279545903205872, + "logps/rejected": -3.0395867824554443, + "loss": 0.8703, + "odds_ratio_loss": 0.4236716330051422, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08279546350240707, + "rewards/margins": 0.22116322815418243, + "rewards/rejected": -0.3039587140083313, + "sft_loss": 0.8279545903205872, + "step": 2860 + }, + { + "epoch": 0.22, + "grad_norm": 22.010711669921875, + "learning_rate": 8.884631630038117e-06, + "logits/chosen": -1.1404074430465698, + "logits/rejected": -0.6384499669075012, + "logps/chosen": -1.0815197229385376, + "logps/rejected": -2.512594699859619, + "loss": 1.111, + "odds_ratio_loss": 0.29465144872665405, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10815198719501495, + "rewards/margins": 0.14310748875141144, + "rewards/rejected": -0.2512594759464264, + "sft_loss": 1.0815197229385376, + "step": 2865 + }, + { + "epoch": 0.22, + "grad_norm": 7.166597843170166, + "learning_rate": 8.88075193084195e-06, + "logits/chosen": -1.188714861869812, + "logits/rejected": -0.8500461578369141, + "logps/chosen": -0.9702251553535461, + "logps/rejected": -1.9410117864608765, + "loss": 1.0235, + "odds_ratio_loss": 0.5329502820968628, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09702251851558685, + "rewards/margins": 0.09707866609096527, + "rewards/rejected": -0.19410118460655212, + "sft_loss": 0.9702251553535461, + "step": 2870 + }, + { + "epoch": 0.22, + "grad_norm": 7.927453994750977, + "learning_rate": 8.876866345992762e-06, + "logits/chosen": -1.1141269207000732, + "logits/rejected": -0.5961653590202332, + "logps/chosen": -0.9098241925239563, + "logps/rejected": -1.1537498235702515, + "loss": 0.9654, + "odds_ratio_loss": 0.5554467439651489, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09098242223262787, + "rewards/margins": 0.024392560124397278, + "rewards/rejected": -0.11537498235702515, + "sft_loss": 0.9098241925239563, + "step": 2875 + }, + { + "epoch": 0.22, + "grad_norm": 7.518460750579834, + "learning_rate": 8.872974881383535e-06, + "logits/chosen": -1.0662617683410645, + "logits/rejected": -1.094089388847351, + "logps/chosen": -0.6162663698196411, + "logps/rejected": -1.763864517211914, + "loss": 0.6453, + "odds_ratio_loss": 0.2900220453739166, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06162664294242859, + "rewards/margins": 0.11475982517004013, + "rewards/rejected": -0.17638644576072693, + "sft_loss": 0.6162663698196411, + "step": 2880 + }, + { + "epoch": 0.22, + "grad_norm": 5.836972236633301, + "learning_rate": 8.869077542916167e-06, + "logits/chosen": -1.1317639350891113, + "logits/rejected": -0.6082301735877991, + "logps/chosen": -0.9128414392471313, + "logps/rejected": -1.3924942016601562, + "loss": 0.9544, + "odds_ratio_loss": 0.4153948426246643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0912841409444809, + "rewards/margins": 0.04796527698636055, + "rewards/rejected": -0.13924942910671234, + "sft_loss": 0.9128414392471313, + "step": 2885 + }, + { + "epoch": 0.22, + "grad_norm": 8.127260208129883, + "learning_rate": 8.86517433650147e-06, + "logits/chosen": -1.1355172395706177, + "logits/rejected": -0.9076636433601379, + "logps/chosen": -0.9803677797317505, + "logps/rejected": -0.9864256978034973, + "loss": 1.0539, + "odds_ratio_loss": 0.7349095344543457, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09803678095340729, + "rewards/margins": 0.0006057933205738664, + "rewards/rejected": -0.09864256531000137, + "sft_loss": 0.9803677797317505, + "step": 2890 + }, + { + "epoch": 0.23, + "grad_norm": 33.14106750488281, + "learning_rate": 8.86126526805915e-06, + "logits/chosen": -1.4334921836853027, + "logits/rejected": -1.1608220338821411, + "logps/chosen": -0.9514063000679016, + "logps/rejected": -1.1313843727111816, + "loss": 1.0146, + "odds_ratio_loss": 0.6314960718154907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09514062106609344, + "rewards/margins": 0.017997819930315018, + "rewards/rejected": -0.11313845217227936, + "sft_loss": 0.9514063000679016, + "step": 2895 + }, + { + "epoch": 0.23, + "grad_norm": 5.668753147125244, + "learning_rate": 8.857350343517804e-06, + "logits/chosen": -1.2259176969528198, + "logits/rejected": -0.9829230308532715, + "logps/chosen": -0.9202004671096802, + "logps/rejected": -1.180841326713562, + "loss": 0.9764, + "odds_ratio_loss": 0.5624373555183411, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09202004969120026, + "rewards/margins": 0.02606409788131714, + "rewards/rejected": -0.1180841475725174, + "sft_loss": 0.9202004671096802, + "step": 2900 + }, + { + "epoch": 0.23, + "grad_norm": 19.66217803955078, + "learning_rate": 8.853429568814913e-06, + "logits/chosen": -1.197495698928833, + "logits/rejected": -0.9274722933769226, + "logps/chosen": -0.8191972970962524, + "logps/rejected": -1.4239537715911865, + "loss": 0.8657, + "odds_ratio_loss": 0.4648253917694092, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08191972225904465, + "rewards/margins": 0.060475654900074005, + "rewards/rejected": -0.14239537715911865, + "sft_loss": 0.8191972970962524, + "step": 2905 + }, + { + "epoch": 0.23, + "grad_norm": 6.289087295532227, + "learning_rate": 8.849502949896831e-06, + "logits/chosen": -1.064294695854187, + "logits/rejected": -1.1289174556732178, + "logps/chosen": -0.6857664585113525, + "logps/rejected": -4.31264591217041, + "loss": 0.7191, + "odds_ratio_loss": 0.3333652913570404, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06857664883136749, + "rewards/margins": 0.362687885761261, + "rewards/rejected": -0.4312645494937897, + "sft_loss": 0.6857664585113525, + "step": 2910 + }, + { + "epoch": 0.23, + "grad_norm": 17.237995147705078, + "learning_rate": 8.845570492718776e-06, + "logits/chosen": -1.1497989892959595, + "logits/rejected": -1.2945317029953003, + "logps/chosen": -0.995174765586853, + "logps/rejected": -1.2497352361679077, + "loss": 1.0528, + "odds_ratio_loss": 0.5761274099349976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09951747953891754, + "rewards/margins": 0.02545604668557644, + "rewards/rejected": -0.12497353553771973, + "sft_loss": 0.995174765586853, + "step": 2915 + }, + { + "epoch": 0.23, + "grad_norm": 6.107137203216553, + "learning_rate": 8.841632203244813e-06, + "logits/chosen": -1.2816154956817627, + "logits/rejected": -0.6775996088981628, + "logps/chosen": -0.8471586108207703, + "logps/rejected": -4.0009355545043945, + "loss": 0.8722, + "odds_ratio_loss": 0.2508474588394165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08471586555242538, + "rewards/margins": 0.3153776526451111, + "rewards/rejected": -0.40009355545043945, + "sft_loss": 0.8471586108207703, + "step": 2920 + }, + { + "epoch": 0.23, + "grad_norm": 10.699955940246582, + "learning_rate": 8.837688087447862e-06, + "logits/chosen": -1.3370736837387085, + "logits/rejected": -0.5626960396766663, + "logps/chosen": -1.0836195945739746, + "logps/rejected": -9.543293952941895, + "loss": 1.1216, + "odds_ratio_loss": 0.3799092769622803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10836195945739746, + "rewards/margins": 0.8459674715995789, + "rewards/rejected": -0.9543293714523315, + "sft_loss": 1.0836195945739746, + "step": 2925 + }, + { + "epoch": 0.23, + "grad_norm": 12.650343894958496, + "learning_rate": 8.833738151309677e-06, + "logits/chosen": -1.4565495252609253, + "logits/rejected": -1.268577218055725, + "logps/chosen": -0.9817711710929871, + "logps/rejected": -4.19735860824585, + "loss": 1.0108, + "odds_ratio_loss": 0.2902795970439911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09817712008953094, + "rewards/margins": 0.32155877351760864, + "rewards/rejected": -0.4197359085083008, + "sft_loss": 0.9817711710929871, + "step": 2930 + }, + { + "epoch": 0.23, + "grad_norm": 9.384466171264648, + "learning_rate": 8.829782400820833e-06, + "logits/chosen": -1.376010537147522, + "logits/rejected": -0.5250366926193237, + "logps/chosen": -0.8629180192947388, + "logps/rejected": -1.7529948949813843, + "loss": 0.8961, + "odds_ratio_loss": 0.33201155066490173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08629179745912552, + "rewards/margins": 0.08900769799947739, + "rewards/rejected": -0.1752994954586029, + "sft_loss": 0.8629180192947388, + "step": 2935 + }, + { + "epoch": 0.23, + "grad_norm": 6.656403541564941, + "learning_rate": 8.825820841980729e-06, + "logits/chosen": -1.4098700284957886, + "logits/rejected": -1.2165441513061523, + "logps/chosen": -0.8539711833000183, + "logps/rejected": -2.8836426734924316, + "loss": 0.8861, + "odds_ratio_loss": 0.321702778339386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08539711683988571, + "rewards/margins": 0.20296712219715118, + "rewards/rejected": -0.2883642315864563, + "sft_loss": 0.8539711833000183, + "step": 2940 + }, + { + "epoch": 0.23, + "grad_norm": 12.5855073928833, + "learning_rate": 8.821853480797574e-06, + "logits/chosen": -1.4244425296783447, + "logits/rejected": -1.0870120525360107, + "logps/chosen": -1.1766611337661743, + "logps/rejected": -1.3328752517700195, + "loss": 1.2367, + "odds_ratio_loss": 0.6007108688354492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11766611039638519, + "rewards/margins": 0.015621413476765156, + "rewards/rejected": -0.13328751921653748, + "sft_loss": 1.1766611337661743, + "step": 2945 + }, + { + "epoch": 0.23, + "grad_norm": 7.5176310539245605, + "learning_rate": 8.817880323288376e-06, + "logits/chosen": -1.2485527992248535, + "logits/rejected": -1.1338937282562256, + "logps/chosen": -1.0665452480316162, + "logps/rejected": -1.3328628540039062, + "loss": 1.1206, + "odds_ratio_loss": 0.5406354665756226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10665452480316162, + "rewards/margins": 0.026631761342287064, + "rewards/rejected": -0.13328629732131958, + "sft_loss": 1.0665452480316162, + "step": 2950 + }, + { + "epoch": 0.23, + "grad_norm": 10.515802383422852, + "learning_rate": 8.813901375478928e-06, + "logits/chosen": -1.373665452003479, + "logits/rejected": -1.2379958629608154, + "logps/chosen": -0.9441508054733276, + "logps/rejected": -3.6124789714813232, + "loss": 0.9766, + "odds_ratio_loss": 0.32453179359436035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.094415083527565, + "rewards/margins": 0.2668328285217285, + "rewards/rejected": -0.3612478971481323, + "sft_loss": 0.9441508054733276, + "step": 2955 + }, + { + "epoch": 0.23, + "grad_norm": 8.03735065460205, + "learning_rate": 8.809916643403813e-06, + "logits/chosen": -1.3460569381713867, + "logits/rejected": -0.6389718651771545, + "logps/chosen": -1.1100437641143799, + "logps/rejected": -1.21809983253479, + "loss": 1.1721, + "odds_ratio_loss": 0.6203465461730957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11100438982248306, + "rewards/margins": 0.010805593803524971, + "rewards/rejected": -0.12180998176336288, + "sft_loss": 1.1100437641143799, + "step": 2960 + }, + { + "epoch": 0.23, + "grad_norm": 6.766620635986328, + "learning_rate": 8.805926133106382e-06, + "logits/chosen": -1.2804914712905884, + "logits/rejected": -0.7814501523971558, + "logps/chosen": -1.7104488611221313, + "logps/rejected": -5.756571292877197, + "loss": 1.7352, + "odds_ratio_loss": 0.24745836853981018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17104490101337433, + "rewards/margins": 0.4046122431755066, + "rewards/rejected": -0.5756571888923645, + "sft_loss": 1.7104488611221313, + "step": 2965 + }, + { + "epoch": 0.23, + "grad_norm": 16.29014778137207, + "learning_rate": 8.80192985063875e-06, + "logits/chosen": -1.2973188161849976, + "logits/rejected": -1.2357347011566162, + "logps/chosen": -0.8642646670341492, + "logps/rejected": -1.2275454998016357, + "loss": 0.9122, + "odds_ratio_loss": 0.4792478680610657, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08642647415399551, + "rewards/margins": 0.036328066140413284, + "rewards/rejected": -0.1227545365691185, + "sft_loss": 0.8642646670341492, + "step": 2970 + }, + { + "epoch": 0.23, + "grad_norm": 6.046884059906006, + "learning_rate": 8.797927802061791e-06, + "logits/chosen": -1.42294442653656, + "logits/rejected": -0.8688791990280151, + "logps/chosen": -0.9156146049499512, + "logps/rejected": -0.9524520635604858, + "loss": 0.9983, + "odds_ratio_loss": 0.8266631364822388, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09156147390604019, + "rewards/margins": 0.0036837488878518343, + "rewards/rejected": -0.09524521231651306, + "sft_loss": 0.9156146049499512, + "step": 2975 + }, + { + "epoch": 0.23, + "grad_norm": 5.413948059082031, + "learning_rate": 8.793919993445114e-06, + "logits/chosen": -1.3873004913330078, + "logits/rejected": -0.7676628828048706, + "logps/chosen": -1.1006882190704346, + "logps/rejected": -1.3490431308746338, + "loss": 1.1669, + "odds_ratio_loss": 0.6618945002555847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11006882041692734, + "rewards/margins": 0.024835485965013504, + "rewards/rejected": -0.13490431010723114, + "sft_loss": 1.1006882190704346, + "step": 2980 + }, + { + "epoch": 0.23, + "grad_norm": 19.28611183166504, + "learning_rate": 8.789906430867073e-06, + "logits/chosen": -1.3977916240692139, + "logits/rejected": -1.0037000179290771, + "logps/chosen": -2.9157166481018066, + "logps/rejected": -4.102760314941406, + "loss": 3.0109, + "odds_ratio_loss": 0.9522919654846191, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2915716767311096, + "rewards/margins": 0.11870436370372772, + "rewards/rejected": -0.41027602553367615, + "sft_loss": 2.9157166481018066, + "step": 2985 + }, + { + "epoch": 0.23, + "grad_norm": 6.863065719604492, + "learning_rate": 8.785887120414744e-06, + "logits/chosen": -1.468731164932251, + "logits/rejected": -0.7206265330314636, + "logps/chosen": -1.1206532716751099, + "logps/rejected": -1.4424384832382202, + "loss": 1.1806, + "odds_ratio_loss": 0.5996042490005493, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11206533014774323, + "rewards/margins": 0.03217853978276253, + "rewards/rejected": -0.14424386620521545, + "sft_loss": 1.1206532716751099, + "step": 2990 + }, + { + "epoch": 0.23, + "grad_norm": 25.038049697875977, + "learning_rate": 8.781862068183922e-06, + "logits/chosen": -1.0874309539794922, + "logits/rejected": -1.0442769527435303, + "logps/chosen": -0.9091469049453735, + "logps/rejected": -8.371574401855469, + "loss": 0.9341, + "odds_ratio_loss": 0.2496640980243683, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09091468900442123, + "rewards/margins": 0.7462427020072937, + "rewards/rejected": -0.8371574282646179, + "sft_loss": 0.9091469049453735, + "step": 2995 + }, + { + "epoch": 0.23, + "grad_norm": 8.39319133758545, + "learning_rate": 8.77783128027911e-06, + "logits/chosen": -1.457811713218689, + "logits/rejected": -0.886703610420227, + "logps/chosen": -1.1325379610061646, + "logps/rejected": -1.4151691198349, + "loss": 1.1911, + "odds_ratio_loss": 0.5853082537651062, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11325379461050034, + "rewards/margins": 0.02826312743127346, + "rewards/rejected": -0.14151692390441895, + "sft_loss": 1.1325379610061646, + "step": 3000 + }, + { + "epoch": 0.23, + "grad_norm": 9.52746868133545, + "learning_rate": 8.773794762813507e-06, + "logits/chosen": -1.4067108631134033, + "logits/rejected": -1.0598171949386597, + "logps/chosen": -0.8269651532173157, + "logps/rejected": -1.1110790967941284, + "loss": 0.877, + "odds_ratio_loss": 0.5001311302185059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08269651234149933, + "rewards/margins": 0.02841140702366829, + "rewards/rejected": -0.11110792309045792, + "sft_loss": 0.8269651532173157, + "step": 3005 + }, + { + "epoch": 0.23, + "grad_norm": 12.703961372375488, + "learning_rate": 8.76975252190901e-06, + "logits/chosen": -1.2674983739852905, + "logits/rejected": -1.1456859111785889, + "logps/chosen": -1.1914570331573486, + "logps/rejected": -10.400805473327637, + "loss": 1.2233, + "odds_ratio_loss": 0.3181864321231842, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1191457062959671, + "rewards/margins": 0.9209348559379578, + "rewards/rejected": -1.0400805473327637, + "sft_loss": 1.1914570331573486, + "step": 3010 + }, + { + "epoch": 0.23, + "grad_norm": 5.381165027618408, + "learning_rate": 8.765704563696187e-06, + "logits/chosen": -1.3665794134140015, + "logits/rejected": -0.9956024885177612, + "logps/chosen": -1.2555207014083862, + "logps/rejected": -1.8276020288467407, + "loss": 1.3062, + "odds_ratio_loss": 0.5069686770439148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12555207312107086, + "rewards/margins": 0.05720812827348709, + "rewards/rejected": -0.18276020884513855, + "sft_loss": 1.2555207014083862, + "step": 3015 + }, + { + "epoch": 0.23, + "grad_norm": 13.621868133544922, + "learning_rate": 8.761650894314278e-06, + "logits/chosen": -1.299678921699524, + "logits/rejected": -0.7859451174736023, + "logps/chosen": -1.0665977001190186, + "logps/rejected": -2.9078383445739746, + "loss": 1.0893, + "odds_ratio_loss": 0.22742700576782227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10665978491306305, + "rewards/margins": 0.18412408232688904, + "rewards/rejected": -0.2907838523387909, + "sft_loss": 1.0665977001190186, + "step": 3020 + }, + { + "epoch": 0.24, + "grad_norm": 25.114072799682617, + "learning_rate": 8.757591519911192e-06, + "logits/chosen": -1.3486571311950684, + "logits/rejected": -1.2607526779174805, + "logps/chosen": -0.8685344457626343, + "logps/rejected": -1.245876431465149, + "loss": 0.9133, + "odds_ratio_loss": 0.4474593698978424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08685345202684402, + "rewards/margins": 0.03773418813943863, + "rewards/rejected": -0.12458764016628265, + "sft_loss": 0.8685344457626343, + "step": 3025 + }, + { + "epoch": 0.24, + "grad_norm": 16.62470054626465, + "learning_rate": 8.753526446643483e-06, + "logits/chosen": -1.3854665756225586, + "logits/rejected": -0.668252170085907, + "logps/chosen": -0.8873245120048523, + "logps/rejected": -9.595453262329102, + "loss": 0.9257, + "odds_ratio_loss": 0.3835451602935791, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08873245120048523, + "rewards/margins": 0.8708128929138184, + "rewards/rejected": -0.9595453143119812, + "sft_loss": 0.8873245120048523, + "step": 3030 + }, + { + "epoch": 0.24, + "grad_norm": 9.757149696350098, + "learning_rate": 8.74945568067635e-06, + "logits/chosen": -1.3147366046905518, + "logits/rejected": -1.037381649017334, + "logps/chosen": -1.0702860355377197, + "logps/rejected": -5.288527011871338, + "loss": 1.1103, + "odds_ratio_loss": 0.3997670114040375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10702860355377197, + "rewards/margins": 0.4218241274356842, + "rewards/rejected": -0.5288527011871338, + "sft_loss": 1.0702860355377197, + "step": 3035 + }, + { + "epoch": 0.24, + "grad_norm": 8.116015434265137, + "learning_rate": 8.74537922818363e-06, + "logits/chosen": -1.3341821432113647, + "logits/rejected": -0.8699405789375305, + "logps/chosen": -1.1210837364196777, + "logps/rejected": -5.426255702972412, + "loss": 1.1637, + "odds_ratio_loss": 0.42615023255348206, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11210837215185165, + "rewards/margins": 0.4305172562599182, + "rewards/rejected": -0.5426255464553833, + "sft_loss": 1.1210837364196777, + "step": 3040 + }, + { + "epoch": 0.24, + "grad_norm": 17.718124389648438, + "learning_rate": 8.741297095347779e-06, + "logits/chosen": -1.3567863702774048, + "logits/rejected": -0.9880765676498413, + "logps/chosen": -1.135987639427185, + "logps/rejected": -1.6871782541275024, + "loss": 1.2054, + "odds_ratio_loss": 0.6937232613563538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1135987788438797, + "rewards/margins": 0.055119067430496216, + "rewards/rejected": -0.16871783137321472, + "sft_loss": 1.135987639427185, + "step": 3045 + }, + { + "epoch": 0.24, + "grad_norm": 4.6651716232299805, + "learning_rate": 8.737209288359868e-06, + "logits/chosen": -1.4298700094223022, + "logits/rejected": -0.8255916833877563, + "logps/chosen": -0.9301006197929382, + "logps/rejected": -1.5776050090789795, + "loss": 0.9788, + "odds_ratio_loss": 0.48722043633461, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0930100604891777, + "rewards/margins": 0.06475045531988144, + "rewards/rejected": -0.15776051580905914, + "sft_loss": 0.9301006197929382, + "step": 3050 + }, + { + "epoch": 0.24, + "grad_norm": 9.877421379089355, + "learning_rate": 8.733115813419575e-06, + "logits/chosen": -1.4384605884552002, + "logits/rejected": -1.1984599828720093, + "logps/chosen": -0.6977513432502747, + "logps/rejected": -1.4935929775238037, + "loss": 0.732, + "odds_ratio_loss": 0.3421218693256378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06977514922618866, + "rewards/margins": 0.07958415895700455, + "rewards/rejected": -0.1493593007326126, + "sft_loss": 0.6977513432502747, + "step": 3055 + }, + { + "epoch": 0.24, + "grad_norm": 62.607574462890625, + "learning_rate": 8.729016676735179e-06, + "logits/chosen": -1.392884373664856, + "logits/rejected": -1.0296696424484253, + "logps/chosen": -0.9279215931892395, + "logps/rejected": -1.295037031173706, + "loss": 0.9826, + "odds_ratio_loss": 0.5466240644454956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09279216080904007, + "rewards/margins": 0.03671155124902725, + "rewards/rejected": -0.12950369715690613, + "sft_loss": 0.9279215931892395, + "step": 3060 + }, + { + "epoch": 0.24, + "grad_norm": 25.62282943725586, + "learning_rate": 8.724911884523537e-06, + "logits/chosen": -1.4473998546600342, + "logits/rejected": -0.888154149055481, + "logps/chosen": -1.002514123916626, + "logps/rejected": -1.8997751474380493, + "loss": 1.0388, + "odds_ratio_loss": 0.3630914092063904, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10025142133235931, + "rewards/margins": 0.08972609043121338, + "rewards/rejected": -0.1899775117635727, + "sft_loss": 1.002514123916626, + "step": 3065 + }, + { + "epoch": 0.24, + "grad_norm": 28.85422706604004, + "learning_rate": 8.720801443010089e-06, + "logits/chosen": -1.3426685333251953, + "logits/rejected": -0.944126307964325, + "logps/chosen": -1.0599967241287231, + "logps/rejected": -1.6001322269439697, + "loss": 1.0998, + "odds_ratio_loss": 0.3978647291660309, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1059996709227562, + "rewards/margins": 0.054013561457395554, + "rewards/rejected": -0.16001322865486145, + "sft_loss": 1.0599967241287231, + "step": 3070 + }, + { + "epoch": 0.24, + "grad_norm": 10.21401309967041, + "learning_rate": 8.71668535842884e-06, + "logits/chosen": -1.4288160800933838, + "logits/rejected": -1.0084912776947021, + "logps/chosen": -1.1480839252471924, + "logps/rejected": -1.619964838027954, + "loss": 1.1981, + "odds_ratio_loss": 0.4998813569545746, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11480840295553207, + "rewards/margins": 0.047188084572553635, + "rewards/rejected": -0.1619964838027954, + "sft_loss": 1.1480839252471924, + "step": 3075 + }, + { + "epoch": 0.24, + "grad_norm": 7.576767921447754, + "learning_rate": 8.712563637022357e-06, + "logits/chosen": -1.3043601512908936, + "logits/rejected": -1.127294898033142, + "logps/chosen": -0.9208580851554871, + "logps/rejected": -7.681375026702881, + "loss": 0.9638, + "odds_ratio_loss": 0.42907315492630005, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0920858085155487, + "rewards/margins": 0.6760517358779907, + "rewards/rejected": -0.7681375741958618, + "sft_loss": 0.9208580851554871, + "step": 3080 + }, + { + "epoch": 0.24, + "grad_norm": 6.175205230712891, + "learning_rate": 8.708436285041755e-06, + "logits/chosen": -1.278227686882019, + "logits/rejected": -0.896086573600769, + "logps/chosen": -0.9899203181266785, + "logps/rejected": -1.603487253189087, + "loss": 1.0352, + "odds_ratio_loss": 0.45269575715065, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09899203479290009, + "rewards/margins": 0.06135668605566025, + "rewards/rejected": -0.16034871339797974, + "sft_loss": 0.9899203181266785, + "step": 3085 + }, + { + "epoch": 0.24, + "grad_norm": 9.487650871276855, + "learning_rate": 8.704303308746684e-06, + "logits/chosen": -1.3824490308761597, + "logits/rejected": -0.6026290655136108, + "logps/chosen": -1.0155322551727295, + "logps/rejected": -1.7905362844467163, + "loss": 1.0567, + "odds_ratio_loss": 0.4116109311580658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10155323892831802, + "rewards/margins": 0.07750038802623749, + "rewards/rejected": -0.1790536344051361, + "sft_loss": 1.0155322551727295, + "step": 3090 + }, + { + "epoch": 0.24, + "grad_norm": 235.64552307128906, + "learning_rate": 8.700164714405328e-06, + "logits/chosen": -1.2964307069778442, + "logits/rejected": -0.9890910983085632, + "logps/chosen": -1.991641640663147, + "logps/rejected": -2.6005265712738037, + "loss": 2.0885, + "odds_ratio_loss": 0.968436062335968, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19916416704654694, + "rewards/margins": 0.06088850647211075, + "rewards/rejected": -0.2600526511669159, + "sft_loss": 1.991641640663147, + "step": 3095 + }, + { + "epoch": 0.24, + "grad_norm": 7.392255783081055, + "learning_rate": 8.696020508294391e-06, + "logits/chosen": -1.440700888633728, + "logits/rejected": -0.8366183042526245, + "logps/chosen": -0.9562617540359497, + "logps/rejected": -1.2703158855438232, + "loss": 1.0096, + "odds_ratio_loss": 0.5336211919784546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09562616795301437, + "rewards/margins": 0.03140542656183243, + "rewards/rejected": -0.1270315945148468, + "sft_loss": 0.9562617540359497, + "step": 3100 + }, + { + "epoch": 0.24, + "grad_norm": 9.974418640136719, + "learning_rate": 8.69187069669909e-06, + "logits/chosen": -1.4482066631317139, + "logits/rejected": -1.3463423252105713, + "logps/chosen": -1.2112513780593872, + "logps/rejected": -1.9238742589950562, + "loss": 1.2708, + "odds_ratio_loss": 0.595005989074707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12112513929605484, + "rewards/margins": 0.07126231491565704, + "rewards/rejected": -0.1923874318599701, + "sft_loss": 1.2112513780593872, + "step": 3105 + }, + { + "epoch": 0.24, + "grad_norm": 20.881736755371094, + "learning_rate": 8.687715285913138e-06, + "logits/chosen": -1.4098975658416748, + "logits/rejected": -0.9576647877693176, + "logps/chosen": -0.7991948127746582, + "logps/rejected": -1.363541841506958, + "loss": 0.8475, + "odds_ratio_loss": 0.4834299683570862, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0799194872379303, + "rewards/margins": 0.05643470212817192, + "rewards/rejected": -0.13635417819023132, + "sft_loss": 0.7991948127746582, + "step": 3110 + }, + { + "epoch": 0.24, + "grad_norm": 23.569698333740234, + "learning_rate": 8.683554282238746e-06, + "logits/chosen": -1.1757612228393555, + "logits/rejected": -1.2875800132751465, + "logps/chosen": -1.008616328239441, + "logps/rejected": -1.3941243886947632, + "loss": 1.0608, + "odds_ratio_loss": 0.5214596390724182, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10086163133382797, + "rewards/margins": 0.03855080530047417, + "rewards/rejected": -0.13941244781017303, + "sft_loss": 1.008616328239441, + "step": 3115 + }, + { + "epoch": 0.24, + "grad_norm": 10.971246719360352, + "learning_rate": 8.6793876919866e-06, + "logits/chosen": -1.4144350290298462, + "logits/rejected": -1.1336814165115356, + "logps/chosen": -1.0561202764511108, + "logps/rejected": -8.628296852111816, + "loss": 1.1081, + "odds_ratio_loss": 0.5193870663642883, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10561203956604004, + "rewards/margins": 0.7572176456451416, + "rewards/rejected": -0.8628296852111816, + "sft_loss": 1.0561202764511108, + "step": 3120 + }, + { + "epoch": 0.24, + "grad_norm": 9.794310569763184, + "learning_rate": 8.675215521475868e-06, + "logits/chosen": -1.2344005107879639, + "logits/rejected": -1.2249268293380737, + "logps/chosen": -0.9627717733383179, + "logps/rejected": -1.757939100265503, + "loss": 0.9977, + "odds_ratio_loss": 0.34921079874038696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09627718478441238, + "rewards/margins": 0.07951673120260239, + "rewards/rejected": -0.17579391598701477, + "sft_loss": 0.9627717733383179, + "step": 3125 + }, + { + "epoch": 0.24, + "grad_norm": 6.747648239135742, + "learning_rate": 8.671037777034173e-06, + "logits/chosen": -1.3628339767456055, + "logits/rejected": -1.1204272508621216, + "logps/chosen": -1.2489184141159058, + "logps/rejected": -6.822798728942871, + "loss": 1.2942, + "odds_ratio_loss": 0.4523259997367859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12489183992147446, + "rewards/margins": 0.5573880076408386, + "rewards/rejected": -0.6822798848152161, + "sft_loss": 1.2489184141159058, + "step": 3130 + }, + { + "epoch": 0.24, + "grad_norm": 16.021392822265625, + "learning_rate": 8.666854464997596e-06, + "logits/chosen": -1.3786375522613525, + "logits/rejected": -0.7975913286209106, + "logps/chosen": -0.9592100381851196, + "logps/rejected": -2.387404203414917, + "loss": 1.0103, + "odds_ratio_loss": 0.5111249685287476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09592099487781525, + "rewards/margins": 0.14281943440437317, + "rewards/rejected": -0.2387404441833496, + "sft_loss": 0.9592100381851196, + "step": 3135 + }, + { + "epoch": 0.24, + "grad_norm": 24.078792572021484, + "learning_rate": 8.662665591710661e-06, + "logits/chosen": -1.2728703022003174, + "logits/rejected": -0.894081711769104, + "logps/chosen": -0.9345654249191284, + "logps/rejected": -2.127600908279419, + "loss": 0.9621, + "odds_ratio_loss": 0.27542421221733093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09345654398202896, + "rewards/margins": 0.11930353939533234, + "rewards/rejected": -0.2127600908279419, + "sft_loss": 0.9345654249191284, + "step": 3140 + }, + { + "epoch": 0.24, + "grad_norm": 208.22991943359375, + "learning_rate": 8.658471163526327e-06, + "logits/chosen": -1.2844762802124023, + "logits/rejected": -1.1959311962127686, + "logps/chosen": -1.1571857929229736, + "logps/rejected": -1.600608468055725, + "loss": 1.2147, + "odds_ratio_loss": 0.5752911567687988, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11571858078241348, + "rewards/margins": 0.0443422757089138, + "rewards/rejected": -0.160060852766037, + "sft_loss": 1.1571857929229736, + "step": 3145 + }, + { + "epoch": 0.25, + "grad_norm": 9.317758560180664, + "learning_rate": 8.654271186805974e-06, + "logits/chosen": -1.3937506675720215, + "logits/rejected": -0.9512116312980652, + "logps/chosen": -0.7833994626998901, + "logps/rejected": -1.622886300086975, + "loss": 0.817, + "odds_ratio_loss": 0.33581018447875977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07833994925022125, + "rewards/margins": 0.08394867926836014, + "rewards/rejected": -0.1622886210680008, + "sft_loss": 0.7833994626998901, + "step": 3150 + }, + { + "epoch": 0.25, + "grad_norm": 10.393624305725098, + "learning_rate": 8.650065667919402e-06, + "logits/chosen": -1.2336599826812744, + "logits/rejected": -1.0274940729141235, + "logps/chosen": -0.948900580406189, + "logps/rejected": -1.548330545425415, + "loss": 0.9981, + "odds_ratio_loss": 0.49193769693374634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09489007294178009, + "rewards/margins": 0.05994298309087753, + "rewards/rejected": -0.15483304858207703, + "sft_loss": 0.948900580406189, + "step": 3155 + }, + { + "epoch": 0.25, + "grad_norm": 5.0471272468566895, + "learning_rate": 8.645854613244817e-06, + "logits/chosen": -1.374237298965454, + "logits/rejected": -0.5605775117874146, + "logps/chosen": -0.8872843980789185, + "logps/rejected": -1.1447935104370117, + "loss": 0.9446, + "odds_ratio_loss": 0.5729075074195862, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08872843533754349, + "rewards/margins": 0.025750914588570595, + "rewards/rejected": -0.11447934806346893, + "sft_loss": 0.8872843980789185, + "step": 3160 + }, + { + "epoch": 0.25, + "grad_norm": 11.45893669128418, + "learning_rate": 8.641638029168812e-06, + "logits/chosen": -1.2924164533615112, + "logits/rejected": -1.0458002090454102, + "logps/chosen": -0.9896078109741211, + "logps/rejected": -1.7145917415618896, + "loss": 1.0336, + "odds_ratio_loss": 0.43977856636047363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09896077960729599, + "rewards/margins": 0.0724983960390091, + "rewards/rejected": -0.1714591681957245, + "sft_loss": 0.9896078109741211, + "step": 3165 + }, + { + "epoch": 0.25, + "grad_norm": 5.857996463775635, + "learning_rate": 8.637415922086377e-06, + "logits/chosen": -1.2714940309524536, + "logits/rejected": -0.8355283737182617, + "logps/chosen": -1.2923920154571533, + "logps/rejected": -2.2515740394592285, + "loss": 1.3668, + "odds_ratio_loss": 0.7443500757217407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12923920154571533, + "rewards/margins": 0.0959181934595108, + "rewards/rejected": -0.22515738010406494, + "sft_loss": 1.2923920154571533, + "step": 3170 + }, + { + "epoch": 0.25, + "grad_norm": 8.047304153442383, + "learning_rate": 8.633188298400872e-06, + "logits/chosen": -1.4827300310134888, + "logits/rejected": -1.222534418106079, + "logps/chosen": -0.8443825840950012, + "logps/rejected": -1.3143736124038696, + "loss": 0.911, + "odds_ratio_loss": 0.6657195687294006, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0844382643699646, + "rewards/margins": 0.04699909687042236, + "rewards/rejected": -0.13143734633922577, + "sft_loss": 0.8443825840950012, + "step": 3175 + }, + { + "epoch": 0.25, + "grad_norm": 6.139329433441162, + "learning_rate": 8.628955164524024e-06, + "logits/chosen": -1.412903904914856, + "logits/rejected": -0.6597784757614136, + "logps/chosen": -1.1135563850402832, + "logps/rejected": -1.4750511646270752, + "loss": 1.1709, + "odds_ratio_loss": 0.5732403993606567, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11135563999414444, + "rewards/margins": 0.03614946827292442, + "rewards/rejected": -0.14750510454177856, + "sft_loss": 1.1135563850402832, + "step": 3180 + }, + { + "epoch": 0.25, + "grad_norm": 7.063668727874756, + "learning_rate": 8.62471652687592e-06, + "logits/chosen": -1.3726282119750977, + "logits/rejected": -0.7232956886291504, + "logps/chosen": -0.9939814805984497, + "logps/rejected": -1.3079441785812378, + "loss": 1.0462, + "odds_ratio_loss": 0.5219636559486389, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09939815104007721, + "rewards/margins": 0.03139626979827881, + "rewards/rejected": -0.1307944357395172, + "sft_loss": 0.9939814805984497, + "step": 3185 + }, + { + "epoch": 0.25, + "grad_norm": 22.926136016845703, + "learning_rate": 8.62047239188499e-06, + "logits/chosen": -1.2752636671066284, + "logits/rejected": -0.9812234044075012, + "logps/chosen": -0.9917522668838501, + "logps/rejected": -1.4488723278045654, + "loss": 1.0384, + "odds_ratio_loss": 0.46636566519737244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09917521476745605, + "rewards/margins": 0.04571203142404556, + "rewards/rejected": -0.14488725364208221, + "sft_loss": 0.9917522668838501, + "step": 3190 + }, + { + "epoch": 0.25, + "grad_norm": 9.83376693725586, + "learning_rate": 8.616222765988006e-06, + "logits/chosen": -1.2776237726211548, + "logits/rejected": -1.3784388303756714, + "logps/chosen": -0.9015542268753052, + "logps/rejected": -5.178496360778809, + "loss": 0.9514, + "odds_ratio_loss": 0.498735249042511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09015541523694992, + "rewards/margins": 0.427694171667099, + "rewards/rejected": -0.5178496241569519, + "sft_loss": 0.9015542268753052, + "step": 3195 + }, + { + "epoch": 0.25, + "grad_norm": 5.757855415344238, + "learning_rate": 8.611967655630062e-06, + "logits/chosen": -1.3458263874053955, + "logits/rejected": -0.8651115298271179, + "logps/chosen": -1.189414381980896, + "logps/rejected": -9.91465950012207, + "loss": 1.2112, + "odds_ratio_loss": 0.21760015189647675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11894144117832184, + "rewards/margins": 0.872524619102478, + "rewards/rejected": -0.9914659261703491, + "sft_loss": 1.189414381980896, + "step": 3200 + }, + { + "epoch": 0.25, + "grad_norm": 8.271770477294922, + "learning_rate": 8.607707067264577e-06, + "logits/chosen": -1.2520349025726318, + "logits/rejected": -0.9718472361564636, + "logps/chosen": -1.1497652530670166, + "logps/rejected": -1.762926459312439, + "loss": 1.212, + "odds_ratio_loss": 0.622275710105896, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11497652530670166, + "rewards/margins": 0.06131613999605179, + "rewards/rejected": -0.17629265785217285, + "sft_loss": 1.1497652530670166, + "step": 3205 + }, + { + "epoch": 0.25, + "grad_norm": 29.484317779541016, + "learning_rate": 8.603441007353271e-06, + "logits/chosen": -1.3252924680709839, + "logits/rejected": -1.2272132635116577, + "logps/chosen": -1.0914915800094604, + "logps/rejected": -1.3656795024871826, + "loss": 1.1633, + "odds_ratio_loss": 0.7178690433502197, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10914917290210724, + "rewards/margins": 0.027418773621320724, + "rewards/rejected": -0.13656795024871826, + "sft_loss": 1.0914915800094604, + "step": 3210 + }, + { + "epoch": 0.25, + "grad_norm": 10.599020957946777, + "learning_rate": 8.599169482366167e-06, + "logits/chosen": -1.3540681600570679, + "logits/rejected": -1.0985815525054932, + "logps/chosen": -0.8688950538635254, + "logps/rejected": -1.479015588760376, + "loss": 0.9184, + "odds_ratio_loss": 0.495300829410553, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08688951283693314, + "rewards/margins": 0.06101206690073013, + "rewards/rejected": -0.14790156483650208, + "sft_loss": 0.8688950538635254, + "step": 3215 + }, + { + "epoch": 0.25, + "grad_norm": 13.34872817993164, + "learning_rate": 8.594892498781574e-06, + "logits/chosen": -1.3374733924865723, + "logits/rejected": -0.5042542815208435, + "logps/chosen": -1.165950059890747, + "logps/rejected": -12.713995933532715, + "loss": 1.1996, + "odds_ratio_loss": 0.33608299493789673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11659500747919083, + "rewards/margins": 1.1548045873641968, + "rewards/rejected": -1.271399736404419, + "sft_loss": 1.165950059890747, + "step": 3220 + }, + { + "epoch": 0.25, + "grad_norm": 35.187660217285156, + "learning_rate": 8.590610063086082e-06, + "logits/chosen": -1.0710675716400146, + "logits/rejected": -0.9008657336235046, + "logps/chosen": -1.040565013885498, + "logps/rejected": -1.2980402708053589, + "loss": 1.0972, + "odds_ratio_loss": 0.5668312907218933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10405649244785309, + "rewards/margins": 0.025747528299689293, + "rewards/rejected": -0.12980404496192932, + "sft_loss": 1.040565013885498, + "step": 3225 + }, + { + "epoch": 0.25, + "grad_norm": 5.274808883666992, + "learning_rate": 8.586322181774547e-06, + "logits/chosen": -1.3774149417877197, + "logits/rejected": -0.7884630560874939, + "logps/chosen": -0.9635206460952759, + "logps/rejected": -12.23045825958252, + "loss": 0.9828, + "odds_ratio_loss": 0.1926787793636322, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09635205566883087, + "rewards/margins": 1.1266937255859375, + "rewards/rejected": -1.223045825958252, + "sft_loss": 0.9635206460952759, + "step": 3230 + }, + { + "epoch": 0.25, + "grad_norm": 11.046777725219727, + "learning_rate": 8.582028861350086e-06, + "logits/chosen": -1.2464903593063354, + "logits/rejected": -1.1636230945587158, + "logps/chosen": -0.8882688283920288, + "logps/rejected": -1.5485460758209229, + "loss": 0.926, + "odds_ratio_loss": 0.3775942623615265, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08882687985897064, + "rewards/margins": 0.06602771580219269, + "rewards/rejected": -0.15485459566116333, + "sft_loss": 0.8882688283920288, + "step": 3235 + }, + { + "epoch": 0.25, + "grad_norm": 14.48009204864502, + "learning_rate": 8.577730108324067e-06, + "logits/chosen": -1.3417062759399414, + "logits/rejected": -0.6446546316146851, + "logps/chosen": -1.0239213705062866, + "logps/rejected": -2.4988975524902344, + "loss": 1.0451, + "odds_ratio_loss": 0.21227788925170898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10239215195178986, + "rewards/margins": 0.14749760925769806, + "rewards/rejected": -0.24988976120948792, + "sft_loss": 1.0239213705062866, + "step": 3240 + }, + { + "epoch": 0.25, + "grad_norm": 14.136868476867676, + "learning_rate": 8.57342592921609e-06, + "logits/chosen": -1.2961242198944092, + "logits/rejected": -0.7371039390563965, + "logps/chosen": -1.0956108570098877, + "logps/rejected": -1.7069292068481445, + "loss": 1.1371, + "odds_ratio_loss": 0.41481003165245056, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10956110060214996, + "rewards/margins": 0.061131834983825684, + "rewards/rejected": -0.17069292068481445, + "sft_loss": 1.0956108570098877, + "step": 3245 + }, + { + "epoch": 0.25, + "grad_norm": 13.239011764526367, + "learning_rate": 8.569116330553992e-06, + "logits/chosen": -1.3907157182693481, + "logits/rejected": -1.0792735815048218, + "logps/chosen": -1.1127643585205078, + "logps/rejected": -2.486975908279419, + "loss": 1.1428, + "odds_ratio_loss": 0.30066484212875366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11127644777297974, + "rewards/margins": 0.1374211311340332, + "rewards/rejected": -0.24869759380817413, + "sft_loss": 1.1127643585205078, + "step": 3250 + }, + { + "epoch": 0.25, + "grad_norm": 61.88528823852539, + "learning_rate": 8.564801318873826e-06, + "logits/chosen": -1.3922154903411865, + "logits/rejected": -1.0898916721343994, + "logps/chosen": -1.3009440898895264, + "logps/rejected": -2.3645083904266357, + "loss": 1.3433, + "odds_ratio_loss": 0.4238888621330261, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13009440898895264, + "rewards/margins": 0.1063564270734787, + "rewards/rejected": -0.23645083606243134, + "sft_loss": 1.3009440898895264, + "step": 3255 + }, + { + "epoch": 0.25, + "grad_norm": 15.953457832336426, + "learning_rate": 8.560480900719855e-06, + "logits/chosen": -1.2886173725128174, + "logits/rejected": -0.7558988928794861, + "logps/chosen": -1.145691990852356, + "logps/rejected": -1.2577075958251953, + "loss": 1.2173, + "odds_ratio_loss": 0.7161797881126404, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11456920206546783, + "rewards/margins": 0.011201570741832256, + "rewards/rejected": -0.12577076256275177, + "sft_loss": 1.145691990852356, + "step": 3260 + }, + { + "epoch": 0.25, + "grad_norm": 24.60605812072754, + "learning_rate": 8.556155082644542e-06, + "logits/chosen": -1.0742292404174805, + "logits/rejected": -0.9177080988883972, + "logps/chosen": -0.989599883556366, + "logps/rejected": -8.721120834350586, + "loss": 1.0172, + "odds_ratio_loss": 0.2762053608894348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09895999729633331, + "rewards/margins": 0.7731519937515259, + "rewards/rejected": -0.8721119165420532, + "sft_loss": 0.989599883556366, + "step": 3265 + }, + { + "epoch": 0.25, + "grad_norm": 17.272823333740234, + "learning_rate": 8.55182387120854e-06, + "logits/chosen": -1.202138900756836, + "logits/rejected": -1.0315873622894287, + "logps/chosen": -0.9821723699569702, + "logps/rejected": -1.4622547626495361, + "loss": 1.0229, + "odds_ratio_loss": 0.407528817653656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09821723401546478, + "rewards/margins": 0.04800824820995331, + "rewards/rejected": -0.1462254822254181, + "sft_loss": 0.9821723699569702, + "step": 3270 + }, + { + "epoch": 0.25, + "grad_norm": 15.39669418334961, + "learning_rate": 8.547487272980679e-06, + "logits/chosen": -1.3310105800628662, + "logits/rejected": -0.7271682620048523, + "logps/chosen": -1.3114240169525146, + "logps/rejected": -1.6460965871810913, + "loss": 1.382, + "odds_ratio_loss": 0.7060557007789612, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13114240765571594, + "rewards/margins": 0.03346724808216095, + "rewards/rejected": -0.1646096557378769, + "sft_loss": 1.3114240169525146, + "step": 3275 + }, + { + "epoch": 0.26, + "grad_norm": 21.61895179748535, + "learning_rate": 8.543145294537963e-06, + "logits/chosen": -1.2018988132476807, + "logits/rejected": -1.2045353651046753, + "logps/chosen": -1.3924777507781982, + "logps/rejected": -2.1691339015960693, + "loss": 1.4376, + "odds_ratio_loss": 0.4510256350040436, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13924778997898102, + "rewards/margins": 0.07766561955213547, + "rewards/rejected": -0.21691341698169708, + "sft_loss": 1.3924777507781982, + "step": 3280 + }, + { + "epoch": 0.26, + "grad_norm": 72.43243408203125, + "learning_rate": 8.538797942465551e-06, + "logits/chosen": -1.2349824905395508, + "logits/rejected": -0.9436966776847839, + "logps/chosen": -1.0088491439819336, + "logps/rejected": -6.553753852844238, + "loss": 1.0257, + "odds_ratio_loss": 0.16876181960105896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10088489949703217, + "rewards/margins": 0.5544905066490173, + "rewards/rejected": -0.6553754210472107, + "sft_loss": 1.0088491439819336, + "step": 3285 + }, + { + "epoch": 0.26, + "grad_norm": 4.700754165649414, + "learning_rate": 8.534445223356756e-06, + "logits/chosen": -1.1828210353851318, + "logits/rejected": -0.9301977157592773, + "logps/chosen": -0.8841454386711121, + "logps/rejected": -1.6752599477767944, + "loss": 0.9198, + "odds_ratio_loss": 0.3564664423465729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08841454237699509, + "rewards/margins": 0.07911147177219391, + "rewards/rejected": -0.1675260066986084, + "sft_loss": 0.8841454386711121, + "step": 3290 + }, + { + "epoch": 0.26, + "grad_norm": 5.469188690185547, + "learning_rate": 8.53008714381303e-06, + "logits/chosen": -1.4232442378997803, + "logits/rejected": -0.569993793964386, + "logps/chosen": -0.8220561146736145, + "logps/rejected": -2.83996844291687, + "loss": 0.8424, + "odds_ratio_loss": 0.20346903800964355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08220561593770981, + "rewards/margins": 0.20179125666618347, + "rewards/rejected": -0.2839968800544739, + "sft_loss": 0.8220561146736145, + "step": 3295 + }, + { + "epoch": 0.26, + "grad_norm": 20.554473876953125, + "learning_rate": 8.525723710443953e-06, + "logits/chosen": -1.4156343936920166, + "logits/rejected": -1.068035364151001, + "logps/chosen": -1.096895456314087, + "logps/rejected": -2.0811915397644043, + "loss": 1.1371, + "odds_ratio_loss": 0.40177327394485474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10968954861164093, + "rewards/margins": 0.0984296202659607, + "rewards/rejected": -0.20811918377876282, + "sft_loss": 1.096895456314087, + "step": 3300 + }, + { + "epoch": 0.26, + "grad_norm": 461.81805419921875, + "learning_rate": 8.521354929867227e-06, + "logits/chosen": -1.2747472524642944, + "logits/rejected": -1.0539219379425049, + "logps/chosen": -0.9387380480766296, + "logps/rejected": -8.72998046875, + "loss": 0.9679, + "odds_ratio_loss": 0.2919756770133972, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09387381374835968, + "rewards/margins": 0.7791242599487305, + "rewards/rejected": -0.872998058795929, + "sft_loss": 0.9387380480766296, + "step": 3305 + }, + { + "epoch": 0.26, + "grad_norm": 13.743693351745605, + "learning_rate": 8.516980808708659e-06, + "logits/chosen": -1.359903335571289, + "logits/rejected": -1.051998496055603, + "logps/chosen": -1.1269054412841797, + "logps/rejected": -9.247424125671387, + "loss": 1.1874, + "odds_ratio_loss": 0.6050975918769836, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11269054561853409, + "rewards/margins": 0.8120518922805786, + "rewards/rejected": -0.9247424006462097, + "sft_loss": 1.1269054412841797, + "step": 3310 + }, + { + "epoch": 0.26, + "grad_norm": 9.05522632598877, + "learning_rate": 8.512601353602164e-06, + "logits/chosen": -1.4644567966461182, + "logits/rejected": -0.9861732721328735, + "logps/chosen": -1.1417715549468994, + "logps/rejected": -1.4305145740509033, + "loss": 1.1992, + "odds_ratio_loss": 0.574771523475647, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1141771674156189, + "rewards/margins": 0.028874289244413376, + "rewards/rejected": -0.14305146038532257, + "sft_loss": 1.1417715549468994, + "step": 3315 + }, + { + "epoch": 0.26, + "grad_norm": 7.766719818115234, + "learning_rate": 8.508216571189737e-06, + "logits/chosen": -1.3620727062225342, + "logits/rejected": -0.8981539607048035, + "logps/chosen": -0.9097458124160767, + "logps/rejected": -5.282686710357666, + "loss": 0.9658, + "odds_ratio_loss": 0.5603520274162292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0909745842218399, + "rewards/margins": 0.437294065952301, + "rewards/rejected": -0.5282686948776245, + "sft_loss": 0.9097458124160767, + "step": 3320 + }, + { + "epoch": 0.26, + "grad_norm": 45.20925521850586, + "learning_rate": 8.50382646812146e-06, + "logits/chosen": -1.3043216466903687, + "logits/rejected": -1.3080909252166748, + "logps/chosen": -0.7670945525169373, + "logps/rejected": -1.146458625793457, + "loss": 0.8246, + "odds_ratio_loss": 0.5751861929893494, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07670946419239044, + "rewards/margins": 0.03793640434741974, + "rewards/rejected": -0.11464586108922958, + "sft_loss": 0.7670945525169373, + "step": 3325 + }, + { + "epoch": 0.26, + "grad_norm": 20.774234771728516, + "learning_rate": 8.49943105105548e-06, + "logits/chosen": -1.447458267211914, + "logits/rejected": -0.8410998582839966, + "logps/chosen": -0.9892905354499817, + "logps/rejected": -2.2154524326324463, + "loss": 1.0132, + "odds_ratio_loss": 0.23863506317138672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09892904758453369, + "rewards/margins": 0.12261620908975601, + "rewards/rejected": -0.2215452492237091, + "sft_loss": 0.9892905354499817, + "step": 3330 + }, + { + "epoch": 0.26, + "grad_norm": 7.589955806732178, + "learning_rate": 8.495030326658007e-06, + "logits/chosen": -1.45345139503479, + "logits/rejected": -0.8291786313056946, + "logps/chosen": -1.0652108192443848, + "logps/rejected": -11.067428588867188, + "loss": 1.1067, + "odds_ratio_loss": 0.41494470834732056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10652108490467072, + "rewards/margins": 1.000221848487854, + "rewards/rejected": -1.1067428588867188, + "sft_loss": 1.0652108192443848, + "step": 3335 + }, + { + "epoch": 0.26, + "grad_norm": 9.080081939697266, + "learning_rate": 8.490624301603296e-06, + "logits/chosen": -1.0828771591186523, + "logits/rejected": -1.0897125005722046, + "logps/chosen": -0.8774884343147278, + "logps/rejected": -1.6583712100982666, + "loss": 0.9108, + "odds_ratio_loss": 0.33312711119651794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08774884045124054, + "rewards/margins": 0.07808827608823776, + "rewards/rejected": -0.1658371239900589, + "sft_loss": 0.8774884343147278, + "step": 3340 + }, + { + "epoch": 0.26, + "grad_norm": 15.120588302612305, + "learning_rate": 8.486212982573648e-06, + "logits/chosen": -1.0031670331954956, + "logits/rejected": -0.9409330487251282, + "logps/chosen": -1.1631263494491577, + "logps/rejected": -2.286313533782959, + "loss": 1.2246, + "odds_ratio_loss": 0.6149234175682068, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11631264537572861, + "rewards/margins": 0.11231871694326401, + "rewards/rejected": -0.22863134741783142, + "sft_loss": 1.1631263494491577, + "step": 3345 + }, + { + "epoch": 0.26, + "grad_norm": 7.824958801269531, + "learning_rate": 8.481796376259382e-06, + "logits/chosen": -1.2014691829681396, + "logits/rejected": -0.8414332270622253, + "logps/chosen": -1.1760876178741455, + "logps/rejected": -1.5616389513015747, + "loss": 1.2254, + "odds_ratio_loss": 0.4928853511810303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11760877072811127, + "rewards/margins": 0.03855512663722038, + "rewards/rejected": -0.15616390109062195, + "sft_loss": 1.1760876178741455, + "step": 3350 + }, + { + "epoch": 0.26, + "grad_norm": 8.375786781311035, + "learning_rate": 8.477374489358845e-06, + "logits/chosen": -1.5305713415145874, + "logits/rejected": -1.3281760215759277, + "logps/chosen": -1.110764503479004, + "logps/rejected": -10.505553245544434, + "loss": 1.2078, + "odds_ratio_loss": 0.9702065587043762, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1110764592885971, + "rewards/margins": 0.9394787549972534, + "rewards/rejected": -1.0505553483963013, + "sft_loss": 1.110764503479004, + "step": 3355 + }, + { + "epoch": 0.26, + "grad_norm": 18.455425262451172, + "learning_rate": 8.472947328578392e-06, + "logits/chosen": -1.4160370826721191, + "logits/rejected": -0.7809430956840515, + "logps/chosen": -1.1917507648468018, + "logps/rejected": -1.7635633945465088, + "loss": 1.2442, + "odds_ratio_loss": 0.5245550870895386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11917507648468018, + "rewards/margins": 0.05718127638101578, + "rewards/rejected": -0.17635634541511536, + "sft_loss": 1.1917507648468018, + "step": 3360 + }, + { + "epoch": 0.26, + "grad_norm": 9.328863143920898, + "learning_rate": 8.46851490063237e-06, + "logits/chosen": -1.3257181644439697, + "logits/rejected": -1.2680017948150635, + "logps/chosen": -1.1743090152740479, + "logps/rejected": -1.757370948791504, + "loss": 1.2341, + "odds_ratio_loss": 0.5981367826461792, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11743088811635971, + "rewards/margins": 0.05830620601773262, + "rewards/rejected": -0.17573709785938263, + "sft_loss": 1.1743090152740479, + "step": 3365 + }, + { + "epoch": 0.26, + "grad_norm": 19.936983108520508, + "learning_rate": 8.464077212243125e-06, + "logits/chosen": -1.33879816532135, + "logits/rejected": -1.112377405166626, + "logps/chosen": -1.3328278064727783, + "logps/rejected": -4.014039039611816, + "loss": 1.3689, + "odds_ratio_loss": 0.36111459136009216, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13328281044960022, + "rewards/margins": 0.2681211233139038, + "rewards/rejected": -0.40140390396118164, + "sft_loss": 1.3328278064727783, + "step": 3370 + }, + { + "epoch": 0.26, + "grad_norm": 69.97361755371094, + "learning_rate": 8.459634270140968e-06, + "logits/chosen": -1.3100478649139404, + "logits/rejected": -0.6492315530776978, + "logps/chosen": -1.3336145877838135, + "logps/rejected": -4.281968116760254, + "loss": 1.355, + "odds_ratio_loss": 0.21349510550498962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13336145877838135, + "rewards/margins": 0.2948353886604309, + "rewards/rejected": -0.42819681763648987, + "sft_loss": 1.3336145877838135, + "step": 3375 + }, + { + "epoch": 0.26, + "grad_norm": 11.706873893737793, + "learning_rate": 8.45518608106419e-06, + "logits/chosen": -1.2311418056488037, + "logits/rejected": -0.7156813740730286, + "logps/chosen": -1.0485591888427734, + "logps/rejected": -2.773469924926758, + "loss": 1.0833, + "odds_ratio_loss": 0.3475884795188904, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10485591739416122, + "rewards/margins": 0.17249107360839844, + "rewards/rejected": -0.27734702825546265, + "sft_loss": 1.0485591888427734, + "step": 3380 + }, + { + "epoch": 0.26, + "grad_norm": 95.40128326416016, + "learning_rate": 8.450732651759033e-06, + "logits/chosen": -1.3298120498657227, + "logits/rejected": -1.1620497703552246, + "logps/chosen": -0.8674956560134888, + "logps/rejected": -1.8080475330352783, + "loss": 0.899, + "odds_ratio_loss": 0.31545546650886536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08674956858158112, + "rewards/margins": 0.0940551608800888, + "rewards/rejected": -0.1808047592639923, + "sft_loss": 0.8674956560134888, + "step": 3385 + }, + { + "epoch": 0.26, + "grad_norm": 20.017311096191406, + "learning_rate": 8.446273988979686e-06, + "logits/chosen": -1.4703214168548584, + "logits/rejected": -1.171466588973999, + "logps/chosen": -0.9976893663406372, + "logps/rejected": -1.4772334098815918, + "loss": 1.0456, + "odds_ratio_loss": 0.4789748787879944, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09976892173290253, + "rewards/margins": 0.047954410314559937, + "rewards/rejected": -0.14772334694862366, + "sft_loss": 0.9976893663406372, + "step": 3390 + }, + { + "epoch": 0.26, + "grad_norm": 7.699336051940918, + "learning_rate": 8.441810099488279e-06, + "logits/chosen": -1.2824268341064453, + "logits/rejected": -0.9811736941337585, + "logps/chosen": -0.8096511960029602, + "logps/rejected": -1.0340986251831055, + "loss": 0.8644, + "odds_ratio_loss": 0.5473746657371521, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08096511662006378, + "rewards/margins": 0.022444751113653183, + "rewards/rejected": -0.10340987145900726, + "sft_loss": 0.8096511960029602, + "step": 3395 + }, + { + "epoch": 0.26, + "grad_norm": 13.585868835449219, + "learning_rate": 8.437340990054868e-06, + "logits/chosen": -1.4865134954452515, + "logits/rejected": -1.2178223133087158, + "logps/chosen": -0.6663314700126648, + "logps/rejected": -1.67364501953125, + "loss": 0.6956, + "odds_ratio_loss": 0.292599618434906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06663314998149872, + "rewards/margins": 0.10073135793209076, + "rewards/rejected": -0.16736450791358948, + "sft_loss": 0.6663314700126648, + "step": 3400 + }, + { + "epoch": 0.26, + "grad_norm": 9.500743865966797, + "learning_rate": 8.432866667457423e-06, + "logits/chosen": -1.4319857358932495, + "logits/rejected": -0.9485819935798645, + "logps/chosen": -0.9060953855514526, + "logps/rejected": -7.988076686859131, + "loss": 0.955, + "odds_ratio_loss": 0.48911604285240173, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09060955047607422, + "rewards/margins": 0.7081981897354126, + "rewards/rejected": -0.7988077402114868, + "sft_loss": 0.9060953855514526, + "step": 3405 + }, + { + "epoch": 0.27, + "grad_norm": 6.2479753494262695, + "learning_rate": 8.428387138481825e-06, + "logits/chosen": -1.4581882953643799, + "logits/rejected": -0.7471655607223511, + "logps/chosen": -1.04079270362854, + "logps/rejected": -1.4583923816680908, + "loss": 1.0885, + "odds_ratio_loss": 0.4766160845756531, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10407926887273788, + "rewards/margins": 0.04175996407866478, + "rewards/rejected": -0.14583922922611237, + "sft_loss": 1.04079270362854, + "step": 3410 + }, + { + "epoch": 0.27, + "grad_norm": 19.998077392578125, + "learning_rate": 8.423902409921842e-06, + "logits/chosen": -1.1454991102218628, + "logits/rejected": -0.9668058156967163, + "logps/chosen": -1.0157272815704346, + "logps/rejected": -1.1806867122650146, + "loss": 1.0831, + "odds_ratio_loss": 0.6740451455116272, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10157272964715958, + "rewards/margins": 0.01649593934416771, + "rewards/rejected": -0.11806867271661758, + "sft_loss": 1.0157272815704346, + "step": 3415 + }, + { + "epoch": 0.27, + "grad_norm": 253.7983856201172, + "learning_rate": 8.419412488579142e-06, + "logits/chosen": -1.4023807048797607, + "logits/rejected": -1.1779649257659912, + "logps/chosen": -0.9201675653457642, + "logps/rejected": -2.4253060817718506, + "loss": 0.9598, + "odds_ratio_loss": 0.39633411169052124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09201676398515701, + "rewards/margins": 0.15051385760307312, + "rewards/rejected": -0.24253061413764954, + "sft_loss": 0.9201675653457642, + "step": 3420 + }, + { + "epoch": 0.27, + "grad_norm": 12.650860786437988, + "learning_rate": 8.414917381263256e-06, + "logits/chosen": -1.4328114986419678, + "logits/rejected": -0.7475396394729614, + "logps/chosen": -1.0361485481262207, + "logps/rejected": -2.0335729122161865, + "loss": 1.0612, + "odds_ratio_loss": 0.2504461109638214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10361485183238983, + "rewards/margins": 0.09974244982004166, + "rewards/rejected": -0.20335730910301208, + "sft_loss": 1.0361485481262207, + "step": 3425 + }, + { + "epoch": 0.27, + "grad_norm": 12.29416561126709, + "learning_rate": 8.410417094791587e-06, + "logits/chosen": -1.2249650955200195, + "logits/rejected": -1.1467955112457275, + "logps/chosen": -1.2544214725494385, + "logps/rejected": -1.503396987915039, + "loss": 1.3107, + "odds_ratio_loss": 0.5625152587890625, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12544213235378265, + "rewards/margins": 0.024897579103708267, + "rewards/rejected": -0.15033970773220062, + "sft_loss": 1.2544214725494385, + "step": 3430 + }, + { + "epoch": 0.27, + "grad_norm": 9.909943580627441, + "learning_rate": 8.405911635989391e-06, + "logits/chosen": -1.3390843868255615, + "logits/rejected": -0.8556186556816101, + "logps/chosen": -1.392810583114624, + "logps/rejected": -0.9943065643310547, + "loss": 1.5033, + "odds_ratio_loss": 1.1048606634140015, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13928106427192688, + "rewards/margins": -0.03985040634870529, + "rewards/rejected": -0.09943065792322159, + "sft_loss": 1.392810583114624, + "step": 3435 + }, + { + "epoch": 0.27, + "grad_norm": 13.671736717224121, + "learning_rate": 8.40140101168977e-06, + "logits/chosen": -1.1585992574691772, + "logits/rejected": -0.9479349851608276, + "logps/chosen": -0.7876306176185608, + "logps/rejected": -2.8406918048858643, + "loss": 0.8062, + "odds_ratio_loss": 0.18568609654903412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07876305282115936, + "rewards/margins": 0.20530612766742706, + "rewards/rejected": -0.28406915068626404, + "sft_loss": 0.7876306176185608, + "step": 3440 + }, + { + "epoch": 0.27, + "grad_norm": 8.141471862792969, + "learning_rate": 8.396885228733651e-06, + "logits/chosen": -1.4388540983200073, + "logits/rejected": -0.9490545392036438, + "logps/chosen": -0.7759628891944885, + "logps/rejected": -6.000257968902588, + "loss": 0.8164, + "odds_ratio_loss": 0.40440258383750916, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07759629189968109, + "rewards/margins": 0.5224294662475586, + "rewards/rejected": -0.6000257730484009, + "sft_loss": 0.7759628891944885, + "step": 3445 + }, + { + "epoch": 0.27, + "grad_norm": 7.394799709320068, + "learning_rate": 8.392364293969802e-06, + "logits/chosen": -1.2691454887390137, + "logits/rejected": -0.830244243144989, + "logps/chosen": -1.0301793813705444, + "logps/rejected": -1.5118746757507324, + "loss": 1.0849, + "odds_ratio_loss": 0.547632098197937, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10301794111728668, + "rewards/margins": 0.04816952347755432, + "rewards/rejected": -0.151187464594841, + "sft_loss": 1.0301793813705444, + "step": 3450 + }, + { + "epoch": 0.27, + "grad_norm": 10.756146430969238, + "learning_rate": 8.387838214254787e-06, + "logits/chosen": -1.463841438293457, + "logits/rejected": -0.8923279047012329, + "logps/chosen": -1.1739076375961304, + "logps/rejected": -3.4276280403137207, + "loss": 1.232, + "odds_ratio_loss": 0.5810686945915222, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11739076673984528, + "rewards/margins": 0.2253720462322235, + "rewards/rejected": -0.34276282787323, + "sft_loss": 1.1739076375961304, + "step": 3455 + }, + { + "epoch": 0.27, + "grad_norm": 7.481341361999512, + "learning_rate": 8.383306996452984e-06, + "logits/chosen": -1.2822167873382568, + "logits/rejected": -1.111701250076294, + "logps/chosen": -0.847335934638977, + "logps/rejected": -1.5403449535369873, + "loss": 0.8853, + "odds_ratio_loss": 0.3792897164821625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08473359048366547, + "rewards/margins": 0.06930090487003326, + "rewards/rejected": -0.15403451025485992, + "sft_loss": 0.847335934638977, + "step": 3460 + }, + { + "epoch": 0.27, + "grad_norm": 59.56509017944336, + "learning_rate": 8.378770647436558e-06, + "logits/chosen": -1.377863883972168, + "logits/rejected": -1.1741714477539062, + "logps/chosen": -1.4446465969085693, + "logps/rejected": -3.021998882293701, + "loss": 1.4683, + "odds_ratio_loss": 0.2365054190158844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1444646567106247, + "rewards/margins": 0.15773524343967438, + "rewards/rejected": -0.3021999001502991, + "sft_loss": 1.4446465969085693, + "step": 3465 + }, + { + "epoch": 0.27, + "grad_norm": 7.632749080657959, + "learning_rate": 8.374229174085462e-06, + "logits/chosen": -1.4123557806015015, + "logits/rejected": -1.2731513977050781, + "logps/chosen": -1.1025629043579102, + "logps/rejected": -1.815313696861267, + "loss": 1.1558, + "odds_ratio_loss": 0.5323792695999146, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11025629192590714, + "rewards/margins": 0.07127507776021957, + "rewards/rejected": -0.1815313696861267, + "sft_loss": 1.1025629043579102, + "step": 3470 + }, + { + "epoch": 0.27, + "grad_norm": 6.778805255889893, + "learning_rate": 8.369682583287414e-06, + "logits/chosen": -1.2414586544036865, + "logits/rejected": -0.7293938398361206, + "logps/chosen": -0.9718669652938843, + "logps/rejected": -3.2794570922851562, + "loss": 1.0039, + "odds_ratio_loss": 0.32015174627304077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09718669950962067, + "rewards/margins": 0.23075899481773376, + "rewards/rejected": -0.3279457092285156, + "sft_loss": 0.9718669652938843, + "step": 3475 + }, + { + "epoch": 0.27, + "grad_norm": 46.404876708984375, + "learning_rate": 8.365130881937897e-06, + "logits/chosen": -1.34323251247406, + "logits/rejected": -1.000976800918579, + "logps/chosen": -1.2657513618469238, + "logps/rejected": -1.1252360343933105, + "loss": 1.3463, + "odds_ratio_loss": 0.8055798411369324, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.12657514214515686, + "rewards/margins": -0.014051536098122597, + "rewards/rejected": -0.11252360045909882, + "sft_loss": 1.2657513618469238, + "step": 3480 + }, + { + "epoch": 0.27, + "grad_norm": 14.280263900756836, + "learning_rate": 8.360574076940143e-06, + "logits/chosen": -1.4023492336273193, + "logits/rejected": -1.4027321338653564, + "logps/chosen": -1.1064332723617554, + "logps/rejected": -2.0462806224823, + "loss": 1.1417, + "odds_ratio_loss": 0.35309427976608276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11064332723617554, + "rewards/margins": 0.09398476034402847, + "rewards/rejected": -0.2046280801296234, + "sft_loss": 1.1064332723617554, + "step": 3485 + }, + { + "epoch": 0.27, + "grad_norm": 14.520868301391602, + "learning_rate": 8.356012175205127e-06, + "logits/chosen": -1.339352011680603, + "logits/rejected": -0.829668402671814, + "logps/chosen": -1.151982069015503, + "logps/rejected": -2.6990561485290527, + "loss": 1.1743, + "odds_ratio_loss": 0.22352364659309387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11519820988178253, + "rewards/margins": 0.1547074317932129, + "rewards/rejected": -0.26990562677383423, + "sft_loss": 1.151982069015503, + "step": 3490 + }, + { + "epoch": 0.27, + "grad_norm": 6.273999214172363, + "learning_rate": 8.351445183651552e-06, + "logits/chosen": -1.5008800029754639, + "logits/rejected": -1.093679666519165, + "logps/chosen": -0.9768564105033875, + "logps/rejected": -5.830966472625732, + "loss": 0.9964, + "odds_ratio_loss": 0.19528701901435852, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09768564254045486, + "rewards/margins": 0.48541101813316345, + "rewards/rejected": -0.5830966234207153, + "sft_loss": 0.9768564105033875, + "step": 3495 + }, + { + "epoch": 0.27, + "grad_norm": 11.410566329956055, + "learning_rate": 8.34687310920584e-06, + "logits/chosen": -1.493222951889038, + "logits/rejected": -1.0385332107543945, + "logps/chosen": -1.2644648551940918, + "logps/rejected": -2.5637965202331543, + "loss": 1.2901, + "odds_ratio_loss": 0.2563869059085846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12644650042057037, + "rewards/margins": 0.129933163523674, + "rewards/rejected": -0.2563796639442444, + "sft_loss": 1.2644648551940918, + "step": 3500 + }, + { + "epoch": 0.27, + "grad_norm": 7.278417110443115, + "learning_rate": 8.34229595880212e-06, + "logits/chosen": -1.4679405689239502, + "logits/rejected": -0.689369797706604, + "logps/chosen": -1.1459770202636719, + "logps/rejected": -2.5427379608154297, + "loss": 1.1695, + "odds_ratio_loss": 0.23483486473560333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11459771543741226, + "rewards/margins": 0.13967609405517578, + "rewards/rejected": -0.25427383184432983, + "sft_loss": 1.1459770202636719, + "step": 3505 + }, + { + "epoch": 0.27, + "grad_norm": 9.49854564666748, + "learning_rate": 8.337713739382224e-06, + "logits/chosen": -1.3915735483169556, + "logits/rejected": -1.2456294298171997, + "logps/chosen": -1.202357530593872, + "logps/rejected": -5.146681785583496, + "loss": 1.2341, + "odds_ratio_loss": 0.3172472417354584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12023575603961945, + "rewards/margins": 0.3944324851036072, + "rewards/rejected": -0.5146682858467102, + "sft_loss": 1.202357530593872, + "step": 3510 + }, + { + "epoch": 0.27, + "grad_norm": 7.210588455200195, + "learning_rate": 8.333126457895673e-06, + "logits/chosen": -1.4028444290161133, + "logits/rejected": -0.9360781908035278, + "logps/chosen": -1.0781782865524292, + "logps/rejected": -2.235292911529541, + "loss": 1.1052, + "odds_ratio_loss": 0.2698212265968323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10781782865524292, + "rewards/margins": 0.11571145057678223, + "rewards/rejected": -0.22352926433086395, + "sft_loss": 1.0781782865524292, + "step": 3515 + }, + { + "epoch": 0.27, + "grad_norm": 5.604741096496582, + "learning_rate": 8.328534121299654e-06, + "logits/chosen": -1.2505098581314087, + "logits/rejected": -0.5317830443382263, + "logps/chosen": -0.8969619870185852, + "logps/rejected": -2.316410779953003, + "loss": 0.9218, + "odds_ratio_loss": 0.2478959560394287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08969619125127792, + "rewards/margins": 0.14194490015506744, + "rewards/rejected": -0.23164109885692596, + "sft_loss": 0.8969619870185852, + "step": 3520 + }, + { + "epoch": 0.27, + "grad_norm": 57.90606689453125, + "learning_rate": 8.323936736559038e-06, + "logits/chosen": -1.0537947416305542, + "logits/rejected": -1.2932230234146118, + "logps/chosen": -2.3485894203186035, + "logps/rejected": -2.0282821655273438, + "loss": 2.4812, + "odds_ratio_loss": 1.3260843753814697, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.23485895991325378, + "rewards/margins": -0.03203073889017105, + "rewards/rejected": -0.20282821357250214, + "sft_loss": 2.3485894203186035, + "step": 3525 + }, + { + "epoch": 0.27, + "grad_norm": 6.8659539222717285, + "learning_rate": 8.319334310646335e-06, + "logits/chosen": -1.393730878829956, + "logits/rejected": -0.7838658690452576, + "logps/chosen": -1.2052452564239502, + "logps/rejected": -1.4900472164154053, + "loss": 1.27, + "odds_ratio_loss": 0.6480141878128052, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12052451074123383, + "rewards/margins": 0.02848021313548088, + "rewards/rejected": -0.149004727602005, + "sft_loss": 1.2052452564239502, + "step": 3530 + }, + { + "epoch": 0.27, + "grad_norm": 5.410351276397705, + "learning_rate": 8.31472685054171e-06, + "logits/chosen": -1.2084213495254517, + "logits/rejected": -1.0660079717636108, + "logps/chosen": -1.127166986465454, + "logps/rejected": -2.948442220687866, + "loss": 1.1443, + "odds_ratio_loss": 0.17111781239509583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11271669715642929, + "rewards/margins": 0.18212752044200897, + "rewards/rejected": -0.29484421014785767, + "sft_loss": 1.127166986465454, + "step": 3535 + }, + { + "epoch": 0.28, + "grad_norm": 313.609130859375, + "learning_rate": 8.310114363232961e-06, + "logits/chosen": -1.1509945392608643, + "logits/rejected": -1.0163276195526123, + "logps/chosen": -1.3601996898651123, + "logps/rejected": -5.512236595153809, + "loss": 1.3712, + "odds_ratio_loss": 0.10965617001056671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1360199749469757, + "rewards/margins": 0.415203720331192, + "rewards/rejected": -0.551223635673523, + "sft_loss": 1.3601996898651123, + "step": 3540 + }, + { + "epoch": 0.28, + "grad_norm": 43.61051940917969, + "learning_rate": 8.305496855715515e-06, + "logits/chosen": -1.3698691129684448, + "logits/rejected": -1.2788420915603638, + "logps/chosen": -0.764731764793396, + "logps/rejected": -3.356771469116211, + "loss": 0.779, + "odds_ratio_loss": 0.14261171221733093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0764731839299202, + "rewards/margins": 0.2592040002346039, + "rewards/rejected": -0.3356771767139435, + "sft_loss": 0.764731764793396, + "step": 3545 + }, + { + "epoch": 0.28, + "grad_norm": 24.839813232421875, + "learning_rate": 8.300874334992404e-06, + "logits/chosen": -1.3410618305206299, + "logits/rejected": -1.0158249139785767, + "logps/chosen": -0.9953804016113281, + "logps/rejected": -3.262022018432617, + "loss": 1.0184, + "odds_ratio_loss": 0.2305566519498825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09953804314136505, + "rewards/margins": 0.22666415572166443, + "rewards/rejected": -0.3262022137641907, + "sft_loss": 0.9953804016113281, + "step": 3550 + }, + { + "epoch": 0.28, + "grad_norm": 26.78093719482422, + "learning_rate": 8.296246808074268e-06, + "logits/chosen": -1.4354108572006226, + "logits/rejected": -1.1755014657974243, + "logps/chosen": -0.6938873529434204, + "logps/rejected": -6.733367919921875, + "loss": 0.7041, + "odds_ratio_loss": 0.10260869562625885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0693887323141098, + "rewards/margins": 0.6039480566978455, + "rewards/rejected": -0.6733368039131165, + "sft_loss": 0.6938873529434204, + "step": 3555 + }, + { + "epoch": 0.28, + "grad_norm": 7.194347381591797, + "learning_rate": 8.291614281979339e-06, + "logits/chosen": -1.406665563583374, + "logits/rejected": -1.10782790184021, + "logps/chosen": -1.9012788534164429, + "logps/rejected": -1.8412885665893555, + "loss": 2.0278, + "odds_ratio_loss": 1.265174150466919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.190127894282341, + "rewards/margins": -0.005999034736305475, + "rewards/rejected": -0.18412885069847107, + "sft_loss": 1.9012788534164429, + "step": 3560 + }, + { + "epoch": 0.28, + "grad_norm": 28.549922943115234, + "learning_rate": 8.286976763733433e-06, + "logits/chosen": -1.2915570735931396, + "logits/rejected": -0.8109747171401978, + "logps/chosen": -1.170531153678894, + "logps/rejected": -2.9936161041259766, + "loss": 1.2096, + "odds_ratio_loss": 0.39067792892456055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11705312877893448, + "rewards/margins": 0.18230850994586945, + "rewards/rejected": -0.2993616461753845, + "sft_loss": 1.170531153678894, + "step": 3565 + }, + { + "epoch": 0.28, + "grad_norm": 28.323139190673828, + "learning_rate": 8.282334260369934e-06, + "logits/chosen": -1.463327407836914, + "logits/rejected": -1.1677974462509155, + "logps/chosen": -1.2482119798660278, + "logps/rejected": -4.510406017303467, + "loss": 1.285, + "odds_ratio_loss": 0.3678421378135681, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12482120841741562, + "rewards/margins": 0.32621946930885315, + "rewards/rejected": -0.451040655374527, + "sft_loss": 1.2482119798660278, + "step": 3570 + }, + { + "epoch": 0.28, + "grad_norm": 43.18751907348633, + "learning_rate": 8.277686778929786e-06, + "logits/chosen": -1.5234771966934204, + "logits/rejected": -1.2108821868896484, + "logps/chosen": -1.2944618463516235, + "logps/rejected": -4.193190097808838, + "loss": 1.3536, + "odds_ratio_loss": 0.5915259122848511, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12944617867469788, + "rewards/margins": 0.28987282514572144, + "rewards/rejected": -0.4193190634250641, + "sft_loss": 1.2944618463516235, + "step": 3575 + }, + { + "epoch": 0.28, + "grad_norm": 26.376367568969727, + "learning_rate": 8.273034326461489e-06, + "logits/chosen": -1.3317598104476929, + "logits/rejected": -0.8073261976242065, + "logps/chosen": -1.051276445388794, + "logps/rejected": -1.787710428237915, + "loss": 1.0856, + "odds_ratio_loss": 0.34296557307243347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10512763261795044, + "rewards/margins": 0.07364340126514435, + "rewards/rejected": -0.1787710338830948, + "sft_loss": 1.051276445388794, + "step": 3580 + }, + { + "epoch": 0.28, + "grad_norm": 23.10160255432129, + "learning_rate": 8.268376910021075e-06, + "logits/chosen": -1.2411911487579346, + "logits/rejected": -1.3177855014801025, + "logps/chosen": -0.8805097341537476, + "logps/rejected": -1.6542412042617798, + "loss": 0.9218, + "odds_ratio_loss": 0.41270628571510315, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.088050976395607, + "rewards/margins": 0.07737313210964203, + "rewards/rejected": -0.16542412340641022, + "sft_loss": 0.8805097341537476, + "step": 3585 + }, + { + "epoch": 0.28, + "grad_norm": 171.03265380859375, + "learning_rate": 8.263714536672105e-06, + "logits/chosen": -1.3680837154388428, + "logits/rejected": -0.7903397679328918, + "logps/chosen": -1.1044056415557861, + "logps/rejected": -3.1359059810638428, + "loss": 1.1217, + "odds_ratio_loss": 0.17297251522541046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11044056713581085, + "rewards/margins": 0.20315006375312805, + "rewards/rejected": -0.3135906159877777, + "sft_loss": 1.1044056415557861, + "step": 3590 + }, + { + "epoch": 0.28, + "grad_norm": 25.135984420776367, + "learning_rate": 8.259047213485664e-06, + "logits/chosen": -1.1868219375610352, + "logits/rejected": -1.4525445699691772, + "logps/chosen": -1.0119667053222656, + "logps/rejected": -1.5798547267913818, + "loss": 1.0506, + "odds_ratio_loss": 0.38638943433761597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10119666904211044, + "rewards/margins": 0.056788813322782516, + "rewards/rejected": -0.15798547863960266, + "sft_loss": 1.0119667053222656, + "step": 3595 + }, + { + "epoch": 0.28, + "grad_norm": 7.753753185272217, + "learning_rate": 8.25437494754034e-06, + "logits/chosen": -1.278234839439392, + "logits/rejected": -1.153738021850586, + "logps/chosen": -0.9590956568717957, + "logps/rejected": -1.9769725799560547, + "loss": 0.9844, + "odds_ratio_loss": 0.25261688232421875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09590956568717957, + "rewards/margins": 0.10178768634796143, + "rewards/rejected": -0.197697252035141, + "sft_loss": 0.9590956568717957, + "step": 3600 + }, + { + "epoch": 0.28, + "grad_norm": 9.523035049438477, + "learning_rate": 8.249697745922216e-06, + "logits/chosen": -1.2500884532928467, + "logits/rejected": -1.2859524488449097, + "logps/chosen": -1.1764044761657715, + "logps/rejected": -3.144897937774658, + "loss": 1.2349, + "odds_ratio_loss": 0.5853737592697144, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11764045059680939, + "rewards/margins": 0.19684937596321106, + "rewards/rejected": -0.31448981165885925, + "sft_loss": 1.1764044761657715, + "step": 3605 + }, + { + "epoch": 0.28, + "grad_norm": 7.400860786437988, + "learning_rate": 8.245015615724862e-06, + "logits/chosen": -1.287461280822754, + "logits/rejected": -0.8889628648757935, + "logps/chosen": -1.0676617622375488, + "logps/rejected": -1.123071551322937, + "loss": 1.1391, + "odds_ratio_loss": 0.7140272855758667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10676617920398712, + "rewards/margins": 0.00554098654538393, + "rewards/rejected": -0.11230716854333878, + "sft_loss": 1.0676617622375488, + "step": 3610 + }, + { + "epoch": 0.28, + "grad_norm": 15.867348670959473, + "learning_rate": 8.240328564049326e-06, + "logits/chosen": -1.1854329109191895, + "logits/rejected": -0.8505358695983887, + "logps/chosen": -1.1343297958374023, + "logps/rejected": -1.3808737993240356, + "loss": 1.1879, + "odds_ratio_loss": 0.5355452299118042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11343298107385635, + "rewards/margins": 0.024654392153024673, + "rewards/rejected": -0.13808736205101013, + "sft_loss": 1.1343297958374023, + "step": 3615 + }, + { + "epoch": 0.28, + "grad_norm": 11.455201148986816, + "learning_rate": 8.235636598004112e-06, + "logits/chosen": -1.4110326766967773, + "logits/rejected": -1.1761510372161865, + "logps/chosen": -1.0385137796401978, + "logps/rejected": -1.8720951080322266, + "loss": 1.0757, + "odds_ratio_loss": 0.37180137634277344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10385137796401978, + "rewards/margins": 0.08335815370082855, + "rewards/rejected": -0.18720951676368713, + "sft_loss": 1.0385137796401978, + "step": 3620 + }, + { + "epoch": 0.28, + "grad_norm": 5.2311811447143555, + "learning_rate": 8.230939724705185e-06, + "logits/chosen": -1.3805992603302002, + "logits/rejected": -0.5798208713531494, + "logps/chosen": -1.0771076679229736, + "logps/rejected": -1.4178438186645508, + "loss": 1.1278, + "odds_ratio_loss": 0.5071910619735718, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10771076381206512, + "rewards/margins": 0.0340736024081707, + "rewards/rejected": -0.14178438484668732, + "sft_loss": 1.0771076679229736, + "step": 3625 + }, + { + "epoch": 0.28, + "grad_norm": 14.495864868164062, + "learning_rate": 8.226237951275951e-06, + "logits/chosen": -1.4785919189453125, + "logits/rejected": -0.8613845705986023, + "logps/chosen": -0.7386495471000671, + "logps/rejected": -1.0128322839736938, + "loss": 0.7915, + "odds_ratio_loss": 0.528670072555542, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07386495918035507, + "rewards/margins": 0.027418266981840134, + "rewards/rejected": -0.1012832298874855, + "sft_loss": 0.7386495471000671, + "step": 3630 + }, + { + "epoch": 0.28, + "grad_norm": 6.944512844085693, + "learning_rate": 8.221531284847242e-06, + "logits/chosen": -1.2884480953216553, + "logits/rejected": -0.8778683543205261, + "logps/chosen": -0.7235269546508789, + "logps/rejected": -1.7379268407821655, + "loss": 0.7619, + "odds_ratio_loss": 0.38415372371673584, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07235269248485565, + "rewards/margins": 0.10144001245498657, + "rewards/rejected": -0.17379269003868103, + "sft_loss": 0.7235269546508789, + "step": 3635 + }, + { + "epoch": 0.28, + "grad_norm": 22.363006591796875, + "learning_rate": 8.21681973255732e-06, + "logits/chosen": -1.457991123199463, + "logits/rejected": -0.9291761517524719, + "logps/chosen": -1.1872960329055786, + "logps/rejected": -1.5329262018203735, + "loss": 1.2416, + "odds_ratio_loss": 0.543427586555481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1187296062707901, + "rewards/margins": 0.03456302359700203, + "rewards/rejected": -0.15329262614250183, + "sft_loss": 1.1872960329055786, + "step": 3640 + }, + { + "epoch": 0.28, + "grad_norm": 10.626898765563965, + "learning_rate": 8.212103301551851e-06, + "logits/chosen": -1.3206452131271362, + "logits/rejected": -0.890534520149231, + "logps/chosen": -1.128233551979065, + "logps/rejected": -1.826210379600525, + "loss": 1.1681, + "odds_ratio_loss": 0.3984006345272064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11282335221767426, + "rewards/margins": 0.06979767978191376, + "rewards/rejected": -0.182621031999588, + "sft_loss": 1.128233551979065, + "step": 3645 + }, + { + "epoch": 0.28, + "grad_norm": 8.859610557556152, + "learning_rate": 8.207381998983897e-06, + "logits/chosen": -1.4563844203948975, + "logits/rejected": -0.7494536638259888, + "logps/chosen": -1.0328868627548218, + "logps/rejected": -2.5515682697296143, + "loss": 1.0563, + "odds_ratio_loss": 0.23427622020244598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1032886877655983, + "rewards/margins": 0.15186813473701477, + "rewards/rejected": -0.25515681505203247, + "sft_loss": 1.0328868627548218, + "step": 3650 + }, + { + "epoch": 0.28, + "grad_norm": 9.665281295776367, + "learning_rate": 8.202655832013919e-06, + "logits/chosen": -1.413213849067688, + "logits/rejected": -1.1829755306243896, + "logps/chosen": -0.9151542782783508, + "logps/rejected": -5.067257404327393, + "loss": 0.9737, + "odds_ratio_loss": 0.5854582786560059, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0915154367685318, + "rewards/margins": 0.4152103364467621, + "rewards/rejected": -0.5067256689071655, + "sft_loss": 0.9151542782783508, + "step": 3655 + }, + { + "epoch": 0.28, + "grad_norm": 19.42445945739746, + "learning_rate": 8.197924807809747e-06, + "logits/chosen": -1.3678677082061768, + "logits/rejected": -1.3623011112213135, + "logps/chosen": -1.2780225276947021, + "logps/rejected": -1.6961390972137451, + "loss": 1.3304, + "odds_ratio_loss": 0.5240920186042786, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12780225276947021, + "rewards/margins": 0.04181166738271713, + "rewards/rejected": -0.16961391270160675, + "sft_loss": 1.2780225276947021, + "step": 3660 + }, + { + "epoch": 0.29, + "grad_norm": 34.14183044433594, + "learning_rate": 8.193188933546579e-06, + "logits/chosen": -1.334039330482483, + "logits/rejected": -1.05949866771698, + "logps/chosen": -0.9566957354545593, + "logps/rejected": -4.501104831695557, + "loss": 0.9723, + "odds_ratio_loss": 0.15578912198543549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09566958248615265, + "rewards/margins": 0.35444092750549316, + "rewards/rejected": -0.450110524892807, + "sft_loss": 0.9566957354545593, + "step": 3665 + }, + { + "epoch": 0.29, + "grad_norm": 30.650644302368164, + "learning_rate": 8.188448216406971e-06, + "logits/chosen": -1.3179060220718384, + "logits/rejected": -1.1395162343978882, + "logps/chosen": -1.471545934677124, + "logps/rejected": -2.721754550933838, + "loss": 1.53, + "odds_ratio_loss": 0.5845457315444946, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14715459942817688, + "rewards/margins": 0.12502089142799377, + "rewards/rejected": -0.27217546105384827, + "sft_loss": 1.471545934677124, + "step": 3670 + }, + { + "epoch": 0.29, + "grad_norm": 10.528597831726074, + "learning_rate": 8.183702663580822e-06, + "logits/chosen": -1.3393423557281494, + "logits/rejected": -1.333606481552124, + "logps/chosen": -1.123405933380127, + "logps/rejected": -4.076834678649902, + "loss": 1.1327, + "odds_ratio_loss": 0.09317772090435028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11234060674905777, + "rewards/margins": 0.2953428626060486, + "rewards/rejected": -0.40768346190452576, + "sft_loss": 1.123405933380127, + "step": 3675 + }, + { + "epoch": 0.29, + "grad_norm": 271.1390075683594, + "learning_rate": 8.178952282265364e-06, + "logits/chosen": -1.3709100484848022, + "logits/rejected": -1.0029327869415283, + "logps/chosen": -1.405590295791626, + "logps/rejected": -2.507824420928955, + "loss": 1.4632, + "odds_ratio_loss": 0.5756229758262634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14055903255939484, + "rewards/margins": 0.1102234274148941, + "rewards/rejected": -0.25078245997428894, + "sft_loss": 1.405590295791626, + "step": 3680 + }, + { + "epoch": 0.29, + "grad_norm": 57.85871124267578, + "learning_rate": 8.174197079665153e-06, + "logits/chosen": -1.3550955057144165, + "logits/rejected": -0.8531472086906433, + "logps/chosen": -1.132922649383545, + "logps/rejected": -3.7029755115509033, + "loss": 1.1647, + "odds_ratio_loss": 0.3179894983768463, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11329226195812225, + "rewards/margins": 0.2570053040981293, + "rewards/rejected": -0.37029752135276794, + "sft_loss": 1.132922649383545, + "step": 3685 + }, + { + "epoch": 0.29, + "grad_norm": 8.095747947692871, + "learning_rate": 8.169437062992061e-06, + "logits/chosen": -1.401653528213501, + "logits/rejected": -0.91447913646698, + "logps/chosen": -0.8620834350585938, + "logps/rejected": -2.502750873565674, + "loss": 0.9076, + "odds_ratio_loss": 0.45480185747146606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08620833605527878, + "rewards/margins": 0.16406671702861786, + "rewards/rejected": -0.2502750754356384, + "sft_loss": 0.8620834350585938, + "step": 3690 + }, + { + "epoch": 0.29, + "grad_norm": 5.202536106109619, + "learning_rate": 8.164672239465254e-06, + "logits/chosen": -1.190614938735962, + "logits/rejected": -0.9084379076957703, + "logps/chosen": -0.8757231831550598, + "logps/rejected": -1.483604073524475, + "loss": 0.9161, + "odds_ratio_loss": 0.4041404128074646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08757232129573822, + "rewards/margins": 0.060788094997406006, + "rewards/rejected": -0.14836041629314423, + "sft_loss": 0.8757231831550598, + "step": 3695 + }, + { + "epoch": 0.29, + "grad_norm": 25.508304595947266, + "learning_rate": 8.159902616311195e-06, + "logits/chosen": -1.2143833637237549, + "logits/rejected": -1.231711745262146, + "logps/chosen": -1.2228883504867554, + "logps/rejected": -1.3752474784851074, + "loss": 1.2946, + "odds_ratio_loss": 0.7171187400817871, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12228883802890778, + "rewards/margins": 0.015235906466841698, + "rewards/rejected": -0.13752475380897522, + "sft_loss": 1.2228883504867554, + "step": 3700 + }, + { + "epoch": 0.29, + "grad_norm": 13.769211769104004, + "learning_rate": 8.155128200763623e-06, + "logits/chosen": -1.255629539489746, + "logits/rejected": -0.8488641977310181, + "logps/chosen": -1.5895591974258423, + "logps/rejected": -1.4509578943252563, + "loss": 1.6858, + "odds_ratio_loss": 0.9623721837997437, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15895593166351318, + "rewards/margins": -0.013860151171684265, + "rewards/rejected": -0.14509578049182892, + "sft_loss": 1.5895591974258423, + "step": 3705 + }, + { + "epoch": 0.29, + "grad_norm": 7.28419303894043, + "learning_rate": 8.15034900006354e-06, + "logits/chosen": -1.1626012325286865, + "logits/rejected": -0.8984651565551758, + "logps/chosen": -1.3513168096542358, + "logps/rejected": -1.5219471454620361, + "loss": 1.4389, + "odds_ratio_loss": 0.8761366009712219, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13513168692588806, + "rewards/margins": 0.017063047736883163, + "rewards/rejected": -0.15219472348690033, + "sft_loss": 1.3513168096542358, + "step": 3710 + }, + { + "epoch": 0.29, + "grad_norm": 7.7843708992004395, + "learning_rate": 8.145565021459217e-06, + "logits/chosen": -1.2671594619750977, + "logits/rejected": -0.9397374987602234, + "logps/chosen": -1.4201005697250366, + "logps/rejected": -1.5774786472320557, + "loss": 1.4891, + "odds_ratio_loss": 0.6898049116134644, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14201006293296814, + "rewards/margins": 0.015737801790237427, + "rewards/rejected": -0.15774787962436676, + "sft_loss": 1.4201005697250366, + "step": 3715 + }, + { + "epoch": 0.29, + "grad_norm": 18.914749145507812, + "learning_rate": 8.140776272206161e-06, + "logits/chosen": -1.2724708318710327, + "logits/rejected": -1.3933824300765991, + "logps/chosen": -0.5651779174804688, + "logps/rejected": -2.1759657859802246, + "loss": 0.6014, + "odds_ratio_loss": 0.3619091808795929, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.056517791002988815, + "rewards/margins": 0.1610788106918335, + "rewards/rejected": -0.2175966054201126, + "sft_loss": 0.5651779174804688, + "step": 3720 + }, + { + "epoch": 0.29, + "grad_norm": 6.794033050537109, + "learning_rate": 8.135982759567121e-06, + "logits/chosen": -1.3524110317230225, + "logits/rejected": -0.882132887840271, + "logps/chosen": -0.7997108697891235, + "logps/rejected": -1.584331750869751, + "loss": 0.832, + "odds_ratio_loss": 0.32256340980529785, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07997108995914459, + "rewards/margins": 0.07846207171678543, + "rewards/rejected": -0.15843316912651062, + "sft_loss": 0.7997108697891235, + "step": 3725 + }, + { + "epoch": 0.29, + "grad_norm": 31.65180015563965, + "learning_rate": 8.131184490812064e-06, + "logits/chosen": -1.186091661453247, + "logits/rejected": -0.8857123255729675, + "logps/chosen": -1.0445191860198975, + "logps/rejected": -4.812319278717041, + "loss": 1.0873, + "odds_ratio_loss": 0.4277670979499817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10445191711187363, + "rewards/margins": 0.3767800033092499, + "rewards/rejected": -0.4812319278717041, + "sft_loss": 1.0445191860198975, + "step": 3730 + }, + { + "epoch": 0.29, + "grad_norm": 7.225113868713379, + "learning_rate": 8.126381473218179e-06, + "logits/chosen": -1.2817249298095703, + "logits/rejected": -1.080437421798706, + "logps/chosen": -1.0402967929840088, + "logps/rejected": -5.129024982452393, + "loss": 1.0936, + "odds_ratio_loss": 0.5332490801811218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10402967035770416, + "rewards/margins": 0.4088728427886963, + "rewards/rejected": -0.5129024982452393, + "sft_loss": 1.0402967929840088, + "step": 3735 + }, + { + "epoch": 0.29, + "grad_norm": 17.7869873046875, + "learning_rate": 8.121573714069848e-06, + "logits/chosen": -1.3492562770843506, + "logits/rejected": -0.9287340044975281, + "logps/chosen": -0.6982876658439636, + "logps/rejected": -1.885575532913208, + "loss": 0.7366, + "odds_ratio_loss": 0.3826819360256195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06982876360416412, + "rewards/margins": 0.11872879415750504, + "rewards/rejected": -0.18855755031108856, + "sft_loss": 0.6982876658439636, + "step": 3740 + }, + { + "epoch": 0.29, + "grad_norm": 8.037618637084961, + "learning_rate": 8.116761220658649e-06, + "logits/chosen": -1.4126262664794922, + "logits/rejected": -1.4442805051803589, + "logps/chosen": -0.7851252555847168, + "logps/rejected": -1.5594263076782227, + "loss": 0.8182, + "odds_ratio_loss": 0.33029070496559143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0785125270485878, + "rewards/margins": 0.0774301066994667, + "rewards/rejected": -0.1559426337480545, + "sft_loss": 0.7851252555847168, + "step": 3745 + }, + { + "epoch": 0.29, + "grad_norm": 9.552249908447266, + "learning_rate": 8.111944000283339e-06, + "logits/chosen": -1.382643461227417, + "logits/rejected": -1.1251006126403809, + "logps/chosen": -0.8634234666824341, + "logps/rejected": -2.6441121101379395, + "loss": 0.8849, + "odds_ratio_loss": 0.21498659253120422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08634234964847565, + "rewards/margins": 0.17806890606880188, + "rewards/rejected": -0.26441124081611633, + "sft_loss": 0.8634234666824341, + "step": 3750 + }, + { + "epoch": 0.29, + "grad_norm": 30.744495391845703, + "learning_rate": 8.107122060249846e-06, + "logits/chosen": -1.0354880094528198, + "logits/rejected": -0.7640705108642578, + "logps/chosen": -1.2475351095199585, + "logps/rejected": -1.5779125690460205, + "loss": 1.304, + "odds_ratio_loss": 0.5643216371536255, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12475351989269257, + "rewards/margins": 0.03303774073719978, + "rewards/rejected": -0.15779125690460205, + "sft_loss": 1.2475351095199585, + "step": 3755 + }, + { + "epoch": 0.29, + "grad_norm": 17.847753524780273, + "learning_rate": 8.102295407871252e-06, + "logits/chosen": -1.40513014793396, + "logits/rejected": -1.1785567998886108, + "logps/chosen": -0.9754989743232727, + "logps/rejected": -3.8326869010925293, + "loss": 1.0581, + "odds_ratio_loss": 0.825912594795227, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09754989296197891, + "rewards/margins": 0.28571879863739014, + "rewards/rejected": -0.38326868414878845, + "sft_loss": 0.9754989743232727, + "step": 3760 + }, + { + "epoch": 0.29, + "grad_norm": 14.9716215133667, + "learning_rate": 8.097464050467788e-06, + "logits/chosen": -1.2403470277786255, + "logits/rejected": -0.671155571937561, + "logps/chosen": -1.0241820812225342, + "logps/rejected": -2.359799861907959, + "loss": 1.0488, + "odds_ratio_loss": 0.24615927040576935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10241822153329849, + "rewards/margins": 0.13356177508831024, + "rewards/rejected": -0.23597998917102814, + "sft_loss": 1.0241820812225342, + "step": 3765 + }, + { + "epoch": 0.29, + "grad_norm": 7.668431758880615, + "learning_rate": 8.092627995366824e-06, + "logits/chosen": -1.3898518085479736, + "logits/rejected": -0.8233474493026733, + "logps/chosen": -0.8895101547241211, + "logps/rejected": -1.942580223083496, + "loss": 0.9232, + "odds_ratio_loss": 0.3369949758052826, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08895101398229599, + "rewards/margins": 0.10530702769756317, + "rewards/rejected": -0.19425803422927856, + "sft_loss": 0.8895101547241211, + "step": 3770 + }, + { + "epoch": 0.29, + "grad_norm": 34.52479553222656, + "learning_rate": 8.08778724990285e-06, + "logits/chosen": -1.2432185411453247, + "logits/rejected": -1.1982190608978271, + "logps/chosen": -1.1835172176361084, + "logps/rejected": -1.8407405614852905, + "loss": 1.2383, + "odds_ratio_loss": 0.5474838018417358, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11835173517465591, + "rewards/margins": 0.06572232395410538, + "rewards/rejected": -0.1840740442276001, + "sft_loss": 1.1835172176361084, + "step": 3775 + }, + { + "epoch": 0.29, + "grad_norm": 7.304516315460205, + "learning_rate": 8.082941821417469e-06, + "logits/chosen": -1.2247803211212158, + "logits/rejected": -0.9323280453681946, + "logps/chosen": -1.0696570873260498, + "logps/rejected": -3.683121919631958, + "loss": 1.0933, + "odds_ratio_loss": 0.23642174899578094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10696570575237274, + "rewards/margins": 0.2613464891910553, + "rewards/rejected": -0.36831218004226685, + "sft_loss": 1.0696570873260498, + "step": 3780 + }, + { + "epoch": 0.29, + "grad_norm": 15.417840957641602, + "learning_rate": 8.07809171725939e-06, + "logits/chosen": -1.3001465797424316, + "logits/rejected": -0.7907955646514893, + "logps/chosen": -1.1750476360321045, + "logps/rejected": -1.312461018562317, + "loss": 1.2469, + "odds_ratio_loss": 0.7185543179512024, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11750476062297821, + "rewards/margins": 0.013741342350840569, + "rewards/rejected": -0.13124610483646393, + "sft_loss": 1.1750476360321045, + "step": 3785 + }, + { + "epoch": 0.29, + "grad_norm": 8.207767486572266, + "learning_rate": 8.073236944784415e-06, + "logits/chosen": -1.3569071292877197, + "logits/rejected": -1.307720422744751, + "logps/chosen": -1.0300140380859375, + "logps/rejected": -2.8602893352508545, + "loss": 1.0538, + "odds_ratio_loss": 0.23768818378448486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10300140082836151, + "rewards/margins": 0.18302753567695618, + "rewards/rejected": -0.2860289514064789, + "sft_loss": 1.0300140380859375, + "step": 3790 + }, + { + "epoch": 0.3, + "grad_norm": 10.38357925415039, + "learning_rate": 8.068377511355418e-06, + "logits/chosen": -1.2362945079803467, + "logits/rejected": -1.1578267812728882, + "logps/chosen": -1.007973074913025, + "logps/rejected": -1.5636718273162842, + "loss": 1.054, + "odds_ratio_loss": 0.46053171157836914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10079731792211533, + "rewards/margins": 0.05556987598538399, + "rewards/rejected": -0.15636718273162842, + "sft_loss": 1.007973074913025, + "step": 3795 + }, + { + "epoch": 0.3, + "grad_norm": 12.893205642700195, + "learning_rate": 8.063513424342348e-06, + "logits/chosen": -1.183700680732727, + "logits/rejected": -1.1352441310882568, + "logps/chosen": -1.0947539806365967, + "logps/rejected": -3.030123472213745, + "loss": 1.1066, + "odds_ratio_loss": 0.11800430715084076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10947538912296295, + "rewards/margins": 0.19353696703910828, + "rewards/rejected": -0.3030123710632324, + "sft_loss": 1.0947539806365967, + "step": 3800 + }, + { + "epoch": 0.3, + "grad_norm": 10.757180213928223, + "learning_rate": 8.058644691122211e-06, + "logits/chosen": -1.3270838260650635, + "logits/rejected": -0.9902740716934204, + "logps/chosen": -0.880314826965332, + "logps/rejected": -2.9867103099823, + "loss": 0.8953, + "odds_ratio_loss": 0.14989587664604187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08803148567676544, + "rewards/margins": 0.21063955128192902, + "rewards/rejected": -0.29867103695869446, + "sft_loss": 0.880314826965332, + "step": 3805 + }, + { + "epoch": 0.3, + "grad_norm": 8.997293472290039, + "learning_rate": 8.053771319079061e-06, + "logits/chosen": -1.4080528020858765, + "logits/rejected": -0.9664722681045532, + "logps/chosen": -1.167017936706543, + "logps/rejected": -2.621018171310425, + "loss": 1.1972, + "odds_ratio_loss": 0.30143603682518005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11670179665088654, + "rewards/margins": 0.1454000174999237, + "rewards/rejected": -0.26210182905197144, + "sft_loss": 1.167017936706543, + "step": 3810 + }, + { + "epoch": 0.3, + "grad_norm": 9.430560111999512, + "learning_rate": 8.048893315603982e-06, + "logits/chosen": -1.350351095199585, + "logits/rejected": -0.7604211568832397, + "logps/chosen": -1.1834475994110107, + "logps/rejected": -5.396286964416504, + "loss": 1.2188, + "odds_ratio_loss": 0.35320568084716797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11834476888179779, + "rewards/margins": 0.42128387093544006, + "rewards/rejected": -0.5396286249160767, + "sft_loss": 1.1834475994110107, + "step": 3815 + }, + { + "epoch": 0.3, + "grad_norm": 13.44129467010498, + "learning_rate": 8.044010688095089e-06, + "logits/chosen": -1.2833950519561768, + "logits/rejected": -0.6708613634109497, + "logps/chosen": -1.1539959907531738, + "logps/rejected": -2.1586103439331055, + "loss": 1.196, + "odds_ratio_loss": 0.4198933243751526, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11539959907531738, + "rewards/margins": 0.100461445748806, + "rewards/rejected": -0.21586103737354279, + "sft_loss": 1.1539959907531738, + "step": 3820 + }, + { + "epoch": 0.3, + "grad_norm": 7.030882358551025, + "learning_rate": 8.039123443957503e-06, + "logits/chosen": -1.4119694232940674, + "logits/rejected": -0.6684118509292603, + "logps/chosen": -0.9586877822875977, + "logps/rejected": -3.244650363922119, + "loss": 0.9943, + "odds_ratio_loss": 0.35587188601493835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.095868781208992, + "rewards/margins": 0.2285962849855423, + "rewards/rejected": -0.3244650661945343, + "sft_loss": 0.9586877822875977, + "step": 3825 + }, + { + "epoch": 0.3, + "grad_norm": 5.365910530090332, + "learning_rate": 8.034231590603355e-06, + "logits/chosen": -1.3894189596176147, + "logits/rejected": -1.2239296436309814, + "logps/chosen": -1.6029672622680664, + "logps/rejected": -5.419643878936768, + "loss": 1.6304, + "odds_ratio_loss": 0.2738359570503235, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1602967083454132, + "rewards/margins": 0.3816676735877991, + "rewards/rejected": -0.5419644117355347, + "sft_loss": 1.6029672622680664, + "step": 3830 + }, + { + "epoch": 0.3, + "grad_norm": 27.208940505981445, + "learning_rate": 8.029335135451756e-06, + "logits/chosen": -1.4428378343582153, + "logits/rejected": -1.114614725112915, + "logps/chosen": -0.8311885595321655, + "logps/rejected": -1.4355841875076294, + "loss": 0.8732, + "odds_ratio_loss": 0.4203735291957855, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08311885595321655, + "rewards/margins": 0.060439564287662506, + "rewards/rejected": -0.14355841279029846, + "sft_loss": 0.8311885595321655, + "step": 3835 + }, + { + "epoch": 0.3, + "grad_norm": 12.32146167755127, + "learning_rate": 8.024434085928806e-06, + "logits/chosen": -1.5364830493927002, + "logits/rejected": -1.250301718711853, + "logps/chosen": -0.8794133067131042, + "logps/rejected": -4.063249111175537, + "loss": 0.9059, + "odds_ratio_loss": 0.2653045356273651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08794133365154266, + "rewards/margins": 0.3183836042881012, + "rewards/rejected": -0.40632495284080505, + "sft_loss": 0.8794133067131042, + "step": 3840 + }, + { + "epoch": 0.3, + "grad_norm": 9.93622875213623, + "learning_rate": 8.019528449467566e-06, + "logits/chosen": -1.114686369895935, + "logits/rejected": -1.0291764736175537, + "logps/chosen": -0.8959754109382629, + "logps/rejected": -1.8957700729370117, + "loss": 0.9519, + "odds_ratio_loss": 0.5592709183692932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08959753811359406, + "rewards/margins": 0.0999794751405716, + "rewards/rejected": -0.18957701325416565, + "sft_loss": 0.8959754109382629, + "step": 3845 + }, + { + "epoch": 0.3, + "grad_norm": 8.939530372619629, + "learning_rate": 8.01461823350806e-06, + "logits/chosen": -1.2906373739242554, + "logits/rejected": -0.6907309293746948, + "logps/chosen": -0.8869168162345886, + "logps/rejected": -1.3133783340454102, + "loss": 0.9384, + "odds_ratio_loss": 0.515034556388855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08869167417287827, + "rewards/margins": 0.04264615476131439, + "rewards/rejected": -0.13133783638477325, + "sft_loss": 0.8869168162345886, + "step": 3850 + }, + { + "epoch": 0.3, + "grad_norm": 6.442677021026611, + "learning_rate": 8.009703445497252e-06, + "logits/chosen": -1.286516785621643, + "logits/rejected": -1.0029170513153076, + "logps/chosen": -1.1681463718414307, + "logps/rejected": -2.1528007984161377, + "loss": 1.1983, + "odds_ratio_loss": 0.30186447501182556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11681463569402695, + "rewards/margins": 0.0984654426574707, + "rewards/rejected": -0.21528008580207825, + "sft_loss": 1.1681463718414307, + "step": 3855 + }, + { + "epoch": 0.3, + "grad_norm": 6.5835981369018555, + "learning_rate": 8.004784092889043e-06, + "logits/chosen": -1.2807387113571167, + "logits/rejected": -0.9215117692947388, + "logps/chosen": -1.4025729894638062, + "logps/rejected": -3.206958770751953, + "loss": 1.4735, + "odds_ratio_loss": 0.7089608907699585, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14025728404521942, + "rewards/margins": 0.18043860793113708, + "rewards/rejected": -0.3206959068775177, + "sft_loss": 1.4025729894638062, + "step": 3860 + }, + { + "epoch": 0.3, + "grad_norm": 44.311912536621094, + "learning_rate": 7.999860183144251e-06, + "logits/chosen": -1.3083603382110596, + "logits/rejected": -0.9558000564575195, + "logps/chosen": -1.020721197128296, + "logps/rejected": -3.8038506507873535, + "loss": 1.0512, + "odds_ratio_loss": 0.30526265501976013, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10207213461399078, + "rewards/margins": 0.27831295132637024, + "rewards/rejected": -0.38038507103919983, + "sft_loss": 1.020721197128296, + "step": 3865 + }, + { + "epoch": 0.3, + "grad_norm": 16.797744750976562, + "learning_rate": 7.994931723730617e-06, + "logits/chosen": -1.3893539905548096, + "logits/rejected": -0.7881430387496948, + "logps/chosen": -1.0593688488006592, + "logps/rejected": -1.619405746459961, + "loss": 1.1074, + "odds_ratio_loss": 0.47998982667922974, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10593689978122711, + "rewards/margins": 0.056003689765930176, + "rewards/rejected": -0.16194060444831848, + "sft_loss": 1.0593688488006592, + "step": 3870 + }, + { + "epoch": 0.3, + "grad_norm": 7.763211727142334, + "learning_rate": 7.989998722122771e-06, + "logits/chosen": -1.3000319004058838, + "logits/rejected": -1.0927565097808838, + "logps/chosen": -1.4487582445144653, + "logps/rejected": -1.6110073328018188, + "loss": 1.5211, + "odds_ratio_loss": 0.7234418988227844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14487579464912415, + "rewards/margins": 0.016224917024374008, + "rewards/rejected": -0.16110071539878845, + "sft_loss": 1.4487582445144653, + "step": 3875 + }, + { + "epoch": 0.3, + "grad_norm": 11.750642776489258, + "learning_rate": 7.98506118580224e-06, + "logits/chosen": -1.2582849264144897, + "logits/rejected": -0.896298885345459, + "logps/chosen": -1.1716492176055908, + "logps/rejected": -2.2447562217712402, + "loss": 1.2204, + "odds_ratio_loss": 0.4879697859287262, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11716492474079132, + "rewards/margins": 0.1073107123374939, + "rewards/rejected": -0.2244756519794464, + "sft_loss": 1.1716492176055908, + "step": 3880 + }, + { + "epoch": 0.3, + "grad_norm": 9.925634384155273, + "learning_rate": 7.98011912225742e-06, + "logits/chosen": -1.3496580123901367, + "logits/rejected": -1.125966191291809, + "logps/chosen": -0.737195611000061, + "logps/rejected": -2.681872844696045, + "loss": 0.7503, + "odds_ratio_loss": 0.1313866823911667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0737195536494255, + "rewards/margins": 0.194467693567276, + "rewards/rejected": -0.2681872844696045, + "sft_loss": 0.737195611000061, + "step": 3885 + }, + { + "epoch": 0.3, + "grad_norm": 9.05740737915039, + "learning_rate": 7.975172538983583e-06, + "logits/chosen": -1.3671633005142212, + "logits/rejected": -0.9348441362380981, + "logps/chosen": -1.0503031015396118, + "logps/rejected": -4.125294208526611, + "loss": 1.0606, + "odds_ratio_loss": 0.10310007631778717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10503031313419342, + "rewards/margins": 0.30749911069869995, + "rewards/rejected": -0.4125294089317322, + "sft_loss": 1.0503031015396118, + "step": 3890 + }, + { + "epoch": 0.3, + "grad_norm": 9.612077713012695, + "learning_rate": 7.970221443482847e-06, + "logits/chosen": -1.3794662952423096, + "logits/rejected": -0.8071687817573547, + "logps/chosen": -1.513962745666504, + "logps/rejected": -3.9997737407684326, + "loss": 1.5482, + "odds_ratio_loss": 0.3426254689693451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1513962745666504, + "rewards/margins": 0.24858109652996063, + "rewards/rejected": -0.3999773859977722, + "sft_loss": 1.513962745666504, + "step": 3895 + }, + { + "epoch": 0.3, + "grad_norm": 11.001276969909668, + "learning_rate": 7.965265843264178e-06, + "logits/chosen": -1.3838317394256592, + "logits/rejected": -1.0965882539749146, + "logps/chosen": -0.969623863697052, + "logps/rejected": -3.6703083515167236, + "loss": 1.003, + "odds_ratio_loss": 0.33327335119247437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09696237742900848, + "rewards/margins": 0.2700684666633606, + "rewards/rejected": -0.3670308589935303, + "sft_loss": 0.969623863697052, + "step": 3900 + }, + { + "epoch": 0.3, + "grad_norm": 12.948336601257324, + "learning_rate": 7.960305745843374e-06, + "logits/chosen": -1.4322015047073364, + "logits/rejected": -0.9059526324272156, + "logps/chosen": -1.9296623468399048, + "logps/rejected": -3.271130084991455, + "loss": 1.9571, + "odds_ratio_loss": 0.27451637387275696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1929662525653839, + "rewards/margins": 0.1341467797756195, + "rewards/rejected": -0.32711300253868103, + "sft_loss": 1.9296623468399048, + "step": 3905 + }, + { + "epoch": 0.3, + "grad_norm": 25.758337020874023, + "learning_rate": 7.955341158743048e-06, + "logits/chosen": -1.312316656112671, + "logits/rejected": -1.235752820968628, + "logps/chosen": -0.7140806913375854, + "logps/rejected": -3.0332131385803223, + "loss": 0.7353, + "odds_ratio_loss": 0.21259894967079163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07140807062387466, + "rewards/margins": 0.2319132536649704, + "rewards/rejected": -0.30332133173942566, + "sft_loss": 0.7140806913375854, + "step": 3910 + }, + { + "epoch": 0.3, + "grad_norm": 17.689552307128906, + "learning_rate": 7.950372089492634e-06, + "logits/chosen": -1.3138293027877808, + "logits/rejected": -0.8663791418075562, + "logps/chosen": -0.8342885971069336, + "logps/rejected": -1.7863073348999023, + "loss": 0.8596, + "odds_ratio_loss": 0.25299352407455444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08342885971069336, + "rewards/margins": 0.09520186483860016, + "rewards/rejected": -0.1786307394504547, + "sft_loss": 0.8342885971069336, + "step": 3915 + }, + { + "epoch": 0.3, + "grad_norm": 6.6880574226379395, + "learning_rate": 7.94539854562835e-06, + "logits/chosen": -1.354337453842163, + "logits/rejected": -0.8866574168205261, + "logps/chosen": -1.0239028930664062, + "logps/rejected": -2.7309670448303223, + "loss": 1.0524, + "odds_ratio_loss": 0.28486576676368713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10239030420780182, + "rewards/margins": 0.17070642113685608, + "rewards/rejected": -0.2730966806411743, + "sft_loss": 1.0239028930664062, + "step": 3920 + }, + { + "epoch": 0.31, + "grad_norm": 16.98858070373535, + "learning_rate": 7.94042053469321e-06, + "logits/chosen": -1.1966888904571533, + "logits/rejected": -0.6225118637084961, + "logps/chosen": -0.9805110692977905, + "logps/rejected": -1.7540674209594727, + "loss": 1.0401, + "odds_ratio_loss": 0.5959800481796265, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09805111587047577, + "rewards/margins": 0.07735563069581985, + "rewards/rejected": -0.17540673911571503, + "sft_loss": 0.9805110692977905, + "step": 3925 + }, + { + "epoch": 0.31, + "grad_norm": 12.183523178100586, + "learning_rate": 7.935438064236998e-06, + "logits/chosen": -1.2711061239242554, + "logits/rejected": -0.860754132270813, + "logps/chosen": -0.8835781812667847, + "logps/rejected": -2.7187657356262207, + "loss": 0.9118, + "odds_ratio_loss": 0.2823113799095154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0883578211069107, + "rewards/margins": 0.18351872265338898, + "rewards/rejected": -0.2718765437602997, + "sft_loss": 0.8835781812667847, + "step": 3930 + }, + { + "epoch": 0.31, + "grad_norm": 16.06085968017578, + "learning_rate": 7.930451141816264e-06, + "logits/chosen": -1.203362226486206, + "logits/rejected": -0.922333836555481, + "logps/chosen": -1.0303407907485962, + "logps/rejected": -3.09401273727417, + "loss": 1.0493, + "odds_ratio_loss": 0.189732626080513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10303407907485962, + "rewards/margins": 0.20636720955371857, + "rewards/rejected": -0.3094013035297394, + "sft_loss": 1.0303407907485962, + "step": 3935 + }, + { + "epoch": 0.31, + "grad_norm": 11.412202835083008, + "learning_rate": 7.925459774994311e-06, + "logits/chosen": -1.3621621131896973, + "logits/rejected": -1.251784086227417, + "logps/chosen": -1.0007188320159912, + "logps/rejected": -2.657543897628784, + "loss": 1.0286, + "odds_ratio_loss": 0.27897682785987854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10007189214229584, + "rewards/margins": 0.16568250954151154, + "rewards/rejected": -0.2657544016838074, + "sft_loss": 1.0007188320159912, + "step": 3940 + }, + { + "epoch": 0.31, + "grad_norm": 19.832815170288086, + "learning_rate": 7.920463971341175e-06, + "logits/chosen": -1.4012175798416138, + "logits/rejected": -0.8629199862480164, + "logps/chosen": -0.9235948324203491, + "logps/rejected": -5.943634986877441, + "loss": 0.9452, + "odds_ratio_loss": 0.2163792848587036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09235947579145432, + "rewards/margins": 0.5020040273666382, + "rewards/rejected": -0.5943635106086731, + "sft_loss": 0.9235948324203491, + "step": 3945 + }, + { + "epoch": 0.31, + "grad_norm": 30.11895751953125, + "learning_rate": 7.915463738433633e-06, + "logits/chosen": -1.422339677810669, + "logits/rejected": -1.1737186908721924, + "logps/chosen": -1.0355141162872314, + "logps/rejected": -2.3156991004943848, + "loss": 1.0611, + "odds_ratio_loss": 0.2554258108139038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10355141013860703, + "rewards/margins": 0.12801849842071533, + "rewards/rejected": -0.23156991600990295, + "sft_loss": 1.0355141162872314, + "step": 3950 + }, + { + "epoch": 0.31, + "grad_norm": 10.107759475708008, + "learning_rate": 7.910459083855169e-06, + "logits/chosen": -1.4450719356536865, + "logits/rejected": -0.7962635159492493, + "logps/chosen": -1.148455023765564, + "logps/rejected": -3.3044426441192627, + "loss": 1.162, + "odds_ratio_loss": 0.13495874404907227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11484551429748535, + "rewards/margins": 0.21559877693653107, + "rewards/rejected": -0.3304442763328552, + "sft_loss": 1.148455023765564, + "step": 3955 + }, + { + "epoch": 0.31, + "grad_norm": 3.7334766387939453, + "learning_rate": 7.905450015195977e-06, + "logits/chosen": -1.6166236400604248, + "logits/rejected": -0.9999248385429382, + "logps/chosen": -1.002211570739746, + "logps/rejected": -2.378195285797119, + "loss": 1.0485, + "odds_ratio_loss": 0.46309876441955566, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10022115707397461, + "rewards/margins": 0.13759836554527283, + "rewards/rejected": -0.23781950771808624, + "sft_loss": 1.002211570739746, + "step": 3960 + }, + { + "epoch": 0.31, + "grad_norm": 15.674083709716797, + "learning_rate": 7.900436540052947e-06, + "logits/chosen": -1.3820528984069824, + "logits/rejected": -1.1322020292282104, + "logps/chosen": -1.0542728900909424, + "logps/rejected": -1.2473478317260742, + "loss": 1.1235, + "odds_ratio_loss": 0.6921383142471313, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10542728751897812, + "rewards/margins": 0.019307482987642288, + "rewards/rejected": -0.1247347742319107, + "sft_loss": 1.0542728900909424, + "step": 3965 + }, + { + "epoch": 0.31, + "grad_norm": 9.685905456542969, + "learning_rate": 7.89541866602965e-06, + "logits/chosen": -1.4918893575668335, + "logits/rejected": -1.1254994869232178, + "logps/chosen": -1.0190510749816895, + "logps/rejected": -1.6521360874176025, + "loss": 1.0692, + "odds_ratio_loss": 0.5017727017402649, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10190512239933014, + "rewards/margins": 0.06330849975347519, + "rewards/rejected": -0.16521361470222473, + "sft_loss": 1.0190510749816895, + "step": 3970 + }, + { + "epoch": 0.31, + "grad_norm": 23.007606506347656, + "learning_rate": 7.89039640073633e-06, + "logits/chosen": -1.2623697519302368, + "logits/rejected": -0.8954647779464722, + "logps/chosen": -1.166612148284912, + "logps/rejected": -1.269649624824524, + "loss": 1.2286, + "odds_ratio_loss": 0.6198663711547852, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11666121333837509, + "rewards/margins": 0.010303747840225697, + "rewards/rejected": -0.12696495652198792, + "sft_loss": 1.166612148284912, + "step": 3975 + }, + { + "epoch": 0.31, + "grad_norm": 11.377882957458496, + "learning_rate": 7.88536975178989e-06, + "logits/chosen": -1.3877617120742798, + "logits/rejected": -0.8028934597969055, + "logps/chosen": -1.045810580253601, + "logps/rejected": -1.1776955127716064, + "loss": 1.1115, + "odds_ratio_loss": 0.6572698950767517, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1045810729265213, + "rewards/margins": 0.013188472017645836, + "rewards/rejected": -0.11776953935623169, + "sft_loss": 1.045810580253601, + "step": 3980 + }, + { + "epoch": 0.31, + "grad_norm": 4.312228202819824, + "learning_rate": 7.880338726813878e-06, + "logits/chosen": -1.4793024063110352, + "logits/rejected": -1.0138689279556274, + "logps/chosen": -0.8670511245727539, + "logps/rejected": -1.6700611114501953, + "loss": 0.8964, + "odds_ratio_loss": 0.29342907667160034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08670511841773987, + "rewards/margins": 0.08030100166797638, + "rewards/rejected": -0.16700613498687744, + "sft_loss": 0.8670511245727539, + "step": 3985 + }, + { + "epoch": 0.31, + "grad_norm": 9.054874420166016, + "learning_rate": 7.875303333438488e-06, + "logits/chosen": -1.4101994037628174, + "logits/rejected": -0.9546709060668945, + "logps/chosen": -1.5213371515274048, + "logps/rejected": -1.413010835647583, + "loss": 1.6373, + "odds_ratio_loss": 1.1592390537261963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15213371813297272, + "rewards/margins": -0.010832617059350014, + "rewards/rejected": -0.14130108058452606, + "sft_loss": 1.5213371515274048, + "step": 3990 + }, + { + "epoch": 0.31, + "grad_norm": 6.566228866577148, + "learning_rate": 7.870263579300527e-06, + "logits/chosen": -1.2961866855621338, + "logits/rejected": -1.0387169122695923, + "logps/chosen": -1.1798770427703857, + "logps/rejected": -3.3426384925842285, + "loss": 1.2468, + "odds_ratio_loss": 0.6694552302360535, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11798770725727081, + "rewards/margins": 0.2162761390209198, + "rewards/rejected": -0.3342638611793518, + "sft_loss": 1.1798770427703857, + "step": 3995 + }, + { + "epoch": 0.31, + "grad_norm": 10.971648216247559, + "learning_rate": 7.865219472043429e-06, + "logits/chosen": -1.2647547721862793, + "logits/rejected": -0.7655025124549866, + "logps/chosen": -1.0178542137145996, + "logps/rejected": -3.2869560718536377, + "loss": 1.0542, + "odds_ratio_loss": 0.36323755979537964, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10178543627262115, + "rewards/margins": 0.22691015899181366, + "rewards/rejected": -0.3286955952644348, + "sft_loss": 1.0178542137145996, + "step": 4000 + }, + { + "epoch": 0.31, + "grad_norm": 7.956406116485596, + "learning_rate": 7.860171019317215e-06, + "logits/chosen": -1.2619847059249878, + "logits/rejected": -0.626349151134491, + "logps/chosen": -1.2772443294525146, + "logps/rejected": -2.4285922050476074, + "loss": 1.3216, + "odds_ratio_loss": 0.44381484389305115, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12772443890571594, + "rewards/margins": 0.11513479053974152, + "rewards/rejected": -0.24285921454429626, + "sft_loss": 1.2772443294525146, + "step": 4005 + }, + { + "epoch": 0.31, + "grad_norm": 12.475284576416016, + "learning_rate": 7.855118228778511e-06, + "logits/chosen": -1.344545841217041, + "logits/rejected": -0.8007782101631165, + "logps/chosen": -1.6138957738876343, + "logps/rejected": -3.235304355621338, + "loss": 1.6714, + "odds_ratio_loss": 0.5750583410263062, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16138958930969238, + "rewards/margins": 0.1621408313512802, + "rewards/rejected": -0.3235304355621338, + "sft_loss": 1.6138957738876343, + "step": 4010 + }, + { + "epoch": 0.31, + "grad_norm": 187.03219604492188, + "learning_rate": 7.850061108090514e-06, + "logits/chosen": -1.1676523685455322, + "logits/rejected": -1.0269989967346191, + "logps/chosen": -1.3574920892715454, + "logps/rejected": -2.018044948577881, + "loss": 1.4053, + "odds_ratio_loss": 0.4782230854034424, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1357492059469223, + "rewards/margins": 0.06605527549982071, + "rewards/rejected": -0.2018044888973236, + "sft_loss": 1.3574920892715454, + "step": 4015 + }, + { + "epoch": 0.31, + "grad_norm": 11.270816802978516, + "learning_rate": 7.844999664922987e-06, + "logits/chosen": -1.146503210067749, + "logits/rejected": -1.0891648530960083, + "logps/chosen": -1.6510696411132812, + "logps/rejected": -4.126519203186035, + "loss": 1.669, + "odds_ratio_loss": 0.17917956411838531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16510698199272156, + "rewards/margins": 0.24754495918750763, + "rewards/rejected": -0.412651926279068, + "sft_loss": 1.6510696411132812, + "step": 4020 + }, + { + "epoch": 0.31, + "grad_norm": 130.6370086669922, + "learning_rate": 7.839933906952252e-06, + "logits/chosen": -1.077316164970398, + "logits/rejected": -1.1368393898010254, + "logps/chosen": -1.3282784223556519, + "logps/rejected": -2.2032151222229004, + "loss": 1.3634, + "odds_ratio_loss": 0.3508017957210541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13282786309719086, + "rewards/margins": 0.08749367296695709, + "rewards/rejected": -0.22032153606414795, + "sft_loss": 1.3282784223556519, + "step": 4025 + }, + { + "epoch": 0.31, + "grad_norm": 8.202735900878906, + "learning_rate": 7.834863841861178e-06, + "logits/chosen": -1.162980079650879, + "logits/rejected": -0.596443772315979, + "logps/chosen": -1.7192405462265015, + "logps/rejected": -6.087828159332275, + "loss": 1.7602, + "odds_ratio_loss": 0.40991973876953125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17192408442497253, + "rewards/margins": 0.43685880303382874, + "rewards/rejected": -0.6087828874588013, + "sft_loss": 1.7192405462265015, + "step": 4030 + }, + { + "epoch": 0.31, + "grad_norm": 46.36093521118164, + "learning_rate": 7.829789477339157e-06, + "logits/chosen": -1.1748098134994507, + "logits/rejected": -1.0097134113311768, + "logps/chosen": -1.247901439666748, + "logps/rejected": -2.3685479164123535, + "loss": 1.2843, + "odds_ratio_loss": 0.36368483304977417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12479015439748764, + "rewards/margins": 0.11206464469432831, + "rewards/rejected": -0.23685479164123535, + "sft_loss": 1.247901439666748, + "step": 4035 + }, + { + "epoch": 0.31, + "grad_norm": 7.1221842765808105, + "learning_rate": 7.824710821082111e-06, + "logits/chosen": -1.271332025527954, + "logits/rejected": -0.8087499737739563, + "logps/chosen": -0.9981080293655396, + "logps/rejected": -1.9443790912628174, + "loss": 1.0292, + "odds_ratio_loss": 0.3106873631477356, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09981080144643784, + "rewards/margins": 0.09462710469961166, + "rewards/rejected": -0.1944379061460495, + "sft_loss": 0.9981080293655396, + "step": 4040 + }, + { + "epoch": 0.31, + "grad_norm": 8.026951789855957, + "learning_rate": 7.819627880792465e-06, + "logits/chosen": -1.1581884622573853, + "logits/rejected": -0.9363598823547363, + "logps/chosen": -1.0807740688323975, + "logps/rejected": -2.251509428024292, + "loss": 1.109, + "odds_ratio_loss": 0.28213122487068176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10807742178440094, + "rewards/margins": 0.11707352101802826, + "rewards/rejected": -0.2251509726047516, + "sft_loss": 1.0807740688323975, + "step": 4045 + }, + { + "epoch": 0.32, + "grad_norm": 8.195876121520996, + "learning_rate": 7.814540664179143e-06, + "logits/chosen": -1.2763893604278564, + "logits/rejected": -0.44702619314193726, + "logps/chosen": -0.9961471557617188, + "logps/rejected": -3.151249885559082, + "loss": 1.0107, + "odds_ratio_loss": 0.14514710009098053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.099614717066288, + "rewards/margins": 0.2155102789402008, + "rewards/rejected": -0.3151249885559082, + "sft_loss": 0.9961471557617188, + "step": 4050 + }, + { + "epoch": 0.32, + "grad_norm": 23.922773361206055, + "learning_rate": 7.809449178957558e-06, + "logits/chosen": -1.2120903730392456, + "logits/rejected": -1.00906240940094, + "logps/chosen": -1.162179946899414, + "logps/rejected": -1.754237174987793, + "loss": 1.2073, + "odds_ratio_loss": 0.4514268934726715, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11621799319982529, + "rewards/margins": 0.059205733239650726, + "rewards/rejected": -0.17542371153831482, + "sft_loss": 1.162179946899414, + "step": 4055 + }, + { + "epoch": 0.32, + "grad_norm": 7.789605617523193, + "learning_rate": 7.80435343284959e-06, + "logits/chosen": -1.2017956972122192, + "logits/rejected": -0.8519023060798645, + "logps/chosen": -0.9169862866401672, + "logps/rejected": -2.8535914421081543, + "loss": 0.9404, + "odds_ratio_loss": 0.23436644673347473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09169862419366837, + "rewards/margins": 0.19366052746772766, + "rewards/rejected": -0.28535914421081543, + "sft_loss": 0.9169862866401672, + "step": 4060 + }, + { + "epoch": 0.32, + "grad_norm": 25.745149612426758, + "learning_rate": 7.799253433583585e-06, + "logits/chosen": -1.3630164861679077, + "logits/rejected": -1.2323427200317383, + "logps/chosen": -0.45543041825294495, + "logps/rejected": -2.0177087783813477, + "loss": 0.5283, + "odds_ratio_loss": 0.7285929918289185, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.045543037354946136, + "rewards/margins": 0.15622785687446594, + "rewards/rejected": -0.2017708718776703, + "sft_loss": 0.45543041825294495, + "step": 4065 + }, + { + "epoch": 0.32, + "grad_norm": 14.55126953125, + "learning_rate": 7.794149188894344e-06, + "logits/chosen": -1.3345615863800049, + "logits/rejected": -1.1428083181381226, + "logps/chosen": -1.3834812641143799, + "logps/rejected": -2.4057507514953613, + "loss": 1.4419, + "odds_ratio_loss": 0.5839170217514038, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13834811747074127, + "rewards/margins": 0.10222695022821426, + "rewards/rejected": -0.24057507514953613, + "sft_loss": 1.3834812641143799, + "step": 4070 + }, + { + "epoch": 0.32, + "grad_norm": 7.617037773132324, + "learning_rate": 7.789040706523097e-06, + "logits/chosen": -1.1593537330627441, + "logits/rejected": -0.9324871301651001, + "logps/chosen": -1.2534054517745972, + "logps/rejected": -1.9832494258880615, + "loss": 1.3098, + "odds_ratio_loss": 0.5639361143112183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1253405511379242, + "rewards/margins": 0.07298441231250763, + "rewards/rejected": -0.19832494854927063, + "sft_loss": 1.2534054517745972, + "step": 4075 + }, + { + "epoch": 0.32, + "grad_norm": 17.690900802612305, + "learning_rate": 7.78392799421751e-06, + "logits/chosen": -1.249637484550476, + "logits/rejected": -1.1855614185333252, + "logps/chosen": -0.6458396315574646, + "logps/rejected": -2.4688522815704346, + "loss": 0.6594, + "odds_ratio_loss": 0.13548722863197327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06458397209644318, + "rewards/margins": 0.18230126798152924, + "rewards/rejected": -0.24688522517681122, + "sft_loss": 0.6458396315574646, + "step": 4080 + }, + { + "epoch": 0.32, + "grad_norm": 5.836312770843506, + "learning_rate": 7.778811059731656e-06, + "logits/chosen": -1.353683590888977, + "logits/rejected": -0.6263580322265625, + "logps/chosen": -1.063247561454773, + "logps/rejected": -1.5036994218826294, + "loss": 1.1098, + "odds_ratio_loss": 0.46563464403152466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10632475465536118, + "rewards/margins": 0.044045187532901764, + "rewards/rejected": -0.15036995708942413, + "sft_loss": 1.063247561454773, + "step": 4085 + }, + { + "epoch": 0.32, + "grad_norm": 5.767371654510498, + "learning_rate": 7.773689910826019e-06, + "logits/chosen": -1.4103326797485352, + "logits/rejected": -1.1369997262954712, + "logps/chosen": -0.8699052929878235, + "logps/rejected": -1.9617269039154053, + "loss": 0.9044, + "odds_ratio_loss": 0.34492164850234985, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08699052780866623, + "rewards/margins": 0.10918216407299042, + "rewards/rejected": -0.19617268443107605, + "sft_loss": 0.8699052929878235, + "step": 4090 + }, + { + "epoch": 0.32, + "grad_norm": 15.891542434692383, + "learning_rate": 7.768564555267473e-06, + "logits/chosen": -1.466572880744934, + "logits/rejected": -1.3226369619369507, + "logps/chosen": -1.081580400466919, + "logps/rejected": -2.9113516807556152, + "loss": 1.0987, + "odds_ratio_loss": 0.1707053780555725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10815805196762085, + "rewards/margins": 0.18297713994979858, + "rewards/rejected": -0.29113519191741943, + "sft_loss": 1.081580400466919, + "step": 4095 + }, + { + "epoch": 0.32, + "grad_norm": 121.89453125, + "learning_rate": 7.763435000829267e-06, + "logits/chosen": -1.290968656539917, + "logits/rejected": -1.1049576997756958, + "logps/chosen": -1.3791439533233643, + "logps/rejected": -3.2474513053894043, + "loss": 1.4425, + "odds_ratio_loss": 0.6333954930305481, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13791438937187195, + "rewards/margins": 0.18683074414730072, + "rewards/rejected": -0.3247451186180115, + "sft_loss": 1.3791439533233643, + "step": 4100 + }, + { + "epoch": 0.32, + "grad_norm": 28.180767059326172, + "learning_rate": 7.758301255291022e-06, + "logits/chosen": -1.1826337575912476, + "logits/rejected": -0.9726226925849915, + "logps/chosen": -1.1570237874984741, + "logps/rejected": -1.7642463445663452, + "loss": 1.1979, + "odds_ratio_loss": 0.409106969833374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11570239067077637, + "rewards/margins": 0.06072225421667099, + "rewards/rejected": -0.17642465233802795, + "sft_loss": 1.1570237874984741, + "step": 4105 + }, + { + "epoch": 0.32, + "grad_norm": 9.990339279174805, + "learning_rate": 7.753163326438716e-06, + "logits/chosen": -1.3788506984710693, + "logits/rejected": -0.790817379951477, + "logps/chosen": -0.9605264663696289, + "logps/rejected": -4.762506484985352, + "loss": 0.9911, + "odds_ratio_loss": 0.30601152777671814, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09605265408754349, + "rewards/margins": 0.38019803166389465, + "rewards/rejected": -0.47625064849853516, + "sft_loss": 0.9605264663696289, + "step": 4110 + }, + { + "epoch": 0.32, + "grad_norm": 12.46078109741211, + "learning_rate": 7.74802122206467e-06, + "logits/chosen": -1.1677567958831787, + "logits/rejected": -0.9421844482421875, + "logps/chosen": -0.9032756686210632, + "logps/rejected": -5.672630310058594, + "loss": 0.9356, + "odds_ratio_loss": 0.32349324226379395, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09032757580280304, + "rewards/margins": 0.476935476064682, + "rewards/rejected": -0.5672630071640015, + "sft_loss": 0.9032756686210632, + "step": 4115 + }, + { + "epoch": 0.32, + "grad_norm": 16.93182373046875, + "learning_rate": 7.74287494996754e-06, + "logits/chosen": -1.1149744987487793, + "logits/rejected": -1.004250407218933, + "logps/chosen": -0.9445215463638306, + "logps/rejected": -2.6246485710144043, + "loss": 0.9833, + "odds_ratio_loss": 0.38825660943984985, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0944521576166153, + "rewards/margins": 0.16801270842552185, + "rewards/rejected": -0.26246488094329834, + "sft_loss": 0.9445215463638306, + "step": 4120 + }, + { + "epoch": 0.32, + "grad_norm": 8.397687911987305, + "learning_rate": 7.737724517952298e-06, + "logits/chosen": -1.1781234741210938, + "logits/rejected": -0.7847142219543457, + "logps/chosen": -0.8613178133964539, + "logps/rejected": -2.36332368850708, + "loss": 0.8903, + "odds_ratio_loss": 0.2901086211204529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08613178133964539, + "rewards/margins": 0.15020060539245605, + "rewards/rejected": -0.23633238673210144, + "sft_loss": 0.8613178133964539, + "step": 4125 + }, + { + "epoch": 0.32, + "grad_norm": 34.25830078125, + "learning_rate": 7.732569933830229e-06, + "logits/chosen": -1.1180028915405273, + "logits/rejected": -0.9175459146499634, + "logps/chosen": -1.0336300134658813, + "logps/rejected": -2.0328879356384277, + "loss": 1.0612, + "odds_ratio_loss": 0.27564844489097595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10336299985647202, + "rewards/margins": 0.09992580860853195, + "rewards/rejected": -0.20328882336616516, + "sft_loss": 1.0336300134658813, + "step": 4130 + }, + { + "epoch": 0.32, + "grad_norm": 23.185644149780273, + "learning_rate": 7.727411205418917e-06, + "logits/chosen": -1.1140944957733154, + "logits/rejected": -0.915080189704895, + "logps/chosen": -1.1209285259246826, + "logps/rejected": -1.5422135591506958, + "loss": 1.1705, + "odds_ratio_loss": 0.4952412545681, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11209283769130707, + "rewards/margins": 0.042128510773181915, + "rewards/rejected": -0.15422135591506958, + "sft_loss": 1.1209285259246826, + "step": 4135 + }, + { + "epoch": 0.32, + "grad_norm": 14.448271751403809, + "learning_rate": 7.722248340542224e-06, + "logits/chosen": -1.1452420949935913, + "logits/rejected": -1.0613524913787842, + "logps/chosen": -1.0193283557891846, + "logps/rejected": -3.6547303199768066, + "loss": 1.0453, + "odds_ratio_loss": 0.2597232460975647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1019328385591507, + "rewards/margins": 0.2635401487350464, + "rewards/rejected": -0.3654729723930359, + "sft_loss": 1.0193283557891846, + "step": 4140 + }, + { + "epoch": 0.32, + "grad_norm": 13.385184288024902, + "learning_rate": 7.717081347030295e-06, + "logits/chosen": -1.1069594621658325, + "logits/rejected": -0.6620140075683594, + "logps/chosen": -1.2712277173995972, + "logps/rejected": -2.2989327907562256, + "loss": 1.3054, + "odds_ratio_loss": 0.3419440686702728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12712277472019196, + "rewards/margins": 0.10277052223682404, + "rewards/rejected": -0.229893296957016, + "sft_loss": 1.2712277173995972, + "step": 4145 + }, + { + "epoch": 0.32, + "grad_norm": 9.037135124206543, + "learning_rate": 7.711910232719526e-06, + "logits/chosen": -1.2756738662719727, + "logits/rejected": -1.017519474029541, + "logps/chosen": -0.8686092495918274, + "logps/rejected": -3.832780122756958, + "loss": 0.8839, + "odds_ratio_loss": 0.15329131484031677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08686093240976334, + "rewards/margins": 0.29641711711883545, + "rewards/rejected": -0.3832780420780182, + "sft_loss": 0.8686092495918274, + "step": 4150 + }, + { + "epoch": 0.32, + "grad_norm": 17.691438674926758, + "learning_rate": 7.706735005452574e-06, + "logits/chosen": -1.1250547170639038, + "logits/rejected": -1.2741339206695557, + "logps/chosen": -1.1006823778152466, + "logps/rejected": -1.6535568237304688, + "loss": 1.147, + "odds_ratio_loss": 0.4627433717250824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11006822437047958, + "rewards/margins": 0.05528745800256729, + "rewards/rejected": -0.16535568237304688, + "sft_loss": 1.1006823778152466, + "step": 4155 + }, + { + "epoch": 0.32, + "grad_norm": 10.580764770507812, + "learning_rate": 7.701555673078324e-06, + "logits/chosen": -1.3162877559661865, + "logits/rejected": -0.5796257257461548, + "logps/chosen": -1.0522043704986572, + "logps/rejected": -5.691858291625977, + "loss": 1.0987, + "odds_ratio_loss": 0.46448415517807007, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10522043704986572, + "rewards/margins": 0.46396535634994507, + "rewards/rejected": -0.5691858530044556, + "sft_loss": 1.0522043704986572, + "step": 4160 + }, + { + "epoch": 0.32, + "grad_norm": 29.99909019470215, + "learning_rate": 7.696372243451894e-06, + "logits/chosen": -1.4073673486709595, + "logits/rejected": -1.3161767721176147, + "logps/chosen": -0.8386504054069519, + "logps/rejected": -1.499929666519165, + "loss": 0.905, + "odds_ratio_loss": 0.663795530796051, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08386505395174026, + "rewards/margins": 0.06612792611122131, + "rewards/rejected": -0.14999297261238098, + "sft_loss": 0.8386504054069519, + "step": 4165 + }, + { + "epoch": 0.32, + "grad_norm": 6.166081428527832, + "learning_rate": 7.691184724434613e-06, + "logits/chosen": -1.2340881824493408, + "logits/rejected": -0.5184012651443481, + "logps/chosen": -0.8715030550956726, + "logps/rejected": -1.367983102798462, + "loss": 0.9184, + "odds_ratio_loss": 0.46853357553482056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08715032041072845, + "rewards/margins": 0.049647994339466095, + "rewards/rejected": -0.13679829239845276, + "sft_loss": 0.8715030550956726, + "step": 4170 + }, + { + "epoch": 0.32, + "grad_norm": 10.00060749053955, + "learning_rate": 7.685993123894008e-06, + "logits/chosen": -1.2512257099151611, + "logits/rejected": -0.9624239206314087, + "logps/chosen": -1.1072269678115845, + "logps/rejected": -2.8060803413391113, + "loss": 1.1424, + "odds_ratio_loss": 0.3513997197151184, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11072269827127457, + "rewards/margins": 0.16988535225391388, + "rewards/rejected": -0.28060805797576904, + "sft_loss": 1.1072269678115845, + "step": 4175 + }, + { + "epoch": 0.33, + "grad_norm": 7.813191890716553, + "learning_rate": 7.680797449703808e-06, + "logits/chosen": -1.23801589012146, + "logits/rejected": -1.0573147535324097, + "logps/chosen": -1.1597460508346558, + "logps/rejected": -2.114023208618164, + "loss": 1.1948, + "odds_ratio_loss": 0.3502116799354553, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11597461998462677, + "rewards/margins": 0.09542771428823471, + "rewards/rejected": -0.2114022970199585, + "sft_loss": 1.1597460508346558, + "step": 4180 + }, + { + "epoch": 0.33, + "grad_norm": 65.03484344482422, + "learning_rate": 7.675597709743906e-06, + "logits/chosen": -1.2186355590820312, + "logits/rejected": -1.2111581563949585, + "logps/chosen": -0.8498256802558899, + "logps/rejected": -4.716329097747803, + "loss": 0.8842, + "odds_ratio_loss": 0.343315064907074, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08498255908489227, + "rewards/margins": 0.38665035367012024, + "rewards/rejected": -0.4716328978538513, + "sft_loss": 0.8498256802558899, + "step": 4185 + }, + { + "epoch": 0.33, + "grad_norm": 5.66574239730835, + "learning_rate": 7.67039391190037e-06, + "logits/chosen": -1.1516528129577637, + "logits/rejected": -1.02261483669281, + "logps/chosen": -1.4871667623519897, + "logps/rejected": -4.307422161102295, + "loss": 1.5184, + "odds_ratio_loss": 0.31264442205429077, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14871668815612793, + "rewards/margins": 0.2820255160331726, + "rewards/rejected": -0.43074217438697815, + "sft_loss": 1.4871667623519897, + "step": 4190 + }, + { + "epoch": 0.33, + "grad_norm": 7.576369762420654, + "learning_rate": 7.665186064065419e-06, + "logits/chosen": -1.2199019193649292, + "logits/rejected": -0.5204739570617676, + "logps/chosen": -1.1422882080078125, + "logps/rejected": -3.815093994140625, + "loss": 1.1608, + "odds_ratio_loss": 0.18513749539852142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11422882974147797, + "rewards/margins": 0.26728057861328125, + "rewards/rejected": -0.381509393453598, + "sft_loss": 1.1422882080078125, + "step": 4195 + }, + { + "epoch": 0.33, + "grad_norm": 51.00697326660156, + "learning_rate": 7.659974174137418e-06, + "logits/chosen": -1.257304310798645, + "logits/rejected": -1.0916509628295898, + "logps/chosen": -1.3007371425628662, + "logps/rejected": -2.1112220287323, + "loss": 1.3443, + "odds_ratio_loss": 0.4351831376552582, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13007371127605438, + "rewards/margins": 0.08104848861694336, + "rewards/rejected": -0.21112219989299774, + "sft_loss": 1.3007371425628662, + "step": 4200 + }, + { + "epoch": 0.33, + "grad_norm": 19.428726196289062, + "learning_rate": 7.654758250020858e-06, + "logits/chosen": -1.285024642944336, + "logits/rejected": -1.0297925472259521, + "logps/chosen": -0.8580648303031921, + "logps/rejected": -3.976184129714966, + "loss": 0.8923, + "odds_ratio_loss": 0.3418591618537903, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08580649644136429, + "rewards/margins": 0.3118119239807129, + "rewards/rejected": -0.3976184129714966, + "sft_loss": 0.8580648303031921, + "step": 4205 + }, + { + "epoch": 0.33, + "grad_norm": 15.734051704406738, + "learning_rate": 7.64953829962635e-06, + "logits/chosen": -1.305315375328064, + "logits/rejected": -0.7829909324645996, + "logps/chosen": -1.1611686944961548, + "logps/rejected": -7.541050910949707, + "loss": 1.1728, + "odds_ratio_loss": 0.11647912114858627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11611686646938324, + "rewards/margins": 0.6379882097244263, + "rewards/rejected": -0.7541050314903259, + "sft_loss": 1.1611686944961548, + "step": 4210 + }, + { + "epoch": 0.33, + "grad_norm": 9.167802810668945, + "learning_rate": 7.644314330870614e-06, + "logits/chosen": -1.268035650253296, + "logits/rejected": -1.1111148595809937, + "logps/chosen": -1.1187152862548828, + "logps/rejected": -5.738256454467773, + "loss": 1.1216, + "odds_ratio_loss": 0.028980012983083725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11187154054641724, + "rewards/margins": 0.4619540572166443, + "rewards/rejected": -0.5738255977630615, + "sft_loss": 1.1187152862548828, + "step": 4215 + }, + { + "epoch": 0.33, + "grad_norm": 18.73733139038086, + "learning_rate": 7.63908635167646e-06, + "logits/chosen": -1.3024123907089233, + "logits/rejected": -0.8621677160263062, + "logps/chosen": -1.0445382595062256, + "logps/rejected": -2.0720951557159424, + "loss": 1.0909, + "odds_ratio_loss": 0.4636809229850769, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10445381700992584, + "rewards/margins": 0.10275568813085556, + "rewards/rejected": -0.207209512591362, + "sft_loss": 1.0445382595062256, + "step": 4220 + }, + { + "epoch": 0.33, + "grad_norm": 7.220019817352295, + "learning_rate": 7.633854369972779e-06, + "logits/chosen": -1.1436474323272705, + "logits/rejected": -1.0087378025054932, + "logps/chosen": -0.9033777117729187, + "logps/rejected": -2.1096222400665283, + "loss": 0.9352, + "odds_ratio_loss": 0.31843090057373047, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09033779054880142, + "rewards/margins": 0.12062442302703857, + "rewards/rejected": -0.2109622210264206, + "sft_loss": 0.9033777117729187, + "step": 4225 + }, + { + "epoch": 0.33, + "grad_norm": 7.440292835235596, + "learning_rate": 7.628618393694543e-06, + "logits/chosen": -1.2576261758804321, + "logits/rejected": -1.3421485424041748, + "logps/chosen": -0.9210315942764282, + "logps/rejected": -1.9578853845596313, + "loss": 0.9629, + "odds_ratio_loss": 0.4189070761203766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09210315346717834, + "rewards/margins": 0.1036853939294815, + "rewards/rejected": -0.19578854739665985, + "sft_loss": 0.9210315942764282, + "step": 4230 + }, + { + "epoch": 0.33, + "grad_norm": 10.724273681640625, + "learning_rate": 7.623378430782768e-06, + "logits/chosen": -1.3568766117095947, + "logits/rejected": -1.1589086055755615, + "logps/chosen": -0.8155969381332397, + "logps/rejected": -3.8145556449890137, + "loss": 0.8491, + "odds_ratio_loss": 0.3349160850048065, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0815596953034401, + "rewards/margins": 0.2998958230018616, + "rewards/rejected": -0.38145551085472107, + "sft_loss": 0.8155969381332397, + "step": 4235 + }, + { + "epoch": 0.33, + "grad_norm": 10.178549766540527, + "learning_rate": 7.618134489184527e-06, + "logits/chosen": -1.283182978630066, + "logits/rejected": -1.1112940311431885, + "logps/chosen": -1.0650850534439087, + "logps/rejected": -1.8539899587631226, + "loss": 1.0975, + "odds_ratio_loss": 0.32411864399909973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10650850832462311, + "rewards/margins": 0.07889048755168915, + "rewards/rejected": -0.18539901077747345, + "sft_loss": 1.0650850534439087, + "step": 4240 + }, + { + "epoch": 0.33, + "grad_norm": 7.90317440032959, + "learning_rate": 7.612886576852921e-06, + "logits/chosen": -1.2977828979492188, + "logits/rejected": -1.011577844619751, + "logps/chosen": -1.0474843978881836, + "logps/rejected": -1.0922882556915283, + "loss": 1.1261, + "odds_ratio_loss": 0.7860459089279175, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.1047484427690506, + "rewards/margins": 0.004480388946831226, + "rewards/rejected": -0.10922882705926895, + "sft_loss": 1.0474843978881836, + "step": 4245 + }, + { + "epoch": 0.33, + "grad_norm": 14.927838325500488, + "learning_rate": 7.607634701747076e-06, + "logits/chosen": -1.3175591230392456, + "logits/rejected": -1.2058073282241821, + "logps/chosen": -1.086005687713623, + "logps/rejected": -6.419415473937988, + "loss": 1.0937, + "odds_ratio_loss": 0.07735596597194672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10860057175159454, + "rewards/margins": 0.5333409309387207, + "rewards/rejected": -0.6419415473937988, + "sft_loss": 1.086005687713623, + "step": 4250 + }, + { + "epoch": 0.33, + "grad_norm": 9.913247108459473, + "learning_rate": 7.602378871832126e-06, + "logits/chosen": -1.297736406326294, + "logits/rejected": -0.5890460014343262, + "logps/chosen": -0.9432314038276672, + "logps/rejected": -4.475759029388428, + "loss": 0.9813, + "odds_ratio_loss": 0.3807455599308014, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09432314336299896, + "rewards/margins": 0.3532527983188629, + "rewards/rejected": -0.44757595658302307, + "sft_loss": 0.9432314038276672, + "step": 4255 + }, + { + "epoch": 0.33, + "grad_norm": 5.066327095031738, + "learning_rate": 7.597119095079209e-06, + "logits/chosen": -1.2584011554718018, + "logits/rejected": -0.6905049681663513, + "logps/chosen": -1.0048248767852783, + "logps/rejected": -1.8865995407104492, + "loss": 1.0372, + "odds_ratio_loss": 0.323311984539032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1004825010895729, + "rewards/margins": 0.08817745000123978, + "rewards/rejected": -0.1886599361896515, + "sft_loss": 1.0048248767852783, + "step": 4260 + }, + { + "epoch": 0.33, + "grad_norm": 35.4716796875, + "learning_rate": 7.5918553794654405e-06, + "logits/chosen": -1.2576793432235718, + "logits/rejected": -1.0605818033218384, + "logps/chosen": -1.1159580945968628, + "logps/rejected": -1.926154375076294, + "loss": 1.153, + "odds_ratio_loss": 0.37044578790664673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11159580945968628, + "rewards/margins": 0.08101961761713028, + "rewards/rejected": -0.19261543452739716, + "sft_loss": 1.1159580945968628, + "step": 4265 + }, + { + "epoch": 0.33, + "grad_norm": 6.506315231323242, + "learning_rate": 7.586587732973914e-06, + "logits/chosen": -1.404921293258667, + "logits/rejected": -0.6291941404342651, + "logps/chosen": -0.815272331237793, + "logps/rejected": -2.1161012649536133, + "loss": 0.8365, + "odds_ratio_loss": 0.21181544661521912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0815272331237793, + "rewards/margins": 0.13008292019367218, + "rewards/rejected": -0.21161015331745148, + "sft_loss": 0.815272331237793, + "step": 4270 + }, + { + "epoch": 0.33, + "grad_norm": 5.738877296447754, + "learning_rate": 7.581316163593684e-06, + "logits/chosen": -1.2702836990356445, + "logits/rejected": -0.9186004400253296, + "logps/chosen": -0.8667852282524109, + "logps/rejected": -2.548339605331421, + "loss": 0.8981, + "odds_ratio_loss": 0.3134905695915222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08667852729558945, + "rewards/margins": 0.1681554615497589, + "rewards/rejected": -0.25483399629592896, + "sft_loss": 0.8667852282524109, + "step": 4275 + }, + { + "epoch": 0.33, + "grad_norm": 64.36282348632812, + "learning_rate": 7.576040679319755e-06, + "logits/chosen": -1.251603126525879, + "logits/rejected": -0.6915744543075562, + "logps/chosen": -0.9854947328567505, + "logps/rejected": -5.712364673614502, + "loss": 0.9989, + "odds_ratio_loss": 0.1339355707168579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09854947775602341, + "rewards/margins": 0.4726869463920593, + "rewards/rejected": -0.5712364912033081, + "sft_loss": 0.9854947328567505, + "step": 4280 + }, + { + "epoch": 0.33, + "grad_norm": 145.11282348632812, + "learning_rate": 7.570761288153069e-06, + "logits/chosen": -1.1111905574798584, + "logits/rejected": -0.8975087404251099, + "logps/chosen": -0.9915755987167358, + "logps/rejected": -1.3429906368255615, + "loss": 1.0428, + "odds_ratio_loss": 0.5118028521537781, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09915756434202194, + "rewards/margins": 0.035141509026288986, + "rewards/rejected": -0.13429906964302063, + "sft_loss": 0.9915755987167358, + "step": 4285 + }, + { + "epoch": 0.33, + "grad_norm": 6.5045084953308105, + "learning_rate": 7.565477998100494e-06, + "logits/chosen": -1.4710153341293335, + "logits/rejected": -0.905910849571228, + "logps/chosen": -0.8476712107658386, + "logps/rejected": -2.0717978477478027, + "loss": 0.873, + "odds_ratio_loss": 0.2529553472995758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08476711809635162, + "rewards/margins": 0.12241265922784805, + "rewards/rejected": -0.20717978477478027, + "sft_loss": 0.8476712107658386, + "step": 4290 + }, + { + "epoch": 0.33, + "grad_norm": 8.989347457885742, + "learning_rate": 7.560190817174808e-06, + "logits/chosen": -1.3830236196517944, + "logits/rejected": -0.9063690900802612, + "logps/chosen": -0.9198230504989624, + "logps/rejected": -2.5667905807495117, + "loss": 0.9511, + "odds_ratio_loss": 0.31292420625686646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09198231250047684, + "rewards/margins": 0.16469675302505493, + "rewards/rejected": -0.25667905807495117, + "sft_loss": 0.9198230504989624, + "step": 4295 + }, + { + "epoch": 0.33, + "grad_norm": 4.432363510131836, + "learning_rate": 7.554899753394696e-06, + "logits/chosen": -1.2827117443084717, + "logits/rejected": -0.44630032777786255, + "logps/chosen": -0.8856255412101746, + "logps/rejected": -3.802992582321167, + "loss": 0.9092, + "odds_ratio_loss": 0.23532943427562714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08856256306171417, + "rewards/margins": 0.2917366921901703, + "rewards/rejected": -0.38029927015304565, + "sft_loss": 0.8856255412101746, + "step": 4300 + }, + { + "epoch": 0.33, + "grad_norm": 8.772881507873535, + "learning_rate": 7.549604814784721e-06, + "logits/chosen": -1.3126475811004639, + "logits/rejected": -0.811437726020813, + "logps/chosen": -0.6851466298103333, + "logps/rejected": -1.9268734455108643, + "loss": 0.7091, + "odds_ratio_loss": 0.2396661341190338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06851466745138168, + "rewards/margins": 0.12417266517877579, + "rewards/rejected": -0.19268734753131866, + "sft_loss": 0.6851466298103333, + "step": 4305 + }, + { + "epoch": 0.34, + "grad_norm": 6.346287250518799, + "learning_rate": 7.544306009375335e-06, + "logits/chosen": -1.3210334777832031, + "logits/rejected": -0.9792496562004089, + "logps/chosen": -0.7893930077552795, + "logps/rejected": -1.422093152999878, + "loss": 0.8408, + "odds_ratio_loss": 0.5139185190200806, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07893930375576019, + "rewards/margins": 0.06327001750469208, + "rewards/rejected": -0.14220932126045227, + "sft_loss": 0.7893930077552795, + "step": 4310 + }, + { + "epoch": 0.34, + "grad_norm": 37.05595779418945, + "learning_rate": 7.53900334520285e-06, + "logits/chosen": -1.3776777982711792, + "logits/rejected": -1.3684583902359009, + "logps/chosen": -1.2751832008361816, + "logps/rejected": -2.9548025131225586, + "loss": 1.2969, + "odds_ratio_loss": 0.2176557332277298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12751832604408264, + "rewards/margins": 0.16796192526817322, + "rewards/rejected": -0.29548028111457825, + "sft_loss": 1.2751832008361816, + "step": 4315 + }, + { + "epoch": 0.34, + "grad_norm": 20.400171279907227, + "learning_rate": 7.533696830309427e-06, + "logits/chosen": -1.42886221408844, + "logits/rejected": -1.1865837574005127, + "logps/chosen": -0.9903708696365356, + "logps/rejected": -1.2984185218811035, + "loss": 1.0777, + "odds_ratio_loss": 0.87353515625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09903708845376968, + "rewards/margins": 0.030804771929979324, + "rewards/rejected": -0.1298418492078781, + "sft_loss": 0.9903708696365356, + "step": 4320 + }, + { + "epoch": 0.34, + "grad_norm": 16.031402587890625, + "learning_rate": 7.52838647274307e-06, + "logits/chosen": -1.289991855621338, + "logits/rejected": -0.9572579264640808, + "logps/chosen": -1.0513736009597778, + "logps/rejected": -1.9264914989471436, + "loss": 1.0815, + "odds_ratio_loss": 0.30108073353767395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10513736307621002, + "rewards/margins": 0.08751179277896881, + "rewards/rejected": -0.19264915585517883, + "sft_loss": 1.0513736009597778, + "step": 4325 + }, + { + "epoch": 0.34, + "grad_norm": 8.47230339050293, + "learning_rate": 7.5230722805576105e-06, + "logits/chosen": -1.1652015447616577, + "logits/rejected": -1.0036388635635376, + "logps/chosen": -1.0877656936645508, + "logps/rejected": -1.7497972249984741, + "loss": 1.1352, + "odds_ratio_loss": 0.4747348725795746, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10877655446529388, + "rewards/margins": 0.06620316207408905, + "rewards/rejected": -0.17497971653938293, + "sft_loss": 1.0877656936645508, + "step": 4330 + }, + { + "epoch": 0.34, + "grad_norm": 28.46778678894043, + "learning_rate": 7.517754261812695e-06, + "logits/chosen": -1.2815945148468018, + "logits/rejected": -0.7410744428634644, + "logps/chosen": -1.0584745407104492, + "logps/rejected": -2.4538943767547607, + "loss": 1.1028, + "odds_ratio_loss": 0.44320353865623474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10584745556116104, + "rewards/margins": 0.13954199850559235, + "rewards/rejected": -0.24538946151733398, + "sft_loss": 1.0584745407104492, + "step": 4335 + }, + { + "epoch": 0.34, + "grad_norm": 7.073278427124023, + "learning_rate": 7.512432424573777e-06, + "logits/chosen": -1.2740113735198975, + "logits/rejected": -0.702880859375, + "logps/chosen": -1.0853495597839355, + "logps/rejected": -2.0686047077178955, + "loss": 1.1248, + "odds_ratio_loss": 0.3948959708213806, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10853495448827744, + "rewards/margins": 0.09832551330327988, + "rewards/rejected": -0.2068604677915573, + "sft_loss": 1.0853495597839355, + "step": 4340 + }, + { + "epoch": 0.34, + "grad_norm": 24.61713218688965, + "learning_rate": 7.507106776912094e-06, + "logits/chosen": -1.336874008178711, + "logits/rejected": -0.7619214057922363, + "logps/chosen": -0.7855367064476013, + "logps/rejected": -2.529465913772583, + "loss": 0.8186, + "odds_ratio_loss": 0.3307104706764221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07855366915464401, + "rewards/margins": 0.1743929386138916, + "rewards/rejected": -0.2529466152191162, + "sft_loss": 0.7855367064476013, + "step": 4345 + }, + { + "epoch": 0.34, + "grad_norm": 8.270886421203613, + "learning_rate": 7.501777326904671e-06, + "logits/chosen": -1.267987847328186, + "logits/rejected": -0.8105506896972656, + "logps/chosen": -1.2178547382354736, + "logps/rejected": -2.15023136138916, + "loss": 1.2791, + "odds_ratio_loss": 0.6123980283737183, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1217854842543602, + "rewards/margins": 0.09323765337467194, + "rewards/rejected": -0.21502313017845154, + "sft_loss": 1.2178547382354736, + "step": 4350 + }, + { + "epoch": 0.34, + "grad_norm": 15.25228214263916, + "learning_rate": 7.4964440826342925e-06, + "logits/chosen": -1.4595444202423096, + "logits/rejected": -1.180437445640564, + "logps/chosen": -1.1525750160217285, + "logps/rejected": -2.413696765899658, + "loss": 1.1908, + "odds_ratio_loss": 0.382191002368927, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11525748670101166, + "rewards/margins": 0.1261121779680252, + "rewards/rejected": -0.24136967957019806, + "sft_loss": 1.1525750160217285, + "step": 4355 + }, + { + "epoch": 0.34, + "grad_norm": 37.01783752441406, + "learning_rate": 7.4911070521895015e-06, + "logits/chosen": -1.1667481660842896, + "logits/rejected": -1.1027100086212158, + "logps/chosen": -1.0879366397857666, + "logps/rejected": -1.8874828815460205, + "loss": 1.1492, + "odds_ratio_loss": 0.6123815774917603, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10879365354776382, + "rewards/margins": 0.0799546092748642, + "rewards/rejected": -0.18874827027320862, + "sft_loss": 1.0879366397857666, + "step": 4360 + }, + { + "epoch": 0.34, + "grad_norm": 6.166680335998535, + "learning_rate": 7.485766243664583e-06, + "logits/chosen": -1.2937138080596924, + "logits/rejected": -0.8818578720092773, + "logps/chosen": -0.8433329463005066, + "logps/rejected": -10.050994873046875, + "loss": 0.8687, + "odds_ratio_loss": 0.2538653016090393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08433329313993454, + "rewards/margins": 0.9207661747932434, + "rewards/rejected": -1.0050995349884033, + "sft_loss": 0.8433329463005066, + "step": 4365 + }, + { + "epoch": 0.34, + "grad_norm": 93.40538024902344, + "learning_rate": 7.480421665159551e-06, + "logits/chosen": -1.0598971843719482, + "logits/rejected": -1.1688032150268555, + "logps/chosen": -1.1783250570297241, + "logps/rejected": -2.901451349258423, + "loss": 1.2068, + "odds_ratio_loss": 0.2851187586784363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11783250421285629, + "rewards/margins": 0.1723126471042633, + "rewards/rejected": -0.2901450991630554, + "sft_loss": 1.1783250570297241, + "step": 4370 + }, + { + "epoch": 0.34, + "grad_norm": 4.833317756652832, + "learning_rate": 7.475073324780138e-06, + "logits/chosen": -1.4235963821411133, + "logits/rejected": -0.9377709627151489, + "logps/chosen": -1.1065336465835571, + "logps/rejected": -2.8160653114318848, + "loss": 1.1591, + "odds_ratio_loss": 0.5254218578338623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1106533631682396, + "rewards/margins": 0.17095312476158142, + "rewards/rejected": -0.2816064953804016, + "sft_loss": 1.1065336465835571, + "step": 4375 + }, + { + "epoch": 0.34, + "grad_norm": 16.7370662689209, + "learning_rate": 7.4697212306377785e-06, + "logits/chosen": -1.1766154766082764, + "logits/rejected": -1.238294243812561, + "logps/chosen": -0.9507826566696167, + "logps/rejected": -1.9046952724456787, + "loss": 0.9904, + "odds_ratio_loss": 0.396290123462677, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09507826715707779, + "rewards/margins": 0.09539126604795456, + "rewards/rejected": -0.19046953320503235, + "sft_loss": 0.9507826566696167, + "step": 4380 + }, + { + "epoch": 0.34, + "grad_norm": 10.103072166442871, + "learning_rate": 7.464365390849606e-06, + "logits/chosen": -1.1430937051773071, + "logits/rejected": -0.7908920049667358, + "logps/chosen": -1.3288061618804932, + "logps/rejected": -6.744845390319824, + "loss": 1.3698, + "odds_ratio_loss": 0.40968722105026245, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13288059830665588, + "rewards/margins": 0.5416039228439331, + "rewards/rejected": -0.6744846105575562, + "sft_loss": 1.3288061618804932, + "step": 4385 + }, + { + "epoch": 0.34, + "grad_norm": 9.449183464050293, + "learning_rate": 7.45900581353843e-06, + "logits/chosen": -1.0516716241836548, + "logits/rejected": -1.0172231197357178, + "logps/chosen": -1.2207443714141846, + "logps/rejected": -1.3754041194915771, + "loss": 1.3082, + "odds_ratio_loss": 0.8747666478157043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1220744401216507, + "rewards/margins": 0.015465967357158661, + "rewards/rejected": -0.13754041492938995, + "sft_loss": 1.2207443714141846, + "step": 4390 + }, + { + "epoch": 0.34, + "grad_norm": 110.34417724609375, + "learning_rate": 7.45364250683273e-06, + "logits/chosen": -1.2138774394989014, + "logits/rejected": -1.1199032068252563, + "logps/chosen": -1.302486538887024, + "logps/rejected": -2.1626267433166504, + "loss": 1.3895, + "odds_ratio_loss": 0.8700674772262573, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.13024863600730896, + "rewards/margins": 0.086014024913311, + "rewards/rejected": -0.21626269817352295, + "sft_loss": 1.302486538887024, + "step": 4395 + }, + { + "epoch": 0.34, + "grad_norm": 5.204502582550049, + "learning_rate": 7.448275478866642e-06, + "logits/chosen": -1.2459137439727783, + "logits/rejected": -0.7799798846244812, + "logps/chosen": -1.1113945245742798, + "logps/rejected": -2.155890703201294, + "loss": 1.1465, + "odds_ratio_loss": 0.35096412897109985, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1111394390463829, + "rewards/margins": 0.10444964468479156, + "rewards/rejected": -0.21558907628059387, + "sft_loss": 1.1113945245742798, + "step": 4400 + }, + { + "epoch": 0.34, + "grad_norm": 17.26323127746582, + "learning_rate": 7.4429047377799455e-06, + "logits/chosen": -1.2751134634017944, + "logits/rejected": -0.9901043772697449, + "logps/chosen": -1.141135573387146, + "logps/rejected": -1.833753228187561, + "loss": 1.1797, + "odds_ratio_loss": 0.3855925500392914, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11411355435848236, + "rewards/margins": 0.06926177442073822, + "rewards/rejected": -0.18337532877922058, + "sft_loss": 1.141135573387146, + "step": 4405 + }, + { + "epoch": 0.34, + "grad_norm": 8.824706077575684, + "learning_rate": 7.437530291718051e-06, + "logits/chosen": -1.2793327569961548, + "logits/rejected": -0.8037842512130737, + "logps/chosen": -0.9298388361930847, + "logps/rejected": -4.0271196365356445, + "loss": 0.9434, + "odds_ratio_loss": 0.13532523810863495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09298388659954071, + "rewards/margins": 0.30972808599472046, + "rewards/rejected": -0.40271201729774475, + "sft_loss": 0.9298388361930847, + "step": 4410 + }, + { + "epoch": 0.34, + "grad_norm": 6.073675632476807, + "learning_rate": 7.432152148831988e-06, + "logits/chosen": -1.2977871894836426, + "logits/rejected": -0.4883858561515808, + "logps/chosen": -0.9581543207168579, + "logps/rejected": -2.2285006046295166, + "loss": 0.9896, + "odds_ratio_loss": 0.31483057141304016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09581543505191803, + "rewards/margins": 0.12703463435173035, + "rewards/rejected": -0.22285005450248718, + "sft_loss": 0.9581543207168579, + "step": 4415 + }, + { + "epoch": 0.34, + "grad_norm": 31.437864303588867, + "learning_rate": 7.426770317278392e-06, + "logits/chosen": -1.2266685962677002, + "logits/rejected": -0.8512941598892212, + "logps/chosen": -1.4204190969467163, + "logps/rejected": -3.2539901733398438, + "loss": 1.4582, + "odds_ratio_loss": 0.37796148657798767, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14204192161560059, + "rewards/margins": 0.1833570897579193, + "rewards/rejected": -0.3253989815711975, + "sft_loss": 1.4204190969467163, + "step": 4420 + }, + { + "epoch": 0.34, + "grad_norm": 4.3960676193237305, + "learning_rate": 7.4213848052194955e-06, + "logits/chosen": -0.9935970306396484, + "logits/rejected": -0.7201655507087708, + "logps/chosen": -0.8070360422134399, + "logps/rejected": -2.0757639408111572, + "loss": 0.8231, + "odds_ratio_loss": 0.16081495583057404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08070359379053116, + "rewards/margins": 0.12687279284000397, + "rewards/rejected": -0.20757639408111572, + "sft_loss": 0.8070360422134399, + "step": 4425 + }, + { + "epoch": 0.34, + "grad_norm": 9.1425199508667, + "learning_rate": 7.415995620823113e-06, + "logits/chosen": -1.2669662237167358, + "logits/rejected": -0.7052954435348511, + "logps/chosen": -0.99391108751297, + "logps/rejected": -4.0261054039001465, + "loss": 1.0137, + "odds_ratio_loss": 0.1983099728822708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09939111024141312, + "rewards/margins": 0.3032194972038269, + "rewards/rejected": -0.40261054039001465, + "sft_loss": 0.99391108751297, + "step": 4430 + }, + { + "epoch": 0.35, + "grad_norm": 5.359577178955078, + "learning_rate": 7.410602772262623e-06, + "logits/chosen": -1.266448974609375, + "logits/rejected": -0.5301991701126099, + "logps/chosen": -1.100752830505371, + "logps/rejected": -2.557126045227051, + "loss": 1.137, + "odds_ratio_loss": 0.3625568747520447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11007529497146606, + "rewards/margins": 0.14563728868961334, + "rewards/rejected": -0.2557125687599182, + "sft_loss": 1.100752830505371, + "step": 4435 + }, + { + "epoch": 0.35, + "grad_norm": 6.489447593688965, + "learning_rate": 7.4052062677169675e-06, + "logits/chosen": -1.280884027481079, + "logits/rejected": -0.6861027479171753, + "logps/chosen": -0.9087278246879578, + "logps/rejected": -4.286924362182617, + "loss": 0.9492, + "odds_ratio_loss": 0.4042681157588959, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09087278693914413, + "rewards/margins": 0.3378196358680725, + "rewards/rejected": -0.42869243025779724, + "sft_loss": 0.9087278246879578, + "step": 4440 + }, + { + "epoch": 0.35, + "grad_norm": 5.1577863693237305, + "learning_rate": 7.399806115370629e-06, + "logits/chosen": -1.3704392910003662, + "logits/rejected": -0.6230510473251343, + "logps/chosen": -0.894112765789032, + "logps/rejected": -2.8345396518707275, + "loss": 0.9103, + "odds_ratio_loss": 0.16220757365226746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08941127359867096, + "rewards/margins": 0.19404269754886627, + "rewards/rejected": -0.2834540009498596, + "sft_loss": 0.894112765789032, + "step": 4445 + }, + { + "epoch": 0.35, + "grad_norm": 10.279149055480957, + "learning_rate": 7.394402323413626e-06, + "logits/chosen": -1.4479832649230957, + "logits/rejected": -0.8625136613845825, + "logps/chosen": -1.0798912048339844, + "logps/rejected": -1.4926217794418335, + "loss": 1.1357, + "odds_ratio_loss": 0.5580379366874695, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1079891175031662, + "rewards/margins": 0.041273050010204315, + "rewards/rejected": -0.14926216006278992, + "sft_loss": 1.0798912048339844, + "step": 4450 + }, + { + "epoch": 0.35, + "grad_norm": 101.64732360839844, + "learning_rate": 7.388994900041495e-06, + "logits/chosen": -1.2341253757476807, + "logits/rejected": -1.146302580833435, + "logps/chosen": -0.8642631769180298, + "logps/rejected": -3.996840238571167, + "loss": 0.8875, + "odds_ratio_loss": 0.23205271363258362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08642631024122238, + "rewards/margins": 0.31325775384902954, + "rewards/rejected": -0.39968404173851013, + "sft_loss": 0.8642631769180298, + "step": 4455 + }, + { + "epoch": 0.35, + "grad_norm": 16.124719619750977, + "learning_rate": 7.383583853455278e-06, + "logits/chosen": -1.4076511859893799, + "logits/rejected": -1.1392637491226196, + "logps/chosen": -0.960048496723175, + "logps/rejected": -3.5500710010528564, + "loss": 0.9826, + "odds_ratio_loss": 0.22579637169837952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09600485861301422, + "rewards/margins": 0.2590022683143616, + "rewards/rejected": -0.3550071120262146, + "sft_loss": 0.960048496723175, + "step": 4460 + }, + { + "epoch": 0.35, + "grad_norm": 10.780272483825684, + "learning_rate": 7.378169191861517e-06, + "logits/chosen": -1.3396289348602295, + "logits/rejected": -0.7209702134132385, + "logps/chosen": -1.2006075382232666, + "logps/rejected": -2.211906909942627, + "loss": 1.2583, + "odds_ratio_loss": 0.5772491693496704, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1200607642531395, + "rewards/margins": 0.101129911839962, + "rewards/rejected": -0.2211906909942627, + "sft_loss": 1.2006075382232666, + "step": 4465 + }, + { + "epoch": 0.35, + "grad_norm": 43.170448303222656, + "learning_rate": 7.372750923472232e-06, + "logits/chosen": -1.3405921459197998, + "logits/rejected": -0.9898471832275391, + "logps/chosen": -1.0272200107574463, + "logps/rejected": -3.1108803749084473, + "loss": 1.0559, + "odds_ratio_loss": 0.2867039740085602, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10272200405597687, + "rewards/margins": 0.20836606621742249, + "rewards/rejected": -0.31108805537223816, + "sft_loss": 1.0272200107574463, + "step": 4470 + }, + { + "epoch": 0.35, + "grad_norm": 6.963412761688232, + "learning_rate": 7.367329056504915e-06, + "logits/chosen": -1.2340214252471924, + "logits/rejected": -0.9777275323867798, + "logps/chosen": -1.541316270828247, + "logps/rejected": -1.94304621219635, + "loss": 1.618, + "odds_ratio_loss": 0.7671359777450562, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15413162112236023, + "rewards/margins": 0.04017297178506851, + "rewards/rejected": -0.19430460035800934, + "sft_loss": 1.541316270828247, + "step": 4475 + }, + { + "epoch": 0.35, + "grad_norm": 5.4368896484375, + "learning_rate": 7.361903599182516e-06, + "logits/chosen": -1.361820936203003, + "logits/rejected": -0.7715870141983032, + "logps/chosen": -0.9218884706497192, + "logps/rejected": -3.4433655738830566, + "loss": 0.9409, + "odds_ratio_loss": 0.1898377537727356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09218885004520416, + "rewards/margins": 0.2521476745605469, + "rewards/rejected": -0.34433650970458984, + "sft_loss": 0.9218884706497192, + "step": 4480 + }, + { + "epoch": 0.35, + "grad_norm": 171.33653259277344, + "learning_rate": 7.35647455973343e-06, + "logits/chosen": -1.2617411613464355, + "logits/rejected": -1.2783212661743164, + "logps/chosen": -1.4098418951034546, + "logps/rejected": -1.3931834697723389, + "loss": 1.4959, + "odds_ratio_loss": 0.8600964546203613, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1409842073917389, + "rewards/margins": -0.0016658693784847856, + "rewards/rejected": -0.1393183171749115, + "sft_loss": 1.4098418951034546, + "step": 4485 + }, + { + "epoch": 0.35, + "grad_norm": 13.803630828857422, + "learning_rate": 7.351041946391485e-06, + "logits/chosen": -1.4524598121643066, + "logits/rejected": -0.7367189526557922, + "logps/chosen": -0.9599382281303406, + "logps/rejected": -1.7124487161636353, + "loss": 1.0182, + "odds_ratio_loss": 0.5826715230941772, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09599383175373077, + "rewards/margins": 0.07525105774402618, + "rewards/rejected": -0.17124485969543457, + "sft_loss": 0.9599382281303406, + "step": 4490 + }, + { + "epoch": 0.35, + "grad_norm": 6.795773029327393, + "learning_rate": 7.345605767395929e-06, + "logits/chosen": -1.3077876567840576, + "logits/rejected": -0.9576922655105591, + "logps/chosen": -0.985217273235321, + "logps/rejected": -1.5327274799346924, + "loss": 1.0325, + "odds_ratio_loss": 0.4725615084171295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09852172434329987, + "rewards/margins": 0.05475100874900818, + "rewards/rejected": -0.15327273309230804, + "sft_loss": 0.985217273235321, + "step": 4495 + }, + { + "epoch": 0.35, + "grad_norm": 6.262613773345947, + "learning_rate": 7.340166030991416e-06, + "logits/chosen": -1.1966087818145752, + "logits/rejected": -0.7058561444282532, + "logps/chosen": -1.084021806716919, + "logps/rejected": -1.4973636865615845, + "loss": 1.135, + "odds_ratio_loss": 0.5095903873443604, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10840219259262085, + "rewards/margins": 0.041334182024002075, + "rewards/rejected": -0.14973635971546173, + "sft_loss": 1.084021806716919, + "step": 4500 + }, + { + "epoch": 0.35, + "grad_norm": 8.660277366638184, + "learning_rate": 7.334722745427998e-06, + "logits/chosen": -1.308048963546753, + "logits/rejected": -0.8562232255935669, + "logps/chosen": -1.1090872287750244, + "logps/rejected": -2.0640666484832764, + "loss": 1.1438, + "odds_ratio_loss": 0.3471030592918396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11090872436761856, + "rewards/margins": 0.0954979658126831, + "rewards/rejected": -0.20640668272972107, + "sft_loss": 1.1090872287750244, + "step": 4505 + }, + { + "epoch": 0.35, + "grad_norm": 5.754500865936279, + "learning_rate": 7.3292759189611075e-06, + "logits/chosen": -1.0905884504318237, + "logits/rejected": -0.6774468421936035, + "logps/chosen": -0.8998796343803406, + "logps/rejected": -3.464883804321289, + "loss": 0.9201, + "odds_ratio_loss": 0.2017499953508377, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08998797088861465, + "rewards/margins": 0.2565004229545593, + "rewards/rejected": -0.3464883863925934, + "sft_loss": 0.8998796343803406, + "step": 4510 + }, + { + "epoch": 0.35, + "grad_norm": 5.588654041290283, + "learning_rate": 7.3238255598515495e-06, + "logits/chosen": -1.3673207759857178, + "logits/rejected": -0.8214467763900757, + "logps/chosen": -1.0710903406143188, + "logps/rejected": -3.6640121936798096, + "loss": 1.0837, + "odds_ratio_loss": 0.12623175978660583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10710903257131577, + "rewards/margins": 0.25929221510887146, + "rewards/rejected": -0.3664012551307678, + "sft_loss": 1.0710903406143188, + "step": 4515 + }, + { + "epoch": 0.35, + "grad_norm": 4.794607162475586, + "learning_rate": 7.318371676365487e-06, + "logits/chosen": -1.3031909465789795, + "logits/rejected": -0.343318372964859, + "logps/chosen": -0.9971429705619812, + "logps/rejected": -9.828311920166016, + "loss": 1.0109, + "odds_ratio_loss": 0.13710226118564606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09971430152654648, + "rewards/margins": 0.8831169009208679, + "rewards/rejected": -0.9828311800956726, + "sft_loss": 0.9971429705619812, + "step": 4520 + }, + { + "epoch": 0.35, + "grad_norm": 16.599647521972656, + "learning_rate": 7.3129142767744266e-06, + "logits/chosen": -1.424988031387329, + "logits/rejected": -0.8827608227729797, + "logps/chosen": -0.8135870099067688, + "logps/rejected": -2.6405673027038574, + "loss": 0.8426, + "odds_ratio_loss": 0.29016706347465515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08135870844125748, + "rewards/margins": 0.18269802629947662, + "rewards/rejected": -0.2640567421913147, + "sft_loss": 0.8135870099067688, + "step": 4525 + }, + { + "epoch": 0.35, + "grad_norm": 27.2139949798584, + "learning_rate": 7.307453369355204e-06, + "logits/chosen": -1.3583816289901733, + "logits/rejected": -1.0602794885635376, + "logps/chosen": -1.0894434452056885, + "logps/rejected": -4.239919662475586, + "loss": 1.1087, + "odds_ratio_loss": 0.19260287284851074, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10894433408975601, + "rewards/margins": 0.31504765152931213, + "rewards/rejected": -0.42399197816848755, + "sft_loss": 1.0894434452056885, + "step": 4530 + }, + { + "epoch": 0.35, + "grad_norm": 6.667520999908447, + "learning_rate": 7.301988962389982e-06, + "logits/chosen": -1.3098249435424805, + "logits/rejected": -1.0471160411834717, + "logps/chosen": -0.9988832473754883, + "logps/rejected": -3.9677734375, + "loss": 1.0147, + "odds_ratio_loss": 0.1579330563545227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09988833218812943, + "rewards/margins": 0.296889066696167, + "rewards/rejected": -0.3967773914337158, + "sft_loss": 0.9988832473754883, + "step": 4535 + }, + { + "epoch": 0.35, + "grad_norm": 4.398935317993164, + "learning_rate": 7.2965210641662265e-06, + "logits/chosen": -1.5169909000396729, + "logits/rejected": -0.7749336361885071, + "logps/chosen": -1.6878244876861572, + "logps/rejected": -3.0113415718078613, + "loss": 1.7534, + "odds_ratio_loss": 0.6560280919075012, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16878245770931244, + "rewards/margins": 0.13235166668891907, + "rewards/rejected": -0.3011341392993927, + "sft_loss": 1.6878244876861572, + "step": 4540 + }, + { + "epoch": 0.35, + "grad_norm": 7.342326641082764, + "learning_rate": 7.2910496829767e-06, + "logits/chosen": -1.2548385858535767, + "logits/rejected": -1.3211010694503784, + "logps/chosen": -0.7158026099205017, + "logps/rejected": -2.429844617843628, + "loss": 0.7398, + "odds_ratio_loss": 0.240126371383667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07158026844263077, + "rewards/margins": 0.1714041829109192, + "rewards/rejected": -0.24298445880413055, + "sft_loss": 0.7158026099205017, + "step": 4545 + }, + { + "epoch": 0.35, + "grad_norm": 7.4929728507995605, + "learning_rate": 7.285574827119446e-06, + "logits/chosen": -1.2970168590545654, + "logits/rejected": -1.3728965520858765, + "logps/chosen": -1.0939594507217407, + "logps/rejected": -1.9339323043823242, + "loss": 1.1435, + "odds_ratio_loss": 0.4956664443016052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10939594358205795, + "rewards/margins": 0.08399729430675507, + "rewards/rejected": -0.19339323043823242, + "sft_loss": 1.0939594507217407, + "step": 4550 + }, + { + "epoch": 0.35, + "grad_norm": 7.341314792633057, + "learning_rate": 7.280096504897778e-06, + "logits/chosen": -1.408686876296997, + "logits/rejected": -0.9580324292182922, + "logps/chosen": -0.9844639897346497, + "logps/rejected": -2.457996129989624, + "loss": 1.008, + "odds_ratio_loss": 0.23529568314552307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09844639152288437, + "rewards/margins": 0.14735323190689087, + "rewards/rejected": -0.24579963088035583, + "sft_loss": 0.9844639897346497, + "step": 4555 + }, + { + "epoch": 0.35, + "grad_norm": 7.616289138793945, + "learning_rate": 7.274614724620269e-06, + "logits/chosen": -1.401800513267517, + "logits/rejected": -0.6990433931350708, + "logps/chosen": -1.2157708406448364, + "logps/rejected": -2.5214405059814453, + "loss": 1.2819, + "odds_ratio_loss": 0.6614880561828613, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12157706916332245, + "rewards/margins": 0.13056698441505432, + "rewards/rejected": -0.2521440386772156, + "sft_loss": 1.2157708406448364, + "step": 4560 + }, + { + "epoch": 0.36, + "grad_norm": 15.146883964538574, + "learning_rate": 7.269129494600733e-06, + "logits/chosen": -1.0967886447906494, + "logits/rejected": -1.1191096305847168, + "logps/chosen": -1.009189248085022, + "logps/rejected": -1.9709656238555908, + "loss": 1.0466, + "odds_ratio_loss": 0.3742437958717346, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10091892629861832, + "rewards/margins": 0.09617763012647629, + "rewards/rejected": -0.1970965564250946, + "sft_loss": 1.009189248085022, + "step": 4565 + }, + { + "epoch": 0.36, + "grad_norm": 34.097381591796875, + "learning_rate": 7.2636408231582204e-06, + "logits/chosen": -1.341217279434204, + "logits/rejected": -0.9640189409255981, + "logps/chosen": -0.8832473754882812, + "logps/rejected": -2.830765962600708, + "loss": 0.914, + "odds_ratio_loss": 0.3079011142253876, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08832474052906036, + "rewards/margins": 0.19475185871124268, + "rewards/rejected": -0.28307658433914185, + "sft_loss": 0.8832473754882812, + "step": 4570 + }, + { + "epoch": 0.36, + "grad_norm": 6.27688455581665, + "learning_rate": 7.258148718616994e-06, + "logits/chosen": -1.4276528358459473, + "logits/rejected": -0.7665327787399292, + "logps/chosen": -0.7643810510635376, + "logps/rejected": -8.876727104187012, + "loss": 0.7689, + "odds_ratio_loss": 0.045427560806274414, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07643811404705048, + "rewards/margins": 0.8112346529960632, + "rewards/rejected": -0.8876727819442749, + "sft_loss": 0.7643810510635376, + "step": 4575 + }, + { + "epoch": 0.36, + "grad_norm": 46.2465705871582, + "learning_rate": 7.2526531893065314e-06, + "logits/chosen": -1.293874979019165, + "logits/rejected": -1.0485689640045166, + "logps/chosen": -0.9439376592636108, + "logps/rejected": -2.3564789295196533, + "loss": 0.9699, + "odds_ratio_loss": 0.25933530926704407, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0943937674164772, + "rewards/margins": 0.14125414192676544, + "rewards/rejected": -0.23564788699150085, + "sft_loss": 0.9439376592636108, + "step": 4580 + }, + { + "epoch": 0.36, + "grad_norm": 6.722151279449463, + "learning_rate": 7.2471542435615e-06, + "logits/chosen": -1.140699028968811, + "logits/rejected": -0.7710426449775696, + "logps/chosen": -0.9066115617752075, + "logps/rejected": -2.0762696266174316, + "loss": 0.9346, + "odds_ratio_loss": 0.2797589898109436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09066115319728851, + "rewards/margins": 0.11696581542491913, + "rewards/rejected": -0.20762696862220764, + "sft_loss": 0.9066115617752075, + "step": 4585 + }, + { + "epoch": 0.36, + "grad_norm": 84.71833038330078, + "learning_rate": 7.241651889721746e-06, + "logits/chosen": -1.4722684621810913, + "logits/rejected": -1.06831955909729, + "logps/chosen": -0.9850140810012817, + "logps/rejected": -5.990573883056641, + "loss": 1.0269, + "odds_ratio_loss": 0.41919898986816406, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09850140661001205, + "rewards/margins": 0.5005560517311096, + "rewards/rejected": -0.5990574359893799, + "sft_loss": 0.9850140810012817, + "step": 4590 + }, + { + "epoch": 0.36, + "grad_norm": 19.882492065429688, + "learning_rate": 7.236146136132292e-06, + "logits/chosen": -1.313164472579956, + "logits/rejected": -0.8736522793769836, + "logps/chosen": -1.2232404947280884, + "logps/rejected": -2.109252452850342, + "loss": 1.2705, + "odds_ratio_loss": 0.4722273349761963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12232404947280884, + "rewards/margins": 0.08860119432210922, + "rewards/rejected": -0.21092525124549866, + "sft_loss": 1.2232404947280884, + "step": 4595 + }, + { + "epoch": 0.36, + "grad_norm": 8.371238708496094, + "learning_rate": 7.230636991143309e-06, + "logits/chosen": -1.3490852117538452, + "logits/rejected": -0.9498292207717896, + "logps/chosen": -1.0545085668563843, + "logps/rejected": -1.9465465545654297, + "loss": 1.0866, + "odds_ratio_loss": 0.32094138860702515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10545085370540619, + "rewards/margins": 0.08920378983020782, + "rewards/rejected": -0.194654643535614, + "sft_loss": 1.0545085668563843, + "step": 4600 + }, + { + "epoch": 0.36, + "grad_norm": 15.105850219726562, + "learning_rate": 7.225124463110118e-06, + "logits/chosen": -1.3464066982269287, + "logits/rejected": -1.1369407176971436, + "logps/chosen": -1.1444499492645264, + "logps/rejected": -1.7896219491958618, + "loss": 1.2127, + "odds_ratio_loss": 0.6820577383041382, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11444501578807831, + "rewards/margins": 0.06451719999313354, + "rewards/rejected": -0.17896220088005066, + "sft_loss": 1.1444499492645264, + "step": 4605 + }, + { + "epoch": 0.36, + "grad_norm": 19.077436447143555, + "learning_rate": 7.219608560393166e-06, + "logits/chosen": -1.2334355115890503, + "logits/rejected": -1.1886484622955322, + "logps/chosen": -0.7450493574142456, + "logps/rejected": -1.6827375888824463, + "loss": 0.7747, + "odds_ratio_loss": 0.2968628704547882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07450493425130844, + "rewards/margins": 0.09376882016658783, + "rewards/rejected": -0.16827376186847687, + "sft_loss": 0.7450493574142456, + "step": 4610 + }, + { + "epoch": 0.36, + "grad_norm": 279.6209716796875, + "learning_rate": 7.2140892913580174e-06, + "logits/chosen": -1.3830896615982056, + "logits/rejected": -1.0050561428070068, + "logps/chosen": -1.1624782085418701, + "logps/rejected": -1.718340277671814, + "loss": 1.2271, + "odds_ratio_loss": 0.6462377905845642, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11624781787395477, + "rewards/margins": 0.05558621138334274, + "rewards/rejected": -0.1718340367078781, + "sft_loss": 1.1624782085418701, + "step": 4615 + }, + { + "epoch": 0.36, + "grad_norm": 7.198555946350098, + "learning_rate": 7.2085666643753475e-06, + "logits/chosen": -1.2989271879196167, + "logits/rejected": -1.0027689933776855, + "logps/chosen": -0.7517678141593933, + "logps/rejected": -1.4274622201919556, + "loss": 0.7941, + "odds_ratio_loss": 0.4230150580406189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07517679035663605, + "rewards/margins": 0.06756944954395294, + "rewards/rejected": -0.142746239900589, + "sft_loss": 0.7517678141593933, + "step": 4620 + }, + { + "epoch": 0.36, + "grad_norm": 21.27382469177246, + "learning_rate": 7.20304068782092e-06, + "logits/chosen": -1.3403156995773315, + "logits/rejected": -1.0457615852355957, + "logps/chosen": -1.1798444986343384, + "logps/rejected": -2.0619351863861084, + "loss": 1.2119, + "odds_ratio_loss": 0.320948988199234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11798445880413055, + "rewards/margins": 0.08820907026529312, + "rewards/rejected": -0.20619352161884308, + "sft_loss": 1.1798444986343384, + "step": 4625 + }, + { + "epoch": 0.36, + "grad_norm": 103.77518463134766, + "learning_rate": 7.197511370075581e-06, + "logits/chosen": -1.200635552406311, + "logits/rejected": -0.9515268206596375, + "logps/chosen": -1.196729302406311, + "logps/rejected": -2.4454774856567383, + "loss": 1.2169, + "odds_ratio_loss": 0.20212802290916443, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11967293173074722, + "rewards/margins": 0.12487481534481049, + "rewards/rejected": -0.2445477545261383, + "sft_loss": 1.196729302406311, + "step": 4630 + }, + { + "epoch": 0.36, + "grad_norm": 6.4753313064575195, + "learning_rate": 7.191978719525243e-06, + "logits/chosen": -1.4594614505767822, + "logits/rejected": -1.1107757091522217, + "logps/chosen": -1.047928810119629, + "logps/rejected": -2.2635796070098877, + "loss": 1.0693, + "odds_ratio_loss": 0.21393127739429474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10479287803173065, + "rewards/margins": 0.12156505882740021, + "rewards/rejected": -0.22635793685913086, + "sft_loss": 1.047928810119629, + "step": 4635 + }, + { + "epoch": 0.36, + "grad_norm": 13.064247131347656, + "learning_rate": 7.186442744560873e-06, + "logits/chosen": -1.4128029346466064, + "logits/rejected": -1.398961067199707, + "logps/chosen": -0.8758260607719421, + "logps/rejected": -3.4516899585723877, + "loss": 0.9049, + "odds_ratio_loss": 0.2910856306552887, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08758260309696198, + "rewards/margins": 0.25758641958236694, + "rewards/rejected": -0.3451690077781677, + "sft_loss": 0.8758260607719421, + "step": 4640 + }, + { + "epoch": 0.36, + "grad_norm": 7.154684066772461, + "learning_rate": 7.1809034535784785e-06, + "logits/chosen": -1.2984259128570557, + "logits/rejected": -1.042232871055603, + "logps/chosen": -0.9784517288208008, + "logps/rejected": -1.5918056964874268, + "loss": 1.0242, + "odds_ratio_loss": 0.4576658308506012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0978451743721962, + "rewards/margins": 0.06133540719747543, + "rewards/rejected": -0.15918058156967163, + "sft_loss": 0.9784517288208008, + "step": 4645 + }, + { + "epoch": 0.36, + "grad_norm": 19.701332092285156, + "learning_rate": 7.1753608549790985e-06, + "logits/chosen": -1.2546695470809937, + "logits/rejected": -1.4402790069580078, + "logps/chosen": -0.7282959222793579, + "logps/rejected": -1.5314674377441406, + "loss": 0.7842, + "odds_ratio_loss": 0.5590053796768188, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07282960414886475, + "rewards/margins": 0.08031713962554932, + "rewards/rejected": -0.15314674377441406, + "sft_loss": 0.7282959222793579, + "step": 4650 + }, + { + "epoch": 0.36, + "grad_norm": 5.952921390533447, + "learning_rate": 7.169814957168786e-06, + "logits/chosen": -1.4671968221664429, + "logits/rejected": -0.9394590258598328, + "logps/chosen": -1.2331550121307373, + "logps/rejected": -1.6452100276947021, + "loss": 1.2829, + "odds_ratio_loss": 0.49736180901527405, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12331549823284149, + "rewards/margins": 0.041205499321222305, + "rewards/rejected": -0.1645210087299347, + "sft_loss": 1.2331550121307373, + "step": 4655 + }, + { + "epoch": 0.36, + "grad_norm": 4.4238104820251465, + "learning_rate": 7.164265768558603e-06, + "logits/chosen": -1.4732682704925537, + "logits/rejected": -1.223921537399292, + "logps/chosen": -0.7300776243209839, + "logps/rejected": -0.9684303402900696, + "loss": 0.7961, + "odds_ratio_loss": 0.6600145697593689, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07300776988267899, + "rewards/margins": 0.02383527345955372, + "rewards/rejected": -0.09684304147958755, + "sft_loss": 0.7300776243209839, + "step": 4660 + }, + { + "epoch": 0.36, + "grad_norm": 13.20688533782959, + "learning_rate": 7.158713297564595e-06, + "logits/chosen": -1.493577480316162, + "logits/rejected": -0.8166548013687134, + "logps/chosen": -0.8863444328308105, + "logps/rejected": -6.545098304748535, + "loss": 0.902, + "odds_ratio_loss": 0.15621954202651978, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0886344462633133, + "rewards/margins": 0.5658753514289856, + "rewards/rejected": -0.6545097827911377, + "sft_loss": 0.8863444328308105, + "step": 4665 + }, + { + "epoch": 0.36, + "grad_norm": 4.963711261749268, + "learning_rate": 7.153157552607789e-06, + "logits/chosen": -1.436767578125, + "logits/rejected": -0.8550936579704285, + "logps/chosen": -1.089935064315796, + "logps/rejected": -2.3934664726257324, + "loss": 1.1425, + "odds_ratio_loss": 0.5253725647926331, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10899350792169571, + "rewards/margins": 0.1303531378507614, + "rewards/rejected": -0.23934665322303772, + "sft_loss": 1.089935064315796, + "step": 4670 + }, + { + "epoch": 0.36, + "grad_norm": 8.91596794128418, + "learning_rate": 7.14759854211418e-06, + "logits/chosen": -1.413935899734497, + "logits/rejected": -0.7103008031845093, + "logps/chosen": -1.0255316495895386, + "logps/rejected": -2.3229520320892334, + "loss": 1.0548, + "odds_ratio_loss": 0.29279404878616333, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10255316644906998, + "rewards/margins": 0.12974204123020172, + "rewards/rejected": -0.2322952300310135, + "sft_loss": 1.0255316495895386, + "step": 4675 + }, + { + "epoch": 0.36, + "grad_norm": 28.46976661682129, + "learning_rate": 7.142036274514712e-06, + "logits/chosen": -1.3273677825927734, + "logits/rejected": -1.1330890655517578, + "logps/chosen": -0.9897178411483765, + "logps/rejected": -2.691734790802002, + "loss": 1.0195, + "odds_ratio_loss": 0.2983167767524719, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09897179901599884, + "rewards/margins": 0.17020167410373688, + "rewards/rejected": -0.26917344331741333, + "sft_loss": 0.9897178411483765, + "step": 4680 + }, + { + "epoch": 0.36, + "grad_norm": 33.62302017211914, + "learning_rate": 7.1364707582452705e-06, + "logits/chosen": -1.3759291172027588, + "logits/rejected": -1.1289112567901611, + "logps/chosen": -0.8859437704086304, + "logps/rejected": -4.888989448547363, + "loss": 0.9001, + "odds_ratio_loss": 0.141677588224411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08859437704086304, + "rewards/margins": 0.40030455589294434, + "rewards/rejected": -0.4888989329338074, + "sft_loss": 0.8859437704086304, + "step": 4685 + }, + { + "epoch": 0.36, + "grad_norm": 12.877079010009766, + "learning_rate": 7.130902001746667e-06, + "logits/chosen": -1.2970373630523682, + "logits/rejected": -1.0835976600646973, + "logps/chosen": -1.139439344406128, + "logps/rejected": -4.003388404846191, + "loss": 1.1481, + "odds_ratio_loss": 0.08658437430858612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11394394934177399, + "rewards/margins": 0.28639495372772217, + "rewards/rejected": -0.40033888816833496, + "sft_loss": 1.139439344406128, + "step": 4690 + }, + { + "epoch": 0.37, + "grad_norm": 7.1802873611450195, + "learning_rate": 7.125330013464629e-06, + "logits/chosen": -1.3899294137954712, + "logits/rejected": -0.9751816987991333, + "logps/chosen": -1.1031863689422607, + "logps/rejected": -2.300921678543091, + "loss": 1.1432, + "odds_ratio_loss": 0.3997074365615845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11031864583492279, + "rewards/margins": 0.11977354437112808, + "rewards/rejected": -0.23009219765663147, + "sft_loss": 1.1031863689422607, + "step": 4695 + }, + { + "epoch": 0.37, + "grad_norm": 17.34177017211914, + "learning_rate": 7.119754801849782e-06, + "logits/chosen": -1.5370855331420898, + "logits/rejected": -1.0702970027923584, + "logps/chosen": -0.8118973970413208, + "logps/rejected": -1.677382230758667, + "loss": 0.8715, + "odds_ratio_loss": 0.5956419706344604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08118973672389984, + "rewards/margins": 0.08654849231243134, + "rewards/rejected": -0.16773822903633118, + "sft_loss": 0.8118973970413208, + "step": 4700 + }, + { + "epoch": 0.37, + "grad_norm": 15.636608123779297, + "learning_rate": 7.1141763753576435e-06, + "logits/chosen": -1.1833438873291016, + "logits/rejected": -0.9127056002616882, + "logps/chosen": -1.0889067649841309, + "logps/rejected": -1.704662561416626, + "loss": 1.1468, + "odds_ratio_loss": 0.5786373019218445, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10889067500829697, + "rewards/margins": 0.06157558411359787, + "rewards/rejected": -0.17046627402305603, + "sft_loss": 1.0889067649841309, + "step": 4705 + }, + { + "epoch": 0.37, + "grad_norm": 5.713825702667236, + "learning_rate": 7.1085947424486045e-06, + "logits/chosen": -1.3846780061721802, + "logits/rejected": -0.7796342372894287, + "logps/chosen": -0.8368937373161316, + "logps/rejected": -1.4111164808273315, + "loss": 0.8761, + "odds_ratio_loss": 0.3923702836036682, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0836893767118454, + "rewards/margins": 0.05742228031158447, + "rewards/rejected": -0.14111164212226868, + "sft_loss": 0.8368937373161316, + "step": 4710 + }, + { + "epoch": 0.37, + "grad_norm": 6.6504364013671875, + "learning_rate": 7.103009911587923e-06, + "logits/chosen": -1.2653753757476807, + "logits/rejected": -1.285905122756958, + "logps/chosen": -0.8329108357429504, + "logps/rejected": -1.9709625244140625, + "loss": 0.8781, + "odds_ratio_loss": 0.45160213112831116, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08329108357429504, + "rewards/margins": 0.11380515992641449, + "rewards/rejected": -0.19709625840187073, + "sft_loss": 0.8329108357429504, + "step": 4715 + }, + { + "epoch": 0.37, + "grad_norm": 18.44487953186035, + "learning_rate": 7.097421891245701e-06, + "logits/chosen": -1.3122217655181885, + "logits/rejected": -1.0286943912506104, + "logps/chosen": -1.413613200187683, + "logps/rejected": -7.518275260925293, + "loss": 1.4387, + "odds_ratio_loss": 0.2513591945171356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14136134088039398, + "rewards/margins": 0.6104661822319031, + "rewards/rejected": -0.7518275380134583, + "sft_loss": 1.413613200187683, + "step": 4720 + }, + { + "epoch": 0.37, + "grad_norm": 11.627037048339844, + "learning_rate": 7.091830689896883e-06, + "logits/chosen": -1.3183649778366089, + "logits/rejected": -0.8607378005981445, + "logps/chosen": -1.0398062467575073, + "logps/rejected": -2.5570998191833496, + "loss": 1.0621, + "odds_ratio_loss": 0.22267043590545654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10398062318563461, + "rewards/margins": 0.15172937512397766, + "rewards/rejected": -0.2557099759578705, + "sft_loss": 1.0398062467575073, + "step": 4725 + }, + { + "epoch": 0.37, + "grad_norm": 20.778667449951172, + "learning_rate": 7.086236316021232e-06, + "logits/chosen": -0.8270589709281921, + "logits/rejected": -1.1936814785003662, + "logps/chosen": -0.9919061660766602, + "logps/rejected": -1.9597247838974, + "loss": 1.0642, + "odds_ratio_loss": 0.7229019403457642, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0991906225681305, + "rewards/margins": 0.09678187221288681, + "rewards/rejected": -0.1959724873304367, + "sft_loss": 0.9919061660766602, + "step": 4730 + }, + { + "epoch": 0.37, + "grad_norm": 12.020709037780762, + "learning_rate": 7.080638778103331e-06, + "logits/chosen": -1.2841196060180664, + "logits/rejected": -0.7834513783454895, + "logps/chosen": -0.962006688117981, + "logps/rejected": -4.6492919921875, + "loss": 1.0038, + "odds_ratio_loss": 0.41799622774124146, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09620066732168198, + "rewards/margins": 0.3687285780906677, + "rewards/rejected": -0.4649292528629303, + "sft_loss": 0.962006688117981, + "step": 4735 + }, + { + "epoch": 0.37, + "grad_norm": 15.375065803527832, + "learning_rate": 7.075038084632554e-06, + "logits/chosen": -1.399659514427185, + "logits/rejected": -1.212721586227417, + "logps/chosen": -0.8884924054145813, + "logps/rejected": -1.693756103515625, + "loss": 0.9439, + "odds_ratio_loss": 0.5542975664138794, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08884923905134201, + "rewards/margins": 0.08052637428045273, + "rewards/rejected": -0.16937562823295593, + "sft_loss": 0.8884924054145813, + "step": 4740 + }, + { + "epoch": 0.37, + "grad_norm": 6.555948734283447, + "learning_rate": 7.069434244103064e-06, + "logits/chosen": -1.293717384338379, + "logits/rejected": -1.2064396142959595, + "logps/chosen": -0.8203238248825073, + "logps/rejected": -3.335233211517334, + "loss": 0.8504, + "odds_ratio_loss": 0.3007420301437378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08203238993883133, + "rewards/margins": 0.2514909505844116, + "rewards/rejected": -0.33352333307266235, + "sft_loss": 0.8203238248825073, + "step": 4745 + }, + { + "epoch": 0.37, + "grad_norm": 10.171930313110352, + "learning_rate": 7.063827265013798e-06, + "logits/chosen": -1.341202974319458, + "logits/rejected": -1.1988797187805176, + "logps/chosen": -0.875900149345398, + "logps/rejected": -2.420520067214966, + "loss": 0.9075, + "odds_ratio_loss": 0.3163735270500183, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08759002387523651, + "rewards/margins": 0.15446197986602783, + "rewards/rejected": -0.24205198884010315, + "sft_loss": 0.875900149345398, + "step": 4750 + }, + { + "epoch": 0.37, + "grad_norm": 7.426060199737549, + "learning_rate": 7.058217155868452e-06, + "logits/chosen": -1.4535651206970215, + "logits/rejected": -0.9479306936264038, + "logps/chosen": -1.0767772197723389, + "logps/rejected": -3.3065083026885986, + "loss": 1.152, + "odds_ratio_loss": 0.7525271773338318, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10767773538827896, + "rewards/margins": 0.2229730784893036, + "rewards/rejected": -0.33065083622932434, + "sft_loss": 1.0767772197723389, + "step": 4755 + }, + { + "epoch": 0.37, + "grad_norm": 5.857900142669678, + "learning_rate": 7.052603925175466e-06, + "logits/chosen": -1.2869961261749268, + "logits/rejected": -0.8082895278930664, + "logps/chosen": -0.7936242818832397, + "logps/rejected": -1.4054462909698486, + "loss": 0.8341, + "odds_ratio_loss": 0.405081182718277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0793624296784401, + "rewards/margins": 0.06118218973278999, + "rewards/rejected": -0.14054462313652039, + "sft_loss": 0.7936242818832397, + "step": 4760 + }, + { + "epoch": 0.37, + "grad_norm": 9.901500701904297, + "learning_rate": 7.04698758144802e-06, + "logits/chosen": -1.3234273195266724, + "logits/rejected": -1.113125205039978, + "logps/chosen": -1.0561497211456299, + "logps/rejected": -2.484020948410034, + "loss": 1.0788, + "odds_ratio_loss": 0.226668119430542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10561498254537582, + "rewards/margins": 0.1427871435880661, + "rewards/rejected": -0.24840211868286133, + "sft_loss": 1.0561497211456299, + "step": 4765 + }, + { + "epoch": 0.37, + "grad_norm": 56.50979995727539, + "learning_rate": 7.04136813320401e-06, + "logits/chosen": -1.2841142416000366, + "logits/rejected": -0.7560856938362122, + "logps/chosen": -1.0950348377227783, + "logps/rejected": -7.483965873718262, + "loss": 1.118, + "odds_ratio_loss": 0.2300182282924652, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10950347036123276, + "rewards/margins": 0.638893187046051, + "rewards/rejected": -0.748396635055542, + "sft_loss": 1.0950348377227783, + "step": 4770 + }, + { + "epoch": 0.37, + "grad_norm": 6.978331565856934, + "learning_rate": 7.0357455889660445e-06, + "logits/chosen": -1.3573987483978271, + "logits/rejected": -1.0475003719329834, + "logps/chosen": -0.8214191198348999, + "logps/rejected": -2.1504311561584473, + "loss": 0.8564, + "odds_ratio_loss": 0.35030627250671387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08214191347360611, + "rewards/margins": 0.13290122151374817, + "rewards/rejected": -0.21504314243793488, + "sft_loss": 0.8214191198348999, + "step": 4775 + }, + { + "epoch": 0.37, + "grad_norm": 148.0516815185547, + "learning_rate": 7.030119957261425e-06, + "logits/chosen": -1.2780089378356934, + "logits/rejected": -0.9989121556282043, + "logps/chosen": -0.9051804542541504, + "logps/rejected": -4.656085014343262, + "loss": 0.915, + "odds_ratio_loss": 0.09867776930332184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0905180498957634, + "rewards/margins": 0.37509050965309143, + "rewards/rejected": -0.46560850739479065, + "sft_loss": 0.9051804542541504, + "step": 4780 + }, + { + "epoch": 0.37, + "grad_norm": 6.404755592346191, + "learning_rate": 7.024491246622135e-06, + "logits/chosen": -1.3146905899047852, + "logits/rejected": -0.586790144443512, + "logps/chosen": -0.9470073580741882, + "logps/rejected": -2.3955118656158447, + "loss": 0.9698, + "odds_ratio_loss": 0.227756068110466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09470073133707047, + "rewards/margins": 0.1448504477739334, + "rewards/rejected": -0.23955118656158447, + "sft_loss": 0.9470073580741882, + "step": 4785 + }, + { + "epoch": 0.37, + "grad_norm": 5.560925483703613, + "learning_rate": 7.018859465584832e-06, + "logits/chosen": -1.2906675338745117, + "logits/rejected": -0.7796791195869446, + "logps/chosen": -1.1187167167663574, + "logps/rejected": -3.7551982402801514, + "loss": 1.1369, + "odds_ratio_loss": 0.1814715564250946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11187167465686798, + "rewards/margins": 0.2636481523513794, + "rewards/rejected": -0.37551984190940857, + "sft_loss": 1.1187167167663574, + "step": 4790 + }, + { + "epoch": 0.37, + "grad_norm": 4.41677713394165, + "learning_rate": 7.013224622690823e-06, + "logits/chosen": -1.2547338008880615, + "logits/rejected": -0.9723116159439087, + "logps/chosen": -0.8910681009292603, + "logps/rejected": -1.8260596990585327, + "loss": 0.9175, + "odds_ratio_loss": 0.26385509967803955, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08910682052373886, + "rewards/margins": 0.09349914640188217, + "rewards/rejected": -0.18260596692562103, + "sft_loss": 0.8910681009292603, + "step": 4795 + }, + { + "epoch": 0.37, + "grad_norm": 14.212723731994629, + "learning_rate": 7.007586726486066e-06, + "logits/chosen": -1.4153473377227783, + "logits/rejected": -0.8781019449234009, + "logps/chosen": -0.974286675453186, + "logps/rejected": -3.054361343383789, + "loss": 1.0106, + "odds_ratio_loss": 0.3631802797317505, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09742867201566696, + "rewards/margins": 0.20800745487213135, + "rewards/rejected": -0.3054361343383789, + "sft_loss": 0.974286675453186, + "step": 4800 + }, + { + "epoch": 0.37, + "grad_norm": 5.878570079803467, + "learning_rate": 7.001945785521145e-06, + "logits/chosen": -1.2006781101226807, + "logits/rejected": -1.1574538946151733, + "logps/chosen": -0.8742920756340027, + "logps/rejected": -5.729536056518555, + "loss": 0.9238, + "odds_ratio_loss": 0.4947918951511383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08742920309305191, + "rewards/margins": 0.48552441596984863, + "rewards/rejected": -0.5729536414146423, + "sft_loss": 0.8742920756340027, + "step": 4805 + }, + { + "epoch": 0.37, + "grad_norm": 5.804833889007568, + "learning_rate": 6.996301808351264e-06, + "logits/chosen": -1.2291743755340576, + "logits/rejected": -0.7988919019699097, + "logps/chosen": -0.962105393409729, + "logps/rejected": -2.536154270172119, + "loss": 0.9884, + "odds_ratio_loss": 0.26280778646469116, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0962105393409729, + "rewards/margins": 0.15740486979484558, + "rewards/rejected": -0.2536154091358185, + "sft_loss": 0.962105393409729, + "step": 4810 + }, + { + "epoch": 0.37, + "grad_norm": 14.966432571411133, + "learning_rate": 6.99065480353623e-06, + "logits/chosen": -1.2544745206832886, + "logits/rejected": -1.0695239305496216, + "logps/chosen": -0.9016950726509094, + "logps/rejected": -4.076902866363525, + "loss": 0.9405, + "odds_ratio_loss": 0.3877166211605072, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0901695042848587, + "rewards/margins": 0.31752076745033264, + "rewards/rejected": -0.40769022703170776, + "sft_loss": 0.9016950726509094, + "step": 4815 + }, + { + "epoch": 0.37, + "grad_norm": 8.97038745880127, + "learning_rate": 6.985004779640442e-06, + "logits/chosen": -1.3023059368133545, + "logits/rejected": -0.5928922891616821, + "logps/chosen": -0.9082862138748169, + "logps/rejected": -4.716104984283447, + "loss": 0.9207, + "odds_ratio_loss": 0.12394730001688004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09082861244678497, + "rewards/margins": 0.3807818591594696, + "rewards/rejected": -0.47161048650741577, + "sft_loss": 0.9082862138748169, + "step": 4820 + }, + { + "epoch": 0.38, + "grad_norm": 7.581333160400391, + "learning_rate": 6.979351745232879e-06, + "logits/chosen": -1.2428420782089233, + "logits/rejected": -0.9282558560371399, + "logps/chosen": -1.1406314373016357, + "logps/rejected": -5.644349575042725, + "loss": 1.1639, + "odds_ratio_loss": 0.23303601145744324, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11406314373016357, + "rewards/margins": 0.4503718912601471, + "rewards/rejected": -0.5644350051879883, + "sft_loss": 1.1406314373016357, + "step": 4825 + }, + { + "epoch": 0.38, + "grad_norm": 291.5585021972656, + "learning_rate": 6.973695708887088e-06, + "logits/chosen": -1.479901909828186, + "logits/rejected": -1.1979395151138306, + "logps/chosen": -1.0565932989120483, + "logps/rejected": -6.788051605224609, + "loss": 1.0746, + "odds_ratio_loss": 0.1802714765071869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10565934330224991, + "rewards/margins": 0.5731458067893982, + "rewards/rejected": -0.6788051724433899, + "sft_loss": 1.0565932989120483, + "step": 4830 + }, + { + "epoch": 0.38, + "grad_norm": 209.37506103515625, + "learning_rate": 6.968036679181164e-06, + "logits/chosen": -1.5039844512939453, + "logits/rejected": -1.1092535257339478, + "logps/chosen": -1.1655082702636719, + "logps/rejected": -2.8518729209899902, + "loss": 1.1909, + "odds_ratio_loss": 0.2540439963340759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11655082553625107, + "rewards/margins": 0.1686364710330963, + "rewards/rejected": -0.285187304019928, + "sft_loss": 1.1655082702636719, + "step": 4835 + }, + { + "epoch": 0.38, + "grad_norm": 95.15951538085938, + "learning_rate": 6.962374664697744e-06, + "logits/chosen": -1.3616310358047485, + "logits/rejected": -0.9117149114608765, + "logps/chosen": -1.0936627388000488, + "logps/rejected": -2.5025813579559326, + "loss": 1.1546, + "odds_ratio_loss": 0.6098529696464539, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.109366275370121, + "rewards/margins": 0.1408918797969818, + "rewards/rejected": -0.2502581477165222, + "sft_loss": 1.0936627388000488, + "step": 4840 + }, + { + "epoch": 0.38, + "grad_norm": 8.825847625732422, + "learning_rate": 6.956709674023991e-06, + "logits/chosen": -1.2413387298583984, + "logits/rejected": -0.9800017476081848, + "logps/chosen": -1.249328851699829, + "logps/rejected": -1.4448187351226807, + "loss": 1.3213, + "odds_ratio_loss": 0.7200738191604614, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.12493288516998291, + "rewards/margins": 0.019548993557691574, + "rewards/rejected": -0.1444818675518036, + "sft_loss": 1.249328851699829, + "step": 4845 + }, + { + "epoch": 0.38, + "grad_norm": 86.56720733642578, + "learning_rate": 6.951041715751585e-06, + "logits/chosen": -1.232062578201294, + "logits/rejected": -1.0836049318313599, + "logps/chosen": -1.107206106185913, + "logps/rejected": -4.509480953216553, + "loss": 1.1338, + "odds_ratio_loss": 0.2663348317146301, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11072063446044922, + "rewards/margins": 0.34022751450538635, + "rewards/rejected": -0.45094814896583557, + "sft_loss": 1.107206106185913, + "step": 4850 + }, + { + "epoch": 0.38, + "grad_norm": 5.162984848022461, + "learning_rate": 6.945370798476704e-06, + "logits/chosen": -1.1838353872299194, + "logits/rejected": -0.9465975761413574, + "logps/chosen": -1.0141656398773193, + "logps/rejected": -4.33822774887085, + "loss": 1.0332, + "odds_ratio_loss": 0.1907617598772049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10141657292842865, + "rewards/margins": 0.3324061930179596, + "rewards/rejected": -0.43382278084754944, + "sft_loss": 1.0141656398773193, + "step": 4855 + }, + { + "epoch": 0.38, + "grad_norm": 7.11558198928833, + "learning_rate": 6.939696930800012e-06, + "logits/chosen": -1.2861100435256958, + "logits/rejected": -1.096118688583374, + "logps/chosen": -1.4758632183074951, + "logps/rejected": -4.466700077056885, + "loss": 1.4866, + "odds_ratio_loss": 0.10705895721912384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14758633077144623, + "rewards/margins": 0.2990837097167969, + "rewards/rejected": -0.4466700553894043, + "sft_loss": 1.4758632183074951, + "step": 4860 + }, + { + "epoch": 0.38, + "grad_norm": 6.247501850128174, + "learning_rate": 6.934020121326651e-06, + "logits/chosen": -1.4358515739440918, + "logits/rejected": -1.1405222415924072, + "logps/chosen": -1.2349154949188232, + "logps/rejected": -5.667292594909668, + "loss": 1.2594, + "odds_ratio_loss": 0.24501347541809082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1234915480017662, + "rewards/margins": 0.4432377219200134, + "rewards/rejected": -0.5667292475700378, + "sft_loss": 1.2349154949188232, + "step": 4865 + }, + { + "epoch": 0.38, + "grad_norm": 6.55000114440918, + "learning_rate": 6.928340378666225e-06, + "logits/chosen": -1.4754161834716797, + "logits/rejected": -0.9751527905464172, + "logps/chosen": -1.3910658359527588, + "logps/rejected": -6.125715732574463, + "loss": 1.4068, + "odds_ratio_loss": 0.15740497410297394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13910658657550812, + "rewards/margins": 0.4734649658203125, + "rewards/rejected": -0.6125715970993042, + "sft_loss": 1.3910658359527588, + "step": 4870 + }, + { + "epoch": 0.38, + "grad_norm": 47.819461822509766, + "learning_rate": 6.922657711432781e-06, + "logits/chosen": -1.452848196029663, + "logits/rejected": -1.0147812366485596, + "logps/chosen": -1.0927129983901978, + "logps/rejected": -2.6984667778015137, + "loss": 1.1157, + "odds_ratio_loss": 0.2295651137828827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10927130281925201, + "rewards/margins": 0.16057537496089935, + "rewards/rejected": -0.26984667778015137, + "sft_loss": 1.0927129983901978, + "step": 4875 + }, + { + "epoch": 0.38, + "grad_norm": 10.338000297546387, + "learning_rate": 6.9169721282448075e-06, + "logits/chosen": -1.2707912921905518, + "logits/rejected": -0.8893852233886719, + "logps/chosen": -1.0967166423797607, + "logps/rejected": -1.765512466430664, + "loss": 1.1472, + "odds_ratio_loss": 0.5050911903381348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10967166721820831, + "rewards/margins": 0.06687958538532257, + "rewards/rejected": -0.17655125260353088, + "sft_loss": 1.0967166423797607, + "step": 4880 + }, + { + "epoch": 0.38, + "grad_norm": 12.873258590698242, + "learning_rate": 6.9112836377252136e-06, + "logits/chosen": -1.247232437133789, + "logits/rejected": -1.4507074356079102, + "logps/chosen": -0.669430136680603, + "logps/rejected": -6.086930274963379, + "loss": 0.7079, + "odds_ratio_loss": 0.38430362939834595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06694300472736359, + "rewards/margins": 0.5417500734329224, + "rewards/rejected": -0.60869300365448, + "sft_loss": 0.669430136680603, + "step": 4885 + }, + { + "epoch": 0.38, + "grad_norm": 54.19504165649414, + "learning_rate": 6.905592248501318e-06, + "logits/chosen": -1.216870665550232, + "logits/rejected": -0.9719651937484741, + "logps/chosen": -1.1346818208694458, + "logps/rejected": -3.1256916522979736, + "loss": 1.2082, + "odds_ratio_loss": 0.7348722219467163, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11346818506717682, + "rewards/margins": 0.19910097122192383, + "rewards/rejected": -0.31256914138793945, + "sft_loss": 1.1346818208694458, + "step": 4890 + }, + { + "epoch": 0.38, + "grad_norm": 5.423599720001221, + "learning_rate": 6.899897969204834e-06, + "logits/chosen": -1.3522473573684692, + "logits/rejected": -1.0995229482650757, + "logps/chosen": -0.8513563871383667, + "logps/rejected": -3.2106711864471436, + "loss": 0.8677, + "odds_ratio_loss": 0.16294452548027039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08513564616441727, + "rewards/margins": 0.23593148589134216, + "rewards/rejected": -0.32106712460517883, + "sft_loss": 0.8513563871383667, + "step": 4895 + }, + { + "epoch": 0.38, + "grad_norm": 5.703450679779053, + "learning_rate": 6.894200808471858e-06, + "logits/chosen": -1.3011819124221802, + "logits/rejected": -0.487908273935318, + "logps/chosen": -0.9999151229858398, + "logps/rejected": -2.380300283432007, + "loss": 1.0476, + "odds_ratio_loss": 0.4769115447998047, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09999152272939682, + "rewards/margins": 0.1380385160446167, + "rewards/rejected": -0.23803003132343292, + "sft_loss": 0.9999151229858398, + "step": 4900 + }, + { + "epoch": 0.38, + "grad_norm": 27.872135162353516, + "learning_rate": 6.88850077494286e-06, + "logits/chosen": -1.3154263496398926, + "logits/rejected": -0.9825452566146851, + "logps/chosen": -1.085605263710022, + "logps/rejected": -3.069535255432129, + "loss": 1.1015, + "odds_ratio_loss": 0.1588580310344696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10856052488088608, + "rewards/margins": 0.19839301705360413, + "rewards/rejected": -0.3069535195827484, + "sft_loss": 1.085605263710022, + "step": 4905 + }, + { + "epoch": 0.38, + "grad_norm": 62.25426483154297, + "learning_rate": 6.882797877262663e-06, + "logits/chosen": -1.3177497386932373, + "logits/rejected": -1.1975306272506714, + "logps/chosen": -1.0411581993103027, + "logps/rejected": -1.4796054363250732, + "loss": 1.0933, + "odds_ratio_loss": 0.5210880637168884, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10411582142114639, + "rewards/margins": 0.043844711035490036, + "rewards/rejected": -0.14796052873134613, + "sft_loss": 1.0411581993103027, + "step": 4910 + }, + { + "epoch": 0.38, + "grad_norm": 8.837223052978516, + "learning_rate": 6.877092124080435e-06, + "logits/chosen": -1.432607650756836, + "logits/rejected": -1.0251507759094238, + "logps/chosen": -1.138588547706604, + "logps/rejected": -7.440249443054199, + "loss": 1.2108, + "odds_ratio_loss": 0.7222912311553955, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11385886371135712, + "rewards/margins": 0.6301661729812622, + "rewards/rejected": -0.744024932384491, + "sft_loss": 1.138588547706604, + "step": 4915 + }, + { + "epoch": 0.38, + "grad_norm": 10.964761734008789, + "learning_rate": 6.8713835240496776e-06, + "logits/chosen": -1.2825660705566406, + "logits/rejected": -0.9963976144790649, + "logps/chosen": -1.039621353149414, + "logps/rejected": -4.55959415435791, + "loss": 1.0755, + "odds_ratio_loss": 0.3583175241947174, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10396213829517365, + "rewards/margins": 0.3519973158836365, + "rewards/rejected": -0.45595940947532654, + "sft_loss": 1.039621353149414, + "step": 4920 + }, + { + "epoch": 0.38, + "grad_norm": 69.33670806884766, + "learning_rate": 6.865672085828205e-06, + "logits/chosen": -1.3993297815322876, + "logits/rejected": -1.4595104455947876, + "logps/chosen": -0.8613026738166809, + "logps/rejected": -3.6366093158721924, + "loss": 0.9189, + "odds_ratio_loss": 0.5756146907806396, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08613026142120361, + "rewards/margins": 0.2775306701660156, + "rewards/rejected": -0.36366090178489685, + "sft_loss": 0.8613026738166809, + "step": 4925 + }, + { + "epoch": 0.38, + "grad_norm": 13.625251770019531, + "learning_rate": 6.859957818078139e-06, + "logits/chosen": -1.4369404315948486, + "logits/rejected": -0.8882350921630859, + "logps/chosen": -1.2669563293457031, + "logps/rejected": -8.768199920654297, + "loss": 1.3147, + "odds_ratio_loss": 0.47707176208496094, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1266956329345703, + "rewards/margins": 0.7501242756843567, + "rewards/rejected": -0.8768199682235718, + "sft_loss": 1.2669563293457031, + "step": 4930 + }, + { + "epoch": 0.38, + "grad_norm": 6.9091267585754395, + "learning_rate": 6.854240729465892e-06, + "logits/chosen": -1.4900411367416382, + "logits/rejected": -1.119840383529663, + "logps/chosen": -1.02559494972229, + "logps/rejected": -6.696244239807129, + "loss": 1.0567, + "odds_ratio_loss": 0.31142452359199524, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10255949199199677, + "rewards/margins": 0.5670649409294128, + "rewards/rejected": -0.6696244478225708, + "sft_loss": 1.02559494972229, + "step": 4935 + }, + { + "epoch": 0.38, + "grad_norm": 7.601396083831787, + "learning_rate": 6.848520828662155e-06, + "logits/chosen": -1.3692182302474976, + "logits/rejected": -1.2401628494262695, + "logps/chosen": -1.0997951030731201, + "logps/rejected": -7.7627716064453125, + "loss": 1.1207, + "odds_ratio_loss": 0.2095102071762085, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10997951030731201, + "rewards/margins": 0.6662976145744324, + "rewards/rejected": -0.7762770652770996, + "sft_loss": 1.0997951030731201, + "step": 4940 + }, + { + "epoch": 0.38, + "grad_norm": 8.860038757324219, + "learning_rate": 6.8427981243418866e-06, + "logits/chosen": -1.3120964765548706, + "logits/rejected": -1.2637927532196045, + "logps/chosen": -0.7979799509048462, + "logps/rejected": -8.659887313842773, + "loss": 0.7987, + "odds_ratio_loss": 0.006944218184798956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07979799807071686, + "rewards/margins": 0.7861906886100769, + "rewards/rejected": -0.8659887313842773, + "sft_loss": 0.7979799509048462, + "step": 4945 + }, + { + "epoch": 0.39, + "grad_norm": 18.52011489868164, + "learning_rate": 6.83707262518429e-06, + "logits/chosen": -1.3004642724990845, + "logits/rejected": -0.9895919561386108, + "logps/chosen": -0.9648422002792358, + "logps/rejected": -2.168957471847534, + "loss": 1.0539, + "odds_ratio_loss": 0.8904326558113098, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0964842289686203, + "rewards/margins": 0.12041151523590088, + "rewards/rejected": -0.21689574420452118, + "sft_loss": 0.9648422002792358, + "step": 4950 + }, + { + "epoch": 0.39, + "grad_norm": 3.977808952331543, + "learning_rate": 6.831344339872813e-06, + "logits/chosen": -1.1475770473480225, + "logits/rejected": -0.7214670181274414, + "logps/chosen": -0.7406553030014038, + "logps/rejected": -13.014932632446289, + "loss": 0.7411, + "odds_ratio_loss": 0.004104848951101303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07406553626060486, + "rewards/margins": 1.2274277210235596, + "rewards/rejected": -1.3014931678771973, + "sft_loss": 0.7406553030014038, + "step": 4955 + }, + { + "epoch": 0.39, + "grad_norm": 9.07308578491211, + "learning_rate": 6.825613277095129e-06, + "logits/chosen": -1.2775825262069702, + "logits/rejected": -1.0573843717575073, + "logps/chosen": -0.9088460803031921, + "logps/rejected": -2.937030792236328, + "loss": 0.9436, + "odds_ratio_loss": 0.3471711277961731, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09088461101055145, + "rewards/margins": 0.20281848311424255, + "rewards/rejected": -0.2937030792236328, + "sft_loss": 0.9088460803031921, + "step": 4960 + }, + { + "epoch": 0.39, + "grad_norm": 7.698314189910889, + "learning_rate": 6.8198794455431205e-06, + "logits/chosen": -1.3673810958862305, + "logits/rejected": -0.930639386177063, + "logps/chosen": -0.8960191011428833, + "logps/rejected": -10.090847969055176, + "loss": 0.9147, + "odds_ratio_loss": 0.1867930293083191, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08960190415382385, + "rewards/margins": 0.9194828867912292, + "rewards/rejected": -1.009084701538086, + "sft_loss": 0.8960191011428833, + "step": 4965 + }, + { + "epoch": 0.39, + "grad_norm": 12.36298656463623, + "learning_rate": 6.814142853912873e-06, + "logits/chosen": -1.4559195041656494, + "logits/rejected": -1.2453267574310303, + "logps/chosen": -1.1622923612594604, + "logps/rejected": -2.9156875610351562, + "loss": 1.2222, + "odds_ratio_loss": 0.5988325476646423, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11622923612594604, + "rewards/margins": 0.17533953487873077, + "rewards/rejected": -0.2915687561035156, + "sft_loss": 1.1622923612594604, + "step": 4970 + }, + { + "epoch": 0.39, + "grad_norm": 9.976313591003418, + "learning_rate": 6.808403510904653e-06, + "logits/chosen": -1.503354787826538, + "logits/rejected": -1.2309849262237549, + "logps/chosen": -0.7218309640884399, + "logps/rejected": -3.757450580596924, + "loss": 0.7453, + "odds_ratio_loss": 0.23489134013652802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07218309491872787, + "rewards/margins": 0.3035619854927063, + "rewards/rejected": -0.3757449984550476, + "sft_loss": 0.7218309640884399, + "step": 4975 + }, + { + "epoch": 0.39, + "grad_norm": 591.4327392578125, + "learning_rate": 6.802661425222907e-06, + "logits/chosen": -1.31455659866333, + "logits/rejected": -0.5907676815986633, + "logps/chosen": -1.8325262069702148, + "logps/rejected": -3.8407177925109863, + "loss": 1.8731, + "odds_ratio_loss": 0.40590929985046387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18325261771678925, + "rewards/margins": 0.20081916451454163, + "rewards/rejected": -0.3840717673301697, + "sft_loss": 1.8325262069702148, + "step": 4980 + }, + { + "epoch": 0.39, + "grad_norm": 5.556074619293213, + "learning_rate": 6.796916605576235e-06, + "logits/chosen": -1.4891436100006104, + "logits/rejected": -1.0084350109100342, + "logps/chosen": -0.8972042798995972, + "logps/rejected": -3.8865675926208496, + "loss": 0.9218, + "odds_ratio_loss": 0.2455102503299713, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08972042798995972, + "rewards/margins": 0.29893630743026733, + "rewards/rejected": -0.38865676522254944, + "sft_loss": 0.8972042798995972, + "step": 4985 + }, + { + "epoch": 0.39, + "grad_norm": 11.468494415283203, + "learning_rate": 6.7911690606773836e-06, + "logits/chosen": -1.3948386907577515, + "logits/rejected": -1.2727683782577515, + "logps/chosen": -0.6499780416488647, + "logps/rejected": -6.216355800628662, + "loss": 0.6515, + "odds_ratio_loss": 0.015562218613922596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06499779969453812, + "rewards/margins": 0.5566378831863403, + "rewards/rejected": -0.6216356158256531, + "sft_loss": 0.6499780416488647, + "step": 4990 + }, + { + "epoch": 0.39, + "grad_norm": 61.68426513671875, + "learning_rate": 6.785418799243238e-06, + "logits/chosen": -1.1920238733291626, + "logits/rejected": -1.3612303733825684, + "logps/chosen": -0.7148585319519043, + "logps/rejected": -4.0645341873168945, + "loss": 0.7558, + "odds_ratio_loss": 0.40968450903892517, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07148585468530655, + "rewards/margins": 0.33496764302253723, + "rewards/rejected": -0.4064534604549408, + "sft_loss": 0.7148585319519043, + "step": 4995 + }, + { + "epoch": 0.39, + "grad_norm": 32.57492446899414, + "learning_rate": 6.7796658299947946e-06, + "logits/chosen": -1.2153874635696411, + "logits/rejected": -1.1897119283676147, + "logps/chosen": -0.749336838722229, + "logps/rejected": -2.0858726501464844, + "loss": 0.7786, + "odds_ratio_loss": 0.2922513484954834, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07493368536233902, + "rewards/margins": 0.13365359604358673, + "rewards/rejected": -0.20858728885650635, + "sft_loss": 0.749336838722229, + "step": 5000 + }, + { + "epoch": 0.39, + "grad_norm": 24.377872467041016, + "learning_rate": 6.7739101616571675e-06, + "logits/chosen": -1.4823286533355713, + "logits/rejected": -1.0391753911972046, + "logps/chosen": -0.9777601957321167, + "logps/rejected": -2.9456517696380615, + "loss": 0.9917, + "odds_ratio_loss": 0.1392892450094223, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09777601808309555, + "rewards/margins": 0.19678914546966553, + "rewards/rejected": -0.2945651412010193, + "sft_loss": 0.9777601957321167, + "step": 5005 + }, + { + "epoch": 0.39, + "grad_norm": 44.49354553222656, + "learning_rate": 6.768151802959556e-06, + "logits/chosen": -1.5044395923614502, + "logits/rejected": -1.249171495437622, + "logps/chosen": -0.8982955813407898, + "logps/rejected": -2.9409663677215576, + "loss": 0.9328, + "odds_ratio_loss": 0.34488964080810547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08982956409454346, + "rewards/margins": 0.20426709949970245, + "rewards/rejected": -0.2940966486930847, + "sft_loss": 0.8982955813407898, + "step": 5010 + }, + { + "epoch": 0.39, + "grad_norm": 7.909432411193848, + "learning_rate": 6.76239076263524e-06, + "logits/chosen": -1.410831093788147, + "logits/rejected": -0.765641450881958, + "logps/chosen": -1.078161358833313, + "logps/rejected": -6.150436878204346, + "loss": 1.0887, + "odds_ratio_loss": 0.1049346923828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10781614482402802, + "rewards/margins": 0.5072275400161743, + "rewards/rejected": -0.6150436997413635, + "sft_loss": 1.078161358833313, + "step": 5015 + }, + { + "epoch": 0.39, + "grad_norm": 27.489490509033203, + "learning_rate": 6.756627049421572e-06, + "logits/chosen": -1.393526315689087, + "logits/rejected": -1.0082769393920898, + "logps/chosen": -1.1783392429351807, + "logps/rejected": -6.130882263183594, + "loss": 1.196, + "odds_ratio_loss": 0.1762438714504242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11783391237258911, + "rewards/margins": 0.49525442719459534, + "rewards/rejected": -0.6130883097648621, + "sft_loss": 1.1783392429351807, + "step": 5020 + }, + { + "epoch": 0.39, + "grad_norm": 8.808501243591309, + "learning_rate": 6.7508606720599535e-06, + "logits/chosen": -1.321639895439148, + "logits/rejected": -0.853980541229248, + "logps/chosen": -0.7712258100509644, + "logps/rejected": -4.877452373504639, + "loss": 0.7797, + "odds_ratio_loss": 0.08445506542921066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07712258398532867, + "rewards/margins": 0.4106226861476898, + "rewards/rejected": -0.4877452254295349, + "sft_loss": 0.7712258100509644, + "step": 5025 + }, + { + "epoch": 0.39, + "grad_norm": 6.824776649475098, + "learning_rate": 6.745091639295827e-06, + "logits/chosen": -1.277091383934021, + "logits/rejected": -0.9593345522880554, + "logps/chosen": -0.764047384262085, + "logps/rejected": -2.850830554962158, + "loss": 0.7783, + "odds_ratio_loss": 0.1427466869354248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07640473544597626, + "rewards/margins": 0.20867832005023956, + "rewards/rejected": -0.28508302569389343, + "sft_loss": 0.764047384262085, + "step": 5030 + }, + { + "epoch": 0.39, + "grad_norm": 4.839137554168701, + "learning_rate": 6.7393199598786655e-06, + "logits/chosen": -1.359311819076538, + "logits/rejected": -0.8653494119644165, + "logps/chosen": -1.2023526430130005, + "logps/rejected": -4.120873928070068, + "loss": 1.2166, + "odds_ratio_loss": 0.14224644005298615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12023527920246124, + "rewards/margins": 0.2918521761894226, + "rewards/rejected": -0.41208744049072266, + "sft_loss": 1.2023526430130005, + "step": 5035 + }, + { + "epoch": 0.39, + "grad_norm": 5.2521586418151855, + "learning_rate": 6.7335456425619515e-06, + "logits/chosen": -1.3867168426513672, + "logits/rejected": -0.7938799262046814, + "logps/chosen": -0.961976170539856, + "logps/rejected": -5.642143726348877, + "loss": 0.9809, + "odds_ratio_loss": 0.189554363489151, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09619762003421783, + "rewards/margins": 0.4680168032646179, + "rewards/rejected": -0.5642144083976746, + "sft_loss": 0.961976170539856, + "step": 5040 + }, + { + "epoch": 0.39, + "grad_norm": 13.384309768676758, + "learning_rate": 6.72776869610317e-06, + "logits/chosen": -1.4045675992965698, + "logits/rejected": -0.8836909532546997, + "logps/chosen": -0.9754483103752136, + "logps/rejected": -3.191079616546631, + "loss": 1.0381, + "odds_ratio_loss": 0.6266669631004333, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0975448340177536, + "rewards/margins": 0.22156314551830292, + "rewards/rejected": -0.3191079795360565, + "sft_loss": 0.9754483103752136, + "step": 5045 + }, + { + "epoch": 0.39, + "grad_norm": 25.794885635375977, + "learning_rate": 6.721989129263797e-06, + "logits/chosen": -1.2337299585342407, + "logits/rejected": -0.8713550567626953, + "logps/chosen": -1.0439655780792236, + "logps/rejected": -7.396246910095215, + "loss": 1.055, + "odds_ratio_loss": 0.11006517708301544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10439654439687729, + "rewards/margins": 0.635228157043457, + "rewards/rejected": -0.7396246790885925, + "sft_loss": 1.0439655780792236, + "step": 5050 + }, + { + "epoch": 0.39, + "grad_norm": 20.35993766784668, + "learning_rate": 6.716206950809274e-06, + "logits/chosen": -1.396535873413086, + "logits/rejected": -0.8851677775382996, + "logps/chosen": -1.0421764850616455, + "logps/rejected": -3.7658889293670654, + "loss": 1.0814, + "odds_ratio_loss": 0.3919692039489746, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10421766340732574, + "rewards/margins": 0.27237120270729065, + "rewards/rejected": -0.3765888512134552, + "sft_loss": 1.0421764850616455, + "step": 5055 + }, + { + "epoch": 0.39, + "grad_norm": 41.810333251953125, + "learning_rate": 6.710422169509015e-06, + "logits/chosen": -1.5096509456634521, + "logits/rejected": -1.0556962490081787, + "logps/chosen": -1.108420968055725, + "logps/rejected": -3.083578586578369, + "loss": 1.1268, + "odds_ratio_loss": 0.1836351901292801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11084209382534027, + "rewards/margins": 0.19751577079296112, + "rewards/rejected": -0.308357834815979, + "sft_loss": 1.108420968055725, + "step": 5060 + }, + { + "epoch": 0.39, + "grad_norm": 23.982271194458008, + "learning_rate": 6.7046347941363706e-06, + "logits/chosen": -1.4725430011749268, + "logits/rejected": -0.8947264552116394, + "logps/chosen": -0.822553277015686, + "logps/rejected": -4.3091607093811035, + "loss": 0.8316, + "odds_ratio_loss": 0.09055305272340775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08225533366203308, + "rewards/margins": 0.34866076707839966, + "rewards/rejected": -0.43091607093811035, + "sft_loss": 0.822553277015686, + "step": 5065 + }, + { + "epoch": 0.39, + "grad_norm": 15.915932655334473, + "learning_rate": 6.698844833468633e-06, + "logits/chosen": -1.500983476638794, + "logits/rejected": -1.2363684177398682, + "logps/chosen": -0.8845119476318359, + "logps/rejected": -3.9846088886260986, + "loss": 0.9087, + "odds_ratio_loss": 0.24226252734661102, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08845119178295135, + "rewards/margins": 0.3100096583366394, + "rewards/rejected": -0.39846086502075195, + "sft_loss": 0.8845119476318359, + "step": 5070 + }, + { + "epoch": 0.39, + "grad_norm": 5.520943641662598, + "learning_rate": 6.693052296287011e-06, + "logits/chosen": -1.409310221672058, + "logits/rejected": -0.7601202726364136, + "logps/chosen": -0.9198876619338989, + "logps/rejected": -2.2644782066345215, + "loss": 0.9456, + "odds_ratio_loss": 0.25737714767456055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09198875725269318, + "rewards/margins": 0.13445906341075897, + "rewards/rejected": -0.22644782066345215, + "sft_loss": 0.9198876619338989, + "step": 5075 + }, + { + "epoch": 0.4, + "grad_norm": 48.577247619628906, + "learning_rate": 6.687257191376624e-06, + "logits/chosen": -1.348859429359436, + "logits/rejected": -0.6082226037979126, + "logps/chosen": -1.1002042293548584, + "logps/rejected": -1.5409919023513794, + "loss": 1.1681, + "odds_ratio_loss": 0.6787872314453125, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11002041399478912, + "rewards/margins": 0.0440787747502327, + "rewards/rejected": -0.15409919619560242, + "sft_loss": 1.1002042293548584, + "step": 5080 + }, + { + "epoch": 0.4, + "grad_norm": 4.986753463745117, + "learning_rate": 6.681459527526484e-06, + "logits/chosen": -1.1812330484390259, + "logits/rejected": -0.9130045175552368, + "logps/chosen": -0.7792029976844788, + "logps/rejected": -2.0369057655334473, + "loss": 0.8078, + "odds_ratio_loss": 0.28569847345352173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07792030274868011, + "rewards/margins": 0.12577028572559357, + "rewards/rejected": -0.2036905735731125, + "sft_loss": 0.7792029976844788, + "step": 5085 + }, + { + "epoch": 0.4, + "grad_norm": 7.481835842132568, + "learning_rate": 6.675659313529482e-06, + "logits/chosen": -1.4299769401550293, + "logits/rejected": -1.0885009765625, + "logps/chosen": -1.0667505264282227, + "logps/rejected": -4.983901500701904, + "loss": 1.0969, + "odds_ratio_loss": 0.30162471532821655, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10667506605386734, + "rewards/margins": 0.39171507954597473, + "rewards/rejected": -0.4983901381492615, + "sft_loss": 1.0667505264282227, + "step": 5090 + }, + { + "epoch": 0.4, + "grad_norm": 6.409983158111572, + "learning_rate": 6.669856558182384e-06, + "logits/chosen": -1.3709847927093506, + "logits/rejected": -0.7447436451911926, + "logps/chosen": -0.974819004535675, + "logps/rejected": -3.636500597000122, + "loss": 0.9995, + "odds_ratio_loss": 0.2470639944076538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09748189151287079, + "rewards/margins": 0.26616817712783813, + "rewards/rejected": -0.36365005373954773, + "sft_loss": 0.974819004535675, + "step": 5095 + }, + { + "epoch": 0.4, + "grad_norm": 6.735576629638672, + "learning_rate": 6.664051270285801e-06, + "logits/chosen": -1.391518473625183, + "logits/rejected": -0.8891741633415222, + "logps/chosen": -1.0563715696334839, + "logps/rejected": -3.8365464210510254, + "loss": 1.0792, + "odds_ratio_loss": 0.22799015045166016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10563715547323227, + "rewards/margins": 0.27801746129989624, + "rewards/rejected": -0.3836546242237091, + "sft_loss": 1.0563715696334839, + "step": 5100 + }, + { + "epoch": 0.4, + "grad_norm": 11.849318504333496, + "learning_rate": 6.658243458644189e-06, + "logits/chosen": -1.4003181457519531, + "logits/rejected": -1.0954400300979614, + "logps/chosen": -0.8720696568489075, + "logps/rejected": -2.591235399246216, + "loss": 0.89, + "odds_ratio_loss": 0.17881350219249725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08720696717500687, + "rewards/margins": 0.17191657423973083, + "rewards/rejected": -0.2591235339641571, + "sft_loss": 0.8720696568489075, + "step": 5105 + }, + { + "epoch": 0.4, + "grad_norm": 23.594053268432617, + "learning_rate": 6.652433132065834e-06, + "logits/chosen": -1.3882420063018799, + "logits/rejected": -0.9964066743850708, + "logps/chosen": -1.1310899257659912, + "logps/rejected": -11.7948637008667, + "loss": 1.1367, + "odds_ratio_loss": 0.05562058836221695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11310900747776031, + "rewards/margins": 1.0663774013519287, + "rewards/rejected": -1.1794865131378174, + "sft_loss": 1.1310899257659912, + "step": 5110 + }, + { + "epoch": 0.4, + "grad_norm": 59.16744613647461, + "learning_rate": 6.646620299362833e-06, + "logits/chosen": -1.3670847415924072, + "logits/rejected": -1.0468709468841553, + "logps/chosen": -0.8871728181838989, + "logps/rejected": -5.707052230834961, + "loss": 0.942, + "odds_ratio_loss": 0.5480446815490723, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08871728926897049, + "rewards/margins": 0.4819878935813904, + "rewards/rejected": -0.5707052946090698, + "sft_loss": 0.8871728181838989, + "step": 5115 + }, + { + "epoch": 0.4, + "grad_norm": 8.881211280822754, + "learning_rate": 6.640804969351086e-06, + "logits/chosen": -1.3376655578613281, + "logits/rejected": -1.367163896560669, + "logps/chosen": -1.4067643880844116, + "logps/rejected": -4.877480506896973, + "loss": 1.4234, + "odds_ratio_loss": 0.16628073155879974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14067645370960236, + "rewards/margins": 0.3470715880393982, + "rewards/rejected": -0.48774799704551697, + "sft_loss": 1.4067643880844116, + "step": 5120 + }, + { + "epoch": 0.4, + "grad_norm": 1220.0985107421875, + "learning_rate": 6.63498715085028e-06, + "logits/chosen": -1.1132049560546875, + "logits/rejected": -1.5372873544692993, + "logps/chosen": -2.2209229469299316, + "logps/rejected": -6.843510627746582, + "loss": 2.2332, + "odds_ratio_loss": 0.12279321998357773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22209230065345764, + "rewards/margins": 0.46225887537002563, + "rewards/rejected": -0.6843510866165161, + "sft_loss": 2.2209229469299316, + "step": 5125 + }, + { + "epoch": 0.4, + "grad_norm": 9.45090103149414, + "learning_rate": 6.62916685268387e-06, + "logits/chosen": -1.3377411365509033, + "logits/rejected": -0.7452336549758911, + "logps/chosen": -0.7855364680290222, + "logps/rejected": -2.1455330848693848, + "loss": 0.8336, + "odds_ratio_loss": 0.4805460572242737, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07855365425348282, + "rewards/margins": 0.1359996497631073, + "rewards/rejected": -0.21455331146717072, + "sft_loss": 0.7855364680290222, + "step": 5130 + }, + { + "epoch": 0.4, + "grad_norm": 10.302745819091797, + "learning_rate": 6.623344083679082e-06, + "logits/chosen": -1.4268461465835571, + "logits/rejected": -1.1838544607162476, + "logps/chosen": -1.1485592126846313, + "logps/rejected": -6.531126499176025, + "loss": 1.1592, + "odds_ratio_loss": 0.10628004372119904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11485592275857925, + "rewards/margins": 0.5382567644119263, + "rewards/rejected": -0.6531126499176025, + "sft_loss": 1.1485592126846313, + "step": 5135 + }, + { + "epoch": 0.4, + "grad_norm": 11.357346534729004, + "learning_rate": 6.617518852666883e-06, + "logits/chosen": -1.4212384223937988, + "logits/rejected": -0.9435787200927734, + "logps/chosen": -1.0578309297561646, + "logps/rejected": -3.755695343017578, + "loss": 1.0795, + "odds_ratio_loss": 0.21710722148418427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10578310489654541, + "rewards/margins": 0.26978641748428345, + "rewards/rejected": -0.37556952238082886, + "sft_loss": 1.0578309297561646, + "step": 5140 + }, + { + "epoch": 0.4, + "grad_norm": 18.677724838256836, + "learning_rate": 6.611691168481976e-06, + "logits/chosen": -1.4505398273468018, + "logits/rejected": -1.397452712059021, + "logps/chosen": -2.092315435409546, + "logps/rejected": -2.6640334129333496, + "loss": 2.2108, + "odds_ratio_loss": 1.1846152544021606, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20923154056072235, + "rewards/margins": 0.05717180296778679, + "rewards/rejected": -0.2664033770561218, + "sft_loss": 2.092315435409546, + "step": 5145 + }, + { + "epoch": 0.4, + "grad_norm": 7.536815643310547, + "learning_rate": 6.605861039962785e-06, + "logits/chosen": -1.3447933197021484, + "logits/rejected": -0.9513217210769653, + "logps/chosen": -1.1527773141860962, + "logps/rejected": -5.865206241607666, + "loss": 1.1605, + "odds_ratio_loss": 0.07718654721975327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1152777299284935, + "rewards/margins": 0.47124290466308594, + "rewards/rejected": -0.5865205526351929, + "sft_loss": 1.1527773141860962, + "step": 5150 + }, + { + "epoch": 0.4, + "grad_norm": 6.230055332183838, + "learning_rate": 6.600028475951438e-06, + "logits/chosen": -1.4554965496063232, + "logits/rejected": -1.0521572828292847, + "logps/chosen": -1.1424763202667236, + "logps/rejected": -6.107195854187012, + "loss": 1.1544, + "odds_ratio_loss": 0.11930395662784576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1142476350069046, + "rewards/margins": 0.49647197127342224, + "rewards/rejected": -0.610719621181488, + "sft_loss": 1.1424763202667236, + "step": 5155 + }, + { + "epoch": 0.4, + "grad_norm": 11.935591697692871, + "learning_rate": 6.594193485293758e-06, + "logits/chosen": -1.4703266620635986, + "logits/rejected": -1.0110032558441162, + "logps/chosen": -1.0646206140518188, + "logps/rejected": -4.367392539978027, + "loss": 1.0918, + "odds_ratio_loss": 0.2720407247543335, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10646207630634308, + "rewards/margins": 0.3302771747112274, + "rewards/rejected": -0.4367392957210541, + "sft_loss": 1.0646206140518188, + "step": 5160 + }, + { + "epoch": 0.4, + "grad_norm": 9.436556816101074, + "learning_rate": 6.5883560768392544e-06, + "logits/chosen": -1.4851438999176025, + "logits/rejected": -1.1477575302124023, + "logps/chosen": -0.8953951001167297, + "logps/rejected": -4.988603591918945, + "loss": 0.9186, + "odds_ratio_loss": 0.23208299279212952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08953951299190521, + "rewards/margins": 0.4093208312988281, + "rewards/rejected": -0.49886035919189453, + "sft_loss": 0.8953951001167297, + "step": 5165 + }, + { + "epoch": 0.4, + "grad_norm": 4.220683574676514, + "learning_rate": 6.5825162594410914e-06, + "logits/chosen": -1.4410722255706787, + "logits/rejected": -0.8721886873245239, + "logps/chosen": -0.7598224878311157, + "logps/rejected": -1.8489675521850586, + "loss": 0.7871, + "odds_ratio_loss": 0.27249783277511597, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07598225027322769, + "rewards/margins": 0.10891450941562653, + "rewards/rejected": -0.18489676713943481, + "sft_loss": 0.7598224878311157, + "step": 5170 + }, + { + "epoch": 0.4, + "grad_norm": 5.179640293121338, + "learning_rate": 6.576674041956099e-06, + "logits/chosen": -1.3173493146896362, + "logits/rejected": -0.913482666015625, + "logps/chosen": -1.0375252962112427, + "logps/rejected": -11.861112594604492, + "loss": 1.0394, + "odds_ratio_loss": 0.018778596073389053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10375253111124039, + "rewards/margins": 1.082358956336975, + "rewards/rejected": -1.186111330986023, + "sft_loss": 1.0375252962112427, + "step": 5175 + }, + { + "epoch": 0.4, + "grad_norm": 14.923868179321289, + "learning_rate": 6.5708294332447385e-06, + "logits/chosen": -1.3341882228851318, + "logits/rejected": -0.9410299062728882, + "logps/chosen": -0.9867004156112671, + "logps/rejected": -1.1775720119476318, + "loss": 1.0457, + "odds_ratio_loss": 0.5900734663009644, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09867005050182343, + "rewards/margins": 0.01908714324235916, + "rewards/rejected": -0.11775718629360199, + "sft_loss": 0.9867004156112671, + "step": 5180 + }, + { + "epoch": 0.4, + "grad_norm": 18.309810638427734, + "learning_rate": 6.564982442171103e-06, + "logits/chosen": -1.444608449935913, + "logits/rejected": -1.1630117893218994, + "logps/chosen": -0.8816467523574829, + "logps/rejected": -3.7716174125671387, + "loss": 0.901, + "odds_ratio_loss": 0.19364431500434875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08816467970609665, + "rewards/margins": 0.2889971137046814, + "rewards/rejected": -0.37716180086135864, + "sft_loss": 0.8816467523574829, + "step": 5185 + }, + { + "epoch": 0.4, + "grad_norm": 10.51563835144043, + "learning_rate": 6.559133077602895e-06, + "logits/chosen": -1.471680998802185, + "logits/rejected": -0.9202741384506226, + "logps/chosen": -1.4591476917266846, + "logps/rejected": -7.845271110534668, + "loss": 1.4825, + "odds_ratio_loss": 0.23371830582618713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14591476321220398, + "rewards/margins": 0.6386123895645142, + "rewards/rejected": -0.7845271229743958, + "sft_loss": 1.4591476917266846, + "step": 5190 + }, + { + "epoch": 0.4, + "grad_norm": 58.147037506103516, + "learning_rate": 6.55328134841142e-06, + "logits/chosen": -1.2235281467437744, + "logits/rejected": -0.6818448305130005, + "logps/chosen": -1.0554659366607666, + "logps/rejected": -10.09231948852539, + "loss": 1.0665, + "odds_ratio_loss": 0.11016272008419037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10554659366607666, + "rewards/margins": 0.903685450553894, + "rewards/rejected": -1.0092319250106812, + "sft_loss": 1.0554659366607666, + "step": 5195 + }, + { + "epoch": 0.4, + "grad_norm": 72.4473648071289, + "learning_rate": 6.5474272634715675e-06, + "logits/chosen": -1.3383867740631104, + "logits/rejected": -0.9712162017822266, + "logps/chosen": -1.0566487312316895, + "logps/rejected": -6.484910488128662, + "loss": 1.0775, + "odds_ratio_loss": 0.20842652022838593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10566486418247223, + "rewards/margins": 0.542826235294342, + "rewards/rejected": -0.6484910845756531, + "sft_loss": 1.0566487312316895, + "step": 5200 + }, + { + "epoch": 0.4, + "grad_norm": 41.163124084472656, + "learning_rate": 6.541570831661802e-06, + "logits/chosen": -1.4022243022918701, + "logits/rejected": -1.1713143587112427, + "logps/chosen": -1.5733802318572998, + "logps/rejected": -8.44092082977295, + "loss": 1.5746, + "odds_ratio_loss": 0.01258087158203125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15733802318572998, + "rewards/margins": 0.686754047870636, + "rewards/rejected": -0.844092071056366, + "sft_loss": 1.5733802318572998, + "step": 5205 + }, + { + "epoch": 0.41, + "grad_norm": 7.19493293762207, + "learning_rate": 6.535712061864144e-06, + "logits/chosen": -1.549918293952942, + "logits/rejected": -1.0348308086395264, + "logps/chosen": -1.1737009286880493, + "logps/rejected": -2.1659064292907715, + "loss": 1.2126, + "odds_ratio_loss": 0.3890572190284729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11737009137868881, + "rewards/margins": 0.09922054409980774, + "rewards/rejected": -0.21659064292907715, + "sft_loss": 1.1737009286880493, + "step": 5210 + }, + { + "epoch": 0.41, + "grad_norm": 23.68317222595215, + "learning_rate": 6.529850962964164e-06, + "logits/chosen": -1.4581935405731201, + "logits/rejected": -1.1173006296157837, + "logps/chosen": -0.9190993309020996, + "logps/rejected": -5.095643043518066, + "loss": 0.9294, + "odds_ratio_loss": 0.10286466777324677, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09190993010997772, + "rewards/margins": 0.417654424905777, + "rewards/rejected": -0.5095642805099487, + "sft_loss": 0.9190993309020996, + "step": 5215 + }, + { + "epoch": 0.41, + "grad_norm": 12.550178527832031, + "learning_rate": 6.523987543850959e-06, + "logits/chosen": -1.3836402893066406, + "logits/rejected": -1.049728274345398, + "logps/chosen": -0.9922205209732056, + "logps/rejected": -2.728982925415039, + "loss": 1.0135, + "odds_ratio_loss": 0.21254947781562805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09922204911708832, + "rewards/margins": 0.1736762523651123, + "rewards/rejected": -0.27289828658103943, + "sft_loss": 0.9922205209732056, + "step": 5220 + }, + { + "epoch": 0.41, + "grad_norm": 15.591679573059082, + "learning_rate": 6.518121813417151e-06, + "logits/chosen": -1.3791759014129639, + "logits/rejected": -0.9591207504272461, + "logps/chosen": -1.084330677986145, + "logps/rejected": -3.4170398712158203, + "loss": 1.1007, + "odds_ratio_loss": 0.16401781141757965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10843305289745331, + "rewards/margins": 0.23327095806598663, + "rewards/rejected": -0.34170401096343994, + "sft_loss": 1.084330677986145, + "step": 5225 + }, + { + "epoch": 0.41, + "grad_norm": 16.50823402404785, + "learning_rate": 6.5122537805588655e-06, + "logits/chosen": -1.3790065050125122, + "logits/rejected": -1.202798843383789, + "logps/chosen": -0.7999995946884155, + "logps/rejected": -6.863905906677246, + "loss": 0.8023, + "odds_ratio_loss": 0.02324349619448185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07999996095895767, + "rewards/margins": 0.6063905954360962, + "rewards/rejected": -0.6863905787467957, + "sft_loss": 0.7999995946884155, + "step": 5230 + }, + { + "epoch": 0.41, + "grad_norm": 4.774572372436523, + "learning_rate": 6.50638345417572e-06, + "logits/chosen": -1.3010079860687256, + "logits/rejected": -1.19615638256073, + "logps/chosen": -1.311397671699524, + "logps/rejected": -9.872981071472168, + "loss": 1.331, + "odds_ratio_loss": 0.1961272656917572, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13113977015018463, + "rewards/margins": 0.856158435344696, + "rewards/rejected": -0.9872981309890747, + "sft_loss": 1.311397671699524, + "step": 5235 + }, + { + "epoch": 0.41, + "grad_norm": 19.700647354125977, + "learning_rate": 6.500510843170808e-06, + "logits/chosen": -1.4291975498199463, + "logits/rejected": -1.0822474956512451, + "logps/chosen": -1.1227763891220093, + "logps/rejected": -15.763837814331055, + "loss": 1.1311, + "odds_ratio_loss": 0.08306220918893814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11227764934301376, + "rewards/margins": 1.4641063213348389, + "rewards/rejected": -1.5763839483261108, + "sft_loss": 1.1227763891220093, + "step": 5240 + }, + { + "epoch": 0.41, + "grad_norm": 17.024803161621094, + "learning_rate": 6.494635956450688e-06, + "logits/chosen": -1.4313437938690186, + "logits/rejected": -0.7469003796577454, + "logps/chosen": -0.8249877691268921, + "logps/rejected": -3.162346601486206, + "loss": 0.8581, + "odds_ratio_loss": 0.33130815625190735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08249877393245697, + "rewards/margins": 0.23373588919639587, + "rewards/rejected": -0.31623467803001404, + "sft_loss": 0.8249877691268921, + "step": 5245 + }, + { + "epoch": 0.41, + "grad_norm": 8.548990249633789, + "learning_rate": 6.488758802925373e-06, + "logits/chosen": -1.1915152072906494, + "logits/rejected": -1.0353416204452515, + "logps/chosen": -0.6875573992729187, + "logps/rejected": -3.8330788612365723, + "loss": 0.7024, + "odds_ratio_loss": 0.14798401296138763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06875574588775635, + "rewards/margins": 0.3145521581172943, + "rewards/rejected": -0.38330790400505066, + "sft_loss": 0.6875573992729187, + "step": 5250 + }, + { + "epoch": 0.41, + "grad_norm": 7.481503963470459, + "learning_rate": 6.482879391508317e-06, + "logits/chosen": -1.4916441440582275, + "logits/rejected": -1.2765228748321533, + "logps/chosen": -0.8017631769180298, + "logps/rejected": -6.369771957397461, + "loss": 0.8202, + "odds_ratio_loss": 0.18478266894817352, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08017632365226746, + "rewards/margins": 0.5568008422851562, + "rewards/rejected": -0.6369771957397461, + "sft_loss": 0.8017631769180298, + "step": 5255 + }, + { + "epoch": 0.41, + "grad_norm": 6.256443977355957, + "learning_rate": 6.476997731116386e-06, + "logits/chosen": -1.363231897354126, + "logits/rejected": -0.838513195514679, + "logps/chosen": -0.9848578572273254, + "logps/rejected": -4.066619396209717, + "loss": 1.0037, + "odds_ratio_loss": 0.1887107640504837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0984857901930809, + "rewards/margins": 0.308176189661026, + "rewards/rejected": -0.4066619873046875, + "sft_loss": 0.9848578572273254, + "step": 5260 + }, + { + "epoch": 0.41, + "grad_norm": 15.482699394226074, + "learning_rate": 6.471113830669872e-06, + "logits/chosen": -1.4585330486297607, + "logits/rejected": -1.3035043478012085, + "logps/chosen": -1.1082388162612915, + "logps/rejected": -6.114068508148193, + "loss": 1.161, + "odds_ratio_loss": 0.5274852514266968, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11082388460636139, + "rewards/margins": 0.5005830526351929, + "rewards/rejected": -0.6114069223403931, + "sft_loss": 1.1082388162612915, + "step": 5265 + }, + { + "epoch": 0.41, + "grad_norm": 13.043397903442383, + "learning_rate": 6.465227699092452e-06, + "logits/chosen": -1.4129165410995483, + "logits/rejected": -0.8737384080886841, + "logps/chosen": -1.0952141284942627, + "logps/rejected": -7.8089280128479, + "loss": 1.1194, + "odds_ratio_loss": 0.2414560317993164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10952140390872955, + "rewards/margins": 0.671371340751648, + "rewards/rejected": -0.7808927893638611, + "sft_loss": 1.0952141284942627, + "step": 5270 + }, + { + "epoch": 0.41, + "grad_norm": 14.405864715576172, + "learning_rate": 6.459339345311194e-06, + "logits/chosen": -1.2031883001327515, + "logits/rejected": -1.09691321849823, + "logps/chosen": -1.3176788091659546, + "logps/rejected": -2.5202293395996094, + "loss": 1.3548, + "odds_ratio_loss": 0.37098708748817444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1317678838968277, + "rewards/margins": 0.12025503814220428, + "rewards/rejected": -0.252022922039032, + "sft_loss": 1.3176788091659546, + "step": 5275 + }, + { + "epoch": 0.41, + "grad_norm": 5.715144157409668, + "learning_rate": 6.4534487782565346e-06, + "logits/chosen": -1.4191725254058838, + "logits/rejected": -0.5255266427993774, + "logps/chosen": -0.8059912919998169, + "logps/rejected": -7.7237420082092285, + "loss": 0.831, + "odds_ratio_loss": 0.24965138733386993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08059912174940109, + "rewards/margins": 0.6917750835418701, + "rewards/rejected": -0.7723742127418518, + "sft_loss": 0.8059912919998169, + "step": 5280 + }, + { + "epoch": 0.41, + "grad_norm": 6.963302135467529, + "learning_rate": 6.447556006862266e-06, + "logits/chosen": -1.422668218612671, + "logits/rejected": -1.079911470413208, + "logps/chosen": -1.1761057376861572, + "logps/rejected": -5.600368022918701, + "loss": 1.2187, + "odds_ratio_loss": 0.4255724549293518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11761058866977692, + "rewards/margins": 0.4424262046813965, + "rewards/rejected": -0.5600367784500122, + "sft_loss": 1.1761057376861572, + "step": 5285 + }, + { + "epoch": 0.41, + "grad_norm": 33.12194061279297, + "learning_rate": 6.441661040065523e-06, + "logits/chosen": -1.6109205484390259, + "logits/rejected": -1.3014240264892578, + "logps/chosen": -0.7997133135795593, + "logps/rejected": -3.3593928813934326, + "loss": 0.8455, + "odds_ratio_loss": 0.45820364356040955, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0799713283777237, + "rewards/margins": 0.2559679448604584, + "rewards/rejected": -0.33593928813934326, + "sft_loss": 0.7997133135795593, + "step": 5290 + }, + { + "epoch": 0.41, + "grad_norm": 515.0574340820312, + "learning_rate": 6.435763886806774e-06, + "logits/chosen": -1.4338130950927734, + "logits/rejected": -1.323038101196289, + "logps/chosen": -1.8791614770889282, + "logps/rejected": -4.376596450805664, + "loss": 1.9784, + "odds_ratio_loss": 0.992806613445282, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1879161298274994, + "rewards/margins": 0.2497435063123703, + "rewards/rejected": -0.4376596510410309, + "sft_loss": 1.8791614770889282, + "step": 5295 + }, + { + "epoch": 0.41, + "grad_norm": 39.97163391113281, + "learning_rate": 6.4298645560297976e-06, + "logits/chosen": -1.5293049812316895, + "logits/rejected": -1.3366972208023071, + "logps/chosen": -0.8182582855224609, + "logps/rejected": -1.072858452796936, + "loss": 0.9715, + "odds_ratio_loss": 1.532416582107544, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08182583749294281, + "rewards/margins": 0.025460004806518555, + "rewards/rejected": -0.10728584229946136, + "sft_loss": 0.8182582855224609, + "step": 5300 + }, + { + "epoch": 0.41, + "grad_norm": 9.11904239654541, + "learning_rate": 6.42396305668168e-06, + "logits/chosen": -1.2009570598602295, + "logits/rejected": -1.1244704723358154, + "logps/chosen": -1.0162278413772583, + "logps/rejected": -5.84080696105957, + "loss": 1.021, + "odds_ratio_loss": 0.04781056195497513, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10162278264760971, + "rewards/margins": 0.48245781660079956, + "rewards/rejected": -0.5840806365013123, + "sft_loss": 1.0162278413772583, + "step": 5305 + }, + { + "epoch": 0.41, + "grad_norm": 10.741803169250488, + "learning_rate": 6.418059397712792e-06, + "logits/chosen": -1.3627127408981323, + "logits/rejected": -0.805992603302002, + "logps/chosen": -0.977192223072052, + "logps/rejected": -3.0407779216766357, + "loss": 0.9869, + "odds_ratio_loss": 0.09715723246335983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0977192148566246, + "rewards/margins": 0.20635858178138733, + "rewards/rejected": -0.30407780408859253, + "sft_loss": 0.977192223072052, + "step": 5310 + }, + { + "epoch": 0.41, + "grad_norm": 13.376646041870117, + "learning_rate": 6.412153588076785e-06, + "logits/chosen": -1.2463794946670532, + "logits/rejected": -0.7775799036026001, + "logps/chosen": -0.7438236474990845, + "logps/rejected": -6.562718868255615, + "loss": 0.7586, + "odds_ratio_loss": 0.14801888167858124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07438236474990845, + "rewards/margins": 0.5818895697593689, + "rewards/rejected": -0.6562718749046326, + "sft_loss": 0.7438236474990845, + "step": 5315 + }, + { + "epoch": 0.41, + "grad_norm": 11.7168607711792, + "learning_rate": 6.406245636730568e-06, + "logits/chosen": -1.44731867313385, + "logits/rejected": -0.870677649974823, + "logps/chosen": -1.2856199741363525, + "logps/rejected": -4.146407127380371, + "loss": 1.311, + "odds_ratio_loss": 0.25399282574653625, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12856200337409973, + "rewards/margins": 0.2860787510871887, + "rewards/rejected": -0.41464075446128845, + "sft_loss": 1.2856199741363525, + "step": 5320 + }, + { + "epoch": 0.41, + "grad_norm": 7.065237045288086, + "learning_rate": 6.4003355526342995e-06, + "logits/chosen": -1.4006376266479492, + "logits/rejected": -1.0352108478546143, + "logps/chosen": -1.1861366033554077, + "logps/rejected": -3.761228084564209, + "loss": 1.2033, + "odds_ratio_loss": 0.17140784859657288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11861366033554077, + "rewards/margins": 0.2575092017650604, + "rewards/rejected": -0.3761228621006012, + "sft_loss": 1.1861366033554077, + "step": 5325 + }, + { + "epoch": 0.41, + "grad_norm": 13.023364067077637, + "learning_rate": 6.39442334475137e-06, + "logits/chosen": -1.2521476745605469, + "logits/rejected": -1.1121388673782349, + "logps/chosen": -1.2779855728149414, + "logps/rejected": -4.648769855499268, + "loss": 1.3339, + "odds_ratio_loss": 0.5596238970756531, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12779855728149414, + "rewards/margins": 0.33707842230796814, + "rewards/rejected": -0.4648769795894623, + "sft_loss": 1.2779855728149414, + "step": 5330 + }, + { + "epoch": 0.42, + "grad_norm": 26.406526565551758, + "learning_rate": 6.388509022048396e-06, + "logits/chosen": -1.4632203578948975, + "logits/rejected": -1.218110203742981, + "logps/chosen": -0.8516885638237, + "logps/rejected": -5.130523681640625, + "loss": 0.886, + "odds_ratio_loss": 0.34357717633247375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08516885340213776, + "rewards/margins": 0.4278835356235504, + "rewards/rejected": -0.5130523443222046, + "sft_loss": 0.8516885638237, + "step": 5335 + }, + { + "epoch": 0.42, + "grad_norm": 183.27879333496094, + "learning_rate": 6.3825925934951986e-06, + "logits/chosen": -1.382738709449768, + "logits/rejected": -0.9956004023551941, + "logps/chosen": -1.4411137104034424, + "logps/rejected": -5.785558223724365, + "loss": 1.4701, + "odds_ratio_loss": 0.29017865657806396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14411136507987976, + "rewards/margins": 0.43444448709487915, + "rewards/rejected": -0.5785558819770813, + "sft_loss": 1.4411137104034424, + "step": 5340 + }, + { + "epoch": 0.42, + "grad_norm": 8.949464797973633, + "learning_rate": 6.376674068064792e-06, + "logits/chosen": -1.3090190887451172, + "logits/rejected": -1.3267171382904053, + "logps/chosen": -0.7871259450912476, + "logps/rejected": -3.755983352661133, + "loss": 0.8014, + "odds_ratio_loss": 0.14319057762622833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0787125900387764, + "rewards/margins": 0.29688572883605957, + "rewards/rejected": -0.37559834122657776, + "sft_loss": 0.7871259450912476, + "step": 5345 + }, + { + "epoch": 0.42, + "grad_norm": 15.70522689819336, + "learning_rate": 6.370753454733371e-06, + "logits/chosen": -1.4009630680084229, + "logits/rejected": -1.197137713432312, + "logps/chosen": -0.9578830599784851, + "logps/rejected": -4.780231475830078, + "loss": 0.9862, + "odds_ratio_loss": 0.2829952836036682, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09578831493854523, + "rewards/margins": 0.3822348415851593, + "rewards/rejected": -0.47802314162254333, + "sft_loss": 0.9578830599784851, + "step": 5350 + }, + { + "epoch": 0.42, + "grad_norm": 23.035987854003906, + "learning_rate": 6.3648307624803e-06, + "logits/chosen": -1.314186692237854, + "logits/rejected": -1.559711217880249, + "logps/chosen": -0.8188357353210449, + "logps/rejected": -9.391572952270508, + "loss": 0.8198, + "odds_ratio_loss": 0.010050063952803612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08188357204198837, + "rewards/margins": 0.8572737574577332, + "rewards/rejected": -0.9391573071479797, + "sft_loss": 0.8188357353210449, + "step": 5355 + }, + { + "epoch": 0.42, + "grad_norm": 24.58151626586914, + "learning_rate": 6.358906000288091e-06, + "logits/chosen": -1.376570463180542, + "logits/rejected": -1.3238173723220825, + "logps/chosen": -0.7370100021362305, + "logps/rejected": -5.758879661560059, + "loss": 0.7583, + "odds_ratio_loss": 0.21306517720222473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07370099425315857, + "rewards/margins": 0.5021870136260986, + "rewards/rejected": -0.5758880376815796, + "sft_loss": 0.7370100021362305, + "step": 5360 + }, + { + "epoch": 0.42, + "grad_norm": 12.155942916870117, + "learning_rate": 6.352979177142399e-06, + "logits/chosen": -1.461085557937622, + "logits/rejected": -0.873151957988739, + "logps/chosen": -0.9852715730667114, + "logps/rejected": -9.310620307922363, + "loss": 0.9927, + "odds_ratio_loss": 0.07405222952365875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09852716326713562, + "rewards/margins": 0.832534909248352, + "rewards/rejected": -0.9310620427131653, + "sft_loss": 0.9852715730667114, + "step": 5365 + }, + { + "epoch": 0.42, + "grad_norm": 10.086821556091309, + "learning_rate": 6.347050302032005e-06, + "logits/chosen": -1.2039637565612793, + "logits/rejected": -1.0078706741333008, + "logps/chosen": -0.9459770917892456, + "logps/rejected": -4.682739734649658, + "loss": 0.9889, + "odds_ratio_loss": 0.42916393280029297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0945977121591568, + "rewards/margins": 0.37367627024650574, + "rewards/rejected": -0.4682740271091461, + "sft_loss": 0.9459770917892456, + "step": 5370 + }, + { + "epoch": 0.42, + "grad_norm": 13.609196662902832, + "learning_rate": 6.341119383948799e-06, + "logits/chosen": -1.5448615550994873, + "logits/rejected": -1.2496637105941772, + "logps/chosen": -0.7335996627807617, + "logps/rejected": -1.3010228872299194, + "loss": 0.7845, + "odds_ratio_loss": 0.5093629956245422, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07335996627807617, + "rewards/margins": 0.05674232169985771, + "rewards/rejected": -0.13010229170322418, + "sft_loss": 0.7335996627807617, + "step": 5375 + }, + { + "epoch": 0.42, + "grad_norm": 37.54430389404297, + "learning_rate": 6.335186431887772e-06, + "logits/chosen": -1.4095587730407715, + "logits/rejected": -1.1966378688812256, + "logps/chosen": -1.336415410041809, + "logps/rejected": -1.9364235401153564, + "loss": 1.4304, + "odds_ratio_loss": 0.9395570755004883, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1336415410041809, + "rewards/margins": 0.060000818222761154, + "rewards/rejected": -0.19364234805107117, + "sft_loss": 1.336415410041809, + "step": 5380 + }, + { + "epoch": 0.42, + "grad_norm": 8.126177787780762, + "learning_rate": 6.329251454847e-06, + "logits/chosen": -1.42582368850708, + "logits/rejected": -0.7910014986991882, + "logps/chosen": -0.8624337911605835, + "logps/rejected": -1.4641921520233154, + "loss": 0.9135, + "odds_ratio_loss": 0.5102102756500244, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08624337613582611, + "rewards/margins": 0.06017584353685379, + "rewards/rejected": -0.1464192271232605, + "sft_loss": 0.8624337911605835, + "step": 5385 + }, + { + "epoch": 0.42, + "grad_norm": 5.362218856811523, + "learning_rate": 6.3233144618276265e-06, + "logits/chosen": -1.4800853729248047, + "logits/rejected": -0.9077849388122559, + "logps/chosen": -0.9694632291793823, + "logps/rejected": -10.608955383300781, + "loss": 0.9849, + "odds_ratio_loss": 0.15407711267471313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09694632887840271, + "rewards/margins": 0.9639492034912109, + "rewards/rejected": -1.060895562171936, + "sft_loss": 0.9694632291793823, + "step": 5390 + }, + { + "epoch": 0.42, + "grad_norm": 5.7052083015441895, + "learning_rate": 6.317375461833859e-06, + "logits/chosen": -1.4369672536849976, + "logits/rejected": -1.213986873626709, + "logps/chosen": -0.9510200619697571, + "logps/rejected": -4.338289260864258, + "loss": 0.9789, + "odds_ratio_loss": 0.278502881526947, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09510200470685959, + "rewards/margins": 0.3387269377708435, + "rewards/rejected": -0.4338289797306061, + "sft_loss": 0.9510200619697571, + "step": 5395 + }, + { + "epoch": 0.42, + "grad_norm": 6.42952823638916, + "learning_rate": 6.311434463872941e-06, + "logits/chosen": -1.4391467571258545, + "logits/rejected": -0.944113552570343, + "logps/chosen": -0.8725773692131042, + "logps/rejected": -2.7120556831359863, + "loss": 0.9056, + "odds_ratio_loss": 0.3303033709526062, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0872577428817749, + "rewards/margins": 0.18394781649112701, + "rewards/rejected": -0.2712055742740631, + "sft_loss": 0.8725773692131042, + "step": 5400 + }, + { + "epoch": 0.42, + "grad_norm": 8.709639549255371, + "learning_rate": 6.305491476955154e-06, + "logits/chosen": -1.294272541999817, + "logits/rejected": -1.2669130563735962, + "logps/chosen": -1.11087965965271, + "logps/rejected": -3.0580954551696777, + "loss": 1.1525, + "odds_ratio_loss": 0.41599932312965393, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11108797788619995, + "rewards/margins": 0.19472156465053558, + "rewards/rejected": -0.30580952763557434, + "sft_loss": 1.11087965965271, + "step": 5405 + }, + { + "epoch": 0.42, + "grad_norm": 12.110684394836426, + "learning_rate": 6.299546510093791e-06, + "logits/chosen": -1.3638485670089722, + "logits/rejected": -1.0348577499389648, + "logps/chosen": -1.0306559801101685, + "logps/rejected": -4.842162132263184, + "loss": 1.0958, + "odds_ratio_loss": 0.6510428190231323, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1030655950307846, + "rewards/margins": 0.38115063309669495, + "rewards/rejected": -0.48421621322631836, + "sft_loss": 1.0306559801101685, + "step": 5410 + }, + { + "epoch": 0.42, + "grad_norm": 5.196389675140381, + "learning_rate": 6.293599572305147e-06, + "logits/chosen": -1.3511767387390137, + "logits/rejected": -0.735332190990448, + "logps/chosen": -1.0389249324798584, + "logps/rejected": -15.559942245483398, + "loss": 1.0391, + "odds_ratio_loss": 0.0016535281902179122, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1038924902677536, + "rewards/margins": 1.452101707458496, + "rewards/rejected": -1.5559942722320557, + "sft_loss": 1.0389249324798584, + "step": 5415 + }, + { + "epoch": 0.42, + "grad_norm": 13.545232772827148, + "learning_rate": 6.287650672608512e-06, + "logits/chosen": -1.0957015752792358, + "logits/rejected": -1.0863041877746582, + "logps/chosen": -0.9821407198905945, + "logps/rejected": -4.82470178604126, + "loss": 0.9899, + "odds_ratio_loss": 0.0780685544013977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09821407496929169, + "rewards/margins": 0.38425612449645996, + "rewards/rejected": -0.48247018456459045, + "sft_loss": 0.9821407198905945, + "step": 5420 + }, + { + "epoch": 0.42, + "grad_norm": 13.815802574157715, + "learning_rate": 6.281699820026144e-06, + "logits/chosen": -1.2515912055969238, + "logits/rejected": -0.8593171834945679, + "logps/chosen": -1.2910552024841309, + "logps/rejected": -4.360851287841797, + "loss": 1.3075, + "odds_ratio_loss": 0.16433535516262054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12910553812980652, + "rewards/margins": 0.30697956681251526, + "rewards/rejected": -0.43608513474464417, + "sft_loss": 1.2910552024841309, + "step": 5425 + }, + { + "epoch": 0.42, + "grad_norm": 120.34965515136719, + "learning_rate": 6.275747023583266e-06, + "logits/chosen": -1.3192626237869263, + "logits/rejected": -1.2971595525741577, + "logps/chosen": -0.8860888481140137, + "logps/rejected": -7.4777069091796875, + "loss": 0.9032, + "odds_ratio_loss": 0.1707981675863266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08860887587070465, + "rewards/margins": 0.6591618657112122, + "rewards/rejected": -0.7477707266807556, + "sft_loss": 0.8860888481140137, + "step": 5430 + }, + { + "epoch": 0.42, + "grad_norm": 14.62352466583252, + "learning_rate": 6.269792292308054e-06, + "logits/chosen": -1.5659847259521484, + "logits/rejected": -1.1328372955322266, + "logps/chosen": -0.748722493648529, + "logps/rejected": -3.743081569671631, + "loss": 0.7556, + "odds_ratio_loss": 0.06894762814044952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07487224042415619, + "rewards/margins": 0.29943591356277466, + "rewards/rejected": -0.37430819869041443, + "sft_loss": 0.748722493648529, + "step": 5435 + }, + { + "epoch": 0.42, + "grad_norm": 12.701285362243652, + "learning_rate": 6.263835635231612e-06, + "logits/chosen": -1.4254283905029297, + "logits/rejected": -0.7170482873916626, + "logps/chosen": -1.096375823020935, + "logps/rejected": -8.693497657775879, + "loss": 1.1015, + "odds_ratio_loss": 0.05131556838750839, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10963758081197739, + "rewards/margins": 0.7597121596336365, + "rewards/rejected": -0.8693498373031616, + "sft_loss": 1.096375823020935, + "step": 5440 + }, + { + "epoch": 0.42, + "grad_norm": 18.322486877441406, + "learning_rate": 6.257877061387966e-06, + "logits/chosen": -1.403956651687622, + "logits/rejected": -1.0851489305496216, + "logps/chosen": -0.80475914478302, + "logps/rejected": -8.924013137817383, + "loss": 0.8272, + "odds_ratio_loss": 0.22463825345039368, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08047591149806976, + "rewards/margins": 0.8119255304336548, + "rewards/rejected": -0.8924013376235962, + "sft_loss": 0.80475914478302, + "step": 5445 + }, + { + "epoch": 0.42, + "grad_norm": 61.04814147949219, + "learning_rate": 6.25191657981405e-06, + "logits/chosen": -1.3005506992340088, + "logits/rejected": -1.1514785289764404, + "logps/chosen": -0.8403207063674927, + "logps/rejected": -9.946390151977539, + "loss": 0.8522, + "odds_ratio_loss": 0.11916428804397583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0840320736169815, + "rewards/margins": 0.9106069803237915, + "rewards/rejected": -0.9946390390396118, + "sft_loss": 0.8403207063674927, + "step": 5450 + }, + { + "epoch": 0.42, + "grad_norm": 25.489612579345703, + "learning_rate": 6.24595419954969e-06, + "logits/chosen": -1.4496216773986816, + "logits/rejected": -0.9528179168701172, + "logps/chosen": -1.1614158153533936, + "logps/rejected": -1.9910621643066406, + "loss": 1.2025, + "odds_ratio_loss": 0.41043177247047424, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11614159494638443, + "rewards/margins": 0.08296463638544083, + "rewards/rejected": -0.19910623133182526, + "sft_loss": 1.1614158153533936, + "step": 5455 + }, + { + "epoch": 0.42, + "grad_norm": 6.294222354888916, + "learning_rate": 6.239989929637595e-06, + "logits/chosen": -1.4808690547943115, + "logits/rejected": -0.9784132838249207, + "logps/chosen": -0.863582968711853, + "logps/rejected": -1.8838698863983154, + "loss": 0.8951, + "odds_ratio_loss": 0.3148866593837738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08635830134153366, + "rewards/margins": 0.10202869027853012, + "rewards/rejected": -0.18838700652122498, + "sft_loss": 0.863582968711853, + "step": 5460 + }, + { + "epoch": 0.43, + "grad_norm": 29.04526710510254, + "learning_rate": 6.234023779123337e-06, + "logits/chosen": -1.4023199081420898, + "logits/rejected": -1.3432037830352783, + "logps/chosen": -0.9532560110092163, + "logps/rejected": -3.891387462615967, + "loss": 0.9783, + "odds_ratio_loss": 0.25030574202537537, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09532561153173447, + "rewards/margins": 0.29381316900253296, + "rewards/rejected": -0.38913875818252563, + "sft_loss": 0.9532560110092163, + "step": 5465 + }, + { + "epoch": 0.43, + "grad_norm": 26.674266815185547, + "learning_rate": 6.228055757055339e-06, + "logits/chosen": -1.380295991897583, + "logits/rejected": -1.1915805339813232, + "logps/chosen": -0.8241817355155945, + "logps/rejected": -4.265925407409668, + "loss": 0.8417, + "odds_ratio_loss": 0.1755792200565338, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08241816610097885, + "rewards/margins": 0.3441743552684784, + "rewards/rejected": -0.42659252882003784, + "sft_loss": 0.8241817355155945, + "step": 5470 + }, + { + "epoch": 0.43, + "grad_norm": 6.6642560958862305, + "learning_rate": 6.222085872484867e-06, + "logits/chosen": -1.4019719362258911, + "logits/rejected": -1.0525364875793457, + "logps/chosen": -1.1212289333343506, + "logps/rejected": -5.022209167480469, + "loss": 1.1447, + "odds_ratio_loss": 0.2343233823776245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11212290823459625, + "rewards/margins": 0.39009803533554077, + "rewards/rejected": -0.5022209286689758, + "sft_loss": 1.1212289333343506, + "step": 5475 + }, + { + "epoch": 0.43, + "grad_norm": 11.184894561767578, + "learning_rate": 6.216114134466005e-06, + "logits/chosen": -1.342789888381958, + "logits/rejected": -1.2364380359649658, + "logps/chosen": -0.923974335193634, + "logps/rejected": -0.862908661365509, + "loss": 1.0542, + "odds_ratio_loss": 1.3019092082977295, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.09239742904901505, + "rewards/margins": -0.006106560118496418, + "rewards/rejected": -0.0862908661365509, + "sft_loss": 0.923974335193634, + "step": 5480 + }, + { + "epoch": 0.43, + "grad_norm": 49.761600494384766, + "learning_rate": 6.210140552055656e-06, + "logits/chosen": -1.1327351331710815, + "logits/rejected": -1.489861011505127, + "logps/chosen": -0.8854734301567078, + "logps/rejected": -8.11977481842041, + "loss": 0.898, + "odds_ratio_loss": 0.12550660967826843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08854734897613525, + "rewards/margins": 0.7234302163124084, + "rewards/rejected": -0.8119775652885437, + "sft_loss": 0.8854734301567078, + "step": 5485 + }, + { + "epoch": 0.43, + "grad_norm": 5.152491092681885, + "learning_rate": 6.204165134313514e-06, + "logits/chosen": -1.3554954528808594, + "logits/rejected": -0.7023124694824219, + "logps/chosen": -0.9967812299728394, + "logps/rejected": -10.33210563659668, + "loss": 0.997, + "odds_ratio_loss": 0.0022422696929425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09967813640832901, + "rewards/margins": 0.9335324168205261, + "rewards/rejected": -1.0332105159759521, + "sft_loss": 0.9967812299728394, + "step": 5490 + }, + { + "epoch": 0.43, + "grad_norm": 26.3110408782959, + "learning_rate": 6.198187890302059e-06, + "logits/chosen": -1.0693590641021729, + "logits/rejected": -0.8451949954032898, + "logps/chosen": -1.0392181873321533, + "logps/rejected": -6.271853446960449, + "loss": 1.057, + "odds_ratio_loss": 0.1776437610387802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10392183065414429, + "rewards/margins": 0.5232634544372559, + "rewards/rejected": -0.6271853446960449, + "sft_loss": 1.0392181873321533, + "step": 5495 + }, + { + "epoch": 0.43, + "grad_norm": 8.049160957336426, + "learning_rate": 6.19220882908654e-06, + "logits/chosen": -1.2815943956375122, + "logits/rejected": -1.4038329124450684, + "logps/chosen": -1.0970063209533691, + "logps/rejected": -12.82702922821045, + "loss": 1.1206, + "odds_ratio_loss": 0.2361186295747757, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10970063507556915, + "rewards/margins": 1.1730024814605713, + "rewards/rejected": -1.282702922821045, + "sft_loss": 1.0970063209533691, + "step": 5500 + }, + { + "epoch": 0.43, + "grad_norm": 147.5748291015625, + "learning_rate": 6.1862279597349625e-06, + "logits/chosen": -1.405278205871582, + "logits/rejected": -1.2007181644439697, + "logps/chosen": -1.237056016921997, + "logps/rejected": -9.997976303100586, + "loss": 1.2397, + "odds_ratio_loss": 0.026751240715384483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12370558828115463, + "rewards/margins": 0.8760920763015747, + "rewards/rejected": -0.9997976422309875, + "sft_loss": 1.237056016921997, + "step": 5505 + }, + { + "epoch": 0.43, + "grad_norm": 24.764318466186523, + "learning_rate": 6.180245291318074e-06, + "logits/chosen": -1.4413025379180908, + "logits/rejected": -0.7700681686401367, + "logps/chosen": -0.8640216588973999, + "logps/rejected": -5.9592132568359375, + "loss": 0.8991, + "odds_ratio_loss": 0.3511553108692169, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08640217781066895, + "rewards/margins": 0.5095191597938538, + "rewards/rejected": -0.5959213376045227, + "sft_loss": 0.8640216588973999, + "step": 5510 + }, + { + "epoch": 0.43, + "grad_norm": 6.59911584854126, + "learning_rate": 6.174260832909355e-06, + "logits/chosen": -1.4317357540130615, + "logits/rejected": -0.7478057146072388, + "logps/chosen": -0.8837820291519165, + "logps/rejected": -1.948574423789978, + "loss": 0.9202, + "odds_ratio_loss": 0.36417558789253235, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08837820589542389, + "rewards/margins": 0.10647924989461899, + "rewards/rejected": -0.19485744833946228, + "sft_loss": 0.8837820291519165, + "step": 5515 + }, + { + "epoch": 0.43, + "grad_norm": 189.65814208984375, + "learning_rate": 6.168274593584991e-06, + "logits/chosen": -1.2420986890792847, + "logits/rejected": -1.0643844604492188, + "logps/chosen": -0.9428080320358276, + "logps/rejected": -6.143196105957031, + "loss": 0.9637, + "odds_ratio_loss": 0.20935705304145813, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09428079426288605, + "rewards/margins": 0.5200387835502625, + "rewards/rejected": -0.6143196225166321, + "sft_loss": 0.9428080320358276, + "step": 5520 + }, + { + "epoch": 0.43, + "grad_norm": 13.59572696685791, + "learning_rate": 6.162286582423876e-06, + "logits/chosen": -1.3535109758377075, + "logits/rejected": -0.9895867109298706, + "logps/chosen": -1.0450531244277954, + "logps/rejected": -9.30381965637207, + "loss": 1.0603, + "odds_ratio_loss": 0.1525220423936844, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10450531542301178, + "rewards/margins": 0.8258765935897827, + "rewards/rejected": -0.9303819537162781, + "sft_loss": 1.0450531244277954, + "step": 5525 + }, + { + "epoch": 0.43, + "grad_norm": 21.94033432006836, + "learning_rate": 6.156296808507588e-06, + "logits/chosen": -1.4038699865341187, + "logits/rejected": -1.224493384361267, + "logps/chosen": -0.7861626744270325, + "logps/rejected": -1.9210304021835327, + "loss": 0.8439, + "odds_ratio_loss": 0.5771653652191162, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07861627638339996, + "rewards/margins": 0.11348676681518555, + "rewards/rejected": -0.1921030431985855, + "sft_loss": 0.7861626744270325, + "step": 5530 + }, + { + "epoch": 0.43, + "grad_norm": 25.767139434814453, + "learning_rate": 6.150305280920381e-06, + "logits/chosen": -1.4617096185684204, + "logits/rejected": -0.8789995312690735, + "logps/chosen": -0.8968402147293091, + "logps/rejected": -4.829245567321777, + "loss": 0.9022, + "odds_ratio_loss": 0.0539650022983551, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08968402445316315, + "rewards/margins": 0.3932405710220337, + "rewards/rejected": -0.48292461037635803, + "sft_loss": 0.8968402147293091, + "step": 5535 + }, + { + "epoch": 0.43, + "grad_norm": 8.61695671081543, + "learning_rate": 6.144312008749168e-06, + "logits/chosen": -1.2894651889801025, + "logits/rejected": -0.5578645467758179, + "logps/chosen": -0.9731871485710144, + "logps/rejected": -2.8847477436065674, + "loss": 0.9916, + "odds_ratio_loss": 0.18387068808078766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09731872379779816, + "rewards/margins": 0.1911560595035553, + "rewards/rejected": -0.28847479820251465, + "sft_loss": 0.9731871485710144, + "step": 5540 + }, + { + "epoch": 0.43, + "grad_norm": 18.54728889465332, + "learning_rate": 6.138317001083505e-06, + "logits/chosen": -1.400864839553833, + "logits/rejected": -1.461357831954956, + "logps/chosen": -0.6988080143928528, + "logps/rejected": -4.805953025817871, + "loss": 0.7053, + "odds_ratio_loss": 0.06458644568920135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06988079845905304, + "rewards/margins": 0.4107145369052887, + "rewards/rejected": -0.48059535026550293, + "sft_loss": 0.6988080143928528, + "step": 5545 + }, + { + "epoch": 0.43, + "grad_norm": 13.36061954498291, + "learning_rate": 6.132320267015586e-06, + "logits/chosen": -1.3715206384658813, + "logits/rejected": -1.4396508932113647, + "logps/chosen": -0.9623751640319824, + "logps/rejected": -8.231966018676758, + "loss": 0.9782, + "odds_ratio_loss": 0.1581466943025589, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09623752534389496, + "rewards/margins": 0.7269589900970459, + "rewards/rejected": -0.823196530342102, + "sft_loss": 0.9623751640319824, + "step": 5550 + }, + { + "epoch": 0.43, + "grad_norm": 9.104194641113281, + "learning_rate": 6.126321815640215e-06, + "logits/chosen": -1.454085111618042, + "logits/rejected": -0.7264108061790466, + "logps/chosen": -0.8256096839904785, + "logps/rejected": -6.499675750732422, + "loss": 0.8393, + "odds_ratio_loss": 0.13687697052955627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08256097137928009, + "rewards/margins": 0.5674066543579102, + "rewards/rejected": -0.6499676704406738, + "sft_loss": 0.8256096839904785, + "step": 5555 + }, + { + "epoch": 0.43, + "grad_norm": 5.782400131225586, + "learning_rate": 6.1203216560548076e-06, + "logits/chosen": -1.333268165588379, + "logits/rejected": -0.6403939127922058, + "logps/chosen": -0.9271313548088074, + "logps/rejected": -9.616800308227539, + "loss": 0.9338, + "odds_ratio_loss": 0.06707803159952164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0927131399512291, + "rewards/margins": 0.8689668774604797, + "rewards/rejected": -0.9616800546646118, + "sft_loss": 0.9271313548088074, + "step": 5560 + }, + { + "epoch": 0.43, + "grad_norm": 14.873298645019531, + "learning_rate": 6.114319797359367e-06, + "logits/chosen": -1.2211421728134155, + "logits/rejected": -1.1900720596313477, + "logps/chosen": -1.1753790378570557, + "logps/rejected": -5.317530155181885, + "loss": 1.2106, + "odds_ratio_loss": 0.3520776629447937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11753790080547333, + "rewards/margins": 0.414215087890625, + "rewards/rejected": -0.5317530035972595, + "sft_loss": 1.1753790378570557, + "step": 5565 + }, + { + "epoch": 0.43, + "grad_norm": 39.33988952636719, + "learning_rate": 6.108316248656474e-06, + "logits/chosen": -1.3350688219070435, + "logits/rejected": -1.342743158340454, + "logps/chosen": -0.8205651044845581, + "logps/rejected": -3.8720860481262207, + "loss": 0.8352, + "odds_ratio_loss": 0.14677473902702332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08205651491880417, + "rewards/margins": 0.305152028799057, + "rewards/rejected": -0.38720858097076416, + "sft_loss": 0.8205651044845581, + "step": 5570 + }, + { + "epoch": 0.43, + "grad_norm": 22.19184684753418, + "learning_rate": 6.102311019051274e-06, + "logits/chosen": -1.1321027278900146, + "logits/rejected": -1.1962878704071045, + "logps/chosen": -1.1018847227096558, + "logps/rejected": -3.9455478191375732, + "loss": 1.1527, + "odds_ratio_loss": 0.5081930160522461, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11018846929073334, + "rewards/margins": 0.28436630964279175, + "rewards/rejected": -0.3945547938346863, + "sft_loss": 1.1018847227096558, + "step": 5575 + }, + { + "epoch": 0.43, + "grad_norm": 9.592340469360352, + "learning_rate": 6.096304117651457e-06, + "logits/chosen": -1.4173847436904907, + "logits/rejected": -1.5834704637527466, + "logps/chosen": -1.1477190256118774, + "logps/rejected": -9.274484634399414, + "loss": 1.161, + "odds_ratio_loss": 0.1331491768360138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11477188766002655, + "rewards/margins": 0.8126765489578247, + "rewards/rejected": -0.9274484515190125, + "sft_loss": 1.1477190256118774, + "step": 5580 + }, + { + "epoch": 0.43, + "grad_norm": 8.10434627532959, + "learning_rate": 6.090295553567254e-06, + "logits/chosen": -1.3882704973220825, + "logits/rejected": -1.2911784648895264, + "logps/chosen": -1.1263264417648315, + "logps/rejected": -2.811375856399536, + "loss": 1.1581, + "odds_ratio_loss": 0.31728702783584595, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1126326471567154, + "rewards/margins": 0.16850493848323822, + "rewards/rejected": -0.2811375558376312, + "sft_loss": 1.1263264417648315, + "step": 5585 + }, + { + "epoch": 0.43, + "grad_norm": 4.900169372558594, + "learning_rate": 6.084285335911415e-06, + "logits/chosen": -1.4652645587921143, + "logits/rejected": -1.4055920839309692, + "logps/chosen": -1.030133605003357, + "logps/rejected": -13.148248672485352, + "loss": 1.0402, + "odds_ratio_loss": 0.10082044452428818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10301337391138077, + "rewards/margins": 1.2118113040924072, + "rewards/rejected": -1.3148248195648193, + "sft_loss": 1.030133605003357, + "step": 5590 + }, + { + "epoch": 0.44, + "grad_norm": 6.193985462188721, + "learning_rate": 6.0782734737991965e-06, + "logits/chosen": -1.4546271562576294, + "logits/rejected": -1.2658965587615967, + "logps/chosen": -1.2212754487991333, + "logps/rejected": -9.927604675292969, + "loss": 1.2485, + "odds_ratio_loss": 0.2723880708217621, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12212755531072617, + "rewards/margins": 0.8706329464912415, + "rewards/rejected": -0.9927604794502258, + "sft_loss": 1.2212754487991333, + "step": 5595 + }, + { + "epoch": 0.44, + "grad_norm": 15.977096557617188, + "learning_rate": 6.072259976348353e-06, + "logits/chosen": -1.3369700908660889, + "logits/rejected": -1.2094776630401611, + "logps/chosen": -0.8101118803024292, + "logps/rejected": -2.6742115020751953, + "loss": 0.827, + "odds_ratio_loss": 0.16866515576839447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08101119101047516, + "rewards/margins": 0.18640998005867004, + "rewards/rejected": -0.267421156167984, + "sft_loss": 0.8101118803024292, + "step": 5600 + }, + { + "epoch": 0.44, + "grad_norm": 5.28997278213501, + "learning_rate": 6.066244852679117e-06, + "logits/chosen": -1.4884275197982788, + "logits/rejected": -1.1082899570465088, + "logps/chosen": -1.3736873865127563, + "logps/rejected": -6.204714775085449, + "loss": 1.3932, + "odds_ratio_loss": 0.1947067677974701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13736873865127563, + "rewards/margins": 0.4831027090549469, + "rewards/rejected": -0.6204714179039001, + "sft_loss": 1.3736873865127563, + "step": 5605 + }, + { + "epoch": 0.44, + "grad_norm": 6.113433837890625, + "learning_rate": 6.060228111914186e-06, + "logits/chosen": -1.356977939605713, + "logits/rejected": -0.836715042591095, + "logps/chosen": -1.1353957653045654, + "logps/rejected": -8.683342933654785, + "loss": 1.1709, + "odds_ratio_loss": 0.3551942706108093, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11353959143161774, + "rewards/margins": 0.754794716835022, + "rewards/rejected": -0.8683342933654785, + "sft_loss": 1.1353957653045654, + "step": 5610 + }, + { + "epoch": 0.44, + "grad_norm": 6.144330024719238, + "learning_rate": 6.054209763178711e-06, + "logits/chosen": -1.475420355796814, + "logits/rejected": -1.0890623331069946, + "logps/chosen": -1.471876859664917, + "logps/rejected": -4.547472953796387, + "loss": 1.5125, + "odds_ratio_loss": 0.4066528379917145, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14718768000602722, + "rewards/margins": 0.30755966901779175, + "rewards/rejected": -0.45474734902381897, + "sft_loss": 1.471876859664917, + "step": 5615 + }, + { + "epoch": 0.44, + "grad_norm": 67.3975830078125, + "learning_rate": 6.048189815600281e-06, + "logits/chosen": -1.257453203201294, + "logits/rejected": -1.1578747034072876, + "logps/chosen": -1.1685110330581665, + "logps/rejected": -13.333429336547852, + "loss": 1.1729, + "odds_ratio_loss": 0.043477244675159454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11685110628604889, + "rewards/margins": 1.2164918184280396, + "rewards/rejected": -1.3333427906036377, + "sft_loss": 1.1685110330581665, + "step": 5620 + }, + { + "epoch": 0.44, + "grad_norm": 18.023799896240234, + "learning_rate": 6.042168278308913e-06, + "logits/chosen": -1.3366608619689941, + "logits/rejected": -1.1345504522323608, + "logps/chosen": -1.0113810300827026, + "logps/rejected": -9.117487907409668, + "loss": 1.0589, + "odds_ratio_loss": 0.4751584529876709, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10113811492919922, + "rewards/margins": 0.8106106519699097, + "rewards/rejected": -0.9117487668991089, + "sft_loss": 1.0113810300827026, + "step": 5625 + }, + { + "epoch": 0.44, + "grad_norm": 10.662079811096191, + "learning_rate": 6.0361451604370335e-06, + "logits/chosen": -1.32015061378479, + "logits/rejected": -1.0003557205200195, + "logps/chosen": -1.0010217428207397, + "logps/rejected": -3.2925896644592285, + "loss": 1.0352, + "odds_ratio_loss": 0.34152549505233765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10010218620300293, + "rewards/margins": 0.22915680706501007, + "rewards/rejected": -0.3292589783668518, + "sft_loss": 1.0010217428207397, + "step": 5630 + }, + { + "epoch": 0.44, + "grad_norm": 6.930639743804932, + "learning_rate": 6.030120471119464e-06, + "logits/chosen": -1.3336502313613892, + "logits/rejected": -1.0396268367767334, + "logps/chosen": -1.0695290565490723, + "logps/rejected": -6.448616027832031, + "loss": 1.1032, + "odds_ratio_loss": 0.3368968069553375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10695289075374603, + "rewards/margins": 0.537908673286438, + "rewards/rejected": -0.6448616981506348, + "sft_loss": 1.0695290565490723, + "step": 5635 + }, + { + "epoch": 0.44, + "grad_norm": 7.778130531311035, + "learning_rate": 6.02409421949341e-06, + "logits/chosen": -1.3733274936676025, + "logits/rejected": -1.3728525638580322, + "logps/chosen": -0.7616759538650513, + "logps/rejected": -9.624399185180664, + "loss": 0.7976, + "odds_ratio_loss": 0.359219491481781, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07616759836673737, + "rewards/margins": 0.8862722516059875, + "rewards/rejected": -0.9624398946762085, + "sft_loss": 0.7616759538650513, + "step": 5640 + }, + { + "epoch": 0.44, + "grad_norm": 55.70398712158203, + "learning_rate": 6.018066414698448e-06, + "logits/chosen": -1.2639678716659546, + "logits/rejected": -0.809512734413147, + "logps/chosen": -1.1552722454071045, + "logps/rejected": -17.194225311279297, + "loss": 1.1611, + "odds_ratio_loss": 0.05807274580001831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11552723497152328, + "rewards/margins": 1.6038951873779297, + "rewards/rejected": -1.7194225788116455, + "sft_loss": 1.1552722454071045, + "step": 5645 + }, + { + "epoch": 0.44, + "grad_norm": 29.567472457885742, + "learning_rate": 6.012037065876509e-06, + "logits/chosen": -1.5392589569091797, + "logits/rejected": -1.446018934249878, + "logps/chosen": -1.339097261428833, + "logps/rejected": -3.9835457801818848, + "loss": 1.4159, + "odds_ratio_loss": 0.768239438533783, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13390973210334778, + "rewards/margins": 0.26444482803344727, + "rewards/rejected": -0.39835453033447266, + "sft_loss": 1.339097261428833, + "step": 5650 + }, + { + "epoch": 0.44, + "grad_norm": 7.155758380889893, + "learning_rate": 6.006006182171868e-06, + "logits/chosen": -1.4382535219192505, + "logits/rejected": -1.5007126331329346, + "logps/chosen": -0.9383573532104492, + "logps/rejected": -17.310665130615234, + "loss": 0.9634, + "odds_ratio_loss": 0.25030335783958435, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0938357338309288, + "rewards/margins": 1.6372311115264893, + "rewards/rejected": -1.7310667037963867, + "sft_loss": 0.9383573532104492, + "step": 5655 + }, + { + "epoch": 0.44, + "grad_norm": 18.916534423828125, + "learning_rate": 5.999973772731121e-06, + "logits/chosen": -1.0211951732635498, + "logits/rejected": -1.2356466054916382, + "logps/chosen": -0.9551981091499329, + "logps/rejected": -7.499606132507324, + "loss": 0.9926, + "odds_ratio_loss": 0.3744484484195709, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09551980346441269, + "rewards/margins": 0.6544408202171326, + "rewards/rejected": -0.7499606609344482, + "sft_loss": 0.9551981091499329, + "step": 5660 + }, + { + "epoch": 0.44, + "grad_norm": 1061.22900390625, + "learning_rate": 5.993939846703189e-06, + "logits/chosen": -1.481950283050537, + "logits/rejected": -1.5191516876220703, + "logps/chosen": -2.560549736022949, + "logps/rejected": -10.782699584960938, + "loss": 2.6494, + "odds_ratio_loss": 0.8887947797775269, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.25605496764183044, + "rewards/margins": 0.8222150802612305, + "rewards/rejected": -1.0782700777053833, + "sft_loss": 2.560549736022949, + "step": 5665 + }, + { + "epoch": 0.44, + "grad_norm": 12.319400787353516, + "learning_rate": 5.987904413239284e-06, + "logits/chosen": -1.3272775411605835, + "logits/rejected": -1.3253281116485596, + "logps/chosen": -0.9079964756965637, + "logps/rejected": -6.848818778991699, + "loss": 0.9143, + "odds_ratio_loss": 0.06282417476177216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09079965204000473, + "rewards/margins": 0.594082236289978, + "rewards/rejected": -0.6848819851875305, + "sft_loss": 0.9079964756965637, + "step": 5670 + }, + { + "epoch": 0.44, + "grad_norm": 7.039941310882568, + "learning_rate": 5.981867481492906e-06, + "logits/chosen": -1.4510374069213867, + "logits/rejected": -1.0965907573699951, + "logps/chosen": -0.9913382530212402, + "logps/rejected": -12.328478813171387, + "loss": 1.0036, + "odds_ratio_loss": 0.12262825667858124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09913383424282074, + "rewards/margins": 1.1337140798568726, + "rewards/rejected": -1.2328479290008545, + "sft_loss": 0.9913382530212402, + "step": 5675 + }, + { + "epoch": 0.44, + "grad_norm": 18.226097106933594, + "learning_rate": 5.97582906061983e-06, + "logits/chosen": -1.4995019435882568, + "logits/rejected": -1.8552892208099365, + "logps/chosen": -0.7538286447525024, + "logps/rejected": -6.817639350891113, + "loss": 0.7843, + "odds_ratio_loss": 0.3042234778404236, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07538286596536636, + "rewards/margins": 0.6063810586929321, + "rewards/rejected": -0.6817639470100403, + "sft_loss": 0.7538286447525024, + "step": 5680 + }, + { + "epoch": 0.44, + "grad_norm": 83.86195373535156, + "learning_rate": 5.969789159778086e-06, + "logits/chosen": -1.4456707239151, + "logits/rejected": -1.1688287258148193, + "logps/chosen": -1.2338379621505737, + "logps/rejected": -7.469872951507568, + "loss": 1.2458, + "odds_ratio_loss": 0.11980722099542618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12338379770517349, + "rewards/margins": 0.6236035227775574, + "rewards/rejected": -0.7469873428344727, + "sft_loss": 1.2338379621505737, + "step": 5685 + }, + { + "epoch": 0.44, + "grad_norm": 6.135641574859619, + "learning_rate": 5.963747788127954e-06, + "logits/chosen": -1.4537721872329712, + "logits/rejected": -1.4107811450958252, + "logps/chosen": -1.9091577529907227, + "logps/rejected": -6.800889015197754, + "loss": 1.926, + "odds_ratio_loss": 0.16891498863697052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1909157782793045, + "rewards/margins": 0.48917311429977417, + "rewards/rejected": -0.6800888776779175, + "sft_loss": 1.9091577529907227, + "step": 5690 + }, + { + "epoch": 0.44, + "grad_norm": 17.577470779418945, + "learning_rate": 5.9577049548319385e-06, + "logits/chosen": -1.1830053329467773, + "logits/rejected": -0.9701796770095825, + "logps/chosen": -1.0823400020599365, + "logps/rejected": -12.730480194091797, + "loss": 1.085, + "odds_ratio_loss": 0.02667110227048397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1082339882850647, + "rewards/margins": 1.1648141145706177, + "rewards/rejected": -1.2730480432510376, + "sft_loss": 1.0823400020599365, + "step": 5695 + }, + { + "epoch": 0.44, + "grad_norm": 5.481243133544922, + "learning_rate": 5.951660669054764e-06, + "logits/chosen": -1.2399911880493164, + "logits/rejected": -0.9875618815422058, + "logps/chosen": -0.9826416969299316, + "logps/rejected": -2.611884832382202, + "loss": 1.0209, + "odds_ratio_loss": 0.38236701488494873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0982641652226448, + "rewards/margins": 0.16292431950569153, + "rewards/rejected": -0.2611885070800781, + "sft_loss": 0.9826416969299316, + "step": 5700 + }, + { + "epoch": 0.44, + "grad_norm": 20.629961013793945, + "learning_rate": 5.945614939963358e-06, + "logits/chosen": -1.4003746509552002, + "logits/rejected": -0.9283599853515625, + "logps/chosen": -1.3317302465438843, + "logps/rejected": -7.419297695159912, + "loss": 1.3534, + "odds_ratio_loss": 0.21676786243915558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13317301869392395, + "rewards/margins": 0.6087567806243896, + "rewards/rejected": -0.741929829120636, + "sft_loss": 1.3317302465438843, + "step": 5705 + }, + { + "epoch": 0.44, + "grad_norm": 16.150772094726562, + "learning_rate": 5.939567776726834e-06, + "logits/chosen": -1.344822883605957, + "logits/rejected": -1.3809988498687744, + "logps/chosen": -1.1117184162139893, + "logps/rejected": -4.634824752807617, + "loss": 1.1244, + "odds_ratio_loss": 0.1271774023771286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11117184162139893, + "rewards/margins": 0.3523106276988983, + "rewards/rejected": -0.46348246932029724, + "sft_loss": 1.1117184162139893, + "step": 5710 + }, + { + "epoch": 0.44, + "grad_norm": 5.546013355255127, + "learning_rate": 5.933519188516485e-06, + "logits/chosen": -1.3635753393173218, + "logits/rejected": -1.1241014003753662, + "logps/chosen": -0.9175945520401001, + "logps/rejected": -2.8002352714538574, + "loss": 0.9335, + "odds_ratio_loss": 0.15915009379386902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09175945818424225, + "rewards/margins": 0.18826408684253693, + "rewards/rejected": -0.2800235450267792, + "sft_loss": 0.9175945520401001, + "step": 5715 + }, + { + "epoch": 0.44, + "grad_norm": 5.130252838134766, + "learning_rate": 5.927469184505762e-06, + "logits/chosen": -1.347988247871399, + "logits/rejected": -1.227783203125, + "logps/chosen": -2.2398881912231445, + "logps/rejected": -10.506120681762695, + "loss": 2.2613, + "odds_ratio_loss": 0.21373744308948517, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.22398881614208221, + "rewards/margins": 0.826623260974884, + "rewards/rejected": -1.0506120920181274, + "sft_loss": 2.2398881912231445, + "step": 5720 + }, + { + "epoch": 0.45, + "grad_norm": 6.612664222717285, + "learning_rate": 5.921417773870266e-06, + "logits/chosen": -1.4397814273834229, + "logits/rejected": -1.322584629058838, + "logps/chosen": -0.8262338638305664, + "logps/rejected": -5.6490397453308105, + "loss": 0.8352, + "odds_ratio_loss": 0.0899687260389328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08262337744235992, + "rewards/margins": 0.48228058218955994, + "rewards/rejected": -0.564903974533081, + "sft_loss": 0.8262338638305664, + "step": 5725 + }, + { + "epoch": 0.45, + "grad_norm": 7.2765727043151855, + "learning_rate": 5.915364965787728e-06, + "logits/chosen": -1.3210567235946655, + "logits/rejected": -1.1719744205474854, + "logps/chosen": -1.021377444267273, + "logps/rejected": -2.6403896808624268, + "loss": 1.0411, + "odds_ratio_loss": 0.19730141758918762, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1021377444267273, + "rewards/margins": 0.16190123558044434, + "rewards/rejected": -0.26403898000717163, + "sft_loss": 1.021377444267273, + "step": 5730 + }, + { + "epoch": 0.45, + "grad_norm": 7.26978063583374, + "learning_rate": 5.909310769437999e-06, + "logits/chosen": -1.3003871440887451, + "logits/rejected": -0.99261873960495, + "logps/chosen": -0.7958885431289673, + "logps/rejected": -9.100659370422363, + "loss": 0.8235, + "odds_ratio_loss": 0.2762227952480316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07958885282278061, + "rewards/margins": 0.8304770588874817, + "rewards/rejected": -0.9100659489631653, + "sft_loss": 0.7958885431289673, + "step": 5735 + }, + { + "epoch": 0.45, + "grad_norm": 39.95994186401367, + "learning_rate": 5.903255194003037e-06, + "logits/chosen": -1.4754865169525146, + "logits/rejected": -1.2973945140838623, + "logps/chosen": -0.8494400978088379, + "logps/rejected": -4.790776252746582, + "loss": 0.8841, + "odds_ratio_loss": 0.3469342589378357, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08494400978088379, + "rewards/margins": 0.39413362741470337, + "rewards/rejected": -0.47907763719558716, + "sft_loss": 0.8494400978088379, + "step": 5740 + }, + { + "epoch": 0.45, + "grad_norm": 5.832455635070801, + "learning_rate": 5.897198248666893e-06, + "logits/chosen": -1.30601966381073, + "logits/rejected": -0.9545402526855469, + "logps/chosen": -1.1280930042266846, + "logps/rejected": -3.4375717639923096, + "loss": 1.191, + "odds_ratio_loss": 0.6293038725852966, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11280930042266846, + "rewards/margins": 0.2309478521347046, + "rewards/rejected": -0.34375715255737305, + "sft_loss": 1.1280930042266846, + "step": 5745 + }, + { + "epoch": 0.45, + "grad_norm": 9.348652839660645, + "learning_rate": 5.891139942615693e-06, + "logits/chosen": -1.252437949180603, + "logits/rejected": -0.779747724533081, + "logps/chosen": -0.9001814723014832, + "logps/rejected": -3.424330472946167, + "loss": 0.9164, + "odds_ratio_loss": 0.16236189007759094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0900181457400322, + "rewards/margins": 0.25241488218307495, + "rewards/rejected": -0.34243303537368774, + "sft_loss": 0.9001814723014832, + "step": 5750 + }, + { + "epoch": 0.45, + "grad_norm": 15.176481246948242, + "learning_rate": 5.8850802850376245e-06, + "logits/chosen": -1.0830439329147339, + "logits/rejected": -0.5726959109306335, + "logps/chosen": -0.9458778500556946, + "logps/rejected": -3.5730719566345215, + "loss": 0.9607, + "odds_ratio_loss": 0.14806696772575378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0945877879858017, + "rewards/margins": 0.26271945238113403, + "rewards/rejected": -0.3573072552680969, + "sft_loss": 0.9458778500556946, + "step": 5755 + }, + { + "epoch": 0.45, + "grad_norm": 7.540465354919434, + "learning_rate": 5.87901928512293e-06, + "logits/chosen": -1.3506877422332764, + "logits/rejected": -1.2902274131774902, + "logps/chosen": -1.1145894527435303, + "logps/rejected": -3.1862120628356934, + "loss": 1.1339, + "odds_ratio_loss": 0.19300493597984314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11145894229412079, + "rewards/margins": 0.2071622908115387, + "rewards/rejected": -0.3186212182044983, + "sft_loss": 1.1145894527435303, + "step": 5760 + }, + { + "epoch": 0.45, + "grad_norm": 29.251178741455078, + "learning_rate": 5.872956952063885e-06, + "logits/chosen": -1.3513835668563843, + "logits/rejected": -1.3920825719833374, + "logps/chosen": -0.811087429523468, + "logps/rejected": -6.7181830406188965, + "loss": 0.8188, + "odds_ratio_loss": 0.07711207121610641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08110874146223068, + "rewards/margins": 0.5907095670700073, + "rewards/rejected": -0.6718182563781738, + "sft_loss": 0.811087429523468, + "step": 5765 + }, + { + "epoch": 0.45, + "grad_norm": 8.553319931030273, + "learning_rate": 5.866893295054788e-06, + "logits/chosen": -1.4405219554901123, + "logits/rejected": -1.3422691822052002, + "logps/chosen": -1.048010230064392, + "logps/rejected": -8.861117362976074, + "loss": 1.0655, + "odds_ratio_loss": 0.17444069683551788, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10480103641748428, + "rewards/margins": 0.7813106775283813, + "rewards/rejected": -0.8861117362976074, + "sft_loss": 1.048010230064392, + "step": 5770 + }, + { + "epoch": 0.45, + "grad_norm": 5.439850807189941, + "learning_rate": 5.860828323291943e-06, + "logits/chosen": -1.3938419818878174, + "logits/rejected": -1.003071665763855, + "logps/chosen": -0.8749563097953796, + "logps/rejected": -10.783254623413086, + "loss": 0.8823, + "odds_ratio_loss": 0.07309209555387497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08749563992023468, + "rewards/margins": 0.9908298254013062, + "rewards/rejected": -1.0783255100250244, + "sft_loss": 0.8749563097953796, + "step": 5775 + }, + { + "epoch": 0.45, + "grad_norm": 8.660329818725586, + "learning_rate": 5.854762045973652e-06, + "logits/chosen": -1.3360105752944946, + "logits/rejected": -1.2421538829803467, + "logps/chosen": -1.4898730516433716, + "logps/rejected": -5.747361660003662, + "loss": 1.5518, + "odds_ratio_loss": 0.6197172403335571, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1489873081445694, + "rewards/margins": 0.42574888467788696, + "rewards/rejected": -0.5747361779212952, + "sft_loss": 1.4898730516433716, + "step": 5780 + }, + { + "epoch": 0.45, + "grad_norm": 6.601789474487305, + "learning_rate": 5.8486944723001926e-06, + "logits/chosen": -1.5374510288238525, + "logits/rejected": -1.1397758722305298, + "logps/chosen": -1.0105637311935425, + "logps/rejected": -3.0968735218048096, + "loss": 1.0427, + "odds_ratio_loss": 0.3209215998649597, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10105638206005096, + "rewards/margins": 0.2086309939622879, + "rewards/rejected": -0.30968737602233887, + "sft_loss": 1.0105637311935425, + "step": 5785 + }, + { + "epoch": 0.45, + "grad_norm": 6.6358160972595215, + "learning_rate": 5.842625611473811e-06, + "logits/chosen": -1.3155019283294678, + "logits/rejected": -1.1303455829620361, + "logps/chosen": -0.8606014251708984, + "logps/rejected": -10.288931846618652, + "loss": 0.8908, + "odds_ratio_loss": 0.3021547198295593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08606014400720596, + "rewards/margins": 0.9428330659866333, + "rewards/rejected": -1.028893232345581, + "sft_loss": 0.8606014251708984, + "step": 5790 + }, + { + "epoch": 0.45, + "grad_norm": 19.110700607299805, + "learning_rate": 5.836555472698707e-06, + "logits/chosen": -1.28428316116333, + "logits/rejected": -1.1632674932479858, + "logps/chosen": -0.964084804058075, + "logps/rejected": -10.772963523864746, + "loss": 0.9645, + "odds_ratio_loss": 0.004020442720502615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09640847891569138, + "rewards/margins": 0.9808878898620605, + "rewards/rejected": -1.077296495437622, + "sft_loss": 0.964084804058075, + "step": 5795 + }, + { + "epoch": 0.45, + "grad_norm": 9.711421966552734, + "learning_rate": 5.830484065181015e-06, + "logits/chosen": -1.2102570533752441, + "logits/rejected": -1.208566665649414, + "logps/chosen": -1.4164773225784302, + "logps/rejected": -2.8852813243865967, + "loss": 1.4742, + "odds_ratio_loss": 0.5771785974502563, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.14164772629737854, + "rewards/margins": 0.1468804031610489, + "rewards/rejected": -0.28852811455726624, + "sft_loss": 1.4164773225784302, + "step": 5800 + }, + { + "epoch": 0.45, + "grad_norm": 47.696739196777344, + "learning_rate": 5.824411398128795e-06, + "logits/chosen": -1.4307831525802612, + "logits/rejected": -1.4838054180145264, + "logps/chosen": -0.7359696626663208, + "logps/rejected": -5.552895545959473, + "loss": 0.7465, + "odds_ratio_loss": 0.10489847511053085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07359696924686432, + "rewards/margins": 0.4816926121711731, + "rewards/rejected": -0.5552895069122314, + "sft_loss": 0.7359696626663208, + "step": 5805 + }, + { + "epoch": 0.45, + "grad_norm": 23.042871475219727, + "learning_rate": 5.81833748075202e-06, + "logits/chosen": -1.394331693649292, + "logits/rejected": -0.6417025327682495, + "logps/chosen": -1.1504552364349365, + "logps/rejected": -2.6900577545166016, + "loss": 1.1776, + "odds_ratio_loss": 0.2714696228504181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11504554748535156, + "rewards/margins": 0.15396027266979218, + "rewards/rejected": -0.26900583505630493, + "sft_loss": 1.1504552364349365, + "step": 5810 + }, + { + "epoch": 0.45, + "grad_norm": 12.432936668395996, + "learning_rate": 5.812262322262554e-06, + "logits/chosen": -1.3173372745513916, + "logits/rejected": -1.1222119331359863, + "logps/chosen": -0.9128435850143433, + "logps/rejected": -1.756940484046936, + "loss": 0.9457, + "odds_ratio_loss": 0.32820624113082886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0912843644618988, + "rewards/margins": 0.0844096764922142, + "rewards/rejected": -0.1756940335035324, + "sft_loss": 0.9128435850143433, + "step": 5815 + }, + { + "epoch": 0.45, + "grad_norm": 7.906182765960693, + "learning_rate": 5.806185931874148e-06, + "logits/chosen": -1.3855034112930298, + "logits/rejected": -0.9745451211929321, + "logps/chosen": -1.174140453338623, + "logps/rejected": -10.40697193145752, + "loss": 1.1841, + "odds_ratio_loss": 0.09939200431108475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11741403490304947, + "rewards/margins": 0.9232832193374634, + "rewards/rejected": -1.0406970977783203, + "sft_loss": 1.174140453338623, + "step": 5820 + }, + { + "epoch": 0.45, + "grad_norm": 17.768505096435547, + "learning_rate": 5.800108318802418e-06, + "logits/chosen": -1.29469895362854, + "logits/rejected": -1.390100121498108, + "logps/chosen": -1.0928549766540527, + "logps/rejected": -5.852885723114014, + "loss": 1.1167, + "odds_ratio_loss": 0.2388814389705658, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10928548872470856, + "rewards/margins": 0.47600308060646057, + "rewards/rejected": -0.5852886438369751, + "sft_loss": 1.0928549766540527, + "step": 5825 + }, + { + "epoch": 0.45, + "grad_norm": 6.824836730957031, + "learning_rate": 5.7940294922648365e-06, + "logits/chosen": -1.4115650653839111, + "logits/rejected": -1.1731384992599487, + "logps/chosen": -1.6221885681152344, + "logps/rejected": -6.723604679107666, + "loss": 1.6568, + "odds_ratio_loss": 0.34624502062797546, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1622188687324524, + "rewards/margins": 0.5101416110992432, + "rewards/rejected": -0.6723604798316956, + "sft_loss": 1.6221885681152344, + "step": 5830 + }, + { + "epoch": 0.45, + "grad_norm": 10.766157150268555, + "learning_rate": 5.787949461480717e-06, + "logits/chosen": -1.1654672622680664, + "logits/rejected": -1.4366929531097412, + "logps/chosen": -1.0175927877426147, + "logps/rejected": -9.682588577270508, + "loss": 1.0291, + "odds_ratio_loss": 0.11466507613658905, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10175929218530655, + "rewards/margins": 0.8664995431900024, + "rewards/rejected": -0.9682588577270508, + "sft_loss": 1.0175927877426147, + "step": 5835 + }, + { + "epoch": 0.45, + "grad_norm": 6.221446514129639, + "learning_rate": 5.781868235671197e-06, + "logits/chosen": -1.406813383102417, + "logits/rejected": -0.8143359422683716, + "logps/chosen": -0.9683381915092468, + "logps/rejected": -4.046400547027588, + "loss": 0.9999, + "odds_ratio_loss": 0.31571096181869507, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09683381766080856, + "rewards/margins": 0.30780622363090515, + "rewards/rejected": -0.4046400487422943, + "sft_loss": 0.9683381915092468, + "step": 5840 + }, + { + "epoch": 0.45, + "grad_norm": 14.350464820861816, + "learning_rate": 5.775785824059228e-06, + "logits/chosen": -1.3366856575012207, + "logits/rejected": -1.1903644800186157, + "logps/chosen": -0.9601635932922363, + "logps/rejected": -1.6902868747711182, + "loss": 0.9953, + "odds_ratio_loss": 0.35140281915664673, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09601635485887527, + "rewards/margins": 0.0730123296380043, + "rewards/rejected": -0.16902866959571838, + "sft_loss": 0.9601635932922363, + "step": 5845 + }, + { + "epoch": 0.46, + "grad_norm": 7.248025417327881, + "learning_rate": 5.7697022358695595e-06, + "logits/chosen": -1.3718547821044922, + "logits/rejected": -1.3155597448349, + "logps/chosen": -0.9688811302185059, + "logps/rejected": -6.297783851623535, + "loss": 0.9867, + "odds_ratio_loss": 0.17819690704345703, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09688811004161835, + "rewards/margins": 0.5328903794288635, + "rewards/rejected": -0.6297784447669983, + "sft_loss": 0.9688811302185059, + "step": 5850 + }, + { + "epoch": 0.46, + "grad_norm": 8.421185493469238, + "learning_rate": 5.763617480328725e-06, + "logits/chosen": -1.3368728160858154, + "logits/rejected": -1.1470426321029663, + "logps/chosen": -1.2153146266937256, + "logps/rejected": -8.046792030334473, + "loss": 1.2225, + "odds_ratio_loss": 0.07202035933732986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12153146415948868, + "rewards/margins": 0.6831477284431458, + "rewards/rejected": -0.8046790957450867, + "sft_loss": 1.2153146266937256, + "step": 5855 + }, + { + "epoch": 0.46, + "grad_norm": 469.78997802734375, + "learning_rate": 5.757531566665029e-06, + "logits/chosen": -1.4922001361846924, + "logits/rejected": -1.472839117050171, + "logps/chosen": -2.2965407371520996, + "logps/rejected": -17.437091827392578, + "loss": 2.2972, + "odds_ratio_loss": 0.007058045826852322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22965407371520996, + "rewards/margins": 1.5140551328659058, + "rewards/rejected": -1.7437093257904053, + "sft_loss": 2.2965407371520996, + "step": 5860 + }, + { + "epoch": 0.46, + "grad_norm": 7.139679431915283, + "learning_rate": 5.751444504108532e-06, + "logits/chosen": -1.504317045211792, + "logits/rejected": -1.1596872806549072, + "logps/chosen": -1.9202678203582764, + "logps/rejected": -15.623100280761719, + "loss": 1.9275, + "odds_ratio_loss": 0.07225911319255829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1920267790555954, + "rewards/margins": 1.3702832460403442, + "rewards/rejected": -1.562309980392456, + "sft_loss": 1.9202678203582764, + "step": 5865 + }, + { + "epoch": 0.46, + "grad_norm": 9.28328800201416, + "learning_rate": 5.745356301891036e-06, + "logits/chosen": -1.3105159997940063, + "logits/rejected": -0.9987856149673462, + "logps/chosen": -0.8882701992988586, + "logps/rejected": -6.670027256011963, + "loss": 0.8993, + "odds_ratio_loss": 0.11003986746072769, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08882702887058258, + "rewards/margins": 0.5781757235527039, + "rewards/rejected": -0.66700279712677, + "sft_loss": 0.8882701992988586, + "step": 5870 + }, + { + "epoch": 0.46, + "grad_norm": 14.096705436706543, + "learning_rate": 5.739266969246077e-06, + "logits/chosen": -1.4479315280914307, + "logits/rejected": -1.2866063117980957, + "logps/chosen": -1.385507345199585, + "logps/rejected": -4.894179344177246, + "loss": 1.3972, + "odds_ratio_loss": 0.11661858856678009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13855072855949402, + "rewards/margins": 0.35086721181869507, + "rewards/rejected": -0.4894179403781891, + "sft_loss": 1.385507345199585, + "step": 5875 + }, + { + "epoch": 0.46, + "grad_norm": 6.6659746170043945, + "learning_rate": 5.733176515408896e-06, + "logits/chosen": -1.3389222621917725, + "logits/rejected": -0.9701216816902161, + "logps/chosen": -1.0844879150390625, + "logps/rejected": -11.632673263549805, + "loss": 1.0921, + "odds_ratio_loss": 0.07584533095359802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10844878852367401, + "rewards/margins": 1.0548183917999268, + "rewards/rejected": -1.1632672548294067, + "sft_loss": 1.0844879150390625, + "step": 5880 + }, + { + "epoch": 0.46, + "grad_norm": 195.03273010253906, + "learning_rate": 5.727084949616443e-06, + "logits/chosen": -1.4234966039657593, + "logits/rejected": -1.4936151504516602, + "logps/chosen": -1.540858507156372, + "logps/rejected": -5.37454891204834, + "loss": 1.5697, + "odds_ratio_loss": 0.28838053345680237, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15408584475517273, + "rewards/margins": 0.3833690583705902, + "rewards/rejected": -0.5374549627304077, + "sft_loss": 1.540858507156372, + "step": 5885 + }, + { + "epoch": 0.46, + "grad_norm": 7.396588325500488, + "learning_rate": 5.720992281107347e-06, + "logits/chosen": -1.4567421674728394, + "logits/rejected": -0.7531291842460632, + "logps/chosen": -1.057483196258545, + "logps/rejected": -1.95417058467865, + "loss": 1.1194, + "odds_ratio_loss": 0.6192514300346375, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10574833303689957, + "rewards/margins": 0.08966872841119766, + "rewards/rejected": -0.19541704654693604, + "sft_loss": 1.057483196258545, + "step": 5890 + }, + { + "epoch": 0.46, + "grad_norm": 17.07513427734375, + "learning_rate": 5.714898519121919e-06, + "logits/chosen": -1.527635097503662, + "logits/rejected": -1.0351780652999878, + "logps/chosen": -0.8463979959487915, + "logps/rejected": -3.124258518218994, + "loss": 0.876, + "odds_ratio_loss": 0.2964860796928406, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08463980257511139, + "rewards/margins": 0.22778606414794922, + "rewards/rejected": -0.3124258816242218, + "sft_loss": 0.8463979959487915, + "step": 5895 + }, + { + "epoch": 0.46, + "grad_norm": 21.76619529724121, + "learning_rate": 5.708803672902119e-06, + "logits/chosen": -1.3705116510391235, + "logits/rejected": -1.0004701614379883, + "logps/chosen": -0.9214539527893066, + "logps/rejected": -3.389974594116211, + "loss": 0.937, + "odds_ratio_loss": 0.15547111630439758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0921453982591629, + "rewards/margins": 0.2468520700931549, + "rewards/rejected": -0.338997483253479, + "sft_loss": 0.9214539527893066, + "step": 5900 + }, + { + "epoch": 0.46, + "grad_norm": 10.555737495422363, + "learning_rate": 5.7027077516915544e-06, + "logits/chosen": -1.3661435842514038, + "logits/rejected": -1.0756629705429077, + "logps/chosen": -0.8574056625366211, + "logps/rejected": -2.659536600112915, + "loss": 0.8867, + "odds_ratio_loss": 0.2932834029197693, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08574056625366211, + "rewards/margins": 0.180213063955307, + "rewards/rejected": -0.2659536302089691, + "sft_loss": 0.8574056625366211, + "step": 5905 + }, + { + "epoch": 0.46, + "grad_norm": 8.062660217285156, + "learning_rate": 5.6966107647354655e-06, + "logits/chosen": -1.5043424367904663, + "logits/rejected": -1.2339164018630981, + "logps/chosen": -1.0549910068511963, + "logps/rejected": -2.0557339191436768, + "loss": 1.081, + "odds_ratio_loss": 0.26006263494491577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10549911111593246, + "rewards/margins": 0.10007427632808685, + "rewards/rejected": -0.20557339489459991, + "sft_loss": 1.0549910068511963, + "step": 5910 + }, + { + "epoch": 0.46, + "grad_norm": 38.601078033447266, + "learning_rate": 5.690512721280707e-06, + "logits/chosen": -1.3967393636703491, + "logits/rejected": -0.752194881439209, + "logps/chosen": -1.0708661079406738, + "logps/rejected": -1.977912187576294, + "loss": 1.1081, + "odds_ratio_loss": 0.3719109892845154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10708661377429962, + "rewards/margins": 0.09070460498332977, + "rewards/rejected": -0.1977912038564682, + "sft_loss": 1.0708661079406738, + "step": 5915 + }, + { + "epoch": 0.46, + "grad_norm": 24.084102630615234, + "learning_rate": 5.684413630575737e-06, + "logits/chosen": -1.4691526889801025, + "logits/rejected": -1.2201991081237793, + "logps/chosen": -0.8691366314888, + "logps/rejected": -2.033484935760498, + "loss": 0.9264, + "odds_ratio_loss": 0.5723375678062439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08691366016864777, + "rewards/margins": 0.11643485724925995, + "rewards/rejected": -0.20334851741790771, + "sft_loss": 0.8691366314888, + "step": 5920 + }, + { + "epoch": 0.46, + "grad_norm": 11.225747108459473, + "learning_rate": 5.678313501870599e-06, + "logits/chosen": -1.2836955785751343, + "logits/rejected": -1.284705400466919, + "logps/chosen": -0.8444737195968628, + "logps/rejected": -4.509748458862305, + "loss": 0.8763, + "odds_ratio_loss": 0.31828147172927856, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08444737643003464, + "rewards/margins": 0.3665274977684021, + "rewards/rejected": -0.45097485184669495, + "sft_loss": 0.8444737195968628, + "step": 5925 + }, + { + "epoch": 0.46, + "grad_norm": 10.136873245239258, + "learning_rate": 5.672212344416912e-06, + "logits/chosen": -1.4180656671524048, + "logits/rejected": -1.0987080335617065, + "logps/chosen": -0.887566864490509, + "logps/rejected": -8.7537841796875, + "loss": 0.9074, + "odds_ratio_loss": 0.1985785961151123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08875668048858643, + "rewards/margins": 0.7866216897964478, + "rewards/rejected": -0.875378429889679, + "sft_loss": 0.887566864490509, + "step": 5930 + }, + { + "epoch": 0.46, + "grad_norm": 8.497200965881348, + "learning_rate": 5.666110167467858e-06, + "logits/chosen": -1.3332171440124512, + "logits/rejected": -1.3454488515853882, + "logps/chosen": -1.0007665157318115, + "logps/rejected": -4.486929416656494, + "loss": 1.0361, + "odds_ratio_loss": 0.3537340760231018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10007666051387787, + "rewards/margins": 0.34861627221107483, + "rewards/rejected": -0.4486928880214691, + "sft_loss": 1.0007665157318115, + "step": 5935 + }, + { + "epoch": 0.46, + "grad_norm": 13.362495422363281, + "learning_rate": 5.6600069802781634e-06, + "logits/chosen": -1.440659999847412, + "logits/rejected": -0.8654731512069702, + "logps/chosen": -0.9633086323738098, + "logps/rejected": -3.2738614082336426, + "loss": 0.9943, + "odds_ratio_loss": 0.30953115224838257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09633086621761322, + "rewards/margins": 0.23105528950691223, + "rewards/rejected": -0.32738617062568665, + "sft_loss": 0.9633086323738098, + "step": 5940 + }, + { + "epoch": 0.46, + "grad_norm": 25.45018768310547, + "learning_rate": 5.6539027921040836e-06, + "logits/chosen": -1.2397971153259277, + "logits/rejected": -0.7677778005599976, + "logps/chosen": -0.9034073948860168, + "logps/rejected": -4.984935760498047, + "loss": 0.9151, + "odds_ratio_loss": 0.11679281294345856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0903407484292984, + "rewards/margins": 0.40815287828445435, + "rewards/rejected": -0.49849358201026917, + "sft_loss": 0.9034073948860168, + "step": 5945 + }, + { + "epoch": 0.46, + "grad_norm": 5.295702934265137, + "learning_rate": 5.647797612203399e-06, + "logits/chosen": -1.394932508468628, + "logits/rejected": -1.2857708930969238, + "logps/chosen": -0.8075026273727417, + "logps/rejected": -8.137491226196289, + "loss": 0.8361, + "odds_ratio_loss": 0.2856552302837372, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08075026422739029, + "rewards/margins": 0.7329989671707153, + "rewards/rejected": -0.8137491941452026, + "sft_loss": 0.8075026273727417, + "step": 5950 + }, + { + "epoch": 0.46, + "grad_norm": 5.818515300750732, + "learning_rate": 5.641691449835387e-06, + "logits/chosen": -1.3131741285324097, + "logits/rejected": -1.2235875129699707, + "logps/chosen": -0.9159797430038452, + "logps/rejected": -6.6266608238220215, + "loss": 0.9419, + "odds_ratio_loss": 0.258743017911911, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09159798920154572, + "rewards/margins": 0.5710681676864624, + "rewards/rejected": -0.6626661419868469, + "sft_loss": 0.9159797430038452, + "step": 5955 + }, + { + "epoch": 0.46, + "grad_norm": 17.876649856567383, + "learning_rate": 5.635584314260818e-06, + "logits/chosen": -1.0992735624313354, + "logits/rejected": -1.378377079963684, + "logps/chosen": -0.7821237444877625, + "logps/rejected": -14.559664726257324, + "loss": 0.7915, + "odds_ratio_loss": 0.0936480313539505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07821237295866013, + "rewards/margins": 1.3777542114257812, + "rewards/rejected": -1.455966591835022, + "sft_loss": 0.7821237444877625, + "step": 5960 + }, + { + "epoch": 0.46, + "grad_norm": 5.123594284057617, + "learning_rate": 5.629476214741941e-06, + "logits/chosen": -1.41877281665802, + "logits/rejected": -0.8851032257080078, + "logps/chosen": -1.00931715965271, + "logps/rejected": -9.06434440612793, + "loss": 1.0301, + "odds_ratio_loss": 0.20743639767169952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10093171894550323, + "rewards/margins": 0.8055028915405273, + "rewards/rejected": -0.9064345359802246, + "sft_loss": 1.00931715965271, + "step": 5965 + }, + { + "epoch": 0.46, + "grad_norm": 21.803184509277344, + "learning_rate": 5.6233671605424625e-06, + "logits/chosen": -1.2566730976104736, + "logits/rejected": -1.1595937013626099, + "logps/chosen": -0.8769540786743164, + "logps/rejected": -1.283287763595581, + "loss": 0.9293, + "odds_ratio_loss": 0.5238395929336548, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08769541233778, + "rewards/margins": 0.04063335806131363, + "rewards/rejected": -0.12832877039909363, + "sft_loss": 0.8769540786743164, + "step": 5970 + }, + { + "epoch": 0.46, + "grad_norm": 20.731637954711914, + "learning_rate": 5.617257160927539e-06, + "logits/chosen": -1.3091537952423096, + "logits/rejected": -0.9598578214645386, + "logps/chosen": -1.185593843460083, + "logps/rejected": -11.909768104553223, + "loss": 1.2584, + "odds_ratio_loss": 0.7278609275817871, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11855938285589218, + "rewards/margins": 1.0724174976348877, + "rewards/rejected": -1.190976858139038, + "sft_loss": 1.185593843460083, + "step": 5975 + }, + { + "epoch": 0.47, + "grad_norm": 27.39661407470703, + "learning_rate": 5.611146225163762e-06, + "logits/chosen": -1.1741008758544922, + "logits/rejected": -0.8947650194168091, + "logps/chosen": -0.9251441955566406, + "logps/rejected": -5.459146022796631, + "loss": 0.9395, + "odds_ratio_loss": 0.14343713223934174, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09251442551612854, + "rewards/margins": 0.453400194644928, + "rewards/rejected": -0.5459145903587341, + "sft_loss": 0.9251441955566406, + "step": 5980 + }, + { + "epoch": 0.47, + "grad_norm": 5.630356311798096, + "learning_rate": 5.6050343625191385e-06, + "logits/chosen": -1.3278236389160156, + "logits/rejected": -1.176492691040039, + "logps/chosen": -0.9233428835868835, + "logps/rejected": -3.4967033863067627, + "loss": 0.9333, + "odds_ratio_loss": 0.0994301587343216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09233428537845612, + "rewards/margins": 0.2573360800743103, + "rewards/rejected": -0.3496703505516052, + "sft_loss": 0.9233428835868835, + "step": 5985 + }, + { + "epoch": 0.47, + "grad_norm": 17.32468605041504, + "learning_rate": 5.598921582263087e-06, + "logits/chosen": -1.1766599416732788, + "logits/rejected": -1.1519089937210083, + "logps/chosen": -1.043341040611267, + "logps/rejected": -6.694605827331543, + "loss": 1.103, + "odds_ratio_loss": 0.5967239737510681, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10433411598205566, + "rewards/margins": 0.5651265382766724, + "rewards/rejected": -0.669460654258728, + "sft_loss": 1.043341040611267, + "step": 5990 + }, + { + "epoch": 0.47, + "grad_norm": 5.883633613586426, + "learning_rate": 5.592807893666413e-06, + "logits/chosen": -1.2702155113220215, + "logits/rejected": -1.0136901140213013, + "logps/chosen": -0.5562310814857483, + "logps/rejected": -4.595681667327881, + "loss": 0.5879, + "odds_ratio_loss": 0.316739022731781, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05562310293316841, + "rewards/margins": 0.40394511818885803, + "rewards/rejected": -0.45956820249557495, + "sft_loss": 0.5562310814857483, + "step": 5995 + }, + { + "epoch": 0.47, + "grad_norm": 784.6876220703125, + "learning_rate": 5.586693306001303e-06, + "logits/chosen": -1.3234854936599731, + "logits/rejected": -1.0099961757659912, + "logps/chosen": -3.74432110786438, + "logps/rejected": -5.342950820922852, + "loss": 3.938, + "odds_ratio_loss": 1.9367377758026123, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.37443211674690247, + "rewards/margins": 0.1598629653453827, + "rewards/rejected": -0.5342950820922852, + "sft_loss": 3.74432110786438, + "step": 6000 + }, + { + "epoch": 0.47, + "grad_norm": 10.769394874572754, + "learning_rate": 5.580577828541306e-06, + "logits/chosen": -1.3043229579925537, + "logits/rejected": -1.1818064451217651, + "logps/chosen": -0.7584089040756226, + "logps/rejected": -6.9647536277771, + "loss": 0.7731, + "odds_ratio_loss": 0.1465327888727188, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07584089040756226, + "rewards/margins": 0.6206345558166504, + "rewards/rejected": -0.6964754462242126, + "sft_loss": 0.7584089040756226, + "step": 6005 + }, + { + "epoch": 0.47, + "grad_norm": 8.166546821594238, + "learning_rate": 5.5744614705613185e-06, + "logits/chosen": -1.3184517621994019, + "logits/rejected": -0.8232451677322388, + "logps/chosen": -1.2114698886871338, + "logps/rejected": -2.8581385612487793, + "loss": 1.2571, + "odds_ratio_loss": 0.45625367760658264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12114699184894562, + "rewards/margins": 0.16466687619686127, + "rewards/rejected": -0.2858138680458069, + "sft_loss": 1.2114698886871338, + "step": 6010 + }, + { + "epoch": 0.47, + "grad_norm": 4.491515159606934, + "learning_rate": 5.568344241337575e-06, + "logits/chosen": -1.3477157354354858, + "logits/rejected": -1.0697810649871826, + "logps/chosen": -0.9576603174209595, + "logps/rejected": -1.879073143005371, + "loss": 1.0139, + "odds_ratio_loss": 0.5625036358833313, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09576603025197983, + "rewards/margins": 0.092141292989254, + "rewards/rejected": -0.18790733814239502, + "sft_loss": 0.9576603174209595, + "step": 6015 + }, + { + "epoch": 0.47, + "grad_norm": 5.953708648681641, + "learning_rate": 5.562226150147629e-06, + "logits/chosen": -1.3620707988739014, + "logits/rejected": -1.260248064994812, + "logps/chosen": -0.64806067943573, + "logps/rejected": -1.206020712852478, + "loss": 0.6885, + "odds_ratio_loss": 0.4039697051048279, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06480606645345688, + "rewards/margins": 0.05579598993062973, + "rewards/rejected": -0.12060205638408661, + "sft_loss": 0.64806067943573, + "step": 6020 + }, + { + "epoch": 0.47, + "grad_norm": 16.700389862060547, + "learning_rate": 5.5561072062703426e-06, + "logits/chosen": -1.267134428024292, + "logits/rejected": -1.0757681131362915, + "logps/chosen": -0.8285772204399109, + "logps/rejected": -2.7522521018981934, + "loss": 0.847, + "odds_ratio_loss": 0.18403898179531097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08285772055387497, + "rewards/margins": 0.1923675239086151, + "rewards/rejected": -0.2752252519130707, + "sft_loss": 0.8285772204399109, + "step": 6025 + }, + { + "epoch": 0.47, + "grad_norm": 6.605173110961914, + "learning_rate": 5.549987418985873e-06, + "logits/chosen": -1.3399419784545898, + "logits/rejected": -0.7796742916107178, + "logps/chosen": -0.9934493899345398, + "logps/rejected": -2.5662717819213867, + "loss": 1.0154, + "odds_ratio_loss": 0.2192169427871704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09934493154287338, + "rewards/margins": 0.1572822481393814, + "rewards/rejected": -0.2566271722316742, + "sft_loss": 0.9934493899345398, + "step": 6030 + }, + { + "epoch": 0.47, + "grad_norm": 5.67198371887207, + "learning_rate": 5.543866797575653e-06, + "logits/chosen": -1.2885167598724365, + "logits/rejected": -0.9590283632278442, + "logps/chosen": -1.0691325664520264, + "logps/rejected": -3.0669302940368652, + "loss": 1.0897, + "odds_ratio_loss": 0.20612020790576935, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10691328346729279, + "rewards/margins": 0.19977974891662598, + "rewards/rejected": -0.30669301748275757, + "sft_loss": 1.0691325664520264, + "step": 6035 + }, + { + "epoch": 0.47, + "grad_norm": 15.120502471923828, + "learning_rate": 5.537745351322382e-06, + "logits/chosen": -1.2657688856124878, + "logits/rejected": -0.9663764834403992, + "logps/chosen": -0.5593496561050415, + "logps/rejected": -4.728362560272217, + "loss": 0.567, + "odds_ratio_loss": 0.07687228918075562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05593496561050415, + "rewards/margins": 0.41690129041671753, + "rewards/rejected": -0.4728362560272217, + "sft_loss": 0.5593496561050415, + "step": 6040 + }, + { + "epoch": 0.47, + "grad_norm": 18.173458099365234, + "learning_rate": 5.531623089510011e-06, + "logits/chosen": -1.3215463161468506, + "logits/rejected": -0.9034023284912109, + "logps/chosen": -1.3617818355560303, + "logps/rejected": -5.3735671043396, + "loss": 1.3972, + "odds_ratio_loss": 0.35413289070129395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1361781805753708, + "rewards/margins": 0.4011785387992859, + "rewards/rejected": -0.5373567342758179, + "sft_loss": 1.3617818355560303, + "step": 6045 + }, + { + "epoch": 0.47, + "grad_norm": 16.87181854248047, + "learning_rate": 5.525500021423726e-06, + "logits/chosen": -1.3848448991775513, + "logits/rejected": -1.0528861284255981, + "logps/chosen": -1.08721923828125, + "logps/rejected": -6.1181511878967285, + "loss": 1.1468, + "odds_ratio_loss": 0.5962321162223816, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10872192680835724, + "rewards/margins": 0.5030931830406189, + "rewards/rejected": -0.6118150949478149, + "sft_loss": 1.08721923828125, + "step": 6050 + }, + { + "epoch": 0.47, + "grad_norm": 7.121865272521973, + "learning_rate": 5.519376156349942e-06, + "logits/chosen": -1.3546545505523682, + "logits/rejected": -0.6967934370040894, + "logps/chosen": -0.7989253997802734, + "logps/rejected": -5.922381401062012, + "loss": 0.8182, + "odds_ratio_loss": 0.1925160139799118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07989253848791122, + "rewards/margins": 0.5123456716537476, + "rewards/rejected": -0.592238187789917, + "sft_loss": 0.7989253997802734, + "step": 6055 + }, + { + "epoch": 0.47, + "grad_norm": 10.315966606140137, + "learning_rate": 5.513251503576271e-06, + "logits/chosen": -1.4382257461547852, + "logits/rejected": -0.8289369344711304, + "logps/chosen": -0.7345893383026123, + "logps/rejected": -3.5460829734802246, + "loss": 0.7703, + "odds_ratio_loss": 0.35676613450050354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07345893979072571, + "rewards/margins": 0.28114938735961914, + "rewards/rejected": -0.35460832715034485, + "sft_loss": 0.7345893383026123, + "step": 6060 + }, + { + "epoch": 0.47, + "grad_norm": 12.880824089050293, + "learning_rate": 5.507126072391531e-06, + "logits/chosen": -1.294734239578247, + "logits/rejected": -0.965386688709259, + "logps/chosen": -0.9744323492050171, + "logps/rejected": -3.9022727012634277, + "loss": 0.986, + "odds_ratio_loss": 0.11532683670520782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09744324535131454, + "rewards/margins": 0.29278403520584106, + "rewards/rejected": -0.3902273178100586, + "sft_loss": 0.9744323492050171, + "step": 6065 + }, + { + "epoch": 0.47, + "grad_norm": 4.685819149017334, + "learning_rate": 5.500999872085716e-06, + "logits/chosen": -1.4079697132110596, + "logits/rejected": -1.3205724954605103, + "logps/chosen": -3.4261250495910645, + "logps/rejected": -5.981120586395264, + "loss": 3.6766, + "odds_ratio_loss": 2.5042619705200195, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.34261250495910645, + "rewards/margins": 0.25549960136413574, + "rewards/rejected": -0.5981121063232422, + "sft_loss": 3.4261250495910645, + "step": 6070 + }, + { + "epoch": 0.47, + "grad_norm": 9.858994483947754, + "learning_rate": 5.494872911949984e-06, + "logits/chosen": -1.4069186449050903, + "logits/rejected": -1.3357126712799072, + "logps/chosen": -1.2475851774215698, + "logps/rejected": -9.896788597106934, + "loss": 1.2574, + "odds_ratio_loss": 0.0983632430434227, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1247585192322731, + "rewards/margins": 0.8649203181266785, + "rewards/rejected": -0.9896788597106934, + "sft_loss": 1.2475851774215698, + "step": 6075 + }, + { + "epoch": 0.47, + "grad_norm": 7.707642078399658, + "learning_rate": 5.488745201276651e-06, + "logits/chosen": -1.3376123905181885, + "logits/rejected": -0.8407508730888367, + "logps/chosen": -0.7080657482147217, + "logps/rejected": -3.033329963684082, + "loss": 0.7166, + "odds_ratio_loss": 0.0854514017701149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0708065778017044, + "rewards/margins": 0.23252645134925842, + "rewards/rejected": -0.303333044052124, + "sft_loss": 0.7080657482147217, + "step": 6080 + }, + { + "epoch": 0.47, + "grad_norm": 8.964948654174805, + "learning_rate": 5.482616749359165e-06, + "logits/chosen": -1.382056474685669, + "logits/rejected": -0.9311412572860718, + "logps/chosen": -1.1271944046020508, + "logps/rejected": -3.809868335723877, + "loss": 1.1583, + "odds_ratio_loss": 0.31120091676712036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11271943897008896, + "rewards/margins": 0.26826736330986023, + "rewards/rejected": -0.3809867799282074, + "sft_loss": 1.1271944046020508, + "step": 6085 + }, + { + "epoch": 0.47, + "grad_norm": 12.242707252502441, + "learning_rate": 5.476487565492105e-06, + "logits/chosen": -1.3703018426895142, + "logits/rejected": -1.1052435636520386, + "logps/chosen": -0.9987226724624634, + "logps/rejected": -7.2316131591796875, + "loss": 1.0014, + "odds_ratio_loss": 0.026345301419496536, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09987227618694305, + "rewards/margins": 0.6232890486717224, + "rewards/rejected": -0.7231613397598267, + "sft_loss": 0.9987226724624634, + "step": 6090 + }, + { + "epoch": 0.47, + "grad_norm": 6.250059127807617, + "learning_rate": 5.4703576589711534e-06, + "logits/chosen": -1.2467752695083618, + "logits/rejected": -1.3182735443115234, + "logps/chosen": -0.82000333070755, + "logps/rejected": -5.343472957611084, + "loss": 0.8405, + "odds_ratio_loss": 0.20504280924797058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08200033009052277, + "rewards/margins": 0.45234689116477966, + "rewards/rejected": -0.5343472957611084, + "sft_loss": 0.82000333070755, + "step": 6095 + }, + { + "epoch": 0.47, + "grad_norm": 19.261926651000977, + "learning_rate": 5.464227039093093e-06, + "logits/chosen": -1.3524301052093506, + "logits/rejected": -1.0798319578170776, + "logps/chosen": -1.0138263702392578, + "logps/rejected": -7.436760902404785, + "loss": 1.0177, + "odds_ratio_loss": 0.03829532116651535, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10138263553380966, + "rewards/margins": 0.6422935128211975, + "rewards/rejected": -0.7436760663986206, + "sft_loss": 1.0138263702392578, + "step": 6100 + }, + { + "epoch": 0.47, + "grad_norm": 11.354137420654297, + "learning_rate": 5.458095715155788e-06, + "logits/chosen": -1.3508212566375732, + "logits/rejected": -1.3399690389633179, + "logps/chosen": -1.062408685684204, + "logps/rejected": -5.051278591156006, + "loss": 1.0874, + "odds_ratio_loss": 0.24966564774513245, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1062408834695816, + "rewards/margins": 0.3988869786262512, + "rewards/rejected": -0.5051278471946716, + "sft_loss": 1.062408685684204, + "step": 6105 + }, + { + "epoch": 0.48, + "grad_norm": 6.081448554992676, + "learning_rate": 5.451963696458168e-06, + "logits/chosen": -1.338024377822876, + "logits/rejected": -1.045498251914978, + "logps/chosen": -0.9185785055160522, + "logps/rejected": -5.6302618980407715, + "loss": 0.9524, + "odds_ratio_loss": 0.33786827325820923, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09185785055160522, + "rewards/margins": 0.4711683690547943, + "rewards/rejected": -0.5630262494087219, + "sft_loss": 0.9185785055160522, + "step": 6110 + }, + { + "epoch": 0.48, + "grad_norm": 9.321093559265137, + "learning_rate": 5.445830992300218e-06, + "logits/chosen": -1.4295237064361572, + "logits/rejected": -0.8577009439468384, + "logps/chosen": -0.8555432558059692, + "logps/rejected": -4.726191520690918, + "loss": 0.867, + "odds_ratio_loss": 0.11437414586544037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08555431663990021, + "rewards/margins": 0.38706478476524353, + "rewards/rejected": -0.4726191461086273, + "sft_loss": 0.8555432558059692, + "step": 6115 + }, + { + "epoch": 0.48, + "grad_norm": 51.58865737915039, + "learning_rate": 5.439697611982966e-06, + "logits/chosen": -1.3293185234069824, + "logits/rejected": -1.09556245803833, + "logps/chosen": -0.684034526348114, + "logps/rejected": -5.386687278747559, + "loss": 0.6868, + "odds_ratio_loss": 0.028074750676751137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.068403460085392, + "rewards/margins": 0.47026529908180237, + "rewards/rejected": -0.5386687517166138, + "sft_loss": 0.684034526348114, + "step": 6120 + }, + { + "epoch": 0.48, + "grad_norm": 7.41538143157959, + "learning_rate": 5.4335635648084586e-06, + "logits/chosen": -1.3778337240219116, + "logits/rejected": -1.0915286540985107, + "logps/chosen": -0.9108161926269531, + "logps/rejected": -6.016547203063965, + "loss": 0.932, + "odds_ratio_loss": 0.21229729056358337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09108161926269531, + "rewards/margins": 0.5105730891227722, + "rewards/rejected": -0.6016547083854675, + "sft_loss": 0.9108161926269531, + "step": 6125 + }, + { + "epoch": 0.48, + "grad_norm": 5.8430352210998535, + "learning_rate": 5.4274288600797575e-06, + "logits/chosen": -1.5422677993774414, + "logits/rejected": -1.1286903619766235, + "logps/chosen": -0.8631542921066284, + "logps/rejected": -6.0735392570495605, + "loss": 0.8843, + "odds_ratio_loss": 0.21111269295215607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08631541579961777, + "rewards/margins": 0.5210385322570801, + "rewards/rejected": -0.607353925704956, + "sft_loss": 0.8631542921066284, + "step": 6130 + }, + { + "epoch": 0.48, + "grad_norm": 117.86041259765625, + "learning_rate": 5.42129350710092e-06, + "logits/chosen": -1.4739691019058228, + "logits/rejected": -1.2402070760726929, + "logps/chosen": -1.0958428382873535, + "logps/rejected": -8.138407707214355, + "loss": 1.1265, + "odds_ratio_loss": 0.30624285340309143, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10958428680896759, + "rewards/margins": 0.7042564749717712, + "rewards/rejected": -0.8138407468795776, + "sft_loss": 1.0958428382873535, + "step": 6135 + }, + { + "epoch": 0.48, + "grad_norm": 7.562963008880615, + "learning_rate": 5.41515751517699e-06, + "logits/chosen": -1.413245439529419, + "logits/rejected": -1.022647500038147, + "logps/chosen": -0.8586851358413696, + "logps/rejected": -4.1241044998168945, + "loss": 0.8875, + "odds_ratio_loss": 0.287839412689209, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08586851507425308, + "rewards/margins": 0.326541930437088, + "rewards/rejected": -0.4124104380607605, + "sft_loss": 0.8586851358413696, + "step": 6140 + }, + { + "epoch": 0.48, + "grad_norm": 8.0787353515625, + "learning_rate": 5.409020893613979e-06, + "logits/chosen": -1.4699350595474243, + "logits/rejected": -1.1512773036956787, + "logps/chosen": -1.0986006259918213, + "logps/rejected": -2.7863099575042725, + "loss": 1.1517, + "odds_ratio_loss": 0.5305068492889404, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10986007750034332, + "rewards/margins": 0.1687709391117096, + "rewards/rejected": -0.2786310017108917, + "sft_loss": 1.0986006259918213, + "step": 6145 + }, + { + "epoch": 0.48, + "grad_norm": 29.433841705322266, + "learning_rate": 5.402883651718851e-06, + "logits/chosen": -1.394988775253296, + "logits/rejected": -1.1701581478118896, + "logps/chosen": -0.9054506421089172, + "logps/rejected": -3.5625643730163574, + "loss": 0.94, + "odds_ratio_loss": 0.34540730714797974, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09054505825042725, + "rewards/margins": 0.26571139693260193, + "rewards/rejected": -0.3562564551830292, + "sft_loss": 0.9054506421089172, + "step": 6150 + }, + { + "epoch": 0.48, + "grad_norm": 5.943950176239014, + "learning_rate": 5.396745798799513e-06, + "logits/chosen": -1.378969669342041, + "logits/rejected": -1.0622516870498657, + "logps/chosen": -0.8753480911254883, + "logps/rejected": -5.49657678604126, + "loss": 0.8861, + "odds_ratio_loss": 0.10759921371936798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08753480017185211, + "rewards/margins": 0.4621228277683258, + "rewards/rejected": -0.5496576428413391, + "sft_loss": 0.8753480911254883, + "step": 6155 + }, + { + "epoch": 0.48, + "grad_norm": 42.807716369628906, + "learning_rate": 5.390607344164799e-06, + "logits/chosen": -1.5209299325942993, + "logits/rejected": -0.9063835144042969, + "logps/chosen": -0.9058001637458801, + "logps/rejected": -2.6679656505584717, + "loss": 0.9345, + "odds_ratio_loss": 0.28699612617492676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09058002382516861, + "rewards/margins": 0.17621657252311707, + "rewards/rejected": -0.2667965590953827, + "sft_loss": 0.9058001637458801, + "step": 6160 + }, + { + "epoch": 0.48, + "grad_norm": 35.36753845214844, + "learning_rate": 5.384468297124452e-06, + "logits/chosen": -1.4319443702697754, + "logits/rejected": -1.0489351749420166, + "logps/chosen": -0.871281623840332, + "logps/rejected": -8.540701866149902, + "loss": 0.8721, + "odds_ratio_loss": 0.008153039962053299, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0871281623840332, + "rewards/margins": 0.7669421434402466, + "rewards/rejected": -0.8540701866149902, + "sft_loss": 0.871281623840332, + "step": 6165 + }, + { + "epoch": 0.48, + "grad_norm": 17.795150756835938, + "learning_rate": 5.378328666989121e-06, + "logits/chosen": -1.5011329650878906, + "logits/rejected": -1.4191067218780518, + "logps/chosen": -0.7918688058853149, + "logps/rejected": -5.1941094398498535, + "loss": 0.8059, + "odds_ratio_loss": 0.1406429409980774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07918687909841537, + "rewards/margins": 0.4402240216732025, + "rewards/rejected": -0.5194109678268433, + "sft_loss": 0.7918688058853149, + "step": 6170 + }, + { + "epoch": 0.48, + "grad_norm": 48.25984573364258, + "learning_rate": 5.37218846307033e-06, + "logits/chosen": -1.117694616317749, + "logits/rejected": -1.5141966342926025, + "logps/chosen": -1.1156768798828125, + "logps/rejected": -4.8804168701171875, + "loss": 1.1522, + "odds_ratio_loss": 0.3651631474494934, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1115676760673523, + "rewards/margins": 0.376473993062973, + "rewards/rejected": -0.4880416989326477, + "sft_loss": 1.1156768798828125, + "step": 6175 + }, + { + "epoch": 0.48, + "grad_norm": 13.673327445983887, + "learning_rate": 5.36604769468048e-06, + "logits/chosen": -1.465071678161621, + "logits/rejected": -1.0160772800445557, + "logps/chosen": -0.8801183700561523, + "logps/rejected": -9.317262649536133, + "loss": 0.9039, + "odds_ratio_loss": 0.23795120418071747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08801184594631195, + "rewards/margins": 0.8437144160270691, + "rewards/rejected": -0.9317262768745422, + "sft_loss": 0.8801183700561523, + "step": 6180 + }, + { + "epoch": 0.48, + "grad_norm": 9.324233055114746, + "learning_rate": 5.359906371132828e-06, + "logits/chosen": -1.3929476737976074, + "logits/rejected": -0.9207280278205872, + "logps/chosen": -1.0257680416107178, + "logps/rejected": -5.169532299041748, + "loss": 1.0405, + "odds_ratio_loss": 0.14743469655513763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10257680714130402, + "rewards/margins": 0.414376437664032, + "rewards/rejected": -0.5169532299041748, + "sft_loss": 1.0257680416107178, + "step": 6185 + }, + { + "epoch": 0.48, + "grad_norm": 26.96074676513672, + "learning_rate": 5.3537645017414666e-06, + "logits/chosen": -1.2415544986724854, + "logits/rejected": -0.9335296750068665, + "logps/chosen": -0.7189784049987793, + "logps/rejected": -7.029814720153809, + "loss": 0.7201, + "odds_ratio_loss": 0.010862020775675774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07189784198999405, + "rewards/margins": 0.631083607673645, + "rewards/rejected": -0.7029815316200256, + "sft_loss": 0.7189784049987793, + "step": 6190 + }, + { + "epoch": 0.48, + "grad_norm": 10.321436882019043, + "learning_rate": 5.347622095821324e-06, + "logits/chosen": -1.3559714555740356, + "logits/rejected": -1.0688087940216064, + "logps/chosen": -1.0005199909210205, + "logps/rejected": -4.694094657897949, + "loss": 1.0338, + "odds_ratio_loss": 0.332375168800354, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10005201399326324, + "rewards/margins": 0.36935749650001526, + "rewards/rejected": -0.4694095253944397, + "sft_loss": 1.0005199909210205, + "step": 6195 + }, + { + "epoch": 0.48, + "grad_norm": 10.095077514648438, + "learning_rate": 5.3414791626881355e-06, + "logits/chosen": -1.275702714920044, + "logits/rejected": -1.1703498363494873, + "logps/chosen": -1.2571362257003784, + "logps/rejected": -11.217076301574707, + "loss": 1.2676, + "odds_ratio_loss": 0.10499601066112518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12571361660957336, + "rewards/margins": 0.9959939122200012, + "rewards/rejected": -1.121707558631897, + "sft_loss": 1.2571362257003784, + "step": 6200 + }, + { + "epoch": 0.48, + "grad_norm": 13.795586585998535, + "learning_rate": 5.335335711658443e-06, + "logits/chosen": -1.413290023803711, + "logits/rejected": -1.6291453838348389, + "logps/chosen": -0.8038554191589355, + "logps/rejected": -13.573580741882324, + "loss": 0.8133, + "odds_ratio_loss": 0.09486626088619232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08038554340600967, + "rewards/margins": 1.2769726514816284, + "rewards/rejected": -1.3573582172393799, + "sft_loss": 0.8038554191589355, + "step": 6205 + }, + { + "epoch": 0.48, + "grad_norm": 7.473656177520752, + "learning_rate": 5.329191752049567e-06, + "logits/chosen": -1.356372594833374, + "logits/rejected": -0.8928033709526062, + "logps/chosen": -0.9042119979858398, + "logps/rejected": -6.3035078048706055, + "loss": 0.9056, + "odds_ratio_loss": 0.013910293579101562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09042120724916458, + "rewards/margins": 0.5399295687675476, + "rewards/rejected": -0.6303507685661316, + "sft_loss": 0.9042119979858398, + "step": 6210 + }, + { + "epoch": 0.48, + "grad_norm": 7.009920597076416, + "learning_rate": 5.3230472931796015e-06, + "logits/chosen": -1.2536189556121826, + "logits/rejected": -0.914240837097168, + "logps/chosen": -1.2416796684265137, + "logps/rejected": -11.471125602722168, + "loss": 1.2476, + "odds_ratio_loss": 0.058856308460235596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12416797876358032, + "rewards/margins": 1.022944688796997, + "rewards/rejected": -1.1471126079559326, + "sft_loss": 1.2416796684265137, + "step": 6215 + }, + { + "epoch": 0.48, + "grad_norm": 257.0062255859375, + "learning_rate": 5.316902344367403e-06, + "logits/chosen": -1.0821647644042969, + "logits/rejected": -1.439767599105835, + "logps/chosen": -2.010810375213623, + "logps/rejected": -7.0934295654296875, + "loss": 2.0161, + "odds_ratio_loss": 0.05243242532014847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2010810375213623, + "rewards/margins": 0.5082619786262512, + "rewards/rejected": -0.7093430161476135, + "sft_loss": 2.010810375213623, + "step": 6220 + }, + { + "epoch": 0.48, + "grad_norm": 11.290236473083496, + "learning_rate": 5.310756914932562e-06, + "logits/chosen": -1.4713438749313354, + "logits/rejected": -1.341290831565857, + "logps/chosen": -1.0983505249023438, + "logps/rejected": -3.4779162406921387, + "loss": 1.1325, + "odds_ratio_loss": 0.3412621021270752, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10983506590127945, + "rewards/margins": 0.23795659840106964, + "rewards/rejected": -0.3477916717529297, + "sft_loss": 1.0983505249023438, + "step": 6225 + }, + { + "epoch": 0.48, + "grad_norm": 6.028042316436768, + "learning_rate": 5.304611014195404e-06, + "logits/chosen": -1.4126899242401123, + "logits/rejected": -0.7001927495002747, + "logps/chosen": -1.079026699066162, + "logps/rejected": -5.179287910461426, + "loss": 1.0821, + "odds_ratio_loss": 0.030901432037353516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10790266841650009, + "rewards/margins": 0.4100261330604553, + "rewards/rejected": -0.5179287791252136, + "sft_loss": 1.079026699066162, + "step": 6230 + }, + { + "epoch": 0.49, + "grad_norm": 11.563214302062988, + "learning_rate": 5.298464651476969e-06, + "logits/chosen": -1.3443890810012817, + "logits/rejected": -0.8630617260932922, + "logps/chosen": -1.0741831064224243, + "logps/rejected": -2.882352352142334, + "loss": 1.1107, + "odds_ratio_loss": 0.3656443953514099, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10741831362247467, + "rewards/margins": 0.18081694841384888, + "rewards/rejected": -0.28823524713516235, + "sft_loss": 1.0741831064224243, + "step": 6235 + }, + { + "epoch": 0.49, + "grad_norm": 8.393163681030273, + "learning_rate": 5.292317836098996e-06, + "logits/chosen": -1.4747278690338135, + "logits/rejected": -0.805279552936554, + "logps/chosen": -0.876976490020752, + "logps/rejected": -2.44502329826355, + "loss": 0.9467, + "odds_ratio_loss": 0.69696044921875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08769764751195908, + "rewards/margins": 0.15680469572544098, + "rewards/rejected": -0.24450235068798065, + "sft_loss": 0.876976490020752, + "step": 6240 + }, + { + "epoch": 0.49, + "grad_norm": 37.72679138183594, + "learning_rate": 5.286170577383909e-06, + "logits/chosen": -1.3478347063064575, + "logits/rejected": -0.7419066429138184, + "logps/chosen": -1.057916522026062, + "logps/rejected": -2.9338550567626953, + "loss": 1.1178, + "odds_ratio_loss": 0.5986490249633789, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10579165071249008, + "rewards/margins": 0.18759389221668243, + "rewards/rejected": -0.2933855354785919, + "sft_loss": 1.057916522026062, + "step": 6245 + }, + { + "epoch": 0.49, + "grad_norm": 6.0733795166015625, + "learning_rate": 5.280022884654809e-06, + "logits/chosen": -1.2470510005950928, + "logits/rejected": -0.6841954588890076, + "logps/chosen": -0.8482405543327332, + "logps/rejected": -1.3880754709243774, + "loss": 0.8984, + "odds_ratio_loss": 0.5011777281761169, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08482404798269272, + "rewards/margins": 0.053983498364686966, + "rewards/rejected": -0.13880756497383118, + "sft_loss": 0.8482405543327332, + "step": 6250 + }, + { + "epoch": 0.49, + "grad_norm": 87.05939483642578, + "learning_rate": 5.27387476723545e-06, + "logits/chosen": -1.4118794202804565, + "logits/rejected": -1.2081636190414429, + "logps/chosen": -1.1015691757202148, + "logps/rejected": -4.243358612060547, + "loss": 1.1348, + "odds_ratio_loss": 0.332470178604126, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11015691608190536, + "rewards/margins": 0.3141789436340332, + "rewards/rejected": -0.42433586716651917, + "sft_loss": 1.1015691757202148, + "step": 6255 + }, + { + "epoch": 0.49, + "grad_norm": 5.861100673675537, + "learning_rate": 5.267726234450236e-06, + "logits/chosen": -1.2835099697113037, + "logits/rejected": -0.6652259826660156, + "logps/chosen": -0.7800506949424744, + "logps/rejected": -3.8315799236297607, + "loss": 0.8021, + "odds_ratio_loss": 0.2206587791442871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07800506800413132, + "rewards/margins": 0.30515292286872864, + "rewards/rejected": -0.38315796852111816, + "sft_loss": 0.7800506949424744, + "step": 6260 + }, + { + "epoch": 0.49, + "grad_norm": 43.585899353027344, + "learning_rate": 5.261577295624194e-06, + "logits/chosen": -1.4102215766906738, + "logits/rejected": -0.8064863085746765, + "logps/chosen": -1.0556681156158447, + "logps/rejected": -4.5865912437438965, + "loss": 1.07, + "odds_ratio_loss": 0.14332325756549835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10556681454181671, + "rewards/margins": 0.3530922830104828, + "rewards/rejected": -0.4586590826511383, + "sft_loss": 1.0556681156158447, + "step": 6265 + }, + { + "epoch": 0.49, + "grad_norm": 7.329597473144531, + "learning_rate": 5.2554279600829714e-06, + "logits/chosen": -1.498291254043579, + "logits/rejected": -1.073769211769104, + "logps/chosen": -1.0253098011016846, + "logps/rejected": -1.5684717893600464, + "loss": 1.0742, + "odds_ratio_loss": 0.48867884278297424, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10253097862005234, + "rewards/margins": 0.0543162003159523, + "rewards/rejected": -0.15684717893600464, + "sft_loss": 1.0253098011016846, + "step": 6270 + }, + { + "epoch": 0.49, + "grad_norm": 21.413816452026367, + "learning_rate": 5.24927823715282e-06, + "logits/chosen": -1.4272135496139526, + "logits/rejected": -1.0142067670822144, + "logps/chosen": -0.8830957412719727, + "logps/rejected": -3.962498903274536, + "loss": 0.8897, + "odds_ratio_loss": 0.06568063795566559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08830957859754562, + "rewards/margins": 0.3079403340816498, + "rewards/rejected": -0.396249920129776, + "sft_loss": 0.8830957412719727, + "step": 6275 + }, + { + "epoch": 0.49, + "grad_norm": 60.13251876831055, + "learning_rate": 5.243128136160569e-06, + "logits/chosen": -1.362410306930542, + "logits/rejected": -1.2184256315231323, + "logps/chosen": -1.1046503782272339, + "logps/rejected": -2.781712055206299, + "loss": 1.1461, + "odds_ratio_loss": 0.4149476885795593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11046504974365234, + "rewards/margins": 0.1677061766386032, + "rewards/rejected": -0.27817121148109436, + "sft_loss": 1.1046503782272339, + "step": 6280 + }, + { + "epoch": 0.49, + "grad_norm": 50.41915512084961, + "learning_rate": 5.236977666433633e-06, + "logits/chosen": -1.3632569313049316, + "logits/rejected": -1.3883846998214722, + "logps/chosen": -1.651738166809082, + "logps/rejected": -3.164759635925293, + "loss": 1.6967, + "odds_ratio_loss": 0.44982948899269104, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16517381370067596, + "rewards/margins": 0.15130215883255005, + "rewards/rejected": -0.3164759576320648, + "sft_loss": 1.651738166809082, + "step": 6285 + }, + { + "epoch": 0.49, + "grad_norm": 28.82509422302246, + "learning_rate": 5.230826837299976e-06, + "logits/chosen": -1.3222404718399048, + "logits/rejected": -1.2658565044403076, + "logps/chosen": -1.2086848020553589, + "logps/rejected": -2.9066264629364014, + "loss": 1.2547, + "odds_ratio_loss": 0.46052518486976624, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1208684891462326, + "rewards/margins": 0.16979417204856873, + "rewards/rejected": -0.2906626760959625, + "sft_loss": 1.2086848020553589, + "step": 6290 + }, + { + "epoch": 0.49, + "grad_norm": 8.832147598266602, + "learning_rate": 5.224675658088115e-06, + "logits/chosen": -1.4218961000442505, + "logits/rejected": -1.237674593925476, + "logps/chosen": -0.9825714826583862, + "logps/rejected": -4.558527946472168, + "loss": 1.0425, + "odds_ratio_loss": 0.5992078185081482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0982571467757225, + "rewards/margins": 0.35759562253952026, + "rewards/rejected": -0.45585280656814575, + "sft_loss": 0.9825714826583862, + "step": 6295 + }, + { + "epoch": 0.49, + "grad_norm": 10.061553001403809, + "learning_rate": 5.218524138127092e-06, + "logits/chosen": -1.4015705585479736, + "logits/rejected": -0.6733571887016296, + "logps/chosen": -0.9295048713684082, + "logps/rejected": -1.6065304279327393, + "loss": 0.9707, + "odds_ratio_loss": 0.41185611486434937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0929504856467247, + "rewards/margins": 0.06770254671573639, + "rewards/rejected": -0.1606530249118805, + "sft_loss": 0.9295048713684082, + "step": 6300 + }, + { + "epoch": 0.49, + "grad_norm": 7.776479244232178, + "learning_rate": 5.212372286746469e-06, + "logits/chosen": -1.291441559791565, + "logits/rejected": -1.0221130847930908, + "logps/chosen": -1.084132432937622, + "logps/rejected": -3.2497684955596924, + "loss": 1.1144, + "odds_ratio_loss": 0.30258169770240784, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10841324180364609, + "rewards/margins": 0.2165636122226715, + "rewards/rejected": -0.3249768614768982, + "sft_loss": 1.084132432937622, + "step": 6305 + }, + { + "epoch": 0.49, + "grad_norm": 8.815446853637695, + "learning_rate": 5.206220113276309e-06, + "logits/chosen": -1.3701775074005127, + "logits/rejected": -0.9479221105575562, + "logps/chosen": -1.3828586339950562, + "logps/rejected": -3.8318514823913574, + "loss": 1.4137, + "odds_ratio_loss": 0.30864453315734863, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13828587532043457, + "rewards/margins": 0.24489931762218475, + "rewards/rejected": -0.38318517804145813, + "sft_loss": 1.3828586339950562, + "step": 6310 + }, + { + "epoch": 0.49, + "grad_norm": 16.117143630981445, + "learning_rate": 5.200067627047164e-06, + "logits/chosen": -1.4318413734436035, + "logits/rejected": -1.2156895399093628, + "logps/chosen": -1.0816423892974854, + "logps/rejected": -2.959960460662842, + "loss": 1.1072, + "odds_ratio_loss": 0.2559763193130493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1081642359495163, + "rewards/margins": 0.1878318041563034, + "rewards/rejected": -0.2959960401058197, + "sft_loss": 1.0816423892974854, + "step": 6315 + }, + { + "epoch": 0.49, + "grad_norm": 15.240541458129883, + "learning_rate": 5.193914837390062e-06, + "logits/chosen": -1.1071289777755737, + "logits/rejected": -1.1513497829437256, + "logps/chosen": -0.6938591003417969, + "logps/rejected": -2.251840591430664, + "loss": 0.7126, + "odds_ratio_loss": 0.18697881698608398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06938590854406357, + "rewards/margins": 0.15579816699028015, + "rewards/rejected": -0.22518405318260193, + "sft_loss": 0.6938591003417969, + "step": 6320 + }, + { + "epoch": 0.49, + "grad_norm": 13.213371276855469, + "learning_rate": 5.187761753636488e-06, + "logits/chosen": -1.2028157711029053, + "logits/rejected": -0.8299140930175781, + "logps/chosen": -1.0828067064285278, + "logps/rejected": -2.840242862701416, + "loss": 1.1056, + "odds_ratio_loss": 0.2279554158449173, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10828067362308502, + "rewards/margins": 0.17574363946914673, + "rewards/rejected": -0.28402429819107056, + "sft_loss": 1.0828067064285278, + "step": 6325 + }, + { + "epoch": 0.49, + "grad_norm": 5.185869216918945, + "learning_rate": 5.181608385118375e-06, + "logits/chosen": -1.275193452835083, + "logits/rejected": -0.7473627924919128, + "logps/chosen": -0.817855179309845, + "logps/rejected": -3.1327741146087646, + "loss": 0.847, + "odds_ratio_loss": 0.2910774052143097, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08178551495075226, + "rewards/margins": 0.23149189352989197, + "rewards/rejected": -0.3132774233818054, + "sft_loss": 0.817855179309845, + "step": 6330 + }, + { + "epoch": 0.49, + "grad_norm": 5.627729892730713, + "learning_rate": 5.175454741168088e-06, + "logits/chosen": -1.3115571737289429, + "logits/rejected": -0.943062424659729, + "logps/chosen": -0.9332623481750488, + "logps/rejected": -1.7769415378570557, + "loss": 0.9732, + "odds_ratio_loss": 0.3995886743068695, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09332623332738876, + "rewards/margins": 0.08436791598796844, + "rewards/rejected": -0.1776941567659378, + "sft_loss": 0.9332623481750488, + "step": 6335 + }, + { + "epoch": 0.49, + "grad_norm": 11.997739791870117, + "learning_rate": 5.169300831118411e-06, + "logits/chosen": -1.3951687812805176, + "logits/rejected": -1.3034172058105469, + "logps/chosen": -1.0512475967407227, + "logps/rejected": -2.227795124053955, + "loss": 1.0738, + "odds_ratio_loss": 0.22538113594055176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10512475669384003, + "rewards/margins": 0.11765476316213608, + "rewards/rejected": -0.2227795124053955, + "sft_loss": 1.0512475967407227, + "step": 6340 + }, + { + "epoch": 0.49, + "grad_norm": 7.004037380218506, + "learning_rate": 5.163146664302526e-06, + "logits/chosen": -1.308970332145691, + "logits/rejected": -0.8894938230514526, + "logps/chosen": -1.0626189708709717, + "logps/rejected": -2.5290350914001465, + "loss": 1.0829, + "odds_ratio_loss": 0.20267066359519958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10626189410686493, + "rewards/margins": 0.14664161205291748, + "rewards/rejected": -0.2529035210609436, + "sft_loss": 1.0626189708709717, + "step": 6345 + }, + { + "epoch": 0.49, + "grad_norm": 7.914247512817383, + "learning_rate": 5.156992250054012e-06, + "logits/chosen": -1.1687589883804321, + "logits/rejected": -1.1605756282806396, + "logps/chosen": -0.959539532661438, + "logps/rejected": -4.1293487548828125, + "loss": 0.9736, + "odds_ratio_loss": 0.14065495133399963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09595395624637604, + "rewards/margins": 0.31698092818260193, + "rewards/rejected": -0.4129348695278168, + "sft_loss": 0.959539532661438, + "step": 6350 + }, + { + "epoch": 0.49, + "grad_norm": 15.616728782653809, + "learning_rate": 5.15083759770682e-06, + "logits/chosen": -1.275852918624878, + "logits/rejected": -0.7614493370056152, + "logps/chosen": -1.340222954750061, + "logps/rejected": -1.3793169260025024, + "loss": 1.4223, + "odds_ratio_loss": 0.8204905390739441, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1340222954750061, + "rewards/margins": 0.003909393213689327, + "rewards/rejected": -0.137931689620018, + "sft_loss": 1.340222954750061, + "step": 6355 + }, + { + "epoch": 0.49, + "grad_norm": 14.889914512634277, + "learning_rate": 5.144682716595257e-06, + "logits/chosen": -1.1798590421676636, + "logits/rejected": -1.333195686340332, + "logps/chosen": -0.8878809213638306, + "logps/rejected": -4.077374458312988, + "loss": 0.8987, + "odds_ratio_loss": 0.10868176072835922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08878809958696365, + "rewards/margins": 0.31894931197166443, + "rewards/rejected": -0.4077374339103699, + "sft_loss": 0.8878809213638306, + "step": 6360 + }, + { + "epoch": 0.5, + "grad_norm": 32.431114196777344, + "learning_rate": 5.138527616053988e-06, + "logits/chosen": -1.4382386207580566, + "logits/rejected": -1.2178415060043335, + "logps/chosen": -0.9835016131401062, + "logps/rejected": -4.400031089782715, + "loss": 1.0424, + "odds_ratio_loss": 0.5888200998306274, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0983501672744751, + "rewards/margins": 0.3416529893875122, + "rewards/rejected": -0.4400032162666321, + "sft_loss": 0.9835016131401062, + "step": 6365 + }, + { + "epoch": 0.5, + "grad_norm": 22.511014938354492, + "learning_rate": 5.132372305417997e-06, + "logits/chosen": -1.2628698348999023, + "logits/rejected": -1.4043775796890259, + "logps/chosen": -1.1757280826568604, + "logps/rejected": -9.438942909240723, + "loss": 1.1913, + "odds_ratio_loss": 0.15536382794380188, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11757279932498932, + "rewards/margins": 0.8263214826583862, + "rewards/rejected": -0.9438942670822144, + "sft_loss": 1.1757280826568604, + "step": 6370 + }, + { + "epoch": 0.5, + "grad_norm": 25.160930633544922, + "learning_rate": 5.126216794022601e-06, + "logits/chosen": -1.431951642036438, + "logits/rejected": -1.627963662147522, + "logps/chosen": -1.1234722137451172, + "logps/rejected": -3.777705430984497, + "loss": 1.1563, + "odds_ratio_loss": 0.327811062335968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11234722286462784, + "rewards/margins": 0.26542332768440247, + "rewards/rejected": -0.3777705729007721, + "sft_loss": 1.1234722137451172, + "step": 6375 + }, + { + "epoch": 0.5, + "grad_norm": 15.66126537322998, + "learning_rate": 5.120061091203412e-06, + "logits/chosen": -1.46817946434021, + "logits/rejected": -1.135692834854126, + "logps/chosen": -1.0381033420562744, + "logps/rejected": -2.4826722145080566, + "loss": 1.1139, + "odds_ratio_loss": 0.7575585246086121, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10381032526493073, + "rewards/margins": 0.1444568783044815, + "rewards/rejected": -0.24826720356941223, + "sft_loss": 1.0381033420562744, + "step": 6380 + }, + { + "epoch": 0.5, + "grad_norm": 27.64168357849121, + "learning_rate": 5.1139052062963335e-06, + "logits/chosen": -1.3704216480255127, + "logits/rejected": -0.9594882726669312, + "logps/chosen": -0.8388217687606812, + "logps/rejected": -6.790257453918457, + "loss": 0.8457, + "odds_ratio_loss": 0.06886833161115646, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08388218283653259, + "rewards/margins": 0.5951434969902039, + "rewards/rejected": -0.6790256500244141, + "sft_loss": 0.8388217687606812, + "step": 6385 + }, + { + "epoch": 0.5, + "grad_norm": 9.125890731811523, + "learning_rate": 5.1077491486375475e-06, + "logits/chosen": -1.2376512289047241, + "logits/rejected": -1.1638411283493042, + "logps/chosen": -1.0288476943969727, + "logps/rejected": -2.5409605503082275, + "loss": 1.0533, + "odds_ratio_loss": 0.24471692740917206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10288476943969727, + "rewards/margins": 0.15121129155158997, + "rewards/rejected": -0.25409606099128723, + "sft_loss": 1.0288476943969727, + "step": 6390 + }, + { + "epoch": 0.5, + "grad_norm": 9.3634614944458, + "learning_rate": 5.101592927563498e-06, + "logits/chosen": -1.1934869289398193, + "logits/rejected": -1.513981580734253, + "logps/chosen": -0.8802944421768188, + "logps/rejected": -3.766958713531494, + "loss": 0.8927, + "odds_ratio_loss": 0.12386783212423325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08802944421768188, + "rewards/margins": 0.28866642713546753, + "rewards/rejected": -0.3766958713531494, + "sft_loss": 0.8802944421768188, + "step": 6395 + }, + { + "epoch": 0.5, + "grad_norm": 15.044342041015625, + "learning_rate": 5.095436552410874e-06, + "logits/chosen": -1.3474657535552979, + "logits/rejected": -0.7830338478088379, + "logps/chosen": -1.0011160373687744, + "logps/rejected": -2.68935227394104, + "loss": 1.0384, + "odds_ratio_loss": 0.37255430221557617, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10011158883571625, + "rewards/margins": 0.16882364451885223, + "rewards/rejected": -0.2689352333545685, + "sft_loss": 1.0011160373687744, + "step": 6400 + }, + { + "epoch": 0.5, + "grad_norm": 22.000795364379883, + "learning_rate": 5.089280032516601e-06, + "logits/chosen": -1.305230736732483, + "logits/rejected": -1.1422834396362305, + "logps/chosen": -0.9740656018257141, + "logps/rejected": -5.649570465087891, + "loss": 0.9822, + "odds_ratio_loss": 0.08109962940216064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09740655869245529, + "rewards/margins": 0.4675505757331848, + "rewards/rejected": -0.5649570822715759, + "sft_loss": 0.9740656018257141, + "step": 6405 + }, + { + "epoch": 0.5, + "grad_norm": 8.750536918640137, + "learning_rate": 5.083123377217826e-06, + "logits/chosen": -1.2674823999404907, + "logits/rejected": -0.589019775390625, + "logps/chosen": -0.9614855051040649, + "logps/rejected": -3.0463080406188965, + "loss": 0.9959, + "odds_ratio_loss": 0.3439742922782898, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0961485505104065, + "rewards/margins": 0.2084822654724121, + "rewards/rejected": -0.3046308159828186, + "sft_loss": 0.9614855051040649, + "step": 6410 + }, + { + "epoch": 0.5, + "grad_norm": 13.309040069580078, + "learning_rate": 5.076966595851894e-06, + "logits/chosen": -1.3427834510803223, + "logits/rejected": -1.3737881183624268, + "logps/chosen": -0.7586371302604675, + "logps/rejected": -7.239819526672363, + "loss": 0.7824, + "odds_ratio_loss": 0.23804640769958496, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07586371898651123, + "rewards/margins": 0.6481181979179382, + "rewards/rejected": -0.7239819765090942, + "sft_loss": 0.7586371302604675, + "step": 6415 + }, + { + "epoch": 0.5, + "grad_norm": 7.2068610191345215, + "learning_rate": 5.070809697756347e-06, + "logits/chosen": -1.2784476280212402, + "logits/rejected": -1.0112392902374268, + "logps/chosen": -0.8584426641464233, + "logps/rejected": -8.016109466552734, + "loss": 0.8745, + "odds_ratio_loss": 0.16093608736991882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08584427088499069, + "rewards/margins": 0.7157666683197021, + "rewards/rejected": -0.8016109466552734, + "sft_loss": 0.8584426641464233, + "step": 6420 + }, + { + "epoch": 0.5, + "grad_norm": 28.779720306396484, + "learning_rate": 5.064652692268902e-06, + "logits/chosen": -1.3163127899169922, + "logits/rejected": -0.7983849048614502, + "logps/chosen": -0.8614311218261719, + "logps/rejected": -10.967259407043457, + "loss": 0.8856, + "odds_ratio_loss": 0.24148115515708923, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0861431136727333, + "rewards/margins": 1.0105828046798706, + "rewards/rejected": -1.0967258214950562, + "sft_loss": 0.8614311218261719, + "step": 6425 + }, + { + "epoch": 0.5, + "grad_norm": 7.035211563110352, + "learning_rate": 5.0584955887274425e-06, + "logits/chosen": -1.3263068199157715, + "logits/rejected": -1.1194041967391968, + "logps/chosen": -1.0586429834365845, + "logps/rejected": -4.127714157104492, + "loss": 1.0917, + "odds_ratio_loss": 0.3301314115524292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10586428642272949, + "rewards/margins": 0.30690711736679077, + "rewards/rejected": -0.41277140378952026, + "sft_loss": 1.0586429834365845, + "step": 6430 + }, + { + "epoch": 0.5, + "grad_norm": 9.80324935913086, + "learning_rate": 5.0523383964699955e-06, + "logits/chosen": -1.414355993270874, + "logits/rejected": -1.308115005493164, + "logps/chosen": -0.8374627232551575, + "logps/rejected": -5.090659141540527, + "loss": 0.8747, + "odds_ratio_loss": 0.3718825578689575, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08374626934528351, + "rewards/margins": 0.4253196716308594, + "rewards/rejected": -0.5090659260749817, + "sft_loss": 0.8374627232551575, + "step": 6435 + }, + { + "epoch": 0.5, + "grad_norm": 3.8619236946105957, + "learning_rate": 5.0461811248347245e-06, + "logits/chosen": -1.4813154935836792, + "logits/rejected": -0.9258764982223511, + "logps/chosen": -0.9134475588798523, + "logps/rejected": -2.8745944499969482, + "loss": 0.9478, + "odds_ratio_loss": 0.3434610962867737, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09134475886821747, + "rewards/margins": 0.1961146891117096, + "rewards/rejected": -0.28745943307876587, + "sft_loss": 0.9134475588798523, + "step": 6440 + }, + { + "epoch": 0.5, + "grad_norm": 12.92715072631836, + "learning_rate": 5.040023783159914e-06, + "logits/chosen": -1.3870189189910889, + "logits/rejected": -1.30267333984375, + "logps/chosen": -1.0112378597259521, + "logps/rejected": -6.614487648010254, + "loss": 1.0638, + "odds_ratio_loss": 0.5260337591171265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10112377256155014, + "rewards/margins": 0.5603249669075012, + "rewards/rejected": -0.6614487171173096, + "sft_loss": 1.0112378597259521, + "step": 6445 + }, + { + "epoch": 0.5, + "grad_norm": 12.387499809265137, + "learning_rate": 5.033866380783955e-06, + "logits/chosen": -1.4080413579940796, + "logits/rejected": -1.0505297183990479, + "logps/chosen": -0.9702507257461548, + "logps/rejected": -4.5742621421813965, + "loss": 0.9856, + "odds_ratio_loss": 0.1531083881855011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.097025066614151, + "rewards/margins": 0.3604011833667755, + "rewards/rejected": -0.4574262201786041, + "sft_loss": 0.9702507257461548, + "step": 6450 + }, + { + "epoch": 0.5, + "grad_norm": 30.352596282958984, + "learning_rate": 5.027708927045331e-06, + "logits/chosen": -1.4215143918991089, + "logits/rejected": -1.3137174844741821, + "logps/chosen": -0.7543405890464783, + "logps/rejected": -5.948849678039551, + "loss": 0.7989, + "odds_ratio_loss": 0.44525426626205444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07543405890464783, + "rewards/margins": 0.5194509029388428, + "rewards/rejected": -0.594884991645813, + "sft_loss": 0.7543405890464783, + "step": 6455 + }, + { + "epoch": 0.5, + "grad_norm": 7.94644832611084, + "learning_rate": 5.021551431282599e-06, + "logits/chosen": -1.3876312971115112, + "logits/rejected": -0.3571079969406128, + "logps/chosen": -0.9345148801803589, + "logps/rejected": -4.167781352996826, + "loss": 0.9487, + "odds_ratio_loss": 0.14198826253414154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09345149248838425, + "rewards/margins": 0.32332664728164673, + "rewards/rejected": -0.4167781472206116, + "sft_loss": 0.9345148801803589, + "step": 6460 + }, + { + "epoch": 0.5, + "grad_norm": 8.467122077941895, + "learning_rate": 5.0153939028343855e-06, + "logits/chosen": -1.2451789379119873, + "logits/rejected": -1.2449508905410767, + "logps/chosen": -0.7295305132865906, + "logps/rejected": -3.314526081085205, + "loss": 0.763, + "odds_ratio_loss": 0.33510622382164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07295306026935577, + "rewards/margins": 0.2584995627403259, + "rewards/rejected": -0.3314525783061981, + "sft_loss": 0.7295305132865906, + "step": 6465 + }, + { + "epoch": 0.5, + "grad_norm": 11.057312965393066, + "learning_rate": 5.009236351039366e-06, + "logits/chosen": -1.4403380155563354, + "logits/rejected": -1.1567604541778564, + "logps/chosen": -0.9498114585876465, + "logps/rejected": -6.46216344833374, + "loss": 0.9606, + "odds_ratio_loss": 0.10795946419239044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09498114883899689, + "rewards/margins": 0.5512352585792542, + "rewards/rejected": -0.6462163925170898, + "sft_loss": 0.9498114585876465, + "step": 6470 + }, + { + "epoch": 0.5, + "grad_norm": 5.785250663757324, + "learning_rate": 5.003078785236245e-06, + "logits/chosen": -1.288499355316162, + "logits/rejected": -1.5583384037017822, + "logps/chosen": -1.2569966316223145, + "logps/rejected": -6.098020076751709, + "loss": 1.2615, + "odds_ratio_loss": 0.0446125753223896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12569966912269592, + "rewards/margins": 0.484102338552475, + "rewards/rejected": -0.6098020672798157, + "sft_loss": 1.2569966316223145, + "step": 6475 + }, + { + "epoch": 0.5, + "grad_norm": 7.866329669952393, + "learning_rate": 4.996921214763755e-06, + "logits/chosen": -1.4628150463104248, + "logits/rejected": -0.8090093731880188, + "logps/chosen": -0.8259984850883484, + "logps/rejected": -7.853868007659912, + "loss": 0.8338, + "odds_ratio_loss": 0.07781483232975006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08259985595941544, + "rewards/margins": 0.7027870416641235, + "rewards/rejected": -0.7853869199752808, + "sft_loss": 0.8259984850883484, + "step": 6480 + }, + { + "epoch": 0.5, + "grad_norm": 26.424274444580078, + "learning_rate": 4.990763648960636e-06, + "logits/chosen": -1.3772542476654053, + "logits/rejected": -0.935505211353302, + "logps/chosen": -1.1864153146743774, + "logps/rejected": -6.842642307281494, + "loss": 1.1887, + "odds_ratio_loss": 0.02320820465683937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11864154040813446, + "rewards/margins": 0.5656226873397827, + "rewards/rejected": -0.6842642426490784, + "sft_loss": 1.1864153146743774, + "step": 6485 + }, + { + "epoch": 0.5, + "grad_norm": 17.74825096130371, + "learning_rate": 4.984606097165615e-06, + "logits/chosen": -1.3706109523773193, + "logits/rejected": -1.016859531402588, + "logps/chosen": -0.7705894708633423, + "logps/rejected": -3.3283610343933105, + "loss": 0.7842, + "odds_ratio_loss": 0.13585834205150604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07705894857645035, + "rewards/margins": 0.2557772099971771, + "rewards/rejected": -0.3328361511230469, + "sft_loss": 0.7705894708633423, + "step": 6490 + }, + { + "epoch": 0.51, + "grad_norm": 5.96090030670166, + "learning_rate": 4.978448568717402e-06, + "logits/chosen": -1.1399333477020264, + "logits/rejected": -1.155973196029663, + "logps/chosen": -1.3984425067901611, + "logps/rejected": -7.212183952331543, + "loss": 1.4215, + "odds_ratio_loss": 0.23092961311340332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13984423875808716, + "rewards/margins": 0.5813741683959961, + "rewards/rejected": -0.7212185263633728, + "sft_loss": 1.3984425067901611, + "step": 6495 + }, + { + "epoch": 0.51, + "grad_norm": 5.749975681304932, + "learning_rate": 4.972291072954672e-06, + "logits/chosen": -1.350691795349121, + "logits/rejected": -1.1734728813171387, + "logps/chosen": -1.0730262994766235, + "logps/rejected": -4.12339973449707, + "loss": 1.0974, + "odds_ratio_loss": 0.24394991993904114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10730264335870743, + "rewards/margins": 0.30503731966018677, + "rewards/rejected": -0.412339985370636, + "sft_loss": 1.0730262994766235, + "step": 6500 + }, + { + "epoch": 0.51, + "grad_norm": 29.49888801574707, + "learning_rate": 4.966133619216047e-06, + "logits/chosen": -1.090889573097229, + "logits/rejected": -1.3663781881332397, + "logps/chosen": -1.12447988986969, + "logps/rejected": -6.730982780456543, + "loss": 1.1604, + "odds_ratio_loss": 0.3587406575679779, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11244799196720123, + "rewards/margins": 0.5606502890586853, + "rewards/rejected": -0.6730983853340149, + "sft_loss": 1.12447988986969, + "step": 6505 + }, + { + "epoch": 0.51, + "grad_norm": 8.06817626953125, + "learning_rate": 4.959976216840088e-06, + "logits/chosen": -1.4288852214813232, + "logits/rejected": -1.4129282236099243, + "logps/chosen": -1.0631098747253418, + "logps/rejected": -14.163995742797852, + "loss": 1.0644, + "odds_ratio_loss": 0.01322208158671856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10631098598241806, + "rewards/margins": 1.3100885152816772, + "rewards/rejected": -1.4163994789123535, + "sft_loss": 1.0631098747253418, + "step": 6510 + }, + { + "epoch": 0.51, + "grad_norm": 7.540499210357666, + "learning_rate": 4.953818875165276e-06, + "logits/chosen": -1.4435449838638306, + "logits/rejected": -1.3639262914657593, + "logps/chosen": -0.7851355671882629, + "logps/rejected": -3.850607395172119, + "loss": 0.8093, + "odds_ratio_loss": 0.24117080867290497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07851354777812958, + "rewards/margins": 0.3065471649169922, + "rewards/rejected": -0.38506072759628296, + "sft_loss": 0.7851355671882629, + "step": 6515 + }, + { + "epoch": 0.51, + "grad_norm": 12.225910186767578, + "learning_rate": 4.947661603530006e-06, + "logits/chosen": -1.300437569618225, + "logits/rejected": -1.2396416664123535, + "logps/chosen": -1.1051909923553467, + "logps/rejected": -6.844620704650879, + "loss": 1.1282, + "odds_ratio_loss": 0.2299909144639969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11051911115646362, + "rewards/margins": 0.5739429593086243, + "rewards/rejected": -0.6844619512557983, + "sft_loss": 1.1051909923553467, + "step": 6520 + }, + { + "epoch": 0.51, + "grad_norm": 11.778657913208008, + "learning_rate": 4.941504411272559e-06, + "logits/chosen": -1.1931259632110596, + "logits/rejected": -1.3879821300506592, + "logps/chosen": -1.3136595487594604, + "logps/rejected": -6.5946364402771, + "loss": 1.3388, + "odds_ratio_loss": 0.2512947916984558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13136595487594604, + "rewards/margins": 0.5280976891517639, + "rewards/rejected": -0.6594635844230652, + "sft_loss": 1.3136595487594604, + "step": 6525 + }, + { + "epoch": 0.51, + "grad_norm": 31.723094940185547, + "learning_rate": 4.9353473077310985e-06, + "logits/chosen": -1.382830023765564, + "logits/rejected": -1.0908111333847046, + "logps/chosen": -1.1355063915252686, + "logps/rejected": -11.42387580871582, + "loss": 1.1552, + "odds_ratio_loss": 0.19670705497264862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11355062574148178, + "rewards/margins": 1.028836965560913, + "rewards/rejected": -1.1423876285552979, + "sft_loss": 1.1355063915252686, + "step": 6530 + }, + { + "epoch": 0.51, + "grad_norm": 18.334205627441406, + "learning_rate": 4.929190302243655e-06, + "logits/chosen": -1.3520934581756592, + "logits/rejected": -1.506186842918396, + "logps/chosen": -1.0280715227127075, + "logps/rejected": -9.33717155456543, + "loss": 1.0397, + "odds_ratio_loss": 0.11600234359502792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10280714929103851, + "rewards/margins": 0.8309100866317749, + "rewards/rejected": -0.9337173700332642, + "sft_loss": 1.0280715227127075, + "step": 6535 + }, + { + "epoch": 0.51, + "grad_norm": 22.152624130249023, + "learning_rate": 4.9230334041481085e-06, + "logits/chosen": -1.3186109066009521, + "logits/rejected": -1.539162278175354, + "logps/chosen": -0.8228222727775574, + "logps/rejected": -12.513707160949707, + "loss": 0.8367, + "odds_ratio_loss": 0.13917812705039978, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08228223025798798, + "rewards/margins": 1.16908860206604, + "rewards/rejected": -1.2513707876205444, + "sft_loss": 0.8228222727775574, + "step": 6540 + }, + { + "epoch": 0.51, + "grad_norm": 19.71617889404297, + "learning_rate": 4.916876622782176e-06, + "logits/chosen": -1.2768125534057617, + "logits/rejected": -1.4440394639968872, + "logps/chosen": -0.863481879234314, + "logps/rejected": -8.750720024108887, + "loss": 0.8689, + "odds_ratio_loss": 0.054228268563747406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08634819090366364, + "rewards/margins": 0.7887238264083862, + "rewards/rejected": -0.8750720024108887, + "sft_loss": 0.863481879234314, + "step": 6545 + }, + { + "epoch": 0.51, + "grad_norm": 136.82469177246094, + "learning_rate": 4.9107199674833995e-06, + "logits/chosen": -1.065739631652832, + "logits/rejected": -1.2996230125427246, + "logps/chosen": -0.8647258877754211, + "logps/rejected": -6.739018440246582, + "loss": 0.8745, + "odds_ratio_loss": 0.0975758507847786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08647258579730988, + "rewards/margins": 0.5874292850494385, + "rewards/rejected": -0.6739019155502319, + "sft_loss": 0.8647258877754211, + "step": 6550 + }, + { + "epoch": 0.51, + "grad_norm": 20.120380401611328, + "learning_rate": 4.904563447589128e-06, + "logits/chosen": -1.1406917572021484, + "logits/rejected": -1.0273816585540771, + "logps/chosen": -0.9011715054512024, + "logps/rejected": -5.575922966003418, + "loss": 0.956, + "odds_ratio_loss": 0.5483925342559814, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09011714160442352, + "rewards/margins": 0.46747517585754395, + "rewards/rejected": -0.5575922727584839, + "sft_loss": 0.9011715054512024, + "step": 6555 + }, + { + "epoch": 0.51, + "grad_norm": 241.13442993164062, + "learning_rate": 4.898407072436503e-06, + "logits/chosen": -1.3336999416351318, + "logits/rejected": -0.5124937295913696, + "logps/chosen": -1.1472948789596558, + "logps/rejected": -3.278719663619995, + "loss": 1.1716, + "odds_ratio_loss": 0.24312250316143036, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11472950130701065, + "rewards/margins": 0.2131424844264984, + "rewards/rejected": -0.32787197828292847, + "sft_loss": 1.1472948789596558, + "step": 6560 + }, + { + "epoch": 0.51, + "grad_norm": 8.492508888244629, + "learning_rate": 4.892250851362453e-06, + "logits/chosen": -1.4392950534820557, + "logits/rejected": -1.133285641670227, + "logps/chosen": -0.6859443783760071, + "logps/rejected": -2.3474297523498535, + "loss": 0.7659, + "odds_ratio_loss": 0.7996104955673218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06859444081783295, + "rewards/margins": 0.16614854335784912, + "rewards/rejected": -0.23474296927452087, + "sft_loss": 0.6859443783760071, + "step": 6565 + }, + { + "epoch": 0.51, + "grad_norm": 11.404682159423828, + "learning_rate": 4.886094793703668e-06, + "logits/chosen": -1.47762930393219, + "logits/rejected": -1.2724297046661377, + "logps/chosen": -0.7437477111816406, + "logps/rejected": -6.391782283782959, + "loss": 0.7519, + "odds_ratio_loss": 0.0815412700176239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07437478005886078, + "rewards/margins": 0.5648034811019897, + "rewards/rejected": -0.6391782760620117, + "sft_loss": 0.7437477111816406, + "step": 6570 + }, + { + "epoch": 0.51, + "grad_norm": 26.6160888671875, + "learning_rate": 4.87993890879659e-06, + "logits/chosen": -1.1805088520050049, + "logits/rejected": -0.6844810247421265, + "logps/chosen": -1.189537763595581, + "logps/rejected": -5.205257415771484, + "loss": 1.233, + "odds_ratio_loss": 0.4342225193977356, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11895380169153214, + "rewards/margins": 0.40157192945480347, + "rewards/rejected": -0.5205257534980774, + "sft_loss": 1.189537763595581, + "step": 6575 + }, + { + "epoch": 0.51, + "grad_norm": 68.27629089355469, + "learning_rate": 4.8737832059773996e-06, + "logits/chosen": -1.1993681192398071, + "logits/rejected": -1.2758736610412598, + "logps/chosen": -0.9924455881118774, + "logps/rejected": -5.342121601104736, + "loss": 1.0453, + "odds_ratio_loss": 0.5286139249801636, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09924455732107162, + "rewards/margins": 0.43496760725975037, + "rewards/rejected": -0.5342121720314026, + "sft_loss": 0.9924455881118774, + "step": 6580 + }, + { + "epoch": 0.51, + "grad_norm": 7.67741060256958, + "learning_rate": 4.867627694582004e-06, + "logits/chosen": -1.3459727764129639, + "logits/rejected": -1.067171335220337, + "logps/chosen": -0.9003369212150574, + "logps/rejected": -9.823450088500977, + "loss": 0.9058, + "odds_ratio_loss": 0.05429646372795105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09003370255231857, + "rewards/margins": 0.8923112750053406, + "rewards/rejected": -0.9823449850082397, + "sft_loss": 0.9003369212150574, + "step": 6585 + }, + { + "epoch": 0.51, + "grad_norm": 12.953150749206543, + "learning_rate": 4.861472383946016e-06, + "logits/chosen": -1.2747457027435303, + "logits/rejected": -0.9779289364814758, + "logps/chosen": -1.1248152256011963, + "logps/rejected": -8.683989524841309, + "loss": 1.1304, + "odds_ratio_loss": 0.0556270070374012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11248151957988739, + "rewards/margins": 0.7559173703193665, + "rewards/rejected": -0.8683989644050598, + "sft_loss": 1.1248152256011963, + "step": 6590 + }, + { + "epoch": 0.51, + "grad_norm": 11.446743965148926, + "learning_rate": 4.855317283404742e-06, + "logits/chosen": -1.3993771076202393, + "logits/rejected": -1.2163068056106567, + "logps/chosen": -1.0514163970947266, + "logps/rejected": -6.497828006744385, + "loss": 1.0756, + "odds_ratio_loss": 0.24144919216632843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10514162480831146, + "rewards/margins": 0.5446411371231079, + "rewards/rejected": -0.6497827768325806, + "sft_loss": 1.0514163970947266, + "step": 6595 + }, + { + "epoch": 0.51, + "grad_norm": 7.466728210449219, + "learning_rate": 4.849162402293182e-06, + "logits/chosen": -1.4714564085006714, + "logits/rejected": -0.9703958630561829, + "logps/chosen": -0.8602334260940552, + "logps/rejected": -5.671200275421143, + "loss": 0.8751, + "odds_ratio_loss": 0.14834879338741302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08602333068847656, + "rewards/margins": 0.48109668493270874, + "rewards/rejected": -0.5671200156211853, + "sft_loss": 0.8602334260940552, + "step": 6600 + }, + { + "epoch": 0.51, + "grad_norm": 4.813689231872559, + "learning_rate": 4.8430077499459885e-06, + "logits/chosen": -1.4296540021896362, + "logits/rejected": -1.2248282432556152, + "logps/chosen": -0.9535678625106812, + "logps/rejected": -8.022564888000488, + "loss": 1.0042, + "odds_ratio_loss": 0.5060486793518066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09535679221153259, + "rewards/margins": 0.7068997025489807, + "rewards/rejected": -0.8022565841674805, + "sft_loss": 0.9535678625106812, + "step": 6605 + }, + { + "epoch": 0.51, + "grad_norm": 10.360968589782715, + "learning_rate": 4.836853335697474e-06, + "logits/chosen": -1.4981950521469116, + "logits/rejected": -1.1309444904327393, + "logps/chosen": -0.9048945307731628, + "logps/rejected": -7.258805274963379, + "loss": 0.9231, + "odds_ratio_loss": 0.18187430500984192, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.090489462018013, + "rewards/margins": 0.6353910565376282, + "rewards/rejected": -0.72588050365448, + "sft_loss": 0.9048945307731628, + "step": 6610 + }, + { + "epoch": 0.51, + "grad_norm": 242.73080444335938, + "learning_rate": 4.830699168881591e-06, + "logits/chosen": -1.413058876991272, + "logits/rejected": -1.4513750076293945, + "logps/chosen": -1.963711142539978, + "logps/rejected": -5.550110816955566, + "loss": 1.9724, + "odds_ratio_loss": 0.08717626333236694, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19637110829353333, + "rewards/margins": 0.35864001512527466, + "rewards/rejected": -0.5550111532211304, + "sft_loss": 1.963711142539978, + "step": 6615 + }, + { + "epoch": 0.51, + "grad_norm": 7.038618564605713, + "learning_rate": 4.824545258831913e-06, + "logits/chosen": -1.2381751537322998, + "logits/rejected": -0.697541356086731, + "logps/chosen": -0.9126702547073364, + "logps/rejected": -7.588592529296875, + "loss": 0.9215, + "odds_ratio_loss": 0.08810234069824219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09126702696084976, + "rewards/margins": 0.6675922274589539, + "rewards/rejected": -0.7588592767715454, + "sft_loss": 0.9126702547073364, + "step": 6620 + }, + { + "epoch": 0.52, + "grad_norm": 10.980341911315918, + "learning_rate": 4.818391614881625e-06, + "logits/chosen": -1.4112461805343628, + "logits/rejected": -0.8747898936271667, + "logps/chosen": -1.4316513538360596, + "logps/rejected": -9.764450073242188, + "loss": 1.4621, + "odds_ratio_loss": 0.30480560660362244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14316514134407043, + "rewards/margins": 0.8332799077033997, + "rewards/rejected": -0.9764450192451477, + "sft_loss": 1.4316513538360596, + "step": 6625 + }, + { + "epoch": 0.52, + "grad_norm": 6.748105049133301, + "learning_rate": 4.812238246363513e-06, + "logits/chosen": -1.410508394241333, + "logits/rejected": -1.1683613061904907, + "logps/chosen": -1.2869287729263306, + "logps/rejected": -1.9622493982315063, + "loss": 1.3375, + "odds_ratio_loss": 0.505769670009613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1286928802728653, + "rewards/margins": 0.06753206253051758, + "rewards/rejected": -0.19622494280338287, + "sft_loss": 1.2869287729263306, + "step": 6630 + }, + { + "epoch": 0.52, + "grad_norm": 63.81284713745117, + "learning_rate": 4.80608516260994e-06, + "logits/chosen": -1.3456947803497314, + "logits/rejected": -1.339935064315796, + "logps/chosen": -3.2150325775146484, + "logps/rejected": -10.46107292175293, + "loss": 3.2597, + "odds_ratio_loss": 0.44674915075302124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.321503221988678, + "rewards/margins": 0.724604070186615, + "rewards/rejected": -1.046107292175293, + "sft_loss": 3.2150325775146484, + "step": 6635 + }, + { + "epoch": 0.52, + "grad_norm": 16.180912017822266, + "learning_rate": 4.799932372952838e-06, + "logits/chosen": -1.3194289207458496, + "logits/rejected": -0.9958184361457825, + "logps/chosen": -1.0840357542037964, + "logps/rejected": -8.959443092346191, + "loss": 1.0964, + "odds_ratio_loss": 0.123465895652771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10840357840061188, + "rewards/margins": 0.7875407338142395, + "rewards/rejected": -0.8959442973136902, + "sft_loss": 1.0840357542037964, + "step": 6640 + }, + { + "epoch": 0.52, + "grad_norm": 10.999001502990723, + "learning_rate": 4.793779886723693e-06, + "logits/chosen": -1.0370042324066162, + "logits/rejected": -1.355830430984497, + "logps/chosen": -1.304805040359497, + "logps/rejected": -4.567163944244385, + "loss": 1.3344, + "odds_ratio_loss": 0.29614123702049255, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13048049807548523, + "rewards/margins": 0.32623592019081116, + "rewards/rejected": -0.456716388463974, + "sft_loss": 1.304805040359497, + "step": 6645 + }, + { + "epoch": 0.52, + "grad_norm": 15.348061561584473, + "learning_rate": 4.787627713253533e-06, + "logits/chosen": -1.0208803415298462, + "logits/rejected": -1.2926498651504517, + "logps/chosen": -0.6741073727607727, + "logps/rejected": -5.330902099609375, + "loss": 0.6904, + "odds_ratio_loss": 0.16298021376132965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06741073727607727, + "rewards/margins": 0.46567949652671814, + "rewards/rejected": -0.5330902338027954, + "sft_loss": 0.6741073727607727, + "step": 6650 + }, + { + "epoch": 0.52, + "grad_norm": 4.856533050537109, + "learning_rate": 4.78147586187291e-06, + "logits/chosen": -1.4575669765472412, + "logits/rejected": -0.7827960848808289, + "logps/chosen": -1.1962621212005615, + "logps/rejected": -14.277833938598633, + "loss": 1.205, + "odds_ratio_loss": 0.08750542253255844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11962622404098511, + "rewards/margins": 1.308157205581665, + "rewards/rejected": -1.4277832508087158, + "sft_loss": 1.1962621212005615, + "step": 6655 + }, + { + "epoch": 0.52, + "grad_norm": 13.62650203704834, + "learning_rate": 4.775324341911887e-06, + "logits/chosen": -1.4796792268753052, + "logits/rejected": -1.3119274377822876, + "logps/chosen": -2.3290915489196777, + "logps/rejected": -4.825216770172119, + "loss": 2.3427, + "odds_ratio_loss": 0.13567480444908142, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23290912806987762, + "rewards/margins": 0.24961252510547638, + "rewards/rejected": -0.4825216829776764, + "sft_loss": 2.3290915489196777, + "step": 6660 + }, + { + "epoch": 0.52, + "grad_norm": 18.167985916137695, + "learning_rate": 4.769173162700025e-06, + "logits/chosen": -1.3903343677520752, + "logits/rejected": -1.4637458324432373, + "logps/chosen": -0.8073641061782837, + "logps/rejected": -8.170225143432617, + "loss": 0.8118, + "odds_ratio_loss": 0.04413865879178047, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08073641359806061, + "rewards/margins": 0.7362861037254333, + "rewards/rejected": -0.8170225024223328, + "sft_loss": 0.8073641061782837, + "step": 6665 + }, + { + "epoch": 0.52, + "grad_norm": 8.094761848449707, + "learning_rate": 4.76302233356637e-06, + "logits/chosen": -1.473113775253296, + "logits/rejected": -1.0856086015701294, + "logps/chosen": -0.7588493227958679, + "logps/rejected": -11.757055282592773, + "loss": 0.7646, + "odds_ratio_loss": 0.05762838199734688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07588493824005127, + "rewards/margins": 1.0998207330703735, + "rewards/rejected": -1.1757055521011353, + "sft_loss": 0.7588493227958679, + "step": 6670 + }, + { + "epoch": 0.52, + "grad_norm": 5.533440113067627, + "learning_rate": 4.756871863839431e-06, + "logits/chosen": -1.4507572650909424, + "logits/rejected": -1.3853797912597656, + "logps/chosen": -0.7131852507591248, + "logps/rejected": -2.202475070953369, + "loss": 0.7341, + "odds_ratio_loss": 0.20953097939491272, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07131852954626083, + "rewards/margins": 0.14892897009849548, + "rewards/rejected": -0.22024747729301453, + "sft_loss": 0.7131852507591248, + "step": 6675 + }, + { + "epoch": 0.52, + "grad_norm": 6.879472255706787, + "learning_rate": 4.750721762847182e-06, + "logits/chosen": -1.523850440979004, + "logits/rejected": -1.277369737625122, + "logps/chosen": -1.0856904983520508, + "logps/rejected": -7.033446311950684, + "loss": 1.0938, + "odds_ratio_loss": 0.08136365562677383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10856904834508896, + "rewards/margins": 0.5947756767272949, + "rewards/rejected": -0.7033447027206421, + "sft_loss": 1.0856904983520508, + "step": 6680 + }, + { + "epoch": 0.52, + "grad_norm": 5.807984352111816, + "learning_rate": 4.744572039917029e-06, + "logits/chosen": -1.2817108631134033, + "logits/rejected": -1.3534901142120361, + "logps/chosen": -1.1051974296569824, + "logps/rejected": -5.682037830352783, + "loss": 1.1592, + "odds_ratio_loss": 0.5402711033821106, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11051974445581436, + "rewards/margins": 0.4576840400695801, + "rewards/rejected": -0.5682038068771362, + "sft_loss": 1.1051974296569824, + "step": 6685 + }, + { + "epoch": 0.52, + "grad_norm": 40.49241638183594, + "learning_rate": 4.738422704375807e-06, + "logits/chosen": -1.3966052532196045, + "logits/rejected": -0.7777081727981567, + "logps/chosen": -1.0331476926803589, + "logps/rejected": -12.39820671081543, + "loss": 1.0353, + "odds_ratio_loss": 0.021451503038406372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10331475734710693, + "rewards/margins": 1.1365059614181519, + "rewards/rejected": -1.2398207187652588, + "sft_loss": 1.0331476926803589, + "step": 6690 + }, + { + "epoch": 0.52, + "grad_norm": 10.272732734680176, + "learning_rate": 4.732273765549766e-06, + "logits/chosen": -1.4794766902923584, + "logits/rejected": -1.0296047925949097, + "logps/chosen": -1.1400551795959473, + "logps/rejected": -6.162641525268555, + "loss": 1.1564, + "odds_ratio_loss": 0.16316184401512146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11400550603866577, + "rewards/margins": 0.5022586584091187, + "rewards/rejected": -0.6162641644477844, + "sft_loss": 1.1400551795959473, + "step": 6695 + }, + { + "epoch": 0.52, + "grad_norm": 9.777023315429688, + "learning_rate": 4.726125232764551e-06, + "logits/chosen": -1.5177513360977173, + "logits/rejected": -1.1050008535385132, + "logps/chosen": -0.8778587579727173, + "logps/rejected": -5.110650062561035, + "loss": 0.9289, + "odds_ratio_loss": 0.5105150938034058, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08778587728738785, + "rewards/margins": 0.4232791066169739, + "rewards/rejected": -0.5110650062561035, + "sft_loss": 0.8778587579727173, + "step": 6700 + }, + { + "epoch": 0.52, + "grad_norm": 8.837748527526855, + "learning_rate": 4.719977115345194e-06, + "logits/chosen": -1.5516029596328735, + "logits/rejected": -1.4659761190414429, + "logps/chosen": -1.1462736129760742, + "logps/rejected": -12.577552795410156, + "loss": 1.1534, + "odds_ratio_loss": 0.0709126815199852, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11462736129760742, + "rewards/margins": 1.1431279182434082, + "rewards/rejected": -1.2577552795410156, + "sft_loss": 1.1462736129760742, + "step": 6705 + }, + { + "epoch": 0.52, + "grad_norm": 56.009029388427734, + "learning_rate": 4.713829422616091e-06, + "logits/chosen": -1.3187446594238281, + "logits/rejected": -1.0171353816986084, + "logps/chosen": -1.2572777271270752, + "logps/rejected": -4.165177822113037, + "loss": 1.2969, + "odds_ratio_loss": 0.395923376083374, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1257277876138687, + "rewards/margins": 0.29078999161720276, + "rewards/rejected": -0.41651779413223267, + "sft_loss": 1.2572777271270752, + "step": 6710 + }, + { + "epoch": 0.52, + "grad_norm": 8.197124481201172, + "learning_rate": 4.7076821639010055e-06, + "logits/chosen": -1.488896131515503, + "logits/rejected": -0.8135197758674622, + "logps/chosen": -0.9947648048400879, + "logps/rejected": -6.5308356285095215, + "loss": 1.0008, + "odds_ratio_loss": 0.060649238526821136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09947647899389267, + "rewards/margins": 0.5536071062088013, + "rewards/rejected": -0.6530836224555969, + "sft_loss": 0.9947648048400879, + "step": 6715 + }, + { + "epoch": 0.52, + "grad_norm": 8.14523983001709, + "learning_rate": 4.701535348523032e-06, + "logits/chosen": -1.4530843496322632, + "logits/rejected": -1.2772367000579834, + "logps/chosen": -0.9320453405380249, + "logps/rejected": -3.5855605602264404, + "loss": 0.9633, + "odds_ratio_loss": 0.31262874603271484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0932045429944992, + "rewards/margins": 0.2653515338897705, + "rewards/rejected": -0.35855603218078613, + "sft_loss": 0.9320453405380249, + "step": 6720 + }, + { + "epoch": 0.52, + "grad_norm": 11.718148231506348, + "learning_rate": 4.695388985804597e-06, + "logits/chosen": -1.4163570404052734, + "logits/rejected": -1.3425827026367188, + "logps/chosen": -0.9410263895988464, + "logps/rejected": -11.246500015258789, + "loss": 0.9429, + "odds_ratio_loss": 0.018949458375573158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0941026359796524, + "rewards/margins": 1.0305473804473877, + "rewards/rejected": -1.124650001525879, + "sft_loss": 0.9410263895988464, + "step": 6725 + }, + { + "epoch": 0.52, + "grad_norm": 64.7518539428711, + "learning_rate": 4.689243085067439e-06, + "logits/chosen": -1.2837133407592773, + "logits/rejected": -1.119122862815857, + "logps/chosen": -0.6392735242843628, + "logps/rejected": -7.493457794189453, + "loss": 0.641, + "odds_ratio_loss": 0.01754852756857872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06392735242843628, + "rewards/margins": 0.685418426990509, + "rewards/rejected": -0.7493457794189453, + "sft_loss": 0.6392735242843628, + "step": 6730 + }, + { + "epoch": 0.52, + "grad_norm": 11.195655822753906, + "learning_rate": 4.6830976556325995e-06, + "logits/chosen": -1.4340559244155884, + "logits/rejected": -1.155472755432129, + "logps/chosen": -2.2630090713500977, + "logps/rejected": -4.744475364685059, + "loss": 2.2716, + "odds_ratio_loss": 0.08597750961780548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.226300910115242, + "rewards/margins": 0.24814662337303162, + "rewards/rejected": -0.4744475483894348, + "sft_loss": 2.2630090713500977, + "step": 6735 + }, + { + "epoch": 0.52, + "grad_norm": 10.269758224487305, + "learning_rate": 4.676952706820398e-06, + "logits/chosen": -1.336328148841858, + "logits/rejected": -1.4924776554107666, + "logps/chosen": -1.1410115957260132, + "logps/rejected": -12.29522705078125, + "loss": 1.1414, + "odds_ratio_loss": 0.004271526355296373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11410115659236908, + "rewards/margins": 1.1154215335845947, + "rewards/rejected": -1.229522705078125, + "sft_loss": 1.1410115957260132, + "step": 6740 + }, + { + "epoch": 0.52, + "grad_norm": 9.63022232055664, + "learning_rate": 4.670808247950435e-06, + "logits/chosen": -1.1530532836914062, + "logits/rejected": -1.0261757373809814, + "logps/chosen": -1.232412338256836, + "logps/rejected": -6.78533935546875, + "loss": 1.2401, + "odds_ratio_loss": 0.0765550285577774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12324123084545135, + "rewards/margins": 0.5552927851676941, + "rewards/rejected": -0.6785339117050171, + "sft_loss": 1.232412338256836, + "step": 6745 + }, + { + "epoch": 0.53, + "grad_norm": 5.54863166809082, + "learning_rate": 4.664664288341559e-06, + "logits/chosen": -0.951921284198761, + "logits/rejected": -1.042023777961731, + "logps/chosen": -1.1395822763442993, + "logps/rejected": -9.88968276977539, + "loss": 1.1433, + "odds_ratio_loss": 0.037159256637096405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1139582172036171, + "rewards/margins": 0.8750101327896118, + "rewards/rejected": -0.9889682531356812, + "sft_loss": 1.1395822763442993, + "step": 6750 + }, + { + "epoch": 0.53, + "grad_norm": 41.68501663208008, + "learning_rate": 4.658520837311865e-06, + "logits/chosen": -1.3054512739181519, + "logits/rejected": -0.7933279871940613, + "logps/chosen": -0.9936810731887817, + "logps/rejected": -3.1271064281463623, + "loss": 1.0154, + "odds_ratio_loss": 0.21723489463329315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09936810284852982, + "rewards/margins": 0.2133425772190094, + "rewards/rejected": -0.312710702419281, + "sft_loss": 0.9936810731887817, + "step": 6755 + }, + { + "epoch": 0.53, + "grad_norm": 6.217238426208496, + "learning_rate": 4.652377904178677e-06, + "logits/chosen": -1.4349156618118286, + "logits/rejected": -0.9307647943496704, + "logps/chosen": -1.1279726028442383, + "logps/rejected": -7.524369716644287, + "loss": 1.1818, + "odds_ratio_loss": 0.5385130047798157, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11279726028442383, + "rewards/margins": 0.6396397352218628, + "rewards/rejected": -0.7524369955062866, + "sft_loss": 1.1279726028442383, + "step": 6760 + }, + { + "epoch": 0.53, + "grad_norm": 9.548029899597168, + "learning_rate": 4.646235498258534e-06, + "logits/chosen": -1.3545303344726562, + "logits/rejected": -1.1169371604919434, + "logps/chosen": -0.8710645437240601, + "logps/rejected": -3.7101237773895264, + "loss": 0.8912, + "odds_ratio_loss": 0.20155823230743408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08710645884275436, + "rewards/margins": 0.28390592336654663, + "rewards/rejected": -0.3710123896598816, + "sft_loss": 0.8710645437240601, + "step": 6765 + }, + { + "epoch": 0.53, + "grad_norm": 33.7285041809082, + "learning_rate": 4.6400936288671746e-06, + "logits/chosen": -1.2456495761871338, + "logits/rejected": -1.3842296600341797, + "logps/chosen": -0.8866599798202515, + "logps/rejected": -4.286937236785889, + "loss": 0.9015, + "odds_ratio_loss": 0.14859908819198608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08866600692272186, + "rewards/margins": 0.34002774953842163, + "rewards/rejected": -0.4286937713623047, + "sft_loss": 0.8866599798202515, + "step": 6770 + }, + { + "epoch": 0.53, + "grad_norm": 5.694341659545898, + "learning_rate": 4.6339523053195204e-06, + "logits/chosen": -1.4577521085739136, + "logits/rejected": -0.9543673396110535, + "logps/chosen": -0.9715708494186401, + "logps/rejected": -3.9672107696533203, + "loss": 1.0079, + "odds_ratio_loss": 0.36367106437683105, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09715709090232849, + "rewards/margins": 0.2995639443397522, + "rewards/rejected": -0.3967210352420807, + "sft_loss": 0.9715708494186401, + "step": 6775 + }, + { + "epoch": 0.53, + "grad_norm": 5.955051422119141, + "learning_rate": 4.6278115369296715e-06, + "logits/chosen": -1.4041407108306885, + "logits/rejected": -0.504332423210144, + "logps/chosen": -0.9568581581115723, + "logps/rejected": -5.345180988311768, + "loss": 0.9666, + "odds_ratio_loss": 0.09708087146282196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09568581730127335, + "rewards/margins": 0.43883222341537476, + "rewards/rejected": -0.5345180630683899, + "sft_loss": 0.9568581581115723, + "step": 6780 + }, + { + "epoch": 0.53, + "grad_norm": 14.830814361572266, + "learning_rate": 4.621671333010882e-06, + "logits/chosen": -1.4793100357055664, + "logits/rejected": -1.2408435344696045, + "logps/chosen": -1.2611196041107178, + "logps/rejected": -3.2941811084747314, + "loss": 1.31, + "odds_ratio_loss": 0.48905545473098755, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1261119544506073, + "rewards/margins": 0.2033061534166336, + "rewards/rejected": -0.3294180929660797, + "sft_loss": 1.2611196041107178, + "step": 6785 + }, + { + "epoch": 0.53, + "grad_norm": 10.222573280334473, + "learning_rate": 4.6155317028755484e-06, + "logits/chosen": -1.5000641345977783, + "logits/rejected": -1.1039273738861084, + "logps/chosen": -1.0030043125152588, + "logps/rejected": -5.1502885818481445, + "loss": 1.0356, + "odds_ratio_loss": 0.3260100185871124, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10030041635036469, + "rewards/margins": 0.41472840309143066, + "rewards/rejected": -0.5150288343429565, + "sft_loss": 1.0030043125152588, + "step": 6790 + }, + { + "epoch": 0.53, + "grad_norm": 11.880167007446289, + "learning_rate": 4.609392655835203e-06, + "logits/chosen": -1.4088056087493896, + "logits/rejected": -1.3552888631820679, + "logps/chosen": -1.0446202754974365, + "logps/rejected": -10.438191413879395, + "loss": 1.0486, + "odds_ratio_loss": 0.0402403399348259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10446202754974365, + "rewards/margins": 0.9393571019172668, + "rewards/rejected": -1.0438191890716553, + "sft_loss": 1.0446202754974365, + "step": 6795 + }, + { + "epoch": 0.53, + "grad_norm": 4.905477046966553, + "learning_rate": 4.603254201200489e-06, + "logits/chosen": -1.3935184478759766, + "logits/rejected": -0.6968734860420227, + "logps/chosen": -0.8401309251785278, + "logps/rejected": -5.342778205871582, + "loss": 0.8616, + "odds_ratio_loss": 0.21473821997642517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08401308953762054, + "rewards/margins": 0.45026469230651855, + "rewards/rejected": -0.5342777967453003, + "sft_loss": 0.8401309251785278, + "step": 6800 + }, + { + "epoch": 0.53, + "grad_norm": 32.383575439453125, + "learning_rate": 4.59711634828115e-06, + "logits/chosen": -1.3621585369110107, + "logits/rejected": -1.4311379194259644, + "logps/chosen": -0.9719074964523315, + "logps/rejected": -3.5250167846679688, + "loss": 1.0021, + "odds_ratio_loss": 0.30197301506996155, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0971907526254654, + "rewards/margins": 0.25531092286109924, + "rewards/rejected": -0.35250169038772583, + "sft_loss": 0.9719074964523315, + "step": 6805 + }, + { + "epoch": 0.53, + "grad_norm": 157.6527557373047, + "learning_rate": 4.5909791063860225e-06, + "logits/chosen": -1.343641757965088, + "logits/rejected": -1.1741821765899658, + "logps/chosen": -1.591430902481079, + "logps/rejected": -9.511109352111816, + "loss": 1.5978, + "odds_ratio_loss": 0.06342393904924393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15914307534694672, + "rewards/margins": 0.7919678688049316, + "rewards/rejected": -0.9511110186576843, + "sft_loss": 1.591430902481079, + "step": 6810 + }, + { + "epoch": 0.53, + "grad_norm": 10.832589149475098, + "learning_rate": 4.584842484823011e-06, + "logits/chosen": -1.4096262454986572, + "logits/rejected": -1.4636614322662354, + "logps/chosen": -0.7325119376182556, + "logps/rejected": -8.924284934997559, + "loss": 0.7382, + "odds_ratio_loss": 0.056823063641786575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07325119525194168, + "rewards/margins": 0.8191774487495422, + "rewards/rejected": -0.8924285769462585, + "sft_loss": 0.7325119376182556, + "step": 6815 + }, + { + "epoch": 0.53, + "grad_norm": 5.163097858428955, + "learning_rate": 4.578706492899082e-06, + "logits/chosen": -1.421924352645874, + "logits/rejected": -0.8165718913078308, + "logps/chosen": -1.1318721771240234, + "logps/rejected": -12.301141738891602, + "loss": 1.1504, + "odds_ratio_loss": 0.18545493483543396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11318721622228622, + "rewards/margins": 1.116927146911621, + "rewards/rejected": -1.230114221572876, + "sft_loss": 1.1318721771240234, + "step": 6820 + }, + { + "epoch": 0.53, + "grad_norm": 6.831690788269043, + "learning_rate": 4.572571139920244e-06, + "logits/chosen": -1.389262318611145, + "logits/rejected": -0.9818083047866821, + "logps/chosen": -0.8672173619270325, + "logps/rejected": -2.39469313621521, + "loss": 0.9093, + "odds_ratio_loss": 0.42063069343566895, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0867217406630516, + "rewards/margins": 0.15274758636951447, + "rewards/rejected": -0.23946928977966309, + "sft_loss": 0.8672173619270325, + "step": 6825 + }, + { + "epoch": 0.53, + "grad_norm": 15.991640090942383, + "learning_rate": 4.566436435191543e-06, + "logits/chosen": -1.4767954349517822, + "logits/rejected": -0.9890767931938171, + "logps/chosen": -0.7637965083122253, + "logps/rejected": -3.739415407180786, + "loss": 0.7842, + "odds_ratio_loss": 0.20384731888771057, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07637965679168701, + "rewards/margins": 0.2975618839263916, + "rewards/rejected": -0.3739415109157562, + "sft_loss": 0.7637965083122253, + "step": 6830 + }, + { + "epoch": 0.53, + "grad_norm": 17.176393508911133, + "learning_rate": 4.5603023880170355e-06, + "logits/chosen": -1.3882057666778564, + "logits/rejected": -1.4111310243606567, + "logps/chosen": -0.9381965398788452, + "logps/rejected": -6.771797180175781, + "loss": 0.95, + "odds_ratio_loss": 0.1176723837852478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09381966292858124, + "rewards/margins": 0.5833600163459778, + "rewards/rejected": -0.6771796941757202, + "sft_loss": 0.9381965398788452, + "step": 6835 + }, + { + "epoch": 0.53, + "grad_norm": 5.813627243041992, + "learning_rate": 4.554169007699782e-06, + "logits/chosen": -1.424593210220337, + "logits/rejected": -1.1305902004241943, + "logps/chosen": -1.1007654666900635, + "logps/rejected": -6.940686225891113, + "loss": 1.1305, + "odds_ratio_loss": 0.2974892556667328, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11007654666900635, + "rewards/margins": 0.5839920043945312, + "rewards/rejected": -0.6940685510635376, + "sft_loss": 1.1007654666900635, + "step": 6840 + }, + { + "epoch": 0.53, + "grad_norm": 6.086479663848877, + "learning_rate": 4.548036303541834e-06, + "logits/chosen": -1.4879542589187622, + "logits/rejected": -1.0121488571166992, + "logps/chosen": -1.0957660675048828, + "logps/rejected": -2.0224268436431885, + "loss": 1.1405, + "odds_ratio_loss": 0.4473304748535156, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10957660526037216, + "rewards/margins": 0.09266607463359833, + "rewards/rejected": -0.2022426575422287, + "sft_loss": 1.0957660675048828, + "step": 6845 + }, + { + "epoch": 0.53, + "grad_norm": 15.212538719177246, + "learning_rate": 4.541904284844214e-06, + "logits/chosen": -1.3758288621902466, + "logits/rejected": -1.157755970954895, + "logps/chosen": -0.7310072183609009, + "logps/rejected": -3.5768425464630127, + "loss": 0.7626, + "odds_ratio_loss": 0.3155810832977295, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07310072332620621, + "rewards/margins": 0.28458356857299805, + "rewards/rejected": -0.35768428444862366, + "sft_loss": 0.7310072183609009, + "step": 6850 + }, + { + "epoch": 0.53, + "grad_norm": 8.853516578674316, + "learning_rate": 4.535772960906907e-06, + "logits/chosen": -1.384530782699585, + "logits/rejected": -1.0214731693267822, + "logps/chosen": -1.0086669921875, + "logps/rejected": -2.941141128540039, + "loss": 1.0577, + "odds_ratio_loss": 0.49013274908065796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10086669027805328, + "rewards/margins": 0.19324742257595062, + "rewards/rejected": -0.2941141128540039, + "sft_loss": 1.0086669921875, + "step": 6855 + }, + { + "epoch": 0.53, + "grad_norm": 26.822484970092773, + "learning_rate": 4.529642341028847e-06, + "logits/chosen": -1.4491193294525146, + "logits/rejected": -1.2508859634399414, + "logps/chosen": -1.03815495967865, + "logps/rejected": -8.596417427062988, + "loss": 1.0479, + "odds_ratio_loss": 0.09718579053878784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10381549596786499, + "rewards/margins": 0.7558261752128601, + "rewards/rejected": -0.8596416711807251, + "sft_loss": 1.03815495967865, + "step": 6860 + }, + { + "epoch": 0.53, + "grad_norm": 15.145536422729492, + "learning_rate": 4.523512434507897e-06, + "logits/chosen": -1.2769041061401367, + "logits/rejected": -1.0685583353042603, + "logps/chosen": -1.0163986682891846, + "logps/rejected": -8.629759788513184, + "loss": 1.0191, + "odds_ratio_loss": 0.026911329478025436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10163986682891846, + "rewards/margins": 0.7613360285758972, + "rewards/rejected": -0.8629759550094604, + "sft_loss": 1.0163986682891846, + "step": 6865 + }, + { + "epoch": 0.53, + "grad_norm": 6.6643757820129395, + "learning_rate": 4.517383250640836e-06, + "logits/chosen": -1.304107666015625, + "logits/rejected": -0.9942609667778015, + "logps/chosen": -0.9170368313789368, + "logps/rejected": -3.983776569366455, + "loss": 0.9379, + "odds_ratio_loss": 0.20818495750427246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09170368313789368, + "rewards/margins": 0.30667397379875183, + "rewards/rejected": -0.3983776867389679, + "sft_loss": 0.9170368313789368, + "step": 6870 + }, + { + "epoch": 0.53, + "grad_norm": 25.229204177856445, + "learning_rate": 4.511254798723351e-06, + "logits/chosen": -1.3273017406463623, + "logits/rejected": -1.038374900817871, + "logps/chosen": -1.1942617893218994, + "logps/rejected": -10.367258071899414, + "loss": 1.2342, + "odds_ratio_loss": 0.3994136452674866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1194261908531189, + "rewards/margins": 0.9172995686531067, + "rewards/rejected": -1.0367257595062256, + "sft_loss": 1.1942617893218994, + "step": 6875 + }, + { + "epoch": 0.54, + "grad_norm": 12.490259170532227, + "learning_rate": 4.505127088050018e-06, + "logits/chosen": -1.1804896593093872, + "logits/rejected": -1.1206060647964478, + "logps/chosen": -1.0563093423843384, + "logps/rejected": -8.599047660827637, + "loss": 1.0905, + "odds_ratio_loss": 0.34177225828170776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10563093423843384, + "rewards/margins": 0.7542737722396851, + "rewards/rejected": -0.8599047660827637, + "sft_loss": 1.0563093423843384, + "step": 6880 + }, + { + "epoch": 0.54, + "grad_norm": 6.394746780395508, + "learning_rate": 4.499000127914286e-06, + "logits/chosen": -1.455170750617981, + "logits/rejected": -1.206276297569275, + "logps/chosen": -0.9744254946708679, + "logps/rejected": -7.1251678466796875, + "loss": 0.9991, + "odds_ratio_loss": 0.24634718894958496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09744254499673843, + "rewards/margins": 0.6150742769241333, + "rewards/rejected": -0.7125169038772583, + "sft_loss": 0.9744254946708679, + "step": 6885 + }, + { + "epoch": 0.54, + "grad_norm": 8.001043319702148, + "learning_rate": 4.49287392760847e-06, + "logits/chosen": -1.1762328147888184, + "logits/rejected": -1.4600152969360352, + "logps/chosen": -1.2551082372665405, + "logps/rejected": -5.348014831542969, + "loss": 1.275, + "odds_ratio_loss": 0.1989041119813919, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12551084160804749, + "rewards/margins": 0.4092906415462494, + "rewards/rejected": -0.5348014831542969, + "sft_loss": 1.2551082372665405, + "step": 6890 + }, + { + "epoch": 0.54, + "grad_norm": 9.176358222961426, + "learning_rate": 4.48674849642373e-06, + "logits/chosen": -1.4300482273101807, + "logits/rejected": -1.6007181406021118, + "logps/chosen": -1.1959967613220215, + "logps/rejected": -6.814823150634766, + "loss": 1.2665, + "odds_ratio_loss": 0.7045713663101196, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11959967762231827, + "rewards/margins": 0.5618826150894165, + "rewards/rejected": -0.6814823746681213, + "sft_loss": 1.1959967613220215, + "step": 6895 + }, + { + "epoch": 0.54, + "grad_norm": 16.08429718017578, + "learning_rate": 4.480623843650061e-06, + "logits/chosen": -1.4096542596817017, + "logits/rejected": -1.1567366123199463, + "logps/chosen": -0.8154433965682983, + "logps/rejected": -2.1745846271514893, + "loss": 0.9268, + "odds_ratio_loss": 1.113632082939148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08154434710741043, + "rewards/margins": 0.1359141319990158, + "rewards/rejected": -0.21745848655700684, + "sft_loss": 0.8154433965682983, + "step": 6900 + }, + { + "epoch": 0.54, + "grad_norm": 23.622493743896484, + "learning_rate": 4.474499978576274e-06, + "logits/chosen": -1.3838725090026855, + "logits/rejected": -0.9869769811630249, + "logps/chosen": -0.7865809202194214, + "logps/rejected": -3.4415574073791504, + "loss": 0.8022, + "odds_ratio_loss": 0.15634959936141968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0786580964922905, + "rewards/margins": 0.2654976546764374, + "rewards/rejected": -0.34415578842163086, + "sft_loss": 0.7865809202194214, + "step": 6905 + }, + { + "epoch": 0.54, + "grad_norm": 9.461322784423828, + "learning_rate": 4.4683769104899905e-06, + "logits/chosen": -1.5936346054077148, + "logits/rejected": -1.3136579990386963, + "logps/chosen": -0.9142619967460632, + "logps/rejected": -6.642866611480713, + "loss": 0.9418, + "odds_ratio_loss": 0.27570387721061707, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09142619371414185, + "rewards/margins": 0.5728604197502136, + "rewards/rejected": -0.6642866134643555, + "sft_loss": 0.9142619967460632, + "step": 6910 + }, + { + "epoch": 0.54, + "grad_norm": 5.414888381958008, + "learning_rate": 4.46225464867762e-06, + "logits/chosen": -1.4732753038406372, + "logits/rejected": -1.5724318027496338, + "logps/chosen": -1.603231430053711, + "logps/rejected": -10.14500617980957, + "loss": 1.623, + "odds_ratio_loss": 0.19741734862327576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1603231579065323, + "rewards/margins": 0.8541774749755859, + "rewards/rejected": -1.014500617980957, + "sft_loss": 1.603231430053711, + "step": 6915 + }, + { + "epoch": 0.54, + "grad_norm": 4.721510410308838, + "learning_rate": 4.456133202424349e-06, + "logits/chosen": -1.4627020359039307, + "logits/rejected": -1.1769201755523682, + "logps/chosen": -1.2980149984359741, + "logps/rejected": -12.185821533203125, + "loss": 1.2998, + "odds_ratio_loss": 0.018322288990020752, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12980152666568756, + "rewards/margins": 1.0887806415557861, + "rewards/rejected": -1.2185821533203125, + "sft_loss": 1.2980149984359741, + "step": 6920 + }, + { + "epoch": 0.54, + "grad_norm": 8.03736400604248, + "learning_rate": 4.450012581014129e-06, + "logits/chosen": -1.320770025253296, + "logits/rejected": -1.0708155632019043, + "logps/chosen": -1.1251728534698486, + "logps/rejected": -2.3679592609405518, + "loss": 1.2122, + "odds_ratio_loss": 0.8706277012825012, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11251727491617203, + "rewards/margins": 0.12427864223718643, + "rewards/rejected": -0.23679594695568085, + "sft_loss": 1.1251728534698486, + "step": 6925 + }, + { + "epoch": 0.54, + "grad_norm": 13.415603637695312, + "learning_rate": 4.443892793729659e-06, + "logits/chosen": -1.4199503660202026, + "logits/rejected": -1.0905369520187378, + "logps/chosen": -1.3850047588348389, + "logps/rejected": -6.778962135314941, + "loss": 1.4477, + "odds_ratio_loss": 0.6268793940544128, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13850049674510956, + "rewards/margins": 0.5393957495689392, + "rewards/rejected": -0.67789626121521, + "sft_loss": 1.3850047588348389, + "step": 6930 + }, + { + "epoch": 0.54, + "grad_norm": 7.6707892417907715, + "learning_rate": 4.437773849852371e-06, + "logits/chosen": -1.3866112232208252, + "logits/rejected": -1.0096814632415771, + "logps/chosen": -0.9563423991203308, + "logps/rejected": -4.1207780838012695, + "loss": 0.9747, + "odds_ratio_loss": 0.18360617756843567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09563424438238144, + "rewards/margins": 0.3164435923099518, + "rewards/rejected": -0.41207781434059143, + "sft_loss": 0.9563423991203308, + "step": 6935 + }, + { + "epoch": 0.54, + "grad_norm": 15.404536247253418, + "learning_rate": 4.431655758662426e-06, + "logits/chosen": -1.465071439743042, + "logits/rejected": -0.9004141688346863, + "logps/chosen": -1.0210731029510498, + "logps/rejected": -5.286839485168457, + "loss": 1.0485, + "odds_ratio_loss": 0.27380725741386414, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10210730880498886, + "rewards/margins": 0.4265766143798828, + "rewards/rejected": -0.5286839604377747, + "sft_loss": 1.0210731029510498, + "step": 6940 + }, + { + "epoch": 0.54, + "grad_norm": 9.173187255859375, + "learning_rate": 4.425538529438682e-06, + "logits/chosen": -1.4739725589752197, + "logits/rejected": -1.0984523296356201, + "logps/chosen": -1.8843532800674438, + "logps/rejected": -9.872546195983887, + "loss": 1.8863, + "odds_ratio_loss": 0.01930277980864048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18843533098697662, + "rewards/margins": 0.798819363117218, + "rewards/rejected": -0.9872547388076782, + "sft_loss": 1.8843532800674438, + "step": 6945 + }, + { + "epoch": 0.54, + "grad_norm": 29.652374267578125, + "learning_rate": 4.419422171458695e-06, + "logits/chosen": -1.5096216201782227, + "logits/rejected": -1.1181193590164185, + "logps/chosen": -0.8287372589111328, + "logps/rejected": -3.0704493522644043, + "loss": 0.8485, + "odds_ratio_loss": 0.19784730672836304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08287372440099716, + "rewards/margins": 0.22417119145393372, + "rewards/rejected": -0.3070449233055115, + "sft_loss": 0.8287372589111328, + "step": 6950 + }, + { + "epoch": 0.54, + "grad_norm": 27.080087661743164, + "learning_rate": 4.413306693998697e-06, + "logits/chosen": -1.5426450967788696, + "logits/rejected": -1.2096660137176514, + "logps/chosen": -1.0421079397201538, + "logps/rejected": -1.9055150747299194, + "loss": 1.1174, + "odds_ratio_loss": 0.7526431083679199, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10421079397201538, + "rewards/margins": 0.08634071052074432, + "rewards/rejected": -0.1905515044927597, + "sft_loss": 1.0421079397201538, + "step": 6955 + }, + { + "epoch": 0.54, + "grad_norm": 39.57759094238281, + "learning_rate": 4.407192106333588e-06, + "logits/chosen": -1.599490761756897, + "logits/rejected": -1.2356152534484863, + "logps/chosen": -0.9575656056404114, + "logps/rejected": -4.758363246917725, + "loss": 0.965, + "odds_ratio_loss": 0.0743914544582367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09575656801462173, + "rewards/margins": 0.3800797462463379, + "rewards/rejected": -0.4758363366127014, + "sft_loss": 0.9575656056404114, + "step": 6960 + }, + { + "epoch": 0.54, + "grad_norm": 28.02701187133789, + "learning_rate": 4.401078417736915e-06, + "logits/chosen": -1.3408010005950928, + "logits/rejected": -0.9972102046012878, + "logps/chosen": -0.8788027763366699, + "logps/rejected": -3.238997220993042, + "loss": 0.9065, + "odds_ratio_loss": 0.2769816815853119, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08788027614355087, + "rewards/margins": 0.23601946234703064, + "rewards/rejected": -0.3238997459411621, + "sft_loss": 0.8788027763366699, + "step": 6965 + }, + { + "epoch": 0.54, + "grad_norm": 9.541007041931152, + "learning_rate": 4.394965637480862e-06, + "logits/chosen": -1.416936993598938, + "logits/rejected": -0.934307873249054, + "logps/chosen": -0.6852422952651978, + "logps/rejected": -3.539524555206299, + "loss": 0.6948, + "odds_ratio_loss": 0.09601191431283951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06852422654628754, + "rewards/margins": 0.2854282259941101, + "rewards/rejected": -0.35395246744155884, + "sft_loss": 0.6852422952651978, + "step": 6970 + }, + { + "epoch": 0.54, + "grad_norm": 11.116430282592773, + "learning_rate": 4.38885377483624e-06, + "logits/chosen": -1.503397822380066, + "logits/rejected": -1.2043156623840332, + "logps/chosen": -1.1527019739151, + "logps/rejected": -10.427370071411133, + "loss": 1.1733, + "odds_ratio_loss": 0.20619888603687286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1152702122926712, + "rewards/margins": 0.9274666905403137, + "rewards/rejected": -1.0427368879318237, + "sft_loss": 1.1527019739151, + "step": 6975 + }, + { + "epoch": 0.54, + "grad_norm": 5.822205543518066, + "learning_rate": 4.3827428390724625e-06, + "logits/chosen": -1.523048758506775, + "logits/rejected": -0.8828157186508179, + "logps/chosen": -1.0152368545532227, + "logps/rejected": -6.31392765045166, + "loss": 1.0357, + "odds_ratio_loss": 0.20493432879447937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10152368247509003, + "rewards/margins": 0.529869019985199, + "rewards/rejected": -0.631392776966095, + "sft_loss": 1.0152368545532227, + "step": 6980 + }, + { + "epoch": 0.54, + "grad_norm": 31.748807907104492, + "learning_rate": 4.376632839457538e-06, + "logits/chosen": -1.49868905544281, + "logits/rejected": -0.8400151133537292, + "logps/chosen": -1.004535436630249, + "logps/rejected": -4.5008649826049805, + "loss": 1.0315, + "odds_ratio_loss": 0.2695631682872772, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10045354068279266, + "rewards/margins": 0.34963294863700867, + "rewards/rejected": -0.4500865042209625, + "sft_loss": 1.004535436630249, + "step": 6985 + }, + { + "epoch": 0.54, + "grad_norm": 19.450061798095703, + "learning_rate": 4.37052378525806e-06, + "logits/chosen": -1.3124674558639526, + "logits/rejected": -1.3847830295562744, + "logps/chosen": -0.941940426826477, + "logps/rejected": -3.816953182220459, + "loss": 0.9567, + "odds_ratio_loss": 0.14801433682441711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09419403970241547, + "rewards/margins": 0.2875012755393982, + "rewards/rejected": -0.38169533014297485, + "sft_loss": 0.941940426826477, + "step": 6990 + }, + { + "epoch": 0.54, + "grad_norm": 9.087126731872559, + "learning_rate": 4.364415685739183e-06, + "logits/chosen": -1.498969316482544, + "logits/rejected": -1.654484510421753, + "logps/chosen": -0.8401981592178345, + "logps/rejected": -13.466900825500488, + "loss": 0.8517, + "odds_ratio_loss": 0.11514023691415787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08401981741189957, + "rewards/margins": 1.2626702785491943, + "rewards/rejected": -1.346690058708191, + "sft_loss": 0.8401981592178345, + "step": 6995 + }, + { + "epoch": 0.54, + "grad_norm": 18.640514373779297, + "learning_rate": 4.358308550164616e-06, + "logits/chosen": -1.3498393297195435, + "logits/rejected": -1.2818952798843384, + "logps/chosen": -1.1116533279418945, + "logps/rejected": -9.627163887023926, + "loss": 1.1231, + "odds_ratio_loss": 0.11460292339324951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11116534471511841, + "rewards/margins": 0.8515509366989136, + "rewards/rejected": -0.9627164006233215, + "sft_loss": 1.1116533279418945, + "step": 7000 + }, + { + "epoch": 0.54, + "grad_norm": 6.592902183532715, + "learning_rate": 4.352202387796602e-06, + "logits/chosen": -1.4888490438461304, + "logits/rejected": -0.92115318775177, + "logps/chosen": -1.0439634323120117, + "logps/rejected": -7.5101637840271, + "loss": 1.0464, + "odds_ratio_loss": 0.023921433836221695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10439634323120117, + "rewards/margins": 0.6466200351715088, + "rewards/rejected": -0.7510164380073547, + "sft_loss": 1.0439634323120117, + "step": 7005 + }, + { + "epoch": 0.55, + "grad_norm": 19.845417022705078, + "learning_rate": 4.346097207895917e-06, + "logits/chosen": -1.3229620456695557, + "logits/rejected": -1.0637264251708984, + "logps/chosen": -1.2682688236236572, + "logps/rejected": -11.699102401733398, + "loss": 1.2807, + "odds_ratio_loss": 0.12445087730884552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12682689726352692, + "rewards/margins": 1.0430833101272583, + "rewards/rejected": -1.169910192489624, + "sft_loss": 1.2682688236236572, + "step": 7010 + }, + { + "epoch": 0.55, + "grad_norm": 7.902325630187988, + "learning_rate": 4.339993019721839e-06, + "logits/chosen": -1.4001775979995728, + "logits/rejected": -0.9012830853462219, + "logps/chosen": -0.9438796043395996, + "logps/rejected": -6.606081485748291, + "loss": 1.0343, + "odds_ratio_loss": 0.9038652181625366, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0943879634141922, + "rewards/margins": 0.5662201642990112, + "rewards/rejected": -0.660608172416687, + "sft_loss": 0.9438796043395996, + "step": 7015 + }, + { + "epoch": 0.55, + "grad_norm": 7.624948978424072, + "learning_rate": 4.333889832532142e-06, + "logits/chosen": -1.5406997203826904, + "logits/rejected": -1.1605985164642334, + "logps/chosen": -1.087935447692871, + "logps/rejected": -6.6805596351623535, + "loss": 1.1146, + "odds_ratio_loss": 0.26643556356430054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10879355669021606, + "rewards/margins": 0.5592623949050903, + "rewards/rejected": -0.6680558919906616, + "sft_loss": 1.087935447692871, + "step": 7020 + }, + { + "epoch": 0.55, + "grad_norm": 11.003002166748047, + "learning_rate": 4.327787655583089e-06, + "logits/chosen": -1.5272648334503174, + "logits/rejected": -0.9672309756278992, + "logps/chosen": -1.0818145275115967, + "logps/rejected": -5.941993713378906, + "loss": 1.1105, + "odds_ratio_loss": 0.28728801012039185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10818145424127579, + "rewards/margins": 0.48601800203323364, + "rewards/rejected": -0.5941994190216064, + "sft_loss": 1.0818145275115967, + "step": 7025 + }, + { + "epoch": 0.55, + "grad_norm": 20.621822357177734, + "learning_rate": 4.321686498129404e-06, + "logits/chosen": -1.5019843578338623, + "logits/rejected": -1.1324069499969482, + "logps/chosen": -1.1026496887207031, + "logps/rejected": -8.86848258972168, + "loss": 1.1324, + "odds_ratio_loss": 0.2977941036224365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11026497185230255, + "rewards/margins": 0.7765833139419556, + "rewards/rejected": -0.8868482708930969, + "sft_loss": 1.1026496887207031, + "step": 7030 + }, + { + "epoch": 0.55, + "grad_norm": 15.918548583984375, + "learning_rate": 4.315586369424265e-06, + "logits/chosen": -1.4344863891601562, + "logits/rejected": -0.7808696627616882, + "logps/chosen": -0.8610206842422485, + "logps/rejected": -4.585818290710449, + "loss": 0.8939, + "odds_ratio_loss": 0.3287752866744995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08610206097364426, + "rewards/margins": 0.37247976660728455, + "rewards/rejected": -0.4585818350315094, + "sft_loss": 0.8610206842422485, + "step": 7035 + }, + { + "epoch": 0.55, + "grad_norm": 45.551788330078125, + "learning_rate": 4.309487278719294e-06, + "logits/chosen": -1.3992388248443604, + "logits/rejected": -0.9382654428482056, + "logps/chosen": -1.136695146560669, + "logps/rejected": -4.336820125579834, + "loss": 1.1791, + "odds_ratio_loss": 0.4242839813232422, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11366952955722809, + "rewards/margins": 0.32001250982284546, + "rewards/rejected": -0.43368202447891235, + "sft_loss": 1.136695146560669, + "step": 7040 + }, + { + "epoch": 0.55, + "grad_norm": 18.471166610717773, + "learning_rate": 4.303389235264536e-06, + "logits/chosen": -1.2700679302215576, + "logits/rejected": -0.6969276070594788, + "logps/chosen": -0.911207377910614, + "logps/rejected": -1.6082451343536377, + "loss": 0.9555, + "odds_ratio_loss": 0.44246095418930054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09112074226140976, + "rewards/margins": 0.06970375031232834, + "rewards/rejected": -0.1608245074748993, + "sft_loss": 0.911207377910614, + "step": 7045 + }, + { + "epoch": 0.55, + "grad_norm": 17.821630477905273, + "learning_rate": 4.297292248308446e-06, + "logits/chosen": -1.3343194723129272, + "logits/rejected": -0.8145080804824829, + "logps/chosen": -0.9822369813919067, + "logps/rejected": -6.457598686218262, + "loss": 0.996, + "odds_ratio_loss": 0.1377163976430893, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09822369366884232, + "rewards/margins": 0.5475361943244934, + "rewards/rejected": -0.6457598805427551, + "sft_loss": 0.9822369813919067, + "step": 7050 + }, + { + "epoch": 0.55, + "grad_norm": 10.077775955200195, + "learning_rate": 4.291196327097883e-06, + "logits/chosen": -1.3986437320709229, + "logits/rejected": -0.9943048357963562, + "logps/chosen": -1.0187883377075195, + "logps/rejected": -6.489192962646484, + "loss": 1.0399, + "odds_ratio_loss": 0.2116069495677948, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10187884420156479, + "rewards/margins": 0.5470404624938965, + "rewards/rejected": -0.6489192247390747, + "sft_loss": 1.0187883377075195, + "step": 7055 + }, + { + "epoch": 0.55, + "grad_norm": 4.37607479095459, + "learning_rate": 4.285101480878083e-06, + "logits/chosen": -1.4077237844467163, + "logits/rejected": -1.0786534547805786, + "logps/chosen": -1.0912609100341797, + "logps/rejected": -7.178577423095703, + "loss": 1.1096, + "odds_ratio_loss": 0.18366660177707672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10912607610225677, + "rewards/margins": 0.6087316870689392, + "rewards/rejected": -0.7178577184677124, + "sft_loss": 1.0912609100341797, + "step": 7060 + }, + { + "epoch": 0.55, + "grad_norm": 8.530895233154297, + "learning_rate": 4.279007718892654e-06, + "logits/chosen": -1.3784153461456299, + "logits/rejected": -1.297706961631775, + "logps/chosen": -1.073346495628357, + "logps/rejected": -7.513908386230469, + "loss": 1.0977, + "odds_ratio_loss": 0.24378827214241028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10733465105295181, + "rewards/margins": 0.6440561413764954, + "rewards/rejected": -0.7513907551765442, + "sft_loss": 1.073346495628357, + "step": 7065 + }, + { + "epoch": 0.55, + "grad_norm": 28.515674591064453, + "learning_rate": 4.272915050383559e-06, + "logits/chosen": -1.4155004024505615, + "logits/rejected": -1.6100749969482422, + "logps/chosen": -1.1467911005020142, + "logps/rejected": -6.216679573059082, + "loss": 1.1542, + "odds_ratio_loss": 0.07433410733938217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11467909812927246, + "rewards/margins": 0.5069888830184937, + "rewards/rejected": -0.6216680407524109, + "sft_loss": 1.1467911005020142, + "step": 7070 + }, + { + "epoch": 0.55, + "grad_norm": 5.460112571716309, + "learning_rate": 4.266823484591106e-06, + "logits/chosen": -1.4371813535690308, + "logits/rejected": -1.3315809965133667, + "logps/chosen": -0.8955456018447876, + "logps/rejected": -4.667305946350098, + "loss": 0.9309, + "odds_ratio_loss": 0.353960245847702, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.089554563164711, + "rewards/margins": 0.37717610597610474, + "rewards/rejected": -0.46673065423965454, + "sft_loss": 0.8955456018447876, + "step": 7075 + }, + { + "epoch": 0.55, + "grad_norm": 26.168041229248047, + "learning_rate": 4.260733030753926e-06, + "logits/chosen": -1.454080581665039, + "logits/rejected": -1.0602571964263916, + "logps/chosen": -1.140600562095642, + "logps/rejected": -5.033745765686035, + "loss": 1.1835, + "odds_ratio_loss": 0.429409921169281, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11406006664037704, + "rewards/margins": 0.38931459188461304, + "rewards/rejected": -0.5033746361732483, + "sft_loss": 1.140600562095642, + "step": 7080 + }, + { + "epoch": 0.55, + "grad_norm": 27.514564514160156, + "learning_rate": 4.254643698108963e-06, + "logits/chosen": -1.4346212148666382, + "logits/rejected": -0.9562221765518188, + "logps/chosen": -0.9034181833267212, + "logps/rejected": -3.7591049671173096, + "loss": 0.9172, + "odds_ratio_loss": 0.1378771811723709, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09034182131290436, + "rewards/margins": 0.2855686545372009, + "rewards/rejected": -0.3759104609489441, + "sft_loss": 0.9034181833267212, + "step": 7085 + }, + { + "epoch": 0.55, + "grad_norm": 23.025066375732422, + "learning_rate": 4.2485554958914695e-06, + "logits/chosen": -1.4962995052337646, + "logits/rejected": -0.750170111656189, + "logps/chosen": -1.304537057876587, + "logps/rejected": -6.853229522705078, + "loss": 1.3358, + "odds_ratio_loss": 0.3125781714916229, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1304537057876587, + "rewards/margins": 0.5548692941665649, + "rewards/rejected": -0.6853229999542236, + "sft_loss": 1.304537057876587, + "step": 7090 + }, + { + "epoch": 0.55, + "grad_norm": 91.04281616210938, + "learning_rate": 4.2424684333349725e-06, + "logits/chosen": -1.413205862045288, + "logits/rejected": -1.2042224407196045, + "logps/chosen": -1.3621070384979248, + "logps/rejected": -4.676912307739258, + "loss": 1.3821, + "odds_ratio_loss": 0.19968989491462708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13621070981025696, + "rewards/margins": 0.3314804434776306, + "rewards/rejected": -0.4676911234855652, + "sft_loss": 1.3621070384979248, + "step": 7095 + }, + { + "epoch": 0.55, + "grad_norm": 5.257567882537842, + "learning_rate": 4.236382519671276e-06, + "logits/chosen": -1.3120917081832886, + "logits/rejected": -1.0594903230667114, + "logps/chosen": -1.0127789974212646, + "logps/rejected": -10.14476490020752, + "loss": 1.0129, + "odds_ratio_loss": 0.0008429423905909061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10127788782119751, + "rewards/margins": 0.9131986498832703, + "rewards/rejected": -1.0144765377044678, + "sft_loss": 1.0127789974212646, + "step": 7100 + }, + { + "epoch": 0.55, + "grad_norm": 39.47378921508789, + "learning_rate": 4.230297764130441e-06, + "logits/chosen": -1.1948641538619995, + "logits/rejected": -1.540321707725525, + "logps/chosen": -0.913619875907898, + "logps/rejected": -8.393054962158203, + "loss": 0.9149, + "odds_ratio_loss": 0.012436440214514732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09136199951171875, + "rewards/margins": 0.7479435205459595, + "rewards/rejected": -0.8393055200576782, + "sft_loss": 0.913619875907898, + "step": 7105 + }, + { + "epoch": 0.55, + "grad_norm": 7.7148756980896, + "learning_rate": 4.224214175940773e-06, + "logits/chosen": -1.5498994588851929, + "logits/rejected": -0.8623272776603699, + "logps/chosen": -1.627855658531189, + "logps/rejected": -3.274567127227783, + "loss": 1.6546, + "odds_ratio_loss": 0.2678052484989166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16278555989265442, + "rewards/margins": 0.1646711379289627, + "rewards/rejected": -0.3274567127227783, + "sft_loss": 1.627855658531189, + "step": 7110 + }, + { + "epoch": 0.55, + "grad_norm": 30.301809310913086, + "learning_rate": 4.218131764328802e-06, + "logits/chosen": -1.5537168979644775, + "logits/rejected": -1.0891371965408325, + "logps/chosen": -0.9233828783035278, + "logps/rejected": -5.229593753814697, + "loss": 0.9502, + "odds_ratio_loss": 0.2679787278175354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09233828634023666, + "rewards/margins": 0.43062108755111694, + "rewards/rejected": -0.5229593515396118, + "sft_loss": 0.9233828783035278, + "step": 7115 + }, + { + "epoch": 0.55, + "grad_norm": 11.14027214050293, + "learning_rate": 4.2120505385192835e-06, + "logits/chosen": -1.4834104776382446, + "logits/rejected": -1.1844459772109985, + "logps/chosen": -1.6177314519882202, + "logps/rejected": -3.180201530456543, + "loss": 1.676, + "odds_ratio_loss": 0.5829809904098511, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16177313029766083, + "rewards/margins": 0.15624700486660004, + "rewards/rejected": -0.31802016496658325, + "sft_loss": 1.6177314519882202, + "step": 7120 + }, + { + "epoch": 0.55, + "grad_norm": 8.944758415222168, + "learning_rate": 4.205970507735165e-06, + "logits/chosen": -1.4102011919021606, + "logits/rejected": -1.3182289600372314, + "logps/chosen": -0.9230417013168335, + "logps/rejected": -3.186675548553467, + "loss": 0.9337, + "odds_ratio_loss": 0.1070776954293251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09230418503284454, + "rewards/margins": 0.2263633906841278, + "rewards/rejected": -0.31866756081581116, + "sft_loss": 0.9230417013168335, + "step": 7125 + }, + { + "epoch": 0.55, + "grad_norm": 12.016209602355957, + "learning_rate": 4.199891681197585e-06, + "logits/chosen": -1.3144750595092773, + "logits/rejected": -1.002539873123169, + "logps/chosen": -1.2599903345108032, + "logps/rejected": -3.335179090499878, + "loss": 1.2785, + "odds_ratio_loss": 0.18462809920310974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12599903345108032, + "rewards/margins": 0.20751889050006866, + "rewards/rejected": -0.3335179388523102, + "sft_loss": 1.2599903345108032, + "step": 7130 + }, + { + "epoch": 0.56, + "grad_norm": 6.1627631187438965, + "learning_rate": 4.193814068125854e-06, + "logits/chosen": -1.3824846744537354, + "logits/rejected": -0.680531919002533, + "logps/chosen": -0.9973615407943726, + "logps/rejected": -3.085211992263794, + "loss": 1.0166, + "odds_ratio_loss": 0.19264943897724152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09973615407943726, + "rewards/margins": 0.2087850272655487, + "rewards/rejected": -0.30852121114730835, + "sft_loss": 0.9973615407943726, + "step": 7135 + }, + { + "epoch": 0.56, + "grad_norm": 8.950169563293457, + "learning_rate": 4.187737677737448e-06, + "logits/chosen": -1.4213409423828125, + "logits/rejected": -1.0563170909881592, + "logps/chosen": -1.1497620344161987, + "logps/rejected": -5.6467132568359375, + "loss": 1.1613, + "odds_ratio_loss": 0.11577478796243668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11497620493173599, + "rewards/margins": 0.44969505071640015, + "rewards/rejected": -0.5646712779998779, + "sft_loss": 1.1497620344161987, + "step": 7140 + }, + { + "epoch": 0.56, + "grad_norm": 5.027875900268555, + "learning_rate": 4.181662519247983e-06, + "logits/chosen": -1.4923940896987915, + "logits/rejected": -0.8564295768737793, + "logps/chosen": -1.015838861465454, + "logps/rejected": -4.046143531799316, + "loss": 1.0314, + "odds_ratio_loss": 0.1554424911737442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10158388316631317, + "rewards/margins": 0.30303049087524414, + "rewards/rejected": -0.4046143591403961, + "sft_loss": 1.015838861465454, + "step": 7145 + }, + { + "epoch": 0.56, + "grad_norm": 7.206550121307373, + "learning_rate": 4.175588601871206e-06, + "logits/chosen": -1.4180933237075806, + "logits/rejected": -0.9299715757369995, + "logps/chosen": -0.8494499921798706, + "logps/rejected": -3.278174877166748, + "loss": 0.8656, + "odds_ratio_loss": 0.16184845566749573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08494500815868378, + "rewards/margins": 0.24287250638008118, + "rewards/rejected": -0.32781749963760376, + "sft_loss": 0.8494499921798706, + "step": 7150 + }, + { + "epoch": 0.56, + "grad_norm": 5.67272424697876, + "learning_rate": 4.169515934818987e-06, + "logits/chosen": -1.3000648021697998, + "logits/rejected": -0.8862468600273132, + "logps/chosen": -1.3208826780319214, + "logps/rejected": -2.5383241176605225, + "loss": 1.3589, + "odds_ratio_loss": 0.3805355131626129, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13208827376365662, + "rewards/margins": 0.12174415588378906, + "rewards/rejected": -0.2538324296474457, + "sft_loss": 1.3208826780319214, + "step": 7155 + }, + { + "epoch": 0.56, + "grad_norm": 14.509788513183594, + "learning_rate": 4.163444527301296e-06, + "logits/chosen": -1.4848687648773193, + "logits/rejected": -1.325732946395874, + "logps/chosen": -0.8232647180557251, + "logps/rejected": -5.763245105743408, + "loss": 0.8311, + "odds_ratio_loss": 0.07884080708026886, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08232647180557251, + "rewards/margins": 0.49399805068969727, + "rewards/rejected": -0.5763245224952698, + "sft_loss": 0.8232647180557251, + "step": 7160 + }, + { + "epoch": 0.56, + "grad_norm": 11.16965389251709, + "learning_rate": 4.157374388526189e-06, + "logits/chosen": -1.4290255308151245, + "logits/rejected": -1.3512481451034546, + "logps/chosen": -0.9287931323051453, + "logps/rejected": -3.3987338542938232, + "loss": 0.9486, + "odds_ratio_loss": 0.19854740798473358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09287931025028229, + "rewards/margins": 0.24699406325817108, + "rewards/rejected": -0.33987337350845337, + "sft_loss": 0.9287931323051453, + "step": 7165 + }, + { + "epoch": 0.56, + "grad_norm": 28.541109085083008, + "learning_rate": 4.151305527699808e-06, + "logits/chosen": -1.3716208934783936, + "logits/rejected": -1.2792634963989258, + "logps/chosen": -0.7543950080871582, + "logps/rejected": -6.4099860191345215, + "loss": 0.7629, + "odds_ratio_loss": 0.08536773175001144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07543949782848358, + "rewards/margins": 0.5655592083930969, + "rewards/rejected": -0.6409986615180969, + "sft_loss": 0.7543950080871582, + "step": 7170 + }, + { + "epoch": 0.56, + "grad_norm": 9.904860496520996, + "learning_rate": 4.1452379540263495e-06, + "logits/chosen": -1.4210548400878906, + "logits/rejected": -1.423855185508728, + "logps/chosen": -2.2208778858184814, + "logps/rejected": -5.382429599761963, + "loss": 2.279, + "odds_ratio_loss": 0.5807275176048279, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2220877707004547, + "rewards/margins": 0.31615522503852844, + "rewards/rejected": -0.5382429957389832, + "sft_loss": 2.2208778858184814, + "step": 7175 + }, + { + "epoch": 0.56, + "grad_norm": 5.127261161804199, + "learning_rate": 4.139171676708057e-06, + "logits/chosen": -1.3772279024124146, + "logits/rejected": -0.9157557487487793, + "logps/chosen": -0.9644227027893066, + "logps/rejected": -4.304810523986816, + "loss": 1.0178, + "odds_ratio_loss": 0.5333045125007629, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09644227474927902, + "rewards/margins": 0.33403879404067993, + "rewards/rejected": -0.43048110604286194, + "sft_loss": 0.9644227027893066, + "step": 7180 + }, + { + "epoch": 0.56, + "grad_norm": 14.579924583435059, + "learning_rate": 4.1331067049452134e-06, + "logits/chosen": -1.3941256999969482, + "logits/rejected": -1.0850193500518799, + "logps/chosen": -0.9973868131637573, + "logps/rejected": -6.488180637359619, + "loss": 1.0413, + "odds_ratio_loss": 0.4392642080783844, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09973867982625961, + "rewards/margins": 0.549079418182373, + "rewards/rejected": -0.6488181352615356, + "sft_loss": 0.9973868131637573, + "step": 7185 + }, + { + "epoch": 0.56, + "grad_norm": 12.829072952270508, + "learning_rate": 4.127043047936116e-06, + "logits/chosen": -1.446582555770874, + "logits/rejected": -1.1274049282073975, + "logps/chosen": -1.1398777961730957, + "logps/rejected": -5.959358215332031, + "loss": 1.1571, + "odds_ratio_loss": 0.17211410403251648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11398778855800629, + "rewards/margins": 0.48194804787635803, + "rewards/rejected": -0.5959358215332031, + "sft_loss": 1.1398777961730957, + "step": 7190 + }, + { + "epoch": 0.56, + "grad_norm": 17.90237808227539, + "learning_rate": 4.120980714877072e-06, + "logits/chosen": -1.4279298782348633, + "logits/rejected": -1.0952799320220947, + "logps/chosen": -0.9773966073989868, + "logps/rejected": -12.761029243469238, + "loss": 0.9786, + "odds_ratio_loss": 0.012265295721590519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09773966670036316, + "rewards/margins": 1.1783632040023804, + "rewards/rejected": -1.2761030197143555, + "sft_loss": 0.9773966073989868, + "step": 7195 + }, + { + "epoch": 0.56, + "grad_norm": 34.895118713378906, + "learning_rate": 4.114919714962376e-06, + "logits/chosen": -1.4511187076568604, + "logits/rejected": -1.0730235576629639, + "logps/chosen": -1.06996750831604, + "logps/rejected": -10.693025588989258, + "loss": 1.0754, + "odds_ratio_loss": 0.05451556295156479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10699673742055893, + "rewards/margins": 0.9623057246208191, + "rewards/rejected": -1.0693025588989258, + "sft_loss": 1.06996750831604, + "step": 7200 + }, + { + "epoch": 0.56, + "grad_norm": 24.5500431060791, + "learning_rate": 4.108860057384309e-06, + "logits/chosen": -1.476646065711975, + "logits/rejected": -1.196157455444336, + "logps/chosen": -1.0712162256240845, + "logps/rejected": -8.418623924255371, + "loss": 1.0888, + "odds_ratio_loss": 0.17610427737236023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10712162405252457, + "rewards/margins": 0.7347409129142761, + "rewards/rejected": -0.8418625593185425, + "sft_loss": 1.0712162256240845, + "step": 7205 + }, + { + "epoch": 0.56, + "grad_norm": 11.665763854980469, + "learning_rate": 4.1028017513331084e-06, + "logits/chosen": -1.2933409214019775, + "logits/rejected": -1.0372425317764282, + "logps/chosen": -1.2550567388534546, + "logps/rejected": -4.989316463470459, + "loss": 1.3087, + "odds_ratio_loss": 0.536535918712616, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12550568580627441, + "rewards/margins": 0.3734259605407715, + "rewards/rejected": -0.4989316463470459, + "sft_loss": 1.2550567388534546, + "step": 7210 + }, + { + "epoch": 0.56, + "grad_norm": 186.90316772460938, + "learning_rate": 4.096744805996964e-06, + "logits/chosen": -1.380571722984314, + "logits/rejected": -1.3022321462631226, + "logps/chosen": -1.0640513896942139, + "logps/rejected": -10.813997268676758, + "loss": 1.1025, + "odds_ratio_loss": 0.3841148018836975, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10640515387058258, + "rewards/margins": 0.9749946594238281, + "rewards/rejected": -1.08139967918396, + "sft_loss": 1.0640513896942139, + "step": 7215 + }, + { + "epoch": 0.56, + "grad_norm": 14.99019718170166, + "learning_rate": 4.090689230562003e-06, + "logits/chosen": -1.0091339349746704, + "logits/rejected": -1.4177144765853882, + "logps/chosen": -1.391606092453003, + "logps/rejected": -7.845396995544434, + "loss": 1.3975, + "odds_ratio_loss": 0.05848420783877373, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13916060328483582, + "rewards/margins": 0.6453791856765747, + "rewards/rejected": -0.7845398187637329, + "sft_loss": 1.391606092453003, + "step": 7220 + }, + { + "epoch": 0.56, + "grad_norm": 9.070110321044922, + "learning_rate": 4.0846350342122746e-06, + "logits/chosen": -1.4411298036575317, + "logits/rejected": -1.2582954168319702, + "logps/chosen": -0.8484258651733398, + "logps/rejected": -8.242280960083008, + "loss": 0.8629, + "odds_ratio_loss": 0.14485874772071838, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08484258502721786, + "rewards/margins": 0.7393856644630432, + "rewards/rejected": -0.8242281675338745, + "sft_loss": 0.8484258651733398, + "step": 7225 + }, + { + "epoch": 0.56, + "grad_norm": 31.72794532775879, + "learning_rate": 4.078582226129735e-06, + "logits/chosen": -1.2538648843765259, + "logits/rejected": -1.2442595958709717, + "logps/chosen": -0.754325270652771, + "logps/rejected": -8.127649307250977, + "loss": 0.7984, + "odds_ratio_loss": 0.4406220018863678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07543252408504486, + "rewards/margins": 0.737332284450531, + "rewards/rejected": -0.8127648234367371, + "sft_loss": 0.754325270652771, + "step": 7230 + }, + { + "epoch": 0.56, + "grad_norm": 8.865771293640137, + "learning_rate": 4.0725308154942395e-06, + "logits/chosen": -1.4666943550109863, + "logits/rejected": -1.2816884517669678, + "logps/chosen": -1.100095272064209, + "logps/rejected": -2.0770721435546875, + "loss": 1.1549, + "odds_ratio_loss": 0.5476905107498169, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11000952869653702, + "rewards/margins": 0.09769769012928009, + "rewards/rejected": -0.2077072113752365, + "sft_loss": 1.100095272064209, + "step": 7235 + }, + { + "epoch": 0.56, + "grad_norm": 21.18962860107422, + "learning_rate": 4.066480811483518e-06, + "logits/chosen": -1.3888037204742432, + "logits/rejected": -0.7071020007133484, + "logps/chosen": -0.9945389032363892, + "logps/rejected": -5.384260177612305, + "loss": 1.0017, + "odds_ratio_loss": 0.07145805656909943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0994538888335228, + "rewards/margins": 0.4389721751213074, + "rewards/rejected": -0.5384260416030884, + "sft_loss": 0.9945389032363892, + "step": 7240 + }, + { + "epoch": 0.56, + "grad_norm": 335.705810546875, + "learning_rate": 4.060432223273169e-06, + "logits/chosen": -1.4586912393569946, + "logits/rejected": -1.1600415706634521, + "logps/chosen": -1.4400997161865234, + "logps/rejected": -2.9663405418395996, + "loss": 1.4646, + "odds_ratio_loss": 0.2453519105911255, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14400997757911682, + "rewards/margins": 0.15262408554553986, + "rewards/rejected": -0.2966340482234955, + "sft_loss": 1.4400997161865234, + "step": 7245 + }, + { + "epoch": 0.56, + "grad_norm": 11.154559135437012, + "learning_rate": 4.0543850600366444e-06, + "logits/chosen": -1.4864270687103271, + "logits/rejected": -1.2301304340362549, + "logps/chosen": -1.3381130695343018, + "logps/rejected": -2.762622833251953, + "loss": 1.3802, + "odds_ratio_loss": 0.42071300745010376, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13381129503250122, + "rewards/margins": 0.1424509882926941, + "rewards/rejected": -0.2762622833251953, + "sft_loss": 1.3381130695343018, + "step": 7250 + }, + { + "epoch": 0.56, + "grad_norm": 38.43938446044922, + "learning_rate": 4.048339330945238e-06, + "logits/chosen": -1.1480789184570312, + "logits/rejected": -0.9946290254592896, + "logps/chosen": -0.8657535314559937, + "logps/rejected": -6.532303810119629, + "loss": 0.8688, + "odds_ratio_loss": 0.03087017312645912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08657535910606384, + "rewards/margins": 0.5666549801826477, + "rewards/rejected": -0.6532303690910339, + "sft_loss": 0.8657535314559937, + "step": 7255 + }, + { + "epoch": 0.56, + "grad_norm": 5.126986980438232, + "learning_rate": 4.042295045168064e-06, + "logits/chosen": -1.4064311981201172, + "logits/rejected": -0.7540382146835327, + "logps/chosen": -1.1285769939422607, + "logps/rejected": -5.043487548828125, + "loss": 1.1424, + "odds_ratio_loss": 0.1382966786623001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11285771429538727, + "rewards/margins": 0.3914910852909088, + "rewards/rejected": -0.5043487548828125, + "sft_loss": 1.1285769939422607, + "step": 7260 + }, + { + "epoch": 0.57, + "grad_norm": 8.36670970916748, + "learning_rate": 4.036252211872047e-06, + "logits/chosen": -1.245692253112793, + "logits/rejected": -1.0072195529937744, + "logps/chosen": -0.7486428022384644, + "logps/rejected": -5.857022285461426, + "loss": 0.7628, + "odds_ratio_loss": 0.14205805957317352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07486427575349808, + "rewards/margins": 0.5108379125595093, + "rewards/rejected": -0.5857021808624268, + "sft_loss": 0.7486428022384644, + "step": 7265 + }, + { + "epoch": 0.57, + "grad_norm": 1.4005019664764404, + "learning_rate": 4.030210840221915e-06, + "logits/chosen": -1.2849702835083008, + "logits/rejected": -1.1393325328826904, + "logps/chosen": -0.7389216423034668, + "logps/rejected": -4.627842426300049, + "loss": 0.7584, + "odds_ratio_loss": 0.19480088353157043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07389216125011444, + "rewards/margins": 0.3888920843601227, + "rewards/rejected": -0.4627842307090759, + "sft_loss": 0.7389216423034668, + "step": 7270 + }, + { + "epoch": 0.57, + "grad_norm": 14.450980186462402, + "learning_rate": 4.024170939380172e-06, + "logits/chosen": -1.3889250755310059, + "logits/rejected": -1.1132410764694214, + "logps/chosen": -1.0050677061080933, + "logps/rejected": -5.148253440856934, + "loss": 1.0419, + "odds_ratio_loss": 0.36850666999816895, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10050676763057709, + "rewards/margins": 0.4143185615539551, + "rewards/rejected": -0.5148253440856934, + "sft_loss": 1.0050677061080933, + "step": 7275 + }, + { + "epoch": 0.57, + "grad_norm": 5.981418609619141, + "learning_rate": 4.018132518507095e-06, + "logits/chosen": -1.227460265159607, + "logits/rejected": -1.2487907409667969, + "logps/chosen": -0.8153915405273438, + "logps/rejected": -3.5961098670959473, + "loss": 0.8424, + "odds_ratio_loss": 0.26995450258255005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08153915405273438, + "rewards/margins": 0.2780718207359314, + "rewards/rejected": -0.35961097478866577, + "sft_loss": 0.8153915405273438, + "step": 7280 + }, + { + "epoch": 0.57, + "grad_norm": 88.4582748413086, + "learning_rate": 4.012095586760718e-06, + "logits/chosen": -1.1984305381774902, + "logits/rejected": -1.5410726070404053, + "logps/chosen": -1.7070376873016357, + "logps/rejected": -4.591531276702881, + "loss": 1.745, + "odds_ratio_loss": 0.3800917863845825, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17070379853248596, + "rewards/margins": 0.28844937682151794, + "rewards/rejected": -0.4591531753540039, + "sft_loss": 1.7070376873016357, + "step": 7285 + }, + { + "epoch": 0.57, + "grad_norm": 22.212352752685547, + "learning_rate": 4.006060153296812e-06, + "logits/chosen": -1.34489107131958, + "logits/rejected": -1.3902560472488403, + "logps/chosen": -0.8336170315742493, + "logps/rejected": -2.8464300632476807, + "loss": 0.8576, + "odds_ratio_loss": 0.23946678638458252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08336170762777328, + "rewards/margins": 0.20128127932548523, + "rewards/rejected": -0.2846429944038391, + "sft_loss": 0.8336170315742493, + "step": 7290 + }, + { + "epoch": 0.57, + "grad_norm": 16.92748260498047, + "learning_rate": 4.000026227268878e-06, + "logits/chosen": -1.4852524995803833, + "logits/rejected": -1.3302969932556152, + "logps/chosen": -1.6487728357315063, + "logps/rejected": -8.657622337341309, + "loss": 1.7169, + "odds_ratio_loss": 0.6810190081596375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1648772954940796, + "rewards/margins": 0.7008849382400513, + "rewards/rejected": -0.8657622337341309, + "sft_loss": 1.6487728357315063, + "step": 7295 + }, + { + "epoch": 0.57, + "grad_norm": 40.37065887451172, + "learning_rate": 3.993993817828134e-06, + "logits/chosen": -1.383131742477417, + "logits/rejected": -1.620224952697754, + "logps/chosen": -1.0177613496780396, + "logps/rejected": -8.058442115783691, + "loss": 1.0251, + "odds_ratio_loss": 0.07358211278915405, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1017761379480362, + "rewards/margins": 0.7040680646896362, + "rewards/rejected": -0.8058441281318665, + "sft_loss": 1.0177613496780396, + "step": 7300 + }, + { + "epoch": 0.57, + "grad_norm": 9.612100601196289, + "learning_rate": 3.9879629341234925e-06, + "logits/chosen": -1.328662633895874, + "logits/rejected": -0.8497580289840698, + "logps/chosen": -0.9461971521377563, + "logps/rejected": -2.255105495452881, + "loss": 0.9915, + "odds_ratio_loss": 0.4530237317085266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09461971372365952, + "rewards/margins": 0.1308908313512802, + "rewards/rejected": -0.22551055252552032, + "sft_loss": 0.9461971521377563, + "step": 7305 + }, + { + "epoch": 0.57, + "grad_norm": 8.014002799987793, + "learning_rate": 3.981933585301555e-06, + "logits/chosen": -1.294582486152649, + "logits/rejected": -1.406262755393982, + "logps/chosen": -0.8181141018867493, + "logps/rejected": -8.624747276306152, + "loss": 0.8296, + "odds_ratio_loss": 0.11477307975292206, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08181141316890717, + "rewards/margins": 0.7806633710861206, + "rewards/rejected": -0.862474799156189, + "sft_loss": 0.8181141018867493, + "step": 7310 + }, + { + "epoch": 0.57, + "grad_norm": 16.23911476135254, + "learning_rate": 3.975905780506591e-06, + "logits/chosen": -1.2872745990753174, + "logits/rejected": -1.3416489362716675, + "logps/chosen": -0.9090186357498169, + "logps/rejected": -5.8794755935668945, + "loss": 0.9113, + "odds_ratio_loss": 0.022481894120573997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09090186655521393, + "rewards/margins": 0.4970456659793854, + "rewards/rejected": -0.5879475474357605, + "sft_loss": 0.9090186357498169, + "step": 7315 + }, + { + "epoch": 0.57, + "grad_norm": 8.851611137390137, + "learning_rate": 3.9698795288805375e-06, + "logits/chosen": -1.352212905883789, + "logits/rejected": -0.933361828327179, + "logps/chosen": -0.7633185386657715, + "logps/rejected": -4.345231056213379, + "loss": 0.7798, + "odds_ratio_loss": 0.1644791066646576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07633186131715775, + "rewards/margins": 0.35819125175476074, + "rewards/rejected": -0.43452316522598267, + "sft_loss": 0.7633185386657715, + "step": 7320 + }, + { + "epoch": 0.57, + "grad_norm": 7.002867698669434, + "learning_rate": 3.963854839562968e-06, + "logits/chosen": -1.4375827312469482, + "logits/rejected": -0.7585693001747131, + "logps/chosen": -0.97300785779953, + "logps/rejected": -3.8505489826202393, + "loss": 0.9821, + "odds_ratio_loss": 0.09092002362012863, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09730079770088196, + "rewards/margins": 0.2877541184425354, + "rewards/rejected": -0.38505488634109497, + "sft_loss": 0.97300785779953, + "step": 7325 + }, + { + "epoch": 0.57, + "grad_norm": 5.4109978675842285, + "learning_rate": 3.957831721691086e-06, + "logits/chosen": -1.3101989030838013, + "logits/rejected": -1.083234190940857, + "logps/chosen": -0.9228937029838562, + "logps/rejected": -4.4622979164123535, + "loss": 0.9336, + "odds_ratio_loss": 0.10741086304187775, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09228937327861786, + "rewards/margins": 0.3539404273033142, + "rewards/rejected": -0.4462297856807709, + "sft_loss": 0.9228937029838562, + "step": 7330 + }, + { + "epoch": 0.57, + "grad_norm": 27.10301399230957, + "learning_rate": 3.95181018439972e-06, + "logits/chosen": -1.1312278509140015, + "logits/rejected": -1.573578119277954, + "logps/chosen": -1.2438294887542725, + "logps/rejected": -6.356667995452881, + "loss": 1.2918, + "odds_ratio_loss": 0.47944560647010803, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12438295781612396, + "rewards/margins": 0.5112838745117188, + "rewards/rejected": -0.6356668472290039, + "sft_loss": 1.2438294887542725, + "step": 7335 + }, + { + "epoch": 0.57, + "grad_norm": 16.846214294433594, + "learning_rate": 3.945790236821292e-06, + "logits/chosen": -1.4615051746368408, + "logits/rejected": -1.215092420578003, + "logps/chosen": -0.9354572296142578, + "logps/rejected": -4.896553993225098, + "loss": 0.9407, + "odds_ratio_loss": 0.05228148028254509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09354571998119354, + "rewards/margins": 0.3961096405982971, + "rewards/rejected": -0.48965540528297424, + "sft_loss": 0.9354572296142578, + "step": 7340 + }, + { + "epoch": 0.57, + "grad_norm": 9.336442947387695, + "learning_rate": 3.939771888085815e-06, + "logits/chosen": -1.3849732875823975, + "logits/rejected": -1.2363064289093018, + "logps/chosen": -0.8775486946105957, + "logps/rejected": -3.843815326690674, + "loss": 0.9062, + "odds_ratio_loss": 0.286162793636322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08775487542152405, + "rewards/margins": 0.2966266870498657, + "rewards/rejected": -0.3843815326690674, + "sft_loss": 0.8775486946105957, + "step": 7345 + }, + { + "epoch": 0.57, + "grad_norm": 12.346092224121094, + "learning_rate": 3.933755147320884e-06, + "logits/chosen": -1.4507148265838623, + "logits/rejected": -1.133741855621338, + "logps/chosen": -1.1160967350006104, + "logps/rejected": -5.653210639953613, + "loss": 1.1234, + "odds_ratio_loss": 0.07335402816534042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11160967499017715, + "rewards/margins": 0.4537113308906555, + "rewards/rejected": -0.5653210878372192, + "sft_loss": 1.1160967350006104, + "step": 7350 + }, + { + "epoch": 0.57, + "grad_norm": 9.299036979675293, + "learning_rate": 3.927740023651648e-06, + "logits/chosen": -1.4600021839141846, + "logits/rejected": -0.7748432755470276, + "logps/chosen": -0.8878051042556763, + "logps/rejected": -2.7120416164398193, + "loss": 0.9038, + "odds_ratio_loss": 0.15970009565353394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08878050744533539, + "rewards/margins": 0.1824236661195755, + "rewards/rejected": -0.2712041735649109, + "sft_loss": 0.8878051042556763, + "step": 7355 + }, + { + "epoch": 0.57, + "grad_norm": 29.598922729492188, + "learning_rate": 3.921726526200803e-06, + "logits/chosen": -1.5297552347183228, + "logits/rejected": -1.322796106338501, + "logps/chosen": -0.7642945051193237, + "logps/rejected": -3.3941707611083984, + "loss": 0.8392, + "odds_ratio_loss": 0.7493532299995422, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07642945647239685, + "rewards/margins": 0.2629876136779785, + "rewards/rejected": -0.33941707015037537, + "sft_loss": 0.7642945051193237, + "step": 7360 + }, + { + "epoch": 0.57, + "grad_norm": 6.58057165145874, + "learning_rate": 3.915714664088586e-06, + "logits/chosen": -1.465050220489502, + "logits/rejected": -0.8858221769332886, + "logps/chosen": -1.0814603567123413, + "logps/rejected": -8.027473449707031, + "loss": 1.0989, + "odds_ratio_loss": 0.17474313080310822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10814603418111801, + "rewards/margins": 0.6946013569831848, + "rewards/rejected": -0.8027472496032715, + "sft_loss": 1.0814603567123413, + "step": 7365 + }, + { + "epoch": 0.57, + "grad_norm": 47.6906852722168, + "learning_rate": 3.909704446432748e-06, + "logits/chosen": -1.4229741096496582, + "logits/rejected": -1.2047765254974365, + "logps/chosen": -1.368956208229065, + "logps/rejected": -4.672844409942627, + "loss": 1.387, + "odds_ratio_loss": 0.1804140955209732, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13689562678337097, + "rewards/margins": 0.33038878440856934, + "rewards/rejected": -0.4672844409942627, + "sft_loss": 1.368956208229065, + "step": 7370 + }, + { + "epoch": 0.57, + "grad_norm": 13.896078109741211, + "learning_rate": 3.903695882348545e-06, + "logits/chosen": -1.5089856386184692, + "logits/rejected": -1.2921054363250732, + "logps/chosen": -1.1158466339111328, + "logps/rejected": -2.5853195190429688, + "loss": 1.1437, + "odds_ratio_loss": 0.27874505519866943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11158466339111328, + "rewards/margins": 0.14694729447364807, + "rewards/rejected": -0.25853198766708374, + "sft_loss": 1.1158466339111328, + "step": 7375 + }, + { + "epoch": 0.57, + "grad_norm": 6.536021709442139, + "learning_rate": 3.897688980948729e-06, + "logits/chosen": -1.3587908744812012, + "logits/rejected": -1.3771960735321045, + "logps/chosen": -1.0086991786956787, + "logps/rejected": -5.609964847564697, + "loss": 1.0279, + "odds_ratio_loss": 0.19150960445404053, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10086993128061295, + "rewards/margins": 0.4601265490055084, + "rewards/rejected": -0.5609964728355408, + "sft_loss": 1.0086991786956787, + "step": 7380 + }, + { + "epoch": 0.57, + "grad_norm": 4.9608283042907715, + "learning_rate": 3.891683751343528e-06, + "logits/chosen": -1.3274409770965576, + "logits/rejected": -0.616326630115509, + "logps/chosen": -1.0203479528427124, + "logps/rejected": -10.478364944458008, + "loss": 1.0275, + "odds_ratio_loss": 0.07164148986339569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1020348072052002, + "rewards/margins": 0.945801854133606, + "rewards/rejected": -1.0478365421295166, + "sft_loss": 1.0203479528427124, + "step": 7385 + }, + { + "epoch": 0.57, + "grad_norm": 20.92188835144043, + "learning_rate": 3.8856802026406355e-06, + "logits/chosen": -1.3875770568847656, + "logits/rejected": -1.0865342617034912, + "logps/chosen": -0.9741819500923157, + "logps/rejected": -10.687253952026367, + "loss": 0.9878, + "odds_ratio_loss": 0.1360916644334793, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09741818159818649, + "rewards/margins": 0.971307098865509, + "rewards/rejected": -1.068725347518921, + "sft_loss": 0.9741819500923157, + "step": 7390 + }, + { + "epoch": 0.58, + "grad_norm": 22.333799362182617, + "learning_rate": 3.879678343945193e-06, + "logits/chosen": -1.153346300125122, + "logits/rejected": -1.1226739883422852, + "logps/chosen": -3.0002946853637695, + "logps/rejected": -3.063096284866333, + "loss": 3.1454, + "odds_ratio_loss": 1.450598955154419, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3000294864177704, + "rewards/margins": 0.006280136294662952, + "rewards/rejected": -0.30630961060523987, + "sft_loss": 3.0002946853637695, + "step": 7395 + }, + { + "epoch": 0.58, + "grad_norm": 7.445085048675537, + "learning_rate": 3.873678184359787e-06, + "logits/chosen": -1.3794811964035034, + "logits/rejected": -1.1986474990844727, + "logps/chosen": -0.830730140209198, + "logps/rejected": -10.583611488342285, + "loss": 0.8329, + "odds_ratio_loss": 0.021612081676721573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08307301253080368, + "rewards/margins": 0.9752882122993469, + "rewards/rejected": -1.058361291885376, + "sft_loss": 0.830730140209198, + "step": 7400 + }, + { + "epoch": 0.58, + "grad_norm": 17.45865249633789, + "learning_rate": 3.867679732984417e-06, + "logits/chosen": -1.4199925661087036, + "logits/rejected": -1.3598154783248901, + "logps/chosen": -1.0561368465423584, + "logps/rejected": -4.694639682769775, + "loss": 1.1353, + "odds_ratio_loss": 0.7915878295898438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10561369359493256, + "rewards/margins": 0.36385026574134827, + "rewards/rejected": -0.469463974237442, + "sft_loss": 1.0561368465423584, + "step": 7405 + }, + { + "epoch": 0.58, + "grad_norm": 7.087828159332275, + "learning_rate": 3.861682998916495e-06, + "logits/chosen": -1.4668176174163818, + "logits/rejected": -1.0923296213150024, + "logps/chosen": -1.2441160678863525, + "logps/rejected": -5.291169166564941, + "loss": 1.2933, + "odds_ratio_loss": 0.4921974539756775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12441160529851913, + "rewards/margins": 0.4047052264213562, + "rewards/rejected": -0.5291168689727783, + "sft_loss": 1.2441160678863525, + "step": 7410 + }, + { + "epoch": 0.58, + "grad_norm": 20.64565658569336, + "learning_rate": 3.855687991250833e-06, + "logits/chosen": -1.3258016109466553, + "logits/rejected": -0.9141836166381836, + "logps/chosen": -1.0131573677062988, + "logps/rejected": -4.818573951721191, + "loss": 1.0382, + "odds_ratio_loss": 0.250355064868927, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10131573677062988, + "rewards/margins": 0.38054168224334717, + "rewards/rejected": -0.48185744881629944, + "sft_loss": 1.0131573677062988, + "step": 7415 + }, + { + "epoch": 0.58, + "grad_norm": 31.21662712097168, + "learning_rate": 3.84969471907962e-06, + "logits/chosen": -1.4796231985092163, + "logits/rejected": -1.3987324237823486, + "logps/chosen": -1.1544537544250488, + "logps/rejected": -7.364099025726318, + "loss": 1.2128, + "odds_ratio_loss": 0.5836321115493774, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.11544537544250488, + "rewards/margins": 0.6209646463394165, + "rewards/rejected": -0.7364099621772766, + "sft_loss": 1.1544537544250488, + "step": 7420 + }, + { + "epoch": 0.58, + "grad_norm": 12.404463768005371, + "learning_rate": 3.843703191492412e-06, + "logits/chosen": -1.3646366596221924, + "logits/rejected": -0.7645974159240723, + "logps/chosen": -0.9818285703659058, + "logps/rejected": -2.8497519493103027, + "loss": 1.0267, + "odds_ratio_loss": 0.44823575019836426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09818285703659058, + "rewards/margins": 0.18679234385490417, + "rewards/rejected": -0.28497520089149475, + "sft_loss": 0.9818285703659058, + "step": 7425 + }, + { + "epoch": 0.58, + "grad_norm": 10.508298873901367, + "learning_rate": 3.837713417576125e-06, + "logits/chosen": -1.3647921085357666, + "logits/rejected": -0.8251982927322388, + "logps/chosen": -1.1779931783676147, + "logps/rejected": -9.686933517456055, + "loss": 1.1988, + "odds_ratio_loss": 0.20841288566589355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11779932677745819, + "rewards/margins": 0.8508940935134888, + "rewards/rejected": -0.9686933755874634, + "sft_loss": 1.1779931783676147, + "step": 7430 + }, + { + "epoch": 0.58, + "grad_norm": 105.03116607666016, + "learning_rate": 3.831725406415011e-06, + "logits/chosen": -1.3835184574127197, + "logits/rejected": -1.4062840938568115, + "logps/chosen": -1.071958303451538, + "logps/rejected": -2.3874545097351074, + "loss": 1.1013, + "odds_ratio_loss": 0.29344767332077026, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10719583183526993, + "rewards/margins": 0.1315496265888214, + "rewards/rejected": -0.23874545097351074, + "sft_loss": 1.071958303451538, + "step": 7435 + }, + { + "epoch": 0.58, + "grad_norm": 13.58053207397461, + "learning_rate": 3.825739167090648e-06, + "logits/chosen": -1.4159271717071533, + "logits/rejected": -0.8211624026298523, + "logps/chosen": -1.19142484664917, + "logps/rejected": -3.6063523292541504, + "loss": 1.2506, + "odds_ratio_loss": 0.5918790102005005, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11914249509572983, + "rewards/margins": 0.24149274826049805, + "rewards/rejected": -0.36063528060913086, + "sft_loss": 1.19142484664917, + "step": 7440 + }, + { + "epoch": 0.58, + "grad_norm": 28.174692153930664, + "learning_rate": 3.819754708681925e-06, + "logits/chosen": -1.5469629764556885, + "logits/rejected": -1.4011662006378174, + "logps/chosen": -1.069461703300476, + "logps/rejected": -7.015917778015137, + "loss": 1.0704, + "odds_ratio_loss": 0.009362577460706234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1069461852312088, + "rewards/margins": 0.594645619392395, + "rewards/rejected": -0.7015917897224426, + "sft_loss": 1.069461703300476, + "step": 7445 + }, + { + "epoch": 0.58, + "grad_norm": 4.565928936004639, + "learning_rate": 3.813772040265039e-06, + "logits/chosen": -1.329349398612976, + "logits/rejected": -1.0876749753952026, + "logps/chosen": -0.9946697354316711, + "logps/rejected": -4.11696720123291, + "loss": 1.0389, + "odds_ratio_loss": 0.4418897032737732, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.099466972053051, + "rewards/margins": 0.31222978234291077, + "rewards/rejected": -0.41169673204421997, + "sft_loss": 0.9946697354316711, + "step": 7450 + }, + { + "epoch": 0.58, + "grad_norm": 16.70859146118164, + "learning_rate": 3.8077911709134625e-06, + "logits/chosen": -1.4293928146362305, + "logits/rejected": -1.2371962070465088, + "logps/chosen": -0.7824566960334778, + "logps/rejected": -3.111362934112549, + "loss": 0.7949, + "odds_ratio_loss": 0.12397966533899307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07824566960334778, + "rewards/margins": 0.23289060592651367, + "rewards/rejected": -0.31113627552986145, + "sft_loss": 0.7824566960334778, + "step": 7455 + }, + { + "epoch": 0.58, + "grad_norm": 5.1738200187683105, + "learning_rate": 3.8018121096979432e-06, + "logits/chosen": -1.3276052474975586, + "logits/rejected": -1.1583489179611206, + "logps/chosen": -0.8614734411239624, + "logps/rejected": -13.287282943725586, + "loss": 0.8701, + "odds_ratio_loss": 0.08607557415962219, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08614735305309296, + "rewards/margins": 1.2425811290740967, + "rewards/rejected": -1.328728437423706, + "sft_loss": 0.8614734411239624, + "step": 7460 + }, + { + "epoch": 0.58, + "grad_norm": 8.078058242797852, + "learning_rate": 3.7958348656864883e-06, + "logits/chosen": -1.433990240097046, + "logits/rejected": -0.9150580167770386, + "logps/chosen": -0.9640260934829712, + "logps/rejected": -3.0823981761932373, + "loss": 0.9896, + "odds_ratio_loss": 0.2554711699485779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0964026004076004, + "rewards/margins": 0.21183724701404572, + "rewards/rejected": -0.3082398474216461, + "sft_loss": 0.9640260934829712, + "step": 7465 + }, + { + "epoch": 0.58, + "grad_norm": 93.4781494140625, + "learning_rate": 3.7898594479443467e-06, + "logits/chosen": -1.2909369468688965, + "logits/rejected": -1.0158801078796387, + "logps/chosen": -1.3513132333755493, + "logps/rejected": -6.237979888916016, + "loss": 1.3717, + "odds_ratio_loss": 0.2035665214061737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1351313292980194, + "rewards/margins": 0.48866668343544006, + "rewards/rejected": -0.6237980127334595, + "sft_loss": 1.3513132333755493, + "step": 7470 + }, + { + "epoch": 0.58, + "grad_norm": 7.653111457824707, + "learning_rate": 3.7838858655339956e-06, + "logits/chosen": -1.4438180923461914, + "logits/rejected": -1.0234767198562622, + "logps/chosen": -0.9316795468330383, + "logps/rejected": -6.642031669616699, + "loss": 0.9507, + "odds_ratio_loss": 0.19040152430534363, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09316794574260712, + "rewards/margins": 0.5710352659225464, + "rewards/rejected": -0.6642031669616699, + "sft_loss": 0.9316795468330383, + "step": 7475 + }, + { + "epoch": 0.58, + "grad_norm": 24.714967727661133, + "learning_rate": 3.777914127515135e-06, + "logits/chosen": -1.0196001529693604, + "logits/rejected": -1.8837283849716187, + "logps/chosen": -1.2547192573547363, + "logps/rejected": -16.984920501708984, + "loss": 1.2547, + "odds_ratio_loss": 0.00013136306370142847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12547191977500916, + "rewards/margins": 1.573020339012146, + "rewards/rejected": -1.6984922885894775, + "sft_loss": 1.2547192573547363, + "step": 7480 + }, + { + "epoch": 0.58, + "grad_norm": 18.038217544555664, + "learning_rate": 3.7719442429446624e-06, + "logits/chosen": -1.3554340600967407, + "logits/rejected": -1.111193299293518, + "logps/chosen": -1.7390228509902954, + "logps/rejected": -7.009710788726807, + "loss": 1.7415, + "odds_ratio_loss": 0.024865852668881416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17390227317810059, + "rewards/margins": 0.5270687937736511, + "rewards/rejected": -0.7009710669517517, + "sft_loss": 1.7390228509902954, + "step": 7485 + }, + { + "epoch": 0.58, + "grad_norm": 6.1074676513671875, + "learning_rate": 3.7659762208766653e-06, + "logits/chosen": -1.3575494289398193, + "logits/rejected": -0.9257568120956421, + "logps/chosen": -1.1369520425796509, + "logps/rejected": -8.370031356811523, + "loss": 1.1644, + "odds_ratio_loss": 0.27419567108154297, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11369520425796509, + "rewards/margins": 0.7233079075813293, + "rewards/rejected": -0.8370031118392944, + "sft_loss": 1.1369520425796509, + "step": 7490 + }, + { + "epoch": 0.58, + "grad_norm": 17.318450927734375, + "learning_rate": 3.760010070362406e-06, + "logits/chosen": -1.277895212173462, + "logits/rejected": -1.1367418766021729, + "logps/chosen": -0.588483989238739, + "logps/rejected": -6.119777202606201, + "loss": 0.5932, + "odds_ratio_loss": 0.04762459173798561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05884840339422226, + "rewards/margins": 0.5531293749809265, + "rewards/rejected": -0.6119778156280518, + "sft_loss": 0.588483989238739, + "step": 7495 + }, + { + "epoch": 0.58, + "grad_norm": 9.198938369750977, + "learning_rate": 3.754045800450311e-06, + "logits/chosen": -1.3233236074447632, + "logits/rejected": -1.4493837356567383, + "logps/chosen": -1.0046014785766602, + "logps/rejected": -7.714364528656006, + "loss": 1.0113, + "odds_ratio_loss": 0.06727956980466843, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10046014934778214, + "rewards/margins": 0.6709764003753662, + "rewards/rejected": -0.7714365124702454, + "sft_loss": 1.0046014785766602, + "step": 7500 + }, + { + "epoch": 0.58, + "grad_norm": 9.631113052368164, + "learning_rate": 3.7480834201859527e-06, + "logits/chosen": -1.3423171043395996, + "logits/rejected": -0.9379371404647827, + "logps/chosen": -0.8321939706802368, + "logps/rejected": -4.562464237213135, + "loss": 0.8415, + "odds_ratio_loss": 0.09319966286420822, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08321939408779144, + "rewards/margins": 0.3730270266532898, + "rewards/rejected": -0.45624643564224243, + "sft_loss": 0.8321939706802368, + "step": 7505 + }, + { + "epoch": 0.58, + "grad_norm": 7.952220916748047, + "learning_rate": 3.7421229386120352e-06, + "logits/chosen": -1.4574992656707764, + "logits/rejected": -1.5705512762069702, + "logps/chosen": -1.4259287118911743, + "logps/rejected": -13.200403213500977, + "loss": 1.4282, + "odds_ratio_loss": 0.022520998492836952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1425928771495819, + "rewards/margins": 1.1774473190307617, + "rewards/rejected": -1.3200403451919556, + "sft_loss": 1.4259287118911743, + "step": 7510 + }, + { + "epoch": 0.58, + "grad_norm": 4.627284049987793, + "learning_rate": 3.7361643647683887e-06, + "logits/chosen": -1.4829736948013306, + "logits/rejected": -1.054227590560913, + "logps/chosen": -0.9570187330245972, + "logps/rejected": -5.006585121154785, + "loss": 0.9731, + "odds_ratio_loss": 0.16131040453910828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09570188075304031, + "rewards/margins": 0.4049566388130188, + "rewards/rejected": -0.5006585121154785, + "sft_loss": 0.9570187330245972, + "step": 7515 + }, + { + "epoch": 0.58, + "grad_norm": 13.184547424316406, + "learning_rate": 3.7302077076919463e-06, + "logits/chosen": -1.3098251819610596, + "logits/rejected": -1.133541226387024, + "logps/chosen": -1.29055917263031, + "logps/rejected": -8.865058898925781, + "loss": 1.3587, + "odds_ratio_loss": 0.6810811758041382, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.129055917263031, + "rewards/margins": 0.7574499845504761, + "rewards/rejected": -0.8865059018135071, + "sft_loss": 1.29055917263031, + "step": 7520 + }, + { + "epoch": 0.59, + "grad_norm": 20.698753356933594, + "learning_rate": 3.7242529764167336e-06, + "logits/chosen": -1.2861082553863525, + "logits/rejected": -1.3361252546310425, + "logps/chosen": -1.0132089853286743, + "logps/rejected": -2.560932159423828, + "loss": 1.0499, + "odds_ratio_loss": 0.36650410294532776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10132090002298355, + "rewards/margins": 0.1547723114490509, + "rewards/rejected": -0.25609320402145386, + "sft_loss": 1.0132089853286743, + "step": 7525 + }, + { + "epoch": 0.59, + "grad_norm": 20.249622344970703, + "learning_rate": 3.7183001799738583e-06, + "logits/chosen": -1.3709007501602173, + "logits/rejected": -1.1397546529769897, + "logps/chosen": -0.8171932101249695, + "logps/rejected": -14.227322578430176, + "loss": 0.8206, + "odds_ratio_loss": 0.03364390507340431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08171931654214859, + "rewards/margins": 1.341012954711914, + "rewards/rejected": -1.4227323532104492, + "sft_loss": 0.8171932101249695, + "step": 7530 + }, + { + "epoch": 0.59, + "grad_norm": 11.157368659973145, + "learning_rate": 3.7123493273914913e-06, + "logits/chosen": -1.4680503606796265, + "logits/rejected": -1.0822898149490356, + "logps/chosen": -0.9170717000961304, + "logps/rejected": -6.117377758026123, + "loss": 0.9235, + "odds_ratio_loss": 0.0642508715391159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09170717746019363, + "rewards/margins": 0.520030677318573, + "rewards/rejected": -0.611737847328186, + "sft_loss": 0.9170717000961304, + "step": 7535 + }, + { + "epoch": 0.59, + "grad_norm": 21.787038803100586, + "learning_rate": 3.706400427694853e-06, + "logits/chosen": -1.509522795677185, + "logits/rejected": -1.2455151081085205, + "logps/chosen": -0.729500949382782, + "logps/rejected": -4.698983669281006, + "loss": 0.7769, + "odds_ratio_loss": 0.47362834215164185, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0729500949382782, + "rewards/margins": 0.39694833755493164, + "rewards/rejected": -0.4698984622955322, + "sft_loss": 0.729500949382782, + "step": 7540 + }, + { + "epoch": 0.59, + "grad_norm": 7.119065284729004, + "learning_rate": 3.70045348990621e-06, + "logits/chosen": -1.234168291091919, + "logits/rejected": -1.2230815887451172, + "logps/chosen": -0.9674831628799438, + "logps/rejected": -6.17057991027832, + "loss": 0.9699, + "odds_ratio_loss": 0.023831719532608986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09674831479787827, + "rewards/margins": 0.5203096866607666, + "rewards/rejected": -0.6170579791069031, + "sft_loss": 0.9674831628799438, + "step": 7545 + }, + { + "epoch": 0.59, + "grad_norm": 5.420483112335205, + "learning_rate": 3.694508523044847e-06, + "logits/chosen": -1.41762375831604, + "logits/rejected": -1.1230498552322388, + "logps/chosen": -0.95631343126297, + "logps/rejected": -2.7387397289276123, + "loss": 0.9797, + "odds_ratio_loss": 0.23361878097057343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09563133865594864, + "rewards/margins": 0.17824262380599976, + "rewards/rejected": -0.2738739848136902, + "sft_loss": 0.95631343126297, + "step": 7550 + }, + { + "epoch": 0.59, + "grad_norm": 16.290616989135742, + "learning_rate": 3.68856553612706e-06, + "logits/chosen": -1.213830590248108, + "logits/rejected": -0.8929702043533325, + "logps/chosen": -0.9933937191963196, + "logps/rejected": -7.666130065917969, + "loss": 1.0055, + "odds_ratio_loss": 0.12060995399951935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09933938086032867, + "rewards/margins": 0.6672737002372742, + "rewards/rejected": -0.7666130661964417, + "sft_loss": 0.9933937191963196, + "step": 7555 + }, + { + "epoch": 0.59, + "grad_norm": 14.791046142578125, + "learning_rate": 3.682624538166143e-06, + "logits/chosen": -1.2562824487686157, + "logits/rejected": -1.0913159847259521, + "logps/chosen": -1.080479383468628, + "logps/rejected": -4.734437465667725, + "loss": 1.0864, + "odds_ratio_loss": 0.059390682727098465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10804794728755951, + "rewards/margins": 0.36539584398269653, + "rewards/rejected": -0.47344380617141724, + "sft_loss": 1.080479383468628, + "step": 7560 + }, + { + "epoch": 0.59, + "grad_norm": 367.5003356933594, + "learning_rate": 3.6766855381723756e-06, + "logits/chosen": -1.426210641860962, + "logits/rejected": -1.2520123720169067, + "logps/chosen": -1.1438326835632324, + "logps/rejected": -3.119784116744995, + "loss": 1.1587, + "odds_ratio_loss": 0.1487005054950714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1143832802772522, + "rewards/margins": 0.19759513437747955, + "rewards/rejected": -0.31197839975357056, + "sft_loss": 1.1438326835632324, + "step": 7565 + }, + { + "epoch": 0.59, + "grad_norm": 16.09073257446289, + "learning_rate": 3.6707485451530035e-06, + "logits/chosen": -1.3527429103851318, + "logits/rejected": -0.7134038209915161, + "logps/chosen": -0.8533148765563965, + "logps/rejected": -2.4193506240844727, + "loss": 0.87, + "odds_ratio_loss": 0.16703931987285614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.085331492125988, + "rewards/margins": 0.1566035896539688, + "rewards/rejected": -0.24193505942821503, + "sft_loss": 0.8533148765563965, + "step": 7570 + }, + { + "epoch": 0.59, + "grad_norm": 5.578355312347412, + "learning_rate": 3.66481356811223e-06, + "logits/chosen": -1.459567666053772, + "logits/rejected": -0.7954663038253784, + "logps/chosen": -0.8611226081848145, + "logps/rejected": -5.201925754547119, + "loss": 0.8792, + "odds_ratio_loss": 0.18051791191101074, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08611226081848145, + "rewards/margins": 0.4340802729129791, + "rewards/rejected": -0.520192563533783, + "sft_loss": 0.8611226081848145, + "step": 7575 + }, + { + "epoch": 0.59, + "grad_norm": 21.39191436767578, + "learning_rate": 3.658880616051204e-06, + "logits/chosen": -1.3536994457244873, + "logits/rejected": -1.4201124906539917, + "logps/chosen": -1.0324753522872925, + "logps/rejected": -2.731003522872925, + "loss": 1.0648, + "odds_ratio_loss": 0.32371786236763, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10324753820896149, + "rewards/margins": 0.16985280811786652, + "rewards/rejected": -0.273100346326828, + "sft_loss": 1.0324753522872925, + "step": 7580 + }, + { + "epoch": 0.59, + "grad_norm": 4.845981121063232, + "learning_rate": 3.652949697967998e-06, + "logits/chosen": -1.3896377086639404, + "logits/rejected": -0.6858119368553162, + "logps/chosen": -0.9702582359313965, + "logps/rejected": -11.030991554260254, + "loss": 0.993, + "odds_ratio_loss": 0.22788207232952118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09702581912279129, + "rewards/margins": 1.0060734748840332, + "rewards/rejected": -1.1030992269515991, + "sft_loss": 0.9702582359313965, + "step": 7585 + }, + { + "epoch": 0.59, + "grad_norm": 8.864776611328125, + "learning_rate": 3.6470208228576017e-06, + "logits/chosen": -1.3436756134033203, + "logits/rejected": -1.6786384582519531, + "logps/chosen": -0.9208037257194519, + "logps/rejected": -13.882339477539062, + "loss": 0.9308, + "odds_ratio_loss": 0.09978379309177399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09208037704229355, + "rewards/margins": 1.2961535453796387, + "rewards/rejected": -1.3882339000701904, + "sft_loss": 0.9208037257194519, + "step": 7590 + }, + { + "epoch": 0.59, + "grad_norm": 9.687556266784668, + "learning_rate": 3.6410939997119097e-06, + "logits/chosen": -1.390998125076294, + "logits/rejected": -1.4484277963638306, + "logps/chosen": -1.0292510986328125, + "logps/rejected": -4.382474422454834, + "loss": 1.0741, + "odds_ratio_loss": 0.44898781180381775, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10292510688304901, + "rewards/margins": 0.3353223204612732, + "rewards/rejected": -0.4382474422454834, + "sft_loss": 1.0292510986328125, + "step": 7595 + }, + { + "epoch": 0.59, + "grad_norm": 39.966758728027344, + "learning_rate": 3.6351692375197018e-06, + "logits/chosen": -1.3556313514709473, + "logits/rejected": -1.0924326181411743, + "logps/chosen": -1.3269069194793701, + "logps/rejected": -8.789861679077148, + "loss": 1.3478, + "odds_ratio_loss": 0.2092055082321167, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1326906979084015, + "rewards/margins": 0.7462955713272095, + "rewards/rejected": -0.8789861798286438, + "sft_loss": 1.3269069194793701, + "step": 7600 + }, + { + "epoch": 0.59, + "grad_norm": 14.269389152526855, + "learning_rate": 3.629246545266629e-06, + "logits/chosen": -1.324946641921997, + "logits/rejected": -1.335137128829956, + "logps/chosen": -1.1242374181747437, + "logps/rejected": -7.613337516784668, + "loss": 1.1367, + "odds_ratio_loss": 0.1243775486946106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11242374032735825, + "rewards/margins": 0.6489100456237793, + "rewards/rejected": -0.7613337635993958, + "sft_loss": 1.1242374181747437, + "step": 7605 + }, + { + "epoch": 0.59, + "grad_norm": 7.829433917999268, + "learning_rate": 3.6233259319352098e-06, + "logits/chosen": -1.3544420003890991, + "logits/rejected": -1.10740327835083, + "logps/chosen": -1.101051926612854, + "logps/rejected": -2.454871416091919, + "loss": 1.1316, + "odds_ratio_loss": 0.30578336119651794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11010519415140152, + "rewards/margins": 0.13538196682929993, + "rewards/rejected": -0.24548716843128204, + "sft_loss": 1.101051926612854, + "step": 7610 + }, + { + "epoch": 0.59, + "grad_norm": 108.93167114257812, + "learning_rate": 3.6174074065048035e-06, + "logits/chosen": -1.2685353755950928, + "logits/rejected": -1.3089756965637207, + "logps/chosen": -0.8494631052017212, + "logps/rejected": -5.920241832733154, + "loss": 0.8577, + "odds_ratio_loss": 0.08219350874423981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08494630455970764, + "rewards/margins": 0.5070779323577881, + "rewards/rejected": -0.5920242071151733, + "sft_loss": 0.8494631052017212, + "step": 7615 + }, + { + "epoch": 0.59, + "grad_norm": 24.65989875793457, + "learning_rate": 3.611490977951606e-06, + "logits/chosen": -1.4084243774414062, + "logits/rejected": -1.1269807815551758, + "logps/chosen": -1.041349172592163, + "logps/rejected": -4.547548294067383, + "loss": 1.0507, + "odds_ratio_loss": 0.09329565614461899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10413491725921631, + "rewards/margins": 0.35061994194984436, + "rewards/rejected": -0.45475488901138306, + "sft_loss": 1.041349172592163, + "step": 7620 + }, + { + "epoch": 0.59, + "grad_norm": 6.330051898956299, + "learning_rate": 3.6055766552486304e-06, + "logits/chosen": -1.4215527772903442, + "logits/rejected": -1.018003225326538, + "logps/chosen": -1.4540941715240479, + "logps/rejected": -3.3146910667419434, + "loss": 1.4778, + "odds_ratio_loss": 0.23683035373687744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14540942013263702, + "rewards/margins": 0.18605968356132507, + "rewards/rejected": -0.3314690589904785, + "sft_loss": 1.4540941715240479, + "step": 7625 + }, + { + "epoch": 0.59, + "grad_norm": 10.327467918395996, + "learning_rate": 3.5996644473657026e-06, + "logits/chosen": -1.3508819341659546, + "logits/rejected": -1.580324411392212, + "logps/chosen": -0.6978863477706909, + "logps/rejected": -7.110561370849609, + "loss": 0.7048, + "odds_ratio_loss": 0.06865421682596207, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06978863477706909, + "rewards/margins": 0.6412675380706787, + "rewards/rejected": -0.7110562324523926, + "sft_loss": 0.6978863477706909, + "step": 7630 + }, + { + "epoch": 0.59, + "grad_norm": 29.400272369384766, + "learning_rate": 3.593754363269434e-06, + "logits/chosen": -1.2756131887435913, + "logits/rejected": -1.0476934909820557, + "logps/chosen": -0.9183648824691772, + "logps/rejected": -1.5128974914550781, + "loss": 0.9769, + "odds_ratio_loss": 0.5853749513626099, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09183648973703384, + "rewards/margins": 0.05945326015353203, + "rewards/rejected": -0.15128974616527557, + "sft_loss": 0.9183648824691772, + "step": 7635 + }, + { + "epoch": 0.59, + "grad_norm": 17.112735748291016, + "learning_rate": 3.587846411923215e-06, + "logits/chosen": -1.3515738248825073, + "logits/rejected": -1.3550692796707153, + "logps/chosen": -1.1381123065948486, + "logps/rejected": -5.934088230133057, + "loss": 1.1593, + "odds_ratio_loss": 0.21174244582653046, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11381123960018158, + "rewards/margins": 0.47959762811660767, + "rewards/rejected": -0.5934088230133057, + "sft_loss": 1.1381123065948486, + "step": 7640 + }, + { + "epoch": 0.59, + "grad_norm": 15.542469024658203, + "learning_rate": 3.581940602287208e-06, + "logits/chosen": -1.2181787490844727, + "logits/rejected": -1.3325717449188232, + "logps/chosen": -0.8493123054504395, + "logps/rejected": -5.115633487701416, + "loss": 0.8657, + "odds_ratio_loss": 0.16385197639465332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08493123948574066, + "rewards/margins": 0.4266320765018463, + "rewards/rejected": -0.5115633606910706, + "sft_loss": 0.8493123054504395, + "step": 7645 + }, + { + "epoch": 0.6, + "grad_norm": 6.590150356292725, + "learning_rate": 3.576036943318322e-06, + "logits/chosen": -1.3251354694366455, + "logits/rejected": -0.8361200094223022, + "logps/chosen": -1.0244470834732056, + "logps/rejected": -3.8385987281799316, + "loss": 1.0434, + "odds_ratio_loss": 0.1891406774520874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10244472324848175, + "rewards/margins": 0.2814151644706726, + "rewards/rejected": -0.38385987281799316, + "sft_loss": 1.0244470834732056, + "step": 7650 + }, + { + "epoch": 0.6, + "grad_norm": 21.922080993652344, + "learning_rate": 3.570135443970203e-06, + "logits/chosen": -1.3503806591033936, + "logits/rejected": -1.3192524909973145, + "logps/chosen": -0.9770252108573914, + "logps/rejected": -4.227506160736084, + "loss": 0.9929, + "odds_ratio_loss": 0.15905950963497162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0977025181055069, + "rewards/margins": 0.3250480592250824, + "rewards/rejected": -0.4227506220340729, + "sft_loss": 0.9770252108573914, + "step": 7655 + }, + { + "epoch": 0.6, + "grad_norm": 11.335700988769531, + "learning_rate": 3.5642361131932274e-06, + "logits/chosen": -1.413520097732544, + "logits/rejected": -1.1838710308074951, + "logps/chosen": -1.6299006938934326, + "logps/rejected": -4.449075222015381, + "loss": 1.7312, + "odds_ratio_loss": 1.0133378505706787, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16299009323120117, + "rewards/margins": 0.2819174826145172, + "rewards/rejected": -0.4449075162410736, + "sft_loss": 1.6299006938934326, + "step": 7660 + }, + { + "epoch": 0.6, + "grad_norm": 33.036643981933594, + "learning_rate": 3.5583389599344775e-06, + "logits/chosen": -1.4875004291534424, + "logits/rejected": -1.414227843284607, + "logps/chosen": -0.8248567581176758, + "logps/rejected": -17.03693199157715, + "loss": 0.8278, + "odds_ratio_loss": 0.029040660709142685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08248567581176758, + "rewards/margins": 1.6212074756622314, + "rewards/rejected": -1.703693151473999, + "sft_loss": 0.8248567581176758, + "step": 7665 + }, + { + "epoch": 0.6, + "grad_norm": 21.289030075073242, + "learning_rate": 3.552443993137735e-06, + "logits/chosen": -1.4467661380767822, + "logits/rejected": -0.9300572276115417, + "logps/chosen": -1.0593516826629639, + "logps/rejected": -2.2387285232543945, + "loss": 1.0911, + "odds_ratio_loss": 0.31711629033088684, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10593517869710922, + "rewards/margins": 0.11793768405914307, + "rewards/rejected": -0.2238728553056717, + "sft_loss": 1.0593516826629639, + "step": 7670 + }, + { + "epoch": 0.6, + "grad_norm": 4.732230186462402, + "learning_rate": 3.5465512217434663e-06, + "logits/chosen": -1.2538764476776123, + "logits/rejected": -1.163688063621521, + "logps/chosen": -1.445475697517395, + "logps/rejected": -8.06837272644043, + "loss": 1.4526, + "odds_ratio_loss": 0.07167679071426392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14454758167266846, + "rewards/margins": 0.6622897386550903, + "rewards/rejected": -0.8068373799324036, + "sft_loss": 1.445475697517395, + "step": 7675 + }, + { + "epoch": 0.6, + "grad_norm": 5.590085506439209, + "learning_rate": 3.5406606546888072e-06, + "logits/chosen": -1.4303420782089233, + "logits/rejected": -1.1193432807922363, + "logps/chosen": -1.1342713832855225, + "logps/rejected": -5.6936726570129395, + "loss": 1.1835, + "odds_ratio_loss": 0.49192532896995544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11342713981866837, + "rewards/margins": 0.4559400975704193, + "rewards/rejected": -0.5693672895431519, + "sft_loss": 1.1342713832855225, + "step": 7680 + }, + { + "epoch": 0.6, + "grad_norm": 5.165656089782715, + "learning_rate": 3.5347723009075496e-06, + "logits/chosen": -1.3710882663726807, + "logits/rejected": -1.0793540477752686, + "logps/chosen": -0.9359593391418457, + "logps/rejected": -7.214070796966553, + "loss": 0.9424, + "odds_ratio_loss": 0.06422661244869232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09359593689441681, + "rewards/margins": 0.627811074256897, + "rewards/rejected": -0.7214070558547974, + "sft_loss": 0.9359593391418457, + "step": 7685 + }, + { + "epoch": 0.6, + "grad_norm": 12.285545349121094, + "learning_rate": 3.52888616933013e-06, + "logits/chosen": -1.3175373077392578, + "logits/rejected": -1.4724985361099243, + "logps/chosen": -1.6350700855255127, + "logps/rejected": -7.500203609466553, + "loss": 1.6442, + "odds_ratio_loss": 0.09161634743213654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16350701451301575, + "rewards/margins": 0.586513340473175, + "rewards/rejected": -0.7500203251838684, + "sft_loss": 1.6350700855255127, + "step": 7690 + }, + { + "epoch": 0.6, + "grad_norm": 6.217746257781982, + "learning_rate": 3.523002268883615e-06, + "logits/chosen": -1.2274795770645142, + "logits/rejected": -1.316551923751831, + "logps/chosen": -0.7977105975151062, + "logps/rejected": -9.286924362182617, + "loss": 0.816, + "odds_ratio_loss": 0.1833195835351944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07977106422185898, + "rewards/margins": 0.8489214181900024, + "rewards/rejected": -0.9286924600601196, + "sft_loss": 0.7977105975151062, + "step": 7695 + }, + { + "epoch": 0.6, + "grad_norm": 9.02701187133789, + "learning_rate": 3.5171206084916865e-06, + "logits/chosen": -1.4551115036010742, + "logits/rejected": -0.822216808795929, + "logps/chosen": -1.205783486366272, + "logps/rejected": -16.62922477722168, + "loss": 1.206, + "odds_ratio_loss": 0.002027118345722556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1205783486366272, + "rewards/margins": 1.542344331741333, + "rewards/rejected": -1.6629226207733154, + "sft_loss": 1.205783486366272, + "step": 7700 + }, + { + "epoch": 0.6, + "grad_norm": 25.063343048095703, + "learning_rate": 3.5112411970746263e-06, + "logits/chosen": -1.3958866596221924, + "logits/rejected": -0.9426444172859192, + "logps/chosen": -0.8656539916992188, + "logps/rejected": -8.09262466430664, + "loss": 0.8748, + "odds_ratio_loss": 0.0913071259856224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08656540513038635, + "rewards/margins": 0.7226970791816711, + "rewards/rejected": -0.8092624545097351, + "sft_loss": 0.8656539916992188, + "step": 7705 + }, + { + "epoch": 0.6, + "grad_norm": 10.038653373718262, + "learning_rate": 3.5053640435493136e-06, + "logits/chosen": -1.1906545162200928, + "logits/rejected": -1.2683629989624023, + "logps/chosen": -0.6818944215774536, + "logps/rejected": -5.234747886657715, + "loss": 0.7035, + "odds_ratio_loss": 0.21561065316200256, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06818944960832596, + "rewards/margins": 0.45528537034988403, + "rewards/rejected": -0.5234748125076294, + "sft_loss": 0.6818944215774536, + "step": 7710 + }, + { + "epoch": 0.6, + "grad_norm": 6.408538818359375, + "learning_rate": 3.4994891568291955e-06, + "logits/chosen": -1.3831173181533813, + "logits/rejected": -0.8486045002937317, + "logps/chosen": -0.8704649209976196, + "logps/rejected": -2.675227403640747, + "loss": 0.9098, + "odds_ratio_loss": 0.39366015791893005, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08704648911952972, + "rewards/margins": 0.18047623336315155, + "rewards/rejected": -0.26752275228500366, + "sft_loss": 0.8704649209976196, + "step": 7715 + }, + { + "epoch": 0.6, + "grad_norm": 58.75336456298828, + "learning_rate": 3.4936165458242817e-06, + "logits/chosen": -1.4805707931518555, + "logits/rejected": -1.1970739364624023, + "logps/chosen": -0.9983084797859192, + "logps/rejected": -4.621636867523193, + "loss": 1.0295, + "odds_ratio_loss": 0.31158146262168884, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09983084350824356, + "rewards/margins": 0.36233288049697876, + "rewards/rejected": -0.4621637463569641, + "sft_loss": 0.9983084797859192, + "step": 7720 + }, + { + "epoch": 0.6, + "grad_norm": 4.402565956115723, + "learning_rate": 3.487746219441135e-06, + "logits/chosen": -1.3285706043243408, + "logits/rejected": -0.7722166180610657, + "logps/chosen": -1.158427357673645, + "logps/rejected": -6.5065107345581055, + "loss": 1.1778, + "odds_ratio_loss": 0.19363968074321747, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11584273725748062, + "rewards/margins": 0.534808337688446, + "rewards/rejected": -0.6506510376930237, + "sft_loss": 1.158427357673645, + "step": 7725 + }, + { + "epoch": 0.6, + "grad_norm": 13.3212308883667, + "learning_rate": 3.48187818658285e-06, + "logits/chosen": -1.465038537979126, + "logits/rejected": -0.6518747806549072, + "logps/chosen": -1.0956566333770752, + "logps/rejected": -5.187521934509277, + "loss": 1.1181, + "odds_ratio_loss": 0.2242608368396759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10956566035747528, + "rewards/margins": 0.4091865122318268, + "rewards/rejected": -0.5187522172927856, + "sft_loss": 1.0956566333770752, + "step": 7730 + }, + { + "epoch": 0.6, + "grad_norm": 4.018584251403809, + "learning_rate": 3.476012456149043e-06, + "logits/chosen": -1.2446355819702148, + "logits/rejected": -1.1161531209945679, + "logps/chosen": -0.7302519083023071, + "logps/rejected": -2.7706727981567383, + "loss": 0.7551, + "odds_ratio_loss": 0.2483852356672287, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0730251893401146, + "rewards/margins": 0.20404212176799774, + "rewards/rejected": -0.27706727385520935, + "sft_loss": 0.7302519083023071, + "step": 7735 + }, + { + "epoch": 0.6, + "grad_norm": 4.515651702880859, + "learning_rate": 3.4701490370358375e-06, + "logits/chosen": -1.3448576927185059, + "logits/rejected": -1.0877039432525635, + "logps/chosen": -1.065352201461792, + "logps/rejected": -7.31790828704834, + "loss": 1.0974, + "odds_ratio_loss": 0.3203319013118744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10653523355722427, + "rewards/margins": 0.6252555847167969, + "rewards/rejected": -0.7317909002304077, + "sft_loss": 1.065352201461792, + "step": 7740 + }, + { + "epoch": 0.6, + "grad_norm": 7.58742094039917, + "learning_rate": 3.464287938135857e-06, + "logits/chosen": -1.4285423755645752, + "logits/rejected": -0.9537609815597534, + "logps/chosen": -0.7986385226249695, + "logps/rejected": -4.994203090667725, + "loss": 0.8187, + "odds_ratio_loss": 0.2002413272857666, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07986384630203247, + "rewards/margins": 0.4195564389228821, + "rewards/rejected": -0.49942031502723694, + "sft_loss": 0.7986385226249695, + "step": 7745 + }, + { + "epoch": 0.6, + "grad_norm": 15.741750717163086, + "learning_rate": 3.4584291683382e-06, + "logits/chosen": -1.4255239963531494, + "logits/rejected": -0.8259667158126831, + "logps/chosen": -0.9038424491882324, + "logps/rejected": -9.146833419799805, + "loss": 0.9213, + "odds_ratio_loss": 0.17470580339431763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09038425981998444, + "rewards/margins": 0.824299156665802, + "rewards/rejected": -0.91468346118927, + "sft_loss": 0.9038424491882324, + "step": 7750 + }, + { + "epoch": 0.6, + "grad_norm": 39.928260803222656, + "learning_rate": 3.452572736528433e-06, + "logits/chosen": -1.365174651145935, + "logits/rejected": -1.3115659952163696, + "logps/chosen": -1.3342235088348389, + "logps/rejected": -2.6797633171081543, + "loss": 1.3923, + "odds_ratio_loss": 0.5805569887161255, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1334223449230194, + "rewards/margins": 0.13455398380756378, + "rewards/rejected": -0.2679763436317444, + "sft_loss": 1.3342235088348389, + "step": 7755 + }, + { + "epoch": 0.6, + "grad_norm": 7.70234489440918, + "learning_rate": 3.4467186515885816e-06, + "logits/chosen": -1.4198806285858154, + "logits/rejected": -0.9018239974975586, + "logps/chosen": -1.1378166675567627, + "logps/rejected": -9.194976806640625, + "loss": 1.1409, + "odds_ratio_loss": 0.03051009215414524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1137816533446312, + "rewards/margins": 0.8057159185409546, + "rewards/rejected": -0.9194976687431335, + "sft_loss": 1.1378166675567627, + "step": 7760 + }, + { + "epoch": 0.6, + "grad_norm": 8.322214126586914, + "learning_rate": 3.440866922397107e-06, + "logits/chosen": -1.3290245532989502, + "logits/rejected": -1.185469150543213, + "logps/chosen": -1.0787431001663208, + "logps/rejected": -3.751833438873291, + "loss": 1.1054, + "odds_ratio_loss": 0.266160786151886, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1078743115067482, + "rewards/margins": 0.2673090696334839, + "rewards/rejected": -0.3751833736896515, + "sft_loss": 1.0787431001663208, + "step": 7765 + }, + { + "epoch": 0.6, + "grad_norm": 22.85993003845215, + "learning_rate": 3.435017557828898e-06, + "logits/chosen": -1.079116702079773, + "logits/rejected": -0.8420137166976929, + "logps/chosen": -1.093390941619873, + "logps/rejected": -5.883313179016113, + "loss": 1.1123, + "odds_ratio_loss": 0.18955589830875397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10933909565210342, + "rewards/margins": 0.4789922833442688, + "rewards/rejected": -0.5883313417434692, + "sft_loss": 1.093390941619873, + "step": 7770 + }, + { + "epoch": 0.6, + "grad_norm": 3.7217321395874023, + "learning_rate": 3.4291705667552623e-06, + "logits/chosen": -1.2628605365753174, + "logits/rejected": -0.8282767534255981, + "logps/chosen": -0.9780300259590149, + "logps/rejected": -7.870736122131348, + "loss": 0.9806, + "odds_ratio_loss": 0.026035413146018982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0978030115365982, + "rewards/margins": 0.689270555973053, + "rewards/rejected": -0.78707355260849, + "sft_loss": 0.9780300259590149, + "step": 7775 + }, + { + "epoch": 0.61, + "grad_norm": 22.497316360473633, + "learning_rate": 3.423325958043903e-06, + "logits/chosen": -0.9871518015861511, + "logits/rejected": -1.1091338396072388, + "logps/chosen": -1.0703319311141968, + "logps/rejected": -2.1713783740997314, + "loss": 1.107, + "odds_ratio_loss": 0.3669655919075012, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10703320801258087, + "rewards/margins": 0.11010462045669556, + "rewards/rejected": -0.21713781356811523, + "sft_loss": 1.0703319311141968, + "step": 7780 + }, + { + "epoch": 0.61, + "grad_norm": 6.523499011993408, + "learning_rate": 3.417483740558909e-06, + "logits/chosen": -1.3533470630645752, + "logits/rejected": -0.9151613116264343, + "logps/chosen": -0.9925488233566284, + "logps/rejected": -6.206988334655762, + "loss": 0.9971, + "odds_ratio_loss": 0.045540980994701385, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09925489127635956, + "rewards/margins": 0.5214439630508423, + "rewards/rejected": -0.6206988096237183, + "sft_loss": 0.9925488233566284, + "step": 7785 + }, + { + "epoch": 0.61, + "grad_norm": 5.865688800811768, + "learning_rate": 3.411643923160748e-06, + "logits/chosen": -1.4720577001571655, + "logits/rejected": -1.1578564643859863, + "logps/chosen": -0.9088132977485657, + "logps/rejected": -3.383127212524414, + "loss": 0.9215, + "odds_ratio_loss": 0.12700459361076355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09088132530450821, + "rewards/margins": 0.24743135273456573, + "rewards/rejected": -0.33831268548965454, + "sft_loss": 0.9088132977485657, + "step": 7790 + }, + { + "epoch": 0.61, + "grad_norm": 5.861291885375977, + "learning_rate": 3.4058065147062423e-06, + "logits/chosen": -1.3404837846755981, + "logits/rejected": -0.8289557695388794, + "logps/chosen": -1.0182613134384155, + "logps/rejected": -7.828372955322266, + "loss": 1.0499, + "odds_ratio_loss": 0.3160038888454437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10182613134384155, + "rewards/margins": 0.6810111403465271, + "rewards/rejected": -0.7828372716903687, + "sft_loss": 1.0182613134384155, + "step": 7795 + }, + { + "epoch": 0.61, + "grad_norm": 12.210633277893066, + "learning_rate": 3.3999715240485643e-06, + "logits/chosen": -1.4205188751220703, + "logits/rejected": -1.0572283267974854, + "logps/chosen": -0.8838884234428406, + "logps/rejected": -20.02041244506836, + "loss": 0.8892, + "odds_ratio_loss": 0.05316939204931259, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0883888453245163, + "rewards/margins": 1.9136524200439453, + "rewards/rejected": -2.0020413398742676, + "sft_loss": 0.8838884234428406, + "step": 7800 + }, + { + "epoch": 0.61, + "grad_norm": 4.870990753173828, + "learning_rate": 3.3941389600372166e-06, + "logits/chosen": -1.3752845525741577, + "logits/rejected": -1.2575510740280151, + "logps/chosen": -0.6893249750137329, + "logps/rejected": -13.15601634979248, + "loss": 0.6917, + "odds_ratio_loss": 0.024077700451016426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06893249601125717, + "rewards/margins": 1.2466691732406616, + "rewards/rejected": -1.3156015872955322, + "sft_loss": 0.6893249750137329, + "step": 7805 + }, + { + "epoch": 0.61, + "grad_norm": 27.354835510253906, + "learning_rate": 3.3883088315180252e-06, + "logits/chosen": -1.4232900142669678, + "logits/rejected": -0.8507736325263977, + "logps/chosen": -1.111433506011963, + "logps/rejected": -5.32407283782959, + "loss": 1.1536, + "odds_ratio_loss": 0.4220332205295563, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11114335060119629, + "rewards/margins": 0.4212639331817627, + "rewards/rejected": -0.532407283782959, + "sft_loss": 1.111433506011963, + "step": 7810 + }, + { + "epoch": 0.61, + "grad_norm": 228.3631134033203, + "learning_rate": 3.3824811473331187e-06, + "logits/chosen": -1.4606685638427734, + "logits/rejected": -1.24955153465271, + "logps/chosen": -1.389299988746643, + "logps/rejected": -14.165156364440918, + "loss": 1.3971, + "odds_ratio_loss": 0.07790975272655487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13892999291419983, + "rewards/margins": 1.2775856256484985, + "rewards/rejected": -1.4165157079696655, + "sft_loss": 1.389299988746643, + "step": 7815 + }, + { + "epoch": 0.61, + "grad_norm": 23.588632583618164, + "learning_rate": 3.3766559163209187e-06, + "logits/chosen": -1.5015602111816406, + "logits/rejected": -1.1102159023284912, + "logps/chosen": -0.867430567741394, + "logps/rejected": -4.193625450134277, + "loss": 0.9145, + "odds_ratio_loss": 0.47047194838523865, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08674306422472, + "rewards/margins": 0.33261948823928833, + "rewards/rejected": -0.41936248540878296, + "sft_loss": 0.867430567741394, + "step": 7820 + }, + { + "epoch": 0.61, + "grad_norm": 6.6012983322143555, + "learning_rate": 3.3708331473161314e-06, + "logits/chosen": -1.3114466667175293, + "logits/rejected": -1.0500717163085938, + "logps/chosen": -1.4407585859298706, + "logps/rejected": -3.9645659923553467, + "loss": 1.4913, + "odds_ratio_loss": 0.5049613118171692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14407587051391602, + "rewards/margins": 0.25238072872161865, + "rewards/rejected": -0.39645659923553467, + "sft_loss": 1.4407585859298706, + "step": 7825 + }, + { + "epoch": 0.61, + "grad_norm": 14.379029273986816, + "learning_rate": 3.3650128491497235e-06, + "logits/chosen": -1.4079885482788086, + "logits/rejected": -1.43578040599823, + "logps/chosen": -1.104499101638794, + "logps/rejected": -11.191978454589844, + "loss": 1.1123, + "odds_ratio_loss": 0.07822314649820328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1104498952627182, + "rewards/margins": 1.008747935295105, + "rewards/rejected": -1.1191978454589844, + "sft_loss": 1.104499101638794, + "step": 7830 + }, + { + "epoch": 0.61, + "grad_norm": 71.34578704833984, + "learning_rate": 3.3591950306489144e-06, + "logits/chosen": -1.327294945716858, + "logits/rejected": -1.5994123220443726, + "logps/chosen": -0.9668186902999878, + "logps/rejected": -12.173527717590332, + "loss": 0.9724, + "odds_ratio_loss": 0.055557817220687866, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0966818779706955, + "rewards/margins": 1.1206709146499634, + "rewards/rejected": -1.2173527479171753, + "sft_loss": 0.9668186902999878, + "step": 7835 + }, + { + "epoch": 0.61, + "grad_norm": 157.09326171875, + "learning_rate": 3.353379700637167e-06, + "logits/chosen": -1.5097315311431885, + "logits/rejected": -1.1535099744796753, + "logps/chosen": -1.4491995573043823, + "logps/rejected": -6.186688423156738, + "loss": 1.4735, + "odds_ratio_loss": 0.2425946295261383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14491994678974152, + "rewards/margins": 0.47374892234802246, + "rewards/rejected": -0.6186689138412476, + "sft_loss": 1.4491995573043823, + "step": 7840 + }, + { + "epoch": 0.61, + "grad_norm": 7.054977893829346, + "learning_rate": 3.3475668679341678e-06, + "logits/chosen": -1.1684037446975708, + "logits/rejected": -1.116753101348877, + "logps/chosen": -1.122152328491211, + "logps/rejected": -12.81079387664795, + "loss": 1.1305, + "odds_ratio_loss": 0.08336828649044037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11221524327993393, + "rewards/margins": 1.1688642501831055, + "rewards/rejected": -1.2810795307159424, + "sft_loss": 1.122152328491211, + "step": 7845 + }, + { + "epoch": 0.61, + "grad_norm": 52.86894226074219, + "learning_rate": 3.341756541355811e-06, + "logits/chosen": -1.309032678604126, + "logits/rejected": -1.1107581853866577, + "logps/chosen": -1.2603918313980103, + "logps/rejected": -10.970057487487793, + "loss": 1.2608, + "odds_ratio_loss": 0.003665325464680791, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12603919208049774, + "rewards/margins": 0.970966637134552, + "rewards/rejected": -1.0970057249069214, + "sft_loss": 1.2603918313980103, + "step": 7850 + }, + { + "epoch": 0.61, + "grad_norm": 231.02935791015625, + "learning_rate": 3.3359487297142014e-06, + "logits/chosen": -1.4242833852767944, + "logits/rejected": -0.9572874903678894, + "logps/chosen": -1.0314557552337646, + "logps/rejected": -6.515725612640381, + "loss": 1.0588, + "odds_ratio_loss": 0.27349480986595154, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10314557701349258, + "rewards/margins": 0.5484269857406616, + "rewards/rejected": -0.6515725255012512, + "sft_loss": 1.0314557552337646, + "step": 7855 + }, + { + "epoch": 0.61, + "grad_norm": 12.31454849243164, + "learning_rate": 3.330143441817618e-06, + "logits/chosen": -1.397132158279419, + "logits/rejected": -1.1553051471710205, + "logps/chosen": -1.918043851852417, + "logps/rejected": -10.284468650817871, + "loss": 2.0299, + "odds_ratio_loss": 1.1188890933990479, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19180439412593842, + "rewards/margins": 0.8366424441337585, + "rewards/rejected": -1.028446912765503, + "sft_loss": 1.918043851852417, + "step": 7860 + }, + { + "epoch": 0.61, + "grad_norm": 36.400726318359375, + "learning_rate": 3.3243406864705193e-06, + "logits/chosen": -1.3096789121627808, + "logits/rejected": -1.1991100311279297, + "logps/chosen": -0.822433590888977, + "logps/rejected": -8.101971626281738, + "loss": 0.8241, + "odds_ratio_loss": 0.01621662639081478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08224336802959442, + "rewards/margins": 0.7279537916183472, + "rewards/rejected": -0.8101971745491028, + "sft_loss": 0.822433590888977, + "step": 7865 + }, + { + "epoch": 0.61, + "grad_norm": 48.41091537475586, + "learning_rate": 3.318540472473518e-06, + "logits/chosen": -1.3551379442214966, + "logits/rejected": -1.2822260856628418, + "logps/chosen": -1.39948308467865, + "logps/rejected": -5.699338912963867, + "loss": 1.4311, + "odds_ratio_loss": 0.31617268919944763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13994832336902618, + "rewards/margins": 0.42998558282852173, + "rewards/rejected": -0.5699338912963867, + "sft_loss": 1.39948308467865, + "step": 7870 + }, + { + "epoch": 0.61, + "grad_norm": 6.471083164215088, + "learning_rate": 3.312742808623378e-06, + "logits/chosen": -1.265817403793335, + "logits/rejected": -0.9677835702896118, + "logps/chosen": -0.7799075841903687, + "logps/rejected": -8.898427963256836, + "loss": 0.8006, + "odds_ratio_loss": 0.20699651539325714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07799076288938522, + "rewards/margins": 0.8118520975112915, + "rewards/rejected": -0.8898428678512573, + "sft_loss": 0.7799075841903687, + "step": 7875 + }, + { + "epoch": 0.61, + "grad_norm": 16.40986442565918, + "learning_rate": 3.306947703712991e-06, + "logits/chosen": -1.280167579650879, + "logits/rejected": -0.9758197665214539, + "logps/chosen": -0.8813843727111816, + "logps/rejected": -6.394897937774658, + "loss": 0.8844, + "odds_ratio_loss": 0.030038166791200638, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08813843131065369, + "rewards/margins": 0.5513514280319214, + "rewards/rejected": -0.6394897699356079, + "sft_loss": 0.8813843727111816, + "step": 7880 + }, + { + "epoch": 0.61, + "grad_norm": 15.605619430541992, + "learning_rate": 3.301155166531368e-06, + "logits/chosen": -1.4267349243164062, + "logits/rejected": -1.3646270036697388, + "logps/chosen": -1.072645664215088, + "logps/rejected": -7.3083038330078125, + "loss": 1.1466, + "odds_ratio_loss": 0.7396418452262878, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10726456344127655, + "rewards/margins": 0.6235657930374146, + "rewards/rejected": -0.7308304309844971, + "sft_loss": 1.072645664215088, + "step": 7885 + }, + { + "epoch": 0.61, + "grad_norm": 7.9199981689453125, + "learning_rate": 3.29536520586363e-06, + "logits/chosen": -1.3843305110931396, + "logits/rejected": -0.7178108096122742, + "logps/chosen": -1.484226107597351, + "logps/rejected": -17.763320922851562, + "loss": 1.4988, + "odds_ratio_loss": 0.14559972286224365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14842261373996735, + "rewards/margins": 1.6279094219207764, + "rewards/rejected": -1.776332139968872, + "sft_loss": 1.484226107597351, + "step": 7890 + }, + { + "epoch": 0.61, + "grad_norm": 115.85680389404297, + "learning_rate": 3.2895778304909865e-06, + "logits/chosen": -1.3865851163864136, + "logits/rejected": -0.9108538627624512, + "logps/chosen": -0.9720360040664673, + "logps/rejected": -9.35875415802002, + "loss": 0.9808, + "odds_ratio_loss": 0.08749934285879135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09720361232757568, + "rewards/margins": 0.8386718034744263, + "rewards/rejected": -0.935875415802002, + "sft_loss": 0.9720360040664673, + "step": 7895 + }, + { + "epoch": 0.61, + "grad_norm": 6.711180686950684, + "learning_rate": 3.2837930491907255e-06, + "logits/chosen": -1.41152024269104, + "logits/rejected": -0.9695557355880737, + "logps/chosen": -1.0238697528839111, + "logps/rejected": -5.494019508361816, + "loss": 1.044, + "odds_ratio_loss": 0.20168164372444153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.102386973798275, + "rewards/margins": 0.4470149874687195, + "rewards/rejected": -0.5494019389152527, + "sft_loss": 1.0238697528839111, + "step": 7900 + }, + { + "epoch": 0.61, + "grad_norm": 16.122793197631836, + "learning_rate": 3.278010870736205e-06, + "logits/chosen": -1.319061279296875, + "logits/rejected": -1.18667733669281, + "logps/chosen": -1.2115809917449951, + "logps/rejected": -2.368157386779785, + "loss": 1.2463, + "odds_ratio_loss": 0.34768372774124146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12115810066461563, + "rewards/margins": 0.11565764993429184, + "rewards/rejected": -0.23681576550006866, + "sft_loss": 1.2115809917449951, + "step": 7905 + }, + { + "epoch": 0.62, + "grad_norm": 10.517181396484375, + "learning_rate": 3.2722313038968312e-06, + "logits/chosen": -1.5105429887771606, + "logits/rejected": -1.256507396697998, + "logps/chosen": -0.960302472114563, + "logps/rejected": -5.137580871582031, + "loss": 0.9908, + "odds_ratio_loss": 0.30541473627090454, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09603025019168854, + "rewards/margins": 0.4177277684211731, + "rewards/rejected": -0.5137580633163452, + "sft_loss": 0.960302472114563, + "step": 7910 + }, + { + "epoch": 0.62, + "grad_norm": 6.5809502601623535, + "learning_rate": 3.2664543574380493e-06, + "logits/chosen": -1.3719203472137451, + "logits/rejected": -1.3545509576797485, + "logps/chosen": -0.741712212562561, + "logps/rejected": -5.936234474182129, + "loss": 0.7698, + "odds_ratio_loss": 0.2805303633213043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07417122274637222, + "rewards/margins": 0.5194522142410278, + "rewards/rejected": -0.5936234593391418, + "sft_loss": 0.741712212562561, + "step": 7915 + }, + { + "epoch": 0.62, + "grad_norm": 5.230719566345215, + "learning_rate": 3.260680040121336e-06, + "logits/chosen": -1.4700971841812134, + "logits/rejected": -0.9396473169326782, + "logps/chosen": -1.037864327430725, + "logps/rejected": -2.747180461883545, + "loss": 1.0811, + "odds_ratio_loss": 0.43252936005592346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10378643125295639, + "rewards/margins": 0.1709316372871399, + "rewards/rejected": -0.2747180759906769, + "sft_loss": 1.037864327430725, + "step": 7920 + }, + { + "epoch": 0.62, + "grad_norm": 5.8517866134643555, + "learning_rate": 3.2549083607041743e-06, + "logits/chosen": -1.2298557758331299, + "logits/rejected": -1.1291000843048096, + "logps/chosen": -0.8646572232246399, + "logps/rejected": -5.618631839752197, + "loss": 0.9033, + "odds_ratio_loss": 0.38687339425086975, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08646572381258011, + "rewards/margins": 0.4753974378108978, + "rewards/rejected": -0.561863124370575, + "sft_loss": 0.8646572232246399, + "step": 7925 + }, + { + "epoch": 0.62, + "grad_norm": 214.61062622070312, + "learning_rate": 3.249139327940049e-06, + "logits/chosen": -1.2199280261993408, + "logits/rejected": -1.067989706993103, + "logps/chosen": -1.4785853624343872, + "logps/rejected": -3.4969024658203125, + "loss": 1.5057, + "odds_ratio_loss": 0.2707682251930237, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14785853028297424, + "rewards/margins": 0.20183169841766357, + "rewards/rejected": -0.3496902585029602, + "sft_loss": 1.4785853624343872, + "step": 7930 + }, + { + "epoch": 0.62, + "grad_norm": 8.33104133605957, + "learning_rate": 3.2433729505784283e-06, + "logits/chosen": -1.3628246784210205, + "logits/rejected": -1.022838830947876, + "logps/chosen": -0.7507215738296509, + "logps/rejected": -3.201002597808838, + "loss": 0.7839, + "odds_ratio_loss": 0.33168500661849976, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07507215440273285, + "rewards/margins": 0.24502809345722198, + "rewards/rejected": -0.32010024785995483, + "sft_loss": 0.7507215738296509, + "step": 7935 + }, + { + "epoch": 0.62, + "grad_norm": 5.3491363525390625, + "learning_rate": 3.2376092373647604e-06, + "logits/chosen": -1.3530322313308716, + "logits/rejected": -0.5111430287361145, + "logps/chosen": -0.979842483997345, + "logps/rejected": -7.742257595062256, + "loss": 1.0023, + "odds_ratio_loss": 0.2243957221508026, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09798424690961838, + "rewards/margins": 0.6762415766716003, + "rewards/rejected": -0.7742258310317993, + "sft_loss": 0.979842483997345, + "step": 7940 + }, + { + "epoch": 0.62, + "grad_norm": 16.427143096923828, + "learning_rate": 3.231848197040446e-06, + "logits/chosen": -1.4268476963043213, + "logits/rejected": -1.4312551021575928, + "logps/chosen": -0.7136311531066895, + "logps/rejected": -4.48763370513916, + "loss": 0.7225, + "odds_ratio_loss": 0.08858003467321396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07136311382055283, + "rewards/margins": 0.377400279045105, + "rewards/rejected": -0.4487634301185608, + "sft_loss": 0.7136311531066895, + "step": 7945 + }, + { + "epoch": 0.62, + "grad_norm": 10.100354194641113, + "learning_rate": 3.226089838342833e-06, + "logits/chosen": -1.4485676288604736, + "logits/rejected": -1.1894410848617554, + "logps/chosen": -0.9481072425842285, + "logps/rejected": -9.236673355102539, + "loss": 0.9591, + "odds_ratio_loss": 0.11030948162078857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09481072425842285, + "rewards/margins": 0.8288565874099731, + "rewards/rejected": -0.923667311668396, + "sft_loss": 0.9481072425842285, + "step": 7950 + }, + { + "epoch": 0.62, + "grad_norm": 30.110383987426758, + "learning_rate": 3.220334170005206e-06, + "logits/chosen": -1.4325348138809204, + "logits/rejected": -1.4311264753341675, + "logps/chosen": -1.306010127067566, + "logps/rejected": -3.6128711700439453, + "loss": 1.3562, + "odds_ratio_loss": 0.5021435022354126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13060101866722107, + "rewards/margins": 0.23068614304065704, + "rewards/rejected": -0.3612871766090393, + "sft_loss": 1.306010127067566, + "step": 7955 + }, + { + "epoch": 0.62, + "grad_norm": 7.615264892578125, + "learning_rate": 3.214581200756765e-06, + "logits/chosen": -1.4307386875152588, + "logits/rejected": -1.1426076889038086, + "logps/chosen": -0.6602781414985657, + "logps/rejected": -5.599646091461182, + "loss": 0.6888, + "odds_ratio_loss": 0.2852723300457001, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06602781265974045, + "rewards/margins": 0.49393683671951294, + "rewards/rejected": -0.559964656829834, + "sft_loss": 0.6602781414985657, + "step": 7960 + }, + { + "epoch": 0.62, + "grad_norm": 8.364228248596191, + "learning_rate": 3.208830939322617e-06, + "logits/chosen": -1.3908792734146118, + "logits/rejected": -1.1232960224151611, + "logps/chosen": -0.7094835042953491, + "logps/rejected": -10.671346664428711, + "loss": 0.7219, + "odds_ratio_loss": 0.1239551529288292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07094834744930267, + "rewards/margins": 0.996186375617981, + "rewards/rejected": -1.0671346187591553, + "sft_loss": 0.7094835042953491, + "step": 7965 + }, + { + "epoch": 0.62, + "grad_norm": 21.444847106933594, + "learning_rate": 3.203083394423766e-06, + "logits/chosen": -1.3634107112884521, + "logits/rejected": -1.0703684091567993, + "logps/chosen": -1.527782678604126, + "logps/rejected": -2.9710288047790527, + "loss": 1.5714, + "odds_ratio_loss": 0.43602442741394043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1527782678604126, + "rewards/margins": 0.1443246304988861, + "rewards/rejected": -0.2971028685569763, + "sft_loss": 1.527782678604126, + "step": 7970 + }, + { + "epoch": 0.62, + "grad_norm": 9.083678245544434, + "learning_rate": 3.197338574777094e-06, + "logits/chosen": -1.4616883993148804, + "logits/rejected": -0.8809460401535034, + "logps/chosen": -1.1917169094085693, + "logps/rejected": -8.423823356628418, + "loss": 1.1971, + "odds_ratio_loss": 0.05426154285669327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11917171627283096, + "rewards/margins": 0.7232107520103455, + "rewards/rejected": -0.8423824310302734, + "sft_loss": 1.1917169094085693, + "step": 7975 + }, + { + "epoch": 0.62, + "grad_norm": 5.379354953765869, + "learning_rate": 3.191596489095348e-06, + "logits/chosen": -1.2065356969833374, + "logits/rejected": -1.0963044166564941, + "logps/chosen": -1.290281057357788, + "logps/rejected": -7.7128400802612305, + "loss": 1.3069, + "odds_ratio_loss": 0.16593563556671143, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1290281116962433, + "rewards/margins": 0.6422559022903442, + "rewards/rejected": -0.7712840437889099, + "sft_loss": 1.290281057357788, + "step": 7980 + }, + { + "epoch": 0.62, + "grad_norm": 12.42475414276123, + "learning_rate": 3.1858571460871284e-06, + "logits/chosen": -1.3918287754058838, + "logits/rejected": -1.3030837774276733, + "logps/chosen": -0.8681005239486694, + "logps/rejected": -8.609440803527832, + "loss": 0.8756, + "odds_ratio_loss": 0.07548637688159943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08681005239486694, + "rewards/margins": 0.7741340398788452, + "rewards/rejected": -0.8609440922737122, + "sft_loss": 0.8681005239486694, + "step": 7985 + }, + { + "epoch": 0.62, + "grad_norm": 15.7763032913208, + "learning_rate": 3.1801205544568816e-06, + "logits/chosen": -1.2245934009552002, + "logits/rejected": -1.5365649461746216, + "logps/chosen": -1.184861421585083, + "logps/rejected": -6.469397068023682, + "loss": 1.2113, + "odds_ratio_loss": 0.26478278636932373, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11848614364862442, + "rewards/margins": 0.528453528881073, + "rewards/rejected": -0.646939754486084, + "sft_loss": 1.184861421585083, + "step": 7990 + }, + { + "epoch": 0.62, + "grad_norm": 5.70435094833374, + "learning_rate": 3.1743867229048734e-06, + "logits/chosen": -1.4829033613204956, + "logits/rejected": -0.8051961660385132, + "logps/chosen": -1.914612054824829, + "logps/rejected": -3.3208343982696533, + "loss": 2.0397, + "odds_ratio_loss": 1.2508898973464966, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1914612054824829, + "rewards/margins": 0.14062225818634033, + "rewards/rejected": -0.33208346366882324, + "sft_loss": 1.914612054824829, + "step": 7995 + }, + { + "epoch": 0.62, + "grad_norm": 8.394847869873047, + "learning_rate": 3.168655660127188e-06, + "logits/chosen": -1.3032631874084473, + "logits/rejected": -1.1171185970306396, + "logps/chosen": -1.410510778427124, + "logps/rejected": -6.261443138122559, + "loss": 1.4635, + "odds_ratio_loss": 0.5297115445137024, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14105108380317688, + "rewards/margins": 0.4850931763648987, + "rewards/rejected": -0.6261442303657532, + "sft_loss": 1.410510778427124, + "step": 8000 + }, + { + "epoch": 0.62, + "grad_norm": 8.586276054382324, + "learning_rate": 3.162927374815712e-06, + "logits/chosen": -1.455913782119751, + "logits/rejected": -1.1645771265029907, + "logps/chosen": -1.0394628047943115, + "logps/rejected": -3.195157766342163, + "loss": 1.0581, + "odds_ratio_loss": 0.1859007179737091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10394628345966339, + "rewards/margins": 0.21556949615478516, + "rewards/rejected": -0.31951576471328735, + "sft_loss": 1.0394628047943115, + "step": 8005 + }, + { + "epoch": 0.62, + "grad_norm": 14.099467277526855, + "learning_rate": 3.157201875658116e-06, + "logits/chosen": -1.3971182107925415, + "logits/rejected": -0.8789178133010864, + "logps/chosen": -1.0584465265274048, + "logps/rejected": -5.310263156890869, + "loss": 1.0896, + "odds_ratio_loss": 0.31155240535736084, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1058446541428566, + "rewards/margins": 0.42518168687820435, + "rewards/rejected": -0.531026303768158, + "sft_loss": 1.0584465265274048, + "step": 8010 + }, + { + "epoch": 0.62, + "grad_norm": 72.75445556640625, + "learning_rate": 3.1514791713378443e-06, + "logits/chosen": -1.4388090372085571, + "logits/rejected": -0.8740331530570984, + "logps/chosen": -0.9836138486862183, + "logps/rejected": -3.6888020038604736, + "loss": 1.0094, + "odds_ratio_loss": 0.2580524981021881, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09836138039827347, + "rewards/margins": 0.27051883935928345, + "rewards/rejected": -0.3688802123069763, + "sft_loss": 0.9836138486862183, + "step": 8015 + }, + { + "epoch": 0.62, + "grad_norm": 9.547337532043457, + "learning_rate": 3.1457592705341088e-06, + "logits/chosen": -1.4236394166946411, + "logits/rejected": -0.8941730260848999, + "logps/chosen": -1.2104017734527588, + "logps/rejected": -4.095586776733398, + "loss": 1.2312, + "odds_ratio_loss": 0.20825231075286865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12104018032550812, + "rewards/margins": 0.2885185182094574, + "rewards/rejected": -0.4095586836338043, + "sft_loss": 1.2104017734527588, + "step": 8020 + }, + { + "epoch": 0.62, + "grad_norm": 5.413477420806885, + "learning_rate": 3.140042181921863e-06, + "logits/chosen": -1.3875473737716675, + "logits/rejected": -1.175796389579773, + "logps/chosen": -1.0537807941436768, + "logps/rejected": -10.514541625976562, + "loss": 1.0618, + "odds_ratio_loss": 0.08046818524599075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10537807643413544, + "rewards/margins": 0.9460762143135071, + "rewards/rejected": -1.0514543056488037, + "sft_loss": 1.0537807941436768, + "step": 8025 + }, + { + "epoch": 0.62, + "grad_norm": 57.867000579833984, + "learning_rate": 3.1343279141717957e-06, + "logits/chosen": -1.2222039699554443, + "logits/rejected": -1.3033952713012695, + "logps/chosen": -1.596881628036499, + "logps/rejected": -6.566547393798828, + "loss": 1.6783, + "odds_ratio_loss": 0.8144750595092773, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.15968818962574005, + "rewards/margins": 0.4969666004180908, + "rewards/rejected": -0.6566547155380249, + "sft_loss": 1.596881628036499, + "step": 8030 + }, + { + "epoch": 0.63, + "grad_norm": 9.617599487304688, + "learning_rate": 3.1286164759503245e-06, + "logits/chosen": -1.3700895309448242, + "logits/rejected": -1.3072712421417236, + "logps/chosen": -1.059555172920227, + "logps/rejected": -6.857283115386963, + "loss": 1.0932, + "odds_ratio_loss": 0.3364062011241913, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10595552623271942, + "rewards/margins": 0.5797727704048157, + "rewards/rejected": -0.6857283711433411, + "sft_loss": 1.059555172920227, + "step": 8035 + }, + { + "epoch": 0.63, + "grad_norm": 9.33051586151123, + "learning_rate": 3.122907875919567e-06, + "logits/chosen": -1.3103891611099243, + "logits/rejected": -1.573803186416626, + "logps/chosen": -1.0470023155212402, + "logps/rejected": -11.741785049438477, + "loss": 1.0516, + "odds_ratio_loss": 0.04594879597425461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1047002449631691, + "rewards/margins": 1.0694782733917236, + "rewards/rejected": -1.1741784811019897, + "sft_loss": 1.0470023155212402, + "step": 8040 + }, + { + "epoch": 0.63, + "grad_norm": 8.176512718200684, + "learning_rate": 3.11720212273734e-06, + "logits/chosen": -1.3815648555755615, + "logits/rejected": -0.7566605806350708, + "logps/chosen": -1.0081384181976318, + "logps/rejected": -7.439121246337891, + "loss": 1.0207, + "odds_ratio_loss": 0.125940203666687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1008138507604599, + "rewards/margins": 0.6430982351303101, + "rewards/rejected": -0.7439121007919312, + "sft_loss": 1.0081384181976318, + "step": 8045 + }, + { + "epoch": 0.63, + "grad_norm": 52.099769592285156, + "learning_rate": 3.1114992250571415e-06, + "logits/chosen": -1.2227838039398193, + "logits/rejected": -0.7810730934143066, + "logps/chosen": -1.0308719873428345, + "logps/rejected": -8.28508186340332, + "loss": 1.0421, + "odds_ratio_loss": 0.11267662048339844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10308720171451569, + "rewards/margins": 0.7254210114479065, + "rewards/rejected": -0.828508198261261, + "sft_loss": 1.0308719873428345, + "step": 8050 + }, + { + "epoch": 0.63, + "grad_norm": 10.428442001342773, + "learning_rate": 3.105799191528144e-06, + "logits/chosen": -1.2264387607574463, + "logits/rejected": -1.103314995765686, + "logps/chosen": -0.7794146537780762, + "logps/rejected": -5.476459980010986, + "loss": 0.8488, + "odds_ratio_loss": 0.6937298774719238, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07794146984815598, + "rewards/margins": 0.4697045385837555, + "rewards/rejected": -0.5476459264755249, + "sft_loss": 0.7794146537780762, + "step": 8055 + }, + { + "epoch": 0.63, + "grad_norm": 14.65079116821289, + "learning_rate": 3.1001020307951684e-06, + "logits/chosen": -1.411501169204712, + "logits/rejected": -1.2931959629058838, + "logps/chosen": -1.372267484664917, + "logps/rejected": -9.434979438781738, + "loss": 1.381, + "odds_ratio_loss": 0.08757570385932922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13722673058509827, + "rewards/margins": 0.8062711954116821, + "rewards/rejected": -0.9434979557991028, + "sft_loss": 1.372267484664917, + "step": 8060 + }, + { + "epoch": 0.63, + "grad_norm": 8.443821907043457, + "learning_rate": 3.0944077514986837e-06, + "logits/chosen": -1.2568533420562744, + "logits/rejected": -1.489222526550293, + "logps/chosen": -0.9871482849121094, + "logps/rejected": -9.15410327911377, + "loss": 0.9907, + "odds_ratio_loss": 0.0351276621222496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09871482849121094, + "rewards/margins": 0.8166955709457397, + "rewards/rejected": -0.9154103994369507, + "sft_loss": 0.9871482849121094, + "step": 8065 + }, + { + "epoch": 0.63, + "grad_norm": 6.821056842803955, + "learning_rate": 3.0887163622747873e-06, + "logits/chosen": -1.3466415405273438, + "logits/rejected": -1.1124385595321655, + "logps/chosen": -1.423811674118042, + "logps/rejected": -5.253532886505127, + "loss": 1.4455, + "odds_ratio_loss": 0.21701247990131378, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14238117635250092, + "rewards/margins": 0.3829721510410309, + "rewards/rejected": -0.5253533124923706, + "sft_loss": 1.423811674118042, + "step": 8070 + }, + { + "epoch": 0.63, + "grad_norm": 12.706870079040527, + "learning_rate": 3.083027871755194e-06, + "logits/chosen": -1.3704214096069336, + "logits/rejected": -0.7175769805908203, + "logps/chosen": -1.1178873777389526, + "logps/rejected": -6.501991271972656, + "loss": 1.1327, + "odds_ratio_loss": 0.14839690923690796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11178873479366302, + "rewards/margins": 0.5384103059768677, + "rewards/rejected": -0.6501990556716919, + "sft_loss": 1.1178873777389526, + "step": 8075 + }, + { + "epoch": 0.63, + "grad_norm": 15.04248332977295, + "learning_rate": 3.07734228856722e-06, + "logits/chosen": -1.2834182977676392, + "logits/rejected": -1.0528788566589355, + "logps/chosen": -1.3128502368927002, + "logps/rejected": -3.2048892974853516, + "loss": 1.3444, + "odds_ratio_loss": 0.31525570154190063, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13128502666950226, + "rewards/margins": 0.1892039030790329, + "rewards/rejected": -0.32048892974853516, + "sft_loss": 1.3128502368927002, + "step": 8080 + }, + { + "epoch": 0.63, + "grad_norm": 5.70639181137085, + "learning_rate": 3.071659621333777e-06, + "logits/chosen": -1.3128175735473633, + "logits/rejected": -0.9505087733268738, + "logps/chosen": -1.1137568950653076, + "logps/rejected": -4.190314292907715, + "loss": 1.147, + "odds_ratio_loss": 0.33274489641189575, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11137568950653076, + "rewards/margins": 0.3076557219028473, + "rewards/rejected": -0.41903138160705566, + "sft_loss": 1.1137568950653076, + "step": 8085 + }, + { + "epoch": 0.63, + "grad_norm": 6.7080607414245605, + "learning_rate": 3.0659798786733497e-06, + "logits/chosen": -1.3358865976333618, + "logits/rejected": -1.1260464191436768, + "logps/chosen": -0.951252281665802, + "logps/rejected": -3.221665143966675, + "loss": 0.9724, + "odds_ratio_loss": 0.21111583709716797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0951252207159996, + "rewards/margins": 0.22704128921031952, + "rewards/rejected": -0.3221665322780609, + "sft_loss": 0.951252281665802, + "step": 8090 + }, + { + "epoch": 0.63, + "grad_norm": 30.165071487426758, + "learning_rate": 3.0603030691999885e-06, + "logits/chosen": -1.3660657405853271, + "logits/rejected": -1.3970402479171753, + "logps/chosen": -0.8544301986694336, + "logps/rejected": -3.3595690727233887, + "loss": 0.8623, + "odds_ratio_loss": 0.07849614322185516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08544301986694336, + "rewards/margins": 0.2505139112472534, + "rewards/rejected": -0.3359569311141968, + "sft_loss": 0.8544301986694336, + "step": 8095 + }, + { + "epoch": 0.63, + "grad_norm": 7.015862464904785, + "learning_rate": 3.054629201523297e-06, + "logits/chosen": -1.2655341625213623, + "logits/rejected": -0.8376556634902954, + "logps/chosen": -1.1147531270980835, + "logps/rejected": -4.9825758934021, + "loss": 1.1226, + "odds_ratio_loss": 0.07891669124364853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11147532612085342, + "rewards/margins": 0.3867822587490082, + "rewards/rejected": -0.498257577419281, + "sft_loss": 1.1147531270980835, + "step": 8100 + }, + { + "epoch": 0.63, + "grad_norm": 97.99761962890625, + "learning_rate": 3.0489582842484155e-06, + "logits/chosen": -1.1555439233779907, + "logits/rejected": -0.9075528383255005, + "logps/chosen": -1.1690194606781006, + "logps/rejected": -5.805201530456543, + "loss": 1.1894, + "odds_ratio_loss": 0.20332148671150208, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1169019490480423, + "rewards/margins": 0.46361827850341797, + "rewards/rejected": -0.5805202126502991, + "sft_loss": 1.1690194606781006, + "step": 8105 + }, + { + "epoch": 0.63, + "grad_norm": 7.906203746795654, + "learning_rate": 3.0432903259760103e-06, + "logits/chosen": -1.3602170944213867, + "logits/rejected": -0.9214040637016296, + "logps/chosen": -1.0084471702575684, + "logps/rejected": -8.591350555419922, + "loss": 1.0197, + "odds_ratio_loss": 0.11236198246479034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10084471851587296, + "rewards/margins": 0.7582904100418091, + "rewards/rejected": -0.8591351509094238, + "sft_loss": 1.0084471702575684, + "step": 8110 + }, + { + "epoch": 0.63, + "grad_norm": 115.37716674804688, + "learning_rate": 3.0376253353022565e-06, + "logits/chosen": -1.4454014301300049, + "logits/rejected": -1.1584488153457642, + "logps/chosen": -1.3010876178741455, + "logps/rejected": -4.5600905418396, + "loss": 1.3204, + "odds_ratio_loss": 0.19311176240444183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1301087737083435, + "rewards/margins": 0.3259003162384033, + "rewards/rejected": -0.4560090899467468, + "sft_loss": 1.3010876178741455, + "step": 8115 + }, + { + "epoch": 0.63, + "grad_norm": 266.7450256347656, + "learning_rate": 3.031963320818837e-06, + "logits/chosen": -1.3959786891937256, + "logits/rejected": -1.0264484882354736, + "logps/chosen": -1.1273565292358398, + "logps/rejected": -7.558053016662598, + "loss": 1.1292, + "odds_ratio_loss": 0.018850315362215042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11273566633462906, + "rewards/margins": 0.6430696249008179, + "rewards/rejected": -0.7558053135871887, + "sft_loss": 1.1273565292358398, + "step": 8120 + }, + { + "epoch": 0.63, + "grad_norm": 8.022828102111816, + "learning_rate": 3.026304291112914e-06, + "logits/chosen": -1.48981773853302, + "logits/rejected": -1.5464773178100586, + "logps/chosen": -0.7510837316513062, + "logps/rejected": -8.108277320861816, + "loss": 0.7574, + "odds_ratio_loss": 0.06287384778261185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07510837912559509, + "rewards/margins": 0.7357193827629089, + "rewards/rejected": -0.8108277320861816, + "sft_loss": 0.7510837316513062, + "step": 8125 + }, + { + "epoch": 0.63, + "grad_norm": 174.53338623046875, + "learning_rate": 3.020648254767121e-06, + "logits/chosen": -1.2474849224090576, + "logits/rejected": -1.1400511264801025, + "logps/chosen": -1.2461121082305908, + "logps/rejected": -13.192431449890137, + "loss": 1.2663, + "odds_ratio_loss": 0.20185616612434387, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12461121380329132, + "rewards/margins": 1.1946319341659546, + "rewards/rejected": -1.3192431926727295, + "sft_loss": 1.2461121082305908, + "step": 8130 + }, + { + "epoch": 0.63, + "grad_norm": 15.472785949707031, + "learning_rate": 3.01499522035956e-06, + "logits/chosen": -1.3132785558700562, + "logits/rejected": -1.2090857028961182, + "logps/chosen": -1.164563775062561, + "logps/rejected": -8.720190048217773, + "loss": 1.1775, + "odds_ratio_loss": 0.12893368303775787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11645637452602386, + "rewards/margins": 0.7555626630783081, + "rewards/rejected": -0.8720189929008484, + "sft_loss": 1.164563775062561, + "step": 8135 + }, + { + "epoch": 0.63, + "grad_norm": 11.70803165435791, + "learning_rate": 3.009345196463773e-06, + "logits/chosen": -1.291359782218933, + "logits/rejected": -0.9156149625778198, + "logps/chosen": -0.9766289591789246, + "logps/rejected": -5.966506004333496, + "loss": 0.9985, + "odds_ratio_loss": 0.2186693698167801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09766290336847305, + "rewards/margins": 0.49898767471313477, + "rewards/rejected": -0.5966506004333496, + "sft_loss": 0.9766289591789246, + "step": 8140 + }, + { + "epoch": 0.63, + "grad_norm": 8.548707962036133, + "learning_rate": 3.0036981916487366e-06, + "logits/chosen": -1.440680742263794, + "logits/rejected": -1.3260940313339233, + "logps/chosen": -0.929384708404541, + "logps/rejected": -15.503623962402344, + "loss": 0.935, + "odds_ratio_loss": 0.055864494293928146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09293847531080246, + "rewards/margins": 1.4574239253997803, + "rewards/rejected": -1.550362467765808, + "sft_loss": 0.929384708404541, + "step": 8145 + }, + { + "epoch": 0.63, + "grad_norm": 12.262228012084961, + "learning_rate": 2.9980542144788564e-06, + "logits/chosen": -1.4146068096160889, + "logits/rejected": -0.9239387512207031, + "logps/chosen": -0.9395908117294312, + "logps/rejected": -4.714138984680176, + "loss": 0.9445, + "odds_ratio_loss": 0.04877752065658569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09395908564329147, + "rewards/margins": 0.37745481729507446, + "rewards/rejected": -0.4714139401912689, + "sft_loss": 0.9395908117294312, + "step": 8150 + }, + { + "epoch": 0.63, + "grad_norm": 23.141952514648438, + "learning_rate": 2.9924132735139357e-06, + "logits/chosen": -1.4843209981918335, + "logits/rejected": -1.102158784866333, + "logps/chosen": -0.9500603675842285, + "logps/rejected": -4.377799987792969, + "loss": 0.9585, + "odds_ratio_loss": 0.08394896239042282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09500603377819061, + "rewards/margins": 0.342773973941803, + "rewards/rejected": -0.4377799928188324, + "sft_loss": 0.9500603675842285, + "step": 8155 + }, + { + "epoch": 0.63, + "grad_norm": 9.641731262207031, + "learning_rate": 2.9867753773091766e-06, + "logits/chosen": -1.342581033706665, + "logits/rejected": -1.1286296844482422, + "logps/chosen": -0.7172707915306091, + "logps/rejected": -10.408893585205078, + "loss": 0.7257, + "odds_ratio_loss": 0.08466891944408417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07172708213329315, + "rewards/margins": 0.9691622853279114, + "rewards/rejected": -1.0408892631530762, + "sft_loss": 0.7172707915306091, + "step": 8160 + }, + { + "epoch": 0.64, + "grad_norm": 7.909165859222412, + "learning_rate": 2.9811405344151702e-06, + "logits/chosen": -1.615762710571289, + "logits/rejected": -1.6490952968597412, + "logps/chosen": -0.8346630930900574, + "logps/rejected": -12.535600662231445, + "loss": 0.8405, + "odds_ratio_loss": 0.05827382951974869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0834663137793541, + "rewards/margins": 1.1700937747955322, + "rewards/rejected": -1.2535600662231445, + "sft_loss": 0.8346630930900574, + "step": 8165 + }, + { + "epoch": 0.64, + "grad_norm": 47.357765197753906, + "learning_rate": 2.975508753377866e-06, + "logits/chosen": -1.412506341934204, + "logits/rejected": -1.051579236984253, + "logps/chosen": -1.016878604888916, + "logps/rejected": -4.126791954040527, + "loss": 1.0491, + "odds_ratio_loss": 0.3220589756965637, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10168787091970444, + "rewards/margins": 0.3109913766384125, + "rewards/rejected": -0.4126792550086975, + "sft_loss": 1.016878604888916, + "step": 8170 + }, + { + "epoch": 0.64, + "grad_norm": 8.349559783935547, + "learning_rate": 2.9698800427385775e-06, + "logits/chosen": -1.2682634592056274, + "logits/rejected": -1.3927457332611084, + "logps/chosen": -0.7896240949630737, + "logps/rejected": -8.555567741394043, + "loss": 0.7956, + "odds_ratio_loss": 0.059539467096328735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07896241545677185, + "rewards/margins": 0.7765944600105286, + "rewards/rejected": -0.855556845664978, + "sft_loss": 0.7896240949630737, + "step": 8175 + }, + { + "epoch": 0.64, + "grad_norm": 9.098254203796387, + "learning_rate": 2.964254411033957e-06, + "logits/chosen": -1.5638911724090576, + "logits/rejected": -1.143739104270935, + "logps/chosen": -0.834095299243927, + "logps/rejected": -4.712614059448242, + "loss": 0.8587, + "odds_ratio_loss": 0.24565191566944122, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08340953290462494, + "rewards/margins": 0.38785186409950256, + "rewards/rejected": -0.4712614119052887, + "sft_loss": 0.834095299243927, + "step": 8180 + }, + { + "epoch": 0.64, + "grad_norm": 423.8319396972656, + "learning_rate": 2.9586318667959917e-06, + "logits/chosen": -1.2335931062698364, + "logits/rejected": -1.4510310888290405, + "logps/chosen": -1.3426623344421387, + "logps/rejected": -13.651086807250977, + "loss": 1.3431, + "odds_ratio_loss": 0.004761195741593838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13426624238491058, + "rewards/margins": 1.2308424711227417, + "rewards/rejected": -1.3651087284088135, + "sft_loss": 1.3426623344421387, + "step": 8185 + }, + { + "epoch": 0.64, + "grad_norm": 57.52622604370117, + "learning_rate": 2.9530124185519824e-06, + "logits/chosen": -1.390239953994751, + "logits/rejected": -0.8772698640823364, + "logps/chosen": -1.1922757625579834, + "logps/rejected": -13.38306713104248, + "loss": 1.2134, + "odds_ratio_loss": 0.2110356092453003, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1192275881767273, + "rewards/margins": 1.2190791368484497, + "rewards/rejected": -1.3383066654205322, + "sft_loss": 1.1922757625579834, + "step": 8190 + }, + { + "epoch": 0.64, + "grad_norm": 5.419862270355225, + "learning_rate": 2.9473960748245344e-06, + "logits/chosen": -1.2417716979980469, + "logits/rejected": -1.1405150890350342, + "logps/chosen": -0.9968358278274536, + "logps/rejected": -12.170660972595215, + "loss": 1.0025, + "odds_ratio_loss": 0.056391291320323944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09968358278274536, + "rewards/margins": 1.117382526397705, + "rewards/rejected": -1.2170660495758057, + "sft_loss": 0.9968358278274536, + "step": 8195 + }, + { + "epoch": 0.64, + "grad_norm": 15.264373779296875, + "learning_rate": 2.9417828441315493e-06, + "logits/chosen": -1.4618908166885376, + "logits/rejected": -1.2241955995559692, + "logps/chosen": -1.154833436012268, + "logps/rejected": -5.7115254402160645, + "loss": 1.192, + "odds_ratio_loss": 0.37174850702285767, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1154833436012268, + "rewards/margins": 0.45566922426223755, + "rewards/rejected": -0.5711525678634644, + "sft_loss": 1.154833436012268, + "step": 8200 + }, + { + "epoch": 0.64, + "grad_norm": 6.008475303649902, + "learning_rate": 2.9361727349862025e-06, + "logits/chosen": -1.318110704421997, + "logits/rejected": -0.7864333987236023, + "logps/chosen": -0.9257491230964661, + "logps/rejected": -9.724273681640625, + "loss": 0.9338, + "odds_ratio_loss": 0.08093155920505524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09257491677999496, + "rewards/margins": 0.8798524737358093, + "rewards/rejected": -0.9724273681640625, + "sft_loss": 0.9257491230964661, + "step": 8205 + }, + { + "epoch": 0.64, + "grad_norm": 5.947242259979248, + "learning_rate": 2.930565755896936e-06, + "logits/chosen": -1.3611711263656616, + "logits/rejected": -0.9525600671768188, + "logps/chosen": -1.174300193786621, + "logps/rejected": -10.041463851928711, + "loss": 1.1828, + "odds_ratio_loss": 0.08481469750404358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11743001639842987, + "rewards/margins": 0.8867164850234985, + "rewards/rejected": -1.0041465759277344, + "sft_loss": 1.174300193786621, + "step": 8210 + }, + { + "epoch": 0.64, + "grad_norm": 11.08918285369873, + "learning_rate": 2.9249619153674475e-06, + "logits/chosen": -1.2925969362258911, + "logits/rejected": -1.3019688129425049, + "logps/chosen": -0.9571875333786011, + "logps/rejected": -1.910681128501892, + "loss": 0.9907, + "odds_ratio_loss": 0.33518046140670776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09571875631809235, + "rewards/margins": 0.09534934908151627, + "rewards/rejected": -0.1910681277513504, + "sft_loss": 0.9571875333786011, + "step": 8215 + }, + { + "epoch": 0.64, + "grad_norm": 4.53801965713501, + "learning_rate": 2.919361221896671e-06, + "logits/chosen": -1.3792939186096191, + "logits/rejected": -0.9149982333183289, + "logps/chosen": -1.1741918325424194, + "logps/rejected": -8.287775039672852, + "loss": 1.2071, + "odds_ratio_loss": 0.3294121325016022, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11741918325424194, + "rewards/margins": 0.711358368396759, + "rewards/rejected": -0.8287774920463562, + "sft_loss": 1.1741918325424194, + "step": 8220 + }, + { + "epoch": 0.64, + "grad_norm": 9.603503227233887, + "learning_rate": 2.913763683978768e-06, + "logits/chosen": -1.4316697120666504, + "logits/rejected": -0.9715169668197632, + "logps/chosen": -1.007556438446045, + "logps/rejected": -3.0127406120300293, + "loss": 1.019, + "odds_ratio_loss": 0.1143096312880516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10075564682483673, + "rewards/margins": 0.2005184143781662, + "rewards/rejected": -0.30127406120300293, + "sft_loss": 1.007556438446045, + "step": 8225 + }, + { + "epoch": 0.64, + "grad_norm": 47.24553680419922, + "learning_rate": 2.9081693101031193e-06, + "logits/chosen": -1.402808666229248, + "logits/rejected": -0.8077438473701477, + "logps/chosen": -1.1371103525161743, + "logps/rejected": -5.126477241516113, + "loss": 1.1479, + "odds_ratio_loss": 0.10753113031387329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11371102184057236, + "rewards/margins": 0.3989366888999939, + "rewards/rejected": -0.5126477479934692, + "sft_loss": 1.1371103525161743, + "step": 8230 + }, + { + "epoch": 0.64, + "grad_norm": 70.96250915527344, + "learning_rate": 2.9025781087543004e-06, + "logits/chosen": -1.37126886844635, + "logits/rejected": -0.8173860311508179, + "logps/chosen": -1.2457082271575928, + "logps/rejected": -5.8854546546936035, + "loss": 1.25, + "odds_ratio_loss": 0.04337408393621445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.124570831656456, + "rewards/margins": 0.46397465467453003, + "rewards/rejected": -0.5885455012321472, + "sft_loss": 1.2457082271575928, + "step": 8235 + }, + { + "epoch": 0.64, + "grad_norm": 14.82772445678711, + "learning_rate": 2.8969900884120794e-06, + "logits/chosen": -1.3153047561645508, + "logits/rejected": -1.1230190992355347, + "logps/chosen": -0.9597269892692566, + "logps/rejected": -7.153435707092285, + "loss": 0.984, + "odds_ratio_loss": 0.24292488396167755, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0959727019071579, + "rewards/margins": 0.6193707585334778, + "rewards/rejected": -0.7153435349464417, + "sft_loss": 0.9597269892692566, + "step": 8240 + }, + { + "epoch": 0.64, + "grad_norm": 31.036779403686523, + "learning_rate": 2.891405257551395e-06, + "logits/chosen": -1.388417363166809, + "logits/rejected": -1.0832921266555786, + "logps/chosen": -1.0220638513565063, + "logps/rejected": -4.166182994842529, + "loss": 1.0276, + "odds_ratio_loss": 0.05575251579284668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10220638662576675, + "rewards/margins": 0.3144119381904602, + "rewards/rejected": -0.41661834716796875, + "sft_loss": 1.0220638513565063, + "step": 8245 + }, + { + "epoch": 0.64, + "grad_norm": 3.953531265258789, + "learning_rate": 2.8858236246423577e-06, + "logits/chosen": -1.4015694856643677, + "logits/rejected": -0.9179224967956543, + "logps/chosen": -0.8062652349472046, + "logps/rejected": -9.229033470153809, + "loss": 0.8104, + "odds_ratio_loss": 0.04182159900665283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08062653243541718, + "rewards/margins": 0.8422768712043762, + "rewards/rejected": -0.9229034185409546, + "sft_loss": 0.8062652349472046, + "step": 8250 + }, + { + "epoch": 0.64, + "grad_norm": 29.65342903137207, + "learning_rate": 2.8802451981502215e-06, + "logits/chosen": -1.290475606918335, + "logits/rejected": -1.1075242757797241, + "logps/chosen": -0.8379208445549011, + "logps/rejected": -1.7874600887298584, + "loss": 0.9005, + "odds_ratio_loss": 0.6255687475204468, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08379209041595459, + "rewards/margins": 0.09495393186807632, + "rewards/rejected": -0.1787460297346115, + "sft_loss": 0.8379208445549011, + "step": 8255 + }, + { + "epoch": 0.64, + "grad_norm": 7.534077167510986, + "learning_rate": 2.8746699865353735e-06, + "logits/chosen": -1.4055616855621338, + "logits/rejected": -0.6370494961738586, + "logps/chosen": -0.9448844194412231, + "logps/rejected": -3.3409061431884766, + "loss": 0.9652, + "odds_ratio_loss": 0.20320256054401398, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09448844194412231, + "rewards/margins": 0.23960216343402863, + "rewards/rejected": -0.33409062027931213, + "sft_loss": 0.9448844194412231, + "step": 8260 + }, + { + "epoch": 0.64, + "grad_norm": 2.534180164337158, + "learning_rate": 2.869097998253335e-06, + "logits/chosen": -1.3750858306884766, + "logits/rejected": -1.4756567478179932, + "logps/chosen": -0.6586253643035889, + "logps/rejected": -3.427091121673584, + "loss": 0.6749, + "odds_ratio_loss": 0.16277530789375305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06586253643035889, + "rewards/margins": 0.27684658765792847, + "rewards/rejected": -0.34270912408828735, + "sft_loss": 0.6586253643035889, + "step": 8265 + }, + { + "epoch": 0.64, + "grad_norm": 11.438091278076172, + "learning_rate": 2.8635292417547316e-06, + "logits/chosen": -1.2504771947860718, + "logits/rejected": -1.1825075149536133, + "logps/chosen": -0.7640448212623596, + "logps/rejected": -2.9363181591033936, + "loss": 0.7794, + "odds_ratio_loss": 0.15384219586849213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07640448957681656, + "rewards/margins": 0.21722733974456787, + "rewards/rejected": -0.2936318516731262, + "sft_loss": 0.7640448212623596, + "step": 8270 + }, + { + "epoch": 0.64, + "grad_norm": 7.1938157081604, + "learning_rate": 2.857963725485289e-06, + "logits/chosen": -1.3243801593780518, + "logits/rejected": -0.9105132222175598, + "logps/chosen": -1.1438543796539307, + "logps/rejected": -12.201528549194336, + "loss": 1.1494, + "odds_ratio_loss": 0.055164773017168045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1143854483962059, + "rewards/margins": 1.1057674884796143, + "rewards/rejected": -1.2201528549194336, + "sft_loss": 1.1438543796539307, + "step": 8275 + }, + { + "epoch": 0.64, + "grad_norm": 93.03870391845703, + "learning_rate": 2.8524014578858212e-06, + "logits/chosen": -1.358870267868042, + "logits/rejected": -1.174807071685791, + "logps/chosen": -1.59250009059906, + "logps/rejected": -4.908066272735596, + "loss": 1.6391, + "odds_ratio_loss": 0.4656241536140442, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15925002098083496, + "rewards/margins": 0.33155661821365356, + "rewards/rejected": -0.4908066391944885, + "sft_loss": 1.59250009059906, + "step": 8280 + }, + { + "epoch": 0.64, + "grad_norm": 7.601529598236084, + "learning_rate": 2.846842447392212e-06, + "logits/chosen": -1.2130687236785889, + "logits/rejected": -0.8104062080383301, + "logps/chosen": -1.2862504720687866, + "logps/rejected": -5.174140453338623, + "loss": 1.3071, + "odds_ratio_loss": 0.20863895118236542, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1286250501871109, + "rewards/margins": 0.38878899812698364, + "rewards/rejected": -0.5174140334129333, + "sft_loss": 1.2862504720687866, + "step": 8285 + }, + { + "epoch": 0.64, + "grad_norm": 9.283108711242676, + "learning_rate": 2.841286702435408e-06, + "logits/chosen": -1.1821047067642212, + "logits/rejected": -0.8123686909675598, + "logps/chosen": -1.0744366645812988, + "logps/rejected": -2.654536724090576, + "loss": 1.1324, + "odds_ratio_loss": 0.5792650580406189, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.107443667948246, + "rewards/margins": 0.15800999104976654, + "rewards/rejected": -0.26545366644859314, + "sft_loss": 1.0744366645812988, + "step": 8290 + }, + { + "epoch": 0.65, + "grad_norm": 66.2201919555664, + "learning_rate": 2.835734231441398e-06, + "logits/chosen": -1.4393401145935059, + "logits/rejected": -1.3402307033538818, + "logps/chosen": -1.0411721467971802, + "logps/rejected": -6.103694438934326, + "loss": 1.046, + "odds_ratio_loss": 0.04821588844060898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10411719977855682, + "rewards/margins": 0.5062522292137146, + "rewards/rejected": -0.6103695034980774, + "sft_loss": 1.0411721467971802, + "step": 8295 + }, + { + "epoch": 0.65, + "grad_norm": 12.559268951416016, + "learning_rate": 2.830185042831214e-06, + "logits/chosen": -1.3654600381851196, + "logits/rejected": -0.9841594696044922, + "logps/chosen": -0.9705682992935181, + "logps/rejected": -2.2234740257263184, + "loss": 0.9994, + "odds_ratio_loss": 0.2879412770271301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09705682843923569, + "rewards/margins": 0.12529058754444122, + "rewards/rejected": -0.2223474234342575, + "sft_loss": 0.9705682992935181, + "step": 8300 + }, + { + "epoch": 0.65, + "grad_norm": 10.072586059570312, + "learning_rate": 2.824639145020903e-06, + "logits/chosen": -1.1897691488265991, + "logits/rejected": -1.0593864917755127, + "logps/chosen": -0.8479114770889282, + "logps/rejected": -2.3160197734832764, + "loss": 0.8767, + "odds_ratio_loss": 0.28744903206825256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0847911462187767, + "rewards/margins": 0.14681082963943481, + "rewards/rejected": -0.23160198330879211, + "sft_loss": 0.8479114770889282, + "step": 8305 + }, + { + "epoch": 0.65, + "grad_norm": 32.56950378417969, + "learning_rate": 2.8190965464215236e-06, + "logits/chosen": -1.2548611164093018, + "logits/rejected": -1.3413254022598267, + "logps/chosen": -0.9717265963554382, + "logps/rejected": -10.914091110229492, + "loss": 0.9799, + "odds_ratio_loss": 0.0821453332901001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09717267006635666, + "rewards/margins": 0.9942364692687988, + "rewards/rejected": -1.0914090871810913, + "sft_loss": 0.9717265963554382, + "step": 8310 + }, + { + "epoch": 0.65, + "grad_norm": 4.923733234405518, + "learning_rate": 2.8135572554391287e-06, + "logits/chosen": -1.285402536392212, + "logits/rejected": -0.6866047978401184, + "logps/chosen": -0.892236590385437, + "logps/rejected": -6.376672267913818, + "loss": 0.9094, + "odds_ratio_loss": 0.17121195793151855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08922366797924042, + "rewards/margins": 0.548443615436554, + "rewards/rejected": -0.6376672983169556, + "sft_loss": 0.892236590385437, + "step": 8315 + }, + { + "epoch": 0.65, + "grad_norm": 11.997573852539062, + "learning_rate": 2.8080212804747587e-06, + "logits/chosen": -1.3283909559249878, + "logits/rejected": -1.0296344757080078, + "logps/chosen": -0.7988203763961792, + "logps/rejected": -10.331366539001465, + "loss": 0.8117, + "odds_ratio_loss": 0.12899550795555115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07988204061985016, + "rewards/margins": 0.9532546997070312, + "rewards/rejected": -1.0331367254257202, + "sft_loss": 0.7988203763961792, + "step": 8320 + }, + { + "epoch": 0.65, + "grad_norm": 5.004197120666504, + "learning_rate": 2.802488629924419e-06, + "logits/chosen": -1.3715083599090576, + "logits/rejected": -1.0818204879760742, + "logps/chosen": -1.191572666168213, + "logps/rejected": -2.8655409812927246, + "loss": 1.2159, + "odds_ratio_loss": 0.24345561861991882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11915726959705353, + "rewards/margins": 0.16739685833454132, + "rewards/rejected": -0.28655415773391724, + "sft_loss": 1.191572666168213, + "step": 8325 + }, + { + "epoch": 0.65, + "grad_norm": 4.403100490570068, + "learning_rate": 2.7969593121790804e-06, + "logits/chosen": -1.1697700023651123, + "logits/rejected": -0.8697908520698547, + "logps/chosen": -0.9482523798942566, + "logps/rejected": -17.272789001464844, + "loss": 0.9486, + "odds_ratio_loss": 0.003111905185505748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09482523798942566, + "rewards/margins": 1.6324536800384521, + "rewards/rejected": -1.7272790670394897, + "sft_loss": 0.9482523798942566, + "step": 8330 + }, + { + "epoch": 0.65, + "grad_norm": 21.261398315429688, + "learning_rate": 2.7914333356246546e-06, + "logits/chosen": -1.2155040502548218, + "logits/rejected": -1.5798513889312744, + "logps/chosen": -1.0860846042633057, + "logps/rejected": -12.345155715942383, + "loss": 1.1098, + "odds_ratio_loss": 0.23688821494579315, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10860846191644669, + "rewards/margins": 1.1259071826934814, + "rewards/rejected": -1.23451566696167, + "sft_loss": 1.0860846042633057, + "step": 8335 + }, + { + "epoch": 0.65, + "grad_norm": 16.56570053100586, + "learning_rate": 2.7859107086419834e-06, + "logits/chosen": -1.4490469694137573, + "logits/rejected": -1.29300057888031, + "logps/chosen": -1.2202339172363281, + "logps/rejected": -7.131228446960449, + "loss": 1.2332, + "odds_ratio_loss": 0.1300738900899887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12202338874340057, + "rewards/margins": 0.5910994410514832, + "rewards/rejected": -0.7131228446960449, + "sft_loss": 1.2202339172363281, + "step": 8340 + }, + { + "epoch": 0.65, + "grad_norm": 6.498528003692627, + "learning_rate": 2.7803914396068365e-06, + "logits/chosen": -1.3390071392059326, + "logits/rejected": -0.6799716353416443, + "logps/chosen": -0.9280698895454407, + "logps/rejected": -5.283810615539551, + "loss": 0.9328, + "odds_ratio_loss": 0.047305118292570114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09280698001384735, + "rewards/margins": 0.4355740547180176, + "rewards/rejected": -0.5283809900283813, + "sft_loss": 0.9280698895454407, + "step": 8345 + }, + { + "epoch": 0.65, + "grad_norm": 5.988420486450195, + "learning_rate": 2.774875536889884e-06, + "logits/chosen": -1.4749171733856201, + "logits/rejected": -1.2055251598358154, + "logps/chosen": -1.0997960567474365, + "logps/rejected": -5.799482822418213, + "loss": 1.1034, + "odds_ratio_loss": 0.03586059808731079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10997961461544037, + "rewards/margins": 0.46996861696243286, + "rewards/rejected": -0.5799482464790344, + "sft_loss": 1.0997960567474365, + "step": 8350 + }, + { + "epoch": 0.65, + "grad_norm": 6.665837287902832, + "learning_rate": 2.7693630088566927e-06, + "logits/chosen": -1.4081647396087646, + "logits/rejected": -1.2188732624053955, + "logps/chosen": -0.9331458210945129, + "logps/rejected": -5.226067543029785, + "loss": 0.9604, + "odds_ratio_loss": 0.2725379168987274, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09331458806991577, + "rewards/margins": 0.4292922019958496, + "rewards/rejected": -0.5226067900657654, + "sft_loss": 0.9331458210945129, + "step": 8355 + }, + { + "epoch": 0.65, + "grad_norm": 12.168383598327637, + "learning_rate": 2.763853863867708e-06, + "logits/chosen": -1.4207031726837158, + "logits/rejected": -1.0859181880950928, + "logps/chosen": -0.7639963626861572, + "logps/rejected": -1.5648362636566162, + "loss": 0.79, + "odds_ratio_loss": 0.26012665033340454, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07639963924884796, + "rewards/margins": 0.08008398115634918, + "rewards/rejected": -0.15648362040519714, + "sft_loss": 0.7639963626861572, + "step": 8360 + }, + { + "epoch": 0.65, + "grad_norm": 5.338338851928711, + "learning_rate": 2.758348110278254e-06, + "logits/chosen": -1.3823060989379883, + "logits/rejected": -1.1109060049057007, + "logps/chosen": -0.8929181098937988, + "logps/rejected": -10.302949905395508, + "loss": 0.8986, + "odds_ratio_loss": 0.05642740800976753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08929181098937988, + "rewards/margins": 0.9410032033920288, + "rewards/rejected": -1.0302950143814087, + "sft_loss": 0.8929181098937988, + "step": 8365 + }, + { + "epoch": 0.65, + "grad_norm": 12.998638153076172, + "learning_rate": 2.7528457564385036e-06, + "logits/chosen": -1.2639782428741455, + "logits/rejected": -1.2002880573272705, + "logps/chosen": -1.0140728950500488, + "logps/rejected": -4.678340911865234, + "loss": 1.0299, + "odds_ratio_loss": 0.15787038207054138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10140728950500488, + "rewards/margins": 0.3664267957210541, + "rewards/rejected": -0.46783414483070374, + "sft_loss": 1.0140728950500488, + "step": 8370 + }, + { + "epoch": 0.65, + "grad_norm": 15.505352020263672, + "learning_rate": 2.74734681069347e-06, + "logits/chosen": -1.1000642776489258, + "logits/rejected": -1.1080691814422607, + "logps/chosen": -0.7868161797523499, + "logps/rejected": -5.477625370025635, + "loss": 0.8012, + "odds_ratio_loss": 0.14340201020240784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07868161052465439, + "rewards/margins": 0.4690808653831482, + "rewards/rejected": -0.5477625131607056, + "sft_loss": 0.7868161797523499, + "step": 8375 + }, + { + "epoch": 0.65, + "grad_norm": 8.352320671081543, + "learning_rate": 2.7418512813830077e-06, + "logits/chosen": -1.3259737491607666, + "logits/rejected": -1.0996830463409424, + "logps/chosen": -1.5631464719772339, + "logps/rejected": -6.814671516418457, + "loss": 1.6234, + "odds_ratio_loss": 0.6024779081344604, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1563146561384201, + "rewards/margins": 0.5251525044441223, + "rewards/rejected": -0.6814671754837036, + "sft_loss": 1.5631464719772339, + "step": 8380 + }, + { + "epoch": 0.65, + "grad_norm": 4.832741737365723, + "learning_rate": 2.7363591768417825e-06, + "logits/chosen": -1.333432912826538, + "logits/rejected": -1.1490983963012695, + "logps/chosen": -0.8162245750427246, + "logps/rejected": -7.255775451660156, + "loss": 0.8201, + "odds_ratio_loss": 0.03829885274171829, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08162246644496918, + "rewards/margins": 0.6439551115036011, + "rewards/rejected": -0.7255775332450867, + "sft_loss": 0.8162245750427246, + "step": 8385 + }, + { + "epoch": 0.65, + "grad_norm": 9.65146255493164, + "learning_rate": 2.730870505399267e-06, + "logits/chosen": -1.4474256038665771, + "logits/rejected": -1.1180304288864136, + "logps/chosen": -0.7661795020103455, + "logps/rejected": -4.420128345489502, + "loss": 0.7841, + "odds_ratio_loss": 0.17968173325061798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07661795616149902, + "rewards/margins": 0.3653948903083801, + "rewards/rejected": -0.4420127868652344, + "sft_loss": 0.7661795020103455, + "step": 8390 + }, + { + "epoch": 0.65, + "grad_norm": 247.20248413085938, + "learning_rate": 2.7253852753797315e-06, + "logits/chosen": -1.2072155475616455, + "logits/rejected": -1.2853707075119019, + "logps/chosen": -1.166669249534607, + "logps/rejected": -9.404703140258789, + "loss": 1.1773, + "odds_ratio_loss": 0.1061633825302124, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11666693538427353, + "rewards/margins": 0.8238033056259155, + "rewards/rejected": -0.9404702186584473, + "sft_loss": 1.166669249534607, + "step": 8395 + }, + { + "epoch": 0.65, + "grad_norm": 9.424349784851074, + "learning_rate": 2.719903495102223e-06, + "logits/chosen": -1.3545876741409302, + "logits/rejected": -1.0364861488342285, + "logps/chosen": -1.0813119411468506, + "logps/rejected": -8.046969413757324, + "loss": 1.095, + "odds_ratio_loss": 0.13737812638282776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10813118517398834, + "rewards/margins": 0.6965658068656921, + "rewards/rejected": -0.8046969175338745, + "sft_loss": 1.0813119411468506, + "step": 8400 + }, + { + "epoch": 0.65, + "grad_norm": 4.774113178253174, + "learning_rate": 2.714425172880554e-06, + "logits/chosen": -1.3201053142547607, + "logits/rejected": -0.8986188769340515, + "logps/chosen": -0.7992849946022034, + "logps/rejected": -4.5283308029174805, + "loss": 0.8134, + "odds_ratio_loss": 0.14078517258167267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07992849498987198, + "rewards/margins": 0.37290462851524353, + "rewards/rejected": -0.4528331160545349, + "sft_loss": 0.7992849946022034, + "step": 8405 + }, + { + "epoch": 0.65, + "grad_norm": 6.051595687866211, + "learning_rate": 2.7089503170233e-06, + "logits/chosen": -1.2442939281463623, + "logits/rejected": -1.1158170700073242, + "logps/chosen": -1.9196693897247314, + "logps/rejected": -6.476452827453613, + "loss": 1.9491, + "odds_ratio_loss": 0.2945200800895691, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1919669210910797, + "rewards/margins": 0.45567837357521057, + "rewards/rejected": -0.6476452946662903, + "sft_loss": 1.9196693897247314, + "step": 8410 + }, + { + "epoch": 0.65, + "grad_norm": 5.730443000793457, + "learning_rate": 2.7034789358337743e-06, + "logits/chosen": -1.3807923793792725, + "logits/rejected": -0.6011480689048767, + "logps/chosen": -0.8076326251029968, + "logps/rejected": -7.6511664390563965, + "loss": 0.8141, + "odds_ratio_loss": 0.06483285129070282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08076325803995132, + "rewards/margins": 0.6843534111976624, + "rewards/rejected": -0.7651166915893555, + "sft_loss": 0.8076326251029968, + "step": 8415 + }, + { + "epoch": 0.65, + "grad_norm": 8.752543449401855, + "learning_rate": 2.6980110376100187e-06, + "logits/chosen": -1.373929738998413, + "logits/rejected": -0.8665214776992798, + "logps/chosen": -0.9539247751235962, + "logps/rejected": -7.036995887756348, + "loss": 0.9627, + "odds_ratio_loss": 0.08804039657115936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09539248049259186, + "rewards/margins": 0.6083070635795593, + "rewards/rejected": -0.7036995887756348, + "sft_loss": 0.9539247751235962, + "step": 8420 + }, + { + "epoch": 0.66, + "grad_norm": 13.451896667480469, + "learning_rate": 2.692546630644797e-06, + "logits/chosen": -1.3927185535430908, + "logits/rejected": -1.1633400917053223, + "logps/chosen": -0.5796695947647095, + "logps/rejected": -7.085219383239746, + "loss": 0.6154, + "odds_ratio_loss": 0.3574976623058319, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05796695873141289, + "rewards/margins": 0.6505550146102905, + "rewards/rejected": -0.7085219621658325, + "sft_loss": 0.5796695947647095, + "step": 8425 + }, + { + "epoch": 0.66, + "grad_norm": 43.32596206665039, + "learning_rate": 2.6870857232255764e-06, + "logits/chosen": -1.469041109085083, + "logits/rejected": -0.9503668546676636, + "logps/chosen": -1.0551387071609497, + "logps/rejected": -6.713972568511963, + "loss": 1.1806, + "odds_ratio_loss": 1.255061388015747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10551387071609497, + "rewards/margins": 0.5658833384513855, + "rewards/rejected": -0.6713972091674805, + "sft_loss": 1.0551387071609497, + "step": 8430 + }, + { + "epoch": 0.66, + "grad_norm": 4.562943458557129, + "learning_rate": 2.6816283236345143e-06, + "logits/chosen": -1.1838380098342896, + "logits/rejected": -1.3772586584091187, + "logps/chosen": -0.830003559589386, + "logps/rejected": -10.05511474609375, + "loss": 0.8466, + "odds_ratio_loss": 0.165578693151474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08300035446882248, + "rewards/margins": 0.922511100769043, + "rewards/rejected": -1.0055115222930908, + "sft_loss": 0.830003559589386, + "step": 8435 + }, + { + "epoch": 0.66, + "grad_norm": 9.076092720031738, + "learning_rate": 2.67617444014845e-06, + "logits/chosen": -1.3671481609344482, + "logits/rejected": -0.9834438562393188, + "logps/chosen": -0.981173038482666, + "logps/rejected": -4.925403594970703, + "loss": 0.9975, + "odds_ratio_loss": 0.1635417342185974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09811730682849884, + "rewards/margins": 0.39442306756973267, + "rewards/rejected": -0.4925404191017151, + "sft_loss": 0.981173038482666, + "step": 8440 + }, + { + "epoch": 0.66, + "grad_norm": 31.632305145263672, + "learning_rate": 2.6707240810388933e-06, + "logits/chosen": -1.2191075086593628, + "logits/rejected": -1.3692381381988525, + "logps/chosen": -0.8065007925033569, + "logps/rejected": -2.877467393875122, + "loss": 0.8654, + "odds_ratio_loss": 0.5888963341712952, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08065007627010345, + "rewards/margins": 0.2070966213941574, + "rewards/rejected": -0.28774672746658325, + "sft_loss": 0.8065007925033569, + "step": 8445 + }, + { + "epoch": 0.66, + "grad_norm": 32.29814910888672, + "learning_rate": 2.665277254572005e-06, + "logits/chosen": -1.227386236190796, + "logits/rejected": -0.7855269312858582, + "logps/chosen": -1.0738370418548584, + "logps/rejected": -2.9455013275146484, + "loss": 1.0988, + "odds_ratio_loss": 0.24924306571483612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10738371312618256, + "rewards/margins": 0.18716642260551453, + "rewards/rejected": -0.2945501208305359, + "sft_loss": 1.0738370418548584, + "step": 8450 + }, + { + "epoch": 0.66, + "grad_norm": 6.155773639678955, + "learning_rate": 2.659833969008585e-06, + "logits/chosen": -1.3078839778900146, + "logits/rejected": -0.8803592920303345, + "logps/chosen": -0.9628534317016602, + "logps/rejected": -6.471461296081543, + "loss": 0.9819, + "odds_ratio_loss": 0.19075781106948853, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09628535062074661, + "rewards/margins": 0.5508608222007751, + "rewards/rejected": -0.6471462249755859, + "sft_loss": 0.9628534317016602, + "step": 8455 + }, + { + "epoch": 0.66, + "grad_norm": 11.189478874206543, + "learning_rate": 2.6543942326040728e-06, + "logits/chosen": -1.4053690433502197, + "logits/rejected": -0.7008453607559204, + "logps/chosen": -1.1526126861572266, + "logps/rejected": -7.940212249755859, + "loss": 1.1566, + "odds_ratio_loss": 0.039906956255435944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1152612715959549, + "rewards/margins": 0.6787599921226501, + "rewards/rejected": -0.7940212488174438, + "sft_loss": 1.1526126861572266, + "step": 8460 + }, + { + "epoch": 0.66, + "grad_norm": 6.9324517250061035, + "learning_rate": 2.6489580536085163e-06, + "logits/chosen": -1.4052064418792725, + "logits/rejected": -1.0391250848770142, + "logps/chosen": -0.8362232446670532, + "logps/rejected": -7.123330116271973, + "loss": 0.8524, + "odds_ratio_loss": 0.16156096756458282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08362232148647308, + "rewards/margins": 0.6287106871604919, + "rewards/rejected": -0.712333083152771, + "sft_loss": 0.8362232446670532, + "step": 8465 + }, + { + "epoch": 0.66, + "grad_norm": 4.802675247192383, + "learning_rate": 2.6435254402665695e-06, + "logits/chosen": -1.352170705795288, + "logits/rejected": -0.9224128723144531, + "logps/chosen": -0.982093334197998, + "logps/rejected": -5.067374229431152, + "loss": 0.9844, + "odds_ratio_loss": 0.02332504466176033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09820933640003204, + "rewards/margins": 0.40852808952331543, + "rewards/rejected": -0.5067374110221863, + "sft_loss": 0.982093334197998, + "step": 8470 + }, + { + "epoch": 0.66, + "grad_norm": 14.048432350158691, + "learning_rate": 2.6380964008174836e-06, + "logits/chosen": -1.1690585613250732, + "logits/rejected": -1.1874010562896729, + "logps/chosen": -1.1043365001678467, + "logps/rejected": -4.214531898498535, + "loss": 1.1143, + "odds_ratio_loss": 0.09924004226922989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1104336529970169, + "rewards/margins": 0.31101956963539124, + "rewards/rejected": -0.42145317792892456, + "sft_loss": 1.1043365001678467, + "step": 8475 + }, + { + "epoch": 0.66, + "grad_norm": 26.32798194885254, + "learning_rate": 2.632670943495086e-06, + "logits/chosen": -1.2947049140930176, + "logits/rejected": -0.8347790837287903, + "logps/chosen": -1.1103200912475586, + "logps/rejected": -4.632498741149902, + "loss": 1.1331, + "odds_ratio_loss": 0.22751355171203613, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11103200912475586, + "rewards/margins": 0.3522178828716278, + "rewards/rejected": -0.4632498621940613, + "sft_loss": 1.1103200912475586, + "step": 8480 + }, + { + "epoch": 0.66, + "grad_norm": 9.29808235168457, + "learning_rate": 2.6272490765277716e-06, + "logits/chosen": -1.3132994174957275, + "logits/rejected": -1.065861463546753, + "logps/chosen": -1.1188445091247559, + "logps/rejected": -5.130213260650635, + "loss": 1.1395, + "odds_ratio_loss": 0.20684120059013367, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1118844598531723, + "rewards/margins": 0.4011368751525879, + "rewards/rejected": -0.5130213499069214, + "sft_loss": 1.1188445091247559, + "step": 8485 + }, + { + "epoch": 0.66, + "grad_norm": 10.75184154510498, + "learning_rate": 2.621830808138485e-06, + "logits/chosen": -1.3912122249603271, + "logits/rejected": -1.1479923725128174, + "logps/chosen": -1.0153714418411255, + "logps/rejected": -7.170912742614746, + "loss": 1.0245, + "odds_ratio_loss": 0.0912996158003807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10153714567422867, + "rewards/margins": 0.6155540943145752, + "rewards/rejected": -0.7170912027359009, + "sft_loss": 1.0153714418411255, + "step": 8490 + }, + { + "epoch": 0.66, + "grad_norm": 29.56340789794922, + "learning_rate": 2.6164161465447235e-06, + "logits/chosen": -1.2630358934402466, + "logits/rejected": -0.7791035771369934, + "logps/chosen": -1.0668691396713257, + "logps/rejected": -2.717613935470581, + "loss": 1.0958, + "odds_ratio_loss": 0.28952598571777344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10668691247701645, + "rewards/margins": 0.16507449746131897, + "rewards/rejected": -0.27176138758659363, + "sft_loss": 1.0668691396713257, + "step": 8495 + }, + { + "epoch": 0.66, + "grad_norm": 9.063817977905273, + "learning_rate": 2.611005099958508e-06, + "logits/chosen": -1.3539373874664307, + "logits/rejected": -0.797394871711731, + "logps/chosen": -0.9918476343154907, + "logps/rejected": -2.190828323364258, + "loss": 1.0311, + "odds_ratio_loss": 0.39265957474708557, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09918475151062012, + "rewards/margins": 0.11989808082580566, + "rewards/rejected": -0.21908283233642578, + "sft_loss": 0.9918476343154907, + "step": 8500 + }, + { + "epoch": 0.66, + "grad_norm": 79.72722625732422, + "learning_rate": 2.6055976765863744e-06, + "logits/chosen": -1.391193151473999, + "logits/rejected": -0.9862115979194641, + "logps/chosen": -1.048738718032837, + "logps/rejected": -6.968371391296387, + "loss": 1.109, + "odds_ratio_loss": 0.6028006672859192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10487387329339981, + "rewards/margins": 0.5919632315635681, + "rewards/rejected": -0.6968370676040649, + "sft_loss": 1.048738718032837, + "step": 8505 + }, + { + "epoch": 0.66, + "grad_norm": 21.11467933654785, + "learning_rate": 2.6001938846293717e-06, + "logits/chosen": -1.4116090536117554, + "logits/rejected": -1.1980160474777222, + "logps/chosen": -1.0626676082611084, + "logps/rejected": -5.518164157867432, + "loss": 1.0703, + "odds_ratio_loss": 0.07616396248340607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10626678168773651, + "rewards/margins": 0.4455496370792389, + "rewards/rejected": -0.5518164038658142, + "sft_loss": 1.0626676082611084, + "step": 8510 + }, + { + "epoch": 0.66, + "grad_norm": 28.986602783203125, + "learning_rate": 2.5947937322830346e-06, + "logits/chosen": -1.4573067426681519, + "logits/rejected": -0.9762972593307495, + "logps/chosen": -1.0367553234100342, + "logps/rejected": -5.620980739593506, + "loss": 1.0498, + "odds_ratio_loss": 0.12997016310691833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10367554426193237, + "rewards/margins": 0.45842257142066956, + "rewards/rejected": -0.5620980858802795, + "sft_loss": 1.0367553234100342, + "step": 8515 + }, + { + "epoch": 0.66, + "grad_norm": 4.6948652267456055, + "learning_rate": 2.589397227737377e-06, + "logits/chosen": -1.3029415607452393, + "logits/rejected": -0.7049711346626282, + "logps/chosen": -1.0776302814483643, + "logps/rejected": -4.658875942230225, + "loss": 1.0874, + "odds_ratio_loss": 0.09810274839401245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10776302963495255, + "rewards/margins": 0.35812458395957947, + "rewards/rejected": -0.4658876359462738, + "sft_loss": 1.0776302814483643, + "step": 8520 + }, + { + "epoch": 0.66, + "grad_norm": 75.1124267578125, + "learning_rate": 2.5840043791768876e-06, + "logits/chosen": -1.0878616571426392, + "logits/rejected": -1.3250956535339355, + "logps/chosen": -1.1645736694335938, + "logps/rejected": -5.899996280670166, + "loss": 1.1723, + "odds_ratio_loss": 0.07772447168827057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11645736545324326, + "rewards/margins": 0.4735422730445862, + "rewards/rejected": -0.5899996161460876, + "sft_loss": 1.1645736694335938, + "step": 8525 + }, + { + "epoch": 0.66, + "grad_norm": 6.188475608825684, + "learning_rate": 2.5786151947805045e-06, + "logits/chosen": -1.4203815460205078, + "logits/rejected": -0.9625504612922668, + "logps/chosen": -0.9616947174072266, + "logps/rejected": -6.568534851074219, + "loss": 0.9895, + "odds_ratio_loss": 0.2777538299560547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09616947174072266, + "rewards/margins": 0.560684084892273, + "rewards/rejected": -0.6568534970283508, + "sft_loss": 0.9616947174072266, + "step": 8530 + }, + { + "epoch": 0.66, + "grad_norm": 25.69866180419922, + "learning_rate": 2.5732296827216086e-06, + "logits/chosen": -1.266811490058899, + "logits/rejected": -1.3779981136322021, + "logps/chosen": -1.1356754302978516, + "logps/rejected": -10.114517211914062, + "loss": 1.1773, + "odds_ratio_loss": 0.4164787232875824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1135675460100174, + "rewards/margins": 0.8978842496871948, + "rewards/rejected": -1.0114517211914062, + "sft_loss": 1.1356754302978516, + "step": 8535 + }, + { + "epoch": 0.66, + "grad_norm": 8.122133255004883, + "learning_rate": 2.5678478511680143e-06, + "logits/chosen": -1.3327136039733887, + "logits/rejected": -0.841677188873291, + "logps/chosen": -0.6438709497451782, + "logps/rejected": -6.755641937255859, + "loss": 0.6519, + "odds_ratio_loss": 0.08069080114364624, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06438709795475006, + "rewards/margins": 0.6111770868301392, + "rewards/rejected": -0.675564169883728, + "sft_loss": 0.6438709497451782, + "step": 8540 + }, + { + "epoch": 0.66, + "grad_norm": 7.290009498596191, + "learning_rate": 2.5624697082819517e-06, + "logits/chosen": -1.1562501192092896, + "logits/rejected": -1.0644404888153076, + "logps/chosen": -0.6335257291793823, + "logps/rejected": -8.344648361206055, + "loss": 0.6375, + "odds_ratio_loss": 0.03943305462598801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06335257738828659, + "rewards/margins": 0.771112322807312, + "rewards/rejected": -0.8344649076461792, + "sft_loss": 0.6335257291793823, + "step": 8545 + }, + { + "epoch": 0.67, + "grad_norm": 10.739457130432129, + "learning_rate": 2.5570952622200575e-06, + "logits/chosen": -1.4692208766937256, + "logits/rejected": -1.1127398014068604, + "logps/chosen": -1.4549614191055298, + "logps/rejected": -10.238380432128906, + "loss": 1.4863, + "odds_ratio_loss": 0.31321436166763306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14549614489078522, + "rewards/margins": 0.8783419728279114, + "rewards/rejected": -1.0238380432128906, + "sft_loss": 1.4549614191055298, + "step": 8550 + }, + { + "epoch": 0.67, + "grad_norm": 24.13983917236328, + "learning_rate": 2.5517245211333585e-06, + "logits/chosen": -1.2598793506622314, + "logits/rejected": -1.0114340782165527, + "logps/chosen": -1.1584948301315308, + "logps/rejected": -4.865040302276611, + "loss": 1.1745, + "odds_ratio_loss": 0.16014239192008972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11584948003292084, + "rewards/margins": 0.37065452337265015, + "rewards/rejected": -0.4865039885044098, + "sft_loss": 1.1584948301315308, + "step": 8555 + }, + { + "epoch": 0.67, + "grad_norm": 18.325902938842773, + "learning_rate": 2.5463574931672714e-06, + "logits/chosen": -1.2867510318756104, + "logits/rejected": -1.064446210861206, + "logps/chosen": -0.9027272462844849, + "logps/rejected": -6.976473808288574, + "loss": 0.9577, + "odds_ratio_loss": 0.5501552820205688, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09027273207902908, + "rewards/margins": 0.6073746681213379, + "rewards/rejected": -0.6976473927497864, + "sft_loss": 0.9027272462844849, + "step": 8560 + }, + { + "epoch": 0.67, + "grad_norm": 26.868846893310547, + "learning_rate": 2.5409941864615717e-06, + "logits/chosen": -1.2793635129928589, + "logits/rejected": -1.092800259590149, + "logps/chosen": -0.8769108057022095, + "logps/rejected": -6.240958213806152, + "loss": 0.8987, + "odds_ratio_loss": 0.21805354952812195, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08769109100103378, + "rewards/margins": 0.5364047884941101, + "rewards/rejected": -0.6240958571434021, + "sft_loss": 0.8769108057022095, + "step": 8565 + }, + { + "epoch": 0.67, + "grad_norm": 16.86786460876465, + "learning_rate": 2.535634609150395e-06, + "logits/chosen": -1.327836275100708, + "logits/rejected": -1.0574593544006348, + "logps/chosen": -1.1664834022521973, + "logps/rejected": -4.941313743591309, + "loss": 1.1825, + "odds_ratio_loss": 0.1604529619216919, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11664833873510361, + "rewards/margins": 0.3774830996990204, + "rewards/rejected": -0.494131475687027, + "sft_loss": 1.1664834022521973, + "step": 8570 + }, + { + "epoch": 0.67, + "grad_norm": 4.323856830596924, + "learning_rate": 2.5302787693622223e-06, + "logits/chosen": -1.4250985383987427, + "logits/rejected": -1.1745866537094116, + "logps/chosen": -0.7151682376861572, + "logps/rejected": -4.566773891448975, + "loss": 0.7262, + "odds_ratio_loss": 0.11058475822210312, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07151682674884796, + "rewards/margins": 0.3851606249809265, + "rewards/rejected": -0.4566774368286133, + "sft_loss": 0.7151682376861572, + "step": 8575 + }, + { + "epoch": 0.67, + "grad_norm": 4.649173259735107, + "learning_rate": 2.5249266752198644e-06, + "logits/chosen": -1.2564361095428467, + "logits/rejected": -0.82867431640625, + "logps/chosen": -0.9718457460403442, + "logps/rejected": -3.516380786895752, + "loss": 1.0028, + "odds_ratio_loss": 0.3100363314151764, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09718457609415054, + "rewards/margins": 0.25445348024368286, + "rewards/rejected": -0.3516380786895752, + "sft_loss": 0.9718457460403442, + "step": 8580 + }, + { + "epoch": 0.67, + "grad_norm": 6.03109884262085, + "learning_rate": 2.519578334840449e-06, + "logits/chosen": -1.2383915185928345, + "logits/rejected": -0.9514943957328796, + "logps/chosen": -1.175445318222046, + "logps/rejected": -7.592044830322266, + "loss": 1.1958, + "odds_ratio_loss": 0.20304739475250244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11754453182220459, + "rewards/margins": 0.6416599154472351, + "rewards/rejected": -0.7592045068740845, + "sft_loss": 1.175445318222046, + "step": 8585 + }, + { + "epoch": 0.67, + "grad_norm": 13.900643348693848, + "learning_rate": 2.514233756335417e-06, + "logits/chosen": -1.415035605430603, + "logits/rejected": -1.2320356369018555, + "logps/chosen": -0.7346863746643066, + "logps/rejected": -3.777554750442505, + "loss": 0.7469, + "odds_ratio_loss": 0.12233449518680573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0734686404466629, + "rewards/margins": 0.3042868673801422, + "rewards/rejected": -0.37775546312332153, + "sft_loss": 0.7346863746643066, + "step": 8590 + }, + { + "epoch": 0.67, + "grad_norm": 25.50760269165039, + "learning_rate": 2.5088929478104993e-06, + "logits/chosen": -1.3608187437057495, + "logits/rejected": -0.8603233098983765, + "logps/chosen": -0.862570583820343, + "logps/rejected": -9.544593811035156, + "loss": 0.9183, + "odds_ratio_loss": 0.5568982362747192, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08625705540180206, + "rewards/margins": 0.8682023882865906, + "rewards/rejected": -0.9544594883918762, + "sft_loss": 0.862570583820343, + "step": 8595 + }, + { + "epoch": 0.67, + "grad_norm": 11.82734489440918, + "learning_rate": 2.503555917365711e-06, + "logits/chosen": -1.062488079071045, + "logits/rejected": -1.4695297479629517, + "logps/chosen": -1.097212791442871, + "logps/rejected": -7.532125949859619, + "loss": 1.1187, + "odds_ratio_loss": 0.21446946263313293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10972128063440323, + "rewards/margins": 0.6434913873672485, + "rewards/rejected": -0.753212571144104, + "sft_loss": 1.097212791442871, + "step": 8600 + }, + { + "epoch": 0.67, + "grad_norm": 8.268609046936035, + "learning_rate": 2.4982226730953315e-06, + "logits/chosen": -1.3083078861236572, + "logits/rejected": -1.256940484046936, + "logps/chosen": -1.0806376934051514, + "logps/rejected": -7.487344264984131, + "loss": 1.0854, + "odds_ratio_loss": 0.047921765595674515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10806377232074738, + "rewards/margins": 0.6406707167625427, + "rewards/rejected": -0.7487344741821289, + "sft_loss": 1.0806376934051514, + "step": 8605 + }, + { + "epoch": 0.67, + "grad_norm": 85.88379669189453, + "learning_rate": 2.4928932230879076e-06, + "logits/chosen": -1.2781380414962769, + "logits/rejected": -1.4369587898254395, + "logps/chosen": -1.1298456192016602, + "logps/rejected": -16.079452514648438, + "loss": 1.1302, + "odds_ratio_loss": 0.003542351070791483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11298457533121109, + "rewards/margins": 1.4949607849121094, + "rewards/rejected": -1.607945203781128, + "sft_loss": 1.1298456192016602, + "step": 8610 + }, + { + "epoch": 0.67, + "grad_norm": 9.954533576965332, + "learning_rate": 2.4875675754262265e-06, + "logits/chosen": -1.3310059309005737, + "logits/rejected": -1.3483989238739014, + "logps/chosen": -0.8747609257698059, + "logps/rejected": -7.9532470703125, + "loss": 0.8769, + "odds_ratio_loss": 0.02184412255883217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08747608959674835, + "rewards/margins": 0.7078485488891602, + "rewards/rejected": -0.7953246831893921, + "sft_loss": 0.8747609257698059, + "step": 8615 + }, + { + "epoch": 0.67, + "grad_norm": 11.167495727539062, + "learning_rate": 2.4822457381873055e-06, + "logits/chosen": -1.3983945846557617, + "logits/rejected": -1.107145071029663, + "logps/chosen": -1.239545226097107, + "logps/rejected": -6.608633518218994, + "loss": 1.2722, + "odds_ratio_loss": 0.3261207342147827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12395453453063965, + "rewards/margins": 0.5369088053703308, + "rewards/rejected": -0.6608633399009705, + "sft_loss": 1.239545226097107, + "step": 8620 + }, + { + "epoch": 0.67, + "grad_norm": 48.37714767456055, + "learning_rate": 2.476927719442391e-06, + "logits/chosen": -1.4697296619415283, + "logits/rejected": -1.104361891746521, + "logps/chosen": -1.1724387407302856, + "logps/rejected": -6.618524074554443, + "loss": 1.1824, + "odds_ratio_loss": 0.10003963857889175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11724387109279633, + "rewards/margins": 0.5446085333824158, + "rewards/rejected": -0.6618523597717285, + "sft_loss": 1.1724387407302856, + "step": 8625 + }, + { + "epoch": 0.67, + "grad_norm": 13.710233688354492, + "learning_rate": 2.471613527256932e-06, + "logits/chosen": -1.3458722829818726, + "logits/rejected": -1.153869867324829, + "logps/chosen": -0.9881760478019714, + "logps/rejected": -10.99592113494873, + "loss": 0.9948, + "odds_ratio_loss": 0.06657255440950394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0988176092505455, + "rewards/margins": 1.000774621963501, + "rewards/rejected": -1.0995922088623047, + "sft_loss": 0.9881760478019714, + "step": 8630 + }, + { + "epoch": 0.67, + "grad_norm": 5.124767780303955, + "learning_rate": 2.4663031696905732e-06, + "logits/chosen": -1.375286340713501, + "logits/rejected": -1.0435411930084229, + "logps/chosen": -0.9623756408691406, + "logps/rejected": -4.443991661071777, + "loss": 0.9678, + "odds_ratio_loss": 0.053923167288303375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09623756259679794, + "rewards/margins": 0.34816163778305054, + "rewards/rejected": -0.4443992078304291, + "sft_loss": 0.9623756408691406, + "step": 8635 + }, + { + "epoch": 0.67, + "grad_norm": 5.593282699584961, + "learning_rate": 2.4609966547971505e-06, + "logits/chosen": -1.276353359222412, + "logits/rejected": -0.8955556154251099, + "logps/chosen": -0.9168095588684082, + "logps/rejected": -4.759482383728027, + "loss": 0.941, + "odds_ratio_loss": 0.24143771827220917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09168095886707306, + "rewards/margins": 0.38426730036735535, + "rewards/rejected": -0.4759482443332672, + "sft_loss": 0.9168095588684082, + "step": 8640 + }, + { + "epoch": 0.67, + "grad_norm": 32.27127456665039, + "learning_rate": 2.4556939906246644e-06, + "logits/chosen": -1.1802663803100586, + "logits/rejected": -1.1810654401779175, + "logps/chosen": -1.0008405447006226, + "logps/rejected": -7.588818550109863, + "loss": 1.0307, + "odds_ratio_loss": 0.2982523441314697, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10008406639099121, + "rewards/margins": 0.6587977409362793, + "rewards/rejected": -0.7588818669319153, + "sft_loss": 1.0008405447006226, + "step": 8645 + }, + { + "epoch": 0.67, + "grad_norm": 4.079829216003418, + "learning_rate": 2.4503951852152803e-06, + "logits/chosen": -1.3074984550476074, + "logits/rejected": -0.7468923330307007, + "logps/chosen": -0.9436966180801392, + "logps/rejected": -6.366265296936035, + "loss": 0.9684, + "odds_ratio_loss": 0.24740377068519592, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09436966478824615, + "rewards/margins": 0.5422569513320923, + "rewards/rejected": -0.6366265416145325, + "sft_loss": 0.9436966180801392, + "step": 8650 + }, + { + "epoch": 0.67, + "grad_norm": 5.481407642364502, + "learning_rate": 2.4451002466053075e-06, + "logits/chosen": -1.199569821357727, + "logits/rejected": -1.1957504749298096, + "logps/chosen": -0.8332787752151489, + "logps/rejected": -4.7142720222473145, + "loss": 0.9124, + "odds_ratio_loss": 0.7910371422767639, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08332787454128265, + "rewards/margins": 0.38809934258461, + "rewards/rejected": -0.47142720222473145, + "sft_loss": 0.8332787752151489, + "step": 8655 + }, + { + "epoch": 0.67, + "grad_norm": 6.620190620422363, + "learning_rate": 2.4398091828251935e-06, + "logits/chosen": -0.940380871295929, + "logits/rejected": -1.2794045209884644, + "logps/chosen": -1.1835598945617676, + "logps/rejected": -8.489274024963379, + "loss": 1.1934, + "odds_ratio_loss": 0.0984053835272789, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11835597455501556, + "rewards/margins": 0.7305713891983032, + "rewards/rejected": -0.84892737865448, + "sft_loss": 1.1835598945617676, + "step": 8660 + }, + { + "epoch": 0.67, + "grad_norm": 6.176886081695557, + "learning_rate": 2.4345220018995086e-06, + "logits/chosen": -1.3382463455200195, + "logits/rejected": -0.9954336881637573, + "logps/chosen": -1.0478094816207886, + "logps/rejected": -17.834476470947266, + "loss": 1.0495, + "odds_ratio_loss": 0.01731877587735653, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10478094965219498, + "rewards/margins": 1.6786667108535767, + "rewards/rejected": -1.7834476232528687, + "sft_loss": 1.0478094816207886, + "step": 8665 + }, + { + "epoch": 0.67, + "grad_norm": 4.765834331512451, + "learning_rate": 2.429238711846932e-06, + "logits/chosen": -1.1585235595703125, + "logits/rejected": -0.7135952711105347, + "logps/chosen": -1.0281130075454712, + "logps/rejected": -11.070489883422852, + "loss": 1.0394, + "odds_ratio_loss": 0.11250078678131104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.102811299264431, + "rewards/margins": 1.0042376518249512, + "rewards/rejected": -1.1070489883422852, + "sft_loss": 1.0281130075454712, + "step": 8670 + }, + { + "epoch": 0.67, + "grad_norm": 5.145506858825684, + "learning_rate": 2.4239593206802465e-06, + "logits/chosen": -1.3943018913269043, + "logits/rejected": -0.9713207483291626, + "logps/chosen": -0.9120500683784485, + "logps/rejected": -5.5587615966796875, + "loss": 0.9227, + "odds_ratio_loss": 0.10694025456905365, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09120501577854156, + "rewards/margins": 0.46467119455337524, + "rewards/rejected": -0.5558761358261108, + "sft_loss": 0.9120500683784485, + "step": 8675 + }, + { + "epoch": 0.68, + "grad_norm": 4.959620952606201, + "learning_rate": 2.418683836406318e-06, + "logits/chosen": -1.3747514486312866, + "logits/rejected": -0.5954295992851257, + "logps/chosen": -1.0089941024780273, + "logps/rejected": -6.331844329833984, + "loss": 1.0323, + "odds_ratio_loss": 0.23257341980934143, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10089939832687378, + "rewards/margins": 0.5322850942611694, + "rewards/rejected": -0.6331844329833984, + "sft_loss": 1.0089941024780273, + "step": 8680 + }, + { + "epoch": 0.68, + "grad_norm": 9.242082595825195, + "learning_rate": 2.4134122670260875e-06, + "logits/chosen": -1.2145757675170898, + "logits/rejected": -1.2564046382904053, + "logps/chosen": -1.1589300632476807, + "logps/rejected": -8.16043472290039, + "loss": 1.1596, + "odds_ratio_loss": 0.007167732808738947, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11589299142360687, + "rewards/margins": 0.7001504898071289, + "rewards/rejected": -0.8160433769226074, + "sft_loss": 1.1589300632476807, + "step": 8685 + }, + { + "epoch": 0.68, + "grad_norm": 9.718241691589355, + "learning_rate": 2.408144620534561e-06, + "logits/chosen": -1.3903659582138062, + "logits/rejected": -1.2220784425735474, + "logps/chosen": -0.8666499853134155, + "logps/rejected": -6.302515506744385, + "loss": 0.8757, + "odds_ratio_loss": 0.09074664860963821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08666499704122543, + "rewards/margins": 0.5435865521430969, + "rewards/rejected": -0.6302515268325806, + "sft_loss": 0.8666499853134155, + "step": 8690 + }, + { + "epoch": 0.68, + "grad_norm": 21.57361602783203, + "learning_rate": 2.4028809049207922e-06, + "logits/chosen": -1.3529293537139893, + "logits/rejected": -0.9282048940658569, + "logps/chosen": -1.0757924318313599, + "logps/rejected": -5.857820987701416, + "loss": 1.0973, + "odds_ratio_loss": 0.21466533839702606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10757924616336823, + "rewards/margins": 0.4782029092311859, + "rewards/rejected": -0.5857821106910706, + "sft_loss": 1.0757924318313599, + "step": 8695 + }, + { + "epoch": 0.68, + "grad_norm": 24.09572410583496, + "learning_rate": 2.3976211281678723e-06, + "logits/chosen": -1.4947898387908936, + "logits/rejected": -0.8406974673271179, + "logps/chosen": -0.9563905596733093, + "logps/rejected": -6.372321605682373, + "loss": 0.9644, + "odds_ratio_loss": 0.07989028841257095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09563905000686646, + "rewards/margins": 0.5415931940078735, + "rewards/rejected": -0.63723224401474, + "sft_loss": 0.9563905596733093, + "step": 8700 + }, + { + "epoch": 0.68, + "grad_norm": 25.895950317382812, + "learning_rate": 2.392365298252925e-06, + "logits/chosen": -1.3481794595718384, + "logits/rejected": -1.4586479663848877, + "logps/chosen": -0.8709031343460083, + "logps/rejected": -6.724035739898682, + "loss": 0.9243, + "odds_ratio_loss": 0.5336446762084961, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08709030598402023, + "rewards/margins": 0.5853133201599121, + "rewards/rejected": -0.6724035739898682, + "sft_loss": 0.8709031343460083, + "step": 8705 + }, + { + "epoch": 0.68, + "grad_norm": 8.630340576171875, + "learning_rate": 2.3871134231470806e-06, + "logits/chosen": -1.1768683195114136, + "logits/rejected": -1.250208854675293, + "logps/chosen": -0.9202073216438293, + "logps/rejected": -4.344022750854492, + "loss": 0.9729, + "odds_ratio_loss": 0.526607871055603, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09202073514461517, + "rewards/margins": 0.3423815965652466, + "rewards/rejected": -0.43440231680870056, + "sft_loss": 0.9202073216438293, + "step": 8710 + }, + { + "epoch": 0.68, + "grad_norm": 10.498140335083008, + "learning_rate": 2.3818655108154747e-06, + "logits/chosen": -1.1865965127944946, + "logits/rejected": -0.629920244216919, + "logps/chosen": -1.229215383529663, + "logps/rejected": -3.8668923377990723, + "loss": 1.2554, + "odds_ratio_loss": 0.2619324326515198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12292154133319855, + "rewards/margins": 0.26376765966415405, + "rewards/rejected": -0.3866892457008362, + "sft_loss": 1.229215383529663, + "step": 8715 + }, + { + "epoch": 0.68, + "grad_norm": 4.093234062194824, + "learning_rate": 2.3766215692172335e-06, + "logits/chosen": -1.387241244316101, + "logits/rejected": -0.8403748273849487, + "logps/chosen": -1.0737946033477783, + "logps/rejected": -8.066632270812988, + "loss": 1.0756, + "odds_ratio_loss": 0.01804409921169281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10737945884466171, + "rewards/margins": 0.6992837190628052, + "rewards/rejected": -0.8066631555557251, + "sft_loss": 1.0737946033477783, + "step": 8720 + }, + { + "epoch": 0.68, + "grad_norm": 7.077796459197998, + "learning_rate": 2.3713816063054594e-06, + "logits/chosen": -1.3643040657043457, + "logits/rejected": -1.0773613452911377, + "logps/chosen": -0.9288724064826965, + "logps/rejected": -14.269126892089844, + "loss": 0.9366, + "odds_ratio_loss": 0.07765939831733704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09288725256919861, + "rewards/margins": 1.334025263786316, + "rewards/rejected": -1.426912546157837, + "sft_loss": 0.9288724064826965, + "step": 8725 + }, + { + "epoch": 0.68, + "grad_norm": 139.87179565429688, + "learning_rate": 2.3661456300272218e-06, + "logits/chosen": -1.4828407764434814, + "logits/rejected": -1.1542161703109741, + "logps/chosen": -0.9767447710037231, + "logps/rejected": -13.312211990356445, + "loss": 0.983, + "odds_ratio_loss": 0.06288562715053558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09767447412014008, + "rewards/margins": 1.2335467338562012, + "rewards/rejected": -1.3312212228775024, + "sft_loss": 0.9767447710037231, + "step": 8730 + }, + { + "epoch": 0.68, + "grad_norm": 5.913445949554443, + "learning_rate": 2.3609136483235417e-06, + "logits/chosen": -1.3908110857009888, + "logits/rejected": -0.7219542264938354, + "logps/chosen": -1.0922479629516602, + "logps/rejected": -7.108962059020996, + "loss": 1.0973, + "odds_ratio_loss": 0.050926219671964645, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10922479629516602, + "rewards/margins": 0.6016713976860046, + "rewards/rejected": -0.7108962535858154, + "sft_loss": 1.0922479629516602, + "step": 8735 + }, + { + "epoch": 0.68, + "grad_norm": 7.034379959106445, + "learning_rate": 2.3556856691293874e-06, + "logits/chosen": -1.3366495370864868, + "logits/rejected": -0.9458214640617371, + "logps/chosen": -1.1878178119659424, + "logps/rejected": -5.579440116882324, + "loss": 1.2406, + "odds_ratio_loss": 0.5279297232627869, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11878176778554916, + "rewards/margins": 0.4391621947288513, + "rewards/rejected": -0.5579439401626587, + "sft_loss": 1.1878178119659424, + "step": 8740 + }, + { + "epoch": 0.68, + "grad_norm": 164.0315704345703, + "learning_rate": 2.3504617003736505e-06, + "logits/chosen": -1.2090203762054443, + "logits/rejected": -1.0629816055297852, + "logps/chosen": -1.0493745803833008, + "logps/rejected": -7.64117431640625, + "loss": 1.0686, + "odds_ratio_loss": 0.19206681847572327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10493745654821396, + "rewards/margins": 0.6591799259185791, + "rewards/rejected": -0.764117419719696, + "sft_loss": 1.0493745803833008, + "step": 8745 + }, + { + "epoch": 0.68, + "grad_norm": 777.0653686523438, + "learning_rate": 2.345241749979142e-06, + "logits/chosen": -1.2011092901229858, + "logits/rejected": -1.496852159500122, + "logps/chosen": -1.4050729274749756, + "logps/rejected": -9.406866073608398, + "loss": 1.4124, + "odds_ratio_loss": 0.07344251871109009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.140507310628891, + "rewards/margins": 0.8001793026924133, + "rewards/rejected": -0.9406865835189819, + "sft_loss": 1.4050729274749756, + "step": 8750 + }, + { + "epoch": 0.68, + "grad_norm": 5.332313537597656, + "learning_rate": 2.3400258258625824e-06, + "logits/chosen": -1.2466320991516113, + "logits/rejected": -1.1018311977386475, + "logps/chosen": -0.7403644323348999, + "logps/rejected": -3.1287522315979004, + "loss": 0.7526, + "odds_ratio_loss": 0.12221725285053253, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07403644919395447, + "rewards/margins": 0.23883876204490662, + "rewards/rejected": -0.3128752112388611, + "sft_loss": 0.7403644323348999, + "step": 8755 + }, + { + "epoch": 0.68, + "grad_norm": 171.93978881835938, + "learning_rate": 2.3348139359345818e-06, + "logits/chosen": -1.4692953824996948, + "logits/rejected": -1.001387357711792, + "logps/chosen": -1.5447783470153809, + "logps/rejected": -5.410799026489258, + "loss": 1.552, + "odds_ratio_loss": 0.07230857759714127, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15447784960269928, + "rewards/margins": 0.3866020143032074, + "rewards/rejected": -0.5410798788070679, + "sft_loss": 1.5447783470153809, + "step": 8760 + }, + { + "epoch": 0.68, + "grad_norm": 20.867794036865234, + "learning_rate": 2.3296060880996324e-06, + "logits/chosen": -1.1623890399932861, + "logits/rejected": -0.919702410697937, + "logps/chosen": -1.1486616134643555, + "logps/rejected": -2.9376235008239746, + "loss": 1.1773, + "odds_ratio_loss": 0.28643161058425903, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11486617475748062, + "rewards/margins": 0.17889617383480072, + "rewards/rejected": -0.29376235604286194, + "sft_loss": 1.1486616134643555, + "step": 8765 + }, + { + "epoch": 0.68, + "grad_norm": 11.270215034484863, + "learning_rate": 2.324402290256097e-06, + "logits/chosen": -1.356595754623413, + "logits/rejected": -1.6129558086395264, + "logps/chosen": -0.7033634185791016, + "logps/rejected": -10.820283889770508, + "loss": 0.7142, + "odds_ratio_loss": 0.10823347419500351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07033634185791016, + "rewards/margins": 1.0116920471191406, + "rewards/rejected": -1.0820282697677612, + "sft_loss": 0.7033634185791016, + "step": 8770 + }, + { + "epoch": 0.68, + "grad_norm": 5.982351303100586, + "learning_rate": 2.319202550296195e-06, + "logits/chosen": -1.2674534320831299, + "logits/rejected": -1.3874826431274414, + "logps/chosen": -1.2085932493209839, + "logps/rejected": -13.73901081085205, + "loss": 1.2104, + "odds_ratio_loss": 0.018456827849149704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12085933983325958, + "rewards/margins": 1.2530416250228882, + "rewards/rejected": -1.373901128768921, + "sft_loss": 1.2085932493209839, + "step": 8775 + }, + { + "epoch": 0.68, + "grad_norm": 5.8140764236450195, + "learning_rate": 2.3140068761059936e-06, + "logits/chosen": -1.3922148942947388, + "logits/rejected": -1.0940783023834229, + "logps/chosen": -1.1989779472351074, + "logps/rejected": -6.598939418792725, + "loss": 1.2196, + "odds_ratio_loss": 0.20643094182014465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11989779770374298, + "rewards/margins": 0.5399961471557617, + "rewards/rejected": -0.6598939299583435, + "sft_loss": 1.1989779472351074, + "step": 8780 + }, + { + "epoch": 0.68, + "grad_norm": 20.456661224365234, + "learning_rate": 2.3088152755653893e-06, + "logits/chosen": -1.314902901649475, + "logits/rejected": -1.0060148239135742, + "logps/chosen": -0.9692791104316711, + "logps/rejected": -5.441655158996582, + "loss": 1.0225, + "odds_ratio_loss": 0.5325320959091187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09692790359258652, + "rewards/margins": 0.44723764061927795, + "rewards/rejected": -0.5441655516624451, + "sft_loss": 0.9692791104316711, + "step": 8785 + }, + { + "epoch": 0.68, + "grad_norm": 5.951380729675293, + "learning_rate": 2.3036277565481076e-06, + "logits/chosen": -1.3810487985610962, + "logits/rejected": -0.8749133348464966, + "logps/chosen": -0.6693980097770691, + "logps/rejected": -2.945880174636841, + "loss": 0.6842, + "odds_ratio_loss": 0.14806067943572998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0669398084282875, + "rewards/margins": 0.22764822840690613, + "rewards/rejected": -0.29458802938461304, + "sft_loss": 0.6693980097770691, + "step": 8790 + }, + { + "epoch": 0.68, + "grad_norm": 74.74230194091797, + "learning_rate": 2.2984443269216777e-06, + "logits/chosen": -1.3578306436538696, + "logits/rejected": -1.1013226509094238, + "logps/chosen": -1.1087204217910767, + "logps/rejected": -6.908092498779297, + "loss": 1.1202, + "odds_ratio_loss": 0.11455819755792618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11087203025817871, + "rewards/margins": 0.5799371600151062, + "rewards/rejected": -0.6908092498779297, + "sft_loss": 1.1087204217910767, + "step": 8795 + }, + { + "epoch": 0.68, + "grad_norm": 126.49470520019531, + "learning_rate": 2.293264994547427e-06, + "logits/chosen": -1.4132869243621826, + "logits/rejected": -0.7511752843856812, + "logps/chosen": -0.8112546801567078, + "logps/rejected": -4.038820266723633, + "loss": 0.8352, + "odds_ratio_loss": 0.23952248692512512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08112547546625137, + "rewards/margins": 0.3227565884590149, + "rewards/rejected": -0.4038820266723633, + "sft_loss": 0.8112546801567078, + "step": 8800 + }, + { + "epoch": 0.68, + "grad_norm": 4.296744346618652, + "learning_rate": 2.288089767280474e-06, + "logits/chosen": -1.481168270111084, + "logits/rejected": -1.0833094120025635, + "logps/chosen": -0.8455594182014465, + "logps/rejected": -7.945650577545166, + "loss": 0.8557, + "odds_ratio_loss": 0.1012900322675705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08455593883991241, + "rewards/margins": 0.7100090384483337, + "rewards/rejected": -0.794564962387085, + "sft_loss": 0.8455594182014465, + "step": 8805 + }, + { + "epoch": 0.69, + "grad_norm": 12.861416816711426, + "learning_rate": 2.282918652969707e-06, + "logits/chosen": -1.3677926063537598, + "logits/rejected": -1.070143222808838, + "logps/chosen": -0.8709946870803833, + "logps/rejected": -3.9313464164733887, + "loss": 0.8799, + "odds_ratio_loss": 0.08856067806482315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08709947764873505, + "rewards/margins": 0.3060351610183716, + "rewards/rejected": -0.3931346535682678, + "sft_loss": 0.8709946870803833, + "step": 8810 + }, + { + "epoch": 0.69, + "grad_norm": 10.0030517578125, + "learning_rate": 2.2777516594577753e-06, + "logits/chosen": -1.2824790477752686, + "logits/rejected": -1.475572109222412, + "logps/chosen": -0.984046459197998, + "logps/rejected": -8.88023853302002, + "loss": 0.9905, + "odds_ratio_loss": 0.06427445262670517, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09840463846921921, + "rewards/margins": 0.7896192669868469, + "rewards/rejected": -0.8880239725112915, + "sft_loss": 0.984046459197998, + "step": 8815 + }, + { + "epoch": 0.69, + "grad_norm": 4.735327243804932, + "learning_rate": 2.2725887945810835e-06, + "logits/chosen": -1.349716305732727, + "logits/rejected": -1.353406548500061, + "logps/chosen": -0.7635191679000854, + "logps/rejected": -12.640535354614258, + "loss": 0.813, + "odds_ratio_loss": 0.49513259530067444, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07635192573070526, + "rewards/margins": 1.1877018213272095, + "rewards/rejected": -1.2640537023544312, + "sft_loss": 0.7635191679000854, + "step": 8820 + }, + { + "epoch": 0.69, + "grad_norm": 12.46005916595459, + "learning_rate": 2.2674300661697705e-06, + "logits/chosen": -1.369740605354309, + "logits/rejected": -1.3559238910675049, + "logps/chosen": -0.9420153498649597, + "logps/rejected": -14.380511283874512, + "loss": 0.9612, + "odds_ratio_loss": 0.19223158061504364, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09420153498649597, + "rewards/margins": 1.343849778175354, + "rewards/rejected": -1.4380512237548828, + "sft_loss": 0.9420153498649597, + "step": 8825 + }, + { + "epoch": 0.69, + "grad_norm": 6.363523960113525, + "learning_rate": 2.2622754820477033e-06, + "logits/chosen": -1.3766988515853882, + "logits/rejected": -0.830722451210022, + "logps/chosen": -1.0896638631820679, + "logps/rejected": -12.302000045776367, + "loss": 1.1067, + "odds_ratio_loss": 0.17026808857917786, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1089663952589035, + "rewards/margins": 1.1212337017059326, + "rewards/rejected": -1.2302000522613525, + "sft_loss": 1.0896638631820679, + "step": 8830 + }, + { + "epoch": 0.69, + "grad_norm": 10.626047134399414, + "learning_rate": 2.257125050032462e-06, + "logits/chosen": -1.3120410442352295, + "logits/rejected": -1.2341954708099365, + "logps/chosen": -0.9303919076919556, + "logps/rejected": -3.4059882164001465, + "loss": 0.9528, + "odds_ratio_loss": 0.22447124123573303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09303919970989227, + "rewards/margins": 0.24755963683128357, + "rewards/rejected": -0.34059882164001465, + "sft_loss": 0.9303919076919556, + "step": 8835 + }, + { + "epoch": 0.69, + "grad_norm": 100.3499984741211, + "learning_rate": 2.2519787779353312e-06, + "logits/chosen": -1.3813316822052002, + "logits/rejected": -0.935640811920166, + "logps/chosen": -0.9915526509284973, + "logps/rejected": -6.646824836730957, + "loss": 1.0069, + "odds_ratio_loss": 0.15396466851234436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09915526211261749, + "rewards/margins": 0.5655272006988525, + "rewards/rejected": -0.6646824479103088, + "sft_loss": 0.9915526509284973, + "step": 8840 + }, + { + "epoch": 0.69, + "grad_norm": 3.4930734634399414, + "learning_rate": 2.246836673561286e-06, + "logits/chosen": -1.2621219158172607, + "logits/rejected": -0.7246573567390442, + "logps/chosen": -1.005814552307129, + "logps/rejected": -3.994333267211914, + "loss": 1.0149, + "odds_ratio_loss": 0.09052891284227371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10058145225048065, + "rewards/margins": 0.2988519072532654, + "rewards/rejected": -0.3994333744049072, + "sft_loss": 1.005814552307129, + "step": 8845 + }, + { + "epoch": 0.69, + "grad_norm": 6.091088771820068, + "learning_rate": 2.2416987447089795e-06, + "logits/chosen": -1.11075758934021, + "logits/rejected": -1.1599006652832031, + "logps/chosen": -0.9157182574272156, + "logps/rejected": -6.711939811706543, + "loss": 0.9192, + "odds_ratio_loss": 0.03470756858587265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09157182276248932, + "rewards/margins": 0.5796222686767578, + "rewards/rejected": -0.6711940765380859, + "sft_loss": 0.9157182574272156, + "step": 8850 + }, + { + "epoch": 0.69, + "grad_norm": 4.585426330566406, + "learning_rate": 2.236564999170735e-06, + "logits/chosen": -1.3810508251190186, + "logits/rejected": -0.529486358165741, + "logps/chosen": -0.9336601495742798, + "logps/rejected": -4.21042013168335, + "loss": 0.9444, + "odds_ratio_loss": 0.10773054510354996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09336601197719574, + "rewards/margins": 0.327675998210907, + "rewards/rejected": -0.4210420250892639, + "sft_loss": 0.9336601495742798, + "step": 8855 + }, + { + "epoch": 0.69, + "grad_norm": 5.172513961791992, + "learning_rate": 2.231435444732529e-06, + "logits/chosen": -1.4077708721160889, + "logits/rejected": -1.3687193393707275, + "logps/chosen": -1.0155298709869385, + "logps/rejected": -10.070394515991211, + "loss": 1.0235, + "odds_ratio_loss": 0.08000832796096802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10155300050973892, + "rewards/margins": 0.9054864048957825, + "rewards/rejected": -1.0070393085479736, + "sft_loss": 1.0155298709869385, + "step": 8860 + }, + { + "epoch": 0.69, + "grad_norm": 5.20650577545166, + "learning_rate": 2.2263100891739804e-06, + "logits/chosen": -1.3737702369689941, + "logits/rejected": -1.1111339330673218, + "logps/chosen": -1.0363662242889404, + "logps/rejected": -10.14550495147705, + "loss": 1.0399, + "odds_ratio_loss": 0.03571737930178642, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10363663733005524, + "rewards/margins": 0.9109139442443848, + "rewards/rejected": -1.0145504474639893, + "sft_loss": 1.0363662242889404, + "step": 8865 + }, + { + "epoch": 0.69, + "grad_norm": 7.640948295593262, + "learning_rate": 2.2211889402683444e-06, + "logits/chosen": -1.306793212890625, + "logits/rejected": -0.9922159910202026, + "logps/chosen": -0.8645086288452148, + "logps/rejected": -10.478594779968262, + "loss": 0.8654, + "odds_ratio_loss": 0.009273124858736992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08645085990428925, + "rewards/margins": 0.9614086151123047, + "rewards/rejected": -1.0478594303131104, + "sft_loss": 0.8645086288452148, + "step": 8870 + }, + { + "epoch": 0.69, + "grad_norm": 5.960412502288818, + "learning_rate": 2.216072005782492e-06, + "logits/chosen": -1.4007996320724487, + "logits/rejected": -0.977988600730896, + "logps/chosen": -1.1421080827713013, + "logps/rejected": -8.920265197753906, + "loss": 1.1509, + "odds_ratio_loss": 0.08776558190584183, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11421080678701401, + "rewards/margins": 0.7778158187866211, + "rewards/rejected": -0.8920267224311829, + "sft_loss": 1.1421080827713013, + "step": 8875 + }, + { + "epoch": 0.69, + "grad_norm": 9.963485717773438, + "learning_rate": 2.2109592934769042e-06, + "logits/chosen": -1.4444516897201538, + "logits/rejected": -1.264888048171997, + "logps/chosen": -1.100772500038147, + "logps/rejected": -5.076528072357178, + "loss": 1.1213, + "odds_ratio_loss": 0.2054096907377243, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11007723957300186, + "rewards/margins": 0.3975755572319031, + "rewards/rejected": -0.5076528787612915, + "sft_loss": 1.100772500038147, + "step": 8880 + }, + { + "epoch": 0.69, + "grad_norm": 22.328811645507812, + "learning_rate": 2.205850811105658e-06, + "logits/chosen": -1.1906664371490479, + "logits/rejected": -1.420731782913208, + "logps/chosen": -0.8635552525520325, + "logps/rejected": -5.215235710144043, + "loss": 0.8849, + "odds_ratio_loss": 0.21342246234416962, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0863555297255516, + "rewards/margins": 0.4351680874824524, + "rewards/rejected": -0.5215235352516174, + "sft_loss": 0.8635552525520325, + "step": 8885 + }, + { + "epoch": 0.69, + "grad_norm": 13.49941349029541, + "learning_rate": 2.2007465664164163e-06, + "logits/chosen": -1.1897201538085938, + "logits/rejected": -1.3385847806930542, + "logps/chosen": -1.1155879497528076, + "logps/rejected": -10.237648010253906, + "loss": 1.1251, + "odds_ratio_loss": 0.0953986719250679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11155879497528076, + "rewards/margins": 0.9122061729431152, + "rewards/rejected": -1.0237648487091064, + "sft_loss": 1.1155879497528076, + "step": 8890 + }, + { + "epoch": 0.69, + "grad_norm": 5.225167274475098, + "learning_rate": 2.1956465671504117e-06, + "logits/chosen": -1.446629285812378, + "logits/rejected": -1.0897634029388428, + "logps/chosen": -0.8796972036361694, + "logps/rejected": -6.697667598724365, + "loss": 0.8802, + "odds_ratio_loss": 0.005169983953237534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08796972036361694, + "rewards/margins": 0.5817970037460327, + "rewards/rejected": -0.6697667837142944, + "sft_loss": 0.8796972036361694, + "step": 8895 + }, + { + "epoch": 0.69, + "grad_norm": 39.218021392822266, + "learning_rate": 2.190550821042444e-06, + "logits/chosen": -1.362776279449463, + "logits/rejected": -1.3119524717330933, + "logps/chosen": -1.0069937705993652, + "logps/rejected": -8.32177448272705, + "loss": 1.0326, + "odds_ratio_loss": 0.25601357221603394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10069938004016876, + "rewards/margins": 0.7314780950546265, + "rewards/rejected": -0.8321775197982788, + "sft_loss": 1.0069937705993652, + "step": 8900 + }, + { + "epoch": 0.69, + "grad_norm": 4.985468864440918, + "learning_rate": 2.185459335820858e-06, + "logits/chosen": -1.263514757156372, + "logits/rejected": -0.6906660795211792, + "logps/chosen": -0.7955794334411621, + "logps/rejected": -3.6059112548828125, + "loss": 0.8298, + "odds_ratio_loss": 0.34243783354759216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07955794036388397, + "rewards/margins": 0.2810332179069519, + "rewards/rejected": -0.3605911433696747, + "sft_loss": 0.7955794334411621, + "step": 8905 + }, + { + "epoch": 0.69, + "grad_norm": 6.7755513191223145, + "learning_rate": 2.1803721192075376e-06, + "logits/chosen": -1.2453514337539673, + "logits/rejected": -0.7368025779724121, + "logps/chosen": -0.8245415687561035, + "logps/rejected": -1.625554084777832, + "loss": 0.8633, + "odds_ratio_loss": 0.38740354776382446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08245415985584259, + "rewards/margins": 0.08010125160217285, + "rewards/rejected": -0.16255542635917664, + "sft_loss": 0.8245415687561035, + "step": 8910 + }, + { + "epoch": 0.69, + "grad_norm": 8.046930313110352, + "learning_rate": 2.1752891789178903e-06, + "logits/chosen": -1.2851436138153076, + "logits/rejected": -0.8310701251029968, + "logps/chosen": -0.9399498105049133, + "logps/rejected": -4.056199073791504, + "loss": 0.9708, + "odds_ratio_loss": 0.30810680985450745, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09399497509002686, + "rewards/margins": 0.3116249144077301, + "rewards/rejected": -0.40561985969543457, + "sft_loss": 0.9399498105049133, + "step": 8915 + }, + { + "epoch": 0.69, + "grad_norm": 45.22992706298828, + "learning_rate": 2.170210522660844e-06, + "logits/chosen": -1.1178120374679565, + "logits/rejected": -1.3127976655960083, + "logps/chosen": -0.791592538356781, + "logps/rejected": -4.469288349151611, + "loss": 0.8065, + "odds_ratio_loss": 0.14901237189769745, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07915925979614258, + "rewards/margins": 0.36776959896087646, + "rewards/rejected": -0.44692888855934143, + "sft_loss": 0.791592538356781, + "step": 8920 + }, + { + "epoch": 0.69, + "grad_norm": 8.589193344116211, + "learning_rate": 2.1651361581388244e-06, + "logits/chosen": -1.2633674144744873, + "logits/rejected": -0.8884477615356445, + "logps/chosen": -1.0612653493881226, + "logps/rejected": -6.128218173980713, + "loss": 1.0918, + "odds_ratio_loss": 0.3054296672344208, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10612654685974121, + "rewards/margins": 0.5066952705383301, + "rewards/rejected": -0.6128217577934265, + "sft_loss": 1.0612653493881226, + "step": 8925 + }, + { + "epoch": 0.69, + "grad_norm": 22.513050079345703, + "learning_rate": 2.1600660930477473e-06, + "logits/chosen": -1.3976125717163086, + "logits/rejected": -1.2376105785369873, + "logps/chosen": -0.8673698306083679, + "logps/rejected": -4.726605415344238, + "loss": 0.8781, + "odds_ratio_loss": 0.10777624696493149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08673697710037231, + "rewards/margins": 0.3859235644340515, + "rewards/rejected": -0.4726606011390686, + "sft_loss": 0.8673698306083679, + "step": 8930 + }, + { + "epoch": 0.7, + "grad_norm": 34.55830383300781, + "learning_rate": 2.1550003350770145e-06, + "logits/chosen": -1.4735945463180542, + "logits/rejected": -1.2802393436431885, + "logps/chosen": -1.6333625316619873, + "logps/rejected": -5.451124668121338, + "loss": 1.6782, + "odds_ratio_loss": 0.44808006286621094, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.16333624720573425, + "rewards/margins": 0.38177618384361267, + "rewards/rejected": -0.5451124906539917, + "sft_loss": 1.6333625316619873, + "step": 8935 + }, + { + "epoch": 0.7, + "grad_norm": 7.063345909118652, + "learning_rate": 2.1499388919094878e-06, + "logits/chosen": -1.357527494430542, + "logits/rejected": -0.8941570520401001, + "logps/chosen": -0.860805332660675, + "logps/rejected": -3.371488094329834, + "loss": 0.8828, + "odds_ratio_loss": 0.219833642244339, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08608053624629974, + "rewards/margins": 0.2510682940483093, + "rewards/rejected": -0.3371488153934479, + "sft_loss": 0.860805332660675, + "step": 8940 + }, + { + "epoch": 0.7, + "grad_norm": 4.833588123321533, + "learning_rate": 2.14488177122149e-06, + "logits/chosen": -1.3718502521514893, + "logits/rejected": -1.3062679767608643, + "logps/chosen": -0.9539562463760376, + "logps/rejected": -13.6318998336792, + "loss": 0.9722, + "odds_ratio_loss": 0.1820879876613617, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09539561718702316, + "rewards/margins": 1.2677944898605347, + "rewards/rejected": -1.3631901741027832, + "sft_loss": 0.9539562463760376, + "step": 8945 + }, + { + "epoch": 0.7, + "grad_norm": 7.767849922180176, + "learning_rate": 2.139828980682786e-06, + "logits/chosen": -1.3066259622573853, + "logits/rejected": -1.2431743144989014, + "logps/chosen": -0.8903936147689819, + "logps/rejected": -7.409182548522949, + "loss": 0.893, + "odds_ratio_loss": 0.025705674663186073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08903936296701431, + "rewards/margins": 0.6518789529800415, + "rewards/rejected": -0.7409183382987976, + "sft_loss": 0.8903936147689819, + "step": 8950 + }, + { + "epoch": 0.7, + "grad_norm": 4.688467979431152, + "learning_rate": 2.1347805279565743e-06, + "logits/chosen": -1.2730211019515991, + "logits/rejected": -0.8708003163337708, + "logps/chosen": -1.1545875072479248, + "logps/rejected": -8.899227142333984, + "loss": 1.1654, + "odds_ratio_loss": 0.10816816240549088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11545874923467636, + "rewards/margins": 0.7744640111923218, + "rewards/rejected": -0.8899227976799011, + "sft_loss": 1.1545875072479248, + "step": 8955 + }, + { + "epoch": 0.7, + "grad_norm": 12.68004035949707, + "learning_rate": 2.1297364206994727e-06, + "logits/chosen": -1.3241649866104126, + "logits/rejected": -0.8673623204231262, + "logps/chosen": -0.922534167766571, + "logps/rejected": -2.2232301235198975, + "loss": 0.9554, + "odds_ratio_loss": 0.3290178179740906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0922534242272377, + "rewards/margins": 0.13006961345672607, + "rewards/rejected": -0.22232303023338318, + "sft_loss": 0.922534167766571, + "step": 8960 + }, + { + "epoch": 0.7, + "grad_norm": 5.558168411254883, + "learning_rate": 2.124696666561513e-06, + "logits/chosen": -1.3462955951690674, + "logits/rejected": -1.0334182977676392, + "logps/chosen": -0.9550608396530151, + "logps/rejected": -3.349431276321411, + "loss": 1.0117, + "odds_ratio_loss": 0.5658974647521973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09550608694553375, + "rewards/margins": 0.2394370287656784, + "rewards/rejected": -0.33494311571121216, + "sft_loss": 0.9550608396530151, + "step": 8965 + }, + { + "epoch": 0.7, + "grad_norm": 5.458000659942627, + "learning_rate": 2.119661273186122e-06, + "logits/chosen": -1.2887673377990723, + "logits/rejected": -0.9980741739273071, + "logps/chosen": -0.8225724101066589, + "logps/rejected": -2.6496741771698, + "loss": 0.8715, + "odds_ratio_loss": 0.4890063405036926, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08225724846124649, + "rewards/margins": 0.182710200548172, + "rewards/rejected": -0.2649674713611603, + "sft_loss": 0.8225724101066589, + "step": 8970 + }, + { + "epoch": 0.7, + "grad_norm": 15.437626838684082, + "learning_rate": 2.114630248210112e-06, + "logits/chosen": -1.324136734008789, + "logits/rejected": -1.00652277469635, + "logps/chosen": -1.0153963565826416, + "logps/rejected": -6.181136608123779, + "loss": 1.0371, + "odds_ratio_loss": 0.21681568026542664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10153963416814804, + "rewards/margins": 0.5165740251541138, + "rewards/rejected": -0.6181136965751648, + "sft_loss": 1.0153963565826416, + "step": 8975 + }, + { + "epoch": 0.7, + "grad_norm": 14.838114738464355, + "learning_rate": 2.10960359926367e-06, + "logits/chosen": -1.1353790760040283, + "logits/rejected": -0.9303848147392273, + "logps/chosen": -0.9914374351501465, + "logps/rejected": -6.257478713989258, + "loss": 1.0012, + "odds_ratio_loss": 0.09774111211299896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09914375841617584, + "rewards/margins": 0.5266041159629822, + "rewards/rejected": -0.6257479190826416, + "sft_loss": 0.9914374351501465, + "step": 8980 + }, + { + "epoch": 0.7, + "grad_norm": 12.68740463256836, + "learning_rate": 2.1045813339703504e-06, + "logits/chosen": -1.332756757736206, + "logits/rejected": -1.1559302806854248, + "logps/chosen": -1.1947431564331055, + "logps/rejected": -14.672497749328613, + "loss": 1.1961, + "odds_ratio_loss": 0.01405693031847477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11947431415319443, + "rewards/margins": 1.3477754592895508, + "rewards/rejected": -1.4672497510910034, + "sft_loss": 1.1947431564331055, + "step": 8985 + }, + { + "epoch": 0.7, + "grad_norm": 12.600214004516602, + "learning_rate": 2.0995634599470543e-06, + "logits/chosen": -1.1652392148971558, + "logits/rejected": -1.0661487579345703, + "logps/chosen": -1.3858683109283447, + "logps/rejected": -3.2528934478759766, + "loss": 1.4568, + "odds_ratio_loss": 0.7093421220779419, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1385868340730667, + "rewards/margins": 0.18670251965522766, + "rewards/rejected": -0.3252893090248108, + "sft_loss": 1.3858683109283447, + "step": 8990 + }, + { + "epoch": 0.7, + "grad_norm": 13.642959594726562, + "learning_rate": 2.0945499848040245e-06, + "logits/chosen": -1.1991904973983765, + "logits/rejected": -1.5512006282806396, + "logps/chosen": -1.1417248249053955, + "logps/rejected": -16.12936019897461, + "loss": 1.1417, + "odds_ratio_loss": 0.0002171795058529824, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11417248100042343, + "rewards/margins": 1.4987636804580688, + "rewards/rejected": -1.61293625831604, + "sft_loss": 1.1417248249053955, + "step": 8995 + }, + { + "epoch": 0.7, + "grad_norm": 273.96759033203125, + "learning_rate": 2.0895409161448336e-06, + "logits/chosen": -1.0852572917938232, + "logits/rejected": -1.2790428400039673, + "logps/chosen": -1.4878782033920288, + "logps/rejected": -7.7668137550354, + "loss": 1.5419, + "odds_ratio_loss": 0.5402774214744568, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14878782629966736, + "rewards/margins": 0.6278935670852661, + "rewards/rejected": -0.7766814231872559, + "sft_loss": 1.4878782033920288, + "step": 9000 + }, + { + "epoch": 0.7, + "grad_norm": 4.770481109619141, + "learning_rate": 2.08453626156637e-06, + "logits/chosen": -1.2997276782989502, + "logits/rejected": -0.8713030815124512, + "logps/chosen": -0.8640148043632507, + "logps/rejected": -10.577802658081055, + "loss": 0.8718, + "odds_ratio_loss": 0.0777444839477539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08640148490667343, + "rewards/margins": 0.9713788032531738, + "rewards/rejected": -1.057780385017395, + "sft_loss": 0.8640148043632507, + "step": 9005 + }, + { + "epoch": 0.7, + "grad_norm": 23.215740203857422, + "learning_rate": 2.079536028658825e-06, + "logits/chosen": -1.1639597415924072, + "logits/rejected": -1.0849668979644775, + "logps/chosen": -0.9988861083984375, + "logps/rejected": -4.473105430603027, + "loss": 1.0022, + "odds_ratio_loss": 0.033389657735824585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09988861531019211, + "rewards/margins": 0.34742194414138794, + "rewards/rejected": -0.44731053709983826, + "sft_loss": 0.9988861083984375, + "step": 9010 + }, + { + "epoch": 0.7, + "grad_norm": 18.671756744384766, + "learning_rate": 2.074540225005691e-06, + "logits/chosen": -1.3484288454055786, + "logits/rejected": -0.8984023928642273, + "logps/chosen": -1.5796782970428467, + "logps/rejected": -6.037528991699219, + "loss": 1.5958, + "odds_ratio_loss": 0.16151759028434753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15796785056591034, + "rewards/margins": 0.4457850456237793, + "rewards/rejected": -0.6037529706954956, + "sft_loss": 1.5796782970428467, + "step": 9015 + }, + { + "epoch": 0.7, + "grad_norm": 6.736073970794678, + "learning_rate": 2.069548858183737e-06, + "logits/chosen": -1.3836395740509033, + "logits/rejected": -1.2181329727172852, + "logps/chosen": -0.72617107629776, + "logps/rejected": -9.326863288879395, + "loss": 0.7917, + "odds_ratio_loss": 0.654904842376709, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07261711359024048, + "rewards/margins": 0.8600692749023438, + "rewards/rejected": -0.932686448097229, + "sft_loss": 0.72617107629776, + "step": 9020 + }, + { + "epoch": 0.7, + "grad_norm": 5.815211772918701, + "learning_rate": 2.0645619357630037e-06, + "logits/chosen": -1.3405768871307373, + "logits/rejected": -1.0667366981506348, + "logps/chosen": -0.8995200395584106, + "logps/rejected": -10.469018936157227, + "loss": 0.8996, + "odds_ratio_loss": 0.0003613657027017325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0899519994854927, + "rewards/margins": 0.9569500088691711, + "rewards/rejected": -1.0469019412994385, + "sft_loss": 0.8995200395584106, + "step": 9025 + }, + { + "epoch": 0.7, + "grad_norm": 51.4861946105957, + "learning_rate": 2.059579465306791e-06, + "logits/chosen": -1.082763910293579, + "logits/rejected": -1.3052375316619873, + "logps/chosen": -0.7171798944473267, + "logps/rejected": -2.294360876083374, + "loss": 0.7315, + "odds_ratio_loss": 0.14310906827449799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0717179924249649, + "rewards/margins": 0.15771810710430145, + "rewards/rejected": -0.22943606972694397, + "sft_loss": 0.7171798944473267, + "step": 9030 + }, + { + "epoch": 0.7, + "grad_norm": 23.833728790283203, + "learning_rate": 2.0546014543716516e-06, + "logits/chosen": -1.336284875869751, + "logits/rejected": -1.3159081935882568, + "logps/chosen": -0.9601768255233765, + "logps/rejected": -4.6210761070251465, + "loss": 0.9913, + "odds_ratio_loss": 0.3113359808921814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09601768851280212, + "rewards/margins": 0.3660898804664612, + "rewards/rejected": -0.4621075689792633, + "sft_loss": 0.9601768255233765, + "step": 9035 + }, + { + "epoch": 0.7, + "grad_norm": 14.797272682189941, + "learning_rate": 2.0496279105073686e-06, + "logits/chosen": -1.3687362670898438, + "logits/rejected": -1.0789031982421875, + "logps/chosen": -0.9673604965209961, + "logps/rejected": -6.7939581871032715, + "loss": 0.9952, + "odds_ratio_loss": 0.27864471077919006, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09673605114221573, + "rewards/margins": 0.5826598405838013, + "rewards/rejected": -0.6793957948684692, + "sft_loss": 0.9673604965209961, + "step": 9040 + }, + { + "epoch": 0.7, + "grad_norm": 7.334042549133301, + "learning_rate": 2.0446588412569514e-06, + "logits/chosen": -1.1946535110473633, + "logits/rejected": -0.973946750164032, + "logps/chosen": -0.8864052891731262, + "logps/rejected": -4.6504411697387695, + "loss": 0.8992, + "odds_ratio_loss": 0.12813030183315277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08864052593708038, + "rewards/margins": 0.3764035701751709, + "rewards/rejected": -0.4650441110134125, + "sft_loss": 0.8864052891731262, + "step": 9045 + }, + { + "epoch": 0.7, + "grad_norm": 6.007212162017822, + "learning_rate": 2.0396942541566277e-06, + "logits/chosen": -1.289546012878418, + "logits/rejected": -1.0946879386901855, + "logps/chosen": -0.9183686971664429, + "logps/rejected": -5.516299724578857, + "loss": 0.936, + "odds_ratio_loss": 0.1759006530046463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09183688461780548, + "rewards/margins": 0.4597931504249573, + "rewards/rejected": -0.5516299605369568, + "sft_loss": 0.9183686971664429, + "step": 9050 + }, + { + "epoch": 0.7, + "grad_norm": 49.407833099365234, + "learning_rate": 2.034734156735823e-06, + "logits/chosen": -1.0109045505523682, + "logits/rejected": -1.3671051263809204, + "logps/chosen": -0.9036981463432312, + "logps/rejected": -11.408597946166992, + "loss": 0.9159, + "odds_ratio_loss": 0.12224564701318741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0903698205947876, + "rewards/margins": 1.0504900217056274, + "rewards/rejected": -1.140859842300415, + "sft_loss": 0.9036981463432312, + "step": 9055 + }, + { + "epoch": 0.7, + "grad_norm": 50.97902297973633, + "learning_rate": 2.029778556517154e-06, + "logits/chosen": -1.2944746017456055, + "logits/rejected": -1.1573295593261719, + "logps/chosen": -0.816990077495575, + "logps/rejected": -16.013568878173828, + "loss": 0.8343, + "odds_ratio_loss": 0.17356497049331665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08169901371002197, + "rewards/margins": 1.519658088684082, + "rewards/rejected": -1.601357102394104, + "sft_loss": 0.816990077495575, + "step": 9060 + }, + { + "epoch": 0.71, + "grad_norm": 102.18575286865234, + "learning_rate": 2.0248274610164185e-06, + "logits/chosen": -1.3450043201446533, + "logits/rejected": -1.1237186193466187, + "logps/chosen": -1.0184475183486938, + "logps/rejected": -6.712457180023193, + "loss": 1.0302, + "odds_ratio_loss": 0.11714156717061996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10184475034475327, + "rewards/margins": 0.5694010853767395, + "rewards/rejected": -0.6712457537651062, + "sft_loss": 1.0184475183486938, + "step": 9065 + }, + { + "epoch": 0.71, + "grad_norm": 38.02252960205078, + "learning_rate": 2.019880877742581e-06, + "logits/chosen": -1.4429337978363037, + "logits/rejected": -1.2393356561660767, + "logps/chosen": -5.945528507232666, + "logps/rejected": -8.805086135864258, + "loss": 6.0357, + "odds_ratio_loss": 0.9014400243759155, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5945528149604797, + "rewards/margins": 0.2859557271003723, + "rewards/rejected": -0.880508542060852, + "sft_loss": 5.945528507232666, + "step": 9070 + }, + { + "epoch": 0.71, + "grad_norm": 70.94361877441406, + "learning_rate": 2.014938814197761e-06, + "logits/chosen": -1.3768596649169922, + "logits/rejected": -1.2406704425811768, + "logps/chosen": -1.871372938156128, + "logps/rejected": -3.2459359169006348, + "loss": 1.9491, + "odds_ratio_loss": 0.7770655751228333, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18713729083538055, + "rewards/margins": 0.1374562680721283, + "rewards/rejected": -0.32459357380867004, + "sft_loss": 1.871372938156128, + "step": 9075 + }, + { + "epoch": 0.71, + "grad_norm": 19.992162704467773, + "learning_rate": 2.0100012778772294e-06, + "logits/chosen": -1.3903619050979614, + "logits/rejected": -0.9029962420463562, + "logps/chosen": -2.460082769393921, + "logps/rejected": -4.92894983291626, + "loss": 2.5327, + "odds_ratio_loss": 0.7256797552108765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.24600830674171448, + "rewards/margins": 0.24688668549060822, + "rewards/rejected": -0.4928949475288391, + "sft_loss": 2.460082769393921, + "step": 9080 + }, + { + "epoch": 0.71, + "grad_norm": 27.510286331176758, + "learning_rate": 2.0050682762693846e-06, + "logits/chosen": -1.3608497381210327, + "logits/rejected": -1.1397944688796997, + "logps/chosen": -0.7833740711212158, + "logps/rejected": -4.637581825256348, + "loss": 0.797, + "odds_ratio_loss": 0.13609935343265533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0783374086022377, + "rewards/margins": 0.3854207694530487, + "rewards/rejected": -0.4637581706047058, + "sft_loss": 0.7833740711212158, + "step": 9085 + }, + { + "epoch": 0.71, + "grad_norm": 28.414520263671875, + "learning_rate": 2.0001398168557508e-06, + "logits/chosen": -1.2943460941314697, + "logits/rejected": -0.8901378512382507, + "logps/chosen": -0.6564239263534546, + "logps/rejected": -3.8656997680664062, + "loss": 0.677, + "odds_ratio_loss": 0.20536451041698456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06564239412546158, + "rewards/margins": 0.32092761993408203, + "rewards/rejected": -0.3865699768066406, + "sft_loss": 0.6564239263534546, + "step": 9090 + }, + { + "epoch": 0.71, + "grad_norm": 5.669079780578613, + "learning_rate": 1.9952159071109594e-06, + "logits/chosen": -1.4045063257217407, + "logits/rejected": -0.9172590970993042, + "logps/chosen": -1.0156147480010986, + "logps/rejected": -3.5632667541503906, + "loss": 1.0359, + "odds_ratio_loss": 0.202461838722229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10156148672103882, + "rewards/margins": 0.25476521253585815, + "rewards/rejected": -0.356326699256897, + "sft_loss": 1.0156147480010986, + "step": 9095 + }, + { + "epoch": 0.71, + "grad_norm": 10.984932899475098, + "learning_rate": 1.990296554502749e-06, + "logits/chosen": -1.3105428218841553, + "logits/rejected": -0.997005820274353, + "logps/chosen": -1.2994569540023804, + "logps/rejected": -4.665297985076904, + "loss": 1.3316, + "odds_ratio_loss": 0.32166650891304016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12994569540023804, + "rewards/margins": 0.3365841209888458, + "rewards/rejected": -0.4665297865867615, + "sft_loss": 1.2994569540023804, + "step": 9100 + }, + { + "epoch": 0.71, + "grad_norm": 7.292508125305176, + "learning_rate": 1.9853817664919413e-06, + "logits/chosen": -1.3598629236221313, + "logits/rejected": -1.0822525024414062, + "logps/chosen": -0.8946825861930847, + "logps/rejected": -3.746201753616333, + "loss": 0.9321, + "odds_ratio_loss": 0.37457841634750366, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08946826308965683, + "rewards/margins": 0.2851519286632538, + "rewards/rejected": -0.3746201992034912, + "sft_loss": 0.8946825861930847, + "step": 9105 + }, + { + "epoch": 0.71, + "grad_norm": 9.447525024414062, + "learning_rate": 1.9804715505324346e-06, + "logits/chosen": -1.172864556312561, + "logits/rejected": -0.8160622715950012, + "logps/chosen": -1.010909914970398, + "logps/rejected": -2.0625815391540527, + "loss": 1.0415, + "odds_ratio_loss": 0.3055305480957031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10109099000692368, + "rewards/margins": 0.10516715049743652, + "rewards/rejected": -0.2062581479549408, + "sft_loss": 1.010909914970398, + "step": 9110 + }, + { + "epoch": 0.71, + "grad_norm": 6.003201007843018, + "learning_rate": 1.9755659140711965e-06, + "logits/chosen": -1.2514169216156006, + "logits/rejected": -1.43091881275177, + "logps/chosen": -0.5724098682403564, + "logps/rejected": -5.860909461975098, + "loss": 0.5743, + "odds_ratio_loss": 0.01881580427289009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.057240985333919525, + "rewards/margins": 0.5288499593734741, + "rewards/rejected": -0.5860909819602966, + "sft_loss": 0.5724098682403564, + "step": 9115 + }, + { + "epoch": 0.71, + "grad_norm": 4.640727519989014, + "learning_rate": 1.9706648645482464e-06, + "logits/chosen": -1.2182533740997314, + "logits/rejected": -1.101128339767456, + "logps/chosen": -0.9864501953125, + "logps/rejected": -10.166420936584473, + "loss": 0.996, + "odds_ratio_loss": 0.09508304297924042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09864500910043716, + "rewards/margins": 0.9179970622062683, + "rewards/rejected": -1.0166422128677368, + "sft_loss": 0.9864501953125, + "step": 9120 + }, + { + "epoch": 0.71, + "grad_norm": 13.459922790527344, + "learning_rate": 1.965768409396647e-06, + "logits/chosen": -1.087660551071167, + "logits/rejected": -1.2694013118743896, + "logps/chosen": -0.7529363632202148, + "logps/rejected": -5.429129600524902, + "loss": 0.7717, + "odds_ratio_loss": 0.18774135410785675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0752936378121376, + "rewards/margins": 0.4676193594932556, + "rewards/rejected": -0.5429129600524902, + "sft_loss": 0.7529363632202148, + "step": 9125 + }, + { + "epoch": 0.71, + "grad_norm": 5.044445037841797, + "learning_rate": 1.9608765560424976e-06, + "logits/chosen": -1.3213609457015991, + "logits/rejected": -1.0660345554351807, + "logps/chosen": -0.9869930148124695, + "logps/rejected": -9.335617065429688, + "loss": 0.9935, + "odds_ratio_loss": 0.06551004201173782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09869930893182755, + "rewards/margins": 0.8348624110221863, + "rewards/rejected": -0.9335616827011108, + "sft_loss": 0.9869930148124695, + "step": 9130 + }, + { + "epoch": 0.71, + "grad_norm": 8.95915412902832, + "learning_rate": 1.9559893119049127e-06, + "logits/chosen": -1.3892710208892822, + "logits/rejected": -1.130847692489624, + "logps/chosen": -0.8897703289985657, + "logps/rejected": -4.048529624938965, + "loss": 0.9171, + "odds_ratio_loss": 0.27287232875823975, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08897703886032104, + "rewards/margins": 0.31587594747543335, + "rewards/rejected": -0.404852956533432, + "sft_loss": 0.8897703289985657, + "step": 9135 + }, + { + "epoch": 0.71, + "grad_norm": 6.433979511260986, + "learning_rate": 1.9511066843960175e-06, + "logits/chosen": -1.3489278554916382, + "logits/rejected": -1.1770883798599243, + "logps/chosen": -1.082330584526062, + "logps/rejected": -4.711759567260742, + "loss": 1.109, + "odds_ratio_loss": 0.26716113090515137, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10823307186365128, + "rewards/margins": 0.3629428744316101, + "rewards/rejected": -0.47117599844932556, + "sft_loss": 1.082330584526062, + "step": 9140 + }, + { + "epoch": 0.71, + "grad_norm": 5.469911575317383, + "learning_rate": 1.9462286809209395e-06, + "logits/chosen": -1.4053490161895752, + "logits/rejected": -0.8818165063858032, + "logps/chosen": -0.7211654782295227, + "logps/rejected": -5.684488296508789, + "loss": 0.7272, + "odds_ratio_loss": 0.06042628735303879, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07211655378341675, + "rewards/margins": 0.49633222818374634, + "rewards/rejected": -0.5684488415718079, + "sft_loss": 0.7211654782295227, + "step": 9145 + }, + { + "epoch": 0.71, + "grad_norm": 6.692239761352539, + "learning_rate": 1.9413553088777894e-06, + "logits/chosen": -1.279656171798706, + "logits/rejected": -1.232208013534546, + "logps/chosen": -1.0628973245620728, + "logps/rejected": -9.918838500976562, + "loss": 1.0781, + "odds_ratio_loss": 0.1518601030111313, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10628972202539444, + "rewards/margins": 0.8855941891670227, + "rewards/rejected": -0.9918839335441589, + "sft_loss": 1.0628973245620728, + "step": 9150 + }, + { + "epoch": 0.71, + "grad_norm": 4.951191425323486, + "learning_rate": 1.9364865756576534e-06, + "logits/chosen": -1.3498833179473877, + "logits/rejected": -0.9882432222366333, + "logps/chosen": -1.00874662399292, + "logps/rejected": -3.104807138442993, + "loss": 1.0643, + "odds_ratio_loss": 0.5553382039070129, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10087466239929199, + "rewards/margins": 0.2096060812473297, + "rewards/rejected": -0.3104807436466217, + "sft_loss": 1.00874662399292, + "step": 9155 + }, + { + "epoch": 0.71, + "grad_norm": 7.204696178436279, + "learning_rate": 1.931622488644583e-06, + "logits/chosen": -1.3502788543701172, + "logits/rejected": -1.2425469160079956, + "logps/chosen": -0.7986493110656738, + "logps/rejected": -8.240682601928711, + "loss": 0.7989, + "odds_ratio_loss": 0.002235189313068986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07986493408679962, + "rewards/margins": 0.7442033886909485, + "rewards/rejected": -0.8240682482719421, + "sft_loss": 0.7986493110656738, + "step": 9160 + }, + { + "epoch": 0.71, + "grad_norm": 63.30450439453125, + "learning_rate": 1.9267630552155862e-06, + "logits/chosen": -1.3233058452606201, + "logits/rejected": -1.1473569869995117, + "logps/chosen": -1.1643285751342773, + "logps/rejected": -2.8467297554016113, + "loss": 1.2127, + "odds_ratio_loss": 0.4839521050453186, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11643286049365997, + "rewards/margins": 0.16824014484882355, + "rewards/rejected": -0.2846730053424835, + "sft_loss": 1.1643285751342773, + "step": 9165 + }, + { + "epoch": 0.71, + "grad_norm": 13.165739059448242, + "learning_rate": 1.92190828274061e-06, + "logits/chosen": -1.3316316604614258, + "logits/rejected": -0.3999803960323334, + "logps/chosen": -1.0170505046844482, + "logps/rejected": -3.586103916168213, + "loss": 1.0439, + "odds_ratio_loss": 0.2684716582298279, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10170505195856094, + "rewards/margins": 0.25690528750419617, + "rewards/rejected": -0.3586103618144989, + "sft_loss": 1.0170505046844482, + "step": 9170 + }, + { + "epoch": 0.71, + "grad_norm": 10.03995132446289, + "learning_rate": 1.917058178582532e-06, + "logits/chosen": -1.3869171142578125, + "logits/rejected": -0.897848904132843, + "logps/chosen": -1.0685797929763794, + "logps/rejected": -3.428459882736206, + "loss": 1.0797, + "odds_ratio_loss": 0.11104003340005875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10685797780752182, + "rewards/margins": 0.235988050699234, + "rewards/rejected": -0.34284600615501404, + "sft_loss": 1.0685797929763794, + "step": 9175 + }, + { + "epoch": 0.71, + "grad_norm": 7.961609840393066, + "learning_rate": 1.9122127500971525e-06, + "logits/chosen": -1.301819086074829, + "logits/rejected": -1.0404784679412842, + "logps/chosen": -0.975425124168396, + "logps/rejected": -4.277166843414307, + "loss": 1.0074, + "odds_ratio_loss": 0.31970852613449097, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09754252433776855, + "rewards/margins": 0.33017414808273315, + "rewards/rejected": -0.4277166724205017, + "sft_loss": 0.975425124168396, + "step": 9180 + }, + { + "epoch": 0.71, + "grad_norm": 97.14888763427734, + "learning_rate": 1.9073720046331777e-06, + "logits/chosen": -1.392052412033081, + "logits/rejected": -1.0715065002441406, + "logps/chosen": -1.0489897727966309, + "logps/rejected": -2.846344470977783, + "loss": 1.0661, + "odds_ratio_loss": 0.17116869986057281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10489897429943085, + "rewards/margins": 0.179735466837883, + "rewards/rejected": -0.28463444113731384, + "sft_loss": 1.0489897727966309, + "step": 9185 + }, + { + "epoch": 0.71, + "grad_norm": 26.383729934692383, + "learning_rate": 1.902535949532212e-06, + "logits/chosen": -1.3624117374420166, + "logits/rejected": -1.1035476922988892, + "logps/chosen": -0.8702263832092285, + "logps/rejected": -3.861776828765869, + "loss": 0.8804, + "odds_ratio_loss": 0.101753830909729, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08702263981103897, + "rewards/margins": 0.29915502667427063, + "rewards/rejected": -0.3861776888370514, + "sft_loss": 0.8702263832092285, + "step": 9190 + }, + { + "epoch": 0.72, + "grad_norm": 9.27212142944336, + "learning_rate": 1.8977045921287496e-06, + "logits/chosen": -1.3226594924926758, + "logits/rejected": -1.1544954776763916, + "logps/chosen": -0.8614295125007629, + "logps/rejected": -10.17005443572998, + "loss": 0.8677, + "odds_ratio_loss": 0.062495239078998566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08614294975996017, + "rewards/margins": 0.930862545967102, + "rewards/rejected": -1.0170055627822876, + "sft_loss": 0.8614295125007629, + "step": 9195 + }, + { + "epoch": 0.72, + "grad_norm": 4.527229309082031, + "learning_rate": 1.8928779397501561e-06, + "logits/chosen": -1.3080815076828003, + "logits/rejected": -0.7346660494804382, + "logps/chosen": -0.6970736980438232, + "logps/rejected": -8.263237953186035, + "loss": 0.7016, + "odds_ratio_loss": 0.045706018805503845, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06970737129449844, + "rewards/margins": 0.7566162943840027, + "rewards/rejected": -0.8263236880302429, + "sft_loss": 0.6970736980438232, + "step": 9200 + }, + { + "epoch": 0.72, + "grad_norm": 5.699766635894775, + "learning_rate": 1.888055999716661e-06, + "logits/chosen": -1.2307069301605225, + "logits/rejected": -1.1355819702148438, + "logps/chosen": -0.8993587493896484, + "logps/rejected": -11.566434860229492, + "loss": 0.8996, + "odds_ratio_loss": 0.0026048864237964153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08993588387966156, + "rewards/margins": 1.0667074918746948, + "rewards/rejected": -1.1566433906555176, + "sft_loss": 0.8993587493896484, + "step": 9205 + }, + { + "epoch": 0.72, + "grad_norm": 152.52505493164062, + "learning_rate": 1.883238779341352e-06, + "logits/chosen": -1.3559463024139404, + "logits/rejected": -1.3009026050567627, + "logps/chosen": -0.942069411277771, + "logps/rejected": -12.025169372558594, + "loss": 0.9462, + "odds_ratio_loss": 0.0411582887172699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09420694410800934, + "rewards/margins": 1.1083099842071533, + "rewards/rejected": -1.2025169134140015, + "sft_loss": 0.942069411277771, + "step": 9210 + }, + { + "epoch": 0.72, + "grad_norm": 7.183080673217773, + "learning_rate": 1.8784262859301534e-06, + "logits/chosen": -1.365206003189087, + "logits/rejected": -1.2650926113128662, + "logps/chosen": -1.0181998014450073, + "logps/rejected": -8.810084342956543, + "loss": 1.0332, + "odds_ratio_loss": 0.14968711137771606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1018199697136879, + "rewards/margins": 0.7791884541511536, + "rewards/rejected": -0.8810084462165833, + "sft_loss": 1.0181998014450073, + "step": 9215 + }, + { + "epoch": 0.72, + "grad_norm": 5.462626934051514, + "learning_rate": 1.8736185267818224e-06, + "logits/chosen": -1.4750196933746338, + "logits/rejected": -1.0687038898468018, + "logps/chosen": -0.9219584465026855, + "logps/rejected": -5.657142162322998, + "loss": 0.9329, + "odds_ratio_loss": 0.10907478630542755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09219583868980408, + "rewards/margins": 0.47351837158203125, + "rewards/rejected": -0.5657142400741577, + "sft_loss": 0.9219584465026855, + "step": 9220 + }, + { + "epoch": 0.72, + "grad_norm": 9.681036949157715, + "learning_rate": 1.8688155091879361e-06, + "logits/chosen": -1.2853978872299194, + "logits/rejected": -1.139392614364624, + "logps/chosen": -0.8742135167121887, + "logps/rejected": -7.485783576965332, + "loss": 0.8791, + "odds_ratio_loss": 0.04870065301656723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08742136508226395, + "rewards/margins": 0.6611570119857788, + "rewards/rejected": -0.7485784292221069, + "sft_loss": 0.8742135167121887, + "step": 9225 + }, + { + "epoch": 0.72, + "grad_norm": 319.069580078125, + "learning_rate": 1.8640172404328816e-06, + "logits/chosen": -1.2906922101974487, + "logits/rejected": -1.2166502475738525, + "logps/chosen": -1.2384088039398193, + "logps/rejected": -4.230016231536865, + "loss": 1.2526, + "odds_ratio_loss": 0.14192232489585876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12384088337421417, + "rewards/margins": 0.29916077852249146, + "rewards/rejected": -0.4230016767978668, + "sft_loss": 1.2384088039398193, + "step": 9230 + }, + { + "epoch": 0.72, + "grad_norm": 4.312925338745117, + "learning_rate": 1.8592237277938413e-06, + "logits/chosen": -1.2975214719772339, + "logits/rejected": -0.49361056089401245, + "logps/chosen": -0.8198236227035522, + "logps/rejected": -9.956624984741211, + "loss": 0.8327, + "odds_ratio_loss": 0.12906351685523987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08198236674070358, + "rewards/margins": 0.9136801958084106, + "rewards/rejected": -0.9956625699996948, + "sft_loss": 0.8198236227035522, + "step": 9235 + }, + { + "epoch": 0.72, + "grad_norm": 16.511611938476562, + "learning_rate": 1.8544349785407844e-06, + "logits/chosen": -1.1626781225204468, + "logits/rejected": -0.8125447034835815, + "logps/chosen": -0.6874169111251831, + "logps/rejected": -2.541872501373291, + "loss": 0.7038, + "odds_ratio_loss": 0.1637047976255417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06874169409275055, + "rewards/margins": 0.18544557690620422, + "rewards/rejected": -0.2541872560977936, + "sft_loss": 0.6874169111251831, + "step": 9240 + }, + { + "epoch": 0.72, + "grad_norm": 6.639622211456299, + "learning_rate": 1.8496509999364609e-06, + "logits/chosen": -1.3168201446533203, + "logits/rejected": -0.7880635857582092, + "logps/chosen": -0.7180159687995911, + "logps/rejected": -5.2311906814575195, + "loss": 0.7216, + "odds_ratio_loss": 0.0356980636715889, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07180158793926239, + "rewards/margins": 0.4513174593448639, + "rewards/rejected": -0.5231190323829651, + "sft_loss": 0.7180159687995911, + "step": 9245 + }, + { + "epoch": 0.72, + "grad_norm": 5.950743198394775, + "learning_rate": 1.8448717992363802e-06, + "logits/chosen": -1.481801152229309, + "logits/rejected": -1.1455045938491821, + "logps/chosen": -1.1666754484176636, + "logps/rejected": -9.948326110839844, + "loss": 1.1821, + "odds_ratio_loss": 0.15418633818626404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11666754633188248, + "rewards/margins": 0.8781651258468628, + "rewards/rejected": -0.9948326349258423, + "sft_loss": 1.1666754484176636, + "step": 9250 + }, + { + "epoch": 0.72, + "grad_norm": 5.432945251464844, + "learning_rate": 1.8400973836888048e-06, + "logits/chosen": -1.3602441549301147, + "logits/rejected": -1.0623838901519775, + "logps/chosen": -1.0072778463363647, + "logps/rejected": -8.661463737487793, + "loss": 1.0476, + "odds_ratio_loss": 0.4036317765712738, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10072778165340424, + "rewards/margins": 0.7654186487197876, + "rewards/rejected": -0.8661463856697083, + "sft_loss": 1.0072778463363647, + "step": 9255 + }, + { + "epoch": 0.72, + "grad_norm": 5.445363521575928, + "learning_rate": 1.8353277605347458e-06, + "logits/chosen": -1.3271220922470093, + "logits/rejected": -0.8071807026863098, + "logps/chosen": -0.8467211723327637, + "logps/rejected": -3.2516613006591797, + "loss": 0.8858, + "odds_ratio_loss": 0.3908676505088806, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08467211574316025, + "rewards/margins": 0.240494042634964, + "rewards/rejected": -0.32516616582870483, + "sft_loss": 0.8467211723327637, + "step": 9260 + }, + { + "epoch": 0.72, + "grad_norm": 49.54649353027344, + "learning_rate": 1.8305629370079403e-06, + "logits/chosen": -1.394164800643921, + "logits/rejected": -1.3578661680221558, + "logps/chosen": -1.0298556089401245, + "logps/rejected": -4.586734294891357, + "loss": 1.0392, + "odds_ratio_loss": 0.09359271079301834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10298557579517365, + "rewards/margins": 0.3556878864765167, + "rewards/rejected": -0.45867347717285156, + "sft_loss": 1.0298556089401245, + "step": 9265 + }, + { + "epoch": 0.72, + "grad_norm": 55.202701568603516, + "learning_rate": 1.8258029203348482e-06, + "logits/chosen": -1.2330883741378784, + "logits/rejected": -0.87456876039505, + "logps/chosen": -1.030839443206787, + "logps/rejected": -6.096185684204102, + "loss": 1.0388, + "odds_ratio_loss": 0.0793967917561531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10308394581079483, + "rewards/margins": 0.5065346360206604, + "rewards/rejected": -0.6096185445785522, + "sft_loss": 1.030839443206787, + "step": 9270 + }, + { + "epoch": 0.72, + "grad_norm": 25.153141021728516, + "learning_rate": 1.821047717734637e-06, + "logits/chosen": -1.3587119579315186, + "logits/rejected": -1.0884228944778442, + "logps/chosen": -1.9555097818374634, + "logps/rejected": -8.748655319213867, + "loss": 2.0443, + "odds_ratio_loss": 0.8876272439956665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.19555097818374634, + "rewards/margins": 0.6793144345283508, + "rewards/rejected": -0.8748654127120972, + "sft_loss": 1.9555097818374634, + "step": 9275 + }, + { + "epoch": 0.72, + "grad_norm": 6.199098110198975, + "learning_rate": 1.8162973364191794e-06, + "logits/chosen": -1.3529561758041382, + "logits/rejected": -0.5642414689064026, + "logps/chosen": -1.07791006565094, + "logps/rejected": -3.2078804969787598, + "loss": 1.1049, + "odds_ratio_loss": 0.27012649178504944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1077909916639328, + "rewards/margins": 0.21299704909324646, + "rewards/rejected": -0.32078805565834045, + "sft_loss": 1.07791006565094, + "step": 9280 + }, + { + "epoch": 0.72, + "grad_norm": 5.515949249267578, + "learning_rate": 1.8115517835930303e-06, + "logits/chosen": -1.309288740158081, + "logits/rejected": -1.2958306074142456, + "logps/chosen": -1.3788987398147583, + "logps/rejected": -6.528385162353516, + "loss": 1.3981, + "odds_ratio_loss": 0.1917024850845337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13788987696170807, + "rewards/margins": 0.5149486660957336, + "rewards/rejected": -0.6528385877609253, + "sft_loss": 1.3788987398147583, + "step": 9285 + }, + { + "epoch": 0.72, + "grad_norm": 22.566404342651367, + "learning_rate": 1.8068110664534217e-06, + "logits/chosen": -1.1723864078521729, + "logits/rejected": -1.353244423866272, + "logps/chosen": -1.0381544828414917, + "logps/rejected": -10.768172264099121, + "loss": 1.0382, + "odds_ratio_loss": 0.0005095123779028654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10381545126438141, + "rewards/margins": 0.9730018377304077, + "rewards/rejected": -1.076817274093628, + "sft_loss": 1.0381544828414917, + "step": 9290 + }, + { + "epoch": 0.72, + "grad_norm": 5.992241382598877, + "learning_rate": 1.802075192190254e-06, + "logits/chosen": -1.377455472946167, + "logits/rejected": -1.128418207168579, + "logps/chosen": -1.0655977725982666, + "logps/rejected": -7.172788143157959, + "loss": 1.0682, + "odds_ratio_loss": 0.02605503238737583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10655979067087173, + "rewards/margins": 0.6107190251350403, + "rewards/rejected": -0.7172788381576538, + "sft_loss": 1.0655977725982666, + "step": 9295 + }, + { + "epoch": 0.72, + "grad_norm": 7.843822956085205, + "learning_rate": 1.797344167986082e-06, + "logits/chosen": -1.3239879608154297, + "logits/rejected": -0.9022638201713562, + "logps/chosen": -1.0198582410812378, + "logps/rejected": -10.66782283782959, + "loss": 1.0446, + "odds_ratio_loss": 0.24729104340076447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10198582708835602, + "rewards/margins": 0.9647965431213379, + "rewards/rejected": -1.0667822360992432, + "sft_loss": 1.0198582410812378, + "step": 9300 + }, + { + "epoch": 0.72, + "grad_norm": 20.95368766784668, + "learning_rate": 1.7926180010161027e-06, + "logits/chosen": -1.2632579803466797, + "logits/rejected": -1.2453078031539917, + "logps/chosen": -0.8721585273742676, + "logps/rejected": -2.734140396118164, + "loss": 0.8955, + "odds_ratio_loss": 0.23379027843475342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.087215855717659, + "rewards/margins": 0.1861981898546219, + "rewards/rejected": -0.2734140455722809, + "sft_loss": 0.8721585273742676, + "step": 9305 + }, + { + "epoch": 0.72, + "grad_norm": 10.482601165771484, + "learning_rate": 1.7878966984481515e-06, + "logits/chosen": -1.3100192546844482, + "logits/rejected": -1.3720757961273193, + "logps/chosen": -1.2211915254592896, + "logps/rejected": -4.563811302185059, + "loss": 1.2738, + "odds_ratio_loss": 0.5262596011161804, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12211916595697403, + "rewards/margins": 0.334261953830719, + "rewards/rejected": -0.4563811421394348, + "sft_loss": 1.2211915254592896, + "step": 9310 + }, + { + "epoch": 0.72, + "grad_norm": 36.49144744873047, + "learning_rate": 1.7831802674426813e-06, + "logits/chosen": -1.2298425436019897, + "logits/rejected": -1.1529333591461182, + "logps/chosen": -0.874362587928772, + "logps/rejected": -10.53254222869873, + "loss": 0.8903, + "odds_ratio_loss": 0.15956975519657135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0874362587928772, + "rewards/margins": 0.9658180475234985, + "rewards/rejected": -1.053254246711731, + "sft_loss": 0.874362587928772, + "step": 9315 + }, + { + "epoch": 0.73, + "grad_norm": 6.323707103729248, + "learning_rate": 1.7784687151527574e-06, + "logits/chosen": -1.464393138885498, + "logits/rejected": -1.2067110538482666, + "logps/chosen": -1.0483952760696411, + "logps/rejected": -7.121232032775879, + "loss": 1.0594, + "odds_ratio_loss": 0.10971790552139282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10483952611684799, + "rewards/margins": 0.6072835922241211, + "rewards/rejected": -0.7121232151985168, + "sft_loss": 1.0483952760696411, + "step": 9320 + }, + { + "epoch": 0.73, + "grad_norm": 6.304421424865723, + "learning_rate": 1.7737620487240504e-06, + "logits/chosen": -1.199683666229248, + "logits/rejected": -0.8703166842460632, + "logps/chosen": -1.2598625421524048, + "logps/rejected": -6.0696187019348145, + "loss": 1.3126, + "odds_ratio_loss": 0.5278078317642212, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.125986248254776, + "rewards/margins": 0.4809756875038147, + "rewards/rejected": -0.6069619059562683, + "sft_loss": 1.2598625421524048, + "step": 9325 + }, + { + "epoch": 0.73, + "grad_norm": 6.901340961456299, + "learning_rate": 1.7690602752948155e-06, + "logits/chosen": -1.3581178188323975, + "logits/rejected": -1.2259575128555298, + "logps/chosen": -0.7316224575042725, + "logps/rejected": -8.98783016204834, + "loss": 0.7541, + "odds_ratio_loss": 0.22492511570453644, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0731622502207756, + "rewards/margins": 0.8256209492683411, + "rewards/rejected": -0.8987830877304077, + "sft_loss": 0.7316224575042725, + "step": 9330 + }, + { + "epoch": 0.73, + "grad_norm": 5.403962135314941, + "learning_rate": 1.7643634019958894e-06, + "logits/chosen": -1.3089879751205444, + "logits/rejected": -0.7208669781684875, + "logps/chosen": -0.830630898475647, + "logps/rejected": -7.895847320556641, + "loss": 0.8413, + "odds_ratio_loss": 0.10705997794866562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08306309580802917, + "rewards/margins": 0.7065216302871704, + "rewards/rejected": -0.789584755897522, + "sft_loss": 0.830630898475647, + "step": 9335 + }, + { + "epoch": 0.73, + "grad_norm": 59.112098693847656, + "learning_rate": 1.7596714359506762e-06, + "logits/chosen": -1.31985604763031, + "logits/rejected": -1.316774606704712, + "logps/chosen": -0.9269720911979675, + "logps/rejected": -4.943132400512695, + "loss": 0.9619, + "odds_ratio_loss": 0.34920763969421387, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09269721806049347, + "rewards/margins": 0.40161600708961487, + "rewards/rejected": -0.4943132996559143, + "sft_loss": 0.9269720911979675, + "step": 9340 + }, + { + "epoch": 0.73, + "grad_norm": 6.590602397918701, + "learning_rate": 1.754984384275139e-06, + "logits/chosen": -1.1757590770721436, + "logits/rejected": -1.2967216968536377, + "logps/chosen": -0.9774863123893738, + "logps/rejected": -16.89197540283203, + "loss": 0.9868, + "odds_ratio_loss": 0.09362256526947021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09774863719940186, + "rewards/margins": 1.5914490222930908, + "rewards/rejected": -1.6891975402832031, + "sft_loss": 0.9774863123893738, + "step": 9345 + }, + { + "epoch": 0.73, + "grad_norm": 7.168046951293945, + "learning_rate": 1.750302254077786e-06, + "logits/chosen": -1.3897745609283447, + "logits/rejected": -1.1414748430252075, + "logps/chosen": -0.7206248044967651, + "logps/rejected": -6.595806121826172, + "loss": 0.7491, + "odds_ratio_loss": 0.28505057096481323, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07206248492002487, + "rewards/margins": 0.5875180959701538, + "rewards/rejected": -0.6595805883407593, + "sft_loss": 0.7206248044967651, + "step": 9350 + }, + { + "epoch": 0.73, + "grad_norm": 12.575242042541504, + "learning_rate": 1.7456250524596607e-06, + "logits/chosen": -1.248130202293396, + "logits/rejected": -1.2025890350341797, + "logps/chosen": -0.7890609502792358, + "logps/rejected": -9.191629409790039, + "loss": 0.7918, + "odds_ratio_loss": 0.02735903300344944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0789061039686203, + "rewards/margins": 0.8402568697929382, + "rewards/rejected": -0.919162929058075, + "sft_loss": 0.7890609502792358, + "step": 9355 + }, + { + "epoch": 0.73, + "grad_norm": 9.834772109985352, + "learning_rate": 1.7409527865143366e-06, + "logits/chosen": -1.3604110479354858, + "logits/rejected": -1.3642244338989258, + "logps/chosen": -0.42751726508140564, + "logps/rejected": -6.097275733947754, + "loss": 0.4296, + "odds_ratio_loss": 0.020784597843885422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0427517332136631, + "rewards/margins": 0.5669758915901184, + "rewards/rejected": -0.6097276210784912, + "sft_loss": 0.42751726508140564, + "step": 9360 + }, + { + "epoch": 0.73, + "grad_norm": 10.374699592590332, + "learning_rate": 1.7362854633278963e-06, + "logits/chosen": -1.3058464527130127, + "logits/rejected": -1.0281599760055542, + "logps/chosen": -0.5291213393211365, + "logps/rejected": -1.087648630142212, + "loss": 0.5644, + "odds_ratio_loss": 0.352711021900177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.052912138402462006, + "rewards/margins": 0.05585271865129471, + "rewards/rejected": -0.10876486450433731, + "sft_loss": 0.5291213393211365, + "step": 9365 + }, + { + "epoch": 0.73, + "grad_norm": 303.52349853515625, + "learning_rate": 1.7316230899789266e-06, + "logits/chosen": -1.299647569656372, + "logits/rejected": -1.0840200185775757, + "logps/chosen": -1.3607500791549683, + "logps/rejected": -9.152244567871094, + "loss": 1.4027, + "odds_ratio_loss": 0.41933393478393555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.13607501983642578, + "rewards/margins": 0.7791494131088257, + "rewards/rejected": -0.9152244329452515, + "sft_loss": 1.3607500791549683, + "step": 9370 + }, + { + "epoch": 0.73, + "grad_norm": 5.656765937805176, + "learning_rate": 1.726965673538512e-06, + "logits/chosen": -1.362069845199585, + "logits/rejected": -0.7855729460716248, + "logps/chosen": -1.309899091720581, + "logps/rejected": -9.314172744750977, + "loss": 1.316, + "odds_ratio_loss": 0.060855478048324585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1309899091720581, + "rewards/margins": 0.800427258014679, + "rewards/rejected": -0.9314171671867371, + "sft_loss": 1.309899091720581, + "step": 9375 + }, + { + "epoch": 0.73, + "grad_norm": 18.312532424926758, + "learning_rate": 1.7223132210702142e-06, + "logits/chosen": -1.274558663368225, + "logits/rejected": -1.370939016342163, + "logps/chosen": -0.8942365646362305, + "logps/rejected": -11.986459732055664, + "loss": 0.8956, + "odds_ratio_loss": 0.013301363214850426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08942366391420364, + "rewards/margins": 1.109222173690796, + "rewards/rejected": -1.198645830154419, + "sft_loss": 0.8942365646362305, + "step": 9380 + }, + { + "epoch": 0.73, + "grad_norm": 5.234900951385498, + "learning_rate": 1.7176657396300667e-06, + "logits/chosen": -1.3498860597610474, + "logits/rejected": -1.177386999130249, + "logps/chosen": -1.1110543012619019, + "logps/rejected": -11.639063835144043, + "loss": 1.1182, + "odds_ratio_loss": 0.07124121487140656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11110544204711914, + "rewards/margins": 1.0528010129928589, + "rewards/rejected": -1.163906455039978, + "sft_loss": 1.1110543012619019, + "step": 9385 + }, + { + "epoch": 0.73, + "grad_norm": 6.4449238777160645, + "learning_rate": 1.7130232362665672e-06, + "logits/chosen": -1.3539700508117676, + "logits/rejected": -1.365997076034546, + "logps/chosen": -1.0939593315124512, + "logps/rejected": -4.255034923553467, + "loss": 1.1402, + "odds_ratio_loss": 0.4621972143650055, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10939594358205795, + "rewards/margins": 0.31610754132270813, + "rewards/rejected": -0.4255034923553467, + "sft_loss": 1.0939593315124512, + "step": 9390 + }, + { + "epoch": 0.73, + "grad_norm": 49.2803840637207, + "learning_rate": 1.7083857180206613e-06, + "logits/chosen": -1.3713133335113525, + "logits/rejected": -1.0905721187591553, + "logps/chosen": -0.9607345461845398, + "logps/rejected": -8.983610153198242, + "loss": 0.9681, + "odds_ratio_loss": 0.07401247322559357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0960734561085701, + "rewards/margins": 0.8022874593734741, + "rewards/rejected": -0.8983610272407532, + "sft_loss": 0.9607345461845398, + "step": 9395 + }, + { + "epoch": 0.73, + "grad_norm": 10.100704193115234, + "learning_rate": 1.7037531919257338e-06, + "logits/chosen": -1.3139148950576782, + "logits/rejected": -1.182771921157837, + "logps/chosen": -1.1720882654190063, + "logps/rejected": -11.94184684753418, + "loss": 1.1866, + "odds_ratio_loss": 0.14487013220787048, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1172088235616684, + "rewards/margins": 1.0769760608673096, + "rewards/rejected": -1.1941848993301392, + "sft_loss": 1.1720882654190063, + "step": 9400 + }, + { + "epoch": 0.73, + "grad_norm": 10.951863288879395, + "learning_rate": 1.6991256650075983e-06, + "logits/chosen": -1.39067542552948, + "logits/rejected": -0.8745479583740234, + "logps/chosen": -1.161098599433899, + "logps/rejected": -8.327936172485352, + "loss": 1.1635, + "odds_ratio_loss": 0.023716315627098083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11610986292362213, + "rewards/margins": 0.7166837453842163, + "rewards/rejected": -0.8327935934066772, + "sft_loss": 1.161098599433899, + "step": 9405 + }, + { + "epoch": 0.73, + "grad_norm": 33.24994659423828, + "learning_rate": 1.6945031442844872e-06, + "logits/chosen": -1.328922986984253, + "logits/rejected": -1.135591745376587, + "logps/chosen": -0.8492316007614136, + "logps/rejected": -5.809294700622559, + "loss": 0.8688, + "odds_ratio_loss": 0.19584044814109802, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0849231630563736, + "rewards/margins": 0.4960063099861145, + "rewards/rejected": -0.5809294581413269, + "sft_loss": 0.8492316007614136, + "step": 9410 + }, + { + "epoch": 0.73, + "grad_norm": 31.55966567993164, + "learning_rate": 1.6898856367670397e-06, + "logits/chosen": -1.1461572647094727, + "logits/rejected": -1.0324511528015137, + "logps/chosen": -0.9476990699768066, + "logps/rejected": -4.202122211456299, + "loss": 0.971, + "odds_ratio_loss": 0.2331872433423996, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0947699099779129, + "rewards/margins": 0.3254423439502716, + "rewards/rejected": -0.4202122688293457, + "sft_loss": 0.9476990699768066, + "step": 9415 + }, + { + "epoch": 0.73, + "grad_norm": 4.240586280822754, + "learning_rate": 1.6852731494582913e-06, + "logits/chosen": -1.3576472997665405, + "logits/rejected": -0.860035240650177, + "logps/chosen": -1.1766688823699951, + "logps/rejected": -4.946648597717285, + "loss": 1.2052, + "odds_ratio_loss": 0.28534621000289917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11766688525676727, + "rewards/margins": 0.3769979476928711, + "rewards/rejected": -0.4946648180484772, + "sft_loss": 1.1766688823699951, + "step": 9420 + }, + { + "epoch": 0.73, + "grad_norm": 7.833755016326904, + "learning_rate": 1.6806656893536672e-06, + "logits/chosen": -1.2453978061676025, + "logits/rejected": -0.9850869178771973, + "logps/chosen": -0.8818928003311157, + "logps/rejected": -4.74990177154541, + "loss": 0.9018, + "odds_ratio_loss": 0.1990598738193512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08818928152322769, + "rewards/margins": 0.3868009150028229, + "rewards/rejected": -0.47499018907546997, + "sft_loss": 0.8818928003311157, + "step": 9425 + }, + { + "epoch": 0.73, + "grad_norm": 127.3130874633789, + "learning_rate": 1.6760632634409647e-06, + "logits/chosen": -1.1609935760498047, + "logits/rejected": -0.7955946326255798, + "logps/chosen": -1.1204373836517334, + "logps/rejected": -4.585291862487793, + "loss": 1.1288, + "odds_ratio_loss": 0.08315370976924896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11204373836517334, + "rewards/margins": 0.3464854657649994, + "rewards/rejected": -0.45852917432785034, + "sft_loss": 1.1204373836517334, + "step": 9430 + }, + { + "epoch": 0.73, + "grad_norm": 8.43161678314209, + "learning_rate": 1.6714658787003445e-06, + "logits/chosen": -1.2553160190582275, + "logits/rejected": -1.1255378723144531, + "logps/chosen": -1.2484910488128662, + "logps/rejected": -3.9797306060791016, + "loss": 1.2893, + "odds_ratio_loss": 0.4080115258693695, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1248491033911705, + "rewards/margins": 0.27312394976615906, + "rewards/rejected": -0.39797303080558777, + "sft_loss": 1.2484910488128662, + "step": 9435 + }, + { + "epoch": 0.73, + "grad_norm": 7.703319549560547, + "learning_rate": 1.6668735421043287e-06, + "logits/chosen": -1.2995141744613647, + "logits/rejected": -1.1087191104888916, + "logps/chosen": -1.626448392868042, + "logps/rejected": -11.731626510620117, + "loss": 1.638, + "odds_ratio_loss": 0.11596866697072983, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16264484822750092, + "rewards/margins": 1.0105178356170654, + "rewards/rejected": -1.173162817955017, + "sft_loss": 1.626448392868042, + "step": 9440 + }, + { + "epoch": 0.73, + "grad_norm": 86.04496002197266, + "learning_rate": 1.662286260617776e-06, + "logits/chosen": -1.345144271850586, + "logits/rejected": -1.0741063356399536, + "logps/chosen": -0.8128703832626343, + "logps/rejected": -9.815523147583008, + "loss": 0.8431, + "odds_ratio_loss": 0.30230069160461426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08128704130649567, + "rewards/margins": 0.900265097618103, + "rewards/rejected": -0.9815523028373718, + "sft_loss": 0.8128703832626343, + "step": 9445 + }, + { + "epoch": 0.74, + "grad_norm": 7.98433256149292, + "learning_rate": 1.6577040411978817e-06, + "logits/chosen": -1.4675757884979248, + "logits/rejected": -1.2412128448486328, + "logps/chosen": -0.8189373016357422, + "logps/rejected": -7.274649143218994, + "loss": 0.8294, + "odds_ratio_loss": 0.10454756021499634, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08189372718334198, + "rewards/margins": 0.645571231842041, + "rewards/rejected": -0.7274649143218994, + "sft_loss": 0.8189373016357422, + "step": 9450 + }, + { + "epoch": 0.74, + "grad_norm": 7.043887138366699, + "learning_rate": 1.653126890794164e-06, + "logits/chosen": -1.4140623807907104, + "logits/rejected": -1.0005137920379639, + "logps/chosen": -0.7331684231758118, + "logps/rejected": -2.0889861583709717, + "loss": 0.7617, + "odds_ratio_loss": 0.2852046489715576, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07331683486700058, + "rewards/margins": 0.13558180630207062, + "rewards/rejected": -0.2088986337184906, + "sft_loss": 0.7331684231758118, + "step": 9455 + }, + { + "epoch": 0.74, + "grad_norm": 4.3914384841918945, + "learning_rate": 1.6485548163484511e-06, + "logits/chosen": -1.293859601020813, + "logits/rejected": -0.8690057992935181, + "logps/chosen": -0.7792251706123352, + "logps/rejected": -3.0021920204162598, + "loss": 0.8029, + "odds_ratio_loss": 0.23707649111747742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0779225155711174, + "rewards/margins": 0.22229667007923126, + "rewards/rejected": -0.30021920800209045, + "sft_loss": 0.7792251706123352, + "step": 9460 + }, + { + "epoch": 0.74, + "grad_norm": 10.480997085571289, + "learning_rate": 1.643987824794876e-06, + "logits/chosen": -1.3863446712493896, + "logits/rejected": -1.0339863300323486, + "logps/chosen": -0.9150098562240601, + "logps/rejected": -6.108335018157959, + "loss": 0.921, + "odds_ratio_loss": 0.0596047043800354, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09150098264217377, + "rewards/margins": 0.5193325281143188, + "rewards/rejected": -0.6108335256576538, + "sft_loss": 0.9150098562240601, + "step": 9465 + }, + { + "epoch": 0.74, + "grad_norm": 6.933332920074463, + "learning_rate": 1.639425923059858e-06, + "logits/chosen": -1.2239768505096436, + "logits/rejected": -1.0536987781524658, + "logps/chosen": -1.0001404285430908, + "logps/rejected": -8.236177444458008, + "loss": 1.0256, + "odds_ratio_loss": 0.2543320059776306, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10001404583454132, + "rewards/margins": 0.7236037850379944, + "rewards/rejected": -0.8236177563667297, + "sft_loss": 1.0001404285430908, + "step": 9470 + }, + { + "epoch": 0.74, + "grad_norm": 7.526812553405762, + "learning_rate": 1.634869118062105e-06, + "logits/chosen": -1.223048210144043, + "logits/rejected": -1.3066279888153076, + "logps/chosen": -1.0369764566421509, + "logps/rejected": -10.110676765441895, + "loss": 1.0437, + "odds_ratio_loss": 0.06689582765102386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10369764268398285, + "rewards/margins": 0.9073699712753296, + "rewards/rejected": -1.0110676288604736, + "sft_loss": 1.0369764566421509, + "step": 9475 + }, + { + "epoch": 0.74, + "grad_norm": 10.411308288574219, + "learning_rate": 1.630317416712588e-06, + "logits/chosen": -0.7705143094062805, + "logits/rejected": -1.2210378646850586, + "logps/chosen": -0.8669899702072144, + "logps/rejected": -8.831012725830078, + "loss": 0.8733, + "odds_ratio_loss": 0.06326936930418015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0866990014910698, + "rewards/margins": 0.7964022755622864, + "rewards/rejected": -0.8831012845039368, + "sft_loss": 0.8669899702072144, + "step": 9480 + }, + { + "epoch": 0.74, + "grad_norm": 752.5221557617188, + "learning_rate": 1.6257708259145388e-06, + "logits/chosen": -1.4490292072296143, + "logits/rejected": -1.3708478212356567, + "logps/chosen": -2.4949989318847656, + "logps/rejected": -7.886415958404541, + "loss": 2.5058, + "odds_ratio_loss": 0.1081923395395279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2494998723268509, + "rewards/margins": 0.5391417145729065, + "rewards/rejected": -0.7886415719985962, + "sft_loss": 2.4949989318847656, + "step": 9485 + }, + { + "epoch": 0.74, + "grad_norm": 6.341780662536621, + "learning_rate": 1.621229352563442e-06, + "logits/chosen": -1.4050637483596802, + "logits/rejected": -0.7608194351196289, + "logps/chosen": -0.8335064053535461, + "logps/rejected": -9.597498893737793, + "loss": 0.8375, + "odds_ratio_loss": 0.03985415771603584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08335064351558685, + "rewards/margins": 0.8763993382453918, + "rewards/rejected": -0.9597498774528503, + "sft_loss": 0.8335064053535461, + "step": 9490 + }, + { + "epoch": 0.74, + "grad_norm": 21.7724552154541, + "learning_rate": 1.616693003547018e-06, + "logits/chosen": -1.3081724643707275, + "logits/rejected": -1.0145162343978882, + "logps/chosen": -0.870733380317688, + "logps/rejected": -7.0630693435668945, + "loss": 0.8841, + "odds_ratio_loss": 0.13338907063007355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08707333356142044, + "rewards/margins": 0.6192336082458496, + "rewards/rejected": -0.7063069343566895, + "sft_loss": 0.870733380317688, + "step": 9495 + }, + { + "epoch": 0.74, + "grad_norm": 12.79421615600586, + "learning_rate": 1.6121617857452138e-06, + "logits/chosen": -1.3996565341949463, + "logits/rejected": -1.309962511062622, + "logps/chosen": -0.990923285484314, + "logps/rejected": -1.5980056524276733, + "loss": 1.0392, + "odds_ratio_loss": 0.4825132489204407, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09909233450889587, + "rewards/margins": 0.06070823594927788, + "rewards/rejected": -0.15980055928230286, + "sft_loss": 0.990923285484314, + "step": 9500 + }, + { + "epoch": 0.74, + "grad_norm": 88.10269927978516, + "learning_rate": 1.6076357060301995e-06, + "logits/chosen": -1.284991979598999, + "logits/rejected": -1.2980594635009766, + "logps/chosen": -0.7582489848136902, + "logps/rejected": -4.3230133056640625, + "loss": 0.7667, + "odds_ratio_loss": 0.08428351581096649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07582490146160126, + "rewards/margins": 0.35647648572921753, + "rewards/rejected": -0.4323013722896576, + "sft_loss": 0.7582489848136902, + "step": 9505 + }, + { + "epoch": 0.74, + "grad_norm": 6.083242893218994, + "learning_rate": 1.6031147712663487e-06, + "logits/chosen": -1.2613264322280884, + "logits/rejected": -1.400712490081787, + "logps/chosen": -1.0994956493377686, + "logps/rejected": -8.263212203979492, + "loss": 1.1013, + "odds_ratio_loss": 0.01845996081829071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10994956642389297, + "rewards/margins": 0.7163716554641724, + "rewards/rejected": -0.8263211250305176, + "sft_loss": 1.0994956493377686, + "step": 9510 + }, + { + "epoch": 0.74, + "grad_norm": 6.732654094696045, + "learning_rate": 1.5985989883102343e-06, + "logits/chosen": -1.3507130146026611, + "logits/rejected": -1.2158911228179932, + "logps/chosen": -1.0793886184692383, + "logps/rejected": -7.2026190757751465, + "loss": 1.0963, + "odds_ratio_loss": 0.16960716247558594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10793887078762054, + "rewards/margins": 0.6123231053352356, + "rewards/rejected": -0.7202619314193726, + "sft_loss": 1.0793886184692383, + "step": 9515 + }, + { + "epoch": 0.74, + "grad_norm": 7.203071594238281, + "learning_rate": 1.5940883640106091e-06, + "logits/chosen": -1.3774728775024414, + "logits/rejected": -1.1365160942077637, + "logps/chosen": -0.8118964433670044, + "logps/rejected": -4.104200839996338, + "loss": 0.842, + "odds_ratio_loss": 0.30122292041778564, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08118964731693268, + "rewards/margins": 0.3292304277420044, + "rewards/rejected": -0.4104200303554535, + "sft_loss": 0.8118964433670044, + "step": 9520 + }, + { + "epoch": 0.74, + "grad_norm": 24.832763671875, + "learning_rate": 1.5895829052084132e-06, + "logits/chosen": -1.3976446390151978, + "logits/rejected": -1.1410542726516724, + "logps/chosen": -1.0454027652740479, + "logps/rejected": -4.984982490539551, + "loss": 1.0612, + "odds_ratio_loss": 0.15799330174922943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10454028844833374, + "rewards/margins": 0.3939579427242279, + "rewards/rejected": -0.49849826097488403, + "sft_loss": 1.0454027652740479, + "step": 9525 + }, + { + "epoch": 0.74, + "grad_norm": 23.104721069335938, + "learning_rate": 1.5850826187367452e-06, + "logits/chosen": -1.245971918106079, + "logits/rejected": -1.221071720123291, + "logps/chosen": -0.7174273729324341, + "logps/rejected": -4.505877494812012, + "loss": 0.73, + "odds_ratio_loss": 0.12609794735908508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07174274325370789, + "rewards/margins": 0.3788450062274933, + "rewards/rejected": -0.45058774948120117, + "sft_loss": 0.7174273729324341, + "step": 9530 + }, + { + "epoch": 0.74, + "grad_norm": 6.791812896728516, + "learning_rate": 1.5805875114208586e-06, + "logits/chosen": -1.4716187715530396, + "logits/rejected": -0.8964160680770874, + "logps/chosen": -0.8372504115104675, + "logps/rejected": -9.657258987426758, + "loss": 0.8378, + "odds_ratio_loss": 0.005763564258813858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08372504264116287, + "rewards/margins": 0.8820008039474487, + "rewards/rejected": -0.9657258987426758, + "sft_loss": 0.8372504115104675, + "step": 9535 + }, + { + "epoch": 0.74, + "grad_norm": 8.337750434875488, + "learning_rate": 1.5760975900781582e-06, + "logits/chosen": -1.3692501783370972, + "logits/rejected": -1.2153772115707397, + "logps/chosen": -0.7844537496566772, + "logps/rejected": -6.409913063049316, + "loss": 0.7908, + "odds_ratio_loss": 0.06374012678861618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07844537496566772, + "rewards/margins": 0.562545895576477, + "rewards/rejected": -0.6409912705421448, + "sft_loss": 0.7844537496566772, + "step": 9540 + }, + { + "epoch": 0.74, + "grad_norm": 9.413233757019043, + "learning_rate": 1.5716128615181786e-06, + "logits/chosen": -1.4353245496749878, + "logits/rejected": -1.217750072479248, + "logps/chosen": -1.0690791606903076, + "logps/rejected": -5.4481353759765625, + "loss": 1.0838, + "odds_ratio_loss": 0.1469026356935501, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.106907919049263, + "rewards/margins": 0.43790560960769653, + "rewards/rejected": -0.5448135137557983, + "sft_loss": 1.0690791606903076, + "step": 9545 + }, + { + "epoch": 0.74, + "grad_norm": 5.308843612670898, + "learning_rate": 1.5671333325425775e-06, + "logits/chosen": -1.2684317827224731, + "logits/rejected": -1.0575990676879883, + "logps/chosen": -0.9879388809204102, + "logps/rejected": -9.218892097473145, + "loss": 1.0006, + "odds_ratio_loss": 0.12698756158351898, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0987938940525055, + "rewards/margins": 0.8230952024459839, + "rewards/rejected": -0.9218891859054565, + "sft_loss": 0.9879388809204102, + "step": 9550 + }, + { + "epoch": 0.74, + "grad_norm": 18.182369232177734, + "learning_rate": 1.5626590099451329e-06, + "logits/chosen": -1.2446739673614502, + "logits/rejected": -1.1840369701385498, + "logps/chosen": -1.1093944311141968, + "logps/rejected": -9.49000358581543, + "loss": 1.1481, + "odds_ratio_loss": 0.3874703347682953, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11093944311141968, + "rewards/margins": 0.8380608558654785, + "rewards/rejected": -0.9490002393722534, + "sft_loss": 1.1093944311141968, + "step": 9555 + }, + { + "epoch": 0.74, + "grad_norm": 41.64733123779297, + "learning_rate": 1.5581899005117212e-06, + "logits/chosen": -1.0101242065429688, + "logits/rejected": -1.3759095668792725, + "logps/chosen": -0.942404568195343, + "logps/rejected": -7.333226680755615, + "loss": 0.9562, + "odds_ratio_loss": 0.13792428374290466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0942404493689537, + "rewards/margins": 0.6390821933746338, + "rewards/rejected": -0.7333226203918457, + "sft_loss": 0.942404568195343, + "step": 9560 + }, + { + "epoch": 0.74, + "grad_norm": 93.02265930175781, + "learning_rate": 1.553726011020315e-06, + "logits/chosen": -1.3022782802581787, + "logits/rejected": -0.844427227973938, + "logps/chosen": -0.8994660377502441, + "logps/rejected": -2.6767678260803223, + "loss": 0.9391, + "odds_ratio_loss": 0.39630061388015747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08994659781455994, + "rewards/margins": 0.17773017287254333, + "rewards/rejected": -0.26767677068710327, + "sft_loss": 0.8994660377502441, + "step": 9565 + }, + { + "epoch": 0.74, + "grad_norm": 7.572153091430664, + "learning_rate": 1.5492673482409693e-06, + "logits/chosen": -1.386723518371582, + "logits/rejected": -1.2708041667938232, + "logps/chosen": -0.9327977895736694, + "logps/rejected": -8.727251052856445, + "loss": 0.9405, + "odds_ratio_loss": 0.07732054591178894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09327977895736694, + "rewards/margins": 0.7794452905654907, + "rewards/rejected": -0.8727251291275024, + "sft_loss": 0.9327977895736694, + "step": 9570 + }, + { + "epoch": 0.74, + "grad_norm": 6.863697052001953, + "learning_rate": 1.5448139189358114e-06, + "logits/chosen": -1.4256162643432617, + "logits/rejected": -0.9826396703720093, + "logps/chosen": -0.9701846837997437, + "logps/rejected": -4.691870212554932, + "loss": 0.9802, + "odds_ratio_loss": 0.1004917174577713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09701846539974213, + "rewards/margins": 0.37216854095458984, + "rewards/rejected": -0.4691869616508484, + "sft_loss": 0.9701846837997437, + "step": 9575 + }, + { + "epoch": 0.75, + "grad_norm": 22.540124893188477, + "learning_rate": 1.5403657298590335e-06, + "logits/chosen": -1.4844796657562256, + "logits/rejected": -1.193983793258667, + "logps/chosen": -0.8730993270874023, + "logps/rejected": -3.906749725341797, + "loss": 0.9163, + "odds_ratio_loss": 0.4321001470088959, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08730993419885635, + "rewards/margins": 0.3033650517463684, + "rewards/rejected": -0.39067500829696655, + "sft_loss": 0.8730993270874023, + "step": 9580 + }, + { + "epoch": 0.75, + "grad_norm": 56.619239807128906, + "learning_rate": 1.5359227877568766e-06, + "logits/chosen": -1.3769557476043701, + "logits/rejected": -0.7135136127471924, + "logps/chosen": -1.2803035974502563, + "logps/rejected": -6.4957756996154785, + "loss": 1.3857, + "odds_ratio_loss": 1.0542762279510498, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12803034484386444, + "rewards/margins": 0.5215471982955933, + "rewards/rejected": -0.6495774984359741, + "sft_loss": 1.2803035974502563, + "step": 9585 + }, + { + "epoch": 0.75, + "grad_norm": 8.903173446655273, + "learning_rate": 1.53148509936763e-06, + "logits/chosen": -1.2418177127838135, + "logits/rejected": -1.2458902597427368, + "logps/chosen": -0.7455726861953735, + "logps/rejected": -8.08189582824707, + "loss": 0.7642, + "odds_ratio_loss": 0.1859380453824997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07455727458000183, + "rewards/margins": 0.7336322069168091, + "rewards/rejected": -0.8081895112991333, + "sft_loss": 0.7455726861953735, + "step": 9590 + }, + { + "epoch": 0.75, + "grad_norm": 13.163384437561035, + "learning_rate": 1.5270526714216106e-06, + "logits/chosen": -1.321645975112915, + "logits/rejected": -1.3027857542037964, + "logps/chosen": -1.1771001815795898, + "logps/rejected": -24.939193725585938, + "loss": 1.1771, + "odds_ratio_loss": 0.00014036455831956118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11771001666784286, + "rewards/margins": 2.376209259033203, + "rewards/rejected": -2.4939193725585938, + "sft_loss": 1.1771001815795898, + "step": 9595 + }, + { + "epoch": 0.75, + "grad_norm": 22.36627960205078, + "learning_rate": 1.5226255106411553e-06, + "logits/chosen": -1.3507254123687744, + "logits/rejected": -1.065592646598816, + "logps/chosen": -0.9857769012451172, + "logps/rejected": -4.691664695739746, + "loss": 0.9978, + "odds_ratio_loss": 0.12072012573480606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09857769310474396, + "rewards/margins": 0.37058883905410767, + "rewards/rejected": -0.46916651725769043, + "sft_loss": 0.9857769012451172, + "step": 9600 + }, + { + "epoch": 0.75, + "grad_norm": 10.55109691619873, + "learning_rate": 1.51820362374062e-06, + "logits/chosen": -1.4908877611160278, + "logits/rejected": -1.7430016994476318, + "logps/chosen": -0.9867733120918274, + "logps/rejected": -8.725939750671387, + "loss": 0.9872, + "odds_ratio_loss": 0.0038646336179226637, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09867732971906662, + "rewards/margins": 0.7739167213439941, + "rewards/rejected": -0.8725940585136414, + "sft_loss": 0.9867733120918274, + "step": 9605 + }, + { + "epoch": 0.75, + "grad_norm": 12.481772422790527, + "learning_rate": 1.5137870174263547e-06, + "logits/chosen": -1.3180853128433228, + "logits/rejected": -1.3651639223098755, + "logps/chosen": -0.7774752974510193, + "logps/rejected": -2.9693262577056885, + "loss": 0.795, + "odds_ratio_loss": 0.17541441321372986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07774752378463745, + "rewards/margins": 0.21918511390686035, + "rewards/rejected": -0.2969326376914978, + "sft_loss": 0.7774752974510193, + "step": 9610 + }, + { + "epoch": 0.75, + "grad_norm": 25.910600662231445, + "learning_rate": 1.5093756983967035e-06, + "logits/chosen": -1.425577163696289, + "logits/rejected": -1.1789932250976562, + "logps/chosen": -1.0428050756454468, + "logps/rejected": -3.9875640869140625, + "loss": 1.0858, + "odds_ratio_loss": 0.42989516258239746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1042805090546608, + "rewards/margins": 0.2944759130477905, + "rewards/rejected": -0.3987564146518707, + "sft_loss": 1.0428050756454468, + "step": 9615 + }, + { + "epoch": 0.75, + "grad_norm": 11.152374267578125, + "learning_rate": 1.5049696733419938e-06, + "logits/chosen": -1.2368385791778564, + "logits/rejected": -0.9455921053886414, + "logps/chosen": -1.049968957901001, + "logps/rejected": -3.453242778778076, + "loss": 1.0725, + "odds_ratio_loss": 0.22488944232463837, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10499688237905502, + "rewards/margins": 0.2403274029493332, + "rewards/rejected": -0.34532430768013, + "sft_loss": 1.049968957901001, + "step": 9620 + }, + { + "epoch": 0.75, + "grad_norm": 7.991733551025391, + "learning_rate": 1.5005689489445208e-06, + "logits/chosen": -1.086672067642212, + "logits/rejected": -1.2593472003936768, + "logps/chosen": -1.281997561454773, + "logps/rejected": -5.90949010848999, + "loss": 1.2998, + "odds_ratio_loss": 0.17808976769447327, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1281997561454773, + "rewards/margins": 0.46274930238723755, + "rewards/rejected": -0.5909490585327148, + "sft_loss": 1.281997561454773, + "step": 9625 + }, + { + "epoch": 0.75, + "grad_norm": 18.611215591430664, + "learning_rate": 1.4961735318785415e-06, + "logits/chosen": -1.2058672904968262, + "logits/rejected": -1.2779263257980347, + "logps/chosen": -1.0617318153381348, + "logps/rejected": -3.314312696456909, + "loss": 1.1067, + "odds_ratio_loss": 0.4496592879295349, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10617318004369736, + "rewards/margins": 0.22525811195373535, + "rewards/rejected": -0.3314313292503357, + "sft_loss": 1.0617318153381348, + "step": 9630 + }, + { + "epoch": 0.75, + "grad_norm": 4.113557815551758, + "learning_rate": 1.4917834288102646e-06, + "logits/chosen": -1.3224985599517822, + "logits/rejected": -0.7819467782974243, + "logps/chosen": -1.22843337059021, + "logps/rejected": -7.48949670791626, + "loss": 1.2421, + "odds_ratio_loss": 0.13632090389728546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12284334748983383, + "rewards/margins": 0.6261062622070312, + "rewards/rejected": -0.7489496469497681, + "sft_loss": 1.22843337059021, + "step": 9635 + }, + { + "epoch": 0.75, + "grad_norm": 9.185248374938965, + "learning_rate": 1.4873986463978386e-06, + "logits/chosen": -1.2555792331695557, + "logits/rejected": -1.1100329160690308, + "logps/chosen": -1.0447484254837036, + "logps/rejected": -4.491857528686523, + "loss": 1.0741, + "odds_ratio_loss": 0.29385143518447876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10447484254837036, + "rewards/margins": 0.34471091628074646, + "rewards/rejected": -0.4491857588291168, + "sft_loss": 1.0447484254837036, + "step": 9640 + }, + { + "epoch": 0.75, + "grad_norm": 5.883439540863037, + "learning_rate": 1.4830191912913421e-06, + "logits/chosen": -1.2748113870620728, + "logits/rejected": -1.396545648574829, + "logps/chosen": -1.0344129800796509, + "logps/rejected": -6.886133670806885, + "loss": 1.0502, + "odds_ratio_loss": 0.1582815945148468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10344129800796509, + "rewards/margins": 0.5851720571517944, + "rewards/rejected": -0.6886133551597595, + "sft_loss": 1.0344129800796509, + "step": 9645 + }, + { + "epoch": 0.75, + "grad_norm": 23.18375587463379, + "learning_rate": 1.4786450701327742e-06, + "logits/chosen": -1.3845680952072144, + "logits/rejected": -0.9784570932388306, + "logps/chosen": -0.9990909695625305, + "logps/rejected": -5.884314060211182, + "loss": 1.0163, + "odds_ratio_loss": 0.1719394028186798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09990908950567245, + "rewards/margins": 0.4885222911834717, + "rewards/rejected": -0.5884313583374023, + "sft_loss": 0.9990909695625305, + "step": 9650 + }, + { + "epoch": 0.75, + "grad_norm": 7.935603141784668, + "learning_rate": 1.4742762895560476e-06, + "logits/chosen": -1.3818585872650146, + "logits/rejected": -1.1174324750900269, + "logps/chosen": -0.7408406138420105, + "logps/rejected": -3.53485107421875, + "loss": 0.7542, + "odds_ratio_loss": 0.13402345776557922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07408406585454941, + "rewards/margins": 0.279401034116745, + "rewards/rejected": -0.353485107421875, + "sft_loss": 0.7408406138420105, + "step": 9655 + }, + { + "epoch": 0.75, + "grad_norm": 9.305075645446777, + "learning_rate": 1.4699128561869708e-06, + "logits/chosen": -1.294163703918457, + "logits/rejected": -0.9440513849258423, + "logps/chosen": -1.1079423427581787, + "logps/rejected": -6.321938991546631, + "loss": 1.1293, + "odds_ratio_loss": 0.21366176009178162, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11079423129558563, + "rewards/margins": 0.5213996171951294, + "rewards/rejected": -0.632193922996521, + "sft_loss": 1.1079423427581787, + "step": 9660 + }, + { + "epoch": 0.75, + "grad_norm": 23.341026306152344, + "learning_rate": 1.4655547766432437e-06, + "logits/chosen": -1.4045528173446655, + "logits/rejected": -1.3434903621673584, + "logps/chosen": -1.1345020532608032, + "logps/rejected": -7.883443355560303, + "loss": 1.1512, + "odds_ratio_loss": 0.16651661694049835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11345021426677704, + "rewards/margins": 0.6748942136764526, + "rewards/rejected": -0.7883443236351013, + "sft_loss": 1.1345020532608032, + "step": 9665 + }, + { + "epoch": 0.75, + "grad_norm": 9.361612319946289, + "learning_rate": 1.4612020575344499e-06, + "logits/chosen": -1.3855892419815063, + "logits/rejected": -0.5439692139625549, + "logps/chosen": -0.8594552874565125, + "logps/rejected": -5.760911464691162, + "loss": 0.8727, + "odds_ratio_loss": 0.13233062624931335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08594552427530289, + "rewards/margins": 0.49014562368392944, + "rewards/rejected": -0.5760911703109741, + "sft_loss": 0.8594552874565125, + "step": 9670 + }, + { + "epoch": 0.75, + "grad_norm": 39.15553665161133, + "learning_rate": 1.4568547054620392e-06, + "logits/chosen": -1.3476572036743164, + "logits/rejected": -0.8295741081237793, + "logps/chosen": -1.328378438949585, + "logps/rejected": -2.901322841644287, + "loss": 1.3614, + "odds_ratio_loss": 0.32974696159362793, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13283784687519073, + "rewards/margins": 0.15729445219039917, + "rewards/rejected": -0.2901323139667511, + "sft_loss": 1.328378438949585, + "step": 9675 + }, + { + "epoch": 0.75, + "grad_norm": 8.7423677444458, + "learning_rate": 1.452512727019323e-06, + "logits/chosen": -1.3339862823486328, + "logits/rejected": -0.887398898601532, + "logps/chosen": -0.948280930519104, + "logps/rejected": -6.027867317199707, + "loss": 0.9603, + "odds_ratio_loss": 0.12048999965190887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09482809156179428, + "rewards/margins": 0.5079585909843445, + "rewards/rejected": -0.6027867197990417, + "sft_loss": 0.948280930519104, + "step": 9680 + }, + { + "epoch": 0.75, + "grad_norm": 25.774412155151367, + "learning_rate": 1.4481761287914625e-06, + "logits/chosen": -1.239070177078247, + "logits/rejected": -1.229046106338501, + "logps/chosen": -1.0665532350540161, + "logps/rejected": -4.786002159118652, + "loss": 1.0929, + "odds_ratio_loss": 0.2636137902736664, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1066553145647049, + "rewards/margins": 0.3719449043273926, + "rewards/rejected": -0.4786002039909363, + "sft_loss": 1.0665532350540161, + "step": 9685 + }, + { + "epoch": 0.75, + "grad_norm": 26.393348693847656, + "learning_rate": 1.4438449173554597e-06, + "logits/chosen": -1.2464635372161865, + "logits/rejected": -1.4154313802719116, + "logps/chosen": -1.031361699104309, + "logps/rejected": -4.2127685546875, + "loss": 1.0615, + "odds_ratio_loss": 0.30091702938079834, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10313616693019867, + "rewards/margins": 0.3181407153606415, + "rewards/rejected": -0.42127689719200134, + "sft_loss": 1.031361699104309, + "step": 9690 + }, + { + "epoch": 0.75, + "grad_norm": 6.652644157409668, + "learning_rate": 1.4395190992801456e-06, + "logits/chosen": -1.371250033378601, + "logits/rejected": -0.7872442603111267, + "logps/chosen": -1.035787582397461, + "logps/rejected": -13.284795761108398, + "loss": 1.0411, + "odds_ratio_loss": 0.05353992059826851, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10357876867055893, + "rewards/margins": 1.2249009609222412, + "rewards/rejected": -1.3284796476364136, + "sft_loss": 1.035787582397461, + "step": 9695 + }, + { + "epoch": 0.75, + "grad_norm": 9.1168794631958, + "learning_rate": 1.4351986811261753e-06, + "logits/chosen": -1.2818351984024048, + "logits/rejected": -1.6169687509536743, + "logps/chosen": -0.7868584990501404, + "logps/rejected": -10.137491226196289, + "loss": 0.788, + "odds_ratio_loss": 0.01179027371108532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07868585735559464, + "rewards/margins": 0.9350631833076477, + "rewards/rejected": -1.0137490034103394, + "sft_loss": 0.7868584990501404, + "step": 9700 + }, + { + "epoch": 0.75, + "grad_norm": 9.317264556884766, + "learning_rate": 1.43088366944601e-06, + "logits/chosen": -1.3212807178497314, + "logits/rejected": -1.1130956411361694, + "logps/chosen": -0.832508385181427, + "logps/rejected": -10.450431823730469, + "loss": 0.8588, + "odds_ratio_loss": 0.2629583179950714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08325083553791046, + "rewards/margins": 0.9617922902107239, + "rewards/rejected": -1.0450432300567627, + "sft_loss": 0.832508385181427, + "step": 9705 + }, + { + "epoch": 0.76, + "grad_norm": 9.495530128479004, + "learning_rate": 1.4265740707839127e-06, + "logits/chosen": -1.3654181957244873, + "logits/rejected": -1.3107173442840576, + "logps/chosen": -1.4224342107772827, + "logps/rejected": -12.950488090515137, + "loss": 1.4268, + "odds_ratio_loss": 0.04388565570116043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.142243430018425, + "rewards/margins": 1.1528054475784302, + "rewards/rejected": -1.2950489521026611, + "sft_loss": 1.4224342107772827, + "step": 9710 + }, + { + "epoch": 0.76, + "grad_norm": 8.009198188781738, + "learning_rate": 1.4222698916759347e-06, + "logits/chosen": -1.3352587223052979, + "logits/rejected": -1.2217845916748047, + "logps/chosen": -0.9053753018379211, + "logps/rejected": -6.1464009284973145, + "loss": 0.9398, + "odds_ratio_loss": 0.3441086411476135, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09053752571344376, + "rewards/margins": 0.5241025686264038, + "rewards/rejected": -0.6146401166915894, + "sft_loss": 0.9053753018379211, + "step": 9715 + }, + { + "epoch": 0.76, + "grad_norm": 46.444496154785156, + "learning_rate": 1.4179711386499145e-06, + "logits/chosen": -1.1560405492782593, + "logits/rejected": -1.4345853328704834, + "logps/chosen": -1.78522527217865, + "logps/rejected": -6.893980503082275, + "loss": 1.8423, + "odds_ratio_loss": 0.5703558921813965, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17852254211902618, + "rewards/margins": 0.5108754634857178, + "rewards/rejected": -0.689397931098938, + "sft_loss": 1.78522527217865, + "step": 9720 + }, + { + "epoch": 0.76, + "grad_norm": 13.907815933227539, + "learning_rate": 1.413677818225454e-06, + "logits/chosen": -1.3137757778167725, + "logits/rejected": -1.4432036876678467, + "logps/chosen": -0.8441891670227051, + "logps/rejected": -4.315835952758789, + "loss": 0.8707, + "odds_ratio_loss": 0.2653736472129822, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08441893011331558, + "rewards/margins": 0.34716469049453735, + "rewards/rejected": -0.43158358335494995, + "sft_loss": 0.8441891670227051, + "step": 9725 + }, + { + "epoch": 0.76, + "grad_norm": 60.72998046875, + "learning_rate": 1.409389936913918e-06, + "logits/chosen": -1.379272699356079, + "logits/rejected": -1.3644345998764038, + "logps/chosen": -1.0330593585968018, + "logps/rejected": -6.766173362731934, + "loss": 1.0723, + "odds_ratio_loss": 0.39229878783226013, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10330593585968018, + "rewards/margins": 0.573311448097229, + "rewards/rejected": -0.6766173839569092, + "sft_loss": 1.0330593585968018, + "step": 9730 + }, + { + "epoch": 0.76, + "grad_norm": 9.962238311767578, + "learning_rate": 1.4051075012184262e-06, + "logits/chosen": -1.0592668056488037, + "logits/rejected": -1.1845660209655762, + "logps/chosen": -1.181274175643921, + "logps/rejected": -4.246553897857666, + "loss": 1.2449, + "odds_ratio_loss": 0.6367109417915344, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11812742054462433, + "rewards/margins": 0.30652791261672974, + "rewards/rejected": -0.42465537786483765, + "sft_loss": 1.181274175643921, + "step": 9735 + }, + { + "epoch": 0.76, + "grad_norm": 5.543389797210693, + "learning_rate": 1.4008305176338337e-06, + "logits/chosen": -1.1186319589614868, + "logits/rejected": -0.8160092234611511, + "logps/chosen": -0.6612199544906616, + "logps/rejected": -6.092648506164551, + "loss": 0.6635, + "odds_ratio_loss": 0.02256493642926216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06612200289964676, + "rewards/margins": 0.5431429147720337, + "rewards/rejected": -0.6092648506164551, + "sft_loss": 0.6612199544906616, + "step": 9740 + }, + { + "epoch": 0.76, + "grad_norm": 7.234911918640137, + "learning_rate": 1.39655899264673e-06, + "logits/chosen": -1.37042236328125, + "logits/rejected": -0.9824494123458862, + "logps/chosen": -0.8050609827041626, + "logps/rejected": -4.500408172607422, + "loss": 0.8205, + "odds_ratio_loss": 0.1539815068244934, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0805060938000679, + "rewards/margins": 0.36953476071357727, + "rewards/rejected": -0.45004087686538696, + "sft_loss": 0.8050609827041626, + "step": 9745 + }, + { + "epoch": 0.76, + "grad_norm": 15.264321327209473, + "learning_rate": 1.3922929327354245e-06, + "logits/chosen": -1.4009451866149902, + "logits/rejected": -1.0183489322662354, + "logps/chosen": -1.019995927810669, + "logps/rejected": -4.73303747177124, + "loss": 1.0445, + "odds_ratio_loss": 0.24479708075523376, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10199960321187973, + "rewards/margins": 0.37130412459373474, + "rewards/rejected": -0.47330373525619507, + "sft_loss": 1.019995927810669, + "step": 9750 + }, + { + "epoch": 0.76, + "grad_norm": 30.697858810424805, + "learning_rate": 1.388032344369939e-06, + "logits/chosen": -1.3964107036590576, + "logits/rejected": -1.2231998443603516, + "logps/chosen": -0.7747622728347778, + "logps/rejected": -4.604029655456543, + "loss": 0.7883, + "odds_ratio_loss": 0.13550665974617004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07747622579336166, + "rewards/margins": 0.3829267621040344, + "rewards/rejected": -0.4604029655456543, + "sft_loss": 0.7747622728347778, + "step": 9755 + }, + { + "epoch": 0.76, + "grad_norm": 100.62313842773438, + "learning_rate": 1.3837772340119959e-06, + "logits/chosen": -1.1030269861221313, + "logits/rejected": -0.8924986124038696, + "logps/chosen": -1.0514342784881592, + "logps/rejected": -3.5023257732391357, + "loss": 1.1183, + "odds_ratio_loss": 0.6688185930252075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10514342784881592, + "rewards/margins": 0.24508914351463318, + "rewards/rejected": -0.3502325713634491, + "sft_loss": 1.0514342784881592, + "step": 9760 + }, + { + "epoch": 0.76, + "grad_norm": 6.77316427230835, + "learning_rate": 1.37952760811501e-06, + "logits/chosen": -1.3329914808273315, + "logits/rejected": -1.0536625385284424, + "logps/chosen": -1.2876416444778442, + "logps/rejected": -7.932206153869629, + "loss": 1.289, + "odds_ratio_loss": 0.013573619537055492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12876416742801666, + "rewards/margins": 0.6644564270973206, + "rewards/rejected": -0.793220579624176, + "sft_loss": 1.2876416444778442, + "step": 9765 + }, + { + "epoch": 0.76, + "grad_norm": 20.56844139099121, + "learning_rate": 1.375283473124081e-06, + "logits/chosen": -1.3427093029022217, + "logits/rejected": -1.0028154850006104, + "logps/chosen": -1.0161627531051636, + "logps/rejected": -6.620965480804443, + "loss": 1.0465, + "odds_ratio_loss": 0.30296653509140015, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1016162857413292, + "rewards/margins": 0.5604802966117859, + "rewards/rejected": -0.6620966196060181, + "sft_loss": 1.0161627531051636, + "step": 9770 + }, + { + "epoch": 0.76, + "grad_norm": 6.547877311706543, + "learning_rate": 1.371044835475977e-06, + "logits/chosen": -1.4146158695220947, + "logits/rejected": -0.9940034747123718, + "logps/chosen": -0.9952043294906616, + "logps/rejected": -5.0639848709106445, + "loss": 1.0029, + "odds_ratio_loss": 0.07706589251756668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09952044486999512, + "rewards/margins": 0.4068779945373535, + "rewards/rejected": -0.5063984990119934, + "sft_loss": 0.9952043294906616, + "step": 9775 + }, + { + "epoch": 0.76, + "grad_norm": 52.96457290649414, + "learning_rate": 1.3668117015991284e-06, + "logits/chosen": -1.3061707019805908, + "logits/rejected": -1.382652997970581, + "logps/chosen": -1.1862385272979736, + "logps/rejected": -5.084000587463379, + "loss": 1.2115, + "odds_ratio_loss": 0.2521643042564392, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11862385272979736, + "rewards/margins": 0.38977622985839844, + "rewards/rejected": -0.5084000825881958, + "sft_loss": 1.1862385272979736, + "step": 9780 + }, + { + "epoch": 0.76, + "grad_norm": 11.038567543029785, + "learning_rate": 1.3625840779136235e-06, + "logits/chosen": -1.4083950519561768, + "logits/rejected": -1.0077704191207886, + "logps/chosen": -0.9693604707717896, + "logps/rejected": -7.7081298828125, + "loss": 0.9708, + "odds_ratio_loss": 0.014776247553527355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09693604707717896, + "rewards/margins": 0.673876941204071, + "rewards/rejected": -0.7708130478858948, + "sft_loss": 0.9693604707717896, + "step": 9785 + }, + { + "epoch": 0.76, + "grad_norm": 4.925965785980225, + "learning_rate": 1.358361970831188e-06, + "logits/chosen": -1.3355958461761475, + "logits/rejected": -0.8679085969924927, + "logps/chosen": -0.9071223139762878, + "logps/rejected": -7.300946235656738, + "loss": 0.9184, + "odds_ratio_loss": 0.11292095482349396, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09071223437786102, + "rewards/margins": 0.6393824219703674, + "rewards/rejected": -0.7300946116447449, + "sft_loss": 0.9071223139762878, + "step": 9790 + }, + { + "epoch": 0.76, + "grad_norm": 14.827608108520508, + "learning_rate": 1.3541453867551851e-06, + "logits/chosen": -1.3884003162384033, + "logits/rejected": -1.0902457237243652, + "logps/chosen": -1.028895378112793, + "logps/rejected": -4.301610946655273, + "loss": 1.0612, + "odds_ratio_loss": 0.32345858216285706, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1028895378112793, + "rewards/margins": 0.32727161049842834, + "rewards/rejected": -0.43016114830970764, + "sft_loss": 1.028895378112793, + "step": 9795 + }, + { + "epoch": 0.76, + "grad_norm": 327.2066650390625, + "learning_rate": 1.3499343320805986e-06, + "logits/chosen": -1.3086704015731812, + "logits/rejected": -0.8594743013381958, + "logps/chosen": -1.4803388118743896, + "logps/rejected": -8.974512100219727, + "loss": 1.4996, + "odds_ratio_loss": 0.1922084391117096, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14803388714790344, + "rewards/margins": 0.7494173049926758, + "rewards/rejected": -0.8974512219429016, + "sft_loss": 1.4803388118743896, + "step": 9800 + }, + { + "epoch": 0.76, + "grad_norm": 3.762913227081299, + "learning_rate": 1.3457288131940276e-06, + "logits/chosen": -1.0716235637664795, + "logits/rejected": -1.4478861093521118, + "logps/chosen": -0.8366058468818665, + "logps/rejected": -10.235267639160156, + "loss": 0.8395, + "odds_ratio_loss": 0.029129063710570335, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08366059511899948, + "rewards/margins": 0.9398662447929382, + "rewards/rejected": -1.0235267877578735, + "sft_loss": 0.8366058468818665, + "step": 9805 + }, + { + "epoch": 0.76, + "grad_norm": 32.673152923583984, + "learning_rate": 1.3415288364736746e-06, + "logits/chosen": -1.438460350036621, + "logits/rejected": -1.3678359985351562, + "logps/chosen": -0.791472315788269, + "logps/rejected": -2.52308988571167, + "loss": 0.8313, + "odds_ratio_loss": 0.39869898557662964, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07914721965789795, + "rewards/margins": 0.17316177487373352, + "rewards/rejected": -0.25230899453163147, + "sft_loss": 0.791472315788269, + "step": 9810 + }, + { + "epoch": 0.76, + "grad_norm": 6.212176322937012, + "learning_rate": 1.3373344082893403e-06, + "logits/chosen": -1.3597334623336792, + "logits/rejected": -1.017281413078308, + "logps/chosen": -0.8786466717720032, + "logps/rejected": -3.898221969604492, + "loss": 0.9047, + "odds_ratio_loss": 0.2601260840892792, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08786466717720032, + "rewards/margins": 0.30195754766464233, + "rewards/rejected": -0.38982218503952026, + "sft_loss": 0.8786466717720032, + "step": 9815 + }, + { + "epoch": 0.76, + "grad_norm": 63.43842315673828, + "learning_rate": 1.3331455350024059e-06, + "logits/chosen": -1.2316055297851562, + "logits/rejected": -0.9201291799545288, + "logps/chosen": -1.0224696397781372, + "logps/rejected": -5.860587120056152, + "loss": 1.053, + "odds_ratio_loss": 0.3048619031906128, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1022469624876976, + "rewards/margins": 0.48381170630455017, + "rewards/rejected": -0.5860587358474731, + "sft_loss": 1.0224696397781372, + "step": 9820 + }, + { + "epoch": 0.76, + "grad_norm": 6.886044979095459, + "learning_rate": 1.3289622229658294e-06, + "logits/chosen": -1.3224356174468994, + "logits/rejected": -1.299999713897705, + "logps/chosen": -0.9050876498222351, + "logps/rejected": -7.530638694763184, + "loss": 0.9165, + "odds_ratio_loss": 0.11380696296691895, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09050877392292023, + "rewards/margins": 0.6625550985336304, + "rewards/rejected": -0.7530638575553894, + "sft_loss": 0.9050876498222351, + "step": 9825 + }, + { + "epoch": 0.76, + "grad_norm": 27.36313819885254, + "learning_rate": 1.3247844785241336e-06, + "logits/chosen": -1.294485330581665, + "logits/rejected": -0.9901436567306519, + "logps/chosen": -1.0604060888290405, + "logps/rejected": -6.207077980041504, + "loss": 1.1032, + "odds_ratio_loss": 0.4282000958919525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10604061186313629, + "rewards/margins": 0.5146671533584595, + "rewards/rejected": -0.6207078099250793, + "sft_loss": 1.0604060888290405, + "step": 9830 + }, + { + "epoch": 0.77, + "grad_norm": 4.699453353881836, + "learning_rate": 1.320612308013401e-06, + "logits/chosen": -1.3086684942245483, + "logits/rejected": -0.826088547706604, + "logps/chosen": -1.4098154306411743, + "logps/rejected": -5.700179576873779, + "loss": 1.441, + "odds_ratio_loss": 0.3116340637207031, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1409815400838852, + "rewards/margins": 0.4290364682674408, + "rewards/rejected": -0.5700180530548096, + "sft_loss": 1.4098154306411743, + "step": 9835 + }, + { + "epoch": 0.77, + "grad_norm": 5.1790289878845215, + "learning_rate": 1.3164457177612566e-06, + "logits/chosen": -1.2479835748672485, + "logits/rejected": -0.7511214017868042, + "logps/chosen": -0.8636773824691772, + "logps/rejected": -13.867403030395508, + "loss": 0.8773, + "odds_ratio_loss": 0.13587191700935364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08636773377656937, + "rewards/margins": 1.3003724813461304, + "rewards/rejected": -1.3867400884628296, + "sft_loss": 0.8636773824691772, + "step": 9840 + }, + { + "epoch": 0.77, + "grad_norm": 18.41057014465332, + "learning_rate": 1.3122847140868617e-06, + "logits/chosen": -1.3606140613555908, + "logits/rejected": -0.8298721313476562, + "logps/chosen": -0.8453086018562317, + "logps/rejected": -5.990819454193115, + "loss": 0.8473, + "odds_ratio_loss": 0.019589338451623917, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08453086018562317, + "rewards/margins": 0.5145511031150818, + "rewards/rejected": -0.5990819334983826, + "sft_loss": 0.8453086018562317, + "step": 9845 + }, + { + "epoch": 0.77, + "grad_norm": 8.951942443847656, + "learning_rate": 1.3081293033009107e-06, + "logits/chosen": -1.218518614768982, + "logits/rejected": -1.2447441816329956, + "logps/chosen": -1.276637077331543, + "logps/rejected": -4.123940944671631, + "loss": 1.2844, + "odds_ratio_loss": 0.0777917206287384, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12766370177268982, + "rewards/margins": 0.28473037481307983, + "rewards/rejected": -0.41239410638809204, + "sft_loss": 1.276637077331543, + "step": 9850 + }, + { + "epoch": 0.77, + "grad_norm": 8.449200630187988, + "learning_rate": 1.3039794917056087e-06, + "logits/chosen": -1.3235777616500854, + "logits/rejected": -0.7842531800270081, + "logps/chosen": -0.9570550918579102, + "logps/rejected": -5.944942474365234, + "loss": 0.9629, + "odds_ratio_loss": 0.058135222643613815, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09570551663637161, + "rewards/margins": 0.4987887442111969, + "rewards/rejected": -0.5944942831993103, + "sft_loss": 0.9570550918579102, + "step": 9855 + }, + { + "epoch": 0.77, + "grad_norm": 23.442842483520508, + "learning_rate": 1.2998352855946728e-06, + "logits/chosen": -1.3478174209594727, + "logits/rejected": -0.9334796071052551, + "logps/chosen": -0.742106556892395, + "logps/rejected": -5.195427894592285, + "loss": 0.7551, + "odds_ratio_loss": 0.1301083117723465, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07421065866947174, + "rewards/margins": 0.4453321397304535, + "rewards/rejected": -0.5195428133010864, + "sft_loss": 0.742106556892395, + "step": 9860 + }, + { + "epoch": 0.77, + "grad_norm": 12.772910118103027, + "learning_rate": 1.2956966912533176e-06, + "logits/chosen": -1.140475869178772, + "logits/rejected": -1.1725482940673828, + "logps/chosen": -1.062133550643921, + "logps/rejected": -5.455204010009766, + "loss": 1.1273, + "odds_ratio_loss": 0.6513864398002625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10621335357427597, + "rewards/margins": 0.4393070638179779, + "rewards/rejected": -0.5455204248428345, + "sft_loss": 1.062133550643921, + "step": 9865 + }, + { + "epoch": 0.77, + "grad_norm": 18.566396713256836, + "learning_rate": 1.2915637149582466e-06, + "logits/chosen": -1.3273341655731201, + "logits/rejected": -0.6339842081069946, + "logps/chosen": -1.1666196584701538, + "logps/rejected": -5.155702114105225, + "loss": 1.1715, + "odds_ratio_loss": 0.04836392030119896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11666196584701538, + "rewards/margins": 0.39890819787979126, + "rewards/rejected": -0.5155701637268066, + "sft_loss": 1.1666196584701538, + "step": 9870 + }, + { + "epoch": 0.77, + "grad_norm": 27.102888107299805, + "learning_rate": 1.2874363629776422e-06, + "logits/chosen": -1.2952888011932373, + "logits/rejected": -1.339074730873108, + "logps/chosen": -0.6745896339416504, + "logps/rejected": -4.707167148590088, + "loss": 0.6864, + "odds_ratio_loss": 0.11814513057470322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06745896488428116, + "rewards/margins": 0.4032577574253082, + "rewards/rejected": -0.4707167148590088, + "sft_loss": 0.6745896339416504, + "step": 9875 + }, + { + "epoch": 0.77, + "grad_norm": 15.510187149047852, + "learning_rate": 1.28331464157116e-06, + "logits/chosen": -1.2012255191802979, + "logits/rejected": -1.3116233348846436, + "logps/chosen": -1.244822382926941, + "logps/rejected": -5.981281280517578, + "loss": 1.2535, + "odds_ratio_loss": 0.08675719797611237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12448225170373917, + "rewards/margins": 0.4736458659172058, + "rewards/rejected": -0.598128080368042, + "sft_loss": 1.244822382926941, + "step": 9880 + }, + { + "epoch": 0.77, + "grad_norm": 20.459211349487305, + "learning_rate": 1.2791985569899124e-06, + "logits/chosen": -1.1367995738983154, + "logits/rejected": -1.5840421915054321, + "logps/chosen": -1.248212218284607, + "logps/rejected": -12.186239242553711, + "loss": 1.2493, + "odds_ratio_loss": 0.011205102317035198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12482122331857681, + "rewards/margins": 1.0938026905059814, + "rewards/rejected": -1.2186239957809448, + "sft_loss": 1.248212218284607, + "step": 9885 + }, + { + "epoch": 0.77, + "grad_norm": 4.763285160064697, + "learning_rate": 1.275088115476465e-06, + "logits/chosen": -1.2230708599090576, + "logits/rejected": -1.3235952854156494, + "logps/chosen": -0.8943685293197632, + "logps/rejected": -6.485651969909668, + "loss": 0.8969, + "odds_ratio_loss": 0.025648515671491623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08943686634302139, + "rewards/margins": 0.5591284036636353, + "rewards/rejected": -0.6485652327537537, + "sft_loss": 0.8943685293197632, + "step": 9890 + }, + { + "epoch": 0.77, + "grad_norm": 49.72977828979492, + "learning_rate": 1.2709833232648216e-06, + "logits/chosen": -1.3660045862197876, + "logits/rejected": -1.3602436780929565, + "logps/chosen": -1.9368641376495361, + "logps/rejected": -15.42323112487793, + "loss": 1.9435, + "odds_ratio_loss": 0.06626741588115692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19368639588356018, + "rewards/margins": 1.3486367464065552, + "rewards/rejected": -1.542323112487793, + "sft_loss": 1.9368641376495361, + "step": 9895 + }, + { + "epoch": 0.77, + "grad_norm": 45.193084716796875, + "learning_rate": 1.2668841865804248e-06, + "logits/chosen": -1.2795097827911377, + "logits/rejected": -1.0634839534759521, + "logps/chosen": -0.7293432354927063, + "logps/rejected": -10.748265266418457, + "loss": 0.7317, + "odds_ratio_loss": 0.0239731278270483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07293432205915451, + "rewards/margins": 1.0018922090530396, + "rewards/rejected": -1.0748264789581299, + "sft_loss": 0.7293432354927063, + "step": 9900 + }, + { + "epoch": 0.77, + "grad_norm": 15.02663803100586, + "learning_rate": 1.2627907116401338e-06, + "logits/chosen": -1.3065115213394165, + "logits/rejected": -0.8887646794319153, + "logps/chosen": -1.0517328977584839, + "logps/rejected": -7.9593825340271, + "loss": 1.067, + "odds_ratio_loss": 0.15316042304039001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10517330467700958, + "rewards/margins": 0.6907650232315063, + "rewards/rejected": -0.7959383726119995, + "sft_loss": 1.0517328977584839, + "step": 9905 + }, + { + "epoch": 0.77, + "grad_norm": 13.276571273803711, + "learning_rate": 1.258702904652223e-06, + "logits/chosen": -1.419241189956665, + "logits/rejected": -1.1413755416870117, + "logps/chosen": -0.8657780885696411, + "logps/rejected": -8.248200416564941, + "loss": 0.8702, + "odds_ratio_loss": 0.04419126361608505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08657781034708023, + "rewards/margins": 0.7382422089576721, + "rewards/rejected": -0.8248200416564941, + "sft_loss": 0.8657780885696411, + "step": 9910 + }, + { + "epoch": 0.77, + "grad_norm": 6.825826168060303, + "learning_rate": 1.2546207718163717e-06, + "logits/chosen": -1.387042760848999, + "logits/rejected": -0.8412164449691772, + "logps/chosen": -1.1644020080566406, + "logps/rejected": -4.54971981048584, + "loss": 1.2089, + "odds_ratio_loss": 0.44534358382225037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11644019931554794, + "rewards/margins": 0.3385317921638489, + "rewards/rejected": -0.45497196912765503, + "sft_loss": 1.1644020080566406, + "step": 9915 + }, + { + "epoch": 0.77, + "grad_norm": 6.1360907554626465, + "learning_rate": 1.2505443193236512e-06, + "logits/chosen": -1.4001635313034058, + "logits/rejected": -1.162745714187622, + "logps/chosen": -1.2425587177276611, + "logps/rejected": -6.215406894683838, + "loss": 1.3028, + "odds_ratio_loss": 0.6023607850074768, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12425585836172104, + "rewards/margins": 0.49728482961654663, + "rewards/rejected": -0.6215406656265259, + "sft_loss": 1.2425587177276611, + "step": 9920 + }, + { + "epoch": 0.77, + "grad_norm": 6.499194622039795, + "learning_rate": 1.246473553356518e-06, + "logits/chosen": -1.255040168762207, + "logits/rejected": -1.2155425548553467, + "logps/chosen": -1.216691017150879, + "logps/rejected": -19.482616424560547, + "loss": 1.2167, + "odds_ratio_loss": 3.4083663194905967e-05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12166911363601685, + "rewards/margins": 1.8265924453735352, + "rewards/rejected": -1.9482614994049072, + "sft_loss": 1.216691017150879, + "step": 9925 + }, + { + "epoch": 0.77, + "grad_norm": 13.034653663635254, + "learning_rate": 1.2424084800888093e-06, + "logits/chosen": -1.297929048538208, + "logits/rejected": -0.9741071462631226, + "logps/chosen": -0.9298331141471863, + "logps/rejected": -5.6627936363220215, + "loss": 0.9421, + "odds_ratio_loss": 0.1228838711977005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09298331290483475, + "rewards/margins": 0.4732961058616638, + "rewards/rejected": -0.5662793517112732, + "sft_loss": 0.9298331141471863, + "step": 9930 + }, + { + "epoch": 0.77, + "grad_norm": 50.6912727355957, + "learning_rate": 1.2383491056857234e-06, + "logits/chosen": -1.28719162940979, + "logits/rejected": -0.7962328195571899, + "logps/chosen": -1.011508584022522, + "logps/rejected": -5.1037397384643555, + "loss": 1.0171, + "odds_ratio_loss": 0.05606383830308914, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10115084797143936, + "rewards/margins": 0.40922316908836365, + "rewards/rejected": -0.5103740096092224, + "sft_loss": 1.011508584022522, + "step": 9935 + }, + { + "epoch": 0.77, + "grad_norm": 15.886419296264648, + "learning_rate": 1.2342954363038146e-06, + "logits/chosen": -1.2996234893798828, + "logits/rejected": -1.1742438077926636, + "logps/chosen": -0.9803162813186646, + "logps/rejected": -10.773435592651367, + "loss": 0.9813, + "odds_ratio_loss": 0.00980658270418644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09803163260221481, + "rewards/margins": 0.9793119430541992, + "rewards/rejected": -1.0773435831069946, + "sft_loss": 0.9803162813186646, + "step": 9940 + }, + { + "epoch": 0.77, + "grad_norm": 10.295068740844727, + "learning_rate": 1.2302474780909901e-06, + "logits/chosen": -1.5229541063308716, + "logits/rejected": -1.3465713262557983, + "logps/chosen": -0.9372552633285522, + "logps/rejected": -10.703648567199707, + "loss": 0.9475, + "odds_ratio_loss": 0.10272009670734406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0937255322933197, + "rewards/margins": 0.9766393899917603, + "rewards/rejected": -1.0703647136688232, + "sft_loss": 0.9372552633285522, + "step": 9945 + }, + { + "epoch": 0.77, + "grad_norm": 5.413816928863525, + "learning_rate": 1.2262052371864924e-06, + "logits/chosen": -1.3776428699493408, + "logits/rejected": -0.8330678939819336, + "logps/chosen": -0.7698219418525696, + "logps/rejected": -6.723541259765625, + "loss": 0.773, + "odds_ratio_loss": 0.032238125801086426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07698218524456024, + "rewards/margins": 0.5953719615936279, + "rewards/rejected": -0.6723541021347046, + "sft_loss": 0.7698219418525696, + "step": 9950 + }, + { + "epoch": 0.77, + "grad_norm": 27.832286834716797, + "learning_rate": 1.2221687197208914e-06, + "logits/chosen": -1.1848442554473877, + "logits/rejected": -1.1238027811050415, + "logps/chosen": -0.8974050283432007, + "logps/rejected": -1.6427028179168701, + "loss": 0.9332, + "odds_ratio_loss": 0.3581832945346832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08974049985408783, + "rewards/margins": 0.07452978938817978, + "rewards/rejected": -0.1642702966928482, + "sft_loss": 0.8974050283432007, + "step": 9955 + }, + { + "epoch": 0.77, + "grad_norm": 15.635947227478027, + "learning_rate": 1.218137931816078e-06, + "logits/chosen": -1.4109541177749634, + "logits/rejected": -1.072914958000183, + "logps/chosen": -0.7615107297897339, + "logps/rejected": -7.309935092926025, + "loss": 0.7773, + "odds_ratio_loss": 0.15754522383213043, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07615108042955399, + "rewards/margins": 0.6548423767089844, + "rewards/rejected": -0.7309934496879578, + "sft_loss": 0.7615107297897339, + "step": 9960 + }, + { + "epoch": 0.78, + "grad_norm": 5.612651348114014, + "learning_rate": 1.2141128795852563e-06, + "logits/chosen": -1.4222948551177979, + "logits/rejected": -1.231484293937683, + "logps/chosen": -0.7754429578781128, + "logps/rejected": -10.327539443969727, + "loss": 0.7877, + "odds_ratio_loss": 0.12259682267904282, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07754429429769516, + "rewards/margins": 0.9552095532417297, + "rewards/rejected": -1.032753825187683, + "sft_loss": 0.7754429578781128, + "step": 9965 + }, + { + "epoch": 0.78, + "grad_norm": 5.235637187957764, + "learning_rate": 1.210093569132928e-06, + "logits/chosen": -1.2951066493988037, + "logits/rejected": -0.5975306630134583, + "logps/chosen": -0.9870067834854126, + "logps/rejected": -8.021510124206543, + "loss": 1.0151, + "odds_ratio_loss": 0.281358927488327, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09870067983865738, + "rewards/margins": 0.7034503221511841, + "rewards/rejected": -0.8021510243415833, + "sft_loss": 0.9870067834854126, + "step": 9970 + }, + { + "epoch": 0.78, + "grad_norm": 36.680965423583984, + "learning_rate": 1.2060800065548867e-06, + "logits/chosen": -1.3343515396118164, + "logits/rejected": -1.5229475498199463, + "logps/chosen": -0.8048030734062195, + "logps/rejected": -8.215921401977539, + "loss": 0.8068, + "odds_ratio_loss": 0.01964385434985161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08048031479120255, + "rewards/margins": 0.7411118745803833, + "rewards/rejected": -0.8215921521186829, + "sft_loss": 0.8048030734062195, + "step": 9975 + }, + { + "epoch": 0.78, + "grad_norm": 24.134246826171875, + "learning_rate": 1.2020721979382111e-06, + "logits/chosen": -1.3110750913619995, + "logits/rejected": -1.1490697860717773, + "logps/chosen": -1.087066888809204, + "logps/rejected": -4.987780570983887, + "loss": 1.1119, + "odds_ratio_loss": 0.24879589676856995, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10870669037103653, + "rewards/margins": 0.39007139205932617, + "rewards/rejected": -0.4987780451774597, + "sft_loss": 1.087066888809204, + "step": 9980 + }, + { + "epoch": 0.78, + "grad_norm": 8.332462310791016, + "learning_rate": 1.1980701493612507e-06, + "logits/chosen": -1.330665111541748, + "logits/rejected": -0.6037562489509583, + "logps/chosen": -0.8439868688583374, + "logps/rejected": -10.712756156921387, + "loss": 0.8541, + "odds_ratio_loss": 0.10095206648111343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08439868688583374, + "rewards/margins": 0.9868769645690918, + "rewards/rejected": -1.0712755918502808, + "sft_loss": 0.8439868688583374, + "step": 9985 + }, + { + "epoch": 0.78, + "grad_norm": 21.133403778076172, + "learning_rate": 1.1940738668936187e-06, + "logits/chosen": -1.2949966192245483, + "logits/rejected": -0.9187256097793579, + "logps/chosen": -1.0978871583938599, + "logps/rejected": -7.6196160316467285, + "loss": 1.1242, + "odds_ratio_loss": 0.26295655965805054, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10978871583938599, + "rewards/margins": 0.652172863483429, + "rewards/rejected": -0.7619615793228149, + "sft_loss": 1.0978871583938599, + "step": 9990 + }, + { + "epoch": 0.78, + "grad_norm": 9.91594123840332, + "learning_rate": 1.1900833565961888e-06, + "logits/chosen": -1.3776895999908447, + "logits/rejected": -0.8226616978645325, + "logps/chosen": -1.0611966848373413, + "logps/rejected": -5.255466938018799, + "loss": 1.0877, + "odds_ratio_loss": 0.26494020223617554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10611967742443085, + "rewards/margins": 0.4194270670413971, + "rewards/rejected": -0.5255467891693115, + "sft_loss": 1.0611966848373413, + "step": 9995 + }, + { + "epoch": 0.78, + "grad_norm": 11.703951835632324, + "learning_rate": 1.1860986245210742e-06, + "logits/chosen": -1.4472962617874146, + "logits/rejected": -1.218927264213562, + "logps/chosen": -0.932456374168396, + "logps/rejected": -5.315445423126221, + "loss": 0.9578, + "odds_ratio_loss": 0.25324195623397827, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09324564039707184, + "rewards/margins": 0.43829888105392456, + "rewards/rejected": -0.53154456615448, + "sft_loss": 0.932456374168396, + "step": 10000 + }, + { + "epoch": 0.78, + "grad_norm": 7.178709506988525, + "learning_rate": 1.1821196767116272e-06, + "logits/chosen": -1.3587113618850708, + "logits/rejected": -1.2883832454681396, + "logps/chosen": -1.508227825164795, + "logps/rejected": -12.98760986328125, + "loss": 1.5083, + "odds_ratio_loss": 0.0008086395682767034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15082278847694397, + "rewards/margins": 1.1479381322860718, + "rewards/rejected": -1.2987611293792725, + "sft_loss": 1.508227825164795, + "step": 10005 + }, + { + "epoch": 0.78, + "grad_norm": 15.75374984741211, + "learning_rate": 1.1781465192024266e-06, + "logits/chosen": -1.378434419631958, + "logits/rejected": -1.251649260520935, + "logps/chosen": -0.8725560307502747, + "logps/rejected": -8.043733596801758, + "loss": 0.8776, + "odds_ratio_loss": 0.05054037645459175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08725561201572418, + "rewards/margins": 0.7171179056167603, + "rewards/rejected": -0.8043734431266785, + "sft_loss": 0.8725560307502747, + "step": 10010 + }, + { + "epoch": 0.78, + "grad_norm": 493.7987060546875, + "learning_rate": 1.1741791580192718e-06, + "logits/chosen": -1.4295212030410767, + "logits/rejected": -1.1595797538757324, + "logps/chosen": -1.3225281238555908, + "logps/rejected": -3.852818727493286, + "loss": 1.3473, + "odds_ratio_loss": 0.24733905494213104, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13225281238555908, + "rewards/margins": 0.2530290484428406, + "rewards/rejected": -0.38528189063072205, + "sft_loss": 1.3225281238555908, + "step": 10015 + }, + { + "epoch": 0.78, + "grad_norm": 5.6853814125061035, + "learning_rate": 1.1702175991791693e-06, + "logits/chosen": -1.3617594242095947, + "logits/rejected": -1.0348384380340576, + "logps/chosen": -0.7079185247421265, + "logps/rejected": -8.361276626586914, + "loss": 0.7145, + "odds_ratio_loss": 0.06607773154973984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07079185545444489, + "rewards/margins": 0.765335738658905, + "rewards/rejected": -0.8361276388168335, + "sft_loss": 0.7079185247421265, + "step": 10020 + }, + { + "epoch": 0.78, + "grad_norm": 18.461105346679688, + "learning_rate": 1.166261848690326e-06, + "logits/chosen": -0.9604610204696655, + "logits/rejected": -1.4066376686096191, + "logps/chosen": -0.6313884854316711, + "logps/rejected": -10.4922513961792, + "loss": 0.6316, + "odds_ratio_loss": 0.0021018588449805975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06313885003328323, + "rewards/margins": 0.9860862493515015, + "rewards/rejected": -1.049225091934204, + "sft_loss": 0.6313884854316711, + "step": 10025 + }, + { + "epoch": 0.78, + "grad_norm": 37.55699920654297, + "learning_rate": 1.1623119125521394e-06, + "logits/chosen": -1.3622324466705322, + "logits/rejected": -1.175256609916687, + "logps/chosen": -1.1930874586105347, + "logps/rejected": -10.830957412719727, + "loss": 1.1946, + "odds_ratio_loss": 0.015017673373222351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11930874735116959, + "rewards/margins": 0.9637869596481323, + "rewards/rejected": -1.0830957889556885, + "sft_loss": 1.1930874586105347, + "step": 10030 + }, + { + "epoch": 0.78, + "grad_norm": 25.23794174194336, + "learning_rate": 1.1583677967551888e-06, + "logits/chosen": -1.375732421875, + "logits/rejected": -1.0804195404052734, + "logps/chosen": -1.0573394298553467, + "logps/rejected": -5.16109561920166, + "loss": 1.0634, + "odds_ratio_loss": 0.060222726315259933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1057339459657669, + "rewards/margins": 0.41037559509277344, + "rewards/rejected": -0.5161095261573792, + "sft_loss": 1.0573394298553467, + "step": 10035 + }, + { + "epoch": 0.78, + "grad_norm": 35.429866790771484, + "learning_rate": 1.154429507281226e-06, + "logits/chosen": -1.3516714572906494, + "logits/rejected": -1.1342250108718872, + "logps/chosen": -1.106170892715454, + "logps/rejected": -4.331687927246094, + "loss": 1.1394, + "odds_ratio_loss": 0.33246272802352905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11061708629131317, + "rewards/margins": 0.3225516974925995, + "rewards/rejected": -0.43316879868507385, + "sft_loss": 1.106170892715454, + "step": 10040 + }, + { + "epoch": 0.78, + "grad_norm": 241.47119140625, + "learning_rate": 1.1504970501031692e-06, + "logits/chosen": -1.3227497339248657, + "logits/rejected": -1.1068919897079468, + "logps/chosen": -1.4671138525009155, + "logps/rejected": -7.351625919342041, + "loss": 1.487, + "odds_ratio_loss": 0.19894203543663025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14671137928962708, + "rewards/margins": 0.5884512662887573, + "rewards/rejected": -0.735162615776062, + "sft_loss": 1.4671138525009155, + "step": 10045 + }, + { + "epoch": 0.78, + "grad_norm": 29.387950897216797, + "learning_rate": 1.1465704311850883e-06, + "logits/chosen": -1.4164648056030273, + "logits/rejected": -1.2835915088653564, + "logps/chosen": -0.9282344579696655, + "logps/rejected": -5.288893222808838, + "loss": 0.9465, + "odds_ratio_loss": 0.18245641887187958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09282345324754715, + "rewards/margins": 0.4360658526420593, + "rewards/rejected": -0.5288892984390259, + "sft_loss": 0.9282344579696655, + "step": 10050 + }, + { + "epoch": 0.78, + "grad_norm": 6.3994903564453125, + "learning_rate": 1.1426496564821976e-06, + "logits/chosen": -1.3634178638458252, + "logits/rejected": -0.6244993805885315, + "logps/chosen": -0.9844557642936707, + "logps/rejected": -5.556734085083008, + "loss": 1.0037, + "odds_ratio_loss": 0.19292011857032776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0984455794095993, + "rewards/margins": 0.457227885723114, + "rewards/rejected": -0.5556734800338745, + "sft_loss": 0.9844557642936707, + "step": 10055 + }, + { + "epoch": 0.78, + "grad_norm": 13.108819961547852, + "learning_rate": 1.138734731940852e-06, + "logits/chosen": -1.5008776187896729, + "logits/rejected": -1.1382453441619873, + "logps/chosen": -0.7119458317756653, + "logps/rejected": -8.66386604309082, + "loss": 0.714, + "odds_ratio_loss": 0.02025299333035946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.071194589138031, + "rewards/margins": 0.7951920628547668, + "rewards/rejected": -0.8663867115974426, + "sft_loss": 0.7119458317756653, + "step": 10060 + }, + { + "epoch": 0.78, + "grad_norm": 7.2096076011657715, + "learning_rate": 1.1348256634985311e-06, + "logits/chosen": -1.3829349279403687, + "logits/rejected": -1.0344932079315186, + "logps/chosen": -0.9017425775527954, + "logps/rejected": -3.416379928588867, + "loss": 0.9245, + "odds_ratio_loss": 0.22740647196769714, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09017425775527954, + "rewards/margins": 0.25146371126174927, + "rewards/rejected": -0.3416379988193512, + "sft_loss": 0.9017425775527954, + "step": 10065 + }, + { + "epoch": 0.78, + "grad_norm": 14.399897575378418, + "learning_rate": 1.1309224570838335e-06, + "logits/chosen": -1.0961267948150635, + "logits/rejected": -0.7845171093940735, + "logps/chosen": -0.9008132815361023, + "logps/rejected": -8.158656120300293, + "loss": 0.9079, + "odds_ratio_loss": 0.0713641345500946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09008133411407471, + "rewards/margins": 0.7257843017578125, + "rewards/rejected": -0.8158656358718872, + "sft_loss": 0.9008132815361023, + "step": 10070 + }, + { + "epoch": 0.78, + "grad_norm": 5.566354751586914, + "learning_rate": 1.1270251186164649e-06, + "logits/chosen": -1.2573572397232056, + "logits/rejected": -1.0662510395050049, + "logps/chosen": -0.6841157674789429, + "logps/rejected": -8.32490348815918, + "loss": 0.6947, + "odds_ratio_loss": 0.10547639429569244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06841157376766205, + "rewards/margins": 0.7640787363052368, + "rewards/rejected": -0.8324903249740601, + "sft_loss": 0.6841157674789429, + "step": 10075 + }, + { + "epoch": 0.78, + "grad_norm": 43.7618293762207, + "learning_rate": 1.1231336540072379e-06, + "logits/chosen": -1.2308496236801147, + "logits/rejected": -1.3328073024749756, + "logps/chosen": -0.7990642786026001, + "logps/rejected": -6.931474208831787, + "loss": 0.825, + "odds_ratio_loss": 0.2591875493526459, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07990642637014389, + "rewards/margins": 0.6132410168647766, + "rewards/rejected": -0.6931473612785339, + "sft_loss": 0.7990642786026001, + "step": 10080 + }, + { + "epoch": 0.78, + "grad_norm": 19.308300018310547, + "learning_rate": 1.1192480691580504e-06, + "logits/chosen": -1.3502826690673828, + "logits/rejected": -1.0207679271697998, + "logps/chosen": -1.1335891485214233, + "logps/rejected": -11.743545532226562, + "loss": 1.1475, + "odds_ratio_loss": 0.13940632343292236, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11335892975330353, + "rewards/margins": 1.0609955787658691, + "rewards/rejected": -1.1743545532226562, + "sft_loss": 1.1335891485214233, + "step": 10085 + }, + { + "epoch": 0.78, + "grad_norm": 5.187989711761475, + "learning_rate": 1.1153683699618856e-06, + "logits/chosen": -1.367068886756897, + "logits/rejected": -1.1009019613265991, + "logps/chosen": -1.4418981075286865, + "logps/rejected": -7.871354579925537, + "loss": 1.4518, + "odds_ratio_loss": 0.09873579442501068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14418981969356537, + "rewards/margins": 0.6429456472396851, + "rewards/rejected": -0.7871354222297668, + "sft_loss": 1.4418981075286865, + "step": 10090 + }, + { + "epoch": 0.79, + "grad_norm": 5.7330827713012695, + "learning_rate": 1.1114945623027995e-06, + "logits/chosen": -1.1983908414840698, + "logits/rejected": -0.9033523797988892, + "logps/chosen": -0.9997088313102722, + "logps/rejected": -4.750739097595215, + "loss": 1.014, + "odds_ratio_loss": 0.14291249215602875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09997088462114334, + "rewards/margins": 0.37510305643081665, + "rewards/rejected": -0.4750739634037018, + "sft_loss": 0.9997088313102722, + "step": 10095 + }, + { + "epoch": 0.79, + "grad_norm": 9.244364738464355, + "learning_rate": 1.1076266520559136e-06, + "logits/chosen": -1.4069405794143677, + "logits/rejected": -0.9098762273788452, + "logps/chosen": -0.9030729532241821, + "logps/rejected": -6.667893409729004, + "loss": 0.9154, + "odds_ratio_loss": 0.12360890209674835, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09030728787183762, + "rewards/margins": 0.5764819979667664, + "rewards/rejected": -0.6667893528938293, + "sft_loss": 0.9030729532241821, + "step": 10100 + }, + { + "epoch": 0.79, + "grad_norm": 12.363853454589844, + "learning_rate": 1.103764645087404e-06, + "logits/chosen": -1.4370672702789307, + "logits/rejected": -0.8995648622512817, + "logps/chosen": -0.7316851615905762, + "logps/rejected": -6.912254333496094, + "loss": 0.7401, + "odds_ratio_loss": 0.08367065340280533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07316852360963821, + "rewards/margins": 0.6180568933486938, + "rewards/rejected": -0.6912254095077515, + "sft_loss": 0.7316851615905762, + "step": 10105 + }, + { + "epoch": 0.79, + "grad_norm": 15.516923904418945, + "learning_rate": 1.0999085472544962e-06, + "logits/chosen": -1.2609612941741943, + "logits/rejected": -1.231152892112732, + "logps/chosen": -0.884219765663147, + "logps/rejected": -5.064759254455566, + "loss": 0.9159, + "odds_ratio_loss": 0.3166887164115906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08842197060585022, + "rewards/margins": 0.41805392503738403, + "rewards/rejected": -0.5064759254455566, + "sft_loss": 0.884219765663147, + "step": 10110 + }, + { + "epoch": 0.79, + "grad_norm": 3.7335870265960693, + "learning_rate": 1.0960583644054517e-06, + "logits/chosen": -1.3824520111083984, + "logits/rejected": -0.9480381011962891, + "logps/chosen": -0.9319052696228027, + "logps/rejected": -9.209867477416992, + "loss": 0.9413, + "odds_ratio_loss": 0.09413303434848785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09319053590297699, + "rewards/margins": 0.8277961611747742, + "rewards/rejected": -0.9209867715835571, + "sft_loss": 0.9319052696228027, + "step": 10115 + }, + { + "epoch": 0.79, + "grad_norm": 5.4312567710876465, + "learning_rate": 1.0922141023795601e-06, + "logits/chosen": -1.3167814016342163, + "logits/rejected": -0.8333790898323059, + "logps/chosen": -0.9882022142410278, + "logps/rejected": -9.491655349731445, + "loss": 1.0609, + "odds_ratio_loss": 0.7272026538848877, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09882022440433502, + "rewards/margins": 0.8503454327583313, + "rewards/rejected": -0.9491655230522156, + "sft_loss": 0.9882022142410278, + "step": 10120 + }, + { + "epoch": 0.79, + "grad_norm": 20.70174789428711, + "learning_rate": 1.0883757670071355e-06, + "logits/chosen": -1.2962617874145508, + "logits/rejected": -1.2887189388275146, + "logps/chosen": -0.810468852519989, + "logps/rejected": -11.951204299926758, + "loss": 0.8113, + "odds_ratio_loss": 0.008780966512858868, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08104689419269562, + "rewards/margins": 1.1140735149383545, + "rewards/rejected": -1.1951204538345337, + "sft_loss": 0.810468852519989, + "step": 10125 + }, + { + "epoch": 0.79, + "grad_norm": 5.827946186065674, + "learning_rate": 1.0845433641094988e-06, + "logits/chosen": -1.345948576927185, + "logits/rejected": -1.1523609161376953, + "logps/chosen": -0.773142397403717, + "logps/rejected": -8.959914207458496, + "loss": 0.7836, + "odds_ratio_loss": 0.10436198860406876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07731424272060394, + "rewards/margins": 0.8186771273612976, + "rewards/rejected": -0.8959914445877075, + "sft_loss": 0.773142397403717, + "step": 10130 + }, + { + "epoch": 0.79, + "grad_norm": 38.608524322509766, + "learning_rate": 1.0807168994989764e-06, + "logits/chosen": -1.3648837804794312, + "logits/rejected": -1.3312628269195557, + "logps/chosen": -0.994695782661438, + "logps/rejected": -11.334436416625977, + "loss": 1.0198, + "odds_ratio_loss": 0.25126659870147705, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09946957975625992, + "rewards/margins": 1.033974051475525, + "rewards/rejected": -1.1334435939788818, + "sft_loss": 0.994695782661438, + "step": 10135 + }, + { + "epoch": 0.79, + "grad_norm": 137.97048950195312, + "learning_rate": 1.0768963789788878e-06, + "logits/chosen": -1.290567398071289, + "logits/rejected": -0.9734798669815063, + "logps/chosen": -1.486511468887329, + "logps/rejected": -13.96313762664795, + "loss": 1.4877, + "odds_ratio_loss": 0.012327526696026325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1486511528491974, + "rewards/margins": 1.2476626634597778, + "rewards/rejected": -1.3963139057159424, + "sft_loss": 1.486511468887329, + "step": 10140 + }, + { + "epoch": 0.79, + "grad_norm": 4.473769664764404, + "learning_rate": 1.0730818083435369e-06, + "logits/chosen": -1.4110952615737915, + "logits/rejected": -0.8735581636428833, + "logps/chosen": -1.2078263759613037, + "logps/rejected": -12.34583854675293, + "loss": 1.2174, + "odds_ratio_loss": 0.09589570015668869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12078263610601425, + "rewards/margins": 1.1138012409210205, + "rewards/rejected": -1.2345839738845825, + "sft_loss": 1.2078263759613037, + "step": 10145 + }, + { + "epoch": 0.79, + "grad_norm": 300.1793212890625, + "learning_rate": 1.0692731933782046e-06, + "logits/chosen": -1.379407286643982, + "logits/rejected": -1.7840709686279297, + "logps/chosen": -1.0849478244781494, + "logps/rejected": -12.135259628295898, + "loss": 1.0985, + "odds_ratio_loss": 0.13510160148143768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10849478095769882, + "rewards/margins": 1.1050312519073486, + "rewards/rejected": -1.2135260105133057, + "sft_loss": 1.0849478244781494, + "step": 10150 + }, + { + "epoch": 0.79, + "grad_norm": 8.062394142150879, + "learning_rate": 1.0654705398591374e-06, + "logits/chosen": -1.3931306600570679, + "logits/rejected": -1.3168301582336426, + "logps/chosen": -0.9894862174987793, + "logps/rejected": -9.375204086303711, + "loss": 0.9929, + "odds_ratio_loss": 0.0342748686671257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09894861280918121, + "rewards/margins": 0.8385717272758484, + "rewards/rejected": -0.9375203847885132, + "sft_loss": 0.9894862174987793, + "step": 10155 + }, + { + "epoch": 0.79, + "grad_norm": 12.118971824645996, + "learning_rate": 1.0616738535535458e-06, + "logits/chosen": -1.1479527950286865, + "logits/rejected": -1.2802600860595703, + "logps/chosen": -1.2164061069488525, + "logps/rejected": -4.142562389373779, + "loss": 1.2229, + "odds_ratio_loss": 0.06503340601921082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12164060771465302, + "rewards/margins": 0.29261571168899536, + "rewards/rejected": -0.4142562747001648, + "sft_loss": 1.2164061069488525, + "step": 10160 + }, + { + "epoch": 0.79, + "grad_norm": 27.138954162597656, + "learning_rate": 1.0578831402195843e-06, + "logits/chosen": -1.1534265279769897, + "logits/rejected": -0.9197772741317749, + "logps/chosen": -1.3329823017120361, + "logps/rejected": -8.715575218200684, + "loss": 1.3445, + "odds_ratio_loss": 0.1155182346701622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13329823315143585, + "rewards/margins": 0.7382593750953674, + "rewards/rejected": -0.8715575337409973, + "sft_loss": 1.3329823017120361, + "step": 10165 + }, + { + "epoch": 0.79, + "grad_norm": 6.607047080993652, + "learning_rate": 1.0540984056063503e-06, + "logits/chosen": -1.4123284816741943, + "logits/rejected": -1.1352747678756714, + "logps/chosen": -1.1117833852767944, + "logps/rejected": -6.28924036026001, + "loss": 1.1148, + "odds_ratio_loss": 0.030421212315559387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11117835342884064, + "rewards/margins": 0.5177456736564636, + "rewards/rejected": -0.6289240121841431, + "sft_loss": 1.1117833852767944, + "step": 10170 + }, + { + "epoch": 0.79, + "grad_norm": 4.502261161804199, + "learning_rate": 1.0503196554538764e-06, + "logits/chosen": -1.390334129333496, + "logits/rejected": -0.9321194887161255, + "logps/chosen": -0.9069440960884094, + "logps/rejected": -8.51655387878418, + "loss": 0.9154, + "odds_ratio_loss": 0.08483470976352692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09069441258907318, + "rewards/margins": 0.76096111536026, + "rewards/rejected": -0.8516554832458496, + "sft_loss": 0.9069440960884094, + "step": 10175 + }, + { + "epoch": 0.79, + "grad_norm": 13.18431568145752, + "learning_rate": 1.0465468954931157e-06, + "logits/chosen": -1.3280055522918701, + "logits/rejected": -1.405785322189331, + "logps/chosen": -0.9179110527038574, + "logps/rejected": -7.704813480377197, + "loss": 0.9182, + "odds_ratio_loss": 0.0026637546252459288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09179110825061798, + "rewards/margins": 0.6786901950836182, + "rewards/rejected": -0.7704813480377197, + "sft_loss": 0.9179110527038574, + "step": 10180 + }, + { + "epoch": 0.79, + "grad_norm": 5.563874244689941, + "learning_rate": 1.0427801314459375e-06, + "logits/chosen": -1.3925745487213135, + "logits/rejected": -0.7530353665351868, + "logps/chosen": -1.2764919996261597, + "logps/rejected": -3.7269034385681152, + "loss": 1.2981, + "odds_ratio_loss": 0.21560311317443848, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1276492029428482, + "rewards/margins": 0.245041161775589, + "rewards/rejected": -0.3726903796195984, + "sft_loss": 1.2764919996261597, + "step": 10185 + }, + { + "epoch": 0.79, + "grad_norm": 7.671252250671387, + "learning_rate": 1.0390193690251187e-06, + "logits/chosen": -1.3380212783813477, + "logits/rejected": -0.934618353843689, + "logps/chosen": -1.2179018259048462, + "logps/rejected": -7.096657752990723, + "loss": 1.2647, + "odds_ratio_loss": 0.4683496356010437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12179018557071686, + "rewards/margins": 0.587875485420227, + "rewards/rejected": -0.7096656560897827, + "sft_loss": 1.2179018259048462, + "step": 10190 + }, + { + "epoch": 0.79, + "grad_norm": 17.141416549682617, + "learning_rate": 1.0352646139343325e-06, + "logits/chosen": -1.3692445755004883, + "logits/rejected": -1.280906319618225, + "logps/chosen": -0.9419307708740234, + "logps/rejected": -8.827807426452637, + "loss": 0.9597, + "odds_ratio_loss": 0.17793647944927216, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09419308602809906, + "rewards/margins": 0.7885876893997192, + "rewards/rejected": -0.8827807307243347, + "sft_loss": 0.9419307708740234, + "step": 10195 + }, + { + "epoch": 0.79, + "grad_norm": 100.93508911132812, + "learning_rate": 1.0315158718681417e-06, + "logits/chosen": -1.163964867591858, + "logits/rejected": -1.1963056325912476, + "logps/chosen": -1.1907155513763428, + "logps/rejected": -3.5567939281463623, + "loss": 1.2565, + "odds_ratio_loss": 0.657985270023346, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11907155811786652, + "rewards/margins": 0.23660783469676971, + "rewards/rejected": -0.3556794226169586, + "sft_loss": 1.1907155513763428, + "step": 10200 + }, + { + "epoch": 0.79, + "grad_norm": 8.81802749633789, + "learning_rate": 1.0277731485119903e-06, + "logits/chosen": -1.4088776111602783, + "logits/rejected": -1.4885118007659912, + "logps/chosen": -0.9907560348510742, + "logps/rejected": -11.762723922729492, + "loss": 0.9908, + "odds_ratio_loss": 0.0005440299864858389, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09907560795545578, + "rewards/margins": 1.0771968364715576, + "rewards/rejected": -1.1762722730636597, + "sft_loss": 0.9907560348510742, + "step": 10205 + }, + { + "epoch": 0.79, + "grad_norm": 9.657306671142578, + "learning_rate": 1.0240364495421918e-06, + "logits/chosen": -1.350857138633728, + "logits/rejected": -1.3532549142837524, + "logps/chosen": -1.101911187171936, + "logps/rejected": -7.677412986755371, + "loss": 1.1276, + "odds_ratio_loss": 0.2564184069633484, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11019112914800644, + "rewards/margins": 0.6575502157211304, + "rewards/rejected": -0.767741322517395, + "sft_loss": 1.101911187171936, + "step": 10210 + }, + { + "epoch": 0.79, + "grad_norm": 10.653190612792969, + "learning_rate": 1.0203057806259264e-06, + "logits/chosen": -1.3104242086410522, + "logits/rejected": -1.1077172756195068, + "logps/chosen": -0.6702635884284973, + "logps/rejected": -2.555192708969116, + "loss": 0.6963, + "odds_ratio_loss": 0.25997036695480347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.06702636182308197, + "rewards/margins": 0.18849292397499084, + "rewards/rejected": -0.255519300699234, + "sft_loss": 0.6702635884284973, + "step": 10215 + }, + { + "epoch": 0.8, + "grad_norm": 85.22171783447266, + "learning_rate": 1.0165811474212244e-06, + "logits/chosen": -1.3750183582305908, + "logits/rejected": -1.0446151494979858, + "logps/chosen": -1.0259478092193604, + "logps/rejected": -4.301419258117676, + "loss": 1.0495, + "odds_ratio_loss": 0.2350333034992218, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1025947779417038, + "rewards/margins": 0.32754719257354736, + "rewards/rejected": -0.43014198541641235, + "sft_loss": 1.0259478092193604, + "step": 10220 + }, + { + "epoch": 0.8, + "grad_norm": 89.82392883300781, + "learning_rate": 1.0128625555769682e-06, + "logits/chosen": -1.25468909740448, + "logits/rejected": -1.4248607158660889, + "logps/chosen": -1.087563157081604, + "logps/rejected": -6.856781005859375, + "loss": 1.1295, + "odds_ratio_loss": 0.41986173391342163, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10875630378723145, + "rewards/margins": 0.5769217610359192, + "rewards/rejected": -0.6856781244277954, + "sft_loss": 1.087563157081604, + "step": 10225 + }, + { + "epoch": 0.8, + "grad_norm": 22.732999801635742, + "learning_rate": 1.0091500107328734e-06, + "logits/chosen": -1.1168807744979858, + "logits/rejected": -1.1486178636550903, + "logps/chosen": -0.9088427424430847, + "logps/rejected": -10.493136405944824, + "loss": 0.917, + "odds_ratio_loss": 0.08161326497793198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09088428318500519, + "rewards/margins": 0.9584293365478516, + "rewards/rejected": -1.0493135452270508, + "sft_loss": 0.9088427424430847, + "step": 10230 + }, + { + "epoch": 0.8, + "grad_norm": 244.6911163330078, + "learning_rate": 1.0054435185194845e-06, + "logits/chosen": -1.324135184288025, + "logits/rejected": -1.189325213432312, + "logps/chosen": -1.0762240886688232, + "logps/rejected": -7.894415378570557, + "loss": 1.0774, + "odds_ratio_loss": 0.012069101445376873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1076224222779274, + "rewards/margins": 0.6818190813064575, + "rewards/rejected": -0.7894414663314819, + "sft_loss": 1.0762240886688232, + "step": 10235 + }, + { + "epoch": 0.8, + "grad_norm": 9.024249076843262, + "learning_rate": 1.0017430845581688e-06, + "logits/chosen": -1.2113773822784424, + "logits/rejected": -1.0507694482803345, + "logps/chosen": -2.6475517749786377, + "logps/rejected": -8.984169006347656, + "loss": 2.7514, + "odds_ratio_loss": 1.038090467453003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2647551894187927, + "rewards/margins": 0.6336617469787598, + "rewards/rejected": -0.8984168171882629, + "sft_loss": 2.6475517749786377, + "step": 10240 + }, + { + "epoch": 0.8, + "grad_norm": 16.64531707763672, + "learning_rate": 9.980487144611045e-07, + "logits/chosen": -1.3650559186935425, + "logits/rejected": -1.6216930150985718, + "logps/chosen": -1.3417785167694092, + "logps/rejected": -16.168201446533203, + "loss": 1.3418, + "odds_ratio_loss": 0.00013019000471103936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13417786359786987, + "rewards/margins": 1.482642412185669, + "rewards/rejected": -1.6168200969696045, + "sft_loss": 1.3417785167694092, + "step": 10245 + }, + { + "epoch": 0.8, + "grad_norm": 6.388486385345459, + "learning_rate": 9.943604138312725e-07, + "logits/chosen": -1.3851463794708252, + "logits/rejected": -0.7398214340209961, + "logps/chosen": -1.0597431659698486, + "logps/rejected": -8.64242172241211, + "loss": 1.1022, + "odds_ratio_loss": 0.4248722195625305, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10597433149814606, + "rewards/margins": 0.7582677602767944, + "rewards/rejected": -0.8642421960830688, + "sft_loss": 1.0597431659698486, + "step": 10250 + }, + { + "epoch": 0.8, + "grad_norm": 13.157958984375, + "learning_rate": 9.906781882624483e-07, + "logits/chosen": -1.1372522115707397, + "logits/rejected": -1.2602009773254395, + "logps/chosen": -1.0265510082244873, + "logps/rejected": -6.398252487182617, + "loss": 1.0279, + "odds_ratio_loss": 0.013952387496829033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10265511274337769, + "rewards/margins": 0.5371701121330261, + "rewards/rejected": -0.6398252248764038, + "sft_loss": 1.0265510082244873, + "step": 10255 + }, + { + "epoch": 0.8, + "grad_norm": 4.697621822357178, + "learning_rate": 9.870020433391947e-07, + "logits/chosen": -1.2178454399108887, + "logits/rejected": -1.1374971866607666, + "logps/chosen": -0.9389911890029907, + "logps/rejected": -5.738818168640137, + "loss": 0.9426, + "odds_ratio_loss": 0.0358780138194561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09389911592006683, + "rewards/margins": 0.47998276352882385, + "rewards/rejected": -0.5738819241523743, + "sft_loss": 0.9389911890029907, + "step": 10260 + }, + { + "epoch": 0.8, + "grad_norm": 124.55999755859375, + "learning_rate": 9.833319846368527e-07, + "logits/chosen": -1.4724111557006836, + "logits/rejected": -1.449961543083191, + "logps/chosen": -1.1817361116409302, + "logps/rejected": -12.227134704589844, + "loss": 1.2398, + "odds_ratio_loss": 0.5806252956390381, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11817361414432526, + "rewards/margins": 1.1045401096343994, + "rewards/rejected": -1.2227134704589844, + "sft_loss": 1.1817361116409302, + "step": 10265 + }, + { + "epoch": 0.8, + "grad_norm": 19.355998992919922, + "learning_rate": 9.796680177215307e-07, + "logits/chosen": -1.2841970920562744, + "logits/rejected": -1.0065765380859375, + "logps/chosen": -1.2662365436553955, + "logps/rejected": -8.185649871826172, + "loss": 1.2759, + "odds_ratio_loss": 0.09626002609729767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12662364542484283, + "rewards/margins": 0.6919413805007935, + "rewards/rejected": -0.8185650110244751, + "sft_loss": 1.2662365436553955, + "step": 10270 + }, + { + "epoch": 0.8, + "grad_norm": 9.266251564025879, + "learning_rate": 9.76010148150102e-07, + "logits/chosen": -1.270081877708435, + "logits/rejected": -1.4700524806976318, + "logps/chosen": -0.9803020358085632, + "logps/rejected": -11.615490913391113, + "loss": 0.9894, + "odds_ratio_loss": 0.09096785634756088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.098030224442482, + "rewards/margins": 1.0635188817977905, + "rewards/rejected": -1.1615490913391113, + "sft_loss": 0.9803020358085632, + "step": 10275 + }, + { + "epoch": 0.8, + "grad_norm": 12.899735450744629, + "learning_rate": 9.723583814701904e-07, + "logits/chosen": -1.3414711952209473, + "logits/rejected": -1.2441836595535278, + "logps/chosen": -1.0551906824111938, + "logps/rejected": -6.592919826507568, + "loss": 1.0625, + "odds_ratio_loss": 0.07285496592521667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10551907122135162, + "rewards/margins": 0.5537729263305664, + "rewards/rejected": -0.6592920422554016, + "sft_loss": 1.0551906824111938, + "step": 10280 + }, + { + "epoch": 0.8, + "grad_norm": 4.892205238342285, + "learning_rate": 9.687127232201604e-07, + "logits/chosen": -1.311057209968567, + "logits/rejected": -1.251116156578064, + "logps/chosen": -0.7263120412826538, + "logps/rejected": -5.205277442932129, + "loss": 0.7392, + "odds_ratio_loss": 0.1287280023097992, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07263121008872986, + "rewards/margins": 0.4478965401649475, + "rewards/rejected": -0.520527720451355, + "sft_loss": 0.7263120412826538, + "step": 10285 + }, + { + "epoch": 0.8, + "grad_norm": 6.589762210845947, + "learning_rate": 9.650731789291191e-07, + "logits/chosen": -1.22185218334198, + "logits/rejected": -1.0001389980316162, + "logps/chosen": -0.8899606466293335, + "logps/rejected": -4.561327934265137, + "loss": 0.9094, + "odds_ratio_loss": 0.19471469521522522, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08899606764316559, + "rewards/margins": 0.36713671684265137, + "rewards/rejected": -0.4561327397823334, + "sft_loss": 0.8899606466293335, + "step": 10290 + }, + { + "epoch": 0.8, + "grad_norm": 19.73723793029785, + "learning_rate": 9.614397541168963e-07, + "logits/chosen": -1.2932441234588623, + "logits/rejected": -1.2435917854309082, + "logps/chosen": -0.8818877935409546, + "logps/rejected": -2.412689685821533, + "loss": 0.9012, + "odds_ratio_loss": 0.192880317568779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0881887823343277, + "rewards/margins": 0.15308019518852234, + "rewards/rejected": -0.24126896262168884, + "sft_loss": 0.8818877935409546, + "step": 10295 + }, + { + "epoch": 0.8, + "grad_norm": 6.862296104431152, + "learning_rate": 9.57812454294041e-07, + "logits/chosen": -1.2940229177474976, + "logits/rejected": -1.143575668334961, + "logps/chosen": -1.2678958177566528, + "logps/rejected": -11.522639274597168, + "loss": 1.3019, + "odds_ratio_loss": 0.33973243832588196, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12678958475589752, + "rewards/margins": 1.025474190711975, + "rewards/rejected": -1.152263879776001, + "sft_loss": 1.2678958177566528, + "step": 10300 + }, + { + "epoch": 0.8, + "grad_norm": 15.191943168640137, + "learning_rate": 9.541912849618157e-07, + "logits/chosen": -1.375228762626648, + "logits/rejected": -1.450534462928772, + "logps/chosen": -1.5081673860549927, + "logps/rejected": -9.468241691589355, + "loss": 1.531, + "odds_ratio_loss": 0.22854717075824738, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15081675350666046, + "rewards/margins": 0.796007513999939, + "rewards/rejected": -0.9468242526054382, + "sft_loss": 1.5081673860549927, + "step": 10305 + }, + { + "epoch": 0.8, + "grad_norm": 1.882200837135315, + "learning_rate": 9.50576251612183e-07, + "logits/chosen": -1.235004186630249, + "logits/rejected": -1.764304757118225, + "logps/chosen": -0.7166417241096497, + "logps/rejected": -11.786436080932617, + "loss": 0.7311, + "odds_ratio_loss": 0.14428307116031647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07166416943073273, + "rewards/margins": 1.1069793701171875, + "rewards/rejected": -1.1786434650421143, + "sft_loss": 0.7166417241096497, + "step": 10310 + }, + { + "epoch": 0.8, + "grad_norm": 4.966947078704834, + "learning_rate": 9.469673597277995e-07, + "logits/chosen": -1.2578141689300537, + "logits/rejected": -1.0698421001434326, + "logps/chosen": -0.5962234735488892, + "logps/rejected": -6.369223594665527, + "loss": 0.6073, + "odds_ratio_loss": 0.11079008877277374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059622347354888916, + "rewards/margins": 0.5773000121116638, + "rewards/rejected": -0.6369223594665527, + "sft_loss": 0.5962234735488892, + "step": 10315 + }, + { + "epoch": 0.8, + "grad_norm": 25.989683151245117, + "learning_rate": 9.43364614782008e-07, + "logits/chosen": -1.3131908178329468, + "logits/rejected": -0.9322491884231567, + "logps/chosen": -1.1517027616500854, + "logps/rejected": -7.070420742034912, + "loss": 1.1769, + "odds_ratio_loss": 0.2521767020225525, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11517028510570526, + "rewards/margins": 0.5918717384338379, + "rewards/rejected": -0.7070420980453491, + "sft_loss": 1.1517027616500854, + "step": 10320 + }, + { + "epoch": 0.8, + "grad_norm": 11.165669441223145, + "learning_rate": 9.397680222388289e-07, + "logits/chosen": -1.2683213949203491, + "logits/rejected": -1.3640797138214111, + "logps/chosen": -0.9720544815063477, + "logps/rejected": -7.684467315673828, + "loss": 0.9952, + "odds_ratio_loss": 0.2314900904893875, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09720546007156372, + "rewards/margins": 0.671241283416748, + "rewards/rejected": -0.7684467434883118, + "sft_loss": 0.9720544815063477, + "step": 10325 + }, + { + "epoch": 0.8, + "grad_norm": 3.5104997158050537, + "learning_rate": 9.361775875529511e-07, + "logits/chosen": -1.3714743852615356, + "logits/rejected": -1.004473090171814, + "logps/chosen": -0.8311120271682739, + "logps/rejected": -7.721318244934082, + "loss": 0.8415, + "odds_ratio_loss": 0.10402430593967438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08311120420694351, + "rewards/margins": 0.689020574092865, + "rewards/rejected": -0.7721318006515503, + "sft_loss": 0.8311120271682739, + "step": 10330 + }, + { + "epoch": 0.8, + "grad_norm": 12.460982322692871, + "learning_rate": 9.325933161697237e-07, + "logits/chosen": -1.2947807312011719, + "logits/rejected": -1.1248445510864258, + "logps/chosen": -0.7291210293769836, + "logps/rejected": -6.313011646270752, + "loss": 0.732, + "odds_ratio_loss": 0.028608087450265884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07291209697723389, + "rewards/margins": 0.5583890676498413, + "rewards/rejected": -0.6313011646270752, + "sft_loss": 0.7291210293769836, + "step": 10335 + }, + { + "epoch": 0.8, + "grad_norm": 8.893537521362305, + "learning_rate": 9.290152135251513e-07, + "logits/chosen": -1.374703049659729, + "logits/rejected": -1.0444432497024536, + "logps/chosen": -1.0620887279510498, + "logps/rejected": -10.834625244140625, + "loss": 1.0637, + "odds_ratio_loss": 0.016157137230038643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10620886087417603, + "rewards/margins": 0.9772537350654602, + "rewards/rejected": -1.0834624767303467, + "sft_loss": 1.0620887279510498, + "step": 10340 + }, + { + "epoch": 0.8, + "grad_norm": 44.764617919921875, + "learning_rate": 9.2544328504588e-07, + "logits/chosen": -1.250694990158081, + "logits/rejected": -1.616758942604065, + "logps/chosen": -0.9803198575973511, + "logps/rejected": -8.175067901611328, + "loss": 1.0013, + "odds_ratio_loss": 0.20934104919433594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09803198277950287, + "rewards/margins": 0.7194747924804688, + "rewards/rejected": -0.8175067901611328, + "sft_loss": 0.9803198575973511, + "step": 10345 + }, + { + "epoch": 0.81, + "grad_norm": 77.59735870361328, + "learning_rate": 9.218775361491916e-07, + "logits/chosen": -1.3645678758621216, + "logits/rejected": -1.6805179119110107, + "logps/chosen": -1.9639705419540405, + "logps/rejected": -10.83612060546875, + "loss": 2.062, + "odds_ratio_loss": 0.9798789024353027, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19639703631401062, + "rewards/margins": 0.8872150182723999, + "rewards/rejected": -1.083612084388733, + "sft_loss": 1.9639705419540405, + "step": 10350 + }, + { + "epoch": 0.81, + "grad_norm": 31.32939338684082, + "learning_rate": 9.183179722429997e-07, + "logits/chosen": -1.4463050365447998, + "logits/rejected": -1.1317588090896606, + "logps/chosen": -1.1434721946716309, + "logps/rejected": -7.519809722900391, + "loss": 1.1464, + "odds_ratio_loss": 0.029724955558776855, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11434721946716309, + "rewards/margins": 0.6376338005065918, + "rewards/rejected": -0.7519810199737549, + "sft_loss": 1.1434721946716309, + "step": 10355 + }, + { + "epoch": 0.81, + "grad_norm": 4.483221530914307, + "learning_rate": 9.14764598725833e-07, + "logits/chosen": -1.2005016803741455, + "logits/rejected": -1.0444400310516357, + "logps/chosen": -1.065643072128296, + "logps/rejected": -8.766390800476074, + "loss": 1.0659, + "odds_ratio_loss": 0.002804366173222661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10656432807445526, + "rewards/margins": 0.7700749039649963, + "rewards/rejected": -0.876639187335968, + "sft_loss": 1.065643072128296, + "step": 10360 + }, + { + "epoch": 0.81, + "grad_norm": 6.713079929351807, + "learning_rate": 9.112174209868341e-07, + "logits/chosen": -1.287014126777649, + "logits/rejected": -1.178283452987671, + "logps/chosen": -0.884039044380188, + "logps/rejected": -5.378254413604736, + "loss": 0.9081, + "odds_ratio_loss": 0.2402028739452362, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08840389549732208, + "rewards/margins": 0.44942155480384827, + "rewards/rejected": -0.5378254652023315, + "sft_loss": 0.884039044380188, + "step": 10365 + }, + { + "epoch": 0.81, + "grad_norm": 7.158381938934326, + "learning_rate": 9.07676444405749e-07, + "logits/chosen": -1.3042536973953247, + "logits/rejected": -1.2625794410705566, + "logps/chosen": -0.8462556004524231, + "logps/rejected": -7.293604373931885, + "loss": 0.8499, + "odds_ratio_loss": 0.03648758679628372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08462555706501007, + "rewards/margins": 0.6447348594665527, + "rewards/rejected": -0.7293604016304016, + "sft_loss": 0.8462556004524231, + "step": 10370 + }, + { + "epoch": 0.81, + "grad_norm": 121.68801879882812, + "learning_rate": 9.041416743529168e-07, + "logits/chosen": -1.4717400074005127, + "logits/rejected": -1.0101182460784912, + "logps/chosen": -0.9590436816215515, + "logps/rejected": -4.982767581939697, + "loss": 0.9694, + "odds_ratio_loss": 0.10307572036981583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09590436518192291, + "rewards/margins": 0.4023723602294922, + "rewards/rejected": -0.4982767701148987, + "sft_loss": 0.9590436816215515, + "step": 10375 + }, + { + "epoch": 0.81, + "grad_norm": 5.824394702911377, + "learning_rate": 9.006131161892662e-07, + "logits/chosen": -1.3937454223632812, + "logits/rejected": -1.1171293258666992, + "logps/chosen": -0.864578366279602, + "logps/rejected": -5.059448719024658, + "loss": 0.9235, + "odds_ratio_loss": 0.58892422914505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08645783364772797, + "rewards/margins": 0.4194870591163635, + "rewards/rejected": -0.5059448480606079, + "sft_loss": 0.864578366279602, + "step": 10380 + }, + { + "epoch": 0.81, + "grad_norm": 77.57972717285156, + "learning_rate": 8.970907752663021e-07, + "logits/chosen": -1.0274670124053955, + "logits/rejected": -1.433475136756897, + "logps/chosen": -1.136293888092041, + "logps/rejected": -11.400399208068848, + "loss": 1.1385, + "odds_ratio_loss": 0.021981341764330864, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11362940073013306, + "rewards/margins": 1.026410460472107, + "rewards/rejected": -1.1400400400161743, + "sft_loss": 1.136293888092041, + "step": 10385 + }, + { + "epoch": 0.81, + "grad_norm": 8.37480354309082, + "learning_rate": 8.935746569261045e-07, + "logits/chosen": -1.3487589359283447, + "logits/rejected": -1.5885874032974243, + "logps/chosen": -1.007812261581421, + "logps/rejected": -17.21298599243164, + "loss": 1.0078, + "odds_ratio_loss": 0.0001933839957928285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10078122466802597, + "rewards/margins": 1.620517373085022, + "rewards/rejected": -1.7212985754013062, + "sft_loss": 1.007812261581421, + "step": 10390 + }, + { + "epoch": 0.81, + "grad_norm": 134.8335418701172, + "learning_rate": 8.900647665013112e-07, + "logits/chosen": -1.3777470588684082, + "logits/rejected": -1.3657658100128174, + "logps/chosen": -1.1525741815567017, + "logps/rejected": -7.184072017669678, + "loss": 1.1571, + "odds_ratio_loss": 0.045270055532455444, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11525741964578629, + "rewards/margins": 0.6031497716903687, + "rewards/rejected": -0.718407154083252, + "sft_loss": 1.1525741815567017, + "step": 10395 + }, + { + "epoch": 0.81, + "grad_norm": 9.768054008483887, + "learning_rate": 8.865611093151161e-07, + "logits/chosen": -1.4446382522583008, + "logits/rejected": -1.2962652444839478, + "logps/chosen": -0.9836952090263367, + "logps/rejected": -13.642313957214355, + "loss": 0.9951, + "odds_ratio_loss": 0.11401750147342682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09836951643228531, + "rewards/margins": 1.2658618688583374, + "rewards/rejected": -1.3642313480377197, + "sft_loss": 0.9836952090263367, + "step": 10400 + }, + { + "epoch": 0.81, + "grad_norm": 9.077279090881348, + "learning_rate": 8.830636906812628e-07, + "logits/chosen": -1.2882994413375854, + "logits/rejected": -1.3112610578536987, + "logps/chosen": -0.8100179433822632, + "logps/rejected": -6.998376369476318, + "loss": 0.8153, + "odds_ratio_loss": 0.0527278408408165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08100180327892303, + "rewards/margins": 0.6188358664512634, + "rewards/rejected": -0.6998376846313477, + "sft_loss": 0.8100179433822632, + "step": 10405 + }, + { + "epoch": 0.81, + "grad_norm": 11.235574722290039, + "learning_rate": 8.795725159040286e-07, + "logits/chosen": -1.18384850025177, + "logits/rejected": -1.3355462551116943, + "logps/chosen": -0.8854089975357056, + "logps/rejected": -9.00184154510498, + "loss": 0.8944, + "odds_ratio_loss": 0.09034241735935211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08854089677333832, + "rewards/margins": 0.8116433024406433, + "rewards/rejected": -0.900184154510498, + "sft_loss": 0.8854089975357056, + "step": 10410 + }, + { + "epoch": 0.81, + "grad_norm": 6.1802496910095215, + "learning_rate": 8.760875902782235e-07, + "logits/chosen": -1.1961528062820435, + "logits/rejected": -1.2074816226959229, + "logps/chosen": -1.2426180839538574, + "logps/rejected": -10.89729118347168, + "loss": 1.2475, + "odds_ratio_loss": 0.048336226493120193, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12426181882619858, + "rewards/margins": 0.9654672741889954, + "rewards/rejected": -1.0897290706634521, + "sft_loss": 1.2426180839538574, + "step": 10415 + }, + { + "epoch": 0.81, + "grad_norm": 26.631731033325195, + "learning_rate": 8.726089190891807e-07, + "logits/chosen": -1.357290506362915, + "logits/rejected": -1.6016134023666382, + "logps/chosen": -1.1133967638015747, + "logps/rejected": -4.950104236602783, + "loss": 1.1918, + "odds_ratio_loss": 0.7840924263000488, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11133966594934464, + "rewards/margins": 0.38367074728012085, + "rewards/rejected": -0.4950104355812073, + "sft_loss": 1.1133967638015747, + "step": 10420 + }, + { + "epoch": 0.81, + "grad_norm": 10.378015518188477, + "learning_rate": 8.691365076127461e-07, + "logits/chosen": -1.4252339601516724, + "logits/rejected": -1.4014484882354736, + "logps/chosen": -0.8036090135574341, + "logps/rejected": -9.63952922821045, + "loss": 0.8036, + "odds_ratio_loss": 0.00032946295686997473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08036090433597565, + "rewards/margins": 0.8835920095443726, + "rewards/rejected": -0.963952898979187, + "sft_loss": 0.8036090135574341, + "step": 10425 + }, + { + "epoch": 0.81, + "grad_norm": 10.65987777709961, + "learning_rate": 8.656703611152728e-07, + "logits/chosen": -1.2951816320419312, + "logits/rejected": -1.3539683818817139, + "logps/chosen": -0.9659156799316406, + "logps/rejected": -15.016809463500977, + "loss": 0.9659, + "odds_ratio_loss": 4.964599429513328e-05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09659156203269958, + "rewards/margins": 1.4050893783569336, + "rewards/rejected": -1.5016809701919556, + "sft_loss": 0.9659156799316406, + "step": 10430 + }, + { + "epoch": 0.81, + "grad_norm": 10.26109504699707, + "learning_rate": 8.622104848536117e-07, + "logits/chosen": -1.2839380502700806, + "logits/rejected": -0.8194013833999634, + "logps/chosen": -0.883658766746521, + "logps/rejected": -5.024937152862549, + "loss": 0.8999, + "odds_ratio_loss": 0.16276155412197113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08836588263511658, + "rewards/margins": 0.41412782669067383, + "rewards/rejected": -0.5024937391281128, + "sft_loss": 0.883658766746521, + "step": 10435 + }, + { + "epoch": 0.81, + "grad_norm": 7.266590595245361, + "learning_rate": 8.587568840751043e-07, + "logits/chosen": -1.2334734201431274, + "logits/rejected": -1.316857099533081, + "logps/chosen": -1.055189847946167, + "logps/rejected": -8.211328506469727, + "loss": 1.0711, + "odds_ratio_loss": 0.1588059365749359, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10551897436380386, + "rewards/margins": 0.7156140208244324, + "rewards/rejected": -0.8211329579353333, + "sft_loss": 1.055189847946167, + "step": 10440 + }, + { + "epoch": 0.81, + "grad_norm": 25.591766357421875, + "learning_rate": 8.553095640175751e-07, + "logits/chosen": -1.3331174850463867, + "logits/rejected": -0.950304388999939, + "logps/chosen": -1.043084979057312, + "logps/rejected": -7.412625789642334, + "loss": 1.0696, + "odds_ratio_loss": 0.26476946473121643, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10430850088596344, + "rewards/margins": 0.636954128742218, + "rewards/rejected": -0.7412625551223755, + "sft_loss": 1.043084979057312, + "step": 10445 + }, + { + "epoch": 0.81, + "grad_norm": 41.862178802490234, + "learning_rate": 8.518685299093216e-07, + "logits/chosen": -1.2142364978790283, + "logits/rejected": -1.0974957942962646, + "logps/chosen": -0.8024848699569702, + "logps/rejected": -10.103257179260254, + "loss": 0.8044, + "odds_ratio_loss": 0.019633423537015915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08024848997592926, + "rewards/margins": 0.9300772547721863, + "rewards/rejected": -1.0103256702423096, + "sft_loss": 0.8024848699569702, + "step": 10450 + }, + { + "epoch": 0.81, + "grad_norm": 6.318872928619385, + "learning_rate": 8.484337869691106e-07, + "logits/chosen": -1.2735618352890015, + "logits/rejected": -1.0776972770690918, + "logps/chosen": -1.2491729259490967, + "logps/rejected": -3.0578088760375977, + "loss": 1.286, + "odds_ratio_loss": 0.3680502772331238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12491729110479355, + "rewards/margins": 0.18086357414722443, + "rewards/rejected": -0.3057808578014374, + "sft_loss": 1.2491729259490967, + "step": 10455 + }, + { + "epoch": 0.81, + "grad_norm": 8.459473609924316, + "learning_rate": 8.450053404061654e-07, + "logits/chosen": -1.4461396932601929, + "logits/rejected": -0.8473329544067383, + "logps/chosen": -1.0210599899291992, + "logps/rejected": -7.846470832824707, + "loss": 1.0504, + "odds_ratio_loss": 0.2929363548755646, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1021059975028038, + "rewards/margins": 0.6825410723686218, + "rewards/rejected": -0.7846471071243286, + "sft_loss": 1.0210599899291992, + "step": 10460 + }, + { + "epoch": 0.81, + "grad_norm": 4.896653175354004, + "learning_rate": 8.415831954201587e-07, + "logits/chosen": -1.2814228534698486, + "logits/rejected": -0.8261783719062805, + "logps/chosen": -0.9416384696960449, + "logps/rejected": -12.27811336517334, + "loss": 0.9628, + "odds_ratio_loss": 0.21157538890838623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09416385740041733, + "rewards/margins": 1.1336476802825928, + "rewards/rejected": -1.2278114557266235, + "sft_loss": 0.9416384696960449, + "step": 10465 + }, + { + "epoch": 0.81, + "grad_norm": 7.460947036743164, + "learning_rate": 8.3816735720121e-07, + "logits/chosen": -1.5645605325698853, + "logits/rejected": -1.104880928993225, + "logps/chosen": -1.0965392589569092, + "logps/rejected": -6.61223840713501, + "loss": 1.1432, + "odds_ratio_loss": 0.4668787121772766, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10965392738580704, + "rewards/margins": 0.5515699982643127, + "rewards/rejected": -0.6612239480018616, + "sft_loss": 1.0965392589569092, + "step": 10470 + }, + { + "epoch": 0.81, + "grad_norm": 17.06568145751953, + "learning_rate": 8.347578309298715e-07, + "logits/chosen": -1.293723702430725, + "logits/rejected": -1.6184883117675781, + "logps/chosen": -0.9423452615737915, + "logps/rejected": -11.385828971862793, + "loss": 0.9426, + "odds_ratio_loss": 0.0028821465093642473, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09423451125621796, + "rewards/margins": 1.0443484783172607, + "rewards/rejected": -1.1385829448699951, + "sft_loss": 0.9423452615737915, + "step": 10475 + }, + { + "epoch": 0.82, + "grad_norm": 10.77719497680664, + "learning_rate": 8.313546217771224e-07, + "logits/chosen": -1.4554004669189453, + "logits/rejected": -1.0656113624572754, + "logps/chosen": -0.8173044323921204, + "logps/rejected": -1.7137502431869507, + "loss": 0.8712, + "odds_ratio_loss": 0.53852379322052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0817304402589798, + "rewards/margins": 0.08964459598064423, + "rewards/rejected": -0.17137503623962402, + "sft_loss": 0.8173044323921204, + "step": 10480 + }, + { + "epoch": 0.82, + "grad_norm": 5.053610801696777, + "learning_rate": 8.27957734904361e-07, + "logits/chosen": -1.4066091775894165, + "logits/rejected": -0.704928994178772, + "logps/chosen": -0.9944049715995789, + "logps/rejected": -6.750539302825928, + "loss": 1.0029, + "odds_ratio_loss": 0.08509130030870438, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09944050014019012, + "rewards/margins": 0.5756133794784546, + "rewards/rejected": -0.6750538945198059, + "sft_loss": 0.9944049715995789, + "step": 10485 + }, + { + "epoch": 0.82, + "grad_norm": 5.587077617645264, + "learning_rate": 8.245671754633977e-07, + "logits/chosen": -1.446058988571167, + "logits/rejected": -1.1737011671066284, + "logps/chosen": -0.727880597114563, + "logps/rejected": -6.857645511627197, + "loss": 0.7469, + "odds_ratio_loss": 0.19013085961341858, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0727880671620369, + "rewards/margins": 0.6129764914512634, + "rewards/rejected": -0.6857645511627197, + "sft_loss": 0.727880597114563, + "step": 10490 + }, + { + "epoch": 0.82, + "grad_norm": 6.152067184448242, + "learning_rate": 8.211829485964462e-07, + "logits/chosen": -1.3971049785614014, + "logits/rejected": -1.288309097290039, + "logps/chosen": -1.004244089126587, + "logps/rejected": -6.761834621429443, + "loss": 1.0273, + "odds_ratio_loss": 0.23082295060157776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10042442381381989, + "rewards/margins": 0.5757590532302856, + "rewards/rejected": -0.6761834621429443, + "sft_loss": 1.004244089126587, + "step": 10495 + }, + { + "epoch": 0.82, + "grad_norm": 79.3951416015625, + "learning_rate": 8.178050594361153e-07, + "logits/chosen": -1.2680330276489258, + "logits/rejected": -1.05678391456604, + "logps/chosen": -0.8385862112045288, + "logps/rejected": -4.127747535705566, + "loss": 0.8477, + "odds_ratio_loss": 0.09067679941654205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08385862410068512, + "rewards/margins": 0.32891613245010376, + "rewards/rejected": -0.4127747416496277, + "sft_loss": 0.8385862112045288, + "step": 10500 + }, + { + "epoch": 0.82, + "grad_norm": 5.112764835357666, + "learning_rate": 8.144335131054054e-07, + "logits/chosen": -1.1656320095062256, + "logits/rejected": -1.1373666524887085, + "logps/chosen": -1.0548193454742432, + "logps/rejected": -10.11992359161377, + "loss": 1.0553, + "odds_ratio_loss": 0.004356134682893753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10548193752765656, + "rewards/margins": 0.9065103530883789, + "rewards/rejected": -1.011992335319519, + "sft_loss": 1.0548193454742432, + "step": 10505 + }, + { + "epoch": 0.82, + "grad_norm": 38.178993225097656, + "learning_rate": 8.110683147176929e-07, + "logits/chosen": -1.3449701070785522, + "logits/rejected": -1.1206490993499756, + "logps/chosen": -0.8311041593551636, + "logps/rejected": -4.470694541931152, + "loss": 0.8728, + "odds_ratio_loss": 0.4171561300754547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08311041444540024, + "rewards/margins": 0.36395907402038574, + "rewards/rejected": -0.4470694959163666, + "sft_loss": 0.8311041593551636, + "step": 10510 + }, + { + "epoch": 0.82, + "grad_norm": 7.466664791107178, + "learning_rate": 8.077094693767274e-07, + "logits/chosen": -1.4264633655548096, + "logits/rejected": -0.9952713251113892, + "logps/chosen": -0.6681760549545288, + "logps/rejected": -4.286257743835449, + "loss": 0.6758, + "odds_ratio_loss": 0.07590831816196442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06681760400533676, + "rewards/margins": 0.3618081510066986, + "rewards/rejected": -0.42862576246261597, + "sft_loss": 0.6681760549545288, + "step": 10515 + }, + { + "epoch": 0.82, + "grad_norm": 6.742861270904541, + "learning_rate": 8.043569821766267e-07, + "logits/chosen": -1.3641449213027954, + "logits/rejected": -0.7682236433029175, + "logps/chosen": -0.9841006994247437, + "logps/rejected": -9.281465530395508, + "loss": 0.9896, + "odds_ratio_loss": 0.055423516780138016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09841008484363556, + "rewards/margins": 0.8297365307807922, + "rewards/rejected": -0.9281465411186218, + "sft_loss": 0.9841006994247437, + "step": 10520 + }, + { + "epoch": 0.82, + "grad_norm": 45.302364349365234, + "learning_rate": 8.010108582018622e-07, + "logits/chosen": -1.3702471256256104, + "logits/rejected": -1.1153770685195923, + "logps/chosen": -1.0117090940475464, + "logps/rejected": -2.893731117248535, + "loss": 1.0545, + "odds_ratio_loss": 0.42815542221069336, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10117091238498688, + "rewards/margins": 0.18820220232009888, + "rewards/rejected": -0.28937309980392456, + "sft_loss": 1.0117090940475464, + "step": 10525 + }, + { + "epoch": 0.82, + "grad_norm": 8.680965423583984, + "learning_rate": 7.976711025272549e-07, + "logits/chosen": -1.3835210800170898, + "logits/rejected": -1.3382296562194824, + "logps/chosen": -0.7052081227302551, + "logps/rejected": -10.351499557495117, + "loss": 0.7243, + "odds_ratio_loss": 0.19136479496955872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0705208107829094, + "rewards/margins": 0.9646291732788086, + "rewards/rejected": -1.0351499319076538, + "sft_loss": 0.7052081227302551, + "step": 10530 + }, + { + "epoch": 0.82, + "grad_norm": 37.77129364013672, + "learning_rate": 7.943377202179697e-07, + "logits/chosen": -1.252140998840332, + "logits/rejected": -0.9507595300674438, + "logps/chosen": -0.8741302490234375, + "logps/rejected": -8.739225387573242, + "loss": 0.8753, + "odds_ratio_loss": 0.01146181020885706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08741302788257599, + "rewards/margins": 0.7865095734596252, + "rewards/rejected": -0.8739225268363953, + "sft_loss": 0.8741302490234375, + "step": 10535 + }, + { + "epoch": 0.82, + "grad_norm": 9.636751174926758, + "learning_rate": 7.910107163295034e-07, + "logits/chosen": -1.291235327720642, + "logits/rejected": -1.383276343345642, + "logps/chosen": -1.13475501537323, + "logps/rejected": -11.56714916229248, + "loss": 1.1404, + "odds_ratio_loss": 0.056367408484220505, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.113475501537323, + "rewards/margins": 1.0432393550872803, + "rewards/rejected": -1.1567147970199585, + "sft_loss": 1.13475501537323, + "step": 10540 + }, + { + "epoch": 0.82, + "grad_norm": 13.887544631958008, + "learning_rate": 7.876900959076806e-07, + "logits/chosen": -1.290006160736084, + "logits/rejected": -0.8440510034561157, + "logps/chosen": -0.8624340295791626, + "logps/rejected": -2.5278334617614746, + "loss": 0.8844, + "odds_ratio_loss": 0.21950320899486542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0862434059381485, + "rewards/margins": 0.1665399670600891, + "rewards/rejected": -0.2527833580970764, + "sft_loss": 0.8624340295791626, + "step": 10545 + }, + { + "epoch": 0.82, + "grad_norm": 37.36802673339844, + "learning_rate": 7.843758639886423e-07, + "logits/chosen": -1.547479510307312, + "logits/rejected": -1.271514654159546, + "logps/chosen": -1.0148048400878906, + "logps/rejected": -2.815554141998291, + "loss": 1.0743, + "odds_ratio_loss": 0.5949819087982178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10148048400878906, + "rewards/margins": 0.18007493019104004, + "rewards/rejected": -0.2815554141998291, + "sft_loss": 1.0148048400878906, + "step": 10550 + }, + { + "epoch": 0.82, + "grad_norm": 3.9318273067474365, + "learning_rate": 7.810680255988428e-07, + "logits/chosen": -1.4175784587860107, + "logits/rejected": -1.2164119482040405, + "logps/chosen": -0.9271456003189087, + "logps/rejected": -18.189111709594727, + "loss": 0.9413, + "odds_ratio_loss": 0.1411227434873581, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09271456301212311, + "rewards/margins": 1.7261966466903687, + "rewards/rejected": -1.8189113140106201, + "sft_loss": 0.9271456003189087, + "step": 10555 + }, + { + "epoch": 0.82, + "grad_norm": 24.3516845703125, + "learning_rate": 7.777665857550392e-07, + "logits/chosen": -1.471502661705017, + "logits/rejected": -1.1980304718017578, + "logps/chosen": -0.7790975570678711, + "logps/rejected": -4.073418140411377, + "loss": 0.8185, + "odds_ratio_loss": 0.39446350932121277, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07790975272655487, + "rewards/margins": 0.32943207025527954, + "rewards/rejected": -0.4073418080806732, + "sft_loss": 0.7790975570678711, + "step": 10560 + }, + { + "epoch": 0.82, + "grad_norm": 33.29905700683594, + "learning_rate": 7.74471549464283e-07, + "logits/chosen": -1.2413263320922852, + "logits/rejected": -1.1940476894378662, + "logps/chosen": -0.9201310873031616, + "logps/rejected": -9.094204902648926, + "loss": 0.9233, + "odds_ratio_loss": 0.031404972076416016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09201310575008392, + "rewards/margins": 0.8174074292182922, + "rewards/rejected": -0.909420371055603, + "sft_loss": 0.9201310873031616, + "step": 10565 + }, + { + "epoch": 0.82, + "grad_norm": 62.213478088378906, + "learning_rate": 7.711829217239169e-07, + "logits/chosen": -1.107883095741272, + "logits/rejected": -1.278570532798767, + "logps/chosen": -1.1935635805130005, + "logps/rejected": -7.609461307525635, + "loss": 1.214, + "odds_ratio_loss": 0.2044333517551422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11935635656118393, + "rewards/margins": 0.6415897607803345, + "rewards/rejected": -0.7609461545944214, + "sft_loss": 1.1935635805130005, + "step": 10570 + }, + { + "epoch": 0.82, + "grad_norm": 21.677597045898438, + "learning_rate": 7.679007075215616e-07, + "logits/chosen": -1.4407821893692017, + "logits/rejected": -0.8825246095657349, + "logps/chosen": -1.1419273614883423, + "logps/rejected": -4.68762731552124, + "loss": 1.154, + "odds_ratio_loss": 0.12092401832342148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11419273912906647, + "rewards/margins": 0.3545700013637543, + "rewards/rejected": -0.46876272559165955, + "sft_loss": 1.1419273614883423, + "step": 10575 + }, + { + "epoch": 0.82, + "grad_norm": 60.53403854370117, + "learning_rate": 7.646249118351106e-07, + "logits/chosen": -1.1642895936965942, + "logits/rejected": -1.0220810174942017, + "logps/chosen": -0.8388074636459351, + "logps/rejected": -7.616601467132568, + "loss": 0.8497, + "odds_ratio_loss": 0.1086864247918129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08388075977563858, + "rewards/margins": 0.6777793765068054, + "rewards/rejected": -0.7616601586341858, + "sft_loss": 0.8388074636459351, + "step": 10580 + }, + { + "epoch": 0.82, + "grad_norm": 8.826997756958008, + "learning_rate": 7.61355539632726e-07, + "logits/chosen": -1.378531575202942, + "logits/rejected": -0.9921766519546509, + "logps/chosen": -0.8884096145629883, + "logps/rejected": -5.2243757247924805, + "loss": 0.9135, + "odds_ratio_loss": 0.25051718950271606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08884096145629883, + "rewards/margins": 0.433596670627594, + "rewards/rejected": -0.522437572479248, + "sft_loss": 0.8884096145629883, + "step": 10585 + }, + { + "epoch": 0.82, + "grad_norm": 12.614011764526367, + "learning_rate": 7.580925958728247e-07, + "logits/chosen": -1.5484821796417236, + "logits/rejected": -1.3846181631088257, + "logps/chosen": -0.9231731295585632, + "logps/rejected": -14.208297729492188, + "loss": 0.9368, + "odds_ratio_loss": 0.13594523072242737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09231732785701752, + "rewards/margins": 1.32851243019104, + "rewards/rejected": -1.4208297729492188, + "sft_loss": 0.9231731295585632, + "step": 10590 + }, + { + "epoch": 0.82, + "grad_norm": 21.662193298339844, + "learning_rate": 7.548360855040754e-07, + "logits/chosen": -1.4308409690856934, + "logits/rejected": -0.9334679841995239, + "logps/chosen": -0.8962264060974121, + "logps/rejected": -10.339228630065918, + "loss": 0.8963, + "odds_ratio_loss": 0.0004231159982737154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0896226316690445, + "rewards/margins": 0.9443003535270691, + "rewards/rejected": -1.0339229106903076, + "sft_loss": 0.8962264060974121, + "step": 10595 + }, + { + "epoch": 0.82, + "grad_norm": 7.128372669219971, + "learning_rate": 7.515860134653897e-07, + "logits/chosen": -1.347833275794983, + "logits/rejected": -1.1420245170593262, + "logps/chosen": -0.8281259536743164, + "logps/rejected": -7.346166133880615, + "loss": 0.8299, + "odds_ratio_loss": 0.017481762915849686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0828125923871994, + "rewards/margins": 0.6518040895462036, + "rewards/rejected": -0.7346166372299194, + "sft_loss": 0.8281259536743164, + "step": 10600 + }, + { + "epoch": 0.82, + "grad_norm": 14.502819061279297, + "learning_rate": 7.483423846859133e-07, + "logits/chosen": -1.3140825033187866, + "logits/rejected": -1.5204674005508423, + "logps/chosen": -0.798182487487793, + "logps/rejected": -5.050746917724609, + "loss": 0.8016, + "odds_ratio_loss": 0.03423731029033661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0798182487487793, + "rewards/margins": 0.4252564311027527, + "rewards/rejected": -0.505074679851532, + "sft_loss": 0.798182487487793, + "step": 10605 + }, + { + "epoch": 0.83, + "grad_norm": 4.6367716789245605, + "learning_rate": 7.451052040850221e-07, + "logits/chosen": -1.4010194540023804, + "logits/rejected": -0.8769040107727051, + "logps/chosen": -1.209674596786499, + "logps/rejected": -6.710930824279785, + "loss": 1.2768, + "odds_ratio_loss": 0.670904815196991, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12096747010946274, + "rewards/margins": 0.5501256585121155, + "rewards/rejected": -0.6710931062698364, + "sft_loss": 1.209674596786499, + "step": 10610 + }, + { + "epoch": 0.83, + "grad_norm": 26.443431854248047, + "learning_rate": 7.418744765723118e-07, + "logits/chosen": -1.3006012439727783, + "logits/rejected": -1.4145643711090088, + "logps/chosen": -0.8388000726699829, + "logps/rejected": -5.752911567687988, + "loss": 0.8537, + "odds_ratio_loss": 0.14872387051582336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08388000726699829, + "rewards/margins": 0.49141111969947815, + "rewards/rejected": -0.5752911567687988, + "sft_loss": 0.8388000726699829, + "step": 10615 + }, + { + "epoch": 0.83, + "grad_norm": 22.965103149414062, + "learning_rate": 7.386502070475904e-07, + "logits/chosen": -1.3692954778671265, + "logits/rejected": -0.9632579684257507, + "logps/chosen": -1.0018607378005981, + "logps/rejected": -7.105264186859131, + "loss": 1.0268, + "odds_ratio_loss": 0.24981114268302917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1001860722899437, + "rewards/margins": 0.610340416431427, + "rewards/rejected": -0.7105264663696289, + "sft_loss": 1.0018607378005981, + "step": 10620 + }, + { + "epoch": 0.83, + "grad_norm": 9.114238739013672, + "learning_rate": 7.354324004008723e-07, + "logits/chosen": -1.4396076202392578, + "logits/rejected": -0.8397786021232605, + "logps/chosen": -0.9166440963745117, + "logps/rejected": -2.550844669342041, + "loss": 0.9506, + "odds_ratio_loss": 0.3398217558860779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09166441112756729, + "rewards/margins": 0.16342003643512726, + "rewards/rejected": -0.25508445501327515, + "sft_loss": 0.9166440963745117, + "step": 10625 + }, + { + "epoch": 0.83, + "grad_norm": 21.649497985839844, + "learning_rate": 7.322210615123688e-07, + "logits/chosen": -1.4119873046875, + "logits/rejected": -1.0898716449737549, + "logps/chosen": -0.959582507610321, + "logps/rejected": -5.670458793640137, + "loss": 1.0098, + "odds_ratio_loss": 0.5025702714920044, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09595826268196106, + "rewards/margins": 0.4710876941680908, + "rewards/rejected": -0.5670458674430847, + "sft_loss": 0.959582507610321, + "step": 10630 + }, + { + "epoch": 0.83, + "grad_norm": 15.845293045043945, + "learning_rate": 7.290161952524843e-07, + "logits/chosen": -1.4333869218826294, + "logits/rejected": -1.5032893419265747, + "logps/chosen": -0.8169578313827515, + "logps/rejected": -10.404666900634766, + "loss": 0.8321, + "odds_ratio_loss": 0.15138432383537292, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0816957876086235, + "rewards/margins": 0.9587709307670593, + "rewards/rejected": -1.0404666662216187, + "sft_loss": 0.8169578313827515, + "step": 10635 + }, + { + "epoch": 0.83, + "grad_norm": 6.184009552001953, + "learning_rate": 7.258178064818056e-07, + "logits/chosen": -1.3951168060302734, + "logits/rejected": -1.4423763751983643, + "logps/chosen": -4.069607734680176, + "logps/rejected": -13.736379623413086, + "loss": 4.1026, + "odds_ratio_loss": 0.3303770422935486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.40696078538894653, + "rewards/margins": 0.9666773080825806, + "rewards/rejected": -1.3736379146575928, + "sft_loss": 4.069607734680176, + "step": 10640 + }, + { + "epoch": 0.83, + "grad_norm": 6.67935037612915, + "learning_rate": 7.226259000510932e-07, + "logits/chosen": -1.2222254276275635, + "logits/rejected": -1.2913544178009033, + "logps/chosen": -1.056786298751831, + "logps/rejected": -5.585196018218994, + "loss": 1.07, + "odds_ratio_loss": 0.1319592297077179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10567863285541534, + "rewards/margins": 0.4528409540653229, + "rewards/rejected": -0.5585195422172546, + "sft_loss": 1.056786298751831, + "step": 10645 + }, + { + "epoch": 0.83, + "grad_norm": 69.80462646484375, + "learning_rate": 7.194404808012811e-07, + "logits/chosen": -1.450503945350647, + "logits/rejected": -0.8002532124519348, + "logps/chosen": -1.2184422016143799, + "logps/rejected": -6.215879440307617, + "loss": 1.2687, + "odds_ratio_loss": 0.5026370882987976, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12184421718120575, + "rewards/margins": 0.4997437596321106, + "rewards/rejected": -0.6215879321098328, + "sft_loss": 1.2184422016143799, + "step": 10650 + }, + { + "epoch": 0.83, + "grad_norm": 14.467700004577637, + "learning_rate": 7.162615535634609e-07, + "logits/chosen": -1.3063253164291382, + "logits/rejected": -1.1163030862808228, + "logps/chosen": -1.0223617553710938, + "logps/rejected": -2.57954740524292, + "loss": 1.04, + "odds_ratio_loss": 0.17633824050426483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10223618894815445, + "rewards/margins": 0.15571856498718262, + "rewards/rejected": -0.25795474648475647, + "sft_loss": 1.0223617553710938, + "step": 10655 + }, + { + "epoch": 0.83, + "grad_norm": 6.114673137664795, + "learning_rate": 7.130891231588794e-07, + "logits/chosen": -1.2796623706817627, + "logits/rejected": -0.8105155229568481, + "logps/chosen": -1.2943050861358643, + "logps/rejected": -2.6557037830352783, + "loss": 1.3144, + "odds_ratio_loss": 0.20114263892173767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12943051755428314, + "rewards/margins": 0.1361398547887802, + "rewards/rejected": -0.26557037234306335, + "sft_loss": 1.2943050861358643, + "step": 10660 + }, + { + "epoch": 0.83, + "grad_norm": 24.761369705200195, + "learning_rate": 7.099231943989299e-07, + "logits/chosen": -1.2193286418914795, + "logits/rejected": -1.3471779823303223, + "logps/chosen": -0.9623664617538452, + "logps/rejected": -6.8237504959106445, + "loss": 0.98, + "odds_ratio_loss": 0.17595073580741882, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09623664617538452, + "rewards/margins": 0.5861383676528931, + "rewards/rejected": -0.6823750734329224, + "sft_loss": 0.9623664617538452, + "step": 10665 + }, + { + "epoch": 0.83, + "grad_norm": 15.483434677124023, + "learning_rate": 7.067637720851451e-07, + "logits/chosen": -1.322331428527832, + "logits/rejected": -1.5640705823898315, + "logps/chosen": -0.6776281595230103, + "logps/rejected": -8.613502502441406, + "loss": 0.698, + "odds_ratio_loss": 0.2034587413072586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0677628144621849, + "rewards/margins": 0.793587327003479, + "rewards/rejected": -0.8613502383232117, + "sft_loss": 0.6776281595230103, + "step": 10670 + }, + { + "epoch": 0.83, + "grad_norm": 14.496145248413086, + "learning_rate": 7.036108610091896e-07, + "logits/chosen": -1.404266595840454, + "logits/rejected": -1.0386518239974976, + "logps/chosen": -1.0749711990356445, + "logps/rejected": -3.9387428760528564, + "loss": 1.1257, + "odds_ratio_loss": 0.5070328712463379, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.10749713331460953, + "rewards/margins": 0.2863771915435791, + "rewards/rejected": -0.39387431740760803, + "sft_loss": 1.0749711990356445, + "step": 10675 + }, + { + "epoch": 0.83, + "grad_norm": 316.1288757324219, + "learning_rate": 7.004644659528559e-07, + "logits/chosen": -0.9859359860420227, + "logits/rejected": -1.2559010982513428, + "logps/chosen": -0.8468478322029114, + "logps/rejected": -9.959203720092773, + "loss": 0.855, + "odds_ratio_loss": 0.08120620250701904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08468478918075562, + "rewards/margins": 0.9112356901168823, + "rewards/rejected": -0.9959205389022827, + "sft_loss": 0.8468478322029114, + "step": 10680 + }, + { + "epoch": 0.83, + "grad_norm": 9.3912353515625, + "learning_rate": 6.973245916880494e-07, + "logits/chosen": -1.4307042360305786, + "logits/rejected": -1.6631122827529907, + "logps/chosen": -1.0856106281280518, + "logps/rejected": -15.808072090148926, + "loss": 1.0857, + "odds_ratio_loss": 0.000607538444455713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10856107622385025, + "rewards/margins": 1.4722459316253662, + "rewards/rejected": -1.580807089805603, + "sft_loss": 1.0856106281280518, + "step": 10685 + }, + { + "epoch": 0.83, + "grad_norm": 48.749019622802734, + "learning_rate": 6.941912429767883e-07, + "logits/chosen": -1.369105577468872, + "logits/rejected": -1.0214558839797974, + "logps/chosen": -0.8400972485542297, + "logps/rejected": -6.254521369934082, + "loss": 0.8957, + "odds_ratio_loss": 0.5555503964424133, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08400972187519073, + "rewards/margins": 0.5414424538612366, + "rewards/rejected": -0.6254521012306213, + "sft_loss": 0.8400972485542297, + "step": 10690 + }, + { + "epoch": 0.83, + "grad_norm": 11.4762544631958, + "learning_rate": 6.910644245711933e-07, + "logits/chosen": -1.3264143466949463, + "logits/rejected": -1.117030382156372, + "logps/chosen": -1.1049288511276245, + "logps/rejected": -12.542996406555176, + "loss": 1.1266, + "odds_ratio_loss": 0.2170540988445282, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11049288511276245, + "rewards/margins": 1.1438066959381104, + "rewards/rejected": -1.2542996406555176, + "sft_loss": 1.1049288511276245, + "step": 10695 + }, + { + "epoch": 0.83, + "grad_norm": 16.640090942382812, + "learning_rate": 6.879441412134829e-07, + "logits/chosen": -1.3166162967681885, + "logits/rejected": -1.4665424823760986, + "logps/chosen": -1.2805297374725342, + "logps/rejected": -14.314722061157227, + "loss": 1.2831, + "odds_ratio_loss": 0.025562694296240807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1280529797077179, + "rewards/margins": 1.3034193515777588, + "rewards/rejected": -1.4314724206924438, + "sft_loss": 1.2805297374725342, + "step": 10700 + }, + { + "epoch": 0.83, + "grad_norm": 4.9655632972717285, + "learning_rate": 6.848303976359627e-07, + "logits/chosen": -1.2870609760284424, + "logits/rejected": -0.8258674740791321, + "logps/chosen": -1.148829460144043, + "logps/rejected": -12.691110610961914, + "loss": 1.1623, + "odds_ratio_loss": 0.1348555088043213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1148829460144043, + "rewards/margins": 1.1542279720306396, + "rewards/rejected": -1.269110918045044, + "sft_loss": 1.148829460144043, + "step": 10705 + }, + { + "epoch": 0.83, + "grad_norm": 4.1600847244262695, + "learning_rate": 6.8172319856102e-07, + "logits/chosen": -1.3312809467315674, + "logits/rejected": -1.062102198600769, + "logps/chosen": -0.932320237159729, + "logps/rejected": -5.111158847808838, + "loss": 0.9464, + "odds_ratio_loss": 0.14102238416671753, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09323202073574066, + "rewards/margins": 0.41788387298583984, + "rewards/rejected": -0.5111159086227417, + "sft_loss": 0.932320237159729, + "step": 10710 + }, + { + "epoch": 0.83, + "grad_norm": 22.662845611572266, + "learning_rate": 6.786225487011161e-07, + "logits/chosen": -1.292526125907898, + "logits/rejected": -0.7720105648040771, + "logps/chosen": -0.8446162343025208, + "logps/rejected": -7.427974700927734, + "loss": 0.8559, + "odds_ratio_loss": 0.11322052776813507, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08446161448955536, + "rewards/margins": 0.65833580493927, + "rewards/rejected": -0.7427974939346313, + "sft_loss": 0.8446162343025208, + "step": 10715 + }, + { + "epoch": 0.83, + "grad_norm": 27.110597610473633, + "learning_rate": 6.755284527587808e-07, + "logits/chosen": -1.2771958112716675, + "logits/rejected": -1.2682626247406006, + "logps/chosen": -0.8162559270858765, + "logps/rejected": -3.1453375816345215, + "loss": 0.8354, + "odds_ratio_loss": 0.19193263351917267, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08162559568881989, + "rewards/margins": 0.23290812969207764, + "rewards/rejected": -0.3145337700843811, + "sft_loss": 0.8162559270858765, + "step": 10720 + }, + { + "epoch": 0.83, + "grad_norm": 17.56296730041504, + "learning_rate": 6.724409154266015e-07, + "logits/chosen": -1.3354074954986572, + "logits/rejected": -1.6221174001693726, + "logps/chosen": -0.7041760683059692, + "logps/rejected": -5.976828098297119, + "loss": 0.7242, + "odds_ratio_loss": 0.19975678622722626, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07041759788990021, + "rewards/margins": 0.5272652506828308, + "rewards/rejected": -0.5976828336715698, + "sft_loss": 0.7041760683059692, + "step": 10725 + }, + { + "epoch": 0.83, + "grad_norm": 23.91651725769043, + "learning_rate": 6.693599413872237e-07, + "logits/chosen": -1.4807329177856445, + "logits/rejected": -1.0074899196624756, + "logps/chosen": -0.9828524589538574, + "logps/rejected": -3.0085151195526123, + "loss": 0.9976, + "odds_ratio_loss": 0.14699013531208038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0982852429151535, + "rewards/margins": 0.2025662660598755, + "rewards/rejected": -0.3008515238761902, + "sft_loss": 0.9828524589538574, + "step": 10730 + }, + { + "epoch": 0.84, + "grad_norm": 15.844127655029297, + "learning_rate": 6.662855353133347e-07, + "logits/chosen": -1.4604413509368896, + "logits/rejected": -1.3962243795394897, + "logps/chosen": -1.2593581676483154, + "logps/rejected": -2.065171003341675, + "loss": 1.3106, + "odds_ratio_loss": 0.5126217603683472, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12593582272529602, + "rewards/margins": 0.08058128505945206, + "rewards/rejected": -0.20651713013648987, + "sft_loss": 1.2593581676483154, + "step": 10735 + }, + { + "epoch": 0.84, + "grad_norm": 8.233352661132812, + "learning_rate": 6.632177018676605e-07, + "logits/chosen": -1.3083667755126953, + "logits/rejected": -1.0775474309921265, + "logps/chosen": -1.0371109247207642, + "logps/rejected": -3.8532378673553467, + "loss": 1.0483, + "odds_ratio_loss": 0.11168348789215088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1037110835313797, + "rewards/margins": 0.28161272406578064, + "rewards/rejected": -0.38532382249832153, + "sft_loss": 1.0371109247207642, + "step": 10740 + }, + { + "epoch": 0.84, + "grad_norm": 4.413276195526123, + "learning_rate": 6.601564457029597e-07, + "logits/chosen": -1.3865123987197876, + "logits/rejected": -1.3351116180419922, + "logps/chosen": -0.9906963109970093, + "logps/rejected": -7.515824794769287, + "loss": 1.036, + "odds_ratio_loss": 0.4525395333766937, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09906964004039764, + "rewards/margins": 0.6525128483772278, + "rewards/rejected": -0.7515825033187866, + "sft_loss": 0.9906963109970093, + "step": 10745 + }, + { + "epoch": 0.84, + "grad_norm": 5.07670259475708, + "learning_rate": 6.571017714620187e-07, + "logits/chosen": -1.4065673351287842, + "logits/rejected": -0.9796167612075806, + "logps/chosen": -0.8060011863708496, + "logps/rejected": -3.166170597076416, + "loss": 0.8262, + "odds_ratio_loss": 0.20244893431663513, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08060012757778168, + "rewards/margins": 0.23601694405078888, + "rewards/rejected": -0.31661707162857056, + "sft_loss": 0.8060011863708496, + "step": 10750 + }, + { + "epoch": 0.84, + "grad_norm": 219.17034912109375, + "learning_rate": 6.540536837776367e-07, + "logits/chosen": -1.2626028060913086, + "logits/rejected": -1.4447038173675537, + "logps/chosen": -0.995640754699707, + "logps/rejected": -6.835412502288818, + "loss": 1.008, + "odds_ratio_loss": 0.12361223995685577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0995640829205513, + "rewards/margins": 0.583977222442627, + "rewards/rejected": -0.6835412979125977, + "sft_loss": 0.995640754699707, + "step": 10755 + }, + { + "epoch": 0.84, + "grad_norm": 6.518074989318848, + "learning_rate": 6.510121872726249e-07, + "logits/chosen": -1.3091720342636108, + "logits/rejected": -0.8189018368721008, + "logps/chosen": -1.0482852458953857, + "logps/rejected": -8.858360290527344, + "loss": 1.0521, + "odds_ratio_loss": 0.03843696787953377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10482852160930634, + "rewards/margins": 0.7810075283050537, + "rewards/rejected": -0.8858360052108765, + "sft_loss": 1.0482852458953857, + "step": 10760 + }, + { + "epoch": 0.84, + "grad_norm": 26.186777114868164, + "learning_rate": 6.479772865598016e-07, + "logits/chosen": -1.4154765605926514, + "logits/rejected": -1.3204116821289062, + "logps/chosen": -1.1116999387741089, + "logps/rejected": -7.335646629333496, + "loss": 1.1134, + "odds_ratio_loss": 0.01742752455174923, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11117000877857208, + "rewards/margins": 0.6223946809768677, + "rewards/rejected": -0.7335646748542786, + "sft_loss": 1.1116999387741089, + "step": 10765 + }, + { + "epoch": 0.84, + "grad_norm": 8.995668411254883, + "learning_rate": 6.449489862419772e-07, + "logits/chosen": -1.2886964082717896, + "logits/rejected": -1.0613796710968018, + "logps/chosen": -1.0305818319320679, + "logps/rejected": -4.333034515380859, + "loss": 1.1145, + "odds_ratio_loss": 0.8395366668701172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10305819660425186, + "rewards/margins": 0.33024531602859497, + "rewards/rejected": -0.43330350518226624, + "sft_loss": 1.0305818319320679, + "step": 10770 + }, + { + "epoch": 0.84, + "grad_norm": 20.093276977539062, + "learning_rate": 6.419272909119539e-07, + "logits/chosen": -1.175619125366211, + "logits/rejected": -1.290102243423462, + "logps/chosen": -1.2042921781539917, + "logps/rejected": -7.513049125671387, + "loss": 1.2206, + "odds_ratio_loss": 0.1633186638355255, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12042923271656036, + "rewards/margins": 0.6308757066726685, + "rewards/rejected": -0.7513049244880676, + "sft_loss": 1.2042921781539917, + "step": 10775 + }, + { + "epoch": 0.84, + "grad_norm": 38.16044235229492, + "learning_rate": 6.38912205152517e-07, + "logits/chosen": -1.269337773323059, + "logits/rejected": -0.9223520159721375, + "logps/chosen": -0.87409907579422, + "logps/rejected": -5.193105697631836, + "loss": 0.8866, + "odds_ratio_loss": 0.1254083812236786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08740990608930588, + "rewards/margins": 0.43190065026283264, + "rewards/rejected": -0.5193105936050415, + "sft_loss": 0.87409907579422, + "step": 10780 + }, + { + "epoch": 0.84, + "grad_norm": 4.398747444152832, + "learning_rate": 6.35903733536426e-07, + "logits/chosen": -1.3488149642944336, + "logits/rejected": -0.831713080406189, + "logps/chosen": -0.7239896655082703, + "logps/rejected": -5.531470775604248, + "loss": 0.7265, + "odds_ratio_loss": 0.0255146324634552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07239897549152374, + "rewards/margins": 0.48074811697006226, + "rewards/rejected": -0.5531471371650696, + "sft_loss": 0.7239896655082703, + "step": 10785 + }, + { + "epoch": 0.84, + "grad_norm": 18.752788543701172, + "learning_rate": 6.329018806264092e-07, + "logits/chosen": -1.303001046180725, + "logits/rejected": -1.189507246017456, + "logps/chosen": -0.9213204383850098, + "logps/rejected": -6.262149810791016, + "loss": 0.932, + "odds_ratio_loss": 0.1070183515548706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09213204681873322, + "rewards/margins": 0.5340828895568848, + "rewards/rejected": -0.6262149810791016, + "sft_loss": 0.9213204383850098, + "step": 10790 + }, + { + "epoch": 0.84, + "grad_norm": 6.965780735015869, + "learning_rate": 6.299066509751595e-07, + "logits/chosen": -1.4093542098999023, + "logits/rejected": -1.1180508136749268, + "logps/chosen": -1.06251060962677, + "logps/rejected": -10.950661659240723, + "loss": 1.0841, + "odds_ratio_loss": 0.21567395329475403, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.106251060962677, + "rewards/margins": 0.9888151288032532, + "rewards/rejected": -1.0950661897659302, + "sft_loss": 1.06251060962677, + "step": 10795 + }, + { + "epoch": 0.84, + "grad_norm": 4.436639308929443, + "learning_rate": 6.26918049125323e-07, + "logits/chosen": -1.2421128749847412, + "logits/rejected": -1.2047935724258423, + "logps/chosen": -1.0232081413269043, + "logps/rejected": -10.297616004943848, + "loss": 1.0383, + "odds_ratio_loss": 0.15066194534301758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10232081264257431, + "rewards/margins": 0.9274408221244812, + "rewards/rejected": -1.029761552810669, + "sft_loss": 1.0232081413269043, + "step": 10800 + }, + { + "epoch": 0.84, + "grad_norm": 6.202176570892334, + "learning_rate": 6.239360796094923e-07, + "logits/chosen": -1.4031856060028076, + "logits/rejected": -1.1629369258880615, + "logps/chosen": -0.8468238711357117, + "logps/rejected": -2.1136059761047363, + "loss": 0.8931, + "odds_ratio_loss": 0.4629918932914734, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0846823900938034, + "rewards/margins": 0.1266781985759735, + "rewards/rejected": -0.2113606035709381, + "sft_loss": 0.8468238711357117, + "step": 10805 + }, + { + "epoch": 0.84, + "grad_norm": 10.24286937713623, + "learning_rate": 6.209607469502032e-07, + "logits/chosen": -1.394173264503479, + "logits/rejected": -1.002078652381897, + "logps/chosen": -0.8187114596366882, + "logps/rejected": -9.146513938903809, + "loss": 0.8499, + "odds_ratio_loss": 0.312236487865448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0818711444735527, + "rewards/margins": 0.8327803611755371, + "rewards/rejected": -0.9146515130996704, + "sft_loss": 0.8187114596366882, + "step": 10810 + }, + { + "epoch": 0.84, + "grad_norm": 104.1180191040039, + "learning_rate": 6.179920556599267e-07, + "logits/chosen": -1.2264460325241089, + "logits/rejected": -1.3998425006866455, + "logps/chosen": -1.2771530151367188, + "logps/rejected": -10.008929252624512, + "loss": 1.3034, + "odds_ratio_loss": 0.2629183828830719, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1277153044939041, + "rewards/margins": 0.8731776475906372, + "rewards/rejected": -1.0008928775787354, + "sft_loss": 1.2771530151367188, + "step": 10815 + }, + { + "epoch": 0.84, + "grad_norm": 6.116608619689941, + "learning_rate": 6.150300102410589e-07, + "logits/chosen": -1.199758768081665, + "logits/rejected": -1.174690842628479, + "logps/chosen": -1.4607865810394287, + "logps/rejected": -6.701357364654541, + "loss": 1.4728, + "odds_ratio_loss": 0.12056130170822144, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1460786759853363, + "rewards/margins": 0.5240570306777954, + "rewards/rejected": -0.6701357364654541, + "sft_loss": 1.4607865810394287, + "step": 10820 + }, + { + "epoch": 0.84, + "grad_norm": 13.097474098205566, + "learning_rate": 6.120746151859186e-07, + "logits/chosen": -1.4636832475662231, + "logits/rejected": -1.1174657344818115, + "logps/chosen": -0.9332340955734253, + "logps/rejected": -3.634037494659424, + "loss": 0.9573, + "odds_ratio_loss": 0.24098041653633118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09332340210676193, + "rewards/margins": 0.2700802981853485, + "rewards/rejected": -0.36340370774269104, + "sft_loss": 0.9332340955734253, + "step": 10825 + }, + { + "epoch": 0.84, + "grad_norm": 4.853806972503662, + "learning_rate": 6.091258749767365e-07, + "logits/chosen": -1.1578795909881592, + "logits/rejected": -1.2788320779800415, + "logps/chosen": -0.7345417737960815, + "logps/rejected": -15.559396743774414, + "loss": 0.7417, + "odds_ratio_loss": 0.07208183407783508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07345417886972427, + "rewards/margins": 1.4824855327606201, + "rewards/rejected": -1.5559396743774414, + "sft_loss": 0.7345417737960815, + "step": 10830 + }, + { + "epoch": 0.84, + "grad_norm": 8.429411888122559, + "learning_rate": 6.061837940856524e-07, + "logits/chosen": -1.3931564092636108, + "logits/rejected": -0.8877042531967163, + "logps/chosen": -1.1764130592346191, + "logps/rejected": -5.6686506271362305, + "loss": 1.2116, + "odds_ratio_loss": 0.35157907009124756, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11764131486415863, + "rewards/margins": 0.44922375679016113, + "rewards/rejected": -0.566865086555481, + "sft_loss": 1.1764130592346191, + "step": 10835 + }, + { + "epoch": 0.84, + "grad_norm": 12.441805839538574, + "learning_rate": 6.032483769747044e-07, + "logits/chosen": -1.273625135421753, + "logits/rejected": -1.008589506149292, + "logps/chosen": -0.9708470106124878, + "logps/rejected": -8.170413970947266, + "loss": 0.9988, + "odds_ratio_loss": 0.279986172914505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09708471596240997, + "rewards/margins": 0.7199567556381226, + "rewards/rejected": -0.8170413970947266, + "sft_loss": 0.9708470106124878, + "step": 10840 + }, + { + "epoch": 0.84, + "grad_norm": 5.836912155151367, + "learning_rate": 6.003196280958268e-07, + "logits/chosen": -1.370110273361206, + "logits/rejected": -0.9305510520935059, + "logps/chosen": -0.7876640558242798, + "logps/rejected": -2.4059929847717285, + "loss": 0.8388, + "odds_ratio_loss": 0.5111384391784668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07876641303300858, + "rewards/margins": 0.16183289885520935, + "rewards/rejected": -0.24059931933879852, + "sft_loss": 0.7876640558242798, + "step": 10845 + }, + { + "epoch": 0.84, + "grad_norm": 5.9041948318481445, + "learning_rate": 5.973975518908381e-07, + "logits/chosen": -1.2838035821914673, + "logits/rejected": -0.8337491750717163, + "logps/chosen": -0.9224063754081726, + "logps/rejected": -6.886902809143066, + "loss": 0.937, + "odds_ratio_loss": 0.14572349190711975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09224063903093338, + "rewards/margins": 0.5964496731758118, + "rewards/rejected": -0.6886903047561646, + "sft_loss": 0.9224063754081726, + "step": 10850 + }, + { + "epoch": 0.84, + "grad_norm": 6.783423900604248, + "learning_rate": 5.94482152791438e-07, + "logits/chosen": -1.383037805557251, + "logits/rejected": -1.1398518085479736, + "logps/chosen": -1.2712384462356567, + "logps/rejected": -8.102948188781738, + "loss": 1.283, + "odds_ratio_loss": 0.11787731945514679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1271238625049591, + "rewards/margins": 0.6831710338592529, + "rewards/rejected": -0.8102949261665344, + "sft_loss": 1.2712384462356567, + "step": 10855 + }, + { + "epoch": 0.84, + "grad_norm": 20.92612648010254, + "learning_rate": 5.915734352191998e-07, + "logits/chosen": -1.3900169134140015, + "logits/rejected": -1.3074524402618408, + "logps/chosen": -0.7815436720848083, + "logps/rejected": -8.142717361450195, + "loss": 0.7898, + "odds_ratio_loss": 0.08238000422716141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07815437018871307, + "rewards/margins": 0.736117422580719, + "rewards/rejected": -0.8142718076705933, + "sft_loss": 0.7815436720848083, + "step": 10860 + }, + { + "epoch": 0.85, + "grad_norm": 4.04435396194458, + "learning_rate": 5.886714035855629e-07, + "logits/chosen": -1.300445318222046, + "logits/rejected": -0.8074096441268921, + "logps/chosen": -0.7334375381469727, + "logps/rejected": -8.39263916015625, + "loss": 0.7431, + "odds_ratio_loss": 0.09648707509040833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07334376126527786, + "rewards/margins": 0.7659201622009277, + "rewards/rejected": -0.839263916015625, + "sft_loss": 0.7334375381469727, + "step": 10865 + }, + { + "epoch": 0.85, + "grad_norm": 5.290650844573975, + "learning_rate": 5.857760622918263e-07, + "logits/chosen": -1.2743334770202637, + "logits/rejected": -0.46047696471214294, + "logps/chosen": -0.8884002566337585, + "logps/rejected": -4.124817848205566, + "loss": 0.9124, + "odds_ratio_loss": 0.23984280228614807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08884003013372421, + "rewards/margins": 0.32364171743392944, + "rewards/rejected": -0.41248178482055664, + "sft_loss": 0.8884002566337585, + "step": 10870 + }, + { + "epoch": 0.85, + "grad_norm": 8.239819526672363, + "learning_rate": 5.828874157291425e-07, + "logits/chosen": -1.1602166891098022, + "logits/rejected": -1.1731626987457275, + "logps/chosen": -0.8925191164016724, + "logps/rejected": -5.278420448303223, + "loss": 0.9066, + "odds_ratio_loss": 0.14100593328475952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08925192058086395, + "rewards/margins": 0.4385901093482971, + "rewards/rejected": -0.5278420448303223, + "sft_loss": 0.8925191164016724, + "step": 10875 + }, + { + "epoch": 0.85, + "grad_norm": 8.573440551757812, + "learning_rate": 5.800054682785117e-07, + "logits/chosen": -1.296282410621643, + "logits/rejected": -0.8988644480705261, + "logps/chosen": -0.9847795367240906, + "logps/rejected": -5.011839389801025, + "loss": 1.0078, + "odds_ratio_loss": 0.22980418801307678, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09847795963287354, + "rewards/margins": 0.40270599722862244, + "rewards/rejected": -0.5011839866638184, + "sft_loss": 0.9847795367240906, + "step": 10880 + }, + { + "epoch": 0.85, + "grad_norm": 23.42486572265625, + "learning_rate": 5.771302243107729e-07, + "logits/chosen": -1.478144884109497, + "logits/rejected": -1.2906591892242432, + "logps/chosen": -0.8750435709953308, + "logps/rejected": -2.1842732429504395, + "loss": 0.9128, + "odds_ratio_loss": 0.377769410610199, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08750435709953308, + "rewards/margins": 0.13092297315597534, + "rewards/rejected": -0.21842734515666962, + "sft_loss": 0.8750435709953308, + "step": 10885 + }, + { + "epoch": 0.85, + "grad_norm": 8.602510452270508, + "learning_rate": 5.742616881865981e-07, + "logits/chosen": -1.3302183151245117, + "logits/rejected": -0.9966436624526978, + "logps/chosen": -1.0099607706069946, + "logps/rejected": -7.133604526519775, + "loss": 1.0168, + "odds_ratio_loss": 0.06856737285852432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10099606215953827, + "rewards/margins": 0.6123644113540649, + "rewards/rejected": -0.7133604288101196, + "sft_loss": 1.0099607706069946, + "step": 10890 + }, + { + "epoch": 0.85, + "grad_norm": 11.181099891662598, + "learning_rate": 5.713998642564872e-07, + "logits/chosen": -1.4327882528305054, + "logits/rejected": -0.9655070304870605, + "logps/chosen": -0.8675923347473145, + "logps/rejected": -2.817263126373291, + "loss": 0.8961, + "odds_ratio_loss": 0.285393625497818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08675923198461533, + "rewards/margins": 0.19496707618236542, + "rewards/rejected": -0.28172630071640015, + "sft_loss": 0.8675923347473145, + "step": 10895 + }, + { + "epoch": 0.85, + "grad_norm": 7.003217697143555, + "learning_rate": 5.685447568607589e-07, + "logits/chosen": -1.1521581411361694, + "logits/rejected": -1.2851454019546509, + "logps/chosen": -0.9078611135482788, + "logps/rejected": -8.256436347961426, + "loss": 0.9093, + "odds_ratio_loss": 0.014228816144168377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09078611433506012, + "rewards/margins": 0.7348575592041016, + "rewards/rejected": -0.8256436586380005, + "sft_loss": 0.9078611135482788, + "step": 10900 + }, + { + "epoch": 0.85, + "grad_norm": 293.4844055175781, + "learning_rate": 5.656963703295454e-07, + "logits/chosen": -1.4213796854019165, + "logits/rejected": -0.9011304974555969, + "logps/chosen": -1.1318405866622925, + "logps/rejected": -9.844264030456543, + "loss": 1.1469, + "odds_ratio_loss": 0.1509471833705902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11318407207727432, + "rewards/margins": 0.871242344379425, + "rewards/rejected": -0.9844264984130859, + "sft_loss": 1.1318405866622925, + "step": 10905 + }, + { + "epoch": 0.85, + "grad_norm": 7.686473846435547, + "learning_rate": 5.628547089827885e-07, + "logits/chosen": -1.1069039106369019, + "logits/rejected": -1.015209674835205, + "logps/chosen": -1.0385732650756836, + "logps/rejected": -5.666954517364502, + "loss": 1.0521, + "odds_ratio_loss": 0.1351829469203949, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10385732352733612, + "rewards/margins": 0.4628380835056305, + "rewards/rejected": -0.5666954517364502, + "sft_loss": 1.0385732650756836, + "step": 10910 + }, + { + "epoch": 0.85, + "grad_norm": 20.32103729248047, + "learning_rate": 5.600197771302274e-07, + "logits/chosen": -1.2023155689239502, + "logits/rejected": -0.7322795987129211, + "logps/chosen": -0.8296264410018921, + "logps/rejected": -1.8694965839385986, + "loss": 0.857, + "odds_ratio_loss": 0.2740650773048401, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08296265453100204, + "rewards/margins": 0.10398700088262558, + "rewards/rejected": -0.18694964051246643, + "sft_loss": 0.8296264410018921, + "step": 10915 + }, + { + "epoch": 0.85, + "grad_norm": 13.354987144470215, + "learning_rate": 5.571915790713944e-07, + "logits/chosen": -1.400775671005249, + "logits/rejected": -1.0198614597320557, + "logps/chosen": -0.9796463251113892, + "logps/rejected": -3.354994535446167, + "loss": 1.0359, + "odds_ratio_loss": 0.5625422596931458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09796462953090668, + "rewards/margins": 0.23753483593463898, + "rewards/rejected": -0.33549946546554565, + "sft_loss": 0.9796463251113892, + "step": 10920 + }, + { + "epoch": 0.85, + "grad_norm": 6.634857177734375, + "learning_rate": 5.543701190956146e-07, + "logits/chosen": -1.442639708518982, + "logits/rejected": -1.0874769687652588, + "logps/chosen": -1.7671304941177368, + "logps/rejected": -5.139246940612793, + "loss": 1.8229, + "odds_ratio_loss": 0.55790114402771, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17671306431293488, + "rewards/margins": 0.33721163868904114, + "rewards/rejected": -0.5139247179031372, + "sft_loss": 1.7671304941177368, + "step": 10925 + }, + { + "epoch": 0.85, + "grad_norm": 13.042349815368652, + "learning_rate": 5.515554014819879e-07, + "logits/chosen": -1.3886566162109375, + "logits/rejected": -0.9926946759223938, + "logps/chosen": -0.8931914567947388, + "logps/rejected": -2.3885300159454346, + "loss": 0.9148, + "odds_ratio_loss": 0.21583838760852814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08931914716959, + "rewards/margins": 0.14953383803367615, + "rewards/rejected": -0.23885297775268555, + "sft_loss": 0.8931914567947388, + "step": 10930 + }, + { + "epoch": 0.85, + "grad_norm": 11.30074405670166, + "learning_rate": 5.487474304993912e-07, + "logits/chosen": -1.3836137056350708, + "logits/rejected": -1.1608842611312866, + "logps/chosen": -1.0646214485168457, + "logps/rejected": -4.637225151062012, + "loss": 1.0853, + "odds_ratio_loss": 0.20709529519081116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10646214336156845, + "rewards/margins": 0.35726040601730347, + "rewards/rejected": -0.4637225270271301, + "sft_loss": 1.0646214485168457, + "step": 10935 + }, + { + "epoch": 0.85, + "grad_norm": 36.60148620605469, + "learning_rate": 5.459462104064695e-07, + "logits/chosen": -1.4806654453277588, + "logits/rejected": -1.1578395366668701, + "logps/chosen": -1.047041654586792, + "logps/rejected": -3.035254955291748, + "loss": 1.0983, + "odds_ratio_loss": 0.5123556852340698, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10470416396856308, + "rewards/margins": 0.19882136583328247, + "rewards/rejected": -0.30352550745010376, + "sft_loss": 1.047041654586792, + "step": 10940 + }, + { + "epoch": 0.85, + "grad_norm": 17.35590171813965, + "learning_rate": 5.431517454516282e-07, + "logits/chosen": -1.360613465309143, + "logits/rejected": -1.1922509670257568, + "logps/chosen": -0.706468403339386, + "logps/rejected": -5.037663459777832, + "loss": 0.7226, + "odds_ratio_loss": 0.161125048995018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07064683735370636, + "rewards/margins": 0.433119535446167, + "rewards/rejected": -0.5037663578987122, + "sft_loss": 0.706468403339386, + "step": 10945 + }, + { + "epoch": 0.85, + "grad_norm": 14.495502471923828, + "learning_rate": 5.403640398730286e-07, + "logits/chosen": -1.2188652753829956, + "logits/rejected": -1.199636697769165, + "logps/chosen": -1.0238749980926514, + "logps/rejected": -5.018639087677002, + "loss": 1.0654, + "odds_ratio_loss": 0.4154825806617737, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10238751024007797, + "rewards/margins": 0.3994763493537903, + "rewards/rejected": -0.5018638372421265, + "sft_loss": 1.0238749980926514, + "step": 10950 + }, + { + "epoch": 0.85, + "grad_norm": 13.935104370117188, + "learning_rate": 5.375830978985791e-07, + "logits/chosen": -1.3460266590118408, + "logits/rejected": -1.4006439447402954, + "logps/chosen": -1.05868399143219, + "logps/rejected": -16.683874130249023, + "loss": 1.0589, + "odds_ratio_loss": 0.0025215111672878265, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10586841404438019, + "rewards/margins": 1.5625190734863281, + "rewards/rejected": -1.6683876514434814, + "sft_loss": 1.05868399143219, + "step": 10955 + }, + { + "epoch": 0.85, + "grad_norm": 7.0826239585876465, + "learning_rate": 5.34808923745933e-07, + "logits/chosen": -1.3722484111785889, + "logits/rejected": -1.1384642124176025, + "logps/chosen": -1.3418304920196533, + "logps/rejected": -8.644782066345215, + "loss": 1.3495, + "odds_ratio_loss": 0.07701762765645981, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13418304920196533, + "rewards/margins": 0.7302952408790588, + "rewards/rejected": -0.8644782304763794, + "sft_loss": 1.3418304920196533, + "step": 10960 + }, + { + "epoch": 0.85, + "grad_norm": 5.142551898956299, + "learning_rate": 5.320415216224767e-07, + "logits/chosen": -1.3383108377456665, + "logits/rejected": -0.87104332447052, + "logps/chosen": -0.9291396141052246, + "logps/rejected": -6.74056339263916, + "loss": 0.9505, + "odds_ratio_loss": 0.21329009532928467, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09291397035121918, + "rewards/margins": 0.5811423063278198, + "rewards/rejected": -0.674056351184845, + "sft_loss": 0.9291396141052246, + "step": 10965 + }, + { + "epoch": 0.85, + "grad_norm": 6.258543491363525, + "learning_rate": 5.292808957253265e-07, + "logits/chosen": -1.3454580307006836, + "logits/rejected": -1.1424548625946045, + "logps/chosen": -0.8718706965446472, + "logps/rejected": -6.058313846588135, + "loss": 0.878, + "odds_ratio_loss": 0.06122884154319763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08718707412481308, + "rewards/margins": 0.5186443328857422, + "rewards/rejected": -0.6058313846588135, + "sft_loss": 0.8718706965446472, + "step": 10970 + }, + { + "epoch": 0.85, + "grad_norm": 29.74590301513672, + "learning_rate": 5.265270502413228e-07, + "logits/chosen": -1.266903281211853, + "logits/rejected": -1.1316707134246826, + "logps/chosen": -0.8083049058914185, + "logps/rejected": -6.998124122619629, + "loss": 0.8188, + "odds_ratio_loss": 0.10462143272161484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08083049952983856, + "rewards/margins": 0.6189819574356079, + "rewards/rejected": -0.6998124122619629, + "sft_loss": 0.8083049058914185, + "step": 10975 + }, + { + "epoch": 0.85, + "grad_norm": 10.037148475646973, + "learning_rate": 5.237799893470219e-07, + "logits/chosen": -1.4532562494277954, + "logits/rejected": -0.8590501546859741, + "logps/chosen": -1.0203161239624023, + "logps/rejected": -7.895735263824463, + "loss": 1.0291, + "odds_ratio_loss": 0.08833594620227814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10203162580728531, + "rewards/margins": 0.6875419020652771, + "rewards/rejected": -0.7895735502243042, + "sft_loss": 1.0203161239624023, + "step": 10980 + }, + { + "epoch": 0.85, + "grad_norm": 73.04912567138672, + "learning_rate": 5.210397172086906e-07, + "logits/chosen": -1.3931853771209717, + "logits/rejected": -1.0637333393096924, + "logps/chosen": -1.1848360300064087, + "logps/rejected": -6.328829765319824, + "loss": 1.1876, + "odds_ratio_loss": 0.027847150340676308, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11848358809947968, + "rewards/margins": 0.5143994092941284, + "rewards/rejected": -0.6328829526901245, + "sft_loss": 1.1848360300064087, + "step": 10985 + }, + { + "epoch": 0.85, + "grad_norm": 16.441225051879883, + "learning_rate": 5.183062379822978e-07, + "logits/chosen": -1.3455500602722168, + "logits/rejected": -1.5203089714050293, + "logps/chosen": -0.9536484479904175, + "logps/rejected": -9.948080062866211, + "loss": 0.954, + "odds_ratio_loss": 0.003847536165267229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09536485373973846, + "rewards/margins": 0.8994432687759399, + "rewards/rejected": -0.9948080778121948, + "sft_loss": 0.9536484479904175, + "step": 10990 + }, + { + "epoch": 0.86, + "grad_norm": 6.653332233428955, + "learning_rate": 5.155795558135141e-07, + "logits/chosen": -1.4233778715133667, + "logits/rejected": -0.9469043612480164, + "logps/chosen": -1.0065068006515503, + "logps/rejected": -5.959763050079346, + "loss": 1.009, + "odds_ratio_loss": 0.024972127750515938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10065068304538727, + "rewards/margins": 0.49532565474510193, + "rewards/rejected": -0.5959763526916504, + "sft_loss": 1.0065068006515503, + "step": 10995 + }, + { + "epoch": 0.86, + "grad_norm": 173.84759521484375, + "learning_rate": 5.128596748376979e-07, + "logits/chosen": -1.4137773513793945, + "logits/rejected": -0.8344324231147766, + "logps/chosen": -0.9149678945541382, + "logps/rejected": -7.3621320724487305, + "loss": 0.9347, + "odds_ratio_loss": 0.1972806602716446, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09149680286645889, + "rewards/margins": 0.6447164416313171, + "rewards/rejected": -0.7362133264541626, + "sft_loss": 0.9149678945541382, + "step": 11000 + }, + { + "epoch": 0.86, + "grad_norm": 5.335231304168701, + "learning_rate": 5.101465991798948e-07, + "logits/chosen": -1.3805840015411377, + "logits/rejected": -1.3037879467010498, + "logps/chosen": -1.1943247318267822, + "logps/rejected": -7.626928806304932, + "loss": 1.2359, + "odds_ratio_loss": 0.41593390703201294, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1194324716925621, + "rewards/margins": 0.6432604193687439, + "rewards/rejected": -0.7626928091049194, + "sft_loss": 1.1943247318267822, + "step": 11005 + }, + { + "epoch": 0.86, + "grad_norm": 9.778425216674805, + "learning_rate": 5.074403329548277e-07, + "logits/chosen": -1.3550758361816406, + "logits/rejected": -0.9329360723495483, + "logps/chosen": -1.3633568286895752, + "logps/rejected": -3.0032997131347656, + "loss": 1.3902, + "odds_ratio_loss": 0.2689247131347656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13633567094802856, + "rewards/margins": 0.16399429738521576, + "rewards/rejected": -0.3003299832344055, + "sft_loss": 1.3633568286895752, + "step": 11010 + }, + { + "epoch": 0.86, + "grad_norm": 5.133905410766602, + "learning_rate": 5.047408802668935e-07, + "logits/chosen": -1.2931458950042725, + "logits/rejected": -1.0412628650665283, + "logps/chosen": -1.0921659469604492, + "logps/rejected": -8.542851448059082, + "loss": 1.116, + "odds_ratio_loss": 0.2381249964237213, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10921660810709, + "rewards/margins": 0.7450685501098633, + "rewards/rejected": -0.8542851209640503, + "sft_loss": 1.0921659469604492, + "step": 11015 + }, + { + "epoch": 0.86, + "grad_norm": 19.411243438720703, + "learning_rate": 5.020482452101539e-07, + "logits/chosen": -1.3749643564224243, + "logits/rejected": -0.825789749622345, + "logps/chosen": -0.9931381940841675, + "logps/rejected": -13.348337173461914, + "loss": 1.0066, + "odds_ratio_loss": 0.13428032398223877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09931383281946182, + "rewards/margins": 1.2355200052261353, + "rewards/rejected": -1.3348338603973389, + "sft_loss": 0.9931381940841675, + "step": 11020 + }, + { + "epoch": 0.86, + "grad_norm": 10.71738052368164, + "learning_rate": 4.993624318683332e-07, + "logits/chosen": -1.4427053928375244, + "logits/rejected": -0.8330078125, + "logps/chosen": -1.0185954570770264, + "logps/rejected": -4.659827709197998, + "loss": 1.0898, + "odds_ratio_loss": 0.7123147249221802, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10185954719781876, + "rewards/margins": 0.3641231954097748, + "rewards/rejected": -0.4659827649593353, + "sft_loss": 1.0185954570770264, + "step": 11025 + }, + { + "epoch": 0.86, + "grad_norm": 30.706079483032227, + "learning_rate": 4.966834443148078e-07, + "logits/chosen": -1.3195933103561401, + "logits/rejected": -0.9074063301086426, + "logps/chosen": -1.1589607000350952, + "logps/rejected": -3.9544196128845215, + "loss": 1.1997, + "odds_ratio_loss": 0.4074534475803375, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1158960685133934, + "rewards/margins": 0.2795459032058716, + "rewards/rejected": -0.3954419791698456, + "sft_loss": 1.1589607000350952, + "step": 11030 + }, + { + "epoch": 0.86, + "grad_norm": 58.909297943115234, + "learning_rate": 4.940112866126018e-07, + "logits/chosen": -1.4745378494262695, + "logits/rejected": -1.5084199905395508, + "logps/chosen": -1.2225674390792847, + "logps/rejected": -6.94110631942749, + "loss": 1.2467, + "odds_ratio_loss": 0.24120387434959412, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12225675582885742, + "rewards/margins": 0.5718539357185364, + "rewards/rejected": -0.6941106915473938, + "sft_loss": 1.2225674390792847, + "step": 11035 + }, + { + "epoch": 0.86, + "grad_norm": 4.487308025360107, + "learning_rate": 4.913459628143829e-07, + "logits/chosen": -1.391526222229004, + "logits/rejected": -0.8932541608810425, + "logps/chosen": -1.1362192630767822, + "logps/rejected": -8.21275806427002, + "loss": 1.1451, + "odds_ratio_loss": 0.0883936733007431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11362192779779434, + "rewards/margins": 0.7076539397239685, + "rewards/rejected": -0.8212758302688599, + "sft_loss": 1.1362192630767822, + "step": 11040 + }, + { + "epoch": 0.86, + "grad_norm": 23.82204246520996, + "learning_rate": 4.886874769624528e-07, + "logits/chosen": -1.4121615886688232, + "logits/rejected": -0.8175728917121887, + "logps/chosen": -0.7233660221099854, + "logps/rejected": -3.2058944702148438, + "loss": 0.7563, + "odds_ratio_loss": 0.32946401834487915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0723365992307663, + "rewards/margins": 0.24825282394886017, + "rewards/rejected": -0.32058948278427124, + "sft_loss": 0.7233660221099854, + "step": 11045 + }, + { + "epoch": 0.86, + "grad_norm": 7.111661911010742, + "learning_rate": 4.860358330887421e-07, + "logits/chosen": -1.4883744716644287, + "logits/rejected": -1.2798588275909424, + "logps/chosen": -1.2233302593231201, + "logps/rejected": -7.7917656898498535, + "loss": 1.2635, + "odds_ratio_loss": 0.4016781449317932, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12233302742242813, + "rewards/margins": 0.6568435430526733, + "rewards/rejected": -0.7791765928268433, + "sft_loss": 1.2233302593231201, + "step": 11050 + }, + { + "epoch": 0.86, + "grad_norm": 18.886579513549805, + "learning_rate": 4.833910352148057e-07, + "logits/chosen": -1.3979682922363281, + "logits/rejected": -1.0197150707244873, + "logps/chosen": -1.0084316730499268, + "logps/rejected": -6.580392360687256, + "loss": 1.0177, + "odds_ratio_loss": 0.09293156862258911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10084317624568939, + "rewards/margins": 0.5571960806846619, + "rewards/rejected": -0.6580392718315125, + "sft_loss": 1.0084316730499268, + "step": 11055 + }, + { + "epoch": 0.86, + "grad_norm": 401.8313293457031, + "learning_rate": 4.807530873518157e-07, + "logits/chosen": -1.48651921749115, + "logits/rejected": -1.1749073266983032, + "logps/chosen": -1.508371353149414, + "logps/rejected": -16.79873275756836, + "loss": 1.5167, + "odds_ratio_loss": 0.0837855190038681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15083713829517365, + "rewards/margins": 1.5290361642837524, + "rewards/rejected": -1.6798732280731201, + "sft_loss": 1.508371353149414, + "step": 11060 + }, + { + "epoch": 0.86, + "grad_norm": 10.496895790100098, + "learning_rate": 4.781219935005548e-07, + "logits/chosen": -1.273923635482788, + "logits/rejected": -1.082719087600708, + "logps/chosen": -0.9612051844596863, + "logps/rejected": -9.734685897827148, + "loss": 0.963, + "odds_ratio_loss": 0.017861105501651764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09612051397562027, + "rewards/margins": 0.8773480653762817, + "rewards/rejected": -0.9734686017036438, + "sft_loss": 0.9612051844596863, + "step": 11065 + }, + { + "epoch": 0.86, + "grad_norm": 79.49195098876953, + "learning_rate": 4.754977576514097e-07, + "logits/chosen": -1.2368603944778442, + "logits/rejected": -0.8288145065307617, + "logps/chosen": -1.0270787477493286, + "logps/rejected": -4.39780855178833, + "loss": 1.0337, + "odds_ratio_loss": 0.06579282134771347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1027078777551651, + "rewards/margins": 0.3370729982852936, + "rewards/rejected": -0.4397808611392975, + "sft_loss": 1.0270787477493286, + "step": 11070 + }, + { + "epoch": 0.86, + "grad_norm": 11.613289833068848, + "learning_rate": 4.7288038378436876e-07, + "logits/chosen": -1.3377621173858643, + "logits/rejected": -0.9755362272262573, + "logps/chosen": -1.0771583318710327, + "logps/rejected": -3.616690158843994, + "loss": 1.1031, + "odds_ratio_loss": 0.2592793107032776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10771583020687103, + "rewards/margins": 0.25395315885543823, + "rewards/rejected": -0.36166900396347046, + "sft_loss": 1.0771583318710327, + "step": 11075 + }, + { + "epoch": 0.86, + "grad_norm": 28.060232162475586, + "learning_rate": 4.702698758690116e-07, + "logits/chosen": -1.3476572036743164, + "logits/rejected": -1.550072431564331, + "logps/chosen": -0.9806027412414551, + "logps/rejected": -4.352713108062744, + "loss": 0.9851, + "odds_ratio_loss": 0.04496455565094948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09806027263402939, + "rewards/margins": 0.3372109830379486, + "rewards/rejected": -0.4352712631225586, + "sft_loss": 0.9806027412414551, + "step": 11080 + }, + { + "epoch": 0.86, + "grad_norm": 242.48379516601562, + "learning_rate": 4.676662378645042e-07, + "logits/chosen": -1.3592426776885986, + "logits/rejected": -1.5002645254135132, + "logps/chosen": -1.485435962677002, + "logps/rejected": -9.078450202941895, + "loss": 1.5257, + "odds_ratio_loss": 0.4027434289455414, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1485435962677002, + "rewards/margins": 0.759301483631134, + "rewards/rejected": -0.9078450202941895, + "sft_loss": 1.485435962677002, + "step": 11085 + }, + { + "epoch": 0.86, + "grad_norm": 23.109437942504883, + "learning_rate": 4.650694737195949e-07, + "logits/chosen": -1.4202067852020264, + "logits/rejected": -1.1966689825057983, + "logps/chosen": -0.6969403028488159, + "logps/rejected": -7.00976037979126, + "loss": 0.7074, + "odds_ratio_loss": 0.10413110256195068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06969402730464935, + "rewards/margins": 0.6312819719314575, + "rewards/rejected": -0.7009760141372681, + "sft_loss": 0.6969403028488159, + "step": 11090 + }, + { + "epoch": 0.86, + "grad_norm": 5.188475131988525, + "learning_rate": 4.6247958737260623e-07, + "logits/chosen": -1.4141209125518799, + "logits/rejected": -1.0835773944854736, + "logps/chosen": -0.8121021389961243, + "logps/rejected": -14.08741283416748, + "loss": 0.8122, + "odds_ratio_loss": 0.0008768331026658416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08121021836996078, + "rewards/margins": 1.327531099319458, + "rewards/rejected": -1.4087413549423218, + "sft_loss": 0.8121021389961243, + "step": 11095 + }, + { + "epoch": 0.86, + "grad_norm": 6.953471660614014, + "learning_rate": 4.598965827514279e-07, + "logits/chosen": -1.4587968587875366, + "logits/rejected": -1.3610684871673584, + "logps/chosen": -0.9946497082710266, + "logps/rejected": -9.220718383789062, + "loss": 1.01, + "odds_ratio_loss": 0.15365691483020782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09946496784687042, + "rewards/margins": 0.8226070404052734, + "rewards/rejected": -0.9220719337463379, + "sft_loss": 0.9946497082710266, + "step": 11100 + }, + { + "epoch": 0.86, + "grad_norm": 2.0856804847717285, + "learning_rate": 4.573204637735174e-07, + "logits/chosen": -1.212342619895935, + "logits/rejected": -0.8433161973953247, + "logps/chosen": -0.6405612826347351, + "logps/rejected": -5.117362022399902, + "loss": 0.6488, + "odds_ratio_loss": 0.08268775045871735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06405612826347351, + "rewards/margins": 0.44768014550209045, + "rewards/rejected": -0.511736273765564, + "sft_loss": 0.6405612826347351, + "step": 11105 + }, + { + "epoch": 0.86, + "grad_norm": 6.809403896331787, + "learning_rate": 4.547512343458843e-07, + "logits/chosen": -1.3145198822021484, + "logits/rejected": -0.7401739954948425, + "logps/chosen": -0.9998018145561218, + "logps/rejected": -3.8390610218048096, + "loss": 1.0124, + "odds_ratio_loss": 0.1258481740951538, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0999801829457283, + "rewards/margins": 0.2839259207248688, + "rewards/rejected": -0.3839060664176941, + "sft_loss": 0.9998018145561218, + "step": 11110 + }, + { + "epoch": 0.86, + "grad_norm": 6.046141147613525, + "learning_rate": 4.5218889836509185e-07, + "logits/chosen": -1.3540902137756348, + "logits/rejected": -1.4832431077957153, + "logps/chosen": -0.7950665950775146, + "logps/rejected": -10.756556510925293, + "loss": 0.7951, + "odds_ratio_loss": 0.0003343525168020278, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07950666546821594, + "rewards/margins": 0.9961490631103516, + "rewards/rejected": -1.0756556987762451, + "sft_loss": 0.7950665950775146, + "step": 11115 + }, + { + "epoch": 0.87, + "grad_norm": 5.530214309692383, + "learning_rate": 4.4963345971724747e-07, + "logits/chosen": -1.4007446765899658, + "logits/rejected": -0.9551407098770142, + "logps/chosen": -0.9888604879379272, + "logps/rejected": -4.050307273864746, + "loss": 1.0175, + "odds_ratio_loss": 0.28623491525650024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09888605773448944, + "rewards/margins": 0.3061446249485016, + "rewards/rejected": -0.40503066778182983, + "sft_loss": 0.9888604879379272, + "step": 11120 + }, + { + "epoch": 0.87, + "grad_norm": 7.572410583496094, + "learning_rate": 4.4708492227799824e-07, + "logits/chosen": -1.2860050201416016, + "logits/rejected": -1.1336227655410767, + "logps/chosen": -0.8657090067863464, + "logps/rejected": -9.67644214630127, + "loss": 0.8677, + "odds_ratio_loss": 0.02012895792722702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08657090365886688, + "rewards/margins": 0.8810732960700989, + "rewards/rejected": -0.967644214630127, + "sft_loss": 0.8657090067863464, + "step": 11125 + }, + { + "epoch": 0.87, + "grad_norm": 9.17696475982666, + "learning_rate": 4.4454328991252517e-07, + "logits/chosen": -1.2129541635513306, + "logits/rejected": -1.0243322849273682, + "logps/chosen": -0.9325492978096008, + "logps/rejected": -6.651510715484619, + "loss": 0.9378, + "odds_ratio_loss": 0.05253799632191658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0932549387216568, + "rewards/margins": 0.5718960762023926, + "rewards/rejected": -0.6651510000228882, + "sft_loss": 0.9325492978096008, + "step": 11130 + }, + { + "epoch": 0.87, + "grad_norm": 6.805229663848877, + "learning_rate": 4.4200856647553527e-07, + "logits/chosen": -1.3854291439056396, + "logits/rejected": -1.1689527034759521, + "logps/chosen": -1.0263032913208008, + "logps/rejected": -4.803722858428955, + "loss": 1.0321, + "odds_ratio_loss": 0.057866036891937256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10263033211231232, + "rewards/margins": 0.3777419924736023, + "rewards/rejected": -0.48037227988243103, + "sft_loss": 1.0263032913208008, + "step": 11135 + }, + { + "epoch": 0.87, + "grad_norm": 6.040757179260254, + "learning_rate": 4.394807558112607e-07, + "logits/chosen": -1.3923509120941162, + "logits/rejected": -1.3340778350830078, + "logps/chosen": -0.826248824596405, + "logps/rejected": -9.858784675598145, + "loss": 0.873, + "odds_ratio_loss": 0.46788063645362854, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0826248899102211, + "rewards/margins": 0.9032536745071411, + "rewards/rejected": -0.985878586769104, + "sft_loss": 0.826248824596405, + "step": 11140 + }, + { + "epoch": 0.87, + "grad_norm": 6.630565643310547, + "learning_rate": 4.3695986175344596e-07, + "logits/chosen": -1.365642786026001, + "logits/rejected": -1.1266989707946777, + "logps/chosen": -0.995932400226593, + "logps/rejected": -5.503636837005615, + "loss": 1.0326, + "odds_ratio_loss": 0.3664228916168213, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09959324449300766, + "rewards/margins": 0.4507705271244049, + "rewards/rejected": -0.5503637194633484, + "sft_loss": 0.995932400226593, + "step": 11145 + }, + { + "epoch": 0.87, + "grad_norm": 14.042057037353516, + "learning_rate": 4.344458881253455e-07, + "logits/chosen": -1.4202449321746826, + "logits/rejected": -0.9090083837509155, + "logps/chosen": -1.0853300094604492, + "logps/rejected": -6.483142852783203, + "loss": 1.1, + "odds_ratio_loss": 0.1471116840839386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10853300988674164, + "rewards/margins": 0.5397812724113464, + "rewards/rejected": -0.6483142375946045, + "sft_loss": 1.0853300094604492, + "step": 11150 + }, + { + "epoch": 0.87, + "grad_norm": 83.15284729003906, + "learning_rate": 4.319388387397228e-07, + "logits/chosen": -1.2567228078842163, + "logits/rejected": -0.9255503416061401, + "logps/chosen": -0.9807448387145996, + "logps/rejected": -11.109251022338867, + "loss": 0.9967, + "odds_ratio_loss": 0.15947206318378448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09807448089122772, + "rewards/margins": 1.0128507614135742, + "rewards/rejected": -1.110925316810608, + "sft_loss": 0.9807448387145996, + "step": 11155 + }, + { + "epoch": 0.87, + "grad_norm": 13.364371299743652, + "learning_rate": 4.29438717398834e-07, + "logits/chosen": -1.3048971891403198, + "logits/rejected": -1.3854659795761108, + "logps/chosen": -0.9401968717575073, + "logps/rejected": -8.30219554901123, + "loss": 0.9664, + "odds_ratio_loss": 0.26237112283706665, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09401968866586685, + "rewards/margins": 0.7361998558044434, + "rewards/rejected": -0.830219566822052, + "sft_loss": 0.9401968717575073, + "step": 11160 + }, + { + "epoch": 0.87, + "grad_norm": 17.632877349853516, + "learning_rate": 4.2694552789443177e-07, + "logits/chosen": -1.3702666759490967, + "logits/rejected": -0.9406827688217163, + "logps/chosen": -0.9656890630722046, + "logps/rejected": -2.2116432189941406, + "loss": 1.1027, + "odds_ratio_loss": 1.3702127933502197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09656891971826553, + "rewards/margins": 0.12459540367126465, + "rewards/rejected": -0.22116431593894958, + "sft_loss": 0.9656890630722046, + "step": 11165 + }, + { + "epoch": 0.87, + "grad_norm": 7.403985977172852, + "learning_rate": 4.244592740077541e-07, + "logits/chosen": -1.410797119140625, + "logits/rejected": -1.1965923309326172, + "logps/chosen": -0.9599423408508301, + "logps/rejected": -9.67354679107666, + "loss": 0.9907, + "odds_ratio_loss": 0.307929128408432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09599423408508301, + "rewards/margins": 0.8713604211807251, + "rewards/rejected": -0.9673545956611633, + "sft_loss": 0.9599423408508301, + "step": 11170 + }, + { + "epoch": 0.87, + "grad_norm": 14.449532508850098, + "learning_rate": 4.2197995950952084e-07, + "logits/chosen": -1.4222707748413086, + "logits/rejected": -1.011182427406311, + "logps/chosen": -0.7854120135307312, + "logps/rejected": -1.555874228477478, + "loss": 0.8625, + "odds_ratio_loss": 0.7705218195915222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07854120433330536, + "rewards/margins": 0.0770462304353714, + "rewards/rejected": -0.15558741986751556, + "sft_loss": 0.7854120135307312, + "step": 11175 + }, + { + "epoch": 0.87, + "grad_norm": 23.771411895751953, + "learning_rate": 4.1950758815992645e-07, + "logits/chosen": -1.3169772624969482, + "logits/rejected": -0.9580669403076172, + "logps/chosen": -0.9869791269302368, + "logps/rejected": -5.212032318115234, + "loss": 1.0013, + "odds_ratio_loss": 0.1432759016752243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09869791567325592, + "rewards/margins": 0.42250528931617737, + "rewards/rejected": -0.5212032198905945, + "sft_loss": 0.9869791269302368, + "step": 11180 + }, + { + "epoch": 0.87, + "grad_norm": 132.80825805664062, + "learning_rate": 4.170421637086364e-07, + "logits/chosen": -1.4070345163345337, + "logits/rejected": -1.1488429307937622, + "logps/chosen": -1.3363525867462158, + "logps/rejected": -8.6935453414917, + "loss": 1.3374, + "odds_ratio_loss": 0.010287756100296974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1336352527141571, + "rewards/margins": 0.7357192635536194, + "rewards/rejected": -0.8693544268608093, + "sft_loss": 1.3363525867462158, + "step": 11185 + }, + { + "epoch": 0.87, + "grad_norm": 5.368658065795898, + "learning_rate": 4.145836898947808e-07, + "logits/chosen": -1.4635095596313477, + "logits/rejected": -1.2153332233428955, + "logps/chosen": -0.8983847498893738, + "logps/rejected": -9.342020034790039, + "loss": 0.9048, + "odds_ratio_loss": 0.06378494203090668, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08983847498893738, + "rewards/margins": 0.8443635106086731, + "rewards/rejected": -0.9342020153999329, + "sft_loss": 0.8983847498893738, + "step": 11190 + }, + { + "epoch": 0.87, + "grad_norm": 413.733642578125, + "learning_rate": 4.121321704469461e-07, + "logits/chosen": -1.367821455001831, + "logits/rejected": -0.9976612329483032, + "logps/chosen": -1.4773330688476562, + "logps/rejected": -15.2203950881958, + "loss": 1.4774, + "odds_ratio_loss": 0.00020272521942388266, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14773330092430115, + "rewards/margins": 1.3743062019348145, + "rewards/rejected": -1.522039532661438, + "sft_loss": 1.4773330688476562, + "step": 11195 + }, + { + "epoch": 0.87, + "grad_norm": 12.227411270141602, + "learning_rate": 4.0968760908317304e-07, + "logits/chosen": -1.2860291004180908, + "logits/rejected": -1.047076940536499, + "logps/chosen": -1.0616978406906128, + "logps/rejected": -4.308385372161865, + "loss": 1.0905, + "odds_ratio_loss": 0.2877461612224579, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10616978257894516, + "rewards/margins": 0.3246687948703766, + "rewards/rejected": -0.43083858489990234, + "sft_loss": 1.0616978406906128, + "step": 11200 + }, + { + "epoch": 0.87, + "grad_norm": 100.4394302368164, + "learning_rate": 4.0725000951094994e-07, + "logits/chosen": -1.2629293203353882, + "logits/rejected": -1.1446837186813354, + "logps/chosen": -0.8070009350776672, + "logps/rejected": -5.544220447540283, + "loss": 0.8434, + "odds_ratio_loss": 0.3644154667854309, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0807000994682312, + "rewards/margins": 0.4737219214439392, + "rewards/rejected": -0.5544220209121704, + "sft_loss": 0.8070009350776672, + "step": 11205 + }, + { + "epoch": 0.87, + "grad_norm": 23.128013610839844, + "learning_rate": 4.0481937542720615e-07, + "logits/chosen": -1.3773269653320312, + "logits/rejected": -0.9969242215156555, + "logps/chosen": -0.6857340931892395, + "logps/rejected": -3.768573760986328, + "loss": 0.6966, + "odds_ratio_loss": 0.10868772119283676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06857341527938843, + "rewards/margins": 0.3082839548587799, + "rewards/rejected": -0.37685737013816833, + "sft_loss": 0.6857340931892395, + "step": 11210 + }, + { + "epoch": 0.87, + "grad_norm": 23.43471908569336, + "learning_rate": 4.023957105183052e-07, + "logits/chosen": -1.2603623867034912, + "logits/rejected": -1.3566380739212036, + "logps/chosen": -1.3014967441558838, + "logps/rejected": -4.620263576507568, + "loss": 1.3311, + "odds_ratio_loss": 0.29601508378982544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13014967739582062, + "rewards/margins": 0.3318766951560974, + "rewards/rejected": -0.4620264172554016, + "sft_loss": 1.3014967441558838, + "step": 11215 + }, + { + "epoch": 0.87, + "grad_norm": 323.7796630859375, + "learning_rate": 3.999790184600449e-07, + "logits/chosen": -1.2544281482696533, + "logits/rejected": -1.3168442249298096, + "logps/chosen": -1.0642973184585571, + "logps/rejected": -5.73541259765625, + "loss": 1.075, + "odds_ratio_loss": 0.10698781907558441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10642973333597183, + "rewards/margins": 0.4671115279197693, + "rewards/rejected": -0.5735412836074829, + "sft_loss": 1.0642973184585571, + "step": 11220 + }, + { + "epoch": 0.87, + "grad_norm": 10.882088661193848, + "learning_rate": 3.975693029176447e-07, + "logits/chosen": -1.3870043754577637, + "logits/rejected": -1.2248777151107788, + "logps/chosen": -1.020098328590393, + "logps/rejected": -6.03187370300293, + "loss": 1.059, + "odds_ratio_loss": 0.3891030550003052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1020098328590393, + "rewards/margins": 0.5011776089668274, + "rewards/rejected": -0.6031874418258667, + "sft_loss": 1.020098328590393, + "step": 11225 + }, + { + "epoch": 0.87, + "grad_norm": 142.18104553222656, + "learning_rate": 3.951665675457433e-07, + "logits/chosen": -1.3323876857757568, + "logits/rejected": -1.2044421434402466, + "logps/chosen": -1.1101775169372559, + "logps/rejected": -9.553766250610352, + "loss": 1.1284, + "odds_ratio_loss": 0.1818692684173584, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11101774871349335, + "rewards/margins": 0.8443588018417358, + "rewards/rejected": -0.9553766250610352, + "sft_loss": 1.1101775169372559, + "step": 11230 + }, + { + "epoch": 0.87, + "grad_norm": 13.318952560424805, + "learning_rate": 3.9277081598839526e-07, + "logits/chosen": -1.2420969009399414, + "logits/rejected": -1.1654160022735596, + "logps/chosen": -1.0236912965774536, + "logps/rejected": -6.747973442077637, + "loss": 1.0563, + "odds_ratio_loss": 0.32635438442230225, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10236912965774536, + "rewards/margins": 0.5724282264709473, + "rewards/rejected": -0.6747973561286926, + "sft_loss": 1.0236912965774536, + "step": 11235 + }, + { + "epoch": 0.87, + "grad_norm": 9.043617248535156, + "learning_rate": 3.903820518790613e-07, + "logits/chosen": -1.3946565389633179, + "logits/rejected": -1.2541202306747437, + "logps/chosen": -1.0738626718521118, + "logps/rejected": -10.492671012878418, + "loss": 1.0827, + "odds_ratio_loss": 0.0884605199098587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1073862686753273, + "rewards/margins": 0.9418808221817017, + "rewards/rejected": -1.0492671728134155, + "sft_loss": 1.0738626718521118, + "step": 11240 + }, + { + "epoch": 0.87, + "grad_norm": 17.10003089904785, + "learning_rate": 3.8800027884060564e-07, + "logits/chosen": -1.3552266359329224, + "logits/rejected": -1.5189950466156006, + "logps/chosen": -1.181077480316162, + "logps/rejected": -6.34781551361084, + "loss": 1.201, + "odds_ratio_loss": 0.19888582825660706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11810775101184845, + "rewards/margins": 0.5166738629341125, + "rewards/rejected": -0.6347816586494446, + "sft_loss": 1.181077480316162, + "step": 11245 + }, + { + "epoch": 0.88, + "grad_norm": 9.954345703125, + "learning_rate": 3.8562550048528823e-07, + "logits/chosen": -1.1790051460266113, + "logits/rejected": -1.1716651916503906, + "logps/chosen": -1.0941330194473267, + "logps/rejected": -10.212206840515137, + "loss": 1.0994, + "odds_ratio_loss": 0.05296441912651062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10941328853368759, + "rewards/margins": 0.9118073582649231, + "rewards/rejected": -1.0212206840515137, + "sft_loss": 1.0941330194473267, + "step": 11250 + }, + { + "epoch": 0.88, + "grad_norm": 17.24154281616211, + "learning_rate": 3.832577204147642e-07, + "logits/chosen": -1.3043853044509888, + "logits/rejected": -1.6192779541015625, + "logps/chosen": -0.9539308547973633, + "logps/rejected": -6.1093950271606445, + "loss": 0.9641, + "odds_ratio_loss": 0.10161396116018295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09539308398962021, + "rewards/margins": 0.5155463814735413, + "rewards/rejected": -0.6109394431114197, + "sft_loss": 0.9539308547973633, + "step": 11255 + }, + { + "epoch": 0.88, + "grad_norm": 5.719317436218262, + "learning_rate": 3.8089694222007144e-07, + "logits/chosen": -1.3731346130371094, + "logits/rejected": -0.7326057553291321, + "logps/chosen": -0.9768118858337402, + "logps/rejected": -4.670573711395264, + "loss": 1.0153, + "odds_ratio_loss": 0.38526564836502075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0976811945438385, + "rewards/margins": 0.36937612295150757, + "rewards/rejected": -0.46705737709999084, + "sft_loss": 0.9768118858337402, + "step": 11260 + }, + { + "epoch": 0.88, + "grad_norm": 6.651812553405762, + "learning_rate": 3.785431694816294e-07, + "logits/chosen": -1.2109453678131104, + "logits/rejected": -1.645216941833496, + "logps/chosen": -1.0555239915847778, + "logps/rejected": -10.367178916931152, + "loss": 1.072, + "odds_ratio_loss": 0.16514301300048828, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10555239766836166, + "rewards/margins": 0.9311655163764954, + "rewards/rejected": -1.0367180109024048, + "sft_loss": 1.0555239915847778, + "step": 11265 + }, + { + "epoch": 0.88, + "grad_norm": 13.967916488647461, + "learning_rate": 3.761964057692341e-07, + "logits/chosen": -1.1737172603607178, + "logits/rejected": -1.1151050329208374, + "logps/chosen": -0.8013264536857605, + "logps/rejected": -5.700438976287842, + "loss": 0.809, + "odds_ratio_loss": 0.07676169276237488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08013264834880829, + "rewards/margins": 0.4899112582206726, + "rewards/rejected": -0.5700439214706421, + "sft_loss": 0.8013264536857605, + "step": 11270 + }, + { + "epoch": 0.88, + "grad_norm": 38.32868957519531, + "learning_rate": 3.738566546420513e-07, + "logits/chosen": -1.3379786014556885, + "logits/rejected": -1.4890538454055786, + "logps/chosen": -0.9517307281494141, + "logps/rejected": -5.213587284088135, + "loss": 0.973, + "odds_ratio_loss": 0.2128792256116867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09517307579517365, + "rewards/margins": 0.426185667514801, + "rewards/rejected": -0.5213587284088135, + "sft_loss": 0.9517307281494141, + "step": 11275 + }, + { + "epoch": 0.88, + "grad_norm": 6.2815141677856445, + "learning_rate": 3.7152391964860924e-07, + "logits/chosen": -1.3357675075531006, + "logits/rejected": -1.0225775241851807, + "logps/chosen": -0.9496587514877319, + "logps/rejected": -7.482272148132324, + "loss": 0.9504, + "odds_ratio_loss": 0.007433583028614521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0949658751487732, + "rewards/margins": 0.6532613635063171, + "rewards/rejected": -0.7482272386550903, + "sft_loss": 0.9496587514877319, + "step": 11280 + }, + { + "epoch": 0.88, + "grad_norm": 6.459371566772461, + "learning_rate": 3.691982043267972e-07, + "logits/chosen": -1.4276762008666992, + "logits/rejected": -0.9678457975387573, + "logps/chosen": -0.990170955657959, + "logps/rejected": -2.987020969390869, + "loss": 1.0233, + "odds_ratio_loss": 0.33171772956848145, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09901710599660873, + "rewards/margins": 0.1996849775314331, + "rewards/rejected": -0.29870206117630005, + "sft_loss": 0.990170955657959, + "step": 11285 + }, + { + "epoch": 0.88, + "grad_norm": 10.674115180969238, + "learning_rate": 3.668795122038582e-07, + "logits/chosen": -1.3349639177322388, + "logits/rejected": -1.1982815265655518, + "logps/chosen": -0.7671822905540466, + "logps/rejected": -4.252364158630371, + "loss": 0.7922, + "odds_ratio_loss": 0.2504265308380127, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07671823352575302, + "rewards/margins": 0.3485181927680969, + "rewards/rejected": -0.42523640394210815, + "sft_loss": 0.7671822905540466, + "step": 11290 + }, + { + "epoch": 0.88, + "grad_norm": 4.621835708618164, + "learning_rate": 3.6456784679638256e-07, + "logits/chosen": -1.3116695880889893, + "logits/rejected": -0.8658286333084106, + "logps/chosen": -0.89985591173172, + "logps/rejected": -6.814452171325684, + "loss": 0.9004, + "odds_ratio_loss": 0.005296620540320873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08998559415340424, + "rewards/margins": 0.5914596319198608, + "rewards/rejected": -0.6814452409744263, + "sft_loss": 0.89985591173172, + "step": 11295 + }, + { + "epoch": 0.88, + "grad_norm": 7.848481178283691, + "learning_rate": 3.6226321161030367e-07, + "logits/chosen": -1.4205824136734009, + "logits/rejected": -1.0507280826568604, + "logps/chosen": -0.968035876750946, + "logps/rejected": -3.0656139850616455, + "loss": 1.0092, + "odds_ratio_loss": 0.41196268796920776, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09680359065532684, + "rewards/margins": 0.20975780487060547, + "rewards/rejected": -0.3065614104270935, + "sft_loss": 0.968035876750946, + "step": 11300 + }, + { + "epoch": 0.88, + "grad_norm": 8.965922355651855, + "learning_rate": 3.599656101408955e-07, + "logits/chosen": -1.1956322193145752, + "logits/rejected": -1.2085590362548828, + "logps/chosen": -0.8699887990951538, + "logps/rejected": -7.0282883644104, + "loss": 0.8702, + "odds_ratio_loss": 0.0024738397914916277, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08699888736009598, + "rewards/margins": 0.6158300042152405, + "rewards/rejected": -0.7028288841247559, + "sft_loss": 0.8699887990951538, + "step": 11305 + }, + { + "epoch": 0.88, + "grad_norm": 27.91067123413086, + "learning_rate": 3.5767504587276124e-07, + "logits/chosen": -1.1915757656097412, + "logits/rejected": -1.0515540838241577, + "logps/chosen": -0.9764666557312012, + "logps/rejected": -9.82097053527832, + "loss": 0.9774, + "odds_ratio_loss": 0.009811131283640862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09764666110277176, + "rewards/margins": 0.8844503164291382, + "rewards/rejected": -0.9820969700813293, + "sft_loss": 0.9764666557312012, + "step": 11310 + }, + { + "epoch": 0.88, + "grad_norm": 16.605636596679688, + "learning_rate": 3.5539152227983155e-07, + "logits/chosen": -1.1500060558319092, + "logits/rejected": -1.403136968612671, + "logps/chosen": -1.2175300121307373, + "logps/rejected": -5.237689971923828, + "loss": 1.2674, + "odds_ratio_loss": 0.4989433288574219, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1217530146241188, + "rewards/margins": 0.4020160138607025, + "rewards/rejected": -0.5237690210342407, + "sft_loss": 1.2175300121307373, + "step": 11315 + }, + { + "epoch": 0.88, + "grad_norm": 12.524778366088867, + "learning_rate": 3.531150428253616e-07, + "logits/chosen": -1.311918020248413, + "logits/rejected": -0.9865154027938843, + "logps/chosen": -1.1745331287384033, + "logps/rejected": -7.8709306716918945, + "loss": 1.1877, + "odds_ratio_loss": 0.13171400129795074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11745331436395645, + "rewards/margins": 0.6696397066116333, + "rewards/rejected": -0.7870929837226868, + "sft_loss": 1.1745331287384033, + "step": 11320 + }, + { + "epoch": 0.88, + "grad_norm": 10.650303840637207, + "learning_rate": 3.508456109619207e-07, + "logits/chosen": -1.5033023357391357, + "logits/rejected": -1.136890172958374, + "logps/chosen": -1.2400705814361572, + "logps/rejected": -3.5096442699432373, + "loss": 1.2745, + "odds_ratio_loss": 0.3439212441444397, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12400706857442856, + "rewards/margins": 0.22695736587047577, + "rewards/rejected": -0.35096442699432373, + "sft_loss": 1.2400705814361572, + "step": 11325 + }, + { + "epoch": 0.88, + "grad_norm": 414.8785400390625, + "learning_rate": 3.485832301313896e-07, + "logits/chosen": -1.3364065885543823, + "logits/rejected": -0.9892686605453491, + "logps/chosen": -1.139243721961975, + "logps/rejected": -7.371476173400879, + "loss": 1.1449, + "odds_ratio_loss": 0.056998781859874725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11392436176538467, + "rewards/margins": 0.6232232451438904, + "rewards/rejected": -0.7371476292610168, + "sft_loss": 1.139243721961975, + "step": 11330 + }, + { + "epoch": 0.88, + "grad_norm": 5.380289554595947, + "learning_rate": 3.463279037649575e-07, + "logits/chosen": -1.3991533517837524, + "logits/rejected": -0.8975147008895874, + "logps/chosen": -0.9783090353012085, + "logps/rejected": -5.975537300109863, + "loss": 1.0223, + "odds_ratio_loss": 0.4397381842136383, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09783090651035309, + "rewards/margins": 0.49972280859947205, + "rewards/rejected": -0.5975537300109863, + "sft_loss": 0.9783090353012085, + "step": 11335 + }, + { + "epoch": 0.88, + "grad_norm": 4.831082820892334, + "learning_rate": 3.440796352831133e-07, + "logits/chosen": -1.316606879234314, + "logits/rejected": -1.2849032878875732, + "logps/chosen": -0.7927159070968628, + "logps/rejected": -8.25066089630127, + "loss": 0.8168, + "odds_ratio_loss": 0.24042615294456482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.079271599650383, + "rewards/margins": 0.745794415473938, + "rewards/rejected": -0.8250659704208374, + "sft_loss": 0.7927159070968628, + "step": 11340 + }, + { + "epoch": 0.88, + "grad_norm": 24.93393325805664, + "learning_rate": 3.4183842809563993e-07, + "logits/chosen": -1.4244930744171143, + "logits/rejected": -1.3379783630371094, + "logps/chosen": -1.170127272605896, + "logps/rejected": -3.386500120162964, + "loss": 1.1928, + "odds_ratio_loss": 0.2265014946460724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11701272428035736, + "rewards/margins": 0.2216372936964035, + "rewards/rejected": -0.33865001797676086, + "sft_loss": 1.170127272605896, + "step": 11345 + }, + { + "epoch": 0.88, + "grad_norm": 11.719467163085938, + "learning_rate": 3.396042856016141e-07, + "logits/chosen": -1.2225192785263062, + "logits/rejected": -1.03545081615448, + "logps/chosen": -0.8588349223136902, + "logps/rejected": -6.034967422485352, + "loss": 0.8724, + "odds_ratio_loss": 0.13602833449840546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08588350564241409, + "rewards/margins": 0.5176132321357727, + "rewards/rejected": -0.6034967303276062, + "sft_loss": 0.8588349223136902, + "step": 11350 + }, + { + "epoch": 0.88, + "grad_norm": 5.902450084686279, + "learning_rate": 3.3737721118939637e-07, + "logits/chosen": -1.2625263929367065, + "logits/rejected": -1.2296942472457886, + "logps/chosen": -1.0912928581237793, + "logps/rejected": -8.46910572052002, + "loss": 1.0968, + "odds_ratio_loss": 0.054971031844615936, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10912929475307465, + "rewards/margins": 0.7377813458442688, + "rewards/rejected": -0.8469105958938599, + "sft_loss": 1.0912928581237793, + "step": 11355 + }, + { + "epoch": 0.88, + "grad_norm": 29.052635192871094, + "learning_rate": 3.351572082366267e-07, + "logits/chosen": -1.0655205249786377, + "logits/rejected": -1.3246183395385742, + "logps/chosen": -1.7015535831451416, + "logps/rejected": -3.9158992767333984, + "loss": 1.7642, + "odds_ratio_loss": 0.6268733739852905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1701553612947464, + "rewards/margins": 0.2214345484972, + "rewards/rejected": -0.3915899395942688, + "sft_loss": 1.7015535831451416, + "step": 11360 + }, + { + "epoch": 0.88, + "grad_norm": 7.776186943054199, + "learning_rate": 3.329442801102223e-07, + "logits/chosen": -1.2450320720672607, + "logits/rejected": -1.2915815114974976, + "logps/chosen": -1.362554907798767, + "logps/rejected": -8.15864086151123, + "loss": 1.3835, + "odds_ratio_loss": 0.20922088623046875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13625548779964447, + "rewards/margins": 0.6796085238456726, + "rewards/rejected": -0.815864086151123, + "sft_loss": 1.362554907798767, + "step": 11365 + }, + { + "epoch": 0.88, + "grad_norm": 102.14395904541016, + "learning_rate": 3.3073843016636964e-07, + "logits/chosen": -1.1582549810409546, + "logits/rejected": -0.6190916299819946, + "logps/chosen": -1.317775011062622, + "logps/rejected": -6.590193271636963, + "loss": 1.3365, + "odds_ratio_loss": 0.18727946281433105, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13177749514579773, + "rewards/margins": 0.5272418260574341, + "rewards/rejected": -0.6590193510055542, + "sft_loss": 1.317775011062622, + "step": 11370 + }, + { + "epoch": 0.88, + "grad_norm": 918.7132568359375, + "learning_rate": 3.285396617505204e-07, + "logits/chosen": -1.2746855020523071, + "logits/rejected": -1.293368935585022, + "logps/chosen": -1.331923007965088, + "logps/rejected": -6.492204189300537, + "loss": 1.3342, + "odds_ratio_loss": 0.023071136325597763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1331923007965088, + "rewards/margins": 0.516028106212616, + "rewards/rejected": -0.6492204070091248, + "sft_loss": 1.331923007965088, + "step": 11375 + }, + { + "epoch": 0.89, + "grad_norm": 10.767485618591309, + "learning_rate": 3.263479781973855e-07, + "logits/chosen": -1.3824785947799683, + "logits/rejected": -1.3801668882369995, + "logps/chosen": -0.8377906680107117, + "logps/rejected": -6.220038414001465, + "loss": 0.8751, + "odds_ratio_loss": 0.37284550070762634, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08377906680107117, + "rewards/margins": 0.5382248163223267, + "rewards/rejected": -0.6220038533210754, + "sft_loss": 0.8377906680107117, + "step": 11380 + }, + { + "epoch": 0.89, + "grad_norm": 6.239452838897705, + "learning_rate": 3.2416338283093207e-07, + "logits/chosen": -1.3986380100250244, + "logits/rejected": -0.7916896343231201, + "logps/chosen": -1.043060541152954, + "logps/rejected": -12.193835258483887, + "loss": 1.0588, + "odds_ratio_loss": 0.15780356526374817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10430605709552765, + "rewards/margins": 1.1150774955749512, + "rewards/rejected": -1.2193834781646729, + "sft_loss": 1.043060541152954, + "step": 11385 + }, + { + "epoch": 0.89, + "grad_norm": 6.802426338195801, + "learning_rate": 3.2198587896437593e-07, + "logits/chosen": -1.327599287033081, + "logits/rejected": -0.993168830871582, + "logps/chosen": -0.9170148968696594, + "logps/rejected": -2.109870433807373, + "loss": 0.9439, + "odds_ratio_loss": 0.2683650553226471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09170148521661758, + "rewards/margins": 0.11928558349609375, + "rewards/rejected": -0.21098704636096954, + "sft_loss": 0.9170148968696594, + "step": 11390 + }, + { + "epoch": 0.89, + "grad_norm": 13.555157661437988, + "learning_rate": 3.198154699001782e-07, + "logits/chosen": -0.9407708048820496, + "logits/rejected": -1.7492096424102783, + "logps/chosen": -0.8097691535949707, + "logps/rejected": -7.6386284828186035, + "loss": 0.8157, + "odds_ratio_loss": 0.0593709759414196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08097691833972931, + "rewards/margins": 0.6828858256340027, + "rewards/rejected": -0.7638627886772156, + "sft_loss": 0.8097691535949707, + "step": 11395 + }, + { + "epoch": 0.89, + "grad_norm": 187.50027465820312, + "learning_rate": 3.1765215893003967e-07, + "logits/chosen": -1.0976942777633667, + "logits/rejected": -1.2680959701538086, + "logps/chosen": -1.2602325677871704, + "logps/rejected": -6.308053970336914, + "loss": 1.3144, + "odds_ratio_loss": 0.5419572591781616, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12602324783802032, + "rewards/margins": 0.5047820806503296, + "rewards/rejected": -0.6308053731918335, + "sft_loss": 1.2602325677871704, + "step": 11400 + }, + { + "epoch": 0.89, + "grad_norm": 35.68206787109375, + "learning_rate": 3.154959493348958e-07, + "logits/chosen": -1.3055843114852905, + "logits/rejected": -1.199350118637085, + "logps/chosen": -0.7830213308334351, + "logps/rejected": -4.249791145324707, + "loss": 0.8357, + "odds_ratio_loss": 0.526390016078949, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07830213010311127, + "rewards/margins": 0.3466770648956299, + "rewards/rejected": -0.42497915029525757, + "sft_loss": 0.7830213308334351, + "step": 11405 + }, + { + "epoch": 0.89, + "grad_norm": 12.375039100646973, + "learning_rate": 3.133468443849125e-07, + "logits/chosen": -1.3636467456817627, + "logits/rejected": -0.858871340751648, + "logps/chosen": -0.8683420419692993, + "logps/rejected": -3.3262736797332764, + "loss": 0.8901, + "odds_ratio_loss": 0.21752576529979706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08683420717716217, + "rewards/margins": 0.2457931786775589, + "rewards/rejected": -0.33262738585472107, + "sft_loss": 0.8683420419692993, + "step": 11410 + }, + { + "epoch": 0.89, + "grad_norm": 7.882063865661621, + "learning_rate": 3.1120484733948073e-07, + "logits/chosen": -1.2236177921295166, + "logits/rejected": -1.404359221458435, + "logps/chosen": -0.7861829996109009, + "logps/rejected": -7.9908766746521, + "loss": 0.7957, + "odds_ratio_loss": 0.0947074443101883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07861830294132233, + "rewards/margins": 0.7204692959785461, + "rewards/rejected": -0.799087643623352, + "sft_loss": 0.7861829996109009, + "step": 11415 + }, + { + "epoch": 0.89, + "grad_norm": 6.249751091003418, + "learning_rate": 3.090699614472109e-07, + "logits/chosen": -1.2637525796890259, + "logits/rejected": -0.9572528600692749, + "logps/chosen": -0.8270605206489563, + "logps/rejected": -4.8529839515686035, + "loss": 0.8522, + "odds_ratio_loss": 0.2513573169708252, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08270604908466339, + "rewards/margins": 0.40259236097335815, + "rewards/rejected": -0.48529839515686035, + "sft_loss": 0.8270605206489563, + "step": 11420 + }, + { + "epoch": 0.89, + "grad_norm": 26.97721290588379, + "learning_rate": 3.0694218994592793e-07, + "logits/chosen": -1.4073753356933594, + "logits/rejected": -0.9085363149642944, + "logps/chosen": -1.1127631664276123, + "logps/rejected": -8.869595527648926, + "loss": 1.1132, + "odds_ratio_loss": 0.0041475556790828705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11127632856369019, + "rewards/margins": 0.7756832242012024, + "rewards/rejected": -0.8869596719741821, + "sft_loss": 1.1127631664276123, + "step": 11425 + }, + { + "epoch": 0.89, + "grad_norm": 6.019398212432861, + "learning_rate": 3.0482153606266716e-07, + "logits/chosen": -1.211147427558899, + "logits/rejected": -1.127651333808899, + "logps/chosen": -1.1163737773895264, + "logps/rejected": -7.680233001708984, + "loss": 1.1226, + "odds_ratio_loss": 0.06235046312212944, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11163737624883652, + "rewards/margins": 0.6563860177993774, + "rewards/rejected": -0.7680233120918274, + "sft_loss": 1.1163737773895264, + "step": 11430 + }, + { + "epoch": 0.89, + "grad_norm": 5.228885650634766, + "learning_rate": 3.027080030136703e-07, + "logits/chosen": -1.296543836593628, + "logits/rejected": -1.1578459739685059, + "logps/chosen": -0.8815022706985474, + "logps/rejected": -9.213228225708008, + "loss": 0.8854, + "odds_ratio_loss": 0.038652561604976654, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08815022557973862, + "rewards/margins": 0.833172619342804, + "rewards/rejected": -0.9213228225708008, + "sft_loss": 0.8815022706985474, + "step": 11435 + }, + { + "epoch": 0.89, + "grad_norm": 9.065093040466309, + "learning_rate": 3.0060159400437883e-07, + "logits/chosen": -1.3202693462371826, + "logits/rejected": -1.380171537399292, + "logps/chosen": -0.9314519762992859, + "logps/rejected": -5.277139663696289, + "loss": 0.9535, + "odds_ratio_loss": 0.2203933447599411, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09314519912004471, + "rewards/margins": 0.4345687925815582, + "rewards/rejected": -0.5277139544487, + "sft_loss": 0.9314519762992859, + "step": 11440 + }, + { + "epoch": 0.89, + "grad_norm": 13.972500801086426, + "learning_rate": 2.985023122294278e-07, + "logits/chosen": -1.3101141452789307, + "logits/rejected": -1.179225206375122, + "logps/chosen": -1.0576988458633423, + "logps/rejected": -7.225750923156738, + "loss": 1.068, + "odds_ratio_loss": 0.1029224544763565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10576988756656647, + "rewards/margins": 0.6168051958084106, + "rewards/rejected": -0.7225750684738159, + "sft_loss": 1.0576988458633423, + "step": 11445 + }, + { + "epoch": 0.89, + "grad_norm": 50.68267059326172, + "learning_rate": 2.9641016087264716e-07, + "logits/chosen": -1.3971431255340576, + "logits/rejected": -0.978039562702179, + "logps/chosen": -0.9783371090888977, + "logps/rejected": -14.972844123840332, + "loss": 0.9813, + "odds_ratio_loss": 0.03002532199025154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09783372282981873, + "rewards/margins": 1.3994505405426025, + "rewards/rejected": -1.4972842931747437, + "sft_loss": 0.9783371090888977, + "step": 11450 + }, + { + "epoch": 0.89, + "grad_norm": 13.629805564880371, + "learning_rate": 2.943251431070476e-07, + "logits/chosen": -1.184397578239441, + "logits/rejected": -1.0515797138214111, + "logps/chosen": -0.9648865461349487, + "logps/rejected": -4.7971930503845215, + "loss": 0.9836, + "odds_ratio_loss": 0.18691152334213257, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09648866951465607, + "rewards/margins": 0.38323062658309937, + "rewards/rejected": -0.4797193109989166, + "sft_loss": 0.9648865461349487, + "step": 11455 + }, + { + "epoch": 0.89, + "grad_norm": 7.995943546295166, + "learning_rate": 2.9224726209482524e-07, + "logits/chosen": -1.3726367950439453, + "logits/rejected": -1.2525684833526611, + "logps/chosen": -0.8402360677719116, + "logps/rejected": -7.144128322601318, + "loss": 0.8457, + "odds_ratio_loss": 0.05475843697786331, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0840236097574234, + "rewards/margins": 0.6303892731666565, + "rewards/rejected": -0.7144128680229187, + "sft_loss": 0.8402360677719116, + "step": 11460 + }, + { + "epoch": 0.89, + "grad_norm": 7.930908679962158, + "learning_rate": 2.901765209873486e-07, + "logits/chosen": -1.2770615816116333, + "logits/rejected": -1.3665797710418701, + "logps/chosen": -0.9703947901725769, + "logps/rejected": -7.10650634765625, + "loss": 0.9743, + "odds_ratio_loss": 0.03855733200907707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09703948348760605, + "rewards/margins": 0.6136112213134766, + "rewards/rejected": -0.7106507420539856, + "sft_loss": 0.9703947901725769, + "step": 11465 + }, + { + "epoch": 0.89, + "grad_norm": 8.73951244354248, + "learning_rate": 2.881129229251611e-07, + "logits/chosen": -1.2204939126968384, + "logits/rejected": -0.9099424481391907, + "logps/chosen": -1.0175669193267822, + "logps/rejected": -9.779008865356445, + "loss": 1.0212, + "odds_ratio_loss": 0.03633153438568115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10175669193267822, + "rewards/margins": 0.8761442303657532, + "rewards/rejected": -0.9779008626937866, + "sft_loss": 1.0175669193267822, + "step": 11470 + }, + { + "epoch": 0.89, + "grad_norm": 8.854645729064941, + "learning_rate": 2.860564710379693e-07, + "logits/chosen": -1.334038496017456, + "logits/rejected": -1.1563862562179565, + "logps/chosen": -1.0889811515808105, + "logps/rejected": -5.698180675506592, + "loss": 1.1267, + "odds_ratio_loss": 0.37702035903930664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1088981181383133, + "rewards/margins": 0.46091994643211365, + "rewards/rejected": -0.5698180794715881, + "sft_loss": 1.0889811515808105, + "step": 11475 + }, + { + "epoch": 0.89, + "grad_norm": 4.634920120239258, + "learning_rate": 2.840071684446455e-07, + "logits/chosen": -1.3449212312698364, + "logits/rejected": -0.7937895059585571, + "logps/chosen": -1.0932824611663818, + "logps/rejected": -9.422958374023438, + "loss": 1.0967, + "odds_ratio_loss": 0.034093089401721954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1093282550573349, + "rewards/margins": 0.8329676389694214, + "rewards/rejected": -0.9422958493232727, + "sft_loss": 1.0932824611663818, + "step": 11480 + }, + { + "epoch": 0.89, + "grad_norm": 301.2113037109375, + "learning_rate": 2.8196501825321686e-07, + "logits/chosen": -1.271220088005066, + "logits/rejected": -0.934947669506073, + "logps/chosen": -0.6947768330574036, + "logps/rejected": -4.528558731079102, + "loss": 0.6972, + "odds_ratio_loss": 0.023960549384355545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06947768479585648, + "rewards/margins": 0.38337820768356323, + "rewards/rejected": -0.4528558850288391, + "sft_loss": 0.6947768330574036, + "step": 11485 + }, + { + "epoch": 0.89, + "grad_norm": 43.53444290161133, + "learning_rate": 2.799300235608626e-07, + "logits/chosen": -1.4846608638763428, + "logits/rejected": -1.35506010055542, + "logps/chosen": -0.753076434135437, + "logps/rejected": -8.12567138671875, + "loss": 0.7927, + "odds_ratio_loss": 0.3962857127189636, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07530764490365982, + "rewards/margins": 0.737259566783905, + "rewards/rejected": -0.8125671148300171, + "sft_loss": 0.753076434135437, + "step": 11490 + }, + { + "epoch": 0.89, + "grad_norm": 7.579301357269287, + "learning_rate": 2.779021874539106e-07, + "logits/chosen": -1.3991349935531616, + "logits/rejected": -1.2568550109863281, + "logps/chosen": -1.0853722095489502, + "logps/rejected": -5.782435417175293, + "loss": 1.13, + "odds_ratio_loss": 0.44617921113967896, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1085372194647789, + "rewards/margins": 0.46970629692077637, + "rewards/rejected": -0.5782435536384583, + "sft_loss": 1.0853722095489502, + "step": 11495 + }, + { + "epoch": 0.89, + "grad_norm": 4.319366455078125, + "learning_rate": 2.758815130078329e-07, + "logits/chosen": -1.2385519742965698, + "logits/rejected": -1.0318421125411987, + "logps/chosen": -0.9813858270645142, + "logps/rejected": -8.16090202331543, + "loss": 0.9943, + "odds_ratio_loss": 0.1292736828327179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09813857823610306, + "rewards/margins": 0.7179515361785889, + "rewards/rejected": -0.8160901069641113, + "sft_loss": 0.9813858270645142, + "step": 11500 + }, + { + "epoch": 0.89, + "grad_norm": 24.704221725463867, + "learning_rate": 2.7386800328723815e-07, + "logits/chosen": -1.3197424411773682, + "logits/rejected": -1.1460809707641602, + "logps/chosen": -0.9466512799263, + "logps/rejected": -11.379068374633789, + "loss": 0.9551, + "odds_ratio_loss": 0.08447456359863281, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09466513246297836, + "rewards/margins": 1.0432417392730713, + "rewards/rejected": -1.1379069089889526, + "sft_loss": 0.9466512799263, + "step": 11505 + }, + { + "epoch": 0.9, + "grad_norm": 24.183088302612305, + "learning_rate": 2.7186166134586964e-07, + "logits/chosen": -1.2981321811676025, + "logits/rejected": -0.7681884765625, + "logps/chosen": -1.0729738473892212, + "logps/rejected": -5.5854268074035645, + "loss": 1.0831, + "odds_ratio_loss": 0.10156464576721191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.107297383248806, + "rewards/margins": 0.4512453079223633, + "rewards/rejected": -0.5585426688194275, + "sft_loss": 1.0729738473892212, + "step": 11510 + }, + { + "epoch": 0.9, + "grad_norm": 14.686610221862793, + "learning_rate": 2.698624902265995e-07, + "logits/chosen": -1.3769340515136719, + "logits/rejected": -1.1684379577636719, + "logps/chosen": -0.8651742935180664, + "logps/rejected": -5.289219856262207, + "loss": 0.8924, + "odds_ratio_loss": 0.27234646677970886, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08651743084192276, + "rewards/margins": 0.442404568195343, + "rewards/rejected": -0.5289219617843628, + "sft_loss": 0.8651742935180664, + "step": 11515 + }, + { + "epoch": 0.9, + "grad_norm": 16.206825256347656, + "learning_rate": 2.6787049296142455e-07, + "logits/chosen": -1.2861360311508179, + "logits/rejected": -1.3285562992095947, + "logps/chosen": -0.5772618055343628, + "logps/rejected": -5.894922256469727, + "loss": 0.5908, + "odds_ratio_loss": 0.1356671303510666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0577261820435524, + "rewards/margins": 0.5317661166191101, + "rewards/rejected": -0.5894922614097595, + "sft_loss": 0.5772618055343628, + "step": 11520 + }, + { + "epoch": 0.9, + "grad_norm": 23.26406478881836, + "learning_rate": 2.658856725714609e-07, + "logits/chosen": -1.4277960062026978, + "logits/rejected": -1.2865487337112427, + "logps/chosen": -0.6814032793045044, + "logps/rejected": -5.886070251464844, + "loss": 0.6937, + "odds_ratio_loss": 0.12267593294382095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06814032047986984, + "rewards/margins": 0.520466685295105, + "rewards/rejected": -0.5886070132255554, + "sft_loss": 0.6814032793045044, + "step": 11525 + }, + { + "epoch": 0.9, + "grad_norm": 28.9005069732666, + "learning_rate": 2.639080320669424e-07, + "logits/chosen": -1.3326749801635742, + "logits/rejected": -1.0623148679733276, + "logps/chosen": -0.8971265554428101, + "logps/rejected": -6.110349655151367, + "loss": 0.9131, + "odds_ratio_loss": 0.15930424630641937, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08971264958381653, + "rewards/margins": 0.5213223695755005, + "rewards/rejected": -0.6110349893569946, + "sft_loss": 0.8971265554428101, + "step": 11530 + }, + { + "epoch": 0.9, + "grad_norm": 176.95278930664062, + "learning_rate": 2.619375744472102e-07, + "logits/chosen": -1.2410593032836914, + "logits/rejected": -0.9277191162109375, + "logps/chosen": -1.0788233280181885, + "logps/rejected": -5.1116623878479, + "loss": 1.1172, + "odds_ratio_loss": 0.38364213705062866, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10788233578205109, + "rewards/margins": 0.4032839238643646, + "rewards/rejected": -0.5111662149429321, + "sft_loss": 1.0788233280181885, + "step": 11535 + }, + { + "epoch": 0.9, + "grad_norm": 12.5931978225708, + "learning_rate": 2.599743027007151e-07, + "logits/chosen": -1.495727777481079, + "logits/rejected": -0.9932398796081543, + "logps/chosen": -2.0280823707580566, + "logps/rejected": -5.652215957641602, + "loss": 2.0938, + "odds_ratio_loss": 0.6574202179908752, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.20280826091766357, + "rewards/margins": 0.36241334676742554, + "rewards/rejected": -0.5652216672897339, + "sft_loss": 2.0280823707580566, + "step": 11540 + }, + { + "epoch": 0.9, + "grad_norm": 6.531797885894775, + "learning_rate": 2.5801821980500574e-07, + "logits/chosen": -1.5165177583694458, + "logits/rejected": -1.4731080532073975, + "logps/chosen": -0.9339659810066223, + "logps/rejected": -9.524686813354492, + "loss": 0.9376, + "odds_ratio_loss": 0.03651661053299904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09339660406112671, + "rewards/margins": 0.8590720891952515, + "rewards/rejected": -0.952468752861023, + "sft_loss": 0.9339659810066223, + "step": 11545 + }, + { + "epoch": 0.9, + "grad_norm": 8.201682090759277, + "learning_rate": 2.560693287267324e-07, + "logits/chosen": -1.4987179040908813, + "logits/rejected": -1.4307103157043457, + "logps/chosen": -0.5705666542053223, + "logps/rejected": -10.991838455200195, + "loss": 0.5793, + "odds_ratio_loss": 0.08719013631343842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05705666542053223, + "rewards/margins": 1.0421271324157715, + "rewards/rejected": -1.0991837978363037, + "sft_loss": 0.5705666542053223, + "step": 11550 + }, + { + "epoch": 0.9, + "grad_norm": 9.921911239624023, + "learning_rate": 2.5412763242163463e-07, + "logits/chosen": -1.3099538087844849, + "logits/rejected": -1.3377288579940796, + "logps/chosen": -1.147143006324768, + "logps/rejected": -6.625881195068359, + "loss": 1.1616, + "odds_ratio_loss": 0.14414840936660767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11471428722143173, + "rewards/margins": 0.547873854637146, + "rewards/rejected": -0.6625881195068359, + "sft_loss": 1.147143006324768, + "step": 11555 + }, + { + "epoch": 0.9, + "grad_norm": 13.260001182556152, + "learning_rate": 2.5219313383454124e-07, + "logits/chosen": -1.3728363513946533, + "logits/rejected": -0.907455325126648, + "logps/chosen": -1.068935751914978, + "logps/rejected": -2.6968321800231934, + "loss": 1.1036, + "odds_ratio_loss": 0.3463651239871979, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10689357668161392, + "rewards/margins": 0.16278965771198273, + "rewards/rejected": -0.26968321204185486, + "sft_loss": 1.068935751914978, + "step": 11560 + }, + { + "epoch": 0.9, + "grad_norm": 5.01524543762207, + "learning_rate": 2.5026583589936646e-07, + "logits/chosen": -1.2656458616256714, + "logits/rejected": -1.123988389968872, + "logps/chosen": -1.0185540914535522, + "logps/rejected": -7.923954010009766, + "loss": 1.0207, + "odds_ratio_loss": 0.021134402602910995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10185541957616806, + "rewards/margins": 0.6905400156974792, + "rewards/rejected": -0.7923954725265503, + "sft_loss": 1.0185540914535522, + "step": 11565 + }, + { + "epoch": 0.9, + "grad_norm": 27.029821395874023, + "learning_rate": 2.483457415391005e-07, + "logits/chosen": -1.4600824117660522, + "logits/rejected": -1.0570296049118042, + "logps/chosen": -0.9342508316040039, + "logps/rejected": -8.031071662902832, + "loss": 0.973, + "odds_ratio_loss": 0.3876782953739166, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09342508018016815, + "rewards/margins": 0.7096821069717407, + "rewards/rejected": -0.8031071424484253, + "sft_loss": 0.9342508316040039, + "step": 11570 + }, + { + "epoch": 0.9, + "grad_norm": 5.063310146331787, + "learning_rate": 2.464328536658117e-07, + "logits/chosen": -1.3322535753250122, + "logits/rejected": -1.235365390777588, + "logps/chosen": -0.9622739553451538, + "logps/rejected": -11.337159156799316, + "loss": 0.9671, + "odds_ratio_loss": 0.047843582928180695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09622739255428314, + "rewards/margins": 1.0374884605407715, + "rewards/rejected": -1.1337158679962158, + "sft_loss": 0.9622739553451538, + "step": 11575 + }, + { + "epoch": 0.9, + "grad_norm": 16.952817916870117, + "learning_rate": 2.445271751806366e-07, + "logits/chosen": -1.4535599946975708, + "logits/rejected": -0.9216348528862, + "logps/chosen": -0.8693429827690125, + "logps/rejected": -5.077768802642822, + "loss": 1.0057, + "odds_ratio_loss": 1.363244652748108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08693429827690125, + "rewards/margins": 0.42084255814552307, + "rewards/rejected": -0.5077768564224243, + "sft_loss": 0.8693429827690125, + "step": 11580 + }, + { + "epoch": 0.9, + "grad_norm": 14.601397514343262, + "learning_rate": 2.426287089737783e-07, + "logits/chosen": -1.3307750225067139, + "logits/rejected": -0.9158192873001099, + "logps/chosen": -1.0141366720199585, + "logps/rejected": -10.266256332397461, + "loss": 1.0207, + "odds_ratio_loss": 0.06609180569648743, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10141368210315704, + "rewards/margins": 0.925212025642395, + "rewards/rejected": -1.0266257524490356, + "sft_loss": 1.0141366720199585, + "step": 11585 + }, + { + "epoch": 0.9, + "grad_norm": 16.586641311645508, + "learning_rate": 2.40737457924502e-07, + "logits/chosen": -1.478084683418274, + "logits/rejected": -1.2017533779144287, + "logps/chosen": -1.712651014328003, + "logps/rejected": -3.850841999053955, + "loss": 1.7535, + "odds_ratio_loss": 0.4086402952671051, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1712651252746582, + "rewards/margins": 0.21381907165050507, + "rewards/rejected": -0.3850841820240021, + "sft_loss": 1.712651014328003, + "step": 11590 + }, + { + "epoch": 0.9, + "grad_norm": 6.445002555847168, + "learning_rate": 2.3885342490113096e-07, + "logits/chosen": -1.1681509017944336, + "logits/rejected": -0.9963008761405945, + "logps/chosen": -1.4345595836639404, + "logps/rejected": -1.9659076929092407, + "loss": 1.5061, + "odds_ratio_loss": 0.7152605056762695, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14345595240592957, + "rewards/margins": 0.05313481017947197, + "rewards/rejected": -0.19659076631069183, + "sft_loss": 1.4345595836639404, + "step": 11595 + }, + { + "epoch": 0.9, + "grad_norm": 7.320664405822754, + "learning_rate": 2.3697661276103956e-07, + "logits/chosen": -0.9666527509689331, + "logits/rejected": -1.5034465789794922, + "logps/chosen": -0.8804365992546082, + "logps/rejected": -4.796812534332275, + "loss": 0.8856, + "odds_ratio_loss": 0.05147816985845566, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08804366737604141, + "rewards/margins": 0.39163756370544434, + "rewards/rejected": -0.47968125343322754, + "sft_loss": 0.8804365992546082, + "step": 11600 + }, + { + "epoch": 0.9, + "grad_norm": 16.727750778198242, + "learning_rate": 2.3510702435065202e-07, + "logits/chosen": -1.3985929489135742, + "logits/rejected": -0.8663476705551147, + "logps/chosen": -0.91816246509552, + "logps/rejected": -6.086346626281738, + "loss": 0.9357, + "odds_ratio_loss": 0.17579717934131622, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.091816246509552, + "rewards/margins": 0.5168184041976929, + "rewards/rejected": -0.6086345911026001, + "sft_loss": 0.91816246509552, + "step": 11605 + }, + { + "epoch": 0.9, + "grad_norm": 7.229618072509766, + "learning_rate": 2.3324466250543577e-07, + "logits/chosen": -1.2521380186080933, + "logits/rejected": -1.2696040868759155, + "logps/chosen": -1.0962046384811401, + "logps/rejected": -8.138738632202148, + "loss": 1.1084, + "odds_ratio_loss": 0.1215173751115799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10962046682834625, + "rewards/margins": 0.7042534947395325, + "rewards/rejected": -0.8138739466667175, + "sft_loss": 1.0962046384811401, + "step": 11610 + }, + { + "epoch": 0.9, + "grad_norm": 25.551485061645508, + "learning_rate": 2.3138953004990027e-07, + "logits/chosen": -1.4428867101669312, + "logits/rejected": -1.410568356513977, + "logps/chosen": -0.6762143969535828, + "logps/rejected": -4.25193977355957, + "loss": 0.6972, + "odds_ratio_loss": 0.21020916104316711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06762143224477768, + "rewards/margins": 0.3575725555419922, + "rewards/rejected": -0.4251939654350281, + "sft_loss": 0.6762143969535828, + "step": 11615 + }, + { + "epoch": 0.9, + "grad_norm": 10.37377643585205, + "learning_rate": 2.2954162979758886e-07, + "logits/chosen": -1.334389567375183, + "logits/rejected": -0.9677041172981262, + "logps/chosen": -1.1673810482025146, + "logps/rejected": -3.838226318359375, + "loss": 1.1857, + "odds_ratio_loss": 0.18330082297325134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11673810333013535, + "rewards/margins": 0.267084538936615, + "rewards/rejected": -0.38382261991500854, + "sft_loss": 1.1673810482025146, + "step": 11620 + }, + { + "epoch": 0.9, + "grad_norm": 9.882246971130371, + "learning_rate": 2.2770096455107692e-07, + "logits/chosen": -1.306133508682251, + "logits/rejected": -1.1123729944229126, + "logps/chosen": -1.3784221410751343, + "logps/rejected": -5.239972114562988, + "loss": 1.3836, + "odds_ratio_loss": 0.051394373178482056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13784220814704895, + "rewards/margins": 0.38615497946739197, + "rewards/rejected": -0.5239971876144409, + "sft_loss": 1.3784221410751343, + "step": 11625 + }, + { + "epoch": 0.9, + "grad_norm": 96.8947982788086, + "learning_rate": 2.2586753710196697e-07, + "logits/chosen": -1.4276647567749023, + "logits/rejected": -0.7415833473205566, + "logps/chosen": -2.1991066932678223, + "logps/rejected": -8.009876251220703, + "loss": 2.2116, + "odds_ratio_loss": 0.1253366321325302, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21991066634655, + "rewards/margins": 0.581076979637146, + "rewards/rejected": -0.8009876012802124, + "sft_loss": 2.1991066932678223, + "step": 11630 + }, + { + "epoch": 0.91, + "grad_norm": 12.267698287963867, + "learning_rate": 2.2404135023088415e-07, + "logits/chosen": -1.3456906080245972, + "logits/rejected": -1.3257023096084595, + "logps/chosen": -0.8446849584579468, + "logps/rejected": -7.658097743988037, + "loss": 0.8484, + "odds_ratio_loss": 0.03717377409338951, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08446849882602692, + "rewards/margins": 0.6813413500785828, + "rewards/rejected": -0.7658098340034485, + "sft_loss": 0.8446849584579468, + "step": 11635 + }, + { + "epoch": 0.91, + "grad_norm": 20.532861709594727, + "learning_rate": 2.2222240670747297e-07, + "logits/chosen": -1.3424028158187866, + "logits/rejected": -1.1350343227386475, + "logps/chosen": -0.8087455630302429, + "logps/rejected": -5.615707874298096, + "loss": 0.8261, + "odds_ratio_loss": 0.17381128668785095, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08087456226348877, + "rewards/margins": 0.4806962013244629, + "rewards/rejected": -0.5615707635879517, + "sft_loss": 0.8087455630302429, + "step": 11640 + }, + { + "epoch": 0.91, + "grad_norm": 21.549776077270508, + "learning_rate": 2.2041070929039233e-07, + "logits/chosen": -1.2935678958892822, + "logits/rejected": -0.947970986366272, + "logps/chosen": -0.965768039226532, + "logps/rejected": -8.586902618408203, + "loss": 0.9884, + "odds_ratio_loss": 0.22596082091331482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09657682478427887, + "rewards/margins": 0.7621134519577026, + "rewards/rejected": -0.8586903810501099, + "sft_loss": 0.965768039226532, + "step": 11645 + }, + { + "epoch": 0.91, + "grad_norm": 7.439232349395752, + "learning_rate": 2.186062607273115e-07, + "logits/chosen": -1.5062514543533325, + "logits/rejected": -1.2926065921783447, + "logps/chosen": -0.9130525588989258, + "logps/rejected": -7.870516777038574, + "loss": 0.9295, + "odds_ratio_loss": 0.16428081691265106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09130525588989258, + "rewards/margins": 0.6957464218139648, + "rewards/rejected": -0.7870516777038574, + "sft_loss": 0.9130525588989258, + "step": 11650 + }, + { + "epoch": 0.91, + "grad_norm": 6.630489349365234, + "learning_rate": 2.1680906375490529e-07, + "logits/chosen": -1.2369283437728882, + "logits/rejected": -1.2240400314331055, + "logps/chosen": -1.8176237344741821, + "logps/rejected": -5.021418571472168, + "loss": 1.8622, + "odds_ratio_loss": 0.4459160268306732, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18176238238811493, + "rewards/margins": 0.3203795254230499, + "rewards/rejected": -0.5021418333053589, + "sft_loss": 1.8176237344741821, + "step": 11655 + }, + { + "epoch": 0.91, + "grad_norm": 10.349519729614258, + "learning_rate": 2.150191210988517e-07, + "logits/chosen": -1.407854437828064, + "logits/rejected": -1.0259262323379517, + "logps/chosen": -1.0435562133789062, + "logps/rejected": -6.068276405334473, + "loss": 1.0683, + "odds_ratio_loss": 0.24705128371715546, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10435561835765839, + "rewards/margins": 0.5024720430374146, + "rewards/rejected": -0.6068276166915894, + "sft_loss": 1.0435562133789062, + "step": 11660 + }, + { + "epoch": 0.91, + "grad_norm": 7.212589263916016, + "learning_rate": 2.1323643547382645e-07, + "logits/chosen": -1.3311125040054321, + "logits/rejected": -1.3817427158355713, + "logps/chosen": -1.0456585884094238, + "logps/rejected": -6.869471073150635, + "loss": 1.0677, + "odds_ratio_loss": 0.22031307220458984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10456587374210358, + "rewards/margins": 0.5823811888694763, + "rewards/rejected": -0.6869471073150635, + "sft_loss": 1.0456585884094238, + "step": 11665 + }, + { + "epoch": 0.91, + "grad_norm": 8.162760734558105, + "learning_rate": 2.1146100958349736e-07, + "logits/chosen": -1.2604087591171265, + "logits/rejected": -1.1009435653686523, + "logps/chosen": -0.7238745093345642, + "logps/rejected": -2.706557035446167, + "loss": 0.7602, + "odds_ratio_loss": 0.3634505569934845, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0723874568939209, + "rewards/margins": 0.19826826453208923, + "rewards/rejected": -0.27065569162368774, + "sft_loss": 0.7238745093345642, + "step": 11670 + }, + { + "epoch": 0.91, + "grad_norm": 36.05037307739258, + "learning_rate": 2.096928461205233e-07, + "logits/chosen": -1.3813269138336182, + "logits/rejected": -0.827099621295929, + "logps/chosen": -1.0514827966690063, + "logps/rejected": -4.236260414123535, + "loss": 1.0595, + "odds_ratio_loss": 0.07986799627542496, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10514827817678452, + "rewards/margins": 0.3184778392314911, + "rewards/rejected": -0.4236261248588562, + "sft_loss": 1.0514827966690063, + "step": 11675 + }, + { + "epoch": 0.91, + "grad_norm": 9.704886436462402, + "learning_rate": 2.0793194776655034e-07, + "logits/chosen": -1.2689664363861084, + "logits/rejected": -1.1581445932388306, + "logps/chosen": -0.964913010597229, + "logps/rejected": -5.2943010330200195, + "loss": 0.9863, + "odds_ratio_loss": 0.21359559893608093, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09649130702018738, + "rewards/margins": 0.432938814163208, + "rewards/rejected": -0.5294301509857178, + "sft_loss": 0.964913010597229, + "step": 11680 + }, + { + "epoch": 0.91, + "grad_norm": 26.077938079833984, + "learning_rate": 2.0617831719220273e-07, + "logits/chosen": -1.2547130584716797, + "logits/rejected": -1.3344730138778687, + "logps/chosen": -1.2342766523361206, + "logps/rejected": -7.203073024749756, + "loss": 1.2524, + "odds_ratio_loss": 0.1816709190607071, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12342766672372818, + "rewards/margins": 0.5968796610832214, + "rewards/rejected": -0.7203073501586914, + "sft_loss": 1.2342766523361206, + "step": 11685 + }, + { + "epoch": 0.91, + "grad_norm": 6.128324031829834, + "learning_rate": 2.0443195705708464e-07, + "logits/chosen": -1.4026262760162354, + "logits/rejected": -1.2854896783828735, + "logps/chosen": -1.093899130821228, + "logps/rejected": -7.937254905700684, + "loss": 1.1075, + "odds_ratio_loss": 0.13566581904888153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10938992351293564, + "rewards/margins": 0.6843355298042297, + "rewards/rejected": -0.7937254905700684, + "sft_loss": 1.093899130821228, + "step": 11690 + }, + { + "epoch": 0.91, + "grad_norm": 8.042830467224121, + "learning_rate": 2.0269287000977244e-07, + "logits/chosen": -1.3604000806808472, + "logits/rejected": -0.6384872198104858, + "logps/chosen": -0.9876667857170105, + "logps/rejected": -2.6367809772491455, + "loss": 1.0193, + "odds_ratio_loss": 0.31643930077552795, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09876667708158493, + "rewards/margins": 0.1649114191532135, + "rewards/rejected": -0.26367810368537903, + "sft_loss": 0.9876667857170105, + "step": 11695 + }, + { + "epoch": 0.91, + "grad_norm": 11.282980918884277, + "learning_rate": 2.00961058687813e-07, + "logits/chosen": -1.3280357122421265, + "logits/rejected": -1.0343616008758545, + "logps/chosen": -1.097975492477417, + "logps/rejected": -5.217041015625, + "loss": 1.1192, + "odds_ratio_loss": 0.21179921925067902, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10979755222797394, + "rewards/margins": 0.41190654039382935, + "rewards/rejected": -0.5217040777206421, + "sft_loss": 1.097975492477417, + "step": 11700 + }, + { + "epoch": 0.91, + "grad_norm": 29.81282615661621, + "learning_rate": 1.99236525717717e-07, + "logits/chosen": -1.4915571212768555, + "logits/rejected": -1.2468748092651367, + "logps/chosen": -0.6215002536773682, + "logps/rejected": -9.786310195922852, + "loss": 0.6416, + "odds_ratio_loss": 0.2014237642288208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0621500238776207, + "rewards/margins": 0.9164810180664062, + "rewards/rejected": -0.9786310195922852, + "sft_loss": 0.6215002536773682, + "step": 11705 + }, + { + "epoch": 0.91, + "grad_norm": 7.58364725112915, + "learning_rate": 1.975192737149595e-07, + "logits/chosen": -1.4052196741104126, + "logits/rejected": -1.3716819286346436, + "logps/chosen": -0.9670282602310181, + "logps/rejected": -9.287376403808594, + "loss": 0.9827, + "odds_ratio_loss": 0.15622581541538239, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09670282900333405, + "rewards/margins": 0.8320348858833313, + "rewards/rejected": -0.9287376403808594, + "sft_loss": 0.9670282602310181, + "step": 11710 + }, + { + "epoch": 0.91, + "grad_norm": 13.548325538635254, + "learning_rate": 1.9580930528396936e-07, + "logits/chosen": -1.393760323524475, + "logits/rejected": -1.1954675912857056, + "logps/chosen": -0.9646106958389282, + "logps/rejected": -8.41152572631836, + "loss": 0.9693, + "odds_ratio_loss": 0.046464212238788605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09646106511354446, + "rewards/margins": 0.7446915507316589, + "rewards/rejected": -0.8411526679992676, + "sft_loss": 0.9646106958389282, + "step": 11715 + }, + { + "epoch": 0.91, + "grad_norm": 21.84276008605957, + "learning_rate": 1.9410662301813155e-07, + "logits/chosen": -1.3173141479492188, + "logits/rejected": -0.8565092086791992, + "logps/chosen": -0.9831641912460327, + "logps/rejected": -4.476618766784668, + "loss": 1.0125, + "odds_ratio_loss": 0.2932348847389221, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09831643104553223, + "rewards/margins": 0.34934547543525696, + "rewards/rejected": -0.4476618766784668, + "sft_loss": 0.9831641912460327, + "step": 11720 + }, + { + "epoch": 0.91, + "grad_norm": 9.901228904724121, + "learning_rate": 1.924112294997804e-07, + "logits/chosen": -1.4346562623977661, + "logits/rejected": -1.0577850341796875, + "logps/chosen": -0.8075051307678223, + "logps/rejected": -7.850630283355713, + "loss": 0.8338, + "odds_ratio_loss": 0.2629486918449402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08075051009654999, + "rewards/margins": 0.7043125629425049, + "rewards/rejected": -0.7850630879402161, + "sft_loss": 0.8075051307678223, + "step": 11725 + }, + { + "epoch": 0.91, + "grad_norm": 6.227826118469238, + "learning_rate": 1.9072312730019471e-07, + "logits/chosen": -1.2854779958724976, + "logits/rejected": -1.1545909643173218, + "logps/chosen": -1.0335874557495117, + "logps/rejected": -4.049464702606201, + "loss": 1.041, + "odds_ratio_loss": 0.07380737364292145, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10335874557495117, + "rewards/margins": 0.3015877902507782, + "rewards/rejected": -0.404946506023407, + "sft_loss": 1.0335874557495117, + "step": 11730 + }, + { + "epoch": 0.91, + "grad_norm": 5.330779552459717, + "learning_rate": 1.8904231897959646e-07, + "logits/chosen": -1.3925060033798218, + "logits/rejected": -1.1478973627090454, + "logps/chosen": -0.9691879153251648, + "logps/rejected": -6.747610569000244, + "loss": 0.97, + "odds_ratio_loss": 0.007857967168092728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09691879153251648, + "rewards/margins": 0.5778422355651855, + "rewards/rejected": -0.6747610569000244, + "sft_loss": 0.9691879153251648, + "step": 11735 + }, + { + "epoch": 0.91, + "grad_norm": 3.6288249492645264, + "learning_rate": 1.8736880708714434e-07, + "logits/chosen": -1.3915761709213257, + "logits/rejected": -1.0295811891555786, + "logps/chosen": -1.1151198148727417, + "logps/rejected": -9.233034133911133, + "loss": 1.1344, + "odds_ratio_loss": 0.19316698610782623, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11151199042797089, + "rewards/margins": 0.8117914199829102, + "rewards/rejected": -0.9233034253120422, + "sft_loss": 1.1151198148727417, + "step": 11740 + }, + { + "epoch": 0.91, + "grad_norm": 34.54227066040039, + "learning_rate": 1.857025941609314e-07, + "logits/chosen": -1.496463418006897, + "logits/rejected": -1.3544741868972778, + "logps/chosen": -0.9142772555351257, + "logps/rejected": -13.582223892211914, + "loss": 0.9159, + "odds_ratio_loss": 0.016292279586195946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09142772853374481, + "rewards/margins": 1.2667948007583618, + "rewards/rejected": -1.358222484588623, + "sft_loss": 0.9142772555351257, + "step": 11745 + }, + { + "epoch": 0.91, + "grad_norm": 12.939098358154297, + "learning_rate": 1.840436827279818e-07, + "logits/chosen": -1.2878293991088867, + "logits/rejected": -1.8434340953826904, + "logps/chosen": -0.953102707862854, + "logps/rejected": -16.294424057006836, + "loss": 0.9532, + "odds_ratio_loss": 0.0005960009293630719, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0953102558851242, + "rewards/margins": 1.5341323614120483, + "rewards/rejected": -1.629442572593689, + "sft_loss": 0.953102707862854, + "step": 11750 + }, + { + "epoch": 0.91, + "grad_norm": 65.51850128173828, + "learning_rate": 1.823920753042441e-07, + "logits/chosen": -1.3374836444854736, + "logits/rejected": -0.9872108697891235, + "logps/chosen": -1.0601967573165894, + "logps/rejected": -6.977286338806152, + "loss": 1.0673, + "odds_ratio_loss": 0.07123270630836487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10601967573165894, + "rewards/margins": 0.5917090177536011, + "rewards/rejected": -0.6977287530899048, + "sft_loss": 1.0601967573165894, + "step": 11755 + }, + { + "epoch": 0.91, + "grad_norm": 16.67684555053711, + "learning_rate": 1.8074777439459234e-07, + "logits/chosen": -1.4826557636260986, + "logits/rejected": -1.2680238485336304, + "logps/chosen": -1.2555855512619019, + "logps/rejected": -12.583868026733398, + "loss": 1.2584, + "odds_ratio_loss": 0.028480147942900658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12555855512619019, + "rewards/margins": 1.1328282356262207, + "rewards/rejected": -1.2583868503570557, + "sft_loss": 1.2555855512619019, + "step": 11760 + }, + { + "epoch": 0.92, + "grad_norm": 11.102761268615723, + "learning_rate": 1.7911078249281676e-07, + "logits/chosen": -1.3503711223602295, + "logits/rejected": -1.167409896850586, + "logps/chosen": -0.8589099645614624, + "logps/rejected": -7.199994087219238, + "loss": 0.8662, + "odds_ratio_loss": 0.07283125072717667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.085890993475914, + "rewards/margins": 0.6341084241867065, + "rewards/rejected": -0.7199994325637817, + "sft_loss": 0.8589099645614624, + "step": 11765 + }, + { + "epoch": 0.92, + "grad_norm": 7.360010147094727, + "learning_rate": 1.7748110208162306e-07, + "logits/chosen": -1.4154503345489502, + "logits/rejected": -1.0518730878829956, + "logps/chosen": -1.017219066619873, + "logps/rejected": -5.722550392150879, + "loss": 1.0484, + "odds_ratio_loss": 0.311401903629303, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10172190517187119, + "rewards/margins": 0.4705330729484558, + "rewards/rejected": -0.57225501537323, + "sft_loss": 1.017219066619873, + "step": 11770 + }, + { + "epoch": 0.92, + "grad_norm": 15.793622970581055, + "learning_rate": 1.7585873563262911e-07, + "logits/chosen": -1.444927453994751, + "logits/rejected": -1.3852720260620117, + "logps/chosen": -1.205545425415039, + "logps/rejected": -3.7909297943115234, + "loss": 1.2489, + "odds_ratio_loss": 0.43350714445114136, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12055455148220062, + "rewards/margins": 0.2585384249687195, + "rewards/rejected": -0.3790929615497589, + "sft_loss": 1.205545425415039, + "step": 11775 + }, + { + "epoch": 0.92, + "grad_norm": 14.353672981262207, + "learning_rate": 1.7424368560635952e-07, + "logits/chosen": -1.3821165561676025, + "logits/rejected": -0.9330111742019653, + "logps/chosen": -0.8571823239326477, + "logps/rejected": -6.622941017150879, + "loss": 0.8578, + "odds_ratio_loss": 0.006473424378782511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08571822941303253, + "rewards/margins": 0.5765758752822876, + "rewards/rejected": -0.6622940897941589, + "sft_loss": 0.8571823239326477, + "step": 11780 + }, + { + "epoch": 0.92, + "grad_norm": 6.85159158706665, + "learning_rate": 1.7263595445224267e-07, + "logits/chosen": -1.3977632522583008, + "logits/rejected": -0.8465273976325989, + "logps/chosen": -0.9446234703063965, + "logps/rejected": -6.264396667480469, + "loss": 0.9558, + "odds_ratio_loss": 0.11224206537008286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09446235001087189, + "rewards/margins": 0.5319773554801941, + "rewards/rejected": -0.6264396905899048, + "sft_loss": 0.9446234703063965, + "step": 11785 + }, + { + "epoch": 0.92, + "grad_norm": 9.203495979309082, + "learning_rate": 1.710355446086065e-07, + "logits/chosen": -1.3460767269134521, + "logits/rejected": -1.2761690616607666, + "logps/chosen": -0.73606938123703, + "logps/rejected": -4.091338157653809, + "loss": 0.7445, + "odds_ratio_loss": 0.08461825549602509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0736069455742836, + "rewards/margins": 0.33552688360214233, + "rewards/rejected": -0.40913382172584534, + "sft_loss": 0.73606938123703, + "step": 11790 + }, + { + "epoch": 0.92, + "grad_norm": 22.562210083007812, + "learning_rate": 1.694424585026766e-07, + "logits/chosen": -1.2790594100952148, + "logits/rejected": -1.4777812957763672, + "logps/chosen": -0.8364423513412476, + "logps/rejected": -8.052239418029785, + "loss": 0.8528, + "odds_ratio_loss": 0.16366323828697205, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08364423364400864, + "rewards/margins": 0.7215796709060669, + "rewards/rejected": -0.8052239418029785, + "sft_loss": 0.8364423513412476, + "step": 11795 + }, + { + "epoch": 0.92, + "grad_norm": 5.110595703125, + "learning_rate": 1.6785669855056973e-07, + "logits/chosen": -1.2552369832992554, + "logits/rejected": -1.2118430137634277, + "logps/chosen": -0.8571056127548218, + "logps/rejected": -8.143725395202637, + "loss": 0.8688, + "odds_ratio_loss": 0.11663065105676651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0857105553150177, + "rewards/margins": 0.7286620736122131, + "rewards/rejected": -0.8143726587295532, + "sft_loss": 0.8571056127548218, + "step": 11800 + }, + { + "epoch": 0.92, + "grad_norm": 0.8979963064193726, + "learning_rate": 1.6627826715729266e-07, + "logits/chosen": -1.3609285354614258, + "logits/rejected": -1.329521894454956, + "logps/chosen": -0.5881875157356262, + "logps/rejected": -6.507943630218506, + "loss": 0.5898, + "odds_ratio_loss": 0.015659797936677933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.058818746358156204, + "rewards/margins": 0.5919756889343262, + "rewards/rejected": -0.6507944464683533, + "sft_loss": 0.5881875157356262, + "step": 11805 + }, + { + "epoch": 0.92, + "grad_norm": 8.27506160736084, + "learning_rate": 1.6470716671673603e-07, + "logits/chosen": -1.2430107593536377, + "logits/rejected": -0.9508243799209595, + "logps/chosen": -0.7933021783828735, + "logps/rejected": -6.344875812530518, + "loss": 0.7937, + "odds_ratio_loss": 0.003900631098076701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07933022081851959, + "rewards/margins": 0.5551573038101196, + "rewards/rejected": -0.6344875693321228, + "sft_loss": 0.7933021783828735, + "step": 11810 + }, + { + "epoch": 0.92, + "grad_norm": 62.29372024536133, + "learning_rate": 1.6314339961167435e-07, + "logits/chosen": -1.3524234294891357, + "logits/rejected": -1.283733606338501, + "logps/chosen": -0.9083568453788757, + "logps/rejected": -6.802338600158691, + "loss": 0.9266, + "odds_ratio_loss": 0.18251758813858032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09083569049835205, + "rewards/margins": 0.589398205280304, + "rewards/rejected": -0.6802338361740112, + "sft_loss": 0.9083568453788757, + "step": 11815 + }, + { + "epoch": 0.92, + "grad_norm": 7.945556640625, + "learning_rate": 1.6158696821375776e-07, + "logits/chosen": -1.3798269033432007, + "logits/rejected": -0.9315776824951172, + "logps/chosen": -1.0558592081069946, + "logps/rejected": -4.040333271026611, + "loss": 1.1, + "odds_ratio_loss": 0.44100436568260193, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10558591037988663, + "rewards/margins": 0.29844745993614197, + "rewards/rejected": -0.404033362865448, + "sft_loss": 1.0558592081069946, + "step": 11820 + }, + { + "epoch": 0.92, + "grad_norm": 7.206352710723877, + "learning_rate": 1.6003787488351298e-07, + "logits/chosen": -1.2845613956451416, + "logits/rejected": -1.0599045753479004, + "logps/chosen": -1.1459189653396606, + "logps/rejected": -7.448056697845459, + "loss": 1.1656, + "odds_ratio_loss": 0.1967560052871704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11459188163280487, + "rewards/margins": 0.6302137970924377, + "rewards/rejected": -0.7448056936264038, + "sft_loss": 1.1459189653396606, + "step": 11825 + }, + { + "epoch": 0.92, + "grad_norm": 9.853588104248047, + "learning_rate": 1.584961219703368e-07, + "logits/chosen": -1.4143199920654297, + "logits/rejected": -1.3166183233261108, + "logps/chosen": -0.7544041872024536, + "logps/rejected": -4.371842384338379, + "loss": 0.7706, + "odds_ratio_loss": 0.1623033583164215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0754404217004776, + "rewards/margins": 0.3617438077926636, + "rewards/rejected": -0.43718424439430237, + "sft_loss": 0.7544041872024536, + "step": 11830 + }, + { + "epoch": 0.92, + "grad_norm": 31.727481842041016, + "learning_rate": 1.569617118124922e-07, + "logits/chosen": -1.4243693351745605, + "logits/rejected": -1.200060248374939, + "logps/chosen": -1.0224988460540771, + "logps/rejected": -3.4999337196350098, + "loss": 1.1675, + "odds_ratio_loss": 1.4497023820877075, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1022498831152916, + "rewards/margins": 0.24774348735809326, + "rewards/rejected": -0.34999337792396545, + "sft_loss": 1.0224988460540771, + "step": 11835 + }, + { + "epoch": 0.92, + "grad_norm": 9.608902931213379, + "learning_rate": 1.5543464673710816e-07, + "logits/chosen": -1.4377899169921875, + "logits/rejected": -0.9992468953132629, + "logps/chosen": -0.9196946024894714, + "logps/rejected": -5.83154821395874, + "loss": 0.9301, + "odds_ratio_loss": 0.10370359569787979, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09196946769952774, + "rewards/margins": 0.49118536710739136, + "rewards/rejected": -0.5831547975540161, + "sft_loss": 0.9196946024894714, + "step": 11840 + }, + { + "epoch": 0.92, + "grad_norm": 7.982198715209961, + "learning_rate": 1.5391492906017268e-07, + "logits/chosen": -1.3677937984466553, + "logits/rejected": -1.2269057035446167, + "logps/chosen": -0.8073514103889465, + "logps/rejected": -7.5669379234313965, + "loss": 0.8105, + "odds_ratio_loss": 0.03112083114683628, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08073513954877853, + "rewards/margins": 0.6759586334228516, + "rewards/rejected": -0.7566937208175659, + "sft_loss": 0.8073514103889465, + "step": 11845 + }, + { + "epoch": 0.92, + "grad_norm": 18.15665626525879, + "learning_rate": 1.5240256108652986e-07, + "logits/chosen": -1.446323037147522, + "logits/rejected": -1.472093939781189, + "logps/chosen": -1.1066490411758423, + "logps/rejected": -9.583824157714844, + "loss": 1.1195, + "odds_ratio_loss": 0.12807539105415344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11066490411758423, + "rewards/margins": 0.8477176427841187, + "rewards/rejected": -0.9583824872970581, + "sft_loss": 1.1066490411758423, + "step": 11850 + }, + { + "epoch": 0.92, + "grad_norm": 6.199967384338379, + "learning_rate": 1.5089754510987875e-07, + "logits/chosen": -1.482166051864624, + "logits/rejected": -1.2701305150985718, + "logps/chosen": -1.2697360515594482, + "logps/rejected": -9.479783058166504, + "loss": 1.295, + "odds_ratio_loss": 0.2527654767036438, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12697359919548035, + "rewards/margins": 0.8210046887397766, + "rewards/rejected": -0.9479783177375793, + "sft_loss": 1.2697360515594482, + "step": 11855 + }, + { + "epoch": 0.92, + "grad_norm": 205.06369018554688, + "learning_rate": 1.493998834127658e-07, + "logits/chosen": -1.33724844455719, + "logits/rejected": -1.0690397024154663, + "logps/chosen": -1.4901643991470337, + "logps/rejected": -6.617243766784668, + "loss": 1.5093, + "odds_ratio_loss": 0.19102993607521057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14901643991470337, + "rewards/margins": 0.5127079486846924, + "rewards/rejected": -0.6617244482040405, + "sft_loss": 1.4901643991470337, + "step": 11860 + }, + { + "epoch": 0.92, + "grad_norm": 5.183155536651611, + "learning_rate": 1.4790957826658624e-07, + "logits/chosen": -1.3647806644439697, + "logits/rejected": -0.6501811742782593, + "logps/chosen": -1.1898683309555054, + "logps/rejected": -7.271353244781494, + "loss": 1.2328, + "odds_ratio_loss": 0.4295937418937683, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11898684501647949, + "rewards/margins": 0.608148455619812, + "rewards/rejected": -0.7271353006362915, + "sft_loss": 1.1898683309555054, + "step": 11865 + }, + { + "epoch": 0.92, + "grad_norm": 7.028154373168945, + "learning_rate": 1.4642663193157602e-07, + "logits/chosen": -1.2401490211486816, + "logits/rejected": -1.5572097301483154, + "logps/chosen": -0.8862310647964478, + "logps/rejected": -8.077818870544434, + "loss": 0.8873, + "odds_ratio_loss": 0.010762016288936138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08862310647964478, + "rewards/margins": 0.7191587686538696, + "rewards/rejected": -0.8077818751335144, + "sft_loss": 0.8862310647964478, + "step": 11870 + }, + { + "epoch": 0.92, + "grad_norm": 13.864395141601562, + "learning_rate": 1.449510466568127e-07, + "logits/chosen": -1.3671717643737793, + "logits/rejected": -1.4441094398498535, + "logps/chosen": -0.9278122782707214, + "logps/rejected": -5.194039344787598, + "loss": 0.9475, + "odds_ratio_loss": 0.19683992862701416, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09278122335672379, + "rewards/margins": 0.4266226887702942, + "rewards/rejected": -0.519403874874115, + "sft_loss": 0.9278122782707214, + "step": 11875 + }, + { + "epoch": 0.92, + "grad_norm": 12.953536033630371, + "learning_rate": 1.434828246802078e-07, + "logits/chosen": -1.3513822555541992, + "logits/rejected": -1.2330682277679443, + "logps/chosen": -0.8415525555610657, + "logps/rejected": -3.681126832962036, + "loss": 0.8629, + "odds_ratio_loss": 0.21303188800811768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08415525406599045, + "rewards/margins": 0.28395745158195496, + "rewards/rejected": -0.368112713098526, + "sft_loss": 0.8415525555610657, + "step": 11880 + }, + { + "epoch": 0.92, + "grad_norm": 29.238855361938477, + "learning_rate": 1.420219682285062e-07, + "logits/chosen": -1.2675743103027344, + "logits/rejected": -1.1817070245742798, + "logps/chosen": -1.044460654258728, + "logps/rejected": -2.4189651012420654, + "loss": 1.0879, + "odds_ratio_loss": 0.4343862533569336, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10444607585668564, + "rewards/margins": 0.1374504566192627, + "rewards/rejected": -0.24189653992652893, + "sft_loss": 1.044460654258728, + "step": 11885 + }, + { + "epoch": 0.92, + "grad_norm": 5.407149314880371, + "learning_rate": 1.4056847951728404e-07, + "logits/chosen": -1.2687222957611084, + "logits/rejected": -1.027491569519043, + "logps/chosen": -0.9911456108093262, + "logps/rejected": -4.098135948181152, + "loss": 1.0073, + "odds_ratio_loss": 0.16173891723155975, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0991145670413971, + "rewards/margins": 0.31069907546043396, + "rewards/rejected": -0.40981364250183105, + "sft_loss": 0.9911456108093262, + "step": 11890 + }, + { + "epoch": 0.93, + "grad_norm": 9.799278259277344, + "learning_rate": 1.3912236075093955e-07, + "logits/chosen": -1.4086112976074219, + "logits/rejected": -0.8215053677558899, + "logps/chosen": -1.0313136577606201, + "logps/rejected": -4.323803901672363, + "loss": 1.0426, + "odds_ratio_loss": 0.1132875308394432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10313136875629425, + "rewards/margins": 0.3292490243911743, + "rewards/rejected": -0.4323803782463074, + "sft_loss": 1.0313136577606201, + "step": 11895 + }, + { + "epoch": 0.93, + "grad_norm": 5.346977233886719, + "learning_rate": 1.376836141226956e-07, + "logits/chosen": -1.3889329433441162, + "logits/rejected": -0.7270419001579285, + "logps/chosen": -1.3125561475753784, + "logps/rejected": -11.502326011657715, + "loss": 1.3134, + "odds_ratio_loss": 0.008127482607960701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1312556117773056, + "rewards/margins": 1.0189769268035889, + "rewards/rejected": -1.1502325534820557, + "sft_loss": 1.3125561475753784, + "step": 11900 + }, + { + "epoch": 0.93, + "grad_norm": 7.43550968170166, + "learning_rate": 1.3625224181459507e-07, + "logits/chosen": -1.271698236465454, + "logits/rejected": -1.1763298511505127, + "logps/chosen": -1.30789053440094, + "logps/rejected": -4.681346893310547, + "loss": 1.3176, + "odds_ratio_loss": 0.09740128368139267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13078904151916504, + "rewards/margins": 0.3373456597328186, + "rewards/rejected": -0.46813470125198364, + "sft_loss": 1.30789053440094, + "step": 11905 + }, + { + "epoch": 0.93, + "grad_norm": 6.522105693817139, + "learning_rate": 1.3482824599749534e-07, + "logits/chosen": -1.300004005432129, + "logits/rejected": -1.1260687112808228, + "logps/chosen": -0.82489013671875, + "logps/rejected": -7.8272504806518555, + "loss": 0.8343, + "odds_ratio_loss": 0.0938338190317154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0824890211224556, + "rewards/margins": 0.7002360820770264, + "rewards/rejected": -0.7827251553535461, + "sft_loss": 0.82489013671875, + "step": 11910 + }, + { + "epoch": 0.93, + "grad_norm": 25.62769889831543, + "learning_rate": 1.3341162883106662e-07, + "logits/chosen": -1.531531572341919, + "logits/rejected": -1.0012396574020386, + "logps/chosen": -1.1209322214126587, + "logps/rejected": -6.971850395202637, + "loss": 1.1345, + "odds_ratio_loss": 0.13544395565986633, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1120932325720787, + "rewards/margins": 0.5850918292999268, + "rewards/rejected": -0.6971850395202637, + "sft_loss": 1.1209322214126587, + "step": 11915 + }, + { + "epoch": 0.93, + "grad_norm": 991.5822143554688, + "learning_rate": 1.320023924637892e-07, + "logits/chosen": -1.2943589687347412, + "logits/rejected": -1.1523224115371704, + "logps/chosen": -1.4653940200805664, + "logps/rejected": -9.920454978942871, + "loss": 1.467, + "odds_ratio_loss": 0.016162164509296417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14653940498828888, + "rewards/margins": 0.8455060720443726, + "rewards/rejected": -0.9920455813407898, + "sft_loss": 1.4653940200805664, + "step": 11920 + }, + { + "epoch": 0.93, + "grad_norm": 20.709386825561523, + "learning_rate": 1.3060053903294846e-07, + "logits/chosen": -1.2096668481826782, + "logits/rejected": -0.8821192979812622, + "logps/chosen": -0.8613446950912476, + "logps/rejected": -7.154592990875244, + "loss": 0.8665, + "odds_ratio_loss": 0.05132795497775078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08613447099924088, + "rewards/margins": 0.6293249130249023, + "rewards/rejected": -0.7154593467712402, + "sft_loss": 0.8613446950912476, + "step": 11925 + }, + { + "epoch": 0.93, + "grad_norm": 5.921295642852783, + "learning_rate": 1.2920607066463365e-07, + "logits/chosen": -1.434833288192749, + "logits/rejected": -1.0353233814239502, + "logps/chosen": -1.550927758216858, + "logps/rejected": -9.426887512207031, + "loss": 1.5513, + "odds_ratio_loss": 0.00328804855234921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15509279072284698, + "rewards/margins": 0.7875960469245911, + "rewards/rejected": -0.9426888227462769, + "sft_loss": 1.550927758216858, + "step": 11930 + }, + { + "epoch": 0.93, + "grad_norm": 393.1823425292969, + "learning_rate": 1.2781898947373195e-07, + "logits/chosen": -1.4327385425567627, + "logits/rejected": -1.2880324125289917, + "logps/chosen": -1.134372353553772, + "logps/rejected": -8.846219062805176, + "loss": 1.1626, + "odds_ratio_loss": 0.2817860245704651, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1134372353553772, + "rewards/margins": 0.7711846232414246, + "rewards/rejected": -0.8846219182014465, + "sft_loss": 1.134372353553772, + "step": 11935 + }, + { + "epoch": 0.93, + "grad_norm": 3.882423162460327, + "learning_rate": 1.2643929756392947e-07, + "logits/chosen": -1.3471843004226685, + "logits/rejected": -0.8193763494491577, + "logps/chosen": -1.3053816556930542, + "logps/rejected": -6.060335636138916, + "loss": 1.3227, + "odds_ratio_loss": 0.17290352284908295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1305381804704666, + "rewards/margins": 0.4754953980445862, + "rewards/rejected": -0.6060335636138916, + "sft_loss": 1.3053816556930542, + "step": 11940 + }, + { + "epoch": 0.93, + "grad_norm": 80.9352798461914, + "learning_rate": 1.2506699702770354e-07, + "logits/chosen": -1.4275892972946167, + "logits/rejected": -0.9729791879653931, + "logps/chosen": -1.1546369791030884, + "logps/rejected": -10.712817192077637, + "loss": 1.1667, + "odds_ratio_loss": 0.12075658142566681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11546371132135391, + "rewards/margins": 0.9558179974555969, + "rewards/rejected": -1.0712816715240479, + "sft_loss": 1.1546369791030884, + "step": 11945 + }, + { + "epoch": 0.93, + "grad_norm": 9.213315963745117, + "learning_rate": 1.2370208994632205e-07, + "logits/chosen": -1.321332573890686, + "logits/rejected": -0.8984807729721069, + "logps/chosen": -0.9224896430969238, + "logps/rejected": -4.597050666809082, + "loss": 0.9979, + "odds_ratio_loss": 0.754401683807373, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.09224896878004074, + "rewards/margins": 0.3674561083316803, + "rewards/rejected": -0.45970505475997925, + "sft_loss": 0.9224896430969238, + "step": 11950 + }, + { + "epoch": 0.93, + "grad_norm": 644.9072875976562, + "learning_rate": 1.2234457838984028e-07, + "logits/chosen": -1.2838728427886963, + "logits/rejected": -1.2300163507461548, + "logps/chosen": -1.6868120431900024, + "logps/rejected": -8.237083435058594, + "loss": 1.6892, + "odds_ratio_loss": 0.02369215339422226, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16868120431900024, + "rewards/margins": 0.6550270915031433, + "rewards/rejected": -0.8237083554267883, + "sft_loss": 1.6868120431900024, + "step": 11955 + }, + { + "epoch": 0.93, + "grad_norm": 32.0411376953125, + "learning_rate": 1.2099446441709628e-07, + "logits/chosen": -1.110761284828186, + "logits/rejected": -1.6298236846923828, + "logps/chosen": -0.7495521306991577, + "logps/rejected": -11.462425231933594, + "loss": 0.7499, + "odds_ratio_loss": 0.0036002404522150755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07495521008968353, + "rewards/margins": 1.0712873935699463, + "rewards/rejected": -1.1462424993515015, + "sft_loss": 0.7495521306991577, + "step": 11960 + }, + { + "epoch": 0.93, + "grad_norm": 5.815232276916504, + "learning_rate": 1.1965175007571052e-07, + "logits/chosen": -1.33306086063385, + "logits/rejected": -1.0084116458892822, + "logps/chosen": -1.0075197219848633, + "logps/rejected": -7.959680080413818, + "loss": 1.0209, + "odds_ratio_loss": 0.133327916264534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10075198113918304, + "rewards/margins": 0.6952160596847534, + "rewards/rejected": -0.7959679365158081, + "sft_loss": 1.0075197219848633, + "step": 11965 + }, + { + "epoch": 0.93, + "grad_norm": 8.307555198669434, + "learning_rate": 1.1831643740207844e-07, + "logits/chosen": -1.2692220211029053, + "logits/rejected": -0.9586740732192993, + "logps/chosen": -0.9765459895133972, + "logps/rejected": -3.8916027545928955, + "loss": 0.9861, + "odds_ratio_loss": 0.09511784464120865, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09765460342168808, + "rewards/margins": 0.2915056645870209, + "rewards/rejected": -0.38916027545928955, + "sft_loss": 0.9765459895133972, + "step": 11970 + }, + { + "epoch": 0.93, + "grad_norm": 26.505313873291016, + "learning_rate": 1.1698852842137176e-07, + "logits/chosen": -1.3430640697479248, + "logits/rejected": -1.318647027015686, + "logps/chosen": -0.8846755027770996, + "logps/rejected": -11.83702278137207, + "loss": 0.9585, + "odds_ratio_loss": 0.7379862666130066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0884675532579422, + "rewards/margins": 1.095234751701355, + "rewards/rejected": -1.1837023496627808, + "sft_loss": 0.8846755027770996, + "step": 11975 + }, + { + "epoch": 0.93, + "grad_norm": 8.90914535522461, + "learning_rate": 1.1566802514753284e-07, + "logits/chosen": -1.4078346490859985, + "logits/rejected": -1.1965572834014893, + "logps/chosen": -0.8551615476608276, + "logps/rejected": -5.059370994567871, + "loss": 0.8636, + "odds_ratio_loss": 0.08399681746959686, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08551616221666336, + "rewards/margins": 0.42042097449302673, + "rewards/rejected": -0.5059371590614319, + "sft_loss": 0.8551615476608276, + "step": 11980 + }, + { + "epoch": 0.93, + "grad_norm": 8.81097412109375, + "learning_rate": 1.1435492958327243e-07, + "logits/chosen": -1.3314521312713623, + "logits/rejected": -1.2199690341949463, + "logps/chosen": -0.9615247845649719, + "logps/rejected": -7.449499607086182, + "loss": 0.9763, + "odds_ratio_loss": 0.1473933905363083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09615248441696167, + "rewards/margins": 0.6487975120544434, + "rewards/rejected": -0.744949996471405, + "sft_loss": 0.9615247845649719, + "step": 11985 + }, + { + "epoch": 0.93, + "grad_norm": 5.545321941375732, + "learning_rate": 1.1304924372006754e-07, + "logits/chosen": -1.4224355220794678, + "logits/rejected": -0.9253866076469421, + "logps/chosen": -1.4553496837615967, + "logps/rejected": -7.184508323669434, + "loss": 1.4743, + "odds_ratio_loss": 0.18964393436908722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1455349624156952, + "rewards/margins": 0.5729159116744995, + "rewards/rejected": -0.7184508442878723, + "sft_loss": 1.4553496837615967, + "step": 11990 + }, + { + "epoch": 0.93, + "grad_norm": 4.908974647521973, + "learning_rate": 1.1175096953815578e-07, + "logits/chosen": -1.211637020111084, + "logits/rejected": -1.2191622257232666, + "logps/chosen": -1.2892272472381592, + "logps/rejected": -13.501127243041992, + "loss": 1.337, + "odds_ratio_loss": 0.4774898588657379, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1289227306842804, + "rewards/margins": 1.221190094947815, + "rewards/rejected": -1.350112795829773, + "sft_loss": 1.2892272472381592, + "step": 11995 + }, + { + "epoch": 0.93, + "grad_norm": 12.566999435424805, + "learning_rate": 1.1046010900653492e-07, + "logits/chosen": -1.3955318927764893, + "logits/rejected": -0.9571416974067688, + "logps/chosen": -0.9366915822029114, + "logps/rejected": -7.75982141494751, + "loss": 0.9803, + "odds_ratio_loss": 0.43617886304855347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09366915374994278, + "rewards/margins": 0.6823130249977112, + "rewards/rejected": -0.775982141494751, + "sft_loss": 0.9366915822029114, + "step": 12000 + }, + { + "epoch": 0.93, + "grad_norm": 35.92390441894531, + "learning_rate": 1.0917666408295891e-07, + "logits/chosen": -1.4041101932525635, + "logits/rejected": -1.0649272203445435, + "logps/chosen": -0.9326528310775757, + "logps/rejected": -2.555864095687866, + "loss": 0.9705, + "odds_ratio_loss": 0.37830302119255066, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09326529502868652, + "rewards/margins": 0.16232113540172577, + "rewards/rejected": -0.2555864453315735, + "sft_loss": 0.9326528310775757, + "step": 12005 + }, + { + "epoch": 0.93, + "grad_norm": 7.222818374633789, + "learning_rate": 1.0790063671393514e-07, + "logits/chosen": -1.2876554727554321, + "logits/rejected": -1.0530130863189697, + "logps/chosen": -0.8531826138496399, + "logps/rejected": -6.352944850921631, + "loss": 0.8805, + "odds_ratio_loss": 0.2727716565132141, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08531826734542847, + "rewards/margins": 0.5499762296676636, + "rewards/rejected": -0.6352945566177368, + "sft_loss": 0.8531826138496399, + "step": 12010 + }, + { + "epoch": 0.93, + "grad_norm": 78.87431335449219, + "learning_rate": 1.0663202883472056e-07, + "logits/chosen": -1.2207971811294556, + "logits/rejected": -1.0271674394607544, + "logps/chosen": -0.8058179616928101, + "logps/rejected": -8.2181396484375, + "loss": 0.8072, + "odds_ratio_loss": 0.013833925127983093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08058180660009384, + "rewards/margins": 0.74123215675354, + "rewards/rejected": -0.8218139410018921, + "sft_loss": 0.8058179616928101, + "step": 12015 + }, + { + "epoch": 0.94, + "grad_norm": 5.772078990936279, + "learning_rate": 1.0537084236932116e-07, + "logits/chosen": -1.1963145732879639, + "logits/rejected": -0.8109409213066101, + "logps/chosen": -0.7392154335975647, + "logps/rejected": -6.433381080627441, + "loss": 0.7564, + "odds_ratio_loss": 0.171526238322258, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07392154633998871, + "rewards/margins": 0.5694166421890259, + "rewards/rejected": -0.6433380842208862, + "sft_loss": 0.7392154335975647, + "step": 12020 + }, + { + "epoch": 0.94, + "grad_norm": 28.807552337646484, + "learning_rate": 1.041170792304852e-07, + "logits/chosen": -1.1929874420166016, + "logits/rejected": -1.4887889623641968, + "logps/chosen": -1.1478734016418457, + "logps/rejected": -8.757756233215332, + "loss": 1.1484, + "odds_ratio_loss": 0.0055931126698851585, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11478734016418457, + "rewards/margins": 0.7609882354736328, + "rewards/rejected": -0.8757756352424622, + "sft_loss": 1.1478734016418457, + "step": 12025 + }, + { + "epoch": 0.94, + "grad_norm": 6.5143585205078125, + "learning_rate": 1.0287074131970387e-07, + "logits/chosen": -1.3817237615585327, + "logits/rejected": -1.1475965976715088, + "logps/chosen": -1.1112757921218872, + "logps/rejected": -5.863193035125732, + "loss": 1.118, + "odds_ratio_loss": 0.06756018102169037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1111275777220726, + "rewards/margins": 0.47519174218177795, + "rewards/rejected": -0.5863193273544312, + "sft_loss": 1.1112757921218872, + "step": 12030 + }, + { + "epoch": 0.94, + "grad_norm": 4.472811698913574, + "learning_rate": 1.0163183052720793e-07, + "logits/chosen": -1.0868616104125977, + "logits/rejected": -1.0630944967269897, + "logps/chosen": -0.9792786836624146, + "logps/rejected": -5.954896450042725, + "loss": 1.0019, + "odds_ratio_loss": 0.22617638111114502, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09792786091566086, + "rewards/margins": 0.4975617825984955, + "rewards/rejected": -0.5954896807670593, + "sft_loss": 0.9792786836624146, + "step": 12035 + }, + { + "epoch": 0.94, + "grad_norm": 15.370599746704102, + "learning_rate": 1.0040034873196158e-07, + "logits/chosen": -1.3447648286819458, + "logits/rejected": -1.0325714349746704, + "logps/chosen": -0.956787109375, + "logps/rejected": -2.4064972400665283, + "loss": 0.9877, + "odds_ratio_loss": 0.3093990385532379, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09567871689796448, + "rewards/margins": 0.14497099816799164, + "rewards/rejected": -0.2406497299671173, + "sft_loss": 0.956787109375, + "step": 12040 + }, + { + "epoch": 0.94, + "grad_norm": 9.366061210632324, + "learning_rate": 9.91762978016636e-08, + "logits/chosen": -1.3803406953811646, + "logits/rejected": -0.5240996479988098, + "logps/chosen": -1.0151456594467163, + "logps/rejected": -7.575808525085449, + "loss": 1.0166, + "odds_ratio_loss": 0.014451740309596062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10151456296443939, + "rewards/margins": 0.6560662984848022, + "rewards/rejected": -0.7575808763504028, + "sft_loss": 1.0151456594467163, + "step": 12045 + }, + { + "epoch": 0.94, + "grad_norm": 6.190619945526123, + "learning_rate": 9.795967959274233e-08, + "logits/chosen": -1.1493083238601685, + "logits/rejected": -1.1575744152069092, + "logps/chosen": -0.8835114240646362, + "logps/rejected": -9.733977317810059, + "loss": 0.898, + "odds_ratio_loss": 0.1443968117237091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08835114538669586, + "rewards/margins": 0.8850466012954712, + "rewards/rejected": -0.9733978509902954, + "sft_loss": 0.8835114240646362, + "step": 12050 + }, + { + "epoch": 0.94, + "grad_norm": 5.491006374359131, + "learning_rate": 9.675049595035512e-08, + "logits/chosen": -1.180174708366394, + "logits/rejected": -0.6385637521743774, + "logps/chosen": -1.3686999082565308, + "logps/rejected": -13.49418830871582, + "loss": 1.3689, + "odds_ratio_loss": 0.0024213658180087805, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13686999678611755, + "rewards/margins": 1.2125487327575684, + "rewards/rejected": -1.3494187593460083, + "sft_loss": 1.3686999082565308, + "step": 12055 + }, + { + "epoch": 0.94, + "grad_norm": 10.283293724060059, + "learning_rate": 9.554874870838116e-08, + "logits/chosen": -1.3981066942214966, + "logits/rejected": -1.540945291519165, + "logps/chosen": -0.962783932685852, + "logps/rejected": -6.7717413902282715, + "loss": 0.9941, + "odds_ratio_loss": 0.31289738416671753, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09627839177846909, + "rewards/margins": 0.5808957815170288, + "rewards/rejected": -0.6771741509437561, + "sft_loss": 0.962783932685852, + "step": 12060 + }, + { + "epoch": 0.94, + "grad_norm": 17.973472595214844, + "learning_rate": 9.435443968942304e-08, + "logits/chosen": -1.2509690523147583, + "logits/rejected": -0.672872006893158, + "logps/chosen": -1.6948953866958618, + "logps/rejected": -6.384387969970703, + "loss": 1.7539, + "odds_ratio_loss": 0.5902327299118042, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1694895476102829, + "rewards/margins": 0.46894925832748413, + "rewards/rejected": -0.6384388208389282, + "sft_loss": 1.6948953866958618, + "step": 12065 + }, + { + "epoch": 0.94, + "grad_norm": 46.34778594970703, + "learning_rate": 9.316757070480242e-08, + "logits/chosen": -1.2243728637695312, + "logits/rejected": -0.6550508737564087, + "logps/chosen": -1.2963842153549194, + "logps/rejected": -4.396199703216553, + "loss": 1.3026, + "odds_ratio_loss": 0.0616978295147419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1296384334564209, + "rewards/margins": 0.3099815547466278, + "rewards/rejected": -0.4396200180053711, + "sft_loss": 1.2963842153549194, + "step": 12070 + }, + { + "epoch": 0.94, + "grad_norm": 14.527528762817383, + "learning_rate": 9.198814355455666e-08, + "logits/chosen": -1.4315307140350342, + "logits/rejected": -1.2783674001693726, + "logps/chosen": -0.5937285423278809, + "logps/rejected": -5.994349002838135, + "loss": 0.6022, + "odds_ratio_loss": 0.08455837517976761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059372853487730026, + "rewards/margins": 0.5400620698928833, + "rewards/rejected": -0.5994349718093872, + "sft_loss": 0.5937285423278809, + "step": 12075 + }, + { + "epoch": 0.94, + "grad_norm": 14.392807006835938, + "learning_rate": 9.08161600274371e-08, + "logits/chosen": -1.3758456707000732, + "logits/rejected": -0.9627407789230347, + "logps/chosen": -1.0425665378570557, + "logps/rejected": -3.557486057281494, + "loss": 1.1027, + "odds_ratio_loss": 0.6015771627426147, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10425666719675064, + "rewards/margins": 0.2514919340610504, + "rewards/rejected": -0.35574859380722046, + "sft_loss": 1.0425665378570557, + "step": 12080 + }, + { + "epoch": 0.94, + "grad_norm": 13.73567008972168, + "learning_rate": 8.965162190090415e-08, + "logits/chosen": -1.4193708896636963, + "logits/rejected": -0.9948325157165527, + "logps/chosen": -1.0957233905792236, + "logps/rejected": -4.032468795776367, + "loss": 1.1318, + "odds_ratio_loss": 0.361260324716568, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10957233607769012, + "rewards/margins": 0.293674498796463, + "rewards/rejected": -0.4032468795776367, + "sft_loss": 1.0957233905792236, + "step": 12085 + }, + { + "epoch": 0.94, + "grad_norm": 221.73533630371094, + "learning_rate": 8.849453094112947e-08, + "logits/chosen": -1.3586000204086304, + "logits/rejected": -0.698357343673706, + "logps/chosen": -1.1419107913970947, + "logps/rejected": -3.3975303173065186, + "loss": 1.1667, + "odds_ratio_loss": 0.24787549674510956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11419107764959335, + "rewards/margins": 0.2255619764328003, + "rewards/rejected": -0.33975309133529663, + "sft_loss": 1.1419107913970947, + "step": 12090 + }, + { + "epoch": 0.94, + "grad_norm": 8.304841995239258, + "learning_rate": 8.734488890298765e-08, + "logits/chosen": -1.3388170003890991, + "logits/rejected": -0.8920143842697144, + "logps/chosen": -0.9899235963821411, + "logps/rejected": -5.051036834716797, + "loss": 1.0105, + "odds_ratio_loss": 0.20586714148521423, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09899236261844635, + "rewards/margins": 0.40611138939857483, + "rewards/rejected": -0.5051037073135376, + "sft_loss": 0.9899235963821411, + "step": 12095 + }, + { + "epoch": 0.94, + "grad_norm": 61.597381591796875, + "learning_rate": 8.620269753005617e-08, + "logits/chosen": -1.508922815322876, + "logits/rejected": -0.8960739374160767, + "logps/chosen": -1.2426116466522217, + "logps/rejected": -8.366430282592773, + "loss": 1.2661, + "odds_ratio_loss": 0.23516330122947693, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12426116317510605, + "rewards/margins": 0.7123818397521973, + "rewards/rejected": -0.8366430401802063, + "sft_loss": 1.2426116466522217, + "step": 12100 + }, + { + "epoch": 0.94, + "grad_norm": 14.718682289123535, + "learning_rate": 8.506795855461381e-08, + "logits/chosen": -1.3858963251113892, + "logits/rejected": -1.4522991180419922, + "logps/chosen": -1.1231282949447632, + "logps/rejected": -5.925871849060059, + "loss": 1.1419, + "odds_ratio_loss": 0.18807430565357208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11231283843517303, + "rewards/margins": 0.4802742898464203, + "rewards/rejected": -0.5925871729850769, + "sft_loss": 1.1231282949447632, + "step": 12105 + }, + { + "epoch": 0.94, + "grad_norm": 27.117656707763672, + "learning_rate": 8.394067369763725e-08, + "logits/chosen": -1.331491470336914, + "logits/rejected": -1.3383785486221313, + "logps/chosen": -0.893126368522644, + "logps/rejected": -4.682895183563232, + "loss": 0.908, + "odds_ratio_loss": 0.14834722876548767, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08931264281272888, + "rewards/margins": 0.37897688150405884, + "rewards/rejected": -0.4682895541191101, + "sft_loss": 0.893126368522644, + "step": 12110 + }, + { + "epoch": 0.94, + "grad_norm": 10.441815376281738, + "learning_rate": 8.282084466879503e-08, + "logits/chosen": -1.4479217529296875, + "logits/rejected": -1.075995683670044, + "logps/chosen": -0.9278053045272827, + "logps/rejected": -6.618409633636475, + "loss": 0.9594, + "odds_ratio_loss": 0.3158452808856964, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09278053045272827, + "rewards/margins": 0.5690604448318481, + "rewards/rejected": -0.6618409156799316, + "sft_loss": 0.9278053045272827, + "step": 12115 + }, + { + "epoch": 0.94, + "grad_norm": 11.735318183898926, + "learning_rate": 8.170847316645247e-08, + "logits/chosen": -1.4747486114501953, + "logits/rejected": -1.2501027584075928, + "logps/chosen": -0.8782304525375366, + "logps/rejected": -5.13653039932251, + "loss": 0.8838, + "odds_ratio_loss": 0.05616645887494087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0878230482339859, + "rewards/margins": 0.4258299767971039, + "rewards/rejected": -0.5136530995368958, + "sft_loss": 0.8782304525375366, + "step": 12120 + }, + { + "epoch": 0.94, + "grad_norm": 11.163629531860352, + "learning_rate": 8.060356087766063e-08, + "logits/chosen": -1.396041989326477, + "logits/rejected": -1.07887601852417, + "logps/chosen": -1.0224263668060303, + "logps/rejected": -2.5870959758758545, + "loss": 1.062, + "odds_ratio_loss": 0.3955123722553253, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10224263370037079, + "rewards/margins": 0.1564669907093048, + "rewards/rejected": -0.2587096095085144, + "sft_loss": 1.0224263668060303, + "step": 12125 + }, + { + "epoch": 0.94, + "grad_norm": 9.195205688476562, + "learning_rate": 7.950610947815907e-08, + "logits/chosen": -1.3456220626831055, + "logits/rejected": -0.5792462229728699, + "logps/chosen": -0.9115538597106934, + "logps/rejected": -1.8077294826507568, + "loss": 0.9513, + "odds_ratio_loss": 0.39775362610816956, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09115538746118546, + "rewards/margins": 0.08961758762598038, + "rewards/rejected": -0.18077296018600464, + "sft_loss": 0.9115538597106934, + "step": 12130 + }, + { + "epoch": 0.94, + "grad_norm": 194.4700469970703, + "learning_rate": 7.841612063237303e-08, + "logits/chosen": -1.3239562511444092, + "logits/rejected": -1.120084285736084, + "logps/chosen": -0.8774365186691284, + "logps/rejected": -9.447786331176758, + "loss": 0.9098, + "odds_ratio_loss": 0.3234691321849823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08774365484714508, + "rewards/margins": 0.8570348620414734, + "rewards/rejected": -0.944778561592102, + "sft_loss": 0.8774365186691284, + "step": 12135 + }, + { + "epoch": 0.94, + "grad_norm": 9.07659912109375, + "learning_rate": 7.733359599340906e-08, + "logits/chosen": -1.4422779083251953, + "logits/rejected": -1.4027847051620483, + "logps/chosen": -1.2413756847381592, + "logps/rejected": -4.607026100158691, + "loss": 1.2533, + "odds_ratio_loss": 0.1192028746008873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12413756549358368, + "rewards/margins": 0.3365650475025177, + "rewards/rejected": -0.4607025980949402, + "sft_loss": 1.2413756847381592, + "step": 12140 + }, + { + "epoch": 0.94, + "grad_norm": 158.15708923339844, + "learning_rate": 7.625853720305276e-08, + "logits/chosen": -1.3283551931381226, + "logits/rejected": -1.3912734985351562, + "logps/chosen": -0.695563018321991, + "logps/rejected": -8.136407852172852, + "loss": 0.6959, + "odds_ratio_loss": 0.0031926899682730436, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06955631077289581, + "rewards/margins": 0.7440845370292664, + "rewards/rejected": -0.8136407732963562, + "sft_loss": 0.695563018321991, + "step": 12145 + }, + { + "epoch": 0.95, + "grad_norm": 10.927029609680176, + "learning_rate": 7.519094589176711e-08, + "logits/chosen": -1.3552180528640747, + "logits/rejected": -0.7659333944320679, + "logps/chosen": -0.9943283200263977, + "logps/rejected": -6.943314552307129, + "loss": 0.999, + "odds_ratio_loss": 0.04680733382701874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09943283349275589, + "rewards/margins": 0.5948985815048218, + "rewards/rejected": -0.6943314671516418, + "sft_loss": 0.9943283200263977, + "step": 12150 + }, + { + "epoch": 0.95, + "grad_norm": 12.59920883178711, + "learning_rate": 7.41308236786903e-08, + "logits/chosen": -1.3633660078048706, + "logits/rejected": -1.2992112636566162, + "logps/chosen": -1.0420236587524414, + "logps/rejected": -10.714728355407715, + "loss": 1.0427, + "odds_ratio_loss": 0.006580235902220011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10420236736536026, + "rewards/margins": 0.9672704935073853, + "rewards/rejected": -1.0714728832244873, + "sft_loss": 1.0420236587524414, + "step": 12155 + }, + { + "epoch": 0.95, + "grad_norm": 4.964155673980713, + "learning_rate": 7.307817217163226e-08, + "logits/chosen": -1.3251888751983643, + "logits/rejected": -0.5417923927307129, + "logps/chosen": -0.9761824607849121, + "logps/rejected": -8.093366622924805, + "loss": 0.9885, + "odds_ratio_loss": 0.12305097281932831, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09761824458837509, + "rewards/margins": 0.7117183804512024, + "rewards/rejected": -0.8093365430831909, + "sft_loss": 0.9761824607849121, + "step": 12160 + }, + { + "epoch": 0.95, + "grad_norm": 104.98523712158203, + "learning_rate": 7.203299296707156e-08, + "logits/chosen": -1.3231998682022095, + "logits/rejected": -1.129874348640442, + "logps/chosen": -1.044856071472168, + "logps/rejected": -5.618155479431152, + "loss": 1.0523, + "odds_ratio_loss": 0.07447656989097595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10448561608791351, + "rewards/margins": 0.45732998847961426, + "rewards/rejected": -0.5618155598640442, + "sft_loss": 1.044856071472168, + "step": 12165 + }, + { + "epoch": 0.95, + "grad_norm": 5.787536144256592, + "learning_rate": 7.099528765015684e-08, + "logits/chosen": -1.4294707775115967, + "logits/rejected": -1.1909528970718384, + "logps/chosen": -1.0964525938034058, + "logps/rejected": -2.062704563140869, + "loss": 1.1325, + "odds_ratio_loss": 0.360903263092041, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10964526236057281, + "rewards/margins": 0.09662520885467529, + "rewards/rejected": -0.20627045631408691, + "sft_loss": 1.0964525938034058, + "step": 12170 + }, + { + "epoch": 0.95, + "grad_norm": 10.634527206420898, + "learning_rate": 6.996505779469976e-08, + "logits/chosen": -1.3393810987472534, + "logits/rejected": -1.0690622329711914, + "logps/chosen": -0.9409956932067871, + "logps/rejected": -6.221067905426025, + "loss": 0.9422, + "odds_ratio_loss": 0.011605637148022652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09409955888986588, + "rewards/margins": 0.5280072689056396, + "rewards/rejected": -0.6221068501472473, + "sft_loss": 0.9409956932067871, + "step": 12175 + }, + { + "epoch": 0.95, + "grad_norm": 6.429897785186768, + "learning_rate": 6.894230496317322e-08, + "logits/chosen": -1.4484320878982544, + "logits/rejected": -0.8284847140312195, + "logps/chosen": -1.3208070993423462, + "logps/rejected": -7.3359880447387695, + "loss": 1.335, + "odds_ratio_loss": 0.1418505609035492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13208071887493134, + "rewards/margins": 0.6015180349349976, + "rewards/rejected": -0.7335987687110901, + "sft_loss": 1.3208070993423462, + "step": 12180 + }, + { + "epoch": 0.95, + "grad_norm": 7.727414131164551, + "learning_rate": 6.792703070671258e-08, + "logits/chosen": -1.229412317276001, + "logits/rejected": -1.1320011615753174, + "logps/chosen": -0.8664442896842957, + "logps/rejected": -10.342384338378906, + "loss": 0.8741, + "odds_ratio_loss": 0.07685581594705582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08664443343877792, + "rewards/margins": 0.9475939869880676, + "rewards/rejected": -1.0342384576797485, + "sft_loss": 0.8664442896842957, + "step": 12185 + }, + { + "epoch": 0.95, + "grad_norm": 8.296463012695312, + "learning_rate": 6.691923656511112e-08, + "logits/chosen": -1.295880913734436, + "logits/rejected": -1.4863216876983643, + "logps/chosen": -1.0527490377426147, + "logps/rejected": -10.10430908203125, + "loss": 1.0625, + "odds_ratio_loss": 0.09733770787715912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10527490079402924, + "rewards/margins": 0.9051558375358582, + "rewards/rejected": -1.0104308128356934, + "sft_loss": 1.0527490377426147, + "step": 12190 + }, + { + "epoch": 0.95, + "grad_norm": 395.551513671875, + "learning_rate": 6.591892406681511e-08, + "logits/chosen": -1.233620285987854, + "logits/rejected": -1.1905428171157837, + "logps/chosen": -1.7282718420028687, + "logps/rejected": -2.440072536468506, + "loss": 1.7949, + "odds_ratio_loss": 0.6666890978813171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.17282718420028687, + "rewards/margins": 0.07118009030818939, + "rewards/rejected": -0.24400727450847626, + "sft_loss": 1.7282718420028687, + "step": 12195 + }, + { + "epoch": 0.95, + "grad_norm": 6.149363040924072, + "learning_rate": 6.492609472892653e-08, + "logits/chosen": -1.2824854850769043, + "logits/rejected": -1.3274608850479126, + "logps/chosen": -1.017652153968811, + "logps/rejected": -10.595731735229492, + "loss": 1.0283, + "odds_ratio_loss": 0.10598143190145493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1017652153968811, + "rewards/margins": 0.9578080177307129, + "rewards/rejected": -1.0595732927322388, + "sft_loss": 1.017652153968811, + "step": 12200 + }, + { + "epoch": 0.95, + "grad_norm": 8.832098007202148, + "learning_rate": 6.394075005719647e-08, + "logits/chosen": -1.3609025478363037, + "logits/rejected": -1.0700979232788086, + "logps/chosen": -0.8298861384391785, + "logps/rejected": -4.105466365814209, + "loss": 0.8455, + "odds_ratio_loss": 0.1559799760580063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08298861235380173, + "rewards/margins": 0.3275580406188965, + "rewards/rejected": -0.4105466902256012, + "sft_loss": 0.8298861384391785, + "step": 12205 + }, + { + "epoch": 0.95, + "grad_norm": 7.1576247215271, + "learning_rate": 6.296289154602508e-08, + "logits/chosen": -1.4908638000488281, + "logits/rejected": -1.1165975332260132, + "logps/chosen": -0.8523378372192383, + "logps/rejected": -3.153052568435669, + "loss": 0.8723, + "odds_ratio_loss": 0.19982339441776276, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08523379266262054, + "rewards/margins": 0.23007145524024963, + "rewards/rejected": -0.315305233001709, + "sft_loss": 0.8523378372192383, + "step": 12210 + }, + { + "epoch": 0.95, + "grad_norm": 5.327239990234375, + "learning_rate": 6.199252067845995e-08, + "logits/chosen": -1.408676266670227, + "logits/rejected": -1.2627909183502197, + "logps/chosen": -1.008329153060913, + "logps/rejected": -7.339837074279785, + "loss": 1.0153, + "odds_ratio_loss": 0.06966123729944229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10083291679620743, + "rewards/margins": 0.6331508159637451, + "rewards/rejected": -0.7339836955070496, + "sft_loss": 1.008329153060913, + "step": 12215 + }, + { + "epoch": 0.95, + "grad_norm": 331.38897705078125, + "learning_rate": 6.102963892619107e-08, + "logits/chosen": -1.4222207069396973, + "logits/rejected": -1.1397409439086914, + "logps/chosen": -1.2288148403167725, + "logps/rejected": -9.242072105407715, + "loss": 1.2508, + "odds_ratio_loss": 0.21948783099651337, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12288150936365128, + "rewards/margins": 0.801325798034668, + "rewards/rejected": -0.924207329750061, + "sft_loss": 1.2288148403167725, + "step": 12220 + }, + { + "epoch": 0.95, + "grad_norm": 16.810434341430664, + "learning_rate": 6.007424774955029e-08, + "logits/chosen": -1.2084866762161255, + "logits/rejected": -1.2693732976913452, + "logps/chosen": -0.8313711285591125, + "logps/rejected": -3.6703248023986816, + "loss": 0.8445, + "odds_ratio_loss": 0.13097386062145233, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08313710987567902, + "rewards/margins": 0.2838953733444214, + "rewards/rejected": -0.3670324683189392, + "sft_loss": 0.8313711285591125, + "step": 12225 + }, + { + "epoch": 0.95, + "grad_norm": 26.042701721191406, + "learning_rate": 5.912634859751021e-08, + "logits/chosen": -1.2757648229599, + "logits/rejected": -1.3488729000091553, + "logps/chosen": -0.9625293016433716, + "logps/rejected": -2.616014003753662, + "loss": 1.0387, + "odds_ratio_loss": 0.7618128657341003, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0962529331445694, + "rewards/margins": 0.16534848511219025, + "rewards/rejected": -0.26160138845443726, + "sft_loss": 0.9625293016433716, + "step": 12230 + }, + { + "epoch": 0.95, + "grad_norm": 9.009185791015625, + "learning_rate": 5.818594290768087e-08, + "logits/chosen": -1.4181190729141235, + "logits/rejected": -1.1298072338104248, + "logps/chosen": -0.9780190587043762, + "logps/rejected": -5.396805286407471, + "loss": 1.0051, + "odds_ratio_loss": 0.2709425091743469, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09780190140008926, + "rewards/margins": 0.44187861680984497, + "rewards/rejected": -0.5396804809570312, + "sft_loss": 0.9780190587043762, + "step": 12235 + }, + { + "epoch": 0.95, + "grad_norm": 33.64002990722656, + "learning_rate": 5.725303210630584e-08, + "logits/chosen": -1.25346839427948, + "logits/rejected": -0.8549555540084839, + "logps/chosen": -0.9498344659805298, + "logps/rejected": -3.4202792644500732, + "loss": 0.9662, + "odds_ratio_loss": 0.16371150314807892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09498345106840134, + "rewards/margins": 0.24704448878765106, + "rewards/rejected": -0.3420279622077942, + "sft_loss": 0.9498344659805298, + "step": 12240 + }, + { + "epoch": 0.95, + "grad_norm": 15.04161548614502, + "learning_rate": 5.632761760826333e-08, + "logits/chosen": -1.1823722124099731, + "logits/rejected": -1.113013505935669, + "logps/chosen": -0.8765622973442078, + "logps/rejected": -2.5976319313049316, + "loss": 0.9287, + "odds_ratio_loss": 0.5214719772338867, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.08765622973442078, + "rewards/margins": 0.17210696637630463, + "rewards/rejected": -0.2597631812095642, + "sft_loss": 0.8765622973442078, + "step": 12245 + }, + { + "epoch": 0.95, + "grad_norm": 6.058566570281982, + "learning_rate": 5.540970081706176e-08, + "logits/chosen": -1.280879259109497, + "logits/rejected": -0.4485422968864441, + "logps/chosen": -1.225573182106018, + "logps/rejected": -9.1913480758667, + "loss": 1.2274, + "odds_ratio_loss": 0.01814914494752884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12255732715129852, + "rewards/margins": 0.796577513217926, + "rewards/rejected": -0.919134795665741, + "sft_loss": 1.225573182106018, + "step": 12250 + }, + { + "epoch": 0.95, + "grad_norm": 31.721729278564453, + "learning_rate": 5.449928312483865e-08, + "logits/chosen": -1.1671860218048096, + "logits/rejected": -1.2594536542892456, + "logps/chosen": -1.1395418643951416, + "logps/rejected": -4.042827129364014, + "loss": 1.1664, + "odds_ratio_loss": 0.26880699396133423, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11395418643951416, + "rewards/margins": 0.2903285324573517, + "rewards/rejected": -0.40428265929222107, + "sft_loss": 1.1395418643951416, + "step": 12255 + }, + { + "epoch": 0.95, + "grad_norm": 618.9825439453125, + "learning_rate": 5.359636591235784e-08, + "logits/chosen": -1.311244249343872, + "logits/rejected": -1.2541598081588745, + "logps/chosen": -1.1040689945220947, + "logps/rejected": -5.051757335662842, + "loss": 1.1462, + "odds_ratio_loss": 0.42150840163230896, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11040691286325455, + "rewards/margins": 0.3947688341140747, + "rewards/rejected": -0.505175769329071, + "sft_loss": 1.1040689945220947, + "step": 12260 + }, + { + "epoch": 0.95, + "grad_norm": 18.62577247619629, + "learning_rate": 5.270095054900781e-08, + "logits/chosen": -1.414301872253418, + "logits/rejected": -1.191956639289856, + "logps/chosen": -1.1366630792617798, + "logps/rejected": -9.936189651489258, + "loss": 1.1743, + "odds_ratio_loss": 0.37666797637939453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11366631090641022, + "rewards/margins": 0.8799527287483215, + "rewards/rejected": -0.9936189651489258, + "sft_loss": 1.1366630792617798, + "step": 12265 + }, + { + "epoch": 0.95, + "grad_norm": 8.314948081970215, + "learning_rate": 5.1813038392800056e-08, + "logits/chosen": -1.1787521839141846, + "logits/rejected": -1.5429284572601318, + "logps/chosen": -1.432403802871704, + "logps/rejected": -10.608168601989746, + "loss": 1.4325, + "odds_ratio_loss": 0.0005790928844362497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14324037730693817, + "rewards/margins": 0.9175764918327332, + "rewards/rejected": -1.060817003250122, + "sft_loss": 1.432403802871704, + "step": 12270 + }, + { + "epoch": 0.95, + "grad_norm": 10.948424339294434, + "learning_rate": 5.0932630790366256e-08, + "logits/chosen": -1.2694766521453857, + "logits/rejected": -1.3637231588363647, + "logps/chosen": -0.8769696950912476, + "logps/rejected": -7.397462368011475, + "loss": 0.8855, + "odds_ratio_loss": 0.08508679270744324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08769698441028595, + "rewards/margins": 0.6520493030548096, + "rewards/rejected": -0.7397462725639343, + "sft_loss": 0.8769696950912476, + "step": 12275 + }, + { + "epoch": 0.96, + "grad_norm": 31.869291305541992, + "learning_rate": 5.0059729076955e-08, + "logits/chosen": -1.192575216293335, + "logits/rejected": -1.6547155380249023, + "logps/chosen": -1.0308337211608887, + "logps/rejected": -12.254032135009766, + "loss": 1.0309, + "odds_ratio_loss": 0.0003266182611696422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10308338701725006, + "rewards/margins": 1.1223198175430298, + "rewards/rejected": -1.2254031896591187, + "sft_loss": 1.0308337211608887, + "step": 12280 + }, + { + "epoch": 0.96, + "grad_norm": 24.861595153808594, + "learning_rate": 4.919433457643452e-08, + "logits/chosen": -1.0728105306625366, + "logits/rejected": -1.2247686386108398, + "logps/chosen": -1.1914570331573486, + "logps/rejected": -6.046353340148926, + "loss": 1.2243, + "odds_ratio_loss": 0.32865655422210693, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1191457062959671, + "rewards/margins": 0.4854896664619446, + "rewards/rejected": -0.6046353578567505, + "sft_loss": 1.1914570331573486, + "step": 12285 + }, + { + "epoch": 0.96, + "grad_norm": 33.83651351928711, + "learning_rate": 4.8336448601283835e-08, + "logits/chosen": -1.2199666500091553, + "logits/rejected": -1.2750102281570435, + "logps/chosen": -1.2813136577606201, + "logps/rejected": -5.206510066986084, + "loss": 1.3284, + "odds_ratio_loss": 0.47062787413597107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12813135981559753, + "rewards/margins": 0.39251965284347534, + "rewards/rejected": -0.5206509828567505, + "sft_loss": 1.2813136577606201, + "step": 12290 + }, + { + "epoch": 0.96, + "grad_norm": 6.180647373199463, + "learning_rate": 4.748607245259606e-08, + "logits/chosen": -1.3814548254013062, + "logits/rejected": -0.8651509284973145, + "logps/chosen": -0.760599672794342, + "logps/rejected": -11.416925430297852, + "loss": 0.7704, + "odds_ratio_loss": 0.09805931150913239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0760599672794342, + "rewards/margins": 1.0656325817108154, + "rewards/rejected": -1.1416925191879272, + "sft_loss": 0.760599672794342, + "step": 12295 + }, + { + "epoch": 0.96, + "grad_norm": 208.45420837402344, + "learning_rate": 4.664320742007622e-08, + "logits/chosen": -1.432703971862793, + "logits/rejected": -1.4251139163970947, + "logps/chosen": -1.8889224529266357, + "logps/rejected": -6.2691545486450195, + "loss": 1.9051, + "odds_ratio_loss": 0.16187646985054016, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18889224529266357, + "rewards/margins": 0.438023179769516, + "rewards/rejected": -0.6269153356552124, + "sft_loss": 1.8889224529266357, + "step": 12300 + }, + { + "epoch": 0.96, + "grad_norm": 8.938104629516602, + "learning_rate": 4.580785478203453e-08, + "logits/chosen": -1.2809088230133057, + "logits/rejected": -1.1958422660827637, + "logps/chosen": -1.5119210481643677, + "logps/rejected": -7.5544843673706055, + "loss": 1.5902, + "odds_ratio_loss": 0.7823742628097534, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1511920988559723, + "rewards/margins": 0.6042563915252686, + "rewards/rejected": -0.7554485201835632, + "sft_loss": 1.5119210481643677, + "step": 12305 + }, + { + "epoch": 0.96, + "grad_norm": 28.944644927978516, + "learning_rate": 4.49800158053898e-08, + "logits/chosen": -1.3874715566635132, + "logits/rejected": -1.027039885520935, + "logps/chosen": -1.0698702335357666, + "logps/rejected": -11.652139663696289, + "loss": 1.0706, + "odds_ratio_loss": 0.007088521029800177, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10698702186346054, + "rewards/margins": 1.0582268238067627, + "rewards/rejected": -1.165213942527771, + "sft_loss": 1.0698702335357666, + "step": 12310 + }, + { + "epoch": 0.96, + "grad_norm": 14.06658935546875, + "learning_rate": 4.4159691745664925e-08, + "logits/chosen": -1.4516162872314453, + "logits/rejected": -1.3610146045684814, + "logps/chosen": -1.4354077577590942, + "logps/rejected": -4.483978271484375, + "loss": 1.4797, + "odds_ratio_loss": 0.4433773458003998, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.14354076981544495, + "rewards/margins": 0.304857075214386, + "rewards/rejected": -0.44839781522750854, + "sft_loss": 1.4354077577590942, + "step": 12315 + }, + { + "epoch": 0.96, + "grad_norm": 13.312085151672363, + "learning_rate": 4.3346883846985265e-08, + "logits/chosen": -1.4321238994598389, + "logits/rejected": -0.7923226356506348, + "logps/chosen": -1.0463868379592896, + "logps/rejected": -9.889020919799805, + "loss": 1.0495, + "odds_ratio_loss": 0.03073304519057274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10463868081569672, + "rewards/margins": 0.8842633962631226, + "rewards/rejected": -0.9889020919799805, + "sft_loss": 1.0463868379592896, + "step": 12320 + }, + { + "epoch": 0.96, + "grad_norm": 123.09896087646484, + "learning_rate": 4.254159334207752e-08, + "logits/chosen": -1.0274879932403564, + "logits/rejected": -1.069379448890686, + "logps/chosen": -1.4011691808700562, + "logps/rejected": -12.469643592834473, + "loss": 1.4012, + "odds_ratio_loss": 0.0007433110731653869, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14011691510677338, + "rewards/margins": 1.1068474054336548, + "rewards/rejected": -1.2469643354415894, + "sft_loss": 1.4011691808700562, + "step": 12325 + }, + { + "epoch": 0.96, + "grad_norm": 6.3734283447265625, + "learning_rate": 4.174382145226696e-08, + "logits/chosen": -1.2176523208618164, + "logits/rejected": -1.2900688648223877, + "logps/chosen": -0.5494667887687683, + "logps/rejected": -6.461459159851074, + "loss": 0.554, + "odds_ratio_loss": 0.04508071392774582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05494668334722519, + "rewards/margins": 0.5911992788314819, + "rewards/rejected": -0.6461459398269653, + "sft_loss": 0.5494667887687683, + "step": 12330 + }, + { + "epoch": 0.96, + "grad_norm": 19.052757263183594, + "learning_rate": 4.09535693874763e-08, + "logits/chosen": -1.4111024141311646, + "logits/rejected": -1.15664803981781, + "logps/chosen": -0.7811176180839539, + "logps/rejected": -4.984771728515625, + "loss": 0.7908, + "odds_ratio_loss": 0.09680463373661041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07811176776885986, + "rewards/margins": 0.4203653931617737, + "rewards/rejected": -0.49847716093063354, + "sft_loss": 0.7811176180839539, + "step": 12335 + }, + { + "epoch": 0.96, + "grad_norm": 23.22859001159668, + "learning_rate": 4.017083834622237e-08, + "logits/chosen": -1.2132599353790283, + "logits/rejected": -1.0425660610198975, + "logps/chosen": -0.9321387410163879, + "logps/rejected": -3.740152359008789, + "loss": 0.9503, + "odds_ratio_loss": 0.18153466284275055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09321387857198715, + "rewards/margins": 0.28080135583877563, + "rewards/rejected": -0.3740152418613434, + "sft_loss": 0.9321387410163879, + "step": 12340 + }, + { + "epoch": 0.96, + "grad_norm": 12.329089164733887, + "learning_rate": 3.9395629515616154e-08, + "logits/chosen": -1.3570067882537842, + "logits/rejected": -1.0711150169372559, + "logps/chosen": -0.8609020113945007, + "logps/rejected": -13.920463562011719, + "loss": 0.8757, + "odds_ratio_loss": 0.14800772070884705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08609020709991455, + "rewards/margins": 1.3059561252593994, + "rewards/rejected": -1.392046332359314, + "sft_loss": 0.8609020113945007, + "step": 12345 + }, + { + "epoch": 0.96, + "grad_norm": 18.560041427612305, + "learning_rate": 3.862794407136106e-08, + "logits/chosen": -1.2778164148330688, + "logits/rejected": -1.038604736328125, + "logps/chosen": -1.2508759498596191, + "logps/rejected": -8.479180335998535, + "loss": 1.2699, + "odds_ratio_loss": 0.18985147774219513, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12508758902549744, + "rewards/margins": 0.7228304147720337, + "rewards/rejected": -0.8479180335998535, + "sft_loss": 1.2508759498596191, + "step": 12350 + }, + { + "epoch": 0.96, + "grad_norm": 61.77004623413086, + "learning_rate": 3.786778317774964e-08, + "logits/chosen": -1.0532138347625732, + "logits/rejected": -1.1754436492919922, + "logps/chosen": -0.764786422252655, + "logps/rejected": -3.444678544998169, + "loss": 0.7887, + "odds_ratio_loss": 0.23928098380565643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07647864520549774, + "rewards/margins": 0.26798921823501587, + "rewards/rejected": -0.3444678485393524, + "sft_loss": 0.764786422252655, + "step": 12355 + }, + { + "epoch": 0.96, + "grad_norm": 21.275634765625, + "learning_rate": 3.711514798766081e-08, + "logits/chosen": -1.2819563150405884, + "logits/rejected": -0.8122344017028809, + "logps/chosen": -1.0787274837493896, + "logps/rejected": -2.4202353954315186, + "loss": 1.1184, + "odds_ratio_loss": 0.39671677350997925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10787274688482285, + "rewards/margins": 0.13415075838565826, + "rewards/rejected": -0.2420235425233841, + "sft_loss": 1.0787274837493896, + "step": 12360 + }, + { + "epoch": 0.96, + "grad_norm": 62.028438568115234, + "learning_rate": 3.6370039642563134e-08, + "logits/chosen": -1.3025977611541748, + "logits/rejected": -1.6483367681503296, + "logps/chosen": -0.8876152038574219, + "logps/rejected": -7.4527907371521, + "loss": 0.9039, + "odds_ratio_loss": 0.16312487423419952, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08876152336597443, + "rewards/margins": 0.6565175652503967, + "rewards/rejected": -0.7452791333198547, + "sft_loss": 0.8876152038574219, + "step": 12365 + }, + { + "epoch": 0.96, + "grad_norm": 18.61726951599121, + "learning_rate": 3.563245927250714e-08, + "logits/chosen": -1.4143457412719727, + "logits/rejected": -1.4389795064926147, + "logps/chosen": -0.9123845100402832, + "logps/rejected": -9.375925064086914, + "loss": 0.9339, + "odds_ratio_loss": 0.21503356099128723, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09123845398426056, + "rewards/margins": 0.8463540077209473, + "rewards/rejected": -0.9375923871994019, + "sft_loss": 0.9123845100402832, + "step": 12370 + }, + { + "epoch": 0.96, + "grad_norm": 13.771241188049316, + "learning_rate": 3.490240799612743e-08, + "logits/chosen": -1.2913004159927368, + "logits/rejected": -1.2106720209121704, + "logps/chosen": -1.0773588418960571, + "logps/rejected": -5.933541297912598, + "loss": 1.0871, + "odds_ratio_loss": 0.09785932302474976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10773588716983795, + "rewards/margins": 0.4856182634830475, + "rewards/rejected": -0.5933541059494019, + "sft_loss": 1.0773588418960571, + "step": 12375 + }, + { + "epoch": 0.96, + "grad_norm": 165.80172729492188, + "learning_rate": 3.417988692063945e-08, + "logits/chosen": -1.4670937061309814, + "logits/rejected": -1.0459048748016357, + "logps/chosen": -0.7823060750961304, + "logps/rejected": -2.592039108276367, + "loss": 0.8162, + "odds_ratio_loss": 0.3391726016998291, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07823061943054199, + "rewards/margins": 0.18097327649593353, + "rewards/rejected": -0.25920388102531433, + "sft_loss": 0.7823060750961304, + "step": 12380 + }, + { + "epoch": 0.96, + "grad_norm": 12.715102195739746, + "learning_rate": 3.346489714183831e-08, + "logits/chosen": -1.2532730102539062, + "logits/rejected": -0.949592113494873, + "logps/chosen": -0.8318389654159546, + "logps/rejected": -4.871085166931152, + "loss": 0.8441, + "odds_ratio_loss": 0.12211360782384872, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0831838995218277, + "rewards/margins": 0.4039246141910553, + "rewards/rejected": -0.4871085286140442, + "sft_loss": 0.8318389654159546, + "step": 12385 + }, + { + "epoch": 0.96, + "grad_norm": 271.7589111328125, + "learning_rate": 3.275743974409606e-08, + "logits/chosen": -1.3090143203735352, + "logits/rejected": -1.1848065853118896, + "logps/chosen": -0.8043609857559204, + "logps/rejected": -11.306020736694336, + "loss": 0.8107, + "odds_ratio_loss": 0.06342881172895432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0804360955953598, + "rewards/margins": 1.050166130065918, + "rewards/rejected": -1.1306021213531494, + "sft_loss": 0.8043609857559204, + "step": 12390 + }, + { + "epoch": 0.96, + "grad_norm": 80.97168731689453, + "learning_rate": 3.20575158003622e-08, + "logits/chosen": -1.466761827468872, + "logits/rejected": -0.7243001461029053, + "logps/chosen": -1.1298373937606812, + "logps/rejected": -4.683624744415283, + "loss": 1.1529, + "odds_ratio_loss": 0.23061838746070862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11298374831676483, + "rewards/margins": 0.3553787171840668, + "rewards/rejected": -0.468362420797348, + "sft_loss": 1.1298373937606812, + "step": 12395 + }, + { + "epoch": 0.96, + "grad_norm": 45.660179138183594, + "learning_rate": 3.1365126372159824e-08, + "logits/chosen": -1.3778924942016602, + "logits/rejected": -1.3838534355163574, + "logps/chosen": -0.791498601436615, + "logps/rejected": -5.726881980895996, + "loss": 0.8549, + "odds_ratio_loss": 0.6341503262519836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07914986461400986, + "rewards/margins": 0.4935383200645447, + "rewards/rejected": -0.5726882219314575, + "sft_loss": 0.791498601436615, + "step": 12400 + }, + { + "epoch": 0.96, + "grad_norm": 19.550127029418945, + "learning_rate": 3.068027250958616e-08, + "logits/chosen": -1.4337947368621826, + "logits/rejected": -1.0507450103759766, + "logps/chosen": -0.8702392578125, + "logps/rejected": -4.506911277770996, + "loss": 0.9357, + "odds_ratio_loss": 0.6542928814888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08702392876148224, + "rewards/margins": 0.36366719007492065, + "rewards/rejected": -0.4506911337375641, + "sft_loss": 0.8702392578125, + "step": 12405 + }, + { + "epoch": 0.97, + "grad_norm": 33.817935943603516, + "learning_rate": 3.0002955251308696e-08, + "logits/chosen": -1.412219762802124, + "logits/rejected": -1.1537320613861084, + "logps/chosen": -0.9598878026008606, + "logps/rejected": -1.8418006896972656, + "loss": 1.0055, + "odds_ratio_loss": 0.4561440348625183, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09598878771066666, + "rewards/margins": 0.08819130063056946, + "rewards/rejected": -0.18418008089065552, + "sft_loss": 0.9598878026008606, + "step": 12410 + }, + { + "epoch": 0.97, + "grad_norm": 17.341407775878906, + "learning_rate": 2.9333175624565168e-08, + "logits/chosen": -1.4794400930404663, + "logits/rejected": -1.7852928638458252, + "logps/chosen": -0.536201000213623, + "logps/rejected": -7.498734951019287, + "loss": 0.5399, + "odds_ratio_loss": 0.037034954875707626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053620100021362305, + "rewards/margins": 0.6962534189224243, + "rewards/rejected": -0.7498735189437866, + "sft_loss": 0.536201000213623, + "step": 12415 + }, + { + "epoch": 0.97, + "grad_norm": 3.3973031044006348, + "learning_rate": 2.8670934645160797e-08, + "logits/chosen": -1.4210028648376465, + "logits/rejected": -1.1338955163955688, + "logps/chosen": -1.047786831855774, + "logps/rejected": -5.595246315002441, + "loss": 1.0528, + "odds_ratio_loss": 0.04966248571872711, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10477868467569351, + "rewards/margins": 0.45474591851234436, + "rewards/rejected": -0.5595245957374573, + "sft_loss": 1.047786831855774, + "step": 12420 + }, + { + "epoch": 0.97, + "grad_norm": 5.257033824920654, + "learning_rate": 2.8016233317468834e-08, + "logits/chosen": -1.3157680034637451, + "logits/rejected": -0.8841627240180969, + "logps/chosen": -0.8723659515380859, + "logps/rejected": -5.532425880432129, + "loss": 0.8923, + "odds_ratio_loss": 0.1988508403301239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08723659813404083, + "rewards/margins": 0.46600595116615295, + "rewards/rejected": -0.553242564201355, + "sft_loss": 0.8723659515380859, + "step": 12425 + }, + { + "epoch": 0.97, + "grad_norm": 66.54361724853516, + "learning_rate": 2.7369072634426673e-08, + "logits/chosen": -1.225722074508667, + "logits/rejected": -1.364844560623169, + "logps/chosen": -1.2202316522598267, + "logps/rejected": -5.396240711212158, + "loss": 1.2492, + "odds_ratio_loss": 0.2900741696357727, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12202316522598267, + "rewards/margins": 0.4176008701324463, + "rewards/rejected": -0.5396240949630737, + "sft_loss": 1.2202316522598267, + "step": 12430 + }, + { + "epoch": 0.97, + "grad_norm": 7.29395055770874, + "learning_rate": 2.672945357753587e-08, + "logits/chosen": -1.2899141311645508, + "logits/rejected": -1.6844837665557861, + "logps/chosen": -0.9342865943908691, + "logps/rejected": -15.596992492675781, + "loss": 0.9355, + "odds_ratio_loss": 0.012498864904046059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09342865645885468, + "rewards/margins": 1.4662706851959229, + "rewards/rejected": -1.559699296951294, + "sft_loss": 0.9342865943908691, + "step": 12435 + }, + { + "epoch": 0.97, + "grad_norm": 8.378362655639648, + "learning_rate": 2.6097377116859335e-08, + "logits/chosen": -1.3625600337982178, + "logits/rejected": -1.1308603286743164, + "logps/chosen": -1.2010387182235718, + "logps/rejected": -9.78742504119873, + "loss": 1.2424, + "odds_ratio_loss": 0.4139278829097748, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1201038733124733, + "rewards/margins": 0.8586385846138, + "rewards/rejected": -0.9787424802780151, + "sft_loss": 1.2010387182235718, + "step": 12440 + }, + { + "epoch": 0.97, + "grad_norm": 12.528746604919434, + "learning_rate": 2.547284421102192e-08, + "logits/chosen": -1.3684858083724976, + "logits/rejected": -0.7313657999038696, + "logps/chosen": -1.1039519309997559, + "logps/rejected": -5.332610607147217, + "loss": 1.1141, + "odds_ratio_loss": 0.10197708755731583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11039519309997559, + "rewards/margins": 0.4228658676147461, + "rewards/rejected": -0.5332610607147217, + "sft_loss": 1.1039519309997559, + "step": 12445 + }, + { + "epoch": 0.97, + "grad_norm": 104.84793090820312, + "learning_rate": 2.4855855807206508e-08, + "logits/chosen": -1.4106018543243408, + "logits/rejected": -1.2714979648590088, + "logps/chosen": -1.1574375629425049, + "logps/rejected": -4.188596725463867, + "loss": 1.1862, + "odds_ratio_loss": 0.28758615255355835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11574375629425049, + "rewards/margins": 0.3031159043312073, + "rewards/rejected": -0.41885966062545776, + "sft_loss": 1.1574375629425049, + "step": 12450 + }, + { + "epoch": 0.97, + "grad_norm": 17.179094314575195, + "learning_rate": 2.4246412841155144e-08, + "logits/chosen": -1.2052602767944336, + "logits/rejected": -1.1856670379638672, + "logps/chosen": -1.4439436197280884, + "logps/rejected": -5.836577892303467, + "loss": 1.4997, + "odds_ratio_loss": 0.5576989650726318, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14439436793327332, + "rewards/margins": 0.4392634332180023, + "rewards/rejected": -0.5836578607559204, + "sft_loss": 1.4439436197280884, + "step": 12455 + }, + { + "epoch": 0.97, + "grad_norm": 20.688180923461914, + "learning_rate": 2.3644516237164572e-08, + "logits/chosen": -1.226061224937439, + "logits/rejected": -1.5945135354995728, + "logps/chosen": -0.8379164934158325, + "logps/rejected": -7.608580112457275, + "loss": 0.8551, + "odds_ratio_loss": 0.17142853140830994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08379164338111877, + "rewards/margins": 0.677066445350647, + "rewards/rejected": -0.7608579993247986, + "sft_loss": 0.8379164934158325, + "step": 12460 + }, + { + "epoch": 0.97, + "grad_norm": 11.189611434936523, + "learning_rate": 2.305016690808848e-08, + "logits/chosen": -1.3278387784957886, + "logits/rejected": -1.1817538738250732, + "logps/chosen": -0.8746849298477173, + "logps/rejected": -3.890235185623169, + "loss": 0.9015, + "odds_ratio_loss": 0.26853257417678833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08746849000453949, + "rewards/margins": 0.3015550374984741, + "rewards/rejected": -0.3890235424041748, + "sft_loss": 0.8746849298477173, + "step": 12465 + }, + { + "epoch": 0.97, + "grad_norm": 28.143857955932617, + "learning_rate": 2.2463365755331924e-08, + "logits/chosen": -1.3462903499603271, + "logits/rejected": -0.7961365580558777, + "logps/chosen": -1.0623729228973389, + "logps/rejected": -5.807260513305664, + "loss": 1.0717, + "odds_ratio_loss": 0.09347637742757797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10623729228973389, + "rewards/margins": 0.4744887351989746, + "rewards/rejected": -0.5807260870933533, + "sft_loss": 1.0623729228973389, + "step": 12470 + }, + { + "epoch": 0.97, + "grad_norm": 5.191647052764893, + "learning_rate": 2.1884113668853567e-08, + "logits/chosen": -1.3941482305526733, + "logits/rejected": -0.8888921737670898, + "logps/chosen": -1.0999834537506104, + "logps/rejected": -5.673184871673584, + "loss": 1.1023, + "odds_ratio_loss": 0.02334499917924404, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10999833047389984, + "rewards/margins": 0.4573201537132263, + "rewards/rejected": -0.5673185586929321, + "sft_loss": 1.0999834537506104, + "step": 12475 + }, + { + "epoch": 0.97, + "grad_norm": 8.231232643127441, + "learning_rate": 2.1312411527164012e-08, + "logits/chosen": -1.2910339832305908, + "logits/rejected": -1.4623374938964844, + "logps/chosen": -0.9894932508468628, + "logps/rejected": -9.08910083770752, + "loss": 0.9928, + "odds_ratio_loss": 0.033546727150678635, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09894932806491852, + "rewards/margins": 0.8099607229232788, + "rewards/rejected": -0.9089100956916809, + "sft_loss": 0.9894932508468628, + "step": 12480 + }, + { + "epoch": 0.97, + "grad_norm": 4.641633033752441, + "learning_rate": 2.0748260197320234e-08, + "logits/chosen": -1.130858063697815, + "logits/rejected": -1.4460694789886475, + "logps/chosen": -0.6807447671890259, + "logps/rejected": -5.400501251220703, + "loss": 0.6967, + "odds_ratio_loss": 0.15972675383090973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06807447969913483, + "rewards/margins": 0.471975713968277, + "rewards/rejected": -0.5400501489639282, + "sft_loss": 0.6807447671890259, + "step": 12485 + }, + { + "epoch": 0.97, + "grad_norm": 5.488857269287109, + "learning_rate": 2.0191660534931158e-08, + "logits/chosen": -1.1198769807815552, + "logits/rejected": -1.112151861190796, + "logps/chosen": -0.9378183484077454, + "logps/rejected": -1.6302454471588135, + "loss": 0.9778, + "odds_ratio_loss": 0.3997967541217804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09378183633089066, + "rewards/margins": 0.0692427009344101, + "rewards/rejected": -0.16302451491355896, + "sft_loss": 0.9378183484077454, + "step": 12490 + }, + { + "epoch": 0.97, + "grad_norm": 15.16747760772705, + "learning_rate": 1.9642613384149302e-08, + "logits/chosen": -1.2848796844482422, + "logits/rejected": -1.223474383354187, + "logps/chosen": -1.5398253202438354, + "logps/rejected": -5.566136360168457, + "loss": 1.555, + "odds_ratio_loss": 0.1515004187822342, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15398254990577698, + "rewards/margins": 0.40263113379478455, + "rewards/rejected": -0.5566136240959167, + "sft_loss": 1.5398253202438354, + "step": 12495 + }, + { + "epoch": 0.97, + "grad_norm": 19.638164520263672, + "learning_rate": 1.910111957767524e-08, + "logits/chosen": -1.4474903345108032, + "logits/rejected": -1.3258614540100098, + "logps/chosen": -0.6606062054634094, + "logps/rejected": -5.556430816650391, + "loss": 0.6655, + "odds_ratio_loss": 0.04882761836051941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0660606175661087, + "rewards/margins": 0.48958244919776917, + "rewards/rejected": -0.5556430816650391, + "sft_loss": 0.6606062054634094, + "step": 12500 + }, + { + "epoch": 0.97, + "grad_norm": 5.311905384063721, + "learning_rate": 1.856717993675261e-08, + "logits/chosen": -1.2535361051559448, + "logits/rejected": -1.0046923160552979, + "logps/chosen": -0.9040173292160034, + "logps/rejected": -11.26391315460205, + "loss": 0.9184, + "odds_ratio_loss": 0.14369972050189972, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09040174633264542, + "rewards/margins": 1.03598952293396, + "rewards/rejected": -1.1263912916183472, + "sft_loss": 0.9040173292160034, + "step": 12505 + }, + { + "epoch": 0.97, + "grad_norm": 10.375931739807129, + "learning_rate": 1.8040795271169753e-08, + "logits/chosen": -1.3679031133651733, + "logits/rejected": -0.9070374369621277, + "logps/chosen": -1.0435655117034912, + "logps/rejected": -5.098996639251709, + "loss": 1.0562, + "odds_ratio_loss": 0.1265898197889328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1043565645813942, + "rewards/margins": 0.40554314851760864, + "rewards/rejected": -0.5098997354507446, + "sft_loss": 1.0435655117034912, + "step": 12510 + }, + { + "epoch": 0.97, + "grad_norm": 4.680568695068359, + "learning_rate": 1.752196637925474e-08, + "logits/chosen": -1.3530833721160889, + "logits/rejected": -1.097294569015503, + "logps/chosen": -0.7488092184066772, + "logps/rejected": -8.338506698608398, + "loss": 0.7525, + "odds_ratio_loss": 0.0365881472826004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0748809203505516, + "rewards/margins": 0.7589698433876038, + "rewards/rejected": -0.8338507413864136, + "sft_loss": 0.7488092184066772, + "step": 12515 + }, + { + "epoch": 0.97, + "grad_norm": 6.964818477630615, + "learning_rate": 1.7010694047877585e-08, + "logits/chosen": -1.293218970298767, + "logits/rejected": -0.6837180256843567, + "logps/chosen": -1.0612825155258179, + "logps/rejected": -6.177232265472412, + "loss": 1.0979, + "odds_ratio_loss": 0.36641591787338257, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1061282530426979, + "rewards/margins": 0.5115949511528015, + "rewards/rejected": -0.6177231669425964, + "sft_loss": 1.0612825155258179, + "step": 12520 + }, + { + "epoch": 0.97, + "grad_norm": 15.219218254089355, + "learning_rate": 1.650697905244747e-08, + "logits/chosen": -1.2578482627868652, + "logits/rejected": -1.422055959701538, + "logps/chosen": -0.8639196157455444, + "logps/rejected": -6.498141288757324, + "loss": 0.8792, + "odds_ratio_loss": 0.15299847722053528, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08639196306467056, + "rewards/margins": 0.5634222030639648, + "rewards/rejected": -0.6498141288757324, + "sft_loss": 0.8639196157455444, + "step": 12525 + }, + { + "epoch": 0.97, + "grad_norm": 4.948149681091309, + "learning_rate": 1.6010822156913297e-08, + "logits/chosen": -1.411399483680725, + "logits/rejected": -0.9298642873764038, + "logps/chosen": -1.153228521347046, + "logps/rejected": -7.6135687828063965, + "loss": 1.1708, + "odds_ratio_loss": 0.17584821581840515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11532286554574966, + "rewards/margins": 0.6460340619087219, + "rewards/rejected": -0.7613569498062134, + "sft_loss": 1.153228521347046, + "step": 12530 + }, + { + "epoch": 0.98, + "grad_norm": 15.22192668914795, + "learning_rate": 1.55222241137587e-08, + "logits/chosen": -1.3863394260406494, + "logits/rejected": -1.158898115158081, + "logps/chosen": -1.0210561752319336, + "logps/rejected": -6.618653774261475, + "loss": 1.0234, + "odds_ratio_loss": 0.023359347134828568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10210561752319336, + "rewards/margins": 0.559759795665741, + "rewards/rejected": -0.6618653535842896, + "sft_loss": 1.0210561752319336, + "step": 12535 + }, + { + "epoch": 0.98, + "grad_norm": 16.71721076965332, + "learning_rate": 1.5041185664005365e-08, + "logits/chosen": -1.3702729940414429, + "logits/rejected": -1.153377890586853, + "logps/chosen": -0.9755480885505676, + "logps/rejected": -5.225513458251953, + "loss": 0.9837, + "odds_ratio_loss": 0.08182965219020844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09755481034517288, + "rewards/margins": 0.424996554851532, + "rewards/rejected": -0.5225513577461243, + "sft_loss": 0.9755480885505676, + "step": 12540 + }, + { + "epoch": 0.98, + "grad_norm": 97.78411865234375, + "learning_rate": 1.4567707537209153e-08, + "logits/chosen": -1.2511688470840454, + "logits/rejected": -1.3687814474105835, + "logps/chosen": -0.8645130395889282, + "logps/rejected": -6.271538257598877, + "loss": 0.8895, + "odds_ratio_loss": 0.2500821650028229, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08645130693912506, + "rewards/margins": 0.5407025218009949, + "rewards/rejected": -0.6271538138389587, + "sft_loss": 0.8645130395889282, + "step": 12545 + }, + { + "epoch": 0.98, + "grad_norm": 183.43133544921875, + "learning_rate": 1.410179045145954e-08, + "logits/chosen": -1.3008298873901367, + "logits/rejected": -1.3373351097106934, + "logps/chosen": -1.174229383468628, + "logps/rejected": -5.585914134979248, + "loss": 1.1844, + "odds_ratio_loss": 0.10214630514383316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11742293834686279, + "rewards/margins": 0.44116848707199097, + "rewards/rejected": -0.5585914850234985, + "sft_loss": 1.174229383468628, + "step": 12550 + }, + { + "epoch": 0.98, + "grad_norm": 5.683286666870117, + "learning_rate": 1.3643435113379067e-08, + "logits/chosen": -1.3199999332427979, + "logits/rejected": -0.7393957376480103, + "logps/chosen": -0.8424164652824402, + "logps/rejected": -11.82227611541748, + "loss": 0.8517, + "odds_ratio_loss": 0.09303996711969376, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08424165099859238, + "rewards/margins": 1.0979859828948975, + "rewards/rejected": -1.182227611541748, + "sft_loss": 0.8424164652824402, + "step": 12555 + }, + { + "epoch": 0.98, + "grad_norm": 5.736454010009766, + "learning_rate": 1.3192642218121666e-08, + "logits/chosen": -1.4242204427719116, + "logits/rejected": -1.298656702041626, + "logps/chosen": -0.6155864000320435, + "logps/rejected": -9.867898941040039, + "loss": 0.621, + "odds_ratio_loss": 0.0539126992225647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.061558641493320465, + "rewards/margins": 0.9252313375473022, + "rewards/rejected": -0.986789882183075, + "sft_loss": 0.6155864000320435, + "step": 12560 + }, + { + "epoch": 0.98, + "grad_norm": 17.89364242553711, + "learning_rate": 1.2749412449372111e-08, + "logits/chosen": -1.3865082263946533, + "logits/rejected": -1.0207258462905884, + "logps/chosen": -0.852857768535614, + "logps/rejected": -5.261617183685303, + "loss": 0.9491, + "odds_ratio_loss": 0.9626294374465942, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.08528578281402588, + "rewards/margins": 0.4408760070800781, + "rewards/rejected": -0.526161789894104, + "sft_loss": 0.852857768535614, + "step": 12565 + }, + { + "epoch": 0.98, + "grad_norm": 27.80198097229004, + "learning_rate": 1.2313746479344358e-08, + "logits/chosen": -1.2341769933700562, + "logits/rejected": -1.2669892311096191, + "logps/chosen": -1.1279561519622803, + "logps/rejected": -8.634511947631836, + "loss": 1.1319, + "odds_ratio_loss": 0.03956901282072067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11279561370611191, + "rewards/margins": 0.7506555318832397, + "rewards/rejected": -0.8634511828422546, + "sft_loss": 1.1279561519622803, + "step": 12570 + }, + { + "epoch": 0.98, + "grad_norm": 9.365230560302734, + "learning_rate": 1.188564496878153e-08, + "logits/chosen": -1.3226608037948608, + "logits/rejected": -0.809215247631073, + "logps/chosen": -1.0910683870315552, + "logps/rejected": -3.9763412475585938, + "loss": 1.1081, + "odds_ratio_loss": 0.1704411506652832, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10910683870315552, + "rewards/margins": 0.2885272800922394, + "rewards/rejected": -0.3976341187953949, + "sft_loss": 1.0910683870315552, + "step": 12575 + }, + { + "epoch": 0.98, + "grad_norm": 6.049681186676025, + "learning_rate": 1.1465108566953708e-08, + "logits/chosen": -1.1451175212860107, + "logits/rejected": -1.6655000448226929, + "logps/chosen": -1.2132195234298706, + "logps/rejected": -15.503191947937012, + "loss": 1.2136, + "odds_ratio_loss": 0.0037333047948777676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12132195383310318, + "rewards/margins": 1.428997278213501, + "rewards/rejected": -1.5503193140029907, + "sft_loss": 1.2132195234298706, + "step": 12580 + }, + { + "epoch": 0.98, + "grad_norm": 7.754752159118652, + "learning_rate": 1.1052137911657934e-08, + "logits/chosen": -1.3875739574432373, + "logits/rejected": -0.9447689056396484, + "logps/chosen": -0.9363592267036438, + "logps/rejected": -4.744017601013184, + "loss": 0.998, + "odds_ratio_loss": 0.61687171459198, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0936359241604805, + "rewards/margins": 0.3807658553123474, + "rewards/rejected": -0.4744018018245697, + "sft_loss": 0.9363592267036438, + "step": 12585 + }, + { + "epoch": 0.98, + "grad_norm": 10.098403930664062, + "learning_rate": 1.0646733629216533e-08, + "logits/chosen": -1.3224453926086426, + "logits/rejected": -1.1574100255966187, + "logps/chosen": -1.0401006937026978, + "logps/rejected": -7.8945112228393555, + "loss": 1.0577, + "odds_ratio_loss": 0.17600440979003906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10401006788015366, + "rewards/margins": 0.6854410171508789, + "rewards/rejected": -0.7894511222839355, + "sft_loss": 1.0401006937026978, + "step": 12590 + }, + { + "epoch": 0.98, + "grad_norm": 5.171893119812012, + "learning_rate": 1.0248896334476565e-08, + "logits/chosen": -1.3351364135742188, + "logits/rejected": -1.343653917312622, + "logps/chosen": -0.8911256790161133, + "logps/rejected": -6.915536403656006, + "loss": 0.893, + "odds_ratio_loss": 0.0184454545378685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08911257237195969, + "rewards/margins": 0.602441132068634, + "rewards/rejected": -0.6915537118911743, + "sft_loss": 0.8911256790161133, + "step": 12595 + }, + { + "epoch": 0.98, + "grad_norm": 4.757512092590332, + "learning_rate": 9.858626630808722e-09, + "logits/chosen": -1.536574125289917, + "logits/rejected": -1.043745756149292, + "logps/chosen": -1.218515396118164, + "logps/rejected": -4.443150520324707, + "loss": 1.248, + "odds_ratio_loss": 0.2948893904685974, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12185152620077133, + "rewards/margins": 0.3224635720252991, + "rewards/rejected": -0.444315105676651, + "sft_loss": 1.218515396118164, + "step": 12600 + }, + { + "epoch": 0.98, + "grad_norm": 6.375965595245361, + "learning_rate": 9.475925110106753e-09, + "logits/chosen": -1.3484649658203125, + "logits/rejected": -0.6500853300094604, + "logps/chosen": -1.0133591890335083, + "logps/rejected": -5.055027961730957, + "loss": 1.0315, + "odds_ratio_loss": 0.181620255112648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10133592039346695, + "rewards/margins": 0.4041668474674225, + "rewards/rejected": -0.5055028200149536, + "sft_loss": 1.0133591890335083, + "step": 12605 + }, + { + "epoch": 0.98, + "grad_norm": 14.441625595092773, + "learning_rate": 9.100792352785826e-09, + "logits/chosen": -1.3859080076217651, + "logits/rejected": -1.3279926776885986, + "logps/chosen": -1.1327625513076782, + "logps/rejected": -9.13421630859375, + "loss": 1.1406, + "odds_ratio_loss": 0.07846628874540329, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11327625811100006, + "rewards/margins": 0.8001454472541809, + "rewards/rejected": -0.9134217500686646, + "sft_loss": 1.1327625513076782, + "step": 12610 + }, + { + "epoch": 0.98, + "grad_norm": 79.08858489990234, + "learning_rate": 8.7332289277825e-09, + "logits/chosen": -1.3899104595184326, + "logits/rejected": -1.2215381860733032, + "logps/chosen": -1.0673812627792358, + "logps/rejected": -6.102982044219971, + "loss": 1.0804, + "odds_ratio_loss": 0.13017579913139343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10673811286687851, + "rewards/margins": 0.5035600662231445, + "rewards/rejected": -0.610298216342926, + "sft_loss": 1.0673812627792358, + "step": 12615 + }, + { + "epoch": 0.98, + "grad_norm": 5.262150287628174, + "learning_rate": 8.373235392553636e-09, + "logits/chosen": -1.3551607131958008, + "logits/rejected": -0.987968921661377, + "logps/chosen": -1.1806728839874268, + "logps/rejected": -8.183259963989258, + "loss": 1.2099, + "odds_ratio_loss": 0.292438268661499, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11806728690862656, + "rewards/margins": 0.7002586722373962, + "rewards/rejected": -0.8183259963989258, + "sft_loss": 1.1806728839874268, + "step": 12620 + }, + { + "epoch": 0.98, + "grad_norm": 9.846895217895508, + "learning_rate": 8.02081229307472e-09, + "logits/chosen": -1.3026286363601685, + "logits/rejected": -0.7004513144493103, + "logps/chosen": -0.8980560302734375, + "logps/rejected": -7.850504398345947, + "loss": 0.9003, + "odds_ratio_loss": 0.022868018597364426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08980560302734375, + "rewards/margins": 0.6952449083328247, + "rewards/rejected": -0.7850505113601685, + "sft_loss": 0.8980560302734375, + "step": 12625 + }, + { + "epoch": 0.98, + "grad_norm": 9.495035171508789, + "learning_rate": 7.675960163840424e-09, + "logits/chosen": -1.2690961360931396, + "logits/rejected": -1.247907042503357, + "logps/chosen": -1.029133915901184, + "logps/rejected": -13.74725341796875, + "loss": 1.0295, + "odds_ratio_loss": 0.0034622892271727324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10291339457035065, + "rewards/margins": 1.271812081336975, + "rewards/rejected": -1.3747254610061646, + "sft_loss": 1.029133915901184, + "step": 12630 + }, + { + "epoch": 0.98, + "grad_norm": 12.372653007507324, + "learning_rate": 7.33867952786238e-09, + "logits/chosen": -1.395240068435669, + "logits/rejected": -0.935256838798523, + "logps/chosen": -1.8347313404083252, + "logps/rejected": -2.4637718200683594, + "loss": 1.9064, + "odds_ratio_loss": 0.7163336277008057, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1834731101989746, + "rewards/margins": 0.06290406733751297, + "rewards/rejected": -0.24637719988822937, + "sft_loss": 1.8347313404083252, + "step": 12635 + }, + { + "epoch": 0.98, + "grad_norm": 5.233946323394775, + "learning_rate": 7.008970896670298e-09, + "logits/chosen": -1.3666355609893799, + "logits/rejected": -0.8786395788192749, + "logps/chosen": -1.2400833368301392, + "logps/rejected": -15.411317825317383, + "loss": 1.2401, + "odds_ratio_loss": 0.0001732175296638161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12400834262371063, + "rewards/margins": 1.4171233177185059, + "rewards/rejected": -1.5411317348480225, + "sft_loss": 1.2400833368301392, + "step": 12640 + }, + { + "epoch": 0.98, + "grad_norm": 18.897790908813477, + "learning_rate": 6.686834770308626e-09, + "logits/chosen": -1.421450138092041, + "logits/rejected": -1.2205473184585571, + "logps/chosen": -0.8802854418754578, + "logps/rejected": -4.678915023803711, + "loss": 0.9042, + "odds_ratio_loss": 0.23880383372306824, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.08802854269742966, + "rewards/margins": 0.3798629939556122, + "rewards/rejected": -0.46789151430130005, + "sft_loss": 0.8802854418754578, + "step": 12645 + }, + { + "epoch": 0.98, + "grad_norm": 7.433767795562744, + "learning_rate": 6.372271637337668e-09, + "logits/chosen": -1.4291326999664307, + "logits/rejected": -0.8617070317268372, + "logps/chosen": -0.9746553301811218, + "logps/rejected": -4.9322404861450195, + "loss": 1.0212, + "odds_ratio_loss": 0.464984655380249, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09746553003787994, + "rewards/margins": 0.39575856924057007, + "rewards/rejected": -0.49322405457496643, + "sft_loss": 0.9746553301811218, + "step": 12650 + }, + { + "epoch": 0.98, + "grad_norm": 6.346502304077148, + "learning_rate": 6.065281974832471e-09, + "logits/chosen": -1.351488709449768, + "logits/rejected": -1.3281567096710205, + "logps/chosen": -0.6839212775230408, + "logps/rejected": -6.7796502113342285, + "loss": 0.685, + "odds_ratio_loss": 0.010401845909655094, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06839212775230408, + "rewards/margins": 0.6095728278160095, + "rewards/rejected": -0.6779649257659912, + "sft_loss": 0.6839212775230408, + "step": 12655 + }, + { + "epoch": 0.98, + "grad_norm": 46.9267692565918, + "learning_rate": 5.765866248381713e-09, + "logits/chosen": -1.465703010559082, + "logits/rejected": -1.3380768299102783, + "logps/chosen": -1.0040488243103027, + "logps/rejected": -4.003195762634277, + "loss": 1.0089, + "odds_ratio_loss": 0.048082135617733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10040488094091415, + "rewards/margins": 0.2999146580696106, + "rewards/rejected": -0.40031957626342773, + "sft_loss": 1.0040488243103027, + "step": 12660 + }, + { + "epoch": 0.99, + "grad_norm": 16.431318283081055, + "learning_rate": 5.474024912087151e-09, + "logits/chosen": -1.4906480312347412, + "logits/rejected": -0.8143842816352844, + "logps/chosen": -0.7565047144889832, + "logps/rejected": -2.544574022293091, + "loss": 0.876, + "odds_ratio_loss": 1.194966197013855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07565047591924667, + "rewards/margins": 0.17880688607692719, + "rewards/rejected": -0.25445738434791565, + "sft_loss": 0.7565047144889832, + "step": 12665 + }, + { + "epoch": 0.99, + "grad_norm": 10.983074188232422, + "learning_rate": 5.189758408564172e-09, + "logits/chosen": -1.4593555927276611, + "logits/rejected": -0.6856383681297302, + "logps/chosen": -0.977257251739502, + "logps/rejected": -6.1070733070373535, + "loss": 1.0098, + "odds_ratio_loss": 0.32559916377067566, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09772571921348572, + "rewards/margins": 0.5129815936088562, + "rewards/rejected": -0.6107073426246643, + "sft_loss": 0.977257251739502, + "step": 12670 + }, + { + "epoch": 0.99, + "grad_norm": 9.633749961853027, + "learning_rate": 4.913067168937913e-09, + "logits/chosen": -1.3621352910995483, + "logits/rejected": -1.0428334474563599, + "logps/chosen": -0.9668213725090027, + "logps/rejected": -9.47398853302002, + "loss": 0.9695, + "odds_ratio_loss": 0.02649303898215294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09668214619159698, + "rewards/margins": 0.850716769695282, + "rewards/rejected": -0.947398841381073, + "sft_loss": 0.9668213725090027, + "step": 12675 + }, + { + "epoch": 0.99, + "grad_norm": 6.018378257751465, + "learning_rate": 4.643951612846587e-09, + "logits/chosen": -1.4125245809555054, + "logits/rejected": -0.9760688543319702, + "logps/chosen": -1.1332448720932007, + "logps/rejected": -8.689168930053711, + "loss": 1.1377, + "odds_ratio_loss": 0.04467242211103439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11332448571920395, + "rewards/margins": 0.7555924654006958, + "rewards/rejected": -0.8689168691635132, + "sft_loss": 1.1332448720932007, + "step": 12680 + }, + { + "epoch": 0.99, + "grad_norm": 9.102438926696777, + "learning_rate": 4.382412148437598e-09, + "logits/chosen": -1.4320725202560425, + "logits/rejected": -0.9469528198242188, + "logps/chosen": -0.852150559425354, + "logps/rejected": -6.599529266357422, + "loss": 0.8543, + "odds_ratio_loss": 0.021591413766145706, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08521505445241928, + "rewards/margins": 0.5747378468513489, + "rewards/rejected": -0.6599529385566711, + "sft_loss": 0.852150559425354, + "step": 12685 + }, + { + "epoch": 0.99, + "grad_norm": 102.65242767333984, + "learning_rate": 4.1284491723686536e-09, + "logits/chosen": -1.406354308128357, + "logits/rejected": -0.9787223935127258, + "logps/chosen": -1.1560405492782593, + "logps/rejected": -7.624680519104004, + "loss": 1.1582, + "odds_ratio_loss": 0.021217485889792442, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11560405790805817, + "rewards/margins": 0.6468639969825745, + "rewards/rejected": -0.7624679803848267, + "sft_loss": 1.1560405492782593, + "step": 12690 + }, + { + "epoch": 0.99, + "grad_norm": 15.876679420471191, + "learning_rate": 3.882063069807762e-09, + "logits/chosen": -1.4680473804473877, + "logits/rejected": -1.2769049406051636, + "logps/chosen": -0.6791022419929504, + "logps/rejected": -4.149896621704102, + "loss": 0.6847, + "odds_ratio_loss": 0.05620497465133667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06791022419929504, + "rewards/margins": 0.34707945585250854, + "rewards/rejected": -0.4149896502494812, + "sft_loss": 0.6791022419929504, + "step": 12695 + }, + { + "epoch": 0.99, + "grad_norm": 9.385997772216797, + "learning_rate": 3.643254214429348e-09, + "logits/chosen": -1.3954238891601562, + "logits/rejected": -1.044518232345581, + "logps/chosen": -0.9343746900558472, + "logps/rejected": -6.008805751800537, + "loss": 0.9483, + "odds_ratio_loss": 0.13876792788505554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09343747794628143, + "rewards/margins": 0.5074431300163269, + "rewards/rejected": -0.6008806228637695, + "sft_loss": 0.9343746900558472, + "step": 12700 + }, + { + "epoch": 0.99, + "grad_norm": 13.203768730163574, + "learning_rate": 3.4120229684181384e-09, + "logits/chosen": -1.3236767053604126, + "logits/rejected": -0.831290602684021, + "logps/chosen": -0.8365150690078735, + "logps/rejected": -3.617612838745117, + "loss": 0.8571, + "odds_ratio_loss": 0.20549385249614716, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08365149796009064, + "rewards/margins": 0.2781098484992981, + "rewards/rejected": -0.36176127195358276, + "sft_loss": 0.8365150690078735, + "step": 12705 + }, + { + "epoch": 0.99, + "grad_norm": 11.924532890319824, + "learning_rate": 3.188369682466386e-09, + "logits/chosen": -1.4278162717819214, + "logits/rejected": -1.0526517629623413, + "logps/chosen": -1.2153667211532593, + "logps/rejected": -5.400894641876221, + "loss": 1.2309, + "odds_ratio_loss": 0.15487821400165558, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12153668701648712, + "rewards/margins": 0.41855278611183167, + "rewards/rejected": -0.54008948802948, + "sft_loss": 1.2153667211532593, + "step": 12710 + }, + { + "epoch": 0.99, + "grad_norm": 8.210156440734863, + "learning_rate": 2.9722946957710943e-09, + "logits/chosen": -1.2760294675827026, + "logits/rejected": -0.657126247882843, + "logps/chosen": -1.1431710720062256, + "logps/rejected": -6.945553779602051, + "loss": 1.1633, + "odds_ratio_loss": 0.20087885856628418, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.11431711912155151, + "rewards/margins": 0.5802382230758667, + "rewards/rejected": -0.6945552825927734, + "sft_loss": 1.1431710720062256, + "step": 12715 + }, + { + "epoch": 0.99, + "grad_norm": 8.233572006225586, + "learning_rate": 2.763798336039014e-09, + "logits/chosen": -1.2289212942123413, + "logits/rejected": -1.1867921352386475, + "logps/chosen": -0.7025913000106812, + "logps/rejected": -5.2682671546936035, + "loss": 0.711, + "odds_ratio_loss": 0.08458174765110016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07025913894176483, + "rewards/margins": 0.45656758546829224, + "rewards/rejected": -0.5268267393112183, + "sft_loss": 0.7025913000106812, + "step": 12720 + }, + { + "epoch": 0.99, + "grad_norm": 6.575437545776367, + "learning_rate": 2.562880919479982e-09, + "logits/chosen": -1.336478590965271, + "logits/rejected": -1.242974042892456, + "logps/chosen": -0.9741020202636719, + "logps/rejected": -3.7598636150360107, + "loss": 1.0127, + "odds_ratio_loss": 0.3857436180114746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09741020947694778, + "rewards/margins": 0.27857616543769836, + "rewards/rejected": -0.37598639726638794, + "sft_loss": 0.9741020202636719, + "step": 12725 + }, + { + "epoch": 0.99, + "grad_norm": 7.393919467926025, + "learning_rate": 2.36954275081136e-09, + "logits/chosen": -1.261437177658081, + "logits/rejected": -1.0915019512176514, + "logps/chosen": -0.8143970370292664, + "logps/rejected": -1.4711533784866333, + "loss": 0.8482, + "odds_ratio_loss": 0.3381730616092682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08143971115350723, + "rewards/margins": 0.06567564606666565, + "rewards/rejected": -0.14711534976959229, + "sft_loss": 0.8143970370292664, + "step": 12730 + }, + { + "epoch": 0.99, + "grad_norm": 4.636721611022949, + "learning_rate": 2.1837841232552616e-09, + "logits/chosen": -1.4686650037765503, + "logits/rejected": -1.1045680046081543, + "logps/chosen": -1.0473394393920898, + "logps/rejected": -7.3457441329956055, + "loss": 1.0561, + "odds_ratio_loss": 0.08712659776210785, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10473394393920898, + "rewards/margins": 0.6298404932022095, + "rewards/rejected": -0.7345744371414185, + "sft_loss": 1.0473394393920898, + "step": 12735 + }, + { + "epoch": 0.99, + "grad_norm": 16.038543701171875, + "learning_rate": 2.0056053185379954e-09, + "logits/chosen": -1.3290282487869263, + "logits/rejected": -0.8544355630874634, + "logps/chosen": -1.1234945058822632, + "logps/rejected": -5.49116325378418, + "loss": 1.1377, + "odds_ratio_loss": 0.1421329826116562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11234945058822632, + "rewards/margins": 0.4367668628692627, + "rewards/rejected": -0.549116313457489, + "sft_loss": 1.1234945058822632, + "step": 12740 + }, + { + "epoch": 0.99, + "grad_norm": 6.807204246520996, + "learning_rate": 1.8350066068906213e-09, + "logits/chosen": -1.2573984861373901, + "logits/rejected": -0.9827863574028015, + "logps/chosen": -0.9654420614242554, + "logps/rejected": -21.103622436523438, + "loss": 0.9654, + "odds_ratio_loss": 6.937227590242401e-05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09654419869184494, + "rewards/margins": 2.0138182640075684, + "rewards/rejected": -2.1103625297546387, + "sft_loss": 0.9654420614242554, + "step": 12745 + }, + { + "epoch": 0.99, + "grad_norm": 8.133843421936035, + "learning_rate": 1.6719882470467297e-09, + "logits/chosen": -1.4547202587127686, + "logits/rejected": -1.3134218454360962, + "logps/chosen": -1.2230987548828125, + "logps/rejected": -8.175860404968262, + "loss": 1.2578, + "odds_ratio_loss": 0.34715619683265686, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12230987846851349, + "rewards/margins": 0.695276141166687, + "rewards/rejected": -0.8175861239433289, + "sft_loss": 1.2230987548828125, + "step": 12750 + }, + { + "epoch": 0.99, + "grad_norm": 22.136442184448242, + "learning_rate": 1.5165504862457713e-09, + "logits/chosen": -1.3403151035308838, + "logits/rejected": -0.894049346446991, + "logps/chosen": -1.0238540172576904, + "logps/rejected": -2.9002327919006348, + "loss": 1.0599, + "odds_ratio_loss": 0.36095088720321655, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10238540172576904, + "rewards/margins": 0.18763785064220428, + "rewards/rejected": -0.2900232672691345, + "sft_loss": 1.0238540172576904, + "step": 12755 + }, + { + "epoch": 0.99, + "grad_norm": 9.113795280456543, + "learning_rate": 1.3686935602280627e-09, + "logits/chosen": -1.173903226852417, + "logits/rejected": -1.0834901332855225, + "logps/chosen": -1.094962239265442, + "logps/rejected": -6.69094181060791, + "loss": 1.0959, + "odds_ratio_loss": 0.009845694527029991, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10949622094631195, + "rewards/margins": 0.559597909450531, + "rewards/rejected": -0.6690941452980042, + "sft_loss": 1.094962239265442, + "step": 12760 + }, + { + "epoch": 0.99, + "grad_norm": 9.03683853149414, + "learning_rate": 1.2284176932375601e-09, + "logits/chosen": -1.5034894943237305, + "logits/rejected": -1.175966501235962, + "logps/chosen": -0.694218635559082, + "logps/rejected": -5.125706672668457, + "loss": 0.7015, + "odds_ratio_loss": 0.0728861540555954, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06942186504602432, + "rewards/margins": 0.44314879179000854, + "rewards/rejected": -0.5125706791877747, + "sft_loss": 0.694218635559082, + "step": 12765 + }, + { + "epoch": 0.99, + "grad_norm": 6.062671184539795, + "learning_rate": 1.0957230980201961e-09, + "logits/chosen": -1.3591234683990479, + "logits/rejected": -0.9866539835929871, + "logps/chosen": -0.8312653303146362, + "logps/rejected": -9.86632251739502, + "loss": 0.8442, + "odds_ratio_loss": 0.129344180226326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08312653750181198, + "rewards/margins": 0.9035056829452515, + "rewards/rejected": -0.986632227897644, + "sft_loss": 0.8312653303146362, + "step": 12770 + }, + { + "epoch": 0.99, + "grad_norm": 5.2164225578308105, + "learning_rate": 9.706099758244325e-10, + "logits/chosen": -1.3940870761871338, + "logits/rejected": -0.8618147969245911, + "logps/chosen": -1.2448606491088867, + "logps/rejected": -6.871438503265381, + "loss": 1.3133, + "odds_ratio_loss": 0.6842848062515259, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.12448606640100479, + "rewards/margins": 0.5626577734947205, + "rewards/rejected": -0.6871439218521118, + "sft_loss": 1.2448606491088867, + "step": 12775 + }, + { + "epoch": 0.99, + "grad_norm": 3.143425226211548, + "learning_rate": 8.530785164001521e-10, + "logits/chosen": -1.4610567092895508, + "logits/rejected": -1.3560374975204468, + "logps/chosen": -1.2909905910491943, + "logps/rejected": -9.755168914794922, + "loss": 1.3604, + "odds_ratio_loss": 0.6936588883399963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1290990561246872, + "rewards/margins": 0.8464177846908569, + "rewards/rejected": -0.9755169153213501, + "sft_loss": 1.2909905910491943, + "step": 12780 + }, + { + "epoch": 0.99, + "grad_norm": 162.7854461669922, + "learning_rate": 7.431288979986572e-10, + "logits/chosen": -1.3986408710479736, + "logits/rejected": -1.0258430242538452, + "logps/chosen": -1.276390552520752, + "logps/rejected": -8.889037132263184, + "loss": 1.2909, + "odds_ratio_loss": 0.1447000950574875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1276390552520752, + "rewards/margins": 0.7612647414207458, + "rewards/rejected": -0.888903796672821, + "sft_loss": 1.276390552520752, + "step": 12785 + }, + { + "epoch": 0.99, + "grad_norm": 5.574721336364746, + "learning_rate": 6.407612873726709e-10, + "logits/chosen": -1.325803518295288, + "logits/rejected": -1.1525557041168213, + "logps/chosen": -0.9420539140701294, + "logps/rejected": -5.841198444366455, + "loss": 0.9465, + "odds_ratio_loss": 0.044614277780056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09420539438724518, + "rewards/margins": 0.4899144768714905, + "rewards/rejected": -0.5841198563575745, + "sft_loss": 0.9420539140701294, + "step": 12790 + }, + { + "epoch": 1.0, + "grad_norm": 6.682177543640137, + "learning_rate": 5.459758397757808e-10, + "logits/chosen": -1.3960912227630615, + "logits/rejected": -0.8625108599662781, + "logps/chosen": -0.9628890752792358, + "logps/rejected": -6.6562066078186035, + "loss": 1.0017, + "odds_ratio_loss": 0.3881911039352417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.09628890454769135, + "rewards/margins": 0.5693317651748657, + "rewards/rejected": -0.6656206846237183, + "sft_loss": 0.9628890752792358, + "step": 12795 + }, + { + "epoch": 1.0, + "grad_norm": 4.425645351409912, + "learning_rate": 4.5877269896132946e-10, + "logits/chosen": -1.4052798748016357, + "logits/rejected": -0.8202239274978638, + "logps/chosen": -1.0506846904754639, + "logps/rejected": -6.205660820007324, + "loss": 1.0597, + "odds_ratio_loss": 0.09048835933208466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10506846755743027, + "rewards/margins": 0.5154975652694702, + "rewards/rejected": -0.6205660700798035, + "sft_loss": 1.0506846904754639, + "step": 12800 + }, + { + "epoch": 1.0, + "grad_norm": 6.5811614990234375, + "learning_rate": 3.791519971851898e-10, + "logits/chosen": -1.3399574756622314, + "logits/rejected": -1.0786248445510864, + "logps/chosen": -1.1442134380340576, + "logps/rejected": -7.063590049743652, + "loss": 1.1558, + "odds_ratio_loss": 0.11629381030797958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11442134529352188, + "rewards/margins": 0.5919376611709595, + "rewards/rejected": -0.7063590288162231, + "sft_loss": 1.1442134380340576, + "step": 12805 + }, + { + "epoch": 1.0, + "grad_norm": 4.901683807373047, + "learning_rate": 3.071138552013242e-10, + "logits/chosen": -1.3089653253555298, + "logits/rejected": -0.7390233278274536, + "logps/chosen": -0.903420090675354, + "logps/rejected": -5.574804306030273, + "loss": 0.9193, + "odds_ratio_loss": 0.15836270153522491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09034201502799988, + "rewards/margins": 0.4671383798122406, + "rewards/rejected": -0.5574804544448853, + "sft_loss": 0.903420090675354, + "step": 12810 + }, + { + "epoch": 1.0, + "grad_norm": 11.829771041870117, + "learning_rate": 2.426583822651152e-10, + "logits/chosen": -1.369470238685608, + "logits/rejected": -1.4664558172225952, + "logps/chosen": -0.9986074566841125, + "logps/rejected": -8.506292343139648, + "loss": 1.0125, + "odds_ratio_loss": 0.13853073120117188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09986075758934021, + "rewards/margins": 0.7507684826850891, + "rewards/rejected": -0.8506291508674622, + "sft_loss": 0.9986074566841125, + "step": 12815 + }, + { + "epoch": 1.0, + "grad_norm": 10.773012161254883, + "learning_rate": 1.8578567613114496e-10, + "logits/chosen": -1.3900362253189087, + "logits/rejected": -1.2793715000152588, + "logps/chosen": -0.8403071165084839, + "logps/rejected": -7.705872535705566, + "loss": 0.8411, + "odds_ratio_loss": 0.0077665760181844234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08403071761131287, + "rewards/margins": 0.6865564584732056, + "rewards/rejected": -0.7705872058868408, + "sft_loss": 0.8403071165084839, + "step": 12820 + }, + { + "epoch": 1.0, + "grad_norm": 6.064254283905029, + "learning_rate": 1.3649582305486076e-10, + "logits/chosen": -1.2912412881851196, + "logits/rejected": -0.9192991256713867, + "logps/chosen": -1.165502905845642, + "logps/rejected": -9.035804748535156, + "loss": 1.1662, + "odds_ratio_loss": 0.0065170153975486755, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11655030399560928, + "rewards/margins": 0.7870301008224487, + "rewards/rejected": -0.9035804867744446, + "sft_loss": 1.165502905845642, + "step": 12825 + }, + { + "epoch": 1.0, + "grad_norm": 14.42724609375, + "learning_rate": 9.478889778979927e-11, + "logits/chosen": -1.3533092737197876, + "logits/rejected": -1.2197463512420654, + "logps/chosen": -1.057451605796814, + "logps/rejected": -6.306844234466553, + "loss": 1.0679, + "odds_ratio_loss": 0.10441489517688751, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10574515908956528, + "rewards/margins": 0.5249393582344055, + "rewards/rejected": -0.6306844353675842, + "sft_loss": 1.057451605796814, + "step": 12830 + }, + { + "epoch": 1.0, + "grad_norm": 16.650434494018555, + "learning_rate": 6.066496358980712e-11, + "logits/chosen": -1.327243447303772, + "logits/rejected": -0.8279932141304016, + "logps/chosen": -0.8257233500480652, + "logps/rejected": -4.297021865844727, + "loss": 0.9023, + "odds_ratio_loss": 0.7656275033950806, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0825723335146904, + "rewards/margins": 0.34712985157966614, + "rewards/rejected": -0.42970219254493713, + "sft_loss": 0.8257233500480652, + "step": 12835 + }, + { + "epoch": 1.0, + "grad_norm": 219.4456024169922, + "learning_rate": 3.412407220904079e-11, + "logits/chosen": -1.287316918373108, + "logits/rejected": -1.2117503881454468, + "logps/chosen": -1.8838069438934326, + "logps/rejected": -7.961875915527344, + "loss": 1.9317, + "odds_ratio_loss": 0.47844308614730835, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18838071823120117, + "rewards/margins": 0.6078068614006042, + "rewards/rejected": -0.7961876392364502, + "sft_loss": 1.8838069438934326, + "step": 12840 + }, + { + "epoch": 1.0, + "grad_norm": 6.5753984451293945, + "learning_rate": 1.5166263899191182e-11, + "logits/chosen": -1.3301522731781006, + "logits/rejected": -1.3508708477020264, + "logps/chosen": -0.8103002309799194, + "logps/rejected": -11.35803508758545, + "loss": 0.8209, + "odds_ratio_loss": 0.10593117773532867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08103003352880478, + "rewards/margins": 1.0547735691070557, + "rewards/rejected": -1.135803461074829, + "sft_loss": 0.8103002309799194, + "step": 12845 + }, + { + "epoch": 1.0, + "grad_norm": 5.922697067260742, + "learning_rate": 3.7915674122590565e-12, + "logits/chosen": -1.4197337627410889, + "logits/rejected": -1.1119892597198486, + "logps/chosen": -0.8883382678031921, + "logps/rejected": -6.8627800941467285, + "loss": 0.9022, + "odds_ratio_loss": 0.1383415162563324, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08883383125066757, + "rewards/margins": 0.5974441766738892, + "rewards/rejected": -0.6862779855728149, + "sft_loss": 0.8883382678031921, + "step": 12850 + }, + { + "epoch": 1.0, + "grad_norm": 9.195427894592285, + "learning_rate": 0.0, + "logits/chosen": -1.2286322116851807, + "logits/rejected": -1.5984668731689453, + "logps/chosen": -1.4172906875610352, + "logps/rejected": -7.929937839508057, + "loss": 1.4268, + "odds_ratio_loss": 0.09547214210033417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14172907173633575, + "rewards/margins": 0.6512646675109863, + "rewards/rejected": -0.7929937243461609, + "sft_loss": 1.4172906875610352, + "step": 12855 + }, + { + "epoch": 1.0, + "step": 12855, + "total_flos": 2.8081461696357335e+18, + "train_loss": 1.0734075052920875, + "train_runtime": 27866.917, + "train_samples_per_second": 0.461, + "train_steps_per_second": 0.461 + } + ], + "logging_steps": 5, + "max_steps": 12855, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 250, + "total_flos": 2.8081461696357335e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}