{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 500, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.319148936170213e-08, "logits/chosen": 0.4053989052772522, "logits/rejected": 0.1312936246395111, "logps/chosen": -434.00537109375, "logps/rejected": -516.5983276367188, "loss": 0.1853, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.15297521650791168, "logits/rejected": 0.29175662994384766, "logps/chosen": -365.80181884765625, "logps/rejected": -353.0853271484375, "loss": 0.2099, "rewards/accuracies": 0.25, "rewards/chosen": -0.0007080123177729547, "rewards/margins": -5.8396861277287826e-05, "rewards/rejected": -0.0006496154237538576, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.11968117952346802, "logits/rejected": 0.2041483372449875, "logps/chosen": -340.0993347167969, "logps/rejected": -348.33087158203125, "loss": 0.2094, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.000655159296002239, "rewards/margins": -8.313418220495805e-05, "rewards/rejected": -0.0005720251356251538, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.2551038861274719, "logits/rejected": 0.25183868408203125, "logps/chosen": -383.1521301269531, "logps/rejected": -364.0672302246094, "loss": 0.2012, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.0005650260718539357, "rewards/margins": 0.00010353984544053674, "rewards/rejected": -0.0006685658590868115, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.1547292321920395, "logits/rejected": 0.27106207609176636, "logps/chosen": -401.61614990234375, "logps/rejected": -385.8863220214844, "loss": 0.2099, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.0006045111804269254, "rewards/margins": -5.9384223277447745e-05, "rewards/rejected": -0.0005451269680634141, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.23326897621154785, "logits/rejected": 0.27433687448501587, "logps/chosen": -441.8401794433594, "logps/rejected": -432.41485595703125, "loss": 0.2047, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.000930719543248415, "rewards/margins": 0.000368706532754004, "rewards/rejected": -0.0012994259595870972, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.17064206302165985, "logits/rejected": 0.3185887336730957, "logps/chosen": -410.41473388671875, "logps/rejected": -414.3666076660156, "loss": 0.2182, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.001497046323493123, "rewards/margins": 0.00016530933498870581, "rewards/rejected": -0.0016623556148260832, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.12393184751272202, "logits/rejected": 0.2235107123851776, "logps/chosen": -354.70562744140625, "logps/rejected": -356.94586181640625, "loss": 0.2086, "rewards/accuracies": 0.5, "rewards/chosen": -5.924403740209527e-05, "rewards/margins": 0.0009310436435043812, "rewards/rejected": -0.000990287633612752, "step": 70 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.14353762567043304, "logits/rejected": 0.2516772449016571, "logps/chosen": -392.6264343261719, "logps/rejected": -380.66351318359375, "loss": 0.208, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0002285484952153638, "rewards/margins": 0.001034508110024035, "rewards/rejected": -0.0008059596875682473, "step": 80 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.24103212356567383, "logits/rejected": 0.1776101142168045, "logps/chosen": -393.3184509277344, "logps/rejected": -416.2762145996094, "loss": 0.1992, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0003849788336083293, "rewards/margins": 0.0017982361605390906, "rewards/rejected": -0.0021832147613167763, "step": 90 }, { "epoch": 0.11, "learning_rate": 4.999375059004058e-06, "logits/chosen": 0.16443544626235962, "logits/rejected": 0.17112873494625092, "logps/chosen": -416.6537170410156, "logps/rejected": -411.6963806152344, "loss": 0.2064, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.0005408526631072164, "rewards/margins": 0.0026028361171483994, "rewards/rejected": -0.002061983570456505, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.9955571065548795e-06, "logits/chosen": 0.2384149730205536, "logits/rejected": 0.1614537537097931, "logps/chosen": -406.7789306640625, "logps/rejected": -391.0703430175781, "loss": 0.2008, "rewards/accuracies": 0.5, "rewards/chosen": 0.012324010953307152, "rewards/margins": 0.0033186424989253283, "rewards/rejected": 0.009005369618535042, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9882736864879e-06, "logits/chosen": 0.08936997503042221, "logits/rejected": 0.25732293725013733, "logps/chosen": -397.0160827636719, "logps/rejected": -431.9867248535156, "loss": 0.2064, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01773521490395069, "rewards/margins": 0.007638473063707352, "rewards/rejected": 0.01009674184024334, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.977534912960124e-06, "logits/chosen": 0.14923642575740814, "logits/rejected": 0.27579236030578613, "logps/chosen": -407.21258544921875, "logps/rejected": -401.8697204589844, "loss": 0.2048, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.005354008637368679, "rewards/margins": 0.00832393579185009, "rewards/rejected": -0.0029699269216507673, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.963355698422092e-06, "logits/chosen": 0.13965365290641785, "logits/rejected": 0.20428553223609924, "logps/chosen": -396.0818786621094, "logps/rejected": -384.4440612792969, "loss": 0.2016, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.011929613538086414, "rewards/margins": 0.00782632827758789, "rewards/rejected": -0.01975594088435173, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.945755732909625e-06, "logits/chosen": 0.0017524458235129714, "logits/rejected": 0.048104483634233475, "logps/chosen": -403.7103576660156, "logps/rejected": -421.3060607910156, "loss": 0.1918, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.020950669422745705, "rewards/margins": 0.01481578964740038, "rewards/rejected": -0.03576646000146866, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.924759456701167e-06, "logits/chosen": 0.050279758870601654, "logits/rejected": 0.12556883692741394, "logps/chosen": -467.9580993652344, "logps/rejected": -487.71844482421875, "loss": 0.1868, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.044767118990421295, "rewards/margins": 0.03162100166082382, "rewards/rejected": -0.07638812065124512, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.900396026378671e-06, "logits/chosen": -0.020991306751966476, "logits/rejected": 0.15817420184612274, "logps/chosen": -522.8843383789062, "logps/rejected": -518.1360473632812, "loss": 0.2105, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.08218502998352051, "rewards/margins": 0.02223752811551094, "rewards/rejected": -0.10442256927490234, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.872699274339169e-06, "logits/chosen": 0.08929436653852463, "logits/rejected": 0.09290768206119537, "logps/chosen": -470.04296875, "logps/rejected": -501.46661376953125, "loss": 0.1886, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08902844041585922, "rewards/margins": 0.03190689533948898, "rewards/rejected": -0.12093535810709, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.8417076618132434e-06, "logits/chosen": 0.017135417088866234, "logits/rejected": 0.09486501663923264, "logps/chosen": -600.1754760742188, "logps/rejected": -609.9652709960938, "loss": 0.1887, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11600615829229355, "rewards/margins": 0.041202057152986526, "rewards/rejected": -0.15720821917057037, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.807464225455655e-06, "logits/chosen": -0.032647065818309784, "logits/rejected": 0.09240031987428665, "logps/chosen": -527.2655029296875, "logps/rejected": -595.9906005859375, "loss": 0.1949, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1231132298707962, "rewards/margins": 0.051538724452257156, "rewards/rejected": -0.17465195059776306, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.770016517582283e-06, "logits/chosen": 0.03914088383316994, "logits/rejected": 0.028707262128591537, "logps/chosen": -524.7379760742188, "logps/rejected": -570.7955322265625, "loss": 0.1902, "rewards/accuracies": 0.53125, "rewards/chosen": -0.12313251197338104, "rewards/margins": 0.04237721115350723, "rewards/rejected": -0.16550973057746887, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.7294165401363616e-06, "logits/chosen": 0.010909264907240868, "logits/rejected": -0.024190250784158707, "logps/chosen": -549.97607421875, "logps/rejected": -590.9778442382812, "loss": 0.1843, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14806947112083435, "rewards/margins": 0.04638643562793732, "rewards/rejected": -0.19445592164993286, "step": 220 }, { "epoch": 0.25, "learning_rate": 4.68572067247573e-06, "logits/chosen": -0.018162641674280167, "logits/rejected": 0.000972352921962738, "logps/chosen": -549.0392456054688, "logps/rejected": -598.3811645507812, "loss": 0.2025, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1491001546382904, "rewards/margins": 0.04847537726163864, "rewards/rejected": -0.19757553935050964, "step": 230 }, { "epoch": 0.26, "learning_rate": 4.638989593081364e-06, "logits/chosen": -0.12062356621026993, "logits/rejected": 0.04868536815047264, "logps/chosen": -484.58685302734375, "logps/rejected": -516.2865600585938, "loss": 0.1908, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1070113405585289, "rewards/margins": 0.028431424871087074, "rewards/rejected": -0.13544276356697083, "step": 240 }, { "epoch": 0.27, "learning_rate": 4.5892881952959015e-06, "logits/chosen": -0.041638366878032684, "logits/rejected": 0.0221172496676445, "logps/chosen": -507.4730529785156, "logps/rejected": -527.9345703125, "loss": 0.2052, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09331385791301727, "rewards/margins": 0.03461749479174614, "rewards/rejected": -0.1279313564300537, "step": 250 }, { "epoch": 0.28, "learning_rate": 4.536685497209182e-06, "logits/chosen": -0.05273251608014107, "logits/rejected": -0.022044766694307327, "logps/chosen": -538.548583984375, "logps/rejected": -590.6500854492188, "loss": 0.182, "rewards/accuracies": 0.5, "rewards/chosen": -0.10730306804180145, "rewards/margins": 0.039316385984420776, "rewards/rejected": -0.14661946892738342, "step": 260 }, { "epoch": 0.29, "learning_rate": 4.481254545815943e-06, "logits/chosen": -0.12279339134693146, "logits/rejected": -0.079288050532341, "logps/chosen": -560.711181640625, "logps/rejected": -635.7985229492188, "loss": 0.1845, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.12682856619358063, "rewards/margins": 0.0557611808180809, "rewards/rejected": -0.18258973956108093, "step": 270 }, { "epoch": 0.3, "learning_rate": 4.42307231557875e-06, "logits/chosen": -0.12978403270244598, "logits/rejected": -0.05718718096613884, "logps/chosen": -534.7046508789062, "logps/rejected": -573.9546508789062, "loss": 0.1872, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.12655052542686462, "rewards/margins": 0.05336705967783928, "rewards/rejected": -0.1799176186323166, "step": 280 }, { "epoch": 0.31, "learning_rate": 4.3622196015370305e-06, "logits/chosen": -0.13656684756278992, "logits/rejected": -0.07923261821269989, "logps/chosen": -537.6559448242188, "logps/rejected": -627.1619873046875, "loss": 0.1952, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13470812141895294, "rewards/margins": 0.05948293209075928, "rewards/rejected": -0.1941910684108734, "step": 290 }, { "epoch": 0.32, "learning_rate": 4.298780907110648e-06, "logits/chosen": -0.11429516226053238, "logits/rejected": -0.12869636714458466, "logps/chosen": -543.2788696289062, "logps/rejected": -558.6578369140625, "loss": 0.1847, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12002478539943695, "rewards/margins": 0.050586897879838943, "rewards/rejected": -0.1706116795539856, "step": 300 }, { "epoch": 0.33, "learning_rate": 4.23284432675381e-06, "logits/chosen": -0.18651030957698822, "logits/rejected": -0.052459727972745895, "logps/chosen": -461.1847229003906, "logps/rejected": -506.2823181152344, "loss": 0.1928, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.08158674091100693, "rewards/margins": 0.045484792441129684, "rewards/rejected": -0.1270715296268463, "step": 310 }, { "epoch": 0.34, "learning_rate": 4.164501423622277e-06, "logits/chosen": -0.1330818384885788, "logits/rejected": -0.09265539795160294, "logps/chosen": -483.45599365234375, "logps/rejected": -515.7194213867188, "loss": 0.1797, "rewards/accuracies": 0.53125, "rewards/chosen": -0.10115663707256317, "rewards/margins": 0.04430658370256424, "rewards/rejected": -0.14546321332454681, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.0938471024237355e-06, "logits/chosen": -0.11196194589138031, "logits/rejected": -0.09686783701181412, "logps/chosen": -557.8707885742188, "logps/rejected": -571.39794921875, "loss": 0.1958, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.12075225263834, "rewards/margins": 0.04514995589852333, "rewards/rejected": -0.16590221226215363, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.020979477627907e-06, "logits/chosen": -0.08174435794353485, "logits/rejected": -0.06923134624958038, "logps/chosen": -531.8570556640625, "logps/rejected": -605.0074462890625, "loss": 0.1889, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11230266094207764, "rewards/margins": 0.06913084536790848, "rewards/rejected": -0.18143349885940552, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.9459997372194105e-06, "logits/chosen": -0.16061343252658844, "logits/rejected": -0.027816006913781166, "logps/chosen": -534.1594848632812, "logps/rejected": -573.8477783203125, "loss": 0.1926, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0970136970281601, "rewards/margins": 0.045039448887109756, "rewards/rejected": -0.14205312728881836, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.869012002182573e-06, "logits/chosen": -0.24527081847190857, "logits/rejected": -0.1484527587890625, "logps/chosen": -544.539306640625, "logps/rejected": -564.9341430664062, "loss": 0.1859, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09417351335287094, "rewards/margins": 0.056557249277830124, "rewards/rejected": -0.15073075890541077, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.7901231819133104e-06, "logits/chosen": -0.1722763478755951, "logits/rejected": -0.17130622267723083, "logps/chosen": -496.27313232421875, "logps/rejected": -557.1398315429688, "loss": 0.1877, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0965786799788475, "rewards/margins": 0.057069409638643265, "rewards/rejected": -0.15364809334278107, "step": 370 }, { "epoch": 0.41, "learning_rate": 3.709442825758875e-06, "logits/chosen": -0.286950945854187, "logits/rejected": -0.12660877406597137, "logps/chosen": -487.8304138183594, "logps/rejected": -506.80267333984375, "loss": 0.1784, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.08378596603870392, "rewards/margins": 0.04325443506240845, "rewards/rejected": -0.12704041600227356, "step": 380 }, { "epoch": 0.42, "learning_rate": 3.6270829708916113e-06, "logits/chosen": -0.2721463441848755, "logits/rejected": -0.19791728258132935, "logps/chosen": -525.049560546875, "logps/rejected": -564.6629028320312, "loss": 0.1924, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1035178154706955, "rewards/margins": 0.03107512556016445, "rewards/rejected": -0.1345929503440857, "step": 390 }, { "epoch": 0.43, "learning_rate": 3.543157986727991e-06, "logits/chosen": -0.17590856552124023, "logits/rejected": -0.16738948225975037, "logps/chosen": -520.5001831054688, "logps/rejected": -564.5961303710938, "loss": 0.1854, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09734812378883362, "rewards/margins": 0.0474289208650589, "rewards/rejected": -0.14477702975273132, "step": 400 }, { "epoch": 0.44, "learning_rate": 3.4577844161089614e-06, "logits/chosen": -0.17745746672153473, "logits/rejected": -0.18353696167469025, "logps/chosen": -508.34637451171875, "logps/rejected": -577.1897583007812, "loss": 0.1804, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0865376815199852, "rewards/margins": 0.049374908208847046, "rewards/rejected": -0.13591258227825165, "step": 410 }, { "epoch": 0.45, "learning_rate": 3.3710808134621577e-06, "logits/chosen": -0.17098669707775116, "logits/rejected": -0.13703958690166473, "logps/chosen": -539.40087890625, "logps/rejected": -593.8014526367188, "loss": 0.1851, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09274087101221085, "rewards/margins": 0.05765017122030258, "rewards/rejected": -0.15039105713367462, "step": 420 }, { "epoch": 0.46, "learning_rate": 3.2831675801707126e-06, "logits/chosen": -0.20213007926940918, "logits/rejected": -0.20745894312858582, "logps/chosen": -453.65478515625, "logps/rejected": -497.0008850097656, "loss": 0.1824, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08567062765359879, "rewards/margins": 0.04529280215501785, "rewards/rejected": -0.13096341490745544, "step": 430 }, { "epoch": 0.47, "learning_rate": 3.194166797377289e-06, "logits/chosen": -0.21449732780456543, "logits/rejected": -0.19523288309574127, "logps/chosen": -547.9935302734375, "logps/rejected": -572.2437744140625, "loss": 0.1901, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09648506343364716, "rewards/margins": 0.03453432396054268, "rewards/rejected": -0.13101938366889954, "step": 440 }, { "epoch": 0.48, "learning_rate": 3.104202056455501e-06, "logits/chosen": -0.22678379714488983, "logits/rejected": -0.18668214976787567, "logps/chosen": -519.3316650390625, "logps/rejected": -561.0910034179688, "loss": 0.1896, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09367823600769043, "rewards/margins": 0.058413583785295486, "rewards/rejected": -0.15209180116653442, "step": 450 }, { "epoch": 0.49, "learning_rate": 3.013398287384144e-06, "logits/chosen": -0.20922398567199707, "logits/rejected": -0.15190599858760834, "logps/chosen": -554.7764892578125, "logps/rejected": -584.9015502929688, "loss": 0.1777, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11632993072271347, "rewards/margins": 0.048789944499731064, "rewards/rejected": -0.16511985659599304, "step": 460 }, { "epoch": 0.5, "learning_rate": 2.9218815852625717e-06, "logits/chosen": -0.2042142152786255, "logits/rejected": -0.19644713401794434, "logps/chosen": -522.6699829101562, "logps/rejected": -589.4488525390625, "loss": 0.189, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.10839296877384186, "rewards/margins": 0.06792866438627243, "rewards/rejected": -0.17632164061069489, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.829779035208113e-06, "logits/chosen": -0.29581087827682495, "logits/rejected": -0.17288121581077576, "logps/chosen": -492.73297119140625, "logps/rejected": -565.6483764648438, "loss": 0.1819, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10467328131198883, "rewards/margins": 0.0681912824511528, "rewards/rejected": -0.17286454141139984, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.737218535878705e-06, "logits/chosen": -0.1768864393234253, "logits/rejected": -0.19145308434963226, "logps/chosen": -481.3701171875, "logps/rejected": -552.0697021484375, "loss": 0.1861, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10367073863744736, "rewards/margins": 0.0636112317442894, "rewards/rejected": -0.16728197038173676, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.64432862186579e-06, "logits/chosen": -0.25040799379348755, "logits/rejected": -0.2705633044242859, "logps/chosen": -473.15777587890625, "logps/rejected": -521.9263916015625, "loss": 0.1845, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10329292714595795, "rewards/margins": 0.04386230558156967, "rewards/rejected": -0.1471552550792694, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.551238285204126e-06, "logits/chosen": -0.22839005291461945, "logits/rejected": -0.18522998690605164, "logps/chosen": -562.2581176757812, "logps/rejected": -602.7523193359375, "loss": 0.1852, "rewards/accuracies": 0.5, "rewards/chosen": -0.11718226969242096, "rewards/margins": 0.054385870695114136, "rewards/rejected": -0.1715681403875351, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.4580767962463688e-06, "logits/chosen": -0.28231528401374817, "logits/rejected": -0.1746218502521515, "logps/chosen": -508.0462951660156, "logps/rejected": -521.466552734375, "loss": 0.1887, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.0966549962759018, "rewards/margins": 0.04605900123715401, "rewards/rejected": -0.1427139937877655, "step": 520 }, { "epoch": 0.57, "learning_rate": 2.3649735241511546e-06, "logits/chosen": -0.14483687281608582, "logits/rejected": -0.18159925937652588, "logps/chosen": -519.6622314453125, "logps/rejected": -554.4771728515625, "loss": 0.1881, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10754968971014023, "rewards/margins": 0.043608419597148895, "rewards/rejected": -0.15115809440612793, "step": 530 }, { "epoch": 0.58, "learning_rate": 2.2720577572339914e-06, "logits/chosen": -0.27724790573120117, "logits/rejected": -0.18303519487380981, "logps/chosen": -502.09747314453125, "logps/rejected": -529.7732543945312, "loss": 0.1902, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.0977000966668129, "rewards/margins": 0.04301925003528595, "rewards/rejected": -0.14071933925151825, "step": 540 }, { "epoch": 0.59, "learning_rate": 2.1794585234303995e-06, "logits/chosen": -0.2885403633117676, "logits/rejected": -0.16289584338665009, "logps/chosen": -519.3963012695312, "logps/rejected": -553.4032592773438, "loss": 0.1871, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10111876577138901, "rewards/margins": 0.040404774248600006, "rewards/rejected": -0.1415235549211502, "step": 550 }, { "epoch": 0.6, "learning_rate": 2.0873044111206407e-06, "logits/chosen": -0.23527821898460388, "logits/rejected": -0.2247372567653656, "logps/chosen": -481.41552734375, "logps/rejected": -552.4132080078125, "loss": 0.2026, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10331599414348602, "rewards/margins": 0.040959432721138, "rewards/rejected": -0.14427544176578522, "step": 560 }, { "epoch": 0.61, "learning_rate": 1.9957233905648293e-06, "logits/chosen": -0.28348255157470703, "logits/rejected": -0.26194503903388977, "logps/chosen": -467.77740478515625, "logps/rejected": -507.6676330566406, "loss": 0.1887, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10089198499917984, "rewards/margins": 0.0405060276389122, "rewards/rejected": -0.14139802753925323, "step": 570 }, { "epoch": 0.62, "learning_rate": 1.904842636196402e-06, "logits/chosen": -0.22403912246227264, "logits/rejected": -0.19076624512672424, "logps/chosen": -500.50982666015625, "logps/rejected": -544.9527587890625, "loss": 0.1793, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09000807255506516, "rewards/margins": 0.05757290869951248, "rewards/rejected": -0.14758098125457764, "step": 580 }, { "epoch": 0.63, "learning_rate": 1.814788350020726e-06, "logits/chosen": -0.25425633788108826, "logits/rejected": -0.13311608135700226, "logps/chosen": -523.557373046875, "logps/rejected": -576.8714599609375, "loss": 0.1667, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09282426536083221, "rewards/margins": 0.06447537243366241, "rewards/rejected": -0.15729963779449463, "step": 590 }, { "epoch": 0.64, "learning_rate": 1.725685586364051e-06, "logits/chosen": -0.25314217805862427, "logits/rejected": -0.2236749678850174, "logps/chosen": -442.9320373535156, "logps/rejected": -521.5167236328125, "loss": 0.1816, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.08528304100036621, "rewards/margins": 0.05453087016940117, "rewards/rejected": -0.13981391489505768, "step": 600 }, { "epoch": 0.65, "learning_rate": 1.6376580782162172e-06, "logits/chosen": -0.2589682936668396, "logits/rejected": -0.2686694264411926, "logps/chosen": -501.1578674316406, "logps/rejected": -545.9219970703125, "loss": 0.1949, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09304684400558472, "rewards/margins": 0.06486930698156357, "rewards/rejected": -0.1579161435365677, "step": 610 }, { "epoch": 0.66, "learning_rate": 1.550828065408227e-06, "logits/chosen": -0.15998974442481995, "logits/rejected": -0.26897841691970825, "logps/chosen": -483.9093322753906, "logps/rejected": -574.64990234375, "loss": 0.1784, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0897846445441246, "rewards/margins": 0.059298910200595856, "rewards/rejected": -0.14908355474472046, "step": 620 }, { "epoch": 0.67, "learning_rate": 1.4653161248633053e-06, "logits/chosen": -0.30697402358055115, "logits/rejected": -0.2922336459159851, "logps/chosen": -446.08642578125, "logps/rejected": -485.38311767578125, "loss": 0.1835, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.07366035133600235, "rewards/margins": 0.05683215707540512, "rewards/rejected": -0.13049249351024628, "step": 630 }, { "epoch": 0.68, "learning_rate": 1.381241003157162e-06, "logits/chosen": -0.27867692708969116, "logits/rejected": -0.23723456263542175, "logps/chosen": -472.146240234375, "logps/rejected": -522.3912963867188, "loss": 0.1886, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.08290112018585205, "rewards/margins": 0.06648631393909454, "rewards/rejected": -0.1493874490261078, "step": 640 }, { "epoch": 0.69, "learning_rate": 1.298719451619979e-06, "logits/chosen": -0.27338069677352905, "logits/rejected": -0.0849432423710823, "logps/chosen": -505.39404296875, "logps/rejected": -564.4884643554688, "loss": 0.1769, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0819827988743782, "rewards/margins": 0.07497727125883102, "rewards/rejected": -0.15696007013320923, "step": 650 }, { "epoch": 0.7, "learning_rate": 1.2178660642091036e-06, "logits/chosen": -0.31306496262550354, "logits/rejected": -0.15988986194133759, "logps/chosen": -536.2052001953125, "logps/rejected": -565.6595458984375, "loss": 0.1917, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10776883363723755, "rewards/margins": 0.04764767736196518, "rewards/rejected": -0.15541651844978333, "step": 660 }, { "epoch": 0.71, "learning_rate": 1.1387931183775821e-06, "logits/chosen": -0.1312873661518097, "logits/rejected": -0.1946374773979187, "logps/chosen": -489.32037353515625, "logps/rejected": -532.713623046875, "loss": 0.1923, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10007087886333466, "rewards/margins": 0.0466584786772728, "rewards/rejected": -0.14672937989234924, "step": 670 }, { "epoch": 0.73, "learning_rate": 1.061610419159532e-06, "logits/chosen": -0.18406830728054047, "logits/rejected": -0.18264801800251007, "logps/chosen": -455.0455627441406, "logps/rejected": -483.98748779296875, "loss": 0.187, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.08491896092891693, "rewards/margins": 0.042202599346637726, "rewards/rejected": -0.12712153792381287, "step": 680 }, { "epoch": 0.74, "learning_rate": 9.864251466888364e-07, "logits/chosen": -0.2591504454612732, "logits/rejected": -0.1554795801639557, "logps/chosen": -488.55328369140625, "logps/rejected": -532.9073486328125, "loss": 0.1807, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.09263734519481659, "rewards/margins": 0.04989578202366829, "rewards/rejected": -0.1425331085920334, "step": 690 }, { "epoch": 0.75, "learning_rate": 9.133417073629288e-07, "logits/chosen": -0.28501999378204346, "logits/rejected": -0.23185932636260986, "logps/chosen": -485.5430603027344, "logps/rejected": -541.1561889648438, "loss": 0.1604, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09013941138982773, "rewards/margins": 0.051737189292907715, "rewards/rejected": -0.14187659323215485, "step": 700 }, { "epoch": 0.76, "learning_rate": 8.424615888583332e-07, "logits/chosen": -0.25448185205459595, "logits/rejected": -0.13845598697662354, "logps/chosen": -504.44085693359375, "logps/rejected": -557.7171630859375, "loss": 0.1875, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09454745799303055, "rewards/margins": 0.0519348680973053, "rewards/rejected": -0.14648231863975525, "step": 710 }, { "epoch": 0.77, "learning_rate": 7.738832191993092e-07, "logits/chosen": -0.20559599995613098, "logits/rejected": -0.1910923421382904, "logps/chosen": -518.5410766601562, "logps/rejected": -559.4114379882812, "loss": 0.1781, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09824297577142715, "rewards/margins": 0.051730893552303314, "rewards/rejected": -0.14997386932373047, "step": 720 }, { "epoch": 0.78, "learning_rate": 7.077018300752917e-07, "logits/chosen": -0.20453593134880066, "logits/rejected": -0.22350621223449707, "logps/chosen": -517.8855590820312, "logps/rejected": -554.9312744140625, "loss": 0.173, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0931503102183342, "rewards/margins": 0.059033893048763275, "rewards/rejected": -0.15218421816825867, "step": 730 }, { "epoch": 0.79, "learning_rate": 6.440093245969342e-07, "logits/chosen": -0.24915683269500732, "logits/rejected": -0.14556431770324707, "logps/chosen": -458.88006591796875, "logps/rejected": -483.50872802734375, "loss": 0.1791, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0776047632098198, "rewards/margins": 0.055497486144304276, "rewards/rejected": -0.13310226798057556, "step": 740 }, { "epoch": 0.8, "learning_rate": 5.828941496744075e-07, "logits/chosen": -0.22538790106773376, "logits/rejected": -0.16318151354789734, "logps/chosen": -516.1492309570312, "logps/rejected": -586.5814208984375, "loss": 0.1829, "rewards/accuracies": 0.5625, "rewards/chosen": -0.09301020205020905, "rewards/margins": 0.06428654491901398, "rewards/rejected": -0.15729674696922302, "step": 750 }, { "epoch": 0.81, "learning_rate": 5.244411731951671e-07, "logits/chosen": -0.2806158661842346, "logits/rejected": -0.066395103931427, "logps/chosen": -479.8614807128906, "logps/rejected": -503.1717224121094, "loss": 0.1733, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.09291915595531464, "rewards/margins": 0.04909076914191246, "rewards/rejected": -0.1420099288225174, "step": 760 }, { "epoch": 0.82, "learning_rate": 4.6873156617173594e-07, "logits/chosen": -0.325612872838974, "logits/rejected": -0.22937624156475067, "logps/chosen": -526.00341796875, "logps/rejected": -547.8123168945312, "loss": 0.1746, "rewards/accuracies": 0.5, "rewards/chosen": -0.09224705398082733, "rewards/margins": 0.054325349628925323, "rewards/rejected": -0.14657239615917206, "step": 770 }, { "epoch": 0.83, "learning_rate": 4.1584269002318653e-07, "logits/chosen": -0.2622816264629364, "logits/rejected": -0.15280409157276154, "logps/chosen": -533.4638061523438, "logps/rejected": -595.7601318359375, "loss": 0.1815, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09688106924295425, "rewards/margins": 0.05785801261663437, "rewards/rejected": -0.15473909676074982, "step": 780 }, { "epoch": 0.84, "learning_rate": 3.658479891468258e-07, "logits/chosen": -0.18345573544502258, "logits/rejected": -0.16111025214195251, "logps/chosen": -487.77886962890625, "logps/rejected": -538.51171875, "loss": 0.184, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08573255687952042, "rewards/margins": 0.0516083724796772, "rewards/rejected": -0.13734093308448792, "step": 790 }, { "epoch": 0.85, "learning_rate": 3.18816888929272e-07, "logits/chosen": -0.2816532254219055, "logits/rejected": -0.15862765908241272, "logps/chosen": -512.4444580078125, "logps/rejected": -536.9342651367188, "loss": 0.1844, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.09420600533485413, "rewards/margins": 0.05614888668060303, "rewards/rejected": -0.15035490691661835, "step": 800 }, { "epoch": 0.86, "learning_rate": 2.748146993385484e-07, "logits/chosen": -0.21904349327087402, "logits/rejected": -0.258176326751709, "logps/chosen": -549.0933837890625, "logps/rejected": -618.8422241210938, "loss": 0.18, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10313485562801361, "rewards/margins": 0.07112576067447662, "rewards/rejected": -0.17426061630249023, "step": 810 }, { "epoch": 0.87, "learning_rate": 2.3390252423108077e-07, "logits/chosen": -0.3128640353679657, "logits/rejected": -0.10742131620645523, "logps/chosen": -525.4251098632812, "logps/rejected": -553.0595092773438, "loss": 0.1734, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08687300980091095, "rewards/margins": 0.06989692151546478, "rewards/rejected": -0.15676993131637573, "step": 820 }, { "epoch": 0.89, "learning_rate": 1.961371764995243e-07, "logits/chosen": -0.1870919167995453, "logits/rejected": -0.18910066783428192, "logps/chosen": -514.9647827148438, "logps/rejected": -548.2384643554688, "loss": 0.182, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0961606353521347, "rewards/margins": 0.05352962762117386, "rewards/rejected": -0.14969027042388916, "step": 830 }, { "epoch": 0.9, "learning_rate": 1.61571099179261e-07, "logits/chosen": -0.31641727685928345, "logits/rejected": -0.21929411590099335, "logps/chosen": -452.8780212402344, "logps/rejected": -497.7808532714844, "loss": 0.1887, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08406294882297516, "rewards/margins": 0.06199796125292778, "rewards/rejected": -0.14606089890003204, "step": 840 }, { "epoch": 0.91, "learning_rate": 1.3025229262312367e-07, "logits/chosen": -0.2824193239212036, "logits/rejected": -0.2219020426273346, "logps/chosen": -516.4136962890625, "logps/rejected": -553.1406860351562, "loss": 0.1761, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09720293432474136, "rewards/margins": 0.046975888311862946, "rewards/rejected": -0.1441788375377655, "step": 850 }, { "epoch": 0.92, "learning_rate": 1.0222424784546853e-07, "logits/chosen": -0.09483526647090912, "logits/rejected": -0.24578902125358582, "logps/chosen": -496.60028076171875, "logps/rejected": -558.0816650390625, "loss": 0.175, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0964212641119957, "rewards/margins": 0.050969939678907394, "rewards/rejected": -0.14739122986793518, "step": 860 }, { "epoch": 0.93, "learning_rate": 7.752588612816553e-08, "logits/chosen": -0.20135729014873505, "logits/rejected": -0.25128036737442017, "logps/chosen": -545.1806640625, "logps/rejected": -568.894775390625, "loss": 0.1815, "rewards/accuracies": 0.5, "rewards/chosen": -0.09586036950349808, "rewards/margins": 0.04870045185089111, "rewards/rejected": -0.1445608288049698, "step": 870 }, { "epoch": 0.94, "learning_rate": 5.619150497236991e-08, "logits/chosen": -0.2596682012081146, "logits/rejected": -0.22516381740570068, "logps/chosen": -498.10028076171875, "logps/rejected": -564.5962524414062, "loss": 0.1708, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09488777071237564, "rewards/margins": 0.06974340975284576, "rewards/rejected": -0.164631187915802, "step": 880 }, { "epoch": 0.95, "learning_rate": 3.825073047112743e-08, "logits/chosen": -0.2944473624229431, "logits/rejected": -0.2125546932220459, "logps/chosen": -502.841552734375, "logps/rejected": -546.7924194335938, "loss": 0.1826, "rewards/accuracies": 0.5, "rewards/chosen": -0.08710362762212753, "rewards/margins": 0.059937745332717896, "rewards/rejected": -0.14704139530658722, "step": 890 }, { "epoch": 0.96, "learning_rate": 2.372847616895685e-08, "logits/chosen": -0.2098701447248459, "logits/rejected": -0.2635635733604431, "logps/chosen": -481.09478759765625, "logps/rejected": -569.34326171875, "loss": 0.1763, "rewards/accuracies": 0.5, "rewards/chosen": -0.09881605207920074, "rewards/margins": 0.06372065842151642, "rewards/rejected": -0.16253669559955597, "step": 900 }, { "epoch": 0.97, "learning_rate": 1.264490846553279e-08, "logits/chosen": -0.20950980484485626, "logits/rejected": -0.2304944545030594, "logps/chosen": -508.41241455078125, "logps/rejected": -574.2259521484375, "loss": 0.1727, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0780680924654007, "rewards/margins": 0.06727245450019836, "rewards/rejected": -0.14534054696559906, "step": 910 }, { "epoch": 0.98, "learning_rate": 5.015418611516165e-09, "logits/chosen": -0.2829793095588684, "logits/rejected": -0.2987596392631531, "logps/chosen": -475.51031494140625, "logps/rejected": -521.2598876953125, "loss": 0.1865, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09187036752700806, "rewards/margins": 0.06315977871417999, "rewards/rejected": -0.15503014624118805, "step": 920 }, { "epoch": 0.99, "learning_rate": 8.506013354186993e-10, "logits/chosen": -0.23163847625255585, "logits/rejected": -0.24427077174186707, "logps/chosen": -477.51507568359375, "logps/rejected": -542.298828125, "loss": 0.1848, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0930924192070961, "rewards/margins": 0.06024498865008354, "rewards/rejected": -0.15333738923072815, "step": 930 }, { "epoch": 1.0, "step": 937, "total_flos": 0.0, "train_loss": 0.1881250925163322, "train_runtime": 7837.4153, "train_samples_per_second": 3.828, "train_steps_per_second": 0.12 } ], "logging_steps": 10, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }