{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.6595744680851065e-08, "logits/chosen": 0.4583740830421448, "logits/rejected": 0.45381295680999756, "logps/chosen": -403.16717529296875, "logps/rejected": -354.3865051269531, "loss": 0.1853, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.6595744680851066e-07, "logits/chosen": 0.18831893801689148, "logits/rejected": 0.16829822957515717, "logps/chosen": -401.88055419921875, "logps/rejected": -396.11865234375, "loss": 0.2155, "rewards/accuracies": 0.2638888955116272, "rewards/chosen": -0.000744127610232681, "rewards/margins": -8.914418140193447e-05, "rewards/rejected": -0.0006549834506586194, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.319148936170213e-07, "logits/chosen": 0.1669415831565857, "logits/rejected": 0.27017873525619507, "logps/chosen": -453.92071533203125, "logps/rejected": -443.1700744628906, "loss": 0.2137, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.0007682474097236991, "rewards/margins": -1.9114029782940634e-05, "rewards/rejected": -0.0007491334108635783, "step": 20 }, { "epoch": 0.02, "learning_rate": 7.97872340425532e-07, "logits/chosen": 0.1750134825706482, "logits/rejected": 0.2582840919494629, "logps/chosen": -361.32965087890625, "logps/rejected": -338.42205810546875, "loss": 0.2148, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.0005828866851516068, "rewards/margins": 4.7791574615985155e-05, "rewards/rejected": -0.000630678201559931, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0638297872340427e-06, "logits/chosen": 0.1780398190021515, "logits/rejected": 0.23933863639831543, "logps/chosen": -418.8761291503906, "logps/rejected": -419.4820251464844, "loss": 0.205, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.0007866934174671769, "rewards/margins": -4.094879113836214e-05, "rewards/rejected": -0.0007457446190528572, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3297872340425533e-06, "logits/chosen": 0.21062365174293518, "logits/rejected": 0.3023103177547455, "logps/chosen": -369.01318359375, "logps/rejected": -368.9207458496094, "loss": 0.2261, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.0007744937902316451, "rewards/margins": -3.84082886739634e-05, "rewards/rejected": -0.0007360855233855546, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.595744680851064e-06, "logits/chosen": 0.21257944405078888, "logits/rejected": 0.23676128685474396, "logps/chosen": -401.72259521484375, "logps/rejected": -401.12982177734375, "loss": 0.21, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0010942494263872504, "rewards/margins": 2.2952935978537425e-05, "rewards/rejected": -0.0011172023368999362, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8617021276595745e-06, "logits/chosen": 0.22530443966388702, "logits/rejected": 0.24157127737998962, "logps/chosen": -421.9361877441406, "logps/rejected": -450.1436462402344, "loss": 0.2112, "rewards/accuracies": 0.375, "rewards/chosen": -0.0015197412576526403, "rewards/margins": 2.4099141228361987e-05, "rewards/rejected": -0.0015438406262546778, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1276595744680853e-06, "logits/chosen": 0.23238015174865723, "logits/rejected": 0.2601611614227295, "logps/chosen": -418.73187255859375, "logps/rejected": -398.03265380859375, "loss": 0.205, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0016283972654491663, "rewards/margins": -1.6413705452578142e-05, "rewards/rejected": -0.0016119834035634995, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.393617021276596e-06, "logits/chosen": 0.17837639153003693, "logits/rejected": 0.24620842933654785, "logps/chosen": -392.7635192871094, "logps/rejected": -385.9835205078125, "loss": 0.2062, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.0008109696209430695, "rewards/margins": 8.667097426950932e-05, "rewards/rejected": -0.0008976406534202397, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6595744680851065e-06, "logits/chosen": 0.22456176578998566, "logits/rejected": 0.2180750072002411, "logps/chosen": -411.5848693847656, "logps/rejected": -392.6322326660156, "loss": 0.2234, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.00039298724732361734, "rewards/margins": 0.00011892013571923599, "rewards/rejected": -0.0005119073903188109, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.9255319148936174e-06, "logits/chosen": 0.19836413860321045, "logits/rejected": 0.3192578852176666, "logps/chosen": -410.68499755859375, "logps/rejected": -386.6772155761719, "loss": 0.2086, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0007202932611107826, "rewards/margins": 0.00010950298747047782, "rewards/rejected": -0.0008297963067889214, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.191489361702128e-06, "logits/chosen": 0.1606016904115677, "logits/rejected": 0.28400760889053345, "logps/chosen": -401.4821472167969, "logps/rejected": -373.6825256347656, "loss": 0.2184, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0019728371407836676, "rewards/margins": 0.0002594580873847008, "rewards/rejected": -0.0022322952281683683, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.457446808510639e-06, "logits/chosen": 0.22675403952598572, "logits/rejected": 0.2259739637374878, "logps/chosen": -432.01788330078125, "logps/rejected": -415.99237060546875, "loss": 0.2144, "rewards/accuracies": 0.5, "rewards/chosen": -0.0032550902105867863, "rewards/margins": 0.000516787578817457, "rewards/rejected": -0.0037718776147812605, "step": 130 }, { "epoch": 0.07, "learning_rate": 3.723404255319149e-06, "logits/chosen": 0.13569238781929016, "logits/rejected": 0.23111942410469055, "logps/chosen": -416.4497985839844, "logps/rejected": -402.7656555175781, "loss": 0.202, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.006250211503356695, "rewards/margins": 0.0006005663308314979, "rewards/rejected": -0.006850778125226498, "step": 140 }, { "epoch": 0.08, "learning_rate": 3.98936170212766e-06, "logits/chosen": 0.15672233700752258, "logits/rejected": 0.2764233350753784, "logps/chosen": -441.35137939453125, "logps/rejected": -409.206787109375, "loss": 0.2094, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.008352077566087246, "rewards/margins": 0.000567329756449908, "rewards/rejected": -0.008919407613575459, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.255319148936171e-06, "logits/chosen": 0.17575426399707794, "logits/rejected": 0.3381495475769043, "logps/chosen": -468.0437927246094, "logps/rejected": -437.7367248535156, "loss": 0.2069, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.011829295195639133, "rewards/margins": 0.0006891209632158279, "rewards/rejected": -0.012518415227532387, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.521276595744681e-06, "logits/chosen": 0.2566456198692322, "logits/rejected": 0.21506217122077942, "logps/chosen": -443.35540771484375, "logps/rejected": -438.29876708984375, "loss": 0.224, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.013920286670327187, "rewards/margins": 0.0008589512435719371, "rewards/rejected": -0.014779238030314445, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.787234042553192e-06, "logits/chosen": 0.1587153822183609, "logits/rejected": 0.346591055393219, "logps/chosen": -423.898681640625, "logps/rejected": -416.0426330566406, "loss": 0.2208, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01828654855489731, "rewards/margins": 0.0014178925193846226, "rewards/rejected": -0.019704440608620644, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999982660399688e-06, "logits/chosen": 0.19637706875801086, "logits/rejected": 0.13249197602272034, "logps/chosen": -401.75360107421875, "logps/rejected": -391.2793884277344, "loss": 0.2078, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.023105399683117867, "rewards/margins": 0.0009698948706500232, "rewards/rejected": -0.02407529577612877, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.99937579964398e-06, "logits/chosen": 0.12021654844284058, "logits/rejected": 0.22755944728851318, "logps/chosen": -438.9696350097656, "logps/rejected": -436.67559814453125, "loss": 0.212, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0280294306576252, "rewards/margins": 0.001816231175325811, "rewards/rejected": -0.02984566055238247, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.9979021993870645e-06, "logits/chosen": 0.17782853543758392, "logits/rejected": 0.1260356307029724, "logps/chosen": -428.07281494140625, "logps/rejected": -423.8680725097656, "loss": 0.2116, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03266894817352295, "rewards/margins": 0.0038566484581679106, "rewards/rejected": -0.03652559593319893, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.995562370647553e-06, "logits/chosen": 0.15141043066978455, "logits/rejected": 0.21045760810375214, "logps/chosen": -441.1923828125, "logps/rejected": -440.4317321777344, "loss": 0.2056, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.03683360293507576, "rewards/margins": 0.00403643399477005, "rewards/rejected": -0.04087003692984581, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.992357124836838e-06, "logits/chosen": 0.1319437474012375, "logits/rejected": 0.16590853035449982, "logps/chosen": -463.49951171875, "logps/rejected": -463.0955505371094, "loss": 0.2099, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.04471432790160179, "rewards/margins": 0.004302737768739462, "rewards/rejected": -0.04901706799864769, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.9882875734777044e-06, "logits/chosen": 0.10973749309778214, "logits/rejected": 0.14370934665203094, "logps/chosen": -432.8614807128906, "logps/rejected": -412.8258361816406, "loss": 0.2081, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.048478107899427414, "rewards/margins": 0.0035480097867548466, "rewards/rejected": -0.052026115357875824, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.983355127818882e-06, "logits/chosen": 0.14486494660377502, "logits/rejected": 0.13507609069347382, "logps/chosen": -425.30743408203125, "logps/rejected": -446.130615234375, "loss": 0.1997, "rewards/accuracies": 0.46875, "rewards/chosen": -0.05807062238454819, "rewards/margins": 0.0065963054075837135, "rewards/rejected": -0.06466692686080933, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.977561498345639e-06, "logits/chosen": 0.18454475700855255, "logits/rejected": 0.16011472046375275, "logps/chosen": -497.3888244628906, "logps/rejected": -488.9383850097656, "loss": 0.1955, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.07542432844638824, "rewards/margins": 0.005581502337008715, "rewards/rejected": -0.08100582659244537, "step": 260 }, { "epoch": 0.14, "learning_rate": 4.970908694186624e-06, "logits/chosen": 0.04981505125761032, "logits/rejected": 0.11929504573345184, "logps/chosen": -475.73687744140625, "logps/rejected": -471.32421875, "loss": 0.2008, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.07895161211490631, "rewards/margins": 0.011724306270480156, "rewards/rejected": -0.09067590534687042, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.9633990224171305e-06, "logits/chosen": 0.09068510681390762, "logits/rejected": 0.17252135276794434, "logps/chosen": -545.042724609375, "logps/rejected": -578.0839233398438, "loss": 0.189, "rewards/accuracies": 0.5625, "rewards/chosen": -0.10266657918691635, "rewards/margins": 0.02503531612455845, "rewards/rejected": -0.12770189344882965, "step": 280 }, { "epoch": 0.15, "learning_rate": 4.955035087259046e-06, "logits/chosen": -0.0076313503086566925, "logits/rejected": 0.0826013833284378, "logps/chosen": -509.309326171875, "logps/rejected": -542.4077758789062, "loss": 0.1898, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12836064398288727, "rewards/margins": 0.03418760746717453, "rewards/rejected": -0.1625482589006424, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.945819789177756e-06, "logits/chosen": -0.011387845501303673, "logits/rejected": 0.058963656425476074, "logps/chosen": -550.4841918945312, "logps/rejected": -566.2930297851562, "loss": 0.1994, "rewards/accuracies": 0.5, "rewards/chosen": -0.1419098824262619, "rewards/margins": 0.03765057772397995, "rewards/rejected": -0.17956045269966125, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.935756323876306e-06, "logits/chosen": 0.030229881405830383, "logits/rejected": 0.00785739440470934, "logps/chosen": -546.796875, "logps/rejected": -585.1295776367188, "loss": 0.1988, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.15364839136600494, "rewards/margins": 0.04681529477238655, "rewards/rejected": -0.2004636526107788, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.924848181187199e-06, "logits/chosen": 0.09916579723358154, "logits/rejected": 0.03708130493760109, "logps/chosen": -534.3087158203125, "logps/rejected": -577.957275390625, "loss": 0.176, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13466373085975647, "rewards/margins": 0.046590112149715424, "rewards/rejected": -0.1812538504600525, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.913099143862173e-06, "logits/chosen": 0.09245137125253677, "logits/rejected": -0.01894455775618553, "logps/chosen": -570.8424682617188, "logps/rejected": -633.2811279296875, "loss": 0.1982, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1720808893442154, "rewards/margins": 0.05095798522233963, "rewards/rejected": -0.22303888201713562, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.900513286260416e-06, "logits/chosen": 0.06517922878265381, "logits/rejected": 0.05662886053323746, "logps/chosen": -556.5380859375, "logps/rejected": -572.91943359375, "loss": 0.1967, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1459980458021164, "rewards/margins": 0.031131967902183533, "rewards/rejected": -0.17713001370429993, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.887094972935645e-06, "logits/chosen": 0.053707320243120193, "logits/rejected": 0.12972167134284973, "logps/chosen": -587.4563598632812, "logps/rejected": -619.8084716796875, "loss": 0.1789, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14950796961784363, "rewards/margins": 0.027994930744171143, "rewards/rejected": -0.17750290036201477, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.87284885712256e-06, "logits/chosen": 0.05275138095021248, "logits/rejected": 0.13531816005706787, "logps/chosen": -553.1101684570312, "logps/rejected": -604.1668701171875, "loss": 0.1926, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13555410504341125, "rewards/margins": 0.0414896085858345, "rewards/rejected": -0.17704370617866516, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.857779879123181e-06, "logits/chosen": 0.10553497076034546, "logits/rejected": 0.027261802926659584, "logps/chosen": -506.9090881347656, "logps/rejected": -567.632568359375, "loss": 0.1952, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12961818277835846, "rewards/margins": 0.0423763282597065, "rewards/rejected": -0.17199452221393585, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.841893264593643e-06, "logits/chosen": -0.021337047219276428, "logits/rejected": 0.10264651477336884, "logps/chosen": -569.1201171875, "logps/rejected": -573.4818115234375, "loss": 0.1775, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.15105991065502167, "rewards/margins": 0.04063944146037102, "rewards/rejected": -0.19169935584068298, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.825194522732023e-06, "logits/chosen": -0.007825059816241264, "logits/rejected": -0.015095917508006096, "logps/chosen": -517.6556396484375, "logps/rejected": -564.65673828125, "loss": 0.1934, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.15630927681922913, "rewards/margins": 0.03676559031009674, "rewards/rejected": -0.19307485222816467, "step": 390 }, { "epoch": 0.21, "learning_rate": 4.807689444367853e-06, "logits/chosen": 0.03950309008359909, "logits/rejected": 0.025178443640470505, "logps/chosen": -613.1845703125, "logps/rejected": -667.6192016601562, "loss": 0.1839, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.16962917149066925, "rewards/margins": 0.061695653945207596, "rewards/rejected": -0.23132482171058655, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78938409995396e-06, "logits/chosen": -0.019574914127588272, "logits/rejected": 0.10981860011816025, "logps/chosen": -587.1323852539062, "logps/rejected": -607.53955078125, "loss": 0.1968, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.1764066517353058, "rewards/margins": 0.03810610622167587, "rewards/rejected": -0.21451278030872345, "step": 410 }, { "epoch": 0.22, "learning_rate": 4.770284837461342e-06, "logits/chosen": -0.016084378585219383, "logits/rejected": 0.11264481395483017, "logps/chosen": -570.8071899414062, "logps/rejected": -621.8716430664062, "loss": 0.2007, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.16467757523059845, "rewards/margins": 0.039321959018707275, "rewards/rejected": -0.20399951934814453, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.7503982801778015e-06, "logits/chosen": 0.05447696894407272, "logits/rejected": 0.03223523125052452, "logps/chosen": -529.8160400390625, "logps/rejected": -550.2005004882812, "loss": 0.1846, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13065846264362335, "rewards/margins": 0.04571721702814102, "rewards/rejected": -0.17637568712234497, "step": 430 }, { "epoch": 0.23, "learning_rate": 4.729731324411104e-06, "logits/chosen": -0.021515950560569763, "logits/rejected": 0.03140898048877716, "logps/chosen": -578.0147094726562, "logps/rejected": -585.7742919921875, "loss": 0.1984, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.15668722987174988, "rewards/margins": 0.035644322633743286, "rewards/rejected": -0.19233153760433197, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.7082911370974645e-06, "logits/chosen": -0.012143002822995186, "logits/rejected": -0.0082742003723979, "logps/chosen": -486.759521484375, "logps/rejected": -521.0432739257812, "loss": 0.199, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.127678781747818, "rewards/margins": 0.03519933670759201, "rewards/rejected": -0.1628781259059906, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.68608515331618e-06, "logits/chosen": -0.023242251947522163, "logits/rejected": 0.003186366055160761, "logps/chosen": -485.2528381347656, "logps/rejected": -513.8176879882812, "loss": 0.1925, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.11917861551046371, "rewards/margins": 0.047842614352703094, "rewards/rejected": -0.167021244764328, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.663121073711269e-06, "logits/chosen": 0.01416336465626955, "logits/rejected": -0.06039174646139145, "logps/chosen": -549.2015380859375, "logps/rejected": -583.758544921875, "loss": 0.1822, "rewards/accuracies": 0.5, "rewards/chosen": -0.14279165863990784, "rewards/margins": 0.038277558982372284, "rewards/rejected": -0.18106922507286072, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.63940686182103e-06, "logits/chosen": 0.01400854904204607, "logits/rejected": 0.10263659805059433, "logps/chosen": -515.437255859375, "logps/rejected": -564.8402099609375, "loss": 0.1976, "rewards/accuracies": 0.5, "rewards/chosen": -0.13872714340686798, "rewards/margins": 0.03997962549328804, "rewards/rejected": -0.17870678007602692, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.614950741316425e-06, "logits/chosen": -0.06926377862691879, "logits/rejected": 0.03542783483862877, "logps/chosen": -540.4765625, "logps/rejected": -549.0443115234375, "loss": 0.1897, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13047516345977783, "rewards/margins": 0.03688601404428482, "rewards/rejected": -0.16736117005348206, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.589761193149254e-06, "logits/chosen": -0.055144570767879486, "logits/rejected": 0.00845063291490078, "logps/chosen": -540.0427856445312, "logps/rejected": -555.8956298828125, "loss": 0.1863, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14786241948604584, "rewards/margins": 0.03734371438622475, "rewards/rejected": -0.18520613014698029, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.563846952611112e-06, "logits/chosen": -0.06796270608901978, "logits/rejected": -0.021189894527196884, "logps/chosen": -538.6000366210938, "logps/rejected": -609.37353515625, "loss": 0.1812, "rewards/accuracies": 0.53125, "rewards/chosen": -0.14661245048046112, "rewards/margins": 0.0674336701631546, "rewards/rejected": -0.2140461504459381, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.537217006304141e-06, "logits/chosen": -0.13910801708698273, "logits/rejected": 0.04259239882230759, "logps/chosen": -555.0911865234375, "logps/rejected": -552.6492919921875, "loss": 0.1901, "rewards/accuracies": 0.46875, "rewards/chosen": -0.15548577904701233, "rewards/margins": 0.03300771862268448, "rewards/rejected": -0.18849347531795502, "step": 520 }, { "epoch": 0.28, "learning_rate": 4.50988058902464e-06, "logits/chosen": -0.047577690333127975, "logits/rejected": 0.034965887665748596, "logps/chosen": -664.0265502929688, "logps/rejected": -692.3294677734375, "loss": 0.1941, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1734737604856491, "rewards/margins": 0.04430503025650978, "rewards/rejected": -0.21777880191802979, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.481847180560593e-06, "logits/chosen": -0.013295474462211132, "logits/rejected": -0.07179677486419678, "logps/chosen": -540.1502685546875, "logps/rejected": -567.1649169921875, "loss": 0.1972, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1351390928030014, "rewards/margins": 0.03879848122596741, "rewards/rejected": -0.1739375740289688, "step": 540 }, { "epoch": 0.29, "learning_rate": 4.453126502404253e-06, "logits/chosen": -0.10417531430721283, "logits/rejected": 0.025454232469201088, "logps/chosen": -571.4408569335938, "logps/rejected": -581.1018676757812, "loss": 0.1785, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13031336665153503, "rewards/margins": 0.04332467168569565, "rewards/rejected": -0.17363804578781128, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.423728514380892e-06, "logits/chosen": -0.09106893092393875, "logits/rejected": 0.05246344953775406, "logps/chosen": -590.4605712890625, "logps/rejected": -646.8682861328125, "loss": 0.1797, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.16008590161800385, "rewards/margins": 0.03608149290084839, "rewards/rejected": -0.19616740942001343, "step": 560 }, { "epoch": 0.3, "learning_rate": 4.393663411194918e-06, "logits/chosen": -0.00037176310434006155, "logits/rejected": -0.05788201093673706, "logps/chosen": -603.2279663085938, "logps/rejected": -637.8117065429688, "loss": 0.2042, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.15948781371116638, "rewards/margins": 0.039536003023386, "rewards/rejected": -0.19902381300926208, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.362941618894523e-06, "logits/chosen": -0.06197139620780945, "logits/rejected": -0.04511360824108124, "logps/chosen": -502.40704345703125, "logps/rejected": -549.5247802734375, "loss": 0.1904, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1260320395231247, "rewards/margins": 0.04073121398687363, "rewards/rejected": -0.1667632758617401, "step": 580 }, { "epoch": 0.31, "learning_rate": 4.331573791256116e-06, "logits/chosen": -0.03825578838586807, "logits/rejected": 0.00044789613457396626, "logps/chosen": -474.03973388671875, "logps/rejected": -531.3597412109375, "loss": 0.1772, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.11952260881662369, "rewards/margins": 0.05453835800290108, "rewards/rejected": -0.17406097054481506, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.299570806089786e-06, "logits/chosen": -0.06856271624565125, "logits/rejected": -0.03950439766049385, "logps/chosen": -530.919189453125, "logps/rejected": -575.9044189453125, "loss": 0.1848, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1305655688047409, "rewards/margins": 0.04401278868317604, "rewards/rejected": -0.17457836866378784, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.266943761467057e-06, "logits/chosen": -0.008585452102124691, "logits/rejected": -0.08859982341527939, "logps/chosen": -561.964599609375, "logps/rejected": -607.6642456054688, "loss": 0.1815, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13443836569786072, "rewards/margins": 0.061653982847929, "rewards/rejected": -0.19609235227108002, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.233703971872287e-06, "logits/chosen": -0.11785644292831421, "logits/rejected": 0.07683457434177399, "logps/chosen": -580.4694213867188, "logps/rejected": -604.2586669921875, "loss": 0.1988, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.14866627752780914, "rewards/margins": 0.03873666003346443, "rewards/rejected": -0.18740293383598328, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.1998629642789925e-06, "logits/chosen": -0.08321277797222137, "logits/rejected": -0.059583961963653564, "logps/chosen": -520.8612060546875, "logps/rejected": -594.0638427734375, "loss": 0.1774, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1341467797756195, "rewards/margins": 0.05918589234352112, "rewards/rejected": -0.19333268702030182, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.165432474152505e-06, "logits/chosen": -0.09531132876873016, "logits/rejected": -0.0270084198564291, "logps/chosen": -528.1355590820312, "logps/rejected": -581.7833251953125, "loss": 0.1745, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13981810212135315, "rewards/margins": 0.07118718326091766, "rewards/rejected": -0.211005300283432, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.130424441380308e-06, "logits/chosen": -0.10320959240198135, "logits/rejected": -0.019093522801995277, "logps/chosen": -571.685791015625, "logps/rejected": -591.3698120117188, "loss": 0.2015, "rewards/accuracies": 0.46875, "rewards/chosen": -0.16250738501548767, "rewards/margins": 0.04821362346410751, "rewards/rejected": -0.21072101593017578, "step": 650 }, { "epoch": 0.35, "learning_rate": 4.09485100613151e-06, "logits/chosen": -0.11241753399372101, "logits/rejected": -0.032273612916469574, "logps/chosen": -624.3687133789062, "logps/rejected": -671.1177368164062, "loss": 0.1973, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.17460055649280548, "rewards/margins": 0.044025253504514694, "rewards/rejected": -0.21862581372261047, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.058724504646834e-06, "logits/chosen": -0.029143745079636574, "logits/rejected": -0.060622453689575195, "logps/chosen": -519.5960083007812, "logps/rejected": -589.2883911132812, "loss": 0.1632, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1383863389492035, "rewards/margins": 0.062351007014513016, "rewards/rejected": -0.2007373571395874, "step": 670 }, { "epoch": 0.36, "learning_rate": 4.022057464960632e-06, "logits/chosen": -0.06448385864496231, "logits/rejected": -0.04329472780227661, "logps/chosen": -536.9004516601562, "logps/rejected": -563.5870361328125, "loss": 0.1866, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14495059847831726, "rewards/margins": 0.03540947288274765, "rewards/rejected": -0.1803600788116455, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.984862602556383e-06, "logits/chosen": -0.004617185331881046, "logits/rejected": -0.01678011380136013, "logps/chosen": -555.3660278320312, "logps/rejected": -594.7876586914062, "loss": 0.182, "rewards/accuracies": 0.5, "rewards/chosen": -0.13744883239269257, "rewards/margins": 0.038119856268167496, "rewards/rejected": -0.17556868493556976, "step": 690 }, { "epoch": 0.37, "learning_rate": 3.947152815957187e-06, "logits/chosen": -0.11041183769702911, "logits/rejected": -0.09442894160747528, "logps/chosen": -475.54583740234375, "logps/rejected": -500.1151428222656, "loss": 0.1817, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.11575014889240265, "rewards/margins": 0.033661358058452606, "rewards/rejected": -0.14941151440143585, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.908941182252785e-06, "logits/chosen": -0.10789042711257935, "logits/rejected": -0.014411434531211853, "logps/chosen": -543.808837890625, "logps/rejected": -581.8538818359375, "loss": 0.1886, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1326008141040802, "rewards/margins": 0.03887839615345001, "rewards/rejected": -0.17147919535636902, "step": 710 }, { "epoch": 0.38, "learning_rate": 3.8702409525646535e-06, "logits/chosen": -0.10680530220270157, "logits/rejected": -0.07425309717655182, "logps/chosen": -522.1688842773438, "logps/rejected": -554.4418334960938, "loss": 0.1864, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12628528475761414, "rewards/margins": 0.04535987228155136, "rewards/rejected": -0.1716451495885849, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8310655474507495e-06, "logits/chosen": -0.10503558814525604, "logits/rejected": -0.013972464017570019, "logps/chosen": -550.2520751953125, "logps/rejected": -571.8939208984375, "loss": 0.1934, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14120154082775116, "rewards/margins": 0.0461118221282959, "rewards/rejected": -0.18731336295604706, "step": 730 }, { "epoch": 0.39, "learning_rate": 3.7914285522515002e-06, "logits/chosen": -0.05753170698881149, "logits/rejected": -0.05392669886350632, "logps/chosen": -513.2532348632812, "logps/rejected": -580.1729736328125, "loss": 0.186, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13195370137691498, "rewards/margins": 0.04442184790968895, "rewards/rejected": -0.17637556791305542, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.751343712378639e-06, "logits/chosen": -0.03736535459756851, "logits/rejected": -0.001959963236004114, "logps/chosen": -531.0137939453125, "logps/rejected": -591.2086181640625, "loss": 0.1881, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1330515593290329, "rewards/margins": 0.05102803185582161, "rewards/rejected": -0.1840795874595642, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.710824928548546e-06, "logits/chosen": -0.10402516275644302, "logits/rejected": -0.04912823066115379, "logps/chosen": -558.085205078125, "logps/rejected": -588.9461669921875, "loss": 0.2009, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13824841380119324, "rewards/margins": 0.03640920668840408, "rewards/rejected": -0.17465761303901672, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.6698862519617225e-06, "logits/chosen": -0.06400999426841736, "logits/rejected": -0.06958254426717758, "logps/chosen": -541.27099609375, "logps/rejected": -621.9313354492188, "loss": 0.1797, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12520474195480347, "rewards/margins": 0.06592214107513428, "rewards/rejected": -0.19112688302993774, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.6285418794300793e-06, "logits/chosen": -0.13884581625461578, "logits/rejected": -0.11223921924829483, "logps/chosen": -552.3884887695312, "logps/rejected": -589.5878295898438, "loss": 0.1734, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.14118912816047668, "rewards/margins": 0.051578063517808914, "rewards/rejected": -0.1927671879529953, "step": 780 }, { "epoch": 0.42, "learning_rate": 3.5868061484537365e-06, "logits/chosen": -0.07228229939937592, "logits/rejected": 0.038021642714738846, "logps/chosen": -565.1091918945312, "logps/rejected": -591.867431640625, "loss": 0.2045, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.1505979746580124, "rewards/margins": 0.04035702347755432, "rewards/rejected": -0.1909550130367279, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.5446935322490285e-06, "logits/chosen": -0.04605743661522865, "logits/rejected": 0.011277568526566029, "logps/chosen": -562.3070678710938, "logps/rejected": -597.06201171875, "loss": 0.1925, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14476799964904785, "rewards/margins": 0.04444306343793869, "rewards/rejected": -0.18921108543872833, "step": 800 }, { "epoch": 0.43, "learning_rate": 3.502218634729447e-06, "logits/chosen": -0.06661860644817352, "logits/rejected": -0.13232673704624176, "logps/chosen": -506.79315185546875, "logps/rejected": -562.1184692382812, "loss": 0.1726, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.12148015201091766, "rewards/margins": 0.05448601767420769, "rewards/rejected": -0.17596617341041565, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.459396185441265e-06, "logits/chosen": -0.13019916415214539, "logits/rejected": -0.08353747427463531, "logps/chosen": -481.433837890625, "logps/rejected": -490.4483947753906, "loss": 0.1862, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11900192499160767, "rewards/margins": 0.037384189665317535, "rewards/rejected": -0.1563861072063446, "step": 820 }, { "epoch": 0.44, "learning_rate": 3.4162410344555834e-06, "logits/chosen": -0.0683845803141594, "logits/rejected": -0.03088710829615593, "logps/chosen": -492.447021484375, "logps/rejected": -564.9397583007812, "loss": 0.1862, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13119229674339294, "rewards/margins": 0.05578438565135002, "rewards/rejected": -0.18697668612003326, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.3727681472185937e-06, "logits/chosen": -0.09018312394618988, "logits/rejected": -0.10359557718038559, "logps/chosen": -537.9823608398438, "logps/rejected": -594.79443359375, "loss": 0.1849, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13429854810237885, "rewards/margins": 0.06120805814862251, "rewards/rejected": -0.19550660252571106, "step": 840 }, { "epoch": 0.45, "learning_rate": 3.3289925993618217e-06, "logits/chosen": -0.18850287795066833, "logits/rejected": -0.04869599640369415, "logps/chosen": -496.01007080078125, "logps/rejected": -560.2242431640625, "loss": 0.1825, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12265129387378693, "rewards/margins": 0.05625399202108383, "rewards/rejected": -0.17890527844429016, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2849295714741643e-06, "logits/chosen": -0.19866463541984558, "logits/rejected": -0.004881598986685276, "logps/chosen": -532.1676635742188, "logps/rejected": -571.507568359375, "loss": 0.1791, "rewards/accuracies": 0.5, "rewards/chosen": -0.12184770405292511, "rewards/margins": 0.05913316085934639, "rewards/rejected": -0.18098084628582, "step": 860 }, { "epoch": 0.46, "learning_rate": 3.2405943438375287e-06, "logits/chosen": -0.2018626630306244, "logits/rejected": -0.10594689846038818, "logps/chosen": -488.4137268066406, "logps/rejected": -556.1409912109375, "loss": 0.1846, "rewards/accuracies": 0.46875, "rewards/chosen": -0.12942823767662048, "rewards/margins": 0.05433051660656929, "rewards/rejected": -0.18375876545906067, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.1960022911279036e-06, "logits/chosen": -0.16193589568138123, "logits/rejected": -0.2137628048658371, "logps/chosen": -484.3509826660156, "logps/rejected": -553.5465698242188, "loss": 0.1964, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.12599757313728333, "rewards/margins": 0.049567125737667084, "rewards/rejected": -0.1755646914243698, "step": 880 }, { "epoch": 0.47, "learning_rate": 3.1511688770836844e-06, "logits/chosen": -0.01477061491459608, "logits/rejected": -0.20513947308063507, "logps/chosen": -559.4446411132812, "logps/rejected": -626.2364501953125, "loss": 0.171, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13351976871490479, "rewards/margins": 0.07253735512495041, "rewards/rejected": -0.2060571163892746, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.1061096491431307e-06, "logits/chosen": -0.23158374428749084, "logits/rejected": -0.15197685360908508, "logps/chosen": -511.8590393066406, "logps/rejected": -550.5593872070312, "loss": 0.2039, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.11726844310760498, "rewards/margins": 0.04829251766204834, "rewards/rejected": -0.16556094586849213, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.0608402330527796e-06, "logits/chosen": -0.17909975349903107, "logits/rejected": -0.11886115372180939, "logps/chosen": -516.4205932617188, "logps/rejected": -559.4550170898438, "loss": 0.1824, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1072378158569336, "rewards/margins": 0.04554293677210808, "rewards/rejected": -0.15278074145317078, "step": 910 }, { "epoch": 0.49, "learning_rate": 3.0153763274487176e-06, "logits/chosen": -0.14633020758628845, "logits/rejected": -0.08009175956249237, "logps/chosen": -497.86724853515625, "logps/rejected": -534.5050659179688, "loss": 0.1581, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09393012523651123, "rewards/margins": 0.05776657536625862, "rewards/rejected": -0.15169671177864075, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.9697336984125683e-06, "logits/chosen": -0.17566119134426117, "logits/rejected": -0.08242344111204147, "logps/chosen": -552.6607055664062, "logps/rejected": -587.2052612304688, "loss": 0.1719, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09468172490596771, "rewards/margins": 0.05334918573498726, "rewards/rejected": -0.14803092181682587, "step": 930 }, { "epoch": 0.5, "learning_rate": 2.923928174004094e-06, "logits/chosen": -0.05346371978521347, "logits/rejected": -0.1932278424501419, "logps/chosen": -535.7907104492188, "logps/rejected": -599.4446411132812, "loss": 0.1681, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09776908159255981, "rewards/margins": 0.05439938232302666, "rewards/rejected": -0.15216846764087677, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8779756387723036e-06, "logits/chosen": -0.17646734416484833, "logits/rejected": -0.10943827778100967, "logps/chosen": -508.94232177734375, "logps/rejected": -568.2824096679688, "loss": 0.1889, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10447195917367935, "rewards/margins": 0.05140892416238785, "rewards/rejected": -0.1558808833360672, "step": 950 }, { "epoch": 0.51, "learning_rate": 2.831892028246968e-06, "logits/chosen": -0.08494242280721664, "logits/rejected": -0.18166685104370117, "logps/chosen": -509.8265686035156, "logps/rejected": -575.4097900390625, "loss": 0.1796, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09938603639602661, "rewards/margins": 0.06175467371940613, "rewards/rejected": -0.16114071011543274, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.7856933234124617e-06, "logits/chosen": -0.11445019394159317, "logits/rejected": -0.12723931670188904, "logps/chosen": -484.7312927246094, "logps/rejected": -564.54345703125, "loss": 0.1861, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10803971439599991, "rewards/margins": 0.055267833173274994, "rewards/rejected": -0.1633075475692749, "step": 970 }, { "epoch": 0.52, "learning_rate": 2.7393955451658387e-06, "logits/chosen": -0.14208866655826569, "logits/rejected": -0.020831745117902756, "logps/chosen": -528.3738403320312, "logps/rejected": -538.9185180664062, "loss": 0.1867, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.11705558001995087, "rewards/margins": 0.04653087258338928, "rewards/rejected": -0.16358645260334015, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6930147487610667e-06, "logits/chosen": -0.21168622374534607, "logits/rejected": -0.10795080661773682, "logps/chosen": -551.8548583984375, "logps/rejected": -576.697998046875, "loss": 0.1936, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12270279228687286, "rewards/margins": 0.041682593524456024, "rewards/rejected": -0.1643853634595871, "step": 990 }, { "epoch": 0.53, "learning_rate": 2.6465670182413487e-06, "logits/chosen": -0.08504590392112732, "logits/rejected": -0.11864916980266571, "logps/chosen": -531.6979370117188, "logps/rejected": -553.0827026367188, "loss": 0.1765, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.107094407081604, "rewards/margins": 0.04743362218141556, "rewards/rejected": -0.15452802181243896, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.6000684608614594e-06, "logits/chosen": -0.08871813118457794, "logits/rejected": -0.20158669352531433, "logps/chosen": -532.0164794921875, "logps/rejected": -609.779052734375, "loss": 0.1869, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.10916718095541, "rewards/margins": 0.06110434979200363, "rewards/rejected": -0.17027154564857483, "step": 1010 }, { "epoch": 0.54, "learning_rate": 2.5535352015020338e-06, "logits/chosen": -0.13045457005500793, "logits/rejected": -0.15182146430015564, "logps/chosen": -511.3404846191406, "logps/rejected": -542.553466796875, "loss": 0.1791, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.10936908423900604, "rewards/margins": 0.04454661160707474, "rewards/rejected": -0.15391568839550018, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.506983377077741e-06, "logits/chosen": -0.10655238479375839, "logits/rejected": -0.07288862764835358, "logps/chosen": -545.8250732421875, "logps/rejected": -581.5811767578125, "loss": 0.1716, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11866410076618195, "rewards/margins": 0.06786342710256577, "rewards/rejected": -0.18652752041816711, "step": 1030 }, { "epoch": 0.55, "learning_rate": 2.460429130941289e-06, "logits/chosen": -0.18012507259845734, "logits/rejected": -0.11784622818231583, "logps/chosen": -466.735107421875, "logps/rejected": -501.16302490234375, "loss": 0.1912, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10379817336797714, "rewards/margins": 0.051670271903276443, "rewards/rejected": -0.15546846389770508, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.413888607285192e-06, "logits/chosen": -0.20981307327747345, "logits/rejected": -0.1565089374780655, "logps/chosen": -503.9241638183594, "logps/rejected": -531.1654052734375, "loss": 0.2048, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.11265435069799423, "rewards/margins": 0.036974333226680756, "rewards/rejected": -0.14962869882583618, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.367377945543249e-06, "logits/chosen": -0.17128372192382812, "logits/rejected": -0.1380850225687027, "logps/chosen": -493.03936767578125, "logps/rejected": -539.0289306640625, "loss": 0.194, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10857043415307999, "rewards/margins": 0.04022621735930443, "rewards/rejected": -0.14879664778709412, "step": 1060 }, { "epoch": 0.57, "learning_rate": 2.320913274793676e-06, "logits/chosen": -0.14254216849803925, "logits/rejected": -0.13331882655620575, "logps/chosen": -505.95428466796875, "logps/rejected": -575.7135009765625, "loss": 0.1834, "rewards/accuracies": 0.5, "rewards/chosen": -0.11139512062072754, "rewards/margins": 0.05586816743016243, "rewards/rejected": -0.16726329922676086, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.27451070816582e-06, "logits/chosen": -0.1611151099205017, "logits/rejected": -0.06889496743679047, "logps/chosen": -582.8988037109375, "logps/rejected": -629.1710205078125, "loss": 0.1726, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11593208461999893, "rewards/margins": 0.06972763687372208, "rewards/rejected": -0.1856597363948822, "step": 1080 }, { "epoch": 0.58, "learning_rate": 2.228186337252414e-06, "logits/chosen": -0.22832027077674866, "logits/rejected": -0.03726217523217201, "logps/chosen": -508.06768798828125, "logps/rejected": -514.2000732421875, "loss": 0.1795, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10370014607906342, "rewards/margins": 0.04398902878165245, "rewards/rejected": -0.14768919348716736, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1819562265292946e-06, "logits/chosen": -0.20967726409435272, "logits/rejected": -0.1568584442138672, "logps/chosen": -568.8887329101562, "logps/rejected": -614.3547973632812, "loss": 0.1752, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10962442308664322, "rewards/margins": 0.060089100152254105, "rewards/rejected": -0.16971352696418762, "step": 1100 }, { "epoch": 0.59, "learning_rate": 2.1358364077845236e-06, "logits/chosen": -0.23702092468738556, "logits/rejected": -0.054711341857910156, "logps/chosen": -569.4314575195312, "logps/rejected": -610.4983520507812, "loss": 0.1861, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13101813197135925, "rewards/margins": 0.043327562510967255, "rewards/rejected": -0.1743457019329071, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.089842874558849e-06, "logits/chosen": -0.13204485177993774, "logits/rejected": -0.18103663623332977, "logps/chosen": -501.7928771972656, "logps/rejected": -548.9622192382812, "loss": 0.1908, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10908225923776627, "rewards/margins": 0.034237340092659, "rewards/rejected": -0.14331960678100586, "step": 1120 }, { "epoch": 0.6, "learning_rate": 2.0439915765994242e-06, "logits/chosen": -0.178396537899971, "logits/rejected": -0.2268516570329666, "logps/chosen": -512.7816772460938, "logps/rejected": -568.0106811523438, "loss": 0.1771, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.11387573182582855, "rewards/margins": 0.05218449980020523, "rewards/rejected": -0.16606023907661438, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.9982984143287186e-06, "logits/chosen": -0.18940022587776184, "logits/rejected": -0.2560541033744812, "logps/chosen": -531.294677734375, "logps/rejected": -585.197509765625, "loss": 0.1857, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.10909552872180939, "rewards/margins": 0.05390959978103638, "rewards/rejected": -0.16300514340400696, "step": 1140 }, { "epoch": 0.61, "learning_rate": 1.95277923333053e-06, "logits/chosen": -0.17567074298858643, "logits/rejected": -0.08641554415225983, "logps/chosen": -553.5035400390625, "logps/rejected": -660.8178100585938, "loss": 0.1567, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10895760357379913, "rewards/margins": 0.08049053698778152, "rewards/rejected": -0.18944814801216125, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.9074498188550156e-06, "logits/chosen": -0.23474335670471191, "logits/rejected": -0.10942939668893814, "logps/chosen": -514.3484497070312, "logps/rejected": -560.9384765625, "loss": 0.1795, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10143520683050156, "rewards/margins": 0.06253394484519958, "rewards/rejected": -0.16396915912628174, "step": 1160 }, { "epoch": 0.62, "learning_rate": 1.862325890344643e-06, "logits/chosen": -0.20588059723377228, "logits/rejected": -0.11454887688159943, "logps/chosen": -492.59356689453125, "logps/rejected": -502.0108947753906, "loss": 0.1959, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11700236797332764, "rewards/margins": 0.039218030869960785, "rewards/rejected": -0.15622040629386902, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.817423095982972e-06, "logits/chosen": -0.1135745421051979, "logits/rejected": -0.21998748183250427, "logps/chosen": -511.43975830078125, "logps/rejected": -591.8009033203125, "loss": 0.1752, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.11385758221149445, "rewards/margins": 0.062401823699474335, "rewards/rejected": -0.17625939846038818, "step": 1180 }, { "epoch": 0.63, "learning_rate": 1.7727570072681293e-06, "logits/chosen": -0.1687012016773224, "logits/rejected": -0.09966389834880829, "logps/chosen": -462.24664306640625, "logps/rejected": -541.049072265625, "loss": 0.1691, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11714209616184235, "rewards/margins": 0.0646623969078064, "rewards/rejected": -0.18180450797080994, "step": 1190 }, { "epoch": 0.64, "learning_rate": 1.7283431136128961e-06, "logits/chosen": -0.246677964925766, "logits/rejected": -0.1048688292503357, "logps/chosen": -603.5682983398438, "logps/rejected": -627.1761474609375, "loss": 0.1809, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.14629192650318146, "rewards/margins": 0.04966907575726509, "rewards/rejected": -0.19596099853515625, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6841968169732478e-06, "logits/chosen": -0.19738665223121643, "logits/rejected": -0.19479918479919434, "logps/chosen": -532.4442138671875, "logps/rejected": -585.4338989257812, "loss": 0.182, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12195596843957901, "rewards/margins": 0.06082000583410263, "rewards/rejected": -0.18277597427368164, "step": 1210 }, { "epoch": 0.65, "learning_rate": 1.6403334265072284e-06, "logits/chosen": -0.25693798065185547, "logits/rejected": -0.07481320202350616, "logps/chosen": -533.8644409179688, "logps/rejected": -563.401123046875, "loss": 0.197, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.12102751433849335, "rewards/margins": 0.05077686160802841, "rewards/rejected": -0.17180435359477997, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5967681532660066e-06, "logits/chosen": -0.2309873402118683, "logits/rejected": -0.10137152671813965, "logps/chosen": -551.7896728515625, "logps/rejected": -624.3748779296875, "loss": 0.19, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.11541406065225601, "rewards/margins": 0.06133885309100151, "rewards/rejected": -0.17675292491912842, "step": 1230 }, { "epoch": 0.66, "learning_rate": 1.5535161049189463e-06, "logits/chosen": -0.2246209681034088, "logits/rejected": -0.13239261507987976, "logps/chosen": -519.9109497070312, "logps/rejected": -560.1650390625, "loss": 0.1923, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1011136993765831, "rewards/margins": 0.05040975660085678, "rewards/rejected": -0.15152345597743988, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.5105922805145356e-06, "logits/chosen": -0.16264840960502625, "logits/rejected": -0.17924869060516357, "logps/chosen": -568.4891967773438, "logps/rejected": -606.84375, "loss": 0.1893, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.11496226489543915, "rewards/margins": 0.05937162786722183, "rewards/rejected": -0.17433388531208038, "step": 1250 }, { "epoch": 0.67, "learning_rate": 1.4680115652789823e-06, "logits/chosen": -0.23676082491874695, "logits/rejected": -0.10287532955408096, "logps/chosen": -539.798095703125, "logps/rejected": -559.6898193359375, "loss": 0.1844, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.103573277592659, "rewards/margins": 0.044529445469379425, "rewards/rejected": -0.14810271561145782, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.4257887254542767e-06, "logits/chosen": -0.16643217206001282, "logits/rejected": -0.20663738250732422, "logps/chosen": -477.3910217285156, "logps/rejected": -546.283203125, "loss": 0.1722, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09414063394069672, "rewards/margins": 0.0609886460006237, "rewards/rejected": -0.15512928366661072, "step": 1270 }, { "epoch": 0.68, "learning_rate": 1.3839384031775227e-06, "logits/chosen": -0.2298511266708374, "logits/rejected": -0.11243085563182831, "logps/chosen": -501.55517578125, "logps/rejected": -560.3333740234375, "loss": 0.2001, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.10863594710826874, "rewards/margins": 0.04755181074142456, "rewards/rejected": -0.1561877578496933, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.342475111403298e-06, "logits/chosen": -0.24995306134223938, "logits/rejected": -0.09091716259717941, "logps/chosen": -546.91455078125, "logps/rejected": -573.63134765625, "loss": 0.1758, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09987141191959381, "rewards/margins": 0.051563430577516556, "rewards/rejected": -0.15143482387065887, "step": 1290 }, { "epoch": 0.69, "learning_rate": 1.3014132288708209e-06, "logits/chosen": -0.20659741759300232, "logits/rejected": -0.1793586015701294, "logps/chosen": -461.78741455078125, "logps/rejected": -546.7547607421875, "loss": 0.1772, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.09007732570171356, "rewards/margins": 0.06504637002944946, "rewards/rejected": -0.15512368083000183, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2607669951176549e-06, "logits/chosen": -0.16400612890720367, "logits/rejected": -0.10733946412801743, "logps/chosen": -477.58673095703125, "logps/rejected": -483.0821838378906, "loss": 0.1908, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.09252551943063736, "rewards/margins": 0.03763090446591377, "rewards/rejected": -0.13015642762184143, "step": 1310 }, { "epoch": 0.7, "learning_rate": 1.2205505055416891e-06, "logits/chosen": -0.23552432656288147, "logits/rejected": -0.05100318789482117, "logps/chosen": -508.714111328125, "logps/rejected": -520.7734375, "loss": 0.1903, "rewards/accuracies": 0.40625, "rewards/chosen": -0.10599436610937119, "rewards/margins": 0.04144153743982315, "rewards/rejected": -0.14743590354919434, "step": 1320 }, { "epoch": 0.71, "learning_rate": 1.1807777065131002e-06, "logits/chosen": -0.23125293850898743, "logits/rejected": -0.17557989060878754, "logps/chosen": -544.9722900390625, "logps/rejected": -586.7325439453125, "loss": 0.1788, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10128601640462875, "rewards/margins": 0.07126955687999725, "rewards/rejected": -0.1725555956363678, "step": 1330 }, { "epoch": 0.71, "learning_rate": 1.1414623905380012e-06, "logits/chosen": -0.23312774300575256, "logits/rejected": -0.14277367293834686, "logps/chosen": -485.92535400390625, "logps/rejected": -506.35205078125, "loss": 0.194, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.09930580854415894, "rewards/margins": 0.03314018249511719, "rewards/rejected": -0.13244597613811493, "step": 1340 }, { "epoch": 0.72, "learning_rate": 1.1026181914754388e-06, "logits/chosen": -0.18930380046367645, "logits/rejected": -0.14925286173820496, "logps/chosen": -466.244140625, "logps/rejected": -539.2670288085938, "loss": 0.1836, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09682577848434448, "rewards/margins": 0.0532786026597023, "rewards/rejected": -0.1501043736934662, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0642585798094136e-06, "logits/chosen": -0.19082944095134735, "logits/rejected": -0.18094971776008606, "logps/chosen": -499.90362548828125, "logps/rejected": -542.33251953125, "loss": 0.1693, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.09551337361335754, "rewards/margins": 0.051950085908174515, "rewards/rejected": -0.14746347069740295, "step": 1360 }, { "epoch": 0.73, "learning_rate": 1.0263968579775522e-06, "logits/chosen": -0.19364750385284424, "logits/rejected": -0.22844457626342773, "logps/chosen": -481.7339782714844, "logps/rejected": -537.8257446289062, "loss": 0.173, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10904572159051895, "rewards/margins": 0.05478460714221001, "rewards/rejected": -0.16383032500743866, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.89046155758058e-07, "logits/chosen": -0.1437099277973175, "logits/rejected": -0.1075567975640297, "logps/chosen": -463.90985107421875, "logps/rejected": -566.3432006835938, "loss": 0.1896, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09277530014514923, "rewards/margins": 0.0687888115644455, "rewards/rejected": -0.16156412661075592, "step": 1380 }, { "epoch": 0.74, "learning_rate": 9.52219425716534e-07, "logits/chosen": -0.242875337600708, "logits/rejected": -0.16699561476707458, "logps/chosen": -552.685302734375, "logps/rejected": -590.0584106445312, "loss": 0.1848, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.11167445033788681, "rewards/margins": 0.04991639405488968, "rewards/rejected": -0.1615908443927765, "step": 1390 }, { "epoch": 0.75, "learning_rate": 9.15929438714262e-07, "logits/chosen": -0.11828355491161346, "logits/rejected": -0.20383377373218536, "logps/chosen": -546.3382568359375, "logps/rejected": -630.2655029296875, "loss": 0.1791, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.10707902908325195, "rewards/margins": 0.07424376159906387, "rewards/rejected": -0.18132279813289642, "step": 1400 }, { "epoch": 0.75, "learning_rate": 8.801887794794911e-07, "logits/chosen": -0.14532892405986786, "logits/rejected": -0.23003335297107697, "logps/chosen": -478.4186096191406, "logps/rejected": -552.7274169921875, "loss": 0.1775, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09584512561559677, "rewards/margins": 0.06631821393966675, "rewards/rejected": -0.16216334700584412, "step": 1410 }, { "epoch": 0.76, "learning_rate": 8.450098422432787e-07, "logits/chosen": -0.20046038925647736, "logits/rejected": -0.15967608988285065, "logps/chosen": -544.1749267578125, "logps/rejected": -586.116943359375, "loss": 0.1792, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.11548880487680435, "rewards/margins": 0.05985084921121597, "rewards/rejected": -0.17533965408802032, "step": 1420 }, { "epoch": 0.76, "learning_rate": 8.104048264413858e-07, "logits/chosen": -0.19410791993141174, "logits/rejected": -0.23477724194526672, "logps/chosen": -451.70941162109375, "logps/rejected": -480.71881103515625, "loss": 0.1821, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09721177071332932, "rewards/margins": 0.04050077125430107, "rewards/rejected": -0.1377125382423401, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.763857324837321e-07, "logits/chosen": -0.1642087996006012, "logits/rejected": -0.230143740773201, "logps/chosen": -522.2006225585938, "logps/rejected": -582.2677612304688, "loss": 0.1954, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.11323000490665436, "rewards/margins": 0.06264480203390121, "rewards/rejected": -0.17587482929229736, "step": 1440 }, { "epoch": 0.77, "learning_rate": 7.429643575928605e-07, "logits/chosen": -0.2773049473762512, "logits/rejected": -0.1615469753742218, "logps/chosen": -547.8629760742188, "logps/rejected": -587.5028076171875, "loss": 0.1817, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.11454708874225616, "rewards/margins": 0.0604601725935936, "rewards/rejected": -0.17500726878643036, "step": 1450 }, { "epoch": 0.78, "learning_rate": 7.101522917128709e-07, "logits/chosen": -0.16052532196044922, "logits/rejected": -0.25322675704956055, "logps/chosen": -515.0181884765625, "logps/rejected": -600.516357421875, "loss": 0.1766, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12128403037786484, "rewards/margins": 0.06581829488277435, "rewards/rejected": -0.1871023178100586, "step": 1460 }, { "epoch": 0.78, "learning_rate": 6.779609134902312e-07, "logits/chosen": -0.2754663825035095, "logits/rejected": -0.19558298587799072, "logps/chosen": -479.9947204589844, "logps/rejected": -522.5654296875, "loss": 0.1816, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.10307195037603378, "rewards/margins": 0.05991575866937637, "rewards/rejected": -0.16298770904541016, "step": 1470 }, { "epoch": 0.79, "learning_rate": 6.464013863278629e-07, "logits/chosen": -0.3147231340408325, "logits/rejected": -0.12713433802127838, "logps/chosen": -576.8265380859375, "logps/rejected": -582.334716796875, "loss": 0.2017, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13038143515586853, "rewards/margins": 0.05221244692802429, "rewards/rejected": -0.18259385228157043, "step": 1480 }, { "epoch": 0.79, "learning_rate": 6.154846545138696e-07, "logits/chosen": -0.20450320839881897, "logits/rejected": -0.2868250906467438, "logps/chosen": -535.9012451171875, "logps/rejected": -593.1867065429688, "loss": 0.1831, "rewards/accuracies": 0.5, "rewards/chosen": -0.11613848060369492, "rewards/margins": 0.05925974249839783, "rewards/rejected": -0.17539823055267334, "step": 1490 }, { "epoch": 0.8, "learning_rate": 5.852214394262515e-07, "logits/chosen": -0.20174920558929443, "logits/rejected": -0.17505855858325958, "logps/chosen": -544.5908813476562, "logps/rejected": -570.7057495117188, "loss": 0.1812, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.12965530157089233, "rewards/margins": 0.03963715583086014, "rewards/rejected": -0.16929244995117188, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.556222358149191e-07, "logits/chosen": -0.17067115008831024, "logits/rejected": -0.23377446830272675, "logps/chosen": -544.231201171875, "logps/rejected": -599.497802734375, "loss": 0.1781, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.11236411333084106, "rewards/margins": 0.05633332207798958, "rewards/rejected": -0.16869743168354034, "step": 1510 }, { "epoch": 0.81, "learning_rate": 5.266973081622992e-07, "logits/chosen": -0.2045322209596634, "logits/rejected": -0.19451120495796204, "logps/chosen": -543.3025512695312, "logps/rejected": -558.47119140625, "loss": 0.1912, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.12296583503484726, "rewards/margins": 0.03358592838048935, "rewards/rejected": -0.1565517634153366, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.984566871237942e-07, "logits/chosen": -0.23366038501262665, "logits/rejected": -0.2533060610294342, "logps/chosen": -494.28082275390625, "logps/rejected": -536.08251953125, "loss": 0.1975, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.11815425008535385, "rewards/margins": 0.03765954449772835, "rewards/rejected": -0.1558137983083725, "step": 1530 }, { "epoch": 0.82, "learning_rate": 4.709101660493251e-07, "logits/chosen": -0.25420263409614563, "logits/rejected": -0.10746750980615616, "logps/chosen": -496.64349365234375, "logps/rejected": -555.2488403320312, "loss": 0.1853, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10732255131006241, "rewards/margins": 0.05255774408578873, "rewards/rejected": -0.15988029539585114, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.440672975871743e-07, "logits/chosen": -0.230036661028862, "logits/rejected": -0.16146495938301086, "logps/chosen": -498.93365478515625, "logps/rejected": -519.977783203125, "loss": 0.179, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.09960173070430756, "rewards/margins": 0.04401581361889839, "rewards/rejected": -0.14361754059791565, "step": 1550 }, { "epoch": 0.83, "learning_rate": 4.1793739037129134e-07, "logits/chosen": -0.14786578714847565, "logits/rejected": -0.1643018275499344, "logps/chosen": -516.0054931640625, "logps/rejected": -543.8809814453125, "loss": 0.1974, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.10642917454242706, "rewards/margins": 0.05054362863302231, "rewards/rejected": -0.15697282552719116, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.9252950579322405e-07, "logits/chosen": -0.26063111424446106, "logits/rejected": -0.20259733498096466, "logps/chosen": -477.3902893066406, "logps/rejected": -544.326416015625, "loss": 0.1805, "rewards/accuracies": 0.5, "rewards/chosen": -0.11126448959112167, "rewards/margins": 0.053704213351011276, "rewards/rejected": -0.16496869921684265, "step": 1570 }, { "epoch": 0.84, "learning_rate": 3.6785245485978864e-07, "logits/chosen": -0.19165876507759094, "logits/rejected": -0.2015565186738968, "logps/chosen": -514.6396484375, "logps/rejected": -582.6300659179688, "loss": 0.1798, "rewards/accuracies": 0.5, "rewards/chosen": -0.11115659773349762, "rewards/margins": 0.05640328675508499, "rewards/rejected": -0.1675598919391632, "step": 1580 }, { "epoch": 0.85, "learning_rate": 3.43914795137566e-07, "logits/chosen": -0.24871298670768738, "logits/rejected": -0.2329408824443817, "logps/chosen": -527.1490478515625, "logps/rejected": -558.3999633789062, "loss": 0.1913, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.10850969702005386, "rewards/margins": 0.04025677964091301, "rewards/rejected": -0.14876648783683777, "step": 1590 }, { "epoch": 0.85, "learning_rate": 3.207248277852901e-07, "logits/chosen": -0.1298540085554123, "logits/rejected": -0.22532661259174347, "logps/chosen": -549.50390625, "logps/rejected": -626.0857543945312, "loss": 0.1837, "rewards/accuracies": 0.5, "rewards/chosen": -0.13459596037864685, "rewards/margins": 0.053374581038951874, "rewards/rejected": -0.18797054886817932, "step": 1600 }, { "epoch": 0.86, "learning_rate": 2.9829059467515074e-07, "logits/chosen": -0.1630309522151947, "logits/rejected": -0.2934538722038269, "logps/chosen": -513.16943359375, "logps/rejected": -542.3517456054688, "loss": 0.1856, "rewards/accuracies": 0.5, "rewards/chosen": -0.1167718768119812, "rewards/margins": 0.04849160462617874, "rewards/rejected": -0.16526347398757935, "step": 1610 }, { "epoch": 0.86, "learning_rate": 2.766198756040153e-07, "logits/chosen": -0.24708867073059082, "logits/rejected": -0.21414759755134583, "logps/chosen": -569.2642822265625, "logps/rejected": -592.0335693359375, "loss": 0.1832, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1170465499162674, "rewards/margins": 0.04982801154255867, "rewards/rejected": -0.16687455773353577, "step": 1620 }, { "epoch": 0.87, "learning_rate": 2.5572018559553155e-07, "logits/chosen": -0.2323312759399414, "logits/rejected": -0.0636955201625824, "logps/chosen": -530.3974609375, "logps/rejected": -546.2402954101562, "loss": 0.196, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10811781883239746, "rewards/margins": 0.05536898970603943, "rewards/rejected": -0.1634868085384369, "step": 1630 }, { "epoch": 0.87, "learning_rate": 2.3559877229404864e-07, "logits/chosen": -0.2858187258243561, "logits/rejected": -0.16766619682312012, "logps/chosen": -497.91876220703125, "logps/rejected": -530.9742431640625, "loss": 0.1815, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09212108701467514, "rewards/margins": 0.06050460413098335, "rewards/rejected": -0.1526256948709488, "step": 1640 }, { "epoch": 0.88, "learning_rate": 2.1626261345126576e-07, "logits/chosen": -0.18948575854301453, "logits/rejected": -0.2522360682487488, "logps/chosen": -498.93951416015625, "logps/rejected": -548.31787109375, "loss": 0.1875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.10647443681955338, "rewards/margins": 0.06186903268098831, "rewards/rejected": -0.1683434695005417, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.9771841450646505e-07, "logits/chosen": -0.20894873142242432, "logits/rejected": -0.07307926565408707, "logps/chosen": -496.451416015625, "logps/rejected": -536.22119140625, "loss": 0.1744, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.10921342670917511, "rewards/margins": 0.05700286105275154, "rewards/rejected": -0.16621626913547516, "step": 1660 }, { "epoch": 0.89, "learning_rate": 1.7997260626118758e-07, "logits/chosen": -0.2548653483390808, "logits/rejected": -0.2179504930973053, "logps/chosen": -520.787841796875, "logps/rejected": -576.6990966796875, "loss": 0.1716, "rewards/accuracies": 0.5, "rewards/chosen": -0.10316022485494614, "rewards/margins": 0.06640519946813583, "rewards/rejected": -0.16956540942192078, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.6303134264914365e-07, "logits/chosen": -0.14907491207122803, "logits/rejected": -0.18132783472537994, "logps/chosen": -493.564697265625, "logps/rejected": -568.8427734375, "loss": 0.1922, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.11788590997457504, "rewards/margins": 0.05090482905507088, "rewards/rejected": -0.16879074275493622, "step": 1680 }, { "epoch": 0.9, "learning_rate": 1.469004986021355e-07, "logits/chosen": -0.2242707461118698, "logits/rejected": -0.16642411053180695, "logps/chosen": -527.6796875, "logps/rejected": -568.4700927734375, "loss": 0.1721, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.10909800231456757, "rewards/margins": 0.05576147884130478, "rewards/rejected": -0.16485948860645294, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.315856680127367e-07, "logits/chosen": -0.1916467398405075, "logits/rejected": -0.21406717598438263, "logps/chosen": -564.77880859375, "logps/rejected": -602.3505859375, "loss": 0.1906, "rewards/accuracies": 0.5, "rewards/chosen": -0.12277360260486603, "rewards/margins": 0.04569784551858902, "rewards/rejected": -0.16847144067287445, "step": 1700 }, { "epoch": 0.91, "learning_rate": 1.1709216179442817e-07, "logits/chosen": -0.20755720138549805, "logits/rejected": -0.1797691434621811, "logps/chosen": -478.31634521484375, "logps/rejected": -570.5222778320312, "loss": 0.1803, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10136885941028595, "rewards/margins": 0.07469947636127472, "rewards/rejected": -0.17606833577156067, "step": 1710 }, { "epoch": 0.92, "learning_rate": 1.0342500603986421e-07, "logits/chosen": -0.15231382846832275, "logits/rejected": -0.20254746079444885, "logps/chosen": -493.1875915527344, "logps/rejected": -561.9268188476562, "loss": 0.1827, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.10837433487176895, "rewards/margins": 0.05267762392759323, "rewards/rejected": -0.16105195879936218, "step": 1720 }, { "epoch": 0.92, "learning_rate": 9.058894027791643e-08, "logits/chosen": -0.33545562624931335, "logits/rejected": -0.14177361130714417, "logps/chosen": -489.1234436035156, "logps/rejected": -541.3236694335938, "loss": 0.1817, "rewards/accuracies": 0.46875, "rewards/chosen": -0.10399062931537628, "rewards/margins": 0.052609704434871674, "rewards/rejected": -0.15660032629966736, "step": 1730 }, { "epoch": 0.93, "learning_rate": 7.858841583008592e-08, "logits/chosen": -0.18932399153709412, "logits/rejected": -0.16892239451408386, "logps/chosen": -535.3873291015625, "logps/rejected": -541.3253173828125, "loss": 0.1843, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11377620697021484, "rewards/margins": 0.04663427174091339, "rewards/rejected": -0.16041049361228943, "step": 1740 }, { "epoch": 0.93, "learning_rate": 6.742759426686313e-08, "logits/chosen": -0.171333447098732, "logits/rejected": -0.1328922063112259, "logps/chosen": -491.8202209472656, "logps/rejected": -543.0697021484375, "loss": 0.1924, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1006278246641159, "rewards/margins": 0.05380718782544136, "rewards/rejected": -0.15443500876426697, "step": 1750 }, { "epoch": 0.94, "learning_rate": 5.7110345964571104e-08, "logits/chosen": -0.24088236689567566, "logits/rejected": -0.15874245762825012, "logps/chosen": -508.15496826171875, "logps/rejected": -554.2071533203125, "loss": 0.1761, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.10783319175243378, "rewards/margins": 0.05231726914644241, "rewards/rejected": -0.1601504534482956, "step": 1760 }, { "epoch": 0.94, "learning_rate": 4.764024876318357e-08, "logits/chosen": -0.35891515016555786, "logits/rejected": -0.10517171770334244, "logps/chosen": -558.290771484375, "logps/rejected": -577.0843505859375, "loss": 0.1852, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1132727861404419, "rewards/margins": 0.04631791263818741, "rewards/rejected": -0.1595907062292099, "step": 1770 }, { "epoch": 0.95, "learning_rate": 3.902058672559633e-08, "logits/chosen": -0.25900477170944214, "logits/rejected": -0.2353241741657257, "logps/chosen": -511.06231689453125, "logps/rejected": -571.1373291015625, "loss": 0.181, "rewards/accuracies": 0.5, "rewards/chosen": -0.1033167615532875, "rewards/margins": 0.05643168091773987, "rewards/rejected": -0.15974844992160797, "step": 1780 }, { "epoch": 0.95, "learning_rate": 3.125434899876933e-08, "logits/chosen": -0.25551778078079224, "logits/rejected": -0.1563359797000885, "logps/chosen": -518.1477661132812, "logps/rejected": -544.9768676757812, "loss": 0.1796, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1060783639550209, "rewards/margins": 0.061492882668972015, "rewards/rejected": -0.16757124662399292, "step": 1790 }, { "epoch": 0.96, "learning_rate": 2.4344228777145873e-08, "logits/chosen": -0.306395947933197, "logits/rejected": -0.20862647891044617, "logps/chosen": -527.55908203125, "logps/rejected": -568.0324096679688, "loss": 0.1903, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.11323089897632599, "rewards/margins": 0.051735300570726395, "rewards/rejected": -0.16496619582176208, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.829262236869772e-08, "logits/chosen": -0.2206902951002121, "logits/rejected": -0.22108717262744904, "logps/chosen": -513.4564208984375, "logps/rejected": -591.93701171875, "loss": 0.1774, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10885292291641235, "rewards/margins": 0.06501305848360062, "rewards/rejected": -0.17386598885059357, "step": 1810 }, { "epoch": 0.97, "learning_rate": 1.3101628363929586e-08, "logits/chosen": -0.11251676082611084, "logits/rejected": -0.19971203804016113, "logps/chosen": -483.9027404785156, "logps/rejected": -568.6507568359375, "loss": 0.1638, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0930183082818985, "rewards/margins": 0.07240499556064606, "rewards/rejected": -0.16542330384254456, "step": 1820 }, { "epoch": 0.98, "learning_rate": 8.773046908123195e-09, "logits/chosen": -0.10743804275989532, "logits/rejected": -0.14361344277858734, "logps/chosen": -578.63671875, "logps/rejected": -649.34423828125, "loss": 0.1712, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12350692600011826, "rewards/margins": 0.06949726492166519, "rewards/rejected": -0.19300420582294464, "step": 1830 }, { "epoch": 0.98, "learning_rate": 5.308379077080817e-09, "logits/chosen": -0.2110254317522049, "logits/rejected": -0.14409136772155762, "logps/chosen": -484.0826110839844, "logps/rejected": -534.74755859375, "loss": 0.1695, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.09997762739658356, "rewards/margins": 0.05725119262933731, "rewards/rejected": -0.15722879767417908, "step": 1840 }, { "epoch": 0.99, "learning_rate": 2.7088263565760996e-09, "logits/chosen": -0.2690781056880951, "logits/rejected": -0.10777749866247177, "logps/chosen": -531.328125, "logps/rejected": -558.0218505859375, "loss": 0.192, "rewards/accuracies": 0.4375, "rewards/chosen": -0.11898232996463776, "rewards/margins": 0.04323791339993477, "rewards/rejected": -0.16222023963928223, "step": 1850 }, { "epoch": 0.99, "learning_rate": 9.752902257023633e-10, "logits/chosen": -0.2458486258983612, "logits/rejected": -0.14588430523872375, "logps/chosen": -481.7171936035156, "logps/rejected": -523.7376708984375, "loss": 0.1744, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.10408926010131836, "rewards/margins": 0.047747764736413956, "rewards/rejected": -0.15183702111244202, "step": 1860 }, { "epoch": 1.0, "learning_rate": 1.083718442532189e-10, "logits/chosen": -0.2206140011548996, "logits/rejected": -0.13354091346263885, "logps/chosen": -543.6251220703125, "logps/rejected": -563.88330078125, "loss": 0.1818, "rewards/accuracies": 0.5, "rewards/chosen": -0.11700856685638428, "rewards/margins": 0.05272556096315384, "rewards/rejected": -0.1697341352701187, "step": 1870 }, { "epoch": 1.0, "step": 1875, "total_flos": 0.0, "train_loss": 0.016774978733062745, "train_runtime": 1063.735, "train_samples_per_second": 28.203, "train_steps_per_second": 1.763 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }