{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998972954467648, "eval_steps": 100, "global_step": 6570, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.5662100456621e-10, "logits/chosen": -0.5863479375839233, "logits/rejected": -0.6061025261878967, "logps/chosen": -79.05304718017578, "logps/rejected": -63.445465087890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 4.5662100456621e-09, "logits/chosen": -0.42927077412605286, "logits/rejected": -0.4148653745651245, "logps/chosen": -84.7120590209961, "logps/rejected": -67.09528350830078, "loss": 0.6975, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.036662034690380096, "rewards/margins": -0.027731113135814667, "rewards/rejected": -0.00893092155456543, "step": 10 }, { "epoch": 0.01, "learning_rate": 9.1324200913242e-09, "logits/chosen": -0.3915753960609436, "logits/rejected": -0.43573087453842163, "logps/chosen": -89.85740661621094, "logps/rejected": -68.4244155883789, "loss": 0.6922, "rewards/accuracies": 0.5625, "rewards/chosen": 0.019763624295592308, "rewards/margins": 0.016261756420135498, "rewards/rejected": 0.0035018683411180973, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.36986301369863e-08, "logits/chosen": -0.40983158349990845, "logits/rejected": -0.3968455195426941, "logps/chosen": -85.82371520996094, "logps/rejected": -68.32268524169922, "loss": 0.6878, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.008524882607161999, "rewards/margins": 0.002812635852023959, "rewards/rejected": 0.005712246987968683, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.82648401826484e-08, "logits/chosen": -0.43874412775039673, "logits/rejected": -0.4789341390132904, "logps/chosen": -88.45948791503906, "logps/rejected": -67.00940704345703, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02156217023730278, "rewards/margins": 0.013487410731613636, "rewards/rejected": 0.008074760437011719, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.28310502283105e-08, "logits/chosen": -0.41604742407798767, "logits/rejected": -0.48207980394363403, "logps/chosen": -86.55135345458984, "logps/rejected": -65.5015640258789, "loss": 0.6802, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0028099894989281893, "rewards/margins": 0.012911021709442139, "rewards/rejected": -0.010101032443344593, "step": 50 }, { "epoch": 0.03, "learning_rate": 2.73972602739726e-08, "logits/chosen": -0.508940577507019, "logits/rejected": -0.46223893761634827, "logps/chosen": -90.01718139648438, "logps/rejected": -69.65950012207031, "loss": 0.6651, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04631998389959335, "rewards/margins": 0.0902792438864708, "rewards/rejected": -0.04395925998687744, "step": 60 }, { "epoch": 0.03, "learning_rate": 3.19634703196347e-08, "logits/chosen": -0.3910054564476013, "logits/rejected": -0.4126061797142029, "logps/chosen": -87.1267318725586, "logps/rejected": -67.88300323486328, "loss": 0.6491, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.03842337056994438, "rewards/margins": 0.10330170392990112, "rewards/rejected": -0.06487832963466644, "step": 70 }, { "epoch": 0.04, "learning_rate": 3.65296803652968e-08, "logits/chosen": -0.3637865483760834, "logits/rejected": -0.4041527211666107, "logps/chosen": -92.4252700805664, "logps/rejected": -68.47924041748047, "loss": 0.6122, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08547542989253998, "rewards/margins": 0.18988534808158875, "rewards/rejected": -0.10440991073846817, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.10958904109589e-08, "logits/chosen": -0.3265071511268616, "logits/rejected": -0.3582015633583069, "logps/chosen": -79.43329620361328, "logps/rejected": -63.17741012573242, "loss": 0.5792, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.08616051822900772, "rewards/margins": 0.2579037547111511, "rewards/rejected": -0.171743243932724, "step": 90 }, { "epoch": 0.05, "learning_rate": 4.5662100456621e-08, "logits/chosen": -0.4225758910179138, "logits/rejected": -0.4386584162712097, "logps/chosen": -85.61375427246094, "logps/rejected": -66.69580078125, "loss": 0.5515, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.09259802103042603, "rewards/margins": 0.32498809695243835, "rewards/rejected": -0.23239007592201233, "step": 100 }, { "epoch": 0.05, "eval_logits/chosen": -0.39840757846832275, "eval_logits/rejected": -0.4061564803123474, "eval_logps/chosen": -85.365966796875, "eval_logps/rejected": -65.37533569335938, "eval_loss": 0.5419811010360718, "eval_rewards/accuracies": 0.910614550113678, "eval_rewards/chosen": 0.08594708889722824, "eval_rewards/margins": 0.3429102897644043, "eval_rewards/rejected": -0.25696322321891785, "eval_runtime": 81.2496, "eval_samples_per_second": 35.225, "eval_steps_per_second": 2.203, "step": 100 }, { "epoch": 0.05, "learning_rate": 5.02283105022831e-08, "logits/chosen": -0.46739107370376587, "logits/rejected": -0.44236382842063904, "logps/chosen": -79.65380096435547, "logps/rejected": -62.80085372924805, "loss": 0.5316, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06431801617145538, "rewards/margins": 0.35616832971572876, "rewards/rejected": -0.29185032844543457, "step": 110 }, { "epoch": 0.05, "learning_rate": 5.47945205479452e-08, "logits/chosen": -0.40276527404785156, "logits/rejected": -0.4527131915092468, "logps/chosen": -89.0259780883789, "logps/rejected": -69.21381378173828, "loss": 0.4844, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.15527208149433136, "rewards/margins": 0.5430252552032471, "rewards/rejected": -0.3877531886100769, "step": 120 }, { "epoch": 0.06, "learning_rate": 5.93607305936073e-08, "logits/chosen": -0.34187182784080505, "logits/rejected": -0.3704484701156616, "logps/chosen": -90.00749206542969, "logps/rejected": -65.06084442138672, "loss": 0.4142, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19979539513587952, "rewards/margins": 0.7884315848350525, "rewards/rejected": -0.5886362791061401, "step": 130 }, { "epoch": 0.06, "learning_rate": 6.39269406392694e-08, "logits/chosen": -0.4649466872215271, "logits/rejected": -0.4326017498970032, "logps/chosen": -86.98945617675781, "logps/rejected": -68.6103515625, "loss": 0.3701, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17062470316886902, "rewards/margins": 0.8517505526542664, "rewards/rejected": -0.681125819683075, "step": 140 }, { "epoch": 0.07, "learning_rate": 6.84931506849315e-08, "logits/chosen": -0.45548853278160095, "logits/rejected": -0.4255827069282532, "logps/chosen": -84.7039794921875, "logps/rejected": -66.87324523925781, "loss": 0.3414, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2619437873363495, "rewards/margins": 0.9896998405456543, "rewards/rejected": -0.7277560830116272, "step": 150 }, { "epoch": 0.07, "learning_rate": 7.30593607305936e-08, "logits/chosen": -0.3764224648475647, "logits/rejected": -0.4269483685493469, "logps/chosen": -89.51905822753906, "logps/rejected": -70.71208953857422, "loss": 0.3168, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.203881174325943, "rewards/margins": 1.1547460556030273, "rewards/rejected": -0.9508647918701172, "step": 160 }, { "epoch": 0.08, "learning_rate": 7.76255707762557e-08, "logits/chosen": -0.31334739923477173, "logits/rejected": -0.373046338558197, "logps/chosen": -81.72102355957031, "logps/rejected": -69.88837432861328, "loss": 0.3234, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17189902067184448, "rewards/margins": 1.1217617988586426, "rewards/rejected": -0.9498627781867981, "step": 170 }, { "epoch": 0.08, "learning_rate": 8.21917808219178e-08, "logits/chosen": -0.4377995431423187, "logits/rejected": -0.4540349841117859, "logps/chosen": -91.31315612792969, "logps/rejected": -67.61857604980469, "loss": 0.3018, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3559116721153259, "rewards/margins": 1.3531014919281006, "rewards/rejected": -0.9971898198127747, "step": 180 }, { "epoch": 0.09, "learning_rate": 8.67579908675799e-08, "logits/chosen": -0.5179687738418579, "logits/rejected": -0.5117358565330505, "logps/chosen": -84.22309875488281, "logps/rejected": -69.97932434082031, "loss": 0.2677, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3408846855163574, "rewards/margins": 1.411771297454834, "rewards/rejected": -1.0708866119384766, "step": 190 }, { "epoch": 0.09, "learning_rate": 9.1324200913242e-08, "logits/chosen": -0.40042734146118164, "logits/rejected": -0.4178311228752136, "logps/chosen": -85.72370910644531, "logps/rejected": -68.88258361816406, "loss": 0.2448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2792873680591583, "rewards/margins": 1.6100257635116577, "rewards/rejected": -1.3307384252548218, "step": 200 }, { "epoch": 0.09, "eval_logits/chosen": -0.3937744200229645, "eval_logits/rejected": -0.39809945225715637, "eval_logps/chosen": -84.8699722290039, "eval_logps/rejected": -67.29297637939453, "eval_loss": 0.23899392783641815, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.33393988013267517, "eval_rewards/margins": 1.5497231483459473, "eval_rewards/rejected": -1.2157832384109497, "eval_runtime": 99.6106, "eval_samples_per_second": 28.732, "eval_steps_per_second": 1.797, "step": 200 }, { "epoch": 0.1, "learning_rate": 9.58904109589041e-08, "logits/chosen": -0.4026223123073578, "logits/rejected": -0.4446091651916504, "logps/chosen": -79.4525375366211, "logps/rejected": -65.70721435546875, "loss": 0.2066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4364851415157318, "rewards/margins": 1.7379001379013062, "rewards/rejected": -1.3014150857925415, "step": 210 }, { "epoch": 0.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -0.3757795989513397, "logits/rejected": -0.40395841002464294, "logps/chosen": -80.05278015136719, "logps/rejected": -67.86895751953125, "loss": 0.1816, "rewards/accuracies": 1.0, "rewards/chosen": 0.3973820209503174, "rewards/margins": 1.9494373798370361, "rewards/rejected": -1.5520555973052979, "step": 220 }, { "epoch": 0.1, "learning_rate": 1.050228310502283e-07, "logits/chosen": -0.3210656940937042, "logits/rejected": -0.3856700658798218, "logps/chosen": -87.6169662475586, "logps/rejected": -71.29808044433594, "loss": 0.1496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3461844027042389, "rewards/margins": 2.317335605621338, "rewards/rejected": -1.9711511135101318, "step": 230 }, { "epoch": 0.11, "learning_rate": 1.095890410958904e-07, "logits/chosen": -0.4276762902736664, "logits/rejected": -0.45350733399391174, "logps/chosen": -88.98173522949219, "logps/rejected": -66.96391296386719, "loss": 0.1329, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5452815294265747, "rewards/margins": 2.694061517715454, "rewards/rejected": -2.148780107498169, "step": 240 }, { "epoch": 0.11, "learning_rate": 1.141552511415525e-07, "logits/chosen": -0.4410382807254791, "logits/rejected": -0.4512443542480469, "logps/chosen": -88.98338317871094, "logps/rejected": -74.33209991455078, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 0.3832108974456787, "rewards/margins": 2.685126304626465, "rewards/rejected": -2.301915168762207, "step": 250 }, { "epoch": 0.12, "learning_rate": 1.187214611872146e-07, "logits/chosen": -0.4957438111305237, "logits/rejected": -0.4765089154243469, "logps/chosen": -86.44584655761719, "logps/rejected": -72.58143615722656, "loss": 0.1242, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5778161883354187, "rewards/margins": 2.74499249458313, "rewards/rejected": -2.1671762466430664, "step": 260 }, { "epoch": 0.12, "learning_rate": 1.232876712328767e-07, "logits/chosen": -0.4042617380619049, "logits/rejected": -0.44205984473228455, "logps/chosen": -87.01408386230469, "logps/rejected": -71.92070007324219, "loss": 0.1136, "rewards/accuracies": 1.0, "rewards/chosen": 0.5387381911277771, "rewards/margins": 2.790922164916992, "rewards/rejected": -2.2521841526031494, "step": 270 }, { "epoch": 0.13, "learning_rate": 1.278538812785388e-07, "logits/chosen": -0.4417573809623718, "logits/rejected": -0.4520273804664612, "logps/chosen": -88.83131408691406, "logps/rejected": -77.17863464355469, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 0.42587852478027344, "rewards/margins": 3.2474589347839355, "rewards/rejected": -2.8215808868408203, "step": 280 }, { "epoch": 0.13, "learning_rate": 1.324200913242009e-07, "logits/chosen": -0.4425846040248871, "logits/rejected": -0.5012689828872681, "logps/chosen": -90.25948333740234, "logps/rejected": -76.37370300292969, "loss": 0.1002, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.533167839050293, "rewards/margins": 3.3074212074279785, "rewards/rejected": -2.7742531299591064, "step": 290 }, { "epoch": 0.14, "learning_rate": 1.36986301369863e-07, "logits/chosen": -0.3799470365047455, "logits/rejected": -0.44799962639808655, "logps/chosen": -80.90581512451172, "logps/rejected": -70.25505065917969, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 0.6411818265914917, "rewards/margins": 3.449615478515625, "rewards/rejected": -2.808433771133423, "step": 300 }, { "epoch": 0.14, "eval_logits/chosen": -0.4048069417476654, "eval_logits/rejected": -0.40349695086479187, "eval_logps/chosen": -84.36418914794922, "eval_logps/rejected": -70.42156219482422, "eval_loss": 0.0937228724360466, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.5868332386016846, "eval_rewards/margins": 3.3669118881225586, "eval_rewards/rejected": -2.780078411102295, "eval_runtime": 95.1355, "eval_samples_per_second": 30.083, "eval_steps_per_second": 1.882, "step": 300 }, { "epoch": 0.14, "learning_rate": 1.415525114155251e-07, "logits/chosen": -0.39280638098716736, "logits/rejected": -0.3995700180530548, "logps/chosen": -84.13384246826172, "logps/rejected": -70.02226257324219, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 0.6554551124572754, "rewards/margins": 3.7848308086395264, "rewards/rejected": -3.12937593460083, "step": 310 }, { "epoch": 0.15, "learning_rate": 1.461187214611872e-07, "logits/chosen": -0.4027988314628601, "logits/rejected": -0.43616342544555664, "logps/chosen": -90.87525939941406, "logps/rejected": -73.50352478027344, "loss": 0.0821, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.41767749190330505, "rewards/margins": 3.4736123085021973, "rewards/rejected": -3.0559346675872803, "step": 320 }, { "epoch": 0.15, "learning_rate": 1.506849315068493e-07, "logits/chosen": -0.40688735246658325, "logits/rejected": -0.42999106645584106, "logps/chosen": -81.19529724121094, "logps/rejected": -72.04861450195312, "loss": 0.0833, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9484050869941711, "rewards/margins": 3.989964246749878, "rewards/rejected": -3.0415592193603516, "step": 330 }, { "epoch": 0.16, "learning_rate": 1.552511415525114e-07, "logits/chosen": -0.3533805012702942, "logits/rejected": -0.3621350824832916, "logps/chosen": -82.51054382324219, "logps/rejected": -69.80853271484375, "loss": 0.0881, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.532248854637146, "rewards/margins": 3.6017870903015137, "rewards/rejected": -3.0695383548736572, "step": 340 }, { "epoch": 0.16, "learning_rate": 1.598173515981735e-07, "logits/chosen": -0.3965502381324768, "logits/rejected": -0.42704907059669495, "logps/chosen": -79.4199447631836, "logps/rejected": -74.82305145263672, "loss": 0.0793, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6175357103347778, "rewards/margins": 4.187249660491943, "rewards/rejected": -3.569714307785034, "step": 350 }, { "epoch": 0.16, "learning_rate": 1.643835616438356e-07, "logits/chosen": -0.40762096643447876, "logits/rejected": -0.41946473717689514, "logps/chosen": -82.0754623413086, "logps/rejected": -73.5984878540039, "loss": 0.0705, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5647357106208801, "rewards/margins": 3.6981327533721924, "rewards/rejected": -3.133397102355957, "step": 360 }, { "epoch": 0.17, "learning_rate": 1.689497716894977e-07, "logits/chosen": -0.42972856760025024, "logits/rejected": -0.44596537947654724, "logps/chosen": -86.44332122802734, "logps/rejected": -69.35662841796875, "loss": 0.0753, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8704233169555664, "rewards/margins": 3.9212913513183594, "rewards/rejected": -3.050868511199951, "step": 370 }, { "epoch": 0.17, "learning_rate": 1.735159817351598e-07, "logits/chosen": -0.39177441596984863, "logits/rejected": -0.4080818295478821, "logps/chosen": -86.29408264160156, "logps/rejected": -72.41487884521484, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 0.6717110872268677, "rewards/margins": 4.34299373626709, "rewards/rejected": -3.6712818145751953, "step": 380 }, { "epoch": 0.18, "learning_rate": 1.780821917808219e-07, "logits/chosen": -0.40621963143348694, "logits/rejected": -0.43334826827049255, "logps/chosen": -83.55781555175781, "logps/rejected": -73.66390991210938, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": 0.5702416896820068, "rewards/margins": 4.263453006744385, "rewards/rejected": -3.693211317062378, "step": 390 }, { "epoch": 0.18, "learning_rate": 1.82648401826484e-07, "logits/chosen": -0.4090951979160309, "logits/rejected": -0.48223596811294556, "logps/chosen": -83.9447250366211, "logps/rejected": -78.8793716430664, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 1.0351506471633911, "rewards/margins": 5.023129463195801, "rewards/rejected": -3.9879791736602783, "step": 400 }, { "epoch": 0.18, "eval_logits/chosen": -0.4163001775741577, "eval_logits/rejected": -0.40926501154899597, "eval_logps/chosen": -83.8791275024414, "eval_logps/rejected": -72.57203674316406, "eval_loss": 0.053430840373039246, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.8293697237968445, "eval_rewards/margins": 4.6846842765808105, "eval_rewards/rejected": -3.855314254760742, "eval_runtime": 67.7104, "eval_samples_per_second": 42.268, "eval_steps_per_second": 2.644, "step": 400 }, { "epoch": 0.19, "learning_rate": 1.872146118721461e-07, "logits/chosen": -0.3794030547142029, "logits/rejected": -0.44113850593566895, "logps/chosen": -93.76518249511719, "logps/rejected": -75.81021881103516, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9566559791564941, "rewards/margins": 4.865769863128662, "rewards/rejected": -3.909113645553589, "step": 410 }, { "epoch": 0.19, "learning_rate": 1.917808219178082e-07, "logits/chosen": -0.44859427213668823, "logits/rejected": -0.4528217911720276, "logps/chosen": -81.76443481445312, "logps/rejected": -75.37516784667969, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 1.0190056562423706, "rewards/margins": 4.9655022621154785, "rewards/rejected": -3.9464964866638184, "step": 420 }, { "epoch": 0.2, "learning_rate": 1.963470319634703e-07, "logits/chosen": -0.44366198778152466, "logits/rejected": -0.435803085565567, "logps/chosen": -80.83895111083984, "logps/rejected": -75.4388427734375, "loss": 0.04, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6601322293281555, "rewards/margins": 4.995715141296387, "rewards/rejected": -4.335582733154297, "step": 430 }, { "epoch": 0.2, "learning_rate": 2.009132420091324e-07, "logits/chosen": -0.4242371618747711, "logits/rejected": -0.44815540313720703, "logps/chosen": -78.79200744628906, "logps/rejected": -74.7320785522461, "loss": 0.0372, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9704369306564331, "rewards/margins": 5.741919994354248, "rewards/rejected": -4.771483421325684, "step": 440 }, { "epoch": 0.21, "learning_rate": 2.054794520547945e-07, "logits/chosen": -0.37376198172569275, "logits/rejected": -0.41443461179733276, "logps/chosen": -87.1550064086914, "logps/rejected": -76.47234344482422, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 1.1105682849884033, "rewards/margins": 6.222236156463623, "rewards/rejected": -5.111668586730957, "step": 450 }, { "epoch": 0.21, "learning_rate": 2.100456621004566e-07, "logits/chosen": -0.46613430976867676, "logits/rejected": -0.45502910017967224, "logps/chosen": -88.49858093261719, "logps/rejected": -83.72077941894531, "loss": 0.0368, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8836356997489929, "rewards/margins": 5.70847225189209, "rewards/rejected": -4.824836730957031, "step": 460 }, { "epoch": 0.21, "learning_rate": 2.146118721461187e-07, "logits/chosen": -0.49693307280540466, "logits/rejected": -0.49518340826034546, "logps/chosen": -84.83604431152344, "logps/rejected": -76.25226593017578, "loss": 0.0301, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8985893130302429, "rewards/margins": 5.911735534667969, "rewards/rejected": -5.01314640045166, "step": 470 }, { "epoch": 0.22, "learning_rate": 2.191780821917808e-07, "logits/chosen": -0.5251117944717407, "logits/rejected": -0.4963545799255371, "logps/chosen": -87.3931884765625, "logps/rejected": -76.91529846191406, "loss": 0.0288, "rewards/accuracies": 1.0, "rewards/chosen": 1.006616473197937, "rewards/margins": 6.471321105957031, "rewards/rejected": -5.4647040367126465, "step": 480 }, { "epoch": 0.22, "learning_rate": 2.237442922374429e-07, "logits/chosen": -0.4567469656467438, "logits/rejected": -0.44530144333839417, "logps/chosen": -86.60816955566406, "logps/rejected": -80.15440368652344, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": 1.0026960372924805, "rewards/margins": 6.509522438049316, "rewards/rejected": -5.506826877593994, "step": 490 }, { "epoch": 0.23, "learning_rate": 2.28310502283105e-07, "logits/chosen": -0.44169288873672485, "logits/rejected": -0.4075329899787903, "logps/chosen": -85.48939514160156, "logps/rejected": -78.98982238769531, "loss": 0.0261, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6804320216178894, "rewards/margins": 6.771535396575928, "rewards/rejected": -6.091103553771973, "step": 500 }, { "epoch": 0.23, "eval_logits/chosen": -0.4554254114627838, "eval_logits/rejected": -0.43644845485687256, "eval_logps/chosen": -83.64783477783203, "eval_logps/rejected": -77.03020477294922, "eval_loss": 0.026485837996006012, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.9450124502182007, "eval_rewards/margins": 7.029412269592285, "eval_rewards/rejected": -6.084399700164795, "eval_runtime": 68.3909, "eval_samples_per_second": 41.848, "eval_steps_per_second": 2.617, "step": 500 }, { "epoch": 0.23, "learning_rate": 2.328767123287671e-07, "logits/chosen": -0.48590287566185, "logits/rejected": -0.4845882058143616, "logps/chosen": -79.791748046875, "logps/rejected": -75.46503448486328, "loss": 0.0256, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8771770596504211, "rewards/margins": 7.567771911621094, "rewards/rejected": -6.690594673156738, "step": 510 }, { "epoch": 0.24, "learning_rate": 2.374429223744292e-07, "logits/chosen": -0.49845123291015625, "logits/rejected": -0.5058192610740662, "logps/chosen": -88.39710998535156, "logps/rejected": -82.1724624633789, "loss": 0.021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8767349123954773, "rewards/margins": 7.156858921051025, "rewards/rejected": -6.280123710632324, "step": 520 }, { "epoch": 0.24, "learning_rate": 2.420091324200913e-07, "logits/chosen": -0.5552406311035156, "logits/rejected": -0.5379841923713684, "logps/chosen": -90.11488342285156, "logps/rejected": -83.9296646118164, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 1.1575672626495361, "rewards/margins": 8.527839660644531, "rewards/rejected": -7.370272636413574, "step": 530 }, { "epoch": 0.25, "learning_rate": 2.465753424657534e-07, "logits/chosen": -0.4260433614253998, "logits/rejected": -0.42235612869262695, "logps/chosen": -86.18304443359375, "logps/rejected": -83.03050994873047, "loss": 0.0169, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.153515100479126, "rewards/margins": 8.45301628112793, "rewards/rejected": -7.299500942230225, "step": 540 }, { "epoch": 0.25, "learning_rate": 2.511415525114155e-07, "logits/chosen": -0.5101253986358643, "logits/rejected": -0.5125061869621277, "logps/chosen": -85.40342712402344, "logps/rejected": -77.66134643554688, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": 0.4489992558956146, "rewards/margins": 7.528244972229004, "rewards/rejected": -7.079245567321777, "step": 550 }, { "epoch": 0.26, "learning_rate": 2.557077625570776e-07, "logits/chosen": -0.44849318265914917, "logits/rejected": -0.5008470416069031, "logps/chosen": -88.95763397216797, "logps/rejected": -82.1678237915039, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 1.1119191646575928, "rewards/margins": 8.655721664428711, "rewards/rejected": -7.543802738189697, "step": 560 }, { "epoch": 0.26, "learning_rate": 2.602739726027397e-07, "logits/chosen": -0.47934848070144653, "logits/rejected": -0.5097138285636902, "logps/chosen": -91.19532012939453, "logps/rejected": -81.44822692871094, "loss": 0.0211, "rewards/accuracies": 1.0, "rewards/chosen": 1.1905813217163086, "rewards/margins": 8.602886199951172, "rewards/rejected": -7.412304878234863, "step": 570 }, { "epoch": 0.26, "learning_rate": 2.648401826484018e-07, "logits/chosen": -0.5011765360832214, "logits/rejected": -0.4872073233127594, "logps/chosen": -85.41288757324219, "logps/rejected": -81.96855163574219, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.8738569021224976, "rewards/margins": 9.319847106933594, "rewards/rejected": -8.445989608764648, "step": 580 }, { "epoch": 0.27, "learning_rate": 2.694063926940639e-07, "logits/chosen": -0.44528061151504517, "logits/rejected": -0.5134448409080505, "logps/chosen": -85.46125793457031, "logps/rejected": -85.14677429199219, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 1.2151055335998535, "rewards/margins": 10.124042510986328, "rewards/rejected": -8.908937454223633, "step": 590 }, { "epoch": 0.27, "learning_rate": 2.73972602739726e-07, "logits/chosen": -0.5176888108253479, "logits/rejected": -0.4848707318305969, "logps/chosen": -82.91043090820312, "logps/rejected": -85.11193084716797, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 1.2624995708465576, "rewards/margins": 10.12977123260498, "rewards/rejected": -8.86727237701416, "step": 600 }, { "epoch": 0.27, "eval_logits/chosen": -0.4719269871711731, "eval_logits/rejected": -0.44449782371520996, "eval_logps/chosen": -83.74675750732422, "eval_logps/rejected": -82.89984130859375, "eval_loss": 0.01547443587332964, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.8955463171005249, "eval_rewards/margins": 9.914765357971191, "eval_rewards/rejected": -9.019220352172852, "eval_runtime": 80.4475, "eval_samples_per_second": 35.576, "eval_steps_per_second": 2.225, "step": 600 }, { "epoch": 0.28, "learning_rate": 2.785388127853881e-07, "logits/chosen": -0.5298361778259277, "logits/rejected": -0.49106723070144653, "logps/chosen": -85.64407348632812, "logps/rejected": -83.2470703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 0.8313077092170715, "rewards/margins": 9.659244537353516, "rewards/rejected": -8.827936172485352, "step": 610 }, { "epoch": 0.28, "learning_rate": 2.831050228310502e-07, "logits/chosen": -0.5390816926956177, "logits/rejected": -0.5547928214073181, "logps/chosen": -83.07210540771484, "logps/rejected": -86.65785217285156, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": 0.5047268867492676, "rewards/margins": 10.445852279663086, "rewards/rejected": -9.941125869750977, "step": 620 }, { "epoch": 0.29, "learning_rate": 2.876712328767123e-07, "logits/chosen": -0.5076915621757507, "logits/rejected": -0.4994226396083832, "logps/chosen": -88.59370422363281, "logps/rejected": -89.94892883300781, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8393543362617493, "rewards/margins": 10.692634582519531, "rewards/rejected": -9.853279113769531, "step": 630 }, { "epoch": 0.29, "learning_rate": 2.922374429223744e-07, "logits/chosen": -0.530367374420166, "logits/rejected": -0.521192193031311, "logps/chosen": -85.75426483154297, "logps/rejected": -85.36978149414062, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 0.40226420760154724, "rewards/margins": 9.481393814086914, "rewards/rejected": -9.079129219055176, "step": 640 }, { "epoch": 0.3, "learning_rate": 2.968036529680365e-07, "logits/chosen": -0.49263906478881836, "logits/rejected": -0.5358297824859619, "logps/chosen": -86.92278289794922, "logps/rejected": -84.6957015991211, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 1.3719327449798584, "rewards/margins": 12.041804313659668, "rewards/rejected": -10.66987133026123, "step": 650 }, { "epoch": 0.3, "learning_rate": 2.998477929984779e-07, "logits/chosen": -0.5359422564506531, "logits/rejected": -0.493899405002594, "logps/chosen": -83.14149475097656, "logps/rejected": -88.8863296508789, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.620421290397644, "rewards/margins": 12.346444129943848, "rewards/rejected": -11.726022720336914, "step": 660 }, { "epoch": 0.31, "learning_rate": 2.993404363267377e-07, "logits/chosen": -0.48512953519821167, "logits/rejected": -0.5147516131401062, "logps/chosen": -88.53128814697266, "logps/rejected": -90.33158111572266, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 0.5566505193710327, "rewards/margins": 12.281156539916992, "rewards/rejected": -11.724506378173828, "step": 670 }, { "epoch": 0.31, "learning_rate": 2.9883307965499743e-07, "logits/chosen": -0.5035023093223572, "logits/rejected": -0.47232455015182495, "logps/chosen": -88.98485565185547, "logps/rejected": -94.34496307373047, "loss": 0.0109, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7806042432785034, "rewards/margins": 12.724248886108398, "rewards/rejected": -11.943646430969238, "step": 680 }, { "epoch": 0.31, "learning_rate": 2.983257229832572e-07, "logits/chosen": -0.46104001998901367, "logits/rejected": -0.5387547016143799, "logps/chosen": -87.83756256103516, "logps/rejected": -85.52259826660156, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 1.1090915203094482, "rewards/margins": 13.257928848266602, "rewards/rejected": -12.148837089538574, "step": 690 }, { "epoch": 0.32, "learning_rate": 2.9781836631151696e-07, "logits/chosen": -0.4819292426109314, "logits/rejected": -0.473407506942749, "logps/chosen": -86.3482894897461, "logps/rejected": -93.20619201660156, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7059062719345093, "rewards/margins": 12.420666694641113, "rewards/rejected": -11.714760780334473, "step": 700 }, { "epoch": 0.32, "eval_logits/chosen": -0.49807193875312805, "eval_logits/rejected": -0.4617076814174652, "eval_logps/chosen": -84.06249237060547, "eval_logps/rejected": -89.08558654785156, "eval_loss": 0.011189465411007404, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.7376802563667297, "eval_rewards/margins": 12.849767684936523, "eval_rewards/rejected": -12.112089157104492, "eval_runtime": 87.676, "eval_samples_per_second": 32.643, "eval_steps_per_second": 2.042, "step": 700 }, { "epoch": 0.32, "learning_rate": 2.9731100963977676e-07, "logits/chosen": -0.47739043831825256, "logits/rejected": -0.47871193289756775, "logps/chosen": -85.58582305908203, "logps/rejected": -92.14688873291016, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.8560394048690796, "rewards/margins": 12.940500259399414, "rewards/rejected": -12.08445930480957, "step": 710 }, { "epoch": 0.33, "learning_rate": 2.968036529680365e-07, "logits/chosen": -0.5277897119522095, "logits/rejected": -0.5494237542152405, "logps/chosen": -84.10975646972656, "logps/rejected": -94.46439361572266, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 1.170245885848999, "rewards/margins": 14.76359748840332, "rewards/rejected": -13.593350410461426, "step": 720 }, { "epoch": 0.33, "learning_rate": 2.962962962962963e-07, "logits/chosen": -0.5545412302017212, "logits/rejected": -0.4997970461845398, "logps/chosen": -85.01225280761719, "logps/rejected": -90.4923324584961, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 1.1222950220108032, "rewards/margins": 14.033930778503418, "rewards/rejected": -12.911636352539062, "step": 730 }, { "epoch": 0.34, "learning_rate": 2.9578893962455603e-07, "logits/chosen": -0.47376394271850586, "logits/rejected": -0.4383009374141693, "logps/chosen": -82.76625061035156, "logps/rejected": -88.45748138427734, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 0.7124737501144409, "rewards/margins": 13.04736614227295, "rewards/rejected": -12.334892272949219, "step": 740 }, { "epoch": 0.34, "learning_rate": 2.952815829528158e-07, "logits/chosen": -0.5440901517868042, "logits/rejected": -0.5601977109909058, "logps/chosen": -83.02261352539062, "logps/rejected": -93.058349609375, "loss": 0.0084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9380642771720886, "rewards/margins": 14.189043045043945, "rewards/rejected": -13.250978469848633, "step": 750 }, { "epoch": 0.35, "learning_rate": 2.9477422628107556e-07, "logits/chosen": -0.6197845339775085, "logits/rejected": -0.5904224514961243, "logps/chosen": -86.34260559082031, "logps/rejected": -93.52967834472656, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9487737417221069, "rewards/margins": 13.869544982910156, "rewards/rejected": -12.92077350616455, "step": 760 }, { "epoch": 0.35, "learning_rate": 2.9426686960933536e-07, "logits/chosen": -0.5594059228897095, "logits/rejected": -0.4972243905067444, "logps/chosen": -92.50401306152344, "logps/rejected": -92.55013275146484, "loss": 0.0083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2346494495868683, "rewards/margins": 12.848955154418945, "rewards/rejected": -12.61430549621582, "step": 770 }, { "epoch": 0.36, "learning_rate": 2.937595129375951e-07, "logits/chosen": -0.48909980058670044, "logits/rejected": -0.5088886022567749, "logps/chosen": -85.69803619384766, "logps/rejected": -91.68397521972656, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.01169496774673462, "rewards/margins": 13.43089771270752, "rewards/rejected": -13.442593574523926, "step": 780 }, { "epoch": 0.36, "learning_rate": 2.932521562658549e-07, "logits/chosen": -0.5919016599655151, "logits/rejected": -0.5654195547103882, "logps/chosen": -85.5023422241211, "logps/rejected": -95.96412658691406, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8735700845718384, "rewards/margins": 15.024134635925293, "rewards/rejected": -14.15056324005127, "step": 790 }, { "epoch": 0.37, "learning_rate": 2.9274479959411463e-07, "logits/chosen": -0.570320725440979, "logits/rejected": -0.5494597554206848, "logps/chosen": -84.90104675292969, "logps/rejected": -96.68202209472656, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.5182453393936157, "rewards/margins": 14.31817626953125, "rewards/rejected": -13.79992961883545, "step": 800 }, { "epoch": 0.37, "eval_logits/chosen": -0.5433102250099182, "eval_logits/rejected": -0.4990071952342987, "eval_logps/chosen": -84.43854522705078, "eval_logps/rejected": -93.17755126953125, "eval_loss": 0.009528527967631817, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.5496511459350586, "eval_rewards/margins": 14.707724571228027, "eval_rewards/rejected": -14.158075332641602, "eval_runtime": 90.1102, "eval_samples_per_second": 31.761, "eval_steps_per_second": 1.986, "step": 800 }, { "epoch": 0.37, "learning_rate": 2.922374429223744e-07, "logits/chosen": -0.5402038097381592, "logits/rejected": -0.5096747279167175, "logps/chosen": -85.3931655883789, "logps/rejected": -97.1419677734375, "loss": 0.0151, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4692815840244293, "rewards/margins": 15.11835765838623, "rewards/rejected": -14.649076461791992, "step": 810 }, { "epoch": 0.37, "learning_rate": 2.9173008625063416e-07, "logits/chosen": -0.5429534912109375, "logits/rejected": -0.528504490852356, "logps/chosen": -87.19554138183594, "logps/rejected": -96.91336822509766, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.8628616333007812, "rewards/margins": 16.042449951171875, "rewards/rejected": -15.179588317871094, "step": 820 }, { "epoch": 0.38, "learning_rate": 2.9122272957889396e-07, "logits/chosen": -0.49800190329551697, "logits/rejected": -0.5036158561706543, "logps/chosen": -85.76387023925781, "logps/rejected": -96.38945770263672, "loss": 0.0123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1412431001663208, "rewards/margins": 14.274526596069336, "rewards/rejected": -13.133282661437988, "step": 830 }, { "epoch": 0.38, "learning_rate": 2.907153729071537e-07, "logits/chosen": -0.5477937459945679, "logits/rejected": -0.5407083630561829, "logps/chosen": -89.34082794189453, "logps/rejected": -94.71735382080078, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.8216226696968079, "rewards/margins": 15.216272354125977, "rewards/rejected": -14.394651412963867, "step": 840 }, { "epoch": 0.39, "learning_rate": 2.902080162354135e-07, "logits/chosen": -0.5074556469917297, "logits/rejected": -0.4516450762748718, "logps/chosen": -77.45787048339844, "logps/rejected": -95.98930358886719, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09650325775146484, "rewards/margins": 14.629781723022461, "rewards/rejected": -14.726284980773926, "step": 850 }, { "epoch": 0.39, "learning_rate": 2.8970065956367323e-07, "logits/chosen": -0.5568779706954956, "logits/rejected": -0.5409306287765503, "logps/chosen": -85.82357025146484, "logps/rejected": -97.19561767578125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.310552179813385, "rewards/margins": 15.414616584777832, "rewards/rejected": -15.10406494140625, "step": 860 }, { "epoch": 0.4, "learning_rate": 2.89193302891933e-07, "logits/chosen": -0.550707221031189, "logits/rejected": -0.5625060796737671, "logps/chosen": -84.54888153076172, "logps/rejected": -101.38866424560547, "loss": 0.0099, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05552394315600395, "rewards/margins": 16.369537353515625, "rewards/rejected": -16.31401252746582, "step": 870 }, { "epoch": 0.4, "learning_rate": 2.8868594622019276e-07, "logits/chosen": -0.5476843118667603, "logits/rejected": -0.5468933582305908, "logps/chosen": -90.43180847167969, "logps/rejected": -96.9254379272461, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3115522861480713, "rewards/margins": 15.831393241882324, "rewards/rejected": -15.519842147827148, "step": 880 }, { "epoch": 0.41, "learning_rate": 2.8817858954845256e-07, "logits/chosen": -0.5169821977615356, "logits/rejected": -0.5348768830299377, "logps/chosen": -85.87166595458984, "logps/rejected": -104.33921813964844, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.8090624809265137, "rewards/margins": 17.356067657470703, "rewards/rejected": -16.547006607055664, "step": 890 }, { "epoch": 0.41, "learning_rate": 2.876712328767123e-07, "logits/chosen": -0.5918963551521301, "logits/rejected": -0.5073711276054382, "logps/chosen": -83.7479019165039, "logps/rejected": -100.97853088378906, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.8310455083847046, "rewards/margins": 17.748876571655273, "rewards/rejected": -16.917831420898438, "step": 900 }, { "epoch": 0.41, "eval_logits/chosen": -0.5699242353439331, "eval_logits/rejected": -0.5203292965888977, "eval_logps/chosen": -84.38159942626953, "eval_logps/rejected": -97.32843017578125, "eval_loss": 0.009458034299314022, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.5781266093254089, "eval_rewards/margins": 16.811635971069336, "eval_rewards/rejected": -16.23350715637207, "eval_runtime": 70.2365, "eval_samples_per_second": 40.748, "eval_steps_per_second": 2.549, "step": 900 }, { "epoch": 0.42, "learning_rate": 2.871638762049721e-07, "logits/chosen": -0.5940302610397339, "logits/rejected": -0.5538831353187561, "logps/chosen": -89.7022705078125, "logps/rejected": -102.83609771728516, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.3309146761894226, "rewards/margins": 16.62120819091797, "rewards/rejected": -16.290292739868164, "step": 910 }, { "epoch": 0.42, "learning_rate": 2.8665651953323183e-07, "logits/chosen": -0.5756375789642334, "logits/rejected": -0.5457393527030945, "logps/chosen": -85.42708587646484, "logps/rejected": -99.51619720458984, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 1.2837529182434082, "rewards/margins": 18.219730377197266, "rewards/rejected": -16.935976028442383, "step": 920 }, { "epoch": 0.42, "learning_rate": 2.861491628614916e-07, "logits/chosen": -0.6441935300827026, "logits/rejected": -0.5929199457168579, "logps/chosen": -83.81559753417969, "logps/rejected": -97.58605194091797, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 0.1634002923965454, "rewards/margins": 15.892837524414062, "rewards/rejected": -15.729436874389648, "step": 930 }, { "epoch": 0.43, "learning_rate": 2.8564180618975136e-07, "logits/chosen": -0.5742901563644409, "logits/rejected": -0.6205036640167236, "logps/chosen": -87.34207916259766, "logps/rejected": -104.62998962402344, "loss": 0.0075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.230440616607666, "rewards/margins": 18.693153381347656, "rewards/rejected": -17.462711334228516, "step": 940 }, { "epoch": 0.43, "learning_rate": 2.8513444951801116e-07, "logits/chosen": -0.5989577770233154, "logits/rejected": -0.5516340732574463, "logps/chosen": -85.1662826538086, "logps/rejected": -105.4076156616211, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.42705821990966797, "rewards/margins": 18.612205505371094, "rewards/rejected": -18.185152053833008, "step": 950 }, { "epoch": 0.44, "learning_rate": 2.846270928462709e-07, "logits/chosen": -0.5289547443389893, "logits/rejected": -0.5138489007949829, "logps/chosen": -86.29643249511719, "logps/rejected": -101.24544525146484, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.5123685598373413, "rewards/margins": 17.13689613342285, "rewards/rejected": -16.62452507019043, "step": 960 }, { "epoch": 0.44, "learning_rate": 2.841197361745307e-07, "logits/chosen": -0.581844687461853, "logits/rejected": -0.5707448720932007, "logps/chosen": -85.06533813476562, "logps/rejected": -99.39533996582031, "loss": 0.0082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0915330648422241, "rewards/margins": 17.751794815063477, "rewards/rejected": -16.660261154174805, "step": 970 }, { "epoch": 0.45, "learning_rate": 2.8361237950279043e-07, "logits/chosen": -0.5148975253105164, "logits/rejected": -0.5400117635726929, "logps/chosen": -84.89034271240234, "logps/rejected": -101.74394226074219, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.6962271332740784, "rewards/margins": 17.362585067749023, "rewards/rejected": -16.666357040405273, "step": 980 }, { "epoch": 0.45, "learning_rate": 2.831050228310502e-07, "logits/chosen": -0.5739647150039673, "logits/rejected": -0.589260995388031, "logps/chosen": -85.27232360839844, "logps/rejected": -99.5882339477539, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 1.102745771408081, "rewards/margins": 16.655506134033203, "rewards/rejected": -15.552759170532227, "step": 990 }, { "epoch": 0.46, "learning_rate": 2.8259766615930996e-07, "logits/chosen": -0.6199604272842407, "logits/rejected": -0.6186779737472534, "logps/chosen": -86.85110473632812, "logps/rejected": -94.86585998535156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.344161868095398, "rewards/margins": 17.192333221435547, "rewards/rejected": -15.848173141479492, "step": 1000 }, { "epoch": 0.46, "eval_logits/chosen": -0.6026656627655029, "eval_logits/rejected": -0.5541569590568542, "eval_logps/chosen": -83.98019409179688, "eval_logps/rejected": -97.6655044555664, "eval_loss": 0.008850914426147938, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.7788311243057251, "eval_rewards/margins": 17.180883407592773, "eval_rewards/rejected": -16.402050018310547, "eval_runtime": 66.2953, "eval_samples_per_second": 43.171, "eval_steps_per_second": 2.7, "step": 1000 }, { "epoch": 0.46, "learning_rate": 2.8209030948756976e-07, "logits/chosen": -0.6393527984619141, "logits/rejected": -0.6539614796638489, "logps/chosen": -83.76344299316406, "logps/rejected": -102.68571472167969, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.3425472974777222, "rewards/margins": 17.38565444946289, "rewards/rejected": -16.043106079101562, "step": 1010 }, { "epoch": 0.47, "learning_rate": 2.815829528158295e-07, "logits/chosen": -0.5339282155036926, "logits/rejected": -0.6024419665336609, "logps/chosen": -83.54170989990234, "logps/rejected": -99.44310760498047, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438585042953491, "rewards/margins": 17.18117904663086, "rewards/rejected": -16.237319946289062, "step": 1020 }, { "epoch": 0.47, "learning_rate": 2.810755961440893e-07, "logits/chosen": -0.6185084581375122, "logits/rejected": -0.6042757034301758, "logps/chosen": -85.75657653808594, "logps/rejected": -104.95362854003906, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 0.2947128415107727, "rewards/margins": 18.044797897338867, "rewards/rejected": -17.750083923339844, "step": 1030 }, { "epoch": 0.47, "learning_rate": 2.8056823947234903e-07, "logits/chosen": -0.5133255124092102, "logits/rejected": -0.5455678701400757, "logps/chosen": -88.9016342163086, "logps/rejected": -97.69975280761719, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.311894178390503, "rewards/margins": 17.26865005493164, "rewards/rejected": -15.956756591796875, "step": 1040 }, { "epoch": 0.48, "learning_rate": 2.800608828006088e-07, "logits/chosen": -0.617928683757782, "logits/rejected": -0.5641010403633118, "logps/chosen": -88.8299560546875, "logps/rejected": -103.35233306884766, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.06934154033660889, "rewards/margins": 17.858707427978516, "rewards/rejected": -17.928049087524414, "step": 1050 }, { "epoch": 0.48, "learning_rate": 2.7955352612886856e-07, "logits/chosen": -0.5771080255508423, "logits/rejected": -0.5437840223312378, "logps/chosen": -77.94991302490234, "logps/rejected": -101.56156158447266, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2994147837162018, "rewards/margins": 18.4749813079834, "rewards/rejected": -18.774398803710938, "step": 1060 }, { "epoch": 0.49, "learning_rate": 2.7904616945712836e-07, "logits/chosen": -0.5053738355636597, "logits/rejected": -0.5292837619781494, "logps/chosen": -82.59956359863281, "logps/rejected": -103.01961517333984, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.9886738657951355, "rewards/margins": 18.72883415222168, "rewards/rejected": -17.740161895751953, "step": 1070 }, { "epoch": 0.49, "learning_rate": 2.785388127853881e-07, "logits/chosen": -0.539372444152832, "logits/rejected": -0.5728116631507874, "logps/chosen": -85.69373321533203, "logps/rejected": -101.7072525024414, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.8183579444885254, "rewards/margins": 18.746421813964844, "rewards/rejected": -17.928064346313477, "step": 1080 }, { "epoch": 0.5, "learning_rate": 2.780314561136479e-07, "logits/chosen": -0.5220463871955872, "logits/rejected": -0.481070339679718, "logps/chosen": -87.22162628173828, "logps/rejected": -101.13359069824219, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.474172443151474, "rewards/margins": 17.313884735107422, "rewards/rejected": -16.839710235595703, "step": 1090 }, { "epoch": 0.5, "learning_rate": 2.7752409944190763e-07, "logits/chosen": -0.5939083099365234, "logits/rejected": -0.5974006652832031, "logps/chosen": -85.4839096069336, "logps/rejected": -100.13436889648438, "loss": 0.0127, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7489646673202515, "rewards/margins": 18.251670837402344, "rewards/rejected": -17.502704620361328, "step": 1100 }, { "epoch": 0.5, "eval_logits/chosen": -0.5881746411323547, "eval_logits/rejected": -0.5407174825668335, "eval_logps/chosen": -83.98693084716797, "eval_logps/rejected": -98.03213500976562, "eval_loss": 0.008042249828577042, "eval_rewards/accuracies": 0.9972066879272461, "eval_rewards/chosen": 0.7754639983177185, "eval_rewards/margins": 17.360830307006836, "eval_rewards/rejected": -16.585365295410156, "eval_runtime": 74.5333, "eval_samples_per_second": 38.399, "eval_steps_per_second": 2.402, "step": 1100 }, { "epoch": 0.51, "learning_rate": 2.770167427701674e-07, "logits/chosen": -0.5169168710708618, "logits/rejected": -0.5307328104972839, "logps/chosen": -89.0078353881836, "logps/rejected": -102.8739242553711, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0199966430664062, "rewards/margins": 17.293201446533203, "rewards/rejected": -16.273204803466797, "step": 1110 }, { "epoch": 0.51, "learning_rate": 2.7650938609842716e-07, "logits/chosen": -0.5890191793441772, "logits/rejected": -0.6054331660270691, "logps/chosen": -86.77067565917969, "logps/rejected": -93.67768859863281, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.5434656143188477, "rewards/margins": 16.876192092895508, "rewards/rejected": -15.332727432250977, "step": 1120 }, { "epoch": 0.52, "learning_rate": 2.7600202942668696e-07, "logits/chosen": -0.6245452165603638, "logits/rejected": -0.5807594656944275, "logps/chosen": -90.57156372070312, "logps/rejected": -96.71611022949219, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.8988291025161743, "rewards/margins": 17.20783233642578, "rewards/rejected": -15.309002876281738, "step": 1130 }, { "epoch": 0.52, "learning_rate": 2.754946727549467e-07, "logits/chosen": -0.6571340560913086, "logits/rejected": -0.598118782043457, "logps/chosen": -88.4867172241211, "logps/rejected": -102.10794830322266, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.4076375961303711, "rewards/margins": 16.26376724243164, "rewards/rejected": -15.856130599975586, "step": 1140 }, { "epoch": 0.52, "learning_rate": 2.749873160832065e-07, "logits/chosen": -0.5951014161109924, "logits/rejected": -0.6142104864120483, "logps/chosen": -81.71449279785156, "logps/rejected": -98.87643432617188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.3448036909103394, "rewards/margins": 17.912372589111328, "rewards/rejected": -16.567569732666016, "step": 1150 }, { "epoch": 0.53, "learning_rate": 2.7447995941146623e-07, "logits/chosen": -0.6196326613426208, "logits/rejected": -0.6616156697273254, "logps/chosen": -86.88075256347656, "logps/rejected": -103.3810043334961, "loss": 0.0113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1018636226654053, "rewards/margins": 17.62788963317871, "rewards/rejected": -16.526025772094727, "step": 1160 }, { "epoch": 0.53, "learning_rate": 2.73972602739726e-07, "logits/chosen": -0.5963867902755737, "logits/rejected": -0.6104931235313416, "logps/chosen": -86.03263854980469, "logps/rejected": -103.81394958496094, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3870103359222412, "rewards/margins": 18.407033920288086, "rewards/rejected": -17.020023345947266, "step": 1170 }, { "epoch": 0.54, "learning_rate": 2.7346524606798576e-07, "logits/chosen": -0.6480125784873962, "logits/rejected": -0.5914435386657715, "logps/chosen": -87.74053955078125, "logps/rejected": -103.57038879394531, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.2278248071670532, "rewards/margins": 18.94321060180664, "rewards/rejected": -17.71538543701172, "step": 1180 }, { "epoch": 0.54, "learning_rate": 2.7295788939624556e-07, "logits/chosen": -0.6207834482192993, "logits/rejected": -0.6019139289855957, "logps/chosen": -84.23980712890625, "logps/rejected": -98.18376159667969, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 2.0491936206817627, "rewards/margins": 18.798131942749023, "rewards/rejected": -16.748937606811523, "step": 1190 }, { "epoch": 0.55, "learning_rate": 2.724505327245053e-07, "logits/chosen": -0.6990079879760742, "logits/rejected": -0.623802661895752, "logps/chosen": -87.16651916503906, "logps/rejected": -104.5052719116211, "loss": 0.0192, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10929219424724579, "rewards/margins": 17.677276611328125, "rewards/rejected": -17.567981719970703, "step": 1200 }, { "epoch": 0.55, "eval_logits/chosen": -0.6292704343795776, "eval_logits/rejected": -0.575247585773468, "eval_logps/chosen": -83.80381774902344, "eval_logps/rejected": -98.02921295166016, "eval_loss": 0.008193709887564182, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.8670160174369812, "eval_rewards/margins": 17.450916290283203, "eval_rewards/rejected": -16.583900451660156, "eval_runtime": 67.8403, "eval_samples_per_second": 42.187, "eval_steps_per_second": 2.639, "step": 1200 }, { "epoch": 0.55, "learning_rate": 2.719431760527651e-07, "logits/chosen": -0.6629089117050171, "logits/rejected": -0.620673656463623, "logps/chosen": -83.75190734863281, "logps/rejected": -97.05430603027344, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.7773175835609436, "rewards/margins": 15.876919746398926, "rewards/rejected": -15.099603652954102, "step": 1210 }, { "epoch": 0.56, "learning_rate": 2.7143581938102483e-07, "logits/chosen": -0.6742457151412964, "logits/rejected": -0.6161606311798096, "logps/chosen": -82.72785949707031, "logps/rejected": -97.83805084228516, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 1.0242589712142944, "rewards/margins": 17.519359588623047, "rewards/rejected": -16.495100021362305, "step": 1220 }, { "epoch": 0.56, "learning_rate": 2.709284627092846e-07, "logits/chosen": -0.602118968963623, "logits/rejected": -0.6233198046684265, "logps/chosen": -81.18174743652344, "logps/rejected": -100.21574401855469, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.5538928508758545, "rewards/margins": 17.575532913208008, "rewards/rejected": -17.02164077758789, "step": 1230 }, { "epoch": 0.57, "learning_rate": 2.7042110603754436e-07, "logits/chosen": -0.650370180606842, "logits/rejected": -0.5805081129074097, "logps/chosen": -86.96976470947266, "logps/rejected": -99.80048370361328, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1351078748703003, "rewards/margins": 17.74318504333496, "rewards/rejected": -16.60807991027832, "step": 1240 }, { "epoch": 0.57, "learning_rate": 2.6991374936580416e-07, "logits/chosen": -0.5949207544326782, "logits/rejected": -0.5505542755126953, "logps/chosen": -86.36726379394531, "logps/rejected": -104.3932113647461, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.521218478679657, "rewards/margins": 18.444801330566406, "rewards/rejected": -17.923583984375, "step": 1250 }, { "epoch": 0.58, "learning_rate": 2.694063926940639e-07, "logits/chosen": -0.5906392335891724, "logits/rejected": -0.6108217239379883, "logps/chosen": -86.92762756347656, "logps/rejected": -99.90259552001953, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.7010552287101746, "rewards/margins": 17.54641342163086, "rewards/rejected": -16.845355987548828, "step": 1260 }, { "epoch": 0.58, "learning_rate": 2.688990360223237e-07, "logits/chosen": -0.6362488865852356, "logits/rejected": -0.5491201877593994, "logps/chosen": -85.17521667480469, "logps/rejected": -108.06062316894531, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.39043551683425903, "rewards/margins": 18.07403564453125, "rewards/rejected": -18.464473724365234, "step": 1270 }, { "epoch": 0.58, "learning_rate": 2.6839167935058343e-07, "logits/chosen": -0.7829685211181641, "logits/rejected": -0.7587999105453491, "logps/chosen": -83.90362548828125, "logps/rejected": -101.64833068847656, "loss": 0.0111, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.357697069644928, "rewards/margins": 18.58173942565918, "rewards/rejected": -18.224040985107422, "step": 1280 }, { "epoch": 0.59, "learning_rate": 2.678843226788432e-07, "logits/chosen": -0.6578508615493774, "logits/rejected": -0.5751517415046692, "logps/chosen": -88.73038482666016, "logps/rejected": -105.878173828125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.28621432185173035, "rewards/margins": 19.108448028564453, "rewards/rejected": -18.82223129272461, "step": 1290 }, { "epoch": 0.59, "learning_rate": 2.6737696600710296e-07, "logits/chosen": -0.7079693078994751, "logits/rejected": -0.7003791332244873, "logps/chosen": -79.51958465576172, "logps/rejected": -102.92280578613281, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1290922164916992, "rewards/margins": 19.95863914489746, "rewards/rejected": -18.829547882080078, "step": 1300 }, { "epoch": 0.59, "eval_logits/chosen": -0.6428735852241516, "eval_logits/rejected": -0.5829644203186035, "eval_logps/chosen": -84.09004974365234, "eval_logps/rejected": -102.14546966552734, "eval_loss": 0.008307097479701042, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.7239024639129639, "eval_rewards/margins": 19.365936279296875, "eval_rewards/rejected": -18.642032623291016, "eval_runtime": 62.3684, "eval_samples_per_second": 45.889, "eval_steps_per_second": 2.87, "step": 1300 }, { "epoch": 0.6, "learning_rate": 2.6686960933536276e-07, "logits/chosen": -0.7085214853286743, "logits/rejected": -0.6773207783699036, "logps/chosen": -83.67912292480469, "logps/rejected": -105.48329162597656, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.8614299893379211, "rewards/margins": 19.49595832824707, "rewards/rejected": -18.634525299072266, "step": 1310 }, { "epoch": 0.6, "learning_rate": 2.663622526636225e-07, "logits/chosen": -0.5678974390029907, "logits/rejected": -0.559326171875, "logps/chosen": -83.47301483154297, "logps/rejected": -107.2491226196289, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6393694281578064, "rewards/margins": 19.674095153808594, "rewards/rejected": -19.034725189208984, "step": 1320 }, { "epoch": 0.61, "learning_rate": 2.658548959918823e-07, "logits/chosen": -0.6403359174728394, "logits/rejected": -0.6420946717262268, "logps/chosen": -87.22882843017578, "logps/rejected": -104.50102233886719, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 1.6024986505508423, "rewards/margins": 20.13058090209961, "rewards/rejected": -18.528079986572266, "step": 1330 }, { "epoch": 0.61, "learning_rate": 2.6534753932014203e-07, "logits/chosen": -0.624729335308075, "logits/rejected": -0.5556301474571228, "logps/chosen": -84.000244140625, "logps/rejected": -100.9580078125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.8587640523910522, "rewards/margins": 19.506357192993164, "rewards/rejected": -18.647592544555664, "step": 1340 }, { "epoch": 0.62, "learning_rate": 2.648401826484018e-07, "logits/chosen": -0.7058721780776978, "logits/rejected": -0.6835563778877258, "logps/chosen": -81.8812484741211, "logps/rejected": -110.7301025390625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.5004382133483887, "rewards/margins": 20.48194122314453, "rewards/rejected": -19.981502532958984, "step": 1350 }, { "epoch": 0.62, "learning_rate": 2.6433282597666156e-07, "logits/chosen": -0.5713628530502319, "logits/rejected": -0.5688737630844116, "logps/chosen": -88.24539184570312, "logps/rejected": -110.30496978759766, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.33614498376846313, "rewards/margins": 19.93739128112793, "rewards/rejected": -19.601245880126953, "step": 1360 }, { "epoch": 0.63, "learning_rate": 2.6382546930492135e-07, "logits/chosen": -0.6647303104400635, "logits/rejected": -0.6416600942611694, "logps/chosen": -84.4307632446289, "logps/rejected": -104.19242095947266, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5064027905464172, "rewards/margins": 19.400102615356445, "rewards/rejected": -18.893699645996094, "step": 1370 }, { "epoch": 0.63, "learning_rate": 2.633181126331811e-07, "logits/chosen": -0.6855611801147461, "logits/rejected": -0.665108323097229, "logps/chosen": -82.07523345947266, "logps/rejected": -102.21691131591797, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5878819227218628, "rewards/margins": 18.011062622070312, "rewards/rejected": -17.42317771911621, "step": 1380 }, { "epoch": 0.63, "learning_rate": 2.628107559614409e-07, "logits/chosen": -0.7162337303161621, "logits/rejected": -0.6620529294013977, "logps/chosen": -84.50788879394531, "logps/rejected": -106.1350326538086, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 1.0131562948226929, "rewards/margins": 19.31102180480957, "rewards/rejected": -18.297863006591797, "step": 1390 }, { "epoch": 0.64, "learning_rate": 2.6230339928970063e-07, "logits/chosen": -0.7488337159156799, "logits/rejected": -0.6330257654190063, "logps/chosen": -86.39143371582031, "logps/rejected": -103.77738189697266, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.3392806351184845, "rewards/margins": 18.962505340576172, "rewards/rejected": -18.62322425842285, "step": 1400 }, { "epoch": 0.64, "eval_logits/chosen": -0.6586980819702148, "eval_logits/rejected": -0.6006718873977661, "eval_logps/chosen": -84.00611877441406, "eval_logps/rejected": -102.19830322265625, "eval_loss": 0.007832423783838749, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.7658681273460388, "eval_rewards/margins": 19.43431854248047, "eval_rewards/rejected": -18.6684513092041, "eval_runtime": 72.5157, "eval_samples_per_second": 39.467, "eval_steps_per_second": 2.468, "step": 1400 }, { "epoch": 0.64, "learning_rate": 2.617960426179604e-07, "logits/chosen": -0.6780227422714233, "logits/rejected": -0.6405975222587585, "logps/chosen": -88.10275268554688, "logps/rejected": -106.83785247802734, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.2879269123077393, "rewards/margins": 21.27725601196289, "rewards/rejected": -19.989328384399414, "step": 1410 }, { "epoch": 0.65, "learning_rate": 2.6128868594622016e-07, "logits/chosen": -0.6182512044906616, "logits/rejected": -0.6471190452575684, "logps/chosen": -84.40079498291016, "logps/rejected": -100.61917877197266, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8890922665596008, "rewards/margins": 18.377668380737305, "rewards/rejected": -17.488576889038086, "step": 1420 }, { "epoch": 0.65, "learning_rate": 2.6078132927447995e-07, "logits/chosen": -0.6950105428695679, "logits/rejected": -0.6299742460250854, "logps/chosen": -84.123046875, "logps/rejected": -106.0107421875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 1.2167333364486694, "rewards/margins": 20.657690048217773, "rewards/rejected": -19.44095802307129, "step": 1430 }, { "epoch": 0.66, "learning_rate": 2.602739726027397e-07, "logits/chosen": -0.6647752523422241, "logits/rejected": -0.6143220663070679, "logps/chosen": -82.63214874267578, "logps/rejected": -105.60970306396484, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.9395632743835449, "rewards/margins": 20.366649627685547, "rewards/rejected": -19.427085876464844, "step": 1440 }, { "epoch": 0.66, "learning_rate": 2.597666159309995e-07, "logits/chosen": -0.6042054295539856, "logits/rejected": -0.5673761367797852, "logps/chosen": -83.49287414550781, "logps/rejected": -106.82151794433594, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 0.18224623799324036, "rewards/margins": 18.836721420288086, "rewards/rejected": -18.654476165771484, "step": 1450 }, { "epoch": 0.67, "learning_rate": 2.5925925925925923e-07, "logits/chosen": -0.6551016569137573, "logits/rejected": -0.6009622812271118, "logps/chosen": -86.40557861328125, "logps/rejected": -108.91459655761719, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.11953596770763397, "rewards/margins": 18.390857696533203, "rewards/rejected": -18.510395050048828, "step": 1460 }, { "epoch": 0.67, "learning_rate": 2.58751902587519e-07, "logits/chosen": -0.5883369445800781, "logits/rejected": -0.6131059527397156, "logps/chosen": -91.0303955078125, "logps/rejected": -105.77290344238281, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7381661534309387, "rewards/margins": 19.129926681518555, "rewards/rejected": -18.391761779785156, "step": 1470 }, { "epoch": 0.68, "learning_rate": 2.5824454591577876e-07, "logits/chosen": -0.6731956005096436, "logits/rejected": -0.623029351234436, "logps/chosen": -85.10891723632812, "logps/rejected": -102.04473876953125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 1.1607452630996704, "rewards/margins": 20.150548934936523, "rewards/rejected": -18.989805221557617, "step": 1480 }, { "epoch": 0.68, "learning_rate": 2.5773718924403855e-07, "logits/chosen": -0.592241644859314, "logits/rejected": -0.5942645072937012, "logps/chosen": -85.31133270263672, "logps/rejected": -105.8062973022461, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.626050591468811, "rewards/margins": 19.598217010498047, "rewards/rejected": -18.972166061401367, "step": 1490 }, { "epoch": 0.68, "learning_rate": 2.572298325722983e-07, "logits/chosen": -0.6044291257858276, "logits/rejected": -0.5564228892326355, "logps/chosen": -85.6323471069336, "logps/rejected": -105.26658630371094, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.7599972486495972, "rewards/margins": 20.052276611328125, "rewards/rejected": -19.292278289794922, "step": 1500 }, { "epoch": 0.68, "eval_logits/chosen": -0.654085099697113, "eval_logits/rejected": -0.5967754125595093, "eval_logps/chosen": -84.53275299072266, "eval_logps/rejected": -104.18524932861328, "eval_loss": 0.007872804999351501, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.5025545954704285, "eval_rewards/margins": 20.164472579956055, "eval_rewards/rejected": -19.66192054748535, "eval_runtime": 70.7679, "eval_samples_per_second": 40.442, "eval_steps_per_second": 2.529, "step": 1500 }, { "epoch": 0.69, "learning_rate": 2.567224759005581e-07, "logits/chosen": -0.640856146812439, "logits/rejected": -0.601913571357727, "logps/chosen": -80.25975799560547, "logps/rejected": -98.95476531982422, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.07107143104076385, "rewards/margins": 18.289194107055664, "rewards/rejected": -18.218120574951172, "step": 1510 }, { "epoch": 0.69, "learning_rate": 2.5621511922881783e-07, "logits/chosen": -0.6410808563232422, "logits/rejected": -0.6079914569854736, "logps/chosen": -86.399658203125, "logps/rejected": -102.8433609008789, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9533981084823608, "rewards/margins": 21.374330520629883, "rewards/rejected": -19.42093276977539, "step": 1520 }, { "epoch": 0.7, "learning_rate": 2.557077625570776e-07, "logits/chosen": -0.6295315027236938, "logits/rejected": -0.6216704249382019, "logps/chosen": -84.7662353515625, "logps/rejected": -106.0814437866211, "loss": 0.0067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7130889892578125, "rewards/margins": 17.965023040771484, "rewards/rejected": -17.251934051513672, "step": 1530 }, { "epoch": 0.7, "learning_rate": 2.5520040588533736e-07, "logits/chosen": -0.6840890049934387, "logits/rejected": -0.6534979939460754, "logps/chosen": -87.48678588867188, "logps/rejected": -104.89375305175781, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.9938493967056274, "rewards/margins": 20.103492736816406, "rewards/rejected": -19.109643936157227, "step": 1540 }, { "epoch": 0.71, "learning_rate": 2.5469304921359715e-07, "logits/chosen": -0.7252383828163147, "logits/rejected": -0.6697880029678345, "logps/chosen": -83.24116516113281, "logps/rejected": -105.6595687866211, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 1.4993698596954346, "rewards/margins": 19.923023223876953, "rewards/rejected": -18.42365074157715, "step": 1550 }, { "epoch": 0.71, "learning_rate": 2.541856925418569e-07, "logits/chosen": -0.6816428899765015, "logits/rejected": -0.6332991123199463, "logps/chosen": -83.70225524902344, "logps/rejected": -103.3266830444336, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.8504802584648132, "rewards/margins": 19.315753936767578, "rewards/rejected": -18.465274810791016, "step": 1560 }, { "epoch": 0.72, "learning_rate": 2.536783358701167e-07, "logits/chosen": -0.691390872001648, "logits/rejected": -0.6756331324577332, "logps/chosen": -82.35096740722656, "logps/rejected": -102.11773681640625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.2216899394989014, "rewards/margins": 20.077028274536133, "rewards/rejected": -18.85533905029297, "step": 1570 }, { "epoch": 0.72, "learning_rate": 2.5317097919837643e-07, "logits/chosen": -0.7393444180488586, "logits/rejected": -0.6760915517807007, "logps/chosen": -82.11813354492188, "logps/rejected": -105.13177490234375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5508477687835693, "rewards/margins": 20.428369522094727, "rewards/rejected": -18.87752342224121, "step": 1580 }, { "epoch": 0.73, "learning_rate": 2.526636225266362e-07, "logits/chosen": -0.6938213109970093, "logits/rejected": -0.6899979710578918, "logps/chosen": -85.61151885986328, "logps/rejected": -108.23921203613281, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.1316661834716797, "rewards/margins": 20.614761352539062, "rewards/rejected": -19.483095169067383, "step": 1590 }, { "epoch": 0.73, "learning_rate": 2.5215626585489596e-07, "logits/chosen": -0.686176598072052, "logits/rejected": -0.6618992686271667, "logps/chosen": -80.52513885498047, "logps/rejected": -104.21512603759766, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.4206916391849518, "rewards/margins": 19.16167449951172, "rewards/rejected": -18.740983963012695, "step": 1600 }, { "epoch": 0.73, "eval_logits/chosen": -0.7077122926712036, "eval_logits/rejected": -0.6482263207435608, "eval_logps/chosen": -84.09796142578125, "eval_logps/rejected": -103.77226257324219, "eval_loss": 0.006887392140924931, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.7199439406394958, "eval_rewards/margins": 20.175371170043945, "eval_rewards/rejected": -19.455429077148438, "eval_runtime": 79.6738, "eval_samples_per_second": 35.921, "eval_steps_per_second": 2.247, "step": 1600 }, { "epoch": 0.73, "learning_rate": 2.5164890918315575e-07, "logits/chosen": -0.8257538080215454, "logits/rejected": -0.7568483948707581, "logps/chosen": -84.69251251220703, "logps/rejected": -105.01495361328125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.7993510961532593, "rewards/margins": 20.178592681884766, "rewards/rejected": -19.379241943359375, "step": 1610 }, { "epoch": 0.74, "learning_rate": 2.511415525114155e-07, "logits/chosen": -0.7131239771842957, "logits/rejected": -0.6775761246681213, "logps/chosen": -85.8831787109375, "logps/rejected": -106.05537414550781, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.2889799177646637, "rewards/margins": 18.521915435791016, "rewards/rejected": -18.810894012451172, "step": 1620 }, { "epoch": 0.74, "learning_rate": 2.506341958396753e-07, "logits/chosen": -0.6207653880119324, "logits/rejected": -0.6060599088668823, "logps/chosen": -85.2615966796875, "logps/rejected": -105.60963439941406, "loss": 0.0068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7492761015892029, "rewards/margins": 21.529277801513672, "rewards/rejected": -20.78000259399414, "step": 1630 }, { "epoch": 0.75, "learning_rate": 2.5012683916793503e-07, "logits/chosen": -0.621131420135498, "logits/rejected": -0.6042443513870239, "logps/chosen": -90.21701049804688, "logps/rejected": -105.9647445678711, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 1.9219605922698975, "rewards/margins": 21.281051635742188, "rewards/rejected": -19.359088897705078, "step": 1640 }, { "epoch": 0.75, "learning_rate": 2.496194824961948e-07, "logits/chosen": -0.713904619216919, "logits/rejected": -0.6409584283828735, "logps/chosen": -84.72190856933594, "logps/rejected": -105.33198547363281, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 1.556586742401123, "rewards/margins": 21.801387786865234, "rewards/rejected": -20.244800567626953, "step": 1650 }, { "epoch": 0.76, "learning_rate": 2.4911212582445456e-07, "logits/chosen": -0.6484477519989014, "logits/rejected": -0.6336522102355957, "logps/chosen": -83.5719985961914, "logps/rejected": -104.5160903930664, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.5944575071334839, "rewards/margins": 20.492382049560547, "rewards/rejected": -18.897924423217773, "step": 1660 }, { "epoch": 0.76, "learning_rate": 2.4860476915271435e-07, "logits/chosen": -0.7745485901832581, "logits/rejected": -0.7227998971939087, "logps/chosen": -82.86568450927734, "logps/rejected": -106.18217468261719, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": 1.017204999923706, "rewards/margins": 21.1662540435791, "rewards/rejected": -20.1490478515625, "step": 1670 }, { "epoch": 0.77, "learning_rate": 2.480974124809741e-07, "logits/chosen": -0.7258914709091187, "logits/rejected": -0.6682701110839844, "logps/chosen": -90.38239288330078, "logps/rejected": -111.46282958984375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.219024896621704, "rewards/margins": 21.0469913482666, "rewards/rejected": -19.82796859741211, "step": 1680 }, { "epoch": 0.77, "learning_rate": 2.475900558092339e-07, "logits/chosen": -0.7296860814094543, "logits/rejected": -0.734585165977478, "logps/chosen": -86.05227661132812, "logps/rejected": -102.233154296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.8629347085952759, "rewards/margins": 19.730310440063477, "rewards/rejected": -18.867374420166016, "step": 1690 }, { "epoch": 0.78, "learning_rate": 2.4708269913749363e-07, "logits/chosen": -0.7404943704605103, "logits/rejected": -0.6497074961662292, "logps/chosen": -82.794677734375, "logps/rejected": -106.5738754272461, "loss": 0.0038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8510491251945496, "rewards/margins": 19.946802139282227, "rewards/rejected": -19.095752716064453, "step": 1700 }, { "epoch": 0.78, "eval_logits/chosen": -0.6825948357582092, "eval_logits/rejected": -0.6247321367263794, "eval_logps/chosen": -83.7663803100586, "eval_logps/rejected": -104.18630981445312, "eval_loss": 0.0068258135579526424, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.8857384324073792, "eval_rewards/margins": 20.548185348510742, "eval_rewards/rejected": -19.662446975708008, "eval_runtime": 70.3883, "eval_samples_per_second": 40.66, "eval_steps_per_second": 2.543, "step": 1700 }, { "epoch": 0.78, "learning_rate": 2.465753424657534e-07, "logits/chosen": -0.6451160311698914, "logits/rejected": -0.7110374569892883, "logps/chosen": -81.88134765625, "logps/rejected": -104.2155990600586, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6395124197006226, "rewards/margins": 20.917659759521484, "rewards/rejected": -19.278146743774414, "step": 1710 }, { "epoch": 0.79, "learning_rate": 2.4606798579401316e-07, "logits/chosen": -0.6008567810058594, "logits/rejected": -0.5818469524383545, "logps/chosen": -84.36399841308594, "logps/rejected": -108.04344177246094, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.9072425961494446, "rewards/margins": 19.935123443603516, "rewards/rejected": -19.027881622314453, "step": 1720 }, { "epoch": 0.79, "learning_rate": 2.4556062912227295e-07, "logits/chosen": -0.6780723333358765, "logits/rejected": -0.667972981929779, "logps/chosen": -86.9156265258789, "logps/rejected": -105.91011810302734, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.872097373008728, "rewards/margins": 20.565614700317383, "rewards/rejected": -18.693517684936523, "step": 1730 }, { "epoch": 0.79, "learning_rate": 2.450532724505327e-07, "logits/chosen": -0.6924620866775513, "logits/rejected": -0.6552094221115112, "logps/chosen": -83.34413146972656, "logps/rejected": -103.43766784667969, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4335517883300781, "rewards/margins": 20.293771743774414, "rewards/rejected": -18.860218048095703, "step": 1740 }, { "epoch": 0.8, "learning_rate": 2.445459157787925e-07, "logits/chosen": -0.7454475164413452, "logits/rejected": -0.7243833541870117, "logps/chosen": -88.2259292602539, "logps/rejected": -105.4636459350586, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 1.127286672592163, "rewards/margins": 18.987035751342773, "rewards/rejected": -17.859750747680664, "step": 1750 }, { "epoch": 0.8, "learning_rate": 2.4403855910705223e-07, "logits/chosen": -0.7101667523384094, "logits/rejected": -0.7183898091316223, "logps/chosen": -81.78260803222656, "logps/rejected": -104.47676086425781, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.3130123019218445, "rewards/margins": 18.950632095336914, "rewards/rejected": -18.637617111206055, "step": 1760 }, { "epoch": 0.81, "learning_rate": 2.43531202435312e-07, "logits/chosen": -0.6741453409194946, "logits/rejected": -0.6272802352905273, "logps/chosen": -90.71461486816406, "logps/rejected": -106.30870056152344, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.5790307521820068, "rewards/margins": 20.211219787597656, "rewards/rejected": -18.632186889648438, "step": 1770 }, { "epoch": 0.81, "learning_rate": 2.4302384576357176e-07, "logits/chosen": -0.7256354093551636, "logits/rejected": -0.6887451410293579, "logps/chosen": -87.96391296386719, "logps/rejected": -116.27462005615234, "loss": 0.0014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.098422646522522, "rewards/margins": 22.23649787902832, "rewards/rejected": -21.138076782226562, "step": 1780 }, { "epoch": 0.82, "learning_rate": 2.4251648909183155e-07, "logits/chosen": -0.723190188407898, "logits/rejected": -0.687700629234314, "logps/chosen": -85.64554595947266, "logps/rejected": -108.02901458740234, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.214348793029785, "rewards/margins": 21.39786720275879, "rewards/rejected": -19.183521270751953, "step": 1790 }, { "epoch": 0.82, "learning_rate": 2.420091324200913e-07, "logits/chosen": -0.7096911668777466, "logits/rejected": -0.6674980521202087, "logps/chosen": -84.20054626464844, "logps/rejected": -107.4981460571289, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.2375263273715973, "rewards/margins": 19.647363662719727, "rewards/rejected": -19.409835815429688, "step": 1800 }, { "epoch": 0.82, "eval_logits/chosen": -0.6538522243499756, "eval_logits/rejected": -0.5991846919059753, "eval_logps/chosen": -83.35757446289062, "eval_logps/rejected": -103.30670928955078, "eval_loss": 0.006914378609508276, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 1.0901424884796143, "eval_rewards/margins": 20.312789916992188, "eval_rewards/rejected": -19.22264862060547, "eval_runtime": 89.8281, "eval_samples_per_second": 31.861, "eval_steps_per_second": 1.993, "step": 1800 }, { "epoch": 0.83, "learning_rate": 2.415017757483511e-07, "logits/chosen": -0.6579132080078125, "logits/rejected": -0.6017246246337891, "logps/chosen": -91.52220916748047, "logps/rejected": -106.00288391113281, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.6669795513153076, "rewards/margins": 20.964008331298828, "rewards/rejected": -19.297027587890625, "step": 1810 }, { "epoch": 0.83, "learning_rate": 2.409944190766108e-07, "logits/chosen": -0.7107955813407898, "logits/rejected": -0.6973943710327148, "logps/chosen": -82.12738037109375, "logps/rejected": -107.88604736328125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.30672594904899597, "rewards/margins": 20.473596572875977, "rewards/rejected": -20.1668701171875, "step": 1820 }, { "epoch": 0.84, "learning_rate": 2.404870624048706e-07, "logits/chosen": -0.668049156665802, "logits/rejected": -0.6822776198387146, "logps/chosen": -80.28944396972656, "logps/rejected": -101.49888610839844, "loss": 0.002, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1876381635665894, "rewards/margins": 20.42356300354004, "rewards/rejected": -19.235923767089844, "step": 1830 }, { "epoch": 0.84, "learning_rate": 2.3997970573313036e-07, "logits/chosen": -0.6816455125808716, "logits/rejected": -0.6640302538871765, "logps/chosen": -82.68707275390625, "logps/rejected": -102.19058990478516, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3355101346969604, "rewards/margins": 21.309734344482422, "rewards/rejected": -19.974224090576172, "step": 1840 }, { "epoch": 0.84, "learning_rate": 2.3947234906139015e-07, "logits/chosen": -0.7045928239822388, "logits/rejected": -0.7241901159286499, "logps/chosen": -85.01335144042969, "logps/rejected": -108.6558609008789, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.4674608707427979, "rewards/margins": 22.616294860839844, "rewards/rejected": -21.148834228515625, "step": 1850 }, { "epoch": 0.85, "learning_rate": 2.389649923896499e-07, "logits/chosen": -0.6199553608894348, "logits/rejected": -0.6317640542984009, "logps/chosen": -85.36334228515625, "logps/rejected": -110.83219146728516, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 1.9138463735580444, "rewards/margins": 22.010051727294922, "rewards/rejected": -20.096206665039062, "step": 1860 }, { "epoch": 0.85, "learning_rate": 2.384576357179097e-07, "logits/chosen": -0.6020737290382385, "logits/rejected": -0.6157525777816772, "logps/chosen": -89.1829605102539, "logps/rejected": -111.08561706542969, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.7350738048553467, "rewards/margins": 22.97161102294922, "rewards/rejected": -21.23653793334961, "step": 1870 }, { "epoch": 0.86, "learning_rate": 2.3795027904616943e-07, "logits/chosen": -0.7352172136306763, "logits/rejected": -0.6482657194137573, "logps/chosen": -89.54198455810547, "logps/rejected": -111.14498138427734, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.4496486783027649, "rewards/margins": 19.777273178100586, "rewards/rejected": -20.22692108154297, "step": 1880 }, { "epoch": 0.86, "learning_rate": 2.374429223744292e-07, "logits/chosen": -0.6808261275291443, "logits/rejected": -0.6242018938064575, "logps/chosen": -81.02848815917969, "logps/rejected": -103.51603698730469, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.4517748355865479, "rewards/margins": 22.18962287902832, "rewards/rejected": -20.73784637451172, "step": 1890 }, { "epoch": 0.87, "learning_rate": 2.3693556570268896e-07, "logits/chosen": -0.7782861590385437, "logits/rejected": -0.7513757944107056, "logps/chosen": -83.20822143554688, "logps/rejected": -109.80049133300781, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 1.010401725769043, "rewards/margins": 20.9800968170166, "rewards/rejected": -19.969696044921875, "step": 1900 }, { "epoch": 0.87, "eval_logits/chosen": -0.680737316608429, "eval_logits/rejected": -0.6232161521911621, "eval_logps/chosen": -83.85591888427734, "eval_logps/rejected": -106.48957824707031, "eval_loss": 0.006986773107200861, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.8409678936004639, "eval_rewards/margins": 21.655057907104492, "eval_rewards/rejected": -20.8140926361084, "eval_runtime": 76.156, "eval_samples_per_second": 37.581, "eval_steps_per_second": 2.35, "step": 1900 }, { "epoch": 0.87, "learning_rate": 2.3642820903094873e-07, "logits/chosen": -0.7015836834907532, "logits/rejected": -0.5991064310073853, "logps/chosen": -81.31778717041016, "logps/rejected": -109.6204833984375, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0844857692718506, "rewards/margins": 22.631372451782227, "rewards/rejected": -21.546886444091797, "step": 1910 }, { "epoch": 0.88, "learning_rate": 2.359208523592085e-07, "logits/chosen": -0.7016021609306335, "logits/rejected": -0.6770969033241272, "logps/chosen": -84.92622375488281, "logps/rejected": -106.5382080078125, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.17899462580680847, "rewards/margins": 20.394420623779297, "rewards/rejected": -20.21542739868164, "step": 1920 }, { "epoch": 0.88, "learning_rate": 2.3541349568746826e-07, "logits/chosen": -0.7778208255767822, "logits/rejected": -0.698055624961853, "logps/chosen": -86.36054992675781, "logps/rejected": -105.30062103271484, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.3135640621185303, "rewards/margins": 20.72547721862793, "rewards/rejected": -19.411914825439453, "step": 1930 }, { "epoch": 0.89, "learning_rate": 2.3490613901572803e-07, "logits/chosen": -0.6952825784683228, "logits/rejected": -0.656592071056366, "logps/chosen": -81.763671875, "logps/rejected": -106.61894226074219, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.49342602491378784, "rewards/margins": 20.16998863220215, "rewards/rejected": -19.676563262939453, "step": 1940 }, { "epoch": 0.89, "learning_rate": 2.343987823439878e-07, "logits/chosen": -0.6903911828994751, "logits/rejected": -0.6286421418190002, "logps/chosen": -79.79450225830078, "logps/rejected": -106.4630126953125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 2.1111063957214355, "rewards/margins": 22.001352310180664, "rewards/rejected": -19.890243530273438, "step": 1950 }, { "epoch": 0.89, "learning_rate": 2.3389142567224756e-07, "logits/chosen": -0.7639296650886536, "logits/rejected": -0.7568944096565247, "logps/chosen": -90.86905670166016, "logps/rejected": -113.7149658203125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.12427709251642227, "rewards/margins": 20.833873748779297, "rewards/rejected": -20.7095947265625, "step": 1960 }, { "epoch": 0.9, "learning_rate": 2.3338406900050733e-07, "logits/chosen": -0.7693318128585815, "logits/rejected": -0.6867147088050842, "logps/chosen": -84.40339660644531, "logps/rejected": -104.76747131347656, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.991206169128418, "rewards/margins": 22.942188262939453, "rewards/rejected": -20.95098304748535, "step": 1970 }, { "epoch": 0.9, "learning_rate": 2.328767123287671e-07, "logits/chosen": -0.7538214325904846, "logits/rejected": -0.7061902284622192, "logps/chosen": -84.06021118164062, "logps/rejected": -112.34172058105469, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.09169814735651016, "rewards/margins": 22.0313720703125, "rewards/rejected": -22.123071670532227, "step": 1980 }, { "epoch": 0.91, "learning_rate": 2.3236935565702686e-07, "logits/chosen": -0.7687441110610962, "logits/rejected": -0.6658346056938171, "logps/chosen": -82.9585952758789, "logps/rejected": -106.9176025390625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.7672392725944519, "rewards/margins": 20.316686630249023, "rewards/rejected": -19.549448013305664, "step": 1990 }, { "epoch": 0.91, "learning_rate": 2.3186199898528663e-07, "logits/chosen": -0.7342469096183777, "logits/rejected": -0.7138758897781372, "logps/chosen": -82.50282287597656, "logps/rejected": -102.0538101196289, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 1.334281325340271, "rewards/margins": 19.689542770385742, "rewards/rejected": -18.355262756347656, "step": 2000 }, { "epoch": 0.91, "eval_logits/chosen": -0.7237182855606079, "eval_logits/rejected": -0.6640624403953552, "eval_logps/chosen": -83.15184020996094, "eval_logps/rejected": -103.29666900634766, "eval_loss": 0.005852441303431988, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.1930080652236938, "eval_rewards/margins": 20.410642623901367, "eval_rewards/rejected": -19.217632293701172, "eval_runtime": 81.642, "eval_samples_per_second": 35.055, "eval_steps_per_second": 2.192, "step": 2000 }, { "epoch": 0.92, "learning_rate": 2.313546423135464e-07, "logits/chosen": -0.7072040438652039, "logits/rejected": -0.6981512904167175, "logps/chosen": -89.46893310546875, "logps/rejected": -106.85356140136719, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 1.0839965343475342, "rewards/margins": 20.12752342224121, "rewards/rejected": -19.043527603149414, "step": 2010 }, { "epoch": 0.92, "learning_rate": 2.3084728564180616e-07, "logits/chosen": -0.6174991726875305, "logits/rejected": -0.5943428874015808, "logps/chosen": -81.58921813964844, "logps/rejected": -108.2657241821289, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 1.291887879371643, "rewards/margins": 19.339893341064453, "rewards/rejected": -18.04800796508789, "step": 2020 }, { "epoch": 0.93, "learning_rate": 2.3033992897006593e-07, "logits/chosen": -0.7860113978385925, "logits/rejected": -0.7448928356170654, "logps/chosen": -84.71741485595703, "logps/rejected": -102.28780364990234, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 1.6886265277862549, "rewards/margins": 21.161090850830078, "rewards/rejected": -19.472463607788086, "step": 2030 }, { "epoch": 0.93, "learning_rate": 2.298325722983257e-07, "logits/chosen": -0.6835452318191528, "logits/rejected": -0.6044243574142456, "logps/chosen": -86.92694091796875, "logps/rejected": -108.12068176269531, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.0500653982162476, "rewards/margins": 20.692108154296875, "rewards/rejected": -19.642040252685547, "step": 2040 }, { "epoch": 0.94, "learning_rate": 2.2932521562658546e-07, "logits/chosen": -0.682311475276947, "logits/rejected": -0.6454225778579712, "logps/chosen": -84.58203887939453, "logps/rejected": -105.5318832397461, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.446061134338379, "rewards/margins": 20.76397132873535, "rewards/rejected": -19.317909240722656, "step": 2050 }, { "epoch": 0.94, "learning_rate": 2.2881785895484523e-07, "logits/chosen": -0.7142086625099182, "logits/rejected": -0.7194957733154297, "logps/chosen": -86.15142822265625, "logps/rejected": -106.82261657714844, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 1.6515350341796875, "rewards/margins": 21.164995193481445, "rewards/rejected": -19.513460159301758, "step": 2060 }, { "epoch": 0.94, "learning_rate": 2.28310502283105e-07, "logits/chosen": -0.6685755848884583, "logits/rejected": -0.648827075958252, "logps/chosen": -77.6914291381836, "logps/rejected": -105.21824645996094, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 2.0622940063476562, "rewards/margins": 20.59681510925293, "rewards/rejected": -18.534521102905273, "step": 2070 }, { "epoch": 0.95, "learning_rate": 2.2780314561136476e-07, "logits/chosen": -0.7563666105270386, "logits/rejected": -0.655853271484375, "logps/chosen": -86.95856475830078, "logps/rejected": -102.80877685546875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.8138997554779053, "rewards/margins": 20.457874298095703, "rewards/rejected": -18.643972396850586, "step": 2080 }, { "epoch": 0.95, "learning_rate": 2.2729578893962453e-07, "logits/chosen": -0.7362481951713562, "logits/rejected": -0.7294582724571228, "logps/chosen": -84.70967102050781, "logps/rejected": -106.26744079589844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.107060670852661, "rewards/margins": 21.18497657775879, "rewards/rejected": -19.077917098999023, "step": 2090 }, { "epoch": 0.96, "learning_rate": 2.267884322678843e-07, "logits/chosen": -0.7865076065063477, "logits/rejected": -0.6870723962783813, "logps/chosen": -83.01496887207031, "logps/rejected": -101.82795715332031, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.8749852180480957, "rewards/margins": 19.56221580505371, "rewards/rejected": -18.687232971191406, "step": 2100 }, { "epoch": 0.96, "eval_logits/chosen": -0.6922763586044312, "eval_logits/rejected": -0.6358173489570618, "eval_logps/chosen": -82.23265075683594, "eval_logps/rejected": -101.94881439208984, "eval_loss": 0.005928453989326954, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.6526042222976685, "eval_rewards/margins": 20.196311950683594, "eval_rewards/rejected": -18.5437068939209, "eval_runtime": 73.4438, "eval_samples_per_second": 38.969, "eval_steps_per_second": 2.437, "step": 2100 }, { "epoch": 0.96, "learning_rate": 2.2628107559614406e-07, "logits/chosen": -0.7227433323860168, "logits/rejected": -0.6691815257072449, "logps/chosen": -80.89137268066406, "logps/rejected": -108.2369613647461, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 1.246288537979126, "rewards/margins": 21.08827781677246, "rewards/rejected": -19.841989517211914, "step": 2110 }, { "epoch": 0.97, "learning_rate": 2.2577371892440383e-07, "logits/chosen": -0.6783403754234314, "logits/rejected": -0.6632364988327026, "logps/chosen": -86.68379211425781, "logps/rejected": -105.72774505615234, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 2.1012396812438965, "rewards/margins": 21.37411117553711, "rewards/rejected": -19.272871017456055, "step": 2120 }, { "epoch": 0.97, "learning_rate": 2.252663622526636e-07, "logits/chosen": -0.6918590664863586, "logits/rejected": -0.7273339033126831, "logps/chosen": -87.85774993896484, "logps/rejected": -106.26072692871094, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 1.5966049432754517, "rewards/margins": 20.83112907409668, "rewards/rejected": -19.23452377319336, "step": 2130 }, { "epoch": 0.98, "learning_rate": 2.2475900558092336e-07, "logits/chosen": -0.7496614456176758, "logits/rejected": -0.7174783945083618, "logps/chosen": -81.19893646240234, "logps/rejected": -107.23223876953125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.5595566034317017, "rewards/margins": 21.59109115600586, "rewards/rejected": -20.031536102294922, "step": 2140 }, { "epoch": 0.98, "learning_rate": 2.2425164890918313e-07, "logits/chosen": -0.695356011390686, "logits/rejected": -0.6898726224899292, "logps/chosen": -88.45246887207031, "logps/rejected": -107.5653076171875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.30954909324646, "rewards/margins": 21.355260848999023, "rewards/rejected": -19.045711517333984, "step": 2150 }, { "epoch": 0.99, "learning_rate": 2.237442922374429e-07, "logits/chosen": -0.6493713855743408, "logits/rejected": -0.6603960394859314, "logps/chosen": -83.4217300415039, "logps/rejected": -100.755859375, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.42486119270324707, "rewards/margins": 18.281320571899414, "rewards/rejected": -17.85645866394043, "step": 2160 }, { "epoch": 0.99, "learning_rate": 2.2323693556570266e-07, "logits/chosen": -0.7269707322120667, "logits/rejected": -0.6796764135360718, "logps/chosen": -83.48429870605469, "logps/rejected": -108.75740814208984, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 2.3360190391540527, "rewards/margins": 21.05870819091797, "rewards/rejected": -18.72269058227539, "step": 2170 }, { "epoch": 1.0, "learning_rate": 2.2272957889396242e-07, "logits/chosen": -0.6619648933410645, "logits/rejected": -0.6784309148788452, "logps/chosen": -84.5233383178711, "logps/rejected": -104.6445541381836, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.9546858072280884, "rewards/margins": 19.96062660217285, "rewards/rejected": -19.005939483642578, "step": 2180 }, { "epoch": 1.0, "learning_rate": 2.222222222222222e-07, "logits/chosen": -0.7001414895057678, "logits/rejected": -0.700007438659668, "logps/chosen": -88.71408081054688, "logps/rejected": -104.5293960571289, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 1.5879231691360474, "rewards/margins": 20.470571517944336, "rewards/rejected": -18.882648468017578, "step": 2190 }, { "epoch": 1.0, "learning_rate": 2.2171486555048196e-07, "logits/chosen": -0.751279354095459, "logits/rejected": -0.7181065082550049, "logps/chosen": -85.7071762084961, "logps/rejected": -111.83939361572266, "loss": 0.0024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.351722240447998, "rewards/margins": 22.148082733154297, "rewards/rejected": -19.796356201171875, "step": 2200 }, { "epoch": 1.0, "eval_logits/chosen": -0.7133587598800659, "eval_logits/rejected": -0.6583219170570374, "eval_logps/chosen": -83.3022689819336, "eval_logps/rejected": -102.72750854492188, "eval_loss": 0.005814776755869389, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.1177955865859985, "eval_rewards/margins": 20.05084991455078, "eval_rewards/rejected": -18.933055877685547, "eval_runtime": 73.4127, "eval_samples_per_second": 38.985, "eval_steps_per_second": 2.438, "step": 2200 }, { "epoch": 1.01, "learning_rate": 2.2120750887874172e-07, "logits/chosen": -0.7339301109313965, "logits/rejected": -0.7166422605514526, "logps/chosen": -86.40919494628906, "logps/rejected": -111.6192855834961, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.697614073753357, "rewards/margins": 21.687803268432617, "rewards/rejected": -19.990190505981445, "step": 2210 }, { "epoch": 1.01, "learning_rate": 2.207001522070015e-07, "logits/chosen": -0.6801533699035645, "logits/rejected": -0.6576797366142273, "logps/chosen": -89.45710754394531, "logps/rejected": -109.4631576538086, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 1.813921332359314, "rewards/margins": 21.14259147644043, "rewards/rejected": -19.32866859436035, "step": 2220 }, { "epoch": 1.02, "learning_rate": 2.2019279553526126e-07, "logits/chosen": -0.688944935798645, "logits/rejected": -0.6999958753585815, "logps/chosen": -93.91636657714844, "logps/rejected": -105.1122817993164, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.1669729948043823, "rewards/margins": 19.678924560546875, "rewards/rejected": -18.511951446533203, "step": 2230 }, { "epoch": 1.02, "learning_rate": 2.1968543886352102e-07, "logits/chosen": -0.7800687551498413, "logits/rejected": -0.7249246835708618, "logps/chosen": -81.42112731933594, "logps/rejected": -106.3897476196289, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.4921230673789978, "rewards/margins": 19.78441619873047, "rewards/rejected": -19.292295455932617, "step": 2240 }, { "epoch": 1.03, "learning_rate": 2.191780821917808e-07, "logits/chosen": -0.7485274076461792, "logits/rejected": -0.7021461725234985, "logps/chosen": -83.02845764160156, "logps/rejected": -108.2826156616211, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.2401946783065796, "rewards/margins": 20.906063079833984, "rewards/rejected": -19.66586685180664, "step": 2250 }, { "epoch": 1.03, "learning_rate": 2.1867072552004056e-07, "logits/chosen": -0.8463417887687683, "logits/rejected": -0.8135510683059692, "logps/chosen": -84.13587951660156, "logps/rejected": -109.97297668457031, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8690627813339233, "rewards/margins": 22.567180633544922, "rewards/rejected": -20.6981201171875, "step": 2260 }, { "epoch": 1.04, "learning_rate": 2.1816336884830032e-07, "logits/chosen": -0.6881771087646484, "logits/rejected": -0.7314122915267944, "logps/chosen": -79.51889038085938, "logps/rejected": -106.68363952636719, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.6152458190917969, "rewards/margins": 21.590240478515625, "rewards/rejected": -19.97499656677246, "step": 2270 }, { "epoch": 1.04, "learning_rate": 2.176560121765601e-07, "logits/chosen": -0.8350343704223633, "logits/rejected": -0.7969235181808472, "logps/chosen": -84.02780151367188, "logps/rejected": -108.79520416259766, "loss": 0.0071, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4147300720214844, "rewards/margins": 20.76464080810547, "rewards/rejected": -19.349910736083984, "step": 2280 }, { "epoch": 1.05, "learning_rate": 2.1714865550481986e-07, "logits/chosen": -0.7332528829574585, "logits/rejected": -0.7286131978034973, "logps/chosen": -83.72573852539062, "logps/rejected": -106.83250427246094, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.2933398485183716, "rewards/margins": 21.30584144592285, "rewards/rejected": -20.012500762939453, "step": 2290 }, { "epoch": 1.05, "learning_rate": 2.1664129883307962e-07, "logits/chosen": -0.7267037630081177, "logits/rejected": -0.704474925994873, "logps/chosen": -83.55988311767578, "logps/rejected": -106.81196594238281, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.4146488308906555, "rewards/margins": 20.529951095581055, "rewards/rejected": -20.115304946899414, "step": 2300 }, { "epoch": 1.05, "eval_logits/chosen": -0.746123731136322, "eval_logits/rejected": -0.6872346997261047, "eval_logps/chosen": -83.3327865600586, "eval_logps/rejected": -103.95221710205078, "eval_loss": 0.005804476328194141, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.1025363206863403, "eval_rewards/margins": 20.6479434967041, "eval_rewards/rejected": -19.54541015625, "eval_runtime": 77.2413, "eval_samples_per_second": 37.053, "eval_steps_per_second": 2.317, "step": 2300 }, { "epoch": 1.05, "learning_rate": 2.161339421613394e-07, "logits/chosen": -0.7835151553153992, "logits/rejected": -0.7966066598892212, "logps/chosen": -82.83807373046875, "logps/rejected": -111.46928405761719, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.7062146067619324, "rewards/margins": 20.392162322998047, "rewards/rejected": -19.68594741821289, "step": 2310 }, { "epoch": 1.06, "learning_rate": 2.1562658548959916e-07, "logits/chosen": -0.8584432601928711, "logits/rejected": -0.7813988924026489, "logps/chosen": -86.52935791015625, "logps/rejected": -106.8839340209961, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.7992275357246399, "rewards/margins": 22.510517120361328, "rewards/rejected": -21.71129035949707, "step": 2320 }, { "epoch": 1.06, "learning_rate": 2.1511922881785892e-07, "logits/chosen": -0.6985601186752319, "logits/rejected": -0.6839465498924255, "logps/chosen": -85.86921691894531, "logps/rejected": -106.46211242675781, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.41637343168258667, "rewards/margins": 19.63858985900879, "rewards/rejected": -19.22221565246582, "step": 2330 }, { "epoch": 1.07, "learning_rate": 2.146118721461187e-07, "logits/chosen": -0.7708380818367004, "logits/rejected": -0.7677600979804993, "logps/chosen": -84.77110290527344, "logps/rejected": -111.69419860839844, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.7214634418487549, "rewards/margins": 22.269315719604492, "rewards/rejected": -20.5478515625, "step": 2340 }, { "epoch": 1.07, "learning_rate": 2.1410451547437846e-07, "logits/chosen": -0.8053115606307983, "logits/rejected": -0.8239187002182007, "logps/chosen": -85.80523681640625, "logps/rejected": -108.161865234375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 2.04756236076355, "rewards/margins": 22.632070541381836, "rewards/rejected": -20.58450698852539, "step": 2350 }, { "epoch": 1.08, "learning_rate": 2.1359715880263822e-07, "logits/chosen": -0.7752918004989624, "logits/rejected": -0.7559579610824585, "logps/chosen": -83.23637390136719, "logps/rejected": -108.0810546875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.097753882408142, "rewards/margins": 21.340835571289062, "rewards/rejected": -20.24308204650879, "step": 2360 }, { "epoch": 1.08, "learning_rate": 2.13089802130898e-07, "logits/chosen": -0.755330502986908, "logits/rejected": -0.6864339709281921, "logps/chosen": -83.15589904785156, "logps/rejected": -113.9013442993164, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.9833482503890991, "rewards/margins": 23.426158905029297, "rewards/rejected": -22.44281005859375, "step": 2370 }, { "epoch": 1.09, "learning_rate": 2.1258244545915776e-07, "logits/chosen": -0.718736469745636, "logits/rejected": -0.6083402037620544, "logps/chosen": -82.97991943359375, "logps/rejected": -103.54791259765625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.28329333662986755, "rewards/margins": 20.23536491394043, "rewards/rejected": -19.952070236206055, "step": 2380 }, { "epoch": 1.09, "learning_rate": 2.1207508878741752e-07, "logits/chosen": -0.7845762968063354, "logits/rejected": -0.7489625215530396, "logps/chosen": -86.88125610351562, "logps/rejected": -109.0585708618164, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.3073398172855377, "rewards/margins": 21.779104232788086, "rewards/rejected": -21.471765518188477, "step": 2390 }, { "epoch": 1.1, "learning_rate": 2.115677321156773e-07, "logits/chosen": -0.7549251317977905, "logits/rejected": -0.725717306137085, "logps/chosen": -84.1806640625, "logps/rejected": -109.55061340332031, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6283788681030273, "rewards/margins": 21.849435806274414, "rewards/rejected": -21.221057891845703, "step": 2400 }, { "epoch": 1.1, "eval_logits/chosen": -0.7693464159965515, "eval_logits/rejected": -0.7055094838142395, "eval_logps/chosen": -83.79874420166016, "eval_logps/rejected": -107.29126739501953, "eval_loss": 0.006137872580438852, "eval_rewards/accuracies": 0.9972066879272461, "eval_rewards/chosen": 0.8695586919784546, "eval_rewards/margins": 22.08449363708496, "eval_rewards/rejected": -21.214935302734375, "eval_runtime": 76.2893, "eval_samples_per_second": 37.515, "eval_steps_per_second": 2.346, "step": 2400 }, { "epoch": 1.1, "learning_rate": 2.1106037544393706e-07, "logits/chosen": -0.7792016267776489, "logits/rejected": -0.7324903011322021, "logps/chosen": -85.0523681640625, "logps/rejected": -111.51036071777344, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 2.077381134033203, "rewards/margins": 23.63772201538086, "rewards/rejected": -21.56034278869629, "step": 2410 }, { "epoch": 1.1, "learning_rate": 2.1055301877219682e-07, "logits/chosen": -0.8506320714950562, "logits/rejected": -0.7847840189933777, "logps/chosen": -85.75637817382812, "logps/rejected": -113.4081802368164, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6911964416503906, "rewards/margins": 23.912567138671875, "rewards/rejected": -22.221370697021484, "step": 2420 }, { "epoch": 1.11, "learning_rate": 2.100456621004566e-07, "logits/chosen": -0.736041247844696, "logits/rejected": -0.6666306257247925, "logps/chosen": -80.7561264038086, "logps/rejected": -109.1169662475586, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.6556488275527954, "rewards/margins": 22.82249641418457, "rewards/rejected": -21.166845321655273, "step": 2430 }, { "epoch": 1.11, "learning_rate": 2.0953830542871636e-07, "logits/chosen": -0.8349838256835938, "logits/rejected": -0.7791039347648621, "logps/chosen": -82.36167907714844, "logps/rejected": -110.2313461303711, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.5611153841018677, "rewards/margins": 21.64533805847168, "rewards/rejected": -21.084224700927734, "step": 2440 }, { "epoch": 1.12, "learning_rate": 2.0903094875697612e-07, "logits/chosen": -0.7713768482208252, "logits/rejected": -0.7219734191894531, "logps/chosen": -92.53242492675781, "logps/rejected": -115.09944915771484, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.6493558883666992, "rewards/margins": 23.381576538085938, "rewards/rejected": -22.732219696044922, "step": 2450 }, { "epoch": 1.12, "learning_rate": 2.085235920852359e-07, "logits/chosen": -0.8122873306274414, "logits/rejected": -0.7389894127845764, "logps/chosen": -81.98970031738281, "logps/rejected": -114.59257507324219, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474372863769531, "rewards/margins": 22.69391632080078, "rewards/rejected": -21.746479034423828, "step": 2460 }, { "epoch": 1.13, "learning_rate": 2.0801623541349566e-07, "logits/chosen": -0.8570648431777954, "logits/rejected": -0.8347585797309875, "logps/chosen": -78.87316131591797, "logps/rejected": -104.390380859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.652663230895996, "rewards/margins": 22.120464324951172, "rewards/rejected": -20.46780014038086, "step": 2470 }, { "epoch": 1.13, "learning_rate": 2.0750887874175542e-07, "logits/chosen": -0.7832027673721313, "logits/rejected": -0.7585029602050781, "logps/chosen": -84.72008514404297, "logps/rejected": -117.10993957519531, "loss": 0.0017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6026067137718201, "rewards/margins": 23.96122932434082, "rewards/rejected": -23.35862159729004, "step": 2480 }, { "epoch": 1.14, "learning_rate": 2.070015220700152e-07, "logits/chosen": -0.7349112629890442, "logits/rejected": -0.7553213834762573, "logps/chosen": -86.74533081054688, "logps/rejected": -115.26509094238281, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.6511772274971008, "rewards/margins": 23.22552490234375, "rewards/rejected": -22.57434844970703, "step": 2490 }, { "epoch": 1.14, "learning_rate": 2.0649416539827496e-07, "logits/chosen": -0.7575095891952515, "logits/rejected": -0.7819565534591675, "logps/chosen": -84.1733169555664, "logps/rejected": -108.44877624511719, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19732484221458435, "rewards/margins": 20.539228439331055, "rewards/rejected": -20.341901779174805, "step": 2500 }, { "epoch": 1.14, "eval_logits/chosen": -0.7653970718383789, "eval_logits/rejected": -0.7059395909309387, "eval_logps/chosen": -83.49778747558594, "eval_logps/rejected": -105.02425384521484, "eval_loss": 0.005491924937814474, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.020034670829773, "eval_rewards/margins": 21.101457595825195, "eval_rewards/rejected": -20.081422805786133, "eval_runtime": 97.1385, "eval_samples_per_second": 29.463, "eval_steps_per_second": 1.843, "step": 2500 }, { "epoch": 1.15, "learning_rate": 2.0598680872653472e-07, "logits/chosen": -0.764125645160675, "logits/rejected": -0.7234150171279907, "logps/chosen": -88.8193588256836, "logps/rejected": -110.6650390625, "loss": 0.0038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7087750434875488, "rewards/margins": 21.741344451904297, "rewards/rejected": -21.032567977905273, "step": 2510 }, { "epoch": 1.15, "learning_rate": 2.054794520547945e-07, "logits/chosen": -0.6850422024726868, "logits/rejected": -0.6487849950790405, "logps/chosen": -82.67296600341797, "logps/rejected": -106.21736145019531, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 1.7742611169815063, "rewards/margins": 22.46800422668457, "rewards/rejected": -20.693742752075195, "step": 2520 }, { "epoch": 1.15, "learning_rate": 2.0497209538305426e-07, "logits/chosen": -0.8037935495376587, "logits/rejected": -0.736015260219574, "logps/chosen": -86.53871154785156, "logps/rejected": -108.95832824707031, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.6968075037002563, "rewards/margins": 21.14533805847168, "rewards/rejected": -20.448530197143555, "step": 2530 }, { "epoch": 1.16, "learning_rate": 2.0446473871131402e-07, "logits/chosen": -0.7924987077713013, "logits/rejected": -0.7409130334854126, "logps/chosen": -90.44608306884766, "logps/rejected": -119.0773696899414, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.6108601689338684, "rewards/margins": 22.708263397216797, "rewards/rejected": -22.09740447998047, "step": 2540 }, { "epoch": 1.16, "learning_rate": 2.039573820395738e-07, "logits/chosen": -0.7455700635910034, "logits/rejected": -0.7140682935714722, "logps/chosen": -78.17546081542969, "logps/rejected": -109.40152740478516, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.9739960432052612, "rewards/margins": 22.592451095581055, "rewards/rejected": -21.61845588684082, "step": 2550 }, { "epoch": 1.17, "learning_rate": 2.0345002536783356e-07, "logits/chosen": -0.7703838348388672, "logits/rejected": -0.7294633388519287, "logps/chosen": -81.02081298828125, "logps/rejected": -107.09619140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9101536273956299, "rewards/margins": 22.840641021728516, "rewards/rejected": -20.930484771728516, "step": 2560 }, { "epoch": 1.17, "learning_rate": 2.0294266869609332e-07, "logits/chosen": -0.8617580533027649, "logits/rejected": -0.8065959811210632, "logps/chosen": -87.26881408691406, "logps/rejected": -106.7549057006836, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 1.3661178350448608, "rewards/margins": 22.15833282470703, "rewards/rejected": -20.79221534729004, "step": 2570 }, { "epoch": 1.18, "learning_rate": 2.024353120243531e-07, "logits/chosen": -0.7026188969612122, "logits/rejected": -0.6509179472923279, "logps/chosen": -77.97960662841797, "logps/rejected": -104.57243347167969, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.5850582122802734, "rewards/margins": 20.327341079711914, "rewards/rejected": -18.74228286743164, "step": 2580 }, { "epoch": 1.18, "learning_rate": 2.0192795535261286e-07, "logits/chosen": -0.7325498461723328, "logits/rejected": -0.7174168825149536, "logps/chosen": -82.26387023925781, "logps/rejected": -109.1794204711914, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 1.1741113662719727, "rewards/margins": 22.20342445373535, "rewards/rejected": -21.029314041137695, "step": 2590 }, { "epoch": 1.19, "learning_rate": 2.0142059868087262e-07, "logits/chosen": -0.7314542531967163, "logits/rejected": -0.7128167152404785, "logps/chosen": -86.39726257324219, "logps/rejected": -107.54801177978516, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.596429467201233, "rewards/margins": 22.180591583251953, "rewards/rejected": -20.584165573120117, "step": 2600 }, { "epoch": 1.19, "eval_logits/chosen": -0.7385216355323792, "eval_logits/rejected": -0.680923342704773, "eval_logps/chosen": -83.2193603515625, "eval_logps/rejected": -106.13276672363281, "eval_loss": 0.005839827004820108, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.159246802330017, "eval_rewards/margins": 21.79492950439453, "eval_rewards/rejected": -20.63568115234375, "eval_runtime": 68.9725, "eval_samples_per_second": 41.495, "eval_steps_per_second": 2.595, "step": 2600 }, { "epoch": 1.19, "learning_rate": 2.009132420091324e-07, "logits/chosen": -0.6961277723312378, "logits/rejected": -0.6608497500419617, "logps/chosen": -80.58322143554688, "logps/rejected": -107.66817474365234, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.9859172701835632, "rewards/margins": 22.4183349609375, "rewards/rejected": -21.432416915893555, "step": 2610 }, { "epoch": 1.2, "learning_rate": 2.0040588533739216e-07, "logits/chosen": -0.7351371049880981, "logits/rejected": -0.7426373958587646, "logps/chosen": -86.33208465576172, "logps/rejected": -109.19034576416016, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.45506373047828674, "rewards/margins": 21.83483123779297, "rewards/rejected": -21.379764556884766, "step": 2620 }, { "epoch": 1.2, "learning_rate": 1.9989852866565192e-07, "logits/chosen": -0.6917943358421326, "logits/rejected": -0.6587172150611877, "logps/chosen": -83.50242614746094, "logps/rejected": -111.31648254394531, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.758619487285614, "rewards/margins": 21.25997543334961, "rewards/rejected": -20.501354217529297, "step": 2630 }, { "epoch": 1.21, "learning_rate": 1.993911719939117e-07, "logits/chosen": -0.6337357759475708, "logits/rejected": -0.6760739088058472, "logps/chosen": -86.26484680175781, "logps/rejected": -103.41255950927734, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 1.7138054370880127, "rewards/margins": 20.66269302368164, "rewards/rejected": -18.948888778686523, "step": 2640 }, { "epoch": 1.21, "learning_rate": 1.9888381532217146e-07, "logits/chosen": -0.7049747705459595, "logits/rejected": -0.6962708234786987, "logps/chosen": -85.10768127441406, "logps/rejected": -109.0610580444336, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.5559046268463135, "rewards/margins": 20.93661880493164, "rewards/rejected": -20.38071632385254, "step": 2650 }, { "epoch": 1.21, "learning_rate": 1.9837645865043122e-07, "logits/chosen": -0.7089205980300903, "logits/rejected": -0.7450774908065796, "logps/chosen": -84.3514175415039, "logps/rejected": -107.42315673828125, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2602952718734741, "rewards/margins": 20.727048873901367, "rewards/rejected": -19.466754913330078, "step": 2660 }, { "epoch": 1.22, "learning_rate": 1.97869101978691e-07, "logits/chosen": -0.6836045980453491, "logits/rejected": -0.6517876386642456, "logps/chosen": -84.9683837890625, "logps/rejected": -107.4233169555664, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.006893515586853, "rewards/margins": 20.917407989501953, "rewards/rejected": -19.9105167388916, "step": 2670 }, { "epoch": 1.22, "learning_rate": 1.9736174530695076e-07, "logits/chosen": -0.7533547282218933, "logits/rejected": -0.7005944848060608, "logps/chosen": -85.52412414550781, "logps/rejected": -108.1337890625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.1605770587921143, "rewards/margins": 21.576522827148438, "rewards/rejected": -20.41594886779785, "step": 2680 }, { "epoch": 1.23, "learning_rate": 1.9685438863521052e-07, "logits/chosen": -0.7099670767784119, "logits/rejected": -0.7033403515815735, "logps/chosen": -82.56312561035156, "logps/rejected": -107.25730895996094, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 2.1814966201782227, "rewards/margins": 21.729625701904297, "rewards/rejected": -19.54813003540039, "step": 2690 }, { "epoch": 1.23, "learning_rate": 1.963470319634703e-07, "logits/chosen": -0.8139055371284485, "logits/rejected": -0.7495929598808289, "logps/chosen": -82.12207794189453, "logps/rejected": -106.3428955078125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.3602625131607056, "rewards/margins": 22.45661163330078, "rewards/rejected": -21.09634780883789, "step": 2700 }, { "epoch": 1.23, "eval_logits/chosen": -0.7374448180198669, "eval_logits/rejected": -0.6823889017105103, "eval_logps/chosen": -83.8608627319336, "eval_logps/rejected": -105.3611068725586, "eval_loss": 0.00569253321737051, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.838493824005127, "eval_rewards/margins": 21.088348388671875, "eval_rewards/rejected": -20.249855041503906, "eval_runtime": 83.4775, "eval_samples_per_second": 34.285, "eval_steps_per_second": 2.144, "step": 2700 }, { "epoch": 1.24, "learning_rate": 1.9583967529173006e-07, "logits/chosen": -0.7564720511436462, "logits/rejected": -0.6812716126441956, "logps/chosen": -87.40708923339844, "logps/rejected": -108.55073547363281, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8356910943984985, "rewards/margins": 21.843751907348633, "rewards/rejected": -21.008060455322266, "step": 2710 }, { "epoch": 1.24, "learning_rate": 1.9533231861998982e-07, "logits/chosen": -0.8517030477523804, "logits/rejected": -0.8198519945144653, "logps/chosen": -86.82032775878906, "logps/rejected": -115.4446792602539, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.7068185806274414, "rewards/margins": 21.509122848510742, "rewards/rejected": -20.80230140686035, "step": 2720 }, { "epoch": 1.25, "learning_rate": 1.948249619482496e-07, "logits/chosen": -0.6536229848861694, "logits/rejected": -0.5762327909469604, "logps/chosen": -86.3205795288086, "logps/rejected": -107.751953125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.5272104144096375, "rewards/margins": 21.346994400024414, "rewards/rejected": -20.819782257080078, "step": 2730 }, { "epoch": 1.25, "learning_rate": 1.9431760527650936e-07, "logits/chosen": -0.8203104734420776, "logits/rejected": -0.7407748103141785, "logps/chosen": -84.98686981201172, "logps/rejected": -113.11669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.3415039777755737, "rewards/margins": 23.172767639160156, "rewards/rejected": -21.83126449584961, "step": 2740 }, { "epoch": 1.26, "learning_rate": 1.9381024860476912e-07, "logits/chosen": -0.8179903030395508, "logits/rejected": -0.7713826894760132, "logps/chosen": -90.3552017211914, "logps/rejected": -110.89227294921875, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.8493086099624634, "rewards/margins": 22.596853256225586, "rewards/rejected": -21.747547149658203, "step": 2750 }, { "epoch": 1.26, "learning_rate": 1.933028919330289e-07, "logits/chosen": -0.78132164478302, "logits/rejected": -0.7138301134109497, "logps/chosen": -84.56668853759766, "logps/rejected": -112.89170837402344, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.28099095821380615, "rewards/margins": 22.480010986328125, "rewards/rejected": -22.199020385742188, "step": 2760 }, { "epoch": 1.26, "learning_rate": 1.9279553526128866e-07, "logits/chosen": -0.8231332898139954, "logits/rejected": -0.7319883704185486, "logps/chosen": -83.89222717285156, "logps/rejected": -110.40364074707031, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.8967598080635071, "rewards/margins": 23.46830177307129, "rewards/rejected": -22.5715389251709, "step": 2770 }, { "epoch": 1.27, "learning_rate": 1.9228817858954842e-07, "logits/chosen": -0.8091581463813782, "logits/rejected": -0.719267725944519, "logps/chosen": -83.92425537109375, "logps/rejected": -114.30082702636719, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.229701280593872, "rewards/margins": 24.159175872802734, "rewards/rejected": -22.929473876953125, "step": 2780 }, { "epoch": 1.27, "learning_rate": 1.917808219178082e-07, "logits/chosen": -0.8809703588485718, "logits/rejected": -0.8047698140144348, "logps/chosen": -86.08138275146484, "logps/rejected": -109.77156066894531, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.8736478686332703, "rewards/margins": 22.648921966552734, "rewards/rejected": -21.77527618408203, "step": 2790 }, { "epoch": 1.28, "learning_rate": 1.9127346524606796e-07, "logits/chosen": -0.8428120613098145, "logits/rejected": -0.815168023109436, "logps/chosen": -82.51287841796875, "logps/rejected": -108.46163177490234, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.5113711357116699, "rewards/margins": 21.765836715698242, "rewards/rejected": -21.254467010498047, "step": 2800 }, { "epoch": 1.28, "eval_logits/chosen": -0.7845156192779541, "eval_logits/rejected": -0.7218629717826843, "eval_logps/chosen": -84.17462158203125, "eval_logps/rejected": -106.3823471069336, "eval_loss": 0.005751576274633408, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.6816177368164062, "eval_rewards/margins": 21.442089080810547, "eval_rewards/rejected": -20.760467529296875, "eval_runtime": 66.7253, "eval_samples_per_second": 42.892, "eval_steps_per_second": 2.683, "step": 2800 }, { "epoch": 1.28, "learning_rate": 1.9076610857432772e-07, "logits/chosen": -0.8195465207099915, "logits/rejected": -0.7400294542312622, "logps/chosen": -86.13768768310547, "logps/rejected": -108.82917785644531, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1283977031707764, "rewards/margins": 22.65591049194336, "rewards/rejected": -21.527515411376953, "step": 2810 }, { "epoch": 1.29, "learning_rate": 1.902587519025875e-07, "logits/chosen": -0.7770295739173889, "logits/rejected": -0.7523963451385498, "logps/chosen": -88.90458679199219, "logps/rejected": -110.6171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.35731858015060425, "rewards/margins": 22.745914459228516, "rewards/rejected": -22.388593673706055, "step": 2820 }, { "epoch": 1.29, "learning_rate": 1.8975139523084726e-07, "logits/chosen": -0.7430048584938049, "logits/rejected": -0.695781946182251, "logps/chosen": -85.51004028320312, "logps/rejected": -113.87823486328125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.0342652797698975, "rewards/margins": 22.87997817993164, "rewards/rejected": -21.845712661743164, "step": 2830 }, { "epoch": 1.3, "learning_rate": 1.8924403855910702e-07, "logits/chosen": -0.8346872329711914, "logits/rejected": -0.8123058080673218, "logps/chosen": -78.13063049316406, "logps/rejected": -108.62190246582031, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.306009441614151, "rewards/margins": 21.68273162841797, "rewards/rejected": -21.376720428466797, "step": 2840 }, { "epoch": 1.3, "learning_rate": 1.887366818873668e-07, "logits/chosen": -0.8324508666992188, "logits/rejected": -0.7995740175247192, "logps/chosen": -85.37599182128906, "logps/rejected": -107.89019775390625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.1990930736064911, "rewards/margins": 21.476375579833984, "rewards/rejected": -21.277286529541016, "step": 2850 }, { "epoch": 1.31, "learning_rate": 1.8822932521562656e-07, "logits/chosen": -0.7088965177536011, "logits/rejected": -0.7114429473876953, "logps/chosen": -86.03219604492188, "logps/rejected": -108.50044250488281, "loss": 0.0078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2706319093704224, "rewards/margins": 21.439594268798828, "rewards/rejected": -20.168964385986328, "step": 2860 }, { "epoch": 1.31, "learning_rate": 1.8772196854388632e-07, "logits/chosen": -0.8044069409370422, "logits/rejected": -0.7494329214096069, "logps/chosen": -88.49215698242188, "logps/rejected": -112.16410064697266, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.07424769550561905, "rewards/margins": 21.921253204345703, "rewards/rejected": -21.995498657226562, "step": 2870 }, { "epoch": 1.31, "learning_rate": 1.872146118721461e-07, "logits/chosen": -0.8148896098136902, "logits/rejected": -0.7668309211730957, "logps/chosen": -92.65351104736328, "logps/rejected": -109.47776794433594, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5657011270523071, "rewards/margins": 21.924177169799805, "rewards/rejected": -21.358476638793945, "step": 2880 }, { "epoch": 1.32, "learning_rate": 1.8670725520040586e-07, "logits/chosen": -0.8484834432601929, "logits/rejected": -0.8214718103408813, "logps/chosen": -87.38011169433594, "logps/rejected": -112.77238464355469, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.251098871231079, "rewards/margins": 23.54327392578125, "rewards/rejected": -22.292173385620117, "step": 2890 }, { "epoch": 1.32, "learning_rate": 1.8619989852866562e-07, "logits/chosen": -0.8180379867553711, "logits/rejected": -0.7856913805007935, "logps/chosen": -85.47797393798828, "logps/rejected": -108.2070541381836, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.7616327404975891, "rewards/margins": 22.59707260131836, "rewards/rejected": -21.835439682006836, "step": 2900 }, { "epoch": 1.32, "eval_logits/chosen": -0.8321563005447388, "eval_logits/rejected": -0.762521505355835, "eval_logps/chosen": -84.00428009033203, "eval_logps/rejected": -108.98328399658203, "eval_loss": 0.005883732810616493, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.7667914032936096, "eval_rewards/margins": 22.827733993530273, "eval_rewards/rejected": -22.060945510864258, "eval_runtime": 73.6418, "eval_samples_per_second": 38.864, "eval_steps_per_second": 2.431, "step": 2900 }, { "epoch": 1.33, "learning_rate": 1.856925418569254e-07, "logits/chosen": -0.7657192945480347, "logits/rejected": -0.8023189306259155, "logps/chosen": -87.12007141113281, "logps/rejected": -110.1858901977539, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.8483368158340454, "rewards/margins": 23.544336318969727, "rewards/rejected": -21.696001052856445, "step": 2910 }, { "epoch": 1.33, "learning_rate": 1.8518518518518516e-07, "logits/chosen": -0.8202277421951294, "logits/rejected": -0.8277060389518738, "logps/chosen": -81.93797302246094, "logps/rejected": -109.71229553222656, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1504634618759155, "rewards/margins": 23.43653106689453, "rewards/rejected": -22.286067962646484, "step": 2920 }, { "epoch": 1.34, "learning_rate": 1.8467782851344492e-07, "logits/chosen": -0.8164095878601074, "logits/rejected": -0.797565758228302, "logps/chosen": -83.46281433105469, "logps/rejected": -110.8356704711914, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3928399384021759, "rewards/margins": 23.051225662231445, "rewards/rejected": -22.65838623046875, "step": 2930 }, { "epoch": 1.34, "learning_rate": 1.841704718417047e-07, "logits/chosen": -0.8441513180732727, "logits/rejected": -0.794573962688446, "logps/chosen": -80.64539337158203, "logps/rejected": -109.9163589477539, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.5498849749565125, "rewards/margins": 22.937286376953125, "rewards/rejected": -22.387401580810547, "step": 2940 }, { "epoch": 1.35, "learning_rate": 1.8366311516996446e-07, "logits/chosen": -0.8687325716018677, "logits/rejected": -0.8416921496391296, "logps/chosen": -83.46965789794922, "logps/rejected": -108.26016998291016, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4809293746948242, "rewards/margins": 22.91249656677246, "rewards/rejected": -21.431570053100586, "step": 2950 }, { "epoch": 1.35, "learning_rate": 1.8315575849822422e-07, "logits/chosen": -0.7985904812812805, "logits/rejected": -0.7669427990913391, "logps/chosen": -82.79601287841797, "logps/rejected": -108.02484130859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.0780209302902222, "rewards/margins": 21.9554443359375, "rewards/rejected": -20.877426147460938, "step": 2960 }, { "epoch": 1.36, "learning_rate": 1.82648401826484e-07, "logits/chosen": -0.838662326335907, "logits/rejected": -0.7950839996337891, "logps/chosen": -80.59087371826172, "logps/rejected": -113.2815933227539, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.22922952473163605, "rewards/margins": 22.965641021728516, "rewards/rejected": -22.736412048339844, "step": 2970 }, { "epoch": 1.36, "learning_rate": 1.8214104515474375e-07, "logits/chosen": -0.8701616525650024, "logits/rejected": -0.809054970741272, "logps/chosen": -84.26366424560547, "logps/rejected": -112.36161804199219, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.2285851538181305, "rewards/margins": 24.036575317382812, "rewards/rejected": -23.80799102783203, "step": 2980 }, { "epoch": 1.36, "learning_rate": 1.8163368848300352e-07, "logits/chosen": -0.9309707880020142, "logits/rejected": -0.8480417132377625, "logps/chosen": -77.65840911865234, "logps/rejected": -114.49421691894531, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9348442554473877, "rewards/margins": 24.756481170654297, "rewards/rejected": -22.821638107299805, "step": 2990 }, { "epoch": 1.37, "learning_rate": 1.811263318112633e-07, "logits/chosen": -0.8576655387878418, "logits/rejected": -0.7783030867576599, "logps/chosen": -85.56487274169922, "logps/rejected": -112.32801818847656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.1698871850967407, "rewards/margins": 22.855905532836914, "rewards/rejected": -21.686016082763672, "step": 3000 }, { "epoch": 1.37, "eval_logits/chosen": -0.8208735585212708, "eval_logits/rejected": -0.7533836960792542, "eval_logps/chosen": -82.93585968017578, "eval_logps/rejected": -106.85352325439453, "eval_loss": 0.005454268306493759, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.3009955883026123, "eval_rewards/margins": 22.297056198120117, "eval_rewards/rejected": -20.99605941772461, "eval_runtime": 72.0521, "eval_samples_per_second": 39.721, "eval_steps_per_second": 2.484, "step": 3000 }, { "epoch": 1.37, "learning_rate": 1.8061897513952305e-07, "logits/chosen": -0.8196511268615723, "logits/rejected": -0.6741745471954346, "logps/chosen": -83.01131439208984, "logps/rejected": -107.64549255371094, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 2.772728443145752, "rewards/margins": 23.33978843688965, "rewards/rejected": -20.567058563232422, "step": 3010 }, { "epoch": 1.38, "learning_rate": 1.8011161846778282e-07, "logits/chosen": -0.8143720626831055, "logits/rejected": -0.751374363899231, "logps/chosen": -82.5423583984375, "logps/rejected": -108.41703033447266, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0407277345657349, "rewards/margins": 21.009965896606445, "rewards/rejected": -19.96923828125, "step": 3020 }, { "epoch": 1.38, "learning_rate": 1.796042617960426e-07, "logits/chosen": -0.7924383878707886, "logits/rejected": -0.7827506065368652, "logps/chosen": -94.8802719116211, "logps/rejected": -110.94437408447266, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6922892332077026, "rewards/margins": 20.605281829833984, "rewards/rejected": -19.91299057006836, "step": 3030 }, { "epoch": 1.39, "learning_rate": 1.7909690512430235e-07, "logits/chosen": -0.7978582382202148, "logits/rejected": -0.7540158629417419, "logps/chosen": -81.8050308227539, "logps/rejected": -104.96846008300781, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.273224353790283, "rewards/margins": 22.64246368408203, "rewards/rejected": -20.36924171447754, "step": 3040 }, { "epoch": 1.39, "learning_rate": 1.7858954845256212e-07, "logits/chosen": -0.8617739677429199, "logits/rejected": -0.7627667784690857, "logps/chosen": -83.94235229492188, "logps/rejected": -106.07057189941406, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 1.619549036026001, "rewards/margins": 21.559040069580078, "rewards/rejected": -19.939491271972656, "step": 3050 }, { "epoch": 1.4, "learning_rate": 1.780821917808219e-07, "logits/chosen": -0.8220621347427368, "logits/rejected": -0.810819149017334, "logps/chosen": -87.0011978149414, "logps/rejected": -108.83686828613281, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 1.5197734832763672, "rewards/margins": 21.68050193786621, "rewards/rejected": -20.160728454589844, "step": 3060 }, { "epoch": 1.4, "learning_rate": 1.7757483510908165e-07, "logits/chosen": -0.7937394976615906, "logits/rejected": -0.7784138917922974, "logps/chosen": -80.34947204589844, "logps/rejected": -111.46089172363281, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.9023909568786621, "rewards/margins": 21.696002960205078, "rewards/rejected": -20.793611526489258, "step": 3070 }, { "epoch": 1.41, "learning_rate": 1.7706747843734142e-07, "logits/chosen": -0.8737959861755371, "logits/rejected": -0.8506304621696472, "logps/chosen": -80.43476867675781, "logps/rejected": -105.3740005493164, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.0261263847351074, "rewards/margins": 21.9659423828125, "rewards/rejected": -20.93981170654297, "step": 3080 }, { "epoch": 1.41, "learning_rate": 1.765601217656012e-07, "logits/chosen": -0.8673890233039856, "logits/rejected": -0.8221572041511536, "logps/chosen": -83.61185455322266, "logps/rejected": -108.5359878540039, "loss": 0.0057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3645645380020142, "rewards/margins": 21.998119354248047, "rewards/rejected": -20.633556365966797, "step": 3090 }, { "epoch": 1.42, "learning_rate": 1.7605276509386095e-07, "logits/chosen": -0.7997790575027466, "logits/rejected": -0.7952042818069458, "logps/chosen": -79.34935760498047, "logps/rejected": -107.53935241699219, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.1273956298828125, "rewards/margins": 22.18309211730957, "rewards/rejected": -21.055694580078125, "step": 3100 }, { "epoch": 1.42, "eval_logits/chosen": -0.8161947727203369, "eval_logits/rejected": -0.7520949840545654, "eval_logps/chosen": -82.9143295288086, "eval_logps/rejected": -105.88829040527344, "eval_loss": 0.005368279293179512, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.3117655515670776, "eval_rewards/margins": 21.82520866394043, "eval_rewards/rejected": -20.513439178466797, "eval_runtime": 78.788, "eval_samples_per_second": 36.325, "eval_steps_per_second": 2.272, "step": 3100 }, { "epoch": 1.42, "learning_rate": 1.7554540842212072e-07, "logits/chosen": -0.8044342994689941, "logits/rejected": -0.7702390551567078, "logps/chosen": -79.634765625, "logps/rejected": -106.61177062988281, "loss": 0.0043, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8688969612121582, "rewards/margins": 21.35724639892578, "rewards/rejected": -20.48834991455078, "step": 3110 }, { "epoch": 1.42, "learning_rate": 1.750380517503805e-07, "logits/chosen": -0.8742705583572388, "logits/rejected": -0.8354657888412476, "logps/chosen": -81.12736511230469, "logps/rejected": -104.99991607666016, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 1.1102584600448608, "rewards/margins": 20.783863067626953, "rewards/rejected": -19.673603057861328, "step": 3120 }, { "epoch": 1.43, "learning_rate": 1.7453069507864025e-07, "logits/chosen": -0.8716095089912415, "logits/rejected": -0.7494352459907532, "logps/chosen": -81.30690002441406, "logps/rejected": -105.8340835571289, "loss": 0.0079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8371508717536926, "rewards/margins": 22.083576202392578, "rewards/rejected": -21.246429443359375, "step": 3130 }, { "epoch": 1.43, "learning_rate": 1.7402333840690002e-07, "logits/chosen": -0.7608228921890259, "logits/rejected": -0.7334285378456116, "logps/chosen": -91.09227752685547, "logps/rejected": -112.9427261352539, "loss": 0.0017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4231182932853699, "rewards/margins": 21.993289947509766, "rewards/rejected": -21.570171356201172, "step": 3140 }, { "epoch": 1.44, "learning_rate": 1.735159817351598e-07, "logits/chosen": -0.7460505962371826, "logits/rejected": -0.7124925851821899, "logps/chosen": -79.06074523925781, "logps/rejected": -111.85310363769531, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 2.2974584102630615, "rewards/margins": 23.934118270874023, "rewards/rejected": -21.636661529541016, "step": 3150 }, { "epoch": 1.44, "learning_rate": 1.7300862506341955e-07, "logits/chosen": -0.8754836916923523, "logits/rejected": -0.8337736129760742, "logps/chosen": -84.03656005859375, "logps/rejected": -110.4716796875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 1.2922741174697876, "rewards/margins": 23.63736343383789, "rewards/rejected": -22.345090866088867, "step": 3160 }, { "epoch": 1.45, "learning_rate": 1.7250126839167932e-07, "logits/chosen": -0.9508827328681946, "logits/rejected": -0.83515465259552, "logps/chosen": -85.82405090332031, "logps/rejected": -107.03657531738281, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.8837907910346985, "rewards/margins": 21.791425704956055, "rewards/rejected": -20.907634735107422, "step": 3170 }, { "epoch": 1.45, "learning_rate": 1.719939117199391e-07, "logits/chosen": -0.7886821031570435, "logits/rejected": -0.7693257927894592, "logps/chosen": -83.99065399169922, "logps/rejected": -107.45686340332031, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 2.2205920219421387, "rewards/margins": 23.225391387939453, "rewards/rejected": -21.00480079650879, "step": 3180 }, { "epoch": 1.46, "learning_rate": 1.7148655504819885e-07, "logits/chosen": -0.7591974139213562, "logits/rejected": -0.7449969053268433, "logps/chosen": -82.21507263183594, "logps/rejected": -108.7207260131836, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.1014723777770996, "rewards/margins": 21.82181739807129, "rewards/rejected": -20.720340728759766, "step": 3190 }, { "epoch": 1.46, "learning_rate": 1.7097919837645862e-07, "logits/chosen": -0.8855707049369812, "logits/rejected": -0.8614555597305298, "logps/chosen": -81.53754425048828, "logps/rejected": -109.7943344116211, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4457625150680542, "rewards/margins": 23.156801223754883, "rewards/rejected": -21.711040496826172, "step": 3200 }, { "epoch": 1.46, "eval_logits/chosen": -0.829251229763031, "eval_logits/rejected": -0.761223554611206, "eval_logps/chosen": -82.8708267211914, "eval_logps/rejected": -106.64373779296875, "eval_loss": 0.0055304598063230515, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 1.3335167169570923, "eval_rewards/margins": 22.224679946899414, "eval_rewards/rejected": -20.891162872314453, "eval_runtime": 81.9215, "eval_samples_per_second": 34.936, "eval_steps_per_second": 2.185, "step": 3200 }, { "epoch": 1.47, "learning_rate": 1.704718417047184e-07, "logits/chosen": -0.8796641230583191, "logits/rejected": -0.8663586378097534, "logps/chosen": -85.18115234375, "logps/rejected": -115.62249755859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.1032954454421997, "rewards/margins": 23.180606842041016, "rewards/rejected": -22.07731056213379, "step": 3210 }, { "epoch": 1.47, "learning_rate": 1.6996448503297815e-07, "logits/chosen": -0.8402705192565918, "logits/rejected": -0.7693440318107605, "logps/chosen": -87.40254974365234, "logps/rejected": -112.21661376953125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.9603824615478516, "rewards/margins": 22.17110252380371, "rewards/rejected": -21.21072006225586, "step": 3220 }, { "epoch": 1.47, "learning_rate": 1.6945712836123792e-07, "logits/chosen": -0.8487188220024109, "logits/rejected": -0.7834162712097168, "logps/chosen": -84.28276062011719, "logps/rejected": -107.58403015136719, "loss": 0.0069, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3142402172088623, "rewards/margins": 21.98873519897461, "rewards/rejected": -20.67449378967285, "step": 3230 }, { "epoch": 1.48, "learning_rate": 1.689497716894977e-07, "logits/chosen": -0.793751060962677, "logits/rejected": -0.7844887971878052, "logps/chosen": -79.22299194335938, "logps/rejected": -104.5686264038086, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1418697834014893, "rewards/margins": 21.396854400634766, "rewards/rejected": -20.254985809326172, "step": 3240 }, { "epoch": 1.48, "learning_rate": 1.6844241501775745e-07, "logits/chosen": -0.8498477935791016, "logits/rejected": -0.8198568224906921, "logps/chosen": -87.35657501220703, "logps/rejected": -111.27471923828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 1.1994593143463135, "rewards/margins": 22.922657012939453, "rewards/rejected": -21.723196029663086, "step": 3250 }, { "epoch": 1.49, "learning_rate": 1.6793505834601722e-07, "logits/chosen": -0.9263782501220703, "logits/rejected": -0.8542073965072632, "logps/chosen": -76.23494720458984, "logps/rejected": -113.0631103515625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.5995099544525146, "rewards/margins": 25.61696434020996, "rewards/rejected": -24.017454147338867, "step": 3260 }, { "epoch": 1.49, "learning_rate": 1.67427701674277e-07, "logits/chosen": -0.8342376947402954, "logits/rejected": -0.756497859954834, "logps/chosen": -81.8563232421875, "logps/rejected": -108.46012115478516, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.6572699546813965, "rewards/margins": 23.10988426208496, "rewards/rejected": -22.45261573791504, "step": 3270 }, { "epoch": 1.5, "learning_rate": 1.6692034500253675e-07, "logits/chosen": -0.8144375681877136, "logits/rejected": -0.7653582096099854, "logps/chosen": -81.36412048339844, "logps/rejected": -115.94920349121094, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.2872655391693115, "rewards/margins": 24.55162239074707, "rewards/rejected": -23.264354705810547, "step": 3280 }, { "epoch": 1.5, "learning_rate": 1.6641298833079652e-07, "logits/chosen": -0.8921886682510376, "logits/rejected": -0.8617362976074219, "logps/chosen": -86.64197540283203, "logps/rejected": -110.98530578613281, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.1070516109466553, "rewards/margins": 23.492164611816406, "rewards/rejected": -22.385112762451172, "step": 3290 }, { "epoch": 1.51, "learning_rate": 1.659056316590563e-07, "logits/chosen": -0.871414303779602, "logits/rejected": -0.7999138236045837, "logps/chosen": -82.70069885253906, "logps/rejected": -109.51627349853516, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.31220346689224243, "rewards/margins": 23.60110855102539, "rewards/rejected": -23.28890609741211, "step": 3300 }, { "epoch": 1.51, "eval_logits/chosen": -0.8566647171974182, "eval_logits/rejected": -0.7842747569084167, "eval_logps/chosen": -83.30318450927734, "eval_logps/rejected": -109.43997955322266, "eval_loss": 0.005761295091360807, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.117339015007019, "eval_rewards/margins": 23.406618118286133, "eval_rewards/rejected": -22.28927993774414, "eval_runtime": 69.4914, "eval_samples_per_second": 41.185, "eval_steps_per_second": 2.576, "step": 3300 }, { "epoch": 1.51, "learning_rate": 1.6539827498731605e-07, "logits/chosen": -0.8968285322189331, "logits/rejected": -0.8614455461502075, "logps/chosen": -85.897216796875, "logps/rejected": -110.06578063964844, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.6581555604934692, "rewards/margins": 23.074901580810547, "rewards/rejected": -22.416744232177734, "step": 3310 }, { "epoch": 1.52, "learning_rate": 1.6489091831557582e-07, "logits/chosen": -0.803012490272522, "logits/rejected": -0.7980927228927612, "logps/chosen": -82.73001861572266, "logps/rejected": -111.27703857421875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.6886202692985535, "rewards/margins": 23.198246002197266, "rewards/rejected": -22.509626388549805, "step": 3320 }, { "epoch": 1.52, "learning_rate": 1.643835616438356e-07, "logits/chosen": -0.8772886395454407, "logits/rejected": -0.8491265177726746, "logps/chosen": -85.08233642578125, "logps/rejected": -115.56292724609375, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.838422954082489, "rewards/margins": 24.703176498413086, "rewards/rejected": -23.864755630493164, "step": 3330 }, { "epoch": 1.52, "learning_rate": 1.6387620497209535e-07, "logits/chosen": -0.7757368683815002, "logits/rejected": -0.7173784971237183, "logps/chosen": -82.10543060302734, "logps/rejected": -111.90571594238281, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.7612966895103455, "rewards/margins": 23.784420013427734, "rewards/rejected": -23.023122787475586, "step": 3340 }, { "epoch": 1.53, "learning_rate": 1.6336884830035512e-07, "logits/chosen": -0.9086600542068481, "logits/rejected": -0.8849117159843445, "logps/chosen": -83.78245544433594, "logps/rejected": -115.2596206665039, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.6915544271469116, "rewards/margins": 23.850078582763672, "rewards/rejected": -23.158523559570312, "step": 3350 }, { "epoch": 1.53, "learning_rate": 1.6286149162861489e-07, "logits/chosen": -0.9009075164794922, "logits/rejected": -0.8365023732185364, "logps/chosen": -87.08536529541016, "logps/rejected": -118.95816802978516, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.35949334502220154, "rewards/margins": 24.53894805908203, "rewards/rejected": -24.179454803466797, "step": 3360 }, { "epoch": 1.54, "learning_rate": 1.6235413495687465e-07, "logits/chosen": -0.7893753051757812, "logits/rejected": -0.700376033782959, "logps/chosen": -83.50393676757812, "logps/rejected": -110.16971588134766, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.2407291829586029, "rewards/margins": 23.86244773864746, "rewards/rejected": -23.621719360351562, "step": 3370 }, { "epoch": 1.54, "learning_rate": 1.6184677828513442e-07, "logits/chosen": -0.8861139416694641, "logits/rejected": -0.8305456042289734, "logps/chosen": -85.4578628540039, "logps/rejected": -115.61383056640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.0657522901892662, "rewards/margins": 24.47097396850586, "rewards/rejected": -24.40522003173828, "step": 3380 }, { "epoch": 1.55, "learning_rate": 1.613394216133942e-07, "logits/chosen": -0.9259065389633179, "logits/rejected": -0.8677760362625122, "logps/chosen": -81.14451599121094, "logps/rejected": -114.08638763427734, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1236779689788818, "rewards/margins": 25.24271583557129, "rewards/rejected": -24.119037628173828, "step": 3390 }, { "epoch": 1.55, "learning_rate": 1.6083206494165398e-07, "logits/chosen": -0.9464787244796753, "logits/rejected": -0.9007658958435059, "logps/chosen": -81.28665924072266, "logps/rejected": -113.91754150390625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.8432676196098328, "rewards/margins": 25.56879997253418, "rewards/rejected": -24.725528717041016, "step": 3400 }, { "epoch": 1.55, "eval_logits/chosen": -0.8611359000205994, "eval_logits/rejected": -0.7883932590484619, "eval_logps/chosen": -83.68134307861328, "eval_logps/rejected": -112.33548736572266, "eval_loss": 0.006425461731851101, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.928252637386322, "eval_rewards/margins": 24.665298461914062, "eval_rewards/rejected": -23.737045288085938, "eval_runtime": 72.9825, "eval_samples_per_second": 39.215, "eval_steps_per_second": 2.453, "step": 3400 }, { "epoch": 1.56, "learning_rate": 1.6032470826991375e-07, "logits/chosen": -0.849983811378479, "logits/rejected": -0.7683964371681213, "logps/chosen": -83.12248229980469, "logps/rejected": -114.29350280761719, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 2.212885618209839, "rewards/margins": 25.064769744873047, "rewards/rejected": -22.851879119873047, "step": 3410 }, { "epoch": 1.56, "learning_rate": 1.598173515981735e-07, "logits/chosen": -0.8216031789779663, "logits/rejected": -0.7773474454879761, "logps/chosen": -88.26131439208984, "logps/rejected": -109.80609130859375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 2.0204696655273438, "rewards/margins": 24.57611083984375, "rewards/rejected": -22.555639266967773, "step": 3420 }, { "epoch": 1.57, "learning_rate": 1.5930999492643328e-07, "logits/chosen": -0.9190078973770142, "logits/rejected": -0.8715893030166626, "logps/chosen": -84.56954956054688, "logps/rejected": -113.73841857910156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.3089227676391602, "rewards/margins": 25.795129776000977, "rewards/rejected": -24.486209869384766, "step": 3430 }, { "epoch": 1.57, "learning_rate": 1.5880263825469305e-07, "logits/chosen": -0.9268477559089661, "logits/rejected": -0.8103355169296265, "logps/chosen": -83.41505432128906, "logps/rejected": -116.24625396728516, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.2660026550292969, "rewards/margins": 26.462406158447266, "rewards/rejected": -25.19640350341797, "step": 3440 }, { "epoch": 1.57, "learning_rate": 1.582952815829528e-07, "logits/chosen": -0.7966245412826538, "logits/rejected": -0.7210028767585754, "logps/chosen": -85.24122619628906, "logps/rejected": -118.64140319824219, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4916000366210938, "rewards/margins": 25.66861915588379, "rewards/rejected": -24.177021026611328, "step": 3450 }, { "epoch": 1.58, "learning_rate": 1.5778792491121258e-07, "logits/chosen": -0.9547828435897827, "logits/rejected": -0.9107531309127808, "logps/chosen": -84.8668441772461, "logps/rejected": -114.7540283203125, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2354485988616943, "rewards/margins": 26.283843994140625, "rewards/rejected": -25.048397064208984, "step": 3460 }, { "epoch": 1.58, "learning_rate": 1.5728056823947235e-07, "logits/chosen": -0.9525339007377625, "logits/rejected": -0.8235961198806763, "logps/chosen": -83.3685531616211, "logps/rejected": -113.2531509399414, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6874784231185913, "rewards/margins": 25.723840713500977, "rewards/rejected": -24.036357879638672, "step": 3470 }, { "epoch": 1.59, "learning_rate": 1.567732115677321e-07, "logits/chosen": -0.8695586919784546, "logits/rejected": -0.7993710041046143, "logps/chosen": -87.9699935913086, "logps/rejected": -113.51700592041016, "loss": 0.0063, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6940809488296509, "rewards/margins": 24.42647933959961, "rewards/rejected": -23.732397079467773, "step": 3480 }, { "epoch": 1.59, "learning_rate": 1.5626585489599188e-07, "logits/chosen": -0.8067137598991394, "logits/rejected": -0.7603528499603271, "logps/chosen": -89.75978088378906, "logps/rejected": -124.76887512207031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7332732677459717, "rewards/margins": 27.42246437072754, "rewards/rejected": -25.689189910888672, "step": 3490 }, { "epoch": 1.6, "learning_rate": 1.5575849822425165e-07, "logits/chosen": -0.9085081815719604, "logits/rejected": -0.8620240092277527, "logps/chosen": -82.04761505126953, "logps/rejected": -117.4197006225586, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.5324010848999023, "rewards/margins": 25.046289443969727, "rewards/rejected": -24.513887405395508, "step": 3500 }, { "epoch": 1.6, "eval_logits/chosen": -0.875246524810791, "eval_logits/rejected": -0.7964990139007568, "eval_logps/chosen": -83.70887756347656, "eval_logps/rejected": -114.64440155029297, "eval_loss": 0.00680342735722661, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9144880175590515, "eval_rewards/margins": 25.805988311767578, "eval_rewards/rejected": -24.891504287719727, "eval_runtime": 99.4145, "eval_samples_per_second": 28.789, "eval_steps_per_second": 1.801, "step": 3500 }, { "epoch": 1.6, "learning_rate": 1.552511415525114e-07, "logits/chosen": -0.9048668146133423, "logits/rejected": -0.882061779499054, "logps/chosen": -85.49907684326172, "logps/rejected": -117.0702133178711, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2163362205028534, "rewards/margins": 25.046667098999023, "rewards/rejected": -25.263004302978516, "step": 3510 }, { "epoch": 1.61, "learning_rate": 1.5474378488077118e-07, "logits/chosen": -0.9017683863639832, "logits/rejected": -0.8488256335258484, "logps/chosen": -91.98550415039062, "logps/rejected": -121.7437973022461, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.6241726875305176, "rewards/margins": 25.137426376342773, "rewards/rejected": -24.513254165649414, "step": 3520 }, { "epoch": 1.61, "learning_rate": 1.5423642820903095e-07, "logits/chosen": -0.8245092630386353, "logits/rejected": -0.791895866394043, "logps/chosen": -83.31465148925781, "logps/rejected": -118.2453842163086, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7763601541519165, "rewards/margins": 26.058420181274414, "rewards/rejected": -24.282062530517578, "step": 3530 }, { "epoch": 1.62, "learning_rate": 1.537290715372907e-07, "logits/chosen": -0.953994870185852, "logits/rejected": -0.8880467414855957, "logps/chosen": -86.09355163574219, "logps/rejected": -112.99943542480469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.6044597625732422, "rewards/margins": 25.4656982421875, "rewards/rejected": -23.861236572265625, "step": 3540 }, { "epoch": 1.62, "learning_rate": 1.5322171486555048e-07, "logits/chosen": -0.9053792953491211, "logits/rejected": -0.838627815246582, "logps/chosen": -90.17684936523438, "logps/rejected": -120.52122497558594, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 2.3521316051483154, "rewards/margins": 28.084697723388672, "rewards/rejected": -25.73256492614746, "step": 3550 }, { "epoch": 1.63, "learning_rate": 1.5271435819381025e-07, "logits/chosen": -0.9697454571723938, "logits/rejected": -0.8927943110466003, "logps/chosen": -86.13640594482422, "logps/rejected": -116.8769302368164, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8283834457397461, "rewards/margins": 25.841808319091797, "rewards/rejected": -25.013423919677734, "step": 3560 }, { "epoch": 1.63, "learning_rate": 1.5220700152207e-07, "logits/chosen": -0.8806090354919434, "logits/rejected": -0.8488904237747192, "logps/chosen": -90.3905029296875, "logps/rejected": -119.23170471191406, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9313351511955261, "rewards/margins": 25.077367782592773, "rewards/rejected": -24.146032333374023, "step": 3570 }, { "epoch": 1.63, "learning_rate": 1.5169964485032978e-07, "logits/chosen": -0.9907437562942505, "logits/rejected": -0.9438600540161133, "logps/chosen": -86.11767578125, "logps/rejected": -126.03318786621094, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.6631255745887756, "rewards/margins": 26.9681396484375, "rewards/rejected": -26.30501365661621, "step": 3580 }, { "epoch": 1.64, "learning_rate": 1.5119228817858955e-07, "logits/chosen": -0.9619858860969543, "logits/rejected": -0.8756266832351685, "logps/chosen": -82.71698760986328, "logps/rejected": -119.3367691040039, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 1.6457278728485107, "rewards/margins": 26.794042587280273, "rewards/rejected": -25.148311614990234, "step": 3590 }, { "epoch": 1.64, "learning_rate": 1.506849315068493e-07, "logits/chosen": -0.8155440092086792, "logits/rejected": -0.7831107378005981, "logps/chosen": -85.99327087402344, "logps/rejected": -113.37886047363281, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 1.6102584600448608, "rewards/margins": 24.855575561523438, "rewards/rejected": -23.245319366455078, "step": 3600 }, { "epoch": 1.64, "eval_logits/chosen": -0.8784149885177612, "eval_logits/rejected": -0.8029141426086426, "eval_logps/chosen": -83.55583190917969, "eval_logps/rejected": -113.30615997314453, "eval_loss": 0.006304467096924782, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9910123348236084, "eval_rewards/margins": 25.21339225769043, "eval_rewards/rejected": -24.22237777709961, "eval_runtime": 67.7796, "eval_samples_per_second": 42.225, "eval_steps_per_second": 2.641, "step": 3600 }, { "epoch": 1.65, "learning_rate": 1.5017757483510908e-07, "logits/chosen": -0.8234073519706726, "logits/rejected": -0.8046862483024597, "logps/chosen": -82.49821472167969, "logps/rejected": -117.84230041503906, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.6039770841598511, "rewards/margins": 24.839069366455078, "rewards/rejected": -24.235092163085938, "step": 3610 }, { "epoch": 1.65, "learning_rate": 1.4967021816336885e-07, "logits/chosen": -0.8637280464172363, "logits/rejected": -0.8385387659072876, "logps/chosen": -80.1966323852539, "logps/rejected": -112.7997817993164, "loss": 0.0037, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6764703989028931, "rewards/margins": 25.432544708251953, "rewards/rejected": -24.756074905395508, "step": 3620 }, { "epoch": 1.66, "learning_rate": 1.491628614916286e-07, "logits/chosen": -0.9332625269889832, "logits/rejected": -0.8725907206535339, "logps/chosen": -85.28267669677734, "logps/rejected": -120.61589050292969, "loss": 0.006, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.753173828125, "rewards/margins": 25.59710121154785, "rewards/rejected": -24.84392547607422, "step": 3630 }, { "epoch": 1.66, "learning_rate": 1.4865550481988838e-07, "logits/chosen": -0.8927936553955078, "logits/rejected": -0.8743786811828613, "logps/chosen": -77.79920959472656, "logps/rejected": -112.90252685546875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.799877941608429, "rewards/margins": 24.70957374572754, "rewards/rejected": -23.90969467163086, "step": 3640 }, { "epoch": 1.67, "learning_rate": 1.4814814814814815e-07, "logits/chosen": -1.0039594173431396, "logits/rejected": -0.8558320999145508, "logps/chosen": -81.33214569091797, "logps/rejected": -109.74530029296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.07261955738067627, "rewards/margins": 23.961139678955078, "rewards/rejected": -23.888517379760742, "step": 3650 }, { "epoch": 1.67, "learning_rate": 1.476407914764079e-07, "logits/chosen": -0.9197986721992493, "logits/rejected": -0.8760434985160828, "logps/chosen": -89.37728118896484, "logps/rejected": -118.18058776855469, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.11399469524621964, "rewards/margins": 24.953617095947266, "rewards/rejected": -25.06761360168457, "step": 3660 }, { "epoch": 1.68, "learning_rate": 1.4713343480466768e-07, "logits/chosen": -0.8881243467330933, "logits/rejected": -0.824755847454071, "logps/chosen": -85.07624816894531, "logps/rejected": -117.46966552734375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3701117038726807, "rewards/margins": 26.875885009765625, "rewards/rejected": -25.505775451660156, "step": 3670 }, { "epoch": 1.68, "learning_rate": 1.4662607813292745e-07, "logits/chosen": -0.9610374569892883, "logits/rejected": -0.894203782081604, "logps/chosen": -87.59607696533203, "logps/rejected": -114.97966003417969, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.8766714930534363, "rewards/margins": 25.969364166259766, "rewards/rejected": -25.092693328857422, "step": 3680 }, { "epoch": 1.68, "learning_rate": 1.461187214611872e-07, "logits/chosen": -0.8563788533210754, "logits/rejected": -0.8630868196487427, "logps/chosen": -88.29237365722656, "logps/rejected": -118.44969177246094, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 0.7286855578422546, "rewards/margins": 26.557199478149414, "rewards/rejected": -25.82851219177246, "step": 3690 }, { "epoch": 1.69, "learning_rate": 1.4561136478944698e-07, "logits/chosen": -0.8998085260391235, "logits/rejected": -0.8663345575332642, "logps/chosen": -84.99273681640625, "logps/rejected": -122.40765380859375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 1.7066619396209717, "rewards/margins": 27.883495330810547, "rewards/rejected": -26.176837921142578, "step": 3700 }, { "epoch": 1.69, "eval_logits/chosen": -0.9141598343849182, "eval_logits/rejected": -0.8334181904792786, "eval_logps/chosen": -84.20160675048828, "eval_logps/rejected": -115.98404693603516, "eval_loss": 0.006939805578440428, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.6681216359138489, "eval_rewards/margins": 26.229442596435547, "eval_rewards/rejected": -25.561321258544922, "eval_runtime": 63.3927, "eval_samples_per_second": 45.147, "eval_steps_per_second": 2.824, "step": 3700 }, { "epoch": 1.69, "learning_rate": 1.4510400811770675e-07, "logits/chosen": -0.9376422166824341, "logits/rejected": -0.8984735608100891, "logps/chosen": -88.30086517333984, "logps/rejected": -119.71275329589844, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.101771593093872, "rewards/margins": 26.530920028686523, "rewards/rejected": -25.429149627685547, "step": 3710 }, { "epoch": 1.7, "learning_rate": 1.445966514459665e-07, "logits/chosen": -0.9134553670883179, "logits/rejected": -0.8494185209274292, "logps/chosen": -84.12943267822266, "logps/rejected": -121.0509033203125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.8017365336418152, "rewards/margins": 27.020517349243164, "rewards/rejected": -26.218780517578125, "step": 3720 }, { "epoch": 1.7, "learning_rate": 1.4408929477422628e-07, "logits/chosen": -0.9076977968215942, "logits/rejected": -0.8381573557853699, "logps/chosen": -88.46109008789062, "logps/rejected": -120.51727294921875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.41265106201171875, "rewards/margins": 25.993633270263672, "rewards/rejected": -25.580984115600586, "step": 3730 }, { "epoch": 1.71, "learning_rate": 1.4358193810248604e-07, "logits/chosen": -0.9492243528366089, "logits/rejected": -0.9089414477348328, "logps/chosen": -81.46710205078125, "logps/rejected": -116.9715576171875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.21577057242393494, "rewards/margins": 26.476455688476562, "rewards/rejected": -26.26068687438965, "step": 3740 }, { "epoch": 1.71, "learning_rate": 1.430745814307458e-07, "logits/chosen": -0.9978575706481934, "logits/rejected": -0.9076077342033386, "logps/chosen": -88.85905456542969, "logps/rejected": -119.39497375488281, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.9890228509902954, "rewards/margins": 26.75762367248535, "rewards/rejected": -25.768600463867188, "step": 3750 }, { "epoch": 1.72, "learning_rate": 1.4256722475900558e-07, "logits/chosen": -0.8967350721359253, "logits/rejected": -0.821501612663269, "logps/chosen": -81.39857482910156, "logps/rejected": -114.3400650024414, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.15209658443927765, "rewards/margins": 25.085054397583008, "rewards/rejected": -24.93295669555664, "step": 3760 }, { "epoch": 1.72, "learning_rate": 1.4205986808726534e-07, "logits/chosen": -0.9509645700454712, "logits/rejected": -0.8750587701797485, "logps/chosen": -84.42094421386719, "logps/rejected": -130.0999755859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.0962730646133423, "rewards/margins": 28.398696899414062, "rewards/rejected": -27.302425384521484, "step": 3770 }, { "epoch": 1.73, "learning_rate": 1.415525114155251e-07, "logits/chosen": -0.9023697972297668, "logits/rejected": -0.9625568389892578, "logps/chosen": -90.93963623046875, "logps/rejected": -120.6771469116211, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.317933440208435, "rewards/margins": 27.938594818115234, "rewards/rejected": -26.62066650390625, "step": 3780 }, { "epoch": 1.73, "learning_rate": 1.4104515474378488e-07, "logits/chosen": -0.8431658744812012, "logits/rejected": -0.8656299710273743, "logps/chosen": -84.60992431640625, "logps/rejected": -117.33194732666016, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19338905811309814, "rewards/margins": 25.269033432006836, "rewards/rejected": -25.075647354125977, "step": 3790 }, { "epoch": 1.73, "learning_rate": 1.4053779807204464e-07, "logits/chosen": -0.9680309295654297, "logits/rejected": -0.9091488718986511, "logps/chosen": -87.53936004638672, "logps/rejected": -113.69287109375, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5028787851333618, "rewards/margins": 25.9219913482666, "rewards/rejected": -24.419111251831055, "step": 3800 }, { "epoch": 1.73, "eval_logits/chosen": -0.9016574621200562, "eval_logits/rejected": -0.8256176114082336, "eval_logps/chosen": -83.64620971679688, "eval_logps/rejected": -114.05230712890625, "eval_loss": 0.006408516317605972, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.9458239078521729, "eval_rewards/margins": 25.541276931762695, "eval_rewards/rejected": -24.595455169677734, "eval_runtime": 66.5039, "eval_samples_per_second": 43.035, "eval_steps_per_second": 2.692, "step": 3800 }, { "epoch": 1.74, "learning_rate": 1.400304414003044e-07, "logits/chosen": -0.9558550119400024, "logits/rejected": -0.8679558634757996, "logps/chosen": -82.83837890625, "logps/rejected": -114.90482330322266, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.4327557682991028, "rewards/margins": 25.373950958251953, "rewards/rejected": -24.941192626953125, "step": 3810 }, { "epoch": 1.74, "learning_rate": 1.3952308472856418e-07, "logits/chosen": -0.8897542953491211, "logits/rejected": -0.8644717931747437, "logps/chosen": -88.68601989746094, "logps/rejected": -116.8119888305664, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.15823300182819366, "rewards/margins": 25.815906524658203, "rewards/rejected": -25.974136352539062, "step": 3820 }, { "epoch": 1.75, "learning_rate": 1.3901572805682394e-07, "logits/chosen": -0.8622309565544128, "logits/rejected": -0.8281264305114746, "logps/chosen": -89.28263092041016, "logps/rejected": -122.96022033691406, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.5797398090362549, "rewards/margins": 27.29312515258789, "rewards/rejected": -25.7133846282959, "step": 3830 }, { "epoch": 1.75, "learning_rate": 1.385083713850837e-07, "logits/chosen": -0.8999799489974976, "logits/rejected": -0.8458458185195923, "logps/chosen": -86.58378601074219, "logps/rejected": -115.96832275390625, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 1.054229497909546, "rewards/margins": 25.937957763671875, "rewards/rejected": -24.883729934692383, "step": 3840 }, { "epoch": 1.76, "learning_rate": 1.3800101471334348e-07, "logits/chosen": -0.8850622177124023, "logits/rejected": -0.8627168536186218, "logps/chosen": -84.57807922363281, "logps/rejected": -112.39884948730469, "loss": 0.0057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08325471729040146, "rewards/margins": 24.015729904174805, "rewards/rejected": -24.098987579345703, "step": 3850 }, { "epoch": 1.76, "learning_rate": 1.3749365804160324e-07, "logits/chosen": -0.8272354006767273, "logits/rejected": -0.7512696981430054, "logps/chosen": -84.18182373046875, "logps/rejected": -114.89219665527344, "loss": 0.0035, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7288559675216675, "rewards/margins": 25.41353988647461, "rewards/rejected": -24.684682846069336, "step": 3860 }, { "epoch": 1.77, "learning_rate": 1.36986301369863e-07, "logits/chosen": -0.7898057699203491, "logits/rejected": -0.8020504713058472, "logps/chosen": -83.73123168945312, "logps/rejected": -118.38655090332031, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.9885632991790771, "rewards/margins": 28.37349510192871, "rewards/rejected": -26.384933471679688, "step": 3870 }, { "epoch": 1.77, "learning_rate": 1.3647894469812278e-07, "logits/chosen": -0.8614355325698853, "logits/rejected": -0.8076695203781128, "logps/chosen": -83.43045043945312, "logps/rejected": -116.3105697631836, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9053701162338257, "rewards/margins": 26.96212387084961, "rewards/rejected": -25.0567569732666, "step": 3880 }, { "epoch": 1.78, "learning_rate": 1.3597158802638254e-07, "logits/chosen": -0.8757370710372925, "logits/rejected": -0.858729362487793, "logps/chosen": -85.84932708740234, "logps/rejected": -123.9087905883789, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7146657705307007, "rewards/margins": 28.85630226135254, "rewards/rejected": -27.141637802124023, "step": 3890 }, { "epoch": 1.78, "learning_rate": 1.354642313546423e-07, "logits/chosen": -0.8456228971481323, "logits/rejected": -0.786232054233551, "logps/chosen": -80.6098403930664, "logps/rejected": -113.07496643066406, "loss": 0.0041, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9995010495185852, "rewards/margins": 25.73309326171875, "rewards/rejected": -24.733591079711914, "step": 3900 }, { "epoch": 1.78, "eval_logits/chosen": -0.8836355209350586, "eval_logits/rejected": -0.8069778680801392, "eval_logps/chosen": -83.1183853149414, "eval_logps/rejected": -114.56892395019531, "eval_loss": 0.0067343455739319324, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.209733486175537, "eval_rewards/margins": 26.063493728637695, "eval_rewards/rejected": -24.853761672973633, "eval_runtime": 76.8714, "eval_samples_per_second": 37.231, "eval_steps_per_second": 2.329, "step": 3900 }, { "epoch": 1.78, "learning_rate": 1.3495687468290208e-07, "logits/chosen": -0.9082708358764648, "logits/rejected": -0.8267936706542969, "logps/chosen": -87.33113861083984, "logps/rejected": -115.68280029296875, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5674334764480591, "rewards/margins": 25.797527313232422, "rewards/rejected": -25.2300968170166, "step": 3910 }, { "epoch": 1.79, "learning_rate": 1.3444951801116184e-07, "logits/chosen": -0.8897479176521301, "logits/rejected": -0.8622371554374695, "logps/chosen": -87.61221313476562, "logps/rejected": -119.82197570800781, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.505795955657959, "rewards/margins": 26.55779457092285, "rewards/rejected": -26.051998138427734, "step": 3920 }, { "epoch": 1.79, "learning_rate": 1.339421613394216e-07, "logits/chosen": -0.9741457104682922, "logits/rejected": -0.8766289949417114, "logps/chosen": -87.72877502441406, "logps/rejected": -121.0351333618164, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 1.0175672769546509, "rewards/margins": 27.11517333984375, "rewards/rejected": -26.097606658935547, "step": 3930 }, { "epoch": 1.8, "learning_rate": 1.3343480466768138e-07, "logits/chosen": -0.9145620465278625, "logits/rejected": -0.8341091275215149, "logps/chosen": -82.79689025878906, "logps/rejected": -113.12298583984375, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.01173012237995863, "rewards/margins": 24.762723922729492, "rewards/rejected": -24.77445411682129, "step": 3940 }, { "epoch": 1.8, "learning_rate": 1.3292744799594114e-07, "logits/chosen": -0.8479018211364746, "logits/rejected": -0.8104110956192017, "logps/chosen": -86.39948272705078, "logps/rejected": -116.41688537597656, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.121065139770508, "rewards/margins": 26.785120010375977, "rewards/rejected": -24.664052963256836, "step": 3950 }, { "epoch": 1.81, "learning_rate": 1.324200913242009e-07, "logits/chosen": -0.8203024864196777, "logits/rejected": -0.7483721971511841, "logps/chosen": -82.69932556152344, "logps/rejected": -118.39974212646484, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.7221908569335938, "rewards/margins": 27.958471298217773, "rewards/rejected": -26.236278533935547, "step": 3960 }, { "epoch": 1.81, "learning_rate": 1.3191273465246068e-07, "logits/chosen": -0.9186090230941772, "logits/rejected": -0.8178736567497253, "logps/chosen": -87.22188568115234, "logps/rejected": -119.90480041503906, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6769983768463135, "rewards/margins": 27.869892120361328, "rewards/rejected": -25.192890167236328, "step": 3970 }, { "epoch": 1.82, "learning_rate": 1.3140537798072044e-07, "logits/chosen": -0.8046373128890991, "logits/rejected": -0.7908083200454712, "logps/chosen": -83.47384643554688, "logps/rejected": -109.45014953613281, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 2.221688747406006, "rewards/margins": 25.10355567932129, "rewards/rejected": -22.881866455078125, "step": 3980 }, { "epoch": 1.82, "learning_rate": 1.308980213089802e-07, "logits/chosen": -0.8663067817687988, "logits/rejected": -0.8282343745231628, "logps/chosen": -82.12123107910156, "logps/rejected": -113.12467956542969, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 2.3237030506134033, "rewards/margins": 24.891172409057617, "rewards/rejected": -22.56747055053711, "step": 3990 }, { "epoch": 1.83, "learning_rate": 1.3039066463723998e-07, "logits/chosen": -0.8992531895637512, "logits/rejected": -0.8389276266098022, "logps/chosen": -84.60519409179688, "logps/rejected": -114.64949798583984, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.048742413520813, "rewards/margins": 26.16281509399414, "rewards/rejected": -25.114072799682617, "step": 4000 }, { "epoch": 1.83, "eval_logits/chosen": -0.8783338069915771, "eval_logits/rejected": -0.8036257028579712, "eval_logps/chosen": -82.68094635009766, "eval_logps/rejected": -113.65660858154297, "eval_loss": 0.0062260739505290985, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.4284573793411255, "eval_rewards/margins": 25.826065063476562, "eval_rewards/rejected": -24.397605895996094, "eval_runtime": 63.7559, "eval_samples_per_second": 44.89, "eval_steps_per_second": 2.808, "step": 4000 }, { "epoch": 1.83, "learning_rate": 1.2988330796549974e-07, "logits/chosen": -0.894163966178894, "logits/rejected": -0.8875846862792969, "logps/chosen": -86.4601821899414, "logps/rejected": -115.12332916259766, "loss": 0.0065, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.6220238208770752, "rewards/margins": 25.81390953063965, "rewards/rejected": -24.19188690185547, "step": 4010 }, { "epoch": 1.83, "learning_rate": 1.293759512937595e-07, "logits/chosen": -0.8875330686569214, "logits/rejected": -0.8312565684318542, "logps/chosen": -88.39109802246094, "logps/rejected": -119.79731750488281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.05904974788427353, "rewards/margins": 24.40655517578125, "rewards/rejected": -24.46560287475586, "step": 4020 }, { "epoch": 1.84, "learning_rate": 1.2886859462201928e-07, "logits/chosen": -0.9583679437637329, "logits/rejected": -0.8589159846305847, "logps/chosen": -87.69874572753906, "logps/rejected": -114.6513900756836, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.5733067989349365, "rewards/margins": 25.790552139282227, "rewards/rejected": -24.217248916625977, "step": 4030 }, { "epoch": 1.84, "learning_rate": 1.2836123795027904e-07, "logits/chosen": -0.8350761532783508, "logits/rejected": -0.8028010129928589, "logps/chosen": -81.8393783569336, "logps/rejected": -115.5448989868164, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1797157526016235, "rewards/margins": 26.374853134155273, "rewards/rejected": -25.195138931274414, "step": 4040 }, { "epoch": 1.85, "learning_rate": 1.278538812785388e-07, "logits/chosen": -0.879655659198761, "logits/rejected": -0.8751519918441772, "logps/chosen": -90.0999526977539, "logps/rejected": -118.6465072631836, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.742595911026001, "rewards/margins": 27.128515243530273, "rewards/rejected": -25.385921478271484, "step": 4050 }, { "epoch": 1.85, "learning_rate": 1.2734652460679858e-07, "logits/chosen": -0.9097345471382141, "logits/rejected": -0.8761495351791382, "logps/chosen": -86.26795959472656, "logps/rejected": -118.676025390625, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 1.8561265468597412, "rewards/margins": 26.486553192138672, "rewards/rejected": -24.63042640686035, "step": 4060 }, { "epoch": 1.86, "learning_rate": 1.2683916793505834e-07, "logits/chosen": -0.8175376653671265, "logits/rejected": -0.7447512149810791, "logps/chosen": -80.61178588867188, "logps/rejected": -117.44854736328125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 2.017381429672241, "rewards/margins": 26.004596710205078, "rewards/rejected": -23.98721694946289, "step": 4070 }, { "epoch": 1.86, "learning_rate": 1.263318112633181e-07, "logits/chosen": -0.8578559756278992, "logits/rejected": -0.7857792973518372, "logps/chosen": -87.51000213623047, "logps/rejected": -114.0421142578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.8245757818222046, "rewards/margins": 24.864219665527344, "rewards/rejected": -24.03964614868164, "step": 4080 }, { "epoch": 1.87, "learning_rate": 1.2582445459157788e-07, "logits/chosen": -0.8620840907096863, "logits/rejected": -0.7266338467597961, "logps/chosen": -80.26228332519531, "logps/rejected": -112.8070068359375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 1.7275581359863281, "rewards/margins": 25.295616149902344, "rewards/rejected": -23.568058013916016, "step": 4090 }, { "epoch": 1.87, "learning_rate": 1.2531709791983764e-07, "logits/chosen": -0.9016240835189819, "logits/rejected": -0.8646566271781921, "logps/chosen": -89.81653594970703, "logps/rejected": -118.0747299194336, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 1.3270467519760132, "rewards/margins": 25.991525650024414, "rewards/rejected": -24.664478302001953, "step": 4100 }, { "epoch": 1.87, "eval_logits/chosen": -0.8751200437545776, "eval_logits/rejected": -0.7990096211433411, "eval_logps/chosen": -82.98982238769531, "eval_logps/rejected": -113.60397338867188, "eval_loss": 0.006390063092112541, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 1.2740201950073242, "eval_rewards/margins": 25.645313262939453, "eval_rewards/rejected": -24.371292114257812, "eval_runtime": 93.3131, "eval_samples_per_second": 30.671, "eval_steps_per_second": 1.918, "step": 4100 }, { "epoch": 1.88, "learning_rate": 1.248097412480974e-07, "logits/chosen": -1.012479543685913, "logits/rejected": -0.8951042294502258, "logps/chosen": -89.95454406738281, "logps/rejected": -120.26151275634766, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.4968180656433105, "rewards/margins": 27.859487533569336, "rewards/rejected": -25.362668991088867, "step": 4110 }, { "epoch": 1.88, "learning_rate": 1.2430238457635718e-07, "logits/chosen": -0.8006552457809448, "logits/rejected": -0.7851511240005493, "logps/chosen": -86.0716552734375, "logps/rejected": -115.94569396972656, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 1.7579014301300049, "rewards/margins": 25.864261627197266, "rewards/rejected": -24.106359481811523, "step": 4120 }, { "epoch": 1.89, "learning_rate": 1.2379502790461694e-07, "logits/chosen": -0.8675606846809387, "logits/rejected": -0.8367154002189636, "logps/chosen": -83.247802734375, "logps/rejected": -115.09774017333984, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.06647715717554092, "rewards/margins": 24.646516799926758, "rewards/rejected": -24.580041885375977, "step": 4130 }, { "epoch": 1.89, "learning_rate": 1.232876712328767e-07, "logits/chosen": -0.8167160749435425, "logits/rejected": -0.80632483959198, "logps/chosen": -88.31963348388672, "logps/rejected": -118.49552917480469, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 1.2290937900543213, "rewards/margins": 26.213436126708984, "rewards/rejected": -24.984344482421875, "step": 4140 }, { "epoch": 1.89, "learning_rate": 1.2278031456113648e-07, "logits/chosen": -0.9386017918586731, "logits/rejected": -0.8482101559638977, "logps/chosen": -89.70311737060547, "logps/rejected": -117.4471206665039, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7219051122665405, "rewards/margins": 25.45486831665039, "rewards/rejected": -23.732961654663086, "step": 4150 }, { "epoch": 1.9, "learning_rate": 1.2227295788939624e-07, "logits/chosen": -0.8697861433029175, "logits/rejected": -0.7494013905525208, "logps/chosen": -85.14886474609375, "logps/rejected": -122.1308364868164, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3808021545410156, "rewards/margins": 27.002914428710938, "rewards/rejected": -25.622112274169922, "step": 4160 }, { "epoch": 1.9, "learning_rate": 1.21765601217656e-07, "logits/chosen": -0.8890932202339172, "logits/rejected": -0.864509105682373, "logps/chosen": -80.30211639404297, "logps/rejected": -120.65213775634766, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.640447974205017, "rewards/margins": 26.37517738342285, "rewards/rejected": -24.734729766845703, "step": 4170 }, { "epoch": 1.91, "learning_rate": 1.2125824454591578e-07, "logits/chosen": -0.8559309840202332, "logits/rejected": -0.8733586072921753, "logps/chosen": -90.32381439208984, "logps/rejected": -114.02840423583984, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 1.5059894323349, "rewards/margins": 25.90911865234375, "rewards/rejected": -24.40313148498535, "step": 4180 }, { "epoch": 1.91, "learning_rate": 1.2075088787417554e-07, "logits/chosen": -0.7873523235321045, "logits/rejected": -0.8265848159790039, "logps/chosen": -84.38890075683594, "logps/rejected": -116.04801940917969, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 1.9120607376098633, "rewards/margins": 26.35770034790039, "rewards/rejected": -24.445636749267578, "step": 4190 }, { "epoch": 1.92, "learning_rate": 1.202435312024353e-07, "logits/chosen": -0.8989976644515991, "logits/rejected": -0.8107091188430786, "logps/chosen": -82.97294616699219, "logps/rejected": -113.08280944824219, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.4709131717681885, "rewards/margins": 23.579673767089844, "rewards/rejected": -23.108760833740234, "step": 4200 }, { "epoch": 1.92, "eval_logits/chosen": -0.8626413941383362, "eval_logits/rejected": -0.7886302471160889, "eval_logps/chosen": -83.84310150146484, "eval_logps/rejected": -112.74535369873047, "eval_loss": 0.0062231095507740974, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.847379744052887, "eval_rewards/margins": 24.78935432434082, "eval_rewards/rejected": -23.941972732543945, "eval_runtime": 80.9092, "eval_samples_per_second": 35.373, "eval_steps_per_second": 2.212, "step": 4200 }, { "epoch": 1.92, "learning_rate": 1.1973617453069508e-07, "logits/chosen": -0.8819769024848938, "logits/rejected": -0.8112877607345581, "logps/chosen": -83.00496673583984, "logps/rejected": -114.4212875366211, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.11636953055858612, "rewards/margins": 24.707599639892578, "rewards/rejected": -24.8239688873291, "step": 4210 }, { "epoch": 1.93, "learning_rate": 1.1922881785895484e-07, "logits/chosen": -0.8796902894973755, "logits/rejected": -0.8617744445800781, "logps/chosen": -88.17607116699219, "logps/rejected": -122.41082763671875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.7744562029838562, "rewards/margins": 25.8326473236084, "rewards/rejected": -25.058191299438477, "step": 4220 }, { "epoch": 1.93, "learning_rate": 1.187214611872146e-07, "logits/chosen": -0.9677571058273315, "logits/rejected": -0.8670059442520142, "logps/chosen": -92.79700469970703, "logps/rejected": -123.5301513671875, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.26532265543937683, "rewards/margins": 25.150630950927734, "rewards/rejected": -25.415952682495117, "step": 4230 }, { "epoch": 1.94, "learning_rate": 1.1821410451547436e-07, "logits/chosen": -0.8243002891540527, "logits/rejected": -0.7622770071029663, "logps/chosen": -82.1230697631836, "logps/rejected": -116.01241302490234, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 1.8827035427093506, "rewards/margins": 26.27325439453125, "rewards/rejected": -24.390552520751953, "step": 4240 }, { "epoch": 1.94, "learning_rate": 1.1770674784373413e-07, "logits/chosen": -0.8179371953010559, "logits/rejected": -0.7521147131919861, "logps/chosen": -79.74634552001953, "logps/rejected": -114.4630126953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.15621128678321838, "rewards/margins": 24.31221580505371, "rewards/rejected": -24.156003952026367, "step": 4250 }, { "epoch": 1.94, "learning_rate": 1.171993911719939e-07, "logits/chosen": -0.9867182970046997, "logits/rejected": -0.8832413554191589, "logps/chosen": -85.32367706298828, "logps/rejected": -119.32315826416016, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.011179482564330101, "rewards/margins": 25.743982315063477, "rewards/rejected": -25.75516128540039, "step": 4260 }, { "epoch": 1.95, "learning_rate": 1.1669203450025366e-07, "logits/chosen": -0.7901066541671753, "logits/rejected": -0.7551292181015015, "logps/chosen": -79.58982849121094, "logps/rejected": -114.67451477050781, "loss": 0.0068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5268216133117676, "rewards/margins": 23.880836486816406, "rewards/rejected": -23.354015350341797, "step": 4270 }, { "epoch": 1.95, "learning_rate": 1.1618467782851343e-07, "logits/chosen": -0.8936311602592468, "logits/rejected": -0.8211982846260071, "logps/chosen": -82.46665954589844, "logps/rejected": -119.68986511230469, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.9696955680847168, "rewards/margins": 25.855365753173828, "rewards/rejected": -24.88566780090332, "step": 4280 }, { "epoch": 1.96, "learning_rate": 1.156773211567732e-07, "logits/chosen": -0.8227362632751465, "logits/rejected": -0.7342718839645386, "logps/chosen": -83.89360809326172, "logps/rejected": -117.38250732421875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.834515929222107, "rewards/margins": 27.02130126953125, "rewards/rejected": -25.186786651611328, "step": 4290 }, { "epoch": 1.96, "learning_rate": 1.1516996448503296e-07, "logits/chosen": -0.9188385009765625, "logits/rejected": -0.8470960855484009, "logps/chosen": -83.9849624633789, "logps/rejected": -120.4980239868164, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.40226873755455017, "rewards/margins": 26.013219833374023, "rewards/rejected": -25.610950469970703, "step": 4300 }, { "epoch": 1.96, "eval_logits/chosen": -0.8774282932281494, "eval_logits/rejected": -0.8064822554588318, "eval_logps/chosen": -83.99234008789062, "eval_logps/rejected": -113.08182525634766, "eval_loss": 0.006075400393456221, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.7727564573287964, "eval_rewards/margins": 24.882965087890625, "eval_rewards/rejected": -24.110210418701172, "eval_runtime": 77.6038, "eval_samples_per_second": 36.88, "eval_steps_per_second": 2.307, "step": 4300 }, { "epoch": 1.97, "learning_rate": 1.1466260781329273e-07, "logits/chosen": -0.8826113939285278, "logits/rejected": -0.7901960611343384, "logps/chosen": -88.16101837158203, "logps/rejected": -117.04609680175781, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6930155158042908, "rewards/margins": 25.547672271728516, "rewards/rejected": -24.854656219482422, "step": 4310 }, { "epoch": 1.97, "learning_rate": 1.141552511415525e-07, "logits/chosen": -0.8641761541366577, "logits/rejected": -0.8362469673156738, "logps/chosen": -88.54991149902344, "logps/rejected": -116.129150390625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.7447670102119446, "rewards/margins": 25.9591007232666, "rewards/rejected": -25.214338302612305, "step": 4320 }, { "epoch": 1.98, "learning_rate": 1.1364789446981226e-07, "logits/chosen": -0.8864119648933411, "logits/rejected": -0.8219255208969116, "logps/chosen": -90.05606842041016, "logps/rejected": -119.87153625488281, "loss": 0.0062, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8769302368164062, "rewards/margins": 26.760021209716797, "rewards/rejected": -25.88309669494629, "step": 4330 }, { "epoch": 1.98, "learning_rate": 1.1314053779807203e-07, "logits/chosen": -0.9568690061569214, "logits/rejected": -0.9261384010314941, "logps/chosen": -85.25032043457031, "logps/rejected": -116.95954895019531, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.3308374285697937, "rewards/margins": 25.455656051635742, "rewards/rejected": -25.12481689453125, "step": 4340 }, { "epoch": 1.99, "learning_rate": 1.126331811263318e-07, "logits/chosen": -0.9497167468070984, "logits/rejected": -0.8359603881835938, "logps/chosen": -82.88484191894531, "logps/rejected": -113.48707580566406, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.22573499381542206, "rewards/margins": 24.359251022338867, "rewards/rejected": -24.58498764038086, "step": 4350 }, { "epoch": 1.99, "learning_rate": 1.1212582445459156e-07, "logits/chosen": -0.8375424146652222, "logits/rejected": -0.8085098266601562, "logps/chosen": -91.62086486816406, "logps/rejected": -118.6721420288086, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.29354745149612427, "rewards/margins": 26.114770889282227, "rewards/rejected": -25.82122230529785, "step": 4360 }, { "epoch": 1.99, "learning_rate": 1.1161846778285133e-07, "logits/chosen": -0.862596869468689, "logits/rejected": -0.7966786623001099, "logps/chosen": -80.2033462524414, "logps/rejected": -115.92503356933594, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.1859007626771927, "rewards/margins": 23.71189308166504, "rewards/rejected": -23.525989532470703, "step": 4370 }, { "epoch": 2.0, "learning_rate": 1.111111111111111e-07, "logits/chosen": -0.8235847353935242, "logits/rejected": -0.7749193906784058, "logps/chosen": -90.13212585449219, "logps/rejected": -112.7907485961914, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.0780595541000366, "rewards/margins": 24.11054229736328, "rewards/rejected": -23.032482147216797, "step": 4380 }, { "epoch": 2.0, "learning_rate": 1.1060375443937086e-07, "logits/chosen": -0.873267650604248, "logits/rejected": -0.872418999671936, "logps/chosen": -84.37730407714844, "logps/rejected": -116.15653991699219, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.26867276430130005, "rewards/margins": 23.833995819091797, "rewards/rejected": -23.56532096862793, "step": 4390 }, { "epoch": 2.01, "learning_rate": 1.1009639776763063e-07, "logits/chosen": -0.9008662104606628, "logits/rejected": -0.8445860743522644, "logps/chosen": -85.77739715576172, "logps/rejected": -113.36775207519531, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9516502618789673, "rewards/margins": 25.20037078857422, "rewards/rejected": -24.248720169067383, "step": 4400 }, { "epoch": 2.01, "eval_logits/chosen": -0.8856587409973145, "eval_logits/rejected": -0.8144620656967163, "eval_logps/chosen": -84.20708465576172, "eval_logps/rejected": -112.38359069824219, "eval_loss": 0.005585566163063049, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.6653829216957092, "eval_rewards/margins": 24.426471710205078, "eval_rewards/rejected": -23.761089324951172, "eval_runtime": 81.7268, "eval_samples_per_second": 35.019, "eval_steps_per_second": 2.19, "step": 4400 }, { "epoch": 2.01, "learning_rate": 1.095890410958904e-07, "logits/chosen": -0.8619349598884583, "logits/rejected": -0.8154422640800476, "logps/chosen": -88.06494140625, "logps/rejected": -120.3101577758789, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 1.4764477014541626, "rewards/margins": 26.710468292236328, "rewards/rejected": -25.234024047851562, "step": 4410 }, { "epoch": 2.02, "learning_rate": 1.0908168442415016e-07, "logits/chosen": -0.8404685854911804, "logits/rejected": -0.8038052320480347, "logps/chosen": -81.84117126464844, "logps/rejected": -112.4288558959961, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.12754690647125244, "rewards/margins": 23.65199851989746, "rewards/rejected": -23.524450302124023, "step": 4420 }, { "epoch": 2.02, "learning_rate": 1.0857432775240993e-07, "logits/chosen": -0.7906190156936646, "logits/rejected": -0.7676808834075928, "logps/chosen": -80.90745544433594, "logps/rejected": -120.09671783447266, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 1.1424059867858887, "rewards/margins": 26.4530086517334, "rewards/rejected": -25.310606002807617, "step": 4430 }, { "epoch": 2.03, "learning_rate": 1.080669710806697e-07, "logits/chosen": -0.9605631828308105, "logits/rejected": -0.8633155822753906, "logps/chosen": -84.76126861572266, "logps/rejected": -118.72808837890625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.9969682693481445, "rewards/margins": 26.093795776367188, "rewards/rejected": -25.09682846069336, "step": 4440 }, { "epoch": 2.03, "learning_rate": 1.0755961440892946e-07, "logits/chosen": -0.953199028968811, "logits/rejected": -0.9226964712142944, "logps/chosen": -82.70755767822266, "logps/rejected": -116.70185852050781, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.997397243976593, "rewards/margins": 25.517671585083008, "rewards/rejected": -24.520275115966797, "step": 4450 }, { "epoch": 2.04, "learning_rate": 1.0705225773718923e-07, "logits/chosen": -0.9112932085990906, "logits/rejected": -0.8612509965896606, "logps/chosen": -86.12630462646484, "logps/rejected": -113.80561828613281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.07253873348236084, "rewards/margins": 24.135400772094727, "rewards/rejected": -24.062862396240234, "step": 4460 }, { "epoch": 2.04, "learning_rate": 1.06544901065449e-07, "logits/chosen": -0.9248711466789246, "logits/rejected": -0.8723545074462891, "logps/chosen": -87.74769592285156, "logps/rejected": -118.7971420288086, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.7695275545120239, "rewards/margins": 24.69394302368164, "rewards/rejected": -23.924413681030273, "step": 4470 }, { "epoch": 2.04, "learning_rate": 1.0603754439370876e-07, "logits/chosen": -0.9148474931716919, "logits/rejected": -0.8270168304443359, "logps/chosen": -85.63670349121094, "logps/rejected": -120.21187591552734, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.07334025204181671, "rewards/margins": 25.913070678710938, "rewards/rejected": -25.986413955688477, "step": 4480 }, { "epoch": 2.05, "learning_rate": 1.0553018772196853e-07, "logits/chosen": -0.9342101216316223, "logits/rejected": -0.8520076870918274, "logps/chosen": -89.24897766113281, "logps/rejected": -117.5027847290039, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7600328922271729, "rewards/margins": 24.749935150146484, "rewards/rejected": -23.98990249633789, "step": 4490 }, { "epoch": 2.05, "learning_rate": 1.050228310502283e-07, "logits/chosen": -0.8792055249214172, "logits/rejected": -0.8664016723632812, "logps/chosen": -83.908447265625, "logps/rejected": -117.2926025390625, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2741992473602295, "rewards/margins": 24.095046997070312, "rewards/rejected": -25.369247436523438, "step": 4500 }, { "epoch": 2.05, "eval_logits/chosen": -0.8906596899032593, "eval_logits/rejected": -0.818504810333252, "eval_logps/chosen": -84.84129333496094, "eval_logps/rejected": -113.85082244873047, "eval_loss": 0.0056776185519993305, "eval_rewards/accuracies": 0.994413435459137, "eval_rewards/chosen": 0.3482798635959625, "eval_rewards/margins": 24.842987060546875, "eval_rewards/rejected": -24.49471092224121, "eval_runtime": 81.446, "eval_samples_per_second": 35.14, "eval_steps_per_second": 2.198, "step": 4500 }, { "epoch": 2.06, "learning_rate": 1.0451547437848806e-07, "logits/chosen": -0.9115379452705383, "logits/rejected": -0.8717126846313477, "logps/chosen": -88.75419616699219, "logps/rejected": -116.99250793457031, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 1.256993055343628, "rewards/margins": 25.51570701599121, "rewards/rejected": -24.258716583251953, "step": 4510 }, { "epoch": 2.06, "learning_rate": 1.0400811770674783e-07, "logits/chosen": -0.8679983019828796, "logits/rejected": -0.8133772611618042, "logps/chosen": -78.07077026367188, "logps/rejected": -116.81890869140625, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4886135160923004, "rewards/margins": 26.63811683654785, "rewards/rejected": -26.149499893188477, "step": 4520 }, { "epoch": 2.07, "learning_rate": 1.035007610350076e-07, "logits/chosen": -0.8347498178482056, "logits/rejected": -0.8103248476982117, "logps/chosen": -89.81623840332031, "logps/rejected": -112.1605224609375, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13603441417217255, "rewards/margins": 24.102638244628906, "rewards/rejected": -23.966602325439453, "step": 4530 }, { "epoch": 2.07, "learning_rate": 1.0299340436326736e-07, "logits/chosen": -0.8579212427139282, "logits/rejected": -0.8020459413528442, "logps/chosen": -83.22380828857422, "logps/rejected": -116.16816711425781, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8977780342102051, "rewards/margins": 25.821319580078125, "rewards/rejected": -24.92354393005371, "step": 4540 }, { "epoch": 2.08, "learning_rate": 1.0248604769152713e-07, "logits/chosen": -0.9014474153518677, "logits/rejected": -0.8092902898788452, "logps/chosen": -82.77027893066406, "logps/rejected": -115.4444580078125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.5497044920921326, "rewards/margins": 24.866573333740234, "rewards/rejected": -24.31686782836914, "step": 4550 }, { "epoch": 2.08, "learning_rate": 1.019786910197869e-07, "logits/chosen": -0.9946414828300476, "logits/rejected": -0.9202834367752075, "logps/chosen": -87.10456848144531, "logps/rejected": -117.59464263916016, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5109010934829712, "rewards/margins": 23.4146671295166, "rewards/rejected": -24.925569534301758, "step": 4560 }, { "epoch": 2.09, "learning_rate": 1.0147133434804666e-07, "logits/chosen": -0.9581912755966187, "logits/rejected": -0.8749968409538269, "logps/chosen": -83.8426742553711, "logps/rejected": -112.90596771240234, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.59953773021698, "rewards/margins": 25.36787986755371, "rewards/rejected": -24.768339157104492, "step": 4570 }, { "epoch": 2.09, "learning_rate": 1.0096397767630643e-07, "logits/chosen": -0.8620211482048035, "logits/rejected": -0.8351920247077942, "logps/chosen": -90.25982666015625, "logps/rejected": -122.8171157836914, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.406160593032837, "rewards/margins": 26.347885131835938, "rewards/rejected": -24.94172477722168, "step": 4580 }, { "epoch": 2.1, "learning_rate": 1.004566210045662e-07, "logits/chosen": -0.9509794116020203, "logits/rejected": -0.8784133791923523, "logps/chosen": -81.00958251953125, "logps/rejected": -119.47309875488281, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.7654523253440857, "rewards/margins": 25.69040298461914, "rewards/rejected": -24.92494773864746, "step": 4590 }, { "epoch": 2.1, "learning_rate": 9.994926433282596e-08, "logits/chosen": -0.946574866771698, "logits/rejected": -0.8596547842025757, "logps/chosen": -90.84710693359375, "logps/rejected": -121.0890884399414, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.3096452057361603, "rewards/margins": 25.466899871826172, "rewards/rejected": -25.776546478271484, "step": 4600 }, { "epoch": 2.1, "eval_logits/chosen": -0.8970758318901062, "eval_logits/rejected": -0.8229092359542847, "eval_logps/chosen": -85.00439453125, "eval_logps/rejected": -114.76090240478516, "eval_loss": 0.005778728984296322, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": 0.2667306661605835, "eval_rewards/margins": 25.21647834777832, "eval_rewards/rejected": -24.94974708557129, "eval_runtime": 156.627, "eval_samples_per_second": 18.273, "eval_steps_per_second": 1.143, "step": 4600 }, { "epoch": 2.1, "learning_rate": 9.944190766108573e-08, "logits/chosen": -0.9343441128730774, "logits/rejected": -0.817380428314209, "logps/chosen": -86.71347045898438, "logps/rejected": -118.91255187988281, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.04746241495013237, "rewards/margins": 25.993091583251953, "rewards/rejected": -25.94562339782715, "step": 4610 }, { "epoch": 2.11, "learning_rate": 9.89345509893455e-08, "logits/chosen": -0.8125195503234863, "logits/rejected": -0.796918511390686, "logps/chosen": -89.77796173095703, "logps/rejected": -122.3956298828125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.11807354539632797, "rewards/margins": 27.092082977294922, "rewards/rejected": -26.974010467529297, "step": 4620 }, { "epoch": 2.11, "learning_rate": 9.842719431760526e-08, "logits/chosen": -0.7921231389045715, "logits/rejected": -0.8076340556144714, "logps/chosen": -84.28463745117188, "logps/rejected": -119.71067810058594, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5065456628799438, "rewards/margins": 26.92000389099121, "rewards/rejected": -26.4134578704834, "step": 4630 }, { "epoch": 2.12, "learning_rate": 9.791983764586503e-08, "logits/chosen": -0.9721126556396484, "logits/rejected": -0.8867252469062805, "logps/chosen": -86.30948638916016, "logps/rejected": -119.10693359375, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4696705937385559, "rewards/margins": 25.70583152770996, "rewards/rejected": -26.175506591796875, "step": 4640 }, { "epoch": 2.12, "learning_rate": 9.74124809741248e-08, "logits/chosen": -0.9281400442123413, "logits/rejected": -0.8973511457443237, "logps/chosen": -81.73219299316406, "logps/rejected": -115.91802978515625, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.17622852325439453, "rewards/margins": 24.91793441772461, "rewards/rejected": -25.094161987304688, "step": 4650 }, { "epoch": 2.13, "learning_rate": 9.690512430238456e-08, "logits/chosen": -0.8222247958183289, "logits/rejected": -0.7661574482917786, "logps/chosen": -82.31886291503906, "logps/rejected": -117.75882720947266, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.8082590103149414, "rewards/margins": 24.505247116088867, "rewards/rejected": -25.313507080078125, "step": 4660 }, { "epoch": 2.13, "learning_rate": 9.639776763064433e-08, "logits/chosen": -0.9574063420295715, "logits/rejected": -0.939258873462677, "logps/chosen": -83.35095977783203, "logps/rejected": -116.93514251708984, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.16359582543373108, "rewards/margins": 26.779830932617188, "rewards/rejected": -26.616235733032227, "step": 4670 }, { "epoch": 2.14, "learning_rate": 9.58904109589041e-08, "logits/chosen": -1.0326330661773682, "logits/rejected": -0.9830595254898071, "logps/chosen": -87.02751159667969, "logps/rejected": -117.13568115234375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.08589287102222443, "rewards/margins": 26.108123779296875, "rewards/rejected": -26.022228240966797, "step": 4680 }, { "epoch": 2.14, "learning_rate": 9.538305428716386e-08, "logits/chosen": -0.8359645009040833, "logits/rejected": -0.8177057504653931, "logps/chosen": -82.79866027832031, "logps/rejected": -110.24278259277344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.42063361406326294, "rewards/margins": 23.922367095947266, "rewards/rejected": -24.343002319335938, "step": 4690 }, { "epoch": 2.15, "learning_rate": 9.487569761542363e-08, "logits/chosen": -0.9532375335693359, "logits/rejected": -0.9111067056655884, "logps/chosen": -94.54585266113281, "logps/rejected": -121.60087585449219, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5539652705192566, "rewards/margins": 26.92850112915039, "rewards/rejected": -26.374536514282227, "step": 4700 }, { "epoch": 2.15, "eval_logits/chosen": -0.915102481842041, "eval_logits/rejected": -0.8374292254447937, "eval_logps/chosen": -85.5395278930664, "eval_logps/rejected": -116.08502960205078, "eval_loss": 0.006045613903552294, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.0008371875155717134, "eval_rewards/margins": 25.610980987548828, "eval_rewards/rejected": -25.611818313598633, "eval_runtime": 70.6998, "eval_samples_per_second": 40.481, "eval_steps_per_second": 2.532, "step": 4700 }, { "epoch": 2.15, "learning_rate": 9.43683409436834e-08, "logits/chosen": -0.9409311413764954, "logits/rejected": -0.8758177757263184, "logps/chosen": -82.46241760253906, "logps/rejected": -121.64759826660156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.5624381303787231, "rewards/margins": 26.8708438873291, "rewards/rejected": -26.30840492248535, "step": 4710 }, { "epoch": 2.15, "learning_rate": 9.386098427194316e-08, "logits/chosen": -0.8292143940925598, "logits/rejected": -0.8223336935043335, "logps/chosen": -86.91860961914062, "logps/rejected": -119.90704345703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.29642102122306824, "rewards/margins": 25.774639129638672, "rewards/rejected": -25.47821617126465, "step": 4720 }, { "epoch": 2.16, "learning_rate": 9.335362760020293e-08, "logits/chosen": -0.9198992848396301, "logits/rejected": -0.9353822469711304, "logps/chosen": -85.23753356933594, "logps/rejected": -115.43070983886719, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8973407745361328, "rewards/margins": 26.307861328125, "rewards/rejected": -25.410518646240234, "step": 4730 }, { "epoch": 2.16, "learning_rate": 9.28462709284627e-08, "logits/chosen": -0.9610759019851685, "logits/rejected": -0.9153316617012024, "logps/chosen": -87.15513610839844, "logps/rejected": -119.3092041015625, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5247424840927124, "rewards/margins": 27.221759796142578, "rewards/rejected": -27.746501922607422, "step": 4740 }, { "epoch": 2.17, "learning_rate": 9.233891425672246e-08, "logits/chosen": -0.9629012942314148, "logits/rejected": -0.8682750463485718, "logps/chosen": -86.7682876586914, "logps/rejected": -120.1332778930664, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.6402833461761475, "rewards/margins": 28.504135131835938, "rewards/rejected": -26.863849639892578, "step": 4750 }, { "epoch": 2.17, "learning_rate": 9.183155758498223e-08, "logits/chosen": -1.0006027221679688, "logits/rejected": -0.9207250475883484, "logps/chosen": -87.22566986083984, "logps/rejected": -121.52197265625, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9196405410766602, "rewards/margins": 25.359289169311523, "rewards/rejected": -26.2789306640625, "step": 4760 }, { "epoch": 2.18, "learning_rate": 9.1324200913242e-08, "logits/chosen": -0.9114225506782532, "logits/rejected": -0.8728778958320618, "logps/chosen": -89.46612548828125, "logps/rejected": -119.34295654296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.09139951318502426, "rewards/margins": 25.818500518798828, "rewards/rejected": -25.909902572631836, "step": 4770 }, { "epoch": 2.18, "learning_rate": 9.081684424150176e-08, "logits/chosen": -0.9377741813659668, "logits/rejected": -0.8587690591812134, "logps/chosen": -88.31532287597656, "logps/rejected": -124.38812255859375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.027195418253540993, "rewards/margins": 27.07535743713379, "rewards/rejected": -27.04816246032715, "step": 4780 }, { "epoch": 2.19, "learning_rate": 9.030948756976153e-08, "logits/chosen": -0.9471622705459595, "logits/rejected": -0.9017325639724731, "logps/chosen": -85.16099548339844, "logps/rejected": -116.66732025146484, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5448721647262573, "rewards/margins": 26.83917236328125, "rewards/rejected": -26.294300079345703, "step": 4790 }, { "epoch": 2.19, "learning_rate": 8.98021308980213e-08, "logits/chosen": -0.9159450531005859, "logits/rejected": -0.8532840609550476, "logps/chosen": -82.51966857910156, "logps/rejected": -119.74662017822266, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6007218360900879, "rewards/margins": 27.183069229125977, "rewards/rejected": -26.58234214782715, "step": 4800 }, { "epoch": 2.19, "eval_logits/chosen": -0.9306203126907349, "eval_logits/rejected": -0.8540387153625488, "eval_logps/chosen": -85.61259460449219, "eval_logps/rejected": -116.9902572631836, "eval_loss": 0.006084715481847525, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.037370555102825165, "eval_rewards/margins": 26.027050018310547, "eval_rewards/rejected": -26.064422607421875, "eval_runtime": 88.42, "eval_samples_per_second": 32.368, "eval_steps_per_second": 2.024, "step": 4800 }, { "epoch": 2.2, "learning_rate": 8.929477422628106e-08, "logits/chosen": -0.9910691380500793, "logits/rejected": -0.873990535736084, "logps/chosen": -87.20209503173828, "logps/rejected": -126.45884704589844, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0020253420807421207, "rewards/margins": 25.715951919555664, "rewards/rejected": -25.713924407958984, "step": 4810 }, { "epoch": 2.2, "learning_rate": 8.878741755454083e-08, "logits/chosen": -0.8924468755722046, "logits/rejected": -0.8882986307144165, "logps/chosen": -90.6511459350586, "logps/rejected": -123.3057632446289, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.5701011419296265, "rewards/margins": 26.548742294311523, "rewards/rejected": -27.118844985961914, "step": 4820 }, { "epoch": 2.2, "learning_rate": 8.82800608828006e-08, "logits/chosen": -1.0024341344833374, "logits/rejected": -0.8866281509399414, "logps/chosen": -85.85223388671875, "logps/rejected": -120.0212173461914, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5763718485832214, "rewards/margins": 27.8824520111084, "rewards/rejected": -27.306081771850586, "step": 4830 }, { "epoch": 2.21, "learning_rate": 8.777270421106036e-08, "logits/chosen": -0.9681693315505981, "logits/rejected": -0.8785957098007202, "logps/chosen": -84.08697509765625, "logps/rejected": -120.38557434082031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.167352557182312, "rewards/margins": 24.95884132385254, "rewards/rejected": -26.12619400024414, "step": 4840 }, { "epoch": 2.21, "learning_rate": 8.726534753932013e-08, "logits/chosen": -0.9239298701286316, "logits/rejected": -0.8562732934951782, "logps/chosen": -83.76175689697266, "logps/rejected": -119.7491226196289, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.1900556981563568, "rewards/margins": 27.33782386779785, "rewards/rejected": -27.14776611328125, "step": 4850 }, { "epoch": 2.22, "learning_rate": 8.67579908675799e-08, "logits/chosen": -0.8515946269035339, "logits/rejected": -0.842880129814148, "logps/chosen": -82.41915893554688, "logps/rejected": -116.04329681396484, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.8233505487442017, "rewards/margins": 26.450525283813477, "rewards/rejected": -25.62717628479004, "step": 4860 }, { "epoch": 2.22, "learning_rate": 8.625063419583966e-08, "logits/chosen": -0.9757340550422668, "logits/rejected": -0.8796631097793579, "logps/chosen": -84.7564697265625, "logps/rejected": -121.77120208740234, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8205292820930481, "rewards/margins": 25.63332748413086, "rewards/rejected": -26.453853607177734, "step": 4870 }, { "epoch": 2.23, "learning_rate": 8.574327752409943e-08, "logits/chosen": -0.9308716058731079, "logits/rejected": -0.8484630584716797, "logps/chosen": -83.25608825683594, "logps/rejected": -121.8863754272461, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.3879397511482239, "rewards/margins": 26.951122283935547, "rewards/rejected": -27.339065551757812, "step": 4880 }, { "epoch": 2.23, "learning_rate": 8.52359208523592e-08, "logits/chosen": -0.9290486574172974, "logits/rejected": -0.8468266725540161, "logps/chosen": -86.26325225830078, "logps/rejected": -124.77742004394531, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.48676443099975586, "rewards/margins": 26.87295150756836, "rewards/rejected": -26.386188507080078, "step": 4890 }, { "epoch": 2.24, "learning_rate": 8.472856418061896e-08, "logits/chosen": -0.9023244976997375, "logits/rejected": -0.8335026502609253, "logps/chosen": -85.95402526855469, "logps/rejected": -118.09017181396484, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 1.000464677810669, "rewards/margins": 27.742565155029297, "rewards/rejected": -26.74209976196289, "step": 4900 }, { "epoch": 2.24, "eval_logits/chosen": -0.9493011236190796, "eval_logits/rejected": -0.8702885508537292, "eval_logps/chosen": -85.37418365478516, "eval_logps/rejected": -117.78074645996094, "eval_loss": 0.006325908936560154, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": 0.08183945715427399, "eval_rewards/margins": 26.54151725769043, "eval_rewards/rejected": -26.459674835205078, "eval_runtime": 104.8312, "eval_samples_per_second": 27.301, "eval_steps_per_second": 1.708, "step": 4900 }, { "epoch": 2.24, "learning_rate": 8.422120750887873e-08, "logits/chosen": -1.0275952816009521, "logits/rejected": -0.9630632400512695, "logps/chosen": -86.69776153564453, "logps/rejected": -120.90191650390625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.8101398348808289, "rewards/margins": 27.59271812438965, "rewards/rejected": -26.782577514648438, "step": 4910 }, { "epoch": 2.25, "learning_rate": 8.37138508371385e-08, "logits/chosen": -0.9919659495353699, "logits/rejected": -0.9070954322814941, "logps/chosen": -85.95883178710938, "logps/rejected": -121.2116928100586, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.606831431388855, "rewards/margins": 27.421112060546875, "rewards/rejected": -26.814281463623047, "step": 4920 }, { "epoch": 2.25, "learning_rate": 8.320649416539826e-08, "logits/chosen": -0.8715358972549438, "logits/rejected": -0.8914009928703308, "logps/chosen": -86.6871109008789, "logps/rejected": -123.45024108886719, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.37944138050079346, "rewards/margins": 27.759761810302734, "rewards/rejected": -28.13920021057129, "step": 4930 }, { "epoch": 2.25, "learning_rate": 8.269913749365803e-08, "logits/chosen": -0.9209533929824829, "logits/rejected": -0.821499228477478, "logps/chosen": -83.86023712158203, "logps/rejected": -120.2563705444336, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.7821536660194397, "rewards/margins": 27.337902069091797, "rewards/rejected": -26.555749893188477, "step": 4940 }, { "epoch": 2.26, "learning_rate": 8.21917808219178e-08, "logits/chosen": -0.9386633038520813, "logits/rejected": -0.8382610082626343, "logps/chosen": -88.39375305175781, "logps/rejected": -119.31612396240234, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.15015359222888947, "rewards/margins": 26.648828506469727, "rewards/rejected": -26.498676300048828, "step": 4950 }, { "epoch": 2.26, "learning_rate": 8.168442415017756e-08, "logits/chosen": -0.9892728924751282, "logits/rejected": -0.9248817563056946, "logps/chosen": -92.55259704589844, "logps/rejected": -121.92408752441406, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.1945768594741821, "rewards/margins": 26.516002655029297, "rewards/rejected": -27.710580825805664, "step": 4960 }, { "epoch": 2.27, "learning_rate": 8.117706747843733e-08, "logits/chosen": -0.9463468790054321, "logits/rejected": -0.9060198664665222, "logps/chosen": -87.85506439208984, "logps/rejected": -123.16998291015625, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.14104899764060974, "rewards/margins": 28.042089462280273, "rewards/rejected": -28.18313980102539, "step": 4970 }, { "epoch": 2.27, "learning_rate": 8.06697108066971e-08, "logits/chosen": -0.9373579025268555, "logits/rejected": -0.9004872441291809, "logps/chosen": -88.71186828613281, "logps/rejected": -121.61962890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.5207719802856445, "rewards/margins": 28.3806095123291, "rewards/rejected": -28.901378631591797, "step": 4980 }, { "epoch": 2.28, "learning_rate": 8.016235413495687e-08, "logits/chosen": -0.8973749279975891, "logits/rejected": -0.8311864137649536, "logps/chosen": -80.42097473144531, "logps/rejected": -121.57218933105469, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.5322467088699341, "rewards/margins": 28.41021156311035, "rewards/rejected": -27.877965927124023, "step": 4990 }, { "epoch": 2.28, "learning_rate": 7.965499746321664e-08, "logits/chosen": -0.8929327130317688, "logits/rejected": -0.8712663650512695, "logps/chosen": -92.48535919189453, "logps/rejected": -123.72513580322266, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.005576419644057751, "rewards/margins": 27.1002254486084, "rewards/rejected": -27.105804443359375, "step": 5000 }, { "epoch": 2.28, "eval_logits/chosen": -0.9634361267089844, "eval_logits/rejected": -0.8802065849304199, "eval_logps/chosen": -87.11389923095703, "eval_logps/rejected": -121.00086975097656, "eval_loss": 0.007718712091445923, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.7880215048789978, "eval_rewards/margins": 27.281707763671875, "eval_rewards/rejected": -28.069734573364258, "eval_runtime": 68.3881, "eval_samples_per_second": 41.849, "eval_steps_per_second": 2.617, "step": 5000 }, { "epoch": 2.29, "learning_rate": 7.91476407914764e-08, "logits/chosen": -0.9871411323547363, "logits/rejected": -0.9252876043319702, "logps/chosen": -87.0884017944336, "logps/rejected": -124.8663330078125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8773247003555298, "rewards/margins": 27.905818939208984, "rewards/rejected": -28.78314208984375, "step": 5010 }, { "epoch": 2.29, "learning_rate": 7.864028411973617e-08, "logits/chosen": -0.9661632776260376, "logits/rejected": -0.860427975654602, "logps/chosen": -92.53157043457031, "logps/rejected": -125.1065444946289, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.514369547367096, "rewards/margins": 28.825185775756836, "rewards/rejected": -29.339553833007812, "step": 5020 }, { "epoch": 2.3, "learning_rate": 7.813292744799594e-08, "logits/chosen": -0.9192354083061218, "logits/rejected": -0.9115994572639465, "logps/chosen": -86.20381164550781, "logps/rejected": -121.10685729980469, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.15855053067207336, "rewards/margins": 27.75839614868164, "rewards/rejected": -27.916950225830078, "step": 5030 }, { "epoch": 2.3, "learning_rate": 7.76255707762557e-08, "logits/chosen": -1.0042352676391602, "logits/rejected": -0.8900748491287231, "logps/chosen": -87.24882507324219, "logps/rejected": -120.3760757446289, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.2955986261367798, "rewards/margins": 25.776174545288086, "rewards/rejected": -27.0717716217041, "step": 5040 }, { "epoch": 2.31, "learning_rate": 7.711821410451547e-08, "logits/chosen": -1.0438252687454224, "logits/rejected": -0.9618898630142212, "logps/chosen": -83.92408752441406, "logps/rejected": -118.453125, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12917456030845642, "rewards/margins": 28.10638999938965, "rewards/rejected": -27.97721290588379, "step": 5050 }, { "epoch": 2.31, "learning_rate": 7.661085743277524e-08, "logits/chosen": -0.9532014727592468, "logits/rejected": -0.9269828796386719, "logps/chosen": -93.73705291748047, "logps/rejected": -126.61479187011719, "loss": 0.002, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03241534158587456, "rewards/margins": 27.516796112060547, "rewards/rejected": -27.549213409423828, "step": 5060 }, { "epoch": 2.31, "learning_rate": 7.6103500761035e-08, "logits/chosen": -0.9407272338867188, "logits/rejected": -0.8259226679801941, "logps/chosen": -87.27238464355469, "logps/rejected": -123.35365295410156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.054013729095459, "rewards/margins": 26.920440673828125, "rewards/rejected": -28.974451065063477, "step": 5070 }, { "epoch": 2.32, "learning_rate": 7.559614408929477e-08, "logits/chosen": -0.919296145439148, "logits/rejected": -0.8606408834457397, "logps/chosen": -86.35128784179688, "logps/rejected": -119.40046691894531, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 1.054381251335144, "rewards/margins": 28.73299217224121, "rewards/rejected": -27.678613662719727, "step": 5080 }, { "epoch": 2.32, "learning_rate": 7.508878741755454e-08, "logits/chosen": -0.8425869941711426, "logits/rejected": -0.8566237688064575, "logps/chosen": -85.87548065185547, "logps/rejected": -125.34675598144531, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.167006254196167, "rewards/margins": 27.098682403564453, "rewards/rejected": -28.26568603515625, "step": 5090 }, { "epoch": 2.33, "learning_rate": 7.45814307458143e-08, "logits/chosen": -1.0067375898361206, "logits/rejected": -0.9085214734077454, "logps/chosen": -88.01753234863281, "logps/rejected": -123.4695816040039, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.424287885427475, "rewards/margins": 29.699504852294922, "rewards/rejected": -29.275217056274414, "step": 5100 }, { "epoch": 2.33, "eval_logits/chosen": -0.9718750715255737, "eval_logits/rejected": -0.8896563053131104, "eval_logps/chosen": -86.07792663574219, "eval_logps/rejected": -120.29959106445312, "eval_loss": 0.0068391538225114346, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.2700366973876953, "eval_rewards/margins": 27.449060440063477, "eval_rewards/rejected": -27.719093322753906, "eval_runtime": 68.349, "eval_samples_per_second": 41.873, "eval_steps_per_second": 2.619, "step": 5100 }, { "epoch": 2.33, "learning_rate": 7.407407407407407e-08, "logits/chosen": -1.0170899629592896, "logits/rejected": -0.9646891355514526, "logps/chosen": -86.5973129272461, "logps/rejected": -122.39430236816406, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2534463405609131, "rewards/margins": 27.92856788635254, "rewards/rejected": -28.1820125579834, "step": 5110 }, { "epoch": 2.34, "learning_rate": 7.356671740233384e-08, "logits/chosen": -0.9090906381607056, "logits/rejected": -0.8181482553482056, "logps/chosen": -82.51972961425781, "logps/rejected": -122.16377258300781, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.48268836736679077, "rewards/margins": 27.902740478515625, "rewards/rejected": -28.38543128967285, "step": 5120 }, { "epoch": 2.34, "learning_rate": 7.30593607305936e-08, "logits/chosen": -1.0290285348892212, "logits/rejected": -0.9375460743904114, "logps/chosen": -89.16535949707031, "logps/rejected": -121.2294921875, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7084894180297852, "rewards/margins": 26.591333389282227, "rewards/rejected": -28.299823760986328, "step": 5130 }, { "epoch": 2.35, "learning_rate": 7.255200405885337e-08, "logits/chosen": -0.9907518625259399, "logits/rejected": -0.9397494196891785, "logps/chosen": -90.25212860107422, "logps/rejected": -126.6907730102539, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.36234050989151, "rewards/margins": 28.96002769470215, "rewards/rejected": -29.322368621826172, "step": 5140 }, { "epoch": 2.35, "learning_rate": 7.204464738711314e-08, "logits/chosen": -1.0006976127624512, "logits/rejected": -0.9534046053886414, "logps/chosen": -83.64295959472656, "logps/rejected": -121.9207763671875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5914263725280762, "rewards/margins": 27.62277603149414, "rewards/rejected": -28.214202880859375, "step": 5150 }, { "epoch": 2.36, "learning_rate": 7.15372907153729e-08, "logits/chosen": -0.9170929789543152, "logits/rejected": -0.8608806729316711, "logps/chosen": -84.09690856933594, "logps/rejected": -122.79866790771484, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.16179148852825165, "rewards/margins": 27.293569564819336, "rewards/rejected": -27.45536231994629, "step": 5160 }, { "epoch": 2.36, "learning_rate": 7.102993404363267e-08, "logits/chosen": -1.0261433124542236, "logits/rejected": -0.9830166697502136, "logps/chosen": -84.33302307128906, "logps/rejected": -124.41776275634766, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.2603864073753357, "rewards/margins": 29.121349334716797, "rewards/rejected": -29.38173484802246, "step": 5170 }, { "epoch": 2.36, "learning_rate": 7.052257737189244e-08, "logits/chosen": -0.8641002774238586, "logits/rejected": -0.832613468170166, "logps/chosen": -85.8521499633789, "logps/rejected": -117.49824523925781, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9484038352966309, "rewards/margins": 25.647912979125977, "rewards/rejected": -26.5963191986084, "step": 5180 }, { "epoch": 2.37, "learning_rate": 7.00152207001522e-08, "logits/chosen": -0.9944854974746704, "logits/rejected": -0.974925696849823, "logps/chosen": -87.15865325927734, "logps/rejected": -124.09013366699219, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.5379887819290161, "rewards/margins": 27.764774322509766, "rewards/rejected": -28.302759170532227, "step": 5190 }, { "epoch": 2.37, "learning_rate": 6.950786402841197e-08, "logits/chosen": -0.9933064579963684, "logits/rejected": -0.9870017170906067, "logps/chosen": -95.2901611328125, "logps/rejected": -129.78933715820312, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5724942684173584, "rewards/margins": 27.332691192626953, "rewards/rejected": -28.905187606811523, "step": 5200 }, { "epoch": 2.37, "eval_logits/chosen": -0.9753385782241821, "eval_logits/rejected": -0.892514169216156, "eval_logps/chosen": -86.38349151611328, "eval_logps/rejected": -121.61949157714844, "eval_loss": 0.0071233040653169155, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.4228147268295288, "eval_rewards/margins": 27.95623016357422, "eval_rewards/rejected": -28.379043579101562, "eval_runtime": 76.5385, "eval_samples_per_second": 37.393, "eval_steps_per_second": 2.339, "step": 5200 }, { "epoch": 2.38, "learning_rate": 6.900050735667174e-08, "logits/chosen": -1.0079680681228638, "logits/rejected": -0.9470082521438599, "logps/chosen": -84.72894287109375, "logps/rejected": -123.27486419677734, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.9842392802238464, "rewards/margins": 27.015186309814453, "rewards/rejected": -27.99942398071289, "step": 5210 }, { "epoch": 2.38, "learning_rate": 6.84931506849315e-08, "logits/chosen": -0.9415313601493835, "logits/rejected": -0.8743622899055481, "logps/chosen": -83.14261627197266, "logps/rejected": -127.46656799316406, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.49354609847068787, "rewards/margins": 28.221790313720703, "rewards/rejected": -28.715335845947266, "step": 5220 }, { "epoch": 2.39, "learning_rate": 6.798579401319127e-08, "logits/chosen": -1.0896893739700317, "logits/rejected": -0.984167218208313, "logps/chosen": -91.80816650390625, "logps/rejected": -124.5762710571289, "loss": 0.0045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7726724147796631, "rewards/margins": 28.63726806640625, "rewards/rejected": -29.409942626953125, "step": 5230 }, { "epoch": 2.39, "learning_rate": 6.747843734145104e-08, "logits/chosen": -0.9839617013931274, "logits/rejected": -0.8807669878005981, "logps/chosen": -86.82026672363281, "logps/rejected": -121.82684326171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.491886705160141, "rewards/margins": 28.546483993530273, "rewards/rejected": -28.054601669311523, "step": 5240 }, { "epoch": 2.4, "learning_rate": 6.69710806697108e-08, "logits/chosen": -1.038967490196228, "logits/rejected": -0.9306316375732422, "logps/chosen": -84.9123306274414, "logps/rejected": -122.56534576416016, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8146915435791016, "rewards/margins": 27.039257049560547, "rewards/rejected": -27.85394859313965, "step": 5250 }, { "epoch": 2.4, "learning_rate": 6.646372399797057e-08, "logits/chosen": -0.9870797991752625, "logits/rejected": -0.9295756220817566, "logps/chosen": -82.35897064208984, "logps/rejected": -122.3917236328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4263637065887451, "rewards/margins": 27.225475311279297, "rewards/rejected": -27.651844024658203, "step": 5260 }, { "epoch": 2.41, "learning_rate": 6.595636732623034e-08, "logits/chosen": -1.0013396739959717, "logits/rejected": -0.953113853931427, "logps/chosen": -88.39720916748047, "logps/rejected": -129.01861572265625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.2705535888671875, "rewards/margins": 28.5063533782959, "rewards/rejected": -29.776906967163086, "step": 5270 }, { "epoch": 2.41, "learning_rate": 6.54490106544901e-08, "logits/chosen": -0.9829657673835754, "logits/rejected": -0.9145382046699524, "logps/chosen": -90.40482330322266, "logps/rejected": -127.89979553222656, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.44692808389663696, "rewards/margins": 31.026226043701172, "rewards/rejected": -31.473154067993164, "step": 5280 }, { "epoch": 2.41, "learning_rate": 6.494165398274987e-08, "logits/chosen": -0.8731921315193176, "logits/rejected": -0.8698946237564087, "logps/chosen": -91.5156478881836, "logps/rejected": -127.5548095703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9198547601699829, "rewards/margins": 27.880355834960938, "rewards/rejected": -28.800212860107422, "step": 5290 }, { "epoch": 2.42, "learning_rate": 6.443429731100964e-08, "logits/chosen": -1.0517842769622803, "logits/rejected": -0.9802875518798828, "logps/chosen": -81.30607604980469, "logps/rejected": -124.15089416503906, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6957739591598511, "rewards/margins": 28.380374908447266, "rewards/rejected": -29.07615089416504, "step": 5300 }, { "epoch": 2.42, "eval_logits/chosen": -0.9960015416145325, "eval_logits/rejected": -0.9123116731643677, "eval_logps/chosen": -86.72779846191406, "eval_logps/rejected": -123.22882843017578, "eval_loss": 0.007154433988034725, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.5949668884277344, "eval_rewards/margins": 28.588741302490234, "eval_rewards/rejected": -29.18370819091797, "eval_runtime": 64.2723, "eval_samples_per_second": 44.529, "eval_steps_per_second": 2.785, "step": 5300 }, { "epoch": 2.42, "learning_rate": 6.39269406392694e-08, "logits/chosen": -1.006734013557434, "logits/rejected": -0.9903522729873657, "logps/chosen": -84.85193634033203, "logps/rejected": -127.9534683227539, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.42257681488990784, "rewards/margins": 30.12015151977539, "rewards/rejected": -29.697574615478516, "step": 5310 }, { "epoch": 2.43, "learning_rate": 6.341958396752917e-08, "logits/chosen": -1.0764577388763428, "logits/rejected": -0.9376031160354614, "logps/chosen": -84.22555541992188, "logps/rejected": -122.31805419921875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4156250059604645, "rewards/margins": 28.341150283813477, "rewards/rejected": -28.756771087646484, "step": 5320 }, { "epoch": 2.43, "learning_rate": 6.291222729578894e-08, "logits/chosen": -1.078168511390686, "logits/rejected": -0.9943448305130005, "logps/chosen": -89.96900939941406, "logps/rejected": -118.6367416381836, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3117859959602356, "rewards/margins": 28.337322235107422, "rewards/rejected": -28.64910888671875, "step": 5330 }, { "epoch": 2.44, "learning_rate": 6.24048706240487e-08, "logits/chosen": -1.069582462310791, "logits/rejected": -1.0039303302764893, "logps/chosen": -88.01235961914062, "logps/rejected": -129.82473754882812, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.2434835433959961, "rewards/margins": 30.145584106445312, "rewards/rejected": -30.389068603515625, "step": 5340 }, { "epoch": 2.44, "learning_rate": 6.189751395230847e-08, "logits/chosen": -1.067857027053833, "logits/rejected": -0.9720760583877563, "logps/chosen": -93.93586730957031, "logps/rejected": -128.0084991455078, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2376720905303955, "rewards/margins": 28.47385025024414, "rewards/rejected": -29.71152114868164, "step": 5350 }, { "epoch": 2.45, "learning_rate": 6.139015728056824e-08, "logits/chosen": -1.0381324291229248, "logits/rejected": -0.977461040019989, "logps/chosen": -85.44249725341797, "logps/rejected": -123.37590026855469, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.276159405708313, "rewards/margins": 29.17499351501465, "rewards/rejected": -28.898834228515625, "step": 5360 }, { "epoch": 2.45, "learning_rate": 6.0882800608828e-08, "logits/chosen": -0.9749631881713867, "logits/rejected": -0.9035015106201172, "logps/chosen": -92.03914642333984, "logps/rejected": -125.73573303222656, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.13078829646110535, "rewards/margins": 29.687381744384766, "rewards/rejected": -29.81817054748535, "step": 5370 }, { "epoch": 2.46, "learning_rate": 6.037544393708777e-08, "logits/chosen": -0.8980385065078735, "logits/rejected": -0.8745632171630859, "logps/chosen": -85.739501953125, "logps/rejected": -128.15335083007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5254358053207397, "rewards/margins": 28.72063636779785, "rewards/rejected": -29.24607276916504, "step": 5380 }, { "epoch": 2.46, "learning_rate": 5.986808726534754e-08, "logits/chosen": -0.9306937456130981, "logits/rejected": -0.949223518371582, "logps/chosen": -89.4453353881836, "logps/rejected": -123.77262878417969, "loss": 0.006, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9093526601791382, "rewards/margins": 28.101343154907227, "rewards/rejected": -29.010696411132812, "step": 5390 }, { "epoch": 2.46, "learning_rate": 5.93607305936073e-08, "logits/chosen": -0.9719539880752563, "logits/rejected": -0.9495238065719604, "logps/chosen": -82.39630126953125, "logps/rejected": -122.8931884765625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2512289583683014, "rewards/margins": 29.647945404052734, "rewards/rejected": -29.8991756439209, "step": 5400 }, { "epoch": 2.46, "eval_logits/chosen": -1.0086694955825806, "eval_logits/rejected": -0.9222978949546814, "eval_logps/chosen": -87.1098403930664, "eval_logps/rejected": -123.73928833007812, "eval_loss": 0.007262189406901598, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.7859910726547241, "eval_rewards/margins": 28.65294647216797, "eval_rewards/rejected": -29.438940048217773, "eval_runtime": 70.8839, "eval_samples_per_second": 40.376, "eval_steps_per_second": 2.525, "step": 5400 }, { "epoch": 2.47, "learning_rate": 5.8853373921867065e-08, "logits/chosen": -1.0013418197631836, "logits/rejected": -0.898902416229248, "logps/chosen": -92.24568176269531, "logps/rejected": -132.75596618652344, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.6157811880111694, "rewards/margins": 29.352039337158203, "rewards/rejected": -29.967823028564453, "step": 5410 }, { "epoch": 2.47, "learning_rate": 5.834601725012683e-08, "logits/chosen": -1.0467584133148193, "logits/rejected": -0.9782527685165405, "logps/chosen": -89.4131851196289, "logps/rejected": -125.35511779785156, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.5491294860839844, "rewards/margins": 28.110363006591797, "rewards/rejected": -28.65949058532715, "step": 5420 }, { "epoch": 2.48, "learning_rate": 5.78386605783866e-08, "logits/chosen": -0.9084771871566772, "logits/rejected": -0.8927844762802124, "logps/chosen": -88.92491149902344, "logps/rejected": -129.6659393310547, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6389572620391846, "rewards/margins": 29.241901397705078, "rewards/rejected": -30.880855560302734, "step": 5430 }, { "epoch": 2.48, "learning_rate": 5.7331303906646365e-08, "logits/chosen": -0.9773572683334351, "logits/rejected": -0.9706518054008484, "logps/chosen": -86.19645690917969, "logps/rejected": -129.49354553222656, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.565061092376709, "rewards/margins": 29.09912109375, "rewards/rejected": -29.6641788482666, "step": 5440 }, { "epoch": 2.49, "learning_rate": 5.682394723490613e-08, "logits/chosen": -1.1273038387298584, "logits/rejected": -1.061490535736084, "logps/chosen": -93.4212875366211, "logps/rejected": -130.5277862548828, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.44857341051101685, "rewards/margins": 30.196619033813477, "rewards/rejected": -30.645191192626953, "step": 5450 }, { "epoch": 2.49, "learning_rate": 5.63165905631659e-08, "logits/chosen": -1.04457688331604, "logits/rejected": -1.0130252838134766, "logps/chosen": -90.43280029296875, "logps/rejected": -132.5623779296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.1586554050445557, "rewards/margins": 30.627910614013672, "rewards/rejected": -31.78656578063965, "step": 5460 }, { "epoch": 2.5, "learning_rate": 5.5809233891425665e-08, "logits/chosen": -0.991267204284668, "logits/rejected": -0.9441797137260437, "logps/chosen": -89.6689453125, "logps/rejected": -127.03794860839844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.5401039123535156, "rewards/margins": 28.44040870666504, "rewards/rejected": -29.980514526367188, "step": 5470 }, { "epoch": 2.5, "learning_rate": 5.530187721968543e-08, "logits/chosen": -1.008504033088684, "logits/rejected": -1.0064032077789307, "logps/chosen": -89.63392639160156, "logps/rejected": -129.8910675048828, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.20265252888202667, "rewards/margins": 29.824050903320312, "rewards/rejected": -30.026702880859375, "step": 5480 }, { "epoch": 2.51, "learning_rate": 5.47945205479452e-08, "logits/chosen": -0.978340744972229, "logits/rejected": -0.8980448842048645, "logps/chosen": -86.2132797241211, "logps/rejected": -132.22434997558594, "loss": 0.0033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3651527166366577, "rewards/margins": 29.39707374572754, "rewards/rejected": -30.76222801208496, "step": 5490 }, { "epoch": 2.51, "learning_rate": 5.4287163876204964e-08, "logits/chosen": -1.056391716003418, "logits/rejected": -0.9532869458198547, "logps/chosen": -89.96862030029297, "logps/rejected": -128.9607696533203, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5641896724700928, "rewards/margins": 29.623580932617188, "rewards/rejected": -30.18777084350586, "step": 5500 }, { "epoch": 2.51, "eval_logits/chosen": -1.0164090394973755, "eval_logits/rejected": -0.9298503994941711, "eval_logps/chosen": -87.34950256347656, "eval_logps/rejected": -124.33818817138672, "eval_loss": 0.007332602050155401, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.9058244824409485, "eval_rewards/margins": 28.832571029663086, "eval_rewards/rejected": -29.73839569091797, "eval_runtime": 85.391, "eval_samples_per_second": 33.516, "eval_steps_per_second": 2.096, "step": 5500 }, { "epoch": 2.52, "learning_rate": 5.377980720446473e-08, "logits/chosen": -0.9833132028579712, "logits/rejected": -0.9164519309997559, "logps/chosen": -87.18998718261719, "logps/rejected": -127.6279525756836, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.17603938281536102, "rewards/margins": 31.583871841430664, "rewards/rejected": -31.4078311920166, "step": 5510 }, { "epoch": 2.52, "learning_rate": 5.32724505327245e-08, "logits/chosen": -1.0401796102523804, "logits/rejected": -0.9769385457038879, "logps/chosen": -86.3476333618164, "logps/rejected": -122.81013488769531, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.039313793182373, "rewards/margins": 27.221426010131836, "rewards/rejected": -28.2607364654541, "step": 5520 }, { "epoch": 2.52, "learning_rate": 5.2765093860984264e-08, "logits/chosen": -0.9912775754928589, "logits/rejected": -0.9012781977653503, "logps/chosen": -85.58564758300781, "logps/rejected": -123.6320571899414, "loss": 0.0034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3589529991149902, "rewards/margins": 26.08465003967285, "rewards/rejected": -28.443607330322266, "step": 5530 }, { "epoch": 2.53, "learning_rate": 5.225773718924403e-08, "logits/chosen": -0.9532960653305054, "logits/rejected": -0.9478763341903687, "logps/chosen": -83.55834197998047, "logps/rejected": -121.30717468261719, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.43583202362060547, "rewards/margins": 27.649715423583984, "rewards/rejected": -28.085546493530273, "step": 5540 }, { "epoch": 2.53, "learning_rate": 5.17503805175038e-08, "logits/chosen": -1.0004901885986328, "logits/rejected": -0.9230673909187317, "logps/chosen": -87.94818878173828, "logps/rejected": -127.7512435913086, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.18584835529327393, "rewards/margins": 30.41741371154785, "rewards/rejected": -30.603260040283203, "step": 5550 }, { "epoch": 2.54, "learning_rate": 5.1243023845763564e-08, "logits/chosen": -1.0533897876739502, "logits/rejected": -1.0084376335144043, "logps/chosen": -81.72035217285156, "logps/rejected": -127.9518051147461, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.35208114981651306, "rewards/margins": 30.244159698486328, "rewards/rejected": -30.596240997314453, "step": 5560 }, { "epoch": 2.54, "learning_rate": 5.073566717402333e-08, "logits/chosen": -1.0520977973937988, "logits/rejected": -0.986443817615509, "logps/chosen": -84.8553695678711, "logps/rejected": -126.15667724609375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.4004618525505066, "rewards/margins": 29.2807559967041, "rewards/rejected": -29.68121910095215, "step": 5570 }, { "epoch": 2.55, "learning_rate": 5.02283105022831e-08, "logits/chosen": -0.9998930096626282, "logits/rejected": -1.0011494159698486, "logps/chosen": -93.25764465332031, "logps/rejected": -127.81201171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.173553228378296, "rewards/margins": 28.28702163696289, "rewards/rejected": -29.4605712890625, "step": 5580 }, { "epoch": 2.55, "learning_rate": 4.9720953830542864e-08, "logits/chosen": -0.9547233581542969, "logits/rejected": -0.9245736002922058, "logps/chosen": -91.9867935180664, "logps/rejected": -126.07781982421875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.9789550304412842, "rewards/margins": 28.35225486755371, "rewards/rejected": -30.331212997436523, "step": 5590 }, { "epoch": 2.56, "learning_rate": 4.921359715880263e-08, "logits/chosen": -0.9986736178398132, "logits/rejected": -0.972088634967804, "logps/chosen": -82.64098358154297, "logps/rejected": -122.1820297241211, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.37886843085289, "rewards/margins": 28.860065460205078, "rewards/rejected": -28.4811954498291, "step": 5600 }, { "epoch": 2.56, "eval_logits/chosen": -1.0180690288543701, "eval_logits/rejected": -0.9321611523628235, "eval_logps/chosen": -87.17377471923828, "eval_logps/rejected": -123.89130401611328, "eval_loss": 0.006997702177613974, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.8179590106010437, "eval_rewards/margins": 28.69698715209961, "eval_rewards/rejected": -29.514951705932617, "eval_runtime": 80.0253, "eval_samples_per_second": 35.764, "eval_steps_per_second": 2.237, "step": 5600 }, { "epoch": 2.56, "learning_rate": 4.87062404870624e-08, "logits/chosen": -1.0039026737213135, "logits/rejected": -0.936947226524353, "logps/chosen": -86.39691925048828, "logps/rejected": -124.20637512207031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.3275587558746338, "rewards/margins": 28.06853675842285, "rewards/rejected": -29.396093368530273, "step": 5610 }, { "epoch": 2.57, "learning_rate": 4.8198883815322164e-08, "logits/chosen": -1.0270158052444458, "logits/rejected": -0.9902611970901489, "logps/chosen": -85.97245025634766, "logps/rejected": -129.32151794433594, "loss": 0.0066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1571362018585205, "rewards/margins": 29.470067977905273, "rewards/rejected": -30.6272029876709, "step": 5620 }, { "epoch": 2.57, "learning_rate": 4.769152714358193e-08, "logits/chosen": -0.9748164415359497, "logits/rejected": -0.8806400299072266, "logps/chosen": -84.25230407714844, "logps/rejected": -128.8730926513672, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.3607916831970215, "rewards/margins": 29.30045509338379, "rewards/rejected": -30.661245346069336, "step": 5630 }, { "epoch": 2.57, "learning_rate": 4.71841704718417e-08, "logits/chosen": -0.9845125079154968, "logits/rejected": -0.8729864358901978, "logps/chosen": -85.73805236816406, "logps/rejected": -128.51284790039062, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4684017896652222, "rewards/margins": 28.368051528930664, "rewards/rejected": -29.83645248413086, "step": 5640 }, { "epoch": 2.58, "learning_rate": 4.6676813800101464e-08, "logits/chosen": -1.0719449520111084, "logits/rejected": -0.9801136255264282, "logps/chosen": -90.75955200195312, "logps/rejected": -126.10191345214844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.1611497402191162, "rewards/margins": 28.947484970092773, "rewards/rejected": -29.1086368560791, "step": 5650 }, { "epoch": 2.58, "learning_rate": 4.616945712836123e-08, "logits/chosen": -0.9902482032775879, "logits/rejected": -0.9737680554389954, "logps/chosen": -88.47955322265625, "logps/rejected": -125.43502044677734, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.6917212605476379, "rewards/margins": 28.979278564453125, "rewards/rejected": -29.671005249023438, "step": 5660 }, { "epoch": 2.59, "learning_rate": 4.5662100456621e-08, "logits/chosen": -1.062365174293518, "logits/rejected": -0.9917305111885071, "logps/chosen": -85.7381820678711, "logps/rejected": -129.78860473632812, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.7698432207107544, "rewards/margins": 29.835586547851562, "rewards/rejected": -30.605432510375977, "step": 5670 }, { "epoch": 2.59, "learning_rate": 4.5154743784880764e-08, "logits/chosen": -0.9991733431816101, "logits/rejected": -0.9434337615966797, "logps/chosen": -79.69161224365234, "logps/rejected": -126.32232666015625, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.819229245185852, "rewards/margins": 28.9149112701416, "rewards/rejected": -29.734134674072266, "step": 5680 }, { "epoch": 2.6, "learning_rate": 4.464738711314053e-08, "logits/chosen": -0.9527062177658081, "logits/rejected": -0.9070295095443726, "logps/chosen": -95.05113983154297, "logps/rejected": -129.95516967773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3489150106906891, "rewards/margins": 29.364145278930664, "rewards/rejected": -29.713058471679688, "step": 5690 }, { "epoch": 2.6, "learning_rate": 4.41400304414003e-08, "logits/chosen": -1.0130369663238525, "logits/rejected": -0.9433294534683228, "logps/chosen": -94.64827728271484, "logps/rejected": -125.37615966796875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.025121713057160378, "rewards/margins": 29.1879940032959, "rewards/rejected": -29.21311378479004, "step": 5700 }, { "epoch": 2.6, "eval_logits/chosen": -1.0215510129928589, "eval_logits/rejected": -0.9332442283630371, "eval_logps/chosen": -86.77206420898438, "eval_logps/rejected": -123.67755126953125, "eval_loss": 0.007124757394194603, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.6171061992645264, "eval_rewards/margins": 28.790964126586914, "eval_rewards/rejected": -29.408065795898438, "eval_runtime": 62.4779, "eval_samples_per_second": 45.808, "eval_steps_per_second": 2.865, "step": 5700 }, { "epoch": 2.61, "learning_rate": 4.3632673769660064e-08, "logits/chosen": -1.0324370861053467, "logits/rejected": -0.9537965059280396, "logps/chosen": -88.25526428222656, "logps/rejected": -127.1012191772461, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.0396380424499512, "rewards/margins": 29.129968643188477, "rewards/rejected": -30.169607162475586, "step": 5710 }, { "epoch": 2.61, "learning_rate": 4.312531709791983e-08, "logits/chosen": -1.0366867780685425, "logits/rejected": -0.9741948246955872, "logps/chosen": -87.47926330566406, "logps/rejected": -128.46270751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.316776990890503, "rewards/margins": 28.57059097290039, "rewards/rejected": -29.887371063232422, "step": 5720 }, { "epoch": 2.62, "learning_rate": 4.26179604261796e-08, "logits/chosen": -1.007524013519287, "logits/rejected": -0.9985057711601257, "logps/chosen": -85.72447204589844, "logps/rejected": -124.79058837890625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.7779345512390137, "rewards/margins": 28.543148040771484, "rewards/rejected": -29.32108497619629, "step": 5730 }, { "epoch": 2.62, "learning_rate": 4.2110603754439363e-08, "logits/chosen": -1.085681676864624, "logits/rejected": -1.0228497982025146, "logps/chosen": -85.09532165527344, "logps/rejected": -127.75041198730469, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.867802619934082, "rewards/margins": 29.682071685791016, "rewards/rejected": -30.54987144470215, "step": 5740 }, { "epoch": 2.62, "learning_rate": 4.160324708269913e-08, "logits/chosen": -1.042829155921936, "logits/rejected": -0.9523305892944336, "logps/chosen": -86.57728576660156, "logps/rejected": -131.00987243652344, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.23421511054039001, "rewards/margins": 29.884082794189453, "rewards/rejected": -29.64986801147461, "step": 5750 }, { "epoch": 2.63, "learning_rate": 4.10958904109589e-08, "logits/chosen": -1.0182812213897705, "logits/rejected": -0.9332420229911804, "logps/chosen": -78.36054992675781, "logps/rejected": -124.4658432006836, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.4626803994178772, "rewards/margins": 28.748886108398438, "rewards/rejected": -28.286205291748047, "step": 5760 }, { "epoch": 2.63, "learning_rate": 4.0588533739218663e-08, "logits/chosen": -0.9937347173690796, "logits/rejected": -0.9194048643112183, "logps/chosen": -79.75376892089844, "logps/rejected": -121.32899475097656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.4256053864955902, "rewards/margins": 30.777950286865234, "rewards/rejected": -30.35235023498535, "step": 5770 }, { "epoch": 2.64, "learning_rate": 4.0081177067478437e-08, "logits/chosen": -0.9398347735404968, "logits/rejected": -0.9001420736312866, "logps/chosen": -88.94413757324219, "logps/rejected": -129.3289794921875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4428245425224304, "rewards/margins": 29.961593627929688, "rewards/rejected": -30.4044189453125, "step": 5780 }, { "epoch": 2.64, "learning_rate": 3.95738203957382e-08, "logits/chosen": -1.0133349895477295, "logits/rejected": -0.9825404286384583, "logps/chosen": -85.88642120361328, "logps/rejected": -125.39090728759766, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0771163702011108, "rewards/margins": 29.060375213623047, "rewards/rejected": -30.137493133544922, "step": 5790 }, { "epoch": 2.65, "learning_rate": 3.906646372399797e-08, "logits/chosen": -0.9934064745903015, "logits/rejected": -0.8903753161430359, "logps/chosen": -89.25447082519531, "logps/rejected": -130.34597778320312, "loss": 0.0054, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3697949647903442, "rewards/margins": 29.40945816040039, "rewards/rejected": -30.779254913330078, "step": 5800 }, { "epoch": 2.65, "eval_logits/chosen": -1.0273491144180298, "eval_logits/rejected": -0.9405400156974792, "eval_logps/chosen": -86.86610412597656, "eval_logps/rejected": -124.399169921875, "eval_loss": 0.007130247540771961, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.6641231775283813, "eval_rewards/margins": 29.104759216308594, "eval_rewards/rejected": -29.768884658813477, "eval_runtime": 62.9523, "eval_samples_per_second": 45.463, "eval_steps_per_second": 2.843, "step": 5800 }, { "epoch": 2.65, "learning_rate": 3.8559107052257736e-08, "logits/chosen": -1.1017916202545166, "logits/rejected": -1.0160459280014038, "logps/chosen": -83.09812927246094, "logps/rejected": -123.5434799194336, "loss": 0.0033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3102543354034424, "rewards/margins": 28.675018310546875, "rewards/rejected": -29.985271453857422, "step": 5810 }, { "epoch": 2.66, "learning_rate": 3.80517503805175e-08, "logits/chosen": -1.0480397939682007, "logits/rejected": -0.9980312585830688, "logps/chosen": -86.56658935546875, "logps/rejected": -126.96297454833984, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2892673015594482, "rewards/margins": 28.608530044555664, "rewards/rejected": -29.89779281616211, "step": 5820 }, { "epoch": 2.66, "learning_rate": 3.754439370877727e-08, "logits/chosen": -1.0648337602615356, "logits/rejected": -1.0143353939056396, "logps/chosen": -91.41023254394531, "logps/rejected": -133.6913299560547, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9324867129325867, "rewards/margins": 30.4281005859375, "rewards/rejected": -31.3605899810791, "step": 5830 }, { "epoch": 2.67, "learning_rate": 3.7037037037037036e-08, "logits/chosen": -1.0235826969146729, "logits/rejected": -0.9797852635383606, "logps/chosen": -84.83919525146484, "logps/rejected": -126.71681213378906, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.3446992635726929, "rewards/margins": 29.284387588500977, "rewards/rejected": -30.629085540771484, "step": 5840 }, { "epoch": 2.67, "learning_rate": 3.65296803652968e-08, "logits/chosen": -1.0963352918624878, "logits/rejected": -1.0587340593338013, "logps/chosen": -88.24998474121094, "logps/rejected": -126.11114501953125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.344663381576538, "rewards/margins": 27.544784545898438, "rewards/rejected": -28.889450073242188, "step": 5850 }, { "epoch": 2.67, "learning_rate": 3.602232369355657e-08, "logits/chosen": -1.034896969795227, "logits/rejected": -0.949033260345459, "logps/chosen": -86.25189208984375, "logps/rejected": -129.3499298095703, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.39351123571395874, "rewards/margins": 31.2337589263916, "rewards/rejected": -30.84025001525879, "step": 5860 }, { "epoch": 2.68, "learning_rate": 3.5514967021816336e-08, "logits/chosen": -1.0709892511367798, "logits/rejected": -1.0646283626556396, "logps/chosen": -88.88494873046875, "logps/rejected": -126.898193359375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.5309356451034546, "rewards/margins": 29.396169662475586, "rewards/rejected": -29.927108764648438, "step": 5870 }, { "epoch": 2.68, "learning_rate": 3.50076103500761e-08, "logits/chosen": -1.041032314300537, "logits/rejected": -0.947468101978302, "logps/chosen": -85.87740325927734, "logps/rejected": -125.021240234375, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.40017738938331604, "rewards/margins": 28.514108657836914, "rewards/rejected": -28.91428565979004, "step": 5880 }, { "epoch": 2.69, "learning_rate": 3.450025367833587e-08, "logits/chosen": -0.9914347529411316, "logits/rejected": -0.9079867601394653, "logps/chosen": -90.79552459716797, "logps/rejected": -127.93312072753906, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.5817571878433228, "rewards/margins": 28.376272201538086, "rewards/rejected": -28.95802879333496, "step": 5890 }, { "epoch": 2.69, "learning_rate": 3.3992897006595636e-08, "logits/chosen": -1.082043170928955, "logits/rejected": -1.057128667831421, "logps/chosen": -88.99037170410156, "logps/rejected": -129.789794921875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.6108391284942627, "rewards/margins": 28.299758911132812, "rewards/rejected": -29.910594940185547, "step": 5900 }, { "epoch": 2.69, "eval_logits/chosen": -1.031474232673645, "eval_logits/rejected": -0.9436381459236145, "eval_logps/chosen": -87.154296875, "eval_logps/rejected": -124.52259826660156, "eval_loss": 0.006726478226482868, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.808218240737915, "eval_rewards/margins": 29.022377014160156, "eval_rewards/rejected": -29.830596923828125, "eval_runtime": 65.8919, "eval_samples_per_second": 43.435, "eval_steps_per_second": 2.717, "step": 5900 }, { "epoch": 2.7, "learning_rate": 3.34855403348554e-08, "logits/chosen": -1.058807134628296, "logits/rejected": -1.002246379852295, "logps/chosen": -94.60523986816406, "logps/rejected": -126.1142349243164, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.3425036668777466, "rewards/margins": 27.65673828125, "rewards/rejected": -28.999242782592773, "step": 5910 }, { "epoch": 2.7, "learning_rate": 3.297818366311517e-08, "logits/chosen": -1.0659196376800537, "logits/rejected": -1.02195143699646, "logps/chosen": -83.831787109375, "logps/rejected": -127.63771057128906, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.325688362121582, "rewards/margins": 28.6981258392334, "rewards/rejected": -30.023815155029297, "step": 5920 }, { "epoch": 2.71, "learning_rate": 3.2470826991374936e-08, "logits/chosen": -1.0907622575759888, "logits/rejected": -1.0057451725006104, "logps/chosen": -89.2253646850586, "logps/rejected": -127.18135833740234, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8062092065811157, "rewards/margins": 28.803585052490234, "rewards/rejected": -29.609792709350586, "step": 5930 }, { "epoch": 2.71, "learning_rate": 3.19634703196347e-08, "logits/chosen": -1.1025313138961792, "logits/rejected": -1.0713751316070557, "logps/chosen": -91.10670471191406, "logps/rejected": -132.5358428955078, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.7590230703353882, "rewards/margins": 29.9611873626709, "rewards/rejected": -31.72021484375, "step": 5940 }, { "epoch": 2.72, "learning_rate": 3.145611364789447e-08, "logits/chosen": -0.9489334225654602, "logits/rejected": -0.9489434957504272, "logps/chosen": -92.52510070800781, "logps/rejected": -127.75334167480469, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.572442889213562, "rewards/margins": 29.219165802001953, "rewards/rejected": -29.791606903076172, "step": 5950 }, { "epoch": 2.72, "learning_rate": 3.0948756976154236e-08, "logits/chosen": -1.0602383613586426, "logits/rejected": -0.9741102457046509, "logps/chosen": -84.59852600097656, "logps/rejected": -125.7697525024414, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.35125207901000977, "rewards/margins": 29.04140281677246, "rewards/rejected": -29.392658233642578, "step": 5960 }, { "epoch": 2.73, "learning_rate": 3.0441400304414e-08, "logits/chosen": -1.0793386697769165, "logits/rejected": -1.0352563858032227, "logps/chosen": -85.41413116455078, "logps/rejected": -125.89903259277344, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.781488299369812, "rewards/margins": 28.587890625, "rewards/rejected": -29.3693790435791, "step": 5970 }, { "epoch": 2.73, "learning_rate": 2.993404363267377e-08, "logits/chosen": -1.0787484645843506, "logits/rejected": -1.0108954906463623, "logps/chosen": -82.58202362060547, "logps/rejected": -125.72383117675781, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0184836387634277, "rewards/margins": 28.39313316345215, "rewards/rejected": -29.4116153717041, "step": 5980 }, { "epoch": 2.73, "learning_rate": 2.9426686960933532e-08, "logits/chosen": -0.9952011108398438, "logits/rejected": -0.8851292729377747, "logps/chosen": -93.87915802001953, "logps/rejected": -124.56547546386719, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.4423131048679352, "rewards/margins": 28.472192764282227, "rewards/rejected": -28.914505004882812, "step": 5990 }, { "epoch": 2.74, "learning_rate": 2.89193302891933e-08, "logits/chosen": -1.178444504737854, "logits/rejected": -1.1157991886138916, "logps/chosen": -88.11283874511719, "logps/rejected": -130.8851318359375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.23296864330768585, "rewards/margins": 30.756484985351562, "rewards/rejected": -30.989452362060547, "step": 6000 }, { "epoch": 2.74, "eval_logits/chosen": -1.0278129577636719, "eval_logits/rejected": -0.9400880932807922, "eval_logps/chosen": -87.00244140625, "eval_logps/rejected": -124.30833435058594, "eval_loss": 0.006807922385632992, "eval_rewards/accuracies": 0.9888268113136292, "eval_rewards/chosen": -0.732290506362915, "eval_rewards/margins": 28.99117088317871, "eval_rewards/rejected": -29.723461151123047, "eval_runtime": 61.3283, "eval_samples_per_second": 46.667, "eval_steps_per_second": 2.919, "step": 6000 }, { "epoch": 2.74, "learning_rate": 2.8411973617453066e-08, "logits/chosen": -0.9492026567459106, "logits/rejected": -0.9152344465255737, "logps/chosen": -86.2878189086914, "logps/rejected": -126.0578842163086, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.4441192150115967, "rewards/margins": 29.64728355407715, "rewards/rejected": -30.091400146484375, "step": 6010 }, { "epoch": 2.75, "learning_rate": 2.7904616945712832e-08, "logits/chosen": -1.0655428171157837, "logits/rejected": -1.0025355815887451, "logps/chosen": -87.4747085571289, "logps/rejected": -122.6070327758789, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.8353004455566406, "rewards/margins": 28.315927505493164, "rewards/rejected": -29.151226043701172, "step": 6020 }, { "epoch": 2.75, "learning_rate": 2.73972602739726e-08, "logits/chosen": -1.0561573505401611, "logits/rejected": -0.9622231721878052, "logps/chosen": -87.29557037353516, "logps/rejected": -127.05860900878906, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.450141429901123, "rewards/margins": 29.205745697021484, "rewards/rejected": -30.6558895111084, "step": 6030 }, { "epoch": 2.76, "learning_rate": 2.6889903602232366e-08, "logits/chosen": -1.0109049081802368, "logits/rejected": -0.9498085975646973, "logps/chosen": -87.83575439453125, "logps/rejected": -128.5558624267578, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.4376589357852936, "rewards/margins": 31.382431030273438, "rewards/rejected": -30.944774627685547, "step": 6040 }, { "epoch": 2.76, "learning_rate": 2.6382546930492132e-08, "logits/chosen": -1.184468150138855, "logits/rejected": -1.082187294960022, "logps/chosen": -88.23623657226562, "logps/rejected": -130.09461975097656, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.7751424312591553, "rewards/margins": 28.917348861694336, "rewards/rejected": -29.692489624023438, "step": 6050 }, { "epoch": 2.77, "learning_rate": 2.58751902587519e-08, "logits/chosen": -1.104385495185852, "logits/rejected": -1.02403724193573, "logps/chosen": -83.91561126708984, "logps/rejected": -121.2802734375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.0201902873814106, "rewards/margins": 27.695789337158203, "rewards/rejected": -27.71598243713379, "step": 6060 }, { "epoch": 2.77, "learning_rate": 2.5367833587011665e-08, "logits/chosen": -1.1031575202941895, "logits/rejected": -0.990433394908905, "logps/chosen": -95.4950942993164, "logps/rejected": -129.18492126464844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.37409108877182007, "rewards/margins": 29.132125854492188, "rewards/rejected": -29.506216049194336, "step": 6070 }, { "epoch": 2.78, "learning_rate": 2.4860476915271432e-08, "logits/chosen": -1.0707147121429443, "logits/rejected": -1.012056589126587, "logps/chosen": -89.7245101928711, "logps/rejected": -122.63059997558594, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.4195477068424225, "rewards/margins": 29.034103393554688, "rewards/rejected": -28.614553451538086, "step": 6080 }, { "epoch": 2.78, "learning_rate": 2.43531202435312e-08, "logits/chosen": -1.071775197982788, "logits/rejected": -1.0009758472442627, "logps/chosen": -80.49525451660156, "logps/rejected": -124.15095520019531, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.16672919690608978, "rewards/margins": 28.871688842773438, "rewards/rejected": -28.7049617767334, "step": 6090 }, { "epoch": 2.78, "learning_rate": 2.3845763571790965e-08, "logits/chosen": -1.0576212406158447, "logits/rejected": -1.0005748271942139, "logps/chosen": -91.83822631835938, "logps/rejected": -124.02302551269531, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.2521924078464508, "rewards/margins": 28.721765518188477, "rewards/rejected": -28.973957061767578, "step": 6100 }, { "epoch": 2.78, "eval_logits/chosen": -1.0250327587127686, "eval_logits/rejected": -0.9389449954032898, "eval_logps/chosen": -86.68120574951172, "eval_logps/rejected": -123.78533172607422, "eval_loss": 0.006523421499878168, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.5716744661331177, "eval_rewards/margins": 28.890287399291992, "eval_rewards/rejected": -29.461963653564453, "eval_runtime": 84.1574, "eval_samples_per_second": 34.008, "eval_steps_per_second": 2.127, "step": 6100 }, { "epoch": 2.79, "learning_rate": 2.3338406900050732e-08, "logits/chosen": -0.916157066822052, "logits/rejected": -0.8838459253311157, "logps/chosen": -86.09705352783203, "logps/rejected": -124.25962829589844, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.23373942077159882, "rewards/margins": 29.301513671875, "rewards/rejected": -29.535253524780273, "step": 6110 }, { "epoch": 2.79, "learning_rate": 2.28310502283105e-08, "logits/chosen": -1.0872166156768799, "logits/rejected": -0.9581939578056335, "logps/chosen": -87.7667236328125, "logps/rejected": -127.71507263183594, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.22645695507526398, "rewards/margins": 30.267444610595703, "rewards/rejected": -30.04098892211914, "step": 6120 }, { "epoch": 2.8, "learning_rate": 2.2323693556570265e-08, "logits/chosen": -1.0600719451904297, "logits/rejected": -1.0231385231018066, "logps/chosen": -82.3358383178711, "logps/rejected": -123.05574035644531, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.8706923723220825, "rewards/margins": 29.04482650756836, "rewards/rejected": -29.9155216217041, "step": 6130 }, { "epoch": 2.8, "learning_rate": 2.1816336884830032e-08, "logits/chosen": -1.002144694328308, "logits/rejected": -0.9809429049491882, "logps/chosen": -82.87908935546875, "logps/rejected": -122.70130920410156, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.017511676996946335, "rewards/margins": 28.458423614501953, "rewards/rejected": -28.4409122467041, "step": 6140 }, { "epoch": 2.81, "learning_rate": 2.13089802130898e-08, "logits/chosen": -1.0385632514953613, "logits/rejected": -0.9566332697868347, "logps/chosen": -86.96278381347656, "logps/rejected": -128.3975372314453, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.32612985372543335, "rewards/margins": 31.055572509765625, "rewards/rejected": -31.381702423095703, "step": 6150 }, { "epoch": 2.81, "learning_rate": 2.0801623541349565e-08, "logits/chosen": -1.0180633068084717, "logits/rejected": -0.9315141439437866, "logps/chosen": -89.90997314453125, "logps/rejected": -129.4881134033203, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5671930313110352, "rewards/margins": 30.148788452148438, "rewards/rejected": -30.71598243713379, "step": 6160 }, { "epoch": 2.82, "learning_rate": 2.0294266869609332e-08, "logits/chosen": -1.0712960958480835, "logits/rejected": -0.9703865051269531, "logps/chosen": -89.47264862060547, "logps/rejected": -127.07987213134766, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.344783902168274, "rewards/margins": 28.681127548217773, "rewards/rejected": -30.02591323852539, "step": 6170 }, { "epoch": 2.82, "learning_rate": 1.97869101978691e-08, "logits/chosen": -1.0805513858795166, "logits/rejected": -0.9932042956352234, "logps/chosen": -93.64996337890625, "logps/rejected": -127.0558090209961, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.9845650792121887, "rewards/margins": 29.586376190185547, "rewards/rejected": -30.570941925048828, "step": 6180 }, { "epoch": 2.83, "learning_rate": 1.9279553526128868e-08, "logits/chosen": -1.0248457193374634, "logits/rejected": -0.9871330261230469, "logps/chosen": -80.39697265625, "logps/rejected": -125.86643981933594, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4571932852268219, "rewards/margins": 30.0575008392334, "rewards/rejected": -30.514690399169922, "step": 6190 }, { "epoch": 2.83, "learning_rate": 1.8772196854388635e-08, "logits/chosen": -0.9889580011367798, "logits/rejected": -0.9497294425964355, "logps/chosen": -85.86564636230469, "logps/rejected": -126.31917572021484, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.4988800287246704, "rewards/margins": 28.99808120727539, "rewards/rejected": -30.496959686279297, "step": 6200 }, { "epoch": 2.83, "eval_logits/chosen": -1.0244492292404175, "eval_logits/rejected": -0.9368904232978821, "eval_logps/chosen": -86.6476821899414, "eval_logps/rejected": -123.90021514892578, "eval_loss": 0.006625541485846043, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.5549145340919495, "eval_rewards/margins": 28.964487075805664, "eval_rewards/rejected": -29.5194034576416, "eval_runtime": 66.1256, "eval_samples_per_second": 43.281, "eval_steps_per_second": 2.707, "step": 6200 }, { "epoch": 2.83, "learning_rate": 1.82648401826484e-08, "logits/chosen": -1.0492604970932007, "logits/rejected": -1.0105106830596924, "logps/chosen": -85.85210418701172, "logps/rejected": -128.20448303222656, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -1.4999401569366455, "rewards/margins": 29.045568466186523, "rewards/rejected": -30.54551124572754, "step": 6210 }, { "epoch": 2.84, "learning_rate": 1.7757483510908168e-08, "logits/chosen": -0.9619634747505188, "logits/rejected": -0.9743566513061523, "logps/chosen": -86.10426330566406, "logps/rejected": -130.9729766845703, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.49150973558425903, "rewards/margins": 30.758865356445312, "rewards/rejected": -31.250377655029297, "step": 6220 }, { "epoch": 2.84, "learning_rate": 1.7250126839167935e-08, "logits/chosen": -1.0379936695098877, "logits/rejected": -1.0153452157974243, "logps/chosen": -81.86763000488281, "logps/rejected": -120.93293762207031, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.818130373954773, "rewards/margins": 28.285436630249023, "rewards/rejected": -29.103565216064453, "step": 6230 }, { "epoch": 2.85, "learning_rate": 1.67427701674277e-08, "logits/chosen": -1.0076709985733032, "logits/rejected": -0.9663689732551575, "logps/chosen": -85.59688568115234, "logps/rejected": -125.4538345336914, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.017603278160095215, "rewards/margins": 30.528635025024414, "rewards/rejected": -30.546234130859375, "step": 6240 }, { "epoch": 2.85, "learning_rate": 1.6235413495687468e-08, "logits/chosen": -0.9639765024185181, "logits/rejected": -0.9728061556816101, "logps/chosen": -87.17933654785156, "logps/rejected": -126.87109375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.7064870595932007, "rewards/margins": 28.171640396118164, "rewards/rejected": -28.878128051757812, "step": 6250 }, { "epoch": 2.86, "learning_rate": 1.5728056823947235e-08, "logits/chosen": -0.9334337115287781, "logits/rejected": -0.8631385564804077, "logps/chosen": -83.15779876708984, "logps/rejected": -122.5430908203125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3862573504447937, "rewards/margins": 28.38237953186035, "rewards/rejected": -28.76863670349121, "step": 6260 }, { "epoch": 2.86, "learning_rate": 1.5220700152207e-08, "logits/chosen": -1.1374794244766235, "logits/rejected": -1.0338224172592163, "logps/chosen": -86.21543884277344, "logps/rejected": -129.5160369873047, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8715160489082336, "rewards/margins": 30.266857147216797, "rewards/rejected": -31.138376235961914, "step": 6270 }, { "epoch": 2.87, "learning_rate": 1.4713343480466766e-08, "logits/chosen": -1.0501601696014404, "logits/rejected": -1.0120677947998047, "logps/chosen": -93.43696594238281, "logps/rejected": -127.74129486083984, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.8736766576766968, "rewards/margins": 27.128524780273438, "rewards/rejected": -29.0022029876709, "step": 6280 }, { "epoch": 2.87, "learning_rate": 1.4205986808726533e-08, "logits/chosen": -0.9635306596755981, "logits/rejected": -0.9733031392097473, "logps/chosen": -86.1040267944336, "logps/rejected": -128.07809448242188, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.6703550219535828, "rewards/margins": 28.874740600585938, "rewards/rejected": -29.545095443725586, "step": 6290 }, { "epoch": 2.88, "learning_rate": 1.36986301369863e-08, "logits/chosen": -1.0834633111953735, "logits/rejected": -1.0034466981887817, "logps/chosen": -82.86973571777344, "logps/rejected": -129.51971435546875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.29943275451660156, "rewards/margins": 30.45029067993164, "rewards/rejected": -30.749719619750977, "step": 6300 }, { "epoch": 2.88, "eval_logits/chosen": -1.0220108032226562, "eval_logits/rejected": -0.9361612200737, "eval_logps/chosen": -86.4330062866211, "eval_logps/rejected": -123.60254669189453, "eval_loss": 0.006502318661659956, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.44757336378097534, "eval_rewards/margins": 28.922998428344727, "eval_rewards/rejected": -29.370569229125977, "eval_runtime": 64.6819, "eval_samples_per_second": 44.247, "eval_steps_per_second": 2.767, "step": 6300 }, { "epoch": 2.88, "learning_rate": 1.3191273465246066e-08, "logits/chosen": -1.0733602046966553, "logits/rejected": -1.0072834491729736, "logps/chosen": -86.40800476074219, "logps/rejected": -119.3967514038086, "loss": 0.0022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4470507502555847, "rewards/margins": 27.95053482055664, "rewards/rejected": -28.397586822509766, "step": 6310 }, { "epoch": 2.88, "learning_rate": 1.2683916793505833e-08, "logits/chosen": -1.060889482498169, "logits/rejected": -1.0003650188446045, "logps/chosen": -89.27617645263672, "logps/rejected": -126.93253326416016, "loss": 0.0036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6875644326210022, "rewards/margins": 28.4171142578125, "rewards/rejected": -29.10468101501465, "step": 6320 }, { "epoch": 2.89, "learning_rate": 1.21765601217656e-08, "logits/chosen": -0.9854904413223267, "logits/rejected": -0.9091927409172058, "logps/chosen": -85.25365447998047, "logps/rejected": -128.0735321044922, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.08507101237773895, "rewards/margins": 31.455326080322266, "rewards/rejected": -31.370258331298828, "step": 6330 }, { "epoch": 2.89, "learning_rate": 1.1669203450025366e-08, "logits/chosen": -1.0819941759109497, "logits/rejected": -1.0338475704193115, "logps/chosen": -90.92166900634766, "logps/rejected": -130.71810913085938, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.01656208001077175, "rewards/margins": 29.559118270874023, "rewards/rejected": -29.575679779052734, "step": 6340 }, { "epoch": 2.9, "learning_rate": 1.1161846778285133e-08, "logits/chosen": -1.0656547546386719, "logits/rejected": -1.037102222442627, "logps/chosen": -86.26807403564453, "logps/rejected": -124.87815856933594, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5916534662246704, "rewards/margins": 29.280879974365234, "rewards/rejected": -29.87253189086914, "step": 6350 }, { "epoch": 2.9, "learning_rate": 1.06544901065449e-08, "logits/chosen": -1.075899362564087, "logits/rejected": -0.9635077714920044, "logps/chosen": -85.9807357788086, "logps/rejected": -128.5865478515625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.5440212488174438, "rewards/margins": 30.536121368408203, "rewards/rejected": -31.08014488220215, "step": 6360 }, { "epoch": 2.91, "learning_rate": 1.0147133434804666e-08, "logits/chosen": -0.891313374042511, "logits/rejected": -0.8621894121170044, "logps/chosen": -88.86170196533203, "logps/rejected": -127.87739562988281, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.10790462791919708, "rewards/margins": 28.5927677154541, "rewards/rejected": -28.700672149658203, "step": 6370 }, { "epoch": 2.91, "learning_rate": 9.639776763064434e-09, "logits/chosen": -0.9837745428085327, "logits/rejected": -0.9746279716491699, "logps/chosen": -83.23785400390625, "logps/rejected": -121.79039001464844, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7958210110664368, "rewards/margins": 28.028661727905273, "rewards/rejected": -28.82448387145996, "step": 6380 }, { "epoch": 2.92, "learning_rate": 9.1324200913242e-09, "logits/chosen": -0.9626250267028809, "logits/rejected": -0.9271273612976074, "logps/chosen": -86.17671203613281, "logps/rejected": -121.7667236328125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.6893644332885742, "rewards/margins": 28.271896362304688, "rewards/rejected": -28.961261749267578, "step": 6390 }, { "epoch": 2.92, "learning_rate": 8.625063419583967e-09, "logits/chosen": -1.0479789972305298, "logits/rejected": -0.9923169016838074, "logps/chosen": -86.66059112548828, "logps/rejected": -129.21859741210938, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6069229245185852, "rewards/margins": 29.50791358947754, "rewards/rejected": -30.114837646484375, "step": 6400 }, { "epoch": 2.92, "eval_logits/chosen": -1.0255866050720215, "eval_logits/rejected": -0.939497709274292, "eval_logps/chosen": -86.53129577636719, "eval_logps/rejected": -123.90242767333984, "eval_loss": 0.006593942176550627, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.4967171549797058, "eval_rewards/margins": 29.0238037109375, "eval_rewards/rejected": -29.520519256591797, "eval_runtime": 75.6633, "eval_samples_per_second": 37.825, "eval_steps_per_second": 2.366, "step": 6400 }, { "epoch": 2.93, "learning_rate": 8.117706747843734e-09, "logits/chosen": -0.9005545377731323, "logits/rejected": -0.8726035952568054, "logps/chosen": -88.71814727783203, "logps/rejected": -127.66644287109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.0038230419158936, "rewards/margins": 28.876317977905273, "rewards/rejected": -29.880138397216797, "step": 6410 }, { "epoch": 2.93, "learning_rate": 7.6103500761035e-09, "logits/chosen": -1.091305136680603, "logits/rejected": -1.033553123474121, "logps/chosen": -85.39485931396484, "logps/rejected": -127.29072570800781, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2828121781349182, "rewards/margins": 29.81418228149414, "rewards/rejected": -30.096996307373047, "step": 6420 }, { "epoch": 2.94, "learning_rate": 7.1029934043632664e-09, "logits/chosen": -1.0751222372055054, "logits/rejected": -0.9772630929946899, "logps/chosen": -84.76583862304688, "logps/rejected": -130.1298065185547, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.2620498836040497, "rewards/margins": 30.05381202697754, "rewards/rejected": -30.315860748291016, "step": 6430 }, { "epoch": 2.94, "learning_rate": 6.595636732623033e-09, "logits/chosen": -1.0256164073944092, "logits/rejected": -0.9431262016296387, "logps/chosen": -87.59001922607422, "logps/rejected": -128.95947265625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.13797995448112488, "rewards/margins": 29.917160034179688, "rewards/rejected": -30.05513572692871, "step": 6440 }, { "epoch": 2.94, "learning_rate": 6.0882800608828e-09, "logits/chosen": -1.0374404191970825, "logits/rejected": -0.9506736993789673, "logps/chosen": -80.81979370117188, "logps/rejected": -125.56309509277344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.40483197569847107, "rewards/margins": 29.041240692138672, "rewards/rejected": -29.44607162475586, "step": 6450 }, { "epoch": 2.95, "learning_rate": 5.580923389142566e-09, "logits/chosen": -0.869223415851593, "logits/rejected": -0.8867238163948059, "logps/chosen": -88.09699249267578, "logps/rejected": -126.71589660644531, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8291648626327515, "rewards/margins": 29.939502716064453, "rewards/rejected": -29.110332489013672, "step": 6460 }, { "epoch": 2.95, "learning_rate": 5.073566717402333e-09, "logits/chosen": -1.019307255744934, "logits/rejected": -1.0058377981185913, "logps/chosen": -98.96701049804688, "logps/rejected": -124.1956558227539, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8037646412849426, "rewards/margins": 28.398019790649414, "rewards/rejected": -29.2017879486084, "step": 6470 }, { "epoch": 2.96, "learning_rate": 4.5662100456621e-09, "logits/chosen": -0.96232670545578, "logits/rejected": -0.9179836511611938, "logps/chosen": -86.69127655029297, "logps/rejected": -123.1005859375, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.9831452369689941, "rewards/margins": 29.9154109954834, "rewards/rejected": -28.932266235351562, "step": 6480 }, { "epoch": 2.96, "learning_rate": 4.058853373921867e-09, "logits/chosen": -1.0222089290618896, "logits/rejected": -0.8847758173942566, "logps/chosen": -89.50880432128906, "logps/rejected": -125.39402770996094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.06352037191390991, "rewards/margins": 30.433147430419922, "rewards/rejected": -30.49666404724121, "step": 6490 }, { "epoch": 2.97, "learning_rate": 3.5514967021816332e-09, "logits/chosen": -1.0513429641723633, "logits/rejected": -0.9981459379196167, "logps/chosen": -88.95649719238281, "logps/rejected": -126.06563568115234, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7696861028671265, "rewards/margins": 28.481319427490234, "rewards/rejected": -29.251007080078125, "step": 6500 }, { "epoch": 2.97, "eval_logits/chosen": -1.0240223407745361, "eval_logits/rejected": -0.9381014108657837, "eval_logps/chosen": -86.47101593017578, "eval_logps/rejected": -123.78819274902344, "eval_loss": 0.006529896054416895, "eval_rewards/accuracies": 0.9916201233863831, "eval_rewards/chosen": -0.46657702326774597, "eval_rewards/margins": 28.996816635131836, "eval_rewards/rejected": -29.463396072387695, "eval_runtime": 78.27, "eval_samples_per_second": 36.566, "eval_steps_per_second": 2.287, "step": 6500 }, { "epoch": 2.97, "learning_rate": 3.0441400304414e-09, "logits/chosen": -1.083836317062378, "logits/rejected": -1.0287898778915405, "logps/chosen": -90.65104675292969, "logps/rejected": -129.10536193847656, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.5575191974639893, "rewards/margins": 29.243595123291016, "rewards/rejected": -30.80111312866211, "step": 6510 }, { "epoch": 2.98, "learning_rate": 2.5367833587011665e-09, "logits/chosen": -1.1479265689849854, "logits/rejected": -1.0269317626953125, "logps/chosen": -92.42093658447266, "logps/rejected": -131.41064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.4860418438911438, "rewards/margins": 30.6885986328125, "rewards/rejected": -31.174636840820312, "step": 6520 }, { "epoch": 2.98, "learning_rate": 2.0294266869609335e-09, "logits/chosen": -0.9470345377922058, "logits/rejected": -0.8561986684799194, "logps/chosen": -85.6261978149414, "logps/rejected": -127.69779968261719, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3518216609954834, "rewards/margins": 28.512975692749023, "rewards/rejected": -29.864795684814453, "step": 6530 }, { "epoch": 2.99, "learning_rate": 1.5220700152207e-09, "logits/chosen": -0.9782537221908569, "logits/rejected": -0.9774686694145203, "logps/chosen": -83.68141174316406, "logps/rejected": -120.14680480957031, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.29628103971481323, "rewards/margins": 27.92130470275879, "rewards/rejected": -28.21758460998535, "step": 6540 }, { "epoch": 2.99, "learning_rate": 1.0147133434804667e-09, "logits/chosen": -1.084174394607544, "logits/rejected": -0.9952249526977539, "logps/chosen": -89.36202239990234, "logps/rejected": -128.4408721923828, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7218122482299805, "rewards/margins": 28.947765350341797, "rewards/rejected": -29.669580459594727, "step": 6550 }, { "epoch": 2.99, "learning_rate": 5.073566717402334e-10, "logits/chosen": -1.0538431406021118, "logits/rejected": -0.9888811111450195, "logps/chosen": -87.28624725341797, "logps/rejected": -125.61628723144531, "loss": 0.0011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9487142562866211, "rewards/margins": 29.355825424194336, "rewards/rejected": -30.304540634155273, "step": 6560 }, { "epoch": 3.0, "learning_rate": 0.0, "logits/chosen": -1.054466962814331, "logits/rejected": -0.9624788165092468, "logps/chosen": -85.47309875488281, "logps/rejected": -125.8909683227539, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.8512237668037415, "rewards/margins": 28.772281646728516, "rewards/rejected": -29.623498916625977, "step": 6570 }, { "epoch": 3.0, "step": 6570, "total_flos": 0.0, "train_loss": 0.022793083270943315, "train_runtime": 24920.2972, "train_samples_per_second": 16.878, "train_steps_per_second": 0.264 } ], "logging_steps": 10, "max_steps": 6570, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }