diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10280 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998972954467648, + "eval_steps": 100, + "global_step": 6570, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-10, + "logits/chosen": -0.5863479375839233, + "logits/rejected": -0.6061025261878967, + "logps/chosen": -79.05304718017578, + "logps/rejected": -63.445465087890625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -0.42927077412605286, + "logits/rejected": -0.4148653745651245, + "logps/chosen": -84.7120590209961, + "logps/rejected": -67.09528350830078, + "loss": 0.6975, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -0.036662034690380096, + "rewards/margins": -0.027731113135814667, + "rewards/rejected": -0.00893092155456543, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -0.3915753960609436, + "logits/rejected": -0.43573087453842163, + "logps/chosen": -89.85740661621094, + "logps/rejected": -68.4244155883789, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.019763624295592308, + "rewards/margins": 0.016261756420135498, + "rewards/rejected": 0.0035018683411180973, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -0.40983158349990845, + "logits/rejected": -0.3968455195426941, + "logps/chosen": -85.82371520996094, + "logps/rejected": -68.32268524169922, + "loss": 0.6878, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.008524882607161999, + "rewards/margins": 0.002812635852023959, + "rewards/rejected": 0.005712246987968683, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -0.43874412775039673, + "logits/rejected": -0.4789341390132904, + "logps/chosen": -88.45948791503906, + "logps/rejected": -67.00940704345703, + "loss": 0.689, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02156217023730278, + "rewards/margins": 0.013487410731613636, + "rewards/rejected": 0.008074760437011719, + "step": 40 + }, + { + "epoch": 0.02, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -0.41604742407798767, + "logits/rejected": -0.48207980394363403, + "logps/chosen": -86.55135345458984, + "logps/rejected": -65.5015640258789, + "loss": 0.6802, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0028099894989281893, + "rewards/margins": 0.012911021709442139, + "rewards/rejected": -0.010101032443344593, + "step": 50 + }, + { + "epoch": 0.03, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -0.508940577507019, + "logits/rejected": -0.46223893761634827, + "logps/chosen": -90.01718139648438, + "logps/rejected": -69.65950012207031, + "loss": 0.6651, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04631998389959335, + "rewards/margins": 0.0902792438864708, + "rewards/rejected": -0.04395925998687744, + "step": 60 + }, + { + "epoch": 0.03, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -0.3910054564476013, + "logits/rejected": -0.4126061797142029, + "logps/chosen": -87.1267318725586, + "logps/rejected": -67.88300323486328, + "loss": 0.6491, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.03842337056994438, + "rewards/margins": 0.10330170392990112, + "rewards/rejected": -0.06487832963466644, + "step": 70 + }, + { + "epoch": 0.04, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -0.3637865483760834, + "logits/rejected": -0.4041527211666107, + "logps/chosen": -92.4252700805664, + "logps/rejected": -68.47924041748047, + "loss": 0.6122, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.08547542989253998, + "rewards/margins": 0.18988534808158875, + "rewards/rejected": -0.10440991073846817, + "step": 80 + }, + { + "epoch": 0.04, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -0.3265071511268616, + "logits/rejected": -0.3582015633583069, + "logps/chosen": -79.43329620361328, + "logps/rejected": -63.17741012573242, + "loss": 0.5792, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.08616051822900772, + "rewards/margins": 0.2579037547111511, + "rewards/rejected": -0.171743243932724, + "step": 90 + }, + { + "epoch": 0.05, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -0.4225758910179138, + "logits/rejected": -0.4386584162712097, + "logps/chosen": -85.61375427246094, + "logps/rejected": -66.69580078125, + "loss": 0.5515, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.09259802103042603, + "rewards/margins": 0.32498809695243835, + "rewards/rejected": -0.23239007592201233, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -0.39840757846832275, + "eval_logits/rejected": -0.4061564803123474, + "eval_logps/chosen": -85.365966796875, + "eval_logps/rejected": -65.37533569335938, + "eval_loss": 0.5419811010360718, + "eval_rewards/accuracies": 0.910614550113678, + "eval_rewards/chosen": 0.08594708889722824, + "eval_rewards/margins": 0.3429102897644043, + "eval_rewards/rejected": -0.25696322321891785, + "eval_runtime": 81.2496, + "eval_samples_per_second": 35.225, + "eval_steps_per_second": 2.203, + "step": 100 + }, + { + "epoch": 0.05, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -0.46739107370376587, + "logits/rejected": -0.44236382842063904, + "logps/chosen": -79.65380096435547, + "logps/rejected": -62.80085372924805, + "loss": 0.5316, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.06431801617145538, + "rewards/margins": 0.35616832971572876, + "rewards/rejected": -0.29185032844543457, + "step": 110 + }, + { + "epoch": 0.05, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -0.40276527404785156, + "logits/rejected": -0.4527131915092468, + "logps/chosen": -89.0259780883789, + "logps/rejected": -69.21381378173828, + "loss": 0.4844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.15527208149433136, + "rewards/margins": 0.5430252552032471, + "rewards/rejected": -0.3877531886100769, + "step": 120 + }, + { + "epoch": 0.06, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -0.34187182784080505, + "logits/rejected": -0.3704484701156616, + "logps/chosen": -90.00749206542969, + "logps/rejected": -65.06084442138672, + "loss": 0.4142, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.19979539513587952, + "rewards/margins": 0.7884315848350525, + "rewards/rejected": -0.5886362791061401, + "step": 130 + }, + { + "epoch": 0.06, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -0.4649466872215271, + "logits/rejected": -0.4326017498970032, + "logps/chosen": -86.98945617675781, + "logps/rejected": -68.6103515625, + "loss": 0.3701, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.17062470316886902, + "rewards/margins": 0.8517505526542664, + "rewards/rejected": -0.681125819683075, + "step": 140 + }, + { + "epoch": 0.07, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -0.45548853278160095, + "logits/rejected": -0.4255827069282532, + "logps/chosen": -84.7039794921875, + "logps/rejected": -66.87324523925781, + "loss": 0.3414, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.2619437873363495, + "rewards/margins": 0.9896998405456543, + "rewards/rejected": -0.7277560830116272, + "step": 150 + }, + { + "epoch": 0.07, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -0.3764224648475647, + "logits/rejected": -0.4269483685493469, + "logps/chosen": -89.51905822753906, + "logps/rejected": -70.71208953857422, + "loss": 0.3168, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.203881174325943, + "rewards/margins": 1.1547460556030273, + "rewards/rejected": -0.9508647918701172, + "step": 160 + }, + { + "epoch": 0.08, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -0.31334739923477173, + "logits/rejected": -0.373046338558197, + "logps/chosen": -81.72102355957031, + "logps/rejected": -69.88837432861328, + "loss": 0.3234, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.17189902067184448, + "rewards/margins": 1.1217617988586426, + "rewards/rejected": -0.9498627781867981, + "step": 170 + }, + { + "epoch": 0.08, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -0.4377995431423187, + "logits/rejected": -0.4540349841117859, + "logps/chosen": -91.31315612792969, + "logps/rejected": -67.61857604980469, + "loss": 0.3018, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.3559116721153259, + "rewards/margins": 1.3531014919281006, + "rewards/rejected": -0.9971898198127747, + "step": 180 + }, + { + "epoch": 0.09, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -0.5179687738418579, + "logits/rejected": -0.5117358565330505, + "logps/chosen": -84.22309875488281, + "logps/rejected": -69.97932434082031, + "loss": 0.2677, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3408846855163574, + "rewards/margins": 1.411771297454834, + "rewards/rejected": -1.0708866119384766, + "step": 190 + }, + { + "epoch": 0.09, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -0.40042734146118164, + "logits/rejected": -0.4178311228752136, + "logps/chosen": -85.72370910644531, + "logps/rejected": -68.88258361816406, + "loss": 0.2448, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2792873680591583, + "rewards/margins": 1.6100257635116577, + "rewards/rejected": -1.3307384252548218, + "step": 200 + }, + { + "epoch": 0.09, + "eval_logits/chosen": -0.3937744200229645, + "eval_logits/rejected": -0.39809945225715637, + "eval_logps/chosen": -84.8699722290039, + "eval_logps/rejected": -67.29297637939453, + "eval_loss": 0.23899392783641815, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.33393988013267517, + "eval_rewards/margins": 1.5497231483459473, + "eval_rewards/rejected": -1.2157832384109497, + "eval_runtime": 99.6106, + "eval_samples_per_second": 28.732, + "eval_steps_per_second": 1.797, + "step": 200 + }, + { + "epoch": 0.1, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -0.4026223123073578, + "logits/rejected": -0.4446091651916504, + "logps/chosen": -79.4525375366211, + "logps/rejected": -65.70721435546875, + "loss": 0.2066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4364851415157318, + "rewards/margins": 1.7379001379013062, + "rewards/rejected": -1.3014150857925415, + "step": 210 + }, + { + "epoch": 0.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -0.3757795989513397, + "logits/rejected": -0.40395841002464294, + "logps/chosen": -80.05278015136719, + "logps/rejected": -67.86895751953125, + "loss": 0.1816, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3973820209503174, + "rewards/margins": 1.9494373798370361, + "rewards/rejected": -1.5520555973052979, + "step": 220 + }, + { + "epoch": 0.1, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -0.3210656940937042, + "logits/rejected": -0.3856700658798218, + "logps/chosen": -87.6169662475586, + "logps/rejected": -71.29808044433594, + "loss": 0.1496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3461844027042389, + "rewards/margins": 2.317335605621338, + "rewards/rejected": -1.9711511135101318, + "step": 230 + }, + { + "epoch": 0.11, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -0.4276762902736664, + "logits/rejected": -0.45350733399391174, + "logps/chosen": -88.98173522949219, + "logps/rejected": -66.96391296386719, + "loss": 0.1329, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5452815294265747, + "rewards/margins": 2.694061517715454, + "rewards/rejected": -2.148780107498169, + "step": 240 + }, + { + "epoch": 0.11, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -0.4410382807254791, + "logits/rejected": -0.4512443542480469, + "logps/chosen": -88.98338317871094, + "logps/rejected": -74.33209991455078, + "loss": 0.1268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3832108974456787, + "rewards/margins": 2.685126304626465, + "rewards/rejected": -2.301915168762207, + "step": 250 + }, + { + "epoch": 0.12, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -0.4957438111305237, + "logits/rejected": -0.4765089154243469, + "logps/chosen": -86.44584655761719, + "logps/rejected": -72.58143615722656, + "loss": 0.1242, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5778161883354187, + "rewards/margins": 2.74499249458313, + "rewards/rejected": -2.1671762466430664, + "step": 260 + }, + { + "epoch": 0.12, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -0.4042617380619049, + "logits/rejected": -0.44205984473228455, + "logps/chosen": -87.01408386230469, + "logps/rejected": -71.92070007324219, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5387381911277771, + "rewards/margins": 2.790922164916992, + "rewards/rejected": -2.2521841526031494, + "step": 270 + }, + { + "epoch": 0.13, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -0.4417573809623718, + "logits/rejected": -0.4520273804664612, + "logps/chosen": -88.83131408691406, + "logps/rejected": -77.17863464355469, + "loss": 0.097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42587852478027344, + "rewards/margins": 3.2474589347839355, + "rewards/rejected": -2.8215808868408203, + "step": 280 + }, + { + "epoch": 0.13, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -0.4425846040248871, + "logits/rejected": -0.5012689828872681, + "logps/chosen": -90.25948333740234, + "logps/rejected": -76.37370300292969, + "loss": 0.1002, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.533167839050293, + "rewards/margins": 3.3074212074279785, + "rewards/rejected": -2.7742531299591064, + "step": 290 + }, + { + "epoch": 0.14, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -0.3799470365047455, + "logits/rejected": -0.44799962639808655, + "logps/chosen": -80.90581512451172, + "logps/rejected": -70.25505065917969, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6411818265914917, + "rewards/margins": 3.449615478515625, + "rewards/rejected": -2.808433771133423, + "step": 300 + }, + { + "epoch": 0.14, + "eval_logits/chosen": -0.4048069417476654, + "eval_logits/rejected": -0.40349695086479187, + "eval_logps/chosen": -84.36418914794922, + "eval_logps/rejected": -70.42156219482422, + "eval_loss": 0.0937228724360466, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.5868332386016846, + "eval_rewards/margins": 3.3669118881225586, + "eval_rewards/rejected": -2.780078411102295, + "eval_runtime": 95.1355, + "eval_samples_per_second": 30.083, + "eval_steps_per_second": 1.882, + "step": 300 + }, + { + "epoch": 0.14, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -0.39280638098716736, + "logits/rejected": -0.3995700180530548, + "logps/chosen": -84.13384246826172, + "logps/rejected": -70.02226257324219, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6554551124572754, + "rewards/margins": 3.7848308086395264, + "rewards/rejected": -3.12937593460083, + "step": 310 + }, + { + "epoch": 0.15, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -0.4027988314628601, + "logits/rejected": -0.43616342544555664, + "logps/chosen": -90.87525939941406, + "logps/rejected": -73.50352478027344, + "loss": 0.0821, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.41767749190330505, + "rewards/margins": 3.4736123085021973, + "rewards/rejected": -3.0559346675872803, + "step": 320 + }, + { + "epoch": 0.15, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -0.40688735246658325, + "logits/rejected": -0.42999106645584106, + "logps/chosen": -81.19529724121094, + "logps/rejected": -72.04861450195312, + "loss": 0.0833, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9484050869941711, + "rewards/margins": 3.989964246749878, + "rewards/rejected": -3.0415592193603516, + "step": 330 + }, + { + "epoch": 0.16, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -0.3533805012702942, + "logits/rejected": -0.3621350824832916, + "logps/chosen": -82.51054382324219, + "logps/rejected": -69.80853271484375, + "loss": 0.0881, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.532248854637146, + "rewards/margins": 3.6017870903015137, + "rewards/rejected": -3.0695383548736572, + "step": 340 + }, + { + "epoch": 0.16, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -0.3965502381324768, + "logits/rejected": -0.42704907059669495, + "logps/chosen": -79.4199447631836, + "logps/rejected": -74.82305145263672, + "loss": 0.0793, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6175357103347778, + "rewards/margins": 4.187249660491943, + "rewards/rejected": -3.569714307785034, + "step": 350 + }, + { + "epoch": 0.16, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -0.40762096643447876, + "logits/rejected": -0.41946473717689514, + "logps/chosen": -82.0754623413086, + "logps/rejected": -73.5984878540039, + "loss": 0.0705, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5647357106208801, + "rewards/margins": 3.6981327533721924, + "rewards/rejected": -3.133397102355957, + "step": 360 + }, + { + "epoch": 0.17, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -0.42972856760025024, + "logits/rejected": -0.44596537947654724, + "logps/chosen": -86.44332122802734, + "logps/rejected": -69.35662841796875, + "loss": 0.0753, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8704233169555664, + "rewards/margins": 3.9212913513183594, + "rewards/rejected": -3.050868511199951, + "step": 370 + }, + { + "epoch": 0.17, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -0.39177441596984863, + "logits/rejected": -0.4080818295478821, + "logps/chosen": -86.29408264160156, + "logps/rejected": -72.41487884521484, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6717110872268677, + "rewards/margins": 4.34299373626709, + "rewards/rejected": -3.6712818145751953, + "step": 380 + }, + { + "epoch": 0.18, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -0.40621963143348694, + "logits/rejected": -0.43334826827049255, + "logps/chosen": -83.55781555175781, + "logps/rejected": -73.66390991210938, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5702416896820068, + "rewards/margins": 4.263453006744385, + "rewards/rejected": -3.693211317062378, + "step": 390 + }, + { + "epoch": 0.18, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -0.4090951979160309, + "logits/rejected": -0.48223596811294556, + "logps/chosen": -83.9447250366211, + "logps/rejected": -78.8793716430664, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0351506471633911, + "rewards/margins": 5.023129463195801, + "rewards/rejected": -3.9879791736602783, + "step": 400 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -0.4163001775741577, + "eval_logits/rejected": -0.40926501154899597, + "eval_logps/chosen": -83.8791275024414, + "eval_logps/rejected": -72.57203674316406, + "eval_loss": 0.053430840373039246, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.8293697237968445, + "eval_rewards/margins": 4.6846842765808105, + "eval_rewards/rejected": -3.855314254760742, + "eval_runtime": 67.7104, + "eval_samples_per_second": 42.268, + "eval_steps_per_second": 2.644, + "step": 400 + }, + { + "epoch": 0.19, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -0.3794030547142029, + "logits/rejected": -0.44113850593566895, + "logps/chosen": -93.76518249511719, + "logps/rejected": -75.81021881103516, + "loss": 0.0508, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9566559791564941, + "rewards/margins": 4.865769863128662, + "rewards/rejected": -3.909113645553589, + "step": 410 + }, + { + "epoch": 0.19, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -0.44859427213668823, + "logits/rejected": -0.4528217911720276, + "logps/chosen": -81.76443481445312, + "logps/rejected": -75.37516784667969, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0190056562423706, + "rewards/margins": 4.9655022621154785, + "rewards/rejected": -3.9464964866638184, + "step": 420 + }, + { + "epoch": 0.2, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -0.44366198778152466, + "logits/rejected": -0.435803085565567, + "logps/chosen": -80.83895111083984, + "logps/rejected": -75.4388427734375, + "loss": 0.04, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6601322293281555, + "rewards/margins": 4.995715141296387, + "rewards/rejected": -4.335582733154297, + "step": 430 + }, + { + "epoch": 0.2, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -0.4242371618747711, + "logits/rejected": -0.44815540313720703, + "logps/chosen": -78.79200744628906, + "logps/rejected": -74.7320785522461, + "loss": 0.0372, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9704369306564331, + "rewards/margins": 5.741919994354248, + "rewards/rejected": -4.771483421325684, + "step": 440 + }, + { + "epoch": 0.21, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -0.37376198172569275, + "logits/rejected": -0.41443461179733276, + "logps/chosen": -87.1550064086914, + "logps/rejected": -76.47234344482422, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1105682849884033, + "rewards/margins": 6.222236156463623, + "rewards/rejected": -5.111668586730957, + "step": 450 + }, + { + "epoch": 0.21, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -0.46613430976867676, + "logits/rejected": -0.45502910017967224, + "logps/chosen": -88.49858093261719, + "logps/rejected": -83.72077941894531, + "loss": 0.0368, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8836356997489929, + "rewards/margins": 5.70847225189209, + "rewards/rejected": -4.824836730957031, + "step": 460 + }, + { + "epoch": 0.21, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -0.49693307280540466, + "logits/rejected": -0.49518340826034546, + "logps/chosen": -84.83604431152344, + "logps/rejected": -76.25226593017578, + "loss": 0.0301, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8985893130302429, + "rewards/margins": 5.911735534667969, + "rewards/rejected": -5.01314640045166, + "step": 470 + }, + { + "epoch": 0.22, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -0.5251117944717407, + "logits/rejected": -0.4963545799255371, + "logps/chosen": -87.3931884765625, + "logps/rejected": -76.91529846191406, + "loss": 0.0288, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.006616473197937, + "rewards/margins": 6.471321105957031, + "rewards/rejected": -5.4647040367126465, + "step": 480 + }, + { + "epoch": 0.22, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -0.4567469656467438, + "logits/rejected": -0.44530144333839417, + "logps/chosen": -86.60816955566406, + "logps/rejected": -80.15440368652344, + "loss": 0.0259, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0026960372924805, + "rewards/margins": 6.509522438049316, + "rewards/rejected": -5.506826877593994, + "step": 490 + }, + { + "epoch": 0.23, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -0.44169288873672485, + "logits/rejected": -0.4075329899787903, + "logps/chosen": -85.48939514160156, + "logps/rejected": -78.98982238769531, + "loss": 0.0261, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6804320216178894, + "rewards/margins": 6.771535396575928, + "rewards/rejected": -6.091103553771973, + "step": 500 + }, + { + "epoch": 0.23, + "eval_logits/chosen": -0.4554254114627838, + "eval_logits/rejected": -0.43644845485687256, + "eval_logps/chosen": -83.64783477783203, + "eval_logps/rejected": -77.03020477294922, + "eval_loss": 0.026485837996006012, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.9450124502182007, + "eval_rewards/margins": 7.029412269592285, + "eval_rewards/rejected": -6.084399700164795, + "eval_runtime": 68.3909, + "eval_samples_per_second": 41.848, + "eval_steps_per_second": 2.617, + "step": 500 + }, + { + "epoch": 0.23, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -0.48590287566185, + "logits/rejected": -0.4845882058143616, + "logps/chosen": -79.791748046875, + "logps/rejected": -75.46503448486328, + "loss": 0.0256, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8771770596504211, + "rewards/margins": 7.567771911621094, + "rewards/rejected": -6.690594673156738, + "step": 510 + }, + { + "epoch": 0.24, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -0.49845123291015625, + "logits/rejected": -0.5058192610740662, + "logps/chosen": -88.39710998535156, + "logps/rejected": -82.1724624633789, + "loss": 0.021, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8767349123954773, + "rewards/margins": 7.156858921051025, + "rewards/rejected": -6.280123710632324, + "step": 520 + }, + { + "epoch": 0.24, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -0.5552406311035156, + "logits/rejected": -0.5379841923713684, + "logps/chosen": -90.11488342285156, + "logps/rejected": -83.9296646118164, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1575672626495361, + "rewards/margins": 8.527839660644531, + "rewards/rejected": -7.370272636413574, + "step": 530 + }, + { + "epoch": 0.25, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -0.4260433614253998, + "logits/rejected": -0.42235612869262695, + "logps/chosen": -86.18304443359375, + "logps/rejected": -83.03050994873047, + "loss": 0.0169, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.153515100479126, + "rewards/margins": 8.45301628112793, + "rewards/rejected": -7.299500942230225, + "step": 540 + }, + { + "epoch": 0.25, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -0.5101253986358643, + "logits/rejected": -0.5125061869621277, + "logps/chosen": -85.40342712402344, + "logps/rejected": -77.66134643554688, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4489992558956146, + "rewards/margins": 7.528244972229004, + "rewards/rejected": -7.079245567321777, + "step": 550 + }, + { + "epoch": 0.26, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -0.44849318265914917, + "logits/rejected": -0.5008470416069031, + "logps/chosen": -88.95763397216797, + "logps/rejected": -82.1678237915039, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1119191646575928, + "rewards/margins": 8.655721664428711, + "rewards/rejected": -7.543802738189697, + "step": 560 + }, + { + "epoch": 0.26, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -0.47934848070144653, + "logits/rejected": -0.5097138285636902, + "logps/chosen": -91.19532012939453, + "logps/rejected": -81.44822692871094, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1905813217163086, + "rewards/margins": 8.602886199951172, + "rewards/rejected": -7.412304878234863, + "step": 570 + }, + { + "epoch": 0.26, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -0.5011765360832214, + "logits/rejected": -0.4872073233127594, + "logps/chosen": -85.41288757324219, + "logps/rejected": -81.96855163574219, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8738569021224976, + "rewards/margins": 9.319847106933594, + "rewards/rejected": -8.445989608764648, + "step": 580 + }, + { + "epoch": 0.27, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -0.44528061151504517, + "logits/rejected": -0.5134448409080505, + "logps/chosen": -85.46125793457031, + "logps/rejected": -85.14677429199219, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2151055335998535, + "rewards/margins": 10.124042510986328, + "rewards/rejected": -8.908937454223633, + "step": 590 + }, + { + "epoch": 0.27, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -0.5176888108253479, + "logits/rejected": -0.4848707318305969, + "logps/chosen": -82.91043090820312, + "logps/rejected": -85.11193084716797, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2624995708465576, + "rewards/margins": 10.12977123260498, + "rewards/rejected": -8.86727237701416, + "step": 600 + }, + { + "epoch": 0.27, + "eval_logits/chosen": -0.4719269871711731, + "eval_logits/rejected": -0.44449782371520996, + "eval_logps/chosen": -83.74675750732422, + "eval_logps/rejected": -82.89984130859375, + "eval_loss": 0.01547443587332964, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.8955463171005249, + "eval_rewards/margins": 9.914765357971191, + "eval_rewards/rejected": -9.019220352172852, + "eval_runtime": 80.4475, + "eval_samples_per_second": 35.576, + "eval_steps_per_second": 2.225, + "step": 600 + }, + { + "epoch": 0.28, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -0.5298361778259277, + "logits/rejected": -0.49106723070144653, + "logps/chosen": -85.64407348632812, + "logps/rejected": -83.2470703125, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8313077092170715, + "rewards/margins": 9.659244537353516, + "rewards/rejected": -8.827936172485352, + "step": 610 + }, + { + "epoch": 0.28, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -0.5390816926956177, + "logits/rejected": -0.5547928214073181, + "logps/chosen": -83.07210540771484, + "logps/rejected": -86.65785217285156, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5047268867492676, + "rewards/margins": 10.445852279663086, + "rewards/rejected": -9.941125869750977, + "step": 620 + }, + { + "epoch": 0.29, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -0.5076915621757507, + "logits/rejected": -0.4994226396083832, + "logps/chosen": -88.59370422363281, + "logps/rejected": -89.94892883300781, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8393543362617493, + "rewards/margins": 10.692634582519531, + "rewards/rejected": -9.853279113769531, + "step": 630 + }, + { + "epoch": 0.29, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -0.530367374420166, + "logits/rejected": -0.521192193031311, + "logps/chosen": -85.75426483154297, + "logps/rejected": -85.36978149414062, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40226420760154724, + "rewards/margins": 9.481393814086914, + "rewards/rejected": -9.079129219055176, + "step": 640 + }, + { + "epoch": 0.3, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -0.49263906478881836, + "logits/rejected": -0.5358297824859619, + "logps/chosen": -86.92278289794922, + "logps/rejected": -84.6957015991211, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3719327449798584, + "rewards/margins": 12.041804313659668, + "rewards/rejected": -10.66987133026123, + "step": 650 + }, + { + "epoch": 0.3, + "learning_rate": 2.998477929984779e-07, + "logits/chosen": -0.5359422564506531, + "logits/rejected": -0.493899405002594, + "logps/chosen": -83.14149475097656, + "logps/rejected": -88.8863296508789, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.620421290397644, + "rewards/margins": 12.346444129943848, + "rewards/rejected": -11.726022720336914, + "step": 660 + }, + { + "epoch": 0.31, + "learning_rate": 2.993404363267377e-07, + "logits/chosen": -0.48512953519821167, + "logits/rejected": -0.5147516131401062, + "logps/chosen": -88.53128814697266, + "logps/rejected": -90.33158111572266, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5566505193710327, + "rewards/margins": 12.281156539916992, + "rewards/rejected": -11.724506378173828, + "step": 670 + }, + { + "epoch": 0.31, + "learning_rate": 2.9883307965499743e-07, + "logits/chosen": -0.5035023093223572, + "logits/rejected": -0.47232455015182495, + "logps/chosen": -88.98485565185547, + "logps/rejected": -94.34496307373047, + "loss": 0.0109, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7806042432785034, + "rewards/margins": 12.724248886108398, + "rewards/rejected": -11.943646430969238, + "step": 680 + }, + { + "epoch": 0.31, + "learning_rate": 2.983257229832572e-07, + "logits/chosen": -0.46104001998901367, + "logits/rejected": -0.5387547016143799, + "logps/chosen": -87.83756256103516, + "logps/rejected": -85.52259826660156, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1090915203094482, + "rewards/margins": 13.257928848266602, + "rewards/rejected": -12.148837089538574, + "step": 690 + }, + { + "epoch": 0.32, + "learning_rate": 2.9781836631151696e-07, + "logits/chosen": -0.4819292426109314, + "logits/rejected": -0.473407506942749, + "logps/chosen": -86.3482894897461, + "logps/rejected": -93.20619201660156, + "loss": 0.0069, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7059062719345093, + "rewards/margins": 12.420666694641113, + "rewards/rejected": -11.714760780334473, + "step": 700 + }, + { + "epoch": 0.32, + "eval_logits/chosen": -0.49807193875312805, + "eval_logits/rejected": -0.4617076814174652, + "eval_logps/chosen": -84.06249237060547, + "eval_logps/rejected": -89.08558654785156, + "eval_loss": 0.011189465411007404, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.7376802563667297, + "eval_rewards/margins": 12.849767684936523, + "eval_rewards/rejected": -12.112089157104492, + "eval_runtime": 87.676, + "eval_samples_per_second": 32.643, + "eval_steps_per_second": 2.042, + "step": 700 + }, + { + "epoch": 0.32, + "learning_rate": 2.9731100963977676e-07, + "logits/chosen": -0.47739043831825256, + "logits/rejected": -0.47871193289756775, + "logps/chosen": -85.58582305908203, + "logps/rejected": -92.14688873291016, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8560394048690796, + "rewards/margins": 12.940500259399414, + "rewards/rejected": -12.08445930480957, + "step": 710 + }, + { + "epoch": 0.33, + "learning_rate": 2.968036529680365e-07, + "logits/chosen": -0.5277897119522095, + "logits/rejected": -0.5494237542152405, + "logps/chosen": -84.10975646972656, + "logps/rejected": -94.46439361572266, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.170245885848999, + "rewards/margins": 14.76359748840332, + "rewards/rejected": -13.593350410461426, + "step": 720 + }, + { + "epoch": 0.33, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -0.5545412302017212, + "logits/rejected": -0.4997970461845398, + "logps/chosen": -85.01225280761719, + "logps/rejected": -90.4923324584961, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1222950220108032, + "rewards/margins": 14.033930778503418, + "rewards/rejected": -12.911636352539062, + "step": 730 + }, + { + "epoch": 0.34, + "learning_rate": 2.9578893962455603e-07, + "logits/chosen": -0.47376394271850586, + "logits/rejected": -0.4383009374141693, + "logps/chosen": -82.76625061035156, + "logps/rejected": -88.45748138427734, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7124737501144409, + "rewards/margins": 13.04736614227295, + "rewards/rejected": -12.334892272949219, + "step": 740 + }, + { + "epoch": 0.34, + "learning_rate": 2.952815829528158e-07, + "logits/chosen": -0.5440901517868042, + "logits/rejected": -0.5601977109909058, + "logps/chosen": -83.02261352539062, + "logps/rejected": -93.058349609375, + "loss": 0.0084, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9380642771720886, + "rewards/margins": 14.189043045043945, + "rewards/rejected": -13.250978469848633, + "step": 750 + }, + { + "epoch": 0.35, + "learning_rate": 2.9477422628107556e-07, + "logits/chosen": -0.6197845339775085, + "logits/rejected": -0.5904224514961243, + "logps/chosen": -86.34260559082031, + "logps/rejected": -93.52967834472656, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9487737417221069, + "rewards/margins": 13.869544982910156, + "rewards/rejected": -12.92077350616455, + "step": 760 + }, + { + "epoch": 0.35, + "learning_rate": 2.9426686960933536e-07, + "logits/chosen": -0.5594059228897095, + "logits/rejected": -0.4972243905067444, + "logps/chosen": -92.50401306152344, + "logps/rejected": -92.55013275146484, + "loss": 0.0083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2346494495868683, + "rewards/margins": 12.848955154418945, + "rewards/rejected": -12.61430549621582, + "step": 770 + }, + { + "epoch": 0.36, + "learning_rate": 2.937595129375951e-07, + "logits/chosen": -0.48909980058670044, + "logits/rejected": -0.5088886022567749, + "logps/chosen": -85.69803619384766, + "logps/rejected": -91.68397521972656, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.01169496774673462, + "rewards/margins": 13.43089771270752, + "rewards/rejected": -13.442593574523926, + "step": 780 + }, + { + "epoch": 0.36, + "learning_rate": 2.932521562658549e-07, + "logits/chosen": -0.5919016599655151, + "logits/rejected": -0.5654195547103882, + "logps/chosen": -85.5023422241211, + "logps/rejected": -95.96412658691406, + "loss": 0.0058, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8735700845718384, + "rewards/margins": 15.024134635925293, + "rewards/rejected": -14.15056324005127, + "step": 790 + }, + { + "epoch": 0.37, + "learning_rate": 2.9274479959411463e-07, + "logits/chosen": -0.570320725440979, + "logits/rejected": -0.5494597554206848, + "logps/chosen": -84.90104675292969, + "logps/rejected": -96.68202209472656, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5182453393936157, + "rewards/margins": 14.31817626953125, + "rewards/rejected": -13.79992961883545, + "step": 800 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -0.5433102250099182, + "eval_logits/rejected": -0.4990071952342987, + "eval_logps/chosen": -84.43854522705078, + "eval_logps/rejected": -93.17755126953125, + "eval_loss": 0.009528527967631817, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.5496511459350586, + "eval_rewards/margins": 14.707724571228027, + "eval_rewards/rejected": -14.158075332641602, + "eval_runtime": 90.1102, + "eval_samples_per_second": 31.761, + "eval_steps_per_second": 1.986, + "step": 800 + }, + { + "epoch": 0.37, + "learning_rate": 2.922374429223744e-07, + "logits/chosen": -0.5402038097381592, + "logits/rejected": -0.5096747279167175, + "logps/chosen": -85.3931655883789, + "logps/rejected": -97.1419677734375, + "loss": 0.0151, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4692815840244293, + "rewards/margins": 15.11835765838623, + "rewards/rejected": -14.649076461791992, + "step": 810 + }, + { + "epoch": 0.37, + "learning_rate": 2.9173008625063416e-07, + "logits/chosen": -0.5429534912109375, + "logits/rejected": -0.528504490852356, + "logps/chosen": -87.19554138183594, + "logps/rejected": -96.91336822509766, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8628616333007812, + "rewards/margins": 16.042449951171875, + "rewards/rejected": -15.179588317871094, + "step": 820 + }, + { + "epoch": 0.38, + "learning_rate": 2.9122272957889396e-07, + "logits/chosen": -0.49800190329551697, + "logits/rejected": -0.5036158561706543, + "logps/chosen": -85.76387023925781, + "logps/rejected": -96.38945770263672, + "loss": 0.0123, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1412431001663208, + "rewards/margins": 14.274526596069336, + "rewards/rejected": -13.133282661437988, + "step": 830 + }, + { + "epoch": 0.38, + "learning_rate": 2.907153729071537e-07, + "logits/chosen": -0.5477937459945679, + "logits/rejected": -0.5407083630561829, + "logps/chosen": -89.34082794189453, + "logps/rejected": -94.71735382080078, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8216226696968079, + "rewards/margins": 15.216272354125977, + "rewards/rejected": -14.394651412963867, + "step": 840 + }, + { + "epoch": 0.39, + "learning_rate": 2.902080162354135e-07, + "logits/chosen": -0.5074556469917297, + "logits/rejected": -0.4516450762748718, + "logps/chosen": -77.45787048339844, + "logps/rejected": -95.98930358886719, + "loss": 0.0105, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.09650325775146484, + "rewards/margins": 14.629781723022461, + "rewards/rejected": -14.726284980773926, + "step": 850 + }, + { + "epoch": 0.39, + "learning_rate": 2.8970065956367323e-07, + "logits/chosen": -0.5568779706954956, + "logits/rejected": -0.5409306287765503, + "logps/chosen": -85.82357025146484, + "logps/rejected": -97.19561767578125, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.310552179813385, + "rewards/margins": 15.414616584777832, + "rewards/rejected": -15.10406494140625, + "step": 860 + }, + { + "epoch": 0.4, + "learning_rate": 2.89193302891933e-07, + "logits/chosen": -0.550707221031189, + "logits/rejected": -0.5625060796737671, + "logps/chosen": -84.54888153076172, + "logps/rejected": -101.38866424560547, + "loss": 0.0099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.05552394315600395, + "rewards/margins": 16.369537353515625, + "rewards/rejected": -16.31401252746582, + "step": 870 + }, + { + "epoch": 0.4, + "learning_rate": 2.8868594622019276e-07, + "logits/chosen": -0.5476843118667603, + "logits/rejected": -0.5468933582305908, + "logps/chosen": -90.43180847167969, + "logps/rejected": -96.9254379272461, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3115522861480713, + "rewards/margins": 15.831393241882324, + "rewards/rejected": -15.519842147827148, + "step": 880 + }, + { + "epoch": 0.41, + "learning_rate": 2.8817858954845256e-07, + "logits/chosen": -0.5169821977615356, + "logits/rejected": -0.5348768830299377, + "logps/chosen": -85.87166595458984, + "logps/rejected": -104.33921813964844, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8090624809265137, + "rewards/margins": 17.356067657470703, + "rewards/rejected": -16.547006607055664, + "step": 890 + }, + { + "epoch": 0.41, + "learning_rate": 2.876712328767123e-07, + "logits/chosen": -0.5918963551521301, + "logits/rejected": -0.5073711276054382, + "logps/chosen": -83.7479019165039, + "logps/rejected": -100.97853088378906, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8310455083847046, + "rewards/margins": 17.748876571655273, + "rewards/rejected": -16.917831420898438, + "step": 900 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -0.5699242353439331, + "eval_logits/rejected": -0.5203292965888977, + "eval_logps/chosen": -84.38159942626953, + "eval_logps/rejected": -97.32843017578125, + "eval_loss": 0.009458034299314022, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.5781266093254089, + "eval_rewards/margins": 16.811635971069336, + "eval_rewards/rejected": -16.23350715637207, + "eval_runtime": 70.2365, + "eval_samples_per_second": 40.748, + "eval_steps_per_second": 2.549, + "step": 900 + }, + { + "epoch": 0.42, + "learning_rate": 2.871638762049721e-07, + "logits/chosen": -0.5940302610397339, + "logits/rejected": -0.5538831353187561, + "logps/chosen": -89.7022705078125, + "logps/rejected": -102.83609771728516, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3309146761894226, + "rewards/margins": 16.62120819091797, + "rewards/rejected": -16.290292739868164, + "step": 910 + }, + { + "epoch": 0.42, + "learning_rate": 2.8665651953323183e-07, + "logits/chosen": -0.5756375789642334, + "logits/rejected": -0.5457393527030945, + "logps/chosen": -85.42708587646484, + "logps/rejected": -99.51619720458984, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2837529182434082, + "rewards/margins": 18.219730377197266, + "rewards/rejected": -16.935976028442383, + "step": 920 + }, + { + "epoch": 0.42, + "learning_rate": 2.861491628614916e-07, + "logits/chosen": -0.6441935300827026, + "logits/rejected": -0.5929199457168579, + "logps/chosen": -83.81559753417969, + "logps/rejected": -97.58605194091797, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1634002923965454, + "rewards/margins": 15.892837524414062, + "rewards/rejected": -15.729436874389648, + "step": 930 + }, + { + "epoch": 0.43, + "learning_rate": 2.8564180618975136e-07, + "logits/chosen": -0.5742901563644409, + "logits/rejected": -0.6205036640167236, + "logps/chosen": -87.34207916259766, + "logps/rejected": -104.62998962402344, + "loss": 0.0075, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.230440616607666, + "rewards/margins": 18.693153381347656, + "rewards/rejected": -17.462711334228516, + "step": 940 + }, + { + "epoch": 0.43, + "learning_rate": 2.8513444951801116e-07, + "logits/chosen": -0.5989577770233154, + "logits/rejected": -0.5516340732574463, + "logps/chosen": -85.1662826538086, + "logps/rejected": -105.4076156616211, + "loss": 0.009, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.42705821990966797, + "rewards/margins": 18.612205505371094, + "rewards/rejected": -18.185152053833008, + "step": 950 + }, + { + "epoch": 0.44, + "learning_rate": 2.846270928462709e-07, + "logits/chosen": -0.5289547443389893, + "logits/rejected": -0.5138489007949829, + "logps/chosen": -86.29643249511719, + "logps/rejected": -101.24544525146484, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5123685598373413, + "rewards/margins": 17.13689613342285, + "rewards/rejected": -16.62452507019043, + "step": 960 + }, + { + "epoch": 0.44, + "learning_rate": 2.841197361745307e-07, + "logits/chosen": -0.581844687461853, + "logits/rejected": -0.5707448720932007, + "logps/chosen": -85.06533813476562, + "logps/rejected": -99.39533996582031, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0915330648422241, + "rewards/margins": 17.751794815063477, + "rewards/rejected": -16.660261154174805, + "step": 970 + }, + { + "epoch": 0.45, + "learning_rate": 2.8361237950279043e-07, + "logits/chosen": -0.5148975253105164, + "logits/rejected": -0.5400117635726929, + "logps/chosen": -84.89034271240234, + "logps/rejected": -101.74394226074219, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6962271332740784, + "rewards/margins": 17.362585067749023, + "rewards/rejected": -16.666357040405273, + "step": 980 + }, + { + "epoch": 0.45, + "learning_rate": 2.831050228310502e-07, + "logits/chosen": -0.5739647150039673, + "logits/rejected": -0.589260995388031, + "logps/chosen": -85.27232360839844, + "logps/rejected": -99.5882339477539, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.102745771408081, + "rewards/margins": 16.655506134033203, + "rewards/rejected": -15.552759170532227, + "step": 990 + }, + { + "epoch": 0.46, + "learning_rate": 2.8259766615930996e-07, + "logits/chosen": -0.6199604272842407, + "logits/rejected": -0.6186779737472534, + "logps/chosen": -86.85110473632812, + "logps/rejected": -94.86585998535156, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.344161868095398, + "rewards/margins": 17.192333221435547, + "rewards/rejected": -15.848173141479492, + "step": 1000 + }, + { + "epoch": 0.46, + "eval_logits/chosen": -0.6026656627655029, + "eval_logits/rejected": -0.5541569590568542, + "eval_logps/chosen": -83.98019409179688, + "eval_logps/rejected": -97.6655044555664, + "eval_loss": 0.008850914426147938, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.7788311243057251, + "eval_rewards/margins": 17.180883407592773, + "eval_rewards/rejected": -16.402050018310547, + "eval_runtime": 66.2953, + "eval_samples_per_second": 43.171, + "eval_steps_per_second": 2.7, + "step": 1000 + }, + { + "epoch": 0.46, + "learning_rate": 2.8209030948756976e-07, + "logits/chosen": -0.6393527984619141, + "logits/rejected": -0.6539614796638489, + "logps/chosen": -83.76344299316406, + "logps/rejected": -102.68571472167969, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3425472974777222, + "rewards/margins": 17.38565444946289, + "rewards/rejected": -16.043106079101562, + "step": 1010 + }, + { + "epoch": 0.47, + "learning_rate": 2.815829528158295e-07, + "logits/chosen": -0.5339282155036926, + "logits/rejected": -0.6024419665336609, + "logps/chosen": -83.54170989990234, + "logps/rejected": -99.44310760498047, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9438585042953491, + "rewards/margins": 17.18117904663086, + "rewards/rejected": -16.237319946289062, + "step": 1020 + }, + { + "epoch": 0.47, + "learning_rate": 2.810755961440893e-07, + "logits/chosen": -0.6185084581375122, + "logits/rejected": -0.6042757034301758, + "logps/chosen": -85.75657653808594, + "logps/rejected": -104.95362854003906, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2947128415107727, + "rewards/margins": 18.044797897338867, + "rewards/rejected": -17.750083923339844, + "step": 1030 + }, + { + "epoch": 0.47, + "learning_rate": 2.8056823947234903e-07, + "logits/chosen": -0.5133255124092102, + "logits/rejected": -0.5455678701400757, + "logps/chosen": -88.9016342163086, + "logps/rejected": -97.69975280761719, + "loss": 0.0069, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.311894178390503, + "rewards/margins": 17.26865005493164, + "rewards/rejected": -15.956756591796875, + "step": 1040 + }, + { + "epoch": 0.48, + "learning_rate": 2.800608828006088e-07, + "logits/chosen": -0.617928683757782, + "logits/rejected": -0.5641010403633118, + "logps/chosen": -88.8299560546875, + "logps/rejected": -103.35233306884766, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06934154033660889, + "rewards/margins": 17.858707427978516, + "rewards/rejected": -17.928049087524414, + "step": 1050 + }, + { + "epoch": 0.48, + "learning_rate": 2.7955352612886856e-07, + "logits/chosen": -0.5771080255508423, + "logits/rejected": -0.5437840223312378, + "logps/chosen": -77.94991302490234, + "logps/rejected": -101.56156158447266, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2994147837162018, + "rewards/margins": 18.4749813079834, + "rewards/rejected": -18.774398803710938, + "step": 1060 + }, + { + "epoch": 0.49, + "learning_rate": 2.7904616945712836e-07, + "logits/chosen": -0.5053738355636597, + "logits/rejected": -0.5292837619781494, + "logps/chosen": -82.59956359863281, + "logps/rejected": -103.01961517333984, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9886738657951355, + "rewards/margins": 18.72883415222168, + "rewards/rejected": -17.740161895751953, + "step": 1070 + }, + { + "epoch": 0.49, + "learning_rate": 2.785388127853881e-07, + "logits/chosen": -0.539372444152832, + "logits/rejected": -0.5728116631507874, + "logps/chosen": -85.69373321533203, + "logps/rejected": -101.7072525024414, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8183579444885254, + "rewards/margins": 18.746421813964844, + "rewards/rejected": -17.928064346313477, + "step": 1080 + }, + { + "epoch": 0.5, + "learning_rate": 2.780314561136479e-07, + "logits/chosen": -0.5220463871955872, + "logits/rejected": -0.481070339679718, + "logps/chosen": -87.22162628173828, + "logps/rejected": -101.13359069824219, + "loss": 0.0049, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.474172443151474, + "rewards/margins": 17.313884735107422, + "rewards/rejected": -16.839710235595703, + "step": 1090 + }, + { + "epoch": 0.5, + "learning_rate": 2.7752409944190763e-07, + "logits/chosen": -0.5939083099365234, + "logits/rejected": -0.5974006652832031, + "logps/chosen": -85.4839096069336, + "logps/rejected": -100.13436889648438, + "loss": 0.0127, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7489646673202515, + "rewards/margins": 18.251670837402344, + "rewards/rejected": -17.502704620361328, + "step": 1100 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -0.5881746411323547, + "eval_logits/rejected": -0.5407174825668335, + "eval_logps/chosen": -83.98693084716797, + "eval_logps/rejected": -98.03213500976562, + "eval_loss": 0.008042249828577042, + "eval_rewards/accuracies": 0.9972066879272461, + "eval_rewards/chosen": 0.7754639983177185, + "eval_rewards/margins": 17.360830307006836, + "eval_rewards/rejected": -16.585365295410156, + "eval_runtime": 74.5333, + "eval_samples_per_second": 38.399, + "eval_steps_per_second": 2.402, + "step": 1100 + }, + { + "epoch": 0.51, + "learning_rate": 2.770167427701674e-07, + "logits/chosen": -0.5169168710708618, + "logits/rejected": -0.5307328104972839, + "logps/chosen": -89.0078353881836, + "logps/rejected": -102.8739242553711, + "loss": 0.004, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0199966430664062, + "rewards/margins": 17.293201446533203, + "rewards/rejected": -16.273204803466797, + "step": 1110 + }, + { + "epoch": 0.51, + "learning_rate": 2.7650938609842716e-07, + "logits/chosen": -0.5890191793441772, + "logits/rejected": -0.6054331660270691, + "logps/chosen": -86.77067565917969, + "logps/rejected": -93.67768859863281, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5434656143188477, + "rewards/margins": 16.876192092895508, + "rewards/rejected": -15.332727432250977, + "step": 1120 + }, + { + "epoch": 0.52, + "learning_rate": 2.7600202942668696e-07, + "logits/chosen": -0.6245452165603638, + "logits/rejected": -0.5807594656944275, + "logps/chosen": -90.57156372070312, + "logps/rejected": -96.71611022949219, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8988291025161743, + "rewards/margins": 17.20783233642578, + "rewards/rejected": -15.309002876281738, + "step": 1130 + }, + { + "epoch": 0.52, + "learning_rate": 2.754946727549467e-07, + "logits/chosen": -0.6571340560913086, + "logits/rejected": -0.598118782043457, + "logps/chosen": -88.4867172241211, + "logps/rejected": -102.10794830322266, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4076375961303711, + "rewards/margins": 16.26376724243164, + "rewards/rejected": -15.856130599975586, + "step": 1140 + }, + { + "epoch": 0.52, + "learning_rate": 2.749873160832065e-07, + "logits/chosen": -0.5951014161109924, + "logits/rejected": -0.6142104864120483, + "logps/chosen": -81.71449279785156, + "logps/rejected": -98.87643432617188, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3448036909103394, + "rewards/margins": 17.912372589111328, + "rewards/rejected": -16.567569732666016, + "step": 1150 + }, + { + "epoch": 0.53, + "learning_rate": 2.7447995941146623e-07, + "logits/chosen": -0.6196326613426208, + "logits/rejected": -0.6616156697273254, + "logps/chosen": -86.88075256347656, + "logps/rejected": -103.3810043334961, + "loss": 0.0113, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1018636226654053, + "rewards/margins": 17.62788963317871, + "rewards/rejected": -16.526025772094727, + "step": 1160 + }, + { + "epoch": 0.53, + "learning_rate": 2.73972602739726e-07, + "logits/chosen": -0.5963867902755737, + "logits/rejected": -0.6104931235313416, + "logps/chosen": -86.03263854980469, + "logps/rejected": -103.81394958496094, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3870103359222412, + "rewards/margins": 18.407033920288086, + "rewards/rejected": -17.020023345947266, + "step": 1170 + }, + { + "epoch": 0.54, + "learning_rate": 2.7346524606798576e-07, + "logits/chosen": -0.6480125784873962, + "logits/rejected": -0.5914435386657715, + "logps/chosen": -87.74053955078125, + "logps/rejected": -103.57038879394531, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2278248071670532, + "rewards/margins": 18.94321060180664, + "rewards/rejected": -17.71538543701172, + "step": 1180 + }, + { + "epoch": 0.54, + "learning_rate": 2.7295788939624556e-07, + "logits/chosen": -0.6207834482192993, + "logits/rejected": -0.6019139289855957, + "logps/chosen": -84.23980712890625, + "logps/rejected": -98.18376159667969, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0491936206817627, + "rewards/margins": 18.798131942749023, + "rewards/rejected": -16.748937606811523, + "step": 1190 + }, + { + "epoch": 0.55, + "learning_rate": 2.724505327245053e-07, + "logits/chosen": -0.6990079879760742, + "logits/rejected": -0.623802661895752, + "logps/chosen": -87.16651916503906, + "logps/rejected": -104.5052719116211, + "loss": 0.0192, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.10929219424724579, + "rewards/margins": 17.677276611328125, + "rewards/rejected": -17.567981719970703, + "step": 1200 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -0.6292704343795776, + "eval_logits/rejected": -0.575247585773468, + "eval_logps/chosen": -83.80381774902344, + "eval_logps/rejected": -98.02921295166016, + "eval_loss": 0.008193709887564182, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.8670160174369812, + "eval_rewards/margins": 17.450916290283203, + "eval_rewards/rejected": -16.583900451660156, + "eval_runtime": 67.8403, + "eval_samples_per_second": 42.187, + "eval_steps_per_second": 2.639, + "step": 1200 + }, + { + "epoch": 0.55, + "learning_rate": 2.719431760527651e-07, + "logits/chosen": -0.6629089117050171, + "logits/rejected": -0.620673656463623, + "logps/chosen": -83.75190734863281, + "logps/rejected": -97.05430603027344, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7773175835609436, + "rewards/margins": 15.876919746398926, + "rewards/rejected": -15.099603652954102, + "step": 1210 + }, + { + "epoch": 0.56, + "learning_rate": 2.7143581938102483e-07, + "logits/chosen": -0.6742457151412964, + "logits/rejected": -0.6161606311798096, + "logps/chosen": -82.72785949707031, + "logps/rejected": -97.83805084228516, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0242589712142944, + "rewards/margins": 17.519359588623047, + "rewards/rejected": -16.495100021362305, + "step": 1220 + }, + { + "epoch": 0.56, + "learning_rate": 2.709284627092846e-07, + "logits/chosen": -0.602118968963623, + "logits/rejected": -0.6233198046684265, + "logps/chosen": -81.18174743652344, + "logps/rejected": -100.21574401855469, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5538928508758545, + "rewards/margins": 17.575532913208008, + "rewards/rejected": -17.02164077758789, + "step": 1230 + }, + { + "epoch": 0.57, + "learning_rate": 2.7042110603754436e-07, + "logits/chosen": -0.650370180606842, + "logits/rejected": -0.5805081129074097, + "logps/chosen": -86.96976470947266, + "logps/rejected": -99.80048370361328, + "loss": 0.0062, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1351078748703003, + "rewards/margins": 17.74318504333496, + "rewards/rejected": -16.60807991027832, + "step": 1240 + }, + { + "epoch": 0.57, + "learning_rate": 2.6991374936580416e-07, + "logits/chosen": -0.5949207544326782, + "logits/rejected": -0.5505542755126953, + "logps/chosen": -86.36726379394531, + "logps/rejected": -104.3932113647461, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.521218478679657, + "rewards/margins": 18.444801330566406, + "rewards/rejected": -17.923583984375, + "step": 1250 + }, + { + "epoch": 0.58, + "learning_rate": 2.694063926940639e-07, + "logits/chosen": -0.5906392335891724, + "logits/rejected": -0.6108217239379883, + "logps/chosen": -86.92762756347656, + "logps/rejected": -99.90259552001953, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7010552287101746, + "rewards/margins": 17.54641342163086, + "rewards/rejected": -16.845355987548828, + "step": 1260 + }, + { + "epoch": 0.58, + "learning_rate": 2.688990360223237e-07, + "logits/chosen": -0.6362488865852356, + "logits/rejected": -0.5491201877593994, + "logps/chosen": -85.17521667480469, + "logps/rejected": -108.06062316894531, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39043551683425903, + "rewards/margins": 18.07403564453125, + "rewards/rejected": -18.464473724365234, + "step": 1270 + }, + { + "epoch": 0.58, + "learning_rate": 2.6839167935058343e-07, + "logits/chosen": -0.7829685211181641, + "logits/rejected": -0.7587999105453491, + "logps/chosen": -83.90362548828125, + "logps/rejected": -101.64833068847656, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.357697069644928, + "rewards/margins": 18.58173942565918, + "rewards/rejected": -18.224040985107422, + "step": 1280 + }, + { + "epoch": 0.59, + "learning_rate": 2.678843226788432e-07, + "logits/chosen": -0.6578508615493774, + "logits/rejected": -0.5751517415046692, + "logps/chosen": -88.73038482666016, + "logps/rejected": -105.878173828125, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28621432185173035, + "rewards/margins": 19.108448028564453, + "rewards/rejected": -18.82223129272461, + "step": 1290 + }, + { + "epoch": 0.59, + "learning_rate": 2.6737696600710296e-07, + "logits/chosen": -0.7079693078994751, + "logits/rejected": -0.7003791332244873, + "logps/chosen": -79.51958465576172, + "logps/rejected": -102.92280578613281, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1290922164916992, + "rewards/margins": 19.95863914489746, + "rewards/rejected": -18.829547882080078, + "step": 1300 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -0.6428735852241516, + "eval_logits/rejected": -0.5829644203186035, + "eval_logps/chosen": -84.09004974365234, + "eval_logps/rejected": -102.14546966552734, + "eval_loss": 0.008307097479701042, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.7239024639129639, + "eval_rewards/margins": 19.365936279296875, + "eval_rewards/rejected": -18.642032623291016, + "eval_runtime": 62.3684, + "eval_samples_per_second": 45.889, + "eval_steps_per_second": 2.87, + "step": 1300 + }, + { + "epoch": 0.6, + "learning_rate": 2.6686960933536276e-07, + "logits/chosen": -0.7085214853286743, + "logits/rejected": -0.6773207783699036, + "logps/chosen": -83.67912292480469, + "logps/rejected": -105.48329162597656, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8614299893379211, + "rewards/margins": 19.49595832824707, + "rewards/rejected": -18.634525299072266, + "step": 1310 + }, + { + "epoch": 0.6, + "learning_rate": 2.663622526636225e-07, + "logits/chosen": -0.5678974390029907, + "logits/rejected": -0.559326171875, + "logps/chosen": -83.47301483154297, + "logps/rejected": -107.2491226196289, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6393694281578064, + "rewards/margins": 19.674095153808594, + "rewards/rejected": -19.034725189208984, + "step": 1320 + }, + { + "epoch": 0.61, + "learning_rate": 2.658548959918823e-07, + "logits/chosen": -0.6403359174728394, + "logits/rejected": -0.6420946717262268, + "logps/chosen": -87.22882843017578, + "logps/rejected": -104.50102233886719, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6024986505508423, + "rewards/margins": 20.13058090209961, + "rewards/rejected": -18.528079986572266, + "step": 1330 + }, + { + "epoch": 0.61, + "learning_rate": 2.6534753932014203e-07, + "logits/chosen": -0.624729335308075, + "logits/rejected": -0.5556301474571228, + "logps/chosen": -84.000244140625, + "logps/rejected": -100.9580078125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8587640523910522, + "rewards/margins": 19.506357192993164, + "rewards/rejected": -18.647592544555664, + "step": 1340 + }, + { + "epoch": 0.62, + "learning_rate": 2.648401826484018e-07, + "logits/chosen": -0.7058721780776978, + "logits/rejected": -0.6835563778877258, + "logps/chosen": -81.8812484741211, + "logps/rejected": -110.7301025390625, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5004382133483887, + "rewards/margins": 20.48194122314453, + "rewards/rejected": -19.981502532958984, + "step": 1350 + }, + { + "epoch": 0.62, + "learning_rate": 2.6433282597666156e-07, + "logits/chosen": -0.5713628530502319, + "logits/rejected": -0.5688737630844116, + "logps/chosen": -88.24539184570312, + "logps/rejected": -110.30496978759766, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33614498376846313, + "rewards/margins": 19.93739128112793, + "rewards/rejected": -19.601245880126953, + "step": 1360 + }, + { + "epoch": 0.63, + "learning_rate": 2.6382546930492135e-07, + "logits/chosen": -0.6647303104400635, + "logits/rejected": -0.6416600942611694, + "logps/chosen": -84.4307632446289, + "logps/rejected": -104.19242095947266, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5064027905464172, + "rewards/margins": 19.400102615356445, + "rewards/rejected": -18.893699645996094, + "step": 1370 + }, + { + "epoch": 0.63, + "learning_rate": 2.633181126331811e-07, + "logits/chosen": -0.6855611801147461, + "logits/rejected": -0.665108323097229, + "logps/chosen": -82.07523345947266, + "logps/rejected": -102.21691131591797, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5878819227218628, + "rewards/margins": 18.011062622070312, + "rewards/rejected": -17.42317771911621, + "step": 1380 + }, + { + "epoch": 0.63, + "learning_rate": 2.628107559614409e-07, + "logits/chosen": -0.7162337303161621, + "logits/rejected": -0.6620529294013977, + "logps/chosen": -84.50788879394531, + "logps/rejected": -106.1350326538086, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0131562948226929, + "rewards/margins": 19.31102180480957, + "rewards/rejected": -18.297863006591797, + "step": 1390 + }, + { + "epoch": 0.64, + "learning_rate": 2.6230339928970063e-07, + "logits/chosen": -0.7488337159156799, + "logits/rejected": -0.6330257654190063, + "logps/chosen": -86.39143371582031, + "logps/rejected": -103.77738189697266, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3392806351184845, + "rewards/margins": 18.962505340576172, + "rewards/rejected": -18.62322425842285, + "step": 1400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -0.6586980819702148, + "eval_logits/rejected": -0.6006718873977661, + "eval_logps/chosen": -84.00611877441406, + "eval_logps/rejected": -102.19830322265625, + "eval_loss": 0.007832423783838749, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.7658681273460388, + "eval_rewards/margins": 19.43431854248047, + "eval_rewards/rejected": -18.6684513092041, + "eval_runtime": 72.5157, + "eval_samples_per_second": 39.467, + "eval_steps_per_second": 2.468, + "step": 1400 + }, + { + "epoch": 0.64, + "learning_rate": 2.617960426179604e-07, + "logits/chosen": -0.6780227422714233, + "logits/rejected": -0.6405975222587585, + "logps/chosen": -88.10275268554688, + "logps/rejected": -106.83785247802734, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2879269123077393, + "rewards/margins": 21.27725601196289, + "rewards/rejected": -19.989328384399414, + "step": 1410 + }, + { + "epoch": 0.65, + "learning_rate": 2.6128868594622016e-07, + "logits/chosen": -0.6182512044906616, + "logits/rejected": -0.6471190452575684, + "logps/chosen": -84.40079498291016, + "logps/rejected": -100.61917877197266, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8890922665596008, + "rewards/margins": 18.377668380737305, + "rewards/rejected": -17.488576889038086, + "step": 1420 + }, + { + "epoch": 0.65, + "learning_rate": 2.6078132927447995e-07, + "logits/chosen": -0.6950105428695679, + "logits/rejected": -0.6299742460250854, + "logps/chosen": -84.123046875, + "logps/rejected": -106.0107421875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2167333364486694, + "rewards/margins": 20.657690048217773, + "rewards/rejected": -19.44095802307129, + "step": 1430 + }, + { + "epoch": 0.66, + "learning_rate": 2.602739726027397e-07, + "logits/chosen": -0.6647752523422241, + "logits/rejected": -0.6143220663070679, + "logps/chosen": -82.63214874267578, + "logps/rejected": -105.60970306396484, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9395632743835449, + "rewards/margins": 20.366649627685547, + "rewards/rejected": -19.427085876464844, + "step": 1440 + }, + { + "epoch": 0.66, + "learning_rate": 2.597666159309995e-07, + "logits/chosen": -0.6042054295539856, + "logits/rejected": -0.5673761367797852, + "logps/chosen": -83.49287414550781, + "logps/rejected": -106.82151794433594, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18224623799324036, + "rewards/margins": 18.836721420288086, + "rewards/rejected": -18.654476165771484, + "step": 1450 + }, + { + "epoch": 0.67, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -0.6551016569137573, + "logits/rejected": -0.6009622812271118, + "logps/chosen": -86.40557861328125, + "logps/rejected": -108.91459655761719, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11953596770763397, + "rewards/margins": 18.390857696533203, + "rewards/rejected": -18.510395050048828, + "step": 1460 + }, + { + "epoch": 0.67, + "learning_rate": 2.58751902587519e-07, + "logits/chosen": -0.5883369445800781, + "logits/rejected": -0.6131059527397156, + "logps/chosen": -91.0303955078125, + "logps/rejected": -105.77290344238281, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7381661534309387, + "rewards/margins": 19.129926681518555, + "rewards/rejected": -18.391761779785156, + "step": 1470 + }, + { + "epoch": 0.68, + "learning_rate": 2.5824454591577876e-07, + "logits/chosen": -0.6731956005096436, + "logits/rejected": -0.623029351234436, + "logps/chosen": -85.10891723632812, + "logps/rejected": -102.04473876953125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1607452630996704, + "rewards/margins": 20.150548934936523, + "rewards/rejected": -18.989805221557617, + "step": 1480 + }, + { + "epoch": 0.68, + "learning_rate": 2.5773718924403855e-07, + "logits/chosen": -0.592241644859314, + "logits/rejected": -0.5942645072937012, + "logps/chosen": -85.31133270263672, + "logps/rejected": -105.8062973022461, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.626050591468811, + "rewards/margins": 19.598217010498047, + "rewards/rejected": -18.972166061401367, + "step": 1490 + }, + { + "epoch": 0.68, + "learning_rate": 2.572298325722983e-07, + "logits/chosen": -0.6044291257858276, + "logits/rejected": -0.5564228892326355, + "logps/chosen": -85.6323471069336, + "logps/rejected": -105.26658630371094, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7599972486495972, + "rewards/margins": 20.052276611328125, + "rewards/rejected": -19.292278289794922, + "step": 1500 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -0.654085099697113, + "eval_logits/rejected": -0.5967754125595093, + "eval_logps/chosen": -84.53275299072266, + "eval_logps/rejected": -104.18524932861328, + "eval_loss": 0.007872804999351501, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.5025545954704285, + "eval_rewards/margins": 20.164472579956055, + "eval_rewards/rejected": -19.66192054748535, + "eval_runtime": 70.7679, + "eval_samples_per_second": 40.442, + "eval_steps_per_second": 2.529, + "step": 1500 + }, + { + "epoch": 0.69, + "learning_rate": 2.567224759005581e-07, + "logits/chosen": -0.640856146812439, + "logits/rejected": -0.601913571357727, + "logps/chosen": -80.25975799560547, + "logps/rejected": -98.95476531982422, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07107143104076385, + "rewards/margins": 18.289194107055664, + "rewards/rejected": -18.218120574951172, + "step": 1510 + }, + { + "epoch": 0.69, + "learning_rate": 2.5621511922881783e-07, + "logits/chosen": -0.6410808563232422, + "logits/rejected": -0.6079914569854736, + "logps/chosen": -86.399658203125, + "logps/rejected": -102.8433609008789, + "loss": 0.005, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9533981084823608, + "rewards/margins": 21.374330520629883, + "rewards/rejected": -19.42093276977539, + "step": 1520 + }, + { + "epoch": 0.7, + "learning_rate": 2.557077625570776e-07, + "logits/chosen": -0.6295315027236938, + "logits/rejected": -0.6216704249382019, + "logps/chosen": -84.7662353515625, + "logps/rejected": -106.0814437866211, + "loss": 0.0067, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7130889892578125, + "rewards/margins": 17.965023040771484, + "rewards/rejected": -17.251934051513672, + "step": 1530 + }, + { + "epoch": 0.7, + "learning_rate": 2.5520040588533736e-07, + "logits/chosen": -0.6840890049934387, + "logits/rejected": -0.6534979939460754, + "logps/chosen": -87.48678588867188, + "logps/rejected": -104.89375305175781, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9938493967056274, + "rewards/margins": 20.103492736816406, + "rewards/rejected": -19.109643936157227, + "step": 1540 + }, + { + "epoch": 0.71, + "learning_rate": 2.5469304921359715e-07, + "logits/chosen": -0.7252383828163147, + "logits/rejected": -0.6697880029678345, + "logps/chosen": -83.24116516113281, + "logps/rejected": -105.6595687866211, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4993698596954346, + "rewards/margins": 19.923023223876953, + "rewards/rejected": -18.42365074157715, + "step": 1550 + }, + { + "epoch": 0.71, + "learning_rate": 2.541856925418569e-07, + "logits/chosen": -0.6816428899765015, + "logits/rejected": -0.6332991123199463, + "logps/chosen": -83.70225524902344, + "logps/rejected": -103.3266830444336, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8504802584648132, + "rewards/margins": 19.315753936767578, + "rewards/rejected": -18.465274810791016, + "step": 1560 + }, + { + "epoch": 0.72, + "learning_rate": 2.536783358701167e-07, + "logits/chosen": -0.691390872001648, + "logits/rejected": -0.6756331324577332, + "logps/chosen": -82.35096740722656, + "logps/rejected": -102.11773681640625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2216899394989014, + "rewards/margins": 20.077028274536133, + "rewards/rejected": -18.85533905029297, + "step": 1570 + }, + { + "epoch": 0.72, + "learning_rate": 2.5317097919837643e-07, + "logits/chosen": -0.7393444180488586, + "logits/rejected": -0.6760915517807007, + "logps/chosen": -82.11813354492188, + "logps/rejected": -105.13177490234375, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5508477687835693, + "rewards/margins": 20.428369522094727, + "rewards/rejected": -18.87752342224121, + "step": 1580 + }, + { + "epoch": 0.73, + "learning_rate": 2.526636225266362e-07, + "logits/chosen": -0.6938213109970093, + "logits/rejected": -0.6899979710578918, + "logps/chosen": -85.61151885986328, + "logps/rejected": -108.23921203613281, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1316661834716797, + "rewards/margins": 20.614761352539062, + "rewards/rejected": -19.483095169067383, + "step": 1590 + }, + { + "epoch": 0.73, + "learning_rate": 2.5215626585489596e-07, + "logits/chosen": -0.686176598072052, + "logits/rejected": -0.6618992686271667, + "logps/chosen": -80.52513885498047, + "logps/rejected": -104.21512603759766, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4206916391849518, + "rewards/margins": 19.16167449951172, + "rewards/rejected": -18.740983963012695, + "step": 1600 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -0.7077122926712036, + "eval_logits/rejected": -0.6482263207435608, + "eval_logps/chosen": -84.09796142578125, + "eval_logps/rejected": -103.77226257324219, + "eval_loss": 0.006887392140924931, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.7199439406394958, + "eval_rewards/margins": 20.175371170043945, + "eval_rewards/rejected": -19.455429077148438, + "eval_runtime": 79.6738, + "eval_samples_per_second": 35.921, + "eval_steps_per_second": 2.247, + "step": 1600 + }, + { + "epoch": 0.73, + "learning_rate": 2.5164890918315575e-07, + "logits/chosen": -0.8257538080215454, + "logits/rejected": -0.7568483948707581, + "logps/chosen": -84.69251251220703, + "logps/rejected": -105.01495361328125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7993510961532593, + "rewards/margins": 20.178592681884766, + "rewards/rejected": -19.379241943359375, + "step": 1610 + }, + { + "epoch": 0.74, + "learning_rate": 2.511415525114155e-07, + "logits/chosen": -0.7131239771842957, + "logits/rejected": -0.6775761246681213, + "logps/chosen": -85.8831787109375, + "logps/rejected": -106.05537414550781, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2889799177646637, + "rewards/margins": 18.521915435791016, + "rewards/rejected": -18.810894012451172, + "step": 1620 + }, + { + "epoch": 0.74, + "learning_rate": 2.506341958396753e-07, + "logits/chosen": -0.6207653880119324, + "logits/rejected": -0.6060599088668823, + "logps/chosen": -85.2615966796875, + "logps/rejected": -105.60963439941406, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7492761015892029, + "rewards/margins": 21.529277801513672, + "rewards/rejected": -20.78000259399414, + "step": 1630 + }, + { + "epoch": 0.75, + "learning_rate": 2.5012683916793503e-07, + "logits/chosen": -0.621131420135498, + "logits/rejected": -0.6042443513870239, + "logps/chosen": -90.21701049804688, + "logps/rejected": -105.9647445678711, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9219605922698975, + "rewards/margins": 21.281051635742188, + "rewards/rejected": -19.359088897705078, + "step": 1640 + }, + { + "epoch": 0.75, + "learning_rate": 2.496194824961948e-07, + "logits/chosen": -0.713904619216919, + "logits/rejected": -0.6409584283828735, + "logps/chosen": -84.72190856933594, + "logps/rejected": -105.33198547363281, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.556586742401123, + "rewards/margins": 21.801387786865234, + "rewards/rejected": -20.244800567626953, + "step": 1650 + }, + { + "epoch": 0.76, + "learning_rate": 2.4911212582445456e-07, + "logits/chosen": -0.6484477519989014, + "logits/rejected": -0.6336522102355957, + "logps/chosen": -83.5719985961914, + "logps/rejected": -104.5160903930664, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5944575071334839, + "rewards/margins": 20.492382049560547, + "rewards/rejected": -18.897924423217773, + "step": 1660 + }, + { + "epoch": 0.76, + "learning_rate": 2.4860476915271435e-07, + "logits/chosen": -0.7745485901832581, + "logits/rejected": -0.7227998971939087, + "logps/chosen": -82.86568450927734, + "logps/rejected": -106.18217468261719, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.017204999923706, + "rewards/margins": 21.1662540435791, + "rewards/rejected": -20.1490478515625, + "step": 1670 + }, + { + "epoch": 0.77, + "learning_rate": 2.480974124809741e-07, + "logits/chosen": -0.7258914709091187, + "logits/rejected": -0.6682701110839844, + "logps/chosen": -90.38239288330078, + "logps/rejected": -111.46282958984375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.219024896621704, + "rewards/margins": 21.0469913482666, + "rewards/rejected": -19.82796859741211, + "step": 1680 + }, + { + "epoch": 0.77, + "learning_rate": 2.475900558092339e-07, + "logits/chosen": -0.7296860814094543, + "logits/rejected": -0.734585165977478, + "logps/chosen": -86.05227661132812, + "logps/rejected": -102.233154296875, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8629347085952759, + "rewards/margins": 19.730310440063477, + "rewards/rejected": -18.867374420166016, + "step": 1690 + }, + { + "epoch": 0.78, + "learning_rate": 2.4708269913749363e-07, + "logits/chosen": -0.7404943704605103, + "logits/rejected": -0.6497074961662292, + "logps/chosen": -82.794677734375, + "logps/rejected": -106.5738754272461, + "loss": 0.0038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8510491251945496, + "rewards/margins": 19.946802139282227, + "rewards/rejected": -19.095752716064453, + "step": 1700 + }, + { + "epoch": 0.78, + "eval_logits/chosen": -0.6825948357582092, + "eval_logits/rejected": -0.6247321367263794, + "eval_logps/chosen": -83.7663803100586, + "eval_logps/rejected": -104.18630981445312, + "eval_loss": 0.0068258135579526424, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.8857384324073792, + "eval_rewards/margins": 20.548185348510742, + "eval_rewards/rejected": -19.662446975708008, + "eval_runtime": 70.3883, + "eval_samples_per_second": 40.66, + "eval_steps_per_second": 2.543, + "step": 1700 + }, + { + "epoch": 0.78, + "learning_rate": 2.465753424657534e-07, + "logits/chosen": -0.6451160311698914, + "logits/rejected": -0.7110374569892883, + "logps/chosen": -81.88134765625, + "logps/rejected": -104.2155990600586, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6395124197006226, + "rewards/margins": 20.917659759521484, + "rewards/rejected": -19.278146743774414, + "step": 1710 + }, + { + "epoch": 0.79, + "learning_rate": 2.4606798579401316e-07, + "logits/chosen": -0.6008567810058594, + "logits/rejected": -0.5818469524383545, + "logps/chosen": -84.36399841308594, + "logps/rejected": -108.04344177246094, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9072425961494446, + "rewards/margins": 19.935123443603516, + "rewards/rejected": -19.027881622314453, + "step": 1720 + }, + { + "epoch": 0.79, + "learning_rate": 2.4556062912227295e-07, + "logits/chosen": -0.6780723333358765, + "logits/rejected": -0.667972981929779, + "logps/chosen": -86.9156265258789, + "logps/rejected": -105.91011810302734, + "loss": 0.005, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.872097373008728, + "rewards/margins": 20.565614700317383, + "rewards/rejected": -18.693517684936523, + "step": 1730 + }, + { + "epoch": 0.79, + "learning_rate": 2.450532724505327e-07, + "logits/chosen": -0.6924620866775513, + "logits/rejected": -0.6552094221115112, + "logps/chosen": -83.34413146972656, + "logps/rejected": -103.43766784667969, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4335517883300781, + "rewards/margins": 20.293771743774414, + "rewards/rejected": -18.860218048095703, + "step": 1740 + }, + { + "epoch": 0.8, + "learning_rate": 2.445459157787925e-07, + "logits/chosen": -0.7454475164413452, + "logits/rejected": -0.7243833541870117, + "logps/chosen": -88.2259292602539, + "logps/rejected": -105.4636459350586, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.127286672592163, + "rewards/margins": 18.987035751342773, + "rewards/rejected": -17.859750747680664, + "step": 1750 + }, + { + "epoch": 0.8, + "learning_rate": 2.4403855910705223e-07, + "logits/chosen": -0.7101667523384094, + "logits/rejected": -0.7183898091316223, + "logps/chosen": -81.78260803222656, + "logps/rejected": -104.47676086425781, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3130123019218445, + "rewards/margins": 18.950632095336914, + "rewards/rejected": -18.637617111206055, + "step": 1760 + }, + { + "epoch": 0.81, + "learning_rate": 2.43531202435312e-07, + "logits/chosen": -0.6741453409194946, + "logits/rejected": -0.6272802352905273, + "logps/chosen": -90.71461486816406, + "logps/rejected": -106.30870056152344, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5790307521820068, + "rewards/margins": 20.211219787597656, + "rewards/rejected": -18.632186889648438, + "step": 1770 + }, + { + "epoch": 0.81, + "learning_rate": 2.4302384576357176e-07, + "logits/chosen": -0.7256354093551636, + "logits/rejected": -0.6887451410293579, + "logps/chosen": -87.96391296386719, + "logps/rejected": -116.27462005615234, + "loss": 0.0014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.098422646522522, + "rewards/margins": 22.23649787902832, + "rewards/rejected": -21.138076782226562, + "step": 1780 + }, + { + "epoch": 0.82, + "learning_rate": 2.4251648909183155e-07, + "logits/chosen": -0.723190188407898, + "logits/rejected": -0.687700629234314, + "logps/chosen": -85.64554595947266, + "logps/rejected": -108.02901458740234, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.214348793029785, + "rewards/margins": 21.39786720275879, + "rewards/rejected": -19.183521270751953, + "step": 1790 + }, + { + "epoch": 0.82, + "learning_rate": 2.420091324200913e-07, + "logits/chosen": -0.7096911668777466, + "logits/rejected": -0.6674980521202087, + "logps/chosen": -84.20054626464844, + "logps/rejected": -107.4981460571289, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2375263273715973, + "rewards/margins": 19.647363662719727, + "rewards/rejected": -19.409835815429688, + "step": 1800 + }, + { + "epoch": 0.82, + "eval_logits/chosen": -0.6538522243499756, + "eval_logits/rejected": -0.5991846919059753, + "eval_logps/chosen": -83.35757446289062, + "eval_logps/rejected": -103.30670928955078, + "eval_loss": 0.006914378609508276, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 1.0901424884796143, + "eval_rewards/margins": 20.312789916992188, + "eval_rewards/rejected": -19.22264862060547, + "eval_runtime": 89.8281, + "eval_samples_per_second": 31.861, + "eval_steps_per_second": 1.993, + "step": 1800 + }, + { + "epoch": 0.83, + "learning_rate": 2.415017757483511e-07, + "logits/chosen": -0.6579132080078125, + "logits/rejected": -0.6017246246337891, + "logps/chosen": -91.52220916748047, + "logps/rejected": -106.00288391113281, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6669795513153076, + "rewards/margins": 20.964008331298828, + "rewards/rejected": -19.297027587890625, + "step": 1810 + }, + { + "epoch": 0.83, + "learning_rate": 2.409944190766108e-07, + "logits/chosen": -0.7107955813407898, + "logits/rejected": -0.6973943710327148, + "logps/chosen": -82.12738037109375, + "logps/rejected": -107.88604736328125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30672594904899597, + "rewards/margins": 20.473596572875977, + "rewards/rejected": -20.1668701171875, + "step": 1820 + }, + { + "epoch": 0.84, + "learning_rate": 2.404870624048706e-07, + "logits/chosen": -0.668049156665802, + "logits/rejected": -0.6822776198387146, + "logps/chosen": -80.28944396972656, + "logps/rejected": -101.49888610839844, + "loss": 0.002, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1876381635665894, + "rewards/margins": 20.42356300354004, + "rewards/rejected": -19.235923767089844, + "step": 1830 + }, + { + "epoch": 0.84, + "learning_rate": 2.3997970573313036e-07, + "logits/chosen": -0.6816455125808716, + "logits/rejected": -0.6640302538871765, + "logps/chosen": -82.68707275390625, + "logps/rejected": -102.19058990478516, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3355101346969604, + "rewards/margins": 21.309734344482422, + "rewards/rejected": -19.974224090576172, + "step": 1840 + }, + { + "epoch": 0.84, + "learning_rate": 2.3947234906139015e-07, + "logits/chosen": -0.7045928239822388, + "logits/rejected": -0.7241901159286499, + "logps/chosen": -85.01335144042969, + "logps/rejected": -108.6558609008789, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4674608707427979, + "rewards/margins": 22.616294860839844, + "rewards/rejected": -21.148834228515625, + "step": 1850 + }, + { + "epoch": 0.85, + "learning_rate": 2.389649923896499e-07, + "logits/chosen": -0.6199553608894348, + "logits/rejected": -0.6317640542984009, + "logps/chosen": -85.36334228515625, + "logps/rejected": -110.83219146728516, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9138463735580444, + "rewards/margins": 22.010051727294922, + "rewards/rejected": -20.096206665039062, + "step": 1860 + }, + { + "epoch": 0.85, + "learning_rate": 2.384576357179097e-07, + "logits/chosen": -0.6020737290382385, + "logits/rejected": -0.6157525777816772, + "logps/chosen": -89.1829605102539, + "logps/rejected": -111.08561706542969, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7350738048553467, + "rewards/margins": 22.97161102294922, + "rewards/rejected": -21.23653793334961, + "step": 1870 + }, + { + "epoch": 0.86, + "learning_rate": 2.3795027904616943e-07, + "logits/chosen": -0.7352172136306763, + "logits/rejected": -0.6482657194137573, + "logps/chosen": -89.54198455810547, + "logps/rejected": -111.14498138427734, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4496486783027649, + "rewards/margins": 19.777273178100586, + "rewards/rejected": -20.22692108154297, + "step": 1880 + }, + { + "epoch": 0.86, + "learning_rate": 2.374429223744292e-07, + "logits/chosen": -0.6808261275291443, + "logits/rejected": -0.6242018938064575, + "logps/chosen": -81.02848815917969, + "logps/rejected": -103.51603698730469, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4517748355865479, + "rewards/margins": 22.18962287902832, + "rewards/rejected": -20.73784637451172, + "step": 1890 + }, + { + "epoch": 0.87, + "learning_rate": 2.3693556570268896e-07, + "logits/chosen": -0.7782861590385437, + "logits/rejected": -0.7513757944107056, + "logps/chosen": -83.20822143554688, + "logps/rejected": -109.80049133300781, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.010401725769043, + "rewards/margins": 20.9800968170166, + "rewards/rejected": -19.969696044921875, + "step": 1900 + }, + { + "epoch": 0.87, + "eval_logits/chosen": -0.680737316608429, + "eval_logits/rejected": -0.6232161521911621, + "eval_logps/chosen": -83.85591888427734, + "eval_logps/rejected": -106.48957824707031, + "eval_loss": 0.006986773107200861, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.8409678936004639, + "eval_rewards/margins": 21.655057907104492, + "eval_rewards/rejected": -20.8140926361084, + "eval_runtime": 76.156, + "eval_samples_per_second": 37.581, + "eval_steps_per_second": 2.35, + "step": 1900 + }, + { + "epoch": 0.87, + "learning_rate": 2.3642820903094873e-07, + "logits/chosen": -0.7015836834907532, + "logits/rejected": -0.5991064310073853, + "logps/chosen": -81.31778717041016, + "logps/rejected": -109.6204833984375, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0844857692718506, + "rewards/margins": 22.631372451782227, + "rewards/rejected": -21.546886444091797, + "step": 1910 + }, + { + "epoch": 0.88, + "learning_rate": 2.359208523592085e-07, + "logits/chosen": -0.7016021609306335, + "logits/rejected": -0.6770969033241272, + "logps/chosen": -84.92622375488281, + "logps/rejected": -106.5382080078125, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.17899462580680847, + "rewards/margins": 20.394420623779297, + "rewards/rejected": -20.21542739868164, + "step": 1920 + }, + { + "epoch": 0.88, + "learning_rate": 2.3541349568746826e-07, + "logits/chosen": -0.7778208255767822, + "logits/rejected": -0.698055624961853, + "logps/chosen": -86.36054992675781, + "logps/rejected": -105.30062103271484, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3135640621185303, + "rewards/margins": 20.72547721862793, + "rewards/rejected": -19.411914825439453, + "step": 1930 + }, + { + "epoch": 0.89, + "learning_rate": 2.3490613901572803e-07, + "logits/chosen": -0.6952825784683228, + "logits/rejected": -0.656592071056366, + "logps/chosen": -81.763671875, + "logps/rejected": -106.61894226074219, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49342602491378784, + "rewards/margins": 20.16998863220215, + "rewards/rejected": -19.676563262939453, + "step": 1940 + }, + { + "epoch": 0.89, + "learning_rate": 2.343987823439878e-07, + "logits/chosen": -0.6903911828994751, + "logits/rejected": -0.6286421418190002, + "logps/chosen": -79.79450225830078, + "logps/rejected": -106.4630126953125, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1111063957214355, + "rewards/margins": 22.001352310180664, + "rewards/rejected": -19.890243530273438, + "step": 1950 + }, + { + "epoch": 0.89, + "learning_rate": 2.3389142567224756e-07, + "logits/chosen": -0.7639296650886536, + "logits/rejected": -0.7568944096565247, + "logps/chosen": -90.86905670166016, + "logps/rejected": -113.7149658203125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12427709251642227, + "rewards/margins": 20.833873748779297, + "rewards/rejected": -20.7095947265625, + "step": 1960 + }, + { + "epoch": 0.9, + "learning_rate": 2.3338406900050733e-07, + "logits/chosen": -0.7693318128585815, + "logits/rejected": -0.6867147088050842, + "logps/chosen": -84.40339660644531, + "logps/rejected": -104.76747131347656, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.991206169128418, + "rewards/margins": 22.942188262939453, + "rewards/rejected": -20.95098304748535, + "step": 1970 + }, + { + "epoch": 0.9, + "learning_rate": 2.328767123287671e-07, + "logits/chosen": -0.7538214325904846, + "logits/rejected": -0.7061902284622192, + "logps/chosen": -84.06021118164062, + "logps/rejected": -112.34172058105469, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09169814735651016, + "rewards/margins": 22.0313720703125, + "rewards/rejected": -22.123071670532227, + "step": 1980 + }, + { + "epoch": 0.91, + "learning_rate": 2.3236935565702686e-07, + "logits/chosen": -0.7687441110610962, + "logits/rejected": -0.6658346056938171, + "logps/chosen": -82.9585952758789, + "logps/rejected": -106.9176025390625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7672392725944519, + "rewards/margins": 20.316686630249023, + "rewards/rejected": -19.549448013305664, + "step": 1990 + }, + { + "epoch": 0.91, + "learning_rate": 2.3186199898528663e-07, + "logits/chosen": -0.7342469096183777, + "logits/rejected": -0.7138758897781372, + "logps/chosen": -82.50282287597656, + "logps/rejected": -102.0538101196289, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.334281325340271, + "rewards/margins": 19.689542770385742, + "rewards/rejected": -18.355262756347656, + "step": 2000 + }, + { + "epoch": 0.91, + "eval_logits/chosen": -0.7237182855606079, + "eval_logits/rejected": -0.6640624403953552, + "eval_logps/chosen": -83.15184020996094, + "eval_logps/rejected": -103.29666900634766, + "eval_loss": 0.005852441303431988, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.1930080652236938, + "eval_rewards/margins": 20.410642623901367, + "eval_rewards/rejected": -19.217632293701172, + "eval_runtime": 81.642, + "eval_samples_per_second": 35.055, + "eval_steps_per_second": 2.192, + "step": 2000 + }, + { + "epoch": 0.92, + "learning_rate": 2.313546423135464e-07, + "logits/chosen": -0.7072040438652039, + "logits/rejected": -0.6981512904167175, + "logps/chosen": -89.46893310546875, + "logps/rejected": -106.85356140136719, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0839965343475342, + "rewards/margins": 20.12752342224121, + "rewards/rejected": -19.043527603149414, + "step": 2010 + }, + { + "epoch": 0.92, + "learning_rate": 2.3084728564180616e-07, + "logits/chosen": -0.6174991726875305, + "logits/rejected": -0.5943428874015808, + "logps/chosen": -81.58921813964844, + "logps/rejected": -108.2657241821289, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.291887879371643, + "rewards/margins": 19.339893341064453, + "rewards/rejected": -18.04800796508789, + "step": 2020 + }, + { + "epoch": 0.93, + "learning_rate": 2.3033992897006593e-07, + "logits/chosen": -0.7860113978385925, + "logits/rejected": -0.7448928356170654, + "logps/chosen": -84.71741485595703, + "logps/rejected": -102.28780364990234, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6886265277862549, + "rewards/margins": 21.161090850830078, + "rewards/rejected": -19.472463607788086, + "step": 2030 + }, + { + "epoch": 0.93, + "learning_rate": 2.298325722983257e-07, + "logits/chosen": -0.6835452318191528, + "logits/rejected": -0.6044243574142456, + "logps/chosen": -86.92694091796875, + "logps/rejected": -108.12068176269531, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0500653982162476, + "rewards/margins": 20.692108154296875, + "rewards/rejected": -19.642040252685547, + "step": 2040 + }, + { + "epoch": 0.94, + "learning_rate": 2.2932521562658546e-07, + "logits/chosen": -0.682311475276947, + "logits/rejected": -0.6454225778579712, + "logps/chosen": -84.58203887939453, + "logps/rejected": -105.5318832397461, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.446061134338379, + "rewards/margins": 20.76397132873535, + "rewards/rejected": -19.317909240722656, + "step": 2050 + }, + { + "epoch": 0.94, + "learning_rate": 2.2881785895484523e-07, + "logits/chosen": -0.7142086625099182, + "logits/rejected": -0.7194957733154297, + "logps/chosen": -86.15142822265625, + "logps/rejected": -106.82261657714844, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6515350341796875, + "rewards/margins": 21.164995193481445, + "rewards/rejected": -19.513460159301758, + "step": 2060 + }, + { + "epoch": 0.94, + "learning_rate": 2.28310502283105e-07, + "logits/chosen": -0.6685755848884583, + "logits/rejected": -0.648827075958252, + "logps/chosen": -77.6914291381836, + "logps/rejected": -105.21824645996094, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0622940063476562, + "rewards/margins": 20.59681510925293, + "rewards/rejected": -18.534521102905273, + "step": 2070 + }, + { + "epoch": 0.95, + "learning_rate": 2.2780314561136476e-07, + "logits/chosen": -0.7563666105270386, + "logits/rejected": -0.655853271484375, + "logps/chosen": -86.95856475830078, + "logps/rejected": -102.80877685546875, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8138997554779053, + "rewards/margins": 20.457874298095703, + "rewards/rejected": -18.643972396850586, + "step": 2080 + }, + { + "epoch": 0.95, + "learning_rate": 2.2729578893962453e-07, + "logits/chosen": -0.7362481951713562, + "logits/rejected": -0.7294582724571228, + "logps/chosen": -84.70967102050781, + "logps/rejected": -106.26744079589844, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.107060670852661, + "rewards/margins": 21.18497657775879, + "rewards/rejected": -19.077917098999023, + "step": 2090 + }, + { + "epoch": 0.96, + "learning_rate": 2.267884322678843e-07, + "logits/chosen": -0.7865076065063477, + "logits/rejected": -0.6870723962783813, + "logps/chosen": -83.01496887207031, + "logps/rejected": -101.82795715332031, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8749852180480957, + "rewards/margins": 19.56221580505371, + "rewards/rejected": -18.687232971191406, + "step": 2100 + }, + { + "epoch": 0.96, + "eval_logits/chosen": -0.6922763586044312, + "eval_logits/rejected": -0.6358173489570618, + "eval_logps/chosen": -82.23265075683594, + "eval_logps/rejected": -101.94881439208984, + "eval_loss": 0.005928453989326954, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.6526042222976685, + "eval_rewards/margins": 20.196311950683594, + "eval_rewards/rejected": -18.5437068939209, + "eval_runtime": 73.4438, + "eval_samples_per_second": 38.969, + "eval_steps_per_second": 2.437, + "step": 2100 + }, + { + "epoch": 0.96, + "learning_rate": 2.2628107559614406e-07, + "logits/chosen": -0.7227433323860168, + "logits/rejected": -0.6691815257072449, + "logps/chosen": -80.89137268066406, + "logps/rejected": -108.2369613647461, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.246288537979126, + "rewards/margins": 21.08827781677246, + "rewards/rejected": -19.841989517211914, + "step": 2110 + }, + { + "epoch": 0.97, + "learning_rate": 2.2577371892440383e-07, + "logits/chosen": -0.6783403754234314, + "logits/rejected": -0.6632364988327026, + "logps/chosen": -86.68379211425781, + "logps/rejected": -105.72774505615234, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1012396812438965, + "rewards/margins": 21.37411117553711, + "rewards/rejected": -19.272871017456055, + "step": 2120 + }, + { + "epoch": 0.97, + "learning_rate": 2.252663622526636e-07, + "logits/chosen": -0.6918590664863586, + "logits/rejected": -0.7273339033126831, + "logps/chosen": -87.85774993896484, + "logps/rejected": -106.26072692871094, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5966049432754517, + "rewards/margins": 20.83112907409668, + "rewards/rejected": -19.23452377319336, + "step": 2130 + }, + { + "epoch": 0.98, + "learning_rate": 2.2475900558092336e-07, + "logits/chosen": -0.7496614456176758, + "logits/rejected": -0.7174783945083618, + "logps/chosen": -81.19893646240234, + "logps/rejected": -107.23223876953125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5595566034317017, + "rewards/margins": 21.59109115600586, + "rewards/rejected": -20.031536102294922, + "step": 2140 + }, + { + "epoch": 0.98, + "learning_rate": 2.2425164890918313e-07, + "logits/chosen": -0.695356011390686, + "logits/rejected": -0.6898726224899292, + "logps/chosen": -88.45246887207031, + "logps/rejected": -107.5653076171875, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.30954909324646, + "rewards/margins": 21.355260848999023, + "rewards/rejected": -19.045711517333984, + "step": 2150 + }, + { + "epoch": 0.99, + "learning_rate": 2.237442922374429e-07, + "logits/chosen": -0.6493713855743408, + "logits/rejected": -0.6603960394859314, + "logps/chosen": -83.4217300415039, + "logps/rejected": -100.755859375, + "loss": 0.0031, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.42486119270324707, + "rewards/margins": 18.281320571899414, + "rewards/rejected": -17.85645866394043, + "step": 2160 + }, + { + "epoch": 0.99, + "learning_rate": 2.2323693556570266e-07, + "logits/chosen": -0.7269707322120667, + "logits/rejected": -0.6796764135360718, + "logps/chosen": -83.48429870605469, + "logps/rejected": -108.75740814208984, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3360190391540527, + "rewards/margins": 21.05870819091797, + "rewards/rejected": -18.72269058227539, + "step": 2170 + }, + { + "epoch": 1.0, + "learning_rate": 2.2272957889396242e-07, + "logits/chosen": -0.6619648933410645, + "logits/rejected": -0.6784309148788452, + "logps/chosen": -84.5233383178711, + "logps/rejected": -104.6445541381836, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9546858072280884, + "rewards/margins": 19.96062660217285, + "rewards/rejected": -19.005939483642578, + "step": 2180 + }, + { + "epoch": 1.0, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -0.7001414895057678, + "logits/rejected": -0.700007438659668, + "logps/chosen": -88.71408081054688, + "logps/rejected": -104.5293960571289, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5879231691360474, + "rewards/margins": 20.470571517944336, + "rewards/rejected": -18.882648468017578, + "step": 2190 + }, + { + "epoch": 1.0, + "learning_rate": 2.2171486555048196e-07, + "logits/chosen": -0.751279354095459, + "logits/rejected": -0.7181065082550049, + "logps/chosen": -85.7071762084961, + "logps/rejected": -111.83939361572266, + "loss": 0.0024, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.351722240447998, + "rewards/margins": 22.148082733154297, + "rewards/rejected": -19.796356201171875, + "step": 2200 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -0.7133587598800659, + "eval_logits/rejected": -0.6583219170570374, + "eval_logps/chosen": -83.3022689819336, + "eval_logps/rejected": -102.72750854492188, + "eval_loss": 0.005814776755869389, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.1177955865859985, + "eval_rewards/margins": 20.05084991455078, + "eval_rewards/rejected": -18.933055877685547, + "eval_runtime": 73.4127, + "eval_samples_per_second": 38.985, + "eval_steps_per_second": 2.438, + "step": 2200 + }, + { + "epoch": 1.01, + "learning_rate": 2.2120750887874172e-07, + "logits/chosen": -0.7339301109313965, + "logits/rejected": -0.7166422605514526, + "logps/chosen": -86.40919494628906, + "logps/rejected": -111.6192855834961, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.697614073753357, + "rewards/margins": 21.687803268432617, + "rewards/rejected": -19.990190505981445, + "step": 2210 + }, + { + "epoch": 1.01, + "learning_rate": 2.207001522070015e-07, + "logits/chosen": -0.6801533699035645, + "logits/rejected": -0.6576797366142273, + "logps/chosen": -89.45710754394531, + "logps/rejected": -109.4631576538086, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.813921332359314, + "rewards/margins": 21.14259147644043, + "rewards/rejected": -19.32866859436035, + "step": 2220 + }, + { + "epoch": 1.02, + "learning_rate": 2.2019279553526126e-07, + "logits/chosen": -0.688944935798645, + "logits/rejected": -0.6999958753585815, + "logps/chosen": -93.91636657714844, + "logps/rejected": -105.1122817993164, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1669729948043823, + "rewards/margins": 19.678924560546875, + "rewards/rejected": -18.511951446533203, + "step": 2230 + }, + { + "epoch": 1.02, + "learning_rate": 2.1968543886352102e-07, + "logits/chosen": -0.7800687551498413, + "logits/rejected": -0.7249246835708618, + "logps/chosen": -81.42112731933594, + "logps/rejected": -106.3897476196289, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4921230673789978, + "rewards/margins": 19.78441619873047, + "rewards/rejected": -19.292295455932617, + "step": 2240 + }, + { + "epoch": 1.03, + "learning_rate": 2.191780821917808e-07, + "logits/chosen": -0.7485274076461792, + "logits/rejected": -0.7021461725234985, + "logps/chosen": -83.02845764160156, + "logps/rejected": -108.2826156616211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2401946783065796, + "rewards/margins": 20.906063079833984, + "rewards/rejected": -19.66586685180664, + "step": 2250 + }, + { + "epoch": 1.03, + "learning_rate": 2.1867072552004056e-07, + "logits/chosen": -0.8463417887687683, + "logits/rejected": -0.8135510683059692, + "logps/chosen": -84.13587951660156, + "logps/rejected": -109.97297668457031, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8690627813339233, + "rewards/margins": 22.567180633544922, + "rewards/rejected": -20.6981201171875, + "step": 2260 + }, + { + "epoch": 1.04, + "learning_rate": 2.1816336884830032e-07, + "logits/chosen": -0.6881771087646484, + "logits/rejected": -0.7314122915267944, + "logps/chosen": -79.51889038085938, + "logps/rejected": -106.68363952636719, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6152458190917969, + "rewards/margins": 21.590240478515625, + "rewards/rejected": -19.97499656677246, + "step": 2270 + }, + { + "epoch": 1.04, + "learning_rate": 2.176560121765601e-07, + "logits/chosen": -0.8350343704223633, + "logits/rejected": -0.7969235181808472, + "logps/chosen": -84.02780151367188, + "logps/rejected": -108.79520416259766, + "loss": 0.0071, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4147300720214844, + "rewards/margins": 20.76464080810547, + "rewards/rejected": -19.349910736083984, + "step": 2280 + }, + { + "epoch": 1.05, + "learning_rate": 2.1714865550481986e-07, + "logits/chosen": -0.7332528829574585, + "logits/rejected": -0.7286131978034973, + "logps/chosen": -83.72573852539062, + "logps/rejected": -106.83250427246094, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2933398485183716, + "rewards/margins": 21.30584144592285, + "rewards/rejected": -20.012500762939453, + "step": 2290 + }, + { + "epoch": 1.05, + "learning_rate": 2.1664129883307962e-07, + "logits/chosen": -0.7267037630081177, + "logits/rejected": -0.704474925994873, + "logps/chosen": -83.55988311767578, + "logps/rejected": -106.81196594238281, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4146488308906555, + "rewards/margins": 20.529951095581055, + "rewards/rejected": -20.115304946899414, + "step": 2300 + }, + { + "epoch": 1.05, + "eval_logits/chosen": -0.746123731136322, + "eval_logits/rejected": -0.6872346997261047, + "eval_logps/chosen": -83.3327865600586, + "eval_logps/rejected": -103.95221710205078, + "eval_loss": 0.005804476328194141, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.1025363206863403, + "eval_rewards/margins": 20.6479434967041, + "eval_rewards/rejected": -19.54541015625, + "eval_runtime": 77.2413, + "eval_samples_per_second": 37.053, + "eval_steps_per_second": 2.317, + "step": 2300 + }, + { + "epoch": 1.05, + "learning_rate": 2.161339421613394e-07, + "logits/chosen": -0.7835151553153992, + "logits/rejected": -0.7966066598892212, + "logps/chosen": -82.83807373046875, + "logps/rejected": -111.46928405761719, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7062146067619324, + "rewards/margins": 20.392162322998047, + "rewards/rejected": -19.68594741821289, + "step": 2310 + }, + { + "epoch": 1.06, + "learning_rate": 2.1562658548959916e-07, + "logits/chosen": -0.8584432601928711, + "logits/rejected": -0.7813988924026489, + "logps/chosen": -86.52935791015625, + "logps/rejected": -106.8839340209961, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7992275357246399, + "rewards/margins": 22.510517120361328, + "rewards/rejected": -21.71129035949707, + "step": 2320 + }, + { + "epoch": 1.06, + "learning_rate": 2.1511922881785892e-07, + "logits/chosen": -0.6985601186752319, + "logits/rejected": -0.6839465498924255, + "logps/chosen": -85.86921691894531, + "logps/rejected": -106.46211242675781, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41637343168258667, + "rewards/margins": 19.63858985900879, + "rewards/rejected": -19.22221565246582, + "step": 2330 + }, + { + "epoch": 1.07, + "learning_rate": 2.146118721461187e-07, + "logits/chosen": -0.7708380818367004, + "logits/rejected": -0.7677600979804993, + "logps/chosen": -84.77110290527344, + "logps/rejected": -111.69419860839844, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7214634418487549, + "rewards/margins": 22.269315719604492, + "rewards/rejected": -20.5478515625, + "step": 2340 + }, + { + "epoch": 1.07, + "learning_rate": 2.1410451547437846e-07, + "logits/chosen": -0.8053115606307983, + "logits/rejected": -0.8239187002182007, + "logps/chosen": -85.80523681640625, + "logps/rejected": -108.161865234375, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.04756236076355, + "rewards/margins": 22.632070541381836, + "rewards/rejected": -20.58450698852539, + "step": 2350 + }, + { + "epoch": 1.08, + "learning_rate": 2.1359715880263822e-07, + "logits/chosen": -0.7752918004989624, + "logits/rejected": -0.7559579610824585, + "logps/chosen": -83.23637390136719, + "logps/rejected": -108.0810546875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.097753882408142, + "rewards/margins": 21.340835571289062, + "rewards/rejected": -20.24308204650879, + "step": 2360 + }, + { + "epoch": 1.08, + "learning_rate": 2.13089802130898e-07, + "logits/chosen": -0.755330502986908, + "logits/rejected": -0.6864339709281921, + "logps/chosen": -83.15589904785156, + "logps/rejected": -113.9013442993164, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9833482503890991, + "rewards/margins": 23.426158905029297, + "rewards/rejected": -22.44281005859375, + "step": 2370 + }, + { + "epoch": 1.09, + "learning_rate": 2.1258244545915776e-07, + "logits/chosen": -0.718736469745636, + "logits/rejected": -0.6083402037620544, + "logps/chosen": -82.97991943359375, + "logps/rejected": -103.54791259765625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28329333662986755, + "rewards/margins": 20.23536491394043, + "rewards/rejected": -19.952070236206055, + "step": 2380 + }, + { + "epoch": 1.09, + "learning_rate": 2.1207508878741752e-07, + "logits/chosen": -0.7845762968063354, + "logits/rejected": -0.7489625215530396, + "logps/chosen": -86.88125610351562, + "logps/rejected": -109.0585708618164, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3073398172855377, + "rewards/margins": 21.779104232788086, + "rewards/rejected": -21.471765518188477, + "step": 2390 + }, + { + "epoch": 1.1, + "learning_rate": 2.115677321156773e-07, + "logits/chosen": -0.7549251317977905, + "logits/rejected": -0.725717306137085, + "logps/chosen": -84.1806640625, + "logps/rejected": -109.55061340332031, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6283788681030273, + "rewards/margins": 21.849435806274414, + "rewards/rejected": -21.221057891845703, + "step": 2400 + }, + { + "epoch": 1.1, + "eval_logits/chosen": -0.7693464159965515, + "eval_logits/rejected": -0.7055094838142395, + "eval_logps/chosen": -83.79874420166016, + "eval_logps/rejected": -107.29126739501953, + "eval_loss": 0.006137872580438852, + "eval_rewards/accuracies": 0.9972066879272461, + "eval_rewards/chosen": 0.8695586919784546, + "eval_rewards/margins": 22.08449363708496, + "eval_rewards/rejected": -21.214935302734375, + "eval_runtime": 76.2893, + "eval_samples_per_second": 37.515, + "eval_steps_per_second": 2.346, + "step": 2400 + }, + { + "epoch": 1.1, + "learning_rate": 2.1106037544393706e-07, + "logits/chosen": -0.7792016267776489, + "logits/rejected": -0.7324903011322021, + "logps/chosen": -85.0523681640625, + "logps/rejected": -111.51036071777344, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.077381134033203, + "rewards/margins": 23.63772201538086, + "rewards/rejected": -21.56034278869629, + "step": 2410 + }, + { + "epoch": 1.1, + "learning_rate": 2.1055301877219682e-07, + "logits/chosen": -0.8506320714950562, + "logits/rejected": -0.7847840189933777, + "logps/chosen": -85.75637817382812, + "logps/rejected": -113.4081802368164, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6911964416503906, + "rewards/margins": 23.912567138671875, + "rewards/rejected": -22.221370697021484, + "step": 2420 + }, + { + "epoch": 1.11, + "learning_rate": 2.100456621004566e-07, + "logits/chosen": -0.736041247844696, + "logits/rejected": -0.6666306257247925, + "logps/chosen": -80.7561264038086, + "logps/rejected": -109.1169662475586, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6556488275527954, + "rewards/margins": 22.82249641418457, + "rewards/rejected": -21.166845321655273, + "step": 2430 + }, + { + "epoch": 1.11, + "learning_rate": 2.0953830542871636e-07, + "logits/chosen": -0.8349838256835938, + "logits/rejected": -0.7791039347648621, + "logps/chosen": -82.36167907714844, + "logps/rejected": -110.2313461303711, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5611153841018677, + "rewards/margins": 21.64533805847168, + "rewards/rejected": -21.084224700927734, + "step": 2440 + }, + { + "epoch": 1.12, + "learning_rate": 2.0903094875697612e-07, + "logits/chosen": -0.7713768482208252, + "logits/rejected": -0.7219734191894531, + "logps/chosen": -92.53242492675781, + "logps/rejected": -115.09944915771484, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6493558883666992, + "rewards/margins": 23.381576538085938, + "rewards/rejected": -22.732219696044922, + "step": 2450 + }, + { + "epoch": 1.12, + "learning_rate": 2.085235920852359e-07, + "logits/chosen": -0.8122873306274414, + "logits/rejected": -0.7389894127845764, + "logps/chosen": -81.98970031738281, + "logps/rejected": -114.59257507324219, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9474372863769531, + "rewards/margins": 22.69391632080078, + "rewards/rejected": -21.746479034423828, + "step": 2460 + }, + { + "epoch": 1.13, + "learning_rate": 2.0801623541349566e-07, + "logits/chosen": -0.8570648431777954, + "logits/rejected": -0.8347585797309875, + "logps/chosen": -78.87316131591797, + "logps/rejected": -104.390380859375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.652663230895996, + "rewards/margins": 22.120464324951172, + "rewards/rejected": -20.46780014038086, + "step": 2470 + }, + { + "epoch": 1.13, + "learning_rate": 2.0750887874175542e-07, + "logits/chosen": -0.7832027673721313, + "logits/rejected": -0.7585029602050781, + "logps/chosen": -84.72008514404297, + "logps/rejected": -117.10993957519531, + "loss": 0.0017, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6026067137718201, + "rewards/margins": 23.96122932434082, + "rewards/rejected": -23.35862159729004, + "step": 2480 + }, + { + "epoch": 1.14, + "learning_rate": 2.070015220700152e-07, + "logits/chosen": -0.7349112629890442, + "logits/rejected": -0.7553213834762573, + "logps/chosen": -86.74533081054688, + "logps/rejected": -115.26509094238281, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6511772274971008, + "rewards/margins": 23.22552490234375, + "rewards/rejected": -22.57434844970703, + "step": 2490 + }, + { + "epoch": 1.14, + "learning_rate": 2.0649416539827496e-07, + "logits/chosen": -0.7575095891952515, + "logits/rejected": -0.7819565534591675, + "logps/chosen": -84.1733169555664, + "logps/rejected": -108.44877624511719, + "loss": 0.0076, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.19732484221458435, + "rewards/margins": 20.539228439331055, + "rewards/rejected": -20.341901779174805, + "step": 2500 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -0.7653970718383789, + "eval_logits/rejected": -0.7059395909309387, + "eval_logps/chosen": -83.49778747558594, + "eval_logps/rejected": -105.02425384521484, + "eval_loss": 0.005491924937814474, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.020034670829773, + "eval_rewards/margins": 21.101457595825195, + "eval_rewards/rejected": -20.081422805786133, + "eval_runtime": 97.1385, + "eval_samples_per_second": 29.463, + "eval_steps_per_second": 1.843, + "step": 2500 + }, + { + "epoch": 1.15, + "learning_rate": 2.0598680872653472e-07, + "logits/chosen": -0.764125645160675, + "logits/rejected": -0.7234150171279907, + "logps/chosen": -88.8193588256836, + "logps/rejected": -110.6650390625, + "loss": 0.0038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7087750434875488, + "rewards/margins": 21.741344451904297, + "rewards/rejected": -21.032567977905273, + "step": 2510 + }, + { + "epoch": 1.15, + "learning_rate": 2.054794520547945e-07, + "logits/chosen": -0.6850422024726868, + "logits/rejected": -0.6487849950790405, + "logps/chosen": -82.67296600341797, + "logps/rejected": -106.21736145019531, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7742611169815063, + "rewards/margins": 22.46800422668457, + "rewards/rejected": -20.693742752075195, + "step": 2520 + }, + { + "epoch": 1.15, + "learning_rate": 2.0497209538305426e-07, + "logits/chosen": -0.8037935495376587, + "logits/rejected": -0.736015260219574, + "logps/chosen": -86.53871154785156, + "logps/rejected": -108.95832824707031, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6968075037002563, + "rewards/margins": 21.14533805847168, + "rewards/rejected": -20.448530197143555, + "step": 2530 + }, + { + "epoch": 1.16, + "learning_rate": 2.0446473871131402e-07, + "logits/chosen": -0.7924987077713013, + "logits/rejected": -0.7409130334854126, + "logps/chosen": -90.44608306884766, + "logps/rejected": -119.0773696899414, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6108601689338684, + "rewards/margins": 22.708263397216797, + "rewards/rejected": -22.09740447998047, + "step": 2540 + }, + { + "epoch": 1.16, + "learning_rate": 2.039573820395738e-07, + "logits/chosen": -0.7455700635910034, + "logits/rejected": -0.7140682935714722, + "logps/chosen": -78.17546081542969, + "logps/rejected": -109.40152740478516, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9739960432052612, + "rewards/margins": 22.592451095581055, + "rewards/rejected": -21.61845588684082, + "step": 2550 + }, + { + "epoch": 1.17, + "learning_rate": 2.0345002536783356e-07, + "logits/chosen": -0.7703838348388672, + "logits/rejected": -0.7294633388519287, + "logps/chosen": -81.02081298828125, + "logps/rejected": -107.09619140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9101536273956299, + "rewards/margins": 22.840641021728516, + "rewards/rejected": -20.930484771728516, + "step": 2560 + }, + { + "epoch": 1.17, + "learning_rate": 2.0294266869609332e-07, + "logits/chosen": -0.8617580533027649, + "logits/rejected": -0.8065959811210632, + "logps/chosen": -87.26881408691406, + "logps/rejected": -106.7549057006836, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3661178350448608, + "rewards/margins": 22.15833282470703, + "rewards/rejected": -20.79221534729004, + "step": 2570 + }, + { + "epoch": 1.18, + "learning_rate": 2.024353120243531e-07, + "logits/chosen": -0.7026188969612122, + "logits/rejected": -0.6509179472923279, + "logps/chosen": -77.97960662841797, + "logps/rejected": -104.57243347167969, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5850582122802734, + "rewards/margins": 20.327341079711914, + "rewards/rejected": -18.74228286743164, + "step": 2580 + }, + { + "epoch": 1.18, + "learning_rate": 2.0192795535261286e-07, + "logits/chosen": -0.7325498461723328, + "logits/rejected": -0.7174168825149536, + "logps/chosen": -82.26387023925781, + "logps/rejected": -109.1794204711914, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1741113662719727, + "rewards/margins": 22.20342445373535, + "rewards/rejected": -21.029314041137695, + "step": 2590 + }, + { + "epoch": 1.19, + "learning_rate": 2.0142059868087262e-07, + "logits/chosen": -0.7314542531967163, + "logits/rejected": -0.7128167152404785, + "logps/chosen": -86.39726257324219, + "logps/rejected": -107.54801177978516, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.596429467201233, + "rewards/margins": 22.180591583251953, + "rewards/rejected": -20.584165573120117, + "step": 2600 + }, + { + "epoch": 1.19, + "eval_logits/chosen": -0.7385216355323792, + "eval_logits/rejected": -0.680923342704773, + "eval_logps/chosen": -83.2193603515625, + "eval_logps/rejected": -106.13276672363281, + "eval_loss": 0.005839827004820108, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.159246802330017, + "eval_rewards/margins": 21.79492950439453, + "eval_rewards/rejected": -20.63568115234375, + "eval_runtime": 68.9725, + "eval_samples_per_second": 41.495, + "eval_steps_per_second": 2.595, + "step": 2600 + }, + { + "epoch": 1.19, + "learning_rate": 2.009132420091324e-07, + "logits/chosen": -0.6961277723312378, + "logits/rejected": -0.6608497500419617, + "logps/chosen": -80.58322143554688, + "logps/rejected": -107.66817474365234, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9859172701835632, + "rewards/margins": 22.4183349609375, + "rewards/rejected": -21.432416915893555, + "step": 2610 + }, + { + "epoch": 1.2, + "learning_rate": 2.0040588533739216e-07, + "logits/chosen": -0.7351371049880981, + "logits/rejected": -0.7426373958587646, + "logps/chosen": -86.33208465576172, + "logps/rejected": -109.19034576416016, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45506373047828674, + "rewards/margins": 21.83483123779297, + "rewards/rejected": -21.379764556884766, + "step": 2620 + }, + { + "epoch": 1.2, + "learning_rate": 1.9989852866565192e-07, + "logits/chosen": -0.6917943358421326, + "logits/rejected": -0.6587172150611877, + "logps/chosen": -83.50242614746094, + "logps/rejected": -111.31648254394531, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.758619487285614, + "rewards/margins": 21.25997543334961, + "rewards/rejected": -20.501354217529297, + "step": 2630 + }, + { + "epoch": 1.21, + "learning_rate": 1.993911719939117e-07, + "logits/chosen": -0.6337357759475708, + "logits/rejected": -0.6760739088058472, + "logps/chosen": -86.26484680175781, + "logps/rejected": -103.41255950927734, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7138054370880127, + "rewards/margins": 20.66269302368164, + "rewards/rejected": -18.948888778686523, + "step": 2640 + }, + { + "epoch": 1.21, + "learning_rate": 1.9888381532217146e-07, + "logits/chosen": -0.7049747705459595, + "logits/rejected": -0.6962708234786987, + "logps/chosen": -85.10768127441406, + "logps/rejected": -109.0610580444336, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5559046268463135, + "rewards/margins": 20.93661880493164, + "rewards/rejected": -20.38071632385254, + "step": 2650 + }, + { + "epoch": 1.21, + "learning_rate": 1.9837645865043122e-07, + "logits/chosen": -0.7089205980300903, + "logits/rejected": -0.7450774908065796, + "logps/chosen": -84.3514175415039, + "logps/rejected": -107.42315673828125, + "loss": 0.0026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2602952718734741, + "rewards/margins": 20.727048873901367, + "rewards/rejected": -19.466754913330078, + "step": 2660 + }, + { + "epoch": 1.22, + "learning_rate": 1.97869101978691e-07, + "logits/chosen": -0.6836045980453491, + "logits/rejected": -0.6517876386642456, + "logps/chosen": -84.9683837890625, + "logps/rejected": -107.4233169555664, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.006893515586853, + "rewards/margins": 20.917407989501953, + "rewards/rejected": -19.9105167388916, + "step": 2670 + }, + { + "epoch": 1.22, + "learning_rate": 1.9736174530695076e-07, + "logits/chosen": -0.7533547282218933, + "logits/rejected": -0.7005944848060608, + "logps/chosen": -85.52412414550781, + "logps/rejected": -108.1337890625, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1605770587921143, + "rewards/margins": 21.576522827148438, + "rewards/rejected": -20.41594886779785, + "step": 2680 + }, + { + "epoch": 1.23, + "learning_rate": 1.9685438863521052e-07, + "logits/chosen": -0.7099670767784119, + "logits/rejected": -0.7033403515815735, + "logps/chosen": -82.56312561035156, + "logps/rejected": -107.25730895996094, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1814966201782227, + "rewards/margins": 21.729625701904297, + "rewards/rejected": -19.54813003540039, + "step": 2690 + }, + { + "epoch": 1.23, + "learning_rate": 1.963470319634703e-07, + "logits/chosen": -0.8139055371284485, + "logits/rejected": -0.7495929598808289, + "logps/chosen": -82.12207794189453, + "logps/rejected": -106.3428955078125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3602625131607056, + "rewards/margins": 22.45661163330078, + "rewards/rejected": -21.09634780883789, + "step": 2700 + }, + { + "epoch": 1.23, + "eval_logits/chosen": -0.7374448180198669, + "eval_logits/rejected": -0.6823889017105103, + "eval_logps/chosen": -83.8608627319336, + "eval_logps/rejected": -105.3611068725586, + "eval_loss": 0.00569253321737051, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.838493824005127, + "eval_rewards/margins": 21.088348388671875, + "eval_rewards/rejected": -20.249855041503906, + "eval_runtime": 83.4775, + "eval_samples_per_second": 34.285, + "eval_steps_per_second": 2.144, + "step": 2700 + }, + { + "epoch": 1.24, + "learning_rate": 1.9583967529173006e-07, + "logits/chosen": -0.7564720511436462, + "logits/rejected": -0.6812716126441956, + "logps/chosen": -87.40708923339844, + "logps/rejected": -108.55073547363281, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8356910943984985, + "rewards/margins": 21.843751907348633, + "rewards/rejected": -21.008060455322266, + "step": 2710 + }, + { + "epoch": 1.24, + "learning_rate": 1.9533231861998982e-07, + "logits/chosen": -0.8517030477523804, + "logits/rejected": -0.8198519945144653, + "logps/chosen": -86.82032775878906, + "logps/rejected": -115.4446792602539, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7068185806274414, + "rewards/margins": 21.509122848510742, + "rewards/rejected": -20.80230140686035, + "step": 2720 + }, + { + "epoch": 1.25, + "learning_rate": 1.948249619482496e-07, + "logits/chosen": -0.6536229848861694, + "logits/rejected": -0.5762327909469604, + "logps/chosen": -86.3205795288086, + "logps/rejected": -107.751953125, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5272104144096375, + "rewards/margins": 21.346994400024414, + "rewards/rejected": -20.819782257080078, + "step": 2730 + }, + { + "epoch": 1.25, + "learning_rate": 1.9431760527650936e-07, + "logits/chosen": -0.8203104734420776, + "logits/rejected": -0.7407748103141785, + "logps/chosen": -84.98686981201172, + "logps/rejected": -113.11669921875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3415039777755737, + "rewards/margins": 23.172767639160156, + "rewards/rejected": -21.83126449584961, + "step": 2740 + }, + { + "epoch": 1.26, + "learning_rate": 1.9381024860476912e-07, + "logits/chosen": -0.8179903030395508, + "logits/rejected": -0.7713826894760132, + "logps/chosen": -90.3552017211914, + "logps/rejected": -110.89227294921875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8493086099624634, + "rewards/margins": 22.596853256225586, + "rewards/rejected": -21.747547149658203, + "step": 2750 + }, + { + "epoch": 1.26, + "learning_rate": 1.933028919330289e-07, + "logits/chosen": -0.78132164478302, + "logits/rejected": -0.7138301134109497, + "logps/chosen": -84.56668853759766, + "logps/rejected": -112.89170837402344, + "loss": 0.0049, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.28099095821380615, + "rewards/margins": 22.480010986328125, + "rewards/rejected": -22.199020385742188, + "step": 2760 + }, + { + "epoch": 1.26, + "learning_rate": 1.9279553526128866e-07, + "logits/chosen": -0.8231332898139954, + "logits/rejected": -0.7319883704185486, + "logps/chosen": -83.89222717285156, + "logps/rejected": -110.40364074707031, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8967598080635071, + "rewards/margins": 23.46830177307129, + "rewards/rejected": -22.5715389251709, + "step": 2770 + }, + { + "epoch": 1.27, + "learning_rate": 1.9228817858954842e-07, + "logits/chosen": -0.8091581463813782, + "logits/rejected": -0.719267725944519, + "logps/chosen": -83.92425537109375, + "logps/rejected": -114.30082702636719, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.229701280593872, + "rewards/margins": 24.159175872802734, + "rewards/rejected": -22.929473876953125, + "step": 2780 + }, + { + "epoch": 1.27, + "learning_rate": 1.917808219178082e-07, + "logits/chosen": -0.8809703588485718, + "logits/rejected": -0.8047698140144348, + "logps/chosen": -86.08138275146484, + "logps/rejected": -109.77156066894531, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8736478686332703, + "rewards/margins": 22.648921966552734, + "rewards/rejected": -21.77527618408203, + "step": 2790 + }, + { + "epoch": 1.28, + "learning_rate": 1.9127346524606796e-07, + "logits/chosen": -0.8428120613098145, + "logits/rejected": -0.815168023109436, + "logps/chosen": -82.51287841796875, + "logps/rejected": -108.46163177490234, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5113711357116699, + "rewards/margins": 21.765836715698242, + "rewards/rejected": -21.254467010498047, + "step": 2800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": -0.7845156192779541, + "eval_logits/rejected": -0.7218629717826843, + "eval_logps/chosen": -84.17462158203125, + "eval_logps/rejected": -106.3823471069336, + "eval_loss": 0.005751576274633408, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.6816177368164062, + "eval_rewards/margins": 21.442089080810547, + "eval_rewards/rejected": -20.760467529296875, + "eval_runtime": 66.7253, + "eval_samples_per_second": 42.892, + "eval_steps_per_second": 2.683, + "step": 2800 + }, + { + "epoch": 1.28, + "learning_rate": 1.9076610857432772e-07, + "logits/chosen": -0.8195465207099915, + "logits/rejected": -0.7400294542312622, + "logps/chosen": -86.13768768310547, + "logps/rejected": -108.82917785644531, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1283977031707764, + "rewards/margins": 22.65591049194336, + "rewards/rejected": -21.527515411376953, + "step": 2810 + }, + { + "epoch": 1.29, + "learning_rate": 1.902587519025875e-07, + "logits/chosen": -0.7770295739173889, + "logits/rejected": -0.7523963451385498, + "logps/chosen": -88.90458679199219, + "logps/rejected": -110.6171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35731858015060425, + "rewards/margins": 22.745914459228516, + "rewards/rejected": -22.388593673706055, + "step": 2820 + }, + { + "epoch": 1.29, + "learning_rate": 1.8975139523084726e-07, + "logits/chosen": -0.7430048584938049, + "logits/rejected": -0.695781946182251, + "logps/chosen": -85.51004028320312, + "logps/rejected": -113.87823486328125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0342652797698975, + "rewards/margins": 22.87997817993164, + "rewards/rejected": -21.845712661743164, + "step": 2830 + }, + { + "epoch": 1.3, + "learning_rate": 1.8924403855910702e-07, + "logits/chosen": -0.8346872329711914, + "logits/rejected": -0.8123058080673218, + "logps/chosen": -78.13063049316406, + "logps/rejected": -108.62190246582031, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.306009441614151, + "rewards/margins": 21.68273162841797, + "rewards/rejected": -21.376720428466797, + "step": 2840 + }, + { + "epoch": 1.3, + "learning_rate": 1.887366818873668e-07, + "logits/chosen": -0.8324508666992188, + "logits/rejected": -0.7995740175247192, + "logps/chosen": -85.37599182128906, + "logps/rejected": -107.89019775390625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1990930736064911, + "rewards/margins": 21.476375579833984, + "rewards/rejected": -21.277286529541016, + "step": 2850 + }, + { + "epoch": 1.31, + "learning_rate": 1.8822932521562656e-07, + "logits/chosen": -0.7088965177536011, + "logits/rejected": -0.7114429473876953, + "logps/chosen": -86.03219604492188, + "logps/rejected": -108.50044250488281, + "loss": 0.0078, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2706319093704224, + "rewards/margins": 21.439594268798828, + "rewards/rejected": -20.168964385986328, + "step": 2860 + }, + { + "epoch": 1.31, + "learning_rate": 1.8772196854388632e-07, + "logits/chosen": -0.8044069409370422, + "logits/rejected": -0.7494329214096069, + "logps/chosen": -88.49215698242188, + "logps/rejected": -112.16410064697266, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07424769550561905, + "rewards/margins": 21.921253204345703, + "rewards/rejected": -21.995498657226562, + "step": 2870 + }, + { + "epoch": 1.31, + "learning_rate": 1.872146118721461e-07, + "logits/chosen": -0.8148896098136902, + "logits/rejected": -0.7668309211730957, + "logps/chosen": -92.65351104736328, + "logps/rejected": -109.47776794433594, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5657011270523071, + "rewards/margins": 21.924177169799805, + "rewards/rejected": -21.358476638793945, + "step": 2880 + }, + { + "epoch": 1.32, + "learning_rate": 1.8670725520040586e-07, + "logits/chosen": -0.8484834432601929, + "logits/rejected": -0.8214718103408813, + "logps/chosen": -87.38011169433594, + "logps/rejected": -112.77238464355469, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.251098871231079, + "rewards/margins": 23.54327392578125, + "rewards/rejected": -22.292173385620117, + "step": 2890 + }, + { + "epoch": 1.32, + "learning_rate": 1.8619989852866562e-07, + "logits/chosen": -0.8180379867553711, + "logits/rejected": -0.7856913805007935, + "logps/chosen": -85.47797393798828, + "logps/rejected": -108.2070541381836, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7616327404975891, + "rewards/margins": 22.59707260131836, + "rewards/rejected": -21.835439682006836, + "step": 2900 + }, + { + "epoch": 1.32, + "eval_logits/chosen": -0.8321563005447388, + "eval_logits/rejected": -0.762521505355835, + "eval_logps/chosen": -84.00428009033203, + "eval_logps/rejected": -108.98328399658203, + "eval_loss": 0.005883732810616493, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.7667914032936096, + "eval_rewards/margins": 22.827733993530273, + "eval_rewards/rejected": -22.060945510864258, + "eval_runtime": 73.6418, + "eval_samples_per_second": 38.864, + "eval_steps_per_second": 2.431, + "step": 2900 + }, + { + "epoch": 1.33, + "learning_rate": 1.856925418569254e-07, + "logits/chosen": -0.7657192945480347, + "logits/rejected": -0.8023189306259155, + "logps/chosen": -87.12007141113281, + "logps/rejected": -110.1858901977539, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8483368158340454, + "rewards/margins": 23.544336318969727, + "rewards/rejected": -21.696001052856445, + "step": 2910 + }, + { + "epoch": 1.33, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -0.8202277421951294, + "logits/rejected": -0.8277060389518738, + "logps/chosen": -81.93797302246094, + "logps/rejected": -109.71229553222656, + "loss": 0.0067, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1504634618759155, + "rewards/margins": 23.43653106689453, + "rewards/rejected": -22.286067962646484, + "step": 2920 + }, + { + "epoch": 1.34, + "learning_rate": 1.8467782851344492e-07, + "logits/chosen": -0.8164095878601074, + "logits/rejected": -0.797565758228302, + "logps/chosen": -83.46281433105469, + "logps/rejected": -110.8356704711914, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3928399384021759, + "rewards/margins": 23.051225662231445, + "rewards/rejected": -22.65838623046875, + "step": 2930 + }, + { + "epoch": 1.34, + "learning_rate": 1.841704718417047e-07, + "logits/chosen": -0.8441513180732727, + "logits/rejected": -0.794573962688446, + "logps/chosen": -80.64539337158203, + "logps/rejected": -109.9163589477539, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5498849749565125, + "rewards/margins": 22.937286376953125, + "rewards/rejected": -22.387401580810547, + "step": 2940 + }, + { + "epoch": 1.35, + "learning_rate": 1.8366311516996446e-07, + "logits/chosen": -0.8687325716018677, + "logits/rejected": -0.8416921496391296, + "logps/chosen": -83.46965789794922, + "logps/rejected": -108.26016998291016, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4809293746948242, + "rewards/margins": 22.91249656677246, + "rewards/rejected": -21.431570053100586, + "step": 2950 + }, + { + "epoch": 1.35, + "learning_rate": 1.8315575849822422e-07, + "logits/chosen": -0.7985904812812805, + "logits/rejected": -0.7669427990913391, + "logps/chosen": -82.79601287841797, + "logps/rejected": -108.02484130859375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0780209302902222, + "rewards/margins": 21.9554443359375, + "rewards/rejected": -20.877426147460938, + "step": 2960 + }, + { + "epoch": 1.36, + "learning_rate": 1.82648401826484e-07, + "logits/chosen": -0.838662326335907, + "logits/rejected": -0.7950839996337891, + "logps/chosen": -80.59087371826172, + "logps/rejected": -113.2815933227539, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22922952473163605, + "rewards/margins": 22.965641021728516, + "rewards/rejected": -22.736412048339844, + "step": 2970 + }, + { + "epoch": 1.36, + "learning_rate": 1.8214104515474375e-07, + "logits/chosen": -0.8701616525650024, + "logits/rejected": -0.809054970741272, + "logps/chosen": -84.26366424560547, + "logps/rejected": -112.36161804199219, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2285851538181305, + "rewards/margins": 24.036575317382812, + "rewards/rejected": -23.80799102783203, + "step": 2980 + }, + { + "epoch": 1.36, + "learning_rate": 1.8163368848300352e-07, + "logits/chosen": -0.9309707880020142, + "logits/rejected": -0.8480417132377625, + "logps/chosen": -77.65840911865234, + "logps/rejected": -114.49421691894531, + "loss": 0.0115, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9348442554473877, + "rewards/margins": 24.756481170654297, + "rewards/rejected": -22.821638107299805, + "step": 2990 + }, + { + "epoch": 1.37, + "learning_rate": 1.811263318112633e-07, + "logits/chosen": -0.8576655387878418, + "logits/rejected": -0.7783030867576599, + "logps/chosen": -85.56487274169922, + "logps/rejected": -112.32801818847656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1698871850967407, + "rewards/margins": 22.855905532836914, + "rewards/rejected": -21.686016082763672, + "step": 3000 + }, + { + "epoch": 1.37, + "eval_logits/chosen": -0.8208735585212708, + "eval_logits/rejected": -0.7533836960792542, + "eval_logps/chosen": -82.93585968017578, + "eval_logps/rejected": -106.85352325439453, + "eval_loss": 0.005454268306493759, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.3009955883026123, + "eval_rewards/margins": 22.297056198120117, + "eval_rewards/rejected": -20.99605941772461, + "eval_runtime": 72.0521, + "eval_samples_per_second": 39.721, + "eval_steps_per_second": 2.484, + "step": 3000 + }, + { + "epoch": 1.37, + "learning_rate": 1.8061897513952305e-07, + "logits/chosen": -0.8196511268615723, + "logits/rejected": -0.6741745471954346, + "logps/chosen": -83.01131439208984, + "logps/rejected": -107.64549255371094, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.772728443145752, + "rewards/margins": 23.33978843688965, + "rewards/rejected": -20.567058563232422, + "step": 3010 + }, + { + "epoch": 1.38, + "learning_rate": 1.8011161846778282e-07, + "logits/chosen": -0.8143720626831055, + "logits/rejected": -0.751374363899231, + "logps/chosen": -82.5423583984375, + "logps/rejected": -108.41703033447266, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0407277345657349, + "rewards/margins": 21.009965896606445, + "rewards/rejected": -19.96923828125, + "step": 3020 + }, + { + "epoch": 1.38, + "learning_rate": 1.796042617960426e-07, + "logits/chosen": -0.7924383878707886, + "logits/rejected": -0.7827506065368652, + "logps/chosen": -94.8802719116211, + "logps/rejected": -110.94437408447266, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6922892332077026, + "rewards/margins": 20.605281829833984, + "rewards/rejected": -19.91299057006836, + "step": 3030 + }, + { + "epoch": 1.39, + "learning_rate": 1.7909690512430235e-07, + "logits/chosen": -0.7978582382202148, + "logits/rejected": -0.7540158629417419, + "logps/chosen": -81.8050308227539, + "logps/rejected": -104.96846008300781, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.273224353790283, + "rewards/margins": 22.64246368408203, + "rewards/rejected": -20.36924171447754, + "step": 3040 + }, + { + "epoch": 1.39, + "learning_rate": 1.7858954845256212e-07, + "logits/chosen": -0.8617739677429199, + "logits/rejected": -0.7627667784690857, + "logps/chosen": -83.94235229492188, + "logps/rejected": -106.07057189941406, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.619549036026001, + "rewards/margins": 21.559040069580078, + "rewards/rejected": -19.939491271972656, + "step": 3050 + }, + { + "epoch": 1.4, + "learning_rate": 1.780821917808219e-07, + "logits/chosen": -0.8220621347427368, + "logits/rejected": -0.810819149017334, + "logps/chosen": -87.0011978149414, + "logps/rejected": -108.83686828613281, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5197734832763672, + "rewards/margins": 21.68050193786621, + "rewards/rejected": -20.160728454589844, + "step": 3060 + }, + { + "epoch": 1.4, + "learning_rate": 1.7757483510908165e-07, + "logits/chosen": -0.7937394976615906, + "logits/rejected": -0.7784138917922974, + "logps/chosen": -80.34947204589844, + "logps/rejected": -111.46089172363281, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9023909568786621, + "rewards/margins": 21.696002960205078, + "rewards/rejected": -20.793611526489258, + "step": 3070 + }, + { + "epoch": 1.41, + "learning_rate": 1.7706747843734142e-07, + "logits/chosen": -0.8737959861755371, + "logits/rejected": -0.8506304621696472, + "logps/chosen": -80.43476867675781, + "logps/rejected": -105.3740005493164, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0261263847351074, + "rewards/margins": 21.9659423828125, + "rewards/rejected": -20.93981170654297, + "step": 3080 + }, + { + "epoch": 1.41, + "learning_rate": 1.765601217656012e-07, + "logits/chosen": -0.8673890233039856, + "logits/rejected": -0.8221572041511536, + "logps/chosen": -83.61185455322266, + "logps/rejected": -108.5359878540039, + "loss": 0.0057, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3645645380020142, + "rewards/margins": 21.998119354248047, + "rewards/rejected": -20.633556365966797, + "step": 3090 + }, + { + "epoch": 1.42, + "learning_rate": 1.7605276509386095e-07, + "logits/chosen": -0.7997790575027466, + "logits/rejected": -0.7952042818069458, + "logps/chosen": -79.34935760498047, + "logps/rejected": -107.53935241699219, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1273956298828125, + "rewards/margins": 22.18309211730957, + "rewards/rejected": -21.055694580078125, + "step": 3100 + }, + { + "epoch": 1.42, + "eval_logits/chosen": -0.8161947727203369, + "eval_logits/rejected": -0.7520949840545654, + "eval_logps/chosen": -82.9143295288086, + "eval_logps/rejected": -105.88829040527344, + "eval_loss": 0.005368279293179512, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.3117655515670776, + "eval_rewards/margins": 21.82520866394043, + "eval_rewards/rejected": -20.513439178466797, + "eval_runtime": 78.788, + "eval_samples_per_second": 36.325, + "eval_steps_per_second": 2.272, + "step": 3100 + }, + { + "epoch": 1.42, + "learning_rate": 1.7554540842212072e-07, + "logits/chosen": -0.8044342994689941, + "logits/rejected": -0.7702390551567078, + "logps/chosen": -79.634765625, + "logps/rejected": -106.61177062988281, + "loss": 0.0043, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8688969612121582, + "rewards/margins": 21.35724639892578, + "rewards/rejected": -20.48834991455078, + "step": 3110 + }, + { + "epoch": 1.42, + "learning_rate": 1.750380517503805e-07, + "logits/chosen": -0.8742705583572388, + "logits/rejected": -0.8354657888412476, + "logps/chosen": -81.12736511230469, + "logps/rejected": -104.99991607666016, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1102584600448608, + "rewards/margins": 20.783863067626953, + "rewards/rejected": -19.673603057861328, + "step": 3120 + }, + { + "epoch": 1.43, + "learning_rate": 1.7453069507864025e-07, + "logits/chosen": -0.8716095089912415, + "logits/rejected": -0.7494352459907532, + "logps/chosen": -81.30690002441406, + "logps/rejected": -105.8340835571289, + "loss": 0.0079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8371508717536926, + "rewards/margins": 22.083576202392578, + "rewards/rejected": -21.246429443359375, + "step": 3130 + }, + { + "epoch": 1.43, + "learning_rate": 1.7402333840690002e-07, + "logits/chosen": -0.7608228921890259, + "logits/rejected": -0.7334285378456116, + "logps/chosen": -91.09227752685547, + "logps/rejected": -112.9427261352539, + "loss": 0.0017, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4231182932853699, + "rewards/margins": 21.993289947509766, + "rewards/rejected": -21.570171356201172, + "step": 3140 + }, + { + "epoch": 1.44, + "learning_rate": 1.735159817351598e-07, + "logits/chosen": -0.7460505962371826, + "logits/rejected": -0.7124925851821899, + "logps/chosen": -79.06074523925781, + "logps/rejected": -111.85310363769531, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2974584102630615, + "rewards/margins": 23.934118270874023, + "rewards/rejected": -21.636661529541016, + "step": 3150 + }, + { + "epoch": 1.44, + "learning_rate": 1.7300862506341955e-07, + "logits/chosen": -0.8754836916923523, + "logits/rejected": -0.8337736129760742, + "logps/chosen": -84.03656005859375, + "logps/rejected": -110.4716796875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2922741174697876, + "rewards/margins": 23.63736343383789, + "rewards/rejected": -22.345090866088867, + "step": 3160 + }, + { + "epoch": 1.45, + "learning_rate": 1.7250126839167932e-07, + "logits/chosen": -0.9508827328681946, + "logits/rejected": -0.83515465259552, + "logps/chosen": -85.82405090332031, + "logps/rejected": -107.03657531738281, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8837907910346985, + "rewards/margins": 21.791425704956055, + "rewards/rejected": -20.907634735107422, + "step": 3170 + }, + { + "epoch": 1.45, + "learning_rate": 1.719939117199391e-07, + "logits/chosen": -0.7886821031570435, + "logits/rejected": -0.7693257927894592, + "logps/chosen": -83.99065399169922, + "logps/rejected": -107.45686340332031, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2205920219421387, + "rewards/margins": 23.225391387939453, + "rewards/rejected": -21.00480079650879, + "step": 3180 + }, + { + "epoch": 1.46, + "learning_rate": 1.7148655504819885e-07, + "logits/chosen": -0.7591974139213562, + "logits/rejected": -0.7449969053268433, + "logps/chosen": -82.21507263183594, + "logps/rejected": -108.7207260131836, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1014723777770996, + "rewards/margins": 21.82181739807129, + "rewards/rejected": -20.720340728759766, + "step": 3190 + }, + { + "epoch": 1.46, + "learning_rate": 1.7097919837645862e-07, + "logits/chosen": -0.8855707049369812, + "logits/rejected": -0.8614555597305298, + "logps/chosen": -81.53754425048828, + "logps/rejected": -109.7943344116211, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4457625150680542, + "rewards/margins": 23.156801223754883, + "rewards/rejected": -21.711040496826172, + "step": 3200 + }, + { + "epoch": 1.46, + "eval_logits/chosen": -0.829251229763031, + "eval_logits/rejected": -0.761223554611206, + "eval_logps/chosen": -82.8708267211914, + "eval_logps/rejected": -106.64373779296875, + "eval_loss": 0.0055304598063230515, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 1.3335167169570923, + "eval_rewards/margins": 22.224679946899414, + "eval_rewards/rejected": -20.891162872314453, + "eval_runtime": 81.9215, + "eval_samples_per_second": 34.936, + "eval_steps_per_second": 2.185, + "step": 3200 + }, + { + "epoch": 1.47, + "learning_rate": 1.704718417047184e-07, + "logits/chosen": -0.8796641230583191, + "logits/rejected": -0.8663586378097534, + "logps/chosen": -85.18115234375, + "logps/rejected": -115.62249755859375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1032954454421997, + "rewards/margins": 23.180606842041016, + "rewards/rejected": -22.07731056213379, + "step": 3210 + }, + { + "epoch": 1.47, + "learning_rate": 1.6996448503297815e-07, + "logits/chosen": -0.8402705192565918, + "logits/rejected": -0.7693440318107605, + "logps/chosen": -87.40254974365234, + "logps/rejected": -112.21661376953125, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9603824615478516, + "rewards/margins": 22.17110252380371, + "rewards/rejected": -21.21072006225586, + "step": 3220 + }, + { + "epoch": 1.47, + "learning_rate": 1.6945712836123792e-07, + "logits/chosen": -0.8487188220024109, + "logits/rejected": -0.7834162712097168, + "logps/chosen": -84.28276062011719, + "logps/rejected": -107.58403015136719, + "loss": 0.0069, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3142402172088623, + "rewards/margins": 21.98873519897461, + "rewards/rejected": -20.67449378967285, + "step": 3230 + }, + { + "epoch": 1.48, + "learning_rate": 1.689497716894977e-07, + "logits/chosen": -0.793751060962677, + "logits/rejected": -0.7844887971878052, + "logps/chosen": -79.22299194335938, + "logps/rejected": -104.5686264038086, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1418697834014893, + "rewards/margins": 21.396854400634766, + "rewards/rejected": -20.254985809326172, + "step": 3240 + }, + { + "epoch": 1.48, + "learning_rate": 1.6844241501775745e-07, + "logits/chosen": -0.8498477935791016, + "logits/rejected": -0.8198568224906921, + "logps/chosen": -87.35657501220703, + "logps/rejected": -111.27471923828125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1994593143463135, + "rewards/margins": 22.922657012939453, + "rewards/rejected": -21.723196029663086, + "step": 3250 + }, + { + "epoch": 1.49, + "learning_rate": 1.6793505834601722e-07, + "logits/chosen": -0.9263782501220703, + "logits/rejected": -0.8542073965072632, + "logps/chosen": -76.23494720458984, + "logps/rejected": -113.0631103515625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5995099544525146, + "rewards/margins": 25.61696434020996, + "rewards/rejected": -24.017454147338867, + "step": 3260 + }, + { + "epoch": 1.49, + "learning_rate": 1.67427701674277e-07, + "logits/chosen": -0.8342376947402954, + "logits/rejected": -0.756497859954834, + "logps/chosen": -81.8563232421875, + "logps/rejected": -108.46012115478516, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6572699546813965, + "rewards/margins": 23.10988426208496, + "rewards/rejected": -22.45261573791504, + "step": 3270 + }, + { + "epoch": 1.5, + "learning_rate": 1.6692034500253675e-07, + "logits/chosen": -0.8144375681877136, + "logits/rejected": -0.7653582096099854, + "logps/chosen": -81.36412048339844, + "logps/rejected": -115.94920349121094, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2872655391693115, + "rewards/margins": 24.55162239074707, + "rewards/rejected": -23.264354705810547, + "step": 3280 + }, + { + "epoch": 1.5, + "learning_rate": 1.6641298833079652e-07, + "logits/chosen": -0.8921886682510376, + "logits/rejected": -0.8617362976074219, + "logps/chosen": -86.64197540283203, + "logps/rejected": -110.98530578613281, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1070516109466553, + "rewards/margins": 23.492164611816406, + "rewards/rejected": -22.385112762451172, + "step": 3290 + }, + { + "epoch": 1.51, + "learning_rate": 1.659056316590563e-07, + "logits/chosen": -0.871414303779602, + "logits/rejected": -0.7999138236045837, + "logps/chosen": -82.70069885253906, + "logps/rejected": -109.51627349853516, + "loss": 0.0021, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.31220346689224243, + "rewards/margins": 23.60110855102539, + "rewards/rejected": -23.28890609741211, + "step": 3300 + }, + { + "epoch": 1.51, + "eval_logits/chosen": -0.8566647171974182, + "eval_logits/rejected": -0.7842747569084167, + "eval_logps/chosen": -83.30318450927734, + "eval_logps/rejected": -109.43997955322266, + "eval_loss": 0.005761295091360807, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.117339015007019, + "eval_rewards/margins": 23.406618118286133, + "eval_rewards/rejected": -22.28927993774414, + "eval_runtime": 69.4914, + "eval_samples_per_second": 41.185, + "eval_steps_per_second": 2.576, + "step": 3300 + }, + { + "epoch": 1.51, + "learning_rate": 1.6539827498731605e-07, + "logits/chosen": -0.8968285322189331, + "logits/rejected": -0.8614455461502075, + "logps/chosen": -85.897216796875, + "logps/rejected": -110.06578063964844, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6581555604934692, + "rewards/margins": 23.074901580810547, + "rewards/rejected": -22.416744232177734, + "step": 3310 + }, + { + "epoch": 1.52, + "learning_rate": 1.6489091831557582e-07, + "logits/chosen": -0.803012490272522, + "logits/rejected": -0.7980927228927612, + "logps/chosen": -82.73001861572266, + "logps/rejected": -111.27703857421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6886202692985535, + "rewards/margins": 23.198246002197266, + "rewards/rejected": -22.509626388549805, + "step": 3320 + }, + { + "epoch": 1.52, + "learning_rate": 1.643835616438356e-07, + "logits/chosen": -0.8772886395454407, + "logits/rejected": -0.8491265177726746, + "logps/chosen": -85.08233642578125, + "logps/rejected": -115.56292724609375, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.838422954082489, + "rewards/margins": 24.703176498413086, + "rewards/rejected": -23.864755630493164, + "step": 3330 + }, + { + "epoch": 1.52, + "learning_rate": 1.6387620497209535e-07, + "logits/chosen": -0.7757368683815002, + "logits/rejected": -0.7173784971237183, + "logps/chosen": -82.10543060302734, + "logps/rejected": -111.90571594238281, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7612966895103455, + "rewards/margins": 23.784420013427734, + "rewards/rejected": -23.023122787475586, + "step": 3340 + }, + { + "epoch": 1.53, + "learning_rate": 1.6336884830035512e-07, + "logits/chosen": -0.9086600542068481, + "logits/rejected": -0.8849117159843445, + "logps/chosen": -83.78245544433594, + "logps/rejected": -115.2596206665039, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6915544271469116, + "rewards/margins": 23.850078582763672, + "rewards/rejected": -23.158523559570312, + "step": 3350 + }, + { + "epoch": 1.53, + "learning_rate": 1.6286149162861489e-07, + "logits/chosen": -0.9009075164794922, + "logits/rejected": -0.8365023732185364, + "logps/chosen": -87.08536529541016, + "logps/rejected": -118.95816802978516, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35949334502220154, + "rewards/margins": 24.53894805908203, + "rewards/rejected": -24.179454803466797, + "step": 3360 + }, + { + "epoch": 1.54, + "learning_rate": 1.6235413495687465e-07, + "logits/chosen": -0.7893753051757812, + "logits/rejected": -0.700376033782959, + "logps/chosen": -83.50393676757812, + "logps/rejected": -110.16971588134766, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2407291829586029, + "rewards/margins": 23.86244773864746, + "rewards/rejected": -23.621719360351562, + "step": 3370 + }, + { + "epoch": 1.54, + "learning_rate": 1.6184677828513442e-07, + "logits/chosen": -0.8861139416694641, + "logits/rejected": -0.8305456042289734, + "logps/chosen": -85.4578628540039, + "logps/rejected": -115.61383056640625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0657522901892662, + "rewards/margins": 24.47097396850586, + "rewards/rejected": -24.40522003173828, + "step": 3380 + }, + { + "epoch": 1.55, + "learning_rate": 1.613394216133942e-07, + "logits/chosen": -0.9259065389633179, + "logits/rejected": -0.8677760362625122, + "logps/chosen": -81.14451599121094, + "logps/rejected": -114.08638763427734, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1236779689788818, + "rewards/margins": 25.24271583557129, + "rewards/rejected": -24.119037628173828, + "step": 3390 + }, + { + "epoch": 1.55, + "learning_rate": 1.6083206494165398e-07, + "logits/chosen": -0.9464787244796753, + "logits/rejected": -0.9007658958435059, + "logps/chosen": -81.28665924072266, + "logps/rejected": -113.91754150390625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8432676196098328, + "rewards/margins": 25.56879997253418, + "rewards/rejected": -24.725528717041016, + "step": 3400 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -0.8611359000205994, + "eval_logits/rejected": -0.7883932590484619, + "eval_logps/chosen": -83.68134307861328, + "eval_logps/rejected": -112.33548736572266, + "eval_loss": 0.006425461731851101, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.928252637386322, + "eval_rewards/margins": 24.665298461914062, + "eval_rewards/rejected": -23.737045288085938, + "eval_runtime": 72.9825, + "eval_samples_per_second": 39.215, + "eval_steps_per_second": 2.453, + "step": 3400 + }, + { + "epoch": 1.56, + "learning_rate": 1.6032470826991375e-07, + "logits/chosen": -0.849983811378479, + "logits/rejected": -0.7683964371681213, + "logps/chosen": -83.12248229980469, + "logps/rejected": -114.29350280761719, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.212885618209839, + "rewards/margins": 25.064769744873047, + "rewards/rejected": -22.851879119873047, + "step": 3410 + }, + { + "epoch": 1.56, + "learning_rate": 1.598173515981735e-07, + "logits/chosen": -0.8216031789779663, + "logits/rejected": -0.7773474454879761, + "logps/chosen": -88.26131439208984, + "logps/rejected": -109.80609130859375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0204696655273438, + "rewards/margins": 24.57611083984375, + "rewards/rejected": -22.555639266967773, + "step": 3420 + }, + { + "epoch": 1.57, + "learning_rate": 1.5930999492643328e-07, + "logits/chosen": -0.9190078973770142, + "logits/rejected": -0.8715893030166626, + "logps/chosen": -84.56954956054688, + "logps/rejected": -113.73841857910156, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3089227676391602, + "rewards/margins": 25.795129776000977, + "rewards/rejected": -24.486209869384766, + "step": 3430 + }, + { + "epoch": 1.57, + "learning_rate": 1.5880263825469305e-07, + "logits/chosen": -0.9268477559089661, + "logits/rejected": -0.8103355169296265, + "logps/chosen": -83.41505432128906, + "logps/rejected": -116.24625396728516, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2660026550292969, + "rewards/margins": 26.462406158447266, + "rewards/rejected": -25.19640350341797, + "step": 3440 + }, + { + "epoch": 1.57, + "learning_rate": 1.582952815829528e-07, + "logits/chosen": -0.7966245412826538, + "logits/rejected": -0.7210028767585754, + "logps/chosen": -85.24122619628906, + "logps/rejected": -118.64140319824219, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4916000366210938, + "rewards/margins": 25.66861915588379, + "rewards/rejected": -24.177021026611328, + "step": 3450 + }, + { + "epoch": 1.58, + "learning_rate": 1.5778792491121258e-07, + "logits/chosen": -0.9547828435897827, + "logits/rejected": -0.9107531309127808, + "logps/chosen": -84.8668441772461, + "logps/rejected": -114.7540283203125, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2354485988616943, + "rewards/margins": 26.283843994140625, + "rewards/rejected": -25.048397064208984, + "step": 3460 + }, + { + "epoch": 1.58, + "learning_rate": 1.5728056823947235e-07, + "logits/chosen": -0.9525339007377625, + "logits/rejected": -0.8235961198806763, + "logps/chosen": -83.3685531616211, + "logps/rejected": -113.2531509399414, + "loss": 0.0046, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6874784231185913, + "rewards/margins": 25.723840713500977, + "rewards/rejected": -24.036357879638672, + "step": 3470 + }, + { + "epoch": 1.59, + "learning_rate": 1.567732115677321e-07, + "logits/chosen": -0.8695586919784546, + "logits/rejected": -0.7993710041046143, + "logps/chosen": -87.9699935913086, + "logps/rejected": -113.51700592041016, + "loss": 0.0063, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6940809488296509, + "rewards/margins": 24.42647933959961, + "rewards/rejected": -23.732397079467773, + "step": 3480 + }, + { + "epoch": 1.59, + "learning_rate": 1.5626585489599188e-07, + "logits/chosen": -0.8067137598991394, + "logits/rejected": -0.7603528499603271, + "logps/chosen": -89.75978088378906, + "logps/rejected": -124.76887512207031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7332732677459717, + "rewards/margins": 27.42246437072754, + "rewards/rejected": -25.689189910888672, + "step": 3490 + }, + { + "epoch": 1.6, + "learning_rate": 1.5575849822425165e-07, + "logits/chosen": -0.9085081815719604, + "logits/rejected": -0.8620240092277527, + "logps/chosen": -82.04761505126953, + "logps/rejected": -117.4197006225586, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5324010848999023, + "rewards/margins": 25.046289443969727, + "rewards/rejected": -24.513887405395508, + "step": 3500 + }, + { + "epoch": 1.6, + "eval_logits/chosen": -0.875246524810791, + "eval_logits/rejected": -0.7964990139007568, + "eval_logps/chosen": -83.70887756347656, + "eval_logps/rejected": -114.64440155029297, + "eval_loss": 0.00680342735722661, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9144880175590515, + "eval_rewards/margins": 25.805988311767578, + "eval_rewards/rejected": -24.891504287719727, + "eval_runtime": 99.4145, + "eval_samples_per_second": 28.789, + "eval_steps_per_second": 1.801, + "step": 3500 + }, + { + "epoch": 1.6, + "learning_rate": 1.552511415525114e-07, + "logits/chosen": -0.9048668146133423, + "logits/rejected": -0.882061779499054, + "logps/chosen": -85.49907684326172, + "logps/rejected": -117.0702133178711, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2163362205028534, + "rewards/margins": 25.046667098999023, + "rewards/rejected": -25.263004302978516, + "step": 3510 + }, + { + "epoch": 1.61, + "learning_rate": 1.5474378488077118e-07, + "logits/chosen": -0.9017683863639832, + "logits/rejected": -0.8488256335258484, + "logps/chosen": -91.98550415039062, + "logps/rejected": -121.7437973022461, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6241726875305176, + "rewards/margins": 25.137426376342773, + "rewards/rejected": -24.513254165649414, + "step": 3520 + }, + { + "epoch": 1.61, + "learning_rate": 1.5423642820903095e-07, + "logits/chosen": -0.8245092630386353, + "logits/rejected": -0.791895866394043, + "logps/chosen": -83.31465148925781, + "logps/rejected": -118.2453842163086, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7763601541519165, + "rewards/margins": 26.058420181274414, + "rewards/rejected": -24.282062530517578, + "step": 3530 + }, + { + "epoch": 1.62, + "learning_rate": 1.537290715372907e-07, + "logits/chosen": -0.953994870185852, + "logits/rejected": -0.8880467414855957, + "logps/chosen": -86.09355163574219, + "logps/rejected": -112.99943542480469, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6044597625732422, + "rewards/margins": 25.4656982421875, + "rewards/rejected": -23.861236572265625, + "step": 3540 + }, + { + "epoch": 1.62, + "learning_rate": 1.5322171486555048e-07, + "logits/chosen": -0.9053792953491211, + "logits/rejected": -0.838627815246582, + "logps/chosen": -90.17684936523438, + "logps/rejected": -120.52122497558594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3521316051483154, + "rewards/margins": 28.084697723388672, + "rewards/rejected": -25.73256492614746, + "step": 3550 + }, + { + "epoch": 1.63, + "learning_rate": 1.5271435819381025e-07, + "logits/chosen": -0.9697454571723938, + "logits/rejected": -0.8927943110466003, + "logps/chosen": -86.13640594482422, + "logps/rejected": -116.8769302368164, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8283834457397461, + "rewards/margins": 25.841808319091797, + "rewards/rejected": -25.013423919677734, + "step": 3560 + }, + { + "epoch": 1.63, + "learning_rate": 1.5220700152207e-07, + "logits/chosen": -0.8806090354919434, + "logits/rejected": -0.8488904237747192, + "logps/chosen": -90.3905029296875, + "logps/rejected": -119.23170471191406, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9313351511955261, + "rewards/margins": 25.077367782592773, + "rewards/rejected": -24.146032333374023, + "step": 3570 + }, + { + "epoch": 1.63, + "learning_rate": 1.5169964485032978e-07, + "logits/chosen": -0.9907437562942505, + "logits/rejected": -0.9438600540161133, + "logps/chosen": -86.11767578125, + "logps/rejected": -126.03318786621094, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6631255745887756, + "rewards/margins": 26.9681396484375, + "rewards/rejected": -26.30501365661621, + "step": 3580 + }, + { + "epoch": 1.64, + "learning_rate": 1.5119228817858955e-07, + "logits/chosen": -0.9619858860969543, + "logits/rejected": -0.8756266832351685, + "logps/chosen": -82.71698760986328, + "logps/rejected": -119.3367691040039, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6457278728485107, + "rewards/margins": 26.794042587280273, + "rewards/rejected": -25.148311614990234, + "step": 3590 + }, + { + "epoch": 1.64, + "learning_rate": 1.506849315068493e-07, + "logits/chosen": -0.8155440092086792, + "logits/rejected": -0.7831107378005981, + "logps/chosen": -85.99327087402344, + "logps/rejected": -113.37886047363281, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6102584600448608, + "rewards/margins": 24.855575561523438, + "rewards/rejected": -23.245319366455078, + "step": 3600 + }, + { + "epoch": 1.64, + "eval_logits/chosen": -0.8784149885177612, + "eval_logits/rejected": -0.8029141426086426, + "eval_logps/chosen": -83.55583190917969, + "eval_logps/rejected": -113.30615997314453, + "eval_loss": 0.006304467096924782, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9910123348236084, + "eval_rewards/margins": 25.21339225769043, + "eval_rewards/rejected": -24.22237777709961, + "eval_runtime": 67.7796, + "eval_samples_per_second": 42.225, + "eval_steps_per_second": 2.641, + "step": 3600 + }, + { + "epoch": 1.65, + "learning_rate": 1.5017757483510908e-07, + "logits/chosen": -0.8234073519706726, + "logits/rejected": -0.8046862483024597, + "logps/chosen": -82.49821472167969, + "logps/rejected": -117.84230041503906, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6039770841598511, + "rewards/margins": 24.839069366455078, + "rewards/rejected": -24.235092163085938, + "step": 3610 + }, + { + "epoch": 1.65, + "learning_rate": 1.4967021816336885e-07, + "logits/chosen": -0.8637280464172363, + "logits/rejected": -0.8385387659072876, + "logps/chosen": -80.1966323852539, + "logps/rejected": -112.7997817993164, + "loss": 0.0037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6764703989028931, + "rewards/margins": 25.432544708251953, + "rewards/rejected": -24.756074905395508, + "step": 3620 + }, + { + "epoch": 1.66, + "learning_rate": 1.491628614916286e-07, + "logits/chosen": -0.9332625269889832, + "logits/rejected": -0.8725907206535339, + "logps/chosen": -85.28267669677734, + "logps/rejected": -120.61589050292969, + "loss": 0.006, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.753173828125, + "rewards/margins": 25.59710121154785, + "rewards/rejected": -24.84392547607422, + "step": 3630 + }, + { + "epoch": 1.66, + "learning_rate": 1.4865550481988838e-07, + "logits/chosen": -0.8927936553955078, + "logits/rejected": -0.8743786811828613, + "logps/chosen": -77.79920959472656, + "logps/rejected": -112.90252685546875, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.799877941608429, + "rewards/margins": 24.70957374572754, + "rewards/rejected": -23.90969467163086, + "step": 3640 + }, + { + "epoch": 1.67, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -1.0039594173431396, + "logits/rejected": -0.8558320999145508, + "logps/chosen": -81.33214569091797, + "logps/rejected": -109.74530029296875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07261955738067627, + "rewards/margins": 23.961139678955078, + "rewards/rejected": -23.888517379760742, + "step": 3650 + }, + { + "epoch": 1.67, + "learning_rate": 1.476407914764079e-07, + "logits/chosen": -0.9197986721992493, + "logits/rejected": -0.8760434985160828, + "logps/chosen": -89.37728118896484, + "logps/rejected": -118.18058776855469, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11399469524621964, + "rewards/margins": 24.953617095947266, + "rewards/rejected": -25.06761360168457, + "step": 3660 + }, + { + "epoch": 1.68, + "learning_rate": 1.4713343480466768e-07, + "logits/chosen": -0.8881243467330933, + "logits/rejected": -0.824755847454071, + "logps/chosen": -85.07624816894531, + "logps/rejected": -117.46966552734375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3701117038726807, + "rewards/margins": 26.875885009765625, + "rewards/rejected": -25.505775451660156, + "step": 3670 + }, + { + "epoch": 1.68, + "learning_rate": 1.4662607813292745e-07, + "logits/chosen": -0.9610374569892883, + "logits/rejected": -0.894203782081604, + "logps/chosen": -87.59607696533203, + "logps/rejected": -114.97966003417969, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8766714930534363, + "rewards/margins": 25.969364166259766, + "rewards/rejected": -25.092693328857422, + "step": 3680 + }, + { + "epoch": 1.68, + "learning_rate": 1.461187214611872e-07, + "logits/chosen": -0.8563788533210754, + "logits/rejected": -0.8630868196487427, + "logps/chosen": -88.29237365722656, + "logps/rejected": -118.44969177246094, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7286855578422546, + "rewards/margins": 26.557199478149414, + "rewards/rejected": -25.82851219177246, + "step": 3690 + }, + { + "epoch": 1.69, + "learning_rate": 1.4561136478944698e-07, + "logits/chosen": -0.8998085260391235, + "logits/rejected": -0.8663345575332642, + "logps/chosen": -84.99273681640625, + "logps/rejected": -122.40765380859375, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7066619396209717, + "rewards/margins": 27.883495330810547, + "rewards/rejected": -26.176837921142578, + "step": 3700 + }, + { + "epoch": 1.69, + "eval_logits/chosen": -0.9141598343849182, + "eval_logits/rejected": -0.8334181904792786, + "eval_logps/chosen": -84.20160675048828, + "eval_logps/rejected": -115.98404693603516, + "eval_loss": 0.006939805578440428, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.6681216359138489, + "eval_rewards/margins": 26.229442596435547, + "eval_rewards/rejected": -25.561321258544922, + "eval_runtime": 63.3927, + "eval_samples_per_second": 45.147, + "eval_steps_per_second": 2.824, + "step": 3700 + }, + { + "epoch": 1.69, + "learning_rate": 1.4510400811770675e-07, + "logits/chosen": -0.9376422166824341, + "logits/rejected": -0.8984735608100891, + "logps/chosen": -88.30086517333984, + "logps/rejected": -119.71275329589844, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.101771593093872, + "rewards/margins": 26.530920028686523, + "rewards/rejected": -25.429149627685547, + "step": 3710 + }, + { + "epoch": 1.7, + "learning_rate": 1.445966514459665e-07, + "logits/chosen": -0.9134553670883179, + "logits/rejected": -0.8494185209274292, + "logps/chosen": -84.12943267822266, + "logps/rejected": -121.0509033203125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8017365336418152, + "rewards/margins": 27.020517349243164, + "rewards/rejected": -26.218780517578125, + "step": 3720 + }, + { + "epoch": 1.7, + "learning_rate": 1.4408929477422628e-07, + "logits/chosen": -0.9076977968215942, + "logits/rejected": -0.8381573557853699, + "logps/chosen": -88.46109008789062, + "logps/rejected": -120.51727294921875, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41265106201171875, + "rewards/margins": 25.993633270263672, + "rewards/rejected": -25.580984115600586, + "step": 3730 + }, + { + "epoch": 1.71, + "learning_rate": 1.4358193810248604e-07, + "logits/chosen": -0.9492243528366089, + "logits/rejected": -0.9089414477348328, + "logps/chosen": -81.46710205078125, + "logps/rejected": -116.9715576171875, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21577057242393494, + "rewards/margins": 26.476455688476562, + "rewards/rejected": -26.26068687438965, + "step": 3740 + }, + { + "epoch": 1.71, + "learning_rate": 1.430745814307458e-07, + "logits/chosen": -0.9978575706481934, + "logits/rejected": -0.9076077342033386, + "logps/chosen": -88.85905456542969, + "logps/rejected": -119.39497375488281, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9890228509902954, + "rewards/margins": 26.75762367248535, + "rewards/rejected": -25.768600463867188, + "step": 3750 + }, + { + "epoch": 1.72, + "learning_rate": 1.4256722475900558e-07, + "logits/chosen": -0.8967350721359253, + "logits/rejected": -0.821501612663269, + "logps/chosen": -81.39857482910156, + "logps/rejected": -114.3400650024414, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15209658443927765, + "rewards/margins": 25.085054397583008, + "rewards/rejected": -24.93295669555664, + "step": 3760 + }, + { + "epoch": 1.72, + "learning_rate": 1.4205986808726534e-07, + "logits/chosen": -0.9509645700454712, + "logits/rejected": -0.8750587701797485, + "logps/chosen": -84.42094421386719, + "logps/rejected": -130.0999755859375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0962730646133423, + "rewards/margins": 28.398696899414062, + "rewards/rejected": -27.302425384521484, + "step": 3770 + }, + { + "epoch": 1.73, + "learning_rate": 1.415525114155251e-07, + "logits/chosen": -0.9023697972297668, + "logits/rejected": -0.9625568389892578, + "logps/chosen": -90.93963623046875, + "logps/rejected": -120.6771469116211, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.317933440208435, + "rewards/margins": 27.938594818115234, + "rewards/rejected": -26.62066650390625, + "step": 3780 + }, + { + "epoch": 1.73, + "learning_rate": 1.4104515474378488e-07, + "logits/chosen": -0.8431658744812012, + "logits/rejected": -0.8656299710273743, + "logps/chosen": -84.60992431640625, + "logps/rejected": -117.33194732666016, + "loss": 0.0048, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.19338905811309814, + "rewards/margins": 25.269033432006836, + "rewards/rejected": -25.075647354125977, + "step": 3790 + }, + { + "epoch": 1.73, + "learning_rate": 1.4053779807204464e-07, + "logits/chosen": -0.9680309295654297, + "logits/rejected": -0.9091488718986511, + "logps/chosen": -87.53936004638672, + "logps/rejected": -113.69287109375, + "loss": 0.0063, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5028787851333618, + "rewards/margins": 25.9219913482666, + "rewards/rejected": -24.419111251831055, + "step": 3800 + }, + { + "epoch": 1.73, + "eval_logits/chosen": -0.9016574621200562, + "eval_logits/rejected": -0.8256176114082336, + "eval_logps/chosen": -83.64620971679688, + "eval_logps/rejected": -114.05230712890625, + "eval_loss": 0.006408516317605972, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.9458239078521729, + "eval_rewards/margins": 25.541276931762695, + "eval_rewards/rejected": -24.595455169677734, + "eval_runtime": 66.5039, + "eval_samples_per_second": 43.035, + "eval_steps_per_second": 2.692, + "step": 3800 + }, + { + "epoch": 1.74, + "learning_rate": 1.400304414003044e-07, + "logits/chosen": -0.9558550119400024, + "logits/rejected": -0.8679558634757996, + "logps/chosen": -82.83837890625, + "logps/rejected": -114.90482330322266, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4327557682991028, + "rewards/margins": 25.373950958251953, + "rewards/rejected": -24.941192626953125, + "step": 3810 + }, + { + "epoch": 1.74, + "learning_rate": 1.3952308472856418e-07, + "logits/chosen": -0.8897542953491211, + "logits/rejected": -0.8644717931747437, + "logps/chosen": -88.68601989746094, + "logps/rejected": -116.8119888305664, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15823300182819366, + "rewards/margins": 25.815906524658203, + "rewards/rejected": -25.974136352539062, + "step": 3820 + }, + { + "epoch": 1.75, + "learning_rate": 1.3901572805682394e-07, + "logits/chosen": -0.8622309565544128, + "logits/rejected": -0.8281264305114746, + "logps/chosen": -89.28263092041016, + "logps/rejected": -122.96022033691406, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5797398090362549, + "rewards/margins": 27.29312515258789, + "rewards/rejected": -25.7133846282959, + "step": 3830 + }, + { + "epoch": 1.75, + "learning_rate": 1.385083713850837e-07, + "logits/chosen": -0.8999799489974976, + "logits/rejected": -0.8458458185195923, + "logps/chosen": -86.58378601074219, + "logps/rejected": -115.96832275390625, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.054229497909546, + "rewards/margins": 25.937957763671875, + "rewards/rejected": -24.883729934692383, + "step": 3840 + }, + { + "epoch": 1.76, + "learning_rate": 1.3800101471334348e-07, + "logits/chosen": -0.8850622177124023, + "logits/rejected": -0.8627168536186218, + "logps/chosen": -84.57807922363281, + "logps/rejected": -112.39884948730469, + "loss": 0.0057, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.08325471729040146, + "rewards/margins": 24.015729904174805, + "rewards/rejected": -24.098987579345703, + "step": 3850 + }, + { + "epoch": 1.76, + "learning_rate": 1.3749365804160324e-07, + "logits/chosen": -0.8272354006767273, + "logits/rejected": -0.7512696981430054, + "logps/chosen": -84.18182373046875, + "logps/rejected": -114.89219665527344, + "loss": 0.0035, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7288559675216675, + "rewards/margins": 25.41353988647461, + "rewards/rejected": -24.684682846069336, + "step": 3860 + }, + { + "epoch": 1.77, + "learning_rate": 1.36986301369863e-07, + "logits/chosen": -0.7898057699203491, + "logits/rejected": -0.8020504713058472, + "logps/chosen": -83.73123168945312, + "logps/rejected": -118.38655090332031, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9885632991790771, + "rewards/margins": 28.37349510192871, + "rewards/rejected": -26.384933471679688, + "step": 3870 + }, + { + "epoch": 1.77, + "learning_rate": 1.3647894469812278e-07, + "logits/chosen": -0.8614355325698853, + "logits/rejected": -0.8076695203781128, + "logps/chosen": -83.43045043945312, + "logps/rejected": -116.3105697631836, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9053701162338257, + "rewards/margins": 26.96212387084961, + "rewards/rejected": -25.0567569732666, + "step": 3880 + }, + { + "epoch": 1.78, + "learning_rate": 1.3597158802638254e-07, + "logits/chosen": -0.8757370710372925, + "logits/rejected": -0.858729362487793, + "logps/chosen": -85.84932708740234, + "logps/rejected": -123.9087905883789, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7146657705307007, + "rewards/margins": 28.85630226135254, + "rewards/rejected": -27.141637802124023, + "step": 3890 + }, + { + "epoch": 1.78, + "learning_rate": 1.354642313546423e-07, + "logits/chosen": -0.8456228971481323, + "logits/rejected": -0.786232054233551, + "logps/chosen": -80.6098403930664, + "logps/rejected": -113.07496643066406, + "loss": 0.0041, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9995010495185852, + "rewards/margins": 25.73309326171875, + "rewards/rejected": -24.733591079711914, + "step": 3900 + }, + { + "epoch": 1.78, + "eval_logits/chosen": -0.8836355209350586, + "eval_logits/rejected": -0.8069778680801392, + "eval_logps/chosen": -83.1183853149414, + "eval_logps/rejected": -114.56892395019531, + "eval_loss": 0.0067343455739319324, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.209733486175537, + "eval_rewards/margins": 26.063493728637695, + "eval_rewards/rejected": -24.853761672973633, + "eval_runtime": 76.8714, + "eval_samples_per_second": 37.231, + "eval_steps_per_second": 2.329, + "step": 3900 + }, + { + "epoch": 1.78, + "learning_rate": 1.3495687468290208e-07, + "logits/chosen": -0.9082708358764648, + "logits/rejected": -0.8267936706542969, + "logps/chosen": -87.33113861083984, + "logps/rejected": -115.68280029296875, + "loss": 0.0013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5674334764480591, + "rewards/margins": 25.797527313232422, + "rewards/rejected": -25.2300968170166, + "step": 3910 + }, + { + "epoch": 1.79, + "learning_rate": 1.3444951801116184e-07, + "logits/chosen": -0.8897479176521301, + "logits/rejected": -0.8622371554374695, + "logps/chosen": -87.61221313476562, + "logps/rejected": -119.82197570800781, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.505795955657959, + "rewards/margins": 26.55779457092285, + "rewards/rejected": -26.051998138427734, + "step": 3920 + }, + { + "epoch": 1.79, + "learning_rate": 1.339421613394216e-07, + "logits/chosen": -0.9741457104682922, + "logits/rejected": -0.8766289949417114, + "logps/chosen": -87.72877502441406, + "logps/rejected": -121.0351333618164, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0175672769546509, + "rewards/margins": 27.11517333984375, + "rewards/rejected": -26.097606658935547, + "step": 3930 + }, + { + "epoch": 1.8, + "learning_rate": 1.3343480466768138e-07, + "logits/chosen": -0.9145620465278625, + "logits/rejected": -0.8341091275215149, + "logps/chosen": -82.79689025878906, + "logps/rejected": -113.12298583984375, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.01173012237995863, + "rewards/margins": 24.762723922729492, + "rewards/rejected": -24.77445411682129, + "step": 3940 + }, + { + "epoch": 1.8, + "learning_rate": 1.3292744799594114e-07, + "logits/chosen": -0.8479018211364746, + "logits/rejected": -0.8104110956192017, + "logps/chosen": -86.39948272705078, + "logps/rejected": -116.41688537597656, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.121065139770508, + "rewards/margins": 26.785120010375977, + "rewards/rejected": -24.664052963256836, + "step": 3950 + }, + { + "epoch": 1.81, + "learning_rate": 1.324200913242009e-07, + "logits/chosen": -0.8203024864196777, + "logits/rejected": -0.7483721971511841, + "logps/chosen": -82.69932556152344, + "logps/rejected": -118.39974212646484, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7221908569335938, + "rewards/margins": 27.958471298217773, + "rewards/rejected": -26.236278533935547, + "step": 3960 + }, + { + "epoch": 1.81, + "learning_rate": 1.3191273465246068e-07, + "logits/chosen": -0.9186090230941772, + "logits/rejected": -0.8178736567497253, + "logps/chosen": -87.22188568115234, + "logps/rejected": -119.90480041503906, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6769983768463135, + "rewards/margins": 27.869892120361328, + "rewards/rejected": -25.192890167236328, + "step": 3970 + }, + { + "epoch": 1.82, + "learning_rate": 1.3140537798072044e-07, + "logits/chosen": -0.8046373128890991, + "logits/rejected": -0.7908083200454712, + "logps/chosen": -83.47384643554688, + "logps/rejected": -109.45014953613281, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.221688747406006, + "rewards/margins": 25.10355567932129, + "rewards/rejected": -22.881866455078125, + "step": 3980 + }, + { + "epoch": 1.82, + "learning_rate": 1.308980213089802e-07, + "logits/chosen": -0.8663067817687988, + "logits/rejected": -0.8282343745231628, + "logps/chosen": -82.12123107910156, + "logps/rejected": -113.12467956542969, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3237030506134033, + "rewards/margins": 24.891172409057617, + "rewards/rejected": -22.56747055053711, + "step": 3990 + }, + { + "epoch": 1.83, + "learning_rate": 1.3039066463723998e-07, + "logits/chosen": -0.8992531895637512, + "logits/rejected": -0.8389276266098022, + "logps/chosen": -84.60519409179688, + "logps/rejected": -114.64949798583984, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.048742413520813, + "rewards/margins": 26.16281509399414, + "rewards/rejected": -25.114072799682617, + "step": 4000 + }, + { + "epoch": 1.83, + "eval_logits/chosen": -0.8783338069915771, + "eval_logits/rejected": -0.8036257028579712, + "eval_logps/chosen": -82.68094635009766, + "eval_logps/rejected": -113.65660858154297, + "eval_loss": 0.0062260739505290985, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.4284573793411255, + "eval_rewards/margins": 25.826065063476562, + "eval_rewards/rejected": -24.397605895996094, + "eval_runtime": 63.7559, + "eval_samples_per_second": 44.89, + "eval_steps_per_second": 2.808, + "step": 4000 + }, + { + "epoch": 1.83, + "learning_rate": 1.2988330796549974e-07, + "logits/chosen": -0.894163966178894, + "logits/rejected": -0.8875846862792969, + "logps/chosen": -86.4601821899414, + "logps/rejected": -115.12332916259766, + "loss": 0.0065, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6220238208770752, + "rewards/margins": 25.81390953063965, + "rewards/rejected": -24.19188690185547, + "step": 4010 + }, + { + "epoch": 1.83, + "learning_rate": 1.293759512937595e-07, + "logits/chosen": -0.8875330686569214, + "logits/rejected": -0.8312565684318542, + "logps/chosen": -88.39109802246094, + "logps/rejected": -119.79731750488281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05904974788427353, + "rewards/margins": 24.40655517578125, + "rewards/rejected": -24.46560287475586, + "step": 4020 + }, + { + "epoch": 1.84, + "learning_rate": 1.2886859462201928e-07, + "logits/chosen": -0.9583679437637329, + "logits/rejected": -0.8589159846305847, + "logps/chosen": -87.69874572753906, + "logps/rejected": -114.6513900756836, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5733067989349365, + "rewards/margins": 25.790552139282227, + "rewards/rejected": -24.217248916625977, + "step": 4030 + }, + { + "epoch": 1.84, + "learning_rate": 1.2836123795027904e-07, + "logits/chosen": -0.8350761532783508, + "logits/rejected": -0.8028010129928589, + "logps/chosen": -81.8393783569336, + "logps/rejected": -115.5448989868164, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1797157526016235, + "rewards/margins": 26.374853134155273, + "rewards/rejected": -25.195138931274414, + "step": 4040 + }, + { + "epoch": 1.85, + "learning_rate": 1.278538812785388e-07, + "logits/chosen": -0.879655659198761, + "logits/rejected": -0.8751519918441772, + "logps/chosen": -90.0999526977539, + "logps/rejected": -118.6465072631836, + "loss": 0.0034, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.742595911026001, + "rewards/margins": 27.128515243530273, + "rewards/rejected": -25.385921478271484, + "step": 4050 + }, + { + "epoch": 1.85, + "learning_rate": 1.2734652460679858e-07, + "logits/chosen": -0.9097345471382141, + "logits/rejected": -0.8761495351791382, + "logps/chosen": -86.26795959472656, + "logps/rejected": -118.676025390625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8561265468597412, + "rewards/margins": 26.486553192138672, + "rewards/rejected": -24.63042640686035, + "step": 4060 + }, + { + "epoch": 1.86, + "learning_rate": 1.2683916793505834e-07, + "logits/chosen": -0.8175376653671265, + "logits/rejected": -0.7447512149810791, + "logps/chosen": -80.61178588867188, + "logps/rejected": -117.44854736328125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.017381429672241, + "rewards/margins": 26.004596710205078, + "rewards/rejected": -23.98721694946289, + "step": 4070 + }, + { + "epoch": 1.86, + "learning_rate": 1.263318112633181e-07, + "logits/chosen": -0.8578559756278992, + "logits/rejected": -0.7857792973518372, + "logps/chosen": -87.51000213623047, + "logps/rejected": -114.0421142578125, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8245757818222046, + "rewards/margins": 24.864219665527344, + "rewards/rejected": -24.03964614868164, + "step": 4080 + }, + { + "epoch": 1.87, + "learning_rate": 1.2582445459157788e-07, + "logits/chosen": -0.8620840907096863, + "logits/rejected": -0.7266338467597961, + "logps/chosen": -80.26228332519531, + "logps/rejected": -112.8070068359375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7275581359863281, + "rewards/margins": 25.295616149902344, + "rewards/rejected": -23.568058013916016, + "step": 4090 + }, + { + "epoch": 1.87, + "learning_rate": 1.2531709791983764e-07, + "logits/chosen": -0.9016240835189819, + "logits/rejected": -0.8646566271781921, + "logps/chosen": -89.81653594970703, + "logps/rejected": -118.0747299194336, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3270467519760132, + "rewards/margins": 25.991525650024414, + "rewards/rejected": -24.664478302001953, + "step": 4100 + }, + { + "epoch": 1.87, + "eval_logits/chosen": -0.8751200437545776, + "eval_logits/rejected": -0.7990096211433411, + "eval_logps/chosen": -82.98982238769531, + "eval_logps/rejected": -113.60397338867188, + "eval_loss": 0.006390063092112541, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 1.2740201950073242, + "eval_rewards/margins": 25.645313262939453, + "eval_rewards/rejected": -24.371292114257812, + "eval_runtime": 93.3131, + "eval_samples_per_second": 30.671, + "eval_steps_per_second": 1.918, + "step": 4100 + }, + { + "epoch": 1.88, + "learning_rate": 1.248097412480974e-07, + "logits/chosen": -1.012479543685913, + "logits/rejected": -0.8951042294502258, + "logps/chosen": -89.95454406738281, + "logps/rejected": -120.26151275634766, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4968180656433105, + "rewards/margins": 27.859487533569336, + "rewards/rejected": -25.362668991088867, + "step": 4110 + }, + { + "epoch": 1.88, + "learning_rate": 1.2430238457635718e-07, + "logits/chosen": -0.8006552457809448, + "logits/rejected": -0.7851511240005493, + "logps/chosen": -86.0716552734375, + "logps/rejected": -115.94569396972656, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7579014301300049, + "rewards/margins": 25.864261627197266, + "rewards/rejected": -24.106359481811523, + "step": 4120 + }, + { + "epoch": 1.89, + "learning_rate": 1.2379502790461694e-07, + "logits/chosen": -0.8675606846809387, + "logits/rejected": -0.8367154002189636, + "logps/chosen": -83.247802734375, + "logps/rejected": -115.09774017333984, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.06647715717554092, + "rewards/margins": 24.646516799926758, + "rewards/rejected": -24.580041885375977, + "step": 4130 + }, + { + "epoch": 1.89, + "learning_rate": 1.232876712328767e-07, + "logits/chosen": -0.8167160749435425, + "logits/rejected": -0.80632483959198, + "logps/chosen": -88.31963348388672, + "logps/rejected": -118.49552917480469, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2290937900543213, + "rewards/margins": 26.213436126708984, + "rewards/rejected": -24.984344482421875, + "step": 4140 + }, + { + "epoch": 1.89, + "learning_rate": 1.2278031456113648e-07, + "logits/chosen": -0.9386017918586731, + "logits/rejected": -0.8482101559638977, + "logps/chosen": -89.70311737060547, + "logps/rejected": -117.4471206665039, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7219051122665405, + "rewards/margins": 25.45486831665039, + "rewards/rejected": -23.732961654663086, + "step": 4150 + }, + { + "epoch": 1.9, + "learning_rate": 1.2227295788939624e-07, + "logits/chosen": -0.8697861433029175, + "logits/rejected": -0.7494013905525208, + "logps/chosen": -85.14886474609375, + "logps/rejected": -122.1308364868164, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3808021545410156, + "rewards/margins": 27.002914428710938, + "rewards/rejected": -25.622112274169922, + "step": 4160 + }, + { + "epoch": 1.9, + "learning_rate": 1.21765601217656e-07, + "logits/chosen": -0.8890932202339172, + "logits/rejected": -0.864509105682373, + "logps/chosen": -80.30211639404297, + "logps/rejected": -120.65213775634766, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.640447974205017, + "rewards/margins": 26.37517738342285, + "rewards/rejected": -24.734729766845703, + "step": 4170 + }, + { + "epoch": 1.91, + "learning_rate": 1.2125824454591578e-07, + "logits/chosen": -0.8559309840202332, + "logits/rejected": -0.8733586072921753, + "logps/chosen": -90.32381439208984, + "logps/rejected": -114.02840423583984, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5059894323349, + "rewards/margins": 25.90911865234375, + "rewards/rejected": -24.40313148498535, + "step": 4180 + }, + { + "epoch": 1.91, + "learning_rate": 1.2075088787417554e-07, + "logits/chosen": -0.7873523235321045, + "logits/rejected": -0.8265848159790039, + "logps/chosen": -84.38890075683594, + "logps/rejected": -116.04801940917969, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9120607376098633, + "rewards/margins": 26.35770034790039, + "rewards/rejected": -24.445636749267578, + "step": 4190 + }, + { + "epoch": 1.92, + "learning_rate": 1.202435312024353e-07, + "logits/chosen": -0.8989976644515991, + "logits/rejected": -0.8107091188430786, + "logps/chosen": -82.97294616699219, + "logps/rejected": -113.08280944824219, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4709131717681885, + "rewards/margins": 23.579673767089844, + "rewards/rejected": -23.108760833740234, + "step": 4200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": -0.8626413941383362, + "eval_logits/rejected": -0.7886302471160889, + "eval_logps/chosen": -83.84310150146484, + "eval_logps/rejected": -112.74535369873047, + "eval_loss": 0.0062231095507740974, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.847379744052887, + "eval_rewards/margins": 24.78935432434082, + "eval_rewards/rejected": -23.941972732543945, + "eval_runtime": 80.9092, + "eval_samples_per_second": 35.373, + "eval_steps_per_second": 2.212, + "step": 4200 + }, + { + "epoch": 1.92, + "learning_rate": 1.1973617453069508e-07, + "logits/chosen": -0.8819769024848938, + "logits/rejected": -0.8112877607345581, + "logps/chosen": -83.00496673583984, + "logps/rejected": -114.4212875366211, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.11636953055858612, + "rewards/margins": 24.707599639892578, + "rewards/rejected": -24.8239688873291, + "step": 4210 + }, + { + "epoch": 1.93, + "learning_rate": 1.1922881785895484e-07, + "logits/chosen": -0.8796902894973755, + "logits/rejected": -0.8617744445800781, + "logps/chosen": -88.17607116699219, + "logps/rejected": -122.41082763671875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7744562029838562, + "rewards/margins": 25.8326473236084, + "rewards/rejected": -25.058191299438477, + "step": 4220 + }, + { + "epoch": 1.93, + "learning_rate": 1.187214611872146e-07, + "logits/chosen": -0.9677571058273315, + "logits/rejected": -0.8670059442520142, + "logps/chosen": -92.79700469970703, + "logps/rejected": -123.5301513671875, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.26532265543937683, + "rewards/margins": 25.150630950927734, + "rewards/rejected": -25.415952682495117, + "step": 4230 + }, + { + "epoch": 1.94, + "learning_rate": 1.1821410451547436e-07, + "logits/chosen": -0.8243002891540527, + "logits/rejected": -0.7622770071029663, + "logps/chosen": -82.1230697631836, + "logps/rejected": -116.01241302490234, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8827035427093506, + "rewards/margins": 26.27325439453125, + "rewards/rejected": -24.390552520751953, + "step": 4240 + }, + { + "epoch": 1.94, + "learning_rate": 1.1770674784373413e-07, + "logits/chosen": -0.8179371953010559, + "logits/rejected": -0.7521147131919861, + "logps/chosen": -79.74634552001953, + "logps/rejected": -114.4630126953125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15621128678321838, + "rewards/margins": 24.31221580505371, + "rewards/rejected": -24.156003952026367, + "step": 4250 + }, + { + "epoch": 1.94, + "learning_rate": 1.171993911719939e-07, + "logits/chosen": -0.9867182970046997, + "logits/rejected": -0.8832413554191589, + "logps/chosen": -85.32367706298828, + "logps/rejected": -119.32315826416016, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.011179482564330101, + "rewards/margins": 25.743982315063477, + "rewards/rejected": -25.75516128540039, + "step": 4260 + }, + { + "epoch": 1.95, + "learning_rate": 1.1669203450025366e-07, + "logits/chosen": -0.7901066541671753, + "logits/rejected": -0.7551292181015015, + "logps/chosen": -79.58982849121094, + "logps/rejected": -114.67451477050781, + "loss": 0.0068, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5268216133117676, + "rewards/margins": 23.880836486816406, + "rewards/rejected": -23.354015350341797, + "step": 4270 + }, + { + "epoch": 1.95, + "learning_rate": 1.1618467782851343e-07, + "logits/chosen": -0.8936311602592468, + "logits/rejected": -0.8211982846260071, + "logps/chosen": -82.46665954589844, + "logps/rejected": -119.68986511230469, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9696955680847168, + "rewards/margins": 25.855365753173828, + "rewards/rejected": -24.88566780090332, + "step": 4280 + }, + { + "epoch": 1.96, + "learning_rate": 1.156773211567732e-07, + "logits/chosen": -0.8227362632751465, + "logits/rejected": -0.7342718839645386, + "logps/chosen": -83.89360809326172, + "logps/rejected": -117.38250732421875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.834515929222107, + "rewards/margins": 27.02130126953125, + "rewards/rejected": -25.186786651611328, + "step": 4290 + }, + { + "epoch": 1.96, + "learning_rate": 1.1516996448503296e-07, + "logits/chosen": -0.9188385009765625, + "logits/rejected": -0.8470960855484009, + "logps/chosen": -83.9849624633789, + "logps/rejected": -120.4980239868164, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40226873755455017, + "rewards/margins": 26.013219833374023, + "rewards/rejected": -25.610950469970703, + "step": 4300 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -0.8774282932281494, + "eval_logits/rejected": -0.8064822554588318, + "eval_logps/chosen": -83.99234008789062, + "eval_logps/rejected": -113.08182525634766, + "eval_loss": 0.006075400393456221, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.7727564573287964, + "eval_rewards/margins": 24.882965087890625, + "eval_rewards/rejected": -24.110210418701172, + "eval_runtime": 77.6038, + "eval_samples_per_second": 36.88, + "eval_steps_per_second": 2.307, + "step": 4300 + }, + { + "epoch": 1.97, + "learning_rate": 1.1466260781329273e-07, + "logits/chosen": -0.8826113939285278, + "logits/rejected": -0.7901960611343384, + "logps/chosen": -88.16101837158203, + "logps/rejected": -117.04609680175781, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6930155158042908, + "rewards/margins": 25.547672271728516, + "rewards/rejected": -24.854656219482422, + "step": 4310 + }, + { + "epoch": 1.97, + "learning_rate": 1.141552511415525e-07, + "logits/chosen": -0.8641761541366577, + "logits/rejected": -0.8362469673156738, + "logps/chosen": -88.54991149902344, + "logps/rejected": -116.129150390625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7447670102119446, + "rewards/margins": 25.9591007232666, + "rewards/rejected": -25.214338302612305, + "step": 4320 + }, + { + "epoch": 1.98, + "learning_rate": 1.1364789446981226e-07, + "logits/chosen": -0.8864119648933411, + "logits/rejected": -0.8219255208969116, + "logps/chosen": -90.05606842041016, + "logps/rejected": -119.87153625488281, + "loss": 0.0062, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8769302368164062, + "rewards/margins": 26.760021209716797, + "rewards/rejected": -25.88309669494629, + "step": 4330 + }, + { + "epoch": 1.98, + "learning_rate": 1.1314053779807203e-07, + "logits/chosen": -0.9568690061569214, + "logits/rejected": -0.9261384010314941, + "logps/chosen": -85.25032043457031, + "logps/rejected": -116.95954895019531, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3308374285697937, + "rewards/margins": 25.455656051635742, + "rewards/rejected": -25.12481689453125, + "step": 4340 + }, + { + "epoch": 1.99, + "learning_rate": 1.126331811263318e-07, + "logits/chosen": -0.9497167468070984, + "logits/rejected": -0.8359603881835938, + "logps/chosen": -82.88484191894531, + "logps/rejected": -113.48707580566406, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22573499381542206, + "rewards/margins": 24.359251022338867, + "rewards/rejected": -24.58498764038086, + "step": 4350 + }, + { + "epoch": 1.99, + "learning_rate": 1.1212582445459156e-07, + "logits/chosen": -0.8375424146652222, + "logits/rejected": -0.8085098266601562, + "logps/chosen": -91.62086486816406, + "logps/rejected": -118.6721420288086, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29354745149612427, + "rewards/margins": 26.114770889282227, + "rewards/rejected": -25.82122230529785, + "step": 4360 + }, + { + "epoch": 1.99, + "learning_rate": 1.1161846778285133e-07, + "logits/chosen": -0.862596869468689, + "logits/rejected": -0.7966786623001099, + "logps/chosen": -80.2033462524414, + "logps/rejected": -115.92503356933594, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1859007626771927, + "rewards/margins": 23.71189308166504, + "rewards/rejected": -23.525989532470703, + "step": 4370 + }, + { + "epoch": 2.0, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -0.8235847353935242, + "logits/rejected": -0.7749193906784058, + "logps/chosen": -90.13212585449219, + "logps/rejected": -112.7907485961914, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0780595541000366, + "rewards/margins": 24.11054229736328, + "rewards/rejected": -23.032482147216797, + "step": 4380 + }, + { + "epoch": 2.0, + "learning_rate": 1.1060375443937086e-07, + "logits/chosen": -0.873267650604248, + "logits/rejected": -0.872418999671936, + "logps/chosen": -84.37730407714844, + "logps/rejected": -116.15653991699219, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.26867276430130005, + "rewards/margins": 23.833995819091797, + "rewards/rejected": -23.56532096862793, + "step": 4390 + }, + { + "epoch": 2.01, + "learning_rate": 1.1009639776763063e-07, + "logits/chosen": -0.9008662104606628, + "logits/rejected": -0.8445860743522644, + "logps/chosen": -85.77739715576172, + "logps/rejected": -113.36775207519531, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9516502618789673, + "rewards/margins": 25.20037078857422, + "rewards/rejected": -24.248720169067383, + "step": 4400 + }, + { + "epoch": 2.01, + "eval_logits/chosen": -0.8856587409973145, + "eval_logits/rejected": -0.8144620656967163, + "eval_logps/chosen": -84.20708465576172, + "eval_logps/rejected": -112.38359069824219, + "eval_loss": 0.005585566163063049, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.6653829216957092, + "eval_rewards/margins": 24.426471710205078, + "eval_rewards/rejected": -23.761089324951172, + "eval_runtime": 81.7268, + "eval_samples_per_second": 35.019, + "eval_steps_per_second": 2.19, + "step": 4400 + }, + { + "epoch": 2.01, + "learning_rate": 1.095890410958904e-07, + "logits/chosen": -0.8619349598884583, + "logits/rejected": -0.8154422640800476, + "logps/chosen": -88.06494140625, + "logps/rejected": -120.3101577758789, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4764477014541626, + "rewards/margins": 26.710468292236328, + "rewards/rejected": -25.234024047851562, + "step": 4410 + }, + { + "epoch": 2.02, + "learning_rate": 1.0908168442415016e-07, + "logits/chosen": -0.8404685854911804, + "logits/rejected": -0.8038052320480347, + "logps/chosen": -81.84117126464844, + "logps/rejected": -112.4288558959961, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12754690647125244, + "rewards/margins": 23.65199851989746, + "rewards/rejected": -23.524450302124023, + "step": 4420 + }, + { + "epoch": 2.02, + "learning_rate": 1.0857432775240993e-07, + "logits/chosen": -0.7906190156936646, + "logits/rejected": -0.7676808834075928, + "logps/chosen": -80.90745544433594, + "logps/rejected": -120.09671783447266, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1424059867858887, + "rewards/margins": 26.4530086517334, + "rewards/rejected": -25.310606002807617, + "step": 4430 + }, + { + "epoch": 2.03, + "learning_rate": 1.080669710806697e-07, + "logits/chosen": -0.9605631828308105, + "logits/rejected": -0.8633155822753906, + "logps/chosen": -84.76126861572266, + "logps/rejected": -118.72808837890625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9969682693481445, + "rewards/margins": 26.093795776367188, + "rewards/rejected": -25.09682846069336, + "step": 4440 + }, + { + "epoch": 2.03, + "learning_rate": 1.0755961440892946e-07, + "logits/chosen": -0.953199028968811, + "logits/rejected": -0.9226964712142944, + "logps/chosen": -82.70755767822266, + "logps/rejected": -116.70185852050781, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.997397243976593, + "rewards/margins": 25.517671585083008, + "rewards/rejected": -24.520275115966797, + "step": 4450 + }, + { + "epoch": 2.04, + "learning_rate": 1.0705225773718923e-07, + "logits/chosen": -0.9112932085990906, + "logits/rejected": -0.8612509965896606, + "logps/chosen": -86.12630462646484, + "logps/rejected": -113.80561828613281, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07253873348236084, + "rewards/margins": 24.135400772094727, + "rewards/rejected": -24.062862396240234, + "step": 4460 + }, + { + "epoch": 2.04, + "learning_rate": 1.06544901065449e-07, + "logits/chosen": -0.9248711466789246, + "logits/rejected": -0.8723545074462891, + "logps/chosen": -87.74769592285156, + "logps/rejected": -118.7971420288086, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7695275545120239, + "rewards/margins": 24.69394302368164, + "rewards/rejected": -23.924413681030273, + "step": 4470 + }, + { + "epoch": 2.04, + "learning_rate": 1.0603754439370876e-07, + "logits/chosen": -0.9148474931716919, + "logits/rejected": -0.8270168304443359, + "logps/chosen": -85.63670349121094, + "logps/rejected": -120.21187591552734, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07334025204181671, + "rewards/margins": 25.913070678710938, + "rewards/rejected": -25.986413955688477, + "step": 4480 + }, + { + "epoch": 2.05, + "learning_rate": 1.0553018772196853e-07, + "logits/chosen": -0.9342101216316223, + "logits/rejected": -0.8520076870918274, + "logps/chosen": -89.24897766113281, + "logps/rejected": -117.5027847290039, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7600328922271729, + "rewards/margins": 24.749935150146484, + "rewards/rejected": -23.98990249633789, + "step": 4490 + }, + { + "epoch": 2.05, + "learning_rate": 1.050228310502283e-07, + "logits/chosen": -0.8792055249214172, + "logits/rejected": -0.8664016723632812, + "logps/chosen": -83.908447265625, + "logps/rejected": -117.2926025390625, + "loss": 0.0032, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2741992473602295, + "rewards/margins": 24.095046997070312, + "rewards/rejected": -25.369247436523438, + "step": 4500 + }, + { + "epoch": 2.05, + "eval_logits/chosen": -0.8906596899032593, + "eval_logits/rejected": -0.818504810333252, + "eval_logps/chosen": -84.84129333496094, + "eval_logps/rejected": -113.85082244873047, + "eval_loss": 0.0056776185519993305, + "eval_rewards/accuracies": 0.994413435459137, + "eval_rewards/chosen": 0.3482798635959625, + "eval_rewards/margins": 24.842987060546875, + "eval_rewards/rejected": -24.49471092224121, + "eval_runtime": 81.446, + "eval_samples_per_second": 35.14, + "eval_steps_per_second": 2.198, + "step": 4500 + }, + { + "epoch": 2.06, + "learning_rate": 1.0451547437848806e-07, + "logits/chosen": -0.9115379452705383, + "logits/rejected": -0.8717126846313477, + "logps/chosen": -88.75419616699219, + "logps/rejected": -116.99250793457031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.256993055343628, + "rewards/margins": 25.51570701599121, + "rewards/rejected": -24.258716583251953, + "step": 4510 + }, + { + "epoch": 2.06, + "learning_rate": 1.0400811770674783e-07, + "logits/chosen": -0.8679983019828796, + "logits/rejected": -0.8133772611618042, + "logps/chosen": -78.07077026367188, + "logps/rejected": -116.81890869140625, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4886135160923004, + "rewards/margins": 26.63811683654785, + "rewards/rejected": -26.149499893188477, + "step": 4520 + }, + { + "epoch": 2.07, + "learning_rate": 1.035007610350076e-07, + "logits/chosen": -0.8347498178482056, + "logits/rejected": -0.8103248476982117, + "logps/chosen": -89.81623840332031, + "logps/rejected": -112.1605224609375, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.13603441417217255, + "rewards/margins": 24.102638244628906, + "rewards/rejected": -23.966602325439453, + "step": 4530 + }, + { + "epoch": 2.07, + "learning_rate": 1.0299340436326736e-07, + "logits/chosen": -0.8579212427139282, + "logits/rejected": -0.8020459413528442, + "logps/chosen": -83.22380828857422, + "logps/rejected": -116.16816711425781, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8977780342102051, + "rewards/margins": 25.821319580078125, + "rewards/rejected": -24.92354393005371, + "step": 4540 + }, + { + "epoch": 2.08, + "learning_rate": 1.0248604769152713e-07, + "logits/chosen": -0.9014474153518677, + "logits/rejected": -0.8092902898788452, + "logps/chosen": -82.77027893066406, + "logps/rejected": -115.4444580078125, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5497044920921326, + "rewards/margins": 24.866573333740234, + "rewards/rejected": -24.31686782836914, + "step": 4550 + }, + { + "epoch": 2.08, + "learning_rate": 1.019786910197869e-07, + "logits/chosen": -0.9946414828300476, + "logits/rejected": -0.9202834367752075, + "logps/chosen": -87.10456848144531, + "logps/rejected": -117.59464263916016, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5109010934829712, + "rewards/margins": 23.4146671295166, + "rewards/rejected": -24.925569534301758, + "step": 4560 + }, + { + "epoch": 2.09, + "learning_rate": 1.0147133434804666e-07, + "logits/chosen": -0.9581912755966187, + "logits/rejected": -0.8749968409538269, + "logps/chosen": -83.8426742553711, + "logps/rejected": -112.90596771240234, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.59953773021698, + "rewards/margins": 25.36787986755371, + "rewards/rejected": -24.768339157104492, + "step": 4570 + }, + { + "epoch": 2.09, + "learning_rate": 1.0096397767630643e-07, + "logits/chosen": -0.8620211482048035, + "logits/rejected": -0.8351920247077942, + "logps/chosen": -90.25982666015625, + "logps/rejected": -122.8171157836914, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.406160593032837, + "rewards/margins": 26.347885131835938, + "rewards/rejected": -24.94172477722168, + "step": 4580 + }, + { + "epoch": 2.1, + "learning_rate": 1.004566210045662e-07, + "logits/chosen": -0.9509794116020203, + "logits/rejected": -0.8784133791923523, + "logps/chosen": -81.00958251953125, + "logps/rejected": -119.47309875488281, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7654523253440857, + "rewards/margins": 25.69040298461914, + "rewards/rejected": -24.92494773864746, + "step": 4590 + }, + { + "epoch": 2.1, + "learning_rate": 9.994926433282596e-08, + "logits/chosen": -0.946574866771698, + "logits/rejected": -0.8596547842025757, + "logps/chosen": -90.84710693359375, + "logps/rejected": -121.0890884399414, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3096452057361603, + "rewards/margins": 25.466899871826172, + "rewards/rejected": -25.776546478271484, + "step": 4600 + }, + { + "epoch": 2.1, + "eval_logits/chosen": -0.8970758318901062, + "eval_logits/rejected": -0.8229092359542847, + "eval_logps/chosen": -85.00439453125, + "eval_logps/rejected": -114.76090240478516, + "eval_loss": 0.005778728984296322, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": 0.2667306661605835, + "eval_rewards/margins": 25.21647834777832, + "eval_rewards/rejected": -24.94974708557129, + "eval_runtime": 156.627, + "eval_samples_per_second": 18.273, + "eval_steps_per_second": 1.143, + "step": 4600 + }, + { + "epoch": 2.1, + "learning_rate": 9.944190766108573e-08, + "logits/chosen": -0.9343441128730774, + "logits/rejected": -0.817380428314209, + "logps/chosen": -86.71347045898438, + "logps/rejected": -118.91255187988281, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04746241495013237, + "rewards/margins": 25.993091583251953, + "rewards/rejected": -25.94562339782715, + "step": 4610 + }, + { + "epoch": 2.11, + "learning_rate": 9.89345509893455e-08, + "logits/chosen": -0.8125195503234863, + "logits/rejected": -0.796918511390686, + "logps/chosen": -89.77796173095703, + "logps/rejected": -122.3956298828125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11807354539632797, + "rewards/margins": 27.092082977294922, + "rewards/rejected": -26.974010467529297, + "step": 4620 + }, + { + "epoch": 2.11, + "learning_rate": 9.842719431760526e-08, + "logits/chosen": -0.7921231389045715, + "logits/rejected": -0.8076340556144714, + "logps/chosen": -84.28463745117188, + "logps/rejected": -119.71067810058594, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5065456628799438, + "rewards/margins": 26.92000389099121, + "rewards/rejected": -26.4134578704834, + "step": 4630 + }, + { + "epoch": 2.12, + "learning_rate": 9.791983764586503e-08, + "logits/chosen": -0.9721126556396484, + "logits/rejected": -0.8867252469062805, + "logps/chosen": -86.30948638916016, + "logps/rejected": -119.10693359375, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4696705937385559, + "rewards/margins": 25.70583152770996, + "rewards/rejected": -26.175506591796875, + "step": 4640 + }, + { + "epoch": 2.12, + "learning_rate": 9.74124809741248e-08, + "logits/chosen": -0.9281400442123413, + "logits/rejected": -0.8973511457443237, + "logps/chosen": -81.73219299316406, + "logps/rejected": -115.91802978515625, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.17622852325439453, + "rewards/margins": 24.91793441772461, + "rewards/rejected": -25.094161987304688, + "step": 4650 + }, + { + "epoch": 2.13, + "learning_rate": 9.690512430238456e-08, + "logits/chosen": -0.8222247958183289, + "logits/rejected": -0.7661574482917786, + "logps/chosen": -82.31886291503906, + "logps/rejected": -117.75882720947266, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8082590103149414, + "rewards/margins": 24.505247116088867, + "rewards/rejected": -25.313507080078125, + "step": 4660 + }, + { + "epoch": 2.13, + "learning_rate": 9.639776763064433e-08, + "logits/chosen": -0.9574063420295715, + "logits/rejected": -0.939258873462677, + "logps/chosen": -83.35095977783203, + "logps/rejected": -116.93514251708984, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16359582543373108, + "rewards/margins": 26.779830932617188, + "rewards/rejected": -26.616235733032227, + "step": 4670 + }, + { + "epoch": 2.14, + "learning_rate": 9.58904109589041e-08, + "logits/chosen": -1.0326330661773682, + "logits/rejected": -0.9830595254898071, + "logps/chosen": -87.02751159667969, + "logps/rejected": -117.13568115234375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08589287102222443, + "rewards/margins": 26.108123779296875, + "rewards/rejected": -26.022228240966797, + "step": 4680 + }, + { + "epoch": 2.14, + "learning_rate": 9.538305428716386e-08, + "logits/chosen": -0.8359645009040833, + "logits/rejected": -0.8177057504653931, + "logps/chosen": -82.79866027832031, + "logps/rejected": -110.24278259277344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42063361406326294, + "rewards/margins": 23.922367095947266, + "rewards/rejected": -24.343002319335938, + "step": 4690 + }, + { + "epoch": 2.15, + "learning_rate": 9.487569761542363e-08, + "logits/chosen": -0.9532375335693359, + "logits/rejected": -0.9111067056655884, + "logps/chosen": -94.54585266113281, + "logps/rejected": -121.60087585449219, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5539652705192566, + "rewards/margins": 26.92850112915039, + "rewards/rejected": -26.374536514282227, + "step": 4700 + }, + { + "epoch": 2.15, + "eval_logits/chosen": -0.915102481842041, + "eval_logits/rejected": -0.8374292254447937, + "eval_logps/chosen": -85.5395278930664, + "eval_logps/rejected": -116.08502960205078, + "eval_loss": 0.006045613903552294, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.0008371875155717134, + "eval_rewards/margins": 25.610980987548828, + "eval_rewards/rejected": -25.611818313598633, + "eval_runtime": 70.6998, + "eval_samples_per_second": 40.481, + "eval_steps_per_second": 2.532, + "step": 4700 + }, + { + "epoch": 2.15, + "learning_rate": 9.43683409436834e-08, + "logits/chosen": -0.9409311413764954, + "logits/rejected": -0.8758177757263184, + "logps/chosen": -82.46241760253906, + "logps/rejected": -121.64759826660156, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5624381303787231, + "rewards/margins": 26.8708438873291, + "rewards/rejected": -26.30840492248535, + "step": 4710 + }, + { + "epoch": 2.15, + "learning_rate": 9.386098427194316e-08, + "logits/chosen": -0.8292143940925598, + "logits/rejected": -0.8223336935043335, + "logps/chosen": -86.91860961914062, + "logps/rejected": -119.90704345703125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29642102122306824, + "rewards/margins": 25.774639129638672, + "rewards/rejected": -25.47821617126465, + "step": 4720 + }, + { + "epoch": 2.16, + "learning_rate": 9.335362760020293e-08, + "logits/chosen": -0.9198992848396301, + "logits/rejected": -0.9353822469711304, + "logps/chosen": -85.23753356933594, + "logps/rejected": -115.43070983886719, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8973407745361328, + "rewards/margins": 26.307861328125, + "rewards/rejected": -25.410518646240234, + "step": 4730 + }, + { + "epoch": 2.16, + "learning_rate": 9.28462709284627e-08, + "logits/chosen": -0.9610759019851685, + "logits/rejected": -0.9153316617012024, + "logps/chosen": -87.15513610839844, + "logps/rejected": -119.3092041015625, + "loss": 0.0026, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5247424840927124, + "rewards/margins": 27.221759796142578, + "rewards/rejected": -27.746501922607422, + "step": 4740 + }, + { + "epoch": 2.17, + "learning_rate": 9.233891425672246e-08, + "logits/chosen": -0.9629012942314148, + "logits/rejected": -0.8682750463485718, + "logps/chosen": -86.7682876586914, + "logps/rejected": -120.1332778930664, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6402833461761475, + "rewards/margins": 28.504135131835938, + "rewards/rejected": -26.863849639892578, + "step": 4750 + }, + { + "epoch": 2.17, + "learning_rate": 9.183155758498223e-08, + "logits/chosen": -1.0006027221679688, + "logits/rejected": -0.9207250475883484, + "logps/chosen": -87.22566986083984, + "logps/rejected": -121.52197265625, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9196405410766602, + "rewards/margins": 25.359289169311523, + "rewards/rejected": -26.2789306640625, + "step": 4760 + }, + { + "epoch": 2.18, + "learning_rate": 9.1324200913242e-08, + "logits/chosen": -0.9114225506782532, + "logits/rejected": -0.8728778958320618, + "logps/chosen": -89.46612548828125, + "logps/rejected": -119.34295654296875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09139951318502426, + "rewards/margins": 25.818500518798828, + "rewards/rejected": -25.909902572631836, + "step": 4770 + }, + { + "epoch": 2.18, + "learning_rate": 9.081684424150176e-08, + "logits/chosen": -0.9377741813659668, + "logits/rejected": -0.8587690591812134, + "logps/chosen": -88.31532287597656, + "logps/rejected": -124.38812255859375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027195418253540993, + "rewards/margins": 27.07535743713379, + "rewards/rejected": -27.04816246032715, + "step": 4780 + }, + { + "epoch": 2.19, + "learning_rate": 9.030948756976153e-08, + "logits/chosen": -0.9471622705459595, + "logits/rejected": -0.9017325639724731, + "logps/chosen": -85.16099548339844, + "logps/rejected": -116.66732025146484, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5448721647262573, + "rewards/margins": 26.83917236328125, + "rewards/rejected": -26.294300079345703, + "step": 4790 + }, + { + "epoch": 2.19, + "learning_rate": 8.98021308980213e-08, + "logits/chosen": -0.9159450531005859, + "logits/rejected": -0.8532840609550476, + "logps/chosen": -82.51966857910156, + "logps/rejected": -119.74662017822266, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6007218360900879, + "rewards/margins": 27.183069229125977, + "rewards/rejected": -26.58234214782715, + "step": 4800 + }, + { + "epoch": 2.19, + "eval_logits/chosen": -0.9306203126907349, + "eval_logits/rejected": -0.8540387153625488, + "eval_logps/chosen": -85.61259460449219, + "eval_logps/rejected": -116.9902572631836, + "eval_loss": 0.006084715481847525, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.037370555102825165, + "eval_rewards/margins": 26.027050018310547, + "eval_rewards/rejected": -26.064422607421875, + "eval_runtime": 88.42, + "eval_samples_per_second": 32.368, + "eval_steps_per_second": 2.024, + "step": 4800 + }, + { + "epoch": 2.2, + "learning_rate": 8.929477422628106e-08, + "logits/chosen": -0.9910691380500793, + "logits/rejected": -0.873990535736084, + "logps/chosen": -87.20209503173828, + "logps/rejected": -126.45884704589844, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0020253420807421207, + "rewards/margins": 25.715951919555664, + "rewards/rejected": -25.713924407958984, + "step": 4810 + }, + { + "epoch": 2.2, + "learning_rate": 8.878741755454083e-08, + "logits/chosen": -0.8924468755722046, + "logits/rejected": -0.8882986307144165, + "logps/chosen": -90.6511459350586, + "logps/rejected": -123.3057632446289, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5701011419296265, + "rewards/margins": 26.548742294311523, + "rewards/rejected": -27.118844985961914, + "step": 4820 + }, + { + "epoch": 2.2, + "learning_rate": 8.82800608828006e-08, + "logits/chosen": -1.0024341344833374, + "logits/rejected": -0.8866281509399414, + "logps/chosen": -85.85223388671875, + "logps/rejected": -120.0212173461914, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5763718485832214, + "rewards/margins": 27.8824520111084, + "rewards/rejected": -27.306081771850586, + "step": 4830 + }, + { + "epoch": 2.21, + "learning_rate": 8.777270421106036e-08, + "logits/chosen": -0.9681693315505981, + "logits/rejected": -0.8785957098007202, + "logps/chosen": -84.08697509765625, + "logps/rejected": -120.38557434082031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.167352557182312, + "rewards/margins": 24.95884132385254, + "rewards/rejected": -26.12619400024414, + "step": 4840 + }, + { + "epoch": 2.21, + "learning_rate": 8.726534753932013e-08, + "logits/chosen": -0.9239298701286316, + "logits/rejected": -0.8562732934951782, + "logps/chosen": -83.76175689697266, + "logps/rejected": -119.7491226196289, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1900556981563568, + "rewards/margins": 27.33782386779785, + "rewards/rejected": -27.14776611328125, + "step": 4850 + }, + { + "epoch": 2.22, + "learning_rate": 8.67579908675799e-08, + "logits/chosen": -0.8515946269035339, + "logits/rejected": -0.842880129814148, + "logps/chosen": -82.41915893554688, + "logps/rejected": -116.04329681396484, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8233505487442017, + "rewards/margins": 26.450525283813477, + "rewards/rejected": -25.62717628479004, + "step": 4860 + }, + { + "epoch": 2.22, + "learning_rate": 8.625063419583966e-08, + "logits/chosen": -0.9757340550422668, + "logits/rejected": -0.8796631097793579, + "logps/chosen": -84.7564697265625, + "logps/rejected": -121.77120208740234, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8205292820930481, + "rewards/margins": 25.63332748413086, + "rewards/rejected": -26.453853607177734, + "step": 4870 + }, + { + "epoch": 2.23, + "learning_rate": 8.574327752409943e-08, + "logits/chosen": -0.9308716058731079, + "logits/rejected": -0.8484630584716797, + "logps/chosen": -83.25608825683594, + "logps/rejected": -121.8863754272461, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3879397511482239, + "rewards/margins": 26.951122283935547, + "rewards/rejected": -27.339065551757812, + "step": 4880 + }, + { + "epoch": 2.23, + "learning_rate": 8.52359208523592e-08, + "logits/chosen": -0.9290486574172974, + "logits/rejected": -0.8468266725540161, + "logps/chosen": -86.26325225830078, + "logps/rejected": -124.77742004394531, + "loss": 0.0032, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.48676443099975586, + "rewards/margins": 26.87295150756836, + "rewards/rejected": -26.386188507080078, + "step": 4890 + }, + { + "epoch": 2.24, + "learning_rate": 8.472856418061896e-08, + "logits/chosen": -0.9023244976997375, + "logits/rejected": -0.8335026502609253, + "logps/chosen": -85.95402526855469, + "logps/rejected": -118.09017181396484, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.000464677810669, + "rewards/margins": 27.742565155029297, + "rewards/rejected": -26.74209976196289, + "step": 4900 + }, + { + "epoch": 2.24, + "eval_logits/chosen": -0.9493011236190796, + "eval_logits/rejected": -0.8702885508537292, + "eval_logps/chosen": -85.37418365478516, + "eval_logps/rejected": -117.78074645996094, + "eval_loss": 0.006325908936560154, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": 0.08183945715427399, + "eval_rewards/margins": 26.54151725769043, + "eval_rewards/rejected": -26.459674835205078, + "eval_runtime": 104.8312, + "eval_samples_per_second": 27.301, + "eval_steps_per_second": 1.708, + "step": 4900 + }, + { + "epoch": 2.24, + "learning_rate": 8.422120750887873e-08, + "logits/chosen": -1.0275952816009521, + "logits/rejected": -0.9630632400512695, + "logps/chosen": -86.69776153564453, + "logps/rejected": -120.90191650390625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8101398348808289, + "rewards/margins": 27.59271812438965, + "rewards/rejected": -26.782577514648438, + "step": 4910 + }, + { + "epoch": 2.25, + "learning_rate": 8.37138508371385e-08, + "logits/chosen": -0.9919659495353699, + "logits/rejected": -0.9070954322814941, + "logps/chosen": -85.95883178710938, + "logps/rejected": -121.2116928100586, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.606831431388855, + "rewards/margins": 27.421112060546875, + "rewards/rejected": -26.814281463623047, + "step": 4920 + }, + { + "epoch": 2.25, + "learning_rate": 8.320649416539826e-08, + "logits/chosen": -0.8715358972549438, + "logits/rejected": -0.8914009928703308, + "logps/chosen": -86.6871109008789, + "logps/rejected": -123.45024108886719, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.37944138050079346, + "rewards/margins": 27.759761810302734, + "rewards/rejected": -28.13920021057129, + "step": 4930 + }, + { + "epoch": 2.25, + "learning_rate": 8.269913749365803e-08, + "logits/chosen": -0.9209533929824829, + "logits/rejected": -0.821499228477478, + "logps/chosen": -83.86023712158203, + "logps/rejected": -120.2563705444336, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7821536660194397, + "rewards/margins": 27.337902069091797, + "rewards/rejected": -26.555749893188477, + "step": 4940 + }, + { + "epoch": 2.26, + "learning_rate": 8.21917808219178e-08, + "logits/chosen": -0.9386633038520813, + "logits/rejected": -0.8382610082626343, + "logps/chosen": -88.39375305175781, + "logps/rejected": -119.31612396240234, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.15015359222888947, + "rewards/margins": 26.648828506469727, + "rewards/rejected": -26.498676300048828, + "step": 4950 + }, + { + "epoch": 2.26, + "learning_rate": 8.168442415017756e-08, + "logits/chosen": -0.9892728924751282, + "logits/rejected": -0.9248817563056946, + "logps/chosen": -92.55259704589844, + "logps/rejected": -121.92408752441406, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1945768594741821, + "rewards/margins": 26.516002655029297, + "rewards/rejected": -27.710580825805664, + "step": 4960 + }, + { + "epoch": 2.27, + "learning_rate": 8.117706747843733e-08, + "logits/chosen": -0.9463468790054321, + "logits/rejected": -0.9060198664665222, + "logps/chosen": -87.85506439208984, + "logps/rejected": -123.16998291015625, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.14104899764060974, + "rewards/margins": 28.042089462280273, + "rewards/rejected": -28.18313980102539, + "step": 4970 + }, + { + "epoch": 2.27, + "learning_rate": 8.06697108066971e-08, + "logits/chosen": -0.9373579025268555, + "logits/rejected": -0.9004872441291809, + "logps/chosen": -88.71186828613281, + "logps/rejected": -121.61962890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5207719802856445, + "rewards/margins": 28.3806095123291, + "rewards/rejected": -28.901378631591797, + "step": 4980 + }, + { + "epoch": 2.28, + "learning_rate": 8.016235413495687e-08, + "logits/chosen": -0.8973749279975891, + "logits/rejected": -0.8311864137649536, + "logps/chosen": -80.42097473144531, + "logps/rejected": -121.57218933105469, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5322467088699341, + "rewards/margins": 28.41021156311035, + "rewards/rejected": -27.877965927124023, + "step": 4990 + }, + { + "epoch": 2.28, + "learning_rate": 7.965499746321664e-08, + "logits/chosen": -0.8929327130317688, + "logits/rejected": -0.8712663650512695, + "logps/chosen": -92.48535919189453, + "logps/rejected": -123.72513580322266, + "loss": 0.0028, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.005576419644057751, + "rewards/margins": 27.1002254486084, + "rewards/rejected": -27.105804443359375, + "step": 5000 + }, + { + "epoch": 2.28, + "eval_logits/chosen": -0.9634361267089844, + "eval_logits/rejected": -0.8802065849304199, + "eval_logps/chosen": -87.11389923095703, + "eval_logps/rejected": -121.00086975097656, + "eval_loss": 0.007718712091445923, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.7880215048789978, + "eval_rewards/margins": 27.281707763671875, + "eval_rewards/rejected": -28.069734573364258, + "eval_runtime": 68.3881, + "eval_samples_per_second": 41.849, + "eval_steps_per_second": 2.617, + "step": 5000 + }, + { + "epoch": 2.29, + "learning_rate": 7.91476407914764e-08, + "logits/chosen": -0.9871411323547363, + "logits/rejected": -0.9252876043319702, + "logps/chosen": -87.0884017944336, + "logps/rejected": -124.8663330078125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8773247003555298, + "rewards/margins": 27.905818939208984, + "rewards/rejected": -28.78314208984375, + "step": 5010 + }, + { + "epoch": 2.29, + "learning_rate": 7.864028411973617e-08, + "logits/chosen": -0.9661632776260376, + "logits/rejected": -0.860427975654602, + "logps/chosen": -92.53157043457031, + "logps/rejected": -125.1065444946289, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.514369547367096, + "rewards/margins": 28.825185775756836, + "rewards/rejected": -29.339553833007812, + "step": 5020 + }, + { + "epoch": 2.3, + "learning_rate": 7.813292744799594e-08, + "logits/chosen": -0.9192354083061218, + "logits/rejected": -0.9115994572639465, + "logps/chosen": -86.20381164550781, + "logps/rejected": -121.10685729980469, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15855053067207336, + "rewards/margins": 27.75839614868164, + "rewards/rejected": -27.916950225830078, + "step": 5030 + }, + { + "epoch": 2.3, + "learning_rate": 7.76255707762557e-08, + "logits/chosen": -1.0042352676391602, + "logits/rejected": -0.8900748491287231, + "logps/chosen": -87.24882507324219, + "logps/rejected": -120.3760757446289, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2955986261367798, + "rewards/margins": 25.776174545288086, + "rewards/rejected": -27.0717716217041, + "step": 5040 + }, + { + "epoch": 2.31, + "learning_rate": 7.711821410451547e-08, + "logits/chosen": -1.0438252687454224, + "logits/rejected": -0.9618898630142212, + "logps/chosen": -83.92408752441406, + "logps/rejected": -118.453125, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.12917456030845642, + "rewards/margins": 28.10638999938965, + "rewards/rejected": -27.97721290588379, + "step": 5050 + }, + { + "epoch": 2.31, + "learning_rate": 7.661085743277524e-08, + "logits/chosen": -0.9532014727592468, + "logits/rejected": -0.9269828796386719, + "logps/chosen": -93.73705291748047, + "logps/rejected": -126.61479187011719, + "loss": 0.002, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.03241534158587456, + "rewards/margins": 27.516796112060547, + "rewards/rejected": -27.549213409423828, + "step": 5060 + }, + { + "epoch": 2.31, + "learning_rate": 7.6103500761035e-08, + "logits/chosen": -0.9407272338867188, + "logits/rejected": -0.8259226679801941, + "logps/chosen": -87.27238464355469, + "logps/rejected": -123.35365295410156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.054013729095459, + "rewards/margins": 26.920440673828125, + "rewards/rejected": -28.974451065063477, + "step": 5070 + }, + { + "epoch": 2.32, + "learning_rate": 7.559614408929477e-08, + "logits/chosen": -0.919296145439148, + "logits/rejected": -0.8606408834457397, + "logps/chosen": -86.35128784179688, + "logps/rejected": -119.40046691894531, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.054381251335144, + "rewards/margins": 28.73299217224121, + "rewards/rejected": -27.678613662719727, + "step": 5080 + }, + { + "epoch": 2.32, + "learning_rate": 7.508878741755454e-08, + "logits/chosen": -0.8425869941711426, + "logits/rejected": -0.8566237688064575, + "logps/chosen": -85.87548065185547, + "logps/rejected": -125.34675598144531, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.167006254196167, + "rewards/margins": 27.098682403564453, + "rewards/rejected": -28.26568603515625, + "step": 5090 + }, + { + "epoch": 2.33, + "learning_rate": 7.45814307458143e-08, + "logits/chosen": -1.0067375898361206, + "logits/rejected": -0.9085214734077454, + "logps/chosen": -88.01753234863281, + "logps/rejected": -123.4695816040039, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.424287885427475, + "rewards/margins": 29.699504852294922, + "rewards/rejected": -29.275217056274414, + "step": 5100 + }, + { + "epoch": 2.33, + "eval_logits/chosen": -0.9718750715255737, + "eval_logits/rejected": -0.8896563053131104, + "eval_logps/chosen": -86.07792663574219, + "eval_logps/rejected": -120.29959106445312, + "eval_loss": 0.0068391538225114346, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.2700366973876953, + "eval_rewards/margins": 27.449060440063477, + "eval_rewards/rejected": -27.719093322753906, + "eval_runtime": 68.349, + "eval_samples_per_second": 41.873, + "eval_steps_per_second": 2.619, + "step": 5100 + }, + { + "epoch": 2.33, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -1.0170899629592896, + "logits/rejected": -0.9646891355514526, + "logps/chosen": -86.5973129272461, + "logps/rejected": -122.39430236816406, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.2534463405609131, + "rewards/margins": 27.92856788635254, + "rewards/rejected": -28.1820125579834, + "step": 5110 + }, + { + "epoch": 2.34, + "learning_rate": 7.356671740233384e-08, + "logits/chosen": -0.9090906381607056, + "logits/rejected": -0.8181482553482056, + "logps/chosen": -82.51972961425781, + "logps/rejected": -122.16377258300781, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.48268836736679077, + "rewards/margins": 27.902740478515625, + "rewards/rejected": -28.38543128967285, + "step": 5120 + }, + { + "epoch": 2.34, + "learning_rate": 7.30593607305936e-08, + "logits/chosen": -1.0290285348892212, + "logits/rejected": -0.9375460743904114, + "logps/chosen": -89.16535949707031, + "logps/rejected": -121.2294921875, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.7084894180297852, + "rewards/margins": 26.591333389282227, + "rewards/rejected": -28.299823760986328, + "step": 5130 + }, + { + "epoch": 2.35, + "learning_rate": 7.255200405885337e-08, + "logits/chosen": -0.9907518625259399, + "logits/rejected": -0.9397494196891785, + "logps/chosen": -90.25212860107422, + "logps/rejected": -126.6907730102539, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36234050989151, + "rewards/margins": 28.96002769470215, + "rewards/rejected": -29.322368621826172, + "step": 5140 + }, + { + "epoch": 2.35, + "learning_rate": 7.204464738711314e-08, + "logits/chosen": -1.0006976127624512, + "logits/rejected": -0.9534046053886414, + "logps/chosen": -83.64295959472656, + "logps/rejected": -121.9207763671875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5914263725280762, + "rewards/margins": 27.62277603149414, + "rewards/rejected": -28.214202880859375, + "step": 5150 + }, + { + "epoch": 2.36, + "learning_rate": 7.15372907153729e-08, + "logits/chosen": -0.9170929789543152, + "logits/rejected": -0.8608806729316711, + "logps/chosen": -84.09690856933594, + "logps/rejected": -122.79866790771484, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16179148852825165, + "rewards/margins": 27.293569564819336, + "rewards/rejected": -27.45536231994629, + "step": 5160 + }, + { + "epoch": 2.36, + "learning_rate": 7.102993404363267e-08, + "logits/chosen": -1.0261433124542236, + "logits/rejected": -0.9830166697502136, + "logps/chosen": -84.33302307128906, + "logps/rejected": -124.41776275634766, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2603864073753357, + "rewards/margins": 29.121349334716797, + "rewards/rejected": -29.38173484802246, + "step": 5170 + }, + { + "epoch": 2.36, + "learning_rate": 7.052257737189244e-08, + "logits/chosen": -0.8641002774238586, + "logits/rejected": -0.832613468170166, + "logps/chosen": -85.8521499633789, + "logps/rejected": -117.49824523925781, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9484038352966309, + "rewards/margins": 25.647912979125977, + "rewards/rejected": -26.5963191986084, + "step": 5180 + }, + { + "epoch": 2.37, + "learning_rate": 7.00152207001522e-08, + "logits/chosen": -0.9944854974746704, + "logits/rejected": -0.974925696849823, + "logps/chosen": -87.15865325927734, + "logps/rejected": -124.09013366699219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5379887819290161, + "rewards/margins": 27.764774322509766, + "rewards/rejected": -28.302759170532227, + "step": 5190 + }, + { + "epoch": 2.37, + "learning_rate": 6.950786402841197e-08, + "logits/chosen": -0.9933064579963684, + "logits/rejected": -0.9870017170906067, + "logps/chosen": -95.2901611328125, + "logps/rejected": -129.78933715820312, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.5724942684173584, + "rewards/margins": 27.332691192626953, + "rewards/rejected": -28.905187606811523, + "step": 5200 + }, + { + "epoch": 2.37, + "eval_logits/chosen": -0.9753385782241821, + "eval_logits/rejected": -0.892514169216156, + "eval_logps/chosen": -86.38349151611328, + "eval_logps/rejected": -121.61949157714844, + "eval_loss": 0.0071233040653169155, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.4228147268295288, + "eval_rewards/margins": 27.95623016357422, + "eval_rewards/rejected": -28.379043579101562, + "eval_runtime": 76.5385, + "eval_samples_per_second": 37.393, + "eval_steps_per_second": 2.339, + "step": 5200 + }, + { + "epoch": 2.38, + "learning_rate": 6.900050735667174e-08, + "logits/chosen": -1.0079680681228638, + "logits/rejected": -0.9470082521438599, + "logps/chosen": -84.72894287109375, + "logps/rejected": -123.27486419677734, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9842392802238464, + "rewards/margins": 27.015186309814453, + "rewards/rejected": -27.99942398071289, + "step": 5210 + }, + { + "epoch": 2.38, + "learning_rate": 6.84931506849315e-08, + "logits/chosen": -0.9415313601493835, + "logits/rejected": -0.8743622899055481, + "logps/chosen": -83.14261627197266, + "logps/rejected": -127.46656799316406, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49354609847068787, + "rewards/margins": 28.221790313720703, + "rewards/rejected": -28.715335845947266, + "step": 5220 + }, + { + "epoch": 2.39, + "learning_rate": 6.798579401319127e-08, + "logits/chosen": -1.0896893739700317, + "logits/rejected": -0.984167218208313, + "logps/chosen": -91.80816650390625, + "logps/rejected": -124.5762710571289, + "loss": 0.0045, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7726724147796631, + "rewards/margins": 28.63726806640625, + "rewards/rejected": -29.409942626953125, + "step": 5230 + }, + { + "epoch": 2.39, + "learning_rate": 6.747843734145104e-08, + "logits/chosen": -0.9839617013931274, + "logits/rejected": -0.8807669878005981, + "logps/chosen": -86.82026672363281, + "logps/rejected": -121.82684326171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.491886705160141, + "rewards/margins": 28.546483993530273, + "rewards/rejected": -28.054601669311523, + "step": 5240 + }, + { + "epoch": 2.4, + "learning_rate": 6.69710806697108e-08, + "logits/chosen": -1.038967490196228, + "logits/rejected": -0.9306316375732422, + "logps/chosen": -84.9123306274414, + "logps/rejected": -122.56534576416016, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.8146915435791016, + "rewards/margins": 27.039257049560547, + "rewards/rejected": -27.85394859313965, + "step": 5250 + }, + { + "epoch": 2.4, + "learning_rate": 6.646372399797057e-08, + "logits/chosen": -0.9870797991752625, + "logits/rejected": -0.9295756220817566, + "logps/chosen": -82.35897064208984, + "logps/rejected": -122.3917236328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4263637065887451, + "rewards/margins": 27.225475311279297, + "rewards/rejected": -27.651844024658203, + "step": 5260 + }, + { + "epoch": 2.41, + "learning_rate": 6.595636732623034e-08, + "logits/chosen": -1.0013396739959717, + "logits/rejected": -0.953113853931427, + "logps/chosen": -88.39720916748047, + "logps/rejected": -129.01861572265625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2705535888671875, + "rewards/margins": 28.5063533782959, + "rewards/rejected": -29.776906967163086, + "step": 5270 + }, + { + "epoch": 2.41, + "learning_rate": 6.54490106544901e-08, + "logits/chosen": -0.9829657673835754, + "logits/rejected": -0.9145382046699524, + "logps/chosen": -90.40482330322266, + "logps/rejected": -127.89979553222656, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44692808389663696, + "rewards/margins": 31.026226043701172, + "rewards/rejected": -31.473154067993164, + "step": 5280 + }, + { + "epoch": 2.41, + "learning_rate": 6.494165398274987e-08, + "logits/chosen": -0.8731921315193176, + "logits/rejected": -0.8698946237564087, + "logps/chosen": -91.5156478881836, + "logps/rejected": -127.5548095703125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9198547601699829, + "rewards/margins": 27.880355834960938, + "rewards/rejected": -28.800212860107422, + "step": 5290 + }, + { + "epoch": 2.42, + "learning_rate": 6.443429731100964e-08, + "logits/chosen": -1.0517842769622803, + "logits/rejected": -0.9802875518798828, + "logps/chosen": -81.30607604980469, + "logps/rejected": -124.15089416503906, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6957739591598511, + "rewards/margins": 28.380374908447266, + "rewards/rejected": -29.07615089416504, + "step": 5300 + }, + { + "epoch": 2.42, + "eval_logits/chosen": -0.9960015416145325, + "eval_logits/rejected": -0.9123116731643677, + "eval_logps/chosen": -86.72779846191406, + "eval_logps/rejected": -123.22882843017578, + "eval_loss": 0.007154433988034725, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.5949668884277344, + "eval_rewards/margins": 28.588741302490234, + "eval_rewards/rejected": -29.18370819091797, + "eval_runtime": 64.2723, + "eval_samples_per_second": 44.529, + "eval_steps_per_second": 2.785, + "step": 5300 + }, + { + "epoch": 2.42, + "learning_rate": 6.39269406392694e-08, + "logits/chosen": -1.006734013557434, + "logits/rejected": -0.9903522729873657, + "logps/chosen": -84.85193634033203, + "logps/rejected": -127.9534683227539, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42257681488990784, + "rewards/margins": 30.12015151977539, + "rewards/rejected": -29.697574615478516, + "step": 5310 + }, + { + "epoch": 2.43, + "learning_rate": 6.341958396752917e-08, + "logits/chosen": -1.0764577388763428, + "logits/rejected": -0.9376031160354614, + "logps/chosen": -84.22555541992188, + "logps/rejected": -122.31805419921875, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4156250059604645, + "rewards/margins": 28.341150283813477, + "rewards/rejected": -28.756771087646484, + "step": 5320 + }, + { + "epoch": 2.43, + "learning_rate": 6.291222729578894e-08, + "logits/chosen": -1.078168511390686, + "logits/rejected": -0.9943448305130005, + "logps/chosen": -89.96900939941406, + "logps/rejected": -118.6367416381836, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.3117859959602356, + "rewards/margins": 28.337322235107422, + "rewards/rejected": -28.64910888671875, + "step": 5330 + }, + { + "epoch": 2.44, + "learning_rate": 6.24048706240487e-08, + "logits/chosen": -1.069582462310791, + "logits/rejected": -1.0039303302764893, + "logps/chosen": -88.01235961914062, + "logps/rejected": -129.82473754882812, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2434835433959961, + "rewards/margins": 30.145584106445312, + "rewards/rejected": -30.389068603515625, + "step": 5340 + }, + { + "epoch": 2.44, + "learning_rate": 6.189751395230847e-08, + "logits/chosen": -1.067857027053833, + "logits/rejected": -0.9720760583877563, + "logps/chosen": -93.93586730957031, + "logps/rejected": -128.0084991455078, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2376720905303955, + "rewards/margins": 28.47385025024414, + "rewards/rejected": -29.71152114868164, + "step": 5350 + }, + { + "epoch": 2.45, + "learning_rate": 6.139015728056824e-08, + "logits/chosen": -1.0381324291229248, + "logits/rejected": -0.977461040019989, + "logps/chosen": -85.44249725341797, + "logps/rejected": -123.37590026855469, + "loss": 0.0023, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.276159405708313, + "rewards/margins": 29.17499351501465, + "rewards/rejected": -28.898834228515625, + "step": 5360 + }, + { + "epoch": 2.45, + "learning_rate": 6.0882800608828e-08, + "logits/chosen": -0.9749631881713867, + "logits/rejected": -0.9035015106201172, + "logps/chosen": -92.03914642333984, + "logps/rejected": -125.73573303222656, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13078829646110535, + "rewards/margins": 29.687381744384766, + "rewards/rejected": -29.81817054748535, + "step": 5370 + }, + { + "epoch": 2.46, + "learning_rate": 6.037544393708777e-08, + "logits/chosen": -0.8980385065078735, + "logits/rejected": -0.8745632171630859, + "logps/chosen": -85.739501953125, + "logps/rejected": -128.15335083007812, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5254358053207397, + "rewards/margins": 28.72063636779785, + "rewards/rejected": -29.24607276916504, + "step": 5380 + }, + { + "epoch": 2.46, + "learning_rate": 5.986808726534754e-08, + "logits/chosen": -0.9306937456130981, + "logits/rejected": -0.949223518371582, + "logps/chosen": -89.4453353881836, + "logps/rejected": -123.77262878417969, + "loss": 0.006, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.9093526601791382, + "rewards/margins": 28.101343154907227, + "rewards/rejected": -29.010696411132812, + "step": 5390 + }, + { + "epoch": 2.46, + "learning_rate": 5.93607305936073e-08, + "logits/chosen": -0.9719539880752563, + "logits/rejected": -0.9495238065719604, + "logps/chosen": -82.39630126953125, + "logps/rejected": -122.8931884765625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2512289583683014, + "rewards/margins": 29.647945404052734, + "rewards/rejected": -29.8991756439209, + "step": 5400 + }, + { + "epoch": 2.46, + "eval_logits/chosen": -1.0086694955825806, + "eval_logits/rejected": -0.9222978949546814, + "eval_logps/chosen": -87.1098403930664, + "eval_logps/rejected": -123.73928833007812, + "eval_loss": 0.007262189406901598, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.7859910726547241, + "eval_rewards/margins": 28.65294647216797, + "eval_rewards/rejected": -29.438940048217773, + "eval_runtime": 70.8839, + "eval_samples_per_second": 40.376, + "eval_steps_per_second": 2.525, + "step": 5400 + }, + { + "epoch": 2.47, + "learning_rate": 5.8853373921867065e-08, + "logits/chosen": -1.0013418197631836, + "logits/rejected": -0.898902416229248, + "logps/chosen": -92.24568176269531, + "logps/rejected": -132.75596618652344, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6157811880111694, + "rewards/margins": 29.352039337158203, + "rewards/rejected": -29.967823028564453, + "step": 5410 + }, + { + "epoch": 2.47, + "learning_rate": 5.834601725012683e-08, + "logits/chosen": -1.0467584133148193, + "logits/rejected": -0.9782527685165405, + "logps/chosen": -89.4131851196289, + "logps/rejected": -125.35511779785156, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5491294860839844, + "rewards/margins": 28.110363006591797, + "rewards/rejected": -28.65949058532715, + "step": 5420 + }, + { + "epoch": 2.48, + "learning_rate": 5.78386605783866e-08, + "logits/chosen": -0.9084771871566772, + "logits/rejected": -0.8927844762802124, + "logps/chosen": -88.92491149902344, + "logps/rejected": -129.6659393310547, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.6389572620391846, + "rewards/margins": 29.241901397705078, + "rewards/rejected": -30.880855560302734, + "step": 5430 + }, + { + "epoch": 2.48, + "learning_rate": 5.7331303906646365e-08, + "logits/chosen": -0.9773572683334351, + "logits/rejected": -0.9706518054008484, + "logps/chosen": -86.19645690917969, + "logps/rejected": -129.49354553222656, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.565061092376709, + "rewards/margins": 29.09912109375, + "rewards/rejected": -29.6641788482666, + "step": 5440 + }, + { + "epoch": 2.49, + "learning_rate": 5.682394723490613e-08, + "logits/chosen": -1.1273038387298584, + "logits/rejected": -1.061490535736084, + "logps/chosen": -93.4212875366211, + "logps/rejected": -130.5277862548828, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44857341051101685, + "rewards/margins": 30.196619033813477, + "rewards/rejected": -30.645191192626953, + "step": 5450 + }, + { + "epoch": 2.49, + "learning_rate": 5.63165905631659e-08, + "logits/chosen": -1.04457688331604, + "logits/rejected": -1.0130252838134766, + "logps/chosen": -90.43280029296875, + "logps/rejected": -132.5623779296875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1586554050445557, + "rewards/margins": 30.627910614013672, + "rewards/rejected": -31.78656578063965, + "step": 5460 + }, + { + "epoch": 2.5, + "learning_rate": 5.5809233891425665e-08, + "logits/chosen": -0.991267204284668, + "logits/rejected": -0.9441797137260437, + "logps/chosen": -89.6689453125, + "logps/rejected": -127.03794860839844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5401039123535156, + "rewards/margins": 28.44040870666504, + "rewards/rejected": -29.980514526367188, + "step": 5470 + }, + { + "epoch": 2.5, + "learning_rate": 5.530187721968543e-08, + "logits/chosen": -1.008504033088684, + "logits/rejected": -1.0064032077789307, + "logps/chosen": -89.63392639160156, + "logps/rejected": -129.8910675048828, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20265252888202667, + "rewards/margins": 29.824050903320312, + "rewards/rejected": -30.026702880859375, + "step": 5480 + }, + { + "epoch": 2.51, + "learning_rate": 5.47945205479452e-08, + "logits/chosen": -0.978340744972229, + "logits/rejected": -0.8980448842048645, + "logps/chosen": -86.2132797241211, + "logps/rejected": -132.22434997558594, + "loss": 0.0033, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3651527166366577, + "rewards/margins": 29.39707374572754, + "rewards/rejected": -30.76222801208496, + "step": 5490 + }, + { + "epoch": 2.51, + "learning_rate": 5.4287163876204964e-08, + "logits/chosen": -1.056391716003418, + "logits/rejected": -0.9532869458198547, + "logps/chosen": -89.96862030029297, + "logps/rejected": -128.9607696533203, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5641896724700928, + "rewards/margins": 29.623580932617188, + "rewards/rejected": -30.18777084350586, + "step": 5500 + }, + { + "epoch": 2.51, + "eval_logits/chosen": -1.0164090394973755, + "eval_logits/rejected": -0.9298503994941711, + "eval_logps/chosen": -87.34950256347656, + "eval_logps/rejected": -124.33818817138672, + "eval_loss": 0.007332602050155401, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.9058244824409485, + "eval_rewards/margins": 28.832571029663086, + "eval_rewards/rejected": -29.73839569091797, + "eval_runtime": 85.391, + "eval_samples_per_second": 33.516, + "eval_steps_per_second": 2.096, + "step": 5500 + }, + { + "epoch": 2.52, + "learning_rate": 5.377980720446473e-08, + "logits/chosen": -0.9833132028579712, + "logits/rejected": -0.9164519309997559, + "logps/chosen": -87.18998718261719, + "logps/rejected": -127.6279525756836, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17603938281536102, + "rewards/margins": 31.583871841430664, + "rewards/rejected": -31.4078311920166, + "step": 5510 + }, + { + "epoch": 2.52, + "learning_rate": 5.32724505327245e-08, + "logits/chosen": -1.0401796102523804, + "logits/rejected": -0.9769385457038879, + "logps/chosen": -86.3476333618164, + "logps/rejected": -122.81013488769531, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.039313793182373, + "rewards/margins": 27.221426010131836, + "rewards/rejected": -28.2607364654541, + "step": 5520 + }, + { + "epoch": 2.52, + "learning_rate": 5.2765093860984264e-08, + "logits/chosen": -0.9912775754928589, + "logits/rejected": -0.9012781977653503, + "logps/chosen": -85.58564758300781, + "logps/rejected": -123.6320571899414, + "loss": 0.0034, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.3589529991149902, + "rewards/margins": 26.08465003967285, + "rewards/rejected": -28.443607330322266, + "step": 5530 + }, + { + "epoch": 2.53, + "learning_rate": 5.225773718924403e-08, + "logits/chosen": -0.9532960653305054, + "logits/rejected": -0.9478763341903687, + "logps/chosen": -83.55834197998047, + "logps/rejected": -121.30717468261719, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43583202362060547, + "rewards/margins": 27.649715423583984, + "rewards/rejected": -28.085546493530273, + "step": 5540 + }, + { + "epoch": 2.53, + "learning_rate": 5.17503805175038e-08, + "logits/chosen": -1.0004901885986328, + "logits/rejected": -0.9230673909187317, + "logps/chosen": -87.94818878173828, + "logps/rejected": -127.7512435913086, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18584835529327393, + "rewards/margins": 30.41741371154785, + "rewards/rejected": -30.603260040283203, + "step": 5550 + }, + { + "epoch": 2.54, + "learning_rate": 5.1243023845763564e-08, + "logits/chosen": -1.0533897876739502, + "logits/rejected": -1.0084376335144043, + "logps/chosen": -81.72035217285156, + "logps/rejected": -127.9518051147461, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35208114981651306, + "rewards/margins": 30.244159698486328, + "rewards/rejected": -30.596240997314453, + "step": 5560 + }, + { + "epoch": 2.54, + "learning_rate": 5.073566717402333e-08, + "logits/chosen": -1.0520977973937988, + "logits/rejected": -0.986443817615509, + "logps/chosen": -84.8553695678711, + "logps/rejected": -126.15667724609375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4004618525505066, + "rewards/margins": 29.2807559967041, + "rewards/rejected": -29.68121910095215, + "step": 5570 + }, + { + "epoch": 2.55, + "learning_rate": 5.02283105022831e-08, + "logits/chosen": -0.9998930096626282, + "logits/rejected": -1.0011494159698486, + "logps/chosen": -93.25764465332031, + "logps/rejected": -127.81201171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.173553228378296, + "rewards/margins": 28.28702163696289, + "rewards/rejected": -29.4605712890625, + "step": 5580 + }, + { + "epoch": 2.55, + "learning_rate": 4.9720953830542864e-08, + "logits/chosen": -0.9547233581542969, + "logits/rejected": -0.9245736002922058, + "logps/chosen": -91.9867935180664, + "logps/rejected": -126.07781982421875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9789550304412842, + "rewards/margins": 28.35225486755371, + "rewards/rejected": -30.331212997436523, + "step": 5590 + }, + { + "epoch": 2.56, + "learning_rate": 4.921359715880263e-08, + "logits/chosen": -0.9986736178398132, + "logits/rejected": -0.972088634967804, + "logps/chosen": -82.64098358154297, + "logps/rejected": -122.1820297241211, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37886843085289, + "rewards/margins": 28.860065460205078, + "rewards/rejected": -28.4811954498291, + "step": 5600 + }, + { + "epoch": 2.56, + "eval_logits/chosen": -1.0180690288543701, + "eval_logits/rejected": -0.9321611523628235, + "eval_logps/chosen": -87.17377471923828, + "eval_logps/rejected": -123.89130401611328, + "eval_loss": 0.006997702177613974, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.8179590106010437, + "eval_rewards/margins": 28.69698715209961, + "eval_rewards/rejected": -29.514951705932617, + "eval_runtime": 80.0253, + "eval_samples_per_second": 35.764, + "eval_steps_per_second": 2.237, + "step": 5600 + }, + { + "epoch": 2.56, + "learning_rate": 4.87062404870624e-08, + "logits/chosen": -1.0039026737213135, + "logits/rejected": -0.936947226524353, + "logps/chosen": -86.39691925048828, + "logps/rejected": -124.20637512207031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3275587558746338, + "rewards/margins": 28.06853675842285, + "rewards/rejected": -29.396093368530273, + "step": 5610 + }, + { + "epoch": 2.57, + "learning_rate": 4.8198883815322164e-08, + "logits/chosen": -1.0270158052444458, + "logits/rejected": -0.9902611970901489, + "logps/chosen": -85.97245025634766, + "logps/rejected": -129.32151794433594, + "loss": 0.0066, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.1571362018585205, + "rewards/margins": 29.470067977905273, + "rewards/rejected": -30.6272029876709, + "step": 5620 + }, + { + "epoch": 2.57, + "learning_rate": 4.769152714358193e-08, + "logits/chosen": -0.9748164415359497, + "logits/rejected": -0.8806400299072266, + "logps/chosen": -84.25230407714844, + "logps/rejected": -128.8730926513672, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3607916831970215, + "rewards/margins": 29.30045509338379, + "rewards/rejected": -30.661245346069336, + "step": 5630 + }, + { + "epoch": 2.57, + "learning_rate": 4.71841704718417e-08, + "logits/chosen": -0.9845125079154968, + "logits/rejected": -0.8729864358901978, + "logps/chosen": -85.73805236816406, + "logps/rejected": -128.51284790039062, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.4684017896652222, + "rewards/margins": 28.368051528930664, + "rewards/rejected": -29.83645248413086, + "step": 5640 + }, + { + "epoch": 2.58, + "learning_rate": 4.6676813800101464e-08, + "logits/chosen": -1.0719449520111084, + "logits/rejected": -0.9801136255264282, + "logps/chosen": -90.75955200195312, + "logps/rejected": -126.10191345214844, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1611497402191162, + "rewards/margins": 28.947484970092773, + "rewards/rejected": -29.1086368560791, + "step": 5650 + }, + { + "epoch": 2.58, + "learning_rate": 4.616945712836123e-08, + "logits/chosen": -0.9902482032775879, + "logits/rejected": -0.9737680554389954, + "logps/chosen": -88.47955322265625, + "logps/rejected": -125.43502044677734, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6917212605476379, + "rewards/margins": 28.979278564453125, + "rewards/rejected": -29.671005249023438, + "step": 5660 + }, + { + "epoch": 2.59, + "learning_rate": 4.5662100456621e-08, + "logits/chosen": -1.062365174293518, + "logits/rejected": -0.9917305111885071, + "logps/chosen": -85.7381820678711, + "logps/rejected": -129.78860473632812, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7698432207107544, + "rewards/margins": 29.835586547851562, + "rewards/rejected": -30.605432510375977, + "step": 5670 + }, + { + "epoch": 2.59, + "learning_rate": 4.5154743784880764e-08, + "logits/chosen": -0.9991733431816101, + "logits/rejected": -0.9434337615966797, + "logps/chosen": -79.69161224365234, + "logps/rejected": -126.32232666015625, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.819229245185852, + "rewards/margins": 28.9149112701416, + "rewards/rejected": -29.734134674072266, + "step": 5680 + }, + { + "epoch": 2.6, + "learning_rate": 4.464738711314053e-08, + "logits/chosen": -0.9527062177658081, + "logits/rejected": -0.9070295095443726, + "logps/chosen": -95.05113983154297, + "logps/rejected": -129.95516967773438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3489150106906891, + "rewards/margins": 29.364145278930664, + "rewards/rejected": -29.713058471679688, + "step": 5690 + }, + { + "epoch": 2.6, + "learning_rate": 4.41400304414003e-08, + "logits/chosen": -1.0130369663238525, + "logits/rejected": -0.9433294534683228, + "logps/chosen": -94.64827728271484, + "logps/rejected": -125.37615966796875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.025121713057160378, + "rewards/margins": 29.1879940032959, + "rewards/rejected": -29.21311378479004, + "step": 5700 + }, + { + "epoch": 2.6, + "eval_logits/chosen": -1.0215510129928589, + "eval_logits/rejected": -0.9332442283630371, + "eval_logps/chosen": -86.77206420898438, + "eval_logps/rejected": -123.67755126953125, + "eval_loss": 0.007124757394194603, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.6171061992645264, + "eval_rewards/margins": 28.790964126586914, + "eval_rewards/rejected": -29.408065795898438, + "eval_runtime": 62.4779, + "eval_samples_per_second": 45.808, + "eval_steps_per_second": 2.865, + "step": 5700 + }, + { + "epoch": 2.61, + "learning_rate": 4.3632673769660064e-08, + "logits/chosen": -1.0324370861053467, + "logits/rejected": -0.9537965059280396, + "logps/chosen": -88.25526428222656, + "logps/rejected": -127.1012191772461, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0396380424499512, + "rewards/margins": 29.129968643188477, + "rewards/rejected": -30.169607162475586, + "step": 5710 + }, + { + "epoch": 2.61, + "learning_rate": 4.312531709791983e-08, + "logits/chosen": -1.0366867780685425, + "logits/rejected": -0.9741948246955872, + "logps/chosen": -87.47926330566406, + "logps/rejected": -128.46270751953125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.316776990890503, + "rewards/margins": 28.57059097290039, + "rewards/rejected": -29.887371063232422, + "step": 5720 + }, + { + "epoch": 2.62, + "learning_rate": 4.26179604261796e-08, + "logits/chosen": -1.007524013519287, + "logits/rejected": -0.9985057711601257, + "logps/chosen": -85.72447204589844, + "logps/rejected": -124.79058837890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7779345512390137, + "rewards/margins": 28.543148040771484, + "rewards/rejected": -29.32108497619629, + "step": 5730 + }, + { + "epoch": 2.62, + "learning_rate": 4.2110603754439363e-08, + "logits/chosen": -1.085681676864624, + "logits/rejected": -1.0228497982025146, + "logps/chosen": -85.09532165527344, + "logps/rejected": -127.75041198730469, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.867802619934082, + "rewards/margins": 29.682071685791016, + "rewards/rejected": -30.54987144470215, + "step": 5740 + }, + { + "epoch": 2.62, + "learning_rate": 4.160324708269913e-08, + "logits/chosen": -1.042829155921936, + "logits/rejected": -0.9523305892944336, + "logps/chosen": -86.57728576660156, + "logps/rejected": -131.00987243652344, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23421511054039001, + "rewards/margins": 29.884082794189453, + "rewards/rejected": -29.64986801147461, + "step": 5750 + }, + { + "epoch": 2.63, + "learning_rate": 4.10958904109589e-08, + "logits/chosen": -1.0182812213897705, + "logits/rejected": -0.9332420229911804, + "logps/chosen": -78.36054992675781, + "logps/rejected": -124.4658432006836, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4626803994178772, + "rewards/margins": 28.748886108398438, + "rewards/rejected": -28.286205291748047, + "step": 5760 + }, + { + "epoch": 2.63, + "learning_rate": 4.0588533739218663e-08, + "logits/chosen": -0.9937347173690796, + "logits/rejected": -0.9194048643112183, + "logps/chosen": -79.75376892089844, + "logps/rejected": -121.32899475097656, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4256053864955902, + "rewards/margins": 30.777950286865234, + "rewards/rejected": -30.35235023498535, + "step": 5770 + }, + { + "epoch": 2.64, + "learning_rate": 4.0081177067478437e-08, + "logits/chosen": -0.9398347735404968, + "logits/rejected": -0.9001420736312866, + "logps/chosen": -88.94413757324219, + "logps/rejected": -129.3289794921875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4428245425224304, + "rewards/margins": 29.961593627929688, + "rewards/rejected": -30.4044189453125, + "step": 5780 + }, + { + "epoch": 2.64, + "learning_rate": 3.95738203957382e-08, + "logits/chosen": -1.0133349895477295, + "logits/rejected": -0.9825404286384583, + "logps/chosen": -85.88642120361328, + "logps/rejected": -125.39090728759766, + "loss": 0.0055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0771163702011108, + "rewards/margins": 29.060375213623047, + "rewards/rejected": -30.137493133544922, + "step": 5790 + }, + { + "epoch": 2.65, + "learning_rate": 3.906646372399797e-08, + "logits/chosen": -0.9934064745903015, + "logits/rejected": -0.8903753161430359, + "logps/chosen": -89.25447082519531, + "logps/rejected": -130.34597778320312, + "loss": 0.0054, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3697949647903442, + "rewards/margins": 29.40945816040039, + "rewards/rejected": -30.779254913330078, + "step": 5800 + }, + { + "epoch": 2.65, + "eval_logits/chosen": -1.0273491144180298, + "eval_logits/rejected": -0.9405400156974792, + "eval_logps/chosen": -86.86610412597656, + "eval_logps/rejected": -124.399169921875, + "eval_loss": 0.007130247540771961, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.6641231775283813, + "eval_rewards/margins": 29.104759216308594, + "eval_rewards/rejected": -29.768884658813477, + "eval_runtime": 62.9523, + "eval_samples_per_second": 45.463, + "eval_steps_per_second": 2.843, + "step": 5800 + }, + { + "epoch": 2.65, + "learning_rate": 3.8559107052257736e-08, + "logits/chosen": -1.1017916202545166, + "logits/rejected": -1.0160459280014038, + "logps/chosen": -83.09812927246094, + "logps/rejected": -123.5434799194336, + "loss": 0.0033, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.3102543354034424, + "rewards/margins": 28.675018310546875, + "rewards/rejected": -29.985271453857422, + "step": 5810 + }, + { + "epoch": 2.66, + "learning_rate": 3.80517503805175e-08, + "logits/chosen": -1.0480397939682007, + "logits/rejected": -0.9980312585830688, + "logps/chosen": -86.56658935546875, + "logps/rejected": -126.96297454833984, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.2892673015594482, + "rewards/margins": 28.608530044555664, + "rewards/rejected": -29.89779281616211, + "step": 5820 + }, + { + "epoch": 2.66, + "learning_rate": 3.754439370877727e-08, + "logits/chosen": -1.0648337602615356, + "logits/rejected": -1.0143353939056396, + "logps/chosen": -91.41023254394531, + "logps/rejected": -133.6913299560547, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9324867129325867, + "rewards/margins": 30.4281005859375, + "rewards/rejected": -31.3605899810791, + "step": 5830 + }, + { + "epoch": 2.67, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -1.0235826969146729, + "logits/rejected": -0.9797852635383606, + "logps/chosen": -84.83919525146484, + "logps/rejected": -126.71681213378906, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3446992635726929, + "rewards/margins": 29.284387588500977, + "rewards/rejected": -30.629085540771484, + "step": 5840 + }, + { + "epoch": 2.67, + "learning_rate": 3.65296803652968e-08, + "logits/chosen": -1.0963352918624878, + "logits/rejected": -1.0587340593338013, + "logps/chosen": -88.24998474121094, + "logps/rejected": -126.11114501953125, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.344663381576538, + "rewards/margins": 27.544784545898438, + "rewards/rejected": -28.889450073242188, + "step": 5850 + }, + { + "epoch": 2.67, + "learning_rate": 3.602232369355657e-08, + "logits/chosen": -1.034896969795227, + "logits/rejected": -0.949033260345459, + "logps/chosen": -86.25189208984375, + "logps/rejected": -129.3499298095703, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39351123571395874, + "rewards/margins": 31.2337589263916, + "rewards/rejected": -30.84025001525879, + "step": 5860 + }, + { + "epoch": 2.68, + "learning_rate": 3.5514967021816336e-08, + "logits/chosen": -1.0709892511367798, + "logits/rejected": -1.0646283626556396, + "logps/chosen": -88.88494873046875, + "logps/rejected": -126.898193359375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5309356451034546, + "rewards/margins": 29.396169662475586, + "rewards/rejected": -29.927108764648438, + "step": 5870 + }, + { + "epoch": 2.68, + "learning_rate": 3.50076103500761e-08, + "logits/chosen": -1.041032314300537, + "logits/rejected": -0.947468101978302, + "logps/chosen": -85.87740325927734, + "logps/rejected": -125.021240234375, + "loss": 0.0049, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.40017738938331604, + "rewards/margins": 28.514108657836914, + "rewards/rejected": -28.91428565979004, + "step": 5880 + }, + { + "epoch": 2.69, + "learning_rate": 3.450025367833587e-08, + "logits/chosen": -0.9914347529411316, + "logits/rejected": -0.9079867601394653, + "logps/chosen": -90.79552459716797, + "logps/rejected": -127.93312072753906, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5817571878433228, + "rewards/margins": 28.376272201538086, + "rewards/rejected": -28.95802879333496, + "step": 5890 + }, + { + "epoch": 2.69, + "learning_rate": 3.3992897006595636e-08, + "logits/chosen": -1.082043170928955, + "logits/rejected": -1.057128667831421, + "logps/chosen": -88.99037170410156, + "logps/rejected": -129.789794921875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6108391284942627, + "rewards/margins": 28.299758911132812, + "rewards/rejected": -29.910594940185547, + "step": 5900 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -1.031474232673645, + "eval_logits/rejected": -0.9436381459236145, + "eval_logps/chosen": -87.154296875, + "eval_logps/rejected": -124.52259826660156, + "eval_loss": 0.006726478226482868, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.808218240737915, + "eval_rewards/margins": 29.022377014160156, + "eval_rewards/rejected": -29.830596923828125, + "eval_runtime": 65.8919, + "eval_samples_per_second": 43.435, + "eval_steps_per_second": 2.717, + "step": 5900 + }, + { + "epoch": 2.7, + "learning_rate": 3.34855403348554e-08, + "logits/chosen": -1.058807134628296, + "logits/rejected": -1.002246379852295, + "logps/chosen": -94.60523986816406, + "logps/rejected": -126.1142349243164, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3425036668777466, + "rewards/margins": 27.65673828125, + "rewards/rejected": -28.999242782592773, + "step": 5910 + }, + { + "epoch": 2.7, + "learning_rate": 3.297818366311517e-08, + "logits/chosen": -1.0659196376800537, + "logits/rejected": -1.02195143699646, + "logps/chosen": -83.831787109375, + "logps/rejected": -127.63771057128906, + "loss": 0.0043, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.325688362121582, + "rewards/margins": 28.6981258392334, + "rewards/rejected": -30.023815155029297, + "step": 5920 + }, + { + "epoch": 2.71, + "learning_rate": 3.2470826991374936e-08, + "logits/chosen": -1.0907622575759888, + "logits/rejected": -1.0057451725006104, + "logps/chosen": -89.2253646850586, + "logps/rejected": -127.18135833740234, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8062092065811157, + "rewards/margins": 28.803585052490234, + "rewards/rejected": -29.609792709350586, + "step": 5930 + }, + { + "epoch": 2.71, + "learning_rate": 3.19634703196347e-08, + "logits/chosen": -1.1025313138961792, + "logits/rejected": -1.0713751316070557, + "logps/chosen": -91.10670471191406, + "logps/rejected": -132.5358428955078, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7590230703353882, + "rewards/margins": 29.9611873626709, + "rewards/rejected": -31.72021484375, + "step": 5940 + }, + { + "epoch": 2.72, + "learning_rate": 3.145611364789447e-08, + "logits/chosen": -0.9489334225654602, + "logits/rejected": -0.9489434957504272, + "logps/chosen": -92.52510070800781, + "logps/rejected": -127.75334167480469, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.572442889213562, + "rewards/margins": 29.219165802001953, + "rewards/rejected": -29.791606903076172, + "step": 5950 + }, + { + "epoch": 2.72, + "learning_rate": 3.0948756976154236e-08, + "logits/chosen": -1.0602383613586426, + "logits/rejected": -0.9741102457046509, + "logps/chosen": -84.59852600097656, + "logps/rejected": -125.7697525024414, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35125207901000977, + "rewards/margins": 29.04140281677246, + "rewards/rejected": -29.392658233642578, + "step": 5960 + }, + { + "epoch": 2.73, + "learning_rate": 3.0441400304414e-08, + "logits/chosen": -1.0793386697769165, + "logits/rejected": -1.0352563858032227, + "logps/chosen": -85.41413116455078, + "logps/rejected": -125.89903259277344, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.781488299369812, + "rewards/margins": 28.587890625, + "rewards/rejected": -29.3693790435791, + "step": 5970 + }, + { + "epoch": 2.73, + "learning_rate": 2.993404363267377e-08, + "logits/chosen": -1.0787484645843506, + "logits/rejected": -1.0108954906463623, + "logps/chosen": -82.58202362060547, + "logps/rejected": -125.72383117675781, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.0184836387634277, + "rewards/margins": 28.39313316345215, + "rewards/rejected": -29.4116153717041, + "step": 5980 + }, + { + "epoch": 2.73, + "learning_rate": 2.9426686960933532e-08, + "logits/chosen": -0.9952011108398438, + "logits/rejected": -0.8851292729377747, + "logps/chosen": -93.87915802001953, + "logps/rejected": -124.56547546386719, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4423131048679352, + "rewards/margins": 28.472192764282227, + "rewards/rejected": -28.914505004882812, + "step": 5990 + }, + { + "epoch": 2.74, + "learning_rate": 2.89193302891933e-08, + "logits/chosen": -1.178444504737854, + "logits/rejected": -1.1157991886138916, + "logps/chosen": -88.11283874511719, + "logps/rejected": -130.8851318359375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23296864330768585, + "rewards/margins": 30.756484985351562, + "rewards/rejected": -30.989452362060547, + "step": 6000 + }, + { + "epoch": 2.74, + "eval_logits/chosen": -1.0278129577636719, + "eval_logits/rejected": -0.9400880932807922, + "eval_logps/chosen": -87.00244140625, + "eval_logps/rejected": -124.30833435058594, + "eval_loss": 0.006807922385632992, + "eval_rewards/accuracies": 0.9888268113136292, + "eval_rewards/chosen": -0.732290506362915, + "eval_rewards/margins": 28.99117088317871, + "eval_rewards/rejected": -29.723461151123047, + "eval_runtime": 61.3283, + "eval_samples_per_second": 46.667, + "eval_steps_per_second": 2.919, + "step": 6000 + }, + { + "epoch": 2.74, + "learning_rate": 2.8411973617453066e-08, + "logits/chosen": -0.9492026567459106, + "logits/rejected": -0.9152344465255737, + "logps/chosen": -86.2878189086914, + "logps/rejected": -126.0578842163086, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4441192150115967, + "rewards/margins": 29.64728355407715, + "rewards/rejected": -30.091400146484375, + "step": 6010 + }, + { + "epoch": 2.75, + "learning_rate": 2.7904616945712832e-08, + "logits/chosen": -1.0655428171157837, + "logits/rejected": -1.0025355815887451, + "logps/chosen": -87.4747085571289, + "logps/rejected": -122.6070327758789, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8353004455566406, + "rewards/margins": 28.315927505493164, + "rewards/rejected": -29.151226043701172, + "step": 6020 + }, + { + "epoch": 2.75, + "learning_rate": 2.73972602739726e-08, + "logits/chosen": -1.0561573505401611, + "logits/rejected": -0.9622231721878052, + "logps/chosen": -87.29557037353516, + "logps/rejected": -127.05860900878906, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.450141429901123, + "rewards/margins": 29.205745697021484, + "rewards/rejected": -30.6558895111084, + "step": 6030 + }, + { + "epoch": 2.76, + "learning_rate": 2.6889903602232366e-08, + "logits/chosen": -1.0109049081802368, + "logits/rejected": -0.9498085975646973, + "logps/chosen": -87.83575439453125, + "logps/rejected": -128.5558624267578, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4376589357852936, + "rewards/margins": 31.382431030273438, + "rewards/rejected": -30.944774627685547, + "step": 6040 + }, + { + "epoch": 2.76, + "learning_rate": 2.6382546930492132e-08, + "logits/chosen": -1.184468150138855, + "logits/rejected": -1.082187294960022, + "logps/chosen": -88.23623657226562, + "logps/rejected": -130.09461975097656, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7751424312591553, + "rewards/margins": 28.917348861694336, + "rewards/rejected": -29.692489624023438, + "step": 6050 + }, + { + "epoch": 2.77, + "learning_rate": 2.58751902587519e-08, + "logits/chosen": -1.104385495185852, + "logits/rejected": -1.02403724193573, + "logps/chosen": -83.91561126708984, + "logps/rejected": -121.2802734375, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0201902873814106, + "rewards/margins": 27.695789337158203, + "rewards/rejected": -27.71598243713379, + "step": 6060 + }, + { + "epoch": 2.77, + "learning_rate": 2.5367833587011665e-08, + "logits/chosen": -1.1031575202941895, + "logits/rejected": -0.990433394908905, + "logps/chosen": -95.4950942993164, + "logps/rejected": -129.18492126464844, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37409108877182007, + "rewards/margins": 29.132125854492188, + "rewards/rejected": -29.506216049194336, + "step": 6070 + }, + { + "epoch": 2.78, + "learning_rate": 2.4860476915271432e-08, + "logits/chosen": -1.0707147121429443, + "logits/rejected": -1.012056589126587, + "logps/chosen": -89.7245101928711, + "logps/rejected": -122.63059997558594, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4195477068424225, + "rewards/margins": 29.034103393554688, + "rewards/rejected": -28.614553451538086, + "step": 6080 + }, + { + "epoch": 2.78, + "learning_rate": 2.43531202435312e-08, + "logits/chosen": -1.071775197982788, + "logits/rejected": -1.0009758472442627, + "logps/chosen": -80.49525451660156, + "logps/rejected": -124.15095520019531, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16672919690608978, + "rewards/margins": 28.871688842773438, + "rewards/rejected": -28.7049617767334, + "step": 6090 + }, + { + "epoch": 2.78, + "learning_rate": 2.3845763571790965e-08, + "logits/chosen": -1.0576212406158447, + "logits/rejected": -1.0005748271942139, + "logps/chosen": -91.83822631835938, + "logps/rejected": -124.02302551269531, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2521924078464508, + "rewards/margins": 28.721765518188477, + "rewards/rejected": -28.973957061767578, + "step": 6100 + }, + { + "epoch": 2.78, + "eval_logits/chosen": -1.0250327587127686, + "eval_logits/rejected": -0.9389449954032898, + "eval_logps/chosen": -86.68120574951172, + "eval_logps/rejected": -123.78533172607422, + "eval_loss": 0.006523421499878168, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.5716744661331177, + "eval_rewards/margins": 28.890287399291992, + "eval_rewards/rejected": -29.461963653564453, + "eval_runtime": 84.1574, + "eval_samples_per_second": 34.008, + "eval_steps_per_second": 2.127, + "step": 6100 + }, + { + "epoch": 2.79, + "learning_rate": 2.3338406900050732e-08, + "logits/chosen": -0.916157066822052, + "logits/rejected": -0.8838459253311157, + "logps/chosen": -86.09705352783203, + "logps/rejected": -124.25962829589844, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.23373942077159882, + "rewards/margins": 29.301513671875, + "rewards/rejected": -29.535253524780273, + "step": 6110 + }, + { + "epoch": 2.79, + "learning_rate": 2.28310502283105e-08, + "logits/chosen": -1.0872166156768799, + "logits/rejected": -0.9581939578056335, + "logps/chosen": -87.7667236328125, + "logps/rejected": -127.71507263183594, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22645695507526398, + "rewards/margins": 30.267444610595703, + "rewards/rejected": -30.04098892211914, + "step": 6120 + }, + { + "epoch": 2.8, + "learning_rate": 2.2323693556570265e-08, + "logits/chosen": -1.0600719451904297, + "logits/rejected": -1.0231385231018066, + "logps/chosen": -82.3358383178711, + "logps/rejected": -123.05574035644531, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8706923723220825, + "rewards/margins": 29.04482650756836, + "rewards/rejected": -29.9155216217041, + "step": 6130 + }, + { + "epoch": 2.8, + "learning_rate": 2.1816336884830032e-08, + "logits/chosen": -1.002144694328308, + "logits/rejected": -0.9809429049491882, + "logps/chosen": -82.87908935546875, + "logps/rejected": -122.70130920410156, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.017511676996946335, + "rewards/margins": 28.458423614501953, + "rewards/rejected": -28.4409122467041, + "step": 6140 + }, + { + "epoch": 2.81, + "learning_rate": 2.13089802130898e-08, + "logits/chosen": -1.0385632514953613, + "logits/rejected": -0.9566332697868347, + "logps/chosen": -86.96278381347656, + "logps/rejected": -128.3975372314453, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32612985372543335, + "rewards/margins": 31.055572509765625, + "rewards/rejected": -31.381702423095703, + "step": 6150 + }, + { + "epoch": 2.81, + "learning_rate": 2.0801623541349565e-08, + "logits/chosen": -1.0180633068084717, + "logits/rejected": -0.9315141439437866, + "logps/chosen": -89.90997314453125, + "logps/rejected": -129.4881134033203, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5671930313110352, + "rewards/margins": 30.148788452148438, + "rewards/rejected": -30.71598243713379, + "step": 6160 + }, + { + "epoch": 2.82, + "learning_rate": 2.0294266869609332e-08, + "logits/chosen": -1.0712960958480835, + "logits/rejected": -0.9703865051269531, + "logps/chosen": -89.47264862060547, + "logps/rejected": -127.07987213134766, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.344783902168274, + "rewards/margins": 28.681127548217773, + "rewards/rejected": -30.02591323852539, + "step": 6170 + }, + { + "epoch": 2.82, + "learning_rate": 1.97869101978691e-08, + "logits/chosen": -1.0805513858795166, + "logits/rejected": -0.9932042956352234, + "logps/chosen": -93.64996337890625, + "logps/rejected": -127.0558090209961, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9845650792121887, + "rewards/margins": 29.586376190185547, + "rewards/rejected": -30.570941925048828, + "step": 6180 + }, + { + "epoch": 2.83, + "learning_rate": 1.9279553526128868e-08, + "logits/chosen": -1.0248457193374634, + "logits/rejected": -0.9871330261230469, + "logps/chosen": -80.39697265625, + "logps/rejected": -125.86643981933594, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4571932852268219, + "rewards/margins": 30.0575008392334, + "rewards/rejected": -30.514690399169922, + "step": 6190 + }, + { + "epoch": 2.83, + "learning_rate": 1.8772196854388635e-08, + "logits/chosen": -0.9889580011367798, + "logits/rejected": -0.9497294425964355, + "logps/chosen": -85.86564636230469, + "logps/rejected": -126.31917572021484, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4988800287246704, + "rewards/margins": 28.99808120727539, + "rewards/rejected": -30.496959686279297, + "step": 6200 + }, + { + "epoch": 2.83, + "eval_logits/chosen": -1.0244492292404175, + "eval_logits/rejected": -0.9368904232978821, + "eval_logps/chosen": -86.6476821899414, + "eval_logps/rejected": -123.90021514892578, + "eval_loss": 0.006625541485846043, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.5549145340919495, + "eval_rewards/margins": 28.964487075805664, + "eval_rewards/rejected": -29.5194034576416, + "eval_runtime": 66.1256, + "eval_samples_per_second": 43.281, + "eval_steps_per_second": 2.707, + "step": 6200 + }, + { + "epoch": 2.83, + "learning_rate": 1.82648401826484e-08, + "logits/chosen": -1.0492604970932007, + "logits/rejected": -1.0105106830596924, + "logps/chosen": -85.85210418701172, + "logps/rejected": -128.20448303222656, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4999401569366455, + "rewards/margins": 29.045568466186523, + "rewards/rejected": -30.54551124572754, + "step": 6210 + }, + { + "epoch": 2.84, + "learning_rate": 1.7757483510908168e-08, + "logits/chosen": -0.9619634747505188, + "logits/rejected": -0.9743566513061523, + "logps/chosen": -86.10426330566406, + "logps/rejected": -130.9729766845703, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.49150973558425903, + "rewards/margins": 30.758865356445312, + "rewards/rejected": -31.250377655029297, + "step": 6220 + }, + { + "epoch": 2.84, + "learning_rate": 1.7250126839167935e-08, + "logits/chosen": -1.0379936695098877, + "logits/rejected": -1.0153452157974243, + "logps/chosen": -81.86763000488281, + "logps/rejected": -120.93293762207031, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.818130373954773, + "rewards/margins": 28.285436630249023, + "rewards/rejected": -29.103565216064453, + "step": 6230 + }, + { + "epoch": 2.85, + "learning_rate": 1.67427701674277e-08, + "logits/chosen": -1.0076709985733032, + "logits/rejected": -0.9663689732551575, + "logps/chosen": -85.59688568115234, + "logps/rejected": -125.4538345336914, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017603278160095215, + "rewards/margins": 30.528635025024414, + "rewards/rejected": -30.546234130859375, + "step": 6240 + }, + { + "epoch": 2.85, + "learning_rate": 1.6235413495687468e-08, + "logits/chosen": -0.9639765024185181, + "logits/rejected": -0.9728061556816101, + "logps/chosen": -87.17933654785156, + "logps/rejected": -126.87109375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7064870595932007, + "rewards/margins": 28.171640396118164, + "rewards/rejected": -28.878128051757812, + "step": 6250 + }, + { + "epoch": 2.86, + "learning_rate": 1.5728056823947235e-08, + "logits/chosen": -0.9334337115287781, + "logits/rejected": -0.8631385564804077, + "logps/chosen": -83.15779876708984, + "logps/rejected": -122.5430908203125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3862573504447937, + "rewards/margins": 28.38237953186035, + "rewards/rejected": -28.76863670349121, + "step": 6260 + }, + { + "epoch": 2.86, + "learning_rate": 1.5220700152207e-08, + "logits/chosen": -1.1374794244766235, + "logits/rejected": -1.0338224172592163, + "logps/chosen": -86.21543884277344, + "logps/rejected": -129.5160369873047, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8715160489082336, + "rewards/margins": 30.266857147216797, + "rewards/rejected": -31.138376235961914, + "step": 6270 + }, + { + "epoch": 2.87, + "learning_rate": 1.4713343480466766e-08, + "logits/chosen": -1.0501601696014404, + "logits/rejected": -1.0120677947998047, + "logps/chosen": -93.43696594238281, + "logps/rejected": -127.74129486083984, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8736766576766968, + "rewards/margins": 27.128524780273438, + "rewards/rejected": -29.0022029876709, + "step": 6280 + }, + { + "epoch": 2.87, + "learning_rate": 1.4205986808726533e-08, + "logits/chosen": -0.9635306596755981, + "logits/rejected": -0.9733031392097473, + "logps/chosen": -86.1040267944336, + "logps/rejected": -128.07809448242188, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6703550219535828, + "rewards/margins": 28.874740600585938, + "rewards/rejected": -29.545095443725586, + "step": 6290 + }, + { + "epoch": 2.88, + "learning_rate": 1.36986301369863e-08, + "logits/chosen": -1.0834633111953735, + "logits/rejected": -1.0034466981887817, + "logps/chosen": -82.86973571777344, + "logps/rejected": -129.51971435546875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29943275451660156, + "rewards/margins": 30.45029067993164, + "rewards/rejected": -30.749719619750977, + "step": 6300 + }, + { + "epoch": 2.88, + "eval_logits/chosen": -1.0220108032226562, + "eval_logits/rejected": -0.9361612200737, + "eval_logps/chosen": -86.4330062866211, + "eval_logps/rejected": -123.60254669189453, + "eval_loss": 0.006502318661659956, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.44757336378097534, + "eval_rewards/margins": 28.922998428344727, + "eval_rewards/rejected": -29.370569229125977, + "eval_runtime": 64.6819, + "eval_samples_per_second": 44.247, + "eval_steps_per_second": 2.767, + "step": 6300 + }, + { + "epoch": 2.88, + "learning_rate": 1.3191273465246066e-08, + "logits/chosen": -1.0733602046966553, + "logits/rejected": -1.0072834491729736, + "logps/chosen": -86.40800476074219, + "logps/rejected": -119.3967514038086, + "loss": 0.0022, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.4470507502555847, + "rewards/margins": 27.95053482055664, + "rewards/rejected": -28.397586822509766, + "step": 6310 + }, + { + "epoch": 2.88, + "learning_rate": 1.2683916793505833e-08, + "logits/chosen": -1.060889482498169, + "logits/rejected": -1.0003650188446045, + "logps/chosen": -89.27617645263672, + "logps/rejected": -126.93253326416016, + "loss": 0.0036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6875644326210022, + "rewards/margins": 28.4171142578125, + "rewards/rejected": -29.10468101501465, + "step": 6320 + }, + { + "epoch": 2.89, + "learning_rate": 1.21765601217656e-08, + "logits/chosen": -0.9854904413223267, + "logits/rejected": -0.9091927409172058, + "logps/chosen": -85.25365447998047, + "logps/rejected": -128.0735321044922, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08507101237773895, + "rewards/margins": 31.455326080322266, + "rewards/rejected": -31.370258331298828, + "step": 6330 + }, + { + "epoch": 2.89, + "learning_rate": 1.1669203450025366e-08, + "logits/chosen": -1.0819941759109497, + "logits/rejected": -1.0338475704193115, + "logps/chosen": -90.92166900634766, + "logps/rejected": -130.71810913085938, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01656208001077175, + "rewards/margins": 29.559118270874023, + "rewards/rejected": -29.575679779052734, + "step": 6340 + }, + { + "epoch": 2.9, + "learning_rate": 1.1161846778285133e-08, + "logits/chosen": -1.0656547546386719, + "logits/rejected": -1.037102222442627, + "logps/chosen": -86.26807403564453, + "logps/rejected": -124.87815856933594, + "loss": 0.0025, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.5916534662246704, + "rewards/margins": 29.280879974365234, + "rewards/rejected": -29.87253189086914, + "step": 6350 + }, + { + "epoch": 2.9, + "learning_rate": 1.06544901065449e-08, + "logits/chosen": -1.075899362564087, + "logits/rejected": -0.9635077714920044, + "logps/chosen": -85.9807357788086, + "logps/rejected": -128.5865478515625, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5440212488174438, + "rewards/margins": 30.536121368408203, + "rewards/rejected": -31.08014488220215, + "step": 6360 + }, + { + "epoch": 2.91, + "learning_rate": 1.0147133434804666e-08, + "logits/chosen": -0.891313374042511, + "logits/rejected": -0.8621894121170044, + "logps/chosen": -88.86170196533203, + "logps/rejected": -127.87739562988281, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10790462791919708, + "rewards/margins": 28.5927677154541, + "rewards/rejected": -28.700672149658203, + "step": 6370 + }, + { + "epoch": 2.91, + "learning_rate": 9.639776763064434e-09, + "logits/chosen": -0.9837745428085327, + "logits/rejected": -0.9746279716491699, + "logps/chosen": -83.23785400390625, + "logps/rejected": -121.79039001464844, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7958210110664368, + "rewards/margins": 28.028661727905273, + "rewards/rejected": -28.82448387145996, + "step": 6380 + }, + { + "epoch": 2.92, + "learning_rate": 9.1324200913242e-09, + "logits/chosen": -0.9626250267028809, + "logits/rejected": -0.9271273612976074, + "logps/chosen": -86.17671203613281, + "logps/rejected": -121.7667236328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6893644332885742, + "rewards/margins": 28.271896362304688, + "rewards/rejected": -28.961261749267578, + "step": 6390 + }, + { + "epoch": 2.92, + "learning_rate": 8.625063419583967e-09, + "logits/chosen": -1.0479789972305298, + "logits/rejected": -0.9923169016838074, + "logps/chosen": -86.66059112548828, + "logps/rejected": -129.21859741210938, + "loss": 0.0035, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.6069229245185852, + "rewards/margins": 29.50791358947754, + "rewards/rejected": -30.114837646484375, + "step": 6400 + }, + { + "epoch": 2.92, + "eval_logits/chosen": -1.0255866050720215, + "eval_logits/rejected": -0.939497709274292, + "eval_logps/chosen": -86.53129577636719, + "eval_logps/rejected": -123.90242767333984, + "eval_loss": 0.006593942176550627, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.4967171549797058, + "eval_rewards/margins": 29.0238037109375, + "eval_rewards/rejected": -29.520519256591797, + "eval_runtime": 75.6633, + "eval_samples_per_second": 37.825, + "eval_steps_per_second": 2.366, + "step": 6400 + }, + { + "epoch": 2.93, + "learning_rate": 8.117706747843734e-09, + "logits/chosen": -0.9005545377731323, + "logits/rejected": -0.8726035952568054, + "logps/chosen": -88.71814727783203, + "logps/rejected": -127.66644287109375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0038230419158936, + "rewards/margins": 28.876317977905273, + "rewards/rejected": -29.880138397216797, + "step": 6410 + }, + { + "epoch": 2.93, + "learning_rate": 7.6103500761035e-09, + "logits/chosen": -1.091305136680603, + "logits/rejected": -1.033553123474121, + "logps/chosen": -85.39485931396484, + "logps/rejected": -127.29072570800781, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2828121781349182, + "rewards/margins": 29.81418228149414, + "rewards/rejected": -30.096996307373047, + "step": 6420 + }, + { + "epoch": 2.94, + "learning_rate": 7.1029934043632664e-09, + "logits/chosen": -1.0751222372055054, + "logits/rejected": -0.9772630929946899, + "logps/chosen": -84.76583862304688, + "logps/rejected": -130.1298065185547, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2620498836040497, + "rewards/margins": 30.05381202697754, + "rewards/rejected": -30.315860748291016, + "step": 6430 + }, + { + "epoch": 2.94, + "learning_rate": 6.595636732623033e-09, + "logits/chosen": -1.0256164073944092, + "logits/rejected": -0.9431262016296387, + "logps/chosen": -87.59001922607422, + "logps/rejected": -128.95947265625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13797995448112488, + "rewards/margins": 29.917160034179688, + "rewards/rejected": -30.05513572692871, + "step": 6440 + }, + { + "epoch": 2.94, + "learning_rate": 6.0882800608828e-09, + "logits/chosen": -1.0374404191970825, + "logits/rejected": -0.9506736993789673, + "logps/chosen": -80.81979370117188, + "logps/rejected": -125.56309509277344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40483197569847107, + "rewards/margins": 29.041240692138672, + "rewards/rejected": -29.44607162475586, + "step": 6450 + }, + { + "epoch": 2.95, + "learning_rate": 5.580923389142566e-09, + "logits/chosen": -0.869223415851593, + "logits/rejected": -0.8867238163948059, + "logps/chosen": -88.09699249267578, + "logps/rejected": -126.71589660644531, + "loss": 0.0033, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8291648626327515, + "rewards/margins": 29.939502716064453, + "rewards/rejected": -29.110332489013672, + "step": 6460 + }, + { + "epoch": 2.95, + "learning_rate": 5.073566717402333e-09, + "logits/chosen": -1.019307255744934, + "logits/rejected": -1.0058377981185913, + "logps/chosen": -98.96701049804688, + "logps/rejected": -124.1956558227539, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8037646412849426, + "rewards/margins": 28.398019790649414, + "rewards/rejected": -29.2017879486084, + "step": 6470 + }, + { + "epoch": 2.96, + "learning_rate": 4.5662100456621e-09, + "logits/chosen": -0.96232670545578, + "logits/rejected": -0.9179836511611938, + "logps/chosen": -86.69127655029297, + "logps/rejected": -123.1005859375, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9831452369689941, + "rewards/margins": 29.9154109954834, + "rewards/rejected": -28.932266235351562, + "step": 6480 + }, + { + "epoch": 2.96, + "learning_rate": 4.058853373921867e-09, + "logits/chosen": -1.0222089290618896, + "logits/rejected": -0.8847758173942566, + "logps/chosen": -89.50880432128906, + "logps/rejected": -125.39402770996094, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06352037191390991, + "rewards/margins": 30.433147430419922, + "rewards/rejected": -30.49666404724121, + "step": 6490 + }, + { + "epoch": 2.97, + "learning_rate": 3.5514967021816332e-09, + "logits/chosen": -1.0513429641723633, + "logits/rejected": -0.9981459379196167, + "logps/chosen": -88.95649719238281, + "logps/rejected": -126.06563568115234, + "loss": 0.0012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.7696861028671265, + "rewards/margins": 28.481319427490234, + "rewards/rejected": -29.251007080078125, + "step": 6500 + }, + { + "epoch": 2.97, + "eval_logits/chosen": -1.0240223407745361, + "eval_logits/rejected": -0.9381014108657837, + "eval_logps/chosen": -86.47101593017578, + "eval_logps/rejected": -123.78819274902344, + "eval_loss": 0.006529896054416895, + "eval_rewards/accuracies": 0.9916201233863831, + "eval_rewards/chosen": -0.46657702326774597, + "eval_rewards/margins": 28.996816635131836, + "eval_rewards/rejected": -29.463396072387695, + "eval_runtime": 78.27, + "eval_samples_per_second": 36.566, + "eval_steps_per_second": 2.287, + "step": 6500 + }, + { + "epoch": 2.97, + "learning_rate": 3.0441400304414e-09, + "logits/chosen": -1.083836317062378, + "logits/rejected": -1.0287898778915405, + "logps/chosen": -90.65104675292969, + "logps/rejected": -129.10536193847656, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5575191974639893, + "rewards/margins": 29.243595123291016, + "rewards/rejected": -30.80111312866211, + "step": 6510 + }, + { + "epoch": 2.98, + "learning_rate": 2.5367833587011665e-09, + "logits/chosen": -1.1479265689849854, + "logits/rejected": -1.0269317626953125, + "logps/chosen": -92.42093658447266, + "logps/rejected": -131.41064453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4860418438911438, + "rewards/margins": 30.6885986328125, + "rewards/rejected": -31.174636840820312, + "step": 6520 + }, + { + "epoch": 2.98, + "learning_rate": 2.0294266869609335e-09, + "logits/chosen": -0.9470345377922058, + "logits/rejected": -0.8561986684799194, + "logps/chosen": -85.6261978149414, + "logps/rejected": -127.69779968261719, + "loss": 0.0044, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -1.3518216609954834, + "rewards/margins": 28.512975692749023, + "rewards/rejected": -29.864795684814453, + "step": 6530 + }, + { + "epoch": 2.99, + "learning_rate": 1.5220700152207e-09, + "logits/chosen": -0.9782537221908569, + "logits/rejected": -0.9774686694145203, + "logps/chosen": -83.68141174316406, + "logps/rejected": -120.14680480957031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29628103971481323, + "rewards/margins": 27.92130470275879, + "rewards/rejected": -28.21758460998535, + "step": 6540 + }, + { + "epoch": 2.99, + "learning_rate": 1.0147133434804667e-09, + "logits/chosen": -1.084174394607544, + "logits/rejected": -0.9952249526977539, + "logps/chosen": -89.36202239990234, + "logps/rejected": -128.4408721923828, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7218122482299805, + "rewards/margins": 28.947765350341797, + "rewards/rejected": -29.669580459594727, + "step": 6550 + }, + { + "epoch": 2.99, + "learning_rate": 5.073566717402334e-10, + "logits/chosen": -1.0538431406021118, + "logits/rejected": -0.9888811111450195, + "logps/chosen": -87.28624725341797, + "logps/rejected": -125.61628723144531, + "loss": 0.0011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.9487142562866211, + "rewards/margins": 29.355825424194336, + "rewards/rejected": -30.304540634155273, + "step": 6560 + }, + { + "epoch": 3.0, + "learning_rate": 0.0, + "logits/chosen": -1.054466962814331, + "logits/rejected": -0.9624788165092468, + "logps/chosen": -85.47309875488281, + "logps/rejected": -125.8909683227539, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8512237668037415, + "rewards/margins": 28.772281646728516, + "rewards/rejected": -29.623498916625977, + "step": 6570 + }, + { + "epoch": 3.0, + "step": 6570, + "total_flos": 0.0, + "train_loss": 0.022793083270943315, + "train_runtime": 24920.2972, + "train_samples_per_second": 16.878, + "train_steps_per_second": 0.264 + } + ], + "logging_steps": 10, + "max_steps": 6570, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}