{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 8826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.662514156285391e-10, "logits/chosen": -0.07598976045846939, "logits/rejected": -0.45198649168014526, "logps/chosen": -223.75332641601562, "logps/rejected": -732.1045532226562, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.66251415628539e-09, "logits/chosen": -0.385540246963501, "logits/rejected": -0.23824787139892578, "logps/chosen": -424.3954772949219, "logps/rejected": -505.50970458984375, "loss": 0.6922, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.005437952000647783, "rewards/margins": -0.004041292704641819, "rewards/rejected": -0.0013966606929898262, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.132502831257078e-08, "logits/chosen": -0.4422360062599182, "logits/rejected": -0.26954811811447144, "logps/chosen": -198.19171142578125, "logps/rejected": -425.90936279296875, "loss": 0.6934, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0048302048817276955, "rewards/margins": 0.0017131452914327383, "rewards/rejected": -0.006543349474668503, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.698754246885617e-08, "logits/chosen": -0.3186202645301819, "logits/rejected": -0.2842954695224762, "logps/chosen": -356.40655517578125, "logps/rejected": -411.4159240722656, "loss": 0.6925, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.006616205908358097, "rewards/margins": 0.012125561945140362, "rewards/rejected": -0.00550935510545969, "step": 30 }, { "epoch": 0.01, "learning_rate": 2.265005662514156e-08, "logits/chosen": -0.40261784195899963, "logits/rejected": -0.32133278250694275, "logps/chosen": -227.0263214111328, "logps/rejected": -441.896484375, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": 0.009186742827296257, "rewards/margins": 0.02110801264643669, "rewards/rejected": -0.011921269819140434, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.8312570781426952e-08, "logits/chosen": -0.5871148109436035, "logits/rejected": -0.13973578810691833, "logps/chosen": -156.10244750976562, "logps/rejected": -525.1514892578125, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": -0.004176813177764416, "rewards/margins": 0.00882786326110363, "rewards/rejected": -0.013004678301513195, "step": 50 }, { "epoch": 0.02, "learning_rate": 3.397508493771234e-08, "logits/chosen": -0.4208458960056305, "logits/rejected": -0.3361497223377228, "logps/chosen": -228.4647674560547, "logps/rejected": -296.693603515625, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": -0.004670066758990288, "rewards/margins": 0.027450567111372948, "rewards/rejected": -0.032120633870363235, "step": 60 }, { "epoch": 0.02, "learning_rate": 3.9637599093997736e-08, "logits/chosen": -0.5727721452713013, "logits/rejected": -0.25741398334503174, "logps/chosen": -225.5534210205078, "logps/rejected": -462.3497619628906, "loss": 0.6704, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008383071050047874, "rewards/margins": 0.05840033292770386, "rewards/rejected": -0.050017256289720535, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.530011325028312e-08, "logits/chosen": -0.3984699547290802, "logits/rejected": -0.30298393964767456, "logps/chosen": -353.0005187988281, "logps/rejected": -372.1181640625, "loss": 0.6561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.009889362379908562, "rewards/margins": 0.0897064059972763, "rewards/rejected": -0.07981704920530319, "step": 80 }, { "epoch": 0.03, "learning_rate": 5.096262740656852e-08, "logits/chosen": -0.4686814248561859, "logits/rejected": -0.2592395544052124, "logps/chosen": -178.34634399414062, "logps/rejected": -240.1350860595703, "loss": 0.6415, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.028491392731666565, "rewards/margins": 0.11343902349472046, "rewards/rejected": -0.0849476233124733, "step": 90 }, { "epoch": 0.03, "learning_rate": 5.6625141562853904e-08, "logits/chosen": -0.5295109152793884, "logits/rejected": -0.31285038590431213, "logps/chosen": -157.41445922851562, "logps/rejected": -380.2042541503906, "loss": 0.6247, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02565709315240383, "rewards/margins": 0.15384149551391602, "rewards/rejected": -0.12818440794944763, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -0.5546308755874634, "eval_logits/rejected": -0.2809797525405884, "eval_logps/chosen": -217.99822998046875, "eval_logps/rejected": -416.314453125, "eval_loss": 0.6155002117156982, "eval_rewards/accuracies": 0.8577440977096558, "eval_rewards/chosen": 0.015758171677589417, "eval_rewards/margins": 0.16080892086029053, "eval_rewards/rejected": -0.1450507640838623, "eval_runtime": 536.0447, "eval_samples_per_second": 17.722, "eval_steps_per_second": 0.554, "step": 100 }, { "epoch": 0.04, "learning_rate": 6.22876557191393e-08, "logits/chosen": -0.5584183931350708, "logits/rejected": -0.22632427513599396, "logps/chosen": -146.4309844970703, "logps/rejected": -638.1456909179688, "loss": 0.6046, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.03242674469947815, "rewards/margins": 0.20094385743141174, "rewards/rejected": -0.1685170978307724, "step": 110 }, { "epoch": 0.04, "learning_rate": 6.795016987542468e-08, "logits/chosen": -0.4910295903682709, "logits/rejected": -0.32439225912094116, "logps/chosen": -163.6087646484375, "logps/rejected": -407.0900573730469, "loss": 0.571, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.06128523498773575, "rewards/margins": 0.29128292202949524, "rewards/rejected": -0.2299976795911789, "step": 120 }, { "epoch": 0.04, "learning_rate": 7.361268403171007e-08, "logits/chosen": -0.4110735356807709, "logits/rejected": -0.32707124948501587, "logps/chosen": -222.0219268798828, "logps/rejected": -431.539306640625, "loss": 0.5471, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0644715204834938, "rewards/margins": 0.3729821741580963, "rewards/rejected": -0.3085106611251831, "step": 130 }, { "epoch": 0.05, "learning_rate": 7.927519818799547e-08, "logits/chosen": -0.46890074014663696, "logits/rejected": -0.26065942645072937, "logps/chosen": -150.04249572753906, "logps/rejected": -479.42437744140625, "loss": 0.5274, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.05077038332819939, "rewards/margins": 0.3963143825531006, "rewards/rejected": -0.3455440104007721, "step": 140 }, { "epoch": 0.05, "learning_rate": 8.493771234428086e-08, "logits/chosen": -0.3876637816429138, "logits/rejected": -0.2349829375743866, "logps/chosen": -165.5437774658203, "logps/rejected": -409.16973876953125, "loss": 0.5163, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0761779397726059, "rewards/margins": 0.4856814444065094, "rewards/rejected": -0.4095034599304199, "step": 150 }, { "epoch": 0.05, "learning_rate": 9.060022650056625e-08, "logits/chosen": -0.2624950408935547, "logits/rejected": -0.24308566749095917, "logps/chosen": -177.4712677001953, "logps/rejected": -374.5631103515625, "loss": 0.4876, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.09907428920269012, "rewards/margins": 0.5338638424873352, "rewards/rejected": -0.4347895085811615, "step": 160 }, { "epoch": 0.06, "learning_rate": 9.626274065685163e-08, "logits/chosen": -0.5416995286941528, "logits/rejected": -0.30129164457321167, "logps/chosen": -264.92620849609375, "logps/rejected": -370.44610595703125, "loss": 0.4474, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.10987275838851929, "rewards/margins": 0.7102267146110535, "rewards/rejected": -0.6003538370132446, "step": 170 }, { "epoch": 0.06, "learning_rate": 1.0192525481313703e-07, "logits/chosen": -0.2831200957298279, "logits/rejected": -0.286939412355423, "logps/chosen": -295.0725402832031, "logps/rejected": -505.97198486328125, "loss": 0.4121, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.13076487183570862, "rewards/margins": 0.8437229990959167, "rewards/rejected": -0.7129581570625305, "step": 180 }, { "epoch": 0.06, "learning_rate": 1.0758776896942241e-07, "logits/chosen": -0.4120418131351471, "logits/rejected": -0.34052786231040955, "logps/chosen": -167.15274047851562, "logps/rejected": -423.9737243652344, "loss": 0.3645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.23381829261779785, "rewards/margins": 1.0079195499420166, "rewards/rejected": -0.7741013169288635, "step": 190 }, { "epoch": 0.07, "learning_rate": 1.1325028312570781e-07, "logits/chosen": -0.3162071108818054, "logits/rejected": -0.1932157278060913, "logps/chosen": -229.230224609375, "logps/rejected": -338.18365478515625, "loss": 0.3738, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0931645929813385, "rewards/margins": 0.9847318530082703, "rewards/rejected": -0.8915673494338989, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": -0.5596946477890015, "eval_logits/rejected": -0.34637966752052307, "eval_logps/chosen": -216.45716857910156, "eval_logps/rejected": -423.8618469238281, "eval_loss": 0.3506593704223633, "eval_rewards/accuracies": 0.9183501601219177, "eval_rewards/chosen": 0.16986550390720367, "eval_rewards/margins": 1.0696542263031006, "eval_rewards/rejected": -0.8997886776924133, "eval_runtime": 533.8766, "eval_samples_per_second": 17.794, "eval_steps_per_second": 0.556, "step": 200 }, { "epoch": 0.07, "learning_rate": 1.189127972819932e-07, "logits/chosen": -0.46144527196884155, "logits/rejected": -0.34894102811813354, "logps/chosen": -163.28524780273438, "logps/rejected": -568.564453125, "loss": 0.3358, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.31559714674949646, "rewards/margins": 1.2628577947616577, "rewards/rejected": -0.9472605586051941, "step": 210 }, { "epoch": 0.07, "learning_rate": 1.245753114382786e-07, "logits/chosen": -0.5060401558876038, "logits/rejected": -0.34054842591285706, "logps/chosen": -269.3310241699219, "logps/rejected": -343.5320739746094, "loss": 0.3017, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.29406309127807617, "rewards/margins": 1.2682092189788818, "rewards/rejected": -0.9741460680961609, "step": 220 }, { "epoch": 0.08, "learning_rate": 1.3023782559456398e-07, "logits/chosen": -0.4501872956752777, "logits/rejected": -0.23859365284442902, "logps/chosen": -224.9252166748047, "logps/rejected": -449.39617919921875, "loss": 0.2953, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3369618058204651, "rewards/margins": 1.3815174102783203, "rewards/rejected": -1.0445555448532104, "step": 230 }, { "epoch": 0.08, "learning_rate": 1.3590033975084937e-07, "logits/chosen": -0.47853976488113403, "logits/rejected": -0.39643120765686035, "logps/chosen": -213.26168823242188, "logps/rejected": -554.62255859375, "loss": 0.3118, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2273525446653366, "rewards/margins": 1.4735194444656372, "rewards/rejected": -1.2461670637130737, "step": 240 }, { "epoch": 0.08, "learning_rate": 1.4156285390713476e-07, "logits/chosen": -0.47971493005752563, "logits/rejected": -0.3890048563480377, "logps/chosen": -158.33865356445312, "logps/rejected": -404.7588806152344, "loss": 0.2977, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.4578720033168793, "rewards/margins": 1.7195074558258057, "rewards/rejected": -1.261635184288025, "step": 250 }, { "epoch": 0.09, "learning_rate": 1.4722536806342014e-07, "logits/chosen": -0.43633347749710083, "logits/rejected": -0.3528065085411072, "logps/chosen": -227.5246124267578, "logps/rejected": -440.298828125, "loss": 0.2764, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3436822295188904, "rewards/margins": 1.7917091846466064, "rewards/rejected": -1.4480268955230713, "step": 260 }, { "epoch": 0.09, "learning_rate": 1.5288788221970556e-07, "logits/chosen": -0.5370690226554871, "logits/rejected": -0.30900686979293823, "logps/chosen": -203.85116577148438, "logps/rejected": -302.18316650390625, "loss": 0.2499, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.3745507597923279, "rewards/margins": 1.9111206531524658, "rewards/rejected": -1.5365700721740723, "step": 270 }, { "epoch": 0.1, "learning_rate": 1.5855039637599094e-07, "logits/chosen": -0.26729458570480347, "logits/rejected": -0.27476876974105835, "logps/chosen": -171.4789276123047, "logps/rejected": -297.64288330078125, "loss": 0.2523, "rewards/accuracies": 0.9375, "rewards/chosen": 0.38083142042160034, "rewards/margins": 2.2318005561828613, "rewards/rejected": -1.8509695529937744, "step": 280 }, { "epoch": 0.1, "learning_rate": 1.642129105322763e-07, "logits/chosen": -0.4671853482723236, "logits/rejected": -0.21942517161369324, "logps/chosen": -160.75570678710938, "logps/rejected": -419.7854919433594, "loss": 0.227, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.37282440066337585, "rewards/margins": 2.009082794189453, "rewards/rejected": -1.6362583637237549, "step": 290 }, { "epoch": 0.1, "learning_rate": 1.6987542468856172e-07, "logits/chosen": -0.39742714166641235, "logits/rejected": -0.3988519310951233, "logps/chosen": -169.22384643554688, "logps/rejected": -517.7080688476562, "loss": 0.2144, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.38003453612327576, "rewards/margins": 2.3613126277923584, "rewards/rejected": -1.9812781810760498, "step": 300 }, { "epoch": 0.1, "eval_logits/chosen": -0.5335479378700256, "eval_logits/rejected": -0.35367077589035034, "eval_logps/chosen": -214.609130859375, "eval_logps/rejected": -432.8870849609375, "eval_loss": 0.21516987681388855, "eval_rewards/accuracies": 0.939393937587738, "eval_rewards/chosen": 0.35466811060905457, "eval_rewards/margins": 2.156982898712158, "eval_rewards/rejected": -1.8023145198822021, "eval_runtime": 534.1881, "eval_samples_per_second": 17.784, "eval_steps_per_second": 0.556, "step": 300 }, { "epoch": 0.11, "learning_rate": 1.755379388448471e-07, "logits/chosen": -0.5349145531654358, "logits/rejected": -0.31476613879203796, "logps/chosen": -157.59751892089844, "logps/rejected": -413.187744140625, "loss": 0.2146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.36103734374046326, "rewards/margins": 2.2508745193481445, "rewards/rejected": -1.8898370265960693, "step": 310 }, { "epoch": 0.11, "learning_rate": 1.812004530011325e-07, "logits/chosen": -0.27551236748695374, "logits/rejected": -0.30738794803619385, "logps/chosen": -335.4036865234375, "logps/rejected": -335.6308288574219, "loss": 0.1904, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.43626269698143005, "rewards/margins": 2.476463794708252, "rewards/rejected": -2.04020094871521, "step": 320 }, { "epoch": 0.11, "learning_rate": 1.868629671574179e-07, "logits/chosen": -0.44193267822265625, "logits/rejected": -0.31210240721702576, "logps/chosen": -216.2269287109375, "logps/rejected": -514.0025024414062, "loss": 0.1795, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.48537230491638184, "rewards/margins": 2.89229154586792, "rewards/rejected": -2.406919479370117, "step": 330 }, { "epoch": 0.12, "learning_rate": 1.9252548131370327e-07, "logits/chosen": -0.3645581305027008, "logits/rejected": -0.2988941967487335, "logps/chosen": -152.3814697265625, "logps/rejected": -443.4131774902344, "loss": 0.1805, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5916491746902466, "rewards/margins": 2.96580171585083, "rewards/rejected": -2.374152421951294, "step": 340 }, { "epoch": 0.12, "learning_rate": 1.9818799546998865e-07, "logits/chosen": -0.47176966071128845, "logits/rejected": -0.24466374516487122, "logps/chosen": -197.41256713867188, "logps/rejected": -509.7435607910156, "loss": 0.1726, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5358818173408508, "rewards/margins": 2.975222110748291, "rewards/rejected": -2.439340114593506, "step": 350 }, { "epoch": 0.12, "learning_rate": 2.0385050962627407e-07, "logits/chosen": -0.35152795910835266, "logits/rejected": -0.24870070815086365, "logps/chosen": -148.6573486328125, "logps/rejected": -355.4652404785156, "loss": 0.1707, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.39122888445854187, "rewards/margins": 3.404369831085205, "rewards/rejected": -3.013140916824341, "step": 360 }, { "epoch": 0.13, "learning_rate": 2.0951302378255946e-07, "logits/chosen": -0.40495458245277405, "logits/rejected": -0.32589226961135864, "logps/chosen": -162.13912963867188, "logps/rejected": -668.97998046875, "loss": 0.1577, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40158432722091675, "rewards/margins": 3.105182647705078, "rewards/rejected": -2.7035984992980957, "step": 370 }, { "epoch": 0.13, "learning_rate": 2.1517553793884482e-07, "logits/chosen": -0.4868873953819275, "logits/rejected": -0.3262481093406677, "logps/chosen": -176.2294921875, "logps/rejected": -575.412353515625, "loss": 0.1707, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5964416265487671, "rewards/margins": 3.3108437061309814, "rewards/rejected": -2.714401960372925, "step": 380 }, { "epoch": 0.13, "learning_rate": 2.2083805209513023e-07, "logits/chosen": -0.2879076600074768, "logits/rejected": -0.22159016132354736, "logps/chosen": -232.27938842773438, "logps/rejected": -352.0542907714844, "loss": 0.1649, "rewards/accuracies": 0.9375, "rewards/chosen": 0.401996910572052, "rewards/margins": 3.0489699840545654, "rewards/rejected": -2.646973133087158, "step": 390 }, { "epoch": 0.14, "learning_rate": 2.2650056625141562e-07, "logits/chosen": -0.46850457787513733, "logits/rejected": -0.3705076277256012, "logps/chosen": -155.06314086914062, "logps/rejected": -544.2332153320312, "loss": 0.1567, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6337358951568604, "rewards/margins": 3.9728286266326904, "rewards/rejected": -3.33909273147583, "step": 400 }, { "epoch": 0.14, "eval_logits/chosen": -0.49473196268081665, "eval_logits/rejected": -0.33432310819625854, "eval_logps/chosen": -213.37452697753906, "eval_logps/rejected": -444.8309020996094, "eval_loss": 0.14582565426826477, "eval_rewards/accuracies": 0.9553872346878052, "eval_rewards/chosen": 0.47812968492507935, "eval_rewards/margins": 3.474830389022827, "eval_rewards/rejected": -2.9967007637023926, "eval_runtime": 534.1711, "eval_samples_per_second": 17.785, "eval_steps_per_second": 0.556, "step": 400 }, { "epoch": 0.14, "learning_rate": 2.32163080407701e-07, "logits/chosen": -0.6111544370651245, "logits/rejected": -0.21178212761878967, "logps/chosen": -155.8394775390625, "logps/rejected": -440.28302001953125, "loss": 0.1432, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5733445882797241, "rewards/margins": 3.280895233154297, "rewards/rejected": -2.707550525665283, "step": 410 }, { "epoch": 0.14, "learning_rate": 2.378255945639864e-07, "logits/chosen": -0.4618472456932068, "logits/rejected": -0.28504040837287903, "logps/chosen": -173.37403869628906, "logps/rejected": -604.4390869140625, "loss": 0.1498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6987735033035278, "rewards/margins": 3.8992393016815186, "rewards/rejected": -3.2004661560058594, "step": 420 }, { "epoch": 0.15, "learning_rate": 2.434881087202718e-07, "logits/chosen": -0.5163825154304504, "logits/rejected": -0.208576962351799, "logps/chosen": -137.42552185058594, "logps/rejected": -537.9922485351562, "loss": 0.1189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7794636487960815, "rewards/margins": 3.7585597038269043, "rewards/rejected": -2.9790961742401123, "step": 430 }, { "epoch": 0.15, "learning_rate": 2.491506228765572e-07, "logits/chosen": -0.3670637011528015, "logits/rejected": -0.33582669496536255, "logps/chosen": -150.42408752441406, "logps/rejected": -393.2051696777344, "loss": 0.1238, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5653060674667358, "rewards/margins": 4.209753513336182, "rewards/rejected": -3.6444473266601562, "step": 440 }, { "epoch": 0.15, "learning_rate": 2.548131370328426e-07, "logits/chosen": -0.33617061376571655, "logits/rejected": -0.34240013360977173, "logps/chosen": -193.50643920898438, "logps/rejected": -340.6159973144531, "loss": 0.1525, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19498947262763977, "rewards/margins": 4.642898082733154, "rewards/rejected": -4.447909355163574, "step": 450 }, { "epoch": 0.16, "learning_rate": 2.6047565118912797e-07, "logits/chosen": -0.3010881841182709, "logits/rejected": -0.23528370261192322, "logps/chosen": -219.30038452148438, "logps/rejected": -372.89617919921875, "loss": 0.1187, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.49272117018699646, "rewards/margins": 4.173504829406738, "rewards/rejected": -3.680783748626709, "step": 460 }, { "epoch": 0.16, "learning_rate": 2.6613816534541335e-07, "logits/chosen": -0.22437143325805664, "logits/rejected": -0.24364586174488068, "logps/chosen": -290.31109619140625, "logps/rejected": -244.8562469482422, "loss": 0.1161, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.572048544883728, "rewards/margins": 4.629483222961426, "rewards/rejected": -4.057435035705566, "step": 470 }, { "epoch": 0.16, "learning_rate": 2.7180067950169874e-07, "logits/chosen": -0.48673558235168457, "logits/rejected": -0.15762242674827576, "logps/chosen": -153.439697265625, "logps/rejected": -538.0169067382812, "loss": 0.1069, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5159075856208801, "rewards/margins": 4.098017692565918, "rewards/rejected": -3.5821099281311035, "step": 480 }, { "epoch": 0.17, "learning_rate": 2.7746319365798413e-07, "logits/chosen": -0.4696560502052307, "logits/rejected": -0.2102680504322052, "logps/chosen": -153.96446228027344, "logps/rejected": -438.740478515625, "loss": 0.1124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6043912172317505, "rewards/margins": 4.742498397827148, "rewards/rejected": -4.1381072998046875, "step": 490 }, { "epoch": 0.17, "learning_rate": 2.831257078142695e-07, "logits/chosen": -0.2923361361026764, "logits/rejected": -0.25716161727905273, "logps/chosen": -275.92779541015625, "logps/rejected": -258.07257080078125, "loss": 0.1121, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5389481782913208, "rewards/margins": 4.387238025665283, "rewards/rejected": -3.8482894897460938, "step": 500 }, { "epoch": 0.17, "eval_logits/chosen": -0.4604286551475525, "eval_logits/rejected": -0.3020128309726715, "eval_logps/chosen": -212.82911682128906, "eval_logps/rejected": -456.2887878417969, "eval_loss": 0.12496975064277649, "eval_rewards/accuracies": 0.9688552021980286, "eval_rewards/chosen": 0.5326722264289856, "eval_rewards/margins": 4.675156593322754, "eval_rewards/rejected": -4.142484664916992, "eval_runtime": 533.9623, "eval_samples_per_second": 17.792, "eval_steps_per_second": 0.556, "step": 500 }, { "epoch": 0.17, "learning_rate": 2.887882219705549e-07, "logits/chosen": -0.10640861093997955, "logits/rejected": -0.2690298855304718, "logps/chosen": -255.36923217773438, "logps/rejected": -559.5323486328125, "loss": 0.1235, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6311744451522827, "rewards/margins": 4.715970516204834, "rewards/rejected": -4.084795951843262, "step": 510 }, { "epoch": 0.18, "learning_rate": 2.944507361268403e-07, "logits/chosen": -0.2766628563404083, "logits/rejected": -0.26869791746139526, "logps/chosen": -161.83465576171875, "logps/rejected": -482.41937255859375, "loss": 0.104, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4263280928134918, "rewards/margins": 4.6483354568481445, "rewards/rejected": -4.222007751464844, "step": 520 }, { "epoch": 0.18, "learning_rate": 3.001132502831257e-07, "logits/chosen": -0.45713695883750916, "logits/rejected": -0.31391245126724243, "logps/chosen": -160.49148559570312, "logps/rejected": -547.5196533203125, "loss": 0.117, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6843430399894714, "rewards/margins": 5.339869022369385, "rewards/rejected": -4.655526161193848, "step": 530 }, { "epoch": 0.18, "learning_rate": 3.057757644394111e-07, "logits/chosen": -0.3268749415874481, "logits/rejected": -0.2451901137828827, "logps/chosen": -152.80709838867188, "logps/rejected": -378.86688232421875, "loss": 0.0973, "rewards/accuracies": 1.0, "rewards/chosen": 0.7984550595283508, "rewards/margins": 5.9152631759643555, "rewards/rejected": -5.116808891296387, "step": 540 }, { "epoch": 0.19, "learning_rate": 3.114382785956965e-07, "logits/chosen": -0.4358833432197571, "logits/rejected": -0.1794118732213974, "logps/chosen": -162.8441162109375, "logps/rejected": -417.78143310546875, "loss": 0.0986, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6562263369560242, "rewards/margins": 4.775213718414307, "rewards/rejected": -4.118987083435059, "step": 550 }, { "epoch": 0.19, "learning_rate": 3.171007927519819e-07, "logits/chosen": -0.2012094259262085, "logits/rejected": -0.2053249329328537, "logps/chosen": -269.29473876953125, "logps/rejected": -383.81781005859375, "loss": 0.1381, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.37394028902053833, "rewards/margins": 4.884980201721191, "rewards/rejected": -4.511040687561035, "step": 560 }, { "epoch": 0.19, "learning_rate": 3.227633069082673e-07, "logits/chosen": -0.3572072386741638, "logits/rejected": -0.2750917375087738, "logps/chosen": -310.8538513183594, "logps/rejected": -299.1601867675781, "loss": 0.0881, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.5027602910995483, "rewards/margins": 5.539792060852051, "rewards/rejected": -5.037032127380371, "step": 570 }, { "epoch": 0.2, "learning_rate": 3.284258210645526e-07, "logits/chosen": -0.34530162811279297, "logits/rejected": -0.24350178241729736, "logps/chosen": -274.55889892578125, "logps/rejected": -433.11865234375, "loss": 0.1137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7184306979179382, "rewards/margins": 5.7707953453063965, "rewards/rejected": -5.052364826202393, "step": 580 }, { "epoch": 0.2, "learning_rate": 3.34088335220838e-07, "logits/chosen": -0.4375142455101013, "logits/rejected": -0.3137005865573883, "logps/chosen": -174.62481689453125, "logps/rejected": -340.85662841796875, "loss": 0.1175, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6618272066116333, "rewards/margins": 6.430197715759277, "rewards/rejected": -5.768371105194092, "step": 590 }, { "epoch": 0.2, "learning_rate": 3.3975084937712344e-07, "logits/chosen": -0.2826346457004547, "logits/rejected": -0.32238835096359253, "logps/chosen": -277.5734558105469, "logps/rejected": -598.1384887695312, "loss": 0.1003, "rewards/accuracies": 1.0, "rewards/chosen": 0.5033386945724487, "rewards/margins": 5.562026023864746, "rewards/rejected": -5.058687686920166, "step": 600 }, { "epoch": 0.2, "eval_logits/chosen": -0.46434447169303894, "eval_logits/rejected": -0.32271096110343933, "eval_logps/chosen": -214.4120635986328, "eval_logps/rejected": -469.2868347167969, "eval_loss": 0.09264827519655228, "eval_rewards/accuracies": 0.9696969985961914, "eval_rewards/chosen": 0.3743777275085449, "eval_rewards/margins": 5.816666603088379, "eval_rewards/rejected": -5.442288398742676, "eval_runtime": 534.4221, "eval_samples_per_second": 17.776, "eval_steps_per_second": 0.556, "step": 600 }, { "epoch": 0.21, "learning_rate": 3.454133635334088e-07, "logits/chosen": -0.19149485230445862, "logits/rejected": -0.27644044160842896, "logps/chosen": -210.48367309570312, "logps/rejected": -420.9043884277344, "loss": 0.1004, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1815412938594818, "rewards/margins": 6.021714210510254, "rewards/rejected": -5.84017276763916, "step": 610 }, { "epoch": 0.21, "learning_rate": 3.510758776896942e-07, "logits/chosen": -0.43977561593055725, "logits/rejected": -0.25845038890838623, "logps/chosen": -282.4232177734375, "logps/rejected": -518.679931640625, "loss": 0.1051, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23226213455200195, "rewards/margins": 5.043756008148193, "rewards/rejected": -4.811493873596191, "step": 620 }, { "epoch": 0.21, "learning_rate": 3.567383918459796e-07, "logits/chosen": -0.13735656440258026, "logits/rejected": -0.3313857913017273, "logps/chosen": -261.3982849121094, "logps/rejected": -258.78741455078125, "loss": 0.0984, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.39616602659225464, "rewards/margins": 6.307466983795166, "rewards/rejected": -5.9113006591796875, "step": 630 }, { "epoch": 0.22, "learning_rate": 3.62400906002265e-07, "logits/chosen": -0.2772436738014221, "logits/rejected": -0.45146340131759644, "logps/chosen": -224.22738647460938, "logps/rejected": -504.8958435058594, "loss": 0.0887, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6622092127799988, "rewards/margins": 6.898531436920166, "rewards/rejected": -6.236320972442627, "step": 640 }, { "epoch": 0.22, "learning_rate": 3.6806342015855037e-07, "logits/chosen": -0.09807883948087692, "logits/rejected": -0.27403968572616577, "logps/chosen": -212.4461669921875, "logps/rejected": -353.0113220214844, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 0.7303094267845154, "rewards/margins": 7.04489803314209, "rewards/rejected": -6.314589023590088, "step": 650 }, { "epoch": 0.22, "learning_rate": 3.737259343148358e-07, "logits/chosen": -0.37181025743484497, "logits/rejected": -0.24910902976989746, "logps/chosen": -168.32766723632812, "logps/rejected": -494.0104064941406, "loss": 0.0835, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4270142614841461, "rewards/margins": 6.579292297363281, "rewards/rejected": -6.152278423309326, "step": 660 }, { "epoch": 0.23, "learning_rate": 3.7938844847112115e-07, "logits/chosen": -0.1960335671901703, "logits/rejected": -0.38082757592201233, "logps/chosen": -201.34225463867188, "logps/rejected": -564.0484619140625, "loss": 0.3762, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2771373391151428, "rewards/margins": 6.862439155578613, "rewards/rejected": -6.585302829742432, "step": 670 }, { "epoch": 0.23, "learning_rate": 3.8505096262740653e-07, "logits/chosen": -0.2186754196882248, "logits/rejected": -0.2299749106168747, "logps/chosen": -166.9310302734375, "logps/rejected": -507.5357360839844, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 0.4309958517551422, "rewards/margins": 6.799513816833496, "rewards/rejected": -6.368517875671387, "step": 680 }, { "epoch": 0.23, "learning_rate": 3.907134767836919e-07, "logits/chosen": -0.3052781820297241, "logits/rejected": -0.3243364989757538, "logps/chosen": -248.62582397460938, "logps/rejected": -506.7079162597656, "loss": 0.108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4183991849422455, "rewards/margins": 6.785237789154053, "rewards/rejected": -6.366837501525879, "step": 690 }, { "epoch": 0.24, "learning_rate": 3.963759909399773e-07, "logits/chosen": -0.449028879404068, "logits/rejected": -0.26317834854125977, "logps/chosen": -199.8858642578125, "logps/rejected": -524.5985107421875, "loss": 0.0602, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6900192499160767, "rewards/margins": 7.143365383148193, "rewards/rejected": -6.453346252441406, "step": 700 }, { "epoch": 0.24, "eval_logits/chosen": -0.44600772857666016, "eval_logits/rejected": -0.31340694427490234, "eval_logps/chosen": -214.433837890625, "eval_logps/rejected": -481.19061279296875, "eval_loss": 0.0768735408782959, "eval_rewards/accuracies": 0.9739057421684265, "eval_rewards/chosen": 0.37219953536987305, "eval_rewards/margins": 7.004868030548096, "eval_rewards/rejected": -6.632668972015381, "eval_runtime": 535.6596, "eval_samples_per_second": 17.735, "eval_steps_per_second": 0.554, "step": 700 }, { "epoch": 0.24, "learning_rate": 4.0203850509626275e-07, "logits/chosen": -0.2162231206893921, "logits/rejected": -0.2548070251941681, "logps/chosen": -207.8511962890625, "logps/rejected": -362.37957763671875, "loss": 0.0891, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6352172493934631, "rewards/margins": 6.503669738769531, "rewards/rejected": -5.8684515953063965, "step": 710 }, { "epoch": 0.24, "learning_rate": 4.0770101925254814e-07, "logits/chosen": -0.28778719902038574, "logits/rejected": -0.20617246627807617, "logps/chosen": -178.68609619140625, "logps/rejected": -396.4261169433594, "loss": 0.0832, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3291332423686981, "rewards/margins": 6.630867958068848, "rewards/rejected": -6.301734447479248, "step": 720 }, { "epoch": 0.25, "learning_rate": 4.133635334088335e-07, "logits/chosen": -0.366641104221344, "logits/rejected": -0.2987636625766754, "logps/chosen": -238.81643676757812, "logps/rejected": -444.86602783203125, "loss": 0.0638, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6036492586135864, "rewards/margins": 6.993215084075928, "rewards/rejected": -6.389565467834473, "step": 730 }, { "epoch": 0.25, "learning_rate": 4.190260475651189e-07, "logits/chosen": -0.32933568954467773, "logits/rejected": -0.20656809210777283, "logps/chosen": -244.2524871826172, "logps/rejected": -394.734619140625, "loss": 0.0665, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.012406098656356335, "rewards/margins": 6.4013800621032715, "rewards/rejected": -6.388974189758301, "step": 740 }, { "epoch": 0.25, "learning_rate": 4.2468856172140424e-07, "logits/chosen": -0.4600093960762024, "logits/rejected": -0.14787457883358002, "logps/chosen": -151.00482177734375, "logps/rejected": -516.2240600585938, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": 0.5669361352920532, "rewards/margins": 7.541558265686035, "rewards/rejected": -6.974621772766113, "step": 750 }, { "epoch": 0.26, "learning_rate": 4.3035107587768963e-07, "logits/chosen": -0.30077478289604187, "logits/rejected": -0.3066253066062927, "logps/chosen": -165.01199340820312, "logps/rejected": -367.2582702636719, "loss": 0.0795, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24088886380195618, "rewards/margins": 7.266399383544922, "rewards/rejected": -7.025511741638184, "step": 760 }, { "epoch": 0.26, "learning_rate": 4.3601359003397507e-07, "logits/chosen": -0.20812579989433289, "logits/rejected": -0.25792360305786133, "logps/chosen": -281.2358093261719, "logps/rejected": -478.462646484375, "loss": 0.0615, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5319908857345581, "rewards/margins": 6.828255653381348, "rewards/rejected": -6.296264171600342, "step": 770 }, { "epoch": 0.27, "learning_rate": 4.4167610419026046e-07, "logits/chosen": -0.38004809617996216, "logits/rejected": -0.30805715918540955, "logps/chosen": -273.6496887207031, "logps/rejected": -588.7950439453125, "loss": 0.0697, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1273520290851593, "rewards/margins": 8.646838188171387, "rewards/rejected": -8.519486427307129, "step": 780 }, { "epoch": 0.27, "learning_rate": 4.4733861834654585e-07, "logits/chosen": -0.35582059621810913, "logits/rejected": -0.0884096622467041, "logps/chosen": -209.0087890625, "logps/rejected": -355.423583984375, "loss": 0.0646, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.37765970826148987, "rewards/margins": 8.372190475463867, "rewards/rejected": -7.99453067779541, "step": 790 }, { "epoch": 0.27, "learning_rate": 4.5300113250283123e-07, "logits/chosen": -0.12763236463069916, "logits/rejected": -0.2596771717071533, "logps/chosen": -225.40371704101562, "logps/rejected": -424.83282470703125, "loss": 0.0584, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.32599931955337524, "rewards/margins": 6.853158473968506, "rewards/rejected": -6.527158260345459, "step": 800 }, { "epoch": 0.27, "eval_logits/chosen": -0.4234791100025177, "eval_logits/rejected": -0.2856813669204712, "eval_logps/chosen": -214.1188507080078, "eval_logps/rejected": -491.4772644042969, "eval_loss": 0.06379322707653046, "eval_rewards/accuracies": 0.9806397557258606, "eval_rewards/chosen": 0.4036966860294342, "eval_rewards/margins": 8.065031051635742, "eval_rewards/rejected": -7.661334037780762, "eval_runtime": 534.8523, "eval_samples_per_second": 17.762, "eval_steps_per_second": 0.555, "step": 800 }, { "epoch": 0.28, "learning_rate": 4.586636466591166e-07, "logits/chosen": -0.2669990658760071, "logits/rejected": -0.2504034638404846, "logps/chosen": -225.17123413085938, "logps/rejected": -559.6758422851562, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 0.4328089654445648, "rewards/margins": 8.280193328857422, "rewards/rejected": -7.847384452819824, "step": 810 }, { "epoch": 0.28, "learning_rate": 4.64326160815402e-07, "logits/chosen": -0.3878735601902008, "logits/rejected": -0.23621630668640137, "logps/chosen": -318.3006591796875, "logps/rejected": -365.98663330078125, "loss": 0.0559, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3591436743736267, "rewards/margins": 8.013032913208008, "rewards/rejected": -7.653888702392578, "step": 820 }, { "epoch": 0.28, "learning_rate": 4.6998867497168745e-07, "logits/chosen": -0.36454930901527405, "logits/rejected": -0.19936661422252655, "logps/chosen": -141.88204956054688, "logps/rejected": -520.3352661132812, "loss": 0.0614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.43681102991104126, "rewards/margins": 7.6321611404418945, "rewards/rejected": -7.19534969329834, "step": 830 }, { "epoch": 0.29, "learning_rate": 4.756511891279728e-07, "logits/chosen": -0.30462154746055603, "logits/rejected": -0.2487163543701172, "logps/chosen": -217.2904052734375, "logps/rejected": -438.473388671875, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.4799513816833496, "rewards/margins": 8.887194633483887, "rewards/rejected": -8.407242774963379, "step": 840 }, { "epoch": 0.29, "learning_rate": 4.813137032842582e-07, "logits/chosen": -0.3396126925945282, "logits/rejected": -0.3047598600387573, "logps/chosen": -230.8419647216797, "logps/rejected": -662.5994873046875, "loss": 0.0581, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3894551396369934, "rewards/margins": 7.8841681480407715, "rewards/rejected": -7.494712829589844, "step": 850 }, { "epoch": 0.29, "learning_rate": 4.869762174405436e-07, "logits/chosen": -0.254059374332428, "logits/rejected": -0.19831958413124084, "logps/chosen": -206.7389373779297, "logps/rejected": -497.86572265625, "loss": 0.0492, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.42382168769836426, "rewards/margins": 8.420679092407227, "rewards/rejected": -7.996856689453125, "step": 860 }, { "epoch": 0.3, "learning_rate": 4.92638731596829e-07, "logits/chosen": -0.4207886755466461, "logits/rejected": -0.2620916962623596, "logps/chosen": -270.48333740234375, "logps/rejected": -546.7339477539062, "loss": 0.0496, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6875104308128357, "rewards/margins": 7.988639831542969, "rewards/rejected": -7.301129341125488, "step": 870 }, { "epoch": 0.3, "learning_rate": 4.983012457531144e-07, "logits/chosen": -0.42369580268859863, "logits/rejected": -0.2931245267391205, "logps/chosen": -141.2826690673828, "logps/rejected": -427.2748107910156, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 0.32747527956962585, "rewards/margins": 8.682317733764648, "rewards/rejected": -8.354843139648438, "step": 880 }, { "epoch": 0.3, "learning_rate": 4.995593604431575e-07, "logits/chosen": -0.2628129720687866, "logits/rejected": -0.3006640076637268, "logps/chosen": -214.7023162841797, "logps/rejected": -344.90130615234375, "loss": 0.0544, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3179890215396881, "rewards/margins": 7.346138000488281, "rewards/rejected": -7.028149604797363, "step": 890 }, { "epoch": 0.31, "learning_rate": 4.989298753619539e-07, "logits/chosen": -0.2285783588886261, "logits/rejected": -0.1876908242702484, "logps/chosen": -165.02896118164062, "logps/rejected": -429.82244873046875, "loss": 0.0555, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6583096385002136, "rewards/margins": 9.801607131958008, "rewards/rejected": -9.143298149108887, "step": 900 }, { "epoch": 0.31, "eval_logits/chosen": -0.42704111337661743, "eval_logits/rejected": -0.29144129157066345, "eval_logps/chosen": -213.87449645996094, "eval_logps/rejected": -497.19091796875, "eval_loss": 0.055654142051935196, "eval_rewards/accuracies": 0.9848484992980957, "eval_rewards/chosen": 0.4281308352947235, "eval_rewards/margins": 8.6608304977417, "eval_rewards/rejected": -8.232699394226074, "eval_runtime": 534.1249, "eval_samples_per_second": 17.786, "eval_steps_per_second": 0.556, "step": 900 }, { "epoch": 0.31, "learning_rate": 4.983003902807503e-07, "logits/chosen": -0.38017234206199646, "logits/rejected": -0.15586760640144348, "logps/chosen": -248.2068634033203, "logps/rejected": -337.6114196777344, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 0.473585307598114, "rewards/margins": 8.148943901062012, "rewards/rejected": -7.67535924911499, "step": 910 }, { "epoch": 0.31, "learning_rate": 4.976709051995467e-07, "logits/chosen": -0.4287186563014984, "logits/rejected": -0.29633355140686035, "logps/chosen": -159.91883850097656, "logps/rejected": -441.87530517578125, "loss": 0.0428, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1576082408428192, "rewards/margins": 9.318530082702637, "rewards/rejected": -9.160922050476074, "step": 920 }, { "epoch": 0.32, "learning_rate": 4.970414201183432e-07, "logits/chosen": -0.39709603786468506, "logits/rejected": -0.40310096740722656, "logps/chosen": -200.92210388183594, "logps/rejected": -528.8557739257812, "loss": 0.0444, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6206746101379395, "rewards/margins": 10.767847061157227, "rewards/rejected": -10.147174835205078, "step": 930 }, { "epoch": 0.32, "learning_rate": 4.964119350371396e-07, "logits/chosen": -0.37714919447898865, "logits/rejected": -0.29318445920944214, "logps/chosen": -157.73971557617188, "logps/rejected": -497.69580078125, "loss": 0.048, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17726925015449524, "rewards/margins": 9.912737846374512, "rewards/rejected": -9.735468864440918, "step": 940 }, { "epoch": 0.32, "learning_rate": 4.95782449955936e-07, "logits/chosen": -0.4353370666503906, "logits/rejected": -0.36344224214553833, "logps/chosen": -200.83201599121094, "logps/rejected": -433.704345703125, "loss": 0.0632, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2186012715101242, "rewards/margins": 9.237771987915039, "rewards/rejected": -9.019169807434082, "step": 950 }, { "epoch": 0.33, "learning_rate": 4.951529648747325e-07, "logits/chosen": -0.26210442185401917, "logits/rejected": -0.2983551621437073, "logps/chosen": -256.0863342285156, "logps/rejected": -482.82720947265625, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 0.72393399477005, "rewards/margins": 9.906654357910156, "rewards/rejected": -9.182720184326172, "step": 960 }, { "epoch": 0.33, "learning_rate": 4.945234797935289e-07, "logits/chosen": -0.4766682982444763, "logits/rejected": -0.3060424327850342, "logps/chosen": -147.65628051757812, "logps/rejected": -638.6217041015625, "loss": 0.0502, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7331063151359558, "rewards/margins": 9.940909385681152, "rewards/rejected": -9.207803726196289, "step": 970 }, { "epoch": 0.33, "learning_rate": 4.938939947123252e-07, "logits/chosen": -0.4401000142097473, "logits/rejected": -0.35368892550468445, "logps/chosen": -267.5409240722656, "logps/rejected": -607.5504760742188, "loss": 0.0598, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.45719894766807556, "rewards/margins": 10.04234504699707, "rewards/rejected": -9.585145950317383, "step": 980 }, { "epoch": 0.34, "learning_rate": 4.932645096311217e-07, "logits/chosen": -0.35212796926498413, "logits/rejected": -0.3259442448616028, "logps/chosen": -137.34100341796875, "logps/rejected": -587.4320068359375, "loss": 0.0397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4701482653617859, "rewards/margins": 10.261445045471191, "rewards/rejected": -9.791296005249023, "step": 990 }, { "epoch": 0.34, "learning_rate": 4.926350245499181e-07, "logits/chosen": -0.41194573044776917, "logits/rejected": -0.33143311738967896, "logps/chosen": -220.0157928466797, "logps/rejected": -441.72650146484375, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 0.5271693468093872, "rewards/margins": 10.389043807983398, "rewards/rejected": -9.8618745803833, "step": 1000 }, { "epoch": 0.34, "eval_logits/chosen": -0.4739537239074707, "eval_logits/rejected": -0.34909966588020325, "eval_logps/chosen": -214.11021423339844, "eval_logps/rejected": -512.6325073242188, "eval_loss": 0.04723240062594414, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": 0.40456104278564453, "eval_rewards/margins": 10.181422233581543, "eval_rewards/rejected": -9.776861190795898, "eval_runtime": 534.2965, "eval_samples_per_second": 17.78, "eval_steps_per_second": 0.556, "step": 1000 }, { "epoch": 0.34, "learning_rate": 4.920055394687146e-07, "logits/chosen": -0.2540286183357239, "logits/rejected": -0.1765807569026947, "logps/chosen": -236.892578125, "logps/rejected": -436.7119140625, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 0.40675920248031616, "rewards/margins": 8.76603889465332, "rewards/rejected": -8.359280586242676, "step": 1010 }, { "epoch": 0.35, "learning_rate": 4.91376054387511e-07, "logits/chosen": -0.48013514280319214, "logits/rejected": -0.20226307213306427, "logps/chosen": -139.52383422851562, "logps/rejected": -458.13336181640625, "loss": 0.0312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.47215309739112854, "rewards/margins": 9.695752143859863, "rewards/rejected": -9.223600387573242, "step": 1020 }, { "epoch": 0.35, "learning_rate": 4.907465693063074e-07, "logits/chosen": -0.43698740005493164, "logits/rejected": -0.334372341632843, "logps/chosen": -144.9458770751953, "logps/rejected": -446.1951599121094, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": 0.20389041304588318, "rewards/margins": 10.303365707397461, "rewards/rejected": -10.09947395324707, "step": 1030 }, { "epoch": 0.35, "learning_rate": 4.901170842251039e-07, "logits/chosen": -0.2944476008415222, "logits/rejected": -0.41581621766090393, "logps/chosen": -315.84539794921875, "logps/rejected": -602.2779541015625, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 0.03368399664759636, "rewards/margins": 8.769097328186035, "rewards/rejected": -8.735413551330566, "step": 1040 }, { "epoch": 0.36, "learning_rate": 4.894875991439003e-07, "logits/chosen": -0.3834468126296997, "logits/rejected": -0.40807825326919556, "logps/chosen": -298.45025634765625, "logps/rejected": -444.386474609375, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 0.2219455987215042, "rewards/margins": 10.802868843078613, "rewards/rejected": -10.580923080444336, "step": 1050 }, { "epoch": 0.36, "learning_rate": 4.888581140626966e-07, "logits/chosen": -0.4633941650390625, "logits/rejected": -0.33421996235847473, "logps/chosen": -219.28359985351562, "logps/rejected": -518.02880859375, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 0.21625420451164246, "rewards/margins": 11.886049270629883, "rewards/rejected": -11.669794082641602, "step": 1060 }, { "epoch": 0.36, "learning_rate": 4.882286289814931e-07, "logits/chosen": -0.4685365557670593, "logits/rejected": -0.29580843448638916, "logps/chosen": -260.9378662109375, "logps/rejected": -407.06671142578125, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 0.01862364448606968, "rewards/margins": 9.68779468536377, "rewards/rejected": -9.669171333312988, "step": 1070 }, { "epoch": 0.37, "learning_rate": 4.875991439002896e-07, "logits/chosen": -0.363955020904541, "logits/rejected": -0.36186718940734863, "logps/chosen": -220.4456024169922, "logps/rejected": -412.52459716796875, "loss": 0.0432, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0067754522897303104, "rewards/margins": 11.263066291809082, "rewards/rejected": -11.269842147827148, "step": 1080 }, { "epoch": 0.37, "learning_rate": 4.869696588190859e-07, "logits/chosen": -0.10451197624206543, "logits/rejected": -0.3102570176124573, "logps/chosen": -226.8278045654297, "logps/rejected": -317.9495849609375, "loss": 0.0332, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4600624144077301, "rewards/margins": 11.063528060913086, "rewards/rejected": -10.603464126586914, "step": 1090 }, { "epoch": 0.37, "learning_rate": 4.863401737378824e-07, "logits/chosen": -0.37099361419677734, "logits/rejected": -0.42316970229148865, "logps/chosen": -228.94638061523438, "logps/rejected": -532.2642822265625, "loss": 0.0673, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.45586976408958435, "rewards/margins": 11.565584182739258, "rewards/rejected": -11.10971450805664, "step": 1100 }, { "epoch": 0.37, "eval_logits/chosen": -0.49547380208969116, "eval_logits/rejected": -0.3771668076515198, "eval_logps/chosen": -214.8733367919922, "eval_logps/rejected": -525.1151733398438, "eval_loss": 0.038325611501932144, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 0.3282488286495209, "eval_rewards/margins": 11.353365898132324, "eval_rewards/rejected": -11.025116920471191, "eval_runtime": 534.2264, "eval_samples_per_second": 17.783, "eval_steps_per_second": 0.556, "step": 1100 }, { "epoch": 0.38, "learning_rate": 4.857106886566788e-07, "logits/chosen": -0.5866730213165283, "logits/rejected": -0.38727062940597534, "logps/chosen": -146.2091827392578, "logps/rejected": -601.9672241210938, "loss": 0.0324, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2949695289134979, "rewards/margins": 12.207448959350586, "rewards/rejected": -11.912480354309082, "step": 1110 }, { "epoch": 0.38, "learning_rate": 4.850812035754753e-07, "logits/chosen": -0.5000025629997253, "logits/rejected": -0.406272828578949, "logps/chosen": -168.3660125732422, "logps/rejected": -520.6743774414062, "loss": 0.0228, "rewards/accuracies": 1.0, "rewards/chosen": 0.1513797789812088, "rewards/margins": 12.158323287963867, "rewards/rejected": -12.006945610046387, "step": 1120 }, { "epoch": 0.38, "learning_rate": 4.844517184942716e-07, "logits/chosen": -0.29790449142456055, "logits/rejected": -0.2714352011680603, "logps/chosen": -246.36270141601562, "logps/rejected": -351.27581787109375, "loss": 0.0229, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07826076447963715, "rewards/margins": 11.058741569519043, "rewards/rejected": -11.137002944946289, "step": 1130 }, { "epoch": 0.39, "learning_rate": 4.838222334130681e-07, "logits/chosen": -0.428137868642807, "logits/rejected": -0.32988542318344116, "logps/chosen": -209.32778930664062, "logps/rejected": -507.0218811035156, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -0.1103137731552124, "rewards/margins": 14.365577697753906, "rewards/rejected": -14.475893020629883, "step": 1140 }, { "epoch": 0.39, "learning_rate": 4.831927483318645e-07, "logits/chosen": -0.4625795781612396, "logits/rejected": -0.2968235909938812, "logps/chosen": -231.2760467529297, "logps/rejected": -357.04925537109375, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 0.3540777564048767, "rewards/margins": 12.226369857788086, "rewards/rejected": -11.872291564941406, "step": 1150 }, { "epoch": 0.39, "learning_rate": 4.82563263250661e-07, "logits/chosen": -0.30490046739578247, "logits/rejected": -0.34345632791519165, "logps/chosen": -294.1733703613281, "logps/rejected": -462.56475830078125, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -0.10383695363998413, "rewards/margins": 12.347694396972656, "rewards/rejected": -12.451530456542969, "step": 1160 }, { "epoch": 0.4, "learning_rate": 4.819337781694573e-07, "logits/chosen": -0.23023287951946259, "logits/rejected": -0.3742315471172333, "logps/chosen": -165.6087188720703, "logps/rejected": -501.8934020996094, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": 0.4085695743560791, "rewards/margins": 10.90626335144043, "rewards/rejected": -10.497692108154297, "step": 1170 }, { "epoch": 0.4, "learning_rate": 4.813042930882538e-07, "logits/chosen": -0.2907789945602417, "logits/rejected": -0.3832870423793793, "logps/chosen": -211.4516143798828, "logps/rejected": -643.9139404296875, "loss": 0.0248, "rewards/accuracies": 1.0, "rewards/chosen": 0.19110803306102753, "rewards/margins": 10.796311378479004, "rewards/rejected": -10.605203628540039, "step": 1180 }, { "epoch": 0.4, "learning_rate": 4.806748080070503e-07, "logits/chosen": -0.3396281898021698, "logits/rejected": -0.31263408064842224, "logps/chosen": -246.7003173828125, "logps/rejected": -401.945068359375, "loss": 0.0391, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.22037991881370544, "rewards/margins": 11.237123489379883, "rewards/rejected": -11.016744613647461, "step": 1190 }, { "epoch": 0.41, "learning_rate": 4.800453229258466e-07, "logits/chosen": -0.3415890634059906, "logits/rejected": -0.29142314195632935, "logps/chosen": -302.6983337402344, "logps/rejected": -481.2146911621094, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.06409353017807007, "rewards/margins": 10.98745346069336, "rewards/rejected": -11.051546096801758, "step": 1200 }, { "epoch": 0.41, "eval_logits/chosen": -0.5142234563827515, "eval_logits/rejected": -0.4115402102470398, "eval_logps/chosen": -216.23255920410156, "eval_logps/rejected": -533.3251342773438, "eval_loss": 0.03254423290491104, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.19232694804668427, "eval_rewards/margins": 12.038444519042969, "eval_rewards/rejected": -11.84611701965332, "eval_runtime": 534.5505, "eval_samples_per_second": 17.772, "eval_steps_per_second": 0.556, "step": 1200 }, { "epoch": 0.41, "learning_rate": 4.79415837844643e-07, "logits/chosen": -0.3845829367637634, "logits/rejected": -0.26644858717918396, "logps/chosen": -220.68655395507812, "logps/rejected": -353.06231689453125, "loss": 0.0337, "rewards/accuracies": 1.0, "rewards/chosen": 0.051003001630306244, "rewards/margins": 13.137338638305664, "rewards/rejected": -13.086336135864258, "step": 1210 }, { "epoch": 0.41, "learning_rate": 4.787863527634395e-07, "logits/chosen": -0.3787842392921448, "logits/rejected": -0.36652541160583496, "logps/chosen": -184.9332733154297, "logps/rejected": -610.3102416992188, "loss": 0.0224, "rewards/accuracies": 1.0, "rewards/chosen": -0.10748779773712158, "rewards/margins": 14.117230415344238, "rewards/rejected": -14.224716186523438, "step": 1220 }, { "epoch": 0.42, "learning_rate": 4.781568676822359e-07, "logits/chosen": -0.4774065613746643, "logits/rejected": -0.45987778902053833, "logps/chosen": -157.17152404785156, "logps/rejected": -730.0006713867188, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": 0.134259432554245, "rewards/margins": 11.59187126159668, "rewards/rejected": -11.457612991333008, "step": 1230 }, { "epoch": 0.42, "learning_rate": 4.775273826010323e-07, "logits/chosen": -0.31929469108581543, "logits/rejected": -0.29284873604774475, "logps/chosen": -235.30178833007812, "logps/rejected": -572.0767822265625, "loss": 0.0205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2410486936569214, "rewards/margins": 12.342641830444336, "rewards/rejected": -12.101593971252441, "step": 1240 }, { "epoch": 0.42, "learning_rate": 4.768978975198288e-07, "logits/chosen": -0.42426449060440063, "logits/rejected": -0.25280293822288513, "logps/chosen": -217.6312255859375, "logps/rejected": -699.5114135742188, "loss": 0.0217, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.030554641038179398, "rewards/margins": 10.491806030273438, "rewards/rejected": -10.461252212524414, "step": 1250 }, { "epoch": 0.43, "learning_rate": 4.762684124386252e-07, "logits/chosen": -0.3115352392196655, "logits/rejected": -0.37043672800064087, "logps/chosen": -233.858154296875, "logps/rejected": -679.0820922851562, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 0.3190721273422241, "rewards/margins": 13.668050765991211, "rewards/rejected": -13.348980903625488, "step": 1260 }, { "epoch": 0.43, "learning_rate": 4.756389273574216e-07, "logits/chosen": -0.38643261790275574, "logits/rejected": -0.34944480657577515, "logps/chosen": -258.94244384765625, "logps/rejected": -587.5523681640625, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 0.026724791154265404, "rewards/margins": 13.0403470993042, "rewards/rejected": -13.01362133026123, "step": 1270 }, { "epoch": 0.44, "learning_rate": 4.7500944227621803e-07, "logits/chosen": -0.37441331148147583, "logits/rejected": -0.3470761477947235, "logps/chosen": -176.2841339111328, "logps/rejected": -434.1368103027344, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 0.2381262332201004, "rewards/margins": 14.470191955566406, "rewards/rejected": -14.232065200805664, "step": 1280 }, { "epoch": 0.44, "learning_rate": 4.7437995719501445e-07, "logits/chosen": -0.5366531014442444, "logits/rejected": -0.3787044882774353, "logps/chosen": -165.64198303222656, "logps/rejected": -580.5164184570312, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.03785517439246178, "rewards/margins": 14.496394157409668, "rewards/rejected": -14.534250259399414, "step": 1290 }, { "epoch": 0.44, "learning_rate": 4.737504721138109e-07, "logits/chosen": -0.401449978351593, "logits/rejected": -0.38668739795684814, "logps/chosen": -217.4298858642578, "logps/rejected": -564.505859375, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -0.04260287806391716, "rewards/margins": 11.826305389404297, "rewards/rejected": -11.86890697479248, "step": 1300 }, { "epoch": 0.44, "eval_logits/chosen": -0.5149909257888794, "eval_logits/rejected": -0.42118585109710693, "eval_logps/chosen": -216.0964813232422, "eval_logps/rejected": -544.2893676757812, "eval_loss": 0.027452431619167328, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.2059345692396164, "eval_rewards/margins": 13.148478507995605, "eval_rewards/rejected": -12.942543029785156, "eval_runtime": 535.5489, "eval_samples_per_second": 17.739, "eval_steps_per_second": 0.555, "step": 1300 }, { "epoch": 0.45, "learning_rate": 4.7312098703260735e-07, "logits/chosen": -0.42848238348960876, "logits/rejected": -0.39036741852760315, "logps/chosen": -156.04502868652344, "logps/rejected": -435.7313537597656, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -0.051517270505428314, "rewards/margins": 11.822015762329102, "rewards/rejected": -11.873533248901367, "step": 1310 }, { "epoch": 0.45, "learning_rate": 4.724915019514038e-07, "logits/chosen": -0.42197251319885254, "logits/rejected": -0.38296177983283997, "logps/chosen": -213.52774047851562, "logps/rejected": -623.489990234375, "loss": 0.0316, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4594356417655945, "rewards/margins": 13.4677734375, "rewards/rejected": -13.008337020874023, "step": 1320 }, { "epoch": 0.45, "learning_rate": 4.7186201687020014e-07, "logits/chosen": -0.306245356798172, "logits/rejected": -0.4234016537666321, "logps/chosen": -220.03402709960938, "logps/rejected": -407.0655822753906, "loss": 0.0216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19869334995746613, "rewards/margins": 13.811253547668457, "rewards/rejected": -13.612561225891113, "step": 1330 }, { "epoch": 0.46, "learning_rate": 4.7123253178899657e-07, "logits/chosen": -0.517323911190033, "logits/rejected": -0.3306100070476532, "logps/chosen": -139.122802734375, "logps/rejected": -497.0418395996094, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": 0.06931233406066895, "rewards/margins": 13.902134895324707, "rewards/rejected": -13.83282470703125, "step": 1340 }, { "epoch": 0.46, "learning_rate": 4.70603046707793e-07, "logits/chosen": -0.4616897702217102, "logits/rejected": -0.4748886227607727, "logps/chosen": -156.35684204101562, "logps/rejected": -647.2406005859375, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 0.14767716825008392, "rewards/margins": 15.818565368652344, "rewards/rejected": -15.67088794708252, "step": 1350 }, { "epoch": 0.46, "learning_rate": 4.699735616265894e-07, "logits/chosen": -0.5157699584960938, "logits/rejected": -0.4018593430519104, "logps/chosen": -213.1888885498047, "logps/rejected": -491.40728759765625, "loss": 0.0251, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2519443929195404, "rewards/margins": 12.250985145568848, "rewards/rejected": -11.999040603637695, "step": 1360 }, { "epoch": 0.47, "learning_rate": 4.693440765453859e-07, "logits/chosen": -0.40230321884155273, "logits/rejected": -0.3672182559967041, "logps/chosen": -193.61593627929688, "logps/rejected": -734.6617431640625, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": -0.2159326821565628, "rewards/margins": 11.286843299865723, "rewards/rejected": -11.502775192260742, "step": 1370 }, { "epoch": 0.47, "learning_rate": 4.687145914641823e-07, "logits/chosen": -0.3283630609512329, "logits/rejected": -0.5172003507614136, "logps/chosen": -177.76608276367188, "logps/rejected": -581.7149658203125, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": 0.18151381611824036, "rewards/margins": 11.299110412597656, "rewards/rejected": -11.117596626281738, "step": 1380 }, { "epoch": 0.47, "learning_rate": 4.6808510638297873e-07, "logits/chosen": -0.4023760259151459, "logits/rejected": -0.5026179552078247, "logps/chosen": -176.25888061523438, "logps/rejected": -561.175537109375, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.0883442759513855, "rewards/margins": 10.470526695251465, "rewards/rejected": -10.55887222290039, "step": 1390 }, { "epoch": 0.48, "learning_rate": 4.674556213017751e-07, "logits/chosen": -0.4534605145454407, "logits/rejected": -0.5221826434135437, "logps/chosen": -138.59791564941406, "logps/rejected": -509.44879150390625, "loss": 0.0143, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.076207235455513, "rewards/margins": 12.572468757629395, "rewards/rejected": -12.648675918579102, "step": 1400 }, { "epoch": 0.48, "eval_logits/chosen": -0.5538153052330017, "eval_logits/rejected": -0.5405449271202087, "eval_logps/chosen": -217.9757843017578, "eval_logps/rejected": -534.5559692382812, "eval_loss": 0.021453365683555603, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.018001805990934372, "eval_rewards/margins": 11.987199783325195, "eval_rewards/rejected": -11.969197273254395, "eval_runtime": 535.5088, "eval_samples_per_second": 17.74, "eval_steps_per_second": 0.555, "step": 1400 }, { "epoch": 0.48, "learning_rate": 4.668261362205715e-07, "logits/chosen": -0.48409414291381836, "logits/rejected": -0.5504492521286011, "logps/chosen": -228.62631225585938, "logps/rejected": -561.8209838867188, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -0.39908766746520996, "rewards/margins": 12.239927291870117, "rewards/rejected": -12.639015197753906, "step": 1410 }, { "epoch": 0.48, "learning_rate": 4.6619665113936795e-07, "logits/chosen": -0.3305067718029022, "logits/rejected": -0.5080638527870178, "logps/chosen": -236.81741333007812, "logps/rejected": -570.1957397460938, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.03078916110098362, "rewards/margins": 11.14117431640625, "rewards/rejected": -11.171964645385742, "step": 1420 }, { "epoch": 0.49, "learning_rate": 4.6556716605816437e-07, "logits/chosen": -0.4823029041290283, "logits/rejected": -0.5342048406600952, "logps/chosen": -211.98159790039062, "logps/rejected": -535.0460205078125, "loss": 0.0219, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.23290273547172546, "rewards/margins": 11.556316375732422, "rewards/rejected": -11.789219856262207, "step": 1430 }, { "epoch": 0.49, "learning_rate": 4.6493768097696085e-07, "logits/chosen": -0.48177576065063477, "logits/rejected": -0.5811325907707214, "logps/chosen": -300.88104248046875, "logps/rejected": -431.742431640625, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -0.37862709164619446, "rewards/margins": 11.830404281616211, "rewards/rejected": -12.20903205871582, "step": 1440 }, { "epoch": 0.49, "learning_rate": 4.6430819589575727e-07, "logits/chosen": -0.5101045370101929, "logits/rejected": -0.49074244499206543, "logps/chosen": -157.8261260986328, "logps/rejected": -402.17230224609375, "loss": 0.0157, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5451567769050598, "rewards/margins": 13.108447074890137, "rewards/rejected": -13.653602600097656, "step": 1450 }, { "epoch": 0.5, "learning_rate": 4.636787108145537e-07, "logits/chosen": -0.4317095875740051, "logits/rejected": -0.4959840774536133, "logps/chosen": -287.135986328125, "logps/rejected": -582.1851196289062, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -0.5175079107284546, "rewards/margins": 11.564324378967285, "rewards/rejected": -12.081830978393555, "step": 1460 }, { "epoch": 0.5, "learning_rate": 4.630492257333501e-07, "logits/chosen": -0.48333558440208435, "logits/rejected": -0.4016719460487366, "logps/chosen": -163.47991943359375, "logps/rejected": -477.482421875, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -0.46019357442855835, "rewards/margins": 12.971224784851074, "rewards/rejected": -13.431417465209961, "step": 1470 }, { "epoch": 0.5, "learning_rate": 4.624197406521465e-07, "logits/chosen": -0.3809479773044586, "logits/rejected": -0.4509502053260803, "logps/chosen": -165.10757446289062, "logps/rejected": -528.8228759765625, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.4563121795654297, "rewards/margins": 14.681309700012207, "rewards/rejected": -15.137621879577637, "step": 1480 }, { "epoch": 0.51, "learning_rate": 4.617902555709429e-07, "logits/chosen": -0.42876458168029785, "logits/rejected": -0.4098014831542969, "logps/chosen": -211.214111328125, "logps/rejected": -346.919677734375, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -0.3952040374279022, "rewards/margins": 12.737375259399414, "rewards/rejected": -13.13257884979248, "step": 1490 }, { "epoch": 0.51, "learning_rate": 4.611607704897394e-07, "logits/chosen": -0.4507642388343811, "logits/rejected": -0.4426211416721344, "logps/chosen": -281.70758056640625, "logps/rejected": -520.260986328125, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -0.12988807260990143, "rewards/margins": 12.681201934814453, "rewards/rejected": -12.811090469360352, "step": 1500 }, { "epoch": 0.51, "eval_logits/chosen": -0.5576140880584717, "eval_logits/rejected": -0.5291763544082642, "eval_logps/chosen": -221.6348876953125, "eval_logps/rejected": -555.7202758789062, "eval_loss": 0.0181296244263649, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.34790754318237305, "eval_rewards/margins": 13.737723350524902, "eval_rewards/rejected": -14.085630416870117, "eval_runtime": 535.4579, "eval_samples_per_second": 17.742, "eval_steps_per_second": 0.555, "step": 1500 }, { "epoch": 0.51, "learning_rate": 4.605312854085358e-07, "logits/chosen": -0.47074851393699646, "logits/rejected": -0.5725608468055725, "logps/chosen": -265.702880859375, "logps/rejected": -465.060546875, "loss": 0.0268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.64374840259552, "rewards/margins": 12.088809967041016, "rewards/rejected": -12.73255729675293, "step": 1510 }, { "epoch": 0.52, "learning_rate": 4.5990180032733223e-07, "logits/chosen": -0.3688223659992218, "logits/rejected": -0.5021462440490723, "logps/chosen": -228.2495574951172, "logps/rejected": -594.3338623046875, "loss": 0.0182, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.42208296060562134, "rewards/margins": 12.566203117370605, "rewards/rejected": -12.988286018371582, "step": 1520 }, { "epoch": 0.52, "learning_rate": 4.5927231524612865e-07, "logits/chosen": -0.33567458391189575, "logits/rejected": -0.42909178137779236, "logps/chosen": -211.79232788085938, "logps/rejected": -421.26422119140625, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.26462775468826294, "rewards/margins": 14.388033866882324, "rewards/rejected": -14.65266227722168, "step": 1530 }, { "epoch": 0.52, "learning_rate": 4.586428301649251e-07, "logits/chosen": -0.5522352457046509, "logits/rejected": -0.3932048976421356, "logps/chosen": -161.44699096679688, "logps/rejected": -641.7650756835938, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -0.07156230509281158, "rewards/margins": 10.797552108764648, "rewards/rejected": -10.86911392211914, "step": 1540 }, { "epoch": 0.53, "learning_rate": 4.5801334508372145e-07, "logits/chosen": -0.39685365557670593, "logits/rejected": -0.36107268929481506, "logps/chosen": -234.2136688232422, "logps/rejected": -450.7112731933594, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4352239668369293, "rewards/margins": 11.835386276245117, "rewards/rejected": -12.270610809326172, "step": 1550 }, { "epoch": 0.53, "learning_rate": 4.573838600025179e-07, "logits/chosen": -0.3981393873691559, "logits/rejected": -0.5295859575271606, "logps/chosen": -304.3006591796875, "logps/rejected": -612.9080810546875, "loss": 0.0158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.560581386089325, "rewards/margins": 12.654718399047852, "rewards/rejected": -13.215298652648926, "step": 1560 }, { "epoch": 0.53, "learning_rate": 4.5675437492131434e-07, "logits/chosen": -0.4437607228755951, "logits/rejected": -0.43397217988967896, "logps/chosen": -197.89523315429688, "logps/rejected": -577.8406372070312, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.31313449144363403, "rewards/margins": 13.599003791809082, "rewards/rejected": -13.912139892578125, "step": 1570 }, { "epoch": 0.54, "learning_rate": 4.5612488984011077e-07, "logits/chosen": -0.33478471636772156, "logits/rejected": -0.5232293605804443, "logps/chosen": -228.79281616210938, "logps/rejected": -497.4456481933594, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -0.39348629117012024, "rewards/margins": 12.850903511047363, "rewards/rejected": -13.244390487670898, "step": 1580 }, { "epoch": 0.54, "learning_rate": 4.554954047589072e-07, "logits/chosen": -0.4264732003211975, "logits/rejected": -0.49747800827026367, "logps/chosen": -220.77267456054688, "logps/rejected": -486.4815368652344, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -0.3274540603160858, "rewards/margins": 15.427177429199219, "rewards/rejected": -15.754631042480469, "step": 1590 }, { "epoch": 0.54, "learning_rate": 4.548659196777036e-07, "logits/chosen": -0.30895158648490906, "logits/rejected": -0.4140236973762512, "logps/chosen": -339.98291015625, "logps/rejected": -451.32421875, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": 0.18101991713047028, "rewards/margins": 12.014985084533691, "rewards/rejected": -11.833965301513672, "step": 1600 }, { "epoch": 0.54, "eval_logits/chosen": -0.5256173610687256, "eval_logits/rejected": -0.4943406879901886, "eval_logps/chosen": -220.39419555664062, "eval_logps/rejected": -553.5293579101562, "eval_loss": 0.028572624549269676, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": -0.22383739054203033, "eval_rewards/margins": 13.642704010009766, "eval_rewards/rejected": -13.866541862487793, "eval_runtime": 534.3393, "eval_samples_per_second": 17.779, "eval_steps_per_second": 0.556, "step": 1600 }, { "epoch": 0.55, "learning_rate": 4.5423643459650003e-07, "logits/chosen": -0.4464387893676758, "logits/rejected": -0.4918655455112457, "logps/chosen": -235.3760986328125, "logps/rejected": -637.3816528320312, "loss": 0.0185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5616486668586731, "rewards/margins": 12.741270065307617, "rewards/rejected": -13.302919387817383, "step": 1610 }, { "epoch": 0.55, "learning_rate": 4.536069495152965e-07, "logits/chosen": -0.4907158315181732, "logits/rejected": -0.5197087526321411, "logps/chosen": -221.53311157226562, "logps/rejected": -601.5340576171875, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -0.34780317544937134, "rewards/margins": 14.549562454223633, "rewards/rejected": -14.897364616394043, "step": 1620 }, { "epoch": 0.55, "learning_rate": 4.529774644340929e-07, "logits/chosen": -0.3327166438102722, "logits/rejected": -0.5005335807800293, "logps/chosen": -235.9947509765625, "logps/rejected": -484.0238342285156, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -0.18066541850566864, "rewards/margins": 13.955121994018555, "rewards/rejected": -14.135787963867188, "step": 1630 }, { "epoch": 0.56, "learning_rate": 4.523479793528893e-07, "logits/chosen": -0.20170505344867706, "logits/rejected": -0.440112829208374, "logps/chosen": -222.8807830810547, "logps/rejected": -499.26422119140625, "loss": 0.0134, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.15153782069683075, "rewards/margins": 13.540555953979492, "rewards/rejected": -13.692094802856445, "step": 1640 }, { "epoch": 0.56, "learning_rate": 4.517184942716857e-07, "logits/chosen": -0.39964231848716736, "logits/rejected": -0.5179239511489868, "logps/chosen": -289.93548583984375, "logps/rejected": -451.51953125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.508069634437561, "rewards/margins": 13.689692497253418, "rewards/rejected": -14.197761535644531, "step": 1650 }, { "epoch": 0.56, "learning_rate": 4.5108900919048215e-07, "logits/chosen": -0.38229459524154663, "logits/rejected": -0.48286551237106323, "logps/chosen": -240.39456176757812, "logps/rejected": -519.87158203125, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5919996500015259, "rewards/margins": 15.353375434875488, "rewards/rejected": -15.94537353515625, "step": 1660 }, { "epoch": 0.57, "learning_rate": 4.5045952410927857e-07, "logits/chosen": -0.5003230571746826, "logits/rejected": -0.36770448088645935, "logps/chosen": -197.37435913085938, "logps/rejected": -553.3612060546875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.1250946819782257, "rewards/margins": 16.469846725463867, "rewards/rejected": -16.59494400024414, "step": 1670 }, { "epoch": 0.57, "learning_rate": 4.4983003902807505e-07, "logits/chosen": -0.40794816613197327, "logits/rejected": -0.4975048005580902, "logps/chosen": -291.07635498046875, "logps/rejected": -749.6055908203125, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -0.4300554394721985, "rewards/margins": 17.46886444091797, "rewards/rejected": -17.8989200592041, "step": 1680 }, { "epoch": 0.57, "learning_rate": 4.4920055394687147e-07, "logits/chosen": -0.3133481740951538, "logits/rejected": -0.45112448930740356, "logps/chosen": -180.49569702148438, "logps/rejected": -472.608642578125, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2760940492153168, "rewards/margins": 14.411653518676758, "rewards/rejected": -14.68774700164795, "step": 1690 }, { "epoch": 0.58, "learning_rate": 4.485710688656679e-07, "logits/chosen": -0.5371385216712952, "logits/rejected": -0.4235255718231201, "logps/chosen": -180.40396118164062, "logps/rejected": -550.0654907226562, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.35535120964050293, "rewards/margins": 16.792621612548828, "rewards/rejected": -17.14797019958496, "step": 1700 }, { "epoch": 0.58, "eval_logits/chosen": -0.521159291267395, "eval_logits/rejected": -0.4799301326274872, "eval_logps/chosen": -220.5081329345703, "eval_logps/rejected": -573.6668701171875, "eval_loss": 0.025109997019171715, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.23523074388504028, "eval_rewards/margins": 15.645064353942871, "eval_rewards/rejected": -15.880293846130371, "eval_runtime": 534.7054, "eval_samples_per_second": 17.767, "eval_steps_per_second": 0.555, "step": 1700 }, { "epoch": 0.58, "learning_rate": 4.4794158378446426e-07, "logits/chosen": -0.356571227312088, "logits/rejected": -0.3338431119918823, "logps/chosen": -292.8492431640625, "logps/rejected": -722.9315795898438, "loss": 0.0247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1379307508468628, "rewards/margins": 13.92692756652832, "rewards/rejected": -14.064857482910156, "step": 1710 }, { "epoch": 0.58, "learning_rate": 4.473120987032607e-07, "logits/chosen": -0.41846877336502075, "logits/rejected": -0.4334571957588196, "logps/chosen": -203.83697509765625, "logps/rejected": -894.732421875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.32279014587402344, "rewards/margins": 15.072772026062012, "rewards/rejected": -15.395563125610352, "step": 1720 }, { "epoch": 0.59, "learning_rate": 4.466826136220571e-07, "logits/chosen": -0.3925188183784485, "logits/rejected": -0.3883039951324463, "logps/chosen": -169.47227478027344, "logps/rejected": -483.35589599609375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.3539579510688782, "rewards/margins": 15.592570304870605, "rewards/rejected": -15.946528434753418, "step": 1730 }, { "epoch": 0.59, "learning_rate": 4.460531285408536e-07, "logits/chosen": -0.4470590651035309, "logits/rejected": -0.36041557788848877, "logps/chosen": -339.82098388671875, "logps/rejected": -495.77734375, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.913856029510498, "rewards/margins": 14.762941360473633, "rewards/rejected": -15.676797866821289, "step": 1740 }, { "epoch": 0.59, "learning_rate": 4.4542364345965e-07, "logits/chosen": -0.40618380904197693, "logits/rejected": -0.4064006209373474, "logps/chosen": -260.1714782714844, "logps/rejected": -505.072509765625, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.08550099283456802, "rewards/margins": 19.473079681396484, "rewards/rejected": -19.558578491210938, "step": 1750 }, { "epoch": 0.6, "learning_rate": 4.4479415837844643e-07, "logits/chosen": -0.3830642104148865, "logits/rejected": -0.4831882417201996, "logps/chosen": -284.1402282714844, "logps/rejected": -515.0564575195312, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.30723220109939575, "rewards/margins": 15.805920600891113, "rewards/rejected": -16.113155364990234, "step": 1760 }, { "epoch": 0.6, "learning_rate": 4.4416467329724285e-07, "logits/chosen": -0.4236987233161926, "logits/rejected": -0.4703877568244934, "logps/chosen": -156.44235229492188, "logps/rejected": -716.8899536132812, "loss": 0.0121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.36839666962623596, "rewards/margins": 18.206823348999023, "rewards/rejected": -18.575220108032227, "step": 1770 }, { "epoch": 0.61, "learning_rate": 4.435351882160392e-07, "logits/chosen": -0.30011001229286194, "logits/rejected": -0.4376433491706848, "logps/chosen": -234.56884765625, "logps/rejected": -494.414794921875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -0.551509439945221, "rewards/margins": 14.84644889831543, "rewards/rejected": -15.397958755493164, "step": 1780 }, { "epoch": 0.61, "learning_rate": 4.4290570313483564e-07, "logits/chosen": -0.45115581154823303, "logits/rejected": -0.5156680345535278, "logps/chosen": -338.4355773925781, "logps/rejected": -524.8842163085938, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8162508010864258, "rewards/margins": 17.6003360748291, "rewards/rejected": -18.41658592224121, "step": 1790 }, { "epoch": 0.61, "learning_rate": 4.422762180536321e-07, "logits/chosen": -0.6411077976226807, "logits/rejected": -0.4612352252006531, "logps/chosen": -168.07736206054688, "logps/rejected": -645.7598876953125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.7181661128997803, "rewards/margins": 16.065811157226562, "rewards/rejected": -16.783977508544922, "step": 1800 }, { "epoch": 0.61, "eval_logits/chosen": -0.5384660959243774, "eval_logits/rejected": -0.4975683093070984, "eval_logps/chosen": -219.97247314453125, "eval_logps/rejected": -582.1795043945312, "eval_loss": 0.016291543841362, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.18166583776474, "eval_rewards/margins": 16.549896240234375, "eval_rewards/rejected": -16.731563568115234, "eval_runtime": 533.8859, "eval_samples_per_second": 17.794, "eval_steps_per_second": 0.556, "step": 1800 }, { "epoch": 0.62, "learning_rate": 4.4164673297242854e-07, "logits/chosen": -0.4978792071342468, "logits/rejected": -0.4507046639919281, "logps/chosen": -278.0787353515625, "logps/rejected": -637.1492309570312, "loss": 0.011, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10290347039699554, "rewards/margins": 17.37094497680664, "rewards/rejected": -17.47385025024414, "step": 1810 }, { "epoch": 0.62, "learning_rate": 4.4101724789122497e-07, "logits/chosen": -0.42331212759017944, "logits/rejected": -0.4892626404762268, "logps/chosen": -228.34347534179688, "logps/rejected": -789.174072265625, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3370633125305176, "rewards/margins": 14.508142471313477, "rewards/rejected": -14.845205307006836, "step": 1820 }, { "epoch": 0.62, "learning_rate": 4.403877628100214e-07, "logits/chosen": -0.4716408848762512, "logits/rejected": -0.41020697355270386, "logps/chosen": -225.7122039794922, "logps/rejected": -543.1890869140625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.08656591176986694, "rewards/margins": 14.987605094909668, "rewards/rejected": -14.901037216186523, "step": 1830 }, { "epoch": 0.63, "learning_rate": 4.397582777288178e-07, "logits/chosen": -0.4566265046596527, "logits/rejected": -0.47303399443626404, "logps/chosen": -286.5848083496094, "logps/rejected": -659.7454223632812, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.18428581953048706, "rewards/margins": 16.48398208618164, "rewards/rejected": -16.66826820373535, "step": 1840 }, { "epoch": 0.63, "learning_rate": 4.3912879264761423e-07, "logits/chosen": -0.3039599657058716, "logits/rejected": -0.3809065520763397, "logps/chosen": -230.4857940673828, "logps/rejected": -554.9456787109375, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -0.19270649552345276, "rewards/margins": 14.690716743469238, "rewards/rejected": -14.8834228515625, "step": 1850 }, { "epoch": 0.63, "learning_rate": 4.3849930756641066e-07, "logits/chosen": -0.4020947813987732, "logits/rejected": -0.4464609622955322, "logps/chosen": -220.7183837890625, "logps/rejected": -647.9937744140625, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": 0.10088600963354111, "rewards/margins": 15.413442611694336, "rewards/rejected": -15.3125581741333, "step": 1860 }, { "epoch": 0.64, "learning_rate": 4.378698224852071e-07, "logits/chosen": -0.40508905053138733, "logits/rejected": -0.39254289865493774, "logps/chosen": -211.35238647460938, "logps/rejected": -521.2523803710938, "loss": 0.0164, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.773373007774353, "rewards/margins": 15.660751342773438, "rewards/rejected": -16.43412208557129, "step": 1870 }, { "epoch": 0.64, "learning_rate": 4.372403374040035e-07, "logits/chosen": -0.6337490677833557, "logits/rejected": -0.4414336085319519, "logps/chosen": -162.35488891601562, "logps/rejected": -554.876953125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -0.3900473415851593, "rewards/margins": 16.4035701751709, "rewards/rejected": -16.793617248535156, "step": 1880 }, { "epoch": 0.64, "learning_rate": 4.366108523227999e-07, "logits/chosen": -0.44221729040145874, "logits/rejected": -0.49020832777023315, "logps/chosen": -235.53097534179688, "logps/rejected": -519.3707275390625, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -0.4810159206390381, "rewards/margins": 13.657504081726074, "rewards/rejected": -14.138521194458008, "step": 1890 }, { "epoch": 0.65, "learning_rate": 4.3598136724159635e-07, "logits/chosen": -0.5277594327926636, "logits/rejected": -0.5274850130081177, "logps/chosen": -284.9675598144531, "logps/rejected": -578.0961303710938, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -0.45527681708335876, "rewards/margins": 16.649520874023438, "rewards/rejected": -17.10479736328125, "step": 1900 }, { "epoch": 0.65, "eval_logits/chosen": -0.6174340844154358, "eval_logits/rejected": -0.5874400734901428, "eval_logps/chosen": -222.07264709472656, "eval_logps/rejected": -601.3036499023438, "eval_loss": 0.015866290777921677, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": -0.3916856646537781, "eval_rewards/margins": 18.252290725708008, "eval_rewards/rejected": -18.64397621154785, "eval_runtime": 534.2115, "eval_samples_per_second": 17.783, "eval_steps_per_second": 0.556, "step": 1900 }, { "epoch": 0.65, "learning_rate": 4.3535188216039277e-07, "logits/chosen": -0.4859447479248047, "logits/rejected": -0.5189141035079956, "logps/chosen": -211.63613891601562, "logps/rejected": -574.7998046875, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3297613561153412, "rewards/margins": 16.760494232177734, "rewards/rejected": -17.090255737304688, "step": 1910 }, { "epoch": 0.65, "learning_rate": 4.3472239707918925e-07, "logits/chosen": -0.5379757881164551, "logits/rejected": -0.4717877507209778, "logps/chosen": -161.94187927246094, "logps/rejected": -441.76898193359375, "loss": 0.0723, "rewards/accuracies": 1.0, "rewards/chosen": -0.10956624895334244, "rewards/margins": 15.853068351745605, "rewards/rejected": -15.96263599395752, "step": 1920 }, { "epoch": 0.66, "learning_rate": 4.3409291199798567e-07, "logits/chosen": -0.34473687410354614, "logits/rejected": -0.5547316074371338, "logps/chosen": -178.09384155273438, "logps/rejected": -666.2685546875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -0.13908074796199799, "rewards/margins": 14.079671859741211, "rewards/rejected": -14.21875286102295, "step": 1930 }, { "epoch": 0.66, "learning_rate": 4.3346342691678204e-07, "logits/chosen": -0.5969959497451782, "logits/rejected": -0.5415017604827881, "logps/chosen": -161.95187377929688, "logps/rejected": -745.8142700195312, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -0.3228221833705902, "rewards/margins": 16.691722869873047, "rewards/rejected": -17.014541625976562, "step": 1940 }, { "epoch": 0.66, "learning_rate": 4.3283394183557846e-07, "logits/chosen": -0.3593668043613434, "logits/rejected": -0.5113891959190369, "logps/chosen": -361.9584045410156, "logps/rejected": -516.3282470703125, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": 0.33454978466033936, "rewards/margins": 16.093481063842773, "rewards/rejected": -15.758931159973145, "step": 1950 }, { "epoch": 0.67, "learning_rate": 4.322044567543749e-07, "logits/chosen": -0.5007352232933044, "logits/rejected": -0.5392492413520813, "logps/chosen": -221.562744140625, "logps/rejected": -505.87481689453125, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -0.24075599014759064, "rewards/margins": 15.75054931640625, "rewards/rejected": -15.991304397583008, "step": 1960 }, { "epoch": 0.67, "learning_rate": 4.315749716731713e-07, "logits/chosen": -0.39577361941337585, "logits/rejected": -0.47207722067832947, "logps/chosen": -231.19070434570312, "logps/rejected": -697.5925903320312, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.1472932994365692, "rewards/margins": 11.822135925292969, "rewards/rejected": -11.969429016113281, "step": 1970 }, { "epoch": 0.67, "learning_rate": 4.309454865919678e-07, "logits/chosen": -0.46047163009643555, "logits/rejected": -0.4786960482597351, "logps/chosen": -236.82229614257812, "logps/rejected": -513.6739501953125, "loss": 0.016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07316909730434418, "rewards/margins": 13.219225883483887, "rewards/rejected": -13.292394638061523, "step": 1980 }, { "epoch": 0.68, "learning_rate": 4.303160015107642e-07, "logits/chosen": -0.31725165247917175, "logits/rejected": -0.5155214071273804, "logps/chosen": -280.30633544921875, "logps/rejected": -417.54364013671875, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.47681179642677307, "rewards/margins": 15.110737800598145, "rewards/rejected": -15.587549209594727, "step": 1990 }, { "epoch": 0.68, "learning_rate": 4.2968651642956063e-07, "logits/chosen": -0.4433720111846924, "logits/rejected": -0.5211519002914429, "logps/chosen": -233.377197265625, "logps/rejected": -481.90130615234375, "loss": 0.007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3188526928424835, "rewards/margins": 14.887802124023438, "rewards/rejected": -15.20665454864502, "step": 2000 }, { "epoch": 0.68, "eval_logits/chosen": -0.5702382326126099, "eval_logits/rejected": -0.555505096912384, "eval_logps/chosen": -219.3957061767578, "eval_logps/rejected": -580.1437377929688, "eval_loss": 0.010610525496304035, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.12398876994848251, "eval_rewards/margins": 16.403993606567383, "eval_rewards/rejected": -16.52798080444336, "eval_runtime": 535.1112, "eval_samples_per_second": 17.753, "eval_steps_per_second": 0.555, "step": 2000 }, { "epoch": 0.68, "learning_rate": 4.29057031348357e-07, "logits/chosen": -0.4271085858345032, "logits/rejected": -0.5719980001449585, "logps/chosen": -164.71615600585938, "logps/rejected": -569.5879516601562, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.15628762543201447, "rewards/margins": 17.174100875854492, "rewards/rejected": -17.33039093017578, "step": 2010 }, { "epoch": 0.69, "learning_rate": 4.284275462671534e-07, "logits/chosen": -0.5002657175064087, "logits/rejected": -0.5766940116882324, "logps/chosen": -163.771240234375, "logps/rejected": -633.3169555664062, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16970457136631012, "rewards/margins": 16.446439743041992, "rewards/rejected": -16.2767333984375, "step": 2020 }, { "epoch": 0.69, "learning_rate": 4.2779806118594984e-07, "logits/chosen": -0.37288618087768555, "logits/rejected": -0.5131433606147766, "logps/chosen": -274.4900207519531, "logps/rejected": -459.62109375, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.36801427602767944, "rewards/margins": 14.247884750366211, "rewards/rejected": -14.615896224975586, "step": 2030 }, { "epoch": 0.69, "learning_rate": 4.271685761047463e-07, "logits/chosen": -0.640346884727478, "logits/rejected": -0.5140171051025391, "logps/chosen": -156.2543182373047, "logps/rejected": -580.4205322265625, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -0.1935465782880783, "rewards/margins": 20.5224666595459, "rewards/rejected": -20.716014862060547, "step": 2040 }, { "epoch": 0.7, "learning_rate": 4.2653909102354274e-07, "logits/chosen": -0.2987828552722931, "logits/rejected": -0.5199651122093201, "logps/chosen": -244.65542602539062, "logps/rejected": -529.4879150390625, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.36563822627067566, "rewards/margins": 16.621753692626953, "rewards/rejected": -16.98739242553711, "step": 2050 }, { "epoch": 0.7, "learning_rate": 4.2590960594233917e-07, "logits/chosen": -0.3394085466861725, "logits/rejected": -0.5272071957588196, "logps/chosen": -241.66183471679688, "logps/rejected": -520.6480102539062, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5247941613197327, "rewards/margins": 16.189863204956055, "rewards/rejected": -16.714656829833984, "step": 2060 }, { "epoch": 0.7, "learning_rate": 4.252801208611356e-07, "logits/chosen": -0.5160232186317444, "logits/rejected": -0.4796876311302185, "logps/chosen": -205.7699432373047, "logps/rejected": -524.12353515625, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -0.14371219277381897, "rewards/margins": 15.435323715209961, "rewards/rejected": -15.5790376663208, "step": 2070 }, { "epoch": 0.71, "learning_rate": 4.24650635779932e-07, "logits/chosen": -0.500092625617981, "logits/rejected": -0.586539089679718, "logps/chosen": -162.15028381347656, "logps/rejected": -493.12066650390625, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.8460739254951477, "rewards/margins": 22.033405303955078, "rewards/rejected": -22.87948226928711, "step": 2080 }, { "epoch": 0.71, "learning_rate": 4.240211506987284e-07, "logits/chosen": -0.4931337237358093, "logits/rejected": -0.6152265071868896, "logps/chosen": -253.0132293701172, "logps/rejected": -585.765625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -0.5827271342277527, "rewards/margins": 19.643821716308594, "rewards/rejected": -20.226551055908203, "step": 2090 }, { "epoch": 0.71, "learning_rate": 4.233916656175248e-07, "logits/chosen": -0.4528264105319977, "logits/rejected": -0.5413549542427063, "logps/chosen": -274.97259521484375, "logps/rejected": -781.49755859375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.093484066426754, "rewards/margins": 16.851360321044922, "rewards/rejected": -16.94484519958496, "step": 2100 }, { "epoch": 0.71, "eval_logits/chosen": -0.5847861766815186, "eval_logits/rejected": -0.580173671245575, "eval_logps/chosen": -221.54400634765625, "eval_logps/rejected": -600.1015625, "eval_loss": 0.01669074036180973, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.3388203978538513, "eval_rewards/margins": 18.184938430786133, "eval_rewards/rejected": -18.523757934570312, "eval_runtime": 533.6278, "eval_samples_per_second": 17.803, "eval_steps_per_second": 0.557, "step": 2100 }, { "epoch": 0.72, "learning_rate": 4.227621805363213e-07, "logits/chosen": -0.5350021123886108, "logits/rejected": -0.4851594567298889, "logps/chosen": -185.82786560058594, "logps/rejected": -507.990966796875, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.5180062651634216, "rewards/margins": 17.037395477294922, "rewards/rejected": -17.555400848388672, "step": 2110 }, { "epoch": 0.72, "learning_rate": 4.221326954551177e-07, "logits/chosen": -0.5126780271530151, "logits/rejected": -0.5662415623664856, "logps/chosen": -164.44113159179688, "logps/rejected": -671.6915283203125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3760198950767517, "rewards/margins": 17.08847427368164, "rewards/rejected": -17.464496612548828, "step": 2120 }, { "epoch": 0.72, "learning_rate": 4.215032103739141e-07, "logits/chosen": -0.49528607726097107, "logits/rejected": -0.5250134468078613, "logps/chosen": -229.97909545898438, "logps/rejected": -677.5662231445312, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -0.3877120614051819, "rewards/margins": 19.573028564453125, "rewards/rejected": -19.960737228393555, "step": 2130 }, { "epoch": 0.73, "learning_rate": 4.2087372529271055e-07, "logits/chosen": -0.3805707097053528, "logits/rejected": -0.42864030599594116, "logps/chosen": -181.89987182617188, "logps/rejected": -477.3799743652344, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.1290227323770523, "rewards/margins": 15.65631103515625, "rewards/rejected": -15.785333633422852, "step": 2140 }, { "epoch": 0.73, "learning_rate": 4.2024424021150697e-07, "logits/chosen": -0.4831499457359314, "logits/rejected": -0.5056699514389038, "logps/chosen": -168.68414306640625, "logps/rejected": -635.7274169921875, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.15924754738807678, "rewards/margins": 18.67966079711914, "rewards/rejected": -18.52041244506836, "step": 2150 }, { "epoch": 0.73, "learning_rate": 4.1961475513030334e-07, "logits/chosen": -0.3718964159488678, "logits/rejected": -0.5391818881034851, "logps/chosen": -287.40692138671875, "logps/rejected": -382.63336181640625, "loss": 0.0127, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5606808662414551, "rewards/margins": 16.70510482788086, "rewards/rejected": -17.26578712463379, "step": 2160 }, { "epoch": 0.74, "learning_rate": 4.189852700490998e-07, "logits/chosen": -0.3778756260871887, "logits/rejected": -0.5391207933425903, "logps/chosen": -228.2191162109375, "logps/rejected": -492.0967712402344, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.39583340287208557, "rewards/margins": 17.752397537231445, "rewards/rejected": -18.148231506347656, "step": 2170 }, { "epoch": 0.74, "learning_rate": 4.1835578496789624e-07, "logits/chosen": -0.4607987403869629, "logits/rejected": -0.5883350372314453, "logps/chosen": -236.50411987304688, "logps/rejected": -567.3048706054688, "loss": 0.0253, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6020129323005676, "rewards/margins": 16.953807830810547, "rewards/rejected": -17.55582046508789, "step": 2180 }, { "epoch": 0.74, "learning_rate": 4.1772629988669266e-07, "logits/chosen": -0.3868181109428406, "logits/rejected": -0.4645780920982361, "logps/chosen": -219.92626953125, "logps/rejected": -577.1318359375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.06665867567062378, "rewards/margins": 17.537878036499023, "rewards/rejected": -17.60453987121582, "step": 2190 }, { "epoch": 0.75, "learning_rate": 4.170968148054891e-07, "logits/chosen": -0.20005278289318085, "logits/rejected": -0.4736247658729553, "logps/chosen": -460.35455322265625, "logps/rejected": -539.3721923828125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.4357563853263855, "rewards/margins": 17.046459197998047, "rewards/rejected": -17.48221778869629, "step": 2200 }, { "epoch": 0.75, "eval_logits/chosen": -0.5517213344573975, "eval_logits/rejected": -0.530022919178009, "eval_logps/chosen": -216.2812042236328, "eval_logps/rejected": -579.7398071289062, "eval_loss": 0.016572650521993637, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.1874610036611557, "eval_rewards/margins": 16.675050735473633, "eval_rewards/rejected": -16.487590789794922, "eval_runtime": 533.7822, "eval_samples_per_second": 17.798, "eval_steps_per_second": 0.556, "step": 2200 }, { "epoch": 0.75, "learning_rate": 4.164673297242855e-07, "logits/chosen": -0.2259330004453659, "logits/rejected": -0.5296998023986816, "logps/chosen": -378.78363037109375, "logps/rejected": -577.098876953125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.38857564330101013, "rewards/margins": 14.37199592590332, "rewards/rejected": -14.760571479797363, "step": 2210 }, { "epoch": 0.75, "learning_rate": 4.1583784464308193e-07, "logits/chosen": -0.6568074822425842, "logits/rejected": -0.478780597448349, "logps/chosen": -151.4872589111328, "logps/rejected": -507.4267578125, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 0.030637968331575394, "rewards/margins": 17.354795455932617, "rewards/rejected": -17.32415771484375, "step": 2220 }, { "epoch": 0.76, "learning_rate": 4.152083595618784e-07, "logits/chosen": -0.397937148809433, "logits/rejected": -0.5080692768096924, "logps/chosen": -162.84046936035156, "logps/rejected": -479.2415466308594, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1433585286140442, "rewards/margins": 17.875303268432617, "rewards/rejected": -18.018661499023438, "step": 2230 }, { "epoch": 0.76, "learning_rate": 4.145788744806748e-07, "logits/chosen": -0.4435577988624573, "logits/rejected": -0.40778645873069763, "logps/chosen": -168.5457305908203, "logps/rejected": -562.525146484375, "loss": 0.0053, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09170367568731308, "rewards/margins": 16.61309051513672, "rewards/rejected": -16.704792022705078, "step": 2240 }, { "epoch": 0.76, "learning_rate": 4.139493893994712e-07, "logits/chosen": -0.40800437331199646, "logits/rejected": -0.48417598009109497, "logps/chosen": -177.51470947265625, "logps/rejected": -433.54705810546875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -0.18716073036193848, "rewards/margins": 16.943470001220703, "rewards/rejected": -17.130630493164062, "step": 2250 }, { "epoch": 0.77, "learning_rate": 4.133199043182676e-07, "logits/chosen": -0.5191536545753479, "logits/rejected": -0.557326078414917, "logps/chosen": -173.1439971923828, "logps/rejected": -596.7061767578125, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.29786989092826843, "rewards/margins": 18.530162811279297, "rewards/rejected": -18.828031539916992, "step": 2260 }, { "epoch": 0.77, "learning_rate": 4.1269041923706404e-07, "logits/chosen": -0.4793340563774109, "logits/rejected": -0.6266626119613647, "logps/chosen": -271.226318359375, "logps/rejected": -544.0018920898438, "loss": 0.0081, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4012511372566223, "rewards/margins": 19.623273849487305, "rewards/rejected": -20.024526596069336, "step": 2270 }, { "epoch": 0.77, "learning_rate": 4.1206093415586047e-07, "logits/chosen": -0.5044962167739868, "logits/rejected": -0.5382771492004395, "logps/chosen": -173.5367889404297, "logps/rejected": -544.4332275390625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.25055167078971863, "rewards/margins": 17.29902458190918, "rewards/rejected": -17.549575805664062, "step": 2280 }, { "epoch": 0.78, "learning_rate": 4.1143144907465694e-07, "logits/chosen": -0.47572222352027893, "logits/rejected": -0.5923356413841248, "logps/chosen": -177.20077514648438, "logps/rejected": -587.7260131835938, "loss": 0.0209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.44551897048950195, "rewards/margins": 15.825716972351074, "rewards/rejected": -16.271236419677734, "step": 2290 }, { "epoch": 0.78, "learning_rate": 4.1080196399345336e-07, "logits/chosen": -0.45693492889404297, "logits/rejected": -0.5454440116882324, "logps/chosen": -220.4252471923828, "logps/rejected": -617.509033203125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.27038222551345825, "rewards/margins": 17.768993377685547, "rewards/rejected": -18.039377212524414, "step": 2300 }, { "epoch": 0.78, "eval_logits/chosen": -0.5932157039642334, "eval_logits/rejected": -0.5945234894752502, "eval_logps/chosen": -223.0087127685547, "eval_logps/rejected": -605.9404907226562, "eval_loss": 0.016650959849357605, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": -0.4852873384952545, "eval_rewards/margins": 18.622373580932617, "eval_rewards/rejected": -19.10765838623047, "eval_runtime": 533.608, "eval_samples_per_second": 17.803, "eval_steps_per_second": 0.557, "step": 2300 }, { "epoch": 0.79, "learning_rate": 4.101724789122498e-07, "logits/chosen": -0.4647675156593323, "logits/rejected": -0.5477542281150818, "logps/chosen": -308.8323974609375, "logps/rejected": -588.047607421875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -0.391190767288208, "rewards/margins": 17.440502166748047, "rewards/rejected": -17.83169174194336, "step": 2310 }, { "epoch": 0.79, "learning_rate": 4.0954299383104616e-07, "logits/chosen": -0.382946640253067, "logits/rejected": -0.5210340023040771, "logps/chosen": -207.72262573242188, "logps/rejected": -498.48028564453125, "loss": 0.0221, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.38736557960510254, "rewards/margins": 14.896451950073242, "rewards/rejected": -15.283819198608398, "step": 2320 }, { "epoch": 0.79, "learning_rate": 4.089135087498426e-07, "logits/chosen": -0.2918635904788971, "logits/rejected": -0.4840494692325592, "logps/chosen": -388.977783203125, "logps/rejected": -541.8551025390625, "loss": 0.0077, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.32416194677352905, "rewards/margins": 18.305639266967773, "rewards/rejected": -18.62980079650879, "step": 2330 }, { "epoch": 0.8, "learning_rate": 4.08284023668639e-07, "logits/chosen": -0.44612568616867065, "logits/rejected": -0.5513128042221069, "logps/chosen": -220.37155151367188, "logps/rejected": -864.4749755859375, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.05022967979311943, "rewards/margins": 17.33795928955078, "rewards/rejected": -17.388187408447266, "step": 2340 }, { "epoch": 0.8, "learning_rate": 4.076545385874355e-07, "logits/chosen": -0.5279570817947388, "logits/rejected": -0.5410766005516052, "logps/chosen": -192.09152221679688, "logps/rejected": -484.39776611328125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.5911061763763428, "rewards/margins": 18.594173431396484, "rewards/rejected": -19.185279846191406, "step": 2350 }, { "epoch": 0.8, "learning_rate": 4.070250535062319e-07, "logits/chosen": -0.49954843521118164, "logits/rejected": -0.5590623617172241, "logps/chosen": -162.03797912597656, "logps/rejected": -510.49658203125, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05663837119936943, "rewards/margins": 19.239727020263672, "rewards/rejected": -19.183086395263672, "step": 2360 }, { "epoch": 0.81, "learning_rate": 4.063955684250283e-07, "logits/chosen": -0.43743905425071716, "logits/rejected": -0.5494459271430969, "logps/chosen": -214.9569091796875, "logps/rejected": -676.9227294921875, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5420457720756531, "rewards/margins": 16.931255340576172, "rewards/rejected": -17.47330093383789, "step": 2370 }, { "epoch": 0.81, "learning_rate": 4.0576608334382475e-07, "logits/chosen": -0.40094536542892456, "logits/rejected": -0.5281438827514648, "logps/chosen": -232.4851531982422, "logps/rejected": -516.7774047851562, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.133186936378479, "rewards/margins": 18.702699661254883, "rewards/rejected": -18.835887908935547, "step": 2380 }, { "epoch": 0.81, "learning_rate": 4.051365982626211e-07, "logits/chosen": -0.5618555545806885, "logits/rejected": -0.5724986791610718, "logps/chosen": -228.51608276367188, "logps/rejected": -681.4666748046875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.37716931104660034, "rewards/margins": 18.319225311279297, "rewards/rejected": -18.696395874023438, "step": 2390 }, { "epoch": 0.82, "learning_rate": 4.0450711318141754e-07, "logits/chosen": -0.6639199256896973, "logits/rejected": -0.44996851682662964, "logps/chosen": -177.58865356445312, "logps/rejected": -589.6808471679688, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.2697039842605591, "rewards/margins": 16.88338851928711, "rewards/rejected": -17.153091430664062, "step": 2400 }, { "epoch": 0.82, "eval_logits/chosen": -0.5695017576217651, "eval_logits/rejected": -0.5528296828269958, "eval_logps/chosen": -219.4221954345703, "eval_logps/rejected": -608.4083251953125, "eval_loss": 0.01481403224170208, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.1266351044178009, "eval_rewards/margins": 19.22780990600586, "eval_rewards/rejected": -19.354446411132812, "eval_runtime": 532.7641, "eval_samples_per_second": 17.832, "eval_steps_per_second": 0.557, "step": 2400 }, { "epoch": 0.82, "learning_rate": 4.03877628100214e-07, "logits/chosen": -0.4979880452156067, "logits/rejected": -0.5355257987976074, "logps/chosen": -173.719970703125, "logps/rejected": -508.90716552734375, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1737275868654251, "rewards/margins": 18.429990768432617, "rewards/rejected": -18.60371971130371, "step": 2410 }, { "epoch": 0.82, "learning_rate": 4.0324814301901044e-07, "logits/chosen": -0.4027596414089203, "logits/rejected": -0.5378649830818176, "logps/chosen": -258.9962158203125, "logps/rejected": -482.55316162109375, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.470703125, "rewards/margins": 15.79443073272705, "rewards/rejected": -16.265132904052734, "step": 2420 }, { "epoch": 0.83, "learning_rate": 4.0261865793780686e-07, "logits/chosen": -0.422789990901947, "logits/rejected": -0.49099642038345337, "logps/chosen": -228.1546173095703, "logps/rejected": -608.689697265625, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.39902716875076294, "rewards/margins": 16.9798583984375, "rewards/rejected": -17.378887176513672, "step": 2430 }, { "epoch": 0.83, "learning_rate": 4.019891728566033e-07, "logits/chosen": -0.41036224365234375, "logits/rejected": -0.5527801513671875, "logps/chosen": -229.97030639648438, "logps/rejected": -709.0821533203125, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.10456440597772598, "rewards/margins": 18.060453414916992, "rewards/rejected": -18.165019989013672, "step": 2440 }, { "epoch": 0.83, "learning_rate": 4.013596877753997e-07, "logits/chosen": -0.5690553784370422, "logits/rejected": -0.5003184080123901, "logps/chosen": -165.8610382080078, "logps/rejected": -659.7174072265625, "loss": 0.0083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6624375581741333, "rewards/margins": 19.461458206176758, "rewards/rejected": -20.12389373779297, "step": 2450 }, { "epoch": 0.84, "learning_rate": 4.0073020269419613e-07, "logits/chosen": -0.4980100691318512, "logits/rejected": -0.4979858994483948, "logps/chosen": -162.407958984375, "logps/rejected": -608.7325439453125, "loss": 0.0091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.38793838024139404, "rewards/margins": 18.551870346069336, "rewards/rejected": -18.939809799194336, "step": 2460 }, { "epoch": 0.84, "learning_rate": 4.0010071761299255e-07, "logits/chosen": -0.4160676598548889, "logits/rejected": -0.5220437049865723, "logps/chosen": -229.43124389648438, "logps/rejected": -573.2512817382812, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -0.30640894174575806, "rewards/margins": 17.892559051513672, "rewards/rejected": -18.198970794677734, "step": 2470 }, { "epoch": 0.84, "learning_rate": 3.99471232531789e-07, "logits/chosen": -0.4635310173034668, "logits/rejected": -0.5817512273788452, "logps/chosen": -267.98748779296875, "logps/rejected": -551.7185668945312, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6688281893730164, "rewards/margins": 17.886335372924805, "rewards/rejected": -18.555164337158203, "step": 2480 }, { "epoch": 0.85, "learning_rate": 3.988417474505854e-07, "logits/chosen": -0.36993610858917236, "logits/rejected": -0.5304928421974182, "logps/chosen": -208.63705444335938, "logps/rejected": -465.3739318847656, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.6690204739570618, "rewards/margins": 17.467199325561523, "rewards/rejected": -18.136219024658203, "step": 2490 }, { "epoch": 0.85, "learning_rate": 3.982122623693818e-07, "logits/chosen": -0.4395477771759033, "logits/rejected": -0.5985128283500671, "logps/chosen": -271.6374816894531, "logps/rejected": -499.2342834472656, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.5782634019851685, "rewards/margins": 20.47756576538086, "rewards/rejected": -21.055830001831055, "step": 2500 }, { "epoch": 0.85, "eval_logits/chosen": -0.6223477721214294, "eval_logits/rejected": -0.6317439079284668, "eval_logps/chosen": -224.68197631835938, "eval_logps/rejected": -625.253173828125, "eval_loss": 0.02767534740269184, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.6526150107383728, "eval_rewards/margins": 20.386310577392578, "eval_rewards/rejected": -21.038925170898438, "eval_runtime": 532.8632, "eval_samples_per_second": 17.828, "eval_steps_per_second": 0.557, "step": 2500 }, { "epoch": 0.85, "learning_rate": 3.9758277728817824e-07, "logits/chosen": -0.3795573115348816, "logits/rejected": -0.5880419015884399, "logps/chosen": -249.85400390625, "logps/rejected": -591.1525268554688, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.8366333246231079, "rewards/margins": 18.46705436706543, "rewards/rejected": -19.303688049316406, "step": 2510 }, { "epoch": 0.86, "learning_rate": 3.9695329220697467e-07, "logits/chosen": -0.5687035918235779, "logits/rejected": -0.6019984483718872, "logps/chosen": -164.0320587158203, "logps/rejected": -779.9796142578125, "loss": 0.016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9198579788208008, "rewards/margins": 18.67759132385254, "rewards/rejected": -19.597448348999023, "step": 2520 }, { "epoch": 0.86, "learning_rate": 3.9632380712577114e-07, "logits/chosen": -0.5434810519218445, "logits/rejected": -0.49595022201538086, "logps/chosen": -222.5576934814453, "logps/rejected": -573.5501708984375, "loss": 0.0071, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5597999691963196, "rewards/margins": 16.69998550415039, "rewards/rejected": -17.259784698486328, "step": 2530 }, { "epoch": 0.86, "learning_rate": 3.9569432204456756e-07, "logits/chosen": -0.4817582666873932, "logits/rejected": -0.5140596628189087, "logps/chosen": -170.65225219726562, "logps/rejected": -494.15142822265625, "loss": 0.0197, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8749632835388184, "rewards/margins": 17.499698638916016, "rewards/rejected": -18.374662399291992, "step": 2540 }, { "epoch": 0.87, "learning_rate": 3.9506483696336393e-07, "logits/chosen": -0.525627076625824, "logits/rejected": -0.670019268989563, "logps/chosen": -219.9454803466797, "logps/rejected": -649.2623901367188, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.9405611753463745, "rewards/margins": 23.287403106689453, "rewards/rejected": -24.227962493896484, "step": 2550 }, { "epoch": 0.87, "learning_rate": 3.9443535188216036e-07, "logits/chosen": -0.6082872152328491, "logits/rejected": -0.577377438545227, "logps/chosen": -169.99879455566406, "logps/rejected": -718.3258056640625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.8879270553588867, "rewards/margins": 22.769832611083984, "rewards/rejected": -23.657758712768555, "step": 2560 }, { "epoch": 0.87, "learning_rate": 3.938058668009568e-07, "logits/chosen": -0.4572966694831848, "logits/rejected": -0.6201387643814087, "logps/chosen": -247.07614135742188, "logps/rejected": -680.2173461914062, "loss": 0.0082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4929053783416748, "rewards/margins": 21.74375343322754, "rewards/rejected": -23.236658096313477, "step": 2570 }, { "epoch": 0.88, "learning_rate": 3.931763817197532e-07, "logits/chosen": -0.34698471426963806, "logits/rejected": -0.5512791872024536, "logps/chosen": -220.0507354736328, "logps/rejected": -465.011474609375, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": -0.6938758492469788, "rewards/margins": 20.746440887451172, "rewards/rejected": -21.440319061279297, "step": 2580 }, { "epoch": 0.88, "learning_rate": 3.925468966385497e-07, "logits/chosen": -0.3836641013622284, "logits/rejected": -0.5139411687850952, "logps/chosen": -280.2154235839844, "logps/rejected": -453.62408447265625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.16283278167247772, "rewards/margins": 20.713857650756836, "rewards/rejected": -20.876689910888672, "step": 2590 }, { "epoch": 0.88, "learning_rate": 3.919174115573461e-07, "logits/chosen": -0.5401272773742676, "logits/rejected": -0.620324969291687, "logps/chosen": -166.4394989013672, "logps/rejected": -635.3299560546875, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -0.4331815838813782, "rewards/margins": 21.180192947387695, "rewards/rejected": -21.613374710083008, "step": 2600 }, { "epoch": 0.88, "eval_logits/chosen": -0.6147525310516357, "eval_logits/rejected": -0.6146714687347412, "eval_logps/chosen": -224.66250610351562, "eval_logps/rejected": -635.2158203125, "eval_loss": 0.015801647678017616, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.6506679058074951, "eval_rewards/margins": 21.38451385498047, "eval_rewards/rejected": -22.035186767578125, "eval_runtime": 533.5271, "eval_samples_per_second": 17.806, "eval_steps_per_second": 0.557, "step": 2600 }, { "epoch": 0.89, "learning_rate": 3.912879264761425e-07, "logits/chosen": -0.5585245490074158, "logits/rejected": -0.5902181267738342, "logps/chosen": -231.0035858154297, "logps/rejected": -645.4872436523438, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.40148717164993286, "rewards/margins": 18.81955337524414, "rewards/rejected": -19.221038818359375, "step": 2610 }, { "epoch": 0.89, "learning_rate": 3.906584413949389e-07, "logits/chosen": -0.4793068468570709, "logits/rejected": -0.6282252669334412, "logps/chosen": -311.48284912109375, "logps/rejected": -494.3038024902344, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.0386672019958496, "rewards/margins": 16.050941467285156, "rewards/rejected": -17.08960723876953, "step": 2620 }, { "epoch": 0.89, "learning_rate": 3.900289563137353e-07, "logits/chosen": -0.4950196146965027, "logits/rejected": -0.6424863338470459, "logps/chosen": -191.91009521484375, "logps/rejected": -663.8267211914062, "loss": 0.0064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6827810406684875, "rewards/margins": 22.277069091796875, "rewards/rejected": -22.959850311279297, "step": 2630 }, { "epoch": 0.9, "learning_rate": 3.8939947123253174e-07, "logits/chosen": -0.4640139937400818, "logits/rejected": -0.6179546117782593, "logps/chosen": -214.16943359375, "logps/rejected": -527.736328125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.092980146408081, "rewards/margins": 18.385467529296875, "rewards/rejected": -19.47844886779785, "step": 2640 }, { "epoch": 0.9, "learning_rate": 3.887699861513282e-07, "logits/chosen": -0.5161629915237427, "logits/rejected": -0.6108736395835876, "logps/chosen": -163.3085174560547, "logps/rejected": -492.26708984375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.9609187841415405, "rewards/margins": 22.117177963256836, "rewards/rejected": -23.07809829711914, "step": 2650 }, { "epoch": 0.9, "learning_rate": 3.8814050107012464e-07, "logits/chosen": -0.5504695177078247, "logits/rejected": -0.6493596434593201, "logps/chosen": -231.51113891601562, "logps/rejected": -506.42919921875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.9554810523986816, "rewards/margins": 23.63483428955078, "rewards/rejected": -24.590314865112305, "step": 2660 }, { "epoch": 0.91, "learning_rate": 3.8751101598892106e-07, "logits/chosen": -0.70225989818573, "logits/rejected": -0.574167013168335, "logps/chosen": -178.37973022460938, "logps/rejected": -746.3311767578125, "loss": 0.0032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8485128283500671, "rewards/margins": 22.192935943603516, "rewards/rejected": -23.04144859313965, "step": 2670 }, { "epoch": 0.91, "learning_rate": 3.868815309077175e-07, "logits/chosen": -0.5552138686180115, "logits/rejected": -0.4812951982021332, "logps/chosen": -231.4610595703125, "logps/rejected": -806.1622314453125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.7693663835525513, "rewards/margins": 24.059215545654297, "rewards/rejected": -24.828582763671875, "step": 2680 }, { "epoch": 0.91, "learning_rate": 3.862520458265139e-07, "logits/chosen": -0.5531286597251892, "logits/rejected": -0.6560055017471313, "logps/chosen": -264.0589294433594, "logps/rejected": -800.667724609375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.6953937411308289, "rewards/margins": 22.391925811767578, "rewards/rejected": -23.087322235107422, "step": 2690 }, { "epoch": 0.92, "learning_rate": 3.856225607453103e-07, "logits/chosen": -0.6973247528076172, "logits/rejected": -0.6188694834709167, "logps/chosen": -143.8126220703125, "logps/rejected": -604.6707763671875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.7418287396430969, "rewards/margins": 27.097768783569336, "rewards/rejected": -27.839599609375, "step": 2700 }, { "epoch": 0.92, "eval_logits/chosen": -0.6379349827766418, "eval_logits/rejected": -0.64005446434021, "eval_logps/chosen": -225.61129760742188, "eval_logps/rejected": -640.5007934570312, "eval_loss": 0.014779478311538696, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.7455464601516724, "eval_rewards/margins": 21.818138122558594, "eval_rewards/rejected": -22.563684463500977, "eval_runtime": 534.5041, "eval_samples_per_second": 17.773, "eval_steps_per_second": 0.556, "step": 2700 }, { "epoch": 0.92, "learning_rate": 3.8499307566410675e-07, "logits/chosen": -0.5456520318984985, "logits/rejected": -0.6197179555892944, "logps/chosen": -174.6804656982422, "logps/rejected": -635.2442016601562, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -1.146396279335022, "rewards/margins": 26.3745059967041, "rewards/rejected": -27.52090072631836, "step": 2710 }, { "epoch": 0.92, "learning_rate": 3.843635905829032e-07, "logits/chosen": -0.4810015559196472, "logits/rejected": -0.5615791082382202, "logps/chosen": -323.13885498046875, "logps/rejected": -728.2703857421875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.554740309715271, "rewards/margins": 22.214792251586914, "rewards/rejected": -22.769533157348633, "step": 2720 }, { "epoch": 0.93, "learning_rate": 3.837341055016996e-07, "logits/chosen": -0.5687593817710876, "logits/rejected": -0.6046972274780273, "logps/chosen": -173.86666870117188, "logps/rejected": -689.3641357421875, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -0.6893017888069153, "rewards/margins": 18.58002281188965, "rewards/rejected": -19.26932716369629, "step": 2730 }, { "epoch": 0.93, "learning_rate": 3.83104620420496e-07, "logits/chosen": -0.4767850339412689, "logits/rejected": -0.5928935408592224, "logps/chosen": -245.018310546875, "logps/rejected": -864.3717041015625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.9025334119796753, "rewards/margins": 19.125226974487305, "rewards/rejected": -20.027761459350586, "step": 2740 }, { "epoch": 0.93, "learning_rate": 3.8247513533929244e-07, "logits/chosen": -0.579264223575592, "logits/rejected": -0.596036970615387, "logps/chosen": -165.98851013183594, "logps/rejected": -730.7996215820312, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -0.46008139848709106, "rewards/margins": 20.51328468322754, "rewards/rejected": -20.973363876342773, "step": 2750 }, { "epoch": 0.94, "learning_rate": 3.8184565025808887e-07, "logits/chosen": -0.4826219975948334, "logits/rejected": -0.6288530230522156, "logps/chosen": -247.00991821289062, "logps/rejected": -663.8226318359375, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.5047249794006348, "rewards/margins": 19.6455078125, "rewards/rejected": -20.15023422241211, "step": 2760 }, { "epoch": 0.94, "learning_rate": 3.8121616517688534e-07, "logits/chosen": -0.631325364112854, "logits/rejected": -0.5907931327819824, "logps/chosen": -153.14556884765625, "logps/rejected": -653.4635620117188, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -0.2594422996044159, "rewards/margins": 22.305164337158203, "rewards/rejected": -22.564605712890625, "step": 2770 }, { "epoch": 0.94, "learning_rate": 3.805866800956817e-07, "logits/chosen": -0.5643798112869263, "logits/rejected": -0.5898051857948303, "logps/chosen": -169.017578125, "logps/rejected": -425.62091064453125, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -0.5756908655166626, "rewards/margins": 20.893468856811523, "rewards/rejected": -21.469160079956055, "step": 2780 }, { "epoch": 0.95, "learning_rate": 3.7995719501447813e-07, "logits/chosen": -0.5480406880378723, "logits/rejected": -0.5817909240722656, "logps/chosen": -181.90447998046875, "logps/rejected": -576.1828002929688, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.14275126159191132, "rewards/margins": 21.613452911376953, "rewards/rejected": -21.756202697753906, "step": 2790 }, { "epoch": 0.95, "learning_rate": 3.7932770993327456e-07, "logits/chosen": -0.48949193954467773, "logits/rejected": -0.6195014715194702, "logps/chosen": -181.42752075195312, "logps/rejected": -804.3336791992188, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -0.5842114686965942, "rewards/margins": 19.86795997619629, "rewards/rejected": -20.45216941833496, "step": 2800 }, { "epoch": 0.95, "eval_logits/chosen": -0.6337935924530029, "eval_logits/rejected": -0.6579821705818176, "eval_logps/chosen": -224.3348846435547, "eval_logps/rejected": -632.451171875, "eval_loss": 0.042935892939567566, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.6179047226905823, "eval_rewards/margins": 21.140817642211914, "eval_rewards/rejected": -21.758724212646484, "eval_runtime": 534.9004, "eval_samples_per_second": 17.76, "eval_steps_per_second": 0.555, "step": 2800 }, { "epoch": 0.96, "learning_rate": 3.78698224852071e-07, "logits/chosen": -0.5576199293136597, "logits/rejected": -0.655912458896637, "logps/chosen": -158.83779907226562, "logps/rejected": -478.88555908203125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.6406091451644897, "rewards/margins": 21.203126907348633, "rewards/rejected": -21.843734741210938, "step": 2810 }, { "epoch": 0.96, "learning_rate": 3.780687397708674e-07, "logits/chosen": -0.5711307525634766, "logits/rejected": -0.5970210433006287, "logps/chosen": -183.6444549560547, "logps/rejected": -446.1632385253906, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.610636830329895, "rewards/margins": 17.090158462524414, "rewards/rejected": -17.700796127319336, "step": 2820 }, { "epoch": 0.96, "learning_rate": 3.774392546896638e-07, "logits/chosen": -0.4669331908226013, "logits/rejected": -0.5918110609054565, "logps/chosen": -242.68533325195312, "logps/rejected": -742.5610961914062, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -0.8708280324935913, "rewards/margins": 21.00680923461914, "rewards/rejected": -21.877635955810547, "step": 2830 }, { "epoch": 0.97, "learning_rate": 3.768097696084603e-07, "logits/chosen": -0.524229109287262, "logits/rejected": -0.5910091996192932, "logps/chosen": -211.4125518798828, "logps/rejected": -692.6990356445312, "loss": 0.0098, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.37801894545555115, "rewards/margins": 23.56831932067871, "rewards/rejected": -23.946338653564453, "step": 2840 }, { "epoch": 0.97, "learning_rate": 3.761802845272567e-07, "logits/chosen": -0.5863999128341675, "logits/rejected": -0.6406316161155701, "logps/chosen": -220.8341827392578, "logps/rejected": -583.6329345703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -0.30830901861190796, "rewards/margins": 23.575008392333984, "rewards/rejected": -23.883319854736328, "step": 2850 }, { "epoch": 0.97, "learning_rate": 3.755507994460531e-07, "logits/chosen": -0.6528455018997192, "logits/rejected": -0.5631856918334961, "logps/chosen": -182.03982543945312, "logps/rejected": -534.8107299804688, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.9082008600234985, "rewards/margins": 19.31280517578125, "rewards/rejected": -20.221006393432617, "step": 2860 }, { "epoch": 0.98, "learning_rate": 3.749213143648495e-07, "logits/chosen": -0.5226881504058838, "logits/rejected": -0.6268125772476196, "logps/chosen": -293.3172607421875, "logps/rejected": -632.835693359375, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3401877284049988, "rewards/margins": 16.875564575195312, "rewards/rejected": -17.215749740600586, "step": 2870 }, { "epoch": 0.98, "learning_rate": 3.7429182928364594e-07, "logits/chosen": -0.6680216193199158, "logits/rejected": -0.5962798595428467, "logps/chosen": -225.3952178955078, "logps/rejected": -817.5527954101562, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.7935547828674316, "rewards/margins": 17.373668670654297, "rewards/rejected": -18.167226791381836, "step": 2880 }, { "epoch": 0.98, "learning_rate": 3.7366234420244236e-07, "logits/chosen": -0.3468959927558899, "logits/rejected": -0.6473889350891113, "logps/chosen": -358.51226806640625, "logps/rejected": -690.7733154296875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.6796993613243103, "rewards/margins": 20.785585403442383, "rewards/rejected": -21.465282440185547, "step": 2890 }, { "epoch": 0.99, "learning_rate": 3.7303285912123884e-07, "logits/chosen": -0.528616189956665, "logits/rejected": -0.6073054075241089, "logps/chosen": -176.47552490234375, "logps/rejected": -597.8533935546875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.43060389161109924, "rewards/margins": 22.78934097290039, "rewards/rejected": -23.21994972229004, "step": 2900 }, { "epoch": 0.99, "eval_logits/chosen": -0.6361685991287231, "eval_logits/rejected": -0.6473199129104614, "eval_logps/chosen": -221.24884033203125, "eval_logps/rejected": -628.4747924804688, "eval_loss": 0.0451551154255867, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.30930212140083313, "eval_rewards/margins": 21.05178451538086, "eval_rewards/rejected": -21.3610897064209, "eval_runtime": 536.0909, "eval_samples_per_second": 17.721, "eval_steps_per_second": 0.554, "step": 2900 }, { "epoch": 0.99, "learning_rate": 3.7240337404003526e-07, "logits/chosen": -0.45460978150367737, "logits/rejected": -0.6425790190696716, "logps/chosen": -178.80308532714844, "logps/rejected": -714.7589111328125, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.3009614646434784, "rewards/margins": 20.526132583618164, "rewards/rejected": -20.82709312438965, "step": 2910 }, { "epoch": 0.99, "learning_rate": 3.717738889588317e-07, "logits/chosen": -0.4810408651828766, "logits/rejected": -0.7286826372146606, "logps/chosen": -189.9222412109375, "logps/rejected": -756.8983764648438, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0682488679885864, "rewards/margins": 22.624393463134766, "rewards/rejected": -23.69264030456543, "step": 2920 }, { "epoch": 1.0, "learning_rate": 3.7114440387762805e-07, "logits/chosen": -0.509602427482605, "logits/rejected": -0.6578050851821899, "logps/chosen": -170.62266540527344, "logps/rejected": -454.8040466308594, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.7977365255355835, "rewards/margins": 19.150104522705078, "rewards/rejected": -19.94784164428711, "step": 2930 }, { "epoch": 1.0, "learning_rate": 3.705149187964245e-07, "logits/chosen": -0.41446733474731445, "logits/rejected": -0.6176181435585022, "logps/chosen": -245.0283203125, "logps/rejected": -563.2479248046875, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -0.7349718809127808, "rewards/margins": 19.123676300048828, "rewards/rejected": -19.8586483001709, "step": 2940 }, { "epoch": 1.0, "learning_rate": 3.698854337152209e-07, "logits/chosen": -0.3989887237548828, "logits/rejected": -0.5773409008979797, "logps/chosen": -160.10055541992188, "logps/rejected": -546.182373046875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.2707807421684265, "rewards/margins": 20.404319763183594, "rewards/rejected": -20.67510223388672, "step": 2950 }, { "epoch": 1.01, "learning_rate": 3.692559486340174e-07, "logits/chosen": -0.5812191367149353, "logits/rejected": -0.6470521688461304, "logps/chosen": -215.756103515625, "logps/rejected": -929.5314331054688, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -0.6215667128562927, "rewards/margins": 21.673276901245117, "rewards/rejected": -22.294841766357422, "step": 2960 }, { "epoch": 1.01, "learning_rate": 3.686264635528138e-07, "logits/chosen": -0.7138069868087769, "logits/rejected": -0.6058154106140137, "logps/chosen": -156.8502197265625, "logps/rejected": -378.6007995605469, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.6348203420639038, "rewards/margins": 20.122272491455078, "rewards/rejected": -20.757091522216797, "step": 2970 }, { "epoch": 1.01, "learning_rate": 3.679969784716102e-07, "logits/chosen": -0.6421266198158264, "logits/rejected": -0.6446320414543152, "logps/chosen": -155.9070281982422, "logps/rejected": -528.64306640625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -0.39291244745254517, "rewards/margins": 19.31147003173828, "rewards/rejected": -19.704381942749023, "step": 2980 }, { "epoch": 1.02, "learning_rate": 3.6736749339040664e-07, "logits/chosen": -0.5348082780838013, "logits/rejected": -0.6018053293228149, "logps/chosen": -231.8517608642578, "logps/rejected": -657.16650390625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.3462441563606262, "rewards/margins": 19.6461181640625, "rewards/rejected": -19.99236297607422, "step": 2990 }, { "epoch": 1.02, "learning_rate": 3.6673800830920307e-07, "logits/chosen": -0.40881386399269104, "logits/rejected": -0.5172764658927917, "logps/chosen": -212.2381134033203, "logps/rejected": -566.4102172851562, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.256114661693573, "rewards/margins": 16.861278533935547, "rewards/rejected": -17.117395401000977, "step": 3000 }, { "epoch": 1.02, "eval_logits/chosen": -0.6499646306037903, "eval_logits/rejected": -0.6812382936477661, "eval_logps/chosen": -222.4543914794922, "eval_logps/rejected": -621.048828125, "eval_loss": 0.039933547377586365, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.4298532009124756, "eval_rewards/margins": 20.188640594482422, "eval_rewards/rejected": -20.618494033813477, "eval_runtime": 535.9163, "eval_samples_per_second": 17.727, "eval_steps_per_second": 0.554, "step": 3000 }, { "epoch": 1.02, "learning_rate": 3.6610852322799943e-07, "logits/chosen": -0.6252007484436035, "logits/rejected": -0.577701985836029, "logps/chosen": -264.13543701171875, "logps/rejected": -608.8316650390625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.46024736762046814, "rewards/margins": 19.29156494140625, "rewards/rejected": -19.751811981201172, "step": 3010 }, { "epoch": 1.03, "learning_rate": 3.654790381467959e-07, "logits/chosen": -0.5669766664505005, "logits/rejected": -0.6807463765144348, "logps/chosen": -166.41195678710938, "logps/rejected": -441.8485412597656, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.6847031712532043, "rewards/margins": 19.156993865966797, "rewards/rejected": -19.841693878173828, "step": 3020 }, { "epoch": 1.03, "learning_rate": 3.6484955306559233e-07, "logits/chosen": -0.47833046317100525, "logits/rejected": -0.6191781163215637, "logps/chosen": -287.1867370605469, "logps/rejected": -503.5406188964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.392835795879364, "rewards/margins": 18.89917755126953, "rewards/rejected": -19.29201316833496, "step": 3030 }, { "epoch": 1.03, "learning_rate": 3.6422006798438876e-07, "logits/chosen": -0.5957221984863281, "logits/rejected": -0.6484453082084656, "logps/chosen": -205.3809356689453, "logps/rejected": -599.25634765625, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -0.6915081739425659, "rewards/margins": 20.328153610229492, "rewards/rejected": -21.019662857055664, "step": 3040 }, { "epoch": 1.04, "learning_rate": 3.635905829031852e-07, "logits/chosen": -0.5771939158439636, "logits/rejected": -0.6921709775924683, "logps/chosen": -286.1935729980469, "logps/rejected": -489.37249755859375, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -0.38978680968284607, "rewards/margins": 20.511150360107422, "rewards/rejected": -20.900938034057617, "step": 3050 }, { "epoch": 1.04, "learning_rate": 3.629610978219816e-07, "logits/chosen": -0.5401151180267334, "logits/rejected": -0.6881545186042786, "logps/chosen": -183.5184783935547, "logps/rejected": -768.9088745117188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.7985284924507141, "rewards/margins": 19.598268508911133, "rewards/rejected": -20.396799087524414, "step": 3060 }, { "epoch": 1.04, "learning_rate": 3.62331612740778e-07, "logits/chosen": -0.5568596720695496, "logits/rejected": -0.6767106652259827, "logps/chosen": -284.1183776855469, "logps/rejected": -630.967041015625, "loss": 0.0039, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3974451720714569, "rewards/margins": 22.21986961364746, "rewards/rejected": -22.617313385009766, "step": 3070 }, { "epoch": 1.05, "learning_rate": 3.617021276595745e-07, "logits/chosen": -0.4026394784450531, "logits/rejected": -0.6144591569900513, "logps/chosen": -384.13861083984375, "logps/rejected": -526.8475952148438, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.8550363779067993, "rewards/margins": 18.697277069091797, "rewards/rejected": -19.552310943603516, "step": 3080 }, { "epoch": 1.05, "learning_rate": 3.6107264257837087e-07, "logits/chosen": -0.6986773610115051, "logits/rejected": -0.6541129350662231, "logps/chosen": -224.229248046875, "logps/rejected": -722.6557006835938, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.7487368583679199, "rewards/margins": 22.25094223022461, "rewards/rejected": -22.999679565429688, "step": 3090 }, { "epoch": 1.05, "learning_rate": 3.604431574971673e-07, "logits/chosen": -0.4346039295196533, "logits/rejected": -0.5835973620414734, "logps/chosen": -225.5347900390625, "logps/rejected": -592.0372924804688, "loss": 0.1239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.30401450395584106, "rewards/margins": 16.633136749267578, "rewards/rejected": -16.937150955200195, "step": 3100 }, { "epoch": 1.05, "eval_logits/chosen": -0.6327572464942932, "eval_logits/rejected": -0.661234974861145, "eval_logps/chosen": -222.31198120117188, "eval_logps/rejected": -631.3914794921875, "eval_loss": 0.009817845188081264, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.4156162142753601, "eval_rewards/margins": 21.237140655517578, "eval_rewards/rejected": -21.652755737304688, "eval_runtime": 535.9325, "eval_samples_per_second": 17.726, "eval_steps_per_second": 0.554, "step": 3100 }, { "epoch": 1.06, "learning_rate": 3.598136724159637e-07, "logits/chosen": -0.4594550132751465, "logits/rejected": -0.6480199694633484, "logps/chosen": -274.86480712890625, "logps/rejected": -520.8260498046875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4669904112815857, "rewards/margins": 24.77928352355957, "rewards/rejected": -25.24627113342285, "step": 3110 }, { "epoch": 1.06, "learning_rate": 3.5918418733476014e-07, "logits/chosen": -0.5901859402656555, "logits/rejected": -0.6077349185943604, "logps/chosen": -282.0781555175781, "logps/rejected": -610.4339599609375, "loss": 0.0728, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07001875340938568, "rewards/margins": 22.44624900817871, "rewards/rejected": -22.516265869140625, "step": 3120 }, { "epoch": 1.06, "learning_rate": 3.5855470225355656e-07, "logits/chosen": -0.7106374502182007, "logits/rejected": -0.6897737979888916, "logps/chosen": -221.6403045654297, "logps/rejected": -618.2293701171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.5246185064315796, "rewards/margins": 20.13018798828125, "rewards/rejected": -20.654804229736328, "step": 3130 }, { "epoch": 1.07, "learning_rate": 3.5792521717235304e-07, "logits/chosen": -0.5421442985534668, "logits/rejected": -0.6039215326309204, "logps/chosen": -228.9363555908203, "logps/rejected": -517.756591796875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -0.7406402826309204, "rewards/margins": 19.58643341064453, "rewards/rejected": -20.327075958251953, "step": 3140 }, { "epoch": 1.07, "learning_rate": 3.5729573209114946e-07, "logits/chosen": -0.5481891632080078, "logits/rejected": -0.5641008019447327, "logps/chosen": -163.91046142578125, "logps/rejected": -580.9547119140625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.7606403827667236, "rewards/margins": 18.112018585205078, "rewards/rejected": -18.872661590576172, "step": 3150 }, { "epoch": 1.07, "learning_rate": 3.5666624700994583e-07, "logits/chosen": -0.5734115839004517, "logits/rejected": -0.6029922366142273, "logps/chosen": -166.51504516601562, "logps/rejected": -921.9576416015625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.18629872798919678, "rewards/margins": 23.86395835876465, "rewards/rejected": -24.050256729125977, "step": 3160 }, { "epoch": 1.08, "learning_rate": 3.5603676192874225e-07, "logits/chosen": -0.6830928325653076, "logits/rejected": -0.5797451138496399, "logps/chosen": -152.14706420898438, "logps/rejected": -492.1962890625, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.12412633001804352, "rewards/margins": 18.959190368652344, "rewards/rejected": -19.083316802978516, "step": 3170 }, { "epoch": 1.08, "learning_rate": 3.554072768475387e-07, "logits/chosen": -0.5152510404586792, "logits/rejected": -0.6227437853813171, "logps/chosen": -295.2653503417969, "logps/rejected": -576.8563232421875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.0023182511795312166, "rewards/margins": 19.405780792236328, "rewards/rejected": -19.408100128173828, "step": 3180 }, { "epoch": 1.08, "learning_rate": 3.547777917663351e-07, "logits/chosen": -0.5957705974578857, "logits/rejected": -0.5866124033927917, "logps/chosen": -212.3314208984375, "logps/rejected": -725.8005981445312, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5937275290489197, "rewards/margins": 23.575305938720703, "rewards/rejected": -24.169034957885742, "step": 3190 }, { "epoch": 1.09, "learning_rate": 3.5414830668513157e-07, "logits/chosen": -0.538429319858551, "logits/rejected": -0.5749589204788208, "logps/chosen": -153.5274200439453, "logps/rejected": -554.267578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.2545836865901947, "rewards/margins": 24.084218978881836, "rewards/rejected": -24.338804244995117, "step": 3200 }, { "epoch": 1.09, "eval_logits/chosen": -0.6310383677482605, "eval_logits/rejected": -0.6459502577781677, "eval_logps/chosen": -222.9790802001953, "eval_logps/rejected": -656.2341918945312, "eval_loss": 0.004120314959436655, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.48232755064964294, "eval_rewards/margins": 23.654691696166992, "eval_rewards/rejected": -24.13701820373535, "eval_runtime": 535.1387, "eval_samples_per_second": 17.752, "eval_steps_per_second": 0.555, "step": 3200 }, { "epoch": 1.09, "learning_rate": 3.53518821603928e-07, "logits/chosen": -0.4803314805030823, "logits/rejected": -0.6297375559806824, "logps/chosen": -174.4329071044922, "logps/rejected": -577.9775390625, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.836286187171936, "rewards/margins": 25.589202880859375, "rewards/rejected": -26.425487518310547, "step": 3210 }, { "epoch": 1.09, "learning_rate": 3.528893365227244e-07, "logits/chosen": -0.48449355363845825, "logits/rejected": -0.6169676780700684, "logps/chosen": -310.72418212890625, "logps/rejected": -569.453125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.4080580174922943, "rewards/margins": 22.381479263305664, "rewards/rejected": -22.789535522460938, "step": 3220 }, { "epoch": 1.1, "learning_rate": 3.5225985144152084e-07, "logits/chosen": -0.6881300210952759, "logits/rejected": -0.6696790456771851, "logps/chosen": -164.5923614501953, "logps/rejected": -581.3643798828125, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -0.5378624796867371, "rewards/margins": 27.030481338500977, "rewards/rejected": -27.56833839416504, "step": 3230 }, { "epoch": 1.1, "learning_rate": 3.516303663603172e-07, "logits/chosen": -0.33399826288223267, "logits/rejected": -0.5630447268486023, "logps/chosen": -237.5459747314453, "logps/rejected": -616.1966552734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5542330741882324, "rewards/margins": 23.228235244750977, "rewards/rejected": -23.782466888427734, "step": 3240 }, { "epoch": 1.1, "learning_rate": 3.5100088127911363e-07, "logits/chosen": -0.5336470603942871, "logits/rejected": -0.5921178460121155, "logps/chosen": -278.57635498046875, "logps/rejected": -573.6995849609375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4158684313297272, "rewards/margins": 27.560577392578125, "rewards/rejected": -27.976444244384766, "step": 3250 }, { "epoch": 1.11, "learning_rate": 3.503713961979101e-07, "logits/chosen": -0.322685569524765, "logits/rejected": -0.6696431636810303, "logps/chosen": -230.83602905273438, "logps/rejected": -671.8842163085938, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.16155432164669037, "rewards/margins": 27.474172592163086, "rewards/rejected": -27.635726928710938, "step": 3260 }, { "epoch": 1.11, "learning_rate": 3.4974191111670653e-07, "logits/chosen": -0.5723060369491577, "logits/rejected": -0.6341259479522705, "logps/chosen": -229.0471649169922, "logps/rejected": -812.7017822265625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.0158292055130005, "rewards/margins": 23.835529327392578, "rewards/rejected": -24.851356506347656, "step": 3270 }, { "epoch": 1.11, "learning_rate": 3.4911242603550296e-07, "logits/chosen": -0.363954097032547, "logits/rejected": -0.682758629322052, "logps/chosen": -216.44656372070312, "logps/rejected": -543.26806640625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.6192915439605713, "rewards/margins": 21.245447158813477, "rewards/rejected": -21.864736557006836, "step": 3280 }, { "epoch": 1.12, "learning_rate": 3.484829409542994e-07, "logits/chosen": -0.5068201422691345, "logits/rejected": -0.569583535194397, "logps/chosen": -254.6343231201172, "logps/rejected": -633.4774169921875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.39383208751678467, "rewards/margins": 25.339698791503906, "rewards/rejected": -25.733531951904297, "step": 3290 }, { "epoch": 1.12, "learning_rate": 3.478534558730958e-07, "logits/chosen": -0.47961869835853577, "logits/rejected": -0.6686064004898071, "logps/chosen": -219.3195343017578, "logps/rejected": -582.8184814453125, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.7996060252189636, "rewards/margins": 24.826183319091797, "rewards/rejected": -25.625789642333984, "step": 3300 }, { "epoch": 1.12, "eval_logits/chosen": -0.6482299566268921, "eval_logits/rejected": -0.6622685194015503, "eval_logps/chosen": -224.40591430664062, "eval_logps/rejected": -669.3063354492188, "eval_loss": 0.0036542899906635284, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.6250095963478088, "eval_rewards/margins": 24.819223403930664, "eval_rewards/rejected": -25.44423484802246, "eval_runtime": 535.585, "eval_samples_per_second": 17.738, "eval_steps_per_second": 0.555, "step": 3300 }, { "epoch": 1.13, "learning_rate": 3.4722397079189217e-07, "logits/chosen": -0.5651262998580933, "logits/rejected": -0.6671077609062195, "logps/chosen": -180.79421997070312, "logps/rejected": -686.2698974609375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.5120385885238647, "rewards/margins": 24.743419647216797, "rewards/rejected": -25.255456924438477, "step": 3310 }, { "epoch": 1.13, "learning_rate": 3.4659448571068865e-07, "logits/chosen": -0.6452927589416504, "logits/rejected": -0.5964315533638, "logps/chosen": -331.1089172363281, "logps/rejected": -731.6763916015625, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -0.9424569010734558, "rewards/margins": 30.840368270874023, "rewards/rejected": -31.782827377319336, "step": 3320 }, { "epoch": 1.13, "learning_rate": 3.4596500062948507e-07, "logits/chosen": -0.5212825536727905, "logits/rejected": -0.5639342069625854, "logps/chosen": -247.92855834960938, "logps/rejected": -669.9676513671875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.4934036135673523, "rewards/margins": 24.6220760345459, "rewards/rejected": -25.115480422973633, "step": 3330 }, { "epoch": 1.14, "learning_rate": 3.453355155482815e-07, "logits/chosen": -0.5801979303359985, "logits/rejected": -0.6691958904266357, "logps/chosen": -210.4730682373047, "logps/rejected": -851.1979370117188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4683939814567566, "rewards/margins": 21.26134490966797, "rewards/rejected": -21.729740142822266, "step": 3340 }, { "epoch": 1.14, "learning_rate": 3.447060304670779e-07, "logits/chosen": -0.6566864848136902, "logits/rejected": -0.5926877856254578, "logps/chosen": -166.0283966064453, "logps/rejected": -639.298095703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5761814713478088, "rewards/margins": 25.22756576538086, "rewards/rejected": -25.803747177124023, "step": 3350 }, { "epoch": 1.14, "learning_rate": 3.4407654538587434e-07, "logits/chosen": -0.4672514796257019, "logits/rejected": -0.7084232568740845, "logps/chosen": -248.3604278564453, "logps/rejected": -721.3619384765625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6948641538619995, "rewards/margins": 22.243799209594727, "rewards/rejected": -22.938661575317383, "step": 3360 }, { "epoch": 1.15, "learning_rate": 3.4344706030467076e-07, "logits/chosen": -0.6167628765106201, "logits/rejected": -0.6096751093864441, "logps/chosen": -175.40591430664062, "logps/rejected": -808.9859619140625, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -0.4607621133327484, "rewards/margins": 24.451772689819336, "rewards/rejected": -24.91253662109375, "step": 3370 }, { "epoch": 1.15, "learning_rate": 3.4281757522346724e-07, "logits/chosen": -0.4866950511932373, "logits/rejected": -0.6506379246711731, "logps/chosen": -287.3866882324219, "logps/rejected": -554.6976318359375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2814773917198181, "rewards/margins": 22.312875747680664, "rewards/rejected": -22.5943546295166, "step": 3380 }, { "epoch": 1.15, "learning_rate": 3.421880901422636e-07, "logits/chosen": -0.5648037195205688, "logits/rejected": -0.6015470623970032, "logps/chosen": -222.2810821533203, "logps/rejected": -761.0306396484375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.29436182975769043, "rewards/margins": 22.276613235473633, "rewards/rejected": -22.570972442626953, "step": 3390 }, { "epoch": 1.16, "learning_rate": 3.4155860506106003e-07, "logits/chosen": -0.6589870452880859, "logits/rejected": -0.6446484327316284, "logps/chosen": -227.8944549560547, "logps/rejected": -671.156005859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.28819558024406433, "rewards/margins": 23.054214477539062, "rewards/rejected": -23.342411041259766, "step": 3400 }, { "epoch": 1.16, "eval_logits/chosen": -0.6142415404319763, "eval_logits/rejected": -0.6331081390380859, "eval_logps/chosen": -220.03668212890625, "eval_logps/rejected": -650.5010375976562, "eval_loss": 0.003882015123963356, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.1880865842103958, "eval_rewards/margins": 23.375625610351562, "eval_rewards/rejected": -23.563709259033203, "eval_runtime": 536.4037, "eval_samples_per_second": 17.711, "eval_steps_per_second": 0.554, "step": 3400 }, { "epoch": 1.16, "learning_rate": 3.4092911997985645e-07, "logits/chosen": -0.5248831510543823, "logits/rejected": -0.6520252823829651, "logps/chosen": -221.42904663085938, "logps/rejected": -645.4061279296875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.35314497351646423, "rewards/margins": 21.675411224365234, "rewards/rejected": -22.0285587310791, "step": 3410 }, { "epoch": 1.16, "learning_rate": 3.402996348986529e-07, "logits/chosen": -0.44672784209251404, "logits/rejected": -0.5755100846290588, "logps/chosen": -290.2615661621094, "logps/rejected": -700.003662109375, "loss": 0.0082, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2534549832344055, "rewards/margins": 20.79842758178711, "rewards/rejected": -21.051881790161133, "step": 3420 }, { "epoch": 1.17, "learning_rate": 3.396701498174493e-07, "logits/chosen": -0.5202574133872986, "logits/rejected": -0.565936803817749, "logps/chosen": -235.45504760742188, "logps/rejected": -616.531494140625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.3813103139400482, "rewards/margins": 20.84848403930664, "rewards/rejected": -21.22979736328125, "step": 3430 }, { "epoch": 1.17, "learning_rate": 3.3904066473624577e-07, "logits/chosen": -0.5865752696990967, "logits/rejected": -0.6585182547569275, "logps/chosen": -167.64541625976562, "logps/rejected": -659.0266723632812, "loss": 0.006, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5113507509231567, "rewards/margins": 21.376943588256836, "rewards/rejected": -21.888296127319336, "step": 3440 }, { "epoch": 1.17, "learning_rate": 3.384111796550422e-07, "logits/chosen": -0.6253348588943481, "logits/rejected": -0.6733888387680054, "logps/chosen": -164.92002868652344, "logps/rejected": -702.3135986328125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.8295150995254517, "rewards/margins": 20.888399124145508, "rewards/rejected": -21.717914581298828, "step": 3450 }, { "epoch": 1.18, "learning_rate": 3.377816945738386e-07, "logits/chosen": -0.6449594497680664, "logits/rejected": -0.4859851002693176, "logps/chosen": -158.80001831054688, "logps/rejected": -544.9908447265625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4347860813140869, "rewards/margins": 20.02499008178711, "rewards/rejected": -20.459775924682617, "step": 3460 }, { "epoch": 1.18, "learning_rate": 3.37152209492635e-07, "logits/chosen": -0.5075950026512146, "logits/rejected": -0.5371834635734558, "logps/chosen": -225.5575714111328, "logps/rejected": -664.6121826171875, "loss": 0.0065, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5992003679275513, "rewards/margins": 19.59102439880371, "rewards/rejected": -20.190223693847656, "step": 3470 }, { "epoch": 1.18, "learning_rate": 3.365227244114314e-07, "logits/chosen": -0.7203564047813416, "logits/rejected": -0.57438725233078, "logps/chosen": -149.720703125, "logps/rejected": -557.320068359375, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8488510251045227, "rewards/margins": 20.745454788208008, "rewards/rejected": -21.59430503845215, "step": 3480 }, { "epoch": 1.19, "learning_rate": 3.3589323933022783e-07, "logits/chosen": -0.4867991507053375, "logits/rejected": -0.6257959604263306, "logps/chosen": -252.96484375, "logps/rejected": -663.0682373046875, "loss": 0.0036, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7242579460144043, "rewards/margins": 24.331472396850586, "rewards/rejected": -25.05573081970215, "step": 3490 }, { "epoch": 1.19, "learning_rate": 3.3526375424902426e-07, "logits/chosen": -0.5065933465957642, "logits/rejected": -0.6244116425514221, "logps/chosen": -234.55712890625, "logps/rejected": -620.8785400390625, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.5914728045463562, "rewards/margins": 24.563915252685547, "rewards/rejected": -25.15538787841797, "step": 3500 }, { "epoch": 1.19, "eval_logits/chosen": -0.6401923894882202, "eval_logits/rejected": -0.6644282937049866, "eval_logps/chosen": -221.40667724609375, "eval_logps/rejected": -655.4830322265625, "eval_loss": 0.00386338634416461, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.325082927942276, "eval_rewards/margins": 23.73682975769043, "eval_rewards/rejected": -24.061912536621094, "eval_runtime": 535.8378, "eval_samples_per_second": 17.729, "eval_steps_per_second": 0.554, "step": 3500 }, { "epoch": 1.19, "learning_rate": 3.3463426916782073e-07, "logits/chosen": -0.5657549500465393, "logits/rejected": -0.5529078245162964, "logps/chosen": -208.6049346923828, "logps/rejected": -543.8447875976562, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24216929078102112, "rewards/margins": 23.841793060302734, "rewards/rejected": -24.08396339416504, "step": 3510 }, { "epoch": 1.2, "learning_rate": 3.3400478408661716e-07, "logits/chosen": -0.4236617088317871, "logits/rejected": -0.7109118700027466, "logps/chosen": -176.38917541503906, "logps/rejected": -458.9212341308594, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.06428302824497223, "rewards/margins": 22.875444412231445, "rewards/rejected": -22.939725875854492, "step": 3520 }, { "epoch": 1.2, "learning_rate": 3.333752990054136e-07, "logits/chosen": -0.54302978515625, "logits/rejected": -0.6346898674964905, "logps/chosen": -221.1962127685547, "logps/rejected": -750.2716064453125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.03781633824110031, "rewards/margins": 25.192829132080078, "rewards/rejected": -25.230648040771484, "step": 3530 }, { "epoch": 1.2, "learning_rate": 3.3274581392420995e-07, "logits/chosen": -0.5955469608306885, "logits/rejected": -0.5674196481704712, "logps/chosen": -145.55331420898438, "logps/rejected": -537.1456298828125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.15676219761371613, "rewards/margins": 25.38955307006836, "rewards/rejected": -25.546316146850586, "step": 3540 }, { "epoch": 1.21, "learning_rate": 3.3211632884300637e-07, "logits/chosen": -0.3348035216331482, "logits/rejected": -0.64543217420578, "logps/chosen": -297.6194763183594, "logps/rejected": -644.1593017578125, "loss": 0.1028, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.23276686668396, "rewards/margins": 22.291088104248047, "rewards/rejected": -23.523853302001953, "step": 3550 }, { "epoch": 1.21, "learning_rate": 3.314868437618028e-07, "logits/chosen": -0.362000972032547, "logits/rejected": -0.6346918344497681, "logps/chosen": -304.65869140625, "logps/rejected": -675.5734252929688, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.049671344459056854, "rewards/margins": 27.12630271911621, "rewards/rejected": -27.17597007751465, "step": 3560 }, { "epoch": 1.21, "learning_rate": 3.3085735868059927e-07, "logits/chosen": -0.45971646904945374, "logits/rejected": -0.6427809596061707, "logps/chosen": -347.6662902832031, "logps/rejected": -543.4942626953125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.2513541579246521, "rewards/margins": 25.006072998046875, "rewards/rejected": -25.257427215576172, "step": 3570 }, { "epoch": 1.22, "learning_rate": 3.302278735993957e-07, "logits/chosen": -0.4451538920402527, "logits/rejected": -0.5961150527000427, "logps/chosen": -292.34716796875, "logps/rejected": -853.0267333984375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.35571613907814026, "rewards/margins": 25.34796714782715, "rewards/rejected": -25.703685760498047, "step": 3580 }, { "epoch": 1.22, "learning_rate": 3.295983885181921e-07, "logits/chosen": -0.5864584445953369, "logits/rejected": -0.5900254845619202, "logps/chosen": -341.0389099121094, "logps/rejected": -796.8211669921875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.6121021509170532, "rewards/margins": 24.576696395874023, "rewards/rejected": -25.188800811767578, "step": 3590 }, { "epoch": 1.22, "learning_rate": 3.2896890343698854e-07, "logits/chosen": -0.6598680019378662, "logits/rejected": -0.5812188386917114, "logps/chosen": -158.32850646972656, "logps/rejected": -617.4008178710938, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.26627498865127563, "rewards/margins": 22.500194549560547, "rewards/rejected": -22.766469955444336, "step": 3600 }, { "epoch": 1.22, "eval_logits/chosen": -0.6330205798149109, "eval_logits/rejected": -0.6421379446983337, "eval_logps/chosen": -222.4930877685547, "eval_logps/rejected": -682.8770141601562, "eval_loss": 0.003120874287560582, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.43372485041618347, "eval_rewards/margins": 26.36758804321289, "eval_rewards/rejected": -26.801311492919922, "eval_runtime": 536.6351, "eval_samples_per_second": 17.703, "eval_steps_per_second": 0.553, "step": 3600 }, { "epoch": 1.23, "learning_rate": 3.2833941835578496e-07, "logits/chosen": -0.4642692506313324, "logits/rejected": -0.5733259320259094, "logps/chosen": -189.3290252685547, "logps/rejected": -750.3478393554688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.03479035943746567, "rewards/margins": 28.482711791992188, "rewards/rejected": -28.447921752929688, "step": 3610 }, { "epoch": 1.23, "learning_rate": 3.2770993327458133e-07, "logits/chosen": -0.517525315284729, "logits/rejected": -0.6196783185005188, "logps/chosen": -227.9562225341797, "logps/rejected": -567.5996704101562, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.14828169345855713, "rewards/margins": 28.8952693939209, "rewards/rejected": -29.04355239868164, "step": 3620 }, { "epoch": 1.23, "learning_rate": 3.270804481933778e-07, "logits/chosen": -0.47844839096069336, "logits/rejected": -0.5736457109451294, "logps/chosen": -173.18704223632812, "logps/rejected": -674.2376098632812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.23523283004760742, "rewards/margins": 23.103513717651367, "rewards/rejected": -23.338748931884766, "step": 3630 }, { "epoch": 1.24, "learning_rate": 3.2645096311217423e-07, "logits/chosen": -0.3818680942058563, "logits/rejected": -0.5898483991622925, "logps/chosen": -236.1413116455078, "logps/rejected": -498.2266540527344, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.644585907459259, "rewards/margins": 23.830223083496094, "rewards/rejected": -24.474807739257812, "step": 3640 }, { "epoch": 1.24, "learning_rate": 3.2582147803097065e-07, "logits/chosen": -0.6482292413711548, "logits/rejected": -0.5954722762107849, "logps/chosen": -206.35574340820312, "logps/rejected": -680.343994140625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.2702055871486664, "rewards/margins": 25.4415283203125, "rewards/rejected": -25.711734771728516, "step": 3650 }, { "epoch": 1.24, "learning_rate": 3.251919929497671e-07, "logits/chosen": -0.4297823905944824, "logits/rejected": -0.5527641177177429, "logps/chosen": -241.09756469726562, "logps/rejected": -594.6710205078125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.12809017300605774, "rewards/margins": 27.198318481445312, "rewards/rejected": -27.3264102935791, "step": 3660 }, { "epoch": 1.25, "learning_rate": 3.245625078685635e-07, "logits/chosen": -0.44234561920166016, "logits/rejected": -0.6569749116897583, "logps/chosen": -314.4349060058594, "logps/rejected": -787.964111328125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.2627989649772644, "rewards/margins": 23.093765258789062, "rewards/rejected": -23.3565616607666, "step": 3670 }, { "epoch": 1.25, "learning_rate": 3.239330227873599e-07, "logits/chosen": -0.5485345125198364, "logits/rejected": -0.5784450769424438, "logps/chosen": -203.53636169433594, "logps/rejected": -630.2029418945312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.0734846442937851, "rewards/margins": 28.616907119750977, "rewards/rejected": -28.69038963317871, "step": 3680 }, { "epoch": 1.25, "learning_rate": 3.233035377061564e-07, "logits/chosen": -0.4327804446220398, "logits/rejected": -0.5879336595535278, "logps/chosen": -227.88671875, "logps/rejected": -595.5135498046875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.565266489982605, "rewards/margins": 23.28707504272461, "rewards/rejected": -23.85234260559082, "step": 3690 }, { "epoch": 1.26, "learning_rate": 3.2267405262495277e-07, "logits/chosen": -0.37219077348709106, "logits/rejected": -0.6063266396522522, "logps/chosen": -203.16477966308594, "logps/rejected": -572.9649658203125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.8236276507377625, "rewards/margins": 21.666545867919922, "rewards/rejected": -22.490171432495117, "step": 3700 }, { "epoch": 1.26, "eval_logits/chosen": -0.6161515712738037, "eval_logits/rejected": -0.6412346959114075, "eval_logps/chosen": -219.26235961914062, "eval_logps/rejected": -643.376708984375, "eval_loss": 0.002986146369948983, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.11065331101417542, "eval_rewards/margins": 22.74062728881836, "eval_rewards/rejected": -22.851282119750977, "eval_runtime": 536.9409, "eval_samples_per_second": 17.693, "eval_steps_per_second": 0.553, "step": 3700 }, { "epoch": 1.26, "learning_rate": 3.220445675437492e-07, "logits/chosen": -0.6146506667137146, "logits/rejected": -0.6390140056610107, "logps/chosen": -176.08621215820312, "logps/rejected": -766.9559326171875, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -0.6174939274787903, "rewards/margins": 25.378719329833984, "rewards/rejected": -25.996212005615234, "step": 3710 }, { "epoch": 1.26, "learning_rate": 3.214150824625456e-07, "logits/chosen": -0.4929020404815674, "logits/rejected": -0.5923459529876709, "logps/chosen": -171.75489807128906, "logps/rejected": -682.27490234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5955243706703186, "rewards/margins": 21.589096069335938, "rewards/rejected": -22.184619903564453, "step": 3720 }, { "epoch": 1.27, "learning_rate": 3.2078559738134203e-07, "logits/chosen": -0.4370489716529846, "logits/rejected": -0.6477999091148376, "logps/chosen": -298.0855407714844, "logps/rejected": -707.3984375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.958692729473114, "rewards/margins": 22.702205657958984, "rewards/rejected": -23.660900115966797, "step": 3730 }, { "epoch": 1.27, "learning_rate": 3.2015611230013846e-07, "logits/chosen": -0.5077528357505798, "logits/rejected": -0.6337962746620178, "logps/chosen": -165.85623168945312, "logps/rejected": -788.4411010742188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4020668864250183, "rewards/margins": 22.114505767822266, "rewards/rejected": -22.516572952270508, "step": 3740 }, { "epoch": 1.27, "learning_rate": 3.1952662721893493e-07, "logits/chosen": -0.5734524726867676, "logits/rejected": -0.5610033869743347, "logps/chosen": -153.70614624023438, "logps/rejected": -634.8050537109375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.6263917088508606, "rewards/margins": 19.17335319519043, "rewards/rejected": -19.79974365234375, "step": 3750 }, { "epoch": 1.28, "learning_rate": 3.1889714213773135e-07, "logits/chosen": -0.5345430374145508, "logits/rejected": -0.5969452261924744, "logps/chosen": -213.6829833984375, "logps/rejected": -652.2935791015625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3156171441078186, "rewards/margins": 22.070911407470703, "rewards/rejected": -22.386524200439453, "step": 3760 }, { "epoch": 1.28, "learning_rate": 3.182676570565277e-07, "logits/chosen": -0.5811839699745178, "logits/rejected": -0.6938631534576416, "logps/chosen": -171.32847595214844, "logps/rejected": -651.0315551757812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.2145511656999588, "rewards/margins": 25.47879409790039, "rewards/rejected": -25.693347930908203, "step": 3770 }, { "epoch": 1.28, "learning_rate": 3.1763817197532415e-07, "logits/chosen": -0.48750075697898865, "logits/rejected": -0.6616155505180359, "logps/chosen": -229.34640502929688, "logps/rejected": -574.244873046875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.15986637771129608, "rewards/margins": 23.963699340820312, "rewards/rejected": -24.12356948852539, "step": 3780 }, { "epoch": 1.29, "learning_rate": 3.1700868689412057e-07, "logits/chosen": -0.5332116484642029, "logits/rejected": -0.6272880434989929, "logps/chosen": -293.86578369140625, "logps/rejected": -727.0882568359375, "loss": 0.002, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.07223048061132431, "rewards/margins": 24.346702575683594, "rewards/rejected": -24.274473190307617, "step": 3790 }, { "epoch": 1.29, "learning_rate": 3.16379201812917e-07, "logits/chosen": -0.6306854486465454, "logits/rejected": -0.6714102029800415, "logps/chosen": -155.32086181640625, "logps/rejected": -729.7457885742188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.45394381880760193, "rewards/margins": 22.420116424560547, "rewards/rejected": -22.874059677124023, "step": 3800 }, { "epoch": 1.29, "eval_logits/chosen": -0.6446669101715088, "eval_logits/rejected": -0.6750091314315796, "eval_logps/chosen": -222.48545837402344, "eval_logps/rejected": -662.1182250976562, "eval_loss": 0.002854662947356701, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.4329643249511719, "eval_rewards/margins": 24.292463302612305, "eval_rewards/rejected": -24.725431442260742, "eval_runtime": 536.7276, "eval_samples_per_second": 17.7, "eval_steps_per_second": 0.553, "step": 3800 }, { "epoch": 1.3, "learning_rate": 3.1574971673171347e-07, "logits/chosen": -0.521294355392456, "logits/rejected": -0.7188762426376343, "logps/chosen": -283.2747497558594, "logps/rejected": -536.244140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.7130476832389832, "rewards/margins": 24.792898178100586, "rewards/rejected": -25.50594711303711, "step": 3810 }, { "epoch": 1.3, "learning_rate": 3.151202316505099e-07, "logits/chosen": -0.7186405062675476, "logits/rejected": -0.6169338822364807, "logps/chosen": -165.6100311279297, "logps/rejected": -769.4568481445312, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.40955981612205505, "rewards/margins": 28.30446434020996, "rewards/rejected": -28.714025497436523, "step": 3820 }, { "epoch": 1.3, "learning_rate": 3.144907465693063e-07, "logits/chosen": -0.5698453783988953, "logits/rejected": -0.5662177205085754, "logps/chosen": -220.39749145507812, "logps/rejected": -648.4984130859375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.7218931913375854, "rewards/margins": 21.97603988647461, "rewards/rejected": -22.697935104370117, "step": 3830 }, { "epoch": 1.31, "learning_rate": 3.1386126148810274e-07, "logits/chosen": -0.6322427988052368, "logits/rejected": -0.5543380975723267, "logps/chosen": -304.21258544921875, "logps/rejected": -618.9811401367188, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.7710276246070862, "rewards/margins": 25.24374008178711, "rewards/rejected": -26.0147647857666, "step": 3840 }, { "epoch": 1.31, "learning_rate": 3.132317764068991e-07, "logits/chosen": -0.6310170292854309, "logits/rejected": -0.6458388566970825, "logps/chosen": -173.12887573242188, "logps/rejected": -765.6507568359375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.7812573909759521, "rewards/margins": 23.786352157592773, "rewards/rejected": -24.567607879638672, "step": 3850 }, { "epoch": 1.31, "learning_rate": 3.1260229132569553e-07, "logits/chosen": -0.38612300157546997, "logits/rejected": -0.6566651463508606, "logps/chosen": -302.1167297363281, "logps/rejected": -546.8599853515625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.2473093271255493, "rewards/margins": 25.067401885986328, "rewards/rejected": -26.314706802368164, "step": 3860 }, { "epoch": 1.32, "learning_rate": 3.11972806244492e-07, "logits/chosen": -0.491655170917511, "logits/rejected": -0.5763476490974426, "logps/chosen": -230.39578247070312, "logps/rejected": -649.1068115234375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.7097188234329224, "rewards/margins": 22.353116989135742, "rewards/rejected": -23.06283950805664, "step": 3870 }, { "epoch": 1.32, "learning_rate": 3.1134332116328843e-07, "logits/chosen": -0.6027604341506958, "logits/rejected": -0.6739881038665771, "logps/chosen": -199.03280639648438, "logps/rejected": -785.955810546875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.9770030975341797, "rewards/margins": 27.17582130432129, "rewards/rejected": -28.1528263092041, "step": 3880 }, { "epoch": 1.32, "learning_rate": 3.1071383608208485e-07, "logits/chosen": -0.49097442626953125, "logits/rejected": -0.6785427927970886, "logps/chosen": -255.62094116210938, "logps/rejected": -516.3653564453125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.819412350654602, "rewards/margins": 25.977575302124023, "rewards/rejected": -26.796985626220703, "step": 3890 }, { "epoch": 1.33, "learning_rate": 3.1008435100088127e-07, "logits/chosen": -0.5741170644760132, "logits/rejected": -0.6505104899406433, "logps/chosen": -174.44393920898438, "logps/rejected": -891.5671997070312, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5431356430053711, "rewards/margins": 27.31049156188965, "rewards/rejected": -27.853626251220703, "step": 3900 }, { "epoch": 1.33, "eval_logits/chosen": -0.6319225430488586, "eval_logits/rejected": -0.661310076713562, "eval_logps/chosen": -223.41334533691406, "eval_logps/rejected": -671.2713623046875, "eval_loss": 0.002612130017951131, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.5257552266120911, "eval_rewards/margins": 25.114986419677734, "eval_rewards/rejected": -25.640737533569336, "eval_runtime": 535.7531, "eval_samples_per_second": 17.732, "eval_steps_per_second": 0.554, "step": 3900 }, { "epoch": 1.33, "learning_rate": 3.094548659196777e-07, "logits/chosen": -0.43399614095687866, "logits/rejected": -0.6454629898071289, "logps/chosen": -350.3164978027344, "logps/rejected": -784.4608154296875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.5058630108833313, "rewards/margins": 22.00267219543457, "rewards/rejected": -22.508533477783203, "step": 3910 }, { "epoch": 1.33, "learning_rate": 3.0882538083847407e-07, "logits/chosen": -0.46475347876548767, "logits/rejected": -0.618371307849884, "logps/chosen": -249.3139190673828, "logps/rejected": -562.9500122070312, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.5914496183395386, "rewards/margins": 23.814281463623047, "rewards/rejected": -24.405731201171875, "step": 3920 }, { "epoch": 1.34, "learning_rate": 3.0819589575727054e-07, "logits/chosen": -0.3204534649848938, "logits/rejected": -0.5393902063369751, "logps/chosen": -394.1038818359375, "logps/rejected": -506.79473876953125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5112158060073853, "rewards/margins": 24.1527099609375, "rewards/rejected": -24.66392707824707, "step": 3930 }, { "epoch": 1.34, "learning_rate": 3.0756641067606696e-07, "logits/chosen": -0.4970259666442871, "logits/rejected": -0.566218912601471, "logps/chosen": -184.57876586914062, "logps/rejected": -622.09423828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.6580025553703308, "rewards/margins": 23.495487213134766, "rewards/rejected": -24.15349006652832, "step": 3940 }, { "epoch": 1.34, "learning_rate": 3.069369255948634e-07, "logits/chosen": -0.6085567474365234, "logits/rejected": -0.5679286122322083, "logps/chosen": -163.68948364257812, "logps/rejected": -754.4090576171875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8379193544387817, "rewards/margins": 29.191431045532227, "rewards/rejected": -30.02935218811035, "step": 3950 }, { "epoch": 1.35, "learning_rate": 3.063074405136598e-07, "logits/chosen": -0.5503302812576294, "logits/rejected": -0.6631855368614197, "logps/chosen": -185.82235717773438, "logps/rejected": -658.0210571289062, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.39478808641433716, "rewards/margins": 27.724651336669922, "rewards/rejected": -28.11944007873535, "step": 3960 }, { "epoch": 1.35, "learning_rate": 3.0567795543245623e-07, "logits/chosen": -0.5989475846290588, "logits/rejected": -0.6769839525222778, "logps/chosen": -219.86819458007812, "logps/rejected": -737.9708251953125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.8018891215324402, "rewards/margins": 25.738483428955078, "rewards/rejected": -26.54037094116211, "step": 3970 }, { "epoch": 1.35, "learning_rate": 3.0504847035125266e-07, "logits/chosen": -0.3452851176261902, "logits/rejected": -0.5594893097877502, "logps/chosen": -265.7044677734375, "logps/rejected": -679.0172729492188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.7248401641845703, "rewards/margins": 25.964492797851562, "rewards/rejected": -26.689334869384766, "step": 3980 }, { "epoch": 1.36, "learning_rate": 3.0441898527004913e-07, "logits/chosen": -0.5120434761047363, "logits/rejected": -0.6677166819572449, "logps/chosen": -166.87823486328125, "logps/rejected": -655.3040771484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.7184686064720154, "rewards/margins": 27.84735107421875, "rewards/rejected": -28.565820693969727, "step": 3990 }, { "epoch": 1.36, "learning_rate": 3.037895001888455e-07, "logits/chosen": -0.5931070446968079, "logits/rejected": -0.6297544836997986, "logps/chosen": -231.5447540283203, "logps/rejected": -626.4212646484375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.744392991065979, "rewards/margins": 23.02704429626465, "rewards/rejected": -23.771434783935547, "step": 4000 }, { "epoch": 1.36, "eval_logits/chosen": -0.656895637512207, "eval_logits/rejected": -0.6795927882194519, "eval_logps/chosen": -226.74781799316406, "eval_logps/rejected": -689.2528076171875, "eval_loss": 0.0025382947642356157, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.8591986894607544, "eval_rewards/margins": 26.579687118530273, "eval_rewards/rejected": -27.438884735107422, "eval_runtime": 537.4459, "eval_samples_per_second": 17.676, "eval_steps_per_second": 0.553, "step": 4000 }, { "epoch": 1.36, "learning_rate": 3.031600151076419e-07, "logits/chosen": -0.5844267010688782, "logits/rejected": -0.6578593254089355, "logps/chosen": -186.2404327392578, "logps/rejected": -692.6897583007812, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.1119728088378906, "rewards/margins": 26.145328521728516, "rewards/rejected": -27.257299423217773, "step": 4010 }, { "epoch": 1.37, "learning_rate": 3.0253053002643835e-07, "logits/chosen": -0.5545540452003479, "logits/rejected": -0.6696325540542603, "logps/chosen": -181.87234497070312, "logps/rejected": -724.7491455078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.8007772564888, "rewards/margins": 28.568668365478516, "rewards/rejected": -29.36944580078125, "step": 4020 }, { "epoch": 1.37, "learning_rate": 3.0190104494523477e-07, "logits/chosen": -0.5091897249221802, "logits/rejected": -0.6420959234237671, "logps/chosen": -188.88986206054688, "logps/rejected": -816.1044921875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.0320355892181396, "rewards/margins": 24.440654754638672, "rewards/rejected": -25.47269058227539, "step": 4030 }, { "epoch": 1.37, "learning_rate": 3.012715598640312e-07, "logits/chosen": -0.39841216802597046, "logits/rejected": -0.6435903310775757, "logps/chosen": -306.08465576171875, "logps/rejected": -694.7786254882812, "loss": 0.0203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8803471326828003, "rewards/margins": 28.514440536499023, "rewards/rejected": -29.394786834716797, "step": 4040 }, { "epoch": 1.38, "learning_rate": 3.0064207478282767e-07, "logits/chosen": -0.3683060109615326, "logits/rejected": -0.5906280279159546, "logps/chosen": -298.08465576171875, "logps/rejected": -669.1845703125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4363463521003723, "rewards/margins": 24.317974090576172, "rewards/rejected": -24.754322052001953, "step": 4050 }, { "epoch": 1.38, "learning_rate": 3.000125897016241e-07, "logits/chosen": -0.49200502038002014, "logits/rejected": -0.5944346189498901, "logps/chosen": -194.3395538330078, "logps/rejected": -668.40869140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.9237950444221497, "rewards/margins": 23.78119659423828, "rewards/rejected": -24.70499038696289, "step": 4060 }, { "epoch": 1.38, "learning_rate": 2.993831046204205e-07, "logits/chosen": -0.47918087244033813, "logits/rejected": -0.5380889773368835, "logps/chosen": -319.29608154296875, "logps/rejected": -562.0574340820312, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5249345898628235, "rewards/margins": 22.418807983398438, "rewards/rejected": -22.943744659423828, "step": 4070 }, { "epoch": 1.39, "learning_rate": 2.987536195392169e-07, "logits/chosen": -0.38760632276535034, "logits/rejected": -0.5022004246711731, "logps/chosen": -246.1426239013672, "logps/rejected": -632.3997802734375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3322727680206299, "rewards/margins": 28.026973724365234, "rewards/rejected": -28.359249114990234, "step": 4080 }, { "epoch": 1.39, "learning_rate": 2.981241344580133e-07, "logits/chosen": -0.5103537440299988, "logits/rejected": -0.616550087928772, "logps/chosen": -236.5392303466797, "logps/rejected": -598.5130615234375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.6924244165420532, "rewards/margins": 23.426597595214844, "rewards/rejected": -24.119022369384766, "step": 4090 }, { "epoch": 1.39, "learning_rate": 2.9749464937680973e-07, "logits/chosen": -0.4686276912689209, "logits/rejected": -0.5994580984115601, "logps/chosen": -185.11636352539062, "logps/rejected": -439.9454650878906, "loss": 0.0293, "rewards/accuracies": 1.0, "rewards/chosen": -0.17654499411582947, "rewards/margins": 24.336116790771484, "rewards/rejected": -24.512664794921875, "step": 4100 }, { "epoch": 1.39, "eval_logits/chosen": -0.6340649724006653, "eval_logits/rejected": -0.6656588912010193, "eval_logps/chosen": -224.44210815429688, "eval_logps/rejected": -679.2517700195312, "eval_loss": 0.0031960448250174522, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.6286283135414124, "eval_rewards/margins": 25.810157775878906, "eval_rewards/rejected": -26.43878936767578, "eval_runtime": 537.8774, "eval_samples_per_second": 17.662, "eval_steps_per_second": 0.552, "step": 4100 }, { "epoch": 1.4, "learning_rate": 2.968651642956062e-07, "logits/chosen": -0.6588854789733887, "logits/rejected": -0.5852169394493103, "logps/chosen": -169.52755737304688, "logps/rejected": -652.9843139648438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8196150660514832, "rewards/margins": 24.70818328857422, "rewards/rejected": -25.52779769897461, "step": 4110 }, { "epoch": 1.4, "learning_rate": 2.9623567921440263e-07, "logits/chosen": -0.5996174216270447, "logits/rejected": -0.5840147733688354, "logps/chosen": -169.6434783935547, "logps/rejected": -541.7594604492188, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.8091737627983093, "rewards/margins": 22.426799774169922, "rewards/rejected": -23.235973358154297, "step": 4120 }, { "epoch": 1.4, "learning_rate": 2.9560619413319905e-07, "logits/chosen": -0.48885440826416016, "logits/rejected": -0.6716369390487671, "logps/chosen": -384.6716613769531, "logps/rejected": -785.0818481445312, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0479235649108887, "rewards/margins": 25.14766502380371, "rewards/rejected": -26.195592880249023, "step": 4130 }, { "epoch": 1.41, "learning_rate": 2.9497670905199547e-07, "logits/chosen": -0.470248281955719, "logits/rejected": -0.6648123264312744, "logps/chosen": -371.95452880859375, "logps/rejected": -715.4351806640625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.5166934728622437, "rewards/margins": 26.738656997680664, "rewards/rejected": -27.25535011291504, "step": 4140 }, { "epoch": 1.41, "learning_rate": 2.9434722397079184e-07, "logits/chosen": -0.5142195224761963, "logits/rejected": -0.6857298612594604, "logps/chosen": -257.34539794921875, "logps/rejected": -850.5185546875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.7111415863037109, "rewards/margins": 24.79717254638672, "rewards/rejected": -25.508312225341797, "step": 4150 }, { "epoch": 1.41, "learning_rate": 2.9371773888958827e-07, "logits/chosen": -0.590836226940155, "logits/rejected": -0.5949567556381226, "logps/chosen": -168.7595672607422, "logps/rejected": -626.8141479492188, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.7229076027870178, "rewards/margins": 23.41318130493164, "rewards/rejected": -24.13608741760254, "step": 4160 }, { "epoch": 1.42, "learning_rate": 2.9308825380838474e-07, "logits/chosen": -0.44332990050315857, "logits/rejected": -0.6776119470596313, "logps/chosen": -246.9022216796875, "logps/rejected": -554.3592529296875, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.34087079763412476, "rewards/margins": 24.004058837890625, "rewards/rejected": -24.344928741455078, "step": 4170 }, { "epoch": 1.42, "learning_rate": 2.9245876872718116e-07, "logits/chosen": -0.5291085243225098, "logits/rejected": -0.6685595512390137, "logps/chosen": -222.9030303955078, "logps/rejected": -660.2169799804688, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.737917423248291, "rewards/margins": 24.741500854492188, "rewards/rejected": -25.479419708251953, "step": 4180 }, { "epoch": 1.42, "learning_rate": 2.918292836459776e-07, "logits/chosen": -0.5772517919540405, "logits/rejected": -0.6671938896179199, "logps/chosen": -199.12290954589844, "logps/rejected": -849.7820434570312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.20440325140953064, "rewards/margins": 25.1953125, "rewards/rejected": -25.399715423583984, "step": 4190 }, { "epoch": 1.43, "learning_rate": 2.91199798564774e-07, "logits/chosen": -0.6835727095603943, "logits/rejected": -0.632318377494812, "logps/chosen": -150.65061950683594, "logps/rejected": -624.3247680664062, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7900353670120239, "rewards/margins": 22.915252685546875, "rewards/rejected": -23.70528793334961, "step": 4200 }, { "epoch": 1.43, "eval_logits/chosen": -0.6545882225036621, "eval_logits/rejected": -0.690679669380188, "eval_logps/chosen": -224.60446166992188, "eval_logps/rejected": -676.0199584960938, "eval_loss": 0.0025619491934776306, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.6448644995689392, "eval_rewards/margins": 25.470733642578125, "eval_rewards/rejected": -26.1155948638916, "eval_runtime": 537.8167, "eval_samples_per_second": 17.664, "eval_steps_per_second": 0.552, "step": 4200 }, { "epoch": 1.43, "learning_rate": 2.9057031348357043e-07, "logits/chosen": -0.7415747046470642, "logits/rejected": -0.7310870289802551, "logps/chosen": -170.92129516601562, "logps/rejected": -772.7601318359375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.7735823392868042, "rewards/margins": 24.12515640258789, "rewards/rejected": -24.898738861083984, "step": 4210 }, { "epoch": 1.43, "learning_rate": 2.8994082840236686e-07, "logits/chosen": -0.5389958024024963, "logits/rejected": -0.6840323209762573, "logps/chosen": -290.2601318359375, "logps/rejected": -697.9601440429688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.4220031201839447, "rewards/margins": 23.201587677001953, "rewards/rejected": -23.623592376708984, "step": 4220 }, { "epoch": 1.44, "learning_rate": 2.893113433211632e-07, "logits/chosen": -0.5756198167800903, "logits/rejected": -0.6462770104408264, "logps/chosen": -192.59133911132812, "logps/rejected": -560.5233764648438, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.3795054256916046, "rewards/margins": 23.460201263427734, "rewards/rejected": -23.839704513549805, "step": 4230 }, { "epoch": 1.44, "learning_rate": 2.886818582399597e-07, "logits/chosen": -0.3747271001338959, "logits/rejected": -0.5314438939094543, "logps/chosen": -242.44082641601562, "logps/rejected": -540.4208984375, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0198962688446045, "rewards/margins": 23.043073654174805, "rewards/rejected": -24.062973022460938, "step": 4240 }, { "epoch": 1.44, "learning_rate": 2.880523731587561e-07, "logits/chosen": -0.5141429901123047, "logits/rejected": -0.6259872913360596, "logps/chosen": -331.22149658203125, "logps/rejected": -949.6028442382812, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6469858288764954, "rewards/margins": 20.399362564086914, "rewards/rejected": -21.046348571777344, "step": 4250 }, { "epoch": 1.45, "learning_rate": 2.8742288807755255e-07, "logits/chosen": -0.6345170736312866, "logits/rejected": -0.6226304769515991, "logps/chosen": -217.5552978515625, "logps/rejected": -776.0902099609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.5271614789962769, "rewards/margins": 22.187824249267578, "rewards/rejected": -22.714982986450195, "step": 4260 }, { "epoch": 1.45, "learning_rate": 2.8679340299634897e-07, "logits/chosen": -0.6831735372543335, "logits/rejected": -0.529569149017334, "logps/chosen": -160.80715942382812, "logps/rejected": -929.51806640625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.026241052895784378, "rewards/margins": 21.611555099487305, "rewards/rejected": -21.63779640197754, "step": 4270 }, { "epoch": 1.45, "learning_rate": 2.861639179151454e-07, "logits/chosen": -0.5589373111724854, "logits/rejected": -0.6145527362823486, "logps/chosen": -183.11404418945312, "logps/rejected": -726.0515747070312, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.46428757905960083, "rewards/margins": 24.595508575439453, "rewards/rejected": -25.059795379638672, "step": 4280 }, { "epoch": 1.46, "learning_rate": 2.855344328339418e-07, "logits/chosen": -0.43564373254776, "logits/rejected": -0.5871229767799377, "logps/chosen": -173.70481872558594, "logps/rejected": -698.3441772460938, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.11621705442667007, "rewards/margins": 26.034286499023438, "rewards/rejected": -26.150503158569336, "step": 4290 }, { "epoch": 1.46, "learning_rate": 2.849049477527383e-07, "logits/chosen": -0.4630086421966553, "logits/rejected": -0.6238056421279907, "logps/chosen": -163.6204376220703, "logps/rejected": -508.8863830566406, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.002947696950286627, "rewards/margins": 24.191852569580078, "rewards/rejected": -24.194801330566406, "step": 4300 }, { "epoch": 1.46, "eval_logits/chosen": -0.6347739696502686, "eval_logits/rejected": -0.6703915596008301, "eval_logps/chosen": -222.2906951904297, "eval_logps/rejected": -668.6073608398438, "eval_loss": 0.002590919379144907, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.4134872257709503, "eval_rewards/margins": 24.960861206054688, "eval_rewards/rejected": -25.374347686767578, "eval_runtime": 536.2613, "eval_samples_per_second": 17.715, "eval_steps_per_second": 0.554, "step": 4300 }, { "epoch": 1.46, "learning_rate": 2.8427546267153466e-07, "logits/chosen": -0.5586211085319519, "logits/rejected": -0.6769331693649292, "logps/chosen": -180.0655517578125, "logps/rejected": -775.4181518554688, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -1.048835039138794, "rewards/margins": 25.666378021240234, "rewards/rejected": -26.7152156829834, "step": 4310 }, { "epoch": 1.47, "learning_rate": 2.836459775903311e-07, "logits/chosen": -0.5005068778991699, "logits/rejected": -0.60284423828125, "logps/chosen": -275.2718505859375, "logps/rejected": -575.8966674804688, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -0.24808374047279358, "rewards/margins": 23.865299224853516, "rewards/rejected": -24.11338233947754, "step": 4320 }, { "epoch": 1.47, "learning_rate": 2.830164925091275e-07, "logits/chosen": -0.511517345905304, "logits/rejected": -0.6544578671455383, "logps/chosen": -175.39649963378906, "logps/rejected": -609.6898193359375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.0018910408252850175, "rewards/margins": 25.612316131591797, "rewards/rejected": -25.610424041748047, "step": 4330 }, { "epoch": 1.48, "learning_rate": 2.8238700742792393e-07, "logits/chosen": -0.5089720487594604, "logits/rejected": -0.5166656970977783, "logps/chosen": -213.7110595703125, "logps/rejected": -509.2606506347656, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.05438070371747017, "rewards/margins": 24.6554012298584, "rewards/rejected": -24.601022720336914, "step": 4340 }, { "epoch": 1.48, "learning_rate": 2.8175752234672035e-07, "logits/chosen": -0.3232540488243103, "logits/rejected": -0.6403359174728394, "logps/chosen": -286.32061767578125, "logps/rejected": -603.0972290039062, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.12461726367473602, "rewards/margins": 22.360042572021484, "rewards/rejected": -22.484661102294922, "step": 4350 }, { "epoch": 1.48, "learning_rate": 2.8112803726551683e-07, "logits/chosen": -0.4277314245700836, "logits/rejected": -0.6289277672767639, "logps/chosen": -303.66033935546875, "logps/rejected": -414.69757080078125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.33236163854599, "rewards/margins": 18.898101806640625, "rewards/rejected": -19.230464935302734, "step": 4360 }, { "epoch": 1.49, "learning_rate": 2.8049855218431325e-07, "logits/chosen": -0.6008056402206421, "logits/rejected": -0.5654591917991638, "logps/chosen": -174.14370727539062, "logps/rejected": -596.6619262695312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.37650489807128906, "rewards/margins": 26.335922241210938, "rewards/rejected": -26.712427139282227, "step": 4370 }, { "epoch": 1.49, "learning_rate": 2.7986906710310967e-07, "logits/chosen": -0.4752674698829651, "logits/rejected": -0.5681442618370056, "logps/chosen": -246.0357666015625, "logps/rejected": -927.1434326171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.5988559126853943, "rewards/margins": 25.943933486938477, "rewards/rejected": -26.542781829833984, "step": 4380 }, { "epoch": 1.49, "learning_rate": 2.7923958202190604e-07, "logits/chosen": -0.5726215243339539, "logits/rejected": -0.6129893064498901, "logps/chosen": -169.2570037841797, "logps/rejected": -915.3245849609375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4199690818786621, "rewards/margins": 26.038936614990234, "rewards/rejected": -26.458908081054688, "step": 4390 }, { "epoch": 1.5, "learning_rate": 2.7861009694070247e-07, "logits/chosen": -0.40878409147262573, "logits/rejected": -0.5935572981834412, "logps/chosen": -255.1625518798828, "logps/rejected": -537.9759521484375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.41659966111183167, "rewards/margins": 24.56509017944336, "rewards/rejected": -24.981687545776367, "step": 4400 }, { "epoch": 1.5, "eval_logits/chosen": -0.6311615109443665, "eval_logits/rejected": -0.6669716238975525, "eval_logps/chosen": -219.8622589111328, "eval_logps/rejected": -668.9984130859375, "eval_loss": 0.002546438481658697, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.17064297199249268, "eval_rewards/margins": 25.24280548095703, "eval_rewards/rejected": -25.413450241088867, "eval_runtime": 535.6245, "eval_samples_per_second": 17.736, "eval_steps_per_second": 0.554, "step": 4400 }, { "epoch": 1.5, "learning_rate": 2.779806118594989e-07, "logits/chosen": -0.5838621258735657, "logits/rejected": -0.5980414152145386, "logps/chosen": -222.0771942138672, "logps/rejected": -773.77099609375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.20677189528942108, "rewards/margins": 26.759521484375, "rewards/rejected": -26.966297149658203, "step": 4410 }, { "epoch": 1.5, "learning_rate": 2.7735112677829536e-07, "logits/chosen": -0.635046124458313, "logits/rejected": -0.5527677536010742, "logps/chosen": -148.0593719482422, "logps/rejected": -637.4700927734375, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.007630092091858387, "rewards/margins": 24.21263313293457, "rewards/rejected": -24.20500373840332, "step": 4420 }, { "epoch": 1.51, "learning_rate": 2.767216416970918e-07, "logits/chosen": -0.6165615320205688, "logits/rejected": -0.6915376782417297, "logps/chosen": -157.70816040039062, "logps/rejected": -528.6312255859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.2142827808856964, "rewards/margins": 24.308422088623047, "rewards/rejected": -24.522705078125, "step": 4430 }, { "epoch": 1.51, "learning_rate": 2.760921566158882e-07, "logits/chosen": -0.7215418815612793, "logits/rejected": -0.6157564520835876, "logps/chosen": -210.8969268798828, "logps/rejected": -619.2723999023438, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.527764081954956, "rewards/margins": 20.16398048400879, "rewards/rejected": -20.69174575805664, "step": 4440 }, { "epoch": 1.51, "learning_rate": 2.7546267153468463e-07, "logits/chosen": -0.3962731957435608, "logits/rejected": -0.6107335090637207, "logps/chosen": -383.6881408691406, "logps/rejected": -598.4122924804688, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.19958773255348206, "rewards/margins": 21.047800064086914, "rewards/rejected": -21.24738883972168, "step": 4450 }, { "epoch": 1.52, "learning_rate": 2.74833186453481e-07, "logits/chosen": -0.4935482442378998, "logits/rejected": -0.708462119102478, "logps/chosen": -286.0418395996094, "logps/rejected": -639.6117553710938, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.13407167792320251, "rewards/margins": 22.698196411132812, "rewards/rejected": -22.8322696685791, "step": 4460 }, { "epoch": 1.52, "learning_rate": 2.742037013722774e-07, "logits/chosen": -0.43831080198287964, "logits/rejected": -0.5730674266815186, "logps/chosen": -233.43661499023438, "logps/rejected": -721.2703247070312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.7597863078117371, "rewards/margins": 27.582605361938477, "rewards/rejected": -28.342391967773438, "step": 4470 }, { "epoch": 1.52, "learning_rate": 2.735742162910739e-07, "logits/chosen": -0.38983944058418274, "logits/rejected": -0.6528708338737488, "logps/chosen": -258.25994873046875, "logps/rejected": -500.02655029296875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5247379541397095, "rewards/margins": 21.049779891967773, "rewards/rejected": -21.57451629638672, "step": 4480 }, { "epoch": 1.53, "learning_rate": 2.729447312098703e-07, "logits/chosen": -0.586678147315979, "logits/rejected": -0.716739296913147, "logps/chosen": -326.28936767578125, "logps/rejected": -704.1714477539062, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.30428168177604675, "rewards/margins": 22.737951278686523, "rewards/rejected": -23.042234420776367, "step": 4490 }, { "epoch": 1.53, "learning_rate": 2.7231524612866675e-07, "logits/chosen": -0.6034249067306519, "logits/rejected": -0.5649521946907043, "logps/chosen": -177.36868286132812, "logps/rejected": -683.35107421875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.5476217269897461, "rewards/margins": 21.189088821411133, "rewards/rejected": -21.73670768737793, "step": 4500 }, { "epoch": 1.53, "eval_logits/chosen": -0.6345024108886719, "eval_logits/rejected": -0.6866307258605957, "eval_logps/chosen": -221.52401733398438, "eval_logps/rejected": -654.6318359375, "eval_loss": 0.0026447693817317486, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.33681926131248474, "eval_rewards/margins": 23.639976501464844, "eval_rewards/rejected": -23.97679901123047, "eval_runtime": 536.4856, "eval_samples_per_second": 17.708, "eval_steps_per_second": 0.554, "step": 4500 }, { "epoch": 1.53, "learning_rate": 2.7168576104746317e-07, "logits/chosen": -0.5709516406059265, "logits/rejected": -0.6648103594779968, "logps/chosen": -193.32252502441406, "logps/rejected": -728.570068359375, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.06060982495546341, "rewards/margins": 26.1737060546875, "rewards/rejected": -26.11309242248535, "step": 4510 }, { "epoch": 1.54, "learning_rate": 2.710562759662596e-07, "logits/chosen": -0.5821015238761902, "logits/rejected": -0.5740675330162048, "logps/chosen": -217.5243682861328, "logps/rejected": -705.5767822265625, "loss": 0.0021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.33417055010795593, "rewards/margins": 24.92624855041504, "rewards/rejected": -25.260417938232422, "step": 4520 }, { "epoch": 1.54, "learning_rate": 2.70426790885056e-07, "logits/chosen": -0.5329350829124451, "logits/rejected": -0.5961849093437195, "logps/chosen": -215.7123565673828, "logps/rejected": -867.8529052734375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.10672841221094131, "rewards/margins": 29.20607566833496, "rewards/rejected": -29.312808990478516, "step": 4530 }, { "epoch": 1.54, "learning_rate": 2.6979730580385244e-07, "logits/chosen": -0.3858904242515564, "logits/rejected": -0.623163104057312, "logps/chosen": -294.22210693359375, "logps/rejected": -680.1553955078125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.16081400215625763, "rewards/margins": 21.86477279663086, "rewards/rejected": -22.02558708190918, "step": 4540 }, { "epoch": 1.55, "learning_rate": 2.6916782072264886e-07, "logits/chosen": -0.5801902413368225, "logits/rejected": -0.6527605056762695, "logps/chosen": -215.2085723876953, "logps/rejected": -555.6490478515625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.04508133977651596, "rewards/margins": 23.1293888092041, "rewards/rejected": -23.174470901489258, "step": 4550 }, { "epoch": 1.55, "learning_rate": 2.685383356414453e-07, "logits/chosen": -0.4929986596107483, "logits/rejected": -0.7096401453018188, "logps/chosen": -238.4801788330078, "logps/rejected": -630.2096557617188, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.41103416681289673, "rewards/margins": 29.2844295501709, "rewards/rejected": -29.69546890258789, "step": 4560 }, { "epoch": 1.55, "learning_rate": 2.679088505602417e-07, "logits/chosen": -0.5265017747879028, "logits/rejected": -0.666379451751709, "logps/chosen": -269.65869140625, "logps/rejected": -753.6040649414062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.24989740550518036, "rewards/margins": 26.072265625, "rewards/rejected": -26.322162628173828, "step": 4570 }, { "epoch": 1.56, "learning_rate": 2.6727936547903813e-07, "logits/chosen": -0.6154996156692505, "logits/rejected": -0.5532437562942505, "logps/chosen": -159.207275390625, "logps/rejected": -651.8311157226562, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.43689998984336853, "rewards/margins": 19.95778465270996, "rewards/rejected": -20.394685745239258, "step": 4580 }, { "epoch": 1.56, "learning_rate": 2.6664988039783455e-07, "logits/chosen": -0.5538499355316162, "logits/rejected": -0.6406517028808594, "logps/chosen": -163.55776977539062, "logps/rejected": -539.0841674804688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.2926756739616394, "rewards/margins": 24.484786987304688, "rewards/rejected": -24.777462005615234, "step": 4590 }, { "epoch": 1.56, "learning_rate": 2.66020395316631e-07, "logits/chosen": -0.49109354615211487, "logits/rejected": -0.6524702310562134, "logps/chosen": -175.52285766601562, "logps/rejected": -550.9501342773438, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.5230644345283508, "rewards/margins": 22.649118423461914, "rewards/rejected": -23.172183990478516, "step": 4600 }, { "epoch": 1.56, "eval_logits/chosen": -0.6252622604370117, "eval_logits/rejected": -0.6724523305892944, "eval_logps/chosen": -218.0094757080078, "eval_logps/rejected": -654.3194580078125, "eval_loss": 0.002482361625880003, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.014634787105023861, "eval_rewards/margins": 23.960182189941406, "eval_rewards/rejected": -23.945545196533203, "eval_runtime": 534.045, "eval_samples_per_second": 17.789, "eval_steps_per_second": 0.556, "step": 4600 }, { "epoch": 1.57, "learning_rate": 2.6539091023542745e-07, "logits/chosen": -0.5461439490318298, "logits/rejected": -0.6609812378883362, "logps/chosen": -156.5863494873047, "logps/rejected": -759.4094848632812, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.45168161392211914, "rewards/margins": 25.320323944091797, "rewards/rejected": -24.868640899658203, "step": 4610 }, { "epoch": 1.57, "learning_rate": 2.647614251542238e-07, "logits/chosen": -0.5062055587768555, "logits/rejected": -0.6309736967086792, "logps/chosen": -194.71963500976562, "logps/rejected": -747.3084106445312, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.2244301289319992, "rewards/margins": 22.145830154418945, "rewards/rejected": -22.370264053344727, "step": 4620 }, { "epoch": 1.57, "learning_rate": 2.6413194007302024e-07, "logits/chosen": -0.40734297037124634, "logits/rejected": -0.6297694444656372, "logps/chosen": -177.48300170898438, "logps/rejected": -650.8176879882812, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.35443010926246643, "rewards/margins": 21.21037483215332, "rewards/rejected": -21.56480598449707, "step": 4630 }, { "epoch": 1.58, "learning_rate": 2.6350245499181666e-07, "logits/chosen": -0.3818315863609314, "logits/rejected": -0.6109921336174011, "logps/chosen": -241.63693237304688, "logps/rejected": -669.118408203125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.17193210124969482, "rewards/margins": 23.999868392944336, "rewards/rejected": -23.82793617248535, "step": 4640 }, { "epoch": 1.58, "learning_rate": 2.628729699106131e-07, "logits/chosen": -0.5388150811195374, "logits/rejected": -0.5518852472305298, "logps/chosen": -170.616455078125, "logps/rejected": -676.6611328125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.055558573454618454, "rewards/margins": 26.960195541381836, "rewards/rejected": -27.0157527923584, "step": 4650 }, { "epoch": 1.58, "learning_rate": 2.6224348482940956e-07, "logits/chosen": -0.4318356513977051, "logits/rejected": -0.6526557803153992, "logps/chosen": -321.1387939453125, "logps/rejected": -772.7674560546875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -0.0408220998942852, "rewards/margins": 22.886890411376953, "rewards/rejected": -22.92771339416504, "step": 4660 }, { "epoch": 1.59, "learning_rate": 2.61613999748206e-07, "logits/chosen": -0.37418609857559204, "logits/rejected": -0.7174888849258423, "logps/chosen": -254.8004913330078, "logps/rejected": -495.41912841796875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.4459246098995209, "rewards/margins": 22.520755767822266, "rewards/rejected": -22.966678619384766, "step": 4670 }, { "epoch": 1.59, "learning_rate": 2.609845146670024e-07, "logits/chosen": -0.6140649318695068, "logits/rejected": -0.6468146443367004, "logps/chosen": -155.67242431640625, "logps/rejected": -506.542236328125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.17385892570018768, "rewards/margins": 22.87448501586914, "rewards/rejected": -23.048343658447266, "step": 4680 }, { "epoch": 1.59, "learning_rate": 2.603550295857988e-07, "logits/chosen": -0.541647732257843, "logits/rejected": -0.5863468647003174, "logps/chosen": -173.7339324951172, "logps/rejected": -660.7400512695312, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.40907183289527893, "rewards/margins": 20.492504119873047, "rewards/rejected": -20.90157699584961, "step": 4690 }, { "epoch": 1.6, "learning_rate": 2.597255445045952e-07, "logits/chosen": -0.44503307342529297, "logits/rejected": -0.6109490394592285, "logps/chosen": -290.66552734375, "logps/rejected": -542.789306640625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.508078932762146, "rewards/margins": 23.950260162353516, "rewards/rejected": -24.45833969116211, "step": 4700 }, { "epoch": 1.6, "eval_logits/chosen": -0.6168164610862732, "eval_logits/rejected": -0.6643640995025635, "eval_logps/chosen": -217.53953552246094, "eval_logps/rejected": -648.1558227539062, "eval_loss": 0.002414580900222063, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.061629027128219604, "eval_rewards/margins": 23.390819549560547, "eval_rewards/rejected": -23.329193115234375, "eval_runtime": 535.4173, "eval_samples_per_second": 17.743, "eval_steps_per_second": 0.555, "step": 4700 }, { "epoch": 1.6, "learning_rate": 2.590960594233916e-07, "logits/chosen": -0.4596787095069885, "logits/rejected": -0.49745646119117737, "logps/chosen": -194.5891876220703, "logps/rejected": -703.3238525390625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.022034084424376488, "rewards/margins": 24.173656463623047, "rewards/rejected": -24.195690155029297, "step": 4710 }, { "epoch": 1.6, "learning_rate": 2.584665743421881e-07, "logits/chosen": -0.5636164546012878, "logits/rejected": -0.6444137096405029, "logps/chosen": -161.633056640625, "logps/rejected": -520.211669921875, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.6545563340187073, "rewards/margins": 23.91883659362793, "rewards/rejected": -24.57339096069336, "step": 4720 }, { "epoch": 1.61, "learning_rate": 2.578370892609845e-07, "logits/chosen": -0.44867125153541565, "logits/rejected": -0.6580969095230103, "logps/chosen": -223.6890106201172, "logps/rejected": -535.4776611328125, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.019272804260254, "rewards/margins": 22.231422424316406, "rewards/rejected": -23.25069808959961, "step": 4730 }, { "epoch": 1.61, "learning_rate": 2.5720760417978095e-07, "logits/chosen": -0.46369022130966187, "logits/rejected": -0.7146589159965515, "logps/chosen": -294.77264404296875, "logps/rejected": -630.6145629882812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.615123450756073, "rewards/margins": 26.241802215576172, "rewards/rejected": -26.856924057006836, "step": 4740 }, { "epoch": 1.61, "learning_rate": 2.5657811909857737e-07, "logits/chosen": -0.36951541900634766, "logits/rejected": -0.6156561970710754, "logps/chosen": -249.8860321044922, "logps/rejected": -611.0787963867188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.1613318920135498, "rewards/margins": 26.555517196655273, "rewards/rejected": -26.716848373413086, "step": 4750 }, { "epoch": 1.62, "learning_rate": 2.559486340173738e-07, "logits/chosen": -0.621701180934906, "logits/rejected": -0.6765621900558472, "logps/chosen": -182.914306640625, "logps/rejected": -566.7771606445312, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.3908681571483612, "rewards/margins": 24.18806266784668, "rewards/rejected": -24.578929901123047, "step": 4760 }, { "epoch": 1.62, "learning_rate": 2.5531914893617016e-07, "logits/chosen": -0.569473147392273, "logits/rejected": -0.6695224642753601, "logps/chosen": -179.2150421142578, "logps/rejected": -865.4130859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8694178462028503, "rewards/margins": 25.207727432250977, "rewards/rejected": -26.077144622802734, "step": 4770 }, { "epoch": 1.62, "learning_rate": 2.5468966385496664e-07, "logits/chosen": -0.5482251644134521, "logits/rejected": -0.5988813638687134, "logps/chosen": -166.9648895263672, "logps/rejected": -532.469970703125, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.34848907589912415, "rewards/margins": 27.195964813232422, "rewards/rejected": -27.54445457458496, "step": 4780 }, { "epoch": 1.63, "learning_rate": 2.5406017877376306e-07, "logits/chosen": -0.48157110810279846, "logits/rejected": -0.6406437754631042, "logps/chosen": -204.14601135253906, "logps/rejected": -604.50244140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.43879812955856323, "rewards/margins": 25.164310455322266, "rewards/rejected": -25.60310935974121, "step": 4790 }, { "epoch": 1.63, "learning_rate": 2.534306936925595e-07, "logits/chosen": -0.4736208915710449, "logits/rejected": -0.7012395262718201, "logps/chosen": -219.4847869873047, "logps/rejected": -651.4237060546875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.4310928285121918, "rewards/margins": 27.208871841430664, "rewards/rejected": -27.639968872070312, "step": 4800 }, { "epoch": 1.63, "eval_logits/chosen": -0.6633859872817993, "eval_logits/rejected": -0.7160833477973938, "eval_logps/chosen": -223.2894287109375, "eval_logps/rejected": -683.9342651367188, "eval_loss": 0.002615395002067089, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.5133598446846008, "eval_rewards/margins": 26.3936767578125, "eval_rewards/rejected": -26.90703773498535, "eval_runtime": 536.4898, "eval_samples_per_second": 17.708, "eval_steps_per_second": 0.554, "step": 4800 }, { "epoch": 1.63, "learning_rate": 2.528012086113559e-07, "logits/chosen": -0.4839145541191101, "logits/rejected": -0.662912130355835, "logps/chosen": -187.75682067871094, "logps/rejected": -810.5582885742188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.45439451932907104, "rewards/margins": 24.701744079589844, "rewards/rejected": -25.156137466430664, "step": 4810 }, { "epoch": 1.64, "learning_rate": 2.5217172353015233e-07, "logits/chosen": -0.6128624081611633, "logits/rejected": -0.6661882996559143, "logps/chosen": -220.98428344726562, "logps/rejected": -730.9119873046875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.024734150618314743, "rewards/margins": 26.22871971130371, "rewards/rejected": -26.2039852142334, "step": 4820 }, { "epoch": 1.64, "learning_rate": 2.5154223844894875e-07, "logits/chosen": -0.5217616558074951, "logits/rejected": -0.6460791826248169, "logps/chosen": -240.1710205078125, "logps/rejected": -500.12127685546875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.4229447841644287, "rewards/margins": 22.270126342773438, "rewards/rejected": -22.693069458007812, "step": 4830 }, { "epoch": 1.65, "learning_rate": 2.509127533677452e-07, "logits/chosen": -0.499409019947052, "logits/rejected": -0.6701967120170593, "logps/chosen": -231.867919921875, "logps/rejected": -460.81292724609375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.19407083094120026, "rewards/margins": 25.11985969543457, "rewards/rejected": -24.92578887939453, "step": 4840 }, { "epoch": 1.65, "learning_rate": 2.502832682865416e-07, "logits/chosen": -0.5727235078811646, "logits/rejected": -0.6604186296463013, "logps/chosen": -230.05160522460938, "logps/rejected": -622.3294067382812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.133963480591774, "rewards/margins": 22.830669403076172, "rewards/rejected": -22.69670295715332, "step": 4850 }, { "epoch": 1.65, "learning_rate": 2.49653783205338e-07, "logits/chosen": -0.4505650997161865, "logits/rejected": -0.6381336450576782, "logps/chosen": -172.19723510742188, "logps/rejected": -593.9669799804688, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 0.01955774798989296, "rewards/margins": 23.25271224975586, "rewards/rejected": -23.233158111572266, "step": 4860 }, { "epoch": 1.66, "learning_rate": 2.4902429812413444e-07, "logits/chosen": -0.5260831117630005, "logits/rejected": -0.5900360345840454, "logps/chosen": -173.10012817382812, "logps/rejected": -640.6582641601562, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.057239532470703125, "rewards/margins": 24.58327865600586, "rewards/rejected": -24.640520095825195, "step": 4870 }, { "epoch": 1.66, "learning_rate": 2.4839481304293086e-07, "logits/chosen": -0.5393608808517456, "logits/rejected": -0.6698201298713684, "logps/chosen": -222.13864135742188, "logps/rejected": -660.695068359375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.2964478135108948, "rewards/margins": 20.80240249633789, "rewards/rejected": -21.098844528198242, "step": 4880 }, { "epoch": 1.66, "learning_rate": 2.477653279617273e-07, "logits/chosen": -0.4589292109012604, "logits/rejected": -0.706427276134491, "logps/chosen": -244.42788696289062, "logps/rejected": -887.2975463867188, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.39181023836135864, "rewards/margins": 26.30059242248535, "rewards/rejected": -25.90877914428711, "step": 4890 }, { "epoch": 1.67, "learning_rate": 2.471358428805237e-07, "logits/chosen": -0.42118844389915466, "logits/rejected": -0.5147238969802856, "logps/chosen": -246.2998809814453, "logps/rejected": -684.775146484375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.3254477381706238, "rewards/margins": 23.42202377319336, "rewards/rejected": -23.096576690673828, "step": 4900 }, { "epoch": 1.67, "eval_logits/chosen": -0.6037598252296448, "eval_logits/rejected": -0.6444448828697205, "eval_logps/chosen": -219.07179260253906, "eval_logps/rejected": -663.070068359375, "eval_loss": 0.002533489838242531, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.09159538894891739, "eval_rewards/margins": 24.729022979736328, "eval_rewards/rejected": -24.820621490478516, "eval_runtime": 535.3572, "eval_samples_per_second": 17.745, "eval_steps_per_second": 0.555, "step": 4900 }, { "epoch": 1.67, "learning_rate": 2.4650635779932013e-07, "logits/chosen": -0.5037122964859009, "logits/rejected": -0.639061450958252, "logps/chosen": -171.3362274169922, "logps/rejected": -563.0844116210938, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.40644949674606323, "rewards/margins": 25.411548614501953, "rewards/rejected": -25.8179988861084, "step": 4910 }, { "epoch": 1.67, "learning_rate": 2.4587687271811656e-07, "logits/chosen": -0.4762570261955261, "logits/rejected": -0.5796430110931396, "logps/chosen": -183.03929138183594, "logps/rejected": -764.0223388671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.2769398093223572, "rewards/margins": 24.206470489501953, "rewards/rejected": -24.483409881591797, "step": 4920 }, { "epoch": 1.68, "learning_rate": 2.45247387636913e-07, "logits/chosen": -0.5040691494941711, "logits/rejected": -0.614579975605011, "logps/chosen": -214.080810546875, "logps/rejected": -611.3472900390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.008473282679915428, "rewards/margins": 22.77727508544922, "rewards/rejected": -22.768802642822266, "step": 4930 }, { "epoch": 1.68, "learning_rate": 2.446179025557094e-07, "logits/chosen": -0.44418421387672424, "logits/rejected": -0.550711452960968, "logps/chosen": -344.0567932128906, "logps/rejected": -770.4573974609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.273067444562912, "rewards/margins": 23.823225021362305, "rewards/rejected": -24.096294403076172, "step": 4940 }, { "epoch": 1.68, "learning_rate": 2.439884174745059e-07, "logits/chosen": -0.560650646686554, "logits/rejected": -0.5647963285446167, "logps/chosen": -227.96621704101562, "logps/rejected": -618.65087890625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -0.6119434237480164, "rewards/margins": 27.354238510131836, "rewards/rejected": -27.966182708740234, "step": 4950 }, { "epoch": 1.69, "learning_rate": 2.4335893239330225e-07, "logits/chosen": -0.3907301127910614, "logits/rejected": -0.646602988243103, "logps/chosen": -273.32794189453125, "logps/rejected": -615.2568969726562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.3525446355342865, "rewards/margins": 21.902441024780273, "rewards/rejected": -22.254987716674805, "step": 4960 }, { "epoch": 1.69, "learning_rate": 2.4272944731209867e-07, "logits/chosen": -0.5153100490570068, "logits/rejected": -0.49774008989334106, "logps/chosen": -226.5325164794922, "logps/rejected": -630.181396484375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -0.27155548334121704, "rewards/margins": 25.313961029052734, "rewards/rejected": -25.58551597595215, "step": 4970 }, { "epoch": 1.69, "learning_rate": 2.4209996223089514e-07, "logits/chosen": -0.4449167847633362, "logits/rejected": -0.5284848213195801, "logps/chosen": -220.9611358642578, "logps/rejected": -611.3411254882812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.29146355390548706, "rewards/margins": 20.74224853515625, "rewards/rejected": -20.450786590576172, "step": 4980 }, { "epoch": 1.7, "learning_rate": 2.4147047714969157e-07, "logits/chosen": -0.538847804069519, "logits/rejected": -0.6162213087081909, "logps/chosen": -166.40518188476562, "logps/rejected": -649.2560424804688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.3283061385154724, "rewards/margins": 20.154369354248047, "rewards/rejected": -19.82606315612793, "step": 4990 }, { "epoch": 1.7, "learning_rate": 2.4084099206848794e-07, "logits/chosen": -0.46189770102500916, "logits/rejected": -0.5298742055892944, "logps/chosen": -158.20643615722656, "logps/rejected": -542.6203002929688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.33455023169517517, "rewards/margins": 21.550458908081055, "rewards/rejected": -21.215909957885742, "step": 5000 }, { "epoch": 1.7, "eval_logits/chosen": -0.5784969925880432, "eval_logits/rejected": -0.6169469952583313, "eval_logps/chosen": -216.5716094970703, "eval_logps/rejected": -653.2886962890625, "eval_loss": 0.0024540331214666367, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.15841947495937347, "eval_rewards/margins": 24.000890731811523, "eval_rewards/rejected": -23.842470169067383, "eval_runtime": 536.1484, "eval_samples_per_second": 17.719, "eval_steps_per_second": 0.554, "step": 5000 }, { "epoch": 1.7, "learning_rate": 2.402115069872844e-07, "logits/chosen": -0.37717562913894653, "logits/rejected": -0.6291487812995911, "logps/chosen": -189.11920166015625, "logps/rejected": -532.9532470703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.07610680162906647, "rewards/margins": 25.146631240844727, "rewards/rejected": -25.07052230834961, "step": 5010 }, { "epoch": 1.71, "learning_rate": 2.3958202190608084e-07, "logits/chosen": -0.5811060070991516, "logits/rejected": -0.5469228029251099, "logps/chosen": -229.98623657226562, "logps/rejected": -550.5834350585938, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.3476710915565491, "rewards/margins": 22.676952362060547, "rewards/rejected": -22.329280853271484, "step": 5020 }, { "epoch": 1.71, "learning_rate": 2.3895253682487726e-07, "logits/chosen": -0.2827227711677551, "logits/rejected": -0.5084789991378784, "logps/chosen": -174.36451721191406, "logps/rejected": -628.8973999023438, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.3764151930809021, "rewards/margins": 20.86031150817871, "rewards/rejected": -20.483898162841797, "step": 5030 }, { "epoch": 1.71, "learning_rate": 2.3832305174367368e-07, "logits/chosen": -0.4601469933986664, "logits/rejected": -0.6643530130386353, "logps/chosen": -222.8939971923828, "logps/rejected": -602.3601684570312, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.2566896975040436, "rewards/margins": 24.220136642456055, "rewards/rejected": -24.47682762145996, "step": 5040 }, { "epoch": 1.72, "learning_rate": 2.3769356666247008e-07, "logits/chosen": -0.5313233137130737, "logits/rejected": -0.6515535116195679, "logps/chosen": -214.2427215576172, "logps/rejected": -617.2474975585938, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.07487977296113968, "rewards/margins": 22.338911056518555, "rewards/rejected": -22.413789749145508, "step": 5050 }, { "epoch": 1.72, "learning_rate": 2.370640815812665e-07, "logits/chosen": -0.46589189767837524, "logits/rejected": -0.5898574590682983, "logps/chosen": -230.20791625976562, "logps/rejected": -637.32470703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.5147914886474609, "rewards/margins": 24.035917282104492, "rewards/rejected": -23.521127700805664, "step": 5060 }, { "epoch": 1.72, "learning_rate": 2.3643459650006295e-07, "logits/chosen": -0.4968477189540863, "logits/rejected": -0.724267840385437, "logps/chosen": -219.8869171142578, "logps/rejected": -431.3829040527344, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.2254493236541748, "rewards/margins": 20.19228744506836, "rewards/rejected": -19.966838836669922, "step": 5070 }, { "epoch": 1.73, "learning_rate": 2.3580511141885937e-07, "logits/chosen": -0.466185986995697, "logits/rejected": -0.6805993318557739, "logps/chosen": -212.73251342773438, "logps/rejected": -493.9432067871094, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3529016971588135, "rewards/margins": 20.42238426208496, "rewards/rejected": -20.775287628173828, "step": 5080 }, { "epoch": 1.73, "learning_rate": 2.3517562633765577e-07, "logits/chosen": -0.3879837989807129, "logits/rejected": -0.6191308498382568, "logps/chosen": -162.71144104003906, "logps/rejected": -501.46234130859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.30180424451828003, "rewards/margins": 22.97334861755371, "rewards/rejected": -22.671546936035156, "step": 5090 }, { "epoch": 1.73, "learning_rate": 2.3454614125645222e-07, "logits/chosen": -0.4880734980106354, "logits/rejected": -0.600933849811554, "logps/chosen": -206.69479370117188, "logps/rejected": -671.4459838867188, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -0.002764034317806363, "rewards/margins": 21.817630767822266, "rewards/rejected": -21.820392608642578, "step": 5100 }, { "epoch": 1.73, "eval_logits/chosen": -0.5975618958473206, "eval_logits/rejected": -0.647698700428009, "eval_logps/chosen": -213.57521057128906, "eval_logps/rejected": -636.8297729492188, "eval_loss": 0.0025801321025937796, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.45806267857551575, "eval_rewards/margins": 22.654644012451172, "eval_rewards/rejected": -22.19658088684082, "eval_runtime": 536.2503, "eval_samples_per_second": 17.716, "eval_steps_per_second": 0.554, "step": 5100 }, { "epoch": 1.74, "learning_rate": 2.3391665617524864e-07, "logits/chosen": -0.5915528535842896, "logits/rejected": -0.5664933919906616, "logps/chosen": -267.34564208984375, "logps/rejected": -845.1912231445312, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": 0.7690844535827637, "rewards/margins": 22.692201614379883, "rewards/rejected": -21.923114776611328, "step": 5110 }, { "epoch": 1.74, "learning_rate": 2.3328717109404506e-07, "logits/chosen": -0.5003194212913513, "logits/rejected": -0.6175363659858704, "logps/chosen": -169.7011260986328, "logps/rejected": -550.2203979492188, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.624178409576416, "rewards/margins": 24.67953109741211, "rewards/rejected": -24.05535125732422, "step": 5120 }, { "epoch": 1.74, "learning_rate": 2.3265768601284149e-07, "logits/chosen": -0.4194776117801666, "logits/rejected": -0.5525631308555603, "logps/chosen": -296.91156005859375, "logps/rejected": -664.9329223632812, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.5841951966285706, "rewards/margins": 23.34842872619629, "rewards/rejected": -22.764232635498047, "step": 5130 }, { "epoch": 1.75, "learning_rate": 2.320282009316379e-07, "logits/chosen": -0.556465744972229, "logits/rejected": -0.6188619136810303, "logps/chosen": -285.1603698730469, "logps/rejected": -803.9190673828125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.6422853469848633, "rewards/margins": 23.42117691040039, "rewards/rejected": -22.77889060974121, "step": 5140 }, { "epoch": 1.75, "learning_rate": 2.3139871585043433e-07, "logits/chosen": -0.45192503929138184, "logits/rejected": -0.573442816734314, "logps/chosen": -221.6807098388672, "logps/rejected": -544.23779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.044663168489933014, "rewards/margins": 22.512176513671875, "rewards/rejected": -22.467514038085938, "step": 5150 }, { "epoch": 1.75, "learning_rate": 2.3076923076923078e-07, "logits/chosen": -0.41379469633102417, "logits/rejected": -0.6697598695755005, "logps/chosen": -327.4898986816406, "logps/rejected": -599.6917114257812, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.1948927640914917, "rewards/margins": 23.816383361816406, "rewards/rejected": -23.621490478515625, "step": 5160 }, { "epoch": 1.76, "learning_rate": 2.3013974568802718e-07, "logits/chosen": -0.5278249979019165, "logits/rejected": -0.6475222110748291, "logps/chosen": -225.9620361328125, "logps/rejected": -541.6181640625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5749045610427856, "rewards/margins": 23.192306518554688, "rewards/rejected": -22.617403030395508, "step": 5170 }, { "epoch": 1.76, "learning_rate": 2.295102606068236e-07, "logits/chosen": -0.6105553507804871, "logits/rejected": -0.6448832750320435, "logps/chosen": -163.29331970214844, "logps/rejected": -846.7215576171875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.031028103083372116, "rewards/margins": 24.41720199584961, "rewards/rejected": -24.38617515563965, "step": 5180 }, { "epoch": 1.76, "learning_rate": 2.2888077552562005e-07, "logits/chosen": -0.6744899749755859, "logits/rejected": -0.5688742399215698, "logps/chosen": -164.93519592285156, "logps/rejected": -680.6533203125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.3923385739326477, "rewards/margins": 22.96457290649414, "rewards/rejected": -22.572233200073242, "step": 5190 }, { "epoch": 1.77, "learning_rate": 2.2825129044441647e-07, "logits/chosen": -0.6232777833938599, "logits/rejected": -0.6612902879714966, "logps/chosen": -154.3406982421875, "logps/rejected": -575.6177978515625, "loss": 0.002, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.16955123841762543, "rewards/margins": 21.990304946899414, "rewards/rejected": -22.159852981567383, "step": 5200 }, { "epoch": 1.77, "eval_logits/chosen": -0.6312224268913269, "eval_logits/rejected": -0.677798867225647, "eval_logps/chosen": -216.4930877685547, "eval_logps/rejected": -652.6381225585938, "eval_loss": 0.002279468346387148, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.16627489030361176, "eval_rewards/margins": 23.94369125366211, "eval_rewards/rejected": -23.777416229248047, "eval_runtime": 538.1247, "eval_samples_per_second": 17.654, "eval_steps_per_second": 0.552, "step": 5200 }, { "epoch": 1.77, "learning_rate": 2.2762180536321287e-07, "logits/chosen": -0.5219216346740723, "logits/rejected": -0.6434003114700317, "logps/chosen": -167.46151733398438, "logps/rejected": -690.9929809570312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.32931405305862427, "rewards/margins": 24.46516227722168, "rewards/rejected": -24.135847091674805, "step": 5210 }, { "epoch": 1.77, "learning_rate": 2.2699232028200932e-07, "logits/chosen": -0.5748692750930786, "logits/rejected": -0.6937567591667175, "logps/chosen": -220.669677734375, "logps/rejected": -745.5933837890625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.25732454657554626, "rewards/margins": 21.676713943481445, "rewards/rejected": -21.419391632080078, "step": 5220 }, { "epoch": 1.78, "learning_rate": 2.2636283520080574e-07, "logits/chosen": -0.3657473623752594, "logits/rejected": -0.6521558165550232, "logps/chosen": -268.3288269042969, "logps/rejected": -576.5147094726562, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.26564258337020874, "rewards/margins": 22.481096267700195, "rewards/rejected": -22.74673843383789, "step": 5230 }, { "epoch": 1.78, "learning_rate": 2.2573335011960216e-07, "logits/chosen": -0.6439858078956604, "logits/rejected": -0.6543588042259216, "logps/chosen": -157.1187744140625, "logps/rejected": -764.0343017578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.41840094327926636, "rewards/margins": 24.57142448425293, "rewards/rejected": -24.153024673461914, "step": 5240 }, { "epoch": 1.78, "learning_rate": 2.2510386503839856e-07, "logits/chosen": -0.5713964700698853, "logits/rejected": -0.6649643778800964, "logps/chosen": -162.4407958984375, "logps/rejected": -666.3436279296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.07940883934497833, "rewards/margins": 23.65790367126465, "rewards/rejected": -23.5784969329834, "step": 5250 }, { "epoch": 1.79, "learning_rate": 2.24474379957195e-07, "logits/chosen": -0.45010191202163696, "logits/rejected": -0.7330666780471802, "logps/chosen": -322.3794250488281, "logps/rejected": -784.6651611328125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.11690457165241241, "rewards/margins": 26.09649085998535, "rewards/rejected": -25.979583740234375, "step": 5260 }, { "epoch": 1.79, "learning_rate": 2.2384489487599143e-07, "logits/chosen": -0.5306657552719116, "logits/rejected": -0.6162094473838806, "logps/chosen": -294.46875, "logps/rejected": -808.5828857421875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.11868568509817123, "rewards/margins": 25.195478439331055, "rewards/rejected": -25.076797485351562, "step": 5270 }, { "epoch": 1.79, "learning_rate": 2.2321540979478783e-07, "logits/chosen": -0.6402685046195984, "logits/rejected": -0.7436927556991577, "logps/chosen": -225.0797882080078, "logps/rejected": -709.9534301757812, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3568553626537323, "rewards/margins": 27.051034927368164, "rewards/rejected": -27.407888412475586, "step": 5280 }, { "epoch": 1.8, "learning_rate": 2.2258592471358428e-07, "logits/chosen": -0.541791558265686, "logits/rejected": -0.6506600975990295, "logps/chosen": -168.20590209960938, "logps/rejected": -710.8687744140625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.22546592354774475, "rewards/margins": 22.423633575439453, "rewards/rejected": -22.649099349975586, "step": 5290 }, { "epoch": 1.8, "learning_rate": 2.219564396323807e-07, "logits/chosen": -0.5548420548439026, "logits/rejected": -0.6564828157424927, "logps/chosen": -169.18524169921875, "logps/rejected": -813.7459716796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.1035889983177185, "rewards/margins": 26.424402236938477, "rewards/rejected": -26.52799415588379, "step": 5300 }, { "epoch": 1.8, "eval_logits/chosen": -0.6444500088691711, "eval_logits/rejected": -0.6906864047050476, "eval_logps/chosen": -217.2704620361328, "eval_logps/rejected": -659.503173828125, "eval_loss": 0.002146689221262932, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.08853628486394882, "eval_rewards/margins": 24.552459716796875, "eval_rewards/rejected": -24.46392250061035, "eval_runtime": 537.4308, "eval_samples_per_second": 17.677, "eval_steps_per_second": 0.553, "step": 5300 }, { "epoch": 1.8, "learning_rate": 2.2132695455117712e-07, "logits/chosen": -0.5261704325675964, "logits/rejected": -0.6446816325187683, "logps/chosen": -175.25343322753906, "logps/rejected": -529.8424072265625, "loss": 0.0017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.07984890043735504, "rewards/margins": 24.476131439208984, "rewards/rejected": -24.396284103393555, "step": 5310 }, { "epoch": 1.81, "learning_rate": 2.2069746946997355e-07, "logits/chosen": -0.7382365465164185, "logits/rejected": -0.6253567934036255, "logps/chosen": -146.17807006835938, "logps/rejected": -464.2119140625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.12434907257556915, "rewards/margins": 23.309185028076172, "rewards/rejected": -23.433536529541016, "step": 5320 }, { "epoch": 1.81, "learning_rate": 2.2006798438876997e-07, "logits/chosen": -0.4152161478996277, "logits/rejected": -0.6559327840805054, "logps/chosen": -301.18450927734375, "logps/rejected": -749.1504516601562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014094263315200806, "rewards/margins": 22.68332290649414, "rewards/rejected": -22.684734344482422, "step": 5330 }, { "epoch": 1.82, "learning_rate": 2.194384993075664e-07, "logits/chosen": -0.587774395942688, "logits/rejected": -0.6367352604866028, "logps/chosen": -157.99966430664062, "logps/rejected": -694.4251708984375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.1906069815158844, "rewards/margins": 26.0396785736084, "rewards/rejected": -25.849071502685547, "step": 5340 }, { "epoch": 1.82, "learning_rate": 2.1880901422636284e-07, "logits/chosen": -0.4906982481479645, "logits/rejected": -0.6710368394851685, "logps/chosen": -168.2609100341797, "logps/rejected": -590.3834228515625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.2531966269016266, "rewards/margins": 24.17781639099121, "rewards/rejected": -23.924619674682617, "step": 5350 }, { "epoch": 1.82, "learning_rate": 2.1817952914515924e-07, "logits/chosen": -0.5968886017799377, "logits/rejected": -0.6363939046859741, "logps/chosen": -165.75656127929688, "logps/rejected": -612.7036743164062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.031180500984191895, "rewards/margins": 21.324342727661133, "rewards/rejected": -21.293161392211914, "step": 5360 }, { "epoch": 1.83, "learning_rate": 2.1755004406395566e-07, "logits/chosen": -0.5899828672409058, "logits/rejected": -0.6916004419326782, "logps/chosen": -244.950927734375, "logps/rejected": -592.8594970703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.3582027852535248, "rewards/margins": 31.33123207092285, "rewards/rejected": -30.973031997680664, "step": 5370 }, { "epoch": 1.83, "learning_rate": 2.169205589827521e-07, "logits/chosen": -0.6637479066848755, "logits/rejected": -0.6977212429046631, "logps/chosen": -164.89901733398438, "logps/rejected": -666.614990234375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -0.36202889680862427, "rewards/margins": 30.058273315429688, "rewards/rejected": -30.420303344726562, "step": 5380 }, { "epoch": 1.83, "learning_rate": 2.1629107390154853e-07, "logits/chosen": -0.5601236820220947, "logits/rejected": -0.6083390712738037, "logps/chosen": -169.22335815429688, "logps/rejected": -747.6775512695312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.1865139603614807, "rewards/margins": 27.021203994750977, "rewards/rejected": -27.207717895507812, "step": 5390 }, { "epoch": 1.84, "learning_rate": 2.1566158882034493e-07, "logits/chosen": -0.6054414510726929, "logits/rejected": -0.5846805572509766, "logps/chosen": -163.00924682617188, "logps/rejected": -496.41534423828125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.4530840516090393, "rewards/margins": 23.252452850341797, "rewards/rejected": -22.799367904663086, "step": 5400 }, { "epoch": 1.84, "eval_logits/chosen": -0.6177369356155396, "eval_logits/rejected": -0.667419970035553, "eval_logps/chosen": -214.89666748046875, "eval_logps/rejected": -653.016845703125, "eval_loss": 0.002047585090622306, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.32591402530670166, "eval_rewards/margins": 24.14120101928711, "eval_rewards/rejected": -23.815288543701172, "eval_runtime": 536.8459, "eval_samples_per_second": 17.696, "eval_steps_per_second": 0.553, "step": 5400 }, { "epoch": 1.84, "learning_rate": 2.1503210373914138e-07, "logits/chosen": -0.45282667875289917, "logits/rejected": -0.611941933631897, "logps/chosen": -219.09707641601562, "logps/rejected": -737.6278076171875, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6453992128372192, "rewards/margins": 20.995098114013672, "rewards/rejected": -20.34969711303711, "step": 5410 }, { "epoch": 1.84, "learning_rate": 2.144026186579378e-07, "logits/chosen": -0.50271075963974, "logits/rejected": -0.6788471937179565, "logps/chosen": -152.79452514648438, "logps/rejected": -799.66064453125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.056967489421367645, "rewards/margins": 26.688552856445312, "rewards/rejected": -26.6315860748291, "step": 5420 }, { "epoch": 1.85, "learning_rate": 2.1377313357673422e-07, "logits/chosen": -0.41423898935317993, "logits/rejected": -0.6023446917533875, "logps/chosen": -219.1536102294922, "logps/rejected": -657.9168701171875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": 0.08240961283445358, "rewards/margins": 20.660839080810547, "rewards/rejected": -20.578432083129883, "step": 5430 }, { "epoch": 1.85, "learning_rate": 2.1314364849553065e-07, "logits/chosen": -0.5515814423561096, "logits/rejected": -0.6339834332466125, "logps/chosen": -203.7603302001953, "logps/rejected": -564.93798828125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.28472867608070374, "rewards/margins": 22.325214385986328, "rewards/rejected": -22.609943389892578, "step": 5440 }, { "epoch": 1.85, "learning_rate": 2.1251416341432707e-07, "logits/chosen": -0.6348009705543518, "logits/rejected": -0.6726848483085632, "logps/chosen": -223.44857788085938, "logps/rejected": -738.9224243164062, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.6148960590362549, "rewards/margins": 24.82906150817871, "rewards/rejected": -24.214162826538086, "step": 5450 }, { "epoch": 1.86, "learning_rate": 2.118846783331235e-07, "logits/chosen": -0.511262059211731, "logits/rejected": -0.696782112121582, "logps/chosen": -257.5040283203125, "logps/rejected": -679.61474609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.36417466402053833, "rewards/margins": 26.876256942749023, "rewards/rejected": -26.5120849609375, "step": 5460 }, { "epoch": 1.86, "learning_rate": 2.1125519325191994e-07, "logits/chosen": -0.514334499835968, "logits/rejected": -0.6245428323745728, "logps/chosen": -180.42147827148438, "logps/rejected": -773.4385375976562, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.11412888765335083, "rewards/margins": 26.978740692138672, "rewards/rejected": -26.864612579345703, "step": 5470 }, { "epoch": 1.86, "learning_rate": 2.1062570817071634e-07, "logits/chosen": -0.4420148432254791, "logits/rejected": -0.5868907570838928, "logps/chosen": -271.01812744140625, "logps/rejected": -630.3073120117188, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.6297640800476074, "rewards/margins": 20.210132598876953, "rewards/rejected": -19.58036994934082, "step": 5480 }, { "epoch": 1.87, "learning_rate": 2.0999622308951276e-07, "logits/chosen": -0.5327980518341064, "logits/rejected": -0.6099623441696167, "logps/chosen": -211.6958770751953, "logps/rejected": -775.9808349609375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.4180804193019867, "rewards/margins": 29.96575355529785, "rewards/rejected": -30.383831024169922, "step": 5490 }, { "epoch": 1.87, "learning_rate": 2.093667380083092e-07, "logits/chosen": -0.6378130912780762, "logits/rejected": -0.6487399935722351, "logps/chosen": -172.21083068847656, "logps/rejected": -611.4880981445312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.23861508071422577, "rewards/margins": 24.568119049072266, "rewards/rejected": -24.806734085083008, "step": 5500 }, { "epoch": 1.87, "eval_logits/chosen": -0.6629668474197388, "eval_logits/rejected": -0.7239001989364624, "eval_logps/chosen": -217.609130859375, "eval_logps/rejected": -669.3798217773438, "eval_loss": 0.0026998009998351336, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.05466857925057411, "eval_rewards/margins": 25.50625991821289, "eval_rewards/rejected": -25.45159339904785, "eval_runtime": 537.7831, "eval_samples_per_second": 17.665, "eval_steps_per_second": 0.552, "step": 5500 }, { "epoch": 1.87, "learning_rate": 2.087372529271056e-07, "logits/chosen": -0.45547351241111755, "logits/rejected": -0.6782476902008057, "logps/chosen": -302.5398864746094, "logps/rejected": -754.4625854492188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.1866532266139984, "rewards/margins": 25.972097396850586, "rewards/rejected": -25.785442352294922, "step": 5510 }, { "epoch": 1.88, "learning_rate": 2.0810776784590203e-07, "logits/chosen": -0.4768661558628082, "logits/rejected": -0.6748565435409546, "logps/chosen": -183.79641723632812, "logps/rejected": -676.3335571289062, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.2570660710334778, "rewards/margins": 27.917861938476562, "rewards/rejected": -28.174922943115234, "step": 5520 }, { "epoch": 1.88, "learning_rate": 2.0747828276469848e-07, "logits/chosen": -0.6507681608200073, "logits/rejected": -0.7092069387435913, "logps/chosen": -274.5489196777344, "logps/rejected": -789.7574462890625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3566311299800873, "rewards/margins": 23.843669891357422, "rewards/rejected": -24.200298309326172, "step": 5530 }, { "epoch": 1.88, "learning_rate": 2.068487976834949e-07, "logits/chosen": -0.550581693649292, "logits/rejected": -0.6649380922317505, "logps/chosen": -198.66046142578125, "logps/rejected": -662.0015869140625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.23536977171897888, "rewards/margins": 26.439926147460938, "rewards/rejected": -26.20455551147461, "step": 5540 }, { "epoch": 1.89, "learning_rate": 2.062193126022913e-07, "logits/chosen": -0.628831684589386, "logits/rejected": -0.6950745582580566, "logps/chosen": -162.88711547851562, "logps/rejected": -762.7994384765625, "loss": 0.0019, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6566280722618103, "rewards/margins": 26.300634384155273, "rewards/rejected": -26.957263946533203, "step": 5550 }, { "epoch": 1.89, "learning_rate": 2.0558982752108775e-07, "logits/chosen": -0.5900738835334778, "logits/rejected": -0.6592618823051453, "logps/chosen": -183.41152954101562, "logps/rejected": -793.13671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.18552155792713165, "rewards/margins": 28.076187133789062, "rewards/rejected": -28.261709213256836, "step": 5560 }, { "epoch": 1.89, "learning_rate": 2.0496034243988417e-07, "logits/chosen": -0.6325895190238953, "logits/rejected": -0.6428799033164978, "logps/chosen": -227.73788452148438, "logps/rejected": -772.1072387695312, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.42681169509887695, "rewards/margins": 27.67380714416504, "rewards/rejected": -28.10062026977539, "step": 5570 }, { "epoch": 1.9, "learning_rate": 2.043308573586806e-07, "logits/chosen": -0.5027201771736145, "logits/rejected": -0.6696706414222717, "logps/chosen": -291.6740417480469, "logps/rejected": -820.3040771484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.17119750380516052, "rewards/margins": 25.672754287719727, "rewards/rejected": -25.843952178955078, "step": 5580 }, { "epoch": 1.9, "learning_rate": 2.0370137227747701e-07, "logits/chosen": -0.5802045464515686, "logits/rejected": -0.6481605768203735, "logps/chosen": -234.7467803955078, "logps/rejected": -768.3636474609375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.07932128012180328, "rewards/margins": 28.208337783813477, "rewards/rejected": -28.287654876708984, "step": 5590 }, { "epoch": 1.9, "learning_rate": 2.0307188719627344e-07, "logits/chosen": -0.568697452545166, "logits/rejected": -0.569672703742981, "logps/chosen": -287.5411682128906, "logps/rejected": -576.031982421875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.23261170089244843, "rewards/margins": 24.74388313293457, "rewards/rejected": -24.511272430419922, "step": 5600 }, { "epoch": 1.9, "eval_logits/chosen": -0.6718372702598572, "eval_logits/rejected": -0.7328038811683655, "eval_logps/chosen": -220.99679565429688, "eval_logps/rejected": -687.2796020507812, "eval_loss": 0.0026656328700482845, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.28409746289253235, "eval_rewards/margins": 26.957468032836914, "eval_rewards/rejected": -27.241565704345703, "eval_runtime": 537.4118, "eval_samples_per_second": 17.677, "eval_steps_per_second": 0.553, "step": 5600 }, { "epoch": 1.91, "learning_rate": 2.0244240211506986e-07, "logits/chosen": -0.5616599321365356, "logits/rejected": -0.6874145269393921, "logps/chosen": -191.07151794433594, "logps/rejected": -488.1980895996094, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -0.19599463045597076, "rewards/margins": 23.2009220123291, "rewards/rejected": -23.39691734313965, "step": 5610 }, { "epoch": 1.91, "learning_rate": 2.018129170338663e-07, "logits/chosen": -0.399535596370697, "logits/rejected": -0.68458092212677, "logps/chosen": -347.31890869140625, "logps/rejected": -610.0401611328125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.2523438334465027, "rewards/margins": 25.049060821533203, "rewards/rejected": -24.796716690063477, "step": 5620 }, { "epoch": 1.91, "learning_rate": 2.011834319526627e-07, "logits/chosen": -0.6280252933502197, "logits/rejected": -0.6226717233657837, "logps/chosen": -213.97665405273438, "logps/rejected": -617.8245239257812, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.08077476918697357, "rewards/margins": 23.92662239074707, "rewards/rejected": -23.84585189819336, "step": 5630 }, { "epoch": 1.92, "learning_rate": 2.0055394687145913e-07, "logits/chosen": -0.5722359418869019, "logits/rejected": -0.619611918926239, "logps/chosen": -230.1018829345703, "logps/rejected": -875.5279541015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.2532828450202942, "rewards/margins": 23.120716094970703, "rewards/rejected": -23.373998641967773, "step": 5640 }, { "epoch": 1.92, "learning_rate": 1.9992446179025558e-07, "logits/chosen": -0.6302633285522461, "logits/rejected": -0.5775930881500244, "logps/chosen": -173.723876953125, "logps/rejected": -689.3563232421875, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 0.08495856821537018, "rewards/margins": 21.160961151123047, "rewards/rejected": -21.07600212097168, "step": 5650 }, { "epoch": 1.92, "learning_rate": 1.99294976709052e-07, "logits/chosen": -0.45107150077819824, "logits/rejected": -0.6993687748908997, "logps/chosen": -177.66098022460938, "logps/rejected": -633.5307006835938, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.23196446895599365, "rewards/margins": 23.674489974975586, "rewards/rejected": -23.44252586364746, "step": 5660 }, { "epoch": 1.93, "learning_rate": 1.986654916278484e-07, "logits/chosen": -0.45468273758888245, "logits/rejected": -0.6589124798774719, "logps/chosen": -170.65170288085938, "logps/rejected": -611.7965698242188, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.027626842260360718, "rewards/margins": 19.21035385131836, "rewards/rejected": -19.182729721069336, "step": 5670 }, { "epoch": 1.93, "learning_rate": 1.9803600654664484e-07, "logits/chosen": -0.5859938859939575, "logits/rejected": -0.6326059103012085, "logps/chosen": -152.93905639648438, "logps/rejected": -884.7824096679688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.3700067102909088, "rewards/margins": 24.062143325805664, "rewards/rejected": -23.692138671875, "step": 5680 }, { "epoch": 1.93, "learning_rate": 1.9740652146544127e-07, "logits/chosen": -0.5430617332458496, "logits/rejected": -0.6132655739784241, "logps/chosen": -231.2560577392578, "logps/rejected": -659.8026123046875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.5374656915664673, "rewards/margins": 23.721202850341797, "rewards/rejected": -23.183734893798828, "step": 5690 }, { "epoch": 1.94, "learning_rate": 1.9677703638423766e-07, "logits/chosen": -0.5835504531860352, "logits/rejected": -0.6342555284500122, "logps/chosen": -160.8026885986328, "logps/rejected": -570.0697021484375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.10946623235940933, "rewards/margins": 24.44606590270996, "rewards/rejected": -24.555532455444336, "step": 5700 }, { "epoch": 1.94, "eval_logits/chosen": -0.6325913071632385, "eval_logits/rejected": -0.701815128326416, "eval_logps/chosen": -214.76190185546875, "eval_logps/rejected": -648.0685424804688, "eval_loss": 0.0030741621740162373, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.3393927216529846, "eval_rewards/margins": 23.659854888916016, "eval_rewards/rejected": -23.32046127319336, "eval_runtime": 537.0692, "eval_samples_per_second": 17.689, "eval_steps_per_second": 0.553, "step": 5700 }, { "epoch": 1.94, "learning_rate": 1.961475513030341e-07, "logits/chosen": -0.46250852942466736, "logits/rejected": -0.6201439499855042, "logps/chosen": -287.5912170410156, "logps/rejected": -691.2306518554688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.27676063776016235, "rewards/margins": 23.338558197021484, "rewards/rejected": -23.061796188354492, "step": 5710 }, { "epoch": 1.94, "learning_rate": 1.9551806622183054e-07, "logits/chosen": -0.5761824250221252, "logits/rejected": -0.6594797372817993, "logps/chosen": -159.11390686035156, "logps/rejected": -726.8004150390625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.44077903032302856, "rewards/margins": 25.894947052001953, "rewards/rejected": -25.454166412353516, "step": 5720 }, { "epoch": 1.95, "learning_rate": 1.9488858114062696e-07, "logits/chosen": -0.39834064245224, "logits/rejected": -0.6698485016822815, "logps/chosen": -178.869873046875, "logps/rejected": -532.64306640625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.5001816153526306, "rewards/margins": 24.53247833251953, "rewards/rejected": -24.03229522705078, "step": 5730 }, { "epoch": 1.95, "learning_rate": 1.9425909605942338e-07, "logits/chosen": -0.63575679063797, "logits/rejected": -0.7074697613716125, "logps/chosen": -351.12158203125, "logps/rejected": -635.7095947265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.0518425814807415, "rewards/margins": 20.849742889404297, "rewards/rejected": -20.79789924621582, "step": 5740 }, { "epoch": 1.95, "learning_rate": 1.936296109782198e-07, "logits/chosen": -0.4867176115512848, "logits/rejected": -0.7049714922904968, "logps/chosen": -216.97097778320312, "logps/rejected": -900.5638427734375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.19303762912750244, "rewards/margins": 27.606531143188477, "rewards/rejected": -27.41349220275879, "step": 5750 }, { "epoch": 1.96, "learning_rate": 1.9300012589701623e-07, "logits/chosen": -0.514417290687561, "logits/rejected": -0.587101399898529, "logps/chosen": -242.0309600830078, "logps/rejected": -775.11572265625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.59832364320755, "rewards/margins": 23.022903442382812, "rewards/rejected": -22.424579620361328, "step": 5760 }, { "epoch": 1.96, "learning_rate": 1.9237064081581268e-07, "logits/chosen": -0.5858964323997498, "logits/rejected": -0.7668770551681519, "logps/chosen": -178.90447998046875, "logps/rejected": -747.6087646484375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 0.26512300968170166, "rewards/margins": 26.923290252685547, "rewards/rejected": -26.658166885375977, "step": 5770 }, { "epoch": 1.96, "learning_rate": 1.9174115573460907e-07, "logits/chosen": -0.6105628609657288, "logits/rejected": -0.6622925996780396, "logps/chosen": -225.90737915039062, "logps/rejected": -724.9630737304688, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.005721080116927624, "rewards/margins": 26.04929542541504, "rewards/rejected": -26.05501365661621, "step": 5780 }, { "epoch": 1.97, "learning_rate": 1.911116706534055e-07, "logits/chosen": -0.3864423930644989, "logits/rejected": -0.6286384463310242, "logps/chosen": -308.58453369140625, "logps/rejected": -630.1354370117188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.024869751185178757, "rewards/margins": 21.885698318481445, "rewards/rejected": -21.910568237304688, "step": 5790 }, { "epoch": 1.97, "learning_rate": 1.9048218557220194e-07, "logits/chosen": -0.5544167757034302, "logits/rejected": -0.6372202634811401, "logps/chosen": -225.78909301757812, "logps/rejected": -514.2470703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.3964363634586334, "rewards/margins": 23.90316390991211, "rewards/rejected": -23.506725311279297, "step": 5800 }, { "epoch": 1.97, "eval_logits/chosen": -0.6247499585151672, "eval_logits/rejected": -0.6865373849868774, "eval_logps/chosen": -214.70004272460938, "eval_logps/rejected": -651.2528076171875, "eval_loss": 0.0021597386803478003, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.3455772399902344, "eval_rewards/margins": 23.98446273803711, "eval_rewards/rejected": -23.638887405395508, "eval_runtime": 537.7464, "eval_samples_per_second": 17.666, "eval_steps_per_second": 0.552, "step": 5800 }, { "epoch": 1.97, "learning_rate": 1.8985270049099837e-07, "logits/chosen": -0.4840589463710785, "logits/rejected": -0.5849170684814453, "logps/chosen": -178.59461975097656, "logps/rejected": -750.7891235351562, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5190991163253784, "rewards/margins": 26.71392822265625, "rewards/rejected": -26.1948299407959, "step": 5810 }, { "epoch": 1.98, "learning_rate": 1.8922321540979476e-07, "logits/chosen": -0.48175400495529175, "logits/rejected": -0.6420316696166992, "logps/chosen": -298.7792053222656, "logps/rejected": -529.0895385742188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2196749746799469, "rewards/margins": 23.958251953125, "rewards/rejected": -24.17792510986328, "step": 5820 }, { "epoch": 1.98, "learning_rate": 1.885937303285912e-07, "logits/chosen": -0.6839223504066467, "logits/rejected": -0.6975377202033997, "logps/chosen": -151.0841064453125, "logps/rejected": -749.2647094726562, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.005560582969337702, "rewards/margins": 28.835119247436523, "rewards/rejected": -28.82956314086914, "step": 5830 }, { "epoch": 1.99, "learning_rate": 1.8796424524738764e-07, "logits/chosen": -0.6825852990150452, "logits/rejected": -0.6945943832397461, "logps/chosen": -217.22073364257812, "logps/rejected": -588.7703857421875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.15094093978405, "rewards/margins": 23.12962532043457, "rewards/rejected": -22.978687286376953, "step": 5840 }, { "epoch": 1.99, "learning_rate": 1.8733476016618406e-07, "logits/chosen": -0.5669941902160645, "logits/rejected": -0.6742810010910034, "logps/chosen": -215.43203735351562, "logps/rejected": -664.5511474609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.5456125736236572, "rewards/margins": 24.339807510375977, "rewards/rejected": -23.794193267822266, "step": 5850 }, { "epoch": 1.99, "learning_rate": 1.8670527508498048e-07, "logits/chosen": -0.6340761184692383, "logits/rejected": -0.7128755450248718, "logps/chosen": -164.09756469726562, "logps/rejected": -848.6427001953125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.08524082601070404, "rewards/margins": 25.800220489501953, "rewards/rejected": -25.885456085205078, "step": 5860 }, { "epoch": 2.0, "learning_rate": 1.860757900037769e-07, "logits/chosen": -0.7065409421920776, "logits/rejected": -0.6064544916152954, "logps/chosen": -152.60903930664062, "logps/rejected": -565.4688720703125, "loss": 0.0066, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.26471951603889465, "rewards/margins": 24.97665023803711, "rewards/rejected": -24.711933135986328, "step": 5870 }, { "epoch": 2.0, "learning_rate": 1.8544630492257333e-07, "logits/chosen": -0.582148551940918, "logits/rejected": -0.6608718633651733, "logps/chosen": -162.1506805419922, "logps/rejected": -692.4993896484375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.24029603600502014, "rewards/margins": 24.260900497436523, "rewards/rejected": -24.020605087280273, "step": 5880 }, { "epoch": 2.0, "learning_rate": 1.8481681984136978e-07, "logits/chosen": -0.5281001329421997, "logits/rejected": -0.6759302020072937, "logps/chosen": -174.66336059570312, "logps/rejected": -503.11102294921875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.17666848003864288, "rewards/margins": 20.63352394104004, "rewards/rejected": -20.810190200805664, "step": 5890 }, { "epoch": 2.01, "learning_rate": 1.8418733476016617e-07, "logits/chosen": -0.4554520547389984, "logits/rejected": -0.6409457325935364, "logps/chosen": -227.8982696533203, "logps/rejected": -565.0753784179688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.13611763715744019, "rewards/margins": 19.66375160217285, "rewards/rejected": -19.799869537353516, "step": 5900 }, { "epoch": 2.01, "eval_logits/chosen": -0.6544415950775146, "eval_logits/rejected": -0.7178645730018616, "eval_logps/chosen": -218.018798828125, "eval_logps/rejected": -666.2399291992188, "eval_loss": 0.0022169328294694424, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.013702023774385452, "eval_rewards/margins": 25.151302337646484, "eval_rewards/rejected": -25.13759994506836, "eval_runtime": 536.4648, "eval_samples_per_second": 17.709, "eval_steps_per_second": 0.554, "step": 5900 }, { "epoch": 2.01, "learning_rate": 1.835578496789626e-07, "logits/chosen": -0.5437734723091125, "logits/rejected": -0.675073504447937, "logps/chosen": -228.841552734375, "logps/rejected": -680.943359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.37268131971359253, "rewards/margins": 22.960582733154297, "rewards/rejected": -22.587902069091797, "step": 5910 }, { "epoch": 2.01, "learning_rate": 1.8292836459775904e-07, "logits/chosen": -0.4654270112514496, "logits/rejected": -0.6589362025260925, "logps/chosen": -284.68975830078125, "logps/rejected": -813.96826171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.2695631980895996, "rewards/margins": 26.295618057250977, "rewards/rejected": -26.026050567626953, "step": 5920 }, { "epoch": 2.02, "learning_rate": 1.8229887951655544e-07, "logits/chosen": -0.5206368565559387, "logits/rejected": -0.667141854763031, "logps/chosen": -163.5179901123047, "logps/rejected": -644.8349609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.057512857019901276, "rewards/margins": 23.254634857177734, "rewards/rejected": -23.197120666503906, "step": 5930 }, { "epoch": 2.02, "learning_rate": 1.8166939443535186e-07, "logits/chosen": -0.541659414768219, "logits/rejected": -0.7091498374938965, "logps/chosen": -230.36392211914062, "logps/rejected": -470.43780517578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.2950902581214905, "rewards/margins": 22.33197021484375, "rewards/rejected": -22.036880493164062, "step": 5940 }, { "epoch": 2.02, "learning_rate": 1.8103990935414829e-07, "logits/chosen": -0.6107196807861328, "logits/rejected": -0.6504772901535034, "logps/chosen": -164.32125854492188, "logps/rejected": -653.484375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.1681213229894638, "rewards/margins": 22.135379791259766, "rewards/rejected": -21.96725845336914, "step": 5950 }, { "epoch": 2.03, "learning_rate": 1.8041042427294474e-07, "logits/chosen": -0.4865199625492096, "logits/rejected": -0.7305706739425659, "logps/chosen": -230.762451171875, "logps/rejected": -453.242919921875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.11261632293462753, "rewards/margins": 26.062023162841797, "rewards/rejected": -26.174640655517578, "step": 5960 }, { "epoch": 2.03, "learning_rate": 1.7978093919174113e-07, "logits/chosen": -0.5239337682723999, "logits/rejected": -0.658258855342865, "logps/chosen": -234.88662719726562, "logps/rejected": -789.7289428710938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.43873825669288635, "rewards/margins": 25.40827751159668, "rewards/rejected": -25.84701919555664, "step": 5970 }, { "epoch": 2.03, "learning_rate": 1.7915145411053755e-07, "logits/chosen": -0.5251814126968384, "logits/rejected": -0.5946325063705444, "logps/chosen": -233.9482421875, "logps/rejected": -705.9213256835938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.021874550729990005, "rewards/margins": 23.363460540771484, "rewards/rejected": -23.38533592224121, "step": 5980 }, { "epoch": 2.04, "learning_rate": 1.78521969029334e-07, "logits/chosen": -0.46983757615089417, "logits/rejected": -0.6961642503738403, "logps/chosen": -229.78262329101562, "logps/rejected": -803.6685180664062, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.1049746721982956, "rewards/margins": 25.435291290283203, "rewards/rejected": -25.330318450927734, "step": 5990 }, { "epoch": 2.04, "learning_rate": 1.7789248394813043e-07, "logits/chosen": -0.5070152282714844, "logits/rejected": -0.749220609664917, "logps/chosen": -266.07366943359375, "logps/rejected": -549.6804809570312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.03183360770344734, "rewards/margins": 25.298259735107422, "rewards/rejected": -25.330089569091797, "step": 6000 }, { "epoch": 2.04, "eval_logits/chosen": -0.6558921933174133, "eval_logits/rejected": -0.7174502015113831, "eval_logps/chosen": -218.42868041992188, "eval_logps/rejected": -670.763427734375, "eval_loss": 0.0022152746096253395, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.02728571742773056, "eval_rewards/margins": 25.562652587890625, "eval_rewards/rejected": -25.58993911743164, "eval_runtime": 537.9808, "eval_samples_per_second": 17.659, "eval_steps_per_second": 0.552, "step": 6000 }, { "epoch": 2.04, "learning_rate": 1.7726299886692682e-07, "logits/chosen": -0.6135789752006531, "logits/rejected": -0.5704982876777649, "logps/chosen": -183.76214599609375, "logps/rejected": -583.9474487304688, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.20374616980552673, "rewards/margins": 22.394207000732422, "rewards/rejected": -22.59795570373535, "step": 6010 }, { "epoch": 2.05, "learning_rate": 1.7663351378572327e-07, "logits/chosen": -0.5839337110519409, "logits/rejected": -0.6292470693588257, "logps/chosen": -158.43331909179688, "logps/rejected": -842.7540283203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.23143868148326874, "rewards/margins": 25.1267147064209, "rewards/rejected": -25.358154296875, "step": 6020 }, { "epoch": 2.05, "learning_rate": 1.760040287045197e-07, "logits/chosen": -0.41864579916000366, "logits/rejected": -0.6075744032859802, "logps/chosen": -365.35247802734375, "logps/rejected": -609.415771484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.38590484857559204, "rewards/margins": 23.925281524658203, "rewards/rejected": -23.539377212524414, "step": 6030 }, { "epoch": 2.05, "learning_rate": 1.7537454362331612e-07, "logits/chosen": -0.4156855642795563, "logits/rejected": -0.6513810753822327, "logps/chosen": -229.06588745117188, "logps/rejected": -586.21728515625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.36991676688194275, "rewards/margins": 22.32361602783203, "rewards/rejected": -22.69352912902832, "step": 6040 }, { "epoch": 2.06, "learning_rate": 1.7474505854211254e-07, "logits/chosen": -0.42269977927207947, "logits/rejected": -0.7159660458564758, "logps/chosen": -347.6496276855469, "logps/rejected": -639.0559692382812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.10356786102056503, "rewards/margins": 25.23113441467285, "rewards/rejected": -25.127567291259766, "step": 6050 }, { "epoch": 2.06, "learning_rate": 1.7411557346090896e-07, "logits/chosen": -0.412602961063385, "logits/rejected": -0.6124510765075684, "logps/chosen": -221.63320922851562, "logps/rejected": -587.4705810546875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.4315613806247711, "rewards/margins": 25.712718963623047, "rewards/rejected": -26.1442813873291, "step": 6060 }, { "epoch": 2.06, "learning_rate": 1.7348608837970539e-07, "logits/chosen": -0.5814113616943359, "logits/rejected": -0.6730402708053589, "logps/chosen": -210.236083984375, "logps/rejected": -685.8887939453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.013295704498887062, "rewards/margins": 24.535747528076172, "rewards/rejected": -24.549041748046875, "step": 6070 }, { "epoch": 2.07, "learning_rate": 1.7285660329850184e-07, "logits/chosen": -0.4980081617832184, "logits/rejected": -0.6739650964736938, "logps/chosen": -172.01962280273438, "logps/rejected": -667.1629028320312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.27424365282058716, "rewards/margins": 27.14284896850586, "rewards/rejected": -26.868606567382812, "step": 6080 }, { "epoch": 2.07, "learning_rate": 1.7222711821729823e-07, "logits/chosen": -0.49969738721847534, "logits/rejected": -0.6377890110015869, "logps/chosen": -153.3025665283203, "logps/rejected": -766.4616088867188, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.31461629271507263, "rewards/margins": 24.342601776123047, "rewards/rejected": -24.02798843383789, "step": 6090 }, { "epoch": 2.07, "learning_rate": 1.7159763313609465e-07, "logits/chosen": -0.48590078949928284, "logits/rejected": -0.6790332198143005, "logps/chosen": -234.4243927001953, "logps/rejected": -613.39892578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.03319666534662247, "rewards/margins": 26.913135528564453, "rewards/rejected": -26.87993812561035, "step": 6100 }, { "epoch": 2.07, "eval_logits/chosen": -0.6425073146820068, "eval_logits/rejected": -0.7034628987312317, "eval_logps/chosen": -218.66212463378906, "eval_logps/rejected": -677.8860473632812, "eval_loss": 0.0020853474270552397, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.050630684942007065, "eval_rewards/margins": 26.251577377319336, "eval_rewards/rejected": -26.302207946777344, "eval_runtime": 538.2863, "eval_samples_per_second": 17.649, "eval_steps_per_second": 0.552, "step": 6100 }, { "epoch": 2.08, "learning_rate": 1.709681480548911e-07, "logits/chosen": -0.42925944924354553, "logits/rejected": -0.7153592109680176, "logps/chosen": -160.71432495117188, "logps/rejected": -560.1774291992188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.19283409416675568, "rewards/margins": 25.37759017944336, "rewards/rejected": -25.57042694091797, "step": 6110 }, { "epoch": 2.08, "learning_rate": 1.7033866297368753e-07, "logits/chosen": -0.5378289818763733, "logits/rejected": -0.7092335820198059, "logps/chosen": -247.27053833007812, "logps/rejected": -717.9095458984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.3249068260192871, "rewards/margins": 26.914926528930664, "rewards/rejected": -26.590017318725586, "step": 6120 }, { "epoch": 2.08, "learning_rate": 1.6970917789248392e-07, "logits/chosen": -0.5842850208282471, "logits/rejected": -0.7018479108810425, "logps/chosen": -206.2807159423828, "logps/rejected": -846.86962890625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.9223998188972473, "rewards/margins": 28.809829711914062, "rewards/rejected": -29.732229232788086, "step": 6130 }, { "epoch": 2.09, "learning_rate": 1.6907969281128037e-07, "logits/chosen": -0.4996110796928406, "logits/rejected": -0.6795738935470581, "logps/chosen": -294.62078857421875, "logps/rejected": -617.7664794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7143559455871582, "rewards/margins": 23.938108444213867, "rewards/rejected": -24.652463912963867, "step": 6140 }, { "epoch": 2.09, "learning_rate": 1.684502077300768e-07, "logits/chosen": -0.5792285799980164, "logits/rejected": -0.7532473206520081, "logps/chosen": -249.13198852539062, "logps/rejected": -646.7651977539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.044773586094379425, "rewards/margins": 26.441192626953125, "rewards/rejected": -26.396419525146484, "step": 6150 }, { "epoch": 2.09, "learning_rate": 1.678207226488732e-07, "logits/chosen": -0.5775994062423706, "logits/rejected": -0.5771958827972412, "logps/chosen": -219.73291015625, "logps/rejected": -638.4561767578125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.008159863762557507, "rewards/margins": 28.191247940063477, "rewards/rejected": -28.19940757751465, "step": 6160 }, { "epoch": 2.1, "learning_rate": 1.6719123756766964e-07, "logits/chosen": -0.5162080526351929, "logits/rejected": -0.701938807964325, "logps/chosen": -241.8827667236328, "logps/rejected": -811.807861328125, "loss": 0.0014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.128142848610878, "rewards/margins": 29.095745086669922, "rewards/rejected": -28.967601776123047, "step": 6170 }, { "epoch": 2.1, "learning_rate": 1.6656175248646606e-07, "logits/chosen": -0.5336301326751709, "logits/rejected": -0.7696987390518188, "logps/chosen": -189.8751678466797, "logps/rejected": -659.97998046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4306076467037201, "rewards/margins": 27.278335571289062, "rewards/rejected": -27.70894432067871, "step": 6180 }, { "epoch": 2.1, "learning_rate": 1.6593226740526249e-07, "logits/chosen": -0.6475902795791626, "logits/rejected": -0.6500539779663086, "logps/chosen": -149.91409301757812, "logps/rejected": -575.5028076171875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -0.07277966290712357, "rewards/margins": 29.569488525390625, "rewards/rejected": -29.64226722717285, "step": 6190 }, { "epoch": 2.11, "learning_rate": 1.653027823240589e-07, "logits/chosen": -0.6268297433853149, "logits/rejected": -0.5650442838668823, "logps/chosen": -172.05935668945312, "logps/rejected": -721.0572509765625, "loss": 0.0033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7015683650970459, "rewards/margins": 25.26753807067871, "rewards/rejected": -25.969106674194336, "step": 6200 }, { "epoch": 2.11, "eval_logits/chosen": -0.6406495571136475, "eval_logits/rejected": -0.6936402320861816, "eval_logps/chosen": -220.1328582763672, "eval_logps/rejected": -686.0946655273438, "eval_loss": 0.0019614899065345526, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.19770237803459167, "eval_rewards/margins": 26.925371170043945, "eval_rewards/rejected": -27.123069763183594, "eval_runtime": 537.5498, "eval_samples_per_second": 17.673, "eval_steps_per_second": 0.553, "step": 6200 }, { "epoch": 2.11, "learning_rate": 1.6467329724285533e-07, "logits/chosen": -0.6141053438186646, "logits/rejected": -0.6189634203910828, "logps/chosen": -174.28347778320312, "logps/rejected": -768.725341796875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.6213392019271851, "rewards/margins": 23.270679473876953, "rewards/rejected": -23.892017364501953, "step": 6210 }, { "epoch": 2.11, "learning_rate": 1.6404381216165175e-07, "logits/chosen": -0.4981359541416168, "logits/rejected": -0.5913408994674683, "logps/chosen": -164.35800170898438, "logps/rejected": -604.9082641601562, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.1555635780096054, "rewards/margins": 25.30427360534668, "rewards/rejected": -25.14870834350586, "step": 6220 }, { "epoch": 2.12, "learning_rate": 1.634143270804482e-07, "logits/chosen": -0.7545971274375916, "logits/rejected": -0.6080290079116821, "logps/chosen": -191.61959838867188, "logps/rejected": -623.01025390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4249326288700104, "rewards/margins": 25.230976104736328, "rewards/rejected": -25.65591049194336, "step": 6230 }, { "epoch": 2.12, "learning_rate": 1.627848419992446e-07, "logits/chosen": -0.3692498803138733, "logits/rejected": -0.7293838858604431, "logps/chosen": -282.7040100097656, "logps/rejected": -861.2039794921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.25410783290863037, "rewards/margins": 29.177047729492188, "rewards/rejected": -29.4311580657959, "step": 6240 }, { "epoch": 2.12, "learning_rate": 1.6215535691804102e-07, "logits/chosen": -0.3574233651161194, "logits/rejected": -0.5809080600738525, "logps/chosen": -216.2305145263672, "logps/rejected": -588.7322998046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.16224080324172974, "rewards/margins": 25.323400497436523, "rewards/rejected": -25.16115951538086, "step": 6250 }, { "epoch": 2.13, "learning_rate": 1.6152587183683747e-07, "logits/chosen": -0.6261405348777771, "logits/rejected": -0.7270271182060242, "logps/chosen": -166.69358825683594, "logps/rejected": -775.8590087890625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.5715898275375366, "rewards/margins": 27.227060317993164, "rewards/rejected": -27.79865074157715, "step": 6260 }, { "epoch": 2.13, "learning_rate": 1.608963867556339e-07, "logits/chosen": -0.6149144172668457, "logits/rejected": -0.6056113839149475, "logps/chosen": -169.19354248046875, "logps/rejected": -592.8151245117188, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.18188437819480896, "rewards/margins": 25.96160316467285, "rewards/rejected": -26.14348793029785, "step": 6270 }, { "epoch": 2.13, "learning_rate": 1.602669016744303e-07, "logits/chosen": -0.46818703413009644, "logits/rejected": -0.6658297181129456, "logps/chosen": -296.52691650390625, "logps/rejected": -943.2103271484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.12341107428073883, "rewards/margins": 26.478466033935547, "rewards/rejected": -26.355056762695312, "step": 6280 }, { "epoch": 2.14, "learning_rate": 1.5963741659322674e-07, "logits/chosen": -0.4485914707183838, "logits/rejected": -0.6884121298789978, "logps/chosen": -177.71981811523438, "logps/rejected": -737.725830078125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.49014657735824585, "rewards/margins": 26.073177337646484, "rewards/rejected": -25.583032608032227, "step": 6290 }, { "epoch": 2.14, "learning_rate": 1.5900793151202316e-07, "logits/chosen": -0.5381507873535156, "logits/rejected": -0.7172382473945618, "logps/chosen": -162.24301147460938, "logps/rejected": -690.0606689453125, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4562622010707855, "rewards/margins": 22.274988174438477, "rewards/rejected": -22.731250762939453, "step": 6300 }, { "epoch": 2.14, "eval_logits/chosen": -0.6297647356987, "eval_logits/rejected": -0.688800573348999, "eval_logps/chosen": -216.320068359375, "eval_logps/rejected": -667.3306274414062, "eval_loss": 0.0018095956183969975, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.18357443809509277, "eval_rewards/margins": 25.43025016784668, "eval_rewards/rejected": -25.246673583984375, "eval_runtime": 536.7549, "eval_samples_per_second": 17.699, "eval_steps_per_second": 0.553, "step": 6300 }, { "epoch": 2.14, "learning_rate": 1.5837844643081959e-07, "logits/chosen": -0.623970091342926, "logits/rejected": -0.6920520067214966, "logps/chosen": -143.3546142578125, "logps/rejected": -833.2921142578125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.2798297107219696, "rewards/margins": 26.251522064208984, "rewards/rejected": -26.53135108947754, "step": 6310 }, { "epoch": 2.15, "learning_rate": 1.57748961349616e-07, "logits/chosen": -0.6357483863830566, "logits/rejected": -0.6701850295066833, "logps/chosen": -220.94265747070312, "logps/rejected": -877.6910400390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.26667293906211853, "rewards/margins": 26.8902530670166, "rewards/rejected": -27.156925201416016, "step": 6320 }, { "epoch": 2.15, "learning_rate": 1.5711947626841243e-07, "logits/chosen": -0.41738444566726685, "logits/rejected": -0.6579784154891968, "logps/chosen": -328.72393798828125, "logps/rejected": -502.26141357421875, "loss": 0.0029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2040051519870758, "rewards/margins": 22.149948120117188, "rewards/rejected": -21.94594383239746, "step": 6330 }, { "epoch": 2.15, "learning_rate": 1.5648999118720885e-07, "logits/chosen": -0.42331504821777344, "logits/rejected": -0.7107141613960266, "logps/chosen": -235.77005004882812, "logps/rejected": -583.9476318359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.44223660230636597, "rewards/margins": 27.78466796875, "rewards/rejected": -27.342432022094727, "step": 6340 }, { "epoch": 2.16, "learning_rate": 1.558605061060053e-07, "logits/chosen": -0.5082443952560425, "logits/rejected": -0.6561731696128845, "logps/chosen": -172.74282836914062, "logps/rejected": -743.0877685546875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.20282702147960663, "rewards/margins": 30.887939453125, "rewards/rejected": -31.090768814086914, "step": 6350 }, { "epoch": 2.16, "learning_rate": 1.552310210248017e-07, "logits/chosen": -0.5118960738182068, "logits/rejected": -0.6457266211509705, "logps/chosen": -280.28741455078125, "logps/rejected": -834.8934326171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.12498612701892853, "rewards/margins": 27.234729766845703, "rewards/rejected": -27.35972023010254, "step": 6360 }, { "epoch": 2.17, "learning_rate": 1.5460153594359812e-07, "logits/chosen": -0.455077588558197, "logits/rejected": -0.649883508682251, "logps/chosen": -291.8680419921875, "logps/rejected": -758.7725219726562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.28148797154426575, "rewards/margins": 30.784814834594727, "rewards/rejected": -31.066299438476562, "step": 6370 }, { "epoch": 2.17, "learning_rate": 1.5397205086239457e-07, "logits/chosen": -0.49562233686447144, "logits/rejected": -0.5913329124450684, "logps/chosen": -224.6325225830078, "logps/rejected": -764.8369140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.2026785910129547, "rewards/margins": 26.89885902404785, "rewards/rejected": -26.696182250976562, "step": 6380 }, { "epoch": 2.17, "learning_rate": 1.5334256578119097e-07, "logits/chosen": -0.5282562971115112, "logits/rejected": -0.659848153591156, "logps/chosen": -217.62686157226562, "logps/rejected": -681.7618408203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.43532896041870117, "rewards/margins": 26.181346893310547, "rewards/rejected": -25.746021270751953, "step": 6390 }, { "epoch": 2.18, "learning_rate": 1.527130806999874e-07, "logits/chosen": -0.531845211982727, "logits/rejected": -0.6718863844871521, "logps/chosen": -292.8276672363281, "logps/rejected": -546.5496215820312, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.030241012573242188, "rewards/margins": 25.15178680419922, "rewards/rejected": -25.182025909423828, "step": 6400 }, { "epoch": 2.18, "eval_logits/chosen": -0.6493544578552246, "eval_logits/rejected": -0.7074719071388245, "eval_logps/chosen": -218.6021728515625, "eval_logps/rejected": -678.427001953125, "eval_loss": 0.0018496609991416335, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.04463376849889755, "eval_rewards/margins": 26.31167221069336, "eval_rewards/rejected": -26.356307983398438, "eval_runtime": 537.7493, "eval_samples_per_second": 17.666, "eval_steps_per_second": 0.552, "step": 6400 }, { "epoch": 2.18, "learning_rate": 1.5208359561878384e-07, "logits/chosen": -0.46855926513671875, "logits/rejected": -0.6641895174980164, "logps/chosen": -228.3707275390625, "logps/rejected": -612.8460083007812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4865381121635437, "rewards/margins": 23.797468185424805, "rewards/rejected": -24.284006118774414, "step": 6410 }, { "epoch": 2.18, "learning_rate": 1.5145411053758026e-07, "logits/chosen": -0.41383475065231323, "logits/rejected": -0.7308405637741089, "logps/chosen": -221.3525848388672, "logps/rejected": -648.8824462890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.21205134689807892, "rewards/margins": 28.086254119873047, "rewards/rejected": -27.87420654296875, "step": 6420 }, { "epoch": 2.19, "learning_rate": 1.5082462545637666e-07, "logits/chosen": -0.7698472738265991, "logits/rejected": -0.686585545539856, "logps/chosen": -165.5222625732422, "logps/rejected": -585.4749755859375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4213569760322571, "rewards/margins": 30.937572479248047, "rewards/rejected": -31.35892677307129, "step": 6430 }, { "epoch": 2.19, "learning_rate": 1.501951403751731e-07, "logits/chosen": -0.5782292485237122, "logits/rejected": -0.6655310988426208, "logps/chosen": -169.94090270996094, "logps/rejected": -709.5994873046875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.0349823459982872, "rewards/margins": 28.64129066467285, "rewards/rejected": -28.676273345947266, "step": 6440 }, { "epoch": 2.19, "learning_rate": 1.4956565529396953e-07, "logits/chosen": -0.4286069869995117, "logits/rejected": -0.6850601434707642, "logps/chosen": -182.2263946533203, "logps/rejected": -665.3687744140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.13414470851421356, "rewards/margins": 29.12774658203125, "rewards/rejected": -29.261890411376953, "step": 6450 }, { "epoch": 2.2, "learning_rate": 1.4893617021276595e-07, "logits/chosen": -0.37892818450927734, "logits/rejected": -0.6622062921524048, "logps/chosen": -178.07223510742188, "logps/rejected": -754.40869140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.0123823881149292, "rewards/margins": 28.93340492248535, "rewards/rejected": -28.945789337158203, "step": 6460 }, { "epoch": 2.2, "learning_rate": 1.4830668513156238e-07, "logits/chosen": -0.6834964752197266, "logits/rejected": -0.6945078372955322, "logps/chosen": -159.0599822998047, "logps/rejected": -511.8021545410156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.056690432131290436, "rewards/margins": 30.34902000427246, "rewards/rejected": -30.405710220336914, "step": 6470 }, { "epoch": 2.2, "learning_rate": 1.476772000503588e-07, "logits/chosen": -0.5392309427261353, "logits/rejected": -0.6344277262687683, "logps/chosen": -157.17813110351562, "logps/rejected": -633.4981689453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.22114773094654083, "rewards/margins": 29.1480655670166, "rewards/rejected": -29.369211196899414, "step": 6480 }, { "epoch": 2.21, "learning_rate": 1.4704771496915522e-07, "logits/chosen": -0.5029164552688599, "logits/rejected": -0.6213968992233276, "logps/chosen": -167.06344604492188, "logps/rejected": -609.4381713867188, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.44678792357444763, "rewards/margins": 28.66744041442871, "rewards/rejected": -29.11423110961914, "step": 6490 }, { "epoch": 2.21, "learning_rate": 1.4641822988795167e-07, "logits/chosen": -0.5306546092033386, "logits/rejected": -0.6301943063735962, "logps/chosen": -163.7183380126953, "logps/rejected": -664.6639404296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.006963974330574274, "rewards/margins": 23.55824851989746, "rewards/rejected": -23.56521224975586, "step": 6500 }, { "epoch": 2.21, "eval_logits/chosen": -0.6418337225914001, "eval_logits/rejected": -0.7006853818893433, "eval_logps/chosen": -219.1754913330078, "eval_logps/rejected": -685.2559814453125, "eval_loss": 0.0017885882407426834, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10196730494499207, "eval_rewards/margins": 26.937244415283203, "eval_rewards/rejected": -27.039207458496094, "eval_runtime": 537.6461, "eval_samples_per_second": 17.67, "eval_steps_per_second": 0.552, "step": 6500 }, { "epoch": 2.21, "learning_rate": 1.4578874480674807e-07, "logits/chosen": -0.5987303256988525, "logits/rejected": -0.6244300007820129, "logps/chosen": -263.18731689453125, "logps/rejected": -637.4671630859375, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2560492157936096, "rewards/margins": 27.76957130432129, "rewards/rejected": -28.025623321533203, "step": 6510 }, { "epoch": 2.22, "learning_rate": 1.451592597255445e-07, "logits/chosen": -0.4933759272098541, "logits/rejected": -0.6844289898872375, "logps/chosen": -280.93243408203125, "logps/rejected": -632.19873046875, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.1691657304763794, "rewards/margins": 26.130056381225586, "rewards/rejected": -26.29922103881836, "step": 6520 }, { "epoch": 2.22, "learning_rate": 1.4452977464434094e-07, "logits/chosen": -0.5048766136169434, "logits/rejected": -0.587834358215332, "logps/chosen": -209.7151641845703, "logps/rejected": -675.181396484375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.0880427211523056, "rewards/margins": 27.112197875976562, "rewards/rejected": -27.200241088867188, "step": 6530 }, { "epoch": 2.22, "learning_rate": 1.4390028956313736e-07, "logits/chosen": -0.6285936236381531, "logits/rejected": -0.6973856687545776, "logps/chosen": -166.12936401367188, "logps/rejected": -728.7384033203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.34099069237709045, "rewards/margins": 30.86602210998535, "rewards/rejected": -31.207012176513672, "step": 6540 }, { "epoch": 2.23, "learning_rate": 1.4327080448193376e-07, "logits/chosen": -0.5246552228927612, "logits/rejected": -0.6934639811515808, "logps/chosen": -342.62164306640625, "logps/rejected": -617.513671875, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6880171895027161, "rewards/margins": 27.57820701599121, "rewards/rejected": -28.266223907470703, "step": 6550 }, { "epoch": 2.23, "learning_rate": 1.426413194007302e-07, "logits/chosen": -0.5776757597923279, "logits/rejected": -0.711004376411438, "logps/chosen": -174.38572692871094, "logps/rejected": -654.0757446289062, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.15177665650844574, "rewards/margins": 26.984264373779297, "rewards/rejected": -27.13604164123535, "step": 6560 }, { "epoch": 2.23, "learning_rate": 1.4201183431952663e-07, "logits/chosen": -0.6357619166374207, "logits/rejected": -0.6987857222557068, "logps/chosen": -221.19583129882812, "logps/rejected": -869.0798950195312, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.30972060561180115, "rewards/margins": 26.325077056884766, "rewards/rejected": -26.634796142578125, "step": 6570 }, { "epoch": 2.24, "learning_rate": 1.4138234923832303e-07, "logits/chosen": -0.5622653961181641, "logits/rejected": -0.6996665596961975, "logps/chosen": -211.46823120117188, "logps/rejected": -661.4744873046875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.02768528088927269, "rewards/margins": 24.20840835571289, "rewards/rejected": -24.236093521118164, "step": 6580 }, { "epoch": 2.24, "learning_rate": 1.4075286415711948e-07, "logits/chosen": -0.5883240103721619, "logits/rejected": -0.7573795318603516, "logps/chosen": -233.20077514648438, "logps/rejected": -609.8150634765625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.31580036878585815, "rewards/margins": 26.86346435546875, "rewards/rejected": -27.179264068603516, "step": 6590 }, { "epoch": 2.24, "learning_rate": 1.401233790759159e-07, "logits/chosen": -0.5779234170913696, "logits/rejected": -0.5682854056358337, "logps/chosen": -270.2347717285156, "logps/rejected": -630.7273559570312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.12839487195014954, "rewards/margins": 22.732200622558594, "rewards/rejected": -22.603805541992188, "step": 6600 }, { "epoch": 2.24, "eval_logits/chosen": -0.640097975730896, "eval_logits/rejected": -0.7075762748718262, "eval_logps/chosen": -218.5896759033203, "eval_logps/rejected": -676.3707275390625, "eval_loss": 0.0017034454504027963, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.04338453710079193, "eval_rewards/margins": 26.10729217529297, "eval_rewards/rejected": -26.150678634643555, "eval_runtime": 538.053, "eval_samples_per_second": 17.656, "eval_steps_per_second": 0.552, "step": 6600 }, { "epoch": 2.25, "learning_rate": 1.3949389399471232e-07, "logits/chosen": -0.689568817615509, "logits/rejected": -0.711378812789917, "logps/chosen": -155.10543823242188, "logps/rejected": -708.8919677734375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3210011124610901, "rewards/margins": 29.23980140686035, "rewards/rejected": -29.560802459716797, "step": 6610 }, { "epoch": 2.25, "learning_rate": 1.3886440891350874e-07, "logits/chosen": -0.45669612288475037, "logits/rejected": -0.6613640785217285, "logps/chosen": -243.42160034179688, "logps/rejected": -572.498291015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.06045308709144592, "rewards/margins": 28.879322052001953, "rewards/rejected": -28.818866729736328, "step": 6620 }, { "epoch": 2.25, "learning_rate": 1.3823492383230517e-07, "logits/chosen": -0.6113361120223999, "logits/rejected": -0.7199699282646179, "logps/chosen": -172.14727783203125, "logps/rejected": -754.6735229492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.1902303397655487, "rewards/margins": 27.478836059570312, "rewards/rejected": -27.288604736328125, "step": 6630 }, { "epoch": 2.26, "learning_rate": 1.376054387511016e-07, "logits/chosen": -0.5342288613319397, "logits/rejected": -0.7267729043960571, "logps/chosen": -185.51580810546875, "logps/rejected": -660.5914306640625, "loss": 0.0013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.009353825822472572, "rewards/margins": 26.375370025634766, "rewards/rejected": -26.366016387939453, "step": 6640 }, { "epoch": 2.26, "learning_rate": 1.36975953669898e-07, "logits/chosen": -0.49513062834739685, "logits/rejected": -0.6360796093940735, "logps/chosen": -216.85458374023438, "logps/rejected": -610.0308837890625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.41457000374794006, "rewards/margins": 29.510705947875977, "rewards/rejected": -29.92527198791504, "step": 6650 }, { "epoch": 2.26, "learning_rate": 1.3634646858869444e-07, "logits/chosen": -0.5569483041763306, "logits/rejected": -0.5567039847373962, "logps/chosen": -181.1729736328125, "logps/rejected": -510.97906494140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.5326394438743591, "rewards/margins": 24.9097957611084, "rewards/rejected": -25.44243621826172, "step": 6660 }, { "epoch": 2.27, "learning_rate": 1.3571698350749086e-07, "logits/chosen": -0.4934006333351135, "logits/rejected": -0.6792032122612, "logps/chosen": -220.5104522705078, "logps/rejected": -658.4547119140625, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 0.3609904944896698, "rewards/margins": 26.22503662109375, "rewards/rejected": -25.86404800415039, "step": 6670 }, { "epoch": 2.27, "learning_rate": 1.3508749842628728e-07, "logits/chosen": -0.46117955446243286, "logits/rejected": -0.635035514831543, "logps/chosen": -206.8939971923828, "logps/rejected": -626.7083129882812, "loss": 0.0035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.37133121490478516, "rewards/margins": 21.502941131591797, "rewards/rejected": -21.13160514831543, "step": 6680 }, { "epoch": 2.27, "learning_rate": 1.3445801334508373e-07, "logits/chosen": -0.42536693811416626, "logits/rejected": -0.6008523106575012, "logps/chosen": -270.62933349609375, "logps/rejected": -460.9425354003906, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.33002784848213196, "rewards/margins": 25.790283203125, "rewards/rejected": -25.460254669189453, "step": 6690 }, { "epoch": 2.28, "learning_rate": 1.3382852826388013e-07, "logits/chosen": -0.48612767457962036, "logits/rejected": -0.5772387385368347, "logps/chosen": -169.74496459960938, "logps/rejected": -587.9669799804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.09327367693185806, "rewards/margins": 23.94822120666504, "rewards/rejected": -23.85494613647461, "step": 6700 }, { "epoch": 2.28, "eval_logits/chosen": -0.6183910965919495, "eval_logits/rejected": -0.6911269426345825, "eval_logps/chosen": -216.66818237304688, "eval_logps/rejected": -669.5584716796875, "eval_loss": 0.0017534078797325492, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.14876383543014526, "eval_rewards/margins": 25.61821746826172, "eval_rewards/rejected": -25.469453811645508, "eval_runtime": 536.6243, "eval_samples_per_second": 17.703, "eval_steps_per_second": 0.553, "step": 6700 }, { "epoch": 2.28, "learning_rate": 1.3319904318267655e-07, "logits/chosen": -0.4250110983848572, "logits/rejected": -0.6291581392288208, "logps/chosen": -221.78384399414062, "logps/rejected": -694.1126098632812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.10289283096790314, "rewards/margins": 24.055532455444336, "rewards/rejected": -23.952640533447266, "step": 6710 }, { "epoch": 2.28, "learning_rate": 1.32569558101473e-07, "logits/chosen": -0.39104539155960083, "logits/rejected": -0.5716915726661682, "logps/chosen": -184.3653106689453, "logps/rejected": -640.5642700195312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.11355412006378174, "rewards/margins": 24.643274307250977, "rewards/rejected": -24.529720306396484, "step": 6720 }, { "epoch": 2.29, "learning_rate": 1.3194007302026942e-07, "logits/chosen": -0.44518598914146423, "logits/rejected": -0.6691451072692871, "logps/chosen": -195.61988830566406, "logps/rejected": -579.0264892578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.1804264336824417, "rewards/margins": 26.661136627197266, "rewards/rejected": -26.480712890625, "step": 6730 }, { "epoch": 2.29, "learning_rate": 1.3131058793906582e-07, "logits/chosen": -0.2583276629447937, "logits/rejected": -0.622469425201416, "logps/chosen": -281.13958740234375, "logps/rejected": -599.06591796875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.007663518190383911, "rewards/margins": 22.390735626220703, "rewards/rejected": -22.398395538330078, "step": 6740 }, { "epoch": 2.29, "learning_rate": 1.3068110285786227e-07, "logits/chosen": -0.4201125502586365, "logits/rejected": -0.6049125790596008, "logps/chosen": -219.01296997070312, "logps/rejected": -559.8234252929688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.06550945341587067, "rewards/margins": 22.978103637695312, "rewards/rejected": -23.043611526489258, "step": 6750 }, { "epoch": 2.3, "learning_rate": 1.300516177766587e-07, "logits/chosen": -0.5221636891365051, "logits/rejected": -0.6536280512809753, "logps/chosen": -210.0901336669922, "logps/rejected": -611.281494140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.033672451972961426, "rewards/margins": 26.174732208251953, "rewards/rejected": -26.20840072631836, "step": 6760 }, { "epoch": 2.3, "learning_rate": 1.294221326954551e-07, "logits/chosen": -0.6499303579330444, "logits/rejected": -0.6867426633834839, "logps/chosen": -224.24447631835938, "logps/rejected": -839.8427734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.05233278125524521, "rewards/margins": 28.673208236694336, "rewards/rejected": -28.725543975830078, "step": 6770 }, { "epoch": 2.3, "learning_rate": 1.2879264761425154e-07, "logits/chosen": -0.47797149419784546, "logits/rejected": -0.738763689994812, "logps/chosen": -174.3010711669922, "logps/rejected": -575.6129150390625, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -0.1753970980644226, "rewards/margins": 29.895254135131836, "rewards/rejected": -30.07065200805664, "step": 6780 }, { "epoch": 2.31, "learning_rate": 1.2816316253304796e-07, "logits/chosen": -0.5813684463500977, "logits/rejected": -0.7032259702682495, "logps/chosen": -147.48806762695312, "logps/rejected": -589.2916259765625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.004688471555709839, "rewards/margins": 26.240951538085938, "rewards/rejected": -26.23626136779785, "step": 6790 }, { "epoch": 2.31, "learning_rate": 1.2753367745184438e-07, "logits/chosen": -0.5931539535522461, "logits/rejected": -0.6605942845344543, "logps/chosen": -223.3124542236328, "logps/rejected": -492.8990173339844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.029732655733823776, "rewards/margins": 24.13622283935547, "rewards/rejected": -24.165956497192383, "step": 6800 }, { "epoch": 2.31, "eval_logits/chosen": -0.6530368328094482, "eval_logits/rejected": -0.723777174949646, "eval_logps/chosen": -218.91807556152344, "eval_logps/rejected": -682.6937866210938, "eval_loss": 0.001797627890482545, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.0762258991599083, "eval_rewards/margins": 26.706762313842773, "eval_rewards/rejected": -26.782987594604492, "eval_runtime": 538.3956, "eval_samples_per_second": 17.645, "eval_steps_per_second": 0.552, "step": 6800 }, { "epoch": 2.31, "learning_rate": 1.2690419237064083e-07, "logits/chosen": -0.5601319670677185, "logits/rejected": -0.6434171795845032, "logps/chosen": -335.9732971191406, "logps/rejected": -710.7401733398438, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.27675318717956543, "rewards/margins": 26.732568740844727, "rewards/rejected": -26.455814361572266, "step": 6810 }, { "epoch": 2.32, "learning_rate": 1.2627470728943723e-07, "logits/chosen": -0.511364758014679, "logits/rejected": -0.6747186183929443, "logps/chosen": -303.65545654296875, "logps/rejected": -754.218505859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.01856706105172634, "rewards/margins": 25.733816146850586, "rewards/rejected": -25.752384185791016, "step": 6820 }, { "epoch": 2.32, "learning_rate": 1.2564522220823365e-07, "logits/chosen": -0.5567011833190918, "logits/rejected": -0.6752985119819641, "logps/chosen": -228.1060028076172, "logps/rejected": -716.2889404296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.09442253410816193, "rewards/margins": 25.381549835205078, "rewards/rejected": -25.287128448486328, "step": 6830 }, { "epoch": 2.32, "learning_rate": 1.250157371270301e-07, "logits/chosen": -0.5739198923110962, "logits/rejected": -0.6498308777809143, "logps/chosen": -211.52542114257812, "logps/rejected": -602.9981079101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3370577394962311, "rewards/margins": 29.040271759033203, "rewards/rejected": -29.377328872680664, "step": 6840 }, { "epoch": 2.33, "learning_rate": 1.243862520458265e-07, "logits/chosen": -0.6446477174758911, "logits/rejected": -0.603238582611084, "logps/chosen": -221.7611541748047, "logps/rejected": -565.7470703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.257123738527298, "rewards/margins": 25.540542602539062, "rewards/rejected": -25.79766273498535, "step": 6850 }, { "epoch": 2.33, "learning_rate": 1.2375676696462294e-07, "logits/chosen": -0.5073419809341431, "logits/rejected": -0.6102813482284546, "logps/chosen": -253.61489868164062, "logps/rejected": -801.1676025390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3765743672847748, "rewards/margins": 27.17889976501465, "rewards/rejected": -27.55547523498535, "step": 6860 }, { "epoch": 2.34, "learning_rate": 1.2312728188341934e-07, "logits/chosen": -0.4444945454597473, "logits/rejected": -0.7624967098236084, "logps/chosen": -192.4506378173828, "logps/rejected": -719.5604858398438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.06323375552892685, "rewards/margins": 28.732025146484375, "rewards/rejected": -28.66878890991211, "step": 6870 }, { "epoch": 2.34, "learning_rate": 1.224977968022158e-07, "logits/chosen": -0.5497924089431763, "logits/rejected": -0.7342925667762756, "logps/chosen": -271.62689208984375, "logps/rejected": -598.141357421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.017152732238173485, "rewards/margins": 25.138439178466797, "rewards/rejected": -25.155593872070312, "step": 6880 }, { "epoch": 2.34, "learning_rate": 1.218683117210122e-07, "logits/chosen": -0.5712088942527771, "logits/rejected": -0.6434404253959656, "logps/chosen": -162.77113342285156, "logps/rejected": -756.5715942382812, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.44584354758262634, "rewards/margins": 27.843669891357422, "rewards/rejected": -27.3978271484375, "step": 6890 }, { "epoch": 2.35, "learning_rate": 1.2123882663980863e-07, "logits/chosen": -0.4178415834903717, "logits/rejected": -0.7087987065315247, "logps/chosen": -231.30691528320312, "logps/rejected": -761.0720825195312, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -0.06388819962739944, "rewards/margins": 22.948915481567383, "rewards/rejected": -23.01280403137207, "step": 6900 }, { "epoch": 2.35, "eval_logits/chosen": -0.6572023034095764, "eval_logits/rejected": -0.726726233959198, "eval_logps/chosen": -220.6759796142578, "eval_logps/rejected": -694.1253051757812, "eval_loss": 0.0017996432725340128, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.25201332569122314, "eval_rewards/margins": 27.674131393432617, "eval_rewards/rejected": -27.926143646240234, "eval_runtime": 538.155, "eval_samples_per_second": 17.653, "eval_steps_per_second": 0.552, "step": 6900 }, { "epoch": 2.35, "learning_rate": 1.2060934155860506e-07, "logits/chosen": -0.3412066102027893, "logits/rejected": -0.6076663732528687, "logps/chosen": -182.9947967529297, "logps/rejected": -611.9998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.007010996341705322, "rewards/margins": 26.10672950744629, "rewards/rejected": -26.11374282836914, "step": 6910 }, { "epoch": 2.35, "learning_rate": 1.1997985647740148e-07, "logits/chosen": -0.616433322429657, "logits/rejected": -0.6595634818077087, "logps/chosen": -175.9999237060547, "logps/rejected": -676.5719604492188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.1642877459526062, "rewards/margins": 26.681751251220703, "rewards/rejected": -26.84603500366211, "step": 6920 }, { "epoch": 2.36, "learning_rate": 1.193503713961979e-07, "logits/chosen": -0.5490292310714722, "logits/rejected": -0.5992618203163147, "logps/chosen": -327.3284606933594, "logps/rejected": -934.85107421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.22694988548755646, "rewards/margins": 28.76523780822754, "rewards/rejected": -28.992183685302734, "step": 6930 }, { "epoch": 2.36, "learning_rate": 1.1872088631499433e-07, "logits/chosen": -0.6869747638702393, "logits/rejected": -0.7118425965309143, "logps/chosen": -197.61351013183594, "logps/rejected": -723.5557250976562, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.7359576225280762, "rewards/margins": 27.1706485748291, "rewards/rejected": -27.906606674194336, "step": 6940 }, { "epoch": 2.36, "learning_rate": 1.1809140123379076e-07, "logits/chosen": -0.5461373925209045, "logits/rejected": -0.6793215870857239, "logps/chosen": -193.26084899902344, "logps/rejected": -710.9674072265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.35801762342453003, "rewards/margins": 24.206113815307617, "rewards/rejected": -23.848094940185547, "step": 6950 }, { "epoch": 2.37, "learning_rate": 1.1746191615258717e-07, "logits/chosen": -0.4858538508415222, "logits/rejected": -0.7356199622154236, "logps/chosen": -176.02066040039062, "logps/rejected": -733.82373046875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.0774877518415451, "rewards/margins": 28.668689727783203, "rewards/rejected": -28.746179580688477, "step": 6960 }, { "epoch": 2.37, "learning_rate": 1.1683243107138361e-07, "logits/chosen": -0.46077775955200195, "logits/rejected": -0.6410581469535828, "logps/chosen": -247.6884765625, "logps/rejected": -723.0765380859375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.13382968306541443, "rewards/margins": 34.847930908203125, "rewards/rejected": -34.98175811767578, "step": 6970 }, { "epoch": 2.37, "learning_rate": 1.1620294599018003e-07, "logits/chosen": -0.45337170362472534, "logits/rejected": -0.6588858366012573, "logps/chosen": -181.2332000732422, "logps/rejected": -578.5050659179688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2828446924686432, "rewards/margins": 25.16775131225586, "rewards/rejected": -25.45059585571289, "step": 6980 }, { "epoch": 2.38, "learning_rate": 1.1557346090897645e-07, "logits/chosen": -0.5127782821655273, "logits/rejected": -0.6714586019515991, "logps/chosen": -162.29949951171875, "logps/rejected": -843.3024291992188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.11272507905960083, "rewards/margins": 27.532678604125977, "rewards/rejected": -27.645404815673828, "step": 6990 }, { "epoch": 2.38, "learning_rate": 1.1494397582777288e-07, "logits/chosen": -0.7157866954803467, "logits/rejected": -0.6473212838172913, "logps/chosen": -158.10824584960938, "logps/rejected": -698.0008544921875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.006679052021354437, "rewards/margins": 24.555662155151367, "rewards/rejected": -24.548982620239258, "step": 7000 }, { "epoch": 2.38, "eval_logits/chosen": -0.6516006588935852, "eval_logits/rejected": -0.7207273244857788, "eval_logps/chosen": -220.13502502441406, "eval_logps/rejected": -692.0079956054688, "eval_loss": 0.0017119839321821928, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.19791966676712036, "eval_rewards/margins": 27.516489028930664, "eval_rewards/rejected": -27.714406967163086, "eval_runtime": 537.597, "eval_samples_per_second": 17.671, "eval_steps_per_second": 0.552, "step": 7000 }, { "epoch": 2.38, "learning_rate": 1.1431449074656931e-07, "logits/chosen": -0.437359094619751, "logits/rejected": -0.6772294044494629, "logps/chosen": -348.9482727050781, "logps/rejected": -548.1334838867188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.32590141892433167, "rewards/margins": 26.076955795288086, "rewards/rejected": -26.402856826782227, "step": 7010 }, { "epoch": 2.39, "learning_rate": 1.1368500566536572e-07, "logits/chosen": -0.5497533082962036, "logits/rejected": -0.6477676033973694, "logps/chosen": -303.20721435546875, "logps/rejected": -734.6389770507812, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.04651997238397598, "rewards/margins": 25.361852645874023, "rewards/rejected": -25.315332412719727, "step": 7020 }, { "epoch": 2.39, "learning_rate": 1.1305552058416214e-07, "logits/chosen": -0.6499623656272888, "logits/rejected": -0.6606015563011169, "logps/chosen": -181.30856323242188, "logps/rejected": -745.7633056640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2717200815677643, "rewards/margins": 30.993331909179688, "rewards/rejected": -31.265050888061523, "step": 7030 }, { "epoch": 2.39, "learning_rate": 1.1242603550295858e-07, "logits/chosen": -0.7045333385467529, "logits/rejected": -0.7066220045089722, "logps/chosen": -150.49737548828125, "logps/rejected": -789.0337524414062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.25654280185699463, "rewards/margins": 29.887344360351562, "rewards/rejected": -30.143884658813477, "step": 7040 }, { "epoch": 2.4, "learning_rate": 1.1179655042175499e-07, "logits/chosen": -0.2939174473285675, "logits/rejected": -0.5746561884880066, "logps/chosen": -187.31414794921875, "logps/rejected": -543.1347045898438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3651852011680603, "rewards/margins": 24.24069595336914, "rewards/rejected": -24.605880737304688, "step": 7050 }, { "epoch": 2.4, "learning_rate": 1.1116706534055143e-07, "logits/chosen": -0.5358907580375671, "logits/rejected": -0.6342066526412964, "logps/chosen": -224.28402709960938, "logps/rejected": -652.5809936523438, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.1837461292743683, "rewards/margins": 28.551349639892578, "rewards/rejected": -28.367603302001953, "step": 7060 }, { "epoch": 2.4, "learning_rate": 1.1053758025934785e-07, "logits/chosen": -0.5950456857681274, "logits/rejected": -0.6314067840576172, "logps/chosen": -172.76904296875, "logps/rejected": -821.1598510742188, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.07030830532312393, "rewards/margins": 25.836727142333984, "rewards/rejected": -25.76641845703125, "step": 7070 }, { "epoch": 2.41, "learning_rate": 1.0990809517814427e-07, "logits/chosen": -0.4927343428134918, "logits/rejected": -0.6503714323043823, "logps/chosen": -226.74038696289062, "logps/rejected": -547.306884765625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.16591541469097137, "rewards/margins": 25.632543563842773, "rewards/rejected": -25.7984561920166, "step": 7080 }, { "epoch": 2.41, "learning_rate": 1.092786100969407e-07, "logits/chosen": -0.4103378653526306, "logits/rejected": -0.6752765774726868, "logps/chosen": -173.9786834716797, "logps/rejected": -643.1845703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.15380695462226868, "rewards/margins": 24.576541900634766, "rewards/rejected": -24.7303466796875, "step": 7090 }, { "epoch": 2.41, "learning_rate": 1.0864912501573713e-07, "logits/chosen": -0.5948134064674377, "logits/rejected": -0.6678715944290161, "logps/chosen": -169.1475067138672, "logps/rejected": -564.2574462890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.6517482995986938, "rewards/margins": 20.47730827331543, "rewards/rejected": -21.129056930541992, "step": 7100 }, { "epoch": 2.41, "eval_logits/chosen": -0.6448216438293457, "eval_logits/rejected": -0.7147423624992371, "eval_logps/chosen": -220.21861267089844, "eval_logps/rejected": -697.6947021484375, "eval_loss": 0.0017432052409276366, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.20628021657466888, "eval_rewards/margins": 28.076793670654297, "eval_rewards/rejected": -28.283071517944336, "eval_runtime": 536.7966, "eval_samples_per_second": 17.698, "eval_steps_per_second": 0.553, "step": 7100 }, { "epoch": 2.42, "learning_rate": 1.0801963993453354e-07, "logits/chosen": -0.653703510761261, "logits/rejected": -0.6906970143318176, "logps/chosen": -178.80882263183594, "logps/rejected": -539.1228637695312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4405710697174072, "rewards/margins": 30.903457641601562, "rewards/rejected": -31.344024658203125, "step": 7110 }, { "epoch": 2.42, "learning_rate": 1.0739015485332998e-07, "logits/chosen": -0.5398464798927307, "logits/rejected": -0.654344916343689, "logps/chosen": -263.400390625, "logps/rejected": -747.3477783203125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.3858211636543274, "rewards/margins": 27.812902450561523, "rewards/rejected": -28.198720932006836, "step": 7120 }, { "epoch": 2.42, "learning_rate": 1.067606697721264e-07, "logits/chosen": -0.49422937631607056, "logits/rejected": -0.6983587741851807, "logps/chosen": -247.74008178710938, "logps/rejected": -630.2291870117188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.7747281789779663, "rewards/margins": 27.801965713500977, "rewards/rejected": -28.57669448852539, "step": 7130 }, { "epoch": 2.43, "learning_rate": 1.0613118469092282e-07, "logits/chosen": -0.6300886869430542, "logits/rejected": -0.6317524313926697, "logps/chosen": -224.0232696533203, "logps/rejected": -1019.0787353515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.30816859006881714, "rewards/margins": 27.921371459960938, "rewards/rejected": -28.229541778564453, "step": 7140 }, { "epoch": 2.43, "learning_rate": 1.0550169960971924e-07, "logits/chosen": -0.5358924865722656, "logits/rejected": -0.6669289469718933, "logps/chosen": -278.2479553222656, "logps/rejected": -798.9022827148438, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.3712834417819977, "rewards/margins": 26.408544540405273, "rewards/rejected": -26.779827117919922, "step": 7150 }, { "epoch": 2.43, "learning_rate": 1.0487221452851568e-07, "logits/chosen": -0.6543781757354736, "logits/rejected": -0.6193122863769531, "logps/chosen": -217.398681640625, "logps/rejected": -818.3699340820312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.1787976175546646, "rewards/margins": 25.384504318237305, "rewards/rejected": -25.56330108642578, "step": 7160 }, { "epoch": 2.44, "learning_rate": 1.0424272944731209e-07, "logits/chosen": -0.48397621512413025, "logits/rejected": -0.6773947477340698, "logps/chosen": -242.27975463867188, "logps/rejected": -802.727294921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.047008905559778214, "rewards/margins": 27.403148651123047, "rewards/rejected": -27.45016098022461, "step": 7170 }, { "epoch": 2.44, "learning_rate": 1.0361324436610853e-07, "logits/chosen": -0.5012516975402832, "logits/rejected": -0.7947234511375427, "logps/chosen": -238.30575561523438, "logps/rejected": -636.2086791992188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.4599526524543762, "rewards/margins": 32.01173400878906, "rewards/rejected": -32.47168731689453, "step": 7180 }, { "epoch": 2.44, "learning_rate": 1.0298375928490494e-07, "logits/chosen": -0.5069199800491333, "logits/rejected": -0.780190110206604, "logps/chosen": -167.947509765625, "logps/rejected": -746.2584228515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.20237283408641815, "rewards/margins": 30.78902244567871, "rewards/rejected": -30.991390228271484, "step": 7190 }, { "epoch": 2.45, "learning_rate": 1.0235427420370137e-07, "logits/chosen": -0.41463613510131836, "logits/rejected": -0.640296459197998, "logps/chosen": -196.25411987304688, "logps/rejected": -681.5792236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.14613595604896545, "rewards/margins": 28.181034088134766, "rewards/rejected": -28.32716941833496, "step": 7200 }, { "epoch": 2.45, "eval_logits/chosen": -0.657196581363678, "eval_logits/rejected": -0.7290647625923157, "eval_logps/chosen": -220.57846069335938, "eval_logps/rejected": -700.2904663085938, "eval_loss": 0.001706042094156146, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.24226266145706177, "eval_rewards/margins": 28.300386428833008, "eval_rewards/rejected": -28.542646408081055, "eval_runtime": 536.9703, "eval_samples_per_second": 17.692, "eval_steps_per_second": 0.553, "step": 7200 }, { "epoch": 2.45, "learning_rate": 1.017247891224978e-07, "logits/chosen": -0.629804253578186, "logits/rejected": -0.6809019446372986, "logps/chosen": -163.8177947998047, "logps/rejected": -517.3888549804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.009478640742599964, "rewards/margins": 28.14434242248535, "rewards/rejected": -28.153818130493164, "step": 7210 }, { "epoch": 2.45, "learning_rate": 1.0109530404129422e-07, "logits/chosen": -0.5950022339820862, "logits/rejected": -0.6600571870803833, "logps/chosen": -245.4164581298828, "logps/rejected": -592.1929321289062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3164582848548889, "rewards/margins": 27.767181396484375, "rewards/rejected": -28.08363914489746, "step": 7220 }, { "epoch": 2.46, "learning_rate": 1.0046581896009064e-07, "logits/chosen": -0.5363454818725586, "logits/rejected": -0.6686700582504272, "logps/chosen": -216.06494140625, "logps/rejected": -678.1289672851562, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.49499645829200745, "rewards/margins": 28.88030433654785, "rewards/rejected": -29.375301361083984, "step": 7230 }, { "epoch": 2.46, "learning_rate": 9.983633387888708e-08, "logits/chosen": -0.5734550952911377, "logits/rejected": -0.691001296043396, "logps/chosen": -221.1831512451172, "logps/rejected": -867.2220458984375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.006386172957718372, "rewards/margins": 27.88824462890625, "rewards/rejected": -27.89463233947754, "step": 7240 }, { "epoch": 2.46, "learning_rate": 9.920684879768348e-08, "logits/chosen": -0.49127036333084106, "logits/rejected": -0.6693507432937622, "logps/chosen": -269.4422912597656, "logps/rejected": -706.8968505859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.035617996007204056, "rewards/margins": 24.592323303222656, "rewards/rejected": -24.627941131591797, "step": 7250 }, { "epoch": 2.47, "learning_rate": 9.857736371647991e-08, "logits/chosen": -0.5138463377952576, "logits/rejected": -0.6969189047813416, "logps/chosen": -302.4375, "logps/rejected": -761.6140747070312, "loss": 0.0031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.27241450548171997, "rewards/margins": 28.822917938232422, "rewards/rejected": -28.550506591796875, "step": 7260 }, { "epoch": 2.47, "learning_rate": 9.794787863527634e-08, "logits/chosen": -0.5246730446815491, "logits/rejected": -0.6718152165412903, "logps/chosen": -283.49267578125, "logps/rejected": -500.39080810546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.3190411925315857, "rewards/margins": 27.798877716064453, "rewards/rejected": -28.117916107177734, "step": 7270 }, { "epoch": 2.47, "learning_rate": 9.731839355407275e-08, "logits/chosen": -0.5385629534721375, "logits/rejected": -0.6723285913467407, "logps/chosen": -216.0872039794922, "logps/rejected": -514.9222412109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.11485503613948822, "rewards/margins": 24.04905128479004, "rewards/rejected": -24.16390609741211, "step": 7280 }, { "epoch": 2.48, "learning_rate": 9.668890847286919e-08, "logits/chosen": -0.4545148015022278, "logits/rejected": -0.7045314311981201, "logps/chosen": -288.62396240234375, "logps/rejected": -570.2217407226562, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.2775440812110901, "rewards/margins": 27.69061279296875, "rewards/rejected": -27.41306495666504, "step": 7290 }, { "epoch": 2.48, "learning_rate": 9.605942339166561e-08, "logits/chosen": -0.7061318159103394, "logits/rejected": -0.7323290109634399, "logps/chosen": -209.32571411132812, "logps/rejected": -895.7783203125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.049312837421894073, "rewards/margins": 24.534969329833984, "rewards/rejected": -24.485652923583984, "step": 7300 }, { "epoch": 2.48, "eval_logits/chosen": -0.6486817598342896, "eval_logits/rejected": -0.7312887907028198, "eval_logps/chosen": -219.0937042236328, "eval_logps/rejected": -687.9479370117188, "eval_loss": 0.0016824412159621716, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.09378727525472641, "eval_rewards/margins": 27.214614868164062, "eval_rewards/rejected": -27.30840492248535, "eval_runtime": 536.247, "eval_samples_per_second": 17.716, "eval_steps_per_second": 0.554, "step": 7300 }, { "epoch": 2.48, "learning_rate": 9.542993831046203e-08, "logits/chosen": -0.48738735914230347, "logits/rejected": -0.7274879217147827, "logps/chosen": -223.1404571533203, "logps/rejected": -602.3787231445312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.41575080156326294, "rewards/margins": 24.99991226196289, "rewards/rejected": -25.415664672851562, "step": 7310 }, { "epoch": 2.49, "learning_rate": 9.480045322925846e-08, "logits/chosen": -0.5465003848075867, "logits/rejected": -0.6907869577407837, "logps/chosen": -229.36019897460938, "logps/rejected": -723.80419921875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3435986042022705, "rewards/margins": 28.192630767822266, "rewards/rejected": -28.53623390197754, "step": 7320 }, { "epoch": 2.49, "learning_rate": 9.41709681480549e-08, "logits/chosen": -0.4298132359981537, "logits/rejected": -0.7592583894729614, "logps/chosen": -160.5, "logps/rejected": -596.3023681640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.27029064297676086, "rewards/margins": 26.787353515625, "rewards/rejected": -26.51706314086914, "step": 7330 }, { "epoch": 2.49, "learning_rate": 9.35414830668513e-08, "logits/chosen": -0.6825663447380066, "logits/rejected": -0.6295133233070374, "logps/chosen": -150.56167602539062, "logps/rejected": -673.5885009765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.12202966213226318, "rewards/margins": 24.98833465576172, "rewards/rejected": -25.110363006591797, "step": 7340 }, { "epoch": 2.5, "learning_rate": 9.291199798564774e-08, "logits/chosen": -0.5743321180343628, "logits/rejected": -0.6710582971572876, "logps/chosen": -169.47586059570312, "logps/rejected": -689.9000244140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.04051043838262558, "rewards/margins": 26.72161293029785, "rewards/rejected": -26.681102752685547, "step": 7350 }, { "epoch": 2.5, "learning_rate": 9.228251290444416e-08, "logits/chosen": -0.6136351823806763, "logits/rejected": -0.7028884887695312, "logps/chosen": -210.3823699951172, "logps/rejected": -681.9471435546875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.016335979104042053, "rewards/margins": 25.259355545043945, "rewards/rejected": -25.275691986083984, "step": 7360 }, { "epoch": 2.51, "learning_rate": 9.165302782324058e-08, "logits/chosen": -0.5134451389312744, "logits/rejected": -0.7092984914779663, "logps/chosen": -171.38812255859375, "logps/rejected": -678.00634765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.15452107787132263, "rewards/margins": 28.658573150634766, "rewards/rejected": -28.50404930114746, "step": 7370 }, { "epoch": 2.51, "learning_rate": 9.102354274203701e-08, "logits/chosen": -0.5690580010414124, "logits/rejected": -0.743142306804657, "logps/chosen": -202.72811889648438, "logps/rejected": -687.6838989257812, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.42484569549560547, "rewards/margins": 27.62217140197754, "rewards/rejected": -28.04701805114746, "step": 7380 }, { "epoch": 2.51, "learning_rate": 9.039405766083344e-08, "logits/chosen": -0.4859371781349182, "logits/rejected": -0.6720676422119141, "logps/chosen": -188.89739990234375, "logps/rejected": -788.46923828125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.34767967462539673, "rewards/margins": 21.621891021728516, "rewards/rejected": -21.96957015991211, "step": 7390 }, { "epoch": 2.52, "learning_rate": 8.976457257962985e-08, "logits/chosen": -0.5447125434875488, "logits/rejected": -0.6852697134017944, "logps/chosen": -169.57017517089844, "logps/rejected": -721.5689697265625, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.019341368228197098, "rewards/margins": 26.690975189208984, "rewards/rejected": -26.6716365814209, "step": 7400 }, { "epoch": 2.52, "eval_logits/chosen": -0.6467255353927612, "eval_logits/rejected": -0.7289376258850098, "eval_logps/chosen": -218.75204467773438, "eval_logps/rejected": -688.5939331054688, "eval_loss": 0.0016218158416450024, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.05962177738547325, "eval_rewards/margins": 27.31337547302246, "eval_rewards/rejected": -27.372995376586914, "eval_runtime": 537.782, "eval_samples_per_second": 17.665, "eval_steps_per_second": 0.552, "step": 7400 }, { "epoch": 2.52, "learning_rate": 8.913508749842629e-08, "logits/chosen": -0.5609080195426941, "logits/rejected": -0.7022966146469116, "logps/chosen": -159.44430541992188, "logps/rejected": -658.2679443359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.24128977954387665, "rewards/margins": 25.813663482666016, "rewards/rejected": -26.05495262145996, "step": 7410 }, { "epoch": 2.52, "learning_rate": 8.850560241722271e-08, "logits/chosen": -0.42324042320251465, "logits/rejected": -0.6835426092147827, "logps/chosen": -234.31143188476562, "logps/rejected": -665.1029663085938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.37104207277297974, "rewards/margins": 24.27901840209961, "rewards/rejected": -24.650060653686523, "step": 7420 }, { "epoch": 2.53, "learning_rate": 8.787611733601913e-08, "logits/chosen": -0.33603090047836304, "logits/rejected": -0.630584180355072, "logps/chosen": -338.33587646484375, "logps/rejected": -541.4974975585938, "loss": 0.005, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19795839488506317, "rewards/margins": 29.466419219970703, "rewards/rejected": -29.268457412719727, "step": 7430 }, { "epoch": 2.53, "learning_rate": 8.724663225481556e-08, "logits/chosen": -0.5414437651634216, "logits/rejected": -0.8038471937179565, "logps/chosen": -208.8376007080078, "logps/rejected": -800.9869384765625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.10747363418340683, "rewards/margins": 28.398120880126953, "rewards/rejected": -28.2906436920166, "step": 7440 }, { "epoch": 2.53, "learning_rate": 8.6617147173612e-08, "logits/chosen": -0.49997109174728394, "logits/rejected": -0.646056592464447, "logps/chosen": -178.00575256347656, "logps/rejected": -815.0991821289062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.4502209722995758, "rewards/margins": 27.605091094970703, "rewards/rejected": -27.154870986938477, "step": 7450 }, { "epoch": 2.54, "learning_rate": 8.59876620924084e-08, "logits/chosen": -0.4798315465450287, "logits/rejected": -0.7562838196754456, "logps/chosen": -338.7757568359375, "logps/rejected": -724.3677978515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.16943266987800598, "rewards/margins": 27.60727882385254, "rewards/rejected": -27.437841415405273, "step": 7460 }, { "epoch": 2.54, "learning_rate": 8.535817701120483e-08, "logits/chosen": -0.5749155879020691, "logits/rejected": -0.7189256548881531, "logps/chosen": -178.41201782226562, "logps/rejected": -815.8389892578125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.2845305800437927, "rewards/margins": 24.132577896118164, "rewards/rejected": -23.848047256469727, "step": 7470 }, { "epoch": 2.54, "learning_rate": 8.472869193000126e-08, "logits/chosen": -0.3999585211277008, "logits/rejected": -0.7674452066421509, "logps/chosen": -374.2238464355469, "logps/rejected": -697.6196899414062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.0055720447562634945, "rewards/margins": 27.210113525390625, "rewards/rejected": -27.215686798095703, "step": 7480 }, { "epoch": 2.55, "learning_rate": 8.409920684879767e-08, "logits/chosen": -0.4942191243171692, "logits/rejected": -0.5995741486549377, "logps/chosen": -222.2894744873047, "logps/rejected": -689.0328979492188, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.2691688537597656, "rewards/margins": 28.452220916748047, "rewards/rejected": -28.183055877685547, "step": 7490 }, { "epoch": 2.55, "learning_rate": 8.346972176759411e-08, "logits/chosen": -0.5729402899742126, "logits/rejected": -0.7221068143844604, "logps/chosen": -323.5924377441406, "logps/rejected": -746.4066162109375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.2634117603302002, "rewards/margins": 27.469867706298828, "rewards/rejected": -27.733280181884766, "step": 7500 }, { "epoch": 2.55, "eval_logits/chosen": -0.6462146639823914, "eval_logits/rejected": -0.7270610332489014, "eval_logps/chosen": -218.0538787841797, "eval_logps/rejected": -688.309326171875, "eval_loss": 0.0015523574547842145, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.01019204966723919, "eval_rewards/margins": 27.354736328125, "eval_rewards/rejected": -27.344541549682617, "eval_runtime": 538.4256, "eval_samples_per_second": 17.644, "eval_steps_per_second": 0.552, "step": 7500 }, { "epoch": 2.55, "learning_rate": 8.284023668639053e-08, "logits/chosen": -0.39733731746673584, "logits/rejected": -0.6356142163276672, "logps/chosen": -315.47686767578125, "logps/rejected": -564.8656005859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2564890384674072, "rewards/margins": 22.46954345703125, "rewards/rejected": -22.72603416442871, "step": 7510 }, { "epoch": 2.56, "learning_rate": 8.221075160518695e-08, "logits/chosen": -0.47270745038986206, "logits/rejected": -0.6476394534111023, "logps/chosen": -273.1866149902344, "logps/rejected": -633.6221923828125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.017966723069548607, "rewards/margins": 23.191078186035156, "rewards/rejected": -23.17310905456543, "step": 7520 }, { "epoch": 2.56, "learning_rate": 8.158126652398338e-08, "logits/chosen": -0.44426876306533813, "logits/rejected": -0.6859654188156128, "logps/chosen": -264.68658447265625, "logps/rejected": -552.93408203125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.5118619799613953, "rewards/margins": 27.2331600189209, "rewards/rejected": -26.721298217773438, "step": 7530 }, { "epoch": 2.56, "learning_rate": 8.09517814427798e-08, "logits/chosen": -0.6174875497817993, "logits/rejected": -0.7009795904159546, "logps/chosen": -153.3941650390625, "logps/rejected": -686.4968872070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.22861552238464355, "rewards/margins": 29.68195152282715, "rewards/rejected": -29.910564422607422, "step": 7540 }, { "epoch": 2.57, "learning_rate": 8.032229636157622e-08, "logits/chosen": -0.5496028661727905, "logits/rejected": -0.6314720511436462, "logps/chosen": -162.07327270507812, "logps/rejected": -857.0486450195312, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.3610314726829529, "rewards/margins": 27.3369197845459, "rewards/rejected": -27.697956085205078, "step": 7550 }, { "epoch": 2.57, "learning_rate": 7.969281128037266e-08, "logits/chosen": -0.5667204856872559, "logits/rejected": -0.7573039531707764, "logps/chosen": -169.70143127441406, "logps/rejected": -644.6480712890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.483036607503891, "rewards/margins": 28.8654842376709, "rewards/rejected": -29.348522186279297, "step": 7560 }, { "epoch": 2.57, "learning_rate": 7.906332619916907e-08, "logits/chosen": -0.5994788408279419, "logits/rejected": -0.7126821279525757, "logps/chosen": -256.64337158203125, "logps/rejected": -716.5960693359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.4611395001411438, "rewards/margins": 28.469472885131836, "rewards/rejected": -28.930612564086914, "step": 7570 }, { "epoch": 2.58, "learning_rate": 7.84338411179655e-08, "logits/chosen": -0.4693906903266907, "logits/rejected": -0.7127438187599182, "logps/chosen": -268.37249755859375, "logps/rejected": -703.8270263671875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.1490761935710907, "rewards/margins": 28.704727172851562, "rewards/rejected": -28.85379981994629, "step": 7580 }, { "epoch": 2.58, "learning_rate": 7.780435603676193e-08, "logits/chosen": -0.6119828820228577, "logits/rejected": -0.6748452186584473, "logps/chosen": -159.760009765625, "logps/rejected": -704.10546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.03963739797472954, "rewards/margins": 31.105587005615234, "rewards/rejected": -31.065948486328125, "step": 7590 }, { "epoch": 2.58, "learning_rate": 7.717487095555835e-08, "logits/chosen": -0.519278347492218, "logits/rejected": -0.7276666164398193, "logps/chosen": -187.6893310546875, "logps/rejected": -766.3416748046875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.2591477930545807, "rewards/margins": 29.921703338623047, "rewards/rejected": -30.18085289001465, "step": 7600 }, { "epoch": 2.58, "eval_logits/chosen": -0.6603902578353882, "eval_logits/rejected": -0.7393137216567993, "eval_logps/chosen": -219.8516387939453, "eval_logps/rejected": -702.1956176757812, "eval_loss": 0.001573009300045669, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.16958260536193848, "eval_rewards/margins": 28.56357765197754, "eval_rewards/rejected": -28.73316192626953, "eval_runtime": 536.4183, "eval_samples_per_second": 17.71, "eval_steps_per_second": 0.554, "step": 7600 }, { "epoch": 2.59, "learning_rate": 7.654538587435477e-08, "logits/chosen": -0.6149822473526001, "logits/rejected": -0.694976806640625, "logps/chosen": -174.75323486328125, "logps/rejected": -662.6119995117188, "loss": 0.0012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1897585093975067, "rewards/margins": 28.88149070739746, "rewards/rejected": -29.07124900817871, "step": 7610 }, { "epoch": 2.59, "learning_rate": 7.591590079315121e-08, "logits/chosen": -0.5032398104667664, "logits/rejected": -0.7486822009086609, "logps/chosen": -223.6412353515625, "logps/rejected": -663.9522705078125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.08392763137817383, "rewards/margins": 25.41326141357422, "rewards/rejected": -25.497188568115234, "step": 7620 }, { "epoch": 2.59, "learning_rate": 7.528641571194762e-08, "logits/chosen": -0.580550491809845, "logits/rejected": -0.7419854998588562, "logps/chosen": -149.48255920410156, "logps/rejected": -819.7890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.2869735360145569, "rewards/margins": 29.19332504272461, "rewards/rejected": -28.90635108947754, "step": 7630 }, { "epoch": 2.6, "learning_rate": 7.465693063074405e-08, "logits/chosen": -0.4693359434604645, "logits/rejected": -0.7724667191505432, "logps/chosen": -247.8316192626953, "logps/rejected": -712.2297973632812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.29097646474838257, "rewards/margins": 31.040634155273438, "rewards/rejected": -31.33160972595215, "step": 7640 }, { "epoch": 2.6, "learning_rate": 7.402744554954048e-08, "logits/chosen": -0.5847674012184143, "logits/rejected": -0.649927020072937, "logps/chosen": -209.9244842529297, "logps/rejected": -575.5289306640625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.20915384590625763, "rewards/margins": 24.106426239013672, "rewards/rejected": -24.315580368041992, "step": 7650 }, { "epoch": 2.6, "learning_rate": 7.33979604683369e-08, "logits/chosen": -0.4895502030849457, "logits/rejected": -0.5771899819374084, "logps/chosen": -235.2196044921875, "logps/rejected": -591.0115356445312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.13747075200080872, "rewards/margins": 23.27407455444336, "rewards/rejected": -23.1366024017334, "step": 7660 }, { "epoch": 2.61, "learning_rate": 7.276847538713332e-08, "logits/chosen": -0.5549731254577637, "logits/rejected": -0.6167377829551697, "logps/chosen": -246.3776397705078, "logps/rejected": -658.54052734375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.16213539242744446, "rewards/margins": 30.44855308532715, "rewards/rejected": -30.61069107055664, "step": 7670 }, { "epoch": 2.61, "learning_rate": 7.213899030592976e-08, "logits/chosen": -0.5069581270217896, "logits/rejected": -0.6556397676467896, "logps/chosen": -240.7689208984375, "logps/rejected": -553.93994140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.08895054459571838, "rewards/margins": 23.16933250427246, "rewards/rejected": -23.080387115478516, "step": 7680 }, { "epoch": 2.61, "learning_rate": 7.150950522472617e-08, "logits/chosen": -0.6793751120567322, "logits/rejected": -0.6234490871429443, "logps/chosen": -224.39932250976562, "logps/rejected": -601.79248046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.012466102838516235, "rewards/margins": 29.341012954711914, "rewards/rejected": -29.35347557067871, "step": 7690 }, { "epoch": 2.62, "learning_rate": 7.088002014352259e-08, "logits/chosen": -0.45252862572669983, "logits/rejected": -0.6574255228042603, "logps/chosen": -290.69476318359375, "logps/rejected": -649.1947021484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.2921113073825836, "rewards/margins": 26.49039649963379, "rewards/rejected": -26.198284149169922, "step": 7700 }, { "epoch": 2.62, "eval_logits/chosen": -0.6502103209495544, "eval_logits/rejected": -0.7263643145561218, "eval_logps/chosen": -219.2384490966797, "eval_logps/rejected": -697.8157958984375, "eval_loss": 0.001527833053842187, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10826155543327332, "eval_rewards/margins": 28.186931610107422, "eval_rewards/rejected": -28.295194625854492, "eval_runtime": 537.9274, "eval_samples_per_second": 17.66, "eval_steps_per_second": 0.552, "step": 7700 }, { "epoch": 2.62, "learning_rate": 7.025053506231903e-08, "logits/chosen": -0.5331140756607056, "logits/rejected": -0.6170163750648499, "logps/chosen": -201.58363342285156, "logps/rejected": -516.0665283203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.10678273439407349, "rewards/margins": 27.11932945251465, "rewards/rejected": -27.226110458374023, "step": 7710 }, { "epoch": 2.62, "learning_rate": 6.962104998111543e-08, "logits/chosen": -0.4092063903808594, "logits/rejected": -0.7719508409500122, "logps/chosen": -202.2513427734375, "logps/rejected": -606.2733154296875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.05823258310556412, "rewards/margins": 27.491344451904297, "rewards/rejected": -27.433115005493164, "step": 7720 }, { "epoch": 2.63, "learning_rate": 6.899156489991187e-08, "logits/chosen": -0.5583127737045288, "logits/rejected": -0.660963237285614, "logps/chosen": -215.2934112548828, "logps/rejected": -710.6063232421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.058464109897613525, "rewards/margins": 31.50775146484375, "rewards/rejected": -31.566213607788086, "step": 7730 }, { "epoch": 2.63, "learning_rate": 6.83620798187083e-08, "logits/chosen": -0.5221672058105469, "logits/rejected": -0.6099148988723755, "logps/chosen": -279.95513916015625, "logps/rejected": -631.7137451171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.10448112338781357, "rewards/margins": 30.799448013305664, "rewards/rejected": -30.694965362548828, "step": 7740 }, { "epoch": 2.63, "learning_rate": 6.773259473750472e-08, "logits/chosen": -0.6462365984916687, "logits/rejected": -0.7048633098602295, "logps/chosen": -152.09130859375, "logps/rejected": -636.9746704101562, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.2516084611415863, "rewards/margins": 27.048690795898438, "rewards/rejected": -26.797082901000977, "step": 7750 }, { "epoch": 2.64, "learning_rate": 6.710310965630114e-08, "logits/chosen": -0.41301918029785156, "logits/rejected": -0.6227253675460815, "logps/chosen": -390.49737548828125, "logps/rejected": -716.6018676757812, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.11996034532785416, "rewards/margins": 26.5164737701416, "rewards/rejected": -26.39651107788086, "step": 7760 }, { "epoch": 2.64, "learning_rate": 6.647362457509758e-08, "logits/chosen": -0.5801044702529907, "logits/rejected": -0.6831263899803162, "logps/chosen": -220.26693725585938, "logps/rejected": -583.1362915039062, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.10364434868097305, "rewards/margins": 27.01556968688965, "rewards/rejected": -27.119213104248047, "step": 7770 }, { "epoch": 2.64, "learning_rate": 6.584413949389398e-08, "logits/chosen": -0.44967302680015564, "logits/rejected": -0.7430446743965149, "logps/chosen": -299.3879699707031, "logps/rejected": -710.0242919921875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.10787667334079742, "rewards/margins": 24.56003761291504, "rewards/rejected": -24.66791343688965, "step": 7780 }, { "epoch": 2.65, "learning_rate": 6.521465441269042e-08, "logits/chosen": -0.5871652364730835, "logits/rejected": -0.7334288358688354, "logps/chosen": -273.8252258300781, "logps/rejected": -753.6806030273438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.0733601450920105, "rewards/margins": 25.05944061279297, "rewards/rejected": -24.986080169677734, "step": 7790 }, { "epoch": 2.65, "learning_rate": 6.458516933148684e-08, "logits/chosen": -0.551927924156189, "logits/rejected": -0.6943042278289795, "logps/chosen": -215.69668579101562, "logps/rejected": -773.6475830078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.13455446064472198, "rewards/margins": 30.49356460571289, "rewards/rejected": -30.62811851501465, "step": 7800 }, { "epoch": 2.65, "eval_logits/chosen": -0.6479499340057373, "eval_logits/rejected": -0.7246240377426147, "eval_logps/chosen": -219.0480194091797, "eval_logps/rejected": -697.8218994140625, "eval_loss": 0.0015299491351470351, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.08922182768583298, "eval_rewards/margins": 28.20657730102539, "eval_rewards/rejected": -28.295799255371094, "eval_runtime": 538.0739, "eval_samples_per_second": 17.656, "eval_steps_per_second": 0.552, "step": 7800 }, { "epoch": 2.65, "learning_rate": 6.395568425028327e-08, "logits/chosen": -0.5817424058914185, "logits/rejected": -0.704187273979187, "logps/chosen": -152.1357879638672, "logps/rejected": -838.125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.18766473233699799, "rewards/margins": 27.071924209594727, "rewards/rejected": -27.25958824157715, "step": 7810 }, { "epoch": 2.66, "learning_rate": 6.332619916907969e-08, "logits/chosen": -0.5099160075187683, "logits/rejected": -0.7478706240653992, "logps/chosen": -166.5770263671875, "logps/rejected": -722.854736328125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.11689897626638412, "rewards/margins": 29.843563079833984, "rewards/rejected": -29.960460662841797, "step": 7820 }, { "epoch": 2.66, "learning_rate": 6.269671408787612e-08, "logits/chosen": -0.5679564476013184, "logits/rejected": -0.6514891982078552, "logps/chosen": -194.82032775878906, "logps/rejected": -830.7722778320312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.081822969019413, "rewards/margins": 29.314083099365234, "rewards/rejected": -29.39590835571289, "step": 7830 }, { "epoch": 2.66, "learning_rate": 6.206722900667253e-08, "logits/chosen": -0.6603137254714966, "logits/rejected": -0.7334845066070557, "logps/chosen": -231.955810546875, "logps/rejected": -850.2542114257812, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.46990084648132324, "rewards/margins": 25.281177520751953, "rewards/rejected": -25.75107765197754, "step": 7840 }, { "epoch": 2.67, "learning_rate": 6.143774392546897e-08, "logits/chosen": -0.47100362181663513, "logits/rejected": -0.67029869556427, "logps/chosen": -299.054931640625, "logps/rejected": -569.7674560546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.3611525893211365, "rewards/margins": 26.11277198791504, "rewards/rejected": -25.751617431640625, "step": 7850 }, { "epoch": 2.67, "learning_rate": 6.080825884426539e-08, "logits/chosen": -0.503799557685852, "logits/rejected": -0.6653316020965576, "logps/chosen": -234.3039093017578, "logps/rejected": -629.9035034179688, "loss": 0.0023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0732225552201271, "rewards/margins": 29.378376007080078, "rewards/rejected": -29.305150985717773, "step": 7860 }, { "epoch": 2.68, "learning_rate": 6.017877376306182e-08, "logits/chosen": -0.6225731372833252, "logits/rejected": -0.7829440832138062, "logps/chosen": -241.2194061279297, "logps/rejected": -677.2831420898438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2662116587162018, "rewards/margins": 33.18977355957031, "rewards/rejected": -33.45598602294922, "step": 7870 }, { "epoch": 2.68, "learning_rate": 5.954928868185824e-08, "logits/chosen": -0.44252508878707886, "logits/rejected": -0.6326407790184021, "logps/chosen": -295.93450927734375, "logps/rejected": -890.9364013671875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.035824067890644073, "rewards/margins": 26.51907730102539, "rewards/rejected": -26.483251571655273, "step": 7880 }, { "epoch": 2.68, "learning_rate": 5.891980360065466e-08, "logits/chosen": -0.4499892294406891, "logits/rejected": -0.6948333978652954, "logps/chosen": -286.75653076171875, "logps/rejected": -501.9292907714844, "loss": 0.0025, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4740988612174988, "rewards/margins": 25.110280990600586, "rewards/rejected": -25.58437728881836, "step": 7890 }, { "epoch": 2.69, "learning_rate": 5.8290318519451084e-08, "logits/chosen": -0.6147579550743103, "logits/rejected": -0.627052903175354, "logps/chosen": -169.21881103515625, "logps/rejected": -562.7364501953125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.14973610639572144, "rewards/margins": 23.69455337524414, "rewards/rejected": -23.844287872314453, "step": 7900 }, { "epoch": 2.69, "eval_logits/chosen": -0.6446524262428284, "eval_logits/rejected": -0.7196046710014343, "eval_logps/chosen": -219.2213592529297, "eval_logps/rejected": -699.198974609375, "eval_loss": 0.001476667239330709, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10655219107866287, "eval_rewards/margins": 28.326953887939453, "eval_rewards/rejected": -28.433502197265625, "eval_runtime": 536.8174, "eval_samples_per_second": 17.697, "eval_steps_per_second": 0.553, "step": 7900 }, { "epoch": 2.69, "learning_rate": 5.7660833438247514e-08, "logits/chosen": -0.5976423025131226, "logits/rejected": -0.6819183230400085, "logps/chosen": -235.99484252929688, "logps/rejected": -699.0614013671875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.05282628536224365, "rewards/margins": 30.853435516357422, "rewards/rejected": -30.906261444091797, "step": 7910 }, { "epoch": 2.69, "learning_rate": 5.7031348357043937e-08, "logits/chosen": -0.5762828588485718, "logits/rejected": -0.7222844958305359, "logps/chosen": -193.56790161132812, "logps/rejected": -883.3673095703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.4399544596672058, "rewards/margins": 29.639135360717773, "rewards/rejected": -30.079092025756836, "step": 7920 }, { "epoch": 2.7, "learning_rate": 5.640186327584036e-08, "logits/chosen": -0.5836883783340454, "logits/rejected": -0.66538006067276, "logps/chosen": -196.8748779296875, "logps/rejected": -589.0851440429688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6816442608833313, "rewards/margins": 27.82281494140625, "rewards/rejected": -28.50446128845215, "step": 7930 }, { "epoch": 2.7, "learning_rate": 5.577237819463679e-08, "logits/chosen": -0.5523054599761963, "logits/rejected": -0.7163228392601013, "logps/chosen": -219.42977905273438, "logps/rejected": -533.24560546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.41872626543045044, "rewards/margins": 30.73213768005371, "rewards/rejected": -31.150867462158203, "step": 7940 }, { "epoch": 2.7, "learning_rate": 5.514289311343321e-08, "logits/chosen": -0.4551618695259094, "logits/rejected": -0.7738016247749329, "logps/chosen": -274.9743957519531, "logps/rejected": -756.6781616210938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.2726861238479614, "rewards/margins": 29.430469512939453, "rewards/rejected": -29.157785415649414, "step": 7950 }, { "epoch": 2.71, "learning_rate": 5.4513408032229634e-08, "logits/chosen": -0.657189667224884, "logits/rejected": -0.7436483502388, "logps/chosen": -279.4065856933594, "logps/rejected": -852.6520385742188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5011743307113647, "rewards/margins": 28.9963321685791, "rewards/rejected": -29.49750328063965, "step": 7960 }, { "epoch": 2.71, "learning_rate": 5.388392295102606e-08, "logits/chosen": -0.5472579002380371, "logits/rejected": -0.7184285521507263, "logps/chosen": -219.3739471435547, "logps/rejected": -525.0343017578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.07874573767185211, "rewards/margins": 26.083267211914062, "rewards/rejected": -26.0045223236084, "step": 7970 }, { "epoch": 2.71, "learning_rate": 5.3254437869822486e-08, "logits/chosen": -0.6013602018356323, "logits/rejected": -0.6921324133872986, "logps/chosen": -153.08262634277344, "logps/rejected": -626.1406860351562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.18267950415611267, "rewards/margins": 30.90728759765625, "rewards/rejected": -31.089969635009766, "step": 7980 }, { "epoch": 2.72, "learning_rate": 5.262495278861891e-08, "logits/chosen": -0.5066236257553101, "logits/rejected": -0.5826975107192993, "logps/chosen": -159.78359985351562, "logps/rejected": -486.80926513671875, "loss": 0.0028, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09906096756458282, "rewards/margins": 24.223087310791016, "rewards/rejected": -24.322145462036133, "step": 7990 }, { "epoch": 2.72, "learning_rate": 5.199546770741533e-08, "logits/chosen": -0.6460464000701904, "logits/rejected": -0.7278153896331787, "logps/chosen": -164.73648071289062, "logps/rejected": -704.9205322265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.22662131488323212, "rewards/margins": 31.245952606201172, "rewards/rejected": -31.472576141357422, "step": 8000 }, { "epoch": 2.72, "eval_logits/chosen": -0.651767373085022, "eval_logits/rejected": -0.7264298796653748, "eval_logps/chosen": -219.6089630126953, "eval_logps/rejected": -703.0481567382812, "eval_loss": 0.0014914135681465268, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.14531363546848297, "eval_rewards/margins": 28.673105239868164, "eval_rewards/rejected": -28.818418502807617, "eval_runtime": 537.046, "eval_samples_per_second": 17.689, "eval_steps_per_second": 0.553, "step": 8000 }, { "epoch": 2.72, "learning_rate": 5.136598262621176e-08, "logits/chosen": -0.7160107493400574, "logits/rejected": -0.6929118037223816, "logps/chosen": -147.16575622558594, "logps/rejected": -656.9649047851562, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.21582171320915222, "rewards/margins": 33.95365905761719, "rewards/rejected": -34.16948318481445, "step": 8010 }, { "epoch": 2.73, "learning_rate": 5.073649754500818e-08, "logits/chosen": -0.7157724499702454, "logits/rejected": -0.7097111940383911, "logps/chosen": -164.52850341796875, "logps/rejected": -581.3948974609375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.00738330464810133, "rewards/margins": 29.4835205078125, "rewards/rejected": -29.49090576171875, "step": 8020 }, { "epoch": 2.73, "learning_rate": 5.01070124638046e-08, "logits/chosen": -0.4424969553947449, "logits/rejected": -0.7745085954666138, "logps/chosen": -250.4312744140625, "logps/rejected": -723.536865234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.1889456808567047, "rewards/margins": 31.114315032958984, "rewards/rejected": -31.303258895874023, "step": 8030 }, { "epoch": 2.73, "learning_rate": 4.947752738260103e-08, "logits/chosen": -0.4692629873752594, "logits/rejected": -0.6507894992828369, "logps/chosen": -257.9976501464844, "logps/rejected": -791.3673706054688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.12106170505285263, "rewards/margins": 30.219585418701172, "rewards/rejected": -30.340646743774414, "step": 8040 }, { "epoch": 2.74, "learning_rate": 4.884804230139745e-08, "logits/chosen": -0.6958299875259399, "logits/rejected": -0.6302472949028015, "logps/chosen": -157.91952514648438, "logps/rejected": -915.40869140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.22624190151691437, "rewards/margins": 27.237232208251953, "rewards/rejected": -27.010990142822266, "step": 8050 }, { "epoch": 2.74, "learning_rate": 4.8218557220193875e-08, "logits/chosen": -0.3464438021183014, "logits/rejected": -0.6550552248954773, "logps/chosen": -449.53167724609375, "logps/rejected": -725.5526123046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.512336015701294, "rewards/margins": 25.29047966003418, "rewards/rejected": -25.80281639099121, "step": 8060 }, { "epoch": 2.74, "learning_rate": 4.7589072138990305e-08, "logits/chosen": -0.525118350982666, "logits/rejected": -0.6289024949073792, "logps/chosen": -211.45703125, "logps/rejected": -588.2133178710938, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5345368981361389, "rewards/margins": 26.49302101135254, "rewards/rejected": -27.02756118774414, "step": 8070 }, { "epoch": 2.75, "learning_rate": 4.695958705778673e-08, "logits/chosen": -0.4847562909126282, "logits/rejected": -0.7526861429214478, "logps/chosen": -352.0751037597656, "logps/rejected": -471.7979431152344, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.6151416897773743, "rewards/margins": 27.161224365234375, "rewards/rejected": -27.7763671875, "step": 8080 }, { "epoch": 2.75, "learning_rate": 4.633010197658315e-08, "logits/chosen": -0.6102427244186401, "logits/rejected": -0.659328818321228, "logps/chosen": -242.03311157226562, "logps/rejected": -560.2989501953125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.1307191550731659, "rewards/margins": 24.094703674316406, "rewards/rejected": -24.22542381286621, "step": 8090 }, { "epoch": 2.75, "learning_rate": 4.570061689537958e-08, "logits/chosen": -0.40024805068969727, "logits/rejected": -0.68034827709198, "logps/chosen": -362.1919860839844, "logps/rejected": -649.44287109375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.10668216645717621, "rewards/margins": 25.649898529052734, "rewards/rejected": -25.756580352783203, "step": 8100 }, { "epoch": 2.75, "eval_logits/chosen": -0.6437954306602478, "eval_logits/rejected": -0.7189971208572388, "eval_logps/chosen": -219.21351623535156, "eval_logps/rejected": -701.4963989257812, "eval_loss": 0.0014567332109436393, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10577043890953064, "eval_rewards/margins": 28.557479858398438, "eval_rewards/rejected": -28.663249969482422, "eval_runtime": 539.131, "eval_samples_per_second": 17.621, "eval_steps_per_second": 0.551, "step": 8100 }, { "epoch": 2.76, "learning_rate": 4.5071131814176e-08, "logits/chosen": -0.6491408348083496, "logits/rejected": -0.6594247817993164, "logps/chosen": -279.30841064453125, "logps/rejected": -689.2876586914062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.048566412180662155, "rewards/margins": 24.962228775024414, "rewards/rejected": -25.010793685913086, "step": 8110 }, { "epoch": 2.76, "learning_rate": 4.4441646732972425e-08, "logits/chosen": -0.48415637016296387, "logits/rejected": -0.6947034597396851, "logps/chosen": -176.71469116210938, "logps/rejected": -920.9857177734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.32843881845474243, "rewards/margins": 32.09209442138672, "rewards/rejected": -31.76365089416504, "step": 8120 }, { "epoch": 2.76, "learning_rate": 4.3812161651768855e-08, "logits/chosen": -0.5864359736442566, "logits/rejected": -0.6255900859832764, "logps/chosen": -170.56509399414062, "logps/rejected": -718.5611572265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.3721325993537903, "rewards/margins": 33.517921447753906, "rewards/rejected": -33.145790100097656, "step": 8130 }, { "epoch": 2.77, "learning_rate": 4.318267657056528e-08, "logits/chosen": -0.4358592629432678, "logits/rejected": -0.6944642663002014, "logps/chosen": -242.355712890625, "logps/rejected": -541.7839965820312, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2944978177547455, "rewards/margins": 21.924625396728516, "rewards/rejected": -22.2191219329834, "step": 8140 }, { "epoch": 2.77, "learning_rate": 4.25531914893617e-08, "logits/chosen": -0.6250282526016235, "logits/rejected": -0.7409448623657227, "logps/chosen": -169.15548706054688, "logps/rejected": -762.7986450195312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.28079918026924133, "rewards/margins": 28.453786849975586, "rewards/rejected": -28.73458480834961, "step": 8150 }, { "epoch": 2.77, "learning_rate": 4.192370640815812e-08, "logits/chosen": -0.4243001341819763, "logits/rejected": -0.6819518208503723, "logps/chosen": -207.8750457763672, "logps/rejected": -576.170166015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.4195918142795563, "rewards/margins": 26.446273803710938, "rewards/rejected": -26.86586570739746, "step": 8160 }, { "epoch": 2.78, "learning_rate": 4.129422132695455e-08, "logits/chosen": -0.5933112502098083, "logits/rejected": -0.552503228187561, "logps/chosen": -222.5304718017578, "logps/rejected": -608.9012451171875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.12023186683654785, "rewards/margins": 27.92329978942871, "rewards/rejected": -28.043533325195312, "step": 8170 }, { "epoch": 2.78, "learning_rate": 4.0664736245750975e-08, "logits/chosen": -0.5677968859672546, "logits/rejected": -0.6837521195411682, "logps/chosen": -160.3731231689453, "logps/rejected": -588.1012573242188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.28319409489631653, "rewards/margins": 29.483585357666016, "rewards/rejected": -29.200389862060547, "step": 8180 }, { "epoch": 2.78, "learning_rate": 4.00352511645474e-08, "logits/chosen": -0.4128655791282654, "logits/rejected": -0.6643985509872437, "logps/chosen": -241.3448486328125, "logps/rejected": -664.2467041015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.06391315162181854, "rewards/margins": 26.5832576751709, "rewards/rejected": -26.647174835205078, "step": 8190 }, { "epoch": 2.79, "learning_rate": 3.940576608334383e-08, "logits/chosen": -0.5241934061050415, "logits/rejected": -0.6955603361129761, "logps/chosen": -250.243896484375, "logps/rejected": -675.3636474609375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.11171729862689972, "rewards/margins": 29.10164451599121, "rewards/rejected": -28.98992919921875, "step": 8200 }, { "epoch": 2.79, "eval_logits/chosen": -0.6488181352615356, "eval_logits/rejected": -0.7226865887641907, "eval_logps/chosen": -219.56236267089844, "eval_logps/rejected": -703.7290649414062, "eval_loss": 0.0014862061943858862, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.1406533569097519, "eval_rewards/margins": 28.74586296081543, "eval_rewards/rejected": -28.88651466369629, "eval_runtime": 537.8512, "eval_samples_per_second": 17.663, "eval_steps_per_second": 0.552, "step": 8200 }, { "epoch": 2.79, "learning_rate": 3.877628100214025e-08, "logits/chosen": -0.49394768476486206, "logits/rejected": -0.6807708740234375, "logps/chosen": -233.87179565429688, "logps/rejected": -699.4337158203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.025788113474845886, "rewards/margins": 28.2269287109375, "rewards/rejected": -28.252716064453125, "step": 8210 }, { "epoch": 2.79, "learning_rate": 3.814679592093667e-08, "logits/chosen": -0.5936201810836792, "logits/rejected": -0.7076500058174133, "logps/chosen": -269.7145080566406, "logps/rejected": -895.7030029296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.23743422329425812, "rewards/margins": 30.35300064086914, "rewards/rejected": -30.59043312072754, "step": 8220 }, { "epoch": 2.8, "learning_rate": 3.75173108397331e-08, "logits/chosen": -0.5975568890571594, "logits/rejected": -0.6831235885620117, "logps/chosen": -207.77133178710938, "logps/rejected": -819.8943481445312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.049438439309597015, "rewards/margins": 34.66115951538086, "rewards/rejected": -34.710601806640625, "step": 8230 }, { "epoch": 2.8, "learning_rate": 3.688782575852952e-08, "logits/chosen": -0.5707891583442688, "logits/rejected": -0.6673256158828735, "logps/chosen": -148.22561645507812, "logps/rejected": -522.9525756835938, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.42146244645118713, "rewards/margins": 23.92791175842285, "rewards/rejected": -24.349374771118164, "step": 8240 }, { "epoch": 2.8, "learning_rate": 3.625834067732594e-08, "logits/chosen": -0.4710753560066223, "logits/rejected": -0.647879958152771, "logps/chosen": -288.81536865234375, "logps/rejected": -709.2244262695312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.06588177382946014, "rewards/margins": 28.727365493774414, "rewards/rejected": -28.793249130249023, "step": 8250 }, { "epoch": 2.81, "learning_rate": 3.562885559612237e-08, "logits/chosen": -0.4954242706298828, "logits/rejected": -0.6366100311279297, "logps/chosen": -255.5436553955078, "logps/rejected": -765.2257690429688, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.5619696378707886, "rewards/margins": 27.733036041259766, "rewards/rejected": -27.171062469482422, "step": 8260 }, { "epoch": 2.81, "learning_rate": 3.499937051491879e-08, "logits/chosen": -0.633447527885437, "logits/rejected": -0.6889923810958862, "logps/chosen": -168.8153533935547, "logps/rejected": -679.3703002929688, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.5872045755386353, "rewards/margins": 33.28185272216797, "rewards/rejected": -33.86906051635742, "step": 8270 }, { "epoch": 2.81, "learning_rate": 3.4369885433715216e-08, "logits/chosen": -0.7627745866775513, "logits/rejected": -0.6821005940437317, "logps/chosen": -141.88217163085938, "logps/rejected": -514.2303466796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.02805689349770546, "rewards/margins": 25.854990005493164, "rewards/rejected": -25.883047103881836, "step": 8280 }, { "epoch": 2.82, "learning_rate": 3.3740400352511645e-08, "logits/chosen": -0.7097324728965759, "logits/rejected": -0.7636481523513794, "logps/chosen": -142.22415161132812, "logps/rejected": -909.1417236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.384767085313797, "rewards/margins": 27.93222999572754, "rewards/rejected": -28.31699562072754, "step": 8290 }, { "epoch": 2.82, "learning_rate": 3.311091527130807e-08, "logits/chosen": -0.5623653531074524, "logits/rejected": -0.6749969124794006, "logps/chosen": -181.60995483398438, "logps/rejected": -509.1392517089844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2057819664478302, "rewards/margins": 25.134441375732422, "rewards/rejected": -25.340221405029297, "step": 8300 }, { "epoch": 2.82, "eval_logits/chosen": -0.6533653140068054, "eval_logits/rejected": -0.7272326350212097, "eval_logps/chosen": -219.68392944335938, "eval_logps/rejected": -704.0963134765625, "eval_loss": 0.0014126788591966033, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.15281014144420624, "eval_rewards/margins": 28.770429611206055, "eval_rewards/rejected": -28.923242568969727, "eval_runtime": 538.75, "eval_samples_per_second": 17.633, "eval_steps_per_second": 0.551, "step": 8300 }, { "epoch": 2.82, "learning_rate": 3.248143019010449e-08, "logits/chosen": -0.5943297147750854, "logits/rejected": -0.6544386148452759, "logps/chosen": -154.3319091796875, "logps/rejected": -760.77587890625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.44339531660079956, "rewards/margins": 26.949169158935547, "rewards/rejected": -27.392566680908203, "step": 8310 }, { "epoch": 2.83, "learning_rate": 3.1851945108900914e-08, "logits/chosen": -0.6508817076683044, "logits/rejected": -0.6619390249252319, "logps/chosen": -226.60861206054688, "logps/rejected": -778.4644775390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.43027907609939575, "rewards/margins": 28.432443618774414, "rewards/rejected": -28.862722396850586, "step": 8320 }, { "epoch": 2.83, "learning_rate": 3.122246002769734e-08, "logits/chosen": -0.6981472373008728, "logits/rejected": -0.6996028423309326, "logps/chosen": -158.33181762695312, "logps/rejected": -623.4542236328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.35416629910469055, "rewards/margins": 28.8267879486084, "rewards/rejected": -29.180950164794922, "step": 8330 }, { "epoch": 2.83, "learning_rate": 3.0592974946493766e-08, "logits/chosen": -0.5392864942550659, "logits/rejected": -0.8186851739883423, "logps/chosen": -205.0666046142578, "logps/rejected": -609.2022094726562, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.11573304980993271, "rewards/margins": 31.735177993774414, "rewards/rejected": -31.61944580078125, "step": 8340 }, { "epoch": 2.84, "learning_rate": 2.996348986529019e-08, "logits/chosen": -0.5038381814956665, "logits/rejected": -0.6672341823577881, "logps/chosen": -297.0650634765625, "logps/rejected": -577.3687744140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3614363670349121, "rewards/margins": 26.931310653686523, "rewards/rejected": -27.29274559020996, "step": 8350 }, { "epoch": 2.84, "learning_rate": 2.9334004784086618e-08, "logits/chosen": -0.506759762763977, "logits/rejected": -0.6892939209938049, "logps/chosen": -177.43885803222656, "logps/rejected": -718.6201171875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.19743594527244568, "rewards/margins": 27.469079971313477, "rewards/rejected": -27.666515350341797, "step": 8360 }, { "epoch": 2.85, "learning_rate": 2.870451970288304e-08, "logits/chosen": -0.42665451765060425, "logits/rejected": -0.6902209520339966, "logps/chosen": -295.36474609375, "logps/rejected": -763.1085205078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.2218431532382965, "rewards/margins": 27.496353149414062, "rewards/rejected": -27.71819496154785, "step": 8370 }, { "epoch": 2.85, "learning_rate": 2.8075034621679467e-08, "logits/chosen": -0.4955870509147644, "logits/rejected": -0.6362816095352173, "logps/chosen": -340.106201171875, "logps/rejected": -451.201904296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.40474003553390503, "rewards/margins": 23.4906005859375, "rewards/rejected": -23.085859298706055, "step": 8380 }, { "epoch": 2.85, "learning_rate": 2.744554954047589e-08, "logits/chosen": -0.5625302195549011, "logits/rejected": -0.7450130581855774, "logps/chosen": -162.1681365966797, "logps/rejected": -681.9586791992188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2606451213359833, "rewards/margins": 28.19317054748535, "rewards/rejected": -28.453815460205078, "step": 8390 }, { "epoch": 2.86, "learning_rate": 2.6816064459272312e-08, "logits/chosen": -0.4746534824371338, "logits/rejected": -0.6074628829956055, "logps/chosen": -325.8948974609375, "logps/rejected": -747.2674560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.11365803331136703, "rewards/margins": 30.380046844482422, "rewards/rejected": -30.266387939453125, "step": 8400 }, { "epoch": 2.86, "eval_logits/chosen": -0.6491891145706177, "eval_logits/rejected": -0.7244004011154175, "eval_logps/chosen": -219.3522491455078, "eval_logps/rejected": -702.4371337890625, "eval_loss": 0.001346489298157394, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.11964266002178192, "eval_rewards/margins": 28.63768196105957, "eval_rewards/rejected": -28.757322311401367, "eval_runtime": 537.4644, "eval_samples_per_second": 17.676, "eval_steps_per_second": 0.553, "step": 8400 }, { "epoch": 2.86, "learning_rate": 2.618657937806874e-08, "logits/chosen": -0.6316782236099243, "logits/rejected": -0.7068917155265808, "logps/chosen": -228.46426391601562, "logps/rejected": -645.2169799804688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.012070173397660255, "rewards/margins": 26.425586700439453, "rewards/rejected": -26.413516998291016, "step": 8410 }, { "epoch": 2.86, "learning_rate": 2.555709429686516e-08, "logits/chosen": -0.5973232388496399, "logits/rejected": -0.7387608289718628, "logps/chosen": -296.6229553222656, "logps/rejected": -670.135009765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.18199920654296875, "rewards/margins": 26.12013816833496, "rewards/rejected": -25.938140869140625, "step": 8420 }, { "epoch": 2.87, "learning_rate": 2.4927609215661587e-08, "logits/chosen": -0.5798535943031311, "logits/rejected": -0.6555114984512329, "logps/chosen": -171.92840576171875, "logps/rejected": -535.505859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.5525417327880859, "rewards/margins": 28.26711654663086, "rewards/rejected": -28.819660186767578, "step": 8430 }, { "epoch": 2.87, "learning_rate": 2.4298124134458013e-08, "logits/chosen": -0.6877808570861816, "logits/rejected": -0.6417495608329773, "logps/chosen": -171.9190673828125, "logps/rejected": -763.720947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.2979416847229004, "rewards/margins": 28.518457412719727, "rewards/rejected": -28.8164005279541, "step": 8440 }, { "epoch": 2.87, "learning_rate": 2.3668639053254436e-08, "logits/chosen": -0.4996717572212219, "logits/rejected": -0.6960971355438232, "logps/chosen": -167.75746154785156, "logps/rejected": -546.6564331054688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8915015459060669, "rewards/margins": 26.126300811767578, "rewards/rejected": -27.017803192138672, "step": 8450 }, { "epoch": 2.88, "learning_rate": 2.3039153972050862e-08, "logits/chosen": -0.6731133460998535, "logits/rejected": -0.7562659382820129, "logps/chosen": -222.4777374267578, "logps/rejected": -877.9293212890625, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.029060805216431618, "rewards/margins": 29.730199813842773, "rewards/rejected": -29.701141357421875, "step": 8460 }, { "epoch": 2.88, "learning_rate": 2.2409668890847285e-08, "logits/chosen": -0.5237163305282593, "logits/rejected": -0.6719953417778015, "logps/chosen": -165.78517150878906, "logps/rejected": -665.7965087890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.32926905155181885, "rewards/margins": 26.77447509765625, "rewards/rejected": -27.103744506835938, "step": 8470 }, { "epoch": 2.88, "learning_rate": 2.178018380964371e-08, "logits/chosen": -0.6638740301132202, "logits/rejected": -0.7178734540939331, "logps/chosen": -224.12789916992188, "logps/rejected": -746.2756958007812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.5556403398513794, "rewards/margins": 31.98124122619629, "rewards/rejected": -32.53688430786133, "step": 8480 }, { "epoch": 2.89, "learning_rate": 2.1150698728440137e-08, "logits/chosen": -0.6383403539657593, "logits/rejected": -0.7099062204360962, "logps/chosen": -207.8087615966797, "logps/rejected": -646.2130126953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.03606262430548668, "rewards/margins": 30.389179229736328, "rewards/rejected": -30.425243377685547, "step": 8490 }, { "epoch": 2.89, "learning_rate": 2.052121364723656e-08, "logits/chosen": -0.645229697227478, "logits/rejected": -0.6613473296165466, "logps/chosen": -161.66293334960938, "logps/rejected": -658.2742919921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3301275372505188, "rewards/margins": 27.462900161743164, "rewards/rejected": -27.793025970458984, "step": 8500 }, { "epoch": 2.89, "eval_logits/chosen": -0.6518352627754211, "eval_logits/rejected": -0.7276310324668884, "eval_logps/chosen": -219.69773864746094, "eval_logps/rejected": -704.3861083984375, "eval_loss": 0.0013085852842777967, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.15419186651706696, "eval_rewards/margins": 28.798019409179688, "eval_rewards/rejected": -28.952213287353516, "eval_runtime": 538.4303, "eval_samples_per_second": 17.644, "eval_steps_per_second": 0.552, "step": 8500 }, { "epoch": 2.89, "learning_rate": 1.9891728566032983e-08, "logits/chosen": -0.4789894223213196, "logits/rejected": -0.6566998958587646, "logps/chosen": -226.34912109375, "logps/rejected": -554.7811889648438, "loss": 0.0052, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6175805330276489, "rewards/margins": 29.08280372619629, "rewards/rejected": -29.700387954711914, "step": 8510 }, { "epoch": 2.9, "learning_rate": 1.926224348482941e-08, "logits/chosen": -0.5758837461471558, "logits/rejected": -0.7699673771858215, "logps/chosen": -277.26806640625, "logps/rejected": -668.8990478515625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.05915145203471184, "rewards/margins": 31.109533309936523, "rewards/rejected": -31.05038833618164, "step": 8520 }, { "epoch": 2.9, "learning_rate": 1.863275840362583e-08, "logits/chosen": -0.4830097258090973, "logits/rejected": -0.6517111659049988, "logps/chosen": -173.46011352539062, "logps/rejected": -833.8435668945312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3971610367298126, "rewards/margins": 30.616695404052734, "rewards/rejected": -31.01386070251465, "step": 8530 }, { "epoch": 2.9, "learning_rate": 1.8003273322422258e-08, "logits/chosen": -0.523481011390686, "logits/rejected": -0.5769650340080261, "logps/chosen": -170.96334838867188, "logps/rejected": -711.47021484375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.2416374683380127, "rewards/margins": 26.643234252929688, "rewards/rejected": -26.884876251220703, "step": 8540 }, { "epoch": 2.91, "learning_rate": 1.737378824121868e-08, "logits/chosen": -0.5567072629928589, "logits/rejected": -0.6633394956588745, "logps/chosen": -162.2719268798828, "logps/rejected": -695.4423828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.5032995939254761, "rewards/margins": 29.034149169921875, "rewards/rejected": -29.53744888305664, "step": 8550 }, { "epoch": 2.91, "learning_rate": 1.6744303160015107e-08, "logits/chosen": -0.6423370838165283, "logits/rejected": -0.676962673664093, "logps/chosen": -224.5527801513672, "logps/rejected": -658.9525146484375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.20928898453712463, "rewards/margins": 26.864093780517578, "rewards/rejected": -27.073379516601562, "step": 8560 }, { "epoch": 2.91, "learning_rate": 1.6114818078811533e-08, "logits/chosen": -0.5528497695922852, "logits/rejected": -0.6786164045333862, "logps/chosen": -227.97909545898438, "logps/rejected": -797.9507446289062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.26104599237442017, "rewards/margins": 27.97920799255371, "rewards/rejected": -28.240253448486328, "step": 8570 }, { "epoch": 2.92, "learning_rate": 1.5485332997607955e-08, "logits/chosen": -0.5983594655990601, "logits/rejected": -0.6968386769294739, "logps/chosen": -164.4805908203125, "logps/rejected": -675.7778930664062, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.29642683267593384, "rewards/margins": 29.330509185791016, "rewards/rejected": -29.034082412719727, "step": 8580 }, { "epoch": 2.92, "learning_rate": 1.485584791640438e-08, "logits/chosen": -0.49407944083213806, "logits/rejected": -0.6636226773262024, "logps/chosen": -227.4563751220703, "logps/rejected": -679.5933227539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.32022902369499207, "rewards/margins": 25.81024169921875, "rewards/rejected": -26.130468368530273, "step": 8590 }, { "epoch": 2.92, "learning_rate": 1.4226362835200804e-08, "logits/chosen": -0.5755038261413574, "logits/rejected": -0.7046786546707153, "logps/chosen": -165.39279174804688, "logps/rejected": -716.6422119140625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.07122356444597244, "rewards/margins": 26.726343154907227, "rewards/rejected": -26.797565460205078, "step": 8600 }, { "epoch": 2.92, "eval_logits/chosen": -0.6425817608833313, "eval_logits/rejected": -0.7180920243263245, "eval_logps/chosen": -219.04080200195312, "eval_logps/rejected": -700.9456176757812, "eval_loss": 0.0013144640251994133, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.08849843591451645, "eval_rewards/margins": 28.519668579101562, "eval_rewards/rejected": -28.60816764831543, "eval_runtime": 538.8627, "eval_samples_per_second": 17.63, "eval_steps_per_second": 0.551, "step": 8600 }, { "epoch": 2.93, "learning_rate": 1.3596877753997229e-08, "logits/chosen": -0.4481170177459717, "logits/rejected": -0.7226850986480713, "logps/chosen": -301.50238037109375, "logps/rejected": -563.1032104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.07532156258821487, "rewards/margins": 29.302698135375977, "rewards/rejected": -29.378021240234375, "step": 8610 }, { "epoch": 2.93, "learning_rate": 1.2967392672793655e-08, "logits/chosen": -0.6746814846992493, "logits/rejected": -0.7281866073608398, "logps/chosen": -178.777099609375, "logps/rejected": -915.0427856445312, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.339714914560318, "rewards/margins": 25.291259765625, "rewards/rejected": -25.6309757232666, "step": 8620 }, { "epoch": 2.93, "learning_rate": 1.233790759159008e-08, "logits/chosen": -0.49744996428489685, "logits/rejected": -0.6918049454689026, "logps/chosen": -200.49749755859375, "logps/rejected": -651.6624755859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.43716326355934143, "rewards/margins": 26.582727432250977, "rewards/rejected": -27.019887924194336, "step": 8630 }, { "epoch": 2.94, "learning_rate": 1.1708422510386504e-08, "logits/chosen": -0.5775032043457031, "logits/rejected": -0.7401893734931946, "logps/chosen": -273.8330383300781, "logps/rejected": -678.9024658203125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.06406418234109879, "rewards/margins": 30.892353057861328, "rewards/rejected": -30.828289031982422, "step": 8640 }, { "epoch": 2.94, "learning_rate": 1.1078937429182926e-08, "logits/chosen": -0.48577064275741577, "logits/rejected": -0.6799474954605103, "logps/chosen": -343.5271911621094, "logps/rejected": -778.7496337890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.0006717324140481651, "rewards/margins": 26.450979232788086, "rewards/rejected": -26.451650619506836, "step": 8650 }, { "epoch": 2.94, "learning_rate": 1.0449452347979353e-08, "logits/chosen": -0.3917025923728943, "logits/rejected": -0.7163748741149902, "logps/chosen": -278.08843994140625, "logps/rejected": -666.14990234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.37205690145492554, "rewards/margins": 25.820392608642578, "rewards/rejected": -25.44833755493164, "step": 8660 }, { "epoch": 2.95, "learning_rate": 9.819967266775777e-09, "logits/chosen": -0.6154365539550781, "logits/rejected": -0.5894041061401367, "logps/chosen": -169.3626251220703, "logps/rejected": -760.666259765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.14306248724460602, "rewards/margins": 31.284704208374023, "rewards/rejected": -31.427764892578125, "step": 8670 }, { "epoch": 2.95, "learning_rate": 9.190482185572201e-09, "logits/chosen": -0.4520339071750641, "logits/rejected": -0.6658391952514648, "logps/chosen": -280.23980712890625, "logps/rejected": -859.0458984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012914479011669755, "rewards/margins": 30.23415756225586, "rewards/rejected": -30.232864379882812, "step": 8680 }, { "epoch": 2.95, "learning_rate": 8.560997104368626e-09, "logits/chosen": -0.5013027191162109, "logits/rejected": -0.6156641244888306, "logps/chosen": -207.96395874023438, "logps/rejected": -554.1871337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.08347469568252563, "rewards/margins": 25.316104888916016, "rewards/rejected": -25.232629776000977, "step": 8690 }, { "epoch": 2.96, "learning_rate": 7.931512023165052e-09, "logits/chosen": -0.43391793966293335, "logits/rejected": -0.5986698269844055, "logps/chosen": -242.9003448486328, "logps/rejected": -632.6611328125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.6143509149551392, "rewards/margins": 23.731985092163086, "rewards/rejected": -24.34633445739746, "step": 8700 }, { "epoch": 2.96, "eval_logits/chosen": -0.6428853273391724, "eval_logits/rejected": -0.7189819812774658, "eval_logps/chosen": -219.0594482421875, "eval_logps/rejected": -700.7509765625, "eval_loss": 0.0013149393489584327, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.0903618261218071, "eval_rewards/margins": 28.498336791992188, "eval_rewards/rejected": -28.588699340820312, "eval_runtime": 539.0008, "eval_samples_per_second": 17.625, "eval_steps_per_second": 0.551, "step": 8700 }, { "epoch": 2.96, "learning_rate": 7.3020269419614755e-09, "logits/chosen": -0.5260372161865234, "logits/rejected": -0.7232301831245422, "logps/chosen": -169.69837951660156, "logps/rejected": -856.2943115234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.37946170568466187, "rewards/margins": 27.055599212646484, "rewards/rejected": -27.4350643157959, "step": 8710 }, { "epoch": 2.96, "learning_rate": 6.6725418607579e-09, "logits/chosen": -0.5516884922981262, "logits/rejected": -0.7399289011955261, "logps/chosen": -224.728515625, "logps/rejected": -777.5650634765625, "loss": 0.0015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2009330689907074, "rewards/margins": 30.44219970703125, "rewards/rejected": -30.64312744140625, "step": 8720 }, { "epoch": 2.97, "learning_rate": 6.043056779554324e-09, "logits/chosen": -0.7349728941917419, "logits/rejected": -0.6249482035636902, "logps/chosen": -167.81991577148438, "logps/rejected": -856.3955078125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.10368845611810684, "rewards/margins": 25.81661033630371, "rewards/rejected": -25.920297622680664, "step": 8730 }, { "epoch": 2.97, "learning_rate": 5.41357169835075e-09, "logits/chosen": -0.5010379552841187, "logits/rejected": -0.6831159591674805, "logps/chosen": -234.1759033203125, "logps/rejected": -689.0057373046875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.14692345261573792, "rewards/margins": 31.09639549255371, "rewards/rejected": -31.24332046508789, "step": 8740 }, { "epoch": 2.97, "learning_rate": 4.784086617147173e-09, "logits/chosen": -0.4989122748374939, "logits/rejected": -0.7627394199371338, "logps/chosen": -340.4763488769531, "logps/rejected": -569.5533447265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.04225959628820419, "rewards/margins": 27.484817504882812, "rewards/rejected": -27.442556381225586, "step": 8750 }, { "epoch": 2.98, "learning_rate": 4.1546015359435984e-09, "logits/chosen": -0.4738582968711853, "logits/rejected": -0.7103700637817383, "logps/chosen": -281.80133056640625, "logps/rejected": -672.8568725585938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.3794795572757721, "rewards/margins": 28.228496551513672, "rewards/rejected": -28.60797119140625, "step": 8760 }, { "epoch": 2.98, "learning_rate": 3.5251164547400225e-09, "logits/chosen": -0.4409480690956116, "logits/rejected": -0.6716551184654236, "logps/chosen": -276.3580322265625, "logps/rejected": -703.5185546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3186389207839966, "rewards/margins": 27.233783721923828, "rewards/rejected": -27.55242347717285, "step": 8770 }, { "epoch": 2.98, "learning_rate": 2.895631373536447e-09, "logits/chosen": -0.4486841559410095, "logits/rejected": -0.6549838781356812, "logps/chosen": -217.8758544921875, "logps/rejected": -521.5465087890625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.14246520400047302, "rewards/margins": 25.34515953063965, "rewards/rejected": -25.487625122070312, "step": 8780 }, { "epoch": 2.99, "learning_rate": 2.2661462923328713e-09, "logits/chosen": -0.5399130582809448, "logits/rejected": -0.6219548583030701, "logps/chosen": -219.319091796875, "logps/rejected": -694.158935546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.42632991075515747, "rewards/margins": 26.378992080688477, "rewards/rejected": -26.80531883239746, "step": 8790 }, { "epoch": 2.99, "learning_rate": 1.6366612111292962e-09, "logits/chosen": -0.5926957130432129, "logits/rejected": -0.6558809876441956, "logps/chosen": -162.33450317382812, "logps/rejected": -602.2572631835938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.7957462668418884, "rewards/margins": 29.334264755249023, "rewards/rejected": -30.130008697509766, "step": 8800 }, { "epoch": 2.99, "eval_logits/chosen": -0.6429576873779297, "eval_logits/rejected": -0.7193523645401001, "eval_logps/chosen": -219.0537872314453, "eval_logps/rejected": -700.7207641601562, "eval_loss": 0.0013058752520009875, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.08979782462120056, "eval_rewards/margins": 28.49588394165039, "eval_rewards/rejected": -28.585681915283203, "eval_runtime": 538.794, "eval_samples_per_second": 17.632, "eval_steps_per_second": 0.551, "step": 8800 }, { "epoch": 2.99, "learning_rate": 1.0071761299257208e-09, "logits/chosen": -0.5969603657722473, "logits/rejected": -0.6630970239639282, "logps/chosen": -181.35720825195312, "logps/rejected": -719.54052734375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.38600805401802063, "rewards/margins": 28.389751434326172, "rewards/rejected": -28.775760650634766, "step": 8810 }, { "epoch": 3.0, "learning_rate": 3.7769104872214527e-10, "logits/chosen": -0.5753879547119141, "logits/rejected": -0.6362535953521729, "logps/chosen": -173.07064819335938, "logps/rejected": -556.792724609375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.36847108602523804, "rewards/margins": 25.675556182861328, "rewards/rejected": -26.04402732849121, "step": 8820 }, { "epoch": 3.0, "step": 8826, "total_flos": 0.0, "train_loss": 0.029375145110450352, "train_runtime": 110770.2779, "train_samples_per_second": 5.099, "train_steps_per_second": 0.08 } ], "logging_steps": 10, "max_steps": 8826, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }