{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 7.936507936507937e-08, "logits/chosen": 0.629372239112854, "logits/rejected": 0.5832597017288208, "logps/chosen": -232.8815155029297, "logps/rejected": -282.9345703125, "loss": 250000.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 7.936507936507937e-07, "logits/chosen": 0.8544880747795105, "logits/rejected": 0.8044350147247314, "logps/chosen": -320.2265625, "logps/rejected": -265.1168518066406, "loss": 249944.1944, "rewards/accuracies": 0.3263888955116272, "rewards/chosen": -2.933910582214594e-05, "rewards/margins": -7.780968735460192e-05, "rewards/rejected": 4.847058153245598e-05, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.5873015873015873e-06, "logits/chosen": 0.8017373085021973, "logits/rejected": 0.9183248281478882, "logps/chosen": -312.8671875, "logps/rejected": -272.638671875, "loss": 249967.3, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.00025959216873161495, "rewards/margins": 1.0932203622360248e-05, "rewards/rejected": -0.0002705243823584169, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.380952380952381e-06, "logits/chosen": 0.7464914917945862, "logits/rejected": 0.8079172372817993, "logps/chosen": -311.29766845703125, "logps/rejected": -305.3917541503906, "loss": 249878.775, "rewards/accuracies": 0.53125, "rewards/chosen": -0.001269679982215166, "rewards/margins": 0.00016684313595760614, "rewards/rejected": -0.001436523045413196, "step": 30 }, { "epoch": 0.06, "learning_rate": 3.1746031746031746e-06, "logits/chosen": 0.7881044149398804, "logits/rejected": 0.754567563533783, "logps/chosen": -318.0370788574219, "logps/rejected": -303.98504638671875, "loss": 249494.675, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.003600001335144043, "rewards/margins": 0.0004992207395844162, "rewards/rejected": -0.004099222365766764, "step": 40 }, { "epoch": 0.08, "learning_rate": 3.968253968253968e-06, "logits/chosen": 0.7520231604576111, "logits/rejected": 0.8153010606765747, "logps/chosen": -326.61138916015625, "logps/rejected": -288.4171447753906, "loss": 248735.45, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0066975317895412445, "rewards/margins": 0.0012643480440601707, "rewards/rejected": -0.007961880415678024, "step": 50 }, { "epoch": 0.1, "learning_rate": 4.761904761904762e-06, "logits/chosen": 0.7390514612197876, "logits/rejected": 0.8621580004692078, "logps/chosen": -345.018310546875, "logps/rejected": -317.90765380859375, "loss": 247854.35, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.014267112128436565, "rewards/margins": 0.0019328411435708404, "rewards/rejected": -0.016199951991438866, "step": 60 }, { "epoch": 0.11, "learning_rate": 4.998086282661188e-06, "logits/chosen": 0.7415452599525452, "logits/rejected": 0.7531384229660034, "logps/chosen": -290.64874267578125, "logps/rejected": -292.1299743652344, "loss": 245170.275, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.024201344698667526, "rewards/margins": 0.00474986806511879, "rewards/rejected": -0.028951212763786316, "step": 70 }, { "epoch": 0.13, "learning_rate": 4.988720025682995e-06, "logits/chosen": 0.6506584882736206, "logits/rejected": 0.7040095925331116, "logps/chosen": -358.07318115234375, "logps/rejected": -349.37615966796875, "loss": 243755.15, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0412554033100605, "rewards/margins": 0.0057886759750545025, "rewards/rejected": -0.04704408347606659, "step": 80 }, { "epoch": 0.14, "learning_rate": 4.9715789537359126e-06, "logits/chosen": 0.5937181115150452, "logits/rejected": 0.673678994178772, "logps/chosen": -381.2982177734375, "logps/rejected": -360.83782958984375, "loss": 240048.85, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.053672052919864655, "rewards/margins": 0.01033872365951538, "rewards/rejected": -0.06401076912879944, "step": 90 }, { "epoch": 0.16, "learning_rate": 4.946716615897932e-06, "logits/chosen": 0.5546287298202515, "logits/rejected": 0.6209256052970886, "logps/chosen": -370.5368957519531, "logps/rejected": -350.92095947265625, "loss": 240275.15, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.06720001995563507, "rewards/margins": 0.011619504541158676, "rewards/rejected": -0.07881952822208405, "step": 100 }, { "epoch": 0.18, "learning_rate": 4.9142106826480114e-06, "logits/chosen": 0.5148851871490479, "logits/rejected": 0.5459151268005371, "logps/chosen": -349.8558044433594, "logps/rejected": -368.8512268066406, "loss": 241037.75, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07670430094003677, "rewards/margins": 0.00954846478998661, "rewards/rejected": -0.08625277131795883, "step": 110 }, { "epoch": 0.19, "learning_rate": 4.874162703221823e-06, "logits/chosen": 0.4573546350002289, "logits/rejected": 0.4861672818660736, "logps/chosen": -383.19659423828125, "logps/rejected": -384.38238525390625, "loss": 235026.15, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09026693552732468, "rewards/margins": 0.018047746270895004, "rewards/rejected": -0.10831467807292938, "step": 120 }, { "epoch": 0.21, "learning_rate": 4.826697788369752e-06, "logits/chosen": 0.30501729249954224, "logits/rejected": 0.4133021831512451, "logps/chosen": -433.96710205078125, "logps/rejected": -439.2042541503906, "loss": 228395.55, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11907751858234406, "rewards/margins": 0.028610652312636375, "rewards/rejected": -0.14768816530704498, "step": 130 }, { "epoch": 0.22, "learning_rate": 4.7719642195082224e-06, "logits/chosen": 0.24359706044197083, "logits/rejected": 0.23563845455646515, "logps/chosen": -455.366455078125, "logps/rejected": -487.98236083984375, "loss": 221256.2, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.15958473086357117, "rewards/margins": 0.035430677235126495, "rewards/rejected": -0.19501543045043945, "step": 140 }, { "epoch": 0.24, "learning_rate": 4.710132985485355e-06, "logits/chosen": 0.11840543895959854, "logits/rejected": 0.16418686509132385, "logps/chosen": -483.0514221191406, "logps/rejected": -533.7542114257812, "loss": 215126.425, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20697049796581268, "rewards/margins": 0.05109601095318794, "rewards/rejected": -0.2580665051937103, "step": 150 }, { "epoch": 0.26, "learning_rate": 4.641397248408122e-06, "logits/chosen": 0.01182345487177372, "logits/rejected": -0.050421517342329025, "logps/chosen": -665.0191040039062, "logps/rejected": -694.4324951171875, "loss": 214404.125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.33108025789260864, "rewards/margins": 0.04403197020292282, "rewards/rejected": -0.3751122057437897, "step": 160 }, { "epoch": 0.27, "learning_rate": 4.5659717401997655e-06, "logits/chosen": -0.1355719119310379, "logits/rejected": -0.06722792237997055, "logps/chosen": -665.4908447265625, "logps/rejected": -698.1319580078125, "loss": 224226.1, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.34724271297454834, "rewards/margins": 0.05173974484205246, "rewards/rejected": -0.398982435464859, "step": 170 }, { "epoch": 0.29, "learning_rate": 4.4840920917726425e-06, "logits/chosen": -0.11122943460941315, "logits/rejected": -0.0640818327665329, "logps/chosen": -615.5494384765625, "logps/rejected": -655.942626953125, "loss": 215075.25, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2923824191093445, "rewards/margins": 0.05855263024568558, "rewards/rejected": -0.35093507170677185, "step": 180 }, { "epoch": 0.3, "learning_rate": 4.396014096912182e-06, "logits/chosen": -0.15400852262973785, "logits/rejected": -0.1399744302034378, "logps/chosen": -680.15283203125, "logps/rejected": -732.2891845703125, "loss": 221556.075, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3797221779823303, "rewards/margins": 0.06087984889745712, "rewards/rejected": -0.44060200452804565, "step": 190 }, { "epoch": 0.32, "learning_rate": 4.302012913171584e-06, "logits/chosen": -0.08666789531707764, "logits/rejected": -0.09571169316768646, "logps/chosen": -724.3123779296875, "logps/rejected": -767.0538330078125, "loss": 221729.975, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.4401639401912689, "rewards/margins": 0.056829143315553665, "rewards/rejected": -0.4969930648803711, "step": 200 }, { "epoch": 0.34, "learning_rate": 4.202382202273702e-06, "logits/chosen": -0.09156730026006699, "logits/rejected": -0.0777561292052269, "logps/chosen": -761.5457763671875, "logps/rejected": -774.8024291992188, "loss": 221646.825, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.3868615925312042, "rewards/margins": 0.06258244067430496, "rewards/rejected": -0.4494439959526062, "step": 210 }, { "epoch": 0.35, "learning_rate": 4.097433212705492e-06, "logits/chosen": -0.07382290065288544, "logits/rejected": -0.02682422660291195, "logps/chosen": -640.3363037109375, "logps/rejected": -678.830810546875, "loss": 218399.6, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.336215078830719, "rewards/margins": 0.04152713343501091, "rewards/rejected": -0.377742201089859, "step": 220 }, { "epoch": 0.37, "learning_rate": 3.987493807371033e-06, "logits/chosen": -0.04611346125602722, "logits/rejected": -0.00773113826289773, "logps/chosen": -662.879638671875, "logps/rejected": -710.36767578125, "loss": 208091.175, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3531550467014313, "rewards/margins": 0.06838408857584, "rewards/rejected": -0.4215391278266907, "step": 230 }, { "epoch": 0.38, "learning_rate": 3.872907439340758e-06, "logits/chosen": -0.10264978557825089, "logits/rejected": -0.05322417616844177, "logps/chosen": -591.2621459960938, "logps/rejected": -644.683837890625, "loss": 213061.95, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3143499493598938, "rewards/margins": 0.06685332953929901, "rewards/rejected": -0.381203293800354, "step": 240 }, { "epoch": 0.4, "learning_rate": 3.75403207889666e-06, "logits/chosen": -0.07593805342912674, "logits/rejected": -0.07000058889389038, "logps/chosen": -812.5138549804688, "logps/rejected": -872.6647338867188, "loss": 206739.9, "rewards/accuracies": 0.625, "rewards/chosen": -0.4910140931606293, "rewards/margins": 0.07318232953548431, "rewards/rejected": -0.5641964673995972, "step": 250 }, { "epoch": 0.42, "learning_rate": 3.631239095225417e-06, "logits/chosen": -0.08646208047866821, "logits/rejected": -0.06198771670460701, "logps/chosen": -728.2857666015625, "logps/rejected": -782.2666625976562, "loss": 207880.075, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.44028884172439575, "rewards/margins": 0.06813923269510269, "rewards/rejected": -0.5084280967712402, "step": 260 }, { "epoch": 0.43, "learning_rate": 3.5049120962530608e-06, "logits/chosen": -0.05799086019396782, "logits/rejected": -0.08497434854507446, "logps/chosen": -652.5838012695312, "logps/rejected": -717.7830200195312, "loss": 207495.575, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.36064040660858154, "rewards/margins": 0.059676121920347214, "rewards/rejected": -0.42031654715538025, "step": 270 }, { "epoch": 0.45, "learning_rate": 3.3754457302455464e-06, "logits/chosen": -0.1196521669626236, "logits/rejected": -0.04309455305337906, "logps/chosen": -689.7659912109375, "logps/rejected": -743.6688232421875, "loss": 208742.325, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.3875437378883362, "rewards/margins": 0.06585486978292465, "rewards/rejected": -0.45339861512184143, "step": 280 }, { "epoch": 0.46, "learning_rate": 3.2432444529190714e-06, "logits/chosen": -0.07471663504838943, "logits/rejected": -0.08168856799602509, "logps/chosen": -667.86669921875, "logps/rejected": -755.6109619140625, "loss": 217160.775, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.3965543508529663, "rewards/margins": 0.07327079772949219, "rewards/rejected": -0.4698251783847809, "step": 290 }, { "epoch": 0.48, "learning_rate": 3.1087212639117057e-06, "logits/chosen": -0.11233433336019516, "logits/rejected": -0.040867146104574203, "logps/chosen": -745.700927734375, "logps/rejected": -783.9187622070312, "loss": 215084.8, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4703361392021179, "rewards/margins": 0.05205522105097771, "rewards/rejected": -0.5223913788795471, "step": 300 }, { "epoch": 0.5, "learning_rate": 2.9722964165636263e-06, "logits/chosen": -0.14317576587200165, "logits/rejected": -0.06158372014760971, "logps/chosen": -731.3981323242188, "logps/rejected": -796.32470703125, "loss": 220291.4, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.43216291069984436, "rewards/margins": 0.06123752519488335, "rewards/rejected": -0.4934004247188568, "step": 310 }, { "epoch": 0.51, "learning_rate": 2.8343961050366275e-06, "logits/chosen": -0.10771390050649643, "logits/rejected": -0.08905068039894104, "logps/chosen": -660.2057495117188, "logps/rejected": -736.7412109375, "loss": 217501.05, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3711238205432892, "rewards/margins": 0.06799530237913132, "rewards/rejected": -0.4391191601753235, "step": 320 }, { "epoch": 0.53, "learning_rate": 2.695451132874385e-06, "logits/chosen": -0.10368801653385162, "logits/rejected": -0.09771151840686798, "logps/chosen": -709.3264770507812, "logps/rejected": -710.8792724609375, "loss": 209546.6375, "rewards/accuracies": 0.625, "rewards/chosen": -0.40247923135757446, "rewards/margins": 0.06322751939296722, "rewards/rejected": -0.4657067656517029, "step": 330 }, { "epoch": 0.54, "learning_rate": 2.5558955671628964e-06, "logits/chosen": -0.15589869022369385, "logits/rejected": -0.10424575954675674, "logps/chosen": -843.1378784179688, "logps/rejected": -876.15966796875, "loss": 209775.175, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5010379552841187, "rewards/margins": 0.08095891773700714, "rewards/rejected": -0.5819969177246094, "step": 340 }, { "epoch": 0.56, "learning_rate": 2.4161653824955654e-06, "logits/chosen": -0.08000655472278595, "logits/rejected": -0.03247884660959244, "logps/chosen": -810.2706298828125, "logps/rejected": -836.5345458984375, "loss": 201790.65, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.49783754348754883, "rewards/margins": 0.0712912380695343, "rewards/rejected": -0.5691288113594055, "step": 350 }, { "epoch": 0.58, "learning_rate": 2.2766970989791697e-06, "logits/chosen": -0.05506904050707817, "logits/rejected": -0.07265397161245346, "logps/chosen": -816.2994384765625, "logps/rejected": -898.0321044921875, "loss": 212702.65, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.46471279859542847, "rewards/margins": 0.0864553228020668, "rewards/rejected": -0.5511681437492371, "step": 360 }, { "epoch": 0.59, "learning_rate": 2.1379264185356545e-06, "logits/chosen": -0.11596915870904922, "logits/rejected": -0.06984439492225647, "logps/chosen": -637.1615600585938, "logps/rejected": -715.4910278320312, "loss": 209158.575, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.38913828134536743, "rewards/margins": 0.06156201288104057, "rewards/rejected": -0.4507002830505371, "step": 370 }, { "epoch": 0.61, "learning_rate": 2.000286863759934e-06, "logits/chosen": -0.05925675481557846, "logits/rejected": 0.01651635393500328, "logps/chosen": -677.723876953125, "logps/rejected": -741.4754638671875, "loss": 203933.425, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.37697839736938477, "rewards/margins": 0.08702079951763153, "rewards/rejected": -0.4639992117881775, "step": 380 }, { "epoch": 0.62, "learning_rate": 1.8642084235859764e-06, "logits/chosen": -0.06466405093669891, "logits/rejected": -0.04278569668531418, "logps/chosen": -731.341796875, "logps/rejected": -771.2420043945312, "loss": 206554.3875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4274744391441345, "rewards/margins": 0.07628165185451508, "rewards/rejected": -0.5037561655044556, "step": 390 }, { "epoch": 0.64, "learning_rate": 1.7301162099921013e-06, "logits/chosen": 0.020595671609044075, "logits/rejected": 0.019321396946907043, "logps/chosen": -797.1294555664062, "logps/rejected": -833.5193481445312, "loss": 207531.1, "rewards/accuracies": 0.625, "rewards/chosen": -0.4648904800415039, "rewards/margins": 0.07909245789051056, "rewards/rejected": -0.543982982635498, "step": 400 }, { "epoch": 0.66, "learning_rate": 1.5984291299420117e-06, "logits/chosen": -0.05305539816617966, "logits/rejected": -0.04147926717996597, "logps/chosen": -756.192138671875, "logps/rejected": -858.1837158203125, "loss": 195018.3875, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4579537510871887, "rewards/margins": 0.09726598113775253, "rewards/rejected": -0.5552197694778442, "step": 410 }, { "epoch": 0.67, "learning_rate": 1.4695585767104092e-06, "logits/chosen": -0.05568443611264229, "logits/rejected": -0.0559534952044487, "logps/chosen": -772.1295166015625, "logps/rejected": -832.1310424804688, "loss": 226686.525, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.49827033281326294, "rewards/margins": 0.05179555341601372, "rewards/rejected": -0.5500659346580505, "step": 420 }, { "epoch": 0.69, "learning_rate": 1.3439071446815452e-06, "logits/chosen": -0.055512405931949615, "logits/rejected": -0.0024025961756706238, "logps/chosen": -757.4938354492188, "logps/rejected": -806.2801513671875, "loss": 212317.3, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.47724610567092896, "rewards/margins": 0.07205446809530258, "rewards/rejected": -0.5493005514144897, "step": 430 }, { "epoch": 0.7, "learning_rate": 1.2218673716356919e-06, "logits/chosen": -0.02327726036310196, "logits/rejected": 0.02376800775527954, "logps/chosen": -825.7927856445312, "logps/rejected": -850.76025390625, "loss": 218713.45, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4958096444606781, "rewards/margins": 0.06326460838317871, "rewards/rejected": -0.5590742826461792, "step": 440 }, { "epoch": 0.72, "learning_rate": 1.103820512452661e-06, "logits/chosen": -0.022990452125668526, "logits/rejected": 0.00906631350517273, "logps/chosen": -758.7638549804688, "logps/rejected": -824.9432373046875, "loss": 195052.875, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4434831738471985, "rewards/margins": 0.06830742210149765, "rewards/rejected": -0.5117905139923096, "step": 450 }, { "epoch": 0.74, "learning_rate": 9.901353480633468e-07, "logits/chosen": 0.03838720545172691, "logits/rejected": 0.12577348947525024, "logps/chosen": -793.5477294921875, "logps/rejected": -819.9153442382812, "loss": 202205.275, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.43852996826171875, "rewards/margins": 0.08077750355005264, "rewards/rejected": -0.5193074345588684, "step": 460 }, { "epoch": 0.75, "learning_rate": 8.811670333701544e-07, "logits/chosen": 0.013123716227710247, "logits/rejected": 0.05847090482711792, "logps/chosen": -703.4427490234375, "logps/rejected": -748.7205200195312, "loss": 211857.375, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.42077550292015076, "rewards/margins": 0.0480610728263855, "rewards/rejected": -0.46883660554885864, "step": 470 }, { "epoch": 0.77, "learning_rate": 7.772559877354341e-07, "logits/chosen": 0.002355717122554779, "logits/rejected": 0.03354542329907417, "logps/chosen": -751.0103759765625, "logps/rejected": -789.4757080078125, "loss": 213394.1, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.43958085775375366, "rewards/margins": 0.07620178908109665, "rewards/rejected": -0.5157827138900757, "step": 480 }, { "epoch": 0.78, "learning_rate": 6.787268315040604e-07, "logits/chosen": -0.012600034475326538, "logits/rejected": -0.028291597962379456, "logps/chosen": -682.4383544921875, "logps/rejected": -727.0133056640625, "loss": 206004.6875, "rewards/accuracies": 0.625, "rewards/chosen": -0.3844345510005951, "rewards/margins": 0.07255369424819946, "rewards/rejected": -0.45698824524879456, "step": 490 }, { "epoch": 0.8, "learning_rate": 5.858873718824829e-07, "logits/chosen": -0.04455758258700371, "logits/rejected": 0.01345035620033741, "logps/chosen": -720.21484375, "logps/rejected": -779.4735717773438, "loss": 206500.15, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4052000641822815, "rewards/margins": 0.0826614573597908, "rewards/rejected": -0.4878615438938141, "step": 500 }, { "epoch": 0.82, "learning_rate": 4.990276413423817e-07, "logits/chosen": 0.00015942193567752838, "logits/rejected": 0.04742427542805672, "logps/chosen": -742.9307861328125, "logps/rejected": -760.3399658203125, "loss": 208713.95, "rewards/accuracies": 0.59375, "rewards/chosen": -0.39830031991004944, "rewards/margins": 0.06554318964481354, "rewards/rejected": -0.4638434946537018, "step": 510 }, { "epoch": 0.83, "learning_rate": 4.184189915529796e-07, "logits/chosen": -0.04313235729932785, "logits/rejected": 0.02894475683569908, "logps/chosen": -675.0413818359375, "logps/rejected": -710.6438598632812, "loss": 211050.5, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.392715722322464, "rewards/margins": 0.061092592775821686, "rewards/rejected": -0.4538083076477051, "step": 520 }, { "epoch": 0.85, "learning_rate": 3.4431324567258176e-07, "logits/chosen": 0.020198270678520203, "logits/rejected": 0.03800346702337265, "logps/chosen": -676.2935791015625, "logps/rejected": -722.9488525390625, "loss": 205822.25, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.40016284584999084, "rewards/margins": 0.07710020244121552, "rewards/rejected": -0.47726306319236755, "step": 530 }, { "epoch": 0.86, "learning_rate": 2.769419116476052e-07, "logits/chosen": 0.0013688721228390932, "logits/rejected": 0.06662561744451523, "logps/chosen": -711.6402587890625, "logps/rejected": -769.0242309570312, "loss": 199808.325, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.41114863753318787, "rewards/margins": 0.08205543458461761, "rewards/rejected": -0.4932040572166443, "step": 540 }, { "epoch": 0.88, "learning_rate": 2.1651545897676512e-07, "logits/chosen": -0.0213983952999115, "logits/rejected": 0.08525023609399796, "logps/chosen": -686.88427734375, "logps/rejected": -734.6177978515625, "loss": 217214.225, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4026912748813629, "rewards/margins": 0.046199485659599304, "rewards/rejected": -0.44889068603515625, "step": 550 }, { "epoch": 0.9, "learning_rate": 1.6322266119983222e-07, "logits/chosen": 0.02770215831696987, "logits/rejected": 0.04931509494781494, "logps/chosen": -752.9605712890625, "logps/rejected": -813.0621337890625, "loss": 210949.15, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4274672865867615, "rewards/margins": 0.08069118112325668, "rewards/rejected": -0.5081585049629211, "step": 560 }, { "epoch": 0.91, "learning_rate": 1.1723000616502167e-07, "logits/chosen": -0.00834731012582779, "logits/rejected": 0.06770430505275726, "logps/chosen": -734.9483032226562, "logps/rejected": -788.4010009765625, "loss": 202997.65, "rewards/accuracies": 0.625, "rewards/chosen": -0.4188470244407654, "rewards/margins": 0.07445165514945984, "rewards/rejected": -0.4932987093925476, "step": 570 }, { "epoch": 0.93, "learning_rate": 7.868117591737585e-08, "logits/chosen": -0.007739663124084473, "logits/rejected": 0.10286346822977066, "logps/chosen": -706.01806640625, "logps/rejected": -754.5486450195312, "loss": 203727.525, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.3930371403694153, "rewards/margins": 0.09636659920215607, "rewards/rejected": -0.48940372467041016, "step": 580 }, { "epoch": 0.94, "learning_rate": 4.769659783295383e-08, "logits/chosen": -0.022421469911932945, "logits/rejected": 0.03599115461111069, "logps/chosen": -727.1646118164062, "logps/rejected": -754.9591064453125, "loss": 202799.8625, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4033421576023102, "rewards/margins": 0.0642734095454216, "rewards/rejected": -0.4676155149936676, "step": 590 }, { "epoch": 0.96, "learning_rate": 2.4373068401120358e-08, "logits/chosen": 0.04254373908042908, "logits/rejected": 0.027004733681678772, "logps/chosen": -792.9937133789062, "logps/rejected": -832.1984252929688, "loss": 200382.225, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4243929386138916, "rewards/margins": 0.09623502194881439, "rewards/rejected": -0.5206279754638672, "step": 600 }, { "epoch": 0.98, "learning_rate": 8.78345083022425e-09, "logits/chosen": -0.03111700341105461, "logits/rejected": -0.02308904565870762, "logps/chosen": -739.8712158203125, "logps/rejected": -808.3377685546875, "loss": 222655.7, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4240250587463379, "rewards/margins": 0.08907955139875412, "rewards/rejected": -0.5131046175956726, "step": 610 }, { "epoch": 0.99, "learning_rate": 9.764474213677654e-10, "logits/chosen": 0.016595929861068726, "logits/rejected": 0.03964446485042572, "logps/chosen": -717.14599609375, "logps/rejected": -763.954833984375, "loss": 214648.5, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.413185179233551, "rewards/margins": 0.07379905879497528, "rewards/rejected": -0.4869841933250427, "step": 620 }, { "epoch": 1.0, "step": 625, "total_flos": 0.0, "train_loss": 108598.9409, "train_runtime": 4079.5354, "train_samples_per_second": 4.903, "train_steps_per_second": 0.153 } ], "logging_steps": 10, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }