{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.35546875, "learning_rate": 2.0576131687242803e-08, "logits/chosen": 0.24564924836158752, "logits/rejected": 1.0062695741653442, "logps/chosen": -229.83255004882812, "logps/rejected": -164.65399169921875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.384765625, "learning_rate": 2.05761316872428e-07, "logits/chosen": -0.0490909218788147, "logits/rejected": 0.6121826171875, "logps/chosen": -238.83880615234375, "logps/rejected": -207.5596923828125, "loss": 0.6931, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.00032901231315918267, "rewards/margins": 0.0006913852412253618, "rewards/margins_max": 0.002890574047341943, "rewards/margins_min": -0.0015078035648912191, "rewards/margins_std": 0.0031101228669285774, "rewards/rejected": -0.001020397525280714, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.443359375, "learning_rate": 4.11522633744856e-07, "logits/chosen": 0.05002685636281967, "logits/rejected": 0.6022137403488159, "logps/chosen": -255.0900115966797, "logps/rejected": -220.280517578125, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00017908953304868191, "rewards/margins": 0.0004872865101788193, "rewards/margins_max": 0.0039043165743350983, "rewards/margins_min": -0.0029297438450157642, "rewards/margins_std": 0.004832410719245672, "rewards/rejected": -0.0003081969916820526, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.36328125, "learning_rate": 6.17283950617284e-07, "logits/chosen": 0.07209397852420807, "logits/rejected": 0.5803325176239014, "logps/chosen": -241.93930053710938, "logps/rejected": -229.0738067626953, "loss": 0.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0010193719062954187, "rewards/margins": 0.001458501792512834, "rewards/margins_max": 0.0036475714296102524, "rewards/margins_min": -0.0007305679609999061, "rewards/margins_std": 0.0030958119314163923, "rewards/rejected": -0.0004391298571135849, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.40625, "learning_rate": 8.23045267489712e-07, "logits/chosen": 0.08637161552906036, "logits/rejected": 0.6608158946037292, "logps/chosen": -272.7409973144531, "logps/rejected": -232.7211151123047, "loss": 0.6918, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0017092287307605147, "rewards/margins": 0.002595087978988886, "rewards/margins_max": 0.0045972722582519054, "rewards/margins_min": 0.0005929030594415963, "rewards/margins_std": 0.0028315167874097824, "rewards/rejected": -0.0008858589571900666, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.39453125, "learning_rate": 1.02880658436214e-06, "logits/chosen": 0.039637185633182526, "logits/rejected": 0.42562946677207947, "logps/chosen": -248.4722137451172, "logps/rejected": -249.7132568359375, "loss": 0.6907, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.0025177341885864735, "rewards/margins": 0.004976312164217234, "rewards/margins_max": 0.008509628474712372, "rewards/margins_min": 0.0014429950388148427, "rewards/margins_std": 0.004996864590793848, "rewards/rejected": -0.002458578208461404, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.412109375, "learning_rate": 1.234567901234568e-06, "logits/chosen": 0.030338022857904434, "logits/rejected": 0.6016219854354858, "logps/chosen": -242.9213409423828, "logps/rejected": -205.34011840820312, "loss": 0.6897, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.002770364750176668, "rewards/margins": 0.006898685358464718, "rewards/margins_max": 0.01105786394327879, "rewards/margins_min": 0.002739507704973221, "rewards/margins_std": 0.005881965160369873, "rewards/rejected": -0.00412832060828805, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.43359375, "learning_rate": 1.440329218106996e-06, "logits/chosen": 0.12884962558746338, "logits/rejected": 0.6521704196929932, "logps/chosen": -233.1442108154297, "logps/rejected": -180.2538299560547, "loss": 0.6884, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0031278387177735567, "rewards/margins": 0.008645228110253811, "rewards/margins_max": 0.012719206511974335, "rewards/margins_min": 0.004571248777210712, "rewards/margins_std": 0.005761477164924145, "rewards/rejected": -0.005517390090972185, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.44140625, "learning_rate": 1.646090534979424e-06, "logits/chosen": -0.02626526914536953, "logits/rejected": 0.4111458361148834, "logps/chosen": -235.10330200195312, "logps/rejected": -224.97488403320312, "loss": 0.686, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.006755024194717407, "rewards/margins": 0.014125975780189037, "rewards/margins_max": 0.020800447091460228, "rewards/margins_min": 0.007451505865901709, "rewards/margins_std": 0.009439127519726753, "rewards/rejected": -0.007370952516794205, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.4921875, "learning_rate": 1.8518518518518519e-06, "logits/chosen": 0.26549288630485535, "logits/rejected": 0.6299537420272827, "logps/chosen": -205.5663604736328, "logps/rejected": -195.4409637451172, "loss": 0.6846, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00643587950617075, "rewards/margins": 0.016837503761053085, "rewards/margins_max": 0.02518610656261444, "rewards/margins_min": 0.008488905616104603, "rewards/margins_std": 0.011806704103946686, "rewards/rejected": -0.010401626117527485, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.427734375, "learning_rate": 2.05761316872428e-06, "logits/chosen": -0.02292916737496853, "logits/rejected": 0.4407041072845459, "logps/chosen": -237.1365509033203, "logps/rejected": -234.0122833251953, "loss": 0.6818, "rewards/accuracies": 0.9375, "rewards/chosen": 0.008963329717516899, "rewards/margins": 0.023605378344655037, "rewards/margins_max": 0.0348113588988781, "rewards/margins_min": 0.012399397790431976, "rewards/margins_std": 0.015847649425268173, "rewards/rejected": -0.014642049558460712, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.5, "learning_rate": 2.263374485596708e-06, "logits/chosen": 0.06019078567624092, "logits/rejected": 0.6456455588340759, "logps/chosen": -252.55941772460938, "logps/rejected": -202.68516540527344, "loss": 0.6787, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01171482540667057, "rewards/margins": 0.0284560676664114, "rewards/margins_max": 0.040376029908657074, "rewards/margins_min": 0.016536109149456024, "rewards/margins_std": 0.016857367008924484, "rewards/rejected": -0.01674124039709568, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.380859375, "learning_rate": 2.469135802469136e-06, "logits/chosen": 0.03018159233033657, "logits/rejected": 0.5444492101669312, "logps/chosen": -230.00732421875, "logps/rejected": -204.04888916015625, "loss": 0.6755, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.010969040915369987, "rewards/margins": 0.03301534429192543, "rewards/margins_max": 0.045270394533872604, "rewards/margins_min": 0.020760290324687958, "rewards/margins_std": 0.017331259325146675, "rewards/rejected": -0.022046301513910294, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.474609375, "learning_rate": 2.674897119341564e-06, "logits/chosen": 0.1473396122455597, "logits/rejected": 0.6573908925056458, "logps/chosen": -263.9186096191406, "logps/rejected": -234.0851593017578, "loss": 0.6704, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.019587906077504158, "rewards/margins": 0.04693462699651718, "rewards/margins_max": 0.06604455411434174, "rewards/margins_min": 0.02782469615340233, "rewards/margins_std": 0.027025526389479637, "rewards/rejected": -0.027346724644303322, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.46484375, "learning_rate": 2.880658436213992e-06, "logits/chosen": 0.1025664433836937, "logits/rejected": 0.6043235063552856, "logps/chosen": -249.485595703125, "logps/rejected": -218.0284423828125, "loss": 0.6651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.020640945062041283, "rewards/margins": 0.054881542921066284, "rewards/margins_max": 0.0779130607843399, "rewards/margins_min": 0.03185003623366356, "rewards/margins_std": 0.032571472227573395, "rewards/rejected": -0.03424059972167015, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.51171875, "learning_rate": 3.08641975308642e-06, "logits/chosen": 0.03741316497325897, "logits/rejected": 0.730408787727356, "logps/chosen": -271.1098327636719, "logps/rejected": -231.9659423828125, "loss": 0.658, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03794144093990326, "rewards/margins": 0.07607483863830566, "rewards/margins_max": 0.10968559980392456, "rewards/margins_min": 0.04246408864855766, "rewards/margins_std": 0.04753277823328972, "rewards/rejected": -0.038133405148983, "step": 150 }, { "epoch": 0.07, "grad_norm": 0.44140625, "learning_rate": 3.292181069958848e-06, "logits/chosen": 0.027915984392166138, "logits/rejected": 0.5170690417289734, "logps/chosen": -227.6484375, "logps/rejected": -201.67355346679688, "loss": 0.6572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.029438817873597145, "rewards/margins": 0.07748283445835114, "rewards/margins_max": 0.1150212287902832, "rewards/margins_min": 0.03994445875287056, "rewards/margins_std": 0.05308728292584419, "rewards/rejected": -0.04804402217268944, "step": 160 }, { "epoch": 0.07, "grad_norm": 0.427734375, "learning_rate": 3.4979423868312762e-06, "logits/chosen": 0.06907240301370621, "logits/rejected": 0.5936463475227356, "logps/chosen": -229.10653686523438, "logps/rejected": -239.67593383789062, "loss": 0.6451, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.05052729696035385, "rewards/margins": 0.09715622663497925, "rewards/margins_max": 0.14001211524009705, "rewards/margins_min": 0.05430033057928085, "rewards/margins_std": 0.060607392340898514, "rewards/rejected": -0.046628933399915695, "step": 170 }, { "epoch": 0.07, "grad_norm": 0.388671875, "learning_rate": 3.7037037037037037e-06, "logits/chosen": 0.23458850383758545, "logits/rejected": 0.604918360710144, "logps/chosen": -207.04891967773438, "logps/rejected": -222.05416870117188, "loss": 0.6477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.021370261907577515, "rewards/margins": 0.09243619441986084, "rewards/margins_max": 0.13056758046150208, "rewards/margins_min": 0.0543048158288002, "rewards/margins_std": 0.05392590910196304, "rewards/rejected": -0.07106593251228333, "step": 180 }, { "epoch": 0.08, "grad_norm": 0.53515625, "learning_rate": 3.909465020576132e-06, "logits/chosen": 0.1551034152507782, "logits/rejected": 0.7508169412612915, "logps/chosen": -252.1875457763672, "logps/rejected": -227.7545166015625, "loss": 0.6276, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.03474985808134079, "rewards/margins": 0.13373127579689026, "rewards/margins_max": 0.19556060433387756, "rewards/margins_min": 0.07190193980932236, "rewards/margins_std": 0.0874398797750473, "rewards/rejected": -0.09898141771554947, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.51171875, "learning_rate": 4.11522633744856e-06, "logits/chosen": 0.06500478088855743, "logits/rejected": 0.7195091247558594, "logps/chosen": -267.84259033203125, "logps/rejected": -238.9481658935547, "loss": 0.6207, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0433725044131279, "rewards/margins": 0.16162040829658508, "rewards/margins_max": 0.2354629933834076, "rewards/margins_min": 0.08777783066034317, "rewards/margins_std": 0.10442917048931122, "rewards/rejected": -0.11824791133403778, "step": 200 }, { "epoch": 0.09, "grad_norm": 0.5078125, "learning_rate": 4.3209876543209875e-06, "logits/chosen": 0.07097109407186508, "logits/rejected": 0.5758925676345825, "logps/chosen": -244.7234649658203, "logps/rejected": -232.4202117919922, "loss": 0.6185, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.027079004794359207, "rewards/margins": 0.17639262974262238, "rewards/margins_max": 0.2551138401031494, "rewards/margins_min": 0.09767140448093414, "rewards/margins_std": 0.1113286241889, "rewards/rejected": -0.14931362867355347, "step": 210 }, { "epoch": 0.09, "grad_norm": 0.474609375, "learning_rate": 4.526748971193416e-06, "logits/chosen": 0.10515166819095612, "logits/rejected": 0.6553866267204285, "logps/chosen": -236.6344451904297, "logps/rejected": -224.96749877929688, "loss": 0.6016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.01706048846244812, "rewards/margins": 0.21246306598186493, "rewards/margins_max": 0.31518980860710144, "rewards/margins_min": 0.10973634570837021, "rewards/margins_std": 0.14527757465839386, "rewards/rejected": -0.1954026073217392, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.53125, "learning_rate": 4.732510288065844e-06, "logits/chosen": -0.010027505457401276, "logits/rejected": 0.5649107098579407, "logps/chosen": -292.68572998046875, "logps/rejected": -271.10955810546875, "loss": 0.5803, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.009528962895274162, "rewards/margins": 0.2713000476360321, "rewards/margins_max": 0.4035729765892029, "rewards/margins_min": 0.13902710378170013, "rewards/margins_std": 0.18706218898296356, "rewards/rejected": -0.2808290421962738, "step": 230 }, { "epoch": 0.1, "grad_norm": 0.6015625, "learning_rate": 4.938271604938272e-06, "logits/chosen": 0.028728529810905457, "logits/rejected": 0.5883212685585022, "logps/chosen": -252.4741973876953, "logps/rejected": -263.96063232421875, "loss": 0.5355, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.026816654950380325, "rewards/margins": 0.36577218770980835, "rewards/margins_max": 0.5184392333030701, "rewards/margins_min": 0.21310511231422424, "rewards/margins_std": 0.2159038782119751, "rewards/rejected": -0.39258888363838196, "step": 240 }, { "epoch": 0.1, "grad_norm": 0.55859375, "learning_rate": 4.999873380880316e-06, "logits/chosen": -0.04030367732048035, "logits/rejected": 0.5767666697502136, "logps/chosen": -280.7464904785156, "logps/rejected": -289.3246154785156, "loss": 0.5451, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.04451828449964523, "rewards/margins": 0.31411212682724, "rewards/margins_max": 0.4603235125541687, "rewards/margins_min": 0.16790074110031128, "rewards/margins_std": 0.20677416026592255, "rewards/rejected": -0.3586304783821106, "step": 250 }, { "epoch": 0.11, "grad_norm": 0.52734375, "learning_rate": 4.999253236476256e-06, "logits/chosen": 0.11786775290966034, "logits/rejected": 0.7519556879997253, "logps/chosen": -285.5740966796875, "logps/rejected": -260.8897399902344, "loss": 0.5191, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.16191735863685608, "rewards/margins": 0.3822050988674164, "rewards/margins_max": 0.6438002586364746, "rewards/margins_min": 0.12060992419719696, "rewards/margins_std": 0.36995142698287964, "rewards/rejected": -0.5441225171089172, "step": 260 }, { "epoch": 0.11, "grad_norm": 0.62109375, "learning_rate": 4.998116438252842e-06, "logits/chosen": -0.01648726500570774, "logits/rejected": 0.596198558807373, "logps/chosen": -308.7812194824219, "logps/rejected": -326.95343017578125, "loss": 0.4666, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.16989798843860626, "rewards/margins": 0.7350731492042542, "rewards/margins_max": 1.1802551746368408, "rewards/margins_min": 0.28989118337631226, "rewards/margins_std": 0.6295824646949768, "rewards/rejected": -0.9049711227416992, "step": 270 }, { "epoch": 0.12, "grad_norm": 0.56640625, "learning_rate": 4.9964632212127305e-06, "logits/chosen": 0.0899326428771019, "logits/rejected": 0.6752752065658569, "logps/chosen": -290.4791564941406, "logps/rejected": -322.41815185546875, "loss": 0.4527, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.32237640023231506, "rewards/margins": 0.6386328935623169, "rewards/margins_max": 0.9289911985397339, "rewards/margins_min": 0.3482745587825775, "rewards/margins_std": 0.41062870621681213, "rewards/rejected": -0.9610093235969543, "step": 280 }, { "epoch": 0.12, "grad_norm": 0.7265625, "learning_rate": 4.994293927114362e-06, "logits/chosen": 0.06901798397302628, "logits/rejected": 0.6215580105781555, "logps/chosen": -290.79632568359375, "logps/rejected": -373.4809265136719, "loss": 0.4322, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.4801758825778961, "rewards/margins": 1.097102403640747, "rewards/margins_max": 1.8094851970672607, "rewards/margins_min": 0.3847196698188782, "rewards/margins_std": 1.0074613094329834, "rewards/rejected": -1.5772783756256104, "step": 290 }, { "epoch": 0.12, "grad_norm": 0.66796875, "learning_rate": 4.991609004401324e-06, "logits/chosen": -0.018922004848718643, "logits/rejected": 0.6193499565124512, "logps/chosen": -317.2272644042969, "logps/rejected": -400.44696044921875, "loss": 0.3836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.622567355632782, "rewards/margins": 1.1482007503509521, "rewards/margins_max": 1.7601646184921265, "rewards/margins_min": 0.5362368226051331, "rewards/margins_std": 0.8654475212097168, "rewards/rejected": -1.770768165588379, "step": 300 }, { "epoch": 0.13, "grad_norm": 1.4375, "learning_rate": 4.988409008109638e-06, "logits/chosen": 0.18614912033081055, "logits/rejected": 0.5903946161270142, "logps/chosen": -306.6070251464844, "logps/rejected": -419.8663024902344, "loss": 0.3599, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.8805279731750488, "rewards/margins": 1.354397177696228, "rewards/margins_max": 2.106048107147217, "rewards/margins_min": 0.6027460098266602, "rewards/margins_std": 1.062995195388794, "rewards/rejected": -2.2349250316619873, "step": 310 }, { "epoch": 0.13, "grad_norm": 3.484375, "learning_rate": 4.984694599753024e-06, "logits/chosen": 0.04539443925023079, "logits/rejected": 0.5839862823486328, "logps/chosen": -364.6256408691406, "logps/rejected": -489.1861877441406, "loss": 0.3496, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.2731292247772217, "rewards/margins": 1.5762968063354492, "rewards/margins_max": 2.456343650817871, "rewards/margins_min": 0.696249783039093, "rewards/margins_std": 1.2445745468139648, "rewards/rejected": -2.849426031112671, "step": 320 }, { "epoch": 0.14, "grad_norm": 1.140625, "learning_rate": 4.980466547186149e-06, "logits/chosen": -0.06857666373252869, "logits/rejected": 0.6623315811157227, "logps/chosen": -401.6612243652344, "logps/rejected": -569.5274047851562, "loss": 0.2962, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7086451053619385, "rewards/margins": 2.181455135345459, "rewards/margins_max": 3.330390453338623, "rewards/margins_min": 1.0325195789337158, "rewards/margins_std": 1.6248401403427124, "rewards/rejected": -3.8901000022888184, "step": 330 }, { "epoch": 0.14, "grad_norm": 0.859375, "learning_rate": 4.975725724445898e-06, "logits/chosen": 0.18517382442951202, "logits/rejected": 0.7153784036636353, "logps/chosen": -425.679443359375, "logps/rejected": -613.3704833984375, "loss": 0.3906, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7429125308990479, "rewards/margins": 2.2590205669403076, "rewards/margins_max": 3.9183871746063232, "rewards/margins_min": 0.5996544361114502, "rewards/margins_std": 2.34669828414917, "rewards/rejected": -4.0019330978393555, "step": 340 }, { "epoch": 0.14, "grad_norm": 1.7109375, "learning_rate": 4.9704731115706805e-06, "logits/chosen": 0.06402029097080231, "logits/rejected": 0.6806127429008484, "logps/chosen": -403.9203186035156, "logps/rejected": -743.2752075195312, "loss": 0.2744, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.463424801826477, "rewards/margins": 3.6999382972717285, "rewards/margins_max": 6.0486063957214355, "rewards/margins_min": 1.3512706756591797, "rewards/margins_std": 3.3215174674987793, "rewards/rejected": -5.163362979888916, "step": 350 }, { "epoch": 0.15, "grad_norm": 4.71875, "learning_rate": 4.964709794397846e-06, "logits/chosen": 0.17624667286872864, "logits/rejected": 0.8073934316635132, "logps/chosen": -420.7196350097656, "logps/rejected": -765.087158203125, "loss": 0.2887, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7227516174316406, "rewards/margins": 3.5676417350769043, "rewards/margins_max": 6.014761447906494, "rewards/margins_min": 1.1205217838287354, "rewards/margins_std": 3.460750102996826, "rewards/rejected": -5.290392875671387, "step": 360 }, { "epoch": 0.15, "grad_norm": 0.546875, "learning_rate": 4.9584369643392076e-06, "logits/chosen": 0.145114004611969, "logits/rejected": 0.8146367073059082, "logps/chosen": -478.583984375, "logps/rejected": -902.0680541992188, "loss": 0.2173, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.5044260025024414, "rewards/margins": 4.49790096282959, "rewards/margins_max": 7.457921028137207, "rewards/margins_min": 1.5378811359405518, "rewards/margins_std": 4.186100006103516, "rewards/rejected": -7.002326965332031, "step": 370 }, { "epoch": 0.16, "grad_norm": 3.046875, "learning_rate": 4.951655918134749e-06, "logits/chosen": 0.10492346435785294, "logits/rejected": 0.6990815997123718, "logps/chosen": -523.7138671875, "logps/rejected": -888.4390869140625, "loss": 0.2891, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.9183859825134277, "rewards/margins": 3.839510679244995, "rewards/margins_max": 6.660401344299316, "rewards/margins_min": 1.0186195373535156, "rewards/margins_std": 3.989342451095581, "rewards/rejected": -6.757896423339844, "step": 380 }, { "epoch": 0.16, "grad_norm": 2.03125, "learning_rate": 4.944368057584568e-06, "logits/chosen": 0.10440587997436523, "logits/rejected": 0.8017401695251465, "logps/chosen": -489.6419982910156, "logps/rejected": -886.7931518554688, "loss": 0.2525, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3617236614227295, "rewards/margins": 4.341439247131348, "rewards/margins_max": 6.716013431549072, "rewards/margins_min": 1.9668653011322021, "rewards/margins_std": 3.358154773712158, "rewards/rejected": -6.703163146972656, "step": 390 }, { "epoch": 0.16, "grad_norm": 0.88671875, "learning_rate": 4.936574889259076e-06, "logits/chosen": 0.20124737918376923, "logits/rejected": 0.9363399744033813, "logps/chosen": -510.68682861328125, "logps/rejected": -808.8845825195312, "loss": 0.2781, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1757853031158447, "rewards/margins": 3.780200958251953, "rewards/margins_max": 6.390686511993408, "rewards/margins_min": 1.1697145700454712, "rewards/margins_std": 3.691784620285034, "rewards/rejected": -5.9559855461120605, "step": 400 }, { "epoch": 0.17, "grad_norm": 0.7421875, "learning_rate": 4.928278024187572e-06, "logits/chosen": 0.07302796840667725, "logits/rejected": 0.7525766491889954, "logps/chosen": -441.041015625, "logps/rejected": -718.113525390625, "loss": 0.2566, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.917728066444397, "rewards/margins": 3.044644832611084, "rewards/margins_max": 4.838767051696777, "rewards/margins_min": 1.2505226135253906, "rewards/margins_std": 2.537271738052368, "rewards/rejected": -4.962372779846191, "step": 410 }, { "epoch": 0.17, "grad_norm": 1.4140625, "learning_rate": 4.91947917752519e-06, "logits/chosen": 0.2481038123369217, "logits/rejected": 0.8633731603622437, "logps/chosen": -495.15234375, "logps/rejected": -929.7926635742188, "loss": 0.2143, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3359451293945312, "rewards/margins": 4.435263156890869, "rewards/margins_max": 6.896138668060303, "rewards/margins_min": 1.9743881225585938, "rewards/margins_std": 3.480203151702881, "rewards/rejected": -6.7712082862854, "step": 420 }, { "epoch": 0.18, "grad_norm": 1.171875, "learning_rate": 4.91018016819835e-06, "logits/chosen": 0.19703389704227448, "logits/rejected": 0.8145115971565247, "logps/chosen": -463.3583068847656, "logps/rejected": -735.3023681640625, "loss": 0.3076, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0541656017303467, "rewards/margins": 3.166626214981079, "rewards/margins_max": 5.232685565948486, "rewards/margins_min": 1.1005662679672241, "rewards/margins_std": 2.921849489212036, "rewards/rejected": -5.220791816711426, "step": 430 }, { "epoch": 0.18, "grad_norm": 1.3203125, "learning_rate": 4.900382918528732e-06, "logits/chosen": 0.37838277220726013, "logits/rejected": 0.9560055732727051, "logps/chosen": -490.0171813964844, "logps/rejected": -867.9541015625, "loss": 0.2098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.327396869659424, "rewards/margins": 4.201695919036865, "rewards/margins_max": 6.30963659286499, "rewards/margins_min": 2.0937557220458984, "rewards/margins_std": 2.9810779094696045, "rewards/rejected": -6.529093265533447, "step": 440 }, { "epoch": 0.19, "grad_norm": 0.416015625, "learning_rate": 4.890089453835894e-06, "logits/chosen": 0.16315485537052155, "logits/rejected": 0.8696213960647583, "logps/chosen": -516.959716796875, "logps/rejected": -999.4393310546875, "loss": 0.1884, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.290456533432007, "rewards/margins": 5.147913932800293, "rewards/margins_max": 8.1465425491333, "rewards/margins_min": 2.1492867469787598, "rewards/margins_std": 4.240699768066406, "rewards/rejected": -7.438370704650879, "step": 450 }, { "epoch": 0.19, "grad_norm": 0.828125, "learning_rate": 4.879301902018592e-06, "logits/chosen": 0.2864415943622589, "logits/rejected": 0.7803254127502441, "logps/chosen": -533.4550170898438, "logps/rejected": -1063.6312255859375, "loss": 0.2423, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9031295776367188, "rewards/margins": 5.533447265625, "rewards/margins_max": 8.971426010131836, "rewards/margins_min": 2.0954694747924805, "rewards/margins_std": 4.862034797668457, "rewards/rejected": -8.436576843261719, "step": 460 }, { "epoch": 0.19, "grad_norm": 7.3125, "learning_rate": 4.868022493114887e-06, "logits/chosen": 0.33959221839904785, "logits/rejected": 1.040248155593872, "logps/chosen": -664.7828369140625, "logps/rejected": -1284.1910400390625, "loss": 0.1801, "rewards/accuracies": 0.9375, "rewards/chosen": -4.016963958740234, "rewards/margins": 6.7210187911987305, "rewards/margins_max": 10.781229019165039, "rewards/margins_min": 2.6608097553253174, "rewards/margins_std": 5.742003440856934, "rewards/rejected": -10.737983703613281, "step": 470 }, { "epoch": 0.2, "grad_norm": 1.8828125, "learning_rate": 4.856253558841153e-06, "logits/chosen": 0.43446415662765503, "logits/rejected": 1.01176118850708, "logps/chosen": -664.5281982421875, "logps/rejected": -1319.181640625, "loss": 0.3663, "rewards/accuracies": 0.9375, "rewards/chosen": -4.380959987640381, "rewards/margins": 6.677268028259277, "rewards/margins_max": 10.735626220703125, "rewards/margins_min": 2.618910074234009, "rewards/margins_std": 5.739384651184082, "rewards/rejected": -11.0582275390625, "step": 480 }, { "epoch": 0.2, "grad_norm": 0.9453125, "learning_rate": 4.843997532110051e-06, "logits/chosen": 0.4099550247192383, "logits/rejected": 0.9675588607788086, "logps/chosen": -634.2232666015625, "logps/rejected": -1535.5347900390625, "loss": 0.1502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.844944477081299, "rewards/margins": 9.073770523071289, "rewards/margins_max": 14.664604187011719, "rewards/margins_min": 3.482936143875122, "rewards/margins_std": 7.906632423400879, "rewards/rejected": -12.91871452331543, "step": 490 }, { "epoch": 0.21, "grad_norm": 0.341796875, "learning_rate": 4.831256946527591e-06, "logits/chosen": 0.41468414664268494, "logits/rejected": 1.1351321935653687, "logps/chosen": -591.6776123046875, "logps/rejected": -1291.275146484375, "loss": 0.2315, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6611881256103516, "rewards/margins": 7.434420585632324, "rewards/margins_max": 11.837034225463867, "rewards/margins_min": 3.0318071842193604, "rewards/margins_std": 6.226236820220947, "rewards/rejected": -11.095609664916992, "step": 500 }, { "epoch": 0.21, "grad_norm": 1.0625, "learning_rate": 4.818034435869377e-06, "logits/chosen": 0.5877698063850403, "logits/rejected": 1.2467072010040283, "logps/chosen": -623.4757080078125, "logps/rejected": -1281.064697265625, "loss": 0.1391, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.8208084106445312, "rewards/margins": 6.917235374450684, "rewards/margins_max": 10.565729141235352, "rewards/margins_min": 3.268742322921753, "rewards/margins_std": 5.159748077392578, "rewards/rejected": -10.738044738769531, "step": 510 }, { "epoch": 0.21, "grad_norm": 3.890625, "learning_rate": 4.804332733536141e-06, "logits/chosen": 0.45656394958496094, "logits/rejected": 1.1674026250839233, "logps/chosen": -701.67041015625, "logps/rejected": -1496.7086181640625, "loss": 0.2265, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.459706783294678, "rewards/margins": 8.300594329833984, "rewards/margins_max": 13.440702438354492, "rewards/margins_min": 3.1604866981506348, "rewards/margins_std": 7.2692108154296875, "rewards/rejected": -12.76030158996582, "step": 520 }, { "epoch": 0.22, "grad_norm": 0.75390625, "learning_rate": 4.790154671988696e-06, "logits/chosen": 0.707282304763794, "logits/rejected": 1.2839213609695435, "logps/chosen": -713.0794067382812, "logps/rejected": -1470.809326171875, "loss": 0.1294, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.646553993225098, "rewards/margins": 7.7992706298828125, "rewards/margins_max": 12.847709655761719, "rewards/margins_min": 2.7508316040039062, "rewards/margins_std": 7.1395721435546875, "rewards/rejected": -12.44582462310791, "step": 530 }, { "epoch": 0.22, "grad_norm": 22.625, "learning_rate": 4.775503182162386e-06, "logits/chosen": 0.6817615032196045, "logits/rejected": 1.3176844120025635, "logps/chosen": -850.7849731445312, "logps/rejected": -1690.505126953125, "loss": 0.253, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.02855110168457, "rewards/margins": 8.858359336853027, "rewards/margins_max": 13.857948303222656, "rewards/margins_min": 3.858771800994873, "rewards/margins_std": 7.0704851150512695, "rewards/rejected": -14.886911392211914, "step": 540 }, { "epoch": 0.23, "grad_norm": 1.8671875, "learning_rate": 4.7603812928612e-06, "logits/chosen": 0.4829481542110443, "logits/rejected": 1.1649879217147827, "logps/chosen": -747.2049560546875, "logps/rejected": -1385.47216796875, "loss": 0.4339, "rewards/accuracies": 0.9375, "rewards/chosen": -4.999864101409912, "rewards/margins": 6.820773124694824, "rewards/margins_max": 10.468523025512695, "rewards/margins_min": 3.1730237007141113, "rewards/margins_std": 5.158697605133057, "rewards/rejected": -11.820637702941895, "step": 550 }, { "epoch": 0.23, "grad_norm": 1.5078125, "learning_rate": 4.744792130131653e-06, "logits/chosen": 0.3002074360847473, "logits/rejected": 1.0043690204620361, "logps/chosen": -662.5621948242188, "logps/rejected": -1360.820068359375, "loss": 0.1538, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9544594287872314, "rewards/margins": 7.2028093338012695, "rewards/margins_max": 11.087037086486816, "rewards/margins_min": 3.318582534790039, "rewards/margins_std": 5.493126392364502, "rewards/rejected": -11.157269477844238, "step": 560 }, { "epoch": 0.23, "grad_norm": 0.63671875, "learning_rate": 4.728738916616552e-06, "logits/chosen": 0.5242341756820679, "logits/rejected": 1.1999857425689697, "logps/chosen": -646.2457885742188, "logps/rejected": -1409.1556396484375, "loss": 0.2874, "rewards/accuracies": 0.9375, "rewards/chosen": -4.057827949523926, "rewards/margins": 7.776298522949219, "rewards/margins_max": 11.975003242492676, "rewards/margins_min": 3.57759428024292, "rewards/margins_std": 5.937864780426025, "rewards/rejected": -11.834127426147461, "step": 570 }, { "epoch": 0.24, "grad_norm": 4.71875, "learning_rate": 4.712224970888801e-06, "logits/chosen": 0.580299973487854, "logits/rejected": 1.3875830173492432, "logps/chosen": -721.7586059570312, "logps/rejected": -1617.5888671875, "loss": 0.2512, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.572092056274414, "rewards/margins": 9.203888893127441, "rewards/margins_max": 14.512364387512207, "rewards/margins_min": 3.8954148292541504, "rewards/margins_std": 7.507315635681152, "rewards/rejected": -13.775980949401855, "step": 580 }, { "epoch": 0.24, "grad_norm": 4.375, "learning_rate": 4.69525370676538e-06, "logits/chosen": 0.5429633855819702, "logits/rejected": 1.3331568241119385, "logps/chosen": -695.3401489257812, "logps/rejected": -1387.588134765625, "loss": 0.2468, "rewards/accuracies": 0.9375, "rewards/chosen": -4.615943908691406, "rewards/margins": 7.4264984130859375, "rewards/margins_max": 11.769147872924805, "rewards/margins_min": 3.0838465690612793, "rewards/margins_std": 6.141435623168945, "rewards/rejected": -12.042441368103027, "step": 590 }, { "epoch": 0.25, "grad_norm": 2.859375, "learning_rate": 4.677828632601625e-06, "logits/chosen": 0.49036288261413574, "logits/rejected": 1.2113770246505737, "logps/chosen": -631.5177001953125, "logps/rejected": -1210.844482421875, "loss": 0.128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.760044813156128, "rewards/margins": 6.260881423950195, "rewards/margins_max": 9.719388961791992, "rewards/margins_min": 2.8023738861083984, "rewards/margins_std": 4.891068458557129, "rewards/rejected": -10.020925521850586, "step": 600 }, { "epoch": 0.25, "grad_norm": 6.875, "learning_rate": 4.65995335056597e-06, "logits/chosen": 0.4661685824394226, "logits/rejected": 1.1997615098953247, "logps/chosen": -697.4072265625, "logps/rejected": -1316.599853515625, "loss": 0.2737, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.225058555603027, "rewards/margins": 6.9119462966918945, "rewards/margins_max": 10.23686408996582, "rewards/margins_min": 3.5870280265808105, "rewards/margins_std": 4.702144145965576, "rewards/rejected": -11.137005805969238, "step": 610 }, { "epoch": 0.26, "grad_norm": 0.90234375, "learning_rate": 4.6416315558952985e-06, "logits/chosen": 0.5700492858886719, "logits/rejected": 1.2297275066375732, "logps/chosen": -648.19482421875, "logps/rejected": -1285.586181640625, "loss": 0.2359, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9675934314727783, "rewards/margins": 6.879029273986816, "rewards/margins_max": 11.239900588989258, "rewards/margins_min": 2.518155336380005, "rewards/margins_std": 6.167205810546875, "rewards/rejected": -10.846620559692383, "step": 620 }, { "epoch": 0.26, "grad_norm": 0.6640625, "learning_rate": 4.622867036131045e-06, "logits/chosen": 0.4446844160556793, "logits/rejected": 1.1179345846176147, "logps/chosen": -699.2020263671875, "logps/rejected": -1307.1138916015625, "loss": 0.1037, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.177753925323486, "rewards/margins": 6.502650260925293, "rewards/margins_max": 10.074499130249023, "rewards/margins_min": 2.9308011531829834, "rewards/margins_std": 5.051357746124268, "rewards/rejected": -10.680402755737305, "step": 630 }, { "epoch": 0.26, "grad_norm": 1.984375, "learning_rate": 4.60366367033623e-06, "logits/chosen": 0.40803995728492737, "logits/rejected": 1.114776849746704, "logps/chosen": -723.8427734375, "logps/rejected": -1445.0594482421875, "loss": 0.1899, "rewards/accuracies": 0.9375, "rewards/chosen": -4.7852630615234375, "rewards/margins": 7.497411251068115, "rewards/margins_max": 11.162347793579102, "rewards/margins_min": 3.8324737548828125, "rewards/margins_std": 5.183003902435303, "rewards/rejected": -12.282673835754395, "step": 640 }, { "epoch": 0.27, "grad_norm": 1.1015625, "learning_rate": 4.5840254282935604e-06, "logits/chosen": 0.5937483310699463, "logits/rejected": 1.2330735921859741, "logps/chosen": -796.9608154296875, "logps/rejected": -1525.527587890625, "loss": 0.2084, "rewards/accuracies": 0.9375, "rewards/chosen": -5.397136688232422, "rewards/margins": 7.485539436340332, "rewards/margins_max": 11.793124198913574, "rewards/margins_min": 3.177953004837036, "rewards/margins_std": 6.091846942901611, "rewards/rejected": -12.882675170898438, "step": 650 }, { "epoch": 0.27, "grad_norm": 4.875, "learning_rate": 4.56395636968479e-06, "logits/chosen": 0.6977173089981079, "logits/rejected": 1.2386213541030884, "logps/chosen": -645.6939697265625, "logps/rejected": -1467.990234375, "loss": 0.1096, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.246734619140625, "rewards/margins": 8.112640380859375, "rewards/margins_max": 12.321252822875977, "rewards/margins_min": 3.904027223587036, "rewards/margins_std": 5.951877593994141, "rewards/rejected": -12.359376907348633, "step": 660 }, { "epoch": 0.28, "grad_norm": 0.333984375, "learning_rate": 4.543460643251481e-06, "logits/chosen": 0.5177757740020752, "logits/rejected": 1.1193509101867676, "logps/chosen": -690.61572265625, "logps/rejected": -1552.57568359375, "loss": 0.1118, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.566229820251465, "rewards/margins": 8.843162536621094, "rewards/margins_max": 13.59516429901123, "rewards/margins_min": 4.091159820556641, "rewards/margins_std": 6.720346927642822, "rewards/rejected": -13.409391403198242, "step": 670 }, { "epoch": 0.28, "grad_norm": 1.578125, "learning_rate": 4.522542485937369e-06, "logits/chosen": 0.6444950103759766, "logits/rejected": 1.3874423503875732, "logps/chosen": -798.6126708984375, "logps/rejected": -1724.8717041015625, "loss": 0.1363, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.293717861175537, "rewards/margins": 9.743268966674805, "rewards/margins_max": 15.012044906616211, "rewards/margins_min": 4.474490165710449, "rewards/margins_std": 7.451178073883057, "rewards/rejected": -15.036985397338867, "step": 680 }, { "epoch": 0.28, "grad_norm": 1.6953125, "learning_rate": 4.5012062220124845e-06, "logits/chosen": 0.5247820019721985, "logits/rejected": 1.264107346534729, "logps/chosen": -724.8851928710938, "logps/rejected": -1684.8916015625, "loss": 0.1534, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.79865026473999, "rewards/margins": 9.892607688903809, "rewards/margins_max": 15.237088203430176, "rewards/margins_min": 4.548129081726074, "rewards/margins_std": 7.558236122131348, "rewards/rejected": -14.691259384155273, "step": 690 }, { "epoch": 0.29, "grad_norm": 0.28125, "learning_rate": 4.479456262179228e-06, "logits/chosen": 0.5434385538101196, "logits/rejected": 1.2754974365234375, "logps/chosen": -826.7483520507812, "logps/rejected": -1494.5345458984375, "loss": 0.1545, "rewards/accuracies": 0.9375, "rewards/chosen": -5.704430103302002, "rewards/margins": 7.395480155944824, "rewards/margins_max": 11.325884819030762, "rewards/margins_min": 3.465075969696045, "rewards/margins_std": 5.558432102203369, "rewards/rejected": -13.099909782409668, "step": 700 }, { "epoch": 0.29, "grad_norm": 0.65234375, "learning_rate": 4.4572971026605726e-06, "logits/chosen": 0.5515539646148682, "logits/rejected": 1.3576513528823853, "logps/chosen": -805.1383666992188, "logps/rejected": -1752.7252197265625, "loss": 0.1508, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.342719078063965, "rewards/margins": 9.904947280883789, "rewards/margins_max": 14.326292037963867, "rewards/margins_min": 5.4836015701293945, "rewards/margins_std": 6.252727031707764, "rewards/rejected": -15.24766731262207, "step": 710 }, { "epoch": 0.3, "grad_norm": 0.408203125, "learning_rate": 4.434733324270592e-06, "logits/chosen": 0.5185344815254211, "logits/rejected": 1.1416656970977783, "logps/chosen": -690.1055908203125, "logps/rejected": -1510.14453125, "loss": 0.1959, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.4360551834106445, "rewards/margins": 8.723947525024414, "rewards/margins_max": 12.72030258178711, "rewards/margins_min": 4.727590084075928, "rewards/margins_std": 5.651700973510742, "rewards/rejected": -13.160001754760742, "step": 720 }, { "epoch": 0.3, "grad_norm": 0.85546875, "learning_rate": 4.411769591467497e-06, "logits/chosen": 0.4622286856174469, "logits/rejected": 1.096407175064087, "logps/chosen": -706.9508056640625, "logps/rejected": -1390.883544921875, "loss": 0.1088, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.562924385070801, "rewards/margins": 7.009686470031738, "rewards/margins_max": 10.363374710083008, "rewards/margins_min": 3.6559970378875732, "rewards/margins_std": 4.742833137512207, "rewards/rejected": -11.572611808776855, "step": 730 }, { "epoch": 0.3, "grad_norm": 1.46875, "learning_rate": 4.3884106513893895e-06, "logits/chosen": 0.5636991262435913, "logits/rejected": 1.2218422889709473, "logps/chosen": -723.6990966796875, "logps/rejected": -1594.478515625, "loss": 0.1631, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.5525712966918945, "rewards/margins": 9.058218002319336, "rewards/margins_max": 13.891519546508789, "rewards/margins_min": 4.224917411804199, "rewards/margins_std": 6.835320949554443, "rewards/rejected": -13.61078929901123, "step": 740 }, { "epoch": 0.31, "grad_norm": 0.984375, "learning_rate": 4.364661332872913e-06, "logits/chosen": 0.4284195005893707, "logits/rejected": 1.1675077676773071, "logps/chosen": -757.3233642578125, "logps/rejected": -1814.9117431640625, "loss": 0.1645, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.945916652679443, "rewards/margins": 11.039398193359375, "rewards/margins_max": 16.22256851196289, "rewards/margins_min": 5.856227874755859, "rewards/margins_std": 7.330111026763916, "rewards/rejected": -15.985315322875977, "step": 750 }, { "epoch": 0.31, "grad_norm": 0.494140625, "learning_rate": 4.340526545455016e-06, "logits/chosen": 0.5042354464530945, "logits/rejected": 1.2818940877914429, "logps/chosen": -712.5407104492188, "logps/rejected": -1623.8824462890625, "loss": 0.1499, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.52707576751709, "rewards/margins": 9.589614868164062, "rewards/margins_max": 15.405502319335938, "rewards/margins_min": 3.77372670173645, "rewards/margins_std": 8.224907875061035, "rewards/rejected": -14.116689682006836, "step": 760 }, { "epoch": 0.32, "grad_norm": 4.5, "learning_rate": 4.31601127835805e-06, "logits/chosen": 0.4573752284049988, "logits/rejected": 1.2255313396453857, "logps/chosen": -803.6546630859375, "logps/rejected": -1744.1165771484375, "loss": 0.1508, "rewards/accuracies": 0.9375, "rewards/chosen": -4.967880725860596, "rewards/margins": 10.099109649658203, "rewards/margins_max": 15.592402458190918, "rewards/margins_min": 4.605815887451172, "rewards/margins_std": 7.768690586090088, "rewards/rejected": -15.066988945007324, "step": 770 }, { "epoch": 0.32, "grad_norm": 1.296875, "learning_rate": 4.291120599458366e-06, "logits/chosen": 0.6284778118133545, "logits/rejected": 1.3736778497695923, "logps/chosen": -744.2955322265625, "logps/rejected": -1689.0992431640625, "loss": 0.108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.914001941680908, "rewards/margins": 9.785839080810547, "rewards/margins_max": 14.224902153015137, "rewards/margins_min": 5.346776008605957, "rewards/margins_std": 6.2777838706970215, "rewards/rejected": -14.69983959197998, "step": 780 }, { "epoch": 0.33, "grad_norm": 0.83203125, "learning_rate": 4.265859654238676e-06, "logits/chosen": 0.518182635307312, "logits/rejected": 1.266416311264038, "logps/chosen": -795.889404296875, "logps/rejected": -1599.6748046875, "loss": 0.1042, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.121318340301514, "rewards/margins": 8.590972900390625, "rewards/margins_max": 12.986808776855469, "rewards/margins_min": 4.195137023925781, "rewards/margins_std": 6.216650485992432, "rewards/rejected": -13.71229076385498, "step": 790 }, { "epoch": 0.33, "grad_norm": 3.09375, "learning_rate": 4.240233664724358e-06, "logits/chosen": 0.5838888883590698, "logits/rejected": 1.3169996738433838, "logps/chosen": -767.9039916992188, "logps/rejected": -1709.415283203125, "loss": 0.1546, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.298363208770752, "rewards/margins": 9.643119812011719, "rewards/margins_max": 14.92499828338623, "rewards/margins_min": 4.361241340637207, "rewards/margins_std": 7.469703674316406, "rewards/rejected": -14.941482543945312, "step": 800 }, { "epoch": 0.33, "grad_norm": 0.5859375, "learning_rate": 4.2142479284039445e-06, "logits/chosen": 0.5468761920928955, "logits/rejected": 1.227432131767273, "logps/chosen": -770.5872802734375, "logps/rejected": -1546.19384765625, "loss": 0.0909, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.143388748168945, "rewards/margins": 8.180723190307617, "rewards/margins_max": 13.141166687011719, "rewards/margins_min": 3.2202792167663574, "rewards/margins_std": 7.015126705169678, "rewards/rejected": -13.324111938476562, "step": 810 }, { "epoch": 0.34, "grad_norm": 2.84375, "learning_rate": 4.187907817134005e-06, "logits/chosen": 0.5028406381607056, "logits/rejected": 1.2698485851287842, "logps/chosen": -769.3389282226562, "logps/rejected": -2071.339111328125, "loss": 0.0668, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.078272342681885, "rewards/margins": 13.281135559082031, "rewards/margins_max": 18.268909454345703, "rewards/margins_min": 8.293363571166992, "rewards/margins_std": 7.053775787353516, "rewards/rejected": -18.359407424926758, "step": 820 }, { "epoch": 0.34, "grad_norm": 0.96484375, "learning_rate": 4.161218776028661e-06, "logits/chosen": 0.4837300181388855, "logits/rejected": 1.2130780220031738, "logps/chosen": -780.1266479492188, "logps/rejected": -2050.310302734375, "loss": 0.2191, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.4014573097229, "rewards/margins": 12.880502700805664, "rewards/margins_max": 20.169330596923828, "rewards/margins_min": 5.591673851013184, "rewards/margins_std": 10.30795955657959, "rewards/rejected": -18.281957626342773, "step": 830 }, { "epoch": 0.35, "grad_norm": 0.443359375, "learning_rate": 4.134186322333951e-06, "logits/chosen": 0.5044664144515991, "logits/rejected": 1.2629055976867676, "logps/chosen": -710.2357788085938, "logps/rejected": -1879.140625, "loss": 0.1806, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.55966329574585, "rewards/margins": 11.992768287658691, "rewards/margins_max": 17.50288200378418, "rewards/margins_min": 6.482656002044678, "rewards/margins_std": 7.79247522354126, "rewards/rejected": -16.552433013916016, "step": 840 }, { "epoch": 0.35, "grad_norm": 2.796875, "learning_rate": 4.106816044287292e-06, "logits/chosen": 0.5818988084793091, "logits/rejected": 1.2744948863983154, "logps/chosen": -702.9332885742188, "logps/rejected": -1656.512939453125, "loss": 0.1058, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6068434715271, "rewards/margins": 9.725323677062988, "rewards/margins_max": 13.76390266418457, "rewards/margins_min": 5.68674373626709, "rewards/margins_std": 5.711414337158203, "rewards/rejected": -14.332165718078613, "step": 850 }, { "epoch": 0.35, "grad_norm": 0.37109375, "learning_rate": 4.079113599962257e-06, "logits/chosen": 0.6045584082603455, "logits/rejected": 1.40791916847229, "logps/chosen": -795.3938598632812, "logps/rejected": -1809.0947265625, "loss": 0.0772, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.249255657196045, "rewards/margins": 10.42540168762207, "rewards/margins_max": 15.904159545898438, "rewards/margins_min": 4.9466447830200195, "rewards/margins_std": 7.748133182525635, "rewards/rejected": -15.674657821655273, "step": 860 }, { "epoch": 0.36, "grad_norm": 0.92578125, "learning_rate": 4.051084716098921e-06, "logits/chosen": 0.5180607438087463, "logits/rejected": 1.220595121383667, "logps/chosen": -676.9715576171875, "logps/rejected": -1734.108642578125, "loss": 0.1499, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.324917793273926, "rewards/margins": 10.716940879821777, "rewards/margins_max": 15.53416633605957, "rewards/margins_min": 5.899716377258301, "rewards/margins_std": 6.812585353851318, "rewards/rejected": -15.04185962677002, "step": 870 }, { "epoch": 0.36, "grad_norm": 4.46875, "learning_rate": 4.022735186920008e-06, "logits/chosen": 0.487175315618515, "logits/rejected": 1.2153656482696533, "logps/chosen": -689.3336791992188, "logps/rejected": -1664.072998046875, "loss": 0.1004, "rewards/accuracies": 0.9375, "rewards/chosen": -4.373027801513672, "rewards/margins": 10.115726470947266, "rewards/margins_max": 15.508180618286133, "rewards/margins_min": 4.72327184677124, "rewards/margins_std": 7.626082420349121, "rewards/rejected": -14.488754272460938, "step": 880 }, { "epoch": 0.37, "grad_norm": 7.4375, "learning_rate": 3.994070872933097e-06, "logits/chosen": 0.4529595375061035, "logits/rejected": 1.1865074634552002, "logps/chosen": -645.4569091796875, "logps/rejected": -1371.4666748046875, "loss": 0.1495, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.04251766204834, "rewards/margins": 7.7823076248168945, "rewards/margins_max": 11.296024322509766, "rewards/margins_min": 4.268589973449707, "rewards/margins_std": 4.969146251678467, "rewards/rejected": -11.824824333190918, "step": 890 }, { "epoch": 0.37, "grad_norm": 3.46875, "learning_rate": 3.965097699719109e-06, "logits/chosen": 0.5944451093673706, "logits/rejected": 1.3090002536773682, "logps/chosen": -762.5585327148438, "logps/rejected": -1599.050537109375, "loss": 0.1855, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.843676567077637, "rewards/margins": 8.63348388671875, "rewards/margins_max": 13.529146194458008, "rewards/margins_min": 3.7378222942352295, "rewards/margins_std": 6.9235124588012695, "rewards/rejected": -13.477160453796387, "step": 900 }, { "epoch": 0.37, "grad_norm": 2.5625, "learning_rate": 3.935821656707359e-06, "logits/chosen": 0.5119448304176331, "logits/rejected": 1.1734087467193604, "logps/chosen": -652.0521850585938, "logps/rejected": -1535.1744384765625, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.284106254577637, "rewards/margins": 8.888346672058105, "rewards/margins_max": 13.543110847473145, "rewards/margins_min": 4.233582496643066, "rewards/margins_std": 6.582831382751465, "rewards/rejected": -13.172452926635742, "step": 910 }, { "epoch": 0.38, "grad_norm": 0.6640625, "learning_rate": 3.9062487959374e-06, "logits/chosen": 0.41363000869750977, "logits/rejected": 1.170240879058838, "logps/chosen": -667.77783203125, "logps/rejected": -1525.485595703125, "loss": 0.1262, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.287820816040039, "rewards/margins": 9.004143714904785, "rewards/margins_max": 13.4224214553833, "rewards/margins_min": 4.585866451263428, "rewards/margins_std": 6.248387336730957, "rewards/rejected": -13.291964530944824, "step": 920 }, { "epoch": 0.38, "grad_norm": 2.75, "learning_rate": 3.8763852308079244e-06, "logits/chosen": 0.5807031393051147, "logits/rejected": 1.292966365814209, "logps/chosen": -698.1134643554688, "logps/rejected": -1579.521240234375, "loss": 0.1198, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.37972354888916, "rewards/margins": 9.12094497680664, "rewards/margins_max": 14.104803085327148, "rewards/margins_min": 4.137085914611816, "rewards/margins_std": 7.048240661621094, "rewards/rejected": -13.5006685256958, "step": 930 }, { "epoch": 0.39, "grad_norm": 0.875, "learning_rate": 3.8462371348129805e-06, "logits/chosen": 0.539486289024353, "logits/rejected": 1.2316633462905884, "logps/chosen": -694.4327392578125, "logps/rejected": -1500.2738037109375, "loss": 0.1419, "rewards/accuracies": 0.9375, "rewards/chosen": -4.638047218322754, "rewards/margins": 8.356195449829102, "rewards/margins_max": 12.973353385925293, "rewards/margins_min": 3.7390365600585938, "rewards/margins_std": 6.5296478271484375, "rewards/rejected": -12.994241714477539, "step": 940 }, { "epoch": 0.39, "grad_norm": 0.10693359375, "learning_rate": 3.815810740265769e-06, "logits/chosen": 0.5020047426223755, "logits/rejected": 1.345840573310852, "logps/chosen": -702.5892333984375, "logps/rejected": -1638.1595458984375, "loss": 0.1495, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.527231216430664, "rewards/margins": 9.787068367004395, "rewards/margins_max": 14.262059211730957, "rewards/margins_min": 5.312075614929199, "rewards/margins_std": 6.328594207763672, "rewards/rejected": -14.314300537109375, "step": 950 }, { "epoch": 0.4, "grad_norm": 6.03125, "learning_rate": 3.785112337010284e-06, "logits/chosen": 0.6428021192550659, "logits/rejected": 1.342193365097046, "logps/chosen": -698.9358520507812, "logps/rejected": -1490.9117431640625, "loss": 0.1081, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.455615043640137, "rewards/margins": 8.250611305236816, "rewards/margins_max": 11.710147857666016, "rewards/margins_min": 4.791074275970459, "rewards/margins_std": 4.892523765563965, "rewards/rejected": -12.706225395202637, "step": 960 }, { "epoch": 0.4, "grad_norm": 0.6328125, "learning_rate": 3.7541482711210474e-06, "logits/chosen": 0.49654191732406616, "logits/rejected": 1.2780824899673462, "logps/chosen": -770.838623046875, "logps/rejected": -1911.4193115234375, "loss": 0.1141, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.841742038726807, "rewards/margins": 11.952981948852539, "rewards/margins_max": 17.861114501953125, "rewards/margins_min": 6.044848442077637, "rewards/margins_std": 8.355361938476562, "rewards/rejected": -16.794721603393555, "step": 970 }, { "epoch": 0.4, "grad_norm": 1.9765625, "learning_rate": 3.722924943591232e-06, "logits/chosen": 0.5268442034721375, "logits/rejected": 1.2990949153900146, "logps/chosen": -794.0743408203125, "logps/rejected": -1849.2060546875, "loss": 0.0905, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.302125930786133, "rewards/margins": 11.0300874710083, "rewards/margins_max": 15.024978637695312, "rewards/margins_min": 7.035195827484131, "rewards/margins_std": 5.649630069732666, "rewards/rejected": -16.33221435546875, "step": 980 }, { "epoch": 0.41, "grad_norm": 1.3203125, "learning_rate": 3.691448809009427e-06, "logits/chosen": 0.627538800239563, "logits/rejected": 1.3176391124725342, "logps/chosen": -826.3603515625, "logps/rejected": -1807.1185302734375, "loss": 0.165, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.355879783630371, "rewards/margins": 10.351202011108398, "rewards/margins_max": 15.11308479309082, "rewards/margins_min": 5.589318752288818, "rewards/margins_std": 6.73431921005249, "rewards/rejected": -15.70708179473877, "step": 990 }, { "epoch": 0.41, "grad_norm": 0.84375, "learning_rate": 3.659726374225323e-06, "logits/chosen": 0.47057127952575684, "logits/rejected": 1.1657397747039795, "logps/chosen": -652.9990234375, "logps/rejected": -1534.808349609375, "loss": 0.0925, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.1840925216674805, "rewards/margins": 8.879453659057617, "rewards/margins_max": 13.41234302520752, "rewards/margins_min": 4.346565246582031, "rewards/margins_std": 6.410472869873047, "rewards/rejected": -13.063547134399414, "step": 1000 }, { "epoch": 0.42, "grad_norm": 1.1171875, "learning_rate": 3.6277641970045975e-06, "logits/chosen": 0.5770415663719177, "logits/rejected": 1.3713629245758057, "logps/chosen": -804.15576171875, "logps/rejected": -1777.1861572265625, "loss": 0.1385, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3897552490234375, "rewards/margins": 10.179000854492188, "rewards/margins_max": 15.141960144042969, "rewards/margins_min": 5.216042995452881, "rewards/margins_std": 7.018682956695557, "rewards/rejected": -15.568756103515625, "step": 1010 }, { "epoch": 0.42, "grad_norm": 1.2265625, "learning_rate": 3.5955688846732677e-06, "logits/chosen": 0.5724108815193176, "logits/rejected": 1.2440688610076904, "logps/chosen": -775.36328125, "logps/rejected": -2097.964599609375, "loss": 0.1097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.414142608642578, "rewards/margins": 13.599000930786133, "rewards/margins_max": 21.20298957824707, "rewards/margins_min": 5.995011329650879, "rewards/margins_std": 10.753664016723633, "rewards/rejected": -19.013145446777344, "step": 1020 }, { "epoch": 0.42, "grad_norm": 3.421875, "learning_rate": 3.563147092751807e-06, "logits/chosen": 0.5183674097061157, "logits/rejected": 1.306438684463501, "logps/chosen": -913.7030029296875, "logps/rejected": -1972.9140625, "loss": 0.087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.088994026184082, "rewards/margins": 11.24118423461914, "rewards/margins_max": 17.46712875366211, "rewards/margins_min": 5.0152411460876465, "rewards/margins_std": 8.804813385009766, "rewards/rejected": -17.330181121826172, "step": 1030 }, { "epoch": 0.43, "grad_norm": 2.65625, "learning_rate": 3.5305055235792906e-06, "logits/chosen": 0.5217747688293457, "logits/rejected": 1.2815120220184326, "logps/chosen": -753.7869262695312, "logps/rejected": -1882.3675537109375, "loss": 0.1165, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.9851508140563965, "rewards/margins": 11.520073890686035, "rewards/margins_max": 17.029399871826172, "rewards/margins_min": 6.010746955871582, "rewards/margins_std": 7.791365623474121, "rewards/rejected": -16.505224227905273, "step": 1040 }, { "epoch": 0.43, "grad_norm": 9.625, "learning_rate": 3.4976509249278673e-06, "logits/chosen": 0.6170846819877625, "logits/rejected": 1.3059895038604736, "logps/chosen": -820.76708984375, "logps/rejected": -1968.9228515625, "loss": 0.1944, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.856749534606934, "rewards/margins": 11.451845169067383, "rewards/margins_max": 17.031482696533203, "rewards/margins_min": 5.8722076416015625, "rewards/margins_std": 7.890799045562744, "rewards/rejected": -17.30859375, "step": 1050 }, { "epoch": 0.44, "grad_norm": 10.0, "learning_rate": 3.4645900886078388e-06, "logits/chosen": 0.47162705659866333, "logits/rejected": 1.2156587839126587, "logps/chosen": -745.6646728515625, "logps/rejected": -1683.563232421875, "loss": 0.1421, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.883286476135254, "rewards/margins": 9.705187797546387, "rewards/margins_max": 14.16901683807373, "rewards/margins_min": 5.241359710693359, "rewards/margins_std": 6.312806129455566, "rewards/rejected": -14.588473320007324, "step": 1060 }, { "epoch": 0.44, "grad_norm": 1.3515625, "learning_rate": 3.4313298490636328e-06, "logits/chosen": 0.542891800403595, "logits/rejected": 1.327044129371643, "logps/chosen": -745.6140747070312, "logps/rejected": -1832.4476318359375, "loss": 0.1122, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.060235023498535, "rewards/margins": 11.030832290649414, "rewards/margins_max": 15.847732543945312, "rewards/margins_min": 6.21393346786499, "rewards/margins_std": 6.812124729156494, "rewards/rejected": -16.091068267822266, "step": 1070 }, { "epoch": 0.44, "grad_norm": 1.40625, "learning_rate": 3.3978770819609647e-06, "logits/chosen": 0.5193914175033569, "logits/rejected": 1.2432626485824585, "logps/chosen": -718.4923095703125, "logps/rejected": -1824.8092041015625, "loss": 0.0604, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.907055377960205, "rewards/margins": 11.238981246948242, "rewards/margins_max": 16.069297790527344, "rewards/margins_min": 6.40866756439209, "rewards/margins_std": 6.831096649169922, "rewards/rejected": -16.146038055419922, "step": 1080 }, { "epoch": 0.45, "grad_norm": 2.0, "learning_rate": 3.364238702765477e-06, "logits/chosen": 0.6283344030380249, "logits/rejected": 1.1587202548980713, "logps/chosen": -784.2772216796875, "logps/rejected": -1649.103515625, "loss": 0.093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.440009117126465, "rewards/margins": 8.746278762817383, "rewards/margins_max": 12.856730461120605, "rewards/margins_min": 4.63582706451416, "rewards/margins_std": 5.813055992126465, "rewards/rejected": -14.186288833618164, "step": 1090 }, { "epoch": 0.45, "grad_norm": 0.11669921875, "learning_rate": 3.3304216653131566e-06, "logits/chosen": 0.4906349778175354, "logits/rejected": 1.1233699321746826, "logps/chosen": -726.7098388671875, "logps/rejected": -1864.184326171875, "loss": 0.0985, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.014734268188477, "rewards/margins": 11.505403518676758, "rewards/margins_max": 17.862531661987305, "rewards/margins_min": 5.148275375366211, "rewards/margins_std": 8.990338325500488, "rewards/rejected": -16.520137786865234, "step": 1100 }, { "epoch": 0.46, "grad_norm": 0.65234375, "learning_rate": 3.2964329603728046e-06, "logits/chosen": 0.4619167447090149, "logits/rejected": 1.1618800163269043, "logps/chosen": -792.4830322265625, "logps/rejected": -1843.839111328125, "loss": 0.1262, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.372784614562988, "rewards/margins": 10.8291597366333, "rewards/margins_max": 15.854537963867188, "rewards/margins_min": 5.803778648376465, "rewards/margins_std": 7.106959342956543, "rewards/rejected": -16.201942443847656, "step": 1110 }, { "epoch": 0.46, "grad_norm": 0.59765625, "learning_rate": 3.262279614200892e-06, "logits/chosen": 0.5689177513122559, "logits/rejected": 1.27706778049469, "logps/chosen": -735.7247314453125, "logps/rejected": -1631.17578125, "loss": 0.1125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.984173774719238, "rewards/margins": 9.399529457092285, "rewards/margins_max": 13.701881408691406, "rewards/margins_min": 5.097177505493164, "rewards/margins_std": 6.084444046020508, "rewards/rejected": -14.383702278137207, "step": 1120 }, { "epoch": 0.47, "grad_norm": 0.322265625, "learning_rate": 3.2279686870890637e-06, "logits/chosen": 0.4834915101528168, "logits/rejected": 1.2427217960357666, "logps/chosen": -703.0142822265625, "logps/rejected": -1653.974365234375, "loss": 0.0839, "rewards/accuracies": 0.9375, "rewards/chosen": -4.775191307067871, "rewards/margins": 9.554444313049316, "rewards/margins_max": 14.255029678344727, "rewards/margins_min": 4.853858470916748, "rewards/margins_std": 6.6476311683654785, "rewards/rejected": -14.329633712768555, "step": 1130 }, { "epoch": 0.47, "grad_norm": 0.57421875, "learning_rate": 3.193507271904612e-06, "logits/chosen": 0.44650688767433167, "logits/rejected": 1.2217845916748047, "logps/chosen": -858.9959106445312, "logps/rejected": -1833.328125, "loss": 0.0916, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.681185722351074, "rewards/margins": 10.469725608825684, "rewards/margins_max": 15.999313354492188, "rewards/margins_min": 4.940136909484863, "rewards/margins_std": 7.8200178146362305, "rewards/rejected": -16.150911331176758, "step": 1140 }, { "epoch": 0.47, "grad_norm": 0.3046875, "learning_rate": 3.158902492624218e-06, "logits/chosen": 0.4523468613624573, "logits/rejected": 1.2057933807373047, "logps/chosen": -841.3018798828125, "logps/rejected": -1831.9058837890625, "loss": 0.0927, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.5725226402282715, "rewards/margins": 10.594744682312012, "rewards/margins_max": 15.057345390319824, "rewards/margins_min": 6.132142543792725, "rewards/margins_std": 6.311070442199707, "rewards/rejected": -16.167264938354492, "step": 1150 }, { "epoch": 0.48, "grad_norm": 4.125, "learning_rate": 3.1241615028612563e-06, "logits/chosen": 0.5951441526412964, "logits/rejected": 1.2352155447006226, "logps/chosen": -768.4414672851562, "logps/rejected": -1707.2896728515625, "loss": 0.2036, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0751752853393555, "rewards/margins": 9.864356994628906, "rewards/margins_max": 14.157461166381836, "rewards/margins_min": 5.57125186920166, "rewards/margins_std": 6.071366786956787, "rewards/rejected": -14.939532279968262, "step": 1160 }, { "epoch": 0.48, "grad_norm": 0.765625, "learning_rate": 3.0892914843869838e-06, "logits/chosen": 0.5745668411254883, "logits/rejected": 1.3735682964324951, "logps/chosen": -716.9601440429688, "logps/rejected": -1639.5445556640625, "loss": 0.0789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.607513904571533, "rewards/margins": 9.50660228729248, "rewards/margins_max": 14.321581840515137, "rewards/margins_min": 4.691622734069824, "rewards/margins_std": 6.809409141540527, "rewards/rejected": -14.114115715026855, "step": 1170 }, { "epoch": 0.49, "grad_norm": 0.3125, "learning_rate": 3.054299645645889e-06, "logits/chosen": 0.574237048625946, "logits/rejected": 1.1578586101531982, "logps/chosen": -723.4119262695312, "logps/rejected": -1720.607666015625, "loss": 0.1266, "rewards/accuracies": 0.9375, "rewards/chosen": -4.802746772766113, "rewards/margins": 10.177275657653809, "rewards/margins_max": 15.720603942871094, "rewards/margins_min": 4.633947372436523, "rewards/margins_std": 7.839449882507324, "rewards/rejected": -14.980023384094238, "step": 1180 }, { "epoch": 0.49, "grad_norm": 0.7578125, "learning_rate": 3.01919322026555e-06, "logits/chosen": 0.57005774974823, "logits/rejected": 1.3801429271697998, "logps/chosen": -777.7997436523438, "logps/rejected": -1871.4964599609375, "loss": 0.12, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.197485446929932, "rewards/margins": 11.358160018920898, "rewards/margins_max": 16.549087524414062, "rewards/margins_min": 6.167231559753418, "rewards/margins_std": 7.341080665588379, "rewards/rejected": -16.555644989013672, "step": 1190 }, { "epoch": 0.49, "grad_norm": 0.26171875, "learning_rate": 2.9839794655612674e-06, "logits/chosen": 0.4680374562740326, "logits/rejected": 1.2621392011642456, "logps/chosen": -701.8974609375, "logps/rejected": -1736.7503662109375, "loss": 0.1476, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.581705570220947, "rewards/margins": 10.844728469848633, "rewards/margins_max": 15.584383964538574, "rewards/margins_min": 6.105072498321533, "rewards/margins_std": 6.702885627746582, "rewards/rejected": -15.426434516906738, "step": 1200 }, { "epoch": 0.5, "grad_norm": 0.2890625, "learning_rate": 2.9486656610358143e-06, "logits/chosen": 0.48323068022727966, "logits/rejected": 1.2080551385879517, "logps/chosen": -702.0436401367188, "logps/rejected": -1731.0406494140625, "loss": 0.0973, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.697200775146484, "rewards/margins": 10.60991096496582, "rewards/margins_max": 16.56106948852539, "rewards/margins_min": 4.658753871917725, "rewards/margins_std": 8.416207313537598, "rewards/rejected": -15.307113647460938, "step": 1210 }, { "epoch": 0.5, "grad_norm": 0.78125, "learning_rate": 2.9132591068745884e-06, "logits/chosen": 0.5117800235748291, "logits/rejected": 1.158496618270874, "logps/chosen": -699.7086791992188, "logps/rejected": -1694.8929443359375, "loss": 0.118, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.589441776275635, "rewards/margins": 9.935541152954102, "rewards/margins_max": 14.52270793914795, "rewards/margins_min": 5.348374366760254, "rewards/margins_std": 6.487233638763428, "rewards/rejected": -14.524983406066895, "step": 1220 }, { "epoch": 0.51, "grad_norm": 1.34375, "learning_rate": 2.8777671224364966e-06, "logits/chosen": 0.5292683243751526, "logits/rejected": 1.3735748529434204, "logps/chosen": -793.8040771484375, "logps/rejected": -2016.2437744140625, "loss": 0.1016, "rewards/accuracies": 0.9375, "rewards/chosen": -5.2961931228637695, "rewards/margins": 12.560154914855957, "rewards/margins_max": 19.488279342651367, "rewards/margins_min": 5.632030487060547, "rewards/margins_std": 9.797847747802734, "rewards/rejected": -17.856348037719727, "step": 1230 }, { "epoch": 0.51, "grad_norm": 1.1796875, "learning_rate": 2.842197044740873e-06, "logits/chosen": 0.5125163793563843, "logits/rejected": 1.1910914182662964, "logps/chosen": -716.28271484375, "logps/rejected": -1681.4847412109375, "loss": 0.1034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.732181549072266, "rewards/margins": 9.891237258911133, "rewards/margins_max": 14.310302734375, "rewards/margins_min": 5.472168922424316, "rewards/margins_std": 6.249504566192627, "rewards/rejected": -14.623417854309082, "step": 1240 }, { "epoch": 0.51, "grad_norm": 0.408203125, "learning_rate": 2.8065562269507464e-06, "logits/chosen": 0.6009246110916138, "logits/rejected": 1.1898201704025269, "logps/chosen": -778.5814819335938, "logps/rejected": -2116.184814453125, "loss": 0.0969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.313014030456543, "rewards/margins": 13.718805313110352, "rewards/margins_max": 19.344837188720703, "rewards/margins_min": 8.092771530151367, "rewards/margins_std": 7.956411838531494, "rewards/rejected": -19.031816482543945, "step": 1250 }, { "epoch": 0.52, "grad_norm": 1.75, "learning_rate": 2.7708520368527687e-06, "logits/chosen": 0.6829395294189453, "logits/rejected": 1.4511185884475708, "logps/chosen": -764.1370849609375, "logps/rejected": -1753.5296630859375, "loss": 0.0735, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.051453590393066, "rewards/margins": 10.619558334350586, "rewards/margins_max": 16.41791534423828, "rewards/margins_min": 4.821201324462891, "rewards/margins_std": 8.200114250183105, "rewards/rejected": -15.671010971069336, "step": 1260 }, { "epoch": 0.52, "grad_norm": 0.6875, "learning_rate": 2.735091855334122e-06, "logits/chosen": 0.5935325622558594, "logits/rejected": 1.272655963897705, "logps/chosen": -780.0508422851562, "logps/rejected": -1765.977294921875, "loss": 0.1392, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.5274786949157715, "rewards/margins": 10.1564302444458, "rewards/margins_max": 15.466066360473633, "rewards/margins_min": 4.846795082092285, "rewards/margins_std": 7.508957862854004, "rewards/rejected": -15.68390941619873, "step": 1270 }, { "epoch": 0.53, "grad_norm": 8.8125, "learning_rate": 2.6992830748567204e-06, "logits/chosen": 0.601089596748352, "logits/rejected": 1.3095543384552002, "logps/chosen": -735.0325317382812, "logps/rejected": -1644.493408203125, "loss": 0.1702, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.930995464324951, "rewards/margins": 9.481109619140625, "rewards/margins_max": 13.667490005493164, "rewards/margins_min": 5.294730186462402, "rewards/margins_std": 5.920435428619385, "rewards/rejected": -14.412104606628418, "step": 1280 }, { "epoch": 0.53, "grad_norm": 1.078125, "learning_rate": 2.6634330979290133e-06, "logits/chosen": 0.5804930925369263, "logits/rejected": 1.1953046321868896, "logps/chosen": -664.8348388671875, "logps/rejected": -1494.647216796875, "loss": 0.0935, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.399333477020264, "rewards/margins": 8.430289268493652, "rewards/margins_max": 12.645450592041016, "rewards/margins_min": 4.215127944946289, "rewards/margins_std": 5.9611382484436035, "rewards/rejected": -12.829623222351074, "step": 1290 }, { "epoch": 0.54, "grad_norm": 2.5625, "learning_rate": 2.6275493355757166e-06, "logits/chosen": 0.5969884395599365, "logits/rejected": 1.2400842905044556, "logps/chosen": -675.366943359375, "logps/rejected": -1664.972412109375, "loss": 0.1129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.368256568908691, "rewards/margins": 10.127969741821289, "rewards/margins_max": 15.879676818847656, "rewards/margins_min": 4.376260757446289, "rewards/margins_std": 8.134143829345703, "rewards/rejected": -14.496225357055664, "step": 1300 }, { "epoch": 0.54, "grad_norm": 1.234375, "learning_rate": 2.5916392058057754e-06, "logits/chosen": 0.6539616584777832, "logits/rejected": 1.2341909408569336, "logps/chosen": -646.232421875, "logps/rejected": -1600.320556640625, "loss": 0.08, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.306728363037109, "rewards/margins": 9.608429908752441, "rewards/margins_max": 13.071403503417969, "rewards/margins_min": 6.145455837249756, "rewards/margins_std": 4.897385597229004, "rewards/rejected": -13.91515827178955, "step": 1310 }, { "epoch": 0.54, "grad_norm": 0.443359375, "learning_rate": 2.5557101320789005e-06, "logits/chosen": 0.43818527460098267, "logits/rejected": 1.1889019012451172, "logps/chosen": -740.1410522460938, "logps/rejected": -1722.2867431640625, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.829034805297852, "rewards/margins": 10.092082023620605, "rewards/margins_max": 14.363845825195312, "rewards/margins_min": 5.820317268371582, "rewards/margins_std": 6.041186332702637, "rewards/rejected": -14.921116828918457, "step": 1320 }, { "epoch": 0.55, "grad_norm": 0.8515625, "learning_rate": 2.519769541770954e-06, "logits/chosen": 0.6074897646903992, "logits/rejected": 1.3447411060333252, "logps/chosen": -747.0977783203125, "logps/rejected": -1610.9615478515625, "loss": 0.1078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.645691871643066, "rewards/margins": 9.302739143371582, "rewards/margins_max": 13.083358764648438, "rewards/margins_min": 5.522116661071777, "rewards/margins_std": 5.34660530090332, "rewards/rejected": -13.948430061340332, "step": 1330 }, { "epoch": 0.55, "grad_norm": 2.4375, "learning_rate": 2.4838248646385458e-06, "logits/chosen": 0.4675142765045166, "logits/rejected": 1.2192738056182861, "logps/chosen": -713.5174560546875, "logps/rejected": -1698.417724609375, "loss": 0.1027, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.643341064453125, "rewards/margins": 10.201144218444824, "rewards/margins_max": 15.106475830078125, "rewards/margins_min": 5.29581356048584, "rewards/margins_std": 6.937185764312744, "rewards/rejected": -14.844487190246582, "step": 1340 }, { "epoch": 0.56, "grad_norm": 4.5, "learning_rate": 2.447883531283127e-06, "logits/chosen": 0.480851411819458, "logits/rejected": 1.3301368951797485, "logps/chosen": -769.1808471679688, "logps/rejected": -1694.986328125, "loss": 0.1297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.698918342590332, "rewards/margins": 10.097999572753906, "rewards/margins_max": 13.938295364379883, "rewards/margins_min": 6.257704257965088, "rewards/margins_std": 5.4309983253479, "rewards/rejected": -14.796917915344238, "step": 1350 }, { "epoch": 0.56, "grad_norm": 0.7734375, "learning_rate": 2.4119529716149126e-06, "logits/chosen": 0.5563157796859741, "logits/rejected": 1.2523400783538818, "logps/chosen": -786.4019775390625, "logps/rejected": -1475.5535888671875, "loss": 0.1001, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.191001892089844, "rewards/margins": 7.456092834472656, "rewards/margins_max": 10.319292068481445, "rewards/margins_min": 4.592894077301025, "rewards/margins_std": 4.0491743087768555, "rewards/rejected": -12.647093772888184, "step": 1360 }, { "epoch": 0.56, "grad_norm": 4.15625, "learning_rate": 2.376040613316944e-06, "logits/chosen": 0.46584218740463257, "logits/rejected": 1.1385295391082764, "logps/chosen": -699.6835327148438, "logps/rejected": -1963.931884765625, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": -4.705713272094727, "rewards/margins": 12.683069229125977, "rewards/margins_max": 18.711505889892578, "rewards/margins_min": 6.654633522033691, "rewards/margins_std": 8.525496482849121, "rewards/rejected": -17.388782501220703, "step": 1370 }, { "epoch": 0.57, "grad_norm": 0.62109375, "learning_rate": 2.340153880309619e-06, "logits/chosen": 0.6857975721359253, "logits/rejected": 1.3219093084335327, "logps/chosen": -779.4803466796875, "logps/rejected": -1705.5458984375, "loss": 0.0992, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.236274242401123, "rewards/margins": 9.680828094482422, "rewards/margins_max": 14.501251220703125, "rewards/margins_min": 4.860402584075928, "rewards/margins_std": 6.81710958480835, "rewards/rejected": -14.917101860046387, "step": 1380 }, { "epoch": 0.57, "grad_norm": 1.75, "learning_rate": 2.3043001912159892e-06, "logits/chosen": 0.5691137313842773, "logits/rejected": 1.298168659210205, "logps/chosen": -779.42431640625, "logps/rejected": -1864.0394287109375, "loss": 0.0727, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.378870010375977, "rewards/margins": 11.22581672668457, "rewards/margins_max": 16.610761642456055, "rewards/margins_min": 5.840869903564453, "rewards/margins_std": 7.615464687347412, "rewards/rejected": -16.604686737060547, "step": 1390 }, { "epoch": 0.58, "grad_norm": 16.25, "learning_rate": 2.268486957828159e-06, "logits/chosen": 0.6387670636177063, "logits/rejected": 1.1569719314575195, "logps/chosen": -729.2189331054688, "logps/rejected": -1777.036376953125, "loss": 0.213, "rewards/accuracies": 0.9375, "rewards/chosen": -5.217606544494629, "rewards/margins": 10.492830276489258, "rewards/margins_max": 15.320582389831543, "rewards/margins_min": 5.665076732635498, "rewards/margins_std": 6.827474117279053, "rewards/rejected": -15.71043586730957, "step": 1400 }, { "epoch": 0.58, "grad_norm": 1.671875, "learning_rate": 2.232721583575099e-06, "logits/chosen": 0.4919258654117584, "logits/rejected": 1.2310242652893066, "logps/chosen": -778.7653198242188, "logps/rejected": -1702.4404296875, "loss": 0.1083, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0904669761657715, "rewards/margins": 9.812942504882812, "rewards/margins_max": 14.663922309875488, "rewards/margins_min": 4.961963176727295, "rewards/margins_std": 6.860320091247559, "rewards/rejected": -14.903407096862793, "step": 1410 }, { "epoch": 0.58, "grad_norm": 3.640625, "learning_rate": 2.1970114619921804e-06, "logits/chosen": 0.5403339862823486, "logits/rejected": 1.279847264289856, "logps/chosen": -782.6492309570312, "logps/rejected": -1954.859130859375, "loss": 0.0877, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.303040504455566, "rewards/margins": 11.789950370788574, "rewards/margins_max": 17.79424476623535, "rewards/margins_min": 5.785655975341797, "rewards/margins_std": 8.491353988647461, "rewards/rejected": -17.09299087524414, "step": 1420 }, { "epoch": 0.59, "grad_norm": 1.0859375, "learning_rate": 2.1613639751927636e-06, "logits/chosen": 0.5678201913833618, "logits/rejected": 1.2467429637908936, "logps/chosen": -794.25341796875, "logps/rejected": -1776.8765869140625, "loss": 0.1397, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.573482513427734, "rewards/margins": 10.259510040283203, "rewards/margins_max": 15.224624633789062, "rewards/margins_min": 5.294394493103027, "rewards/margins_std": 7.021732330322266, "rewards/rejected": -15.832992553710938, "step": 1430 }, { "epoch": 0.59, "grad_norm": 0.3984375, "learning_rate": 2.1257864923421405e-06, "logits/chosen": 0.5252267122268677, "logits/rejected": 1.167436957359314, "logps/chosen": -733.9835205078125, "logps/rejected": -1947.4000244140625, "loss": 0.1001, "rewards/accuracies": 1.0, "rewards/chosen": -4.864560127258301, "rewards/margins": 12.287667274475098, "rewards/margins_max": 17.395977020263672, "rewards/margins_min": 7.179357051849365, "rewards/margins_std": 7.224241733551025, "rewards/rejected": -17.152225494384766, "step": 1440 }, { "epoch": 0.6, "grad_norm": 2.078125, "learning_rate": 2.0902863681341546e-06, "logits/chosen": 0.592448353767395, "logits/rejected": 1.254591703414917, "logps/chosen": -762.2680053710938, "logps/rejected": -1617.958740234375, "loss": 0.1161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.002324104309082, "rewards/margins": 8.928361892700195, "rewards/margins_max": 13.639185905456543, "rewards/margins_min": 4.2175397872924805, "rewards/margins_std": 6.662109375, "rewards/rejected": -13.930686950683594, "step": 1450 }, { "epoch": 0.6, "grad_norm": 1.203125, "learning_rate": 2.0548709412708235e-06, "logits/chosen": 0.46100831031799316, "logits/rejected": 1.1205612421035767, "logps/chosen": -758.50244140625, "logps/rejected": -1687.9713134765625, "loss": 0.1154, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.229290962219238, "rewards/margins": 9.612086296081543, "rewards/margins_max": 13.77092456817627, "rewards/margins_min": 5.453249931335449, "rewards/margins_std": 5.881483554840088, "rewards/rejected": -14.841377258300781, "step": 1460 }, { "epoch": 0.61, "grad_norm": 0.828125, "learning_rate": 2.019547532945246e-06, "logits/chosen": 0.5935944318771362, "logits/rejected": 1.189452886581421, "logps/chosen": -698.9295654296875, "logps/rejected": -1723.960205078125, "loss": 0.0487, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.868830680847168, "rewards/margins": 10.070269584655762, "rewards/margins_max": 15.034326553344727, "rewards/margins_min": 5.1062116622924805, "rewards/margins_std": 7.020236015319824, "rewards/rejected": -14.939099311828613, "step": 1470 }, { "epoch": 0.61, "grad_norm": 2.25, "learning_rate": 1.9843234453281503e-06, "logits/chosen": 0.5408506989479065, "logits/rejected": 1.2704055309295654, "logps/chosen": -782.6818237304688, "logps/rejected": -1806.959228515625, "loss": 0.0791, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.255041122436523, "rewards/margins": 10.520251274108887, "rewards/margins_max": 15.160499572753906, "rewards/margins_min": 5.880003929138184, "rewards/margins_std": 6.562302589416504, "rewards/rejected": -15.775293350219727, "step": 1480 }, { "epoch": 0.61, "grad_norm": 3.515625, "learning_rate": 1.949205960058361e-06, "logits/chosen": 0.4531838297843933, "logits/rejected": 1.268123745918274, "logps/chosen": -836.1715698242188, "logps/rejected": -1688.537353515625, "loss": 0.1847, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.429810523986816, "rewards/margins": 9.195915222167969, "rewards/margins_max": 13.804384231567383, "rewards/margins_min": 4.587449073791504, "rewards/margins_std": 6.517356872558594, "rewards/rejected": -14.625727653503418, "step": 1490 }, { "epoch": 0.62, "grad_norm": 0.96875, "learning_rate": 1.914202336737517e-06, "logits/chosen": 0.4794815182685852, "logits/rejected": 1.1748192310333252, "logps/chosen": -741.9356079101562, "logps/rejected": -1867.9176025390625, "loss": 0.0846, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.017170429229736, "rewards/margins": 11.546915054321289, "rewards/margins_max": 17.515758514404297, "rewards/margins_min": 5.5780720710754395, "rewards/margins_std": 8.441219329833984, "rewards/rejected": -16.564085006713867, "step": 1500 }, { "epoch": 0.62, "grad_norm": 0.10595703125, "learning_rate": 1.8793198114293419e-06, "logits/chosen": 0.5758123993873596, "logits/rejected": 1.2776639461517334, "logps/chosen": -671.192138671875, "logps/rejected": -1982.5927734375, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": -4.604527473449707, "rewards/margins": 13.000114440917969, "rewards/margins_max": 19.173341751098633, "rewards/margins_min": 6.826885223388672, "rewards/margins_std": 8.730262756347656, "rewards/rejected": -17.604642868041992, "step": 1510 }, { "epoch": 0.63, "grad_norm": 0.87890625, "learning_rate": 1.8445655951637797e-06, "logits/chosen": 0.5493451952934265, "logits/rejected": 1.379970908164978, "logps/chosen": -748.9559326171875, "logps/rejected": -1775.750244140625, "loss": 0.0733, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9326372146606445, "rewards/margins": 10.712015151977539, "rewards/margins_max": 15.60670280456543, "rewards/margins_min": 5.817324638366699, "rewards/margins_std": 6.922135829925537, "rewards/rejected": -15.64465045928955, "step": 1520 }, { "epoch": 0.63, "grad_norm": 2.5625, "learning_rate": 1.809946872446312e-06, "logits/chosen": 0.5186041593551636, "logits/rejected": 1.1791023015975952, "logps/chosen": -725.4397583007812, "logps/rejected": -1589.250732421875, "loss": 0.0827, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.731976509094238, "rewards/margins": 9.246491432189941, "rewards/margins_max": 14.182947158813477, "rewards/margins_min": 4.310037136077881, "rewards/margins_std": 6.981202125549316, "rewards/rejected": -13.978469848632812, "step": 1530 }, { "epoch": 0.63, "grad_norm": 1.171875, "learning_rate": 1.7754707997727471e-06, "logits/chosen": 0.6401320695877075, "logits/rejected": 1.1825447082519531, "logps/chosen": -791.119140625, "logps/rejected": -1879.62109375, "loss": 0.0851, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.561471939086914, "rewards/margins": 10.975897789001465, "rewards/margins_max": 15.100332260131836, "rewards/margins_min": 6.85146427154541, "rewards/margins_std": 5.832830429077148, "rewards/rejected": -16.537368774414062, "step": 1540 }, { "epoch": 0.64, "grad_norm": 6.0625, "learning_rate": 1.7411445041498099e-06, "logits/chosen": 0.5857383012771606, "logits/rejected": 1.3303660154342651, "logps/chosen": -796.5535888671875, "logps/rejected": -2160.908203125, "loss": 0.1357, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.3245439529418945, "rewards/margins": 13.754674911499023, "rewards/margins_max": 19.97846221923828, "rewards/margins_min": 7.530887603759766, "rewards/margins_std": 8.801763534545898, "rewards/rejected": -19.079219818115234, "step": 1550 }, { "epoch": 0.64, "grad_norm": 1.0703125, "learning_rate": 1.7069750816218218e-06, "logits/chosen": 0.5591040849685669, "logits/rejected": 1.376008152961731, "logps/chosen": -757.9560546875, "logps/rejected": -1931.6070556640625, "loss": 0.0526, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.156350135803223, "rewards/margins": 12.076948165893555, "rewards/margins_max": 17.336977005004883, "rewards/margins_min": 6.816922664642334, "rewards/margins_std": 7.438802242279053, "rewards/rejected": -17.233299255371094, "step": 1560 }, { "epoch": 0.65, "grad_norm": 3.390625, "learning_rate": 1.6729695958037856e-06, "logits/chosen": 0.5422581434249878, "logits/rejected": 1.107097864151001, "logps/chosen": -806.7074584960938, "logps/rejected": -1815.927734375, "loss": 0.0766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.696779727935791, "rewards/margins": 10.256368637084961, "rewards/margins_max": 15.249975204467773, "rewards/margins_min": 5.262759685516357, "rewards/margins_std": 7.062028408050537, "rewards/rejected": -15.953149795532227, "step": 1570 }, { "epoch": 0.65, "grad_norm": 4.21875, "learning_rate": 1.6391350764211675e-06, "logits/chosen": 0.47015446424484253, "logits/rejected": 1.3002904653549194, "logps/chosen": -784.7755737304688, "logps/rejected": -1845.008544921875, "loss": 0.0581, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.293187618255615, "rewards/margins": 11.025456428527832, "rewards/margins_max": 15.80299186706543, "rewards/margins_min": 6.247918605804443, "rewards/margins_std": 6.756457328796387, "rewards/rejected": -16.31864356994629, "step": 1580 }, { "epoch": 0.65, "grad_norm": 5.65625, "learning_rate": 1.6054785178566944e-06, "logits/chosen": 0.39869189262390137, "logits/rejected": 1.1358020305633545, "logps/chosen": -790.1834106445312, "logps/rejected": -1959.004638671875, "loss": 0.1276, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.488138198852539, "rewards/margins": 12.004860877990723, "rewards/margins_max": 17.329792022705078, "rewards/margins_min": 6.679928779602051, "rewards/margins_std": 7.530592441558838, "rewards/rejected": -17.493000030517578, "step": 1590 }, { "epoch": 0.66, "grad_norm": 0.6328125, "learning_rate": 1.5720068777044479e-06, "logits/chosen": 0.5967472195625305, "logits/rejected": 1.3974864482879639, "logps/chosen": -806.0808715820312, "logps/rejected": -1895.9869384765625, "loss": 0.092, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.477103233337402, "rewards/margins": 11.376334190368652, "rewards/margins_max": 16.01239776611328, "rewards/margins_min": 6.74027156829834, "rewards/margins_std": 6.5563836097717285, "rewards/rejected": -16.853437423706055, "step": 1600 }, { "epoch": 0.66, "grad_norm": 3.671875, "learning_rate": 1.5387270753315726e-06, "logits/chosen": 0.5518096089363098, "logits/rejected": 1.32808256149292, "logps/chosen": -816.740234375, "logps/rejected": -2068.84326171875, "loss": 0.1744, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.405552387237549, "rewards/margins": 12.889608383178711, "rewards/margins_max": 20.060169219970703, "rewards/margins_min": 5.719046592712402, "rewards/margins_std": 10.140707015991211, "rewards/rejected": -18.295162200927734, "step": 1610 }, { "epoch": 0.67, "grad_norm": 0.7421875, "learning_rate": 1.5056459904478738e-06, "logits/chosen": 0.5233970880508423, "logits/rejected": 1.1991077661514282, "logps/chosen": -799.1451416015625, "logps/rejected": -1898.0875244140625, "loss": 0.1284, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.3765363693237305, "rewards/margins": 11.210358619689941, "rewards/margins_max": 15.882084846496582, "rewards/margins_min": 6.538631439208984, "rewards/margins_std": 6.606819152832031, "rewards/rejected": -16.586894989013672, "step": 1620 }, { "epoch": 0.67, "grad_norm": 7.25, "learning_rate": 1.4727704616836297e-06, "logits/chosen": 0.4744800925254822, "logits/rejected": 1.247642993927002, "logps/chosen": -778.9432373046875, "logps/rejected": -1884.1217041015625, "loss": 0.0875, "rewards/accuracies": 0.9375, "rewards/chosen": -5.104981422424316, "rewards/margins": 11.539787292480469, "rewards/margins_max": 16.65086555480957, "rewards/margins_min": 6.428709506988525, "rewards/margins_std": 7.228156089782715, "rewards/rejected": -16.6447696685791, "step": 1630 }, { "epoch": 0.68, "grad_norm": 2.90625, "learning_rate": 1.4401072851758835e-06, "logits/chosen": 0.5687705278396606, "logits/rejected": 1.1934126615524292, "logps/chosen": -706.02294921875, "logps/rejected": -1641.071044921875, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": -4.839352607727051, "rewards/margins": 9.687154769897461, "rewards/margins_max": 12.636642456054688, "rewards/margins_min": 6.737668514251709, "rewards/margins_std": 4.17120361328125, "rewards/rejected": -14.526507377624512, "step": 1640 }, { "epoch": 0.68, "grad_norm": 9.0625, "learning_rate": 1.4076632131635226e-06, "logits/chosen": 0.46886777877807617, "logits/rejected": 1.1962741613388062, "logps/chosen": -732.7435302734375, "logps/rejected": -1612.6419677734375, "loss": 0.1473, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.984394550323486, "rewards/margins": 9.234537124633789, "rewards/margins_max": 13.321691513061523, "rewards/margins_min": 5.147382736206055, "rewards/margins_std": 5.780109882354736, "rewards/rejected": -14.218931198120117, "step": 1650 }, { "epoch": 0.68, "grad_norm": 1.1328125, "learning_rate": 1.3754449525914359e-06, "logits/chosen": 0.5064732432365417, "logits/rejected": 1.1770398616790771, "logps/chosen": -800.9207153320312, "logps/rejected": -1703.955810546875, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.16342306137085, "rewards/margins": 9.53877067565918, "rewards/margins_max": 14.161503791809082, "rewards/margins_min": 4.916037559509277, "rewards/margins_std": 6.537531852722168, "rewards/rejected": -14.702194213867188, "step": 1660 }, { "epoch": 0.69, "grad_norm": 11.9375, "learning_rate": 1.343459163724032e-06, "logits/chosen": 0.6023787260055542, "logits/rejected": 1.207897424697876, "logps/chosen": -743.8614501953125, "logps/rejected": -1714.038330078125, "loss": 0.1015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.851070404052734, "rewards/margins": 10.04643440246582, "rewards/margins_max": 14.574777603149414, "rewards/margins_min": 5.518091678619385, "rewards/margins_std": 6.404044151306152, "rewards/rejected": -14.897504806518555, "step": 1670 }, { "epoch": 0.69, "grad_norm": 3.3125, "learning_rate": 1.311712458768406e-06, "logits/chosen": 0.6761046648025513, "logits/rejected": 1.2278960943222046, "logps/chosen": -726.6144409179688, "logps/rejected": -1568.93359375, "loss": 0.1191, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09859561920166, "rewards/margins": 8.505921363830566, "rewards/margins_max": 12.416373252868652, "rewards/margins_min": 4.595466613769531, "rewards/margins_std": 5.530216217041016, "rewards/rejected": -13.604515075683594, "step": 1680 }, { "epoch": 0.7, "grad_norm": 0.41015625, "learning_rate": 1.280211400507444e-06, "logits/chosen": 0.6303955316543579, "logits/rejected": 1.32115638256073, "logps/chosen": -698.5577392578125, "logps/rejected": -1850.8424072265625, "loss": 0.0572, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.735129356384277, "rewards/margins": 11.382537841796875, "rewards/margins_max": 16.18451499938965, "rewards/margins_min": 6.580558776855469, "rewards/margins_std": 6.791023254394531, "rewards/rejected": -16.117666244506836, "step": 1690 }, { "epoch": 0.7, "grad_norm": 1.65625, "learning_rate": 1.2489625009431409e-06, "logits/chosen": 0.5856636762619019, "logits/rejected": 1.2052780389785767, "logps/chosen": -733.9873046875, "logps/rejected": -1687.567138671875, "loss": 0.1764, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.016100883483887, "rewards/margins": 9.677934646606445, "rewards/margins_max": 15.035099983215332, "rewards/margins_min": 4.320771217346191, "rewards/margins_std": 7.576174259185791, "rewards/rejected": -14.694036483764648, "step": 1700 }, { "epoch": 0.7, "grad_norm": 1.6640625, "learning_rate": 1.2179722199504213e-06, "logits/chosen": 0.5713605284690857, "logits/rejected": 1.207334280014038, "logps/chosen": -728.2240600585938, "logps/rejected": -1619.398681640625, "loss": 0.1203, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.969111919403076, "rewards/margins": 9.243115425109863, "rewards/margins_max": 13.461108207702637, "rewards/margins_min": 5.025121212005615, "rewards/margins_std": 5.96514368057251, "rewards/rejected": -14.212226867675781, "step": 1710 }, { "epoch": 0.71, "grad_norm": 3.328125, "learning_rate": 1.187246963941731e-06, "logits/chosen": 0.5765690803527832, "logits/rejected": 1.1067253351211548, "logps/chosen": -698.3675537109375, "logps/rejected": -1611.295654296875, "loss": 0.1114, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.707912445068359, "rewards/margins": 9.343481063842773, "rewards/margins_max": 13.480981826782227, "rewards/margins_min": 5.205979347229004, "rewards/margins_std": 5.851310729980469, "rewards/rejected": -14.05139446258545, "step": 1720 }, { "epoch": 0.71, "grad_norm": 1.2265625, "learning_rate": 1.1567930845426802e-06, "logits/chosen": 0.41190090775489807, "logits/rejected": 1.0678179264068604, "logps/chosen": -716.1203002929688, "logps/rejected": -1831.5726318359375, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": -4.89259147644043, "rewards/margins": 11.263982772827148, "rewards/margins_max": 16.847904205322266, "rewards/margins_min": 5.680062294006348, "rewards/margins_std": 7.89685583114624, "rewards/rejected": -16.156574249267578, "step": 1730 }, { "epoch": 0.72, "grad_norm": 1.640625, "learning_rate": 1.1266168772790195e-06, "logits/chosen": 0.3195948004722595, "logits/rejected": 1.1387958526611328, "logps/chosen": -776.84228515625, "logps/rejected": -1585.8587646484375, "loss": 0.1529, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.140332221984863, "rewards/margins": 8.726045608520508, "rewards/margins_max": 12.733012199401855, "rewards/margins_min": 4.719078063964844, "rewards/margins_std": 5.666707515716553, "rewards/rejected": -13.866376876831055, "step": 1740 }, { "epoch": 0.72, "grad_norm": 0.3984375, "learning_rate": 1.0967245802752044e-06, "logits/chosen": 0.5815094113349915, "logits/rejected": 1.331162691116333, "logps/chosen": -753.7833862304688, "logps/rejected": -1863.087646484375, "loss": 0.0754, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.917272090911865, "rewards/margins": 11.66606330871582, "rewards/margins_max": 16.2323055267334, "rewards/margins_min": 7.099822044372559, "rewards/margins_std": 6.457640171051025, "rewards/rejected": -16.583335876464844, "step": 1750 }, { "epoch": 0.72, "grad_norm": 1.9921875, "learning_rate": 1.0671223729648338e-06, "logits/chosen": 0.5788689851760864, "logits/rejected": 1.1679919958114624, "logps/chosen": -738.84423828125, "logps/rejected": -1693.3870849609375, "loss": 0.1364, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.073354244232178, "rewards/margins": 9.768596649169922, "rewards/margins_max": 14.380800247192383, "rewards/margins_min": 5.156393051147461, "rewards/margins_std": 6.522641658782959, "rewards/rejected": -14.841951370239258, "step": 1760 }, { "epoch": 0.73, "grad_norm": 1.7421875, "learning_rate": 1.0378163748132102e-06, "logits/chosen": 0.49502748250961304, "logits/rejected": 1.2685495615005493, "logps/chosen": -712.3984375, "logps/rejected": -1658.44921875, "loss": 0.0608, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.415982246398926, "rewards/margins": 9.93490219116211, "rewards/margins_max": 14.178258895874023, "rewards/margins_min": 5.691543102264404, "rewards/margins_std": 6.001015663146973, "rewards/rejected": -14.350883483886719, "step": 1770 }, { "epoch": 0.73, "grad_norm": 6.125, "learning_rate": 1.008812644052311e-06, "logits/chosen": 0.4484991431236267, "logits/rejected": 1.1256628036499023, "logps/chosen": -690.5672607421875, "logps/rejected": -1668.564697265625, "loss": 0.0796, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.382365703582764, "rewards/margins": 10.01590347290039, "rewards/margins_max": 13.478933334350586, "rewards/margins_min": 6.552873134613037, "rewards/margins_std": 4.897465229034424, "rewards/rejected": -14.398269653320312, "step": 1780 }, { "epoch": 0.74, "grad_norm": 0.796875, "learning_rate": 9.801171764284072e-07, "logits/chosen": 0.5813416838645935, "logits/rejected": 1.228780746459961, "logps/chosen": -712.302734375, "logps/rejected": -1828.1448974609375, "loss": 0.0642, "rewards/accuracies": 1.0, "rewards/chosen": -4.676440238952637, "rewards/margins": 11.309109687805176, "rewards/margins_max": 16.14137840270996, "rewards/margins_min": 6.4768385887146, "rewards/margins_std": 6.833861351013184, "rewards/rejected": -15.985549926757812, "step": 1790 }, { "epoch": 0.74, "grad_norm": 1.4453125, "learning_rate": 9.517359039626043e-07, "logits/chosen": 0.5194617509841919, "logits/rejected": 1.1831514835357666, "logps/chosen": -732.8680419921875, "logps/rejected": -1726.790283203125, "loss": 0.0712, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.601668357849121, "rewards/margins": 10.169393539428711, "rewards/margins_max": 15.003515243530273, "rewards/margins_min": 5.335273742675781, "rewards/margins_std": 6.836478233337402, "rewards/rejected": -14.771062850952148, "step": 1800 }, { "epoch": 0.75, "grad_norm": 0.83203125, "learning_rate": 9.23674693724555e-07, "logits/chosen": 0.2990169823169708, "logits/rejected": 0.9671838879585266, "logps/chosen": -760.0450439453125, "logps/rejected": -1871.8310546875, "loss": 0.0583, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.976813316345215, "rewards/margins": 11.278745651245117, "rewards/margins_max": 14.921788215637207, "rewards/margins_min": 7.63570499420166, "rewards/margins_std": 5.152037620544434, "rewards/rejected": -16.255558013916016, "step": 1810 }, { "epoch": 0.75, "grad_norm": 1.6328125, "learning_rate": 8.959393466195973e-07, "logits/chosen": 0.41968780755996704, "logits/rejected": 1.290880799293518, "logps/chosen": -761.1870727539062, "logps/rejected": -1643.1441650390625, "loss": 0.0557, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9296698570251465, "rewards/margins": 9.346491813659668, "rewards/margins_max": 12.647331237792969, "rewards/margins_min": 6.045652866363525, "rewards/margins_std": 4.6680908203125, "rewards/rejected": -14.276163101196289, "step": 1820 }, { "epoch": 0.75, "grad_norm": 0.86328125, "learning_rate": 8.685355961895783e-07, "logits/chosen": 0.687114417552948, "logits/rejected": 1.4132459163665771, "logps/chosen": -767.54248046875, "logps/rejected": -1862.729248046875, "loss": 0.0503, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.025767803192139, "rewards/margins": 11.429033279418945, "rewards/margins_max": 17.451461791992188, "rewards/margins_min": 5.406604290008545, "rewards/margins_std": 8.517000198364258, "rewards/rejected": -16.45479965209961, "step": 1830 }, { "epoch": 0.76, "grad_norm": 0.73046875, "learning_rate": 8.414691074275916e-07, "logits/chosen": 0.4633597433567047, "logits/rejected": 1.248290777206421, "logps/chosen": -777.6952514648438, "logps/rejected": -1863.720458984375, "loss": 0.0822, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.001014232635498, "rewards/margins": 11.318872451782227, "rewards/margins_max": 15.89136028289795, "rewards/margins_min": 6.7463860511779785, "rewards/margins_std": 6.466473579406738, "rewards/rejected": -16.319889068603516, "step": 1840 }, { "epoch": 0.76, "grad_norm": 0.9453125, "learning_rate": 8.147454756068937e-07, "logits/chosen": 0.5497418642044067, "logits/rejected": 1.2043471336364746, "logps/chosen": -709.6234130859375, "logps/rejected": -1719.3043212890625, "loss": 0.0768, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.652140140533447, "rewards/margins": 10.480083465576172, "rewards/margins_max": 15.241083145141602, "rewards/margins_min": 5.719081401824951, "rewards/margins_std": 6.733071804046631, "rewards/rejected": -15.132222175598145, "step": 1850 }, { "epoch": 0.77, "grad_norm": 1.1640625, "learning_rate": 7.883702251242298e-07, "logits/chosen": 0.45454102754592896, "logits/rejected": 1.1140748262405396, "logps/chosen": -678.3165283203125, "logps/rejected": -1609.6807861328125, "loss": 0.1038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.47580623626709, "rewards/margins": 9.60850715637207, "rewards/margins_max": 13.529436111450195, "rewards/margins_min": 5.6875810623168945, "rewards/margins_std": 5.545028209686279, "rewards/rejected": -14.084314346313477, "step": 1860 }, { "epoch": 0.77, "grad_norm": 0.34765625, "learning_rate": 7.623488083578148e-07, "logits/chosen": 0.48715901374816895, "logits/rejected": 1.142924189567566, "logps/chosen": -676.9874267578125, "logps/rejected": -1663.1302490234375, "loss": 0.1045, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.443617820739746, "rewards/margins": 10.04162311553955, "rewards/margins_max": 15.552085876464844, "rewards/margins_min": 4.531158447265625, "rewards/margins_std": 7.792973518371582, "rewards/rejected": -14.485241889953613, "step": 1870 }, { "epoch": 0.77, "grad_norm": 1.03125, "learning_rate": 7.366866045401968e-07, "logits/chosen": 0.5052765607833862, "logits/rejected": 1.288438081741333, "logps/chosen": -724.85302734375, "logps/rejected": -1638.660400390625, "loss": 0.085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.777608394622803, "rewards/margins": 9.423945426940918, "rewards/margins_max": 13.84093952178955, "rewards/margins_min": 5.006953239440918, "rewards/margins_std": 6.246571063995361, "rewards/rejected": -14.201555252075195, "step": 1880 }, { "epoch": 0.78, "grad_norm": 3.140625, "learning_rate": 7.113889186462477e-07, "logits/chosen": 0.6119362115859985, "logits/rejected": 1.1571754217147827, "logps/chosen": -736.3836669921875, "logps/rejected": -1706.408447265625, "loss": 0.079, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75775146484375, "rewards/margins": 10.034549713134766, "rewards/margins_max": 14.87476921081543, "rewards/margins_min": 5.194329738616943, "rewards/margins_std": 6.845104217529297, "rewards/rejected": -14.7923002243042, "step": 1890 }, { "epoch": 0.78, "grad_norm": 0.6328125, "learning_rate": 6.864609802964978e-07, "logits/chosen": 0.5309674143791199, "logits/rejected": 1.2003862857818604, "logps/chosen": -700.8447265625, "logps/rejected": -1731.847412109375, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": -4.624502658843994, "rewards/margins": 10.476162910461426, "rewards/margins_max": 15.123468399047852, "rewards/margins_min": 5.828855991363525, "rewards/margins_std": 6.572283744812012, "rewards/rejected": -15.100665092468262, "step": 1900 }, { "epoch": 0.79, "grad_norm": 6.1875, "learning_rate": 6.619079426760545e-07, "logits/chosen": 0.49570074677467346, "logits/rejected": 1.1981004476547241, "logps/chosen": -769.2633056640625, "logps/rejected": -1941.0648193359375, "loss": 0.0931, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.233429908752441, "rewards/margins": 12.01481819152832, "rewards/margins_max": 17.587182998657227, "rewards/margins_min": 6.4424543380737305, "rewards/margins_std": 7.8805131912231445, "rewards/rejected": -17.248249053955078, "step": 1910 }, { "epoch": 0.79, "grad_norm": 10.3125, "learning_rate": 6.377348814693174e-07, "logits/chosen": 0.5919948220252991, "logits/rejected": 1.398564338684082, "logps/chosen": -762.436279296875, "logps/rejected": -1769.9222412109375, "loss": 0.113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.956363201141357, "rewards/margins": 10.631746292114258, "rewards/margins_max": 15.959482192993164, "rewards/margins_min": 5.304008483886719, "rewards/margins_std": 7.5345563888549805, "rewards/rejected": -15.588109016418457, "step": 1920 }, { "epoch": 0.79, "grad_norm": 0.2138671875, "learning_rate": 6.139467938107169e-07, "logits/chosen": 0.38951975107192993, "logits/rejected": 1.1649284362792969, "logps/chosen": -778.3822021484375, "logps/rejected": -2023.4833984375, "loss": 0.0779, "rewards/accuracies": 1.0, "rewards/chosen": -5.072902679443359, "rewards/margins": 12.735228538513184, "rewards/margins_max": 17.881254196166992, "rewards/margins_min": 7.589202880859375, "rewards/margins_std": 7.277578830718994, "rewards/rejected": -17.80813217163086, "step": 1930 }, { "epoch": 0.8, "grad_norm": 0.625, "learning_rate": 5.905485972516903e-07, "logits/chosen": 0.5617870092391968, "logits/rejected": 1.2924219369888306, "logps/chosen": -818.1054077148438, "logps/rejected": -1890.950927734375, "loss": 0.13, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.535712718963623, "rewards/margins": 11.305818557739258, "rewards/margins_max": 17.704103469848633, "rewards/margins_min": 4.907529830932617, "rewards/margins_std": 9.048542976379395, "rewards/rejected": -16.841529846191406, "step": 1940 }, { "epoch": 0.8, "grad_norm": 1.03125, "learning_rate": 5.675451287441072e-07, "logits/chosen": 0.7306760549545288, "logits/rejected": 1.395262360572815, "logps/chosen": -816.7340087890625, "logps/rejected": -1670.490234375, "loss": 0.1545, "rewards/accuracies": 0.9375, "rewards/chosen": -5.269486427307129, "rewards/margins": 9.290313720703125, "rewards/margins_max": 13.515324592590332, "rewards/margins_min": 5.065301895141602, "rewards/margins_std": 5.975068092346191, "rewards/rejected": -14.55980110168457, "step": 1950 }, { "epoch": 0.81, "grad_norm": 5.28125, "learning_rate": 5.449411436403632e-07, "logits/chosen": 0.7268288135528564, "logits/rejected": 1.3329485654830933, "logps/chosen": -696.4193115234375, "logps/rejected": -1806.069091796875, "loss": 0.0933, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.67076301574707, "rewards/margins": 11.251920700073242, "rewards/margins_max": 16.409025192260742, "rewards/margins_min": 6.094817638397217, "rewards/margins_std": 7.2932448387146, "rewards/rejected": -15.92268180847168, "step": 1960 }, { "epoch": 0.81, "grad_norm": 4.28125, "learning_rate": 5.227413147103336e-07, "logits/chosen": 0.5869401693344116, "logits/rejected": 1.2344766855239868, "logps/chosen": -729.2957153320312, "logps/rejected": -1567.1939697265625, "loss": 0.1098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.183860778808594, "rewards/margins": 8.676237106323242, "rewards/margins_max": 12.739700317382812, "rewards/margins_min": 4.6127729415893555, "rewards/margins_std": 5.746604919433594, "rewards/rejected": -13.86009693145752, "step": 1970 }, { "epoch": 0.82, "grad_norm": 2.46875, "learning_rate": 5.009502311754081e-07, "logits/chosen": 0.5038915872573853, "logits/rejected": 1.1727396249771118, "logps/chosen": -724.4193725585938, "logps/rejected": -1736.091064453125, "loss": 0.146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8910675048828125, "rewards/margins": 10.278018951416016, "rewards/margins_max": 15.358955383300781, "rewards/margins_min": 5.197081565856934, "rewards/margins_std": 7.1855292320251465, "rewards/rejected": -15.169085502624512, "step": 1980 }, { "epoch": 0.82, "grad_norm": 0.703125, "learning_rate": 4.795723977597844e-07, "logits/chosen": 0.5357404947280884, "logits/rejected": 1.154956579208374, "logps/chosen": -719.9663696289062, "logps/rejected": -1709.3209228515625, "loss": 0.0843, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.783188819885254, "rewards/margins": 10.301424026489258, "rewards/margins_max": 14.93207836151123, "rewards/margins_min": 5.670768737792969, "rewards/margins_std": 6.54873514175415, "rewards/rejected": -15.084611892700195, "step": 1990 }, { "epoch": 0.82, "grad_norm": 1.28125, "learning_rate": 4.586122337592444e-07, "logits/chosen": 0.48415178060531616, "logits/rejected": 1.2925946712493896, "logps/chosen": -734.2363891601562, "logps/rejected": -1889.037353515625, "loss": 0.0469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.827243804931641, "rewards/margins": 11.82060432434082, "rewards/margins_max": 17.99706268310547, "rewards/margins_min": 5.644143104553223, "rewards/margins_std": 8.734832763671875, "rewards/rejected": -16.647846221923828, "step": 2000 }, { "epoch": 0.83, "grad_norm": 2.8125, "learning_rate": 4.380740721275786e-07, "logits/chosen": 0.6227355003356934, "logits/rejected": 1.2969437837600708, "logps/chosen": -780.354736328125, "logps/rejected": -1921.7880859375, "loss": 0.0832, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.117171287536621, "rewards/margins": 11.844526290893555, "rewards/margins_max": 17.806795120239258, "rewards/margins_min": 5.882256031036377, "rewards/margins_std": 8.43192195892334, "rewards/rejected": -16.96169662475586, "step": 2010 }, { "epoch": 0.83, "grad_norm": 0.197265625, "learning_rate": 4.1796215858086577e-07, "logits/chosen": 0.6349445581436157, "logits/rejected": 1.3867194652557373, "logps/chosen": -799.1439208984375, "logps/rejected": -1852.981689453125, "loss": 0.1059, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.414491176605225, "rewards/margins": 11.105630874633789, "rewards/margins_max": 17.207439422607422, "rewards/margins_min": 5.003822326660156, "rewards/margins_std": 8.629260063171387, "rewards/rejected": -16.520122528076172, "step": 2020 }, { "epoch": 0.84, "grad_norm": 0.2412109375, "learning_rate": 3.982806507197831e-07, "logits/chosen": 0.6008701324462891, "logits/rejected": 1.230450987815857, "logps/chosen": -744.5529174804688, "logps/rejected": -1744.4771728515625, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": -4.9854936599731445, "rewards/margins": 9.673846244812012, "rewards/margins_max": 13.759689331054688, "rewards/margins_min": 5.5880022048950195, "rewards/margins_std": 5.778256416320801, "rewards/rejected": -14.659339904785156, "step": 2030 }, { "epoch": 0.84, "grad_norm": 3.765625, "learning_rate": 3.790336171701331e-07, "logits/chosen": 0.5796440839767456, "logits/rejected": 1.2289059162139893, "logps/chosen": -720.2902221679688, "logps/rejected": -1942.671142578125, "loss": 0.0773, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.671596050262451, "rewards/margins": 12.241273880004883, "rewards/margins_max": 16.92319107055664, "rewards/margins_min": 7.55935525894165, "rewards/margins_std": 6.621232509613037, "rewards/rejected": -16.912870407104492, "step": 2040 }, { "epoch": 0.84, "grad_norm": 1.515625, "learning_rate": 3.6022503674176537e-07, "logits/chosen": 0.5198200941085815, "logits/rejected": 1.3343006372451782, "logps/chosen": -796.9490966796875, "logps/rejected": -1900.691162109375, "loss": 0.09, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.069548606872559, "rewards/margins": 11.76386547088623, "rewards/margins_max": 17.033977508544922, "rewards/margins_min": 6.4937543869018555, "rewards/margins_std": 7.453061580657959, "rewards/rejected": -16.83341407775879, "step": 2050 }, { "epoch": 0.85, "grad_norm": 1.0625, "learning_rate": 3.4185879760606525e-07, "logits/chosen": 0.5187299847602844, "logits/rejected": 1.1834654808044434, "logps/chosen": -736.1161499023438, "logps/rejected": -1869.806640625, "loss": 0.0581, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.959606647491455, "rewards/margins": 11.514524459838867, "rewards/margins_max": 16.145587921142578, "rewards/margins_min": 6.883460998535156, "rewards/margins_std": 6.549312591552734, "rewards/rejected": -16.474130630493164, "step": 2060 }, { "epoch": 0.85, "grad_norm": 0.5859375, "learning_rate": 3.2393869649217454e-07, "logits/chosen": 0.5701602697372437, "logits/rejected": 1.3300843238830566, "logps/chosen": -761.6326904296875, "logps/rejected": -1840.357177734375, "loss": 0.0662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.093400478363037, "rewards/margins": 11.083673477172852, "rewards/margins_max": 15.876144409179688, "rewards/margins_min": 6.291202545166016, "rewards/margins_std": 6.777576446533203, "rewards/rejected": -16.177074432373047, "step": 2070 }, { "epoch": 0.86, "grad_norm": 2.5, "learning_rate": 3.064684379021207e-07, "logits/chosen": 0.43363428115844727, "logits/rejected": 1.0424432754516602, "logps/chosen": -684.9832763671875, "logps/rejected": -1844.010009765625, "loss": 0.054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.640398979187012, "rewards/margins": 11.688333511352539, "rewards/margins_max": 16.679744720458984, "rewards/margins_min": 6.696922302246094, "rewards/margins_std": 7.058920860290527, "rewards/rejected": -16.328731536865234, "step": 2080 }, { "epoch": 0.86, "grad_norm": 1.2421875, "learning_rate": 2.894516333450115e-07, "logits/chosen": 0.5114481449127197, "logits/rejected": 1.14482843875885, "logps/chosen": -735.6387939453125, "logps/rejected": -1779.406005859375, "loss": 0.1105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.890014171600342, "rewards/margins": 10.568880081176758, "rewards/margins_max": 14.828028678894043, "rewards/margins_min": 6.309730529785156, "rewards/margins_std": 6.023346900939941, "rewards/rejected": -15.458892822265625, "step": 2090 }, { "epoch": 0.86, "grad_norm": 0.4921875, "learning_rate": 2.728918005904513e-07, "logits/chosen": 0.3923017084598541, "logits/rejected": 1.0707480907440186, "logps/chosen": -806.395751953125, "logps/rejected": -1856.805908203125, "loss": 0.1777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.439145565032959, "rewards/margins": 10.84535026550293, "rewards/margins_max": 16.374027252197266, "rewards/margins_min": 5.316674709320068, "rewards/margins_std": 7.818729400634766, "rewards/rejected": -16.284496307373047, "step": 2100 }, { "epoch": 0.87, "grad_norm": 0.87109375, "learning_rate": 2.5679236294133493e-07, "logits/chosen": 0.5716456174850464, "logits/rejected": 1.229247808456421, "logps/chosen": -732.1973876953125, "logps/rejected": -1738.814697265625, "loss": 0.0752, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.678011894226074, "rewards/margins": 10.548458099365234, "rewards/margins_max": 15.896784782409668, "rewards/margins_min": 5.200132369995117, "rewards/margins_std": 7.563673496246338, "rewards/rejected": -15.226470947265625, "step": 2110 }, { "epoch": 0.87, "grad_norm": 2.453125, "learning_rate": 2.4115664852617294e-07, "logits/chosen": 0.5404381155967712, "logits/rejected": 1.2678707838058472, "logps/chosen": -750.8515014648438, "logps/rejected": -1882.701416015625, "loss": 0.1361, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.122988700866699, "rewards/margins": 11.382095336914062, "rewards/margins_max": 17.519346237182617, "rewards/margins_min": 5.244842529296875, "rewards/margins_std": 8.679386138916016, "rewards/rejected": -16.505083084106445, "step": 2120 }, { "epoch": 0.88, "grad_norm": 0.458984375, "learning_rate": 2.2598788961108897e-07, "logits/chosen": 0.5512218475341797, "logits/rejected": 1.235686182975769, "logps/chosen": -703.8411865234375, "logps/rejected": -1646.0699462890625, "loss": 0.0847, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.887363910675049, "rewards/margins": 9.765475273132324, "rewards/margins_max": 14.917486190795898, "rewards/margins_min": 4.613465309143066, "rewards/margins_std": 7.286043643951416, "rewards/rejected": -14.652839660644531, "step": 2130 }, { "epoch": 0.88, "grad_norm": 1.4609375, "learning_rate": 2.1128922193163564e-07, "logits/chosen": 0.5618628263473511, "logits/rejected": 1.2874120473861694, "logps/chosen": -742.8013916015625, "logps/rejected": -1916.2711181640625, "loss": 0.1083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.009154796600342, "rewards/margins": 12.049230575561523, "rewards/margins_max": 17.573997497558594, "rewards/margins_min": 6.5244646072387695, "rewards/margins_std": 7.813199043273926, "rewards/rejected": -17.058387756347656, "step": 2140 }, { "epoch": 0.89, "grad_norm": 3.203125, "learning_rate": 1.9706368404456472e-07, "logits/chosen": 0.4528091549873352, "logits/rejected": 1.111604928970337, "logps/chosen": -745.5892333984375, "logps/rejected": -1809.7484130859375, "loss": 0.0911, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.078321933746338, "rewards/margins": 10.962045669555664, "rewards/margins_max": 15.926958084106445, "rewards/margins_min": 5.997132778167725, "rewards/margins_std": 7.02144718170166, "rewards/rejected": -16.040367126464844, "step": 2150 }, { "epoch": 0.89, "grad_norm": 3.75, "learning_rate": 1.8331421669968708e-07, "logits/chosen": 0.6266171336174011, "logits/rejected": 1.3515210151672363, "logps/chosen": -768.0899047851562, "logps/rejected": -1722.0845947265625, "loss": 0.1013, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.038366794586182, "rewards/margins": 9.89087200164795, "rewards/margins_max": 14.385602951049805, "rewards/margins_min": 5.396140098571777, "rewards/margins_std": 6.356511116027832, "rewards/rejected": -14.929239273071289, "step": 2160 }, { "epoch": 0.89, "grad_norm": 5.0, "learning_rate": 1.7004366223194984e-07, "logits/chosen": 0.5014376044273376, "logits/rejected": 1.2441834211349487, "logps/chosen": -747.4188232421875, "logps/rejected": -1764.168212890625, "loss": 0.1175, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.955763339996338, "rewards/margins": 10.59716510772705, "rewards/margins_max": 15.370004653930664, "rewards/margins_min": 5.8243255615234375, "rewards/margins_std": 6.749813079833984, "rewards/rejected": -15.552927017211914, "step": 2170 }, { "epoch": 0.9, "grad_norm": 1.21875, "learning_rate": 1.5725476397386197e-07, "logits/chosen": 0.3932679295539856, "logits/rejected": 1.2315890789031982, "logps/chosen": -707.3714599609375, "logps/rejected": -1953.8558349609375, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.406428337097168, "rewards/margins": 12.903097152709961, "rewards/margins_max": 18.890384674072266, "rewards/margins_min": 6.915809631347656, "rewards/margins_std": 8.467303276062012, "rewards/rejected": -17.309528350830078, "step": 2180 }, { "epoch": 0.9, "grad_norm": 1.0625, "learning_rate": 1.4495016568838198e-07, "logits/chosen": 0.44051748514175415, "logits/rejected": 1.1644173860549927, "logps/chosen": -771.8656616210938, "logps/rejected": -1771.4254150390625, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.945860385894775, "rewards/margins": 10.547651290893555, "rewards/margins_max": 15.509611129760742, "rewards/margins_min": 5.585693359375, "rewards/margins_std": 7.017270565032959, "rewards/rejected": -15.493513107299805, "step": 2190 }, { "epoch": 0.91, "grad_norm": 0.90625, "learning_rate": 1.3313241102239056e-07, "logits/chosen": 0.6278412342071533, "logits/rejected": 1.4244401454925537, "logps/chosen": -682.3499755859375, "logps/rejected": -1634.0439453125, "loss": 0.085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.619953155517578, "rewards/margins": 9.823567390441895, "rewards/margins_max": 13.926409721374512, "rewards/margins_min": 5.720723628997803, "rewards/margins_std": 5.802296161651611, "rewards/rejected": -14.443519592285156, "step": 2200 }, { "epoch": 0.91, "grad_norm": 1.078125, "learning_rate": 1.2180394298086095e-07, "logits/chosen": 0.5217547416687012, "logits/rejected": 1.2030553817749023, "logps/chosen": -736.8463745117188, "logps/rejected": -1752.949951171875, "loss": 0.0513, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.804549217224121, "rewards/margins": 10.467925071716309, "rewards/margins_max": 14.837709426879883, "rewards/margins_min": 6.098140716552734, "rewards/margins_std": 6.179808139801025, "rewards/rejected": -15.272473335266113, "step": 2210 }, { "epoch": 0.91, "grad_norm": 1.328125, "learning_rate": 1.1096710342183042e-07, "logits/chosen": 0.4959636628627777, "logits/rejected": 1.1241891384124756, "logps/chosen": -719.9772338867188, "logps/rejected": -1887.967529296875, "loss": 0.0614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.843437671661377, "rewards/margins": 12.002474784851074, "rewards/margins_max": 17.96475601196289, "rewards/margins_min": 6.040192604064941, "rewards/margins_std": 8.431940078735352, "rewards/rejected": -16.84591293334961, "step": 2220 }, { "epoch": 0.92, "grad_norm": 3.421875, "learning_rate": 1.0062413257228676e-07, "logits/chosen": 0.5790583491325378, "logits/rejected": 1.3127405643463135, "logps/chosen": -810.7774658203125, "logps/rejected": -1996.8209228515625, "loss": 0.0655, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.4459967613220215, "rewards/margins": 12.252093315124512, "rewards/margins_max": 17.894405364990234, "rewards/margins_min": 6.6097846031188965, "rewards/margins_std": 7.979430198669434, "rewards/rejected": -17.698089599609375, "step": 2230 }, { "epoch": 0.92, "grad_norm": 6.78125, "learning_rate": 9.077716856505825e-08, "logits/chosen": 0.5055748224258423, "logits/rejected": 1.3198411464691162, "logps/chosen": -762.1981201171875, "logps/rejected": -1757.9710693359375, "loss": 0.1311, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.086750030517578, "rewards/margins": 10.44882583618164, "rewards/margins_max": 14.819003105163574, "rewards/margins_min": 6.078649044036865, "rewards/margins_std": 6.180363178253174, "rewards/rejected": -15.535575866699219, "step": 2240 }, { "epoch": 0.93, "grad_norm": 2.671875, "learning_rate": 8.142824699681501e-08, "logits/chosen": 0.5170903205871582, "logits/rejected": 1.170555830001831, "logps/chosen": -716.73583984375, "logps/rejected": -1645.933349609375, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": -4.820572376251221, "rewards/margins": 9.699037551879883, "rewards/margins_max": 15.123661994934082, "rewards/margins_min": 4.274412155151367, "rewards/margins_std": 7.671577453613281, "rewards/rejected": -14.519609451293945, "step": 2250 }, { "epoch": 0.93, "grad_norm": 1.5859375, "learning_rate": 7.257930050726003e-08, "logits/chosen": 0.5653474926948547, "logits/rejected": 1.3792940378189087, "logps/chosen": -771.275634765625, "logps/rejected": -1766.762939453125, "loss": 0.095, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0928826332092285, "rewards/margins": 10.51708698272705, "rewards/margins_max": 15.350242614746094, "rewards/margins_min": 5.683931827545166, "rewards/margins_std": 6.835114479064941, "rewards/rejected": -15.609970092773438, "step": 2260 }, { "epoch": 0.93, "grad_norm": 0.78125, "learning_rate": 6.423215837961045e-08, "logits/chosen": 0.5271497368812561, "logits/rejected": 1.3941797018051147, "logps/chosen": -722.5052490234375, "logps/rejected": -1865.8720703125, "loss": 0.0869, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.881400108337402, "rewards/margins": 11.674361228942871, "rewards/margins_max": 16.58696746826172, "rewards/margins_min": 6.76175594329834, "rewards/margins_std": 6.947473049163818, "rewards/rejected": -16.555761337280273, "step": 2270 }, { "epoch": 0.94, "grad_norm": 4.6875, "learning_rate": 5.6388546162442215e-08, "logits/chosen": 0.6665564775466919, "logits/rejected": 1.2474958896636963, "logps/chosen": -746.4205322265625, "logps/rejected": -1758.126708984375, "loss": 0.111, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9972076416015625, "rewards/margins": 10.384644508361816, "rewards/margins_max": 15.061482429504395, "rewards/margins_min": 5.707806587219238, "rewards/margins_std": 6.614047050476074, "rewards/rejected": -15.381853103637695, "step": 2280 }, { "epoch": 0.94, "grad_norm": 2.453125, "learning_rate": 4.905008531297661e-08, "logits/chosen": 0.43994975090026855, "logits/rejected": 1.1360465288162231, "logps/chosen": -817.5567016601562, "logps/rejected": -1885.947998046875, "loss": 0.1015, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.406924247741699, "rewards/margins": 11.238929748535156, "rewards/margins_max": 16.946365356445312, "rewards/margins_min": 5.531497955322266, "rewards/margins_std": 8.071528434753418, "rewards/rejected": -16.645854949951172, "step": 2290 }, { "epoch": 0.95, "grad_norm": 2.9375, "learning_rate": 4.2218292861889444e-08, "logits/chosen": 0.5859326124191284, "logits/rejected": 1.2896353006362915, "logps/chosen": -773.9613037109375, "logps/rejected": -1735.0863037109375, "loss": 0.0918, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.150477409362793, "rewards/margins": 10.18777084350586, "rewards/margins_max": 15.025718688964844, "rewards/margins_min": 5.34982442855835, "rewards/margins_std": 6.84188985824585, "rewards/rejected": -15.338247299194336, "step": 2300 }, { "epoch": 0.95, "grad_norm": 0.4765625, "learning_rate": 3.589458109970467e-08, "logits/chosen": 0.5294678807258606, "logits/rejected": 1.2444860935211182, "logps/chosen": -743.5758666992188, "logps/rejected": -1683.407958984375, "loss": 0.1648, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.184317588806152, "rewards/margins": 9.598274230957031, "rewards/margins_max": 14.4242582321167, "rewards/margins_min": 4.7722883224487305, "rewards/margins_std": 6.824974060058594, "rewards/rejected": -14.78258991241455, "step": 2310 }, { "epoch": 0.96, "grad_norm": 2.265625, "learning_rate": 3.008025728484132e-08, "logits/chosen": 0.5059491991996765, "logits/rejected": 1.3568942546844482, "logps/chosen": -737.1229858398438, "logps/rejected": -1987.5445556640625, "loss": 0.091, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.759067535400391, "rewards/margins": 12.818672180175781, "rewards/margins_max": 18.131336212158203, "rewards/margins_min": 7.506007194519043, "rewards/margins_std": 7.513242244720459, "rewards/rejected": -17.577739715576172, "step": 2320 }, { "epoch": 0.96, "grad_norm": 0.41015625, "learning_rate": 2.4776523373372385e-08, "logits/chosen": 0.5932313799858093, "logits/rejected": 1.2811510562896729, "logps/chosen": -719.3793334960938, "logps/rejected": -1677.4947509765625, "loss": 0.075, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.827818870544434, "rewards/margins": 9.848733901977539, "rewards/margins_max": 14.251507759094238, "rewards/margins_min": 5.44596004486084, "rewards/margins_std": 6.226462364196777, "rewards/rejected": -14.676549911499023, "step": 2330 }, { "epoch": 0.96, "grad_norm": 2.953125, "learning_rate": 1.998447577055307e-08, "logits/chosen": 0.531481146812439, "logits/rejected": 1.223459005355835, "logps/chosen": -786.4287109375, "logps/rejected": -1908.134765625, "loss": 0.1179, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.9455766677856445, "rewards/margins": 11.500458717346191, "rewards/margins_max": 16.30417823791504, "rewards/margins_min": 6.696742057800293, "rewards/margins_std": 6.793482303619385, "rewards/rejected": -16.44603729248047, "step": 2340 }, { "epoch": 0.97, "grad_norm": 1.765625, "learning_rate": 1.5705105104167617e-08, "logits/chosen": 0.446284681558609, "logits/rejected": 1.1109905242919922, "logps/chosen": -792.7479858398438, "logps/rejected": -1819.526611328125, "loss": 0.0377, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.261641502380371, "rewards/margins": 10.497610092163086, "rewards/margins_max": 15.17370891571045, "rewards/margins_min": 5.821512699127197, "rewards/margins_std": 6.613001346588135, "rewards/rejected": -15.759251594543457, "step": 2350 }, { "epoch": 0.97, "grad_norm": 0.7265625, "learning_rate": 1.1939296019744529e-08, "logits/chosen": 0.5473194122314453, "logits/rejected": 1.104913353919983, "logps/chosen": -683.9682006835938, "logps/rejected": -1851.5924072265625, "loss": 0.076, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.581644058227539, "rewards/margins": 11.716299057006836, "rewards/margins_max": 16.266063690185547, "rewards/margins_min": 7.166537284851074, "rewards/margins_std": 6.434335231781006, "rewards/rejected": -16.29794692993164, "step": 2360 }, { "epoch": 0.98, "grad_norm": 0.345703125, "learning_rate": 8.687826997678116e-09, "logits/chosen": 0.5780460834503174, "logits/rejected": 1.2975406646728516, "logps/chosen": -729.7520751953125, "logps/rejected": -1835.2408447265625, "loss": 0.0673, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.86895751953125, "rewards/margins": 11.184858322143555, "rewards/margins_max": 15.93799114227295, "rewards/margins_min": 6.431723117828369, "rewards/margins_std": 6.721946716308594, "rewards/rejected": -16.053813934326172, "step": 2370 }, { "epoch": 0.98, "grad_norm": 0.6484375, "learning_rate": 5.951370192300576e-09, "logits/chosen": 0.5345112085342407, "logits/rejected": 1.213844656944275, "logps/chosen": -702.4107055664062, "logps/rejected": -1682.540283203125, "loss": 0.0844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.7045135498046875, "rewards/margins": 10.013148307800293, "rewards/margins_max": 14.771675109863281, "rewards/margins_min": 5.254621982574463, "rewards/margins_std": 6.729572296142578, "rewards/rejected": -14.71766185760498, "step": 2380 }, { "epoch": 0.98, "grad_norm": 0.57421875, "learning_rate": 3.730491292930072e-09, "logits/chosen": 0.5803747773170471, "logits/rejected": 1.2554329633712769, "logps/chosen": -735.1619262695312, "logps/rejected": -1728.2470703125, "loss": 0.065, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.109474182128906, "rewards/margins": 10.042867660522461, "rewards/margins_max": 15.090121269226074, "rewards/margins_min": 4.995615482330322, "rewards/margins_std": 7.1378936767578125, "rewards/rejected": -15.152341842651367, "step": 2390 }, { "epoch": 0.99, "grad_norm": 2.140625, "learning_rate": 2.0256494069306744e-09, "logits/chosen": 0.5878351926803589, "logits/rejected": 1.3185656070709229, "logps/chosen": -693.908203125, "logps/rejected": -1897.8695068359375, "loss": 0.1782, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.697625160217285, "rewards/margins": 12.107023239135742, "rewards/margins_max": 17.24478530883789, "rewards/margins_min": 6.969258785247803, "rewards/margins_std": 7.265894889831543, "rewards/rejected": -16.80464744567871, "step": 2400 }, { "epoch": 0.99, "grad_norm": 1.828125, "learning_rate": 8.371969648043876e-10, "logits/chosen": 0.6715101003646851, "logits/rejected": 1.3766255378723145, "logps/chosen": -733.6317138671875, "logps/rejected": -1728.0810546875, "loss": 0.1379, "rewards/accuracies": 0.9375, "rewards/chosen": -4.896046161651611, "rewards/margins": 10.241785049438477, "rewards/margins_max": 14.780932426452637, "rewards/margins_min": 5.702638626098633, "rewards/margins_std": 6.419322967529297, "rewards/rejected": -15.13783073425293, "step": 2410 }, { "epoch": 1.0, "grad_norm": 0.59765625, "learning_rate": 1.653796473341518e-10, "logits/chosen": 0.4578518271446228, "logits/rejected": 1.291621446609497, "logps/chosen": -713.6257934570312, "logps/rejected": -1659.1171875, "loss": 0.0983, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.576380729675293, "rewards/margins": 9.997756958007812, "rewards/margins_max": 14.97430419921875, "rewards/margins_min": 5.021212577819824, "rewards/margins_std": 7.037899017333984, "rewards/rejected": -14.574139595031738, "step": 2420 }, { "epoch": 1.0, "eval_logits/chosen": 1.1567333936691284, "eval_logits/rejected": 1.3502662181854248, "eval_logps/chosen": -815.366455078125, "eval_logps/rejected": -885.2385864257812, "eval_loss": 0.8997361063957214, "eval_rewards/accuracies": 0.5877500176429749, "eval_rewards/chosen": -4.770576000213623, "eval_rewards/margins": 0.8909361362457275, "eval_rewards/margins_max": 6.031704425811768, "eval_rewards/margins_min": -2.9257826805114746, "eval_rewards/margins_std": 2.9022324085235596, "eval_rewards/rejected": -5.66151237487793, "eval_runtime": 1670.0359, "eval_samples_per_second": 4.79, "eval_steps_per_second": 0.299, "step": 2428 }, { "epoch": 1.0, "step": 2428, "total_flos": 0.0, "train_loss": 0.19075011757787017, "train_runtime": 22524.5017, "train_samples_per_second": 1.725, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 2428, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }