diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4423 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2428, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 2.0576131687242803e-08, + "logits/chosen": 0.24564924836158752, + "logits/rejected": 1.0062695741653442, + "logps/chosen": -229.83255004882812, + "logps/rejected": -164.65399169921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.384765625, + "learning_rate": 2.05761316872428e-07, + "logits/chosen": -0.0490909218788147, + "logits/rejected": 0.6121826171875, + "logps/chosen": -238.83880615234375, + "logps/rejected": -207.5596923828125, + "loss": 0.6931, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -0.00032901231315918267, + "rewards/margins": 0.0006913852412253618, + "rewards/margins_max": 0.002890574047341943, + "rewards/margins_min": -0.0015078035648912191, + "rewards/margins_std": 0.0031101228669285774, + "rewards/rejected": -0.001020397525280714, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.443359375, + "learning_rate": 4.11522633744856e-07, + "logits/chosen": 0.05002685636281967, + "logits/rejected": 0.6022137403488159, + "logps/chosen": -255.0900115966797, + "logps/rejected": -220.280517578125, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00017908953304868191, + "rewards/margins": 0.0004872865101788193, + "rewards/margins_max": 0.0039043165743350983, + "rewards/margins_min": -0.0029297438450157642, + "rewards/margins_std": 0.004832410719245672, + "rewards/rejected": -0.0003081969916820526, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.36328125, + "learning_rate": 6.17283950617284e-07, + "logits/chosen": 0.07209397852420807, + "logits/rejected": 0.5803325176239014, + "logps/chosen": -241.93930053710938, + "logps/rejected": -229.0738067626953, + "loss": 0.6925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0010193719062954187, + "rewards/margins": 0.001458501792512834, + "rewards/margins_max": 0.0036475714296102524, + "rewards/margins_min": -0.0007305679609999061, + "rewards/margins_std": 0.0030958119314163923, + "rewards/rejected": -0.0004391298571135849, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.40625, + "learning_rate": 8.23045267489712e-07, + "logits/chosen": 0.08637161552906036, + "logits/rejected": 0.6608158946037292, + "logps/chosen": -272.7409973144531, + "logps/rejected": -232.7211151123047, + "loss": 0.6918, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0017092287307605147, + "rewards/margins": 0.002595087978988886, + "rewards/margins_max": 0.0045972722582519054, + "rewards/margins_min": 0.0005929030594415963, + "rewards/margins_std": 0.0028315167874097824, + "rewards/rejected": -0.0008858589571900666, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.39453125, + "learning_rate": 1.02880658436214e-06, + "logits/chosen": 0.039637185633182526, + "logits/rejected": 0.42562946677207947, + "logps/chosen": -248.4722137451172, + "logps/rejected": -249.7132568359375, + "loss": 0.6907, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.0025177341885864735, + "rewards/margins": 0.004976312164217234, + "rewards/margins_max": 0.008509628474712372, + "rewards/margins_min": 0.0014429950388148427, + "rewards/margins_std": 0.004996864590793848, + "rewards/rejected": -0.002458578208461404, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.412109375, + "learning_rate": 1.234567901234568e-06, + "logits/chosen": 0.030338022857904434, + "logits/rejected": 0.6016219854354858, + "logps/chosen": -242.9213409423828, + "logps/rejected": -205.34011840820312, + "loss": 0.6897, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.002770364750176668, + "rewards/margins": 0.006898685358464718, + "rewards/margins_max": 0.01105786394327879, + "rewards/margins_min": 0.002739507704973221, + "rewards/margins_std": 0.005881965160369873, + "rewards/rejected": -0.00412832060828805, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 0.43359375, + "learning_rate": 1.440329218106996e-06, + "logits/chosen": 0.12884962558746338, + "logits/rejected": 0.6521704196929932, + "logps/chosen": -233.1442108154297, + "logps/rejected": -180.2538299560547, + "loss": 0.6884, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.0031278387177735567, + "rewards/margins": 0.008645228110253811, + "rewards/margins_max": 0.012719206511974335, + "rewards/margins_min": 0.004571248777210712, + "rewards/margins_std": 0.005761477164924145, + "rewards/rejected": -0.005517390090972185, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.44140625, + "learning_rate": 1.646090534979424e-06, + "logits/chosen": -0.02626526914536953, + "logits/rejected": 0.4111458361148834, + "logps/chosen": -235.10330200195312, + "logps/rejected": -224.97488403320312, + "loss": 0.686, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.006755024194717407, + "rewards/margins": 0.014125975780189037, + "rewards/margins_max": 0.020800447091460228, + "rewards/margins_min": 0.007451505865901709, + "rewards/margins_std": 0.009439127519726753, + "rewards/rejected": -0.007370952516794205, + "step": 80 + }, + { + "epoch": 0.04, + "grad_norm": 0.4921875, + "learning_rate": 1.8518518518518519e-06, + "logits/chosen": 0.26549288630485535, + "logits/rejected": 0.6299537420272827, + "logps/chosen": -205.5663604736328, + "logps/rejected": -195.4409637451172, + "loss": 0.6846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.00643587950617075, + "rewards/margins": 0.016837503761053085, + "rewards/margins_max": 0.02518610656261444, + "rewards/margins_min": 0.008488905616104603, + "rewards/margins_std": 0.011806704103946686, + "rewards/rejected": -0.010401626117527485, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.427734375, + "learning_rate": 2.05761316872428e-06, + "logits/chosen": -0.02292916737496853, + "logits/rejected": 0.4407041072845459, + "logps/chosen": -237.1365509033203, + "logps/rejected": -234.0122833251953, + "loss": 0.6818, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.008963329717516899, + "rewards/margins": 0.023605378344655037, + "rewards/margins_max": 0.0348113588988781, + "rewards/margins_min": 0.012399397790431976, + "rewards/margins_std": 0.015847649425268173, + "rewards/rejected": -0.014642049558460712, + "step": 100 + }, + { + "epoch": 0.05, + "grad_norm": 0.5, + "learning_rate": 2.263374485596708e-06, + "logits/chosen": 0.06019078567624092, + "logits/rejected": 0.6456455588340759, + "logps/chosen": -252.55941772460938, + "logps/rejected": -202.68516540527344, + "loss": 0.6787, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01171482540667057, + "rewards/margins": 0.0284560676664114, + "rewards/margins_max": 0.040376029908657074, + "rewards/margins_min": 0.016536109149456024, + "rewards/margins_std": 0.016857367008924484, + "rewards/rejected": -0.01674124039709568, + "step": 110 + }, + { + "epoch": 0.05, + "grad_norm": 0.380859375, + "learning_rate": 2.469135802469136e-06, + "logits/chosen": 0.03018159233033657, + "logits/rejected": 0.5444492101669312, + "logps/chosen": -230.00732421875, + "logps/rejected": -204.04888916015625, + "loss": 0.6755, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.010969040915369987, + "rewards/margins": 0.03301534429192543, + "rewards/margins_max": 0.045270394533872604, + "rewards/margins_min": 0.020760290324687958, + "rewards/margins_std": 0.017331259325146675, + "rewards/rejected": -0.022046301513910294, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 0.474609375, + "learning_rate": 2.674897119341564e-06, + "logits/chosen": 0.1473396122455597, + "logits/rejected": 0.6573908925056458, + "logps/chosen": -263.9186096191406, + "logps/rejected": -234.0851593017578, + "loss": 0.6704, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.019587906077504158, + "rewards/margins": 0.04693462699651718, + "rewards/margins_max": 0.06604455411434174, + "rewards/margins_min": 0.02782469615340233, + "rewards/margins_std": 0.027025526389479637, + "rewards/rejected": -0.027346724644303322, + "step": 130 + }, + { + "epoch": 0.06, + "grad_norm": 0.46484375, + "learning_rate": 2.880658436213992e-06, + "logits/chosen": 0.1025664433836937, + "logits/rejected": 0.6043235063552856, + "logps/chosen": -249.485595703125, + "logps/rejected": -218.0284423828125, + "loss": 0.6651, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020640945062041283, + "rewards/margins": 0.054881542921066284, + "rewards/margins_max": 0.0779130607843399, + "rewards/margins_min": 0.03185003623366356, + "rewards/margins_std": 0.032571472227573395, + "rewards/rejected": -0.03424059972167015, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.51171875, + "learning_rate": 3.08641975308642e-06, + "logits/chosen": 0.03741316497325897, + "logits/rejected": 0.730408787727356, + "logps/chosen": -271.1098327636719, + "logps/rejected": -231.9659423828125, + "loss": 0.658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03794144093990326, + "rewards/margins": 0.07607483863830566, + "rewards/margins_max": 0.10968559980392456, + "rewards/margins_min": 0.04246408864855766, + "rewards/margins_std": 0.04753277823328972, + "rewards/rejected": -0.038133405148983, + "step": 150 + }, + { + "epoch": 0.07, + "grad_norm": 0.44140625, + "learning_rate": 3.292181069958848e-06, + "logits/chosen": 0.027915984392166138, + "logits/rejected": 0.5170690417289734, + "logps/chosen": -227.6484375, + "logps/rejected": -201.67355346679688, + "loss": 0.6572, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.029438817873597145, + "rewards/margins": 0.07748283445835114, + "rewards/margins_max": 0.1150212287902832, + "rewards/margins_min": 0.03994445875287056, + "rewards/margins_std": 0.05308728292584419, + "rewards/rejected": -0.04804402217268944, + "step": 160 + }, + { + "epoch": 0.07, + "grad_norm": 0.427734375, + "learning_rate": 3.4979423868312762e-06, + "logits/chosen": 0.06907240301370621, + "logits/rejected": 0.5936463475227356, + "logps/chosen": -229.10653686523438, + "logps/rejected": -239.67593383789062, + "loss": 0.6451, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.05052729696035385, + "rewards/margins": 0.09715622663497925, + "rewards/margins_max": 0.14001211524009705, + "rewards/margins_min": 0.05430033057928085, + "rewards/margins_std": 0.060607392340898514, + "rewards/rejected": -0.046628933399915695, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 0.388671875, + "learning_rate": 3.7037037037037037e-06, + "logits/chosen": 0.23458850383758545, + "logits/rejected": 0.604918360710144, + "logps/chosen": -207.04891967773438, + "logps/rejected": -222.05416870117188, + "loss": 0.6477, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.021370261907577515, + "rewards/margins": 0.09243619441986084, + "rewards/margins_max": 0.13056758046150208, + "rewards/margins_min": 0.0543048158288002, + "rewards/margins_std": 0.05392590910196304, + "rewards/rejected": -0.07106593251228333, + "step": 180 + }, + { + "epoch": 0.08, + "grad_norm": 0.53515625, + "learning_rate": 3.909465020576132e-06, + "logits/chosen": 0.1551034152507782, + "logits/rejected": 0.7508169412612915, + "logps/chosen": -252.1875457763672, + "logps/rejected": -227.7545166015625, + "loss": 0.6276, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.03474985808134079, + "rewards/margins": 0.13373127579689026, + "rewards/margins_max": 0.19556060433387756, + "rewards/margins_min": 0.07190193980932236, + "rewards/margins_std": 0.0874398797750473, + "rewards/rejected": -0.09898141771554947, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.51171875, + "learning_rate": 4.11522633744856e-06, + "logits/chosen": 0.06500478088855743, + "logits/rejected": 0.7195091247558594, + "logps/chosen": -267.84259033203125, + "logps/rejected": -238.9481658935547, + "loss": 0.6207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0433725044131279, + "rewards/margins": 0.16162040829658508, + "rewards/margins_max": 0.2354629933834076, + "rewards/margins_min": 0.08777783066034317, + "rewards/margins_std": 0.10442917048931122, + "rewards/rejected": -0.11824791133403778, + "step": 200 + }, + { + "epoch": 0.09, + "grad_norm": 0.5078125, + "learning_rate": 4.3209876543209875e-06, + "logits/chosen": 0.07097109407186508, + "logits/rejected": 0.5758925676345825, + "logps/chosen": -244.7234649658203, + "logps/rejected": -232.4202117919922, + "loss": 0.6185, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.027079004794359207, + "rewards/margins": 0.17639262974262238, + "rewards/margins_max": 0.2551138401031494, + "rewards/margins_min": 0.09767140448093414, + "rewards/margins_std": 0.1113286241889, + "rewards/rejected": -0.14931362867355347, + "step": 210 + }, + { + "epoch": 0.09, + "grad_norm": 0.474609375, + "learning_rate": 4.526748971193416e-06, + "logits/chosen": 0.10515166819095612, + "logits/rejected": 0.6553866267204285, + "logps/chosen": -236.6344451904297, + "logps/rejected": -224.96749877929688, + "loss": 0.6016, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01706048846244812, + "rewards/margins": 0.21246306598186493, + "rewards/margins_max": 0.31518980860710144, + "rewards/margins_min": 0.10973634570837021, + "rewards/margins_std": 0.14527757465839386, + "rewards/rejected": -0.1954026073217392, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 0.53125, + "learning_rate": 4.732510288065844e-06, + "logits/chosen": -0.010027505457401276, + "logits/rejected": 0.5649107098579407, + "logps/chosen": -292.68572998046875, + "logps/rejected": -271.10955810546875, + "loss": 0.5803, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.009528962895274162, + "rewards/margins": 0.2713000476360321, + "rewards/margins_max": 0.4035729765892029, + "rewards/margins_min": 0.13902710378170013, + "rewards/margins_std": 0.18706218898296356, + "rewards/rejected": -0.2808290421962738, + "step": 230 + }, + { + "epoch": 0.1, + "grad_norm": 0.6015625, + "learning_rate": 4.938271604938272e-06, + "logits/chosen": 0.028728529810905457, + "logits/rejected": 0.5883212685585022, + "logps/chosen": -252.4741973876953, + "logps/rejected": -263.96063232421875, + "loss": 0.5355, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.026816654950380325, + "rewards/margins": 0.36577218770980835, + "rewards/margins_max": 0.5184392333030701, + "rewards/margins_min": 0.21310511231422424, + "rewards/margins_std": 0.2159038782119751, + "rewards/rejected": -0.39258888363838196, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.55859375, + "learning_rate": 4.999873380880316e-06, + "logits/chosen": -0.04030367732048035, + "logits/rejected": 0.5767666697502136, + "logps/chosen": -280.7464904785156, + "logps/rejected": -289.3246154785156, + "loss": 0.5451, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.04451828449964523, + "rewards/margins": 0.31411212682724, + "rewards/margins_max": 0.4603235125541687, + "rewards/margins_min": 0.16790074110031128, + "rewards/margins_std": 0.20677416026592255, + "rewards/rejected": -0.3586304783821106, + "step": 250 + }, + { + "epoch": 0.11, + "grad_norm": 0.52734375, + "learning_rate": 4.999253236476256e-06, + "logits/chosen": 0.11786775290966034, + "logits/rejected": 0.7519556879997253, + "logps/chosen": -285.5740966796875, + "logps/rejected": -260.8897399902344, + "loss": 0.5191, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.16191735863685608, + "rewards/margins": 0.3822050988674164, + "rewards/margins_max": 0.6438002586364746, + "rewards/margins_min": 0.12060992419719696, + "rewards/margins_std": 0.36995142698287964, + "rewards/rejected": -0.5441225171089172, + "step": 260 + }, + { + "epoch": 0.11, + "grad_norm": 0.62109375, + "learning_rate": 4.998116438252842e-06, + "logits/chosen": -0.01648726500570774, + "logits/rejected": 0.596198558807373, + "logps/chosen": -308.7812194824219, + "logps/rejected": -326.95343017578125, + "loss": 0.4666, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.16989798843860626, + "rewards/margins": 0.7350731492042542, + "rewards/margins_max": 1.1802551746368408, + "rewards/margins_min": 0.28989118337631226, + "rewards/margins_std": 0.6295824646949768, + "rewards/rejected": -0.9049711227416992, + "step": 270 + }, + { + "epoch": 0.12, + "grad_norm": 0.56640625, + "learning_rate": 4.9964632212127305e-06, + "logits/chosen": 0.0899326428771019, + "logits/rejected": 0.6752752065658569, + "logps/chosen": -290.4791564941406, + "logps/rejected": -322.41815185546875, + "loss": 0.4527, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.32237640023231506, + "rewards/margins": 0.6386328935623169, + "rewards/margins_max": 0.9289911985397339, + "rewards/margins_min": 0.3482745587825775, + "rewards/margins_std": 0.41062870621681213, + "rewards/rejected": -0.9610093235969543, + "step": 280 + }, + { + "epoch": 0.12, + "grad_norm": 0.7265625, + "learning_rate": 4.994293927114362e-06, + "logits/chosen": 0.06901798397302628, + "logits/rejected": 0.6215580105781555, + "logps/chosen": -290.79632568359375, + "logps/rejected": -373.4809265136719, + "loss": 0.4322, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.4801758825778961, + "rewards/margins": 1.097102403640747, + "rewards/margins_max": 1.8094851970672607, + "rewards/margins_min": 0.3847196698188782, + "rewards/margins_std": 1.0074613094329834, + "rewards/rejected": -1.5772783756256104, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.66796875, + "learning_rate": 4.991609004401324e-06, + "logits/chosen": -0.018922004848718643, + "logits/rejected": 0.6193499565124512, + "logps/chosen": -317.2272644042969, + "logps/rejected": -400.44696044921875, + "loss": 0.3836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.622567355632782, + "rewards/margins": 1.1482007503509521, + "rewards/margins_max": 1.7601646184921265, + "rewards/margins_min": 0.5362368226051331, + "rewards/margins_std": 0.8654475212097168, + "rewards/rejected": -1.770768165588379, + "step": 300 + }, + { + "epoch": 0.13, + "grad_norm": 1.4375, + "learning_rate": 4.988409008109638e-06, + "logits/chosen": 0.18614912033081055, + "logits/rejected": 0.5903946161270142, + "logps/chosen": -306.6070251464844, + "logps/rejected": -419.8663024902344, + "loss": 0.3599, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.8805279731750488, + "rewards/margins": 1.354397177696228, + "rewards/margins_max": 2.106048107147217, + "rewards/margins_min": 0.6027460098266602, + "rewards/margins_std": 1.062995195388794, + "rewards/rejected": -2.2349250316619873, + "step": 310 + }, + { + "epoch": 0.13, + "grad_norm": 3.484375, + "learning_rate": 4.984694599753024e-06, + "logits/chosen": 0.04539443925023079, + "logits/rejected": 0.5839862823486328, + "logps/chosen": -364.6256408691406, + "logps/rejected": -489.1861877441406, + "loss": 0.3496, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.2731292247772217, + "rewards/margins": 1.5762968063354492, + "rewards/margins_max": 2.456343650817871, + "rewards/margins_min": 0.696249783039093, + "rewards/margins_std": 1.2445745468139648, + "rewards/rejected": -2.849426031112671, + "step": 320 + }, + { + "epoch": 0.14, + "grad_norm": 1.140625, + "learning_rate": 4.980466547186149e-06, + "logits/chosen": -0.06857666373252869, + "logits/rejected": 0.6623315811157227, + "logps/chosen": -401.6612243652344, + "logps/rejected": -569.5274047851562, + "loss": 0.2962, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7086451053619385, + "rewards/margins": 2.181455135345459, + "rewards/margins_max": 3.330390453338623, + "rewards/margins_min": 1.0325195789337158, + "rewards/margins_std": 1.6248401403427124, + "rewards/rejected": -3.8901000022888184, + "step": 330 + }, + { + "epoch": 0.14, + "grad_norm": 0.859375, + "learning_rate": 4.975725724445898e-06, + "logits/chosen": 0.18517382442951202, + "logits/rejected": 0.7153784036636353, + "logps/chosen": -425.679443359375, + "logps/rejected": -613.3704833984375, + "loss": 0.3906, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7429125308990479, + "rewards/margins": 2.2590205669403076, + "rewards/margins_max": 3.9183871746063232, + "rewards/margins_min": 0.5996544361114502, + "rewards/margins_std": 2.34669828414917, + "rewards/rejected": -4.0019330978393555, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 1.7109375, + "learning_rate": 4.9704731115706805e-06, + "logits/chosen": 0.06402029097080231, + "logits/rejected": 0.6806127429008484, + "logps/chosen": -403.9203186035156, + "logps/rejected": -743.2752075195312, + "loss": 0.2744, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.463424801826477, + "rewards/margins": 3.6999382972717285, + "rewards/margins_max": 6.0486063957214355, + "rewards/margins_min": 1.3512706756591797, + "rewards/margins_std": 3.3215174674987793, + "rewards/rejected": -5.163362979888916, + "step": 350 + }, + { + "epoch": 0.15, + "grad_norm": 4.71875, + "learning_rate": 4.964709794397846e-06, + "logits/chosen": 0.17624667286872864, + "logits/rejected": 0.8073934316635132, + "logps/chosen": -420.7196350097656, + "logps/rejected": -765.087158203125, + "loss": 0.2887, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.7227516174316406, + "rewards/margins": 3.5676417350769043, + "rewards/margins_max": 6.014761447906494, + "rewards/margins_min": 1.1205217838287354, + "rewards/margins_std": 3.460750102996826, + "rewards/rejected": -5.290392875671387, + "step": 360 + }, + { + "epoch": 0.15, + "grad_norm": 0.546875, + "learning_rate": 4.9584369643392076e-06, + "logits/chosen": 0.145114004611969, + "logits/rejected": 0.8146367073059082, + "logps/chosen": -478.583984375, + "logps/rejected": -902.0680541992188, + "loss": 0.2173, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.5044260025024414, + "rewards/margins": 4.49790096282959, + "rewards/margins_max": 7.457921028137207, + "rewards/margins_min": 1.5378811359405518, + "rewards/margins_std": 4.186100006103516, + "rewards/rejected": -7.002326965332031, + "step": 370 + }, + { + "epoch": 0.16, + "grad_norm": 3.046875, + "learning_rate": 4.951655918134749e-06, + "logits/chosen": 0.10492346435785294, + "logits/rejected": 0.6990815997123718, + "logps/chosen": -523.7138671875, + "logps/rejected": -888.4390869140625, + "loss": 0.2891, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.9183859825134277, + "rewards/margins": 3.839510679244995, + "rewards/margins_max": 6.660401344299316, + "rewards/margins_min": 1.0186195373535156, + "rewards/margins_std": 3.989342451095581, + "rewards/rejected": -6.757896423339844, + "step": 380 + }, + { + "epoch": 0.16, + "grad_norm": 2.03125, + "learning_rate": 4.944368057584568e-06, + "logits/chosen": 0.10440587997436523, + "logits/rejected": 0.8017401695251465, + "logps/chosen": -489.6419982910156, + "logps/rejected": -886.7931518554688, + "loss": 0.2525, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.3617236614227295, + "rewards/margins": 4.341439247131348, + "rewards/margins_max": 6.716013431549072, + "rewards/margins_min": 1.9668653011322021, + "rewards/margins_std": 3.358154773712158, + "rewards/rejected": -6.703163146972656, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 0.88671875, + "learning_rate": 4.936574889259076e-06, + "logits/chosen": 0.20124737918376923, + "logits/rejected": 0.9363399744033813, + "logps/chosen": -510.68682861328125, + "logps/rejected": -808.8845825195312, + "loss": 0.2781, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.1757853031158447, + "rewards/margins": 3.780200958251953, + "rewards/margins_max": 6.390686511993408, + "rewards/margins_min": 1.1697145700454712, + "rewards/margins_std": 3.691784620285034, + "rewards/rejected": -5.9559855461120605, + "step": 400 + }, + { + "epoch": 0.17, + "grad_norm": 0.7421875, + "learning_rate": 4.928278024187572e-06, + "logits/chosen": 0.07302796840667725, + "logits/rejected": 0.7525766491889954, + "logps/chosen": -441.041015625, + "logps/rejected": -718.113525390625, + "loss": 0.2566, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.917728066444397, + "rewards/margins": 3.044644832611084, + "rewards/margins_max": 4.838767051696777, + "rewards/margins_min": 1.2505226135253906, + "rewards/margins_std": 2.537271738052368, + "rewards/rejected": -4.962372779846191, + "step": 410 + }, + { + "epoch": 0.17, + "grad_norm": 1.4140625, + "learning_rate": 4.91947917752519e-06, + "logits/chosen": 0.2481038123369217, + "logits/rejected": 0.8633731603622437, + "logps/chosen": -495.15234375, + "logps/rejected": -929.7926635742188, + "loss": 0.2143, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.3359451293945312, + "rewards/margins": 4.435263156890869, + "rewards/margins_max": 6.896138668060303, + "rewards/margins_min": 1.9743881225585938, + "rewards/margins_std": 3.480203151702881, + "rewards/rejected": -6.7712082862854, + "step": 420 + }, + { + "epoch": 0.18, + "grad_norm": 1.171875, + "learning_rate": 4.91018016819835e-06, + "logits/chosen": 0.19703389704227448, + "logits/rejected": 0.8145115971565247, + "logps/chosen": -463.3583068847656, + "logps/rejected": -735.3023681640625, + "loss": 0.3076, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.0541656017303467, + "rewards/margins": 3.166626214981079, + "rewards/margins_max": 5.232685565948486, + "rewards/margins_min": 1.1005662679672241, + "rewards/margins_std": 2.921849489212036, + "rewards/rejected": -5.220791816711426, + "step": 430 + }, + { + "epoch": 0.18, + "grad_norm": 1.3203125, + "learning_rate": 4.900382918528732e-06, + "logits/chosen": 0.37838277220726013, + "logits/rejected": 0.9560055732727051, + "logps/chosen": -490.0171813964844, + "logps/rejected": -867.9541015625, + "loss": 0.2098, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.327396869659424, + "rewards/margins": 4.201695919036865, + "rewards/margins_max": 6.30963659286499, + "rewards/margins_min": 2.0937557220458984, + "rewards/margins_std": 2.9810779094696045, + "rewards/rejected": -6.529093265533447, + "step": 440 + }, + { + "epoch": 0.19, + "grad_norm": 0.416015625, + "learning_rate": 4.890089453835894e-06, + "logits/chosen": 0.16315485537052155, + "logits/rejected": 0.8696213960647583, + "logps/chosen": -516.959716796875, + "logps/rejected": -999.4393310546875, + "loss": 0.1884, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.290456533432007, + "rewards/margins": 5.147913932800293, + "rewards/margins_max": 8.1465425491333, + "rewards/margins_min": 2.1492867469787598, + "rewards/margins_std": 4.240699768066406, + "rewards/rejected": -7.438370704650879, + "step": 450 + }, + { + "epoch": 0.19, + "grad_norm": 0.828125, + "learning_rate": 4.879301902018592e-06, + "logits/chosen": 0.2864415943622589, + "logits/rejected": 0.7803254127502441, + "logps/chosen": -533.4550170898438, + "logps/rejected": -1063.6312255859375, + "loss": 0.2423, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.9031295776367188, + "rewards/margins": 5.533447265625, + "rewards/margins_max": 8.971426010131836, + "rewards/margins_min": 2.0954694747924805, + "rewards/margins_std": 4.862034797668457, + "rewards/rejected": -8.436576843261719, + "step": 460 + }, + { + "epoch": 0.19, + "grad_norm": 7.3125, + "learning_rate": 4.868022493114887e-06, + "logits/chosen": 0.33959221839904785, + "logits/rejected": 1.040248155593872, + "logps/chosen": -664.7828369140625, + "logps/rejected": -1284.1910400390625, + "loss": 0.1801, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.016963958740234, + "rewards/margins": 6.7210187911987305, + "rewards/margins_max": 10.781229019165039, + "rewards/margins_min": 2.6608097553253174, + "rewards/margins_std": 5.742003440856934, + "rewards/rejected": -10.737983703613281, + "step": 470 + }, + { + "epoch": 0.2, + "grad_norm": 1.8828125, + "learning_rate": 4.856253558841153e-06, + "logits/chosen": 0.43446415662765503, + "logits/rejected": 1.01176118850708, + "logps/chosen": -664.5281982421875, + "logps/rejected": -1319.181640625, + "loss": 0.3663, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.380959987640381, + "rewards/margins": 6.677268028259277, + "rewards/margins_max": 10.735626220703125, + "rewards/margins_min": 2.618910074234009, + "rewards/margins_std": 5.739384651184082, + "rewards/rejected": -11.0582275390625, + "step": 480 + }, + { + "epoch": 0.2, + "grad_norm": 0.9453125, + "learning_rate": 4.843997532110051e-06, + "logits/chosen": 0.4099550247192383, + "logits/rejected": 0.9675588607788086, + "logps/chosen": -634.2232666015625, + "logps/rejected": -1535.5347900390625, + "loss": 0.1502, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.844944477081299, + "rewards/margins": 9.073770523071289, + "rewards/margins_max": 14.664604187011719, + "rewards/margins_min": 3.482936143875122, + "rewards/margins_std": 7.906632423400879, + "rewards/rejected": -12.91871452331543, + "step": 490 + }, + { + "epoch": 0.21, + "grad_norm": 0.341796875, + "learning_rate": 4.831256946527591e-06, + "logits/chosen": 0.41468414664268494, + "logits/rejected": 1.1351321935653687, + "logps/chosen": -591.6776123046875, + "logps/rejected": -1291.275146484375, + "loss": 0.2315, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.6611881256103516, + "rewards/margins": 7.434420585632324, + "rewards/margins_max": 11.837034225463867, + "rewards/margins_min": 3.0318071842193604, + "rewards/margins_std": 6.226236820220947, + "rewards/rejected": -11.095609664916992, + "step": 500 + }, + { + "epoch": 0.21, + "grad_norm": 1.0625, + "learning_rate": 4.818034435869377e-06, + "logits/chosen": 0.5877698063850403, + "logits/rejected": 1.2467072010040283, + "logps/chosen": -623.4757080078125, + "logps/rejected": -1281.064697265625, + "loss": 0.1391, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.8208084106445312, + "rewards/margins": 6.917235374450684, + "rewards/margins_max": 10.565729141235352, + "rewards/margins_min": 3.268742322921753, + "rewards/margins_std": 5.159748077392578, + "rewards/rejected": -10.738044738769531, + "step": 510 + }, + { + "epoch": 0.21, + "grad_norm": 3.890625, + "learning_rate": 4.804332733536141e-06, + "logits/chosen": 0.45656394958496094, + "logits/rejected": 1.1674026250839233, + "logps/chosen": -701.67041015625, + "logps/rejected": -1496.7086181640625, + "loss": 0.2265, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.459706783294678, + "rewards/margins": 8.300594329833984, + "rewards/margins_max": 13.440702438354492, + "rewards/margins_min": 3.1604866981506348, + "rewards/margins_std": 7.2692108154296875, + "rewards/rejected": -12.76030158996582, + "step": 520 + }, + { + "epoch": 0.22, + "grad_norm": 0.75390625, + "learning_rate": 4.790154671988696e-06, + "logits/chosen": 0.707282304763794, + "logits/rejected": 1.2839213609695435, + "logps/chosen": -713.0794067382812, + "logps/rejected": -1470.809326171875, + "loss": 0.1294, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.646553993225098, + "rewards/margins": 7.7992706298828125, + "rewards/margins_max": 12.847709655761719, + "rewards/margins_min": 2.7508316040039062, + "rewards/margins_std": 7.1395721435546875, + "rewards/rejected": -12.44582462310791, + "step": 530 + }, + { + "epoch": 0.22, + "grad_norm": 22.625, + "learning_rate": 4.775503182162386e-06, + "logits/chosen": 0.6817615032196045, + "logits/rejected": 1.3176844120025635, + "logps/chosen": -850.7849731445312, + "logps/rejected": -1690.505126953125, + "loss": 0.253, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.02855110168457, + "rewards/margins": 8.858359336853027, + "rewards/margins_max": 13.857948303222656, + "rewards/margins_min": 3.858771800994873, + "rewards/margins_std": 7.0704851150512695, + "rewards/rejected": -14.886911392211914, + "step": 540 + }, + { + "epoch": 0.23, + "grad_norm": 1.8671875, + "learning_rate": 4.7603812928612e-06, + "logits/chosen": 0.4829481542110443, + "logits/rejected": 1.1649879217147827, + "logps/chosen": -747.2049560546875, + "logps/rejected": -1385.47216796875, + "loss": 0.4339, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.999864101409912, + "rewards/margins": 6.820773124694824, + "rewards/margins_max": 10.468523025512695, + "rewards/margins_min": 3.1730237007141113, + "rewards/margins_std": 5.158697605133057, + "rewards/rejected": -11.820637702941895, + "step": 550 + }, + { + "epoch": 0.23, + "grad_norm": 1.5078125, + "learning_rate": 4.744792130131653e-06, + "logits/chosen": 0.3002074360847473, + "logits/rejected": 1.0043690204620361, + "logps/chosen": -662.5621948242188, + "logps/rejected": -1360.820068359375, + "loss": 0.1538, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.9544594287872314, + "rewards/margins": 7.2028093338012695, + "rewards/margins_max": 11.087037086486816, + "rewards/margins_min": 3.318582534790039, + "rewards/margins_std": 5.493126392364502, + "rewards/rejected": -11.157269477844238, + "step": 560 + }, + { + "epoch": 0.23, + "grad_norm": 0.63671875, + "learning_rate": 4.728738916616552e-06, + "logits/chosen": 0.5242341756820679, + "logits/rejected": 1.1999857425689697, + "logps/chosen": -646.2457885742188, + "logps/rejected": -1409.1556396484375, + "loss": 0.2874, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.057827949523926, + "rewards/margins": 7.776298522949219, + "rewards/margins_max": 11.975003242492676, + "rewards/margins_min": 3.57759428024292, + "rewards/margins_std": 5.937864780426025, + "rewards/rejected": -11.834127426147461, + "step": 570 + }, + { + "epoch": 0.24, + "grad_norm": 4.71875, + "learning_rate": 4.712224970888801e-06, + "logits/chosen": 0.580299973487854, + "logits/rejected": 1.3875830173492432, + "logps/chosen": -721.7586059570312, + "logps/rejected": -1617.5888671875, + "loss": 0.2512, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.572092056274414, + "rewards/margins": 9.203888893127441, + "rewards/margins_max": 14.512364387512207, + "rewards/margins_min": 3.8954148292541504, + "rewards/margins_std": 7.507315635681152, + "rewards/rejected": -13.775980949401855, + "step": 580 + }, + { + "epoch": 0.24, + "grad_norm": 4.375, + "learning_rate": 4.69525370676538e-06, + "logits/chosen": 0.5429633855819702, + "logits/rejected": 1.3331568241119385, + "logps/chosen": -695.3401489257812, + "logps/rejected": -1387.588134765625, + "loss": 0.2468, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.615943908691406, + "rewards/margins": 7.4264984130859375, + "rewards/margins_max": 11.769147872924805, + "rewards/margins_min": 3.0838465690612793, + "rewards/margins_std": 6.141435623168945, + "rewards/rejected": -12.042441368103027, + "step": 590 + }, + { + "epoch": 0.25, + "grad_norm": 2.859375, + "learning_rate": 4.677828632601625e-06, + "logits/chosen": 0.49036288261413574, + "logits/rejected": 1.2113770246505737, + "logps/chosen": -631.5177001953125, + "logps/rejected": -1210.844482421875, + "loss": 0.128, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.760044813156128, + "rewards/margins": 6.260881423950195, + "rewards/margins_max": 9.719388961791992, + "rewards/margins_min": 2.8023738861083984, + "rewards/margins_std": 4.891068458557129, + "rewards/rejected": -10.020925521850586, + "step": 600 + }, + { + "epoch": 0.25, + "grad_norm": 6.875, + "learning_rate": 4.65995335056597e-06, + "logits/chosen": 0.4661685824394226, + "logits/rejected": 1.1997615098953247, + "logps/chosen": -697.4072265625, + "logps/rejected": -1316.599853515625, + "loss": 0.2737, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.225058555603027, + "rewards/margins": 6.9119462966918945, + "rewards/margins_max": 10.23686408996582, + "rewards/margins_min": 3.5870280265808105, + "rewards/margins_std": 4.702144145965576, + "rewards/rejected": -11.137005805969238, + "step": 610 + }, + { + "epoch": 0.26, + "grad_norm": 0.90234375, + "learning_rate": 4.6416315558952985e-06, + "logits/chosen": 0.5700492858886719, + "logits/rejected": 1.2297275066375732, + "logps/chosen": -648.19482421875, + "logps/rejected": -1285.586181640625, + "loss": 0.2359, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.9675934314727783, + "rewards/margins": 6.879029273986816, + "rewards/margins_max": 11.239900588989258, + "rewards/margins_min": 2.518155336380005, + "rewards/margins_std": 6.167205810546875, + "rewards/rejected": -10.846620559692383, + "step": 620 + }, + { + "epoch": 0.26, + "grad_norm": 0.6640625, + "learning_rate": 4.622867036131045e-06, + "logits/chosen": 0.4446844160556793, + "logits/rejected": 1.1179345846176147, + "logps/chosen": -699.2020263671875, + "logps/rejected": -1307.1138916015625, + "loss": 0.1037, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.177753925323486, + "rewards/margins": 6.502650260925293, + "rewards/margins_max": 10.074499130249023, + "rewards/margins_min": 2.9308011531829834, + "rewards/margins_std": 5.051357746124268, + "rewards/rejected": -10.680402755737305, + "step": 630 + }, + { + "epoch": 0.26, + "grad_norm": 1.984375, + "learning_rate": 4.60366367033623e-06, + "logits/chosen": 0.40803995728492737, + "logits/rejected": 1.114776849746704, + "logps/chosen": -723.8427734375, + "logps/rejected": -1445.0594482421875, + "loss": 0.1899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.7852630615234375, + "rewards/margins": 7.497411251068115, + "rewards/margins_max": 11.162347793579102, + "rewards/margins_min": 3.8324737548828125, + "rewards/margins_std": 5.183003902435303, + "rewards/rejected": -12.282673835754395, + "step": 640 + }, + { + "epoch": 0.27, + "grad_norm": 1.1015625, + "learning_rate": 4.5840254282935604e-06, + "logits/chosen": 0.5937483310699463, + "logits/rejected": 1.2330735921859741, + "logps/chosen": -796.9608154296875, + "logps/rejected": -1525.527587890625, + "loss": 0.2084, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.397136688232422, + "rewards/margins": 7.485539436340332, + "rewards/margins_max": 11.793124198913574, + "rewards/margins_min": 3.177953004837036, + "rewards/margins_std": 6.091846942901611, + "rewards/rejected": -12.882675170898438, + "step": 650 + }, + { + "epoch": 0.27, + "grad_norm": 4.875, + "learning_rate": 4.56395636968479e-06, + "logits/chosen": 0.6977173089981079, + "logits/rejected": 1.2386213541030884, + "logps/chosen": -645.6939697265625, + "logps/rejected": -1467.990234375, + "loss": 0.1096, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.246734619140625, + "rewards/margins": 8.112640380859375, + "rewards/margins_max": 12.321252822875977, + "rewards/margins_min": 3.904027223587036, + "rewards/margins_std": 5.951877593994141, + "rewards/rejected": -12.359376907348633, + "step": 660 + }, + { + "epoch": 0.28, + "grad_norm": 0.333984375, + "learning_rate": 4.543460643251481e-06, + "logits/chosen": 0.5177757740020752, + "logits/rejected": 1.1193509101867676, + "logps/chosen": -690.61572265625, + "logps/rejected": -1552.57568359375, + "loss": 0.1118, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.566229820251465, + "rewards/margins": 8.843162536621094, + "rewards/margins_max": 13.59516429901123, + "rewards/margins_min": 4.091159820556641, + "rewards/margins_std": 6.720346927642822, + "rewards/rejected": -13.409391403198242, + "step": 670 + }, + { + "epoch": 0.28, + "grad_norm": 1.578125, + "learning_rate": 4.522542485937369e-06, + "logits/chosen": 0.6444950103759766, + "logits/rejected": 1.3874423503875732, + "logps/chosen": -798.6126708984375, + "logps/rejected": -1724.8717041015625, + "loss": 0.1363, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.293717861175537, + "rewards/margins": 9.743268966674805, + "rewards/margins_max": 15.012044906616211, + "rewards/margins_min": 4.474490165710449, + "rewards/margins_std": 7.451178073883057, + "rewards/rejected": -15.036985397338867, + "step": 680 + }, + { + "epoch": 0.28, + "grad_norm": 1.6953125, + "learning_rate": 4.5012062220124845e-06, + "logits/chosen": 0.5247820019721985, + "logits/rejected": 1.264107346534729, + "logps/chosen": -724.8851928710938, + "logps/rejected": -1684.8916015625, + "loss": 0.1534, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.79865026473999, + "rewards/margins": 9.892607688903809, + "rewards/margins_max": 15.237088203430176, + "rewards/margins_min": 4.548129081726074, + "rewards/margins_std": 7.558236122131348, + "rewards/rejected": -14.691259384155273, + "step": 690 + }, + { + "epoch": 0.29, + "grad_norm": 0.28125, + "learning_rate": 4.479456262179228e-06, + "logits/chosen": 0.5434385538101196, + "logits/rejected": 1.2754974365234375, + "logps/chosen": -826.7483520507812, + "logps/rejected": -1494.5345458984375, + "loss": 0.1545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.704430103302002, + "rewards/margins": 7.395480155944824, + "rewards/margins_max": 11.325884819030762, + "rewards/margins_min": 3.465075969696045, + "rewards/margins_std": 5.558432102203369, + "rewards/rejected": -13.099909782409668, + "step": 700 + }, + { + "epoch": 0.29, + "grad_norm": 0.65234375, + "learning_rate": 4.4572971026605726e-06, + "logits/chosen": 0.5515539646148682, + "logits/rejected": 1.3576513528823853, + "logps/chosen": -805.1383666992188, + "logps/rejected": -1752.7252197265625, + "loss": 0.1508, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.342719078063965, + "rewards/margins": 9.904947280883789, + "rewards/margins_max": 14.326292037963867, + "rewards/margins_min": 5.4836015701293945, + "rewards/margins_std": 6.252727031707764, + "rewards/rejected": -15.24766731262207, + "step": 710 + }, + { + "epoch": 0.3, + "grad_norm": 0.408203125, + "learning_rate": 4.434733324270592e-06, + "logits/chosen": 0.5185344815254211, + "logits/rejected": 1.1416656970977783, + "logps/chosen": -690.1055908203125, + "logps/rejected": -1510.14453125, + "loss": 0.1959, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.4360551834106445, + "rewards/margins": 8.723947525024414, + "rewards/margins_max": 12.72030258178711, + "rewards/margins_min": 4.727590084075928, + "rewards/margins_std": 5.651700973510742, + "rewards/rejected": -13.160001754760742, + "step": 720 + }, + { + "epoch": 0.3, + "grad_norm": 0.85546875, + "learning_rate": 4.411769591467497e-06, + "logits/chosen": 0.4622286856174469, + "logits/rejected": 1.096407175064087, + "logps/chosen": -706.9508056640625, + "logps/rejected": -1390.883544921875, + "loss": 0.1088, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.562924385070801, + "rewards/margins": 7.009686470031738, + "rewards/margins_max": 10.363374710083008, + "rewards/margins_min": 3.6559970378875732, + "rewards/margins_std": 4.742833137512207, + "rewards/rejected": -11.572611808776855, + "step": 730 + }, + { + "epoch": 0.3, + "grad_norm": 1.46875, + "learning_rate": 4.3884106513893895e-06, + "logits/chosen": 0.5636991262435913, + "logits/rejected": 1.2218422889709473, + "logps/chosen": -723.6990966796875, + "logps/rejected": -1594.478515625, + "loss": 0.1631, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.5525712966918945, + "rewards/margins": 9.058218002319336, + "rewards/margins_max": 13.891519546508789, + "rewards/margins_min": 4.224917411804199, + "rewards/margins_std": 6.835320949554443, + "rewards/rejected": -13.61078929901123, + "step": 740 + }, + { + "epoch": 0.31, + "grad_norm": 0.984375, + "learning_rate": 4.364661332872913e-06, + "logits/chosen": 0.4284195005893707, + "logits/rejected": 1.1675077676773071, + "logps/chosen": -757.3233642578125, + "logps/rejected": -1814.9117431640625, + "loss": 0.1645, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.945916652679443, + "rewards/margins": 11.039398193359375, + "rewards/margins_max": 16.22256851196289, + "rewards/margins_min": 5.856227874755859, + "rewards/margins_std": 7.330111026763916, + "rewards/rejected": -15.985315322875977, + "step": 750 + }, + { + "epoch": 0.31, + "grad_norm": 0.494140625, + "learning_rate": 4.340526545455016e-06, + "logits/chosen": 0.5042354464530945, + "logits/rejected": 1.2818940877914429, + "logps/chosen": -712.5407104492188, + "logps/rejected": -1623.8824462890625, + "loss": 0.1499, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.52707576751709, + "rewards/margins": 9.589614868164062, + "rewards/margins_max": 15.405502319335938, + "rewards/margins_min": 3.77372670173645, + "rewards/margins_std": 8.224907875061035, + "rewards/rejected": -14.116689682006836, + "step": 760 + }, + { + "epoch": 0.32, + "grad_norm": 4.5, + "learning_rate": 4.31601127835805e-06, + "logits/chosen": 0.4573752284049988, + "logits/rejected": 1.2255313396453857, + "logps/chosen": -803.6546630859375, + "logps/rejected": -1744.1165771484375, + "loss": 0.1508, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.967880725860596, + "rewards/margins": 10.099109649658203, + "rewards/margins_max": 15.592402458190918, + "rewards/margins_min": 4.605815887451172, + "rewards/margins_std": 7.768690586090088, + "rewards/rejected": -15.066988945007324, + "step": 770 + }, + { + "epoch": 0.32, + "grad_norm": 1.296875, + "learning_rate": 4.291120599458366e-06, + "logits/chosen": 0.6284778118133545, + "logits/rejected": 1.3736778497695923, + "logps/chosen": -744.2955322265625, + "logps/rejected": -1689.0992431640625, + "loss": 0.108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.914001941680908, + "rewards/margins": 9.785839080810547, + "rewards/margins_max": 14.224902153015137, + "rewards/margins_min": 5.346776008605957, + "rewards/margins_std": 6.2777838706970215, + "rewards/rejected": -14.69983959197998, + "step": 780 + }, + { + "epoch": 0.33, + "grad_norm": 0.83203125, + "learning_rate": 4.265859654238676e-06, + "logits/chosen": 0.518182635307312, + "logits/rejected": 1.266416311264038, + "logps/chosen": -795.889404296875, + "logps/rejected": -1599.6748046875, + "loss": 0.1042, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.121318340301514, + "rewards/margins": 8.590972900390625, + "rewards/margins_max": 12.986808776855469, + "rewards/margins_min": 4.195137023925781, + "rewards/margins_std": 6.216650485992432, + "rewards/rejected": -13.71229076385498, + "step": 790 + }, + { + "epoch": 0.33, + "grad_norm": 3.09375, + "learning_rate": 4.240233664724358e-06, + "logits/chosen": 0.5838888883590698, + "logits/rejected": 1.3169996738433838, + "logps/chosen": -767.9039916992188, + "logps/rejected": -1709.415283203125, + "loss": 0.1546, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.298363208770752, + "rewards/margins": 9.643119812011719, + "rewards/margins_max": 14.92499828338623, + "rewards/margins_min": 4.361241340637207, + "rewards/margins_std": 7.469703674316406, + "rewards/rejected": -14.941482543945312, + "step": 800 + }, + { + "epoch": 0.33, + "grad_norm": 0.5859375, + "learning_rate": 4.2142479284039445e-06, + "logits/chosen": 0.5468761920928955, + "logits/rejected": 1.227432131767273, + "logps/chosen": -770.5872802734375, + "logps/rejected": -1546.19384765625, + "loss": 0.0909, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.143388748168945, + "rewards/margins": 8.180723190307617, + "rewards/margins_max": 13.141166687011719, + "rewards/margins_min": 3.2202792167663574, + "rewards/margins_std": 7.015126705169678, + "rewards/rejected": -13.324111938476562, + "step": 810 + }, + { + "epoch": 0.34, + "grad_norm": 2.84375, + "learning_rate": 4.187907817134005e-06, + "logits/chosen": 0.5028406381607056, + "logits/rejected": 1.2698485851287842, + "logps/chosen": -769.3389282226562, + "logps/rejected": -2071.339111328125, + "loss": 0.0668, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.078272342681885, + "rewards/margins": 13.281135559082031, + "rewards/margins_max": 18.268909454345703, + "rewards/margins_min": 8.293363571166992, + "rewards/margins_std": 7.053775787353516, + "rewards/rejected": -18.359407424926758, + "step": 820 + }, + { + "epoch": 0.34, + "grad_norm": 0.96484375, + "learning_rate": 4.161218776028661e-06, + "logits/chosen": 0.4837300181388855, + "logits/rejected": 1.2130780220031738, + "logps/chosen": -780.1266479492188, + "logps/rejected": -2050.310302734375, + "loss": 0.2191, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.4014573097229, + "rewards/margins": 12.880502700805664, + "rewards/margins_max": 20.169330596923828, + "rewards/margins_min": 5.591673851013184, + "rewards/margins_std": 10.30795955657959, + "rewards/rejected": -18.281957626342773, + "step": 830 + }, + { + "epoch": 0.35, + "grad_norm": 0.443359375, + "learning_rate": 4.134186322333951e-06, + "logits/chosen": 0.5044664144515991, + "logits/rejected": 1.2629055976867676, + "logps/chosen": -710.2357788085938, + "logps/rejected": -1879.140625, + "loss": 0.1806, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.55966329574585, + "rewards/margins": 11.992768287658691, + "rewards/margins_max": 17.50288200378418, + "rewards/margins_min": 6.482656002044678, + "rewards/margins_std": 7.79247522354126, + "rewards/rejected": -16.552433013916016, + "step": 840 + }, + { + "epoch": 0.35, + "grad_norm": 2.796875, + "learning_rate": 4.106816044287292e-06, + "logits/chosen": 0.5818988084793091, + "logits/rejected": 1.2744948863983154, + "logps/chosen": -702.9332885742188, + "logps/rejected": -1656.512939453125, + "loss": 0.1058, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.6068434715271, + "rewards/margins": 9.725323677062988, + "rewards/margins_max": 13.76390266418457, + "rewards/margins_min": 5.68674373626709, + "rewards/margins_std": 5.711414337158203, + "rewards/rejected": -14.332165718078613, + "step": 850 + }, + { + "epoch": 0.35, + "grad_norm": 0.37109375, + "learning_rate": 4.079113599962257e-06, + "logits/chosen": 0.6045584082603455, + "logits/rejected": 1.40791916847229, + "logps/chosen": -795.3938598632812, + "logps/rejected": -1809.0947265625, + "loss": 0.0772, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.249255657196045, + "rewards/margins": 10.42540168762207, + "rewards/margins_max": 15.904159545898438, + "rewards/margins_min": 4.9466447830200195, + "rewards/margins_std": 7.748133182525635, + "rewards/rejected": -15.674657821655273, + "step": 860 + }, + { + "epoch": 0.36, + "grad_norm": 0.92578125, + "learning_rate": 4.051084716098921e-06, + "logits/chosen": 0.5180607438087463, + "logits/rejected": 1.220595121383667, + "logps/chosen": -676.9715576171875, + "logps/rejected": -1734.108642578125, + "loss": 0.1499, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.324917793273926, + "rewards/margins": 10.716940879821777, + "rewards/margins_max": 15.53416633605957, + "rewards/margins_min": 5.899716377258301, + "rewards/margins_std": 6.812585353851318, + "rewards/rejected": -15.04185962677002, + "step": 870 + }, + { + "epoch": 0.36, + "grad_norm": 4.46875, + "learning_rate": 4.022735186920008e-06, + "logits/chosen": 0.487175315618515, + "logits/rejected": 1.2153656482696533, + "logps/chosen": -689.3336791992188, + "logps/rejected": -1664.072998046875, + "loss": 0.1004, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.373027801513672, + "rewards/margins": 10.115726470947266, + "rewards/margins_max": 15.508180618286133, + "rewards/margins_min": 4.72327184677124, + "rewards/margins_std": 7.626082420349121, + "rewards/rejected": -14.488754272460938, + "step": 880 + }, + { + "epoch": 0.37, + "grad_norm": 7.4375, + "learning_rate": 3.994070872933097e-06, + "logits/chosen": 0.4529595375061035, + "logits/rejected": 1.1865074634552002, + "logps/chosen": -645.4569091796875, + "logps/rejected": -1371.4666748046875, + "loss": 0.1495, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.04251766204834, + "rewards/margins": 7.7823076248168945, + "rewards/margins_max": 11.296024322509766, + "rewards/margins_min": 4.268589973449707, + "rewards/margins_std": 4.969146251678467, + "rewards/rejected": -11.824824333190918, + "step": 890 + }, + { + "epoch": 0.37, + "grad_norm": 3.46875, + "learning_rate": 3.965097699719109e-06, + "logits/chosen": 0.5944451093673706, + "logits/rejected": 1.3090002536773682, + "logps/chosen": -762.5585327148438, + "logps/rejected": -1599.050537109375, + "loss": 0.1855, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.843676567077637, + "rewards/margins": 8.63348388671875, + "rewards/margins_max": 13.529146194458008, + "rewards/margins_min": 3.7378222942352295, + "rewards/margins_std": 6.9235124588012695, + "rewards/rejected": -13.477160453796387, + "step": 900 + }, + { + "epoch": 0.37, + "grad_norm": 2.5625, + "learning_rate": 3.935821656707359e-06, + "logits/chosen": 0.5119448304176331, + "logits/rejected": 1.1734087467193604, + "logps/chosen": -652.0521850585938, + "logps/rejected": -1535.1744384765625, + "loss": 0.1104, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.284106254577637, + "rewards/margins": 8.888346672058105, + "rewards/margins_max": 13.543110847473145, + "rewards/margins_min": 4.233582496643066, + "rewards/margins_std": 6.582831382751465, + "rewards/rejected": -13.172452926635742, + "step": 910 + }, + { + "epoch": 0.38, + "grad_norm": 0.6640625, + "learning_rate": 3.9062487959374e-06, + "logits/chosen": 0.41363000869750977, + "logits/rejected": 1.170240879058838, + "logps/chosen": -667.77783203125, + "logps/rejected": -1525.485595703125, + "loss": 0.1262, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.287820816040039, + "rewards/margins": 9.004143714904785, + "rewards/margins_max": 13.4224214553833, + "rewards/margins_min": 4.585866451263428, + "rewards/margins_std": 6.248387336730957, + "rewards/rejected": -13.291964530944824, + "step": 920 + }, + { + "epoch": 0.38, + "grad_norm": 2.75, + "learning_rate": 3.8763852308079244e-06, + "logits/chosen": 0.5807031393051147, + "logits/rejected": 1.292966365814209, + "logps/chosen": -698.1134643554688, + "logps/rejected": -1579.521240234375, + "loss": 0.1198, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.37972354888916, + "rewards/margins": 9.12094497680664, + "rewards/margins_max": 14.104803085327148, + "rewards/margins_min": 4.137085914611816, + "rewards/margins_std": 7.048240661621094, + "rewards/rejected": -13.5006685256958, + "step": 930 + }, + { + "epoch": 0.39, + "grad_norm": 0.875, + "learning_rate": 3.8462371348129805e-06, + "logits/chosen": 0.539486289024353, + "logits/rejected": 1.2316633462905884, + "logps/chosen": -694.4327392578125, + "logps/rejected": -1500.2738037109375, + "loss": 0.1419, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.638047218322754, + "rewards/margins": 8.356195449829102, + "rewards/margins_max": 12.973353385925293, + "rewards/margins_min": 3.7390365600585938, + "rewards/margins_std": 6.5296478271484375, + "rewards/rejected": -12.994241714477539, + "step": 940 + }, + { + "epoch": 0.39, + "grad_norm": 0.10693359375, + "learning_rate": 3.815810740265769e-06, + "logits/chosen": 0.5020047426223755, + "logits/rejected": 1.345840573310852, + "logps/chosen": -702.5892333984375, + "logps/rejected": -1638.1595458984375, + "loss": 0.1495, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.527231216430664, + "rewards/margins": 9.787068367004395, + "rewards/margins_max": 14.262059211730957, + "rewards/margins_min": 5.312075614929199, + "rewards/margins_std": 6.328594207763672, + "rewards/rejected": -14.314300537109375, + "step": 950 + }, + { + "epoch": 0.4, + "grad_norm": 6.03125, + "learning_rate": 3.785112337010284e-06, + "logits/chosen": 0.6428021192550659, + "logits/rejected": 1.342193365097046, + "logps/chosen": -698.9358520507812, + "logps/rejected": -1490.9117431640625, + "loss": 0.1081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.455615043640137, + "rewards/margins": 8.250611305236816, + "rewards/margins_max": 11.710147857666016, + "rewards/margins_min": 4.791074275970459, + "rewards/margins_std": 4.892523765563965, + "rewards/rejected": -12.706225395202637, + "step": 960 + }, + { + "epoch": 0.4, + "grad_norm": 0.6328125, + "learning_rate": 3.7541482711210474e-06, + "logits/chosen": 0.49654191732406616, + "logits/rejected": 1.2780824899673462, + "logps/chosen": -770.838623046875, + "logps/rejected": -1911.4193115234375, + "loss": 0.1141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.841742038726807, + "rewards/margins": 11.952981948852539, + "rewards/margins_max": 17.861114501953125, + "rewards/margins_min": 6.044848442077637, + "rewards/margins_std": 8.355361938476562, + "rewards/rejected": -16.794721603393555, + "step": 970 + }, + { + "epoch": 0.4, + "grad_norm": 1.9765625, + "learning_rate": 3.722924943591232e-06, + "logits/chosen": 0.5268442034721375, + "logits/rejected": 1.2990949153900146, + "logps/chosen": -794.0743408203125, + "logps/rejected": -1849.2060546875, + "loss": 0.0905, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.302125930786133, + "rewards/margins": 11.0300874710083, + "rewards/margins_max": 15.024978637695312, + "rewards/margins_min": 7.035195827484131, + "rewards/margins_std": 5.649630069732666, + "rewards/rejected": -16.33221435546875, + "step": 980 + }, + { + "epoch": 0.41, + "grad_norm": 1.3203125, + "learning_rate": 3.691448809009427e-06, + "logits/chosen": 0.627538800239563, + "logits/rejected": 1.3176391124725342, + "logps/chosen": -826.3603515625, + "logps/rejected": -1807.1185302734375, + "loss": 0.165, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.355879783630371, + "rewards/margins": 10.351202011108398, + "rewards/margins_max": 15.11308479309082, + "rewards/margins_min": 5.589318752288818, + "rewards/margins_std": 6.73431921005249, + "rewards/rejected": -15.70708179473877, + "step": 990 + }, + { + "epoch": 0.41, + "grad_norm": 0.84375, + "learning_rate": 3.659726374225323e-06, + "logits/chosen": 0.47057127952575684, + "logits/rejected": 1.1657397747039795, + "logps/chosen": -652.9990234375, + "logps/rejected": -1534.808349609375, + "loss": 0.0925, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.1840925216674805, + "rewards/margins": 8.879453659057617, + "rewards/margins_max": 13.41234302520752, + "rewards/margins_min": 4.346565246582031, + "rewards/margins_std": 6.410472869873047, + "rewards/rejected": -13.063547134399414, + "step": 1000 + }, + { + "epoch": 0.42, + "grad_norm": 1.1171875, + "learning_rate": 3.6277641970045975e-06, + "logits/chosen": 0.5770415663719177, + "logits/rejected": 1.3713629245758057, + "logps/chosen": -804.15576171875, + "logps/rejected": -1777.1861572265625, + "loss": 0.1385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.3897552490234375, + "rewards/margins": 10.179000854492188, + "rewards/margins_max": 15.141960144042969, + "rewards/margins_min": 5.216042995452881, + "rewards/margins_std": 7.018682956695557, + "rewards/rejected": -15.568756103515625, + "step": 1010 + }, + { + "epoch": 0.42, + "grad_norm": 1.2265625, + "learning_rate": 3.5955688846732677e-06, + "logits/chosen": 0.5724108815193176, + "logits/rejected": 1.2440688610076904, + "logps/chosen": -775.36328125, + "logps/rejected": -2097.964599609375, + "loss": 0.1097, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.414142608642578, + "rewards/margins": 13.599000930786133, + "rewards/margins_max": 21.20298957824707, + "rewards/margins_min": 5.995011329650879, + "rewards/margins_std": 10.753664016723633, + "rewards/rejected": -19.013145446777344, + "step": 1020 + }, + { + "epoch": 0.42, + "grad_norm": 3.421875, + "learning_rate": 3.563147092751807e-06, + "logits/chosen": 0.5183674097061157, + "logits/rejected": 1.306438684463501, + "logps/chosen": -913.7030029296875, + "logps/rejected": -1972.9140625, + "loss": 0.087, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.088994026184082, + "rewards/margins": 11.24118423461914, + "rewards/margins_max": 17.46712875366211, + "rewards/margins_min": 5.0152411460876465, + "rewards/margins_std": 8.804813385009766, + "rewards/rejected": -17.330181121826172, + "step": 1030 + }, + { + "epoch": 0.43, + "grad_norm": 2.65625, + "learning_rate": 3.5305055235792906e-06, + "logits/chosen": 0.5217747688293457, + "logits/rejected": 1.2815120220184326, + "logps/chosen": -753.7869262695312, + "logps/rejected": -1882.3675537109375, + "loss": 0.1165, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.9851508140563965, + "rewards/margins": 11.520073890686035, + "rewards/margins_max": 17.029399871826172, + "rewards/margins_min": 6.010746955871582, + "rewards/margins_std": 7.791365623474121, + "rewards/rejected": -16.505224227905273, + "step": 1040 + }, + { + "epoch": 0.43, + "grad_norm": 9.625, + "learning_rate": 3.4976509249278673e-06, + "logits/chosen": 0.6170846819877625, + "logits/rejected": 1.3059895038604736, + "logps/chosen": -820.76708984375, + "logps/rejected": -1968.9228515625, + "loss": 0.1944, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.856749534606934, + "rewards/margins": 11.451845169067383, + "rewards/margins_max": 17.031482696533203, + "rewards/margins_min": 5.8722076416015625, + "rewards/margins_std": 7.890799045562744, + "rewards/rejected": -17.30859375, + "step": 1050 + }, + { + "epoch": 0.44, + "grad_norm": 10.0, + "learning_rate": 3.4645900886078388e-06, + "logits/chosen": 0.47162705659866333, + "logits/rejected": 1.2156587839126587, + "logps/chosen": -745.6646728515625, + "logps/rejected": -1683.563232421875, + "loss": 0.1421, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.883286476135254, + "rewards/margins": 9.705187797546387, + "rewards/margins_max": 14.16901683807373, + "rewards/margins_min": 5.241359710693359, + "rewards/margins_std": 6.312806129455566, + "rewards/rejected": -14.588473320007324, + "step": 1060 + }, + { + "epoch": 0.44, + "grad_norm": 1.3515625, + "learning_rate": 3.4313298490636328e-06, + "logits/chosen": 0.542891800403595, + "logits/rejected": 1.327044129371643, + "logps/chosen": -745.6140747070312, + "logps/rejected": -1832.4476318359375, + "loss": 0.1122, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.060235023498535, + "rewards/margins": 11.030832290649414, + "rewards/margins_max": 15.847732543945312, + "rewards/margins_min": 6.21393346786499, + "rewards/margins_std": 6.812124729156494, + "rewards/rejected": -16.091068267822266, + "step": 1070 + }, + { + "epoch": 0.44, + "grad_norm": 1.40625, + "learning_rate": 3.3978770819609647e-06, + "logits/chosen": 0.5193914175033569, + "logits/rejected": 1.2432626485824585, + "logps/chosen": -718.4923095703125, + "logps/rejected": -1824.8092041015625, + "loss": 0.0604, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.907055377960205, + "rewards/margins": 11.238981246948242, + "rewards/margins_max": 16.069297790527344, + "rewards/margins_min": 6.40866756439209, + "rewards/margins_std": 6.831096649169922, + "rewards/rejected": -16.146038055419922, + "step": 1080 + }, + { + "epoch": 0.45, + "grad_norm": 2.0, + "learning_rate": 3.364238702765477e-06, + "logits/chosen": 0.6283344030380249, + "logits/rejected": 1.1587202548980713, + "logps/chosen": -784.2772216796875, + "logps/rejected": -1649.103515625, + "loss": 0.093, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.440009117126465, + "rewards/margins": 8.746278762817383, + "rewards/margins_max": 12.856730461120605, + "rewards/margins_min": 4.63582706451416, + "rewards/margins_std": 5.813055992126465, + "rewards/rejected": -14.186288833618164, + "step": 1090 + }, + { + "epoch": 0.45, + "grad_norm": 0.11669921875, + "learning_rate": 3.3304216653131566e-06, + "logits/chosen": 0.4906349778175354, + "logits/rejected": 1.1233699321746826, + "logps/chosen": -726.7098388671875, + "logps/rejected": -1864.184326171875, + "loss": 0.0985, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.014734268188477, + "rewards/margins": 11.505403518676758, + "rewards/margins_max": 17.862531661987305, + "rewards/margins_min": 5.148275375366211, + "rewards/margins_std": 8.990338325500488, + "rewards/rejected": -16.520137786865234, + "step": 1100 + }, + { + "epoch": 0.46, + "grad_norm": 0.65234375, + "learning_rate": 3.2964329603728046e-06, + "logits/chosen": 0.4619167447090149, + "logits/rejected": 1.1618800163269043, + "logps/chosen": -792.4830322265625, + "logps/rejected": -1843.839111328125, + "loss": 0.1262, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.372784614562988, + "rewards/margins": 10.8291597366333, + "rewards/margins_max": 15.854537963867188, + "rewards/margins_min": 5.803778648376465, + "rewards/margins_std": 7.106959342956543, + "rewards/rejected": -16.201942443847656, + "step": 1110 + }, + { + "epoch": 0.46, + "grad_norm": 0.59765625, + "learning_rate": 3.262279614200892e-06, + "logits/chosen": 0.5689177513122559, + "logits/rejected": 1.27706778049469, + "logps/chosen": -735.7247314453125, + "logps/rejected": -1631.17578125, + "loss": 0.1125, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.984173774719238, + "rewards/margins": 9.399529457092285, + "rewards/margins_max": 13.701881408691406, + "rewards/margins_min": 5.097177505493164, + "rewards/margins_std": 6.084444046020508, + "rewards/rejected": -14.383702278137207, + "step": 1120 + }, + { + "epoch": 0.47, + "grad_norm": 0.322265625, + "learning_rate": 3.2279686870890637e-06, + "logits/chosen": 0.4834915101528168, + "logits/rejected": 1.2427217960357666, + "logps/chosen": -703.0142822265625, + "logps/rejected": -1653.974365234375, + "loss": 0.0839, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.775191307067871, + "rewards/margins": 9.554444313049316, + "rewards/margins_max": 14.255029678344727, + "rewards/margins_min": 4.853858470916748, + "rewards/margins_std": 6.6476311683654785, + "rewards/rejected": -14.329633712768555, + "step": 1130 + }, + { + "epoch": 0.47, + "grad_norm": 0.57421875, + "learning_rate": 3.193507271904612e-06, + "logits/chosen": 0.44650688767433167, + "logits/rejected": 1.2217845916748047, + "logps/chosen": -858.9959106445312, + "logps/rejected": -1833.328125, + "loss": 0.0916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.681185722351074, + "rewards/margins": 10.469725608825684, + "rewards/margins_max": 15.999313354492188, + "rewards/margins_min": 4.940136909484863, + "rewards/margins_std": 7.8200178146362305, + "rewards/rejected": -16.150911331176758, + "step": 1140 + }, + { + "epoch": 0.47, + "grad_norm": 0.3046875, + "learning_rate": 3.158902492624218e-06, + "logits/chosen": 0.4523468613624573, + "logits/rejected": 1.2057933807373047, + "logps/chosen": -841.3018798828125, + "logps/rejected": -1831.9058837890625, + "loss": 0.0927, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.5725226402282715, + "rewards/margins": 10.594744682312012, + "rewards/margins_max": 15.057345390319824, + "rewards/margins_min": 6.132142543792725, + "rewards/margins_std": 6.311070442199707, + "rewards/rejected": -16.167264938354492, + "step": 1150 + }, + { + "epoch": 0.48, + "grad_norm": 4.125, + "learning_rate": 3.1241615028612563e-06, + "logits/chosen": 0.5951441526412964, + "logits/rejected": 1.2352155447006226, + "logps/chosen": -768.4414672851562, + "logps/rejected": -1707.2896728515625, + "loss": 0.2036, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.0751752853393555, + "rewards/margins": 9.864356994628906, + "rewards/margins_max": 14.157461166381836, + "rewards/margins_min": 5.57125186920166, + "rewards/margins_std": 6.071366786956787, + "rewards/rejected": -14.939532279968262, + "step": 1160 + }, + { + "epoch": 0.48, + "grad_norm": 0.765625, + "learning_rate": 3.0892914843869838e-06, + "logits/chosen": 0.5745668411254883, + "logits/rejected": 1.3735682964324951, + "logps/chosen": -716.9601440429688, + "logps/rejected": -1639.5445556640625, + "loss": 0.0789, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.607513904571533, + "rewards/margins": 9.50660228729248, + "rewards/margins_max": 14.321581840515137, + "rewards/margins_min": 4.691622734069824, + "rewards/margins_std": 6.809409141540527, + "rewards/rejected": -14.114115715026855, + "step": 1170 + }, + { + "epoch": 0.49, + "grad_norm": 0.3125, + "learning_rate": 3.054299645645889e-06, + "logits/chosen": 0.574237048625946, + "logits/rejected": 1.1578586101531982, + "logps/chosen": -723.4119262695312, + "logps/rejected": -1720.607666015625, + "loss": 0.1266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.802746772766113, + "rewards/margins": 10.177275657653809, + "rewards/margins_max": 15.720603942871094, + "rewards/margins_min": 4.633947372436523, + "rewards/margins_std": 7.839449882507324, + "rewards/rejected": -14.980023384094238, + "step": 1180 + }, + { + "epoch": 0.49, + "grad_norm": 0.7578125, + "learning_rate": 3.01919322026555e-06, + "logits/chosen": 0.57005774974823, + "logits/rejected": 1.3801429271697998, + "logps/chosen": -777.7997436523438, + "logps/rejected": -1871.4964599609375, + "loss": 0.12, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.197485446929932, + "rewards/margins": 11.358160018920898, + "rewards/margins_max": 16.549087524414062, + "rewards/margins_min": 6.167231559753418, + "rewards/margins_std": 7.341080665588379, + "rewards/rejected": -16.555644989013672, + "step": 1190 + }, + { + "epoch": 0.49, + "grad_norm": 0.26171875, + "learning_rate": 2.9839794655612674e-06, + "logits/chosen": 0.4680374562740326, + "logits/rejected": 1.2621392011642456, + "logps/chosen": -701.8974609375, + "logps/rejected": -1736.7503662109375, + "loss": 0.1476, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.581705570220947, + "rewards/margins": 10.844728469848633, + "rewards/margins_max": 15.584383964538574, + "rewards/margins_min": 6.105072498321533, + "rewards/margins_std": 6.702885627746582, + "rewards/rejected": -15.426434516906738, + "step": 1200 + }, + { + "epoch": 0.5, + "grad_norm": 0.2890625, + "learning_rate": 2.9486656610358143e-06, + "logits/chosen": 0.48323068022727966, + "logits/rejected": 1.2080551385879517, + "logps/chosen": -702.0436401367188, + "logps/rejected": -1731.0406494140625, + "loss": 0.0973, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.697200775146484, + "rewards/margins": 10.60991096496582, + "rewards/margins_max": 16.56106948852539, + "rewards/margins_min": 4.658753871917725, + "rewards/margins_std": 8.416207313537598, + "rewards/rejected": -15.307113647460938, + "step": 1210 + }, + { + "epoch": 0.5, + "grad_norm": 0.78125, + "learning_rate": 2.9132591068745884e-06, + "logits/chosen": 0.5117800235748291, + "logits/rejected": 1.158496618270874, + "logps/chosen": -699.7086791992188, + "logps/rejected": -1694.8929443359375, + "loss": 0.118, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.589441776275635, + "rewards/margins": 9.935541152954102, + "rewards/margins_max": 14.52270793914795, + "rewards/margins_min": 5.348374366760254, + "rewards/margins_std": 6.487233638763428, + "rewards/rejected": -14.524983406066895, + "step": 1220 + }, + { + "epoch": 0.51, + "grad_norm": 1.34375, + "learning_rate": 2.8777671224364966e-06, + "logits/chosen": 0.5292683243751526, + "logits/rejected": 1.3735748529434204, + "logps/chosen": -793.8040771484375, + "logps/rejected": -2016.2437744140625, + "loss": 0.1016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.2961931228637695, + "rewards/margins": 12.560154914855957, + "rewards/margins_max": 19.488279342651367, + "rewards/margins_min": 5.632030487060547, + "rewards/margins_std": 9.797847747802734, + "rewards/rejected": -17.856348037719727, + "step": 1230 + }, + { + "epoch": 0.51, + "grad_norm": 1.1796875, + "learning_rate": 2.842197044740873e-06, + "logits/chosen": 0.5125163793563843, + "logits/rejected": 1.1910914182662964, + "logps/chosen": -716.28271484375, + "logps/rejected": -1681.4847412109375, + "loss": 0.1034, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.732181549072266, + "rewards/margins": 9.891237258911133, + "rewards/margins_max": 14.310302734375, + "rewards/margins_min": 5.472168922424316, + "rewards/margins_std": 6.249504566192627, + "rewards/rejected": -14.623417854309082, + "step": 1240 + }, + { + "epoch": 0.51, + "grad_norm": 0.408203125, + "learning_rate": 2.8065562269507464e-06, + "logits/chosen": 0.6009246110916138, + "logits/rejected": 1.1898201704025269, + "logps/chosen": -778.5814819335938, + "logps/rejected": -2116.184814453125, + "loss": 0.0969, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.313014030456543, + "rewards/margins": 13.718805313110352, + "rewards/margins_max": 19.344837188720703, + "rewards/margins_min": 8.092771530151367, + "rewards/margins_std": 7.956411838531494, + "rewards/rejected": -19.031816482543945, + "step": 1250 + }, + { + "epoch": 0.52, + "grad_norm": 1.75, + "learning_rate": 2.7708520368527687e-06, + "logits/chosen": 0.6829395294189453, + "logits/rejected": 1.4511185884475708, + "logps/chosen": -764.1370849609375, + "logps/rejected": -1753.5296630859375, + "loss": 0.0735, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.051453590393066, + "rewards/margins": 10.619558334350586, + "rewards/margins_max": 16.41791534423828, + "rewards/margins_min": 4.821201324462891, + "rewards/margins_std": 8.200114250183105, + "rewards/rejected": -15.671010971069336, + "step": 1260 + }, + { + "epoch": 0.52, + "grad_norm": 0.6875, + "learning_rate": 2.735091855334122e-06, + "logits/chosen": 0.5935325622558594, + "logits/rejected": 1.272655963897705, + "logps/chosen": -780.0508422851562, + "logps/rejected": -1765.977294921875, + "loss": 0.1392, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.5274786949157715, + "rewards/margins": 10.1564302444458, + "rewards/margins_max": 15.466066360473633, + "rewards/margins_min": 4.846795082092285, + "rewards/margins_std": 7.508957862854004, + "rewards/rejected": -15.68390941619873, + "step": 1270 + }, + { + "epoch": 0.53, + "grad_norm": 8.8125, + "learning_rate": 2.6992830748567204e-06, + "logits/chosen": 0.601089596748352, + "logits/rejected": 1.3095543384552002, + "logps/chosen": -735.0325317382812, + "logps/rejected": -1644.493408203125, + "loss": 0.1702, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.930995464324951, + "rewards/margins": 9.481109619140625, + "rewards/margins_max": 13.667490005493164, + "rewards/margins_min": 5.294730186462402, + "rewards/margins_std": 5.920435428619385, + "rewards/rejected": -14.412104606628418, + "step": 1280 + }, + { + "epoch": 0.53, + "grad_norm": 1.078125, + "learning_rate": 2.6634330979290133e-06, + "logits/chosen": 0.5804930925369263, + "logits/rejected": 1.1953046321868896, + "logps/chosen": -664.8348388671875, + "logps/rejected": -1494.647216796875, + "loss": 0.0935, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.399333477020264, + "rewards/margins": 8.430289268493652, + "rewards/margins_max": 12.645450592041016, + "rewards/margins_min": 4.215127944946289, + "rewards/margins_std": 5.9611382484436035, + "rewards/rejected": -12.829623222351074, + "step": 1290 + }, + { + "epoch": 0.54, + "grad_norm": 2.5625, + "learning_rate": 2.6275493355757166e-06, + "logits/chosen": 0.5969884395599365, + "logits/rejected": 1.2400842905044556, + "logps/chosen": -675.366943359375, + "logps/rejected": -1664.972412109375, + "loss": 0.1129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.368256568908691, + "rewards/margins": 10.127969741821289, + "rewards/margins_max": 15.879676818847656, + "rewards/margins_min": 4.376260757446289, + "rewards/margins_std": 8.134143829345703, + "rewards/rejected": -14.496225357055664, + "step": 1300 + }, + { + "epoch": 0.54, + "grad_norm": 1.234375, + "learning_rate": 2.5916392058057754e-06, + "logits/chosen": 0.6539616584777832, + "logits/rejected": 1.2341909408569336, + "logps/chosen": -646.232421875, + "logps/rejected": -1600.320556640625, + "loss": 0.08, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.306728363037109, + "rewards/margins": 9.608429908752441, + "rewards/margins_max": 13.071403503417969, + "rewards/margins_min": 6.145455837249756, + "rewards/margins_std": 4.897385597229004, + "rewards/rejected": -13.91515827178955, + "step": 1310 + }, + { + "epoch": 0.54, + "grad_norm": 0.443359375, + "learning_rate": 2.5557101320789005e-06, + "logits/chosen": 0.43818527460098267, + "logits/rejected": 1.1889019012451172, + "logps/chosen": -740.1410522460938, + "logps/rejected": -1722.2867431640625, + "loss": 0.0504, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.829034805297852, + "rewards/margins": 10.092082023620605, + "rewards/margins_max": 14.363845825195312, + "rewards/margins_min": 5.820317268371582, + "rewards/margins_std": 6.041186332702637, + "rewards/rejected": -14.921116828918457, + "step": 1320 + }, + { + "epoch": 0.55, + "grad_norm": 0.8515625, + "learning_rate": 2.519769541770954e-06, + "logits/chosen": 0.6074897646903992, + "logits/rejected": 1.3447411060333252, + "logps/chosen": -747.0977783203125, + "logps/rejected": -1610.9615478515625, + "loss": 0.1078, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.645691871643066, + "rewards/margins": 9.302739143371582, + "rewards/margins_max": 13.083358764648438, + "rewards/margins_min": 5.522116661071777, + "rewards/margins_std": 5.34660530090332, + "rewards/rejected": -13.948430061340332, + "step": 1330 + }, + { + "epoch": 0.55, + "grad_norm": 2.4375, + "learning_rate": 2.4838248646385458e-06, + "logits/chosen": 0.4675142765045166, + "logits/rejected": 1.2192738056182861, + "logps/chosen": -713.5174560546875, + "logps/rejected": -1698.417724609375, + "loss": 0.1027, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.643341064453125, + "rewards/margins": 10.201144218444824, + "rewards/margins_max": 15.106475830078125, + "rewards/margins_min": 5.29581356048584, + "rewards/margins_std": 6.937185764312744, + "rewards/rejected": -14.844487190246582, + "step": 1340 + }, + { + "epoch": 0.56, + "grad_norm": 4.5, + "learning_rate": 2.447883531283127e-06, + "logits/chosen": 0.480851411819458, + "logits/rejected": 1.3301368951797485, + "logps/chosen": -769.1808471679688, + "logps/rejected": -1694.986328125, + "loss": 0.1297, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.698918342590332, + "rewards/margins": 10.097999572753906, + "rewards/margins_max": 13.938295364379883, + "rewards/margins_min": 6.257704257965088, + "rewards/margins_std": 5.4309983253479, + "rewards/rejected": -14.796917915344238, + "step": 1350 + }, + { + "epoch": 0.56, + "grad_norm": 0.7734375, + "learning_rate": 2.4119529716149126e-06, + "logits/chosen": 0.5563157796859741, + "logits/rejected": 1.2523400783538818, + "logps/chosen": -786.4019775390625, + "logps/rejected": -1475.5535888671875, + "loss": 0.1001, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.191001892089844, + "rewards/margins": 7.456092834472656, + "rewards/margins_max": 10.319292068481445, + "rewards/margins_min": 4.592894077301025, + "rewards/margins_std": 4.0491743087768555, + "rewards/rejected": -12.647093772888184, + "step": 1360 + }, + { + "epoch": 0.56, + "grad_norm": 4.15625, + "learning_rate": 2.376040613316944e-06, + "logits/chosen": 0.46584218740463257, + "logits/rejected": 1.1385295391082764, + "logps/chosen": -699.6835327148438, + "logps/rejected": -1963.931884765625, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.705713272094727, + "rewards/margins": 12.683069229125977, + "rewards/margins_max": 18.711505889892578, + "rewards/margins_min": 6.654633522033691, + "rewards/margins_std": 8.525496482849121, + "rewards/rejected": -17.388782501220703, + "step": 1370 + }, + { + "epoch": 0.57, + "grad_norm": 0.62109375, + "learning_rate": 2.340153880309619e-06, + "logits/chosen": 0.6857975721359253, + "logits/rejected": 1.3219093084335327, + "logps/chosen": -779.4803466796875, + "logps/rejected": -1705.5458984375, + "loss": 0.0992, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.236274242401123, + "rewards/margins": 9.680828094482422, + "rewards/margins_max": 14.501251220703125, + "rewards/margins_min": 4.860402584075928, + "rewards/margins_std": 6.81710958480835, + "rewards/rejected": -14.917101860046387, + "step": 1380 + }, + { + "epoch": 0.57, + "grad_norm": 1.75, + "learning_rate": 2.3043001912159892e-06, + "logits/chosen": 0.5691137313842773, + "logits/rejected": 1.298168659210205, + "logps/chosen": -779.42431640625, + "logps/rejected": -1864.0394287109375, + "loss": 0.0727, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.378870010375977, + "rewards/margins": 11.22581672668457, + "rewards/margins_max": 16.610761642456055, + "rewards/margins_min": 5.840869903564453, + "rewards/margins_std": 7.615464687347412, + "rewards/rejected": -16.604686737060547, + "step": 1390 + }, + { + "epoch": 0.58, + "grad_norm": 16.25, + "learning_rate": 2.268486957828159e-06, + "logits/chosen": 0.6387670636177063, + "logits/rejected": 1.1569719314575195, + "logps/chosen": -729.2189331054688, + "logps/rejected": -1777.036376953125, + "loss": 0.213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.217606544494629, + "rewards/margins": 10.492830276489258, + "rewards/margins_max": 15.320582389831543, + "rewards/margins_min": 5.665076732635498, + "rewards/margins_std": 6.827474117279053, + "rewards/rejected": -15.71043586730957, + "step": 1400 + }, + { + "epoch": 0.58, + "grad_norm": 1.671875, + "learning_rate": 2.232721583575099e-06, + "logits/chosen": 0.4919258654117584, + "logits/rejected": 1.2310242652893066, + "logps/chosen": -778.7653198242188, + "logps/rejected": -1702.4404296875, + "loss": 0.1083, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.0904669761657715, + "rewards/margins": 9.812942504882812, + "rewards/margins_max": 14.663922309875488, + "rewards/margins_min": 4.961963176727295, + "rewards/margins_std": 6.860320091247559, + "rewards/rejected": -14.903407096862793, + "step": 1410 + }, + { + "epoch": 0.58, + "grad_norm": 3.640625, + "learning_rate": 2.1970114619921804e-06, + "logits/chosen": 0.5403339862823486, + "logits/rejected": 1.279847264289856, + "logps/chosen": -782.6492309570312, + "logps/rejected": -1954.859130859375, + "loss": 0.0877, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.303040504455566, + "rewards/margins": 11.789950370788574, + "rewards/margins_max": 17.79424476623535, + "rewards/margins_min": 5.785655975341797, + "rewards/margins_std": 8.491353988647461, + "rewards/rejected": -17.09299087524414, + "step": 1420 + }, + { + "epoch": 0.59, + "grad_norm": 1.0859375, + "learning_rate": 2.1613639751927636e-06, + "logits/chosen": 0.5678201913833618, + "logits/rejected": 1.2467429637908936, + "logps/chosen": -794.25341796875, + "logps/rejected": -1776.8765869140625, + "loss": 0.1397, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.573482513427734, + "rewards/margins": 10.259510040283203, + "rewards/margins_max": 15.224624633789062, + "rewards/margins_min": 5.294394493103027, + "rewards/margins_std": 7.021732330322266, + "rewards/rejected": -15.832992553710938, + "step": 1430 + }, + { + "epoch": 0.59, + "grad_norm": 0.3984375, + "learning_rate": 2.1257864923421405e-06, + "logits/chosen": 0.5252267122268677, + "logits/rejected": 1.167436957359314, + "logps/chosen": -733.9835205078125, + "logps/rejected": -1947.4000244140625, + "loss": 0.1001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.864560127258301, + "rewards/margins": 12.287667274475098, + "rewards/margins_max": 17.395977020263672, + "rewards/margins_min": 7.179357051849365, + "rewards/margins_std": 7.224241733551025, + "rewards/rejected": -17.152225494384766, + "step": 1440 + }, + { + "epoch": 0.6, + "grad_norm": 2.078125, + "learning_rate": 2.0902863681341546e-06, + "logits/chosen": 0.592448353767395, + "logits/rejected": 1.254591703414917, + "logps/chosen": -762.2680053710938, + "logps/rejected": -1617.958740234375, + "loss": 0.1161, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.002324104309082, + "rewards/margins": 8.928361892700195, + "rewards/margins_max": 13.639185905456543, + "rewards/margins_min": 4.2175397872924805, + "rewards/margins_std": 6.662109375, + "rewards/rejected": -13.930686950683594, + "step": 1450 + }, + { + "epoch": 0.6, + "grad_norm": 1.203125, + "learning_rate": 2.0548709412708235e-06, + "logits/chosen": 0.46100831031799316, + "logits/rejected": 1.1205612421035767, + "logps/chosen": -758.50244140625, + "logps/rejected": -1687.9713134765625, + "loss": 0.1154, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.229290962219238, + "rewards/margins": 9.612086296081543, + "rewards/margins_max": 13.77092456817627, + "rewards/margins_min": 5.453249931335449, + "rewards/margins_std": 5.881483554840088, + "rewards/rejected": -14.841377258300781, + "step": 1460 + }, + { + "epoch": 0.61, + "grad_norm": 0.828125, + "learning_rate": 2.019547532945246e-06, + "logits/chosen": 0.5935944318771362, + "logits/rejected": 1.189452886581421, + "logps/chosen": -698.9295654296875, + "logps/rejected": -1723.960205078125, + "loss": 0.0487, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.868830680847168, + "rewards/margins": 10.070269584655762, + "rewards/margins_max": 15.034326553344727, + "rewards/margins_min": 5.1062116622924805, + "rewards/margins_std": 7.020236015319824, + "rewards/rejected": -14.939099311828613, + "step": 1470 + }, + { + "epoch": 0.61, + "grad_norm": 2.25, + "learning_rate": 1.9843234453281503e-06, + "logits/chosen": 0.5408506989479065, + "logits/rejected": 1.2704055309295654, + "logps/chosen": -782.6818237304688, + "logps/rejected": -1806.959228515625, + "loss": 0.0791, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.255041122436523, + "rewards/margins": 10.520251274108887, + "rewards/margins_max": 15.160499572753906, + "rewards/margins_min": 5.880003929138184, + "rewards/margins_std": 6.562302589416504, + "rewards/rejected": -15.775293350219727, + "step": 1480 + }, + { + "epoch": 0.61, + "grad_norm": 3.515625, + "learning_rate": 1.949205960058361e-06, + "logits/chosen": 0.4531838297843933, + "logits/rejected": 1.268123745918274, + "logps/chosen": -836.1715698242188, + "logps/rejected": -1688.537353515625, + "loss": 0.1847, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.429810523986816, + "rewards/margins": 9.195915222167969, + "rewards/margins_max": 13.804384231567383, + "rewards/margins_min": 4.587449073791504, + "rewards/margins_std": 6.517356872558594, + "rewards/rejected": -14.625727653503418, + "step": 1490 + }, + { + "epoch": 0.62, + "grad_norm": 0.96875, + "learning_rate": 1.914202336737517e-06, + "logits/chosen": 0.4794815182685852, + "logits/rejected": 1.1748192310333252, + "logps/chosen": -741.9356079101562, + "logps/rejected": -1867.9176025390625, + "loss": 0.0846, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.017170429229736, + "rewards/margins": 11.546915054321289, + "rewards/margins_max": 17.515758514404297, + "rewards/margins_min": 5.5780720710754395, + "rewards/margins_std": 8.441219329833984, + "rewards/rejected": -16.564085006713867, + "step": 1500 + }, + { + "epoch": 0.62, + "grad_norm": 0.10595703125, + "learning_rate": 1.8793198114293419e-06, + "logits/chosen": 0.5758123993873596, + "logits/rejected": 1.2776639461517334, + "logps/chosen": -671.192138671875, + "logps/rejected": -1982.5927734375, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.604527473449707, + "rewards/margins": 13.000114440917969, + "rewards/margins_max": 19.173341751098633, + "rewards/margins_min": 6.826885223388672, + "rewards/margins_std": 8.730262756347656, + "rewards/rejected": -17.604642868041992, + "step": 1510 + }, + { + "epoch": 0.63, + "grad_norm": 0.87890625, + "learning_rate": 1.8445655951637797e-06, + "logits/chosen": 0.5493451952934265, + "logits/rejected": 1.379970908164978, + "logps/chosen": -748.9559326171875, + "logps/rejected": -1775.750244140625, + "loss": 0.0733, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.9326372146606445, + "rewards/margins": 10.712015151977539, + "rewards/margins_max": 15.60670280456543, + "rewards/margins_min": 5.817324638366699, + "rewards/margins_std": 6.922135829925537, + "rewards/rejected": -15.64465045928955, + "step": 1520 + }, + { + "epoch": 0.63, + "grad_norm": 2.5625, + "learning_rate": 1.809946872446312e-06, + "logits/chosen": 0.5186041593551636, + "logits/rejected": 1.1791023015975952, + "logps/chosen": -725.4397583007812, + "logps/rejected": -1589.250732421875, + "loss": 0.0827, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.731976509094238, + "rewards/margins": 9.246491432189941, + "rewards/margins_max": 14.182947158813477, + "rewards/margins_min": 4.310037136077881, + "rewards/margins_std": 6.981202125549316, + "rewards/rejected": -13.978469848632812, + "step": 1530 + }, + { + "epoch": 0.63, + "grad_norm": 1.171875, + "learning_rate": 1.7754707997727471e-06, + "logits/chosen": 0.6401320695877075, + "logits/rejected": 1.1825447082519531, + "logps/chosen": -791.119140625, + "logps/rejected": -1879.62109375, + "loss": 0.0851, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.561471939086914, + "rewards/margins": 10.975897789001465, + "rewards/margins_max": 15.100332260131836, + "rewards/margins_min": 6.85146427154541, + "rewards/margins_std": 5.832830429077148, + "rewards/rejected": -16.537368774414062, + "step": 1540 + }, + { + "epoch": 0.64, + "grad_norm": 6.0625, + "learning_rate": 1.7411445041498099e-06, + "logits/chosen": 0.5857383012771606, + "logits/rejected": 1.3303660154342651, + "logps/chosen": -796.5535888671875, + "logps/rejected": -2160.908203125, + "loss": 0.1357, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.3245439529418945, + "rewards/margins": 13.754674911499023, + "rewards/margins_max": 19.97846221923828, + "rewards/margins_min": 7.530887603759766, + "rewards/margins_std": 8.801763534545898, + "rewards/rejected": -19.079219818115234, + "step": 1550 + }, + { + "epoch": 0.64, + "grad_norm": 1.0703125, + "learning_rate": 1.7069750816218218e-06, + "logits/chosen": 0.5591040849685669, + "logits/rejected": 1.376008152961731, + "logps/chosen": -757.9560546875, + "logps/rejected": -1931.6070556640625, + "loss": 0.0526, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.156350135803223, + "rewards/margins": 12.076948165893555, + "rewards/margins_max": 17.336977005004883, + "rewards/margins_min": 6.816922664642334, + "rewards/margins_std": 7.438802242279053, + "rewards/rejected": -17.233299255371094, + "step": 1560 + }, + { + "epoch": 0.65, + "grad_norm": 3.390625, + "learning_rate": 1.6729695958037856e-06, + "logits/chosen": 0.5422581434249878, + "logits/rejected": 1.107097864151001, + "logps/chosen": -806.7074584960938, + "logps/rejected": -1815.927734375, + "loss": 0.0766, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.696779727935791, + "rewards/margins": 10.256368637084961, + "rewards/margins_max": 15.249975204467773, + "rewards/margins_min": 5.262759685516357, + "rewards/margins_std": 7.062028408050537, + "rewards/rejected": -15.953149795532227, + "step": 1570 + }, + { + "epoch": 0.65, + "grad_norm": 4.21875, + "learning_rate": 1.6391350764211675e-06, + "logits/chosen": 0.47015446424484253, + "logits/rejected": 1.3002904653549194, + "logps/chosen": -784.7755737304688, + "logps/rejected": -1845.008544921875, + "loss": 0.0581, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.293187618255615, + "rewards/margins": 11.025456428527832, + "rewards/margins_max": 15.80299186706543, + "rewards/margins_min": 6.247918605804443, + "rewards/margins_std": 6.756457328796387, + "rewards/rejected": -16.31864356994629, + "step": 1580 + }, + { + "epoch": 0.65, + "grad_norm": 5.65625, + "learning_rate": 1.6054785178566944e-06, + "logits/chosen": 0.39869189262390137, + "logits/rejected": 1.1358020305633545, + "logps/chosen": -790.1834106445312, + "logps/rejected": -1959.004638671875, + "loss": 0.1276, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.488138198852539, + "rewards/margins": 12.004860877990723, + "rewards/margins_max": 17.329792022705078, + "rewards/margins_min": 6.679928779602051, + "rewards/margins_std": 7.530592441558838, + "rewards/rejected": -17.493000030517578, + "step": 1590 + }, + { + "epoch": 0.66, + "grad_norm": 0.6328125, + "learning_rate": 1.5720068777044479e-06, + "logits/chosen": 0.5967472195625305, + "logits/rejected": 1.3974864482879639, + "logps/chosen": -806.0808715820312, + "logps/rejected": -1895.9869384765625, + "loss": 0.092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.477103233337402, + "rewards/margins": 11.376334190368652, + "rewards/margins_max": 16.01239776611328, + "rewards/margins_min": 6.74027156829834, + "rewards/margins_std": 6.5563836097717285, + "rewards/rejected": -16.853437423706055, + "step": 1600 + }, + { + "epoch": 0.66, + "grad_norm": 3.671875, + "learning_rate": 1.5387270753315726e-06, + "logits/chosen": 0.5518096089363098, + "logits/rejected": 1.32808256149292, + "logps/chosen": -816.740234375, + "logps/rejected": -2068.84326171875, + "loss": 0.1744, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.405552387237549, + "rewards/margins": 12.889608383178711, + "rewards/margins_max": 20.060169219970703, + "rewards/margins_min": 5.719046592712402, + "rewards/margins_std": 10.140707015991211, + "rewards/rejected": -18.295162200927734, + "step": 1610 + }, + { + "epoch": 0.67, + "grad_norm": 0.7421875, + "learning_rate": 1.5056459904478738e-06, + "logits/chosen": 0.5233970880508423, + "logits/rejected": 1.1991077661514282, + "logps/chosen": -799.1451416015625, + "logps/rejected": -1898.0875244140625, + "loss": 0.1284, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.3765363693237305, + "rewards/margins": 11.210358619689941, + "rewards/margins_max": 15.882084846496582, + "rewards/margins_min": 6.538631439208984, + "rewards/margins_std": 6.606819152832031, + "rewards/rejected": -16.586894989013672, + "step": 1620 + }, + { + "epoch": 0.67, + "grad_norm": 7.25, + "learning_rate": 1.4727704616836297e-06, + "logits/chosen": 0.4744800925254822, + "logits/rejected": 1.247642993927002, + "logps/chosen": -778.9432373046875, + "logps/rejected": -1884.1217041015625, + "loss": 0.0875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.104981422424316, + "rewards/margins": 11.539787292480469, + "rewards/margins_max": 16.65086555480957, + "rewards/margins_min": 6.428709506988525, + "rewards/margins_std": 7.228156089782715, + "rewards/rejected": -16.6447696685791, + "step": 1630 + }, + { + "epoch": 0.68, + "grad_norm": 2.90625, + "learning_rate": 1.4401072851758835e-06, + "logits/chosen": 0.5687705278396606, + "logits/rejected": 1.1934126615524292, + "logps/chosen": -706.02294921875, + "logps/rejected": -1641.071044921875, + "loss": 0.0828, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.839352607727051, + "rewards/margins": 9.687154769897461, + "rewards/margins_max": 12.636642456054688, + "rewards/margins_min": 6.737668514251709, + "rewards/margins_std": 4.17120361328125, + "rewards/rejected": -14.526507377624512, + "step": 1640 + }, + { + "epoch": 0.68, + "grad_norm": 9.0625, + "learning_rate": 1.4076632131635226e-06, + "logits/chosen": 0.46886777877807617, + "logits/rejected": 1.1962741613388062, + "logps/chosen": -732.7435302734375, + "logps/rejected": -1612.6419677734375, + "loss": 0.1473, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.984394550323486, + "rewards/margins": 9.234537124633789, + "rewards/margins_max": 13.321691513061523, + "rewards/margins_min": 5.147382736206055, + "rewards/margins_std": 5.780109882354736, + "rewards/rejected": -14.218931198120117, + "step": 1650 + }, + { + "epoch": 0.68, + "grad_norm": 1.1328125, + "learning_rate": 1.3754449525914359e-06, + "logits/chosen": 0.5064732432365417, + "logits/rejected": 1.1770398616790771, + "logps/chosen": -800.9207153320312, + "logps/rejected": -1703.955810546875, + "loss": 0.0867, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.16342306137085, + "rewards/margins": 9.53877067565918, + "rewards/margins_max": 14.161503791809082, + "rewards/margins_min": 4.916037559509277, + "rewards/margins_std": 6.537531852722168, + "rewards/rejected": -14.702194213867188, + "step": 1660 + }, + { + "epoch": 0.69, + "grad_norm": 11.9375, + "learning_rate": 1.343459163724032e-06, + "logits/chosen": 0.6023787260055542, + "logits/rejected": 1.207897424697876, + "logps/chosen": -743.8614501953125, + "logps/rejected": -1714.038330078125, + "loss": 0.1015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.851070404052734, + "rewards/margins": 10.04643440246582, + "rewards/margins_max": 14.574777603149414, + "rewards/margins_min": 5.518091678619385, + "rewards/margins_std": 6.404044151306152, + "rewards/rejected": -14.897504806518555, + "step": 1670 + }, + { + "epoch": 0.69, + "grad_norm": 3.3125, + "learning_rate": 1.311712458768406e-06, + "logits/chosen": 0.6761046648025513, + "logits/rejected": 1.2278960943222046, + "logps/chosen": -726.6144409179688, + "logps/rejected": -1568.93359375, + "loss": 0.1191, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.09859561920166, + "rewards/margins": 8.505921363830566, + "rewards/margins_max": 12.416373252868652, + "rewards/margins_min": 4.595466613769531, + "rewards/margins_std": 5.530216217041016, + "rewards/rejected": -13.604515075683594, + "step": 1680 + }, + { + "epoch": 0.7, + "grad_norm": 0.41015625, + "learning_rate": 1.280211400507444e-06, + "logits/chosen": 0.6303955316543579, + "logits/rejected": 1.32115638256073, + "logps/chosen": -698.5577392578125, + "logps/rejected": -1850.8424072265625, + "loss": 0.0572, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.735129356384277, + "rewards/margins": 11.382537841796875, + "rewards/margins_max": 16.18451499938965, + "rewards/margins_min": 6.580558776855469, + "rewards/margins_std": 6.791023254394531, + "rewards/rejected": -16.117666244506836, + "step": 1690 + }, + { + "epoch": 0.7, + "grad_norm": 1.65625, + "learning_rate": 1.2489625009431409e-06, + "logits/chosen": 0.5856636762619019, + "logits/rejected": 1.2052780389785767, + "logps/chosen": -733.9873046875, + "logps/rejected": -1687.567138671875, + "loss": 0.1764, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.016100883483887, + "rewards/margins": 9.677934646606445, + "rewards/margins_max": 15.035099983215332, + "rewards/margins_min": 4.320771217346191, + "rewards/margins_std": 7.576174259185791, + "rewards/rejected": -14.694036483764648, + "step": 1700 + }, + { + "epoch": 0.7, + "grad_norm": 1.6640625, + "learning_rate": 1.2179722199504213e-06, + "logits/chosen": 0.5713605284690857, + "logits/rejected": 1.207334280014038, + "logps/chosen": -728.2240600585938, + "logps/rejected": -1619.398681640625, + "loss": 0.1203, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.969111919403076, + "rewards/margins": 9.243115425109863, + "rewards/margins_max": 13.461108207702637, + "rewards/margins_min": 5.025121212005615, + "rewards/margins_std": 5.96514368057251, + "rewards/rejected": -14.212226867675781, + "step": 1710 + }, + { + "epoch": 0.71, + "grad_norm": 3.328125, + "learning_rate": 1.187246963941731e-06, + "logits/chosen": 0.5765690803527832, + "logits/rejected": 1.1067253351211548, + "logps/chosen": -698.3675537109375, + "logps/rejected": -1611.295654296875, + "loss": 0.1114, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.707912445068359, + "rewards/margins": 9.343481063842773, + "rewards/margins_max": 13.480981826782227, + "rewards/margins_min": 5.205979347229004, + "rewards/margins_std": 5.851310729980469, + "rewards/rejected": -14.05139446258545, + "step": 1720 + }, + { + "epoch": 0.71, + "grad_norm": 1.2265625, + "learning_rate": 1.1567930845426802e-06, + "logits/chosen": 0.41190090775489807, + "logits/rejected": 1.0678179264068604, + "logps/chosen": -716.1203002929688, + "logps/rejected": -1831.5726318359375, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.89259147644043, + "rewards/margins": 11.263982772827148, + "rewards/margins_max": 16.847904205322266, + "rewards/margins_min": 5.680062294006348, + "rewards/margins_std": 7.89685583114624, + "rewards/rejected": -16.156574249267578, + "step": 1730 + }, + { + "epoch": 0.72, + "grad_norm": 1.640625, + "learning_rate": 1.1266168772790195e-06, + "logits/chosen": 0.3195948004722595, + "logits/rejected": 1.1387958526611328, + "logps/chosen": -776.84228515625, + "logps/rejected": -1585.8587646484375, + "loss": 0.1529, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.140332221984863, + "rewards/margins": 8.726045608520508, + "rewards/margins_max": 12.733012199401855, + "rewards/margins_min": 4.719078063964844, + "rewards/margins_std": 5.666707515716553, + "rewards/rejected": -13.866376876831055, + "step": 1740 + }, + { + "epoch": 0.72, + "grad_norm": 0.3984375, + "learning_rate": 1.0967245802752044e-06, + "logits/chosen": 0.5815094113349915, + "logits/rejected": 1.331162691116333, + "logps/chosen": -753.7833862304688, + "logps/rejected": -1863.087646484375, + "loss": 0.0754, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.917272090911865, + "rewards/margins": 11.66606330871582, + "rewards/margins_max": 16.2323055267334, + "rewards/margins_min": 7.099822044372559, + "rewards/margins_std": 6.457640171051025, + "rewards/rejected": -16.583335876464844, + "step": 1750 + }, + { + "epoch": 0.72, + "grad_norm": 1.9921875, + "learning_rate": 1.0671223729648338e-06, + "logits/chosen": 0.5788689851760864, + "logits/rejected": 1.1679919958114624, + "logps/chosen": -738.84423828125, + "logps/rejected": -1693.3870849609375, + "loss": 0.1364, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.073354244232178, + "rewards/margins": 9.768596649169922, + "rewards/margins_max": 14.380800247192383, + "rewards/margins_min": 5.156393051147461, + "rewards/margins_std": 6.522641658782959, + "rewards/rejected": -14.841951370239258, + "step": 1760 + }, + { + "epoch": 0.73, + "grad_norm": 1.7421875, + "learning_rate": 1.0378163748132102e-06, + "logits/chosen": 0.49502748250961304, + "logits/rejected": 1.2685495615005493, + "logps/chosen": -712.3984375, + "logps/rejected": -1658.44921875, + "loss": 0.0608, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.415982246398926, + "rewards/margins": 9.93490219116211, + "rewards/margins_max": 14.178258895874023, + "rewards/margins_min": 5.691543102264404, + "rewards/margins_std": 6.001015663146973, + "rewards/rejected": -14.350883483886719, + "step": 1770 + }, + { + "epoch": 0.73, + "grad_norm": 6.125, + "learning_rate": 1.008812644052311e-06, + "logits/chosen": 0.4484991431236267, + "logits/rejected": 1.1256628036499023, + "logps/chosen": -690.5672607421875, + "logps/rejected": -1668.564697265625, + "loss": 0.0796, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.382365703582764, + "rewards/margins": 10.01590347290039, + "rewards/margins_max": 13.478933334350586, + "rewards/margins_min": 6.552873134613037, + "rewards/margins_std": 4.897465229034424, + "rewards/rejected": -14.398269653320312, + "step": 1780 + }, + { + "epoch": 0.74, + "grad_norm": 0.796875, + "learning_rate": 9.801171764284072e-07, + "logits/chosen": 0.5813416838645935, + "logits/rejected": 1.228780746459961, + "logps/chosen": -712.302734375, + "logps/rejected": -1828.1448974609375, + "loss": 0.0642, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.676440238952637, + "rewards/margins": 11.309109687805176, + "rewards/margins_max": 16.14137840270996, + "rewards/margins_min": 6.4768385887146, + "rewards/margins_std": 6.833861351013184, + "rewards/rejected": -15.985549926757812, + "step": 1790 + }, + { + "epoch": 0.74, + "grad_norm": 1.4453125, + "learning_rate": 9.517359039626043e-07, + "logits/chosen": 0.5194617509841919, + "logits/rejected": 1.1831514835357666, + "logps/chosen": -732.8680419921875, + "logps/rejected": -1726.790283203125, + "loss": 0.0712, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.601668357849121, + "rewards/margins": 10.169393539428711, + "rewards/margins_max": 15.003515243530273, + "rewards/margins_min": 5.335273742675781, + "rewards/margins_std": 6.836478233337402, + "rewards/rejected": -14.771062850952148, + "step": 1800 + }, + { + "epoch": 0.75, + "grad_norm": 0.83203125, + "learning_rate": 9.23674693724555e-07, + "logits/chosen": 0.2990169823169708, + "logits/rejected": 0.9671838879585266, + "logps/chosen": -760.0450439453125, + "logps/rejected": -1871.8310546875, + "loss": 0.0583, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.976813316345215, + "rewards/margins": 11.278745651245117, + "rewards/margins_max": 14.921788215637207, + "rewards/margins_min": 7.63570499420166, + "rewards/margins_std": 5.152037620544434, + "rewards/rejected": -16.255558013916016, + "step": 1810 + }, + { + "epoch": 0.75, + "grad_norm": 1.6328125, + "learning_rate": 8.959393466195973e-07, + "logits/chosen": 0.41968780755996704, + "logits/rejected": 1.290880799293518, + "logps/chosen": -761.1870727539062, + "logps/rejected": -1643.1441650390625, + "loss": 0.0557, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.9296698570251465, + "rewards/margins": 9.346491813659668, + "rewards/margins_max": 12.647331237792969, + "rewards/margins_min": 6.045652866363525, + "rewards/margins_std": 4.6680908203125, + "rewards/rejected": -14.276163101196289, + "step": 1820 + }, + { + "epoch": 0.75, + "grad_norm": 0.86328125, + "learning_rate": 8.685355961895783e-07, + "logits/chosen": 0.687114417552948, + "logits/rejected": 1.4132459163665771, + "logps/chosen": -767.54248046875, + "logps/rejected": -1862.729248046875, + "loss": 0.0503, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.025767803192139, + "rewards/margins": 11.429033279418945, + "rewards/margins_max": 17.451461791992188, + "rewards/margins_min": 5.406604290008545, + "rewards/margins_std": 8.517000198364258, + "rewards/rejected": -16.45479965209961, + "step": 1830 + }, + { + "epoch": 0.76, + "grad_norm": 0.73046875, + "learning_rate": 8.414691074275916e-07, + "logits/chosen": 0.4633597433567047, + "logits/rejected": 1.248290777206421, + "logps/chosen": -777.6952514648438, + "logps/rejected": -1863.720458984375, + "loss": 0.0822, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.001014232635498, + "rewards/margins": 11.318872451782227, + "rewards/margins_max": 15.89136028289795, + "rewards/margins_min": 6.7463860511779785, + "rewards/margins_std": 6.466473579406738, + "rewards/rejected": -16.319889068603516, + "step": 1840 + }, + { + "epoch": 0.76, + "grad_norm": 0.9453125, + "learning_rate": 8.147454756068937e-07, + "logits/chosen": 0.5497418642044067, + "logits/rejected": 1.2043471336364746, + "logps/chosen": -709.6234130859375, + "logps/rejected": -1719.3043212890625, + "loss": 0.0768, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.652140140533447, + "rewards/margins": 10.480083465576172, + "rewards/margins_max": 15.241083145141602, + "rewards/margins_min": 5.719081401824951, + "rewards/margins_std": 6.733071804046631, + "rewards/rejected": -15.132222175598145, + "step": 1850 + }, + { + "epoch": 0.77, + "grad_norm": 1.1640625, + "learning_rate": 7.883702251242298e-07, + "logits/chosen": 0.45454102754592896, + "logits/rejected": 1.1140748262405396, + "logps/chosen": -678.3165283203125, + "logps/rejected": -1609.6807861328125, + "loss": 0.1038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.47580623626709, + "rewards/margins": 9.60850715637207, + "rewards/margins_max": 13.529436111450195, + "rewards/margins_min": 5.6875810623168945, + "rewards/margins_std": 5.545028209686279, + "rewards/rejected": -14.084314346313477, + "step": 1860 + }, + { + "epoch": 0.77, + "grad_norm": 0.34765625, + "learning_rate": 7.623488083578148e-07, + "logits/chosen": 0.48715901374816895, + "logits/rejected": 1.142924189567566, + "logps/chosen": -676.9874267578125, + "logps/rejected": -1663.1302490234375, + "loss": 0.1045, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.443617820739746, + "rewards/margins": 10.04162311553955, + "rewards/margins_max": 15.552085876464844, + "rewards/margins_min": 4.531158447265625, + "rewards/margins_std": 7.792973518371582, + "rewards/rejected": -14.485241889953613, + "step": 1870 + }, + { + "epoch": 0.77, + "grad_norm": 1.03125, + "learning_rate": 7.366866045401968e-07, + "logits/chosen": 0.5052765607833862, + "logits/rejected": 1.288438081741333, + "logps/chosen": -724.85302734375, + "logps/rejected": -1638.660400390625, + "loss": 0.085, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.777608394622803, + "rewards/margins": 9.423945426940918, + "rewards/margins_max": 13.84093952178955, + "rewards/margins_min": 5.006953239440918, + "rewards/margins_std": 6.246571063995361, + "rewards/rejected": -14.201555252075195, + "step": 1880 + }, + { + "epoch": 0.78, + "grad_norm": 3.140625, + "learning_rate": 7.113889186462477e-07, + "logits/chosen": 0.6119362115859985, + "logits/rejected": 1.1571754217147827, + "logps/chosen": -736.3836669921875, + "logps/rejected": -1706.408447265625, + "loss": 0.079, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.75775146484375, + "rewards/margins": 10.034549713134766, + "rewards/margins_max": 14.87476921081543, + "rewards/margins_min": 5.194329738616943, + "rewards/margins_std": 6.845104217529297, + "rewards/rejected": -14.7923002243042, + "step": 1890 + }, + { + "epoch": 0.78, + "grad_norm": 0.6328125, + "learning_rate": 6.864609802964978e-07, + "logits/chosen": 0.5309674143791199, + "logits/rejected": 1.2003862857818604, + "logps/chosen": -700.8447265625, + "logps/rejected": -1731.847412109375, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.624502658843994, + "rewards/margins": 10.476162910461426, + "rewards/margins_max": 15.123468399047852, + "rewards/margins_min": 5.828855991363525, + "rewards/margins_std": 6.572283744812012, + "rewards/rejected": -15.100665092468262, + "step": 1900 + }, + { + "epoch": 0.79, + "grad_norm": 6.1875, + "learning_rate": 6.619079426760545e-07, + "logits/chosen": 0.49570074677467346, + "logits/rejected": 1.1981004476547241, + "logps/chosen": -769.2633056640625, + "logps/rejected": -1941.0648193359375, + "loss": 0.0931, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.233429908752441, + "rewards/margins": 12.01481819152832, + "rewards/margins_max": 17.587182998657227, + "rewards/margins_min": 6.4424543380737305, + "rewards/margins_std": 7.8805131912231445, + "rewards/rejected": -17.248249053955078, + "step": 1910 + }, + { + "epoch": 0.79, + "grad_norm": 10.3125, + "learning_rate": 6.377348814693174e-07, + "logits/chosen": 0.5919948220252991, + "logits/rejected": 1.398564338684082, + "logps/chosen": -762.436279296875, + "logps/rejected": -1769.9222412109375, + "loss": 0.113, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.956363201141357, + "rewards/margins": 10.631746292114258, + "rewards/margins_max": 15.959482192993164, + "rewards/margins_min": 5.304008483886719, + "rewards/margins_std": 7.5345563888549805, + "rewards/rejected": -15.588109016418457, + "step": 1920 + }, + { + "epoch": 0.79, + "grad_norm": 0.2138671875, + "learning_rate": 6.139467938107169e-07, + "logits/chosen": 0.38951975107192993, + "logits/rejected": 1.1649284362792969, + "logps/chosen": -778.3822021484375, + "logps/rejected": -2023.4833984375, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.072902679443359, + "rewards/margins": 12.735228538513184, + "rewards/margins_max": 17.881254196166992, + "rewards/margins_min": 7.589202880859375, + "rewards/margins_std": 7.277578830718994, + "rewards/rejected": -17.80813217163086, + "step": 1930 + }, + { + "epoch": 0.8, + "grad_norm": 0.625, + "learning_rate": 5.905485972516903e-07, + "logits/chosen": 0.5617870092391968, + "logits/rejected": 1.2924219369888306, + "logps/chosen": -818.1054077148438, + "logps/rejected": -1890.950927734375, + "loss": 0.13, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.535712718963623, + "rewards/margins": 11.305818557739258, + "rewards/margins_max": 17.704103469848633, + "rewards/margins_min": 4.907529830932617, + "rewards/margins_std": 9.048542976379395, + "rewards/rejected": -16.841529846191406, + "step": 1940 + }, + { + "epoch": 0.8, + "grad_norm": 1.03125, + "learning_rate": 5.675451287441072e-07, + "logits/chosen": 0.7306760549545288, + "logits/rejected": 1.395262360572815, + "logps/chosen": -816.7340087890625, + "logps/rejected": -1670.490234375, + "loss": 0.1545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.269486427307129, + "rewards/margins": 9.290313720703125, + "rewards/margins_max": 13.515324592590332, + "rewards/margins_min": 5.065301895141602, + "rewards/margins_std": 5.975068092346191, + "rewards/rejected": -14.55980110168457, + "step": 1950 + }, + { + "epoch": 0.81, + "grad_norm": 5.28125, + "learning_rate": 5.449411436403632e-07, + "logits/chosen": 0.7268288135528564, + "logits/rejected": 1.3329485654830933, + "logps/chosen": -696.4193115234375, + "logps/rejected": -1806.069091796875, + "loss": 0.0933, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.67076301574707, + "rewards/margins": 11.251920700073242, + "rewards/margins_max": 16.409025192260742, + "rewards/margins_min": 6.094817638397217, + "rewards/margins_std": 7.2932448387146, + "rewards/rejected": -15.92268180847168, + "step": 1960 + }, + { + "epoch": 0.81, + "grad_norm": 4.28125, + "learning_rate": 5.227413147103336e-07, + "logits/chosen": 0.5869401693344116, + "logits/rejected": 1.2344766855239868, + "logps/chosen": -729.2957153320312, + "logps/rejected": -1567.1939697265625, + "loss": 0.1098, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.183860778808594, + "rewards/margins": 8.676237106323242, + "rewards/margins_max": 12.739700317382812, + "rewards/margins_min": 4.6127729415893555, + "rewards/margins_std": 5.746604919433594, + "rewards/rejected": -13.86009693145752, + "step": 1970 + }, + { + "epoch": 0.82, + "grad_norm": 2.46875, + "learning_rate": 5.009502311754081e-07, + "logits/chosen": 0.5038915872573853, + "logits/rejected": 1.1727396249771118, + "logps/chosen": -724.4193725585938, + "logps/rejected": -1736.091064453125, + "loss": 0.146, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.8910675048828125, + "rewards/margins": 10.278018951416016, + "rewards/margins_max": 15.358955383300781, + "rewards/margins_min": 5.197081565856934, + "rewards/margins_std": 7.1855292320251465, + "rewards/rejected": -15.169085502624512, + "step": 1980 + }, + { + "epoch": 0.82, + "grad_norm": 0.703125, + "learning_rate": 4.795723977597844e-07, + "logits/chosen": 0.5357404947280884, + "logits/rejected": 1.154956579208374, + "logps/chosen": -719.9663696289062, + "logps/rejected": -1709.3209228515625, + "loss": 0.0843, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.783188819885254, + "rewards/margins": 10.301424026489258, + "rewards/margins_max": 14.93207836151123, + "rewards/margins_min": 5.670768737792969, + "rewards/margins_std": 6.54873514175415, + "rewards/rejected": -15.084611892700195, + "step": 1990 + }, + { + "epoch": 0.82, + "grad_norm": 1.28125, + "learning_rate": 4.586122337592444e-07, + "logits/chosen": 0.48415178060531616, + "logits/rejected": 1.2925946712493896, + "logps/chosen": -734.2363891601562, + "logps/rejected": -1889.037353515625, + "loss": 0.0469, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.827243804931641, + "rewards/margins": 11.82060432434082, + "rewards/margins_max": 17.99706268310547, + "rewards/margins_min": 5.644143104553223, + "rewards/margins_std": 8.734832763671875, + "rewards/rejected": -16.647846221923828, + "step": 2000 + }, + { + "epoch": 0.83, + "grad_norm": 2.8125, + "learning_rate": 4.380740721275786e-07, + "logits/chosen": 0.6227355003356934, + "logits/rejected": 1.2969437837600708, + "logps/chosen": -780.354736328125, + "logps/rejected": -1921.7880859375, + "loss": 0.0832, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.117171287536621, + "rewards/margins": 11.844526290893555, + "rewards/margins_max": 17.806795120239258, + "rewards/margins_min": 5.882256031036377, + "rewards/margins_std": 8.43192195892334, + "rewards/rejected": -16.96169662475586, + "step": 2010 + }, + { + "epoch": 0.83, + "grad_norm": 0.197265625, + "learning_rate": 4.1796215858086577e-07, + "logits/chosen": 0.6349445581436157, + "logits/rejected": 1.3867194652557373, + "logps/chosen": -799.1439208984375, + "logps/rejected": -1852.981689453125, + "loss": 0.1059, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.414491176605225, + "rewards/margins": 11.105630874633789, + "rewards/margins_max": 17.207439422607422, + "rewards/margins_min": 5.003822326660156, + "rewards/margins_std": 8.629260063171387, + "rewards/rejected": -16.520122528076172, + "step": 2020 + }, + { + "epoch": 0.84, + "grad_norm": 0.2412109375, + "learning_rate": 3.982806507197831e-07, + "logits/chosen": 0.6008701324462891, + "logits/rejected": 1.230450987815857, + "logps/chosen": -744.5529174804688, + "logps/rejected": -1744.4771728515625, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9854936599731445, + "rewards/margins": 9.673846244812012, + "rewards/margins_max": 13.759689331054688, + "rewards/margins_min": 5.5880022048950195, + "rewards/margins_std": 5.778256416320801, + "rewards/rejected": -14.659339904785156, + "step": 2030 + }, + { + "epoch": 0.84, + "grad_norm": 3.765625, + "learning_rate": 3.790336171701331e-07, + "logits/chosen": 0.5796440839767456, + "logits/rejected": 1.2289059162139893, + "logps/chosen": -720.2902221679688, + "logps/rejected": -1942.671142578125, + "loss": 0.0773, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.671596050262451, + "rewards/margins": 12.241273880004883, + "rewards/margins_max": 16.92319107055664, + "rewards/margins_min": 7.55935525894165, + "rewards/margins_std": 6.621232509613037, + "rewards/rejected": -16.912870407104492, + "step": 2040 + }, + { + "epoch": 0.84, + "grad_norm": 1.515625, + "learning_rate": 3.6022503674176537e-07, + "logits/chosen": 0.5198200941085815, + "logits/rejected": 1.3343006372451782, + "logps/chosen": -796.9490966796875, + "logps/rejected": -1900.691162109375, + "loss": 0.09, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.069548606872559, + "rewards/margins": 11.76386547088623, + "rewards/margins_max": 17.033977508544922, + "rewards/margins_min": 6.4937543869018555, + "rewards/margins_std": 7.453061580657959, + "rewards/rejected": -16.83341407775879, + "step": 2050 + }, + { + "epoch": 0.85, + "grad_norm": 1.0625, + "learning_rate": 3.4185879760606525e-07, + "logits/chosen": 0.5187299847602844, + "logits/rejected": 1.1834654808044434, + "logps/chosen": -736.1161499023438, + "logps/rejected": -1869.806640625, + "loss": 0.0581, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.959606647491455, + "rewards/margins": 11.514524459838867, + "rewards/margins_max": 16.145587921142578, + "rewards/margins_min": 6.883460998535156, + "rewards/margins_std": 6.549312591552734, + "rewards/rejected": -16.474130630493164, + "step": 2060 + }, + { + "epoch": 0.85, + "grad_norm": 0.5859375, + "learning_rate": 3.2393869649217454e-07, + "logits/chosen": 0.5701602697372437, + "logits/rejected": 1.3300843238830566, + "logps/chosen": -761.6326904296875, + "logps/rejected": -1840.357177734375, + "loss": 0.0662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.093400478363037, + "rewards/margins": 11.083673477172852, + "rewards/margins_max": 15.876144409179688, + "rewards/margins_min": 6.291202545166016, + "rewards/margins_std": 6.777576446533203, + "rewards/rejected": -16.177074432373047, + "step": 2070 + }, + { + "epoch": 0.86, + "grad_norm": 2.5, + "learning_rate": 3.064684379021207e-07, + "logits/chosen": 0.43363428115844727, + "logits/rejected": 1.0424432754516602, + "logps/chosen": -684.9832763671875, + "logps/rejected": -1844.010009765625, + "loss": 0.054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.640398979187012, + "rewards/margins": 11.688333511352539, + "rewards/margins_max": 16.679744720458984, + "rewards/margins_min": 6.696922302246094, + "rewards/margins_std": 7.058920860290527, + "rewards/rejected": -16.328731536865234, + "step": 2080 + }, + { + "epoch": 0.86, + "grad_norm": 1.2421875, + "learning_rate": 2.894516333450115e-07, + "logits/chosen": 0.5114481449127197, + "logits/rejected": 1.14482843875885, + "logps/chosen": -735.6387939453125, + "logps/rejected": -1779.406005859375, + "loss": 0.1105, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.890014171600342, + "rewards/margins": 10.568880081176758, + "rewards/margins_max": 14.828028678894043, + "rewards/margins_min": 6.309730529785156, + "rewards/margins_std": 6.023346900939941, + "rewards/rejected": -15.458892822265625, + "step": 2090 + }, + { + "epoch": 0.86, + "grad_norm": 0.4921875, + "learning_rate": 2.728918005904513e-07, + "logits/chosen": 0.3923017084598541, + "logits/rejected": 1.0707480907440186, + "logps/chosen": -806.395751953125, + "logps/rejected": -1856.805908203125, + "loss": 0.1777, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.439145565032959, + "rewards/margins": 10.84535026550293, + "rewards/margins_max": 16.374027252197266, + "rewards/margins_min": 5.316674709320068, + "rewards/margins_std": 7.818729400634766, + "rewards/rejected": -16.284496307373047, + "step": 2100 + }, + { + "epoch": 0.87, + "grad_norm": 0.87109375, + "learning_rate": 2.5679236294133493e-07, + "logits/chosen": 0.5716456174850464, + "logits/rejected": 1.229247808456421, + "logps/chosen": -732.1973876953125, + "logps/rejected": -1738.814697265625, + "loss": 0.0752, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.678011894226074, + "rewards/margins": 10.548458099365234, + "rewards/margins_max": 15.896784782409668, + "rewards/margins_min": 5.200132369995117, + "rewards/margins_std": 7.563673496246338, + "rewards/rejected": -15.226470947265625, + "step": 2110 + }, + { + "epoch": 0.87, + "grad_norm": 2.453125, + "learning_rate": 2.4115664852617294e-07, + "logits/chosen": 0.5404381155967712, + "logits/rejected": 1.2678707838058472, + "logps/chosen": -750.8515014648438, + "logps/rejected": -1882.701416015625, + "loss": 0.1361, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.122988700866699, + "rewards/margins": 11.382095336914062, + "rewards/margins_max": 17.519346237182617, + "rewards/margins_min": 5.244842529296875, + "rewards/margins_std": 8.679386138916016, + "rewards/rejected": -16.505083084106445, + "step": 2120 + }, + { + "epoch": 0.88, + "grad_norm": 0.458984375, + "learning_rate": 2.2598788961108897e-07, + "logits/chosen": 0.5512218475341797, + "logits/rejected": 1.235686182975769, + "logps/chosen": -703.8411865234375, + "logps/rejected": -1646.0699462890625, + "loss": 0.0847, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.887363910675049, + "rewards/margins": 9.765475273132324, + "rewards/margins_max": 14.917486190795898, + "rewards/margins_min": 4.613465309143066, + "rewards/margins_std": 7.286043643951416, + "rewards/rejected": -14.652839660644531, + "step": 2130 + }, + { + "epoch": 0.88, + "grad_norm": 1.4609375, + "learning_rate": 2.1128922193163564e-07, + "logits/chosen": 0.5618628263473511, + "logits/rejected": 1.2874120473861694, + "logps/chosen": -742.8013916015625, + "logps/rejected": -1916.2711181640625, + "loss": 0.1083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.009154796600342, + "rewards/margins": 12.049230575561523, + "rewards/margins_max": 17.573997497558594, + "rewards/margins_min": 6.5244646072387695, + "rewards/margins_std": 7.813199043273926, + "rewards/rejected": -17.058387756347656, + "step": 2140 + }, + { + "epoch": 0.89, + "grad_norm": 3.203125, + "learning_rate": 1.9706368404456472e-07, + "logits/chosen": 0.4528091549873352, + "logits/rejected": 1.111604928970337, + "logps/chosen": -745.5892333984375, + "logps/rejected": -1809.7484130859375, + "loss": 0.0911, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.078321933746338, + "rewards/margins": 10.962045669555664, + "rewards/margins_max": 15.926958084106445, + "rewards/margins_min": 5.997132778167725, + "rewards/margins_std": 7.02144718170166, + "rewards/rejected": -16.040367126464844, + "step": 2150 + }, + { + "epoch": 0.89, + "grad_norm": 3.75, + "learning_rate": 1.8331421669968708e-07, + "logits/chosen": 0.6266171336174011, + "logits/rejected": 1.3515210151672363, + "logps/chosen": -768.0899047851562, + "logps/rejected": -1722.0845947265625, + "loss": 0.1013, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.038366794586182, + "rewards/margins": 9.89087200164795, + "rewards/margins_max": 14.385602951049805, + "rewards/margins_min": 5.396140098571777, + "rewards/margins_std": 6.356511116027832, + "rewards/rejected": -14.929239273071289, + "step": 2160 + }, + { + "epoch": 0.89, + "grad_norm": 5.0, + "learning_rate": 1.7004366223194984e-07, + "logits/chosen": 0.5014376044273376, + "logits/rejected": 1.2441834211349487, + "logps/chosen": -747.4188232421875, + "logps/rejected": -1764.168212890625, + "loss": 0.1175, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.955763339996338, + "rewards/margins": 10.59716510772705, + "rewards/margins_max": 15.370004653930664, + "rewards/margins_min": 5.8243255615234375, + "rewards/margins_std": 6.749813079833984, + "rewards/rejected": -15.552927017211914, + "step": 2170 + }, + { + "epoch": 0.9, + "grad_norm": 1.21875, + "learning_rate": 1.5725476397386197e-07, + "logits/chosen": 0.3932679295539856, + "logits/rejected": 1.2315890789031982, + "logps/chosen": -707.3714599609375, + "logps/rejected": -1953.8558349609375, + "loss": 0.0576, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.406428337097168, + "rewards/margins": 12.903097152709961, + "rewards/margins_max": 18.890384674072266, + "rewards/margins_min": 6.915809631347656, + "rewards/margins_std": 8.467303276062012, + "rewards/rejected": -17.309528350830078, + "step": 2180 + }, + { + "epoch": 0.9, + "grad_norm": 1.0625, + "learning_rate": 1.4495016568838198e-07, + "logits/chosen": 0.44051748514175415, + "logits/rejected": 1.1644173860549927, + "logps/chosen": -771.8656616210938, + "logps/rejected": -1771.4254150390625, + "loss": 0.079, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.945860385894775, + "rewards/margins": 10.547651290893555, + "rewards/margins_max": 15.509611129760742, + "rewards/margins_min": 5.585693359375, + "rewards/margins_std": 7.017270565032959, + "rewards/rejected": -15.493513107299805, + "step": 2190 + }, + { + "epoch": 0.91, + "grad_norm": 0.90625, + "learning_rate": 1.3313241102239056e-07, + "logits/chosen": 0.6278412342071533, + "logits/rejected": 1.4244401454925537, + "logps/chosen": -682.3499755859375, + "logps/rejected": -1634.0439453125, + "loss": 0.085, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.619953155517578, + "rewards/margins": 9.823567390441895, + "rewards/margins_max": 13.926409721374512, + "rewards/margins_min": 5.720723628997803, + "rewards/margins_std": 5.802296161651611, + "rewards/rejected": -14.443519592285156, + "step": 2200 + }, + { + "epoch": 0.91, + "grad_norm": 1.078125, + "learning_rate": 1.2180394298086095e-07, + "logits/chosen": 0.5217547416687012, + "logits/rejected": 1.2030553817749023, + "logps/chosen": -736.8463745117188, + "logps/rejected": -1752.949951171875, + "loss": 0.0513, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.804549217224121, + "rewards/margins": 10.467925071716309, + "rewards/margins_max": 14.837709426879883, + "rewards/margins_min": 6.098140716552734, + "rewards/margins_std": 6.179808139801025, + "rewards/rejected": -15.272473335266113, + "step": 2210 + }, + { + "epoch": 0.91, + "grad_norm": 1.328125, + "learning_rate": 1.1096710342183042e-07, + "logits/chosen": 0.4959636628627777, + "logits/rejected": 1.1241891384124756, + "logps/chosen": -719.9772338867188, + "logps/rejected": -1887.967529296875, + "loss": 0.0614, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.843437671661377, + "rewards/margins": 12.002474784851074, + "rewards/margins_max": 17.96475601196289, + "rewards/margins_min": 6.040192604064941, + "rewards/margins_std": 8.431940078735352, + "rewards/rejected": -16.84591293334961, + "step": 2220 + }, + { + "epoch": 0.92, + "grad_norm": 3.421875, + "learning_rate": 1.0062413257228676e-07, + "logits/chosen": 0.5790583491325378, + "logits/rejected": 1.3127405643463135, + "logps/chosen": -810.7774658203125, + "logps/rejected": -1996.8209228515625, + "loss": 0.0655, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.4459967613220215, + "rewards/margins": 12.252093315124512, + "rewards/margins_max": 17.894405364990234, + "rewards/margins_min": 6.6097846031188965, + "rewards/margins_std": 7.979430198669434, + "rewards/rejected": -17.698089599609375, + "step": 2230 + }, + { + "epoch": 0.92, + "grad_norm": 6.78125, + "learning_rate": 9.077716856505825e-08, + "logits/chosen": 0.5055748224258423, + "logits/rejected": 1.3198411464691162, + "logps/chosen": -762.1981201171875, + "logps/rejected": -1757.9710693359375, + "loss": 0.1311, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.086750030517578, + "rewards/margins": 10.44882583618164, + "rewards/margins_max": 14.819003105163574, + "rewards/margins_min": 6.078649044036865, + "rewards/margins_std": 6.180363178253174, + "rewards/rejected": -15.535575866699219, + "step": 2240 + }, + { + "epoch": 0.93, + "grad_norm": 2.671875, + "learning_rate": 8.142824699681501e-08, + "logits/chosen": 0.5170903205871582, + "logits/rejected": 1.170555830001831, + "logps/chosen": -716.73583984375, + "logps/rejected": -1645.933349609375, + "loss": 0.0802, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.820572376251221, + "rewards/margins": 9.699037551879883, + "rewards/margins_max": 15.123661994934082, + "rewards/margins_min": 4.274412155151367, + "rewards/margins_std": 7.671577453613281, + "rewards/rejected": -14.519609451293945, + "step": 2250 + }, + { + "epoch": 0.93, + "grad_norm": 1.5859375, + "learning_rate": 7.257930050726003e-08, + "logits/chosen": 0.5653474926948547, + "logits/rejected": 1.3792940378189087, + "logps/chosen": -771.275634765625, + "logps/rejected": -1766.762939453125, + "loss": 0.095, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.0928826332092285, + "rewards/margins": 10.51708698272705, + "rewards/margins_max": 15.350242614746094, + "rewards/margins_min": 5.683931827545166, + "rewards/margins_std": 6.835114479064941, + "rewards/rejected": -15.609970092773438, + "step": 2260 + }, + { + "epoch": 0.93, + "grad_norm": 0.78125, + "learning_rate": 6.423215837961045e-08, + "logits/chosen": 0.5271497368812561, + "logits/rejected": 1.3941797018051147, + "logps/chosen": -722.5052490234375, + "logps/rejected": -1865.8720703125, + "loss": 0.0869, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.881400108337402, + "rewards/margins": 11.674361228942871, + "rewards/margins_max": 16.58696746826172, + "rewards/margins_min": 6.76175594329834, + "rewards/margins_std": 6.947473049163818, + "rewards/rejected": -16.555761337280273, + "step": 2270 + }, + { + "epoch": 0.94, + "grad_norm": 4.6875, + "learning_rate": 5.6388546162442215e-08, + "logits/chosen": 0.6665564775466919, + "logits/rejected": 1.2474958896636963, + "logps/chosen": -746.4205322265625, + "logps/rejected": -1758.126708984375, + "loss": 0.111, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.9972076416015625, + "rewards/margins": 10.384644508361816, + "rewards/margins_max": 15.061482429504395, + "rewards/margins_min": 5.707806587219238, + "rewards/margins_std": 6.614047050476074, + "rewards/rejected": -15.381853103637695, + "step": 2280 + }, + { + "epoch": 0.94, + "grad_norm": 2.453125, + "learning_rate": 4.905008531297661e-08, + "logits/chosen": 0.43994975090026855, + "logits/rejected": 1.1360465288162231, + "logps/chosen": -817.5567016601562, + "logps/rejected": -1885.947998046875, + "loss": 0.1015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.406924247741699, + "rewards/margins": 11.238929748535156, + "rewards/margins_max": 16.946365356445312, + "rewards/margins_min": 5.531497955322266, + "rewards/margins_std": 8.071528434753418, + "rewards/rejected": -16.645854949951172, + "step": 2290 + }, + { + "epoch": 0.95, + "grad_norm": 2.9375, + "learning_rate": 4.2218292861889444e-08, + "logits/chosen": 0.5859326124191284, + "logits/rejected": 1.2896353006362915, + "logps/chosen": -773.9613037109375, + "logps/rejected": -1735.0863037109375, + "loss": 0.0918, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.150477409362793, + "rewards/margins": 10.18777084350586, + "rewards/margins_max": 15.025718688964844, + "rewards/margins_min": 5.34982442855835, + "rewards/margins_std": 6.84188985824585, + "rewards/rejected": -15.338247299194336, + "step": 2300 + }, + { + "epoch": 0.95, + "grad_norm": 0.4765625, + "learning_rate": 3.589458109970467e-08, + "logits/chosen": 0.5294678807258606, + "logits/rejected": 1.2444860935211182, + "logps/chosen": -743.5758666992188, + "logps/rejected": -1683.407958984375, + "loss": 0.1648, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.184317588806152, + "rewards/margins": 9.598274230957031, + "rewards/margins_max": 14.4242582321167, + "rewards/margins_min": 4.7722883224487305, + "rewards/margins_std": 6.824974060058594, + "rewards/rejected": -14.78258991241455, + "step": 2310 + }, + { + "epoch": 0.96, + "grad_norm": 2.265625, + "learning_rate": 3.008025728484132e-08, + "logits/chosen": 0.5059491991996765, + "logits/rejected": 1.3568942546844482, + "logps/chosen": -737.1229858398438, + "logps/rejected": -1987.5445556640625, + "loss": 0.091, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.759067535400391, + "rewards/margins": 12.818672180175781, + "rewards/margins_max": 18.131336212158203, + "rewards/margins_min": 7.506007194519043, + "rewards/margins_std": 7.513242244720459, + "rewards/rejected": -17.577739715576172, + "step": 2320 + }, + { + "epoch": 0.96, + "grad_norm": 0.41015625, + "learning_rate": 2.4776523373372385e-08, + "logits/chosen": 0.5932313799858093, + "logits/rejected": 1.2811510562896729, + "logps/chosen": -719.3793334960938, + "logps/rejected": -1677.4947509765625, + "loss": 0.075, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.827818870544434, + "rewards/margins": 9.848733901977539, + "rewards/margins_max": 14.251507759094238, + "rewards/margins_min": 5.44596004486084, + "rewards/margins_std": 6.226462364196777, + "rewards/rejected": -14.676549911499023, + "step": 2330 + }, + { + "epoch": 0.96, + "grad_norm": 2.953125, + "learning_rate": 1.998447577055307e-08, + "logits/chosen": 0.531481146812439, + "logits/rejected": 1.223459005355835, + "logps/chosen": -786.4287109375, + "logps/rejected": -1908.134765625, + "loss": 0.1179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.9455766677856445, + "rewards/margins": 11.500458717346191, + "rewards/margins_max": 16.30417823791504, + "rewards/margins_min": 6.696742057800293, + "rewards/margins_std": 6.793482303619385, + "rewards/rejected": -16.44603729248047, + "step": 2340 + }, + { + "epoch": 0.97, + "grad_norm": 1.765625, + "learning_rate": 1.5705105104167617e-08, + "logits/chosen": 0.446284681558609, + "logits/rejected": 1.1109905242919922, + "logps/chosen": -792.7479858398438, + "logps/rejected": -1819.526611328125, + "loss": 0.0377, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -5.261641502380371, + "rewards/margins": 10.497610092163086, + "rewards/margins_max": 15.17370891571045, + "rewards/margins_min": 5.821512699127197, + "rewards/margins_std": 6.613001346588135, + "rewards/rejected": -15.759251594543457, + "step": 2350 + }, + { + "epoch": 0.97, + "grad_norm": 0.7265625, + "learning_rate": 1.1939296019744529e-08, + "logits/chosen": 0.5473194122314453, + "logits/rejected": 1.104913353919983, + "logps/chosen": -683.9682006835938, + "logps/rejected": -1851.5924072265625, + "loss": 0.076, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.581644058227539, + "rewards/margins": 11.716299057006836, + "rewards/margins_max": 16.266063690185547, + "rewards/margins_min": 7.166537284851074, + "rewards/margins_std": 6.434335231781006, + "rewards/rejected": -16.29794692993164, + "step": 2360 + }, + { + "epoch": 0.98, + "grad_norm": 0.345703125, + "learning_rate": 8.687826997678116e-09, + "logits/chosen": 0.5780460834503174, + "logits/rejected": 1.2975406646728516, + "logps/chosen": -729.7520751953125, + "logps/rejected": -1835.2408447265625, + "loss": 0.0673, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.86895751953125, + "rewards/margins": 11.184858322143555, + "rewards/margins_max": 15.93799114227295, + "rewards/margins_min": 6.431723117828369, + "rewards/margins_std": 6.721946716308594, + "rewards/rejected": -16.053813934326172, + "step": 2370 + }, + { + "epoch": 0.98, + "grad_norm": 0.6484375, + "learning_rate": 5.951370192300576e-09, + "logits/chosen": 0.5345112085342407, + "logits/rejected": 1.213844656944275, + "logps/chosen": -702.4107055664062, + "logps/rejected": -1682.540283203125, + "loss": 0.0844, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.7045135498046875, + "rewards/margins": 10.013148307800293, + "rewards/margins_max": 14.771675109863281, + "rewards/margins_min": 5.254621982574463, + "rewards/margins_std": 6.729572296142578, + "rewards/rejected": -14.71766185760498, + "step": 2380 + }, + { + "epoch": 0.98, + "grad_norm": 0.57421875, + "learning_rate": 3.730491292930072e-09, + "logits/chosen": 0.5803747773170471, + "logits/rejected": 1.2554329633712769, + "logps/chosen": -735.1619262695312, + "logps/rejected": -1728.2470703125, + "loss": 0.065, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.109474182128906, + "rewards/margins": 10.042867660522461, + "rewards/margins_max": 15.090121269226074, + "rewards/margins_min": 4.995615482330322, + "rewards/margins_std": 7.1378936767578125, + "rewards/rejected": -15.152341842651367, + "step": 2390 + }, + { + "epoch": 0.99, + "grad_norm": 2.140625, + "learning_rate": 2.0256494069306744e-09, + "logits/chosen": 0.5878351926803589, + "logits/rejected": 1.3185656070709229, + "logps/chosen": -693.908203125, + "logps/rejected": -1897.8695068359375, + "loss": 0.1782, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -4.697625160217285, + "rewards/margins": 12.107023239135742, + "rewards/margins_max": 17.24478530883789, + "rewards/margins_min": 6.969258785247803, + "rewards/margins_std": 7.265894889831543, + "rewards/rejected": -16.80464744567871, + "step": 2400 + }, + { + "epoch": 0.99, + "grad_norm": 1.828125, + "learning_rate": 8.371969648043876e-10, + "logits/chosen": 0.6715101003646851, + "logits/rejected": 1.3766255378723145, + "logps/chosen": -733.6317138671875, + "logps/rejected": -1728.0810546875, + "loss": 0.1379, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.896046161651611, + "rewards/margins": 10.241785049438477, + "rewards/margins_max": 14.780932426452637, + "rewards/margins_min": 5.702638626098633, + "rewards/margins_std": 6.419322967529297, + "rewards/rejected": -15.13783073425293, + "step": 2410 + }, + { + "epoch": 1.0, + "grad_norm": 0.59765625, + "learning_rate": 1.653796473341518e-10, + "logits/chosen": 0.4578518271446228, + "logits/rejected": 1.291621446609497, + "logps/chosen": -713.6257934570312, + "logps/rejected": -1659.1171875, + "loss": 0.0983, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -4.576380729675293, + "rewards/margins": 9.997756958007812, + "rewards/margins_max": 14.97430419921875, + "rewards/margins_min": 5.021212577819824, + "rewards/margins_std": 7.037899017333984, + "rewards/rejected": -14.574139595031738, + "step": 2420 + }, + { + "epoch": 1.0, + "eval_logits/chosen": 1.1567333936691284, + "eval_logits/rejected": 1.3502662181854248, + "eval_logps/chosen": -815.366455078125, + "eval_logps/rejected": -885.2385864257812, + "eval_loss": 0.8997361063957214, + "eval_rewards/accuracies": 0.5877500176429749, + "eval_rewards/chosen": -4.770576000213623, + "eval_rewards/margins": 0.8909361362457275, + "eval_rewards/margins_max": 6.031704425811768, + "eval_rewards/margins_min": -2.9257826805114746, + "eval_rewards/margins_std": 2.9022324085235596, + "eval_rewards/rejected": -5.66151237487793, + "eval_runtime": 1670.0359, + "eval_samples_per_second": 4.79, + "eval_steps_per_second": 0.299, + "step": 2428 + }, + { + "epoch": 1.0, + "step": 2428, + "total_flos": 0.0, + "train_loss": 0.19075011757787017, + "train_runtime": 22524.5017, + "train_samples_per_second": 1.725, + "train_steps_per_second": 0.108 + } + ], + "logging_steps": 10, + "max_steps": 2428, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}