{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 38, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -0.7997890710830688, "debug/policy_chosen_logps": -225.56033325195312, "debug/policy_rejected_logits": -0.7811033725738525, "debug/policy_rejected_logps": -194.36915588378906, "debug/reference_chosen_logps": -225.56033325195312, "debug/reference_rejected_logps": -194.36915588378906, "epoch": 0.02631578947368421, "grad_norm": 10.059155945927795, "learning_rate": 1e-06, "logits/chosen": -0.7997890710830688, "logits/rejected": -0.7811033725738525, "logps/chosen": -225.56033325195312, "logps/rejected": -194.36915588378906, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -0.787833571434021, "debug/policy_chosen_logps": -192.9886474609375, "debug/policy_rejected_logits": -0.8347375988960266, "debug/policy_rejected_logps": -183.05389404296875, "debug/reference_chosen_logps": -193.40695190429688, "debug/reference_rejected_logps": -182.5074920654297, "epoch": 0.05263157894736842, "grad_norm": 9.106220560523727, "learning_rate": 1e-06, "logits/chosen": -0.787833571434021, "logits/rejected": -0.8347375988960266, "logps/chosen": -192.9886474609375, "logps/rejected": -183.05389404296875, "loss": 0.4992, "rewards/accuracies": 0.75, "rewards/chosen": 0.004183177836239338, "rewards/margins": 0.009647198021411896, "rewards/rejected": -0.0054640197195112705, "step": 2 }, { "debug/policy_chosen_logits": -0.7686588168144226, "debug/policy_chosen_logps": -166.56585693359375, "debug/policy_rejected_logits": -0.823331892490387, "debug/policy_rejected_logps": -206.15472412109375, "debug/reference_chosen_logps": -166.86782836914062, "debug/reference_rejected_logps": -206.29469299316406, "epoch": 0.07894736842105263, "grad_norm": 9.89971860615226, "learning_rate": 1e-06, "logits/chosen": -0.7686588168144226, "logits/rejected": -0.823331892490387, "logps/chosen": -166.56585693359375, "logps/rejected": -206.15472412109375, "loss": 0.4972, "rewards/accuracies": 0.625, "rewards/chosen": 0.0030196956358850002, "rewards/margins": 0.0016200444661080837, "rewards/rejected": 0.0013996504712849855, "step": 3 }, { "debug/policy_chosen_logits": -0.8045904040336609, "debug/policy_chosen_logps": -167.75006103515625, "debug/policy_rejected_logits": -0.9390885829925537, "debug/policy_rejected_logps": -183.21987915039062, "debug/reference_chosen_logps": -168.35263061523438, "debug/reference_rejected_logps": -183.38255310058594, "epoch": 0.10526315789473684, "grad_norm": 9.923126638306748, "learning_rate": 1e-06, "logits/chosen": -0.8045904040336609, "logits/rejected": -0.9390885829925537, "logps/chosen": -167.75006103515625, "logps/rejected": -183.21987915039062, "loss": 0.498, "rewards/accuracies": 0.375, "rewards/chosen": 0.006025714334100485, "rewards/margins": 0.004398994147777557, "rewards/rejected": 0.001626720535568893, "step": 4 }, { "debug/policy_chosen_logits": -0.8231886625289917, "debug/policy_chosen_logps": -179.09014892578125, "debug/policy_rejected_logits": -0.8653745055198669, "debug/policy_rejected_logps": -173.03317260742188, "debug/reference_chosen_logps": -179.02658081054688, "debug/reference_rejected_logps": -172.79678344726562, "epoch": 0.13157894736842105, "grad_norm": 10.543939600237236, "learning_rate": 1e-06, "logits/chosen": -0.8231886625289917, "logits/rejected": -0.8653745055198669, "logps/chosen": -179.09014892578125, "logps/rejected": -173.03317260742188, "loss": 0.4976, "rewards/accuracies": 0.625, "rewards/chosen": -0.0006354906363412738, "rewards/margins": 0.0017283153720200062, "rewards/rejected": -0.002363805891945958, "step": 5 }, { "debug/policy_chosen_logits": -0.9182553887367249, "debug/policy_chosen_logps": -175.53131103515625, "debug/policy_rejected_logits": -0.9559266567230225, "debug/policy_rejected_logps": -208.5589141845703, "debug/reference_chosen_logps": -176.2925567626953, "debug/reference_rejected_logps": -208.50613403320312, "epoch": 0.15789473684210525, "grad_norm": 11.060840140231088, "learning_rate": 1e-06, "logits/chosen": -0.9182553887367249, "logits/rejected": -0.9559266567230225, "logps/chosen": -175.53131103515625, "logps/rejected": -208.5589141845703, "loss": 0.4986, "rewards/accuracies": 0.75, "rewards/chosen": 0.0076125627383589745, "rewards/margins": 0.00814034417271614, "rewards/rejected": -0.0005277825985103846, "step": 6 }, { "debug/policy_chosen_logits": -0.8945421576499939, "debug/policy_chosen_logps": -154.65753173828125, "debug/policy_rejected_logits": -0.9487111568450928, "debug/policy_rejected_logps": -207.63418579101562, "debug/reference_chosen_logps": -153.6591796875, "debug/reference_rejected_logps": -208.0142364501953, "epoch": 0.18421052631578946, "grad_norm": 10.113024121054902, "learning_rate": 1e-06, "logits/chosen": -0.8945421576499939, "logits/rejected": -0.9487111568450928, "logps/chosen": -154.65753173828125, "logps/rejected": -207.63418579101562, "loss": 0.4995, "rewards/accuracies": 0.5, "rewards/chosen": -0.009983415715396404, "rewards/margins": -0.013783845119178295, "rewards/rejected": 0.0038004303351044655, "step": 7 }, { "debug/policy_chosen_logits": -0.8738376498222351, "debug/policy_chosen_logps": -183.52603149414062, "debug/policy_rejected_logits": -0.9855321645736694, "debug/policy_rejected_logps": -162.13426208496094, "debug/reference_chosen_logps": -182.94473266601562, "debug/reference_rejected_logps": -161.57327270507812, "epoch": 0.21052631578947367, "grad_norm": 11.253835109468124, "learning_rate": 1e-06, "logits/chosen": -0.8738376498222351, "logits/rejected": -0.9855321645736694, "logps/chosen": -183.52603149414062, "logps/rejected": -162.13426208496094, "loss": 0.4972, "rewards/accuracies": 0.75, "rewards/chosen": -0.005812949500977993, "rewards/margins": -0.00020312238484621048, "rewards/rejected": -0.005609826650470495, "step": 8 }, { "debug/policy_chosen_logits": -0.7741842269897461, "debug/policy_chosen_logps": -155.53382873535156, "debug/policy_rejected_logits": -0.8124099373817444, "debug/policy_rejected_logps": -194.9011993408203, "debug/reference_chosen_logps": -155.30392456054688, "debug/reference_rejected_logps": -194.22259521484375, "epoch": 0.23684210526315788, "grad_norm": 10.505846566132414, "learning_rate": 1e-06, "logits/chosen": -0.7741842269897461, "logits/rejected": -0.8124099373817444, "logps/chosen": -155.53382873535156, "logps/rejected": -194.9011993408203, "loss": 0.4954, "rewards/accuracies": 0.625, "rewards/chosen": -0.0022989464923739433, "rewards/margins": 0.00448720995336771, "rewards/rejected": -0.006786155980080366, "step": 9 }, { "debug/policy_chosen_logits": -0.8215097188949585, "debug/policy_chosen_logps": -169.93019104003906, "debug/policy_rejected_logits": -1.0346907377243042, "debug/policy_rejected_logps": -170.85223388671875, "debug/reference_chosen_logps": -171.34588623046875, "debug/reference_rejected_logps": -169.4602508544922, "epoch": 0.2631578947368421, "grad_norm": 10.830589104143412, "learning_rate": 1e-06, "logits/chosen": -0.8215097188949585, "logits/rejected": -1.0346907377243042, "logps/chosen": -169.93019104003906, "logps/rejected": -170.85223388671875, "loss": 0.4913, "rewards/accuracies": 0.75, "rewards/chosen": 0.014157085679471493, "rewards/margins": 0.02807692438364029, "rewards/rejected": -0.013919839635491371, "step": 10 }, { "debug/policy_chosen_logits": -0.979742705821991, "debug/policy_chosen_logps": -150.38729858398438, "debug/policy_rejected_logits": -0.8156647086143494, "debug/policy_rejected_logps": -194.0210418701172, "debug/reference_chosen_logps": -149.69064331054688, "debug/reference_rejected_logps": -192.29783630371094, "epoch": 0.2894736842105263, "grad_norm": 11.460987827797899, "learning_rate": 1e-06, "logits/chosen": -0.979742705821991, "logits/rejected": -0.8156647086143494, "logps/chosen": -150.38729858398438, "logps/rejected": -194.0210418701172, "loss": 0.494, "rewards/accuracies": 0.625, "rewards/chosen": -0.006966524291783571, "rewards/margins": 0.010265503078699112, "rewards/rejected": -0.017232026904821396, "step": 11 }, { "debug/policy_chosen_logits": -0.9658709764480591, "debug/policy_chosen_logps": -156.47149658203125, "debug/policy_rejected_logits": -0.9509150385856628, "debug/policy_rejected_logps": -190.25225830078125, "debug/reference_chosen_logps": -156.70692443847656, "debug/reference_rejected_logps": -187.9228515625, "epoch": 0.3157894736842105, "grad_norm": 12.5990794235455, "learning_rate": 1e-06, "logits/chosen": -0.9658709764480591, "logits/rejected": -0.9509150385856628, "logps/chosen": -156.47149658203125, "logps/rejected": -190.25225830078125, "loss": 0.5009, "rewards/accuracies": 0.875, "rewards/chosen": 0.002354326192289591, "rewards/margins": 0.02564830705523491, "rewards/rejected": -0.023293981328606606, "step": 12 }, { "debug/policy_chosen_logits": -0.9551741480827332, "debug/policy_chosen_logps": -159.93605041503906, "debug/policy_rejected_logits": -1.077520489692688, "debug/policy_rejected_logps": -175.4044189453125, "debug/reference_chosen_logps": -162.68727111816406, "debug/reference_rejected_logps": -176.97789001464844, "epoch": 0.34210526315789475, "grad_norm": 11.764195807799357, "learning_rate": 1e-06, "logits/chosen": -0.9551741480827332, "logits/rejected": -1.077520489692688, "logps/chosen": -159.93605041503906, "logps/rejected": -175.4044189453125, "loss": 0.4988, "rewards/accuracies": 0.625, "rewards/chosen": 0.02751227281987667, "rewards/margins": 0.011777523905038834, "rewards/rejected": 0.015734750777482986, "step": 13 }, { "debug/policy_chosen_logits": -0.8166245222091675, "debug/policy_chosen_logps": -192.5528106689453, "debug/policy_rejected_logits": -0.7194980382919312, "debug/policy_rejected_logps": -193.76840209960938, "debug/reference_chosen_logps": -192.17568969726562, "debug/reference_rejected_logps": -193.7100372314453, "epoch": 0.3684210526315789, "grad_norm": 13.966553473864488, "learning_rate": 1e-06, "logits/chosen": -0.8166245222091675, "logits/rejected": -0.7194980382919312, "logps/chosen": -192.5528106689453, "logps/rejected": -193.76840209960938, "loss": 0.4966, "rewards/accuracies": 0.625, "rewards/chosen": -0.0037712082266807556, "rewards/margins": -0.0031875791028141975, "rewards/rejected": -0.000583629822358489, "step": 14 }, { "debug/policy_chosen_logits": -0.8419144749641418, "debug/policy_chosen_logps": -156.88470458984375, "debug/policy_rejected_logits": -0.7425439357757568, "debug/policy_rejected_logps": -156.3319091796875, "debug/reference_chosen_logps": -157.73068237304688, "debug/reference_rejected_logps": -156.30401611328125, "epoch": 0.39473684210526316, "grad_norm": 12.533768329750274, "learning_rate": 1e-06, "logits/chosen": -0.8419144749641418, "logits/rejected": -0.7425439357757568, "logps/chosen": -156.88470458984375, "logps/rejected": -156.3319091796875, "loss": 0.5016, "rewards/accuracies": 0.375, "rewards/chosen": 0.008459766395390034, "rewards/margins": 0.008738689124584198, "rewards/rejected": -0.00027891993522644043, "step": 15 }, { "debug/policy_chosen_logits": -0.9986197352409363, "debug/policy_chosen_logps": -178.00741577148438, "debug/policy_rejected_logits": -1.0260645151138306, "debug/policy_rejected_logps": -168.3215789794922, "debug/reference_chosen_logps": -175.48635864257812, "debug/reference_rejected_logps": -159.8695068359375, "epoch": 0.42105263157894735, "grad_norm": 13.68956407505082, "learning_rate": 1e-06, "logits/chosen": -0.9986197352409363, "logits/rejected": -1.0260645151138306, "logps/chosen": -178.00741577148438, "logps/rejected": -168.3215789794922, "loss": 0.5002, "rewards/accuracies": 0.875, "rewards/chosen": -0.02521066553890705, "rewards/margins": 0.059310123324394226, "rewards/rejected": -0.08452078700065613, "step": 16 }, { "debug/policy_chosen_logits": -0.9570299983024597, "debug/policy_chosen_logps": -155.34194946289062, "debug/policy_rejected_logits": -0.8540157079696655, "debug/policy_rejected_logps": -158.7489013671875, "debug/reference_chosen_logps": -153.97943115234375, "debug/reference_rejected_logps": -161.26361083984375, "epoch": 0.4473684210526316, "grad_norm": 11.796187058429084, "learning_rate": 1e-06, "logits/chosen": -0.9570299983024597, "logits/rejected": -0.8540157079696655, "logps/chosen": -155.34194946289062, "logps/rejected": -158.7489013671875, "loss": 0.4944, "rewards/accuracies": 0.25, "rewards/chosen": -0.013625269755721092, "rewards/margins": -0.03877229616045952, "rewards/rejected": 0.025147024542093277, "step": 17 }, { "debug/policy_chosen_logits": -0.9488164186477661, "debug/policy_chosen_logps": -158.7830047607422, "debug/policy_rejected_logits": -0.6659660935401917, "debug/policy_rejected_logps": -177.89169311523438, "debug/reference_chosen_logps": -160.00875854492188, "debug/reference_rejected_logps": -178.63717651367188, "epoch": 0.47368421052631576, "grad_norm": 12.509178210406022, "learning_rate": 1e-06, "logits/chosen": -0.9488164186477661, "logits/rejected": -0.6659660935401917, "logps/chosen": -158.7830047607422, "logps/rejected": -177.89169311523438, "loss": 0.5023, "rewards/accuracies": 0.625, "rewards/chosen": 0.012257632799446583, "rewards/margins": 0.004802837502211332, "rewards/rejected": 0.007454794831573963, "step": 18 }, { "debug/policy_chosen_logits": -1.0241601467132568, "debug/policy_chosen_logps": -146.33169555664062, "debug/policy_rejected_logits": -0.9203600883483887, "debug/policy_rejected_logps": -185.3747100830078, "debug/reference_chosen_logps": -144.0775146484375, "debug/reference_rejected_logps": -182.81109619140625, "epoch": 0.5, "grad_norm": 14.193595912188917, "learning_rate": 1e-06, "logits/chosen": -1.0241601467132568, "logits/rejected": -0.9203600883483887, "logps/chosen": -146.33169555664062, "logps/rejected": -185.3747100830078, "loss": 0.4921, "rewards/accuracies": 0.625, "rewards/chosen": -0.022541627287864685, "rewards/margins": 0.003094470128417015, "rewards/rejected": -0.02563609927892685, "step": 19 }, { "debug/policy_chosen_logits": -0.7719177603721619, "debug/policy_chosen_logps": -170.9295196533203, "debug/policy_rejected_logits": -0.8293173909187317, "debug/policy_rejected_logps": -213.50628662109375, "debug/reference_chosen_logps": -173.196044921875, "debug/reference_rejected_logps": -216.7070770263672, "epoch": 0.5263157894736842, "grad_norm": 12.254973925319538, "learning_rate": 1e-06, "logits/chosen": -0.7719177603721619, "logits/rejected": -0.8293173909187317, "logps/chosen": -170.9295196533203, "logps/rejected": -213.50628662109375, "loss": 0.5029, "rewards/accuracies": 0.25, "rewards/chosen": 0.02266528084874153, "rewards/margins": -0.009342546574771404, "rewards/rejected": 0.03200782835483551, "step": 20 }, { "debug/policy_chosen_logits": -0.8476613163948059, "debug/policy_chosen_logps": -176.2569122314453, "debug/policy_rejected_logits": -0.8481642007827759, "debug/policy_rejected_logps": -157.81607055664062, "debug/reference_chosen_logps": -175.54322814941406, "debug/reference_rejected_logps": -157.49563598632812, "epoch": 0.5526315789473685, "grad_norm": 13.463741700626029, "learning_rate": 1e-06, "logits/chosen": -0.8476613163948059, "logits/rejected": -0.8481642007827759, "logps/chosen": -176.2569122314453, "logps/rejected": -157.81607055664062, "loss": 0.5035, "rewards/accuracies": 0.375, "rewards/chosen": -0.007136850152164698, "rewards/margins": -0.00393272377550602, "rewards/rejected": -0.0032041254453361034, "step": 21 }, { "debug/policy_chosen_logits": -0.9509191513061523, "debug/policy_chosen_logps": -163.81971740722656, "debug/policy_rejected_logits": -1.03162682056427, "debug/policy_rejected_logps": -166.19873046875, "debug/reference_chosen_logps": -163.58975219726562, "debug/reference_rejected_logps": -163.93687438964844, "epoch": 0.5789473684210527, "grad_norm": 12.346043910937565, "learning_rate": 1e-06, "logits/chosen": -0.9509191513061523, "logits/rejected": -1.03162682056427, "logps/chosen": -163.81971740722656, "logps/rejected": -166.19873046875, "loss": 0.5033, "rewards/accuracies": 0.5, "rewards/chosen": -0.0022995760664343834, "rewards/margins": 0.020319033414125443, "rewards/rejected": -0.02261860854923725, "step": 22 }, { "debug/policy_chosen_logits": -0.747829794883728, "debug/policy_chosen_logps": -205.87387084960938, "debug/policy_rejected_logits": -0.7931165099143982, "debug/policy_rejected_logps": -202.55450439453125, "debug/reference_chosen_logps": -207.89743041992188, "debug/reference_rejected_logps": -202.11566162109375, "epoch": 0.6052631578947368, "grad_norm": 13.406646451993275, "learning_rate": 1e-06, "logits/chosen": -0.747829794883728, "logits/rejected": -0.7931165099143982, "logps/chosen": -205.87387084960938, "logps/rejected": -202.55450439453125, "loss": 0.5008, "rewards/accuracies": 0.75, "rewards/chosen": 0.020235728472471237, "rewards/margins": 0.024624040350317955, "rewards/rejected": -0.004388311877846718, "step": 23 }, { "debug/policy_chosen_logits": -0.7797695994377136, "debug/policy_chosen_logps": -183.11962890625, "debug/policy_rejected_logits": -0.8020097017288208, "debug/policy_rejected_logps": -179.82424926757812, "debug/reference_chosen_logps": -182.20074462890625, "debug/reference_rejected_logps": -179.76992797851562, "epoch": 0.631578947368421, "grad_norm": 13.008691124944571, "learning_rate": 1e-06, "logits/chosen": -0.7797695994377136, "logits/rejected": -0.8020097017288208, "logps/chosen": -183.11962890625, "logps/rejected": -179.82424926757812, "loss": 0.5058, "rewards/accuracies": 0.375, "rewards/chosen": -0.00918867252767086, "rewards/margins": -0.008645400404930115, "rewards/rejected": -0.0005432693287730217, "step": 24 }, { "debug/policy_chosen_logits": -0.9057678580284119, "debug/policy_chosen_logps": -176.46397399902344, "debug/policy_rejected_logits": -0.7624039649963379, "debug/policy_rejected_logps": -191.3661346435547, "debug/reference_chosen_logps": -175.11502075195312, "debug/reference_rejected_logps": -189.20126342773438, "epoch": 0.6578947368421053, "grad_norm": 12.793107149155997, "learning_rate": 1e-06, "logits/chosen": -0.9057678580284119, "logits/rejected": -0.7624039649963379, "logps/chosen": -176.46397399902344, "logps/rejected": -191.3661346435547, "loss": 0.4976, "rewards/accuracies": 0.75, "rewards/chosen": -0.01348949410021305, "rewards/margins": 0.008159112185239792, "rewards/rejected": -0.021648606285452843, "step": 25 }, { "debug/policy_chosen_logits": -0.8435552716255188, "debug/policy_chosen_logps": -168.30003356933594, "debug/policy_rejected_logits": -0.8235043883323669, "debug/policy_rejected_logps": -181.6990203857422, "debug/reference_chosen_logps": -170.16717529296875, "debug/reference_rejected_logps": -178.15789794921875, "epoch": 0.6842105263157895, "grad_norm": 16.18283932437879, "learning_rate": 1e-06, "logits/chosen": -0.8435552716255188, "logits/rejected": -0.8235043883323669, "logps/chosen": -168.30003356933594, "logps/rejected": -181.6990203857422, "loss": 0.5103, "rewards/accuracies": 0.75, "rewards/chosen": 0.018671445548534393, "rewards/margins": 0.05408259481191635, "rewards/rejected": -0.03541114926338196, "step": 26 }, { "debug/policy_chosen_logits": -0.89864182472229, "debug/policy_chosen_logps": -190.04278564453125, "debug/policy_rejected_logits": -0.765967845916748, "debug/policy_rejected_logps": -206.12942504882812, "debug/reference_chosen_logps": -186.39132690429688, "debug/reference_rejected_logps": -204.19808959960938, "epoch": 0.7105263157894737, "grad_norm": 11.950723408804826, "learning_rate": 1e-06, "logits/chosen": -0.89864182472229, "logits/rejected": -0.765967845916748, "logps/chosen": -190.04278564453125, "logps/rejected": -206.12942504882812, "loss": 0.4956, "rewards/accuracies": 0.25, "rewards/chosen": -0.0365147590637207, "rewards/margins": -0.01720167137682438, "rewards/rejected": -0.019313087686896324, "step": 27 }, { "debug/policy_chosen_logits": -0.9197551608085632, "debug/policy_chosen_logps": -159.85580444335938, "debug/policy_rejected_logits": -0.8922968506813049, "debug/policy_rejected_logps": -154.68045043945312, "debug/reference_chosen_logps": -157.56961059570312, "debug/reference_rejected_logps": -153.11502075195312, "epoch": 0.7368421052631579, "grad_norm": 11.128732665194095, "learning_rate": 1e-06, "logits/chosen": -0.9197551608085632, "logits/rejected": -0.8922968506813049, "logps/chosen": -159.85580444335938, "logps/rejected": -154.68045043945312, "loss": 0.491, "rewards/accuracies": 0.375, "rewards/chosen": -0.022861871868371964, "rewards/margins": -0.007207621354609728, "rewards/rejected": -0.015654249116778374, "step": 28 }, { "debug/policy_chosen_logits": -0.9484947919845581, "debug/policy_chosen_logps": -152.24813842773438, "debug/policy_rejected_logits": -0.8805264234542847, "debug/policy_rejected_logps": -168.650634765625, "debug/reference_chosen_logps": -152.11996459960938, "debug/reference_rejected_logps": -169.3289794921875, "epoch": 0.7631578947368421, "grad_norm": 12.54768054735427, "learning_rate": 1e-06, "logits/chosen": -0.9484947919845581, "logits/rejected": -0.8805264234542847, "logps/chosen": -152.24813842773438, "logps/rejected": -168.650634765625, "loss": 0.5024, "rewards/accuracies": 0.375, "rewards/chosen": -0.001281691249459982, "rewards/margins": -0.008065233007073402, "rewards/rejected": 0.0067835417576134205, "step": 29 }, { "debug/policy_chosen_logits": -0.9098101258277893, "debug/policy_chosen_logps": -144.8257598876953, "debug/policy_rejected_logits": -0.9026345014572144, "debug/policy_rejected_logps": -178.5587921142578, "debug/reference_chosen_logps": -146.36148071289062, "debug/reference_rejected_logps": -177.2120361328125, "epoch": 0.7894736842105263, "grad_norm": 11.052110219658221, "learning_rate": 1e-06, "logits/chosen": -0.9098101258277893, "logits/rejected": -0.9026345014572144, "logps/chosen": -144.8257598876953, "logps/rejected": -178.5587921142578, "loss": 0.4892, "rewards/accuracies": 0.75, "rewards/chosen": 0.015357255935668945, "rewards/margins": 0.028824787586927414, "rewards/rejected": -0.013467530719935894, "step": 30 }, { "debug/policy_chosen_logits": -0.8571977615356445, "debug/policy_chosen_logps": -149.52532958984375, "debug/policy_rejected_logits": -0.8092418313026428, "debug/policy_rejected_logps": -160.9595947265625, "debug/reference_chosen_logps": -151.14004516601562, "debug/reference_rejected_logps": -160.58103942871094, "epoch": 0.8157894736842105, "grad_norm": 11.744991657971045, "learning_rate": 1e-06, "logits/chosen": -0.8571977615356445, "logits/rejected": -0.8092418313026428, "logps/chosen": -149.52532958984375, "logps/rejected": -160.9595947265625, "loss": 0.5024, "rewards/accuracies": 0.625, "rewards/chosen": 0.016147155314683914, "rewards/margins": 0.019932862371206284, "rewards/rejected": -0.00378570519387722, "step": 31 }, { "debug/policy_chosen_logits": -0.8337549567222595, "debug/policy_chosen_logps": -145.87689208984375, "debug/policy_rejected_logits": -0.774303138256073, "debug/policy_rejected_logps": -159.3026123046875, "debug/reference_chosen_logps": -147.57308959960938, "debug/reference_rejected_logps": -158.9627685546875, "epoch": 0.8421052631578947, "grad_norm": 10.910880747605056, "learning_rate": 1e-06, "logits/chosen": -0.8337549567222595, "logits/rejected": -0.774303138256073, "logps/chosen": -145.87689208984375, "logps/rejected": -159.3026123046875, "loss": 0.491, "rewards/accuracies": 0.75, "rewards/chosen": 0.01696179248392582, "rewards/margins": 0.020359963178634644, "rewards/rejected": -0.0033981711603701115, "step": 32 }, { "debug/policy_chosen_logits": -0.9494245648384094, "debug/policy_chosen_logps": -162.24545288085938, "debug/policy_rejected_logits": -1.0418776273727417, "debug/policy_rejected_logps": -194.7138671875, "debug/reference_chosen_logps": -162.0795135498047, "debug/reference_rejected_logps": -193.1683807373047, "epoch": 0.868421052631579, "grad_norm": 10.621881858417531, "learning_rate": 1e-06, "logits/chosen": -0.9494245648384094, "logits/rejected": -1.0418776273727417, "logps/chosen": -162.24545288085938, "logps/rejected": -194.7138671875, "loss": 0.5043, "rewards/accuracies": 0.75, "rewards/chosen": -0.0016593635082244873, "rewards/margins": 0.013795491307973862, "rewards/rejected": -0.015454854816198349, "step": 33 }, { "debug/policy_chosen_logits": -0.7892115712165833, "debug/policy_chosen_logps": -153.41824340820312, "debug/policy_rejected_logits": -0.7181702852249146, "debug/policy_rejected_logps": -175.0258026123047, "debug/reference_chosen_logps": -152.8340606689453, "debug/reference_rejected_logps": -174.9475555419922, "epoch": 0.8947368421052632, "grad_norm": 11.510187964278074, "learning_rate": 1e-06, "logits/chosen": -0.7892115712165833, "logits/rejected": -0.7181702852249146, "logps/chosen": -153.41824340820312, "logps/rejected": -175.0258026123047, "loss": 0.4869, "rewards/accuracies": 0.625, "rewards/chosen": -0.005841732025146484, "rewards/margins": -0.005059261806309223, "rewards/rejected": -0.0007824706844985485, "step": 34 }, { "debug/policy_chosen_logits": -0.6696067452430725, "debug/policy_chosen_logps": -184.62283325195312, "debug/policy_rejected_logits": -0.6000730395317078, "debug/policy_rejected_logps": -208.19822692871094, "debug/reference_chosen_logps": -184.1222686767578, "debug/reference_rejected_logps": -207.0794677734375, "epoch": 0.9210526315789473, "grad_norm": 10.959485144211554, "learning_rate": 1e-06, "logits/chosen": -0.6696067452430725, "logits/rejected": -0.6000730395317078, "logps/chosen": -184.62283325195312, "logps/rejected": -208.19822692871094, "loss": 0.4963, "rewards/accuracies": 0.75, "rewards/chosen": -0.005005750805139542, "rewards/margins": 0.006181859411299229, "rewards/rejected": -0.011187611147761345, "step": 35 }, { "debug/policy_chosen_logits": -0.8848521113395691, "debug/policy_chosen_logps": -160.16763305664062, "debug/policy_rejected_logits": -0.8665605187416077, "debug/policy_rejected_logps": -213.43040466308594, "debug/reference_chosen_logps": -160.1614990234375, "debug/reference_rejected_logps": -211.8297882080078, "epoch": 0.9473684210526315, "grad_norm": 11.865862578065203, "learning_rate": 1e-06, "logits/chosen": -0.8848521113395691, "logits/rejected": -0.8665605187416077, "logps/chosen": -160.16763305664062, "logps/rejected": -213.43040466308594, "loss": 0.5022, "rewards/accuracies": 0.625, "rewards/chosen": -6.138812750577927e-05, "rewards/margins": 0.015944700688123703, "rewards/rejected": -0.016006087884306908, "step": 36 }, { "debug/policy_chosen_logits": -0.6179525256156921, "debug/policy_chosen_logps": -215.09716796875, "debug/policy_rejected_logits": -0.8251385688781738, "debug/policy_rejected_logps": -191.83261108398438, "debug/reference_chosen_logps": -214.5953369140625, "debug/reference_rejected_logps": -192.8199462890625, "epoch": 0.9736842105263158, "grad_norm": 12.158709477313657, "learning_rate": 1e-06, "logits/chosen": -0.6179525256156921, "logits/rejected": -0.8251385688781738, "logps/chosen": -215.09716796875, "logps/rejected": -191.83261108398438, "loss": 0.4844, "rewards/accuracies": 0.375, "rewards/chosen": -0.005018271971493959, "rewards/margins": -0.014891558326780796, "rewards/rejected": 0.00987328588962555, "step": 37 }, { "debug/policy_chosen_logits": -0.794685959815979, "debug/policy_chosen_logps": -220.12693786621094, "debug/policy_rejected_logits": -0.8812568187713623, "debug/policy_rejected_logps": -166.08612060546875, "debug/reference_chosen_logps": -219.10116577148438, "debug/reference_rejected_logps": -165.18740844726562, "epoch": 1.0, "grad_norm": 10.941646073739074, "learning_rate": 1e-06, "logits/chosen": -0.794685959815979, "logits/rejected": -0.8812568187713623, "logps/chosen": -220.12693786621094, "logps/rejected": -166.08612060546875, "loss": 0.4589, "rewards/accuracies": 0.375, "rewards/chosen": -0.010257730260491371, "rewards/margins": -0.001270495355129242, "rewards/rejected": -0.00898723490536213, "step": 38 }, { "epoch": 1.0, "step": 38, "total_flos": 0.0, "train_loss": 0.49694290051334783, "train_runtime": 142.4117, "train_samples_per_second": 16.923, "train_steps_per_second": 0.267 } ], "logging_steps": 1, "max_steps": 38, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }