{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 64, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.0596340894699097, "debug/policy_chosen_logps": -179.04273986816406, "debug/policy_rejected_logits": -1.1748394966125488, "debug/policy_rejected_logps": -295.01690673828125, "debug/reference_chosen_logps": -179.04273986816406, "debug/reference_rejected_logps": -295.01690673828125, "epoch": 0.015625, "grad_norm": 52.30319105460711, "learning_rate": 1e-06, "logits/chosen": -1.0596340894699097, "logits/rejected": -1.1748394966125488, "logps/chosen": -179.04273986816406, "logps/rejected": -295.01690673828125, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -1.1150486469268799, "debug/policy_chosen_logps": -124.63790893554688, "debug/policy_rejected_logits": -1.0623761415481567, "debug/policy_rejected_logps": -270.75244140625, "debug/reference_chosen_logps": -125.14633178710938, "debug/reference_rejected_logps": -271.20208740234375, "epoch": 0.03125, "grad_norm": 29.130704023833047, "learning_rate": 1e-06, "logits/chosen": -1.1150486469268799, "logits/rejected": -1.0623761415481567, "logps/chosen": -124.63790893554688, "logps/rejected": -270.75244140625, "loss": 0.4989, "rewards/accuracies": 0.375, "rewards/chosen": 0.005084190517663956, "rewards/margins": 0.0005879019154235721, "rewards/rejected": 0.0044962880201637745, "step": 2 }, { "debug/policy_chosen_logits": -1.1071562767028809, "debug/policy_chosen_logps": -136.3170166015625, "debug/policy_rejected_logits": -1.1613606214523315, "debug/policy_rejected_logps": -268.709228515625, "debug/reference_chosen_logps": -137.68783569335938, "debug/reference_rejected_logps": -268.8507995605469, "epoch": 0.046875, "grad_norm": 24.965184935253273, "learning_rate": 1e-06, "logits/chosen": -1.1071562767028809, "logits/rejected": -1.1613606214523315, "logps/chosen": -136.3170166015625, "logps/rejected": -268.709228515625, "loss": 0.4914, "rewards/accuracies": 0.75, "rewards/chosen": 0.013708190061151981, "rewards/margins": 0.012292098253965378, "rewards/rejected": 0.0014160918071866035, "step": 3 }, { "debug/policy_chosen_logits": -1.066061019897461, "debug/policy_chosen_logps": -153.8428192138672, "debug/policy_rejected_logits": -1.1866570711135864, "debug/policy_rejected_logps": -274.9277648925781, "debug/reference_chosen_logps": -155.69000244140625, "debug/reference_rejected_logps": -275.12884521484375, "epoch": 0.0625, "grad_norm": 24.49810670915077, "learning_rate": 1e-06, "logits/chosen": -1.066061019897461, "logits/rejected": -1.1866570711135864, "logps/chosen": -153.8428192138672, "logps/rejected": -274.9277648925781, "loss": 0.4777, "rewards/accuracies": 0.625, "rewards/chosen": 0.018471689894795418, "rewards/margins": 0.016461096704006195, "rewards/rejected": 0.0020105931907892227, "step": 4 }, { "debug/policy_chosen_logits": -1.0783909559249878, "debug/policy_chosen_logps": -161.8551483154297, "debug/policy_rejected_logits": -1.1809625625610352, "debug/policy_rejected_logps": -291.5763244628906, "debug/reference_chosen_logps": -165.77706909179688, "debug/reference_rejected_logps": -290.215087890625, "epoch": 0.078125, "grad_norm": 22.066344534464825, "learning_rate": 1e-06, "logits/chosen": -1.0783909559249878, "logits/rejected": -1.1809625625610352, "logps/chosen": -161.8551483154297, "logps/rejected": -291.5763244628906, "loss": 0.4425, "rewards/accuracies": 0.625, "rewards/chosen": 0.039219196885824203, "rewards/margins": 0.05283135548233986, "rewards/rejected": -0.013612156733870506, "step": 5 }, { "debug/policy_chosen_logits": -1.0005463361740112, "debug/policy_chosen_logps": -177.85003662109375, "debug/policy_rejected_logits": -1.0288403034210205, "debug/policy_rejected_logps": -263.21014404296875, "debug/reference_chosen_logps": -178.246337890625, "debug/reference_rejected_logps": -263.5099182128906, "epoch": 0.09375, "grad_norm": 41.16778948079108, "learning_rate": 1e-06, "logits/chosen": -1.0005463361740112, "logits/rejected": -1.0288403034210205, "logps/chosen": -177.85003662109375, "logps/rejected": -263.21014404296875, "loss": 0.4659, "rewards/accuracies": 0.625, "rewards/chosen": 0.003962935879826546, "rewards/margins": 0.0009648129343986511, "rewards/rejected": 0.0029981210827827454, "step": 6 }, { "debug/policy_chosen_logits": -0.9317433834075928, "debug/policy_chosen_logps": -155.7017822265625, "debug/policy_rejected_logits": -1.3209773302078247, "debug/policy_rejected_logps": -308.2155456542969, "debug/reference_chosen_logps": -165.14569091796875, "debug/reference_rejected_logps": -295.0081481933594, "epoch": 0.109375, "grad_norm": 15.904262612549944, "learning_rate": 1e-06, "logits/chosen": -0.9317433834075928, "logits/rejected": -1.3209773302078247, "logps/chosen": -155.7017822265625, "logps/rejected": -308.2155456542969, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 0.0944390594959259, "rewards/margins": 0.22651299834251404, "rewards/rejected": -0.13207395374774933, "step": 7 }, { "debug/policy_chosen_logits": -1.0539729595184326, "debug/policy_chosen_logps": -173.66781616210938, "debug/policy_rejected_logits": -1.0206472873687744, "debug/policy_rejected_logps": -271.9588317871094, "debug/reference_chosen_logps": -177.30899047851562, "debug/reference_rejected_logps": -263.9579162597656, "epoch": 0.125, "grad_norm": 15.028091497342194, "learning_rate": 1e-06, "logits/chosen": -1.0539729595184326, "logits/rejected": -1.0206472873687744, "logps/chosen": -173.66781616210938, "logps/rejected": -271.9588317871094, "loss": 0.4255, "rewards/accuracies": 0.875, "rewards/chosen": 0.036411646753549576, "rewards/margins": 0.1164209246635437, "rewards/rejected": -0.08000928163528442, "step": 8 }, { "debug/policy_chosen_logits": -0.9866081476211548, "debug/policy_chosen_logps": -176.56866455078125, "debug/policy_rejected_logits": -0.9740838408470154, "debug/policy_rejected_logps": -272.35650634765625, "debug/reference_chosen_logps": -177.0741729736328, "debug/reference_rejected_logps": -260.4818420410156, "epoch": 0.140625, "grad_norm": 32.8906220838234, "learning_rate": 1e-06, "logits/chosen": -0.9866081476211548, "logits/rejected": -0.9740838408470154, "logps/chosen": -176.56866455078125, "logps/rejected": -272.35650634765625, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 0.005055226851254702, "rewards/margins": 0.12380212545394897, "rewards/rejected": -0.11874689161777496, "step": 9 }, { "debug/policy_chosen_logits": -0.9441277980804443, "debug/policy_chosen_logps": -164.24789428710938, "debug/policy_rejected_logits": -1.1364271640777588, "debug/policy_rejected_logps": -292.0938720703125, "debug/reference_chosen_logps": -160.7564697265625, "debug/reference_rejected_logps": -257.1752014160156, "epoch": 0.15625, "grad_norm": 28.811843166780264, "learning_rate": 1e-06, "logits/chosen": -0.9441277980804443, "logits/rejected": -1.1364271640777588, "logps/chosen": -164.24789428710938, "logps/rejected": -292.0938720703125, "loss": 0.4237, "rewards/accuracies": 0.875, "rewards/chosen": -0.03491419926285744, "rewards/margins": 0.3142724931240082, "rewards/rejected": -0.3491867184638977, "step": 10 }, { "debug/policy_chosen_logits": -0.9175143837928772, "debug/policy_chosen_logps": -214.65664672851562, "debug/policy_rejected_logits": -1.1515822410583496, "debug/policy_rejected_logps": -244.6530303955078, "debug/reference_chosen_logps": -207.79930114746094, "debug/reference_rejected_logps": -230.90333557128906, "epoch": 0.171875, "grad_norm": 38.46428758925275, "learning_rate": 1e-06, "logits/chosen": -0.9175143837928772, "logits/rejected": -1.1515822410583496, "logps/chosen": -214.65664672851562, "logps/rejected": -244.6530303955078, "loss": 0.4949, "rewards/accuracies": 0.5, "rewards/chosen": -0.06857340782880783, "rewards/margins": 0.06892354786396027, "rewards/rejected": -0.1374969631433487, "step": 11 }, { "debug/policy_chosen_logits": -0.8965519070625305, "debug/policy_chosen_logps": -153.26284790039062, "debug/policy_rejected_logits": -1.1321805715560913, "debug/policy_rejected_logps": -318.78076171875, "debug/reference_chosen_logps": -154.14707946777344, "debug/reference_rejected_logps": -289.067138671875, "epoch": 0.1875, "grad_norm": 41.433140559474445, "learning_rate": 1e-06, "logits/chosen": -0.8965519070625305, "logits/rejected": -1.1321805715560913, "logps/chosen": -153.26284790039062, "logps/rejected": -318.78076171875, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": 0.008842326700687408, "rewards/margins": 0.30597835779190063, "rewards/rejected": -0.2971360683441162, "step": 12 }, { "debug/policy_chosen_logits": -0.945601761341095, "debug/policy_chosen_logps": -122.90229797363281, "debug/policy_rejected_logits": -1.0716924667358398, "debug/policy_rejected_logps": -274.2931823730469, "debug/reference_chosen_logps": -120.32145690917969, "debug/reference_rejected_logps": -250.55557250976562, "epoch": 0.203125, "grad_norm": 26.79881614435138, "learning_rate": 1e-06, "logits/chosen": -0.945601761341095, "logits/rejected": -1.0716924667358398, "logps/chosen": -122.90229797363281, "logps/rejected": -274.2931823730469, "loss": 0.4694, "rewards/accuracies": 0.625, "rewards/chosen": -0.02580837532877922, "rewards/margins": 0.2115677297115326, "rewards/rejected": -0.2373761087656021, "step": 13 }, { "debug/policy_chosen_logits": -1.0047388076782227, "debug/policy_chosen_logps": -200.4830780029297, "debug/policy_rejected_logits": -1.1980981826782227, "debug/policy_rejected_logps": -315.792236328125, "debug/reference_chosen_logps": -190.80075073242188, "debug/reference_rejected_logps": -281.5347595214844, "epoch": 0.21875, "grad_norm": 27.316365360407435, "learning_rate": 1e-06, "logits/chosen": -1.0047388076782227, "logits/rejected": -1.1980981826782227, "logps/chosen": -200.4830780029297, "logps/rejected": -315.792236328125, "loss": 0.4324, "rewards/accuracies": 0.5, "rewards/chosen": -0.09682333469390869, "rewards/margins": 0.24575121700763702, "rewards/rejected": -0.3425745368003845, "step": 14 }, { "debug/policy_chosen_logits": -1.1348706483840942, "debug/policy_chosen_logps": -208.73074340820312, "debug/policy_rejected_logits": -1.121549129486084, "debug/policy_rejected_logps": -310.7353210449219, "debug/reference_chosen_logps": -204.0843048095703, "debug/reference_rejected_logps": -281.996337890625, "epoch": 0.234375, "grad_norm": 58.18504208169894, "learning_rate": 1e-06, "logits/chosen": -1.1348706483840942, "logits/rejected": -1.121549129486084, "logps/chosen": -208.73074340820312, "logps/rejected": -310.7353210449219, "loss": 0.4662, "rewards/accuracies": 0.75, "rewards/chosen": -0.04646441712975502, "rewards/margins": 0.24092541635036469, "rewards/rejected": -0.2873898148536682, "step": 15 }, { "debug/policy_chosen_logits": -0.9974825978279114, "debug/policy_chosen_logps": -154.0273895263672, "debug/policy_rejected_logits": -1.1503194570541382, "debug/policy_rejected_logps": -307.7276611328125, "debug/reference_chosen_logps": -154.69586181640625, "debug/reference_rejected_logps": -273.1531677246094, "epoch": 0.25, "grad_norm": 56.48600158612175, "learning_rate": 1e-06, "logits/chosen": -0.9974825978279114, "logits/rejected": -1.1503194570541382, "logps/chosen": -154.0273895263672, "logps/rejected": -307.7276611328125, "loss": 0.4093, "rewards/accuracies": 0.875, "rewards/chosen": 0.006684892810881138, "rewards/margins": 0.3524298071861267, "rewards/rejected": -0.34574490785598755, "step": 16 }, { "debug/policy_chosen_logits": -1.0567247867584229, "debug/policy_chosen_logps": -137.61720275878906, "debug/policy_rejected_logits": -1.0961592197418213, "debug/policy_rejected_logps": -313.12060546875, "debug/reference_chosen_logps": -135.6652069091797, "debug/reference_rejected_logps": -297.18695068359375, "epoch": 0.265625, "grad_norm": 31.458528575785774, "learning_rate": 1e-06, "logits/chosen": -1.0567247867584229, "logits/rejected": -1.0961592197418213, "logps/chosen": -137.61720275878906, "logps/rejected": -313.12060546875, "loss": 0.4473, "rewards/accuracies": 0.875, "rewards/chosen": -0.01951989158987999, "rewards/margins": 0.13981682062149048, "rewards/rejected": -0.15933671593666077, "step": 17 }, { "debug/policy_chosen_logits": -1.0550764799118042, "debug/policy_chosen_logps": -143.5434112548828, "debug/policy_rejected_logits": -1.3183400630950928, "debug/policy_rejected_logps": -359.35418701171875, "debug/reference_chosen_logps": -157.90188598632812, "debug/reference_rejected_logps": -317.474853515625, "epoch": 0.28125, "grad_norm": 24.003283570475016, "learning_rate": 1e-06, "logits/chosen": -1.0550764799118042, "logits/rejected": -1.3183400630950928, "logps/chosen": -143.5434112548828, "logps/rejected": -359.35418701171875, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 0.14358465373516083, "rewards/margins": 0.5623779296875, "rewards/rejected": -0.418793261051178, "step": 18 }, { "debug/policy_chosen_logits": -1.1249719858169556, "debug/policy_chosen_logps": -163.00375366210938, "debug/policy_rejected_logits": -1.1258165836334229, "debug/policy_rejected_logps": -283.2430725097656, "debug/reference_chosen_logps": -166.72418212890625, "debug/reference_rejected_logps": -264.2232360839844, "epoch": 0.296875, "grad_norm": 41.98096605753313, "learning_rate": 1e-06, "logits/chosen": -1.1249719858169556, "logits/rejected": -1.1258165836334229, "logps/chosen": -163.00375366210938, "logps/rejected": -283.2430725097656, "loss": 0.4597, "rewards/accuracies": 0.75, "rewards/chosen": 0.03720443695783615, "rewards/margins": 0.22740286588668823, "rewards/rejected": -0.19019843637943268, "step": 19 }, { "debug/policy_chosen_logits": -1.000652551651001, "debug/policy_chosen_logps": -174.8540802001953, "debug/policy_rejected_logits": -1.075732946395874, "debug/policy_rejected_logps": -248.3970947265625, "debug/reference_chosen_logps": -179.03424072265625, "debug/reference_rejected_logps": -235.50778198242188, "epoch": 0.3125, "grad_norm": 26.892461198324778, "learning_rate": 1e-06, "logits/chosen": -1.000652551651001, "logits/rejected": -1.075732946395874, "logps/chosen": -174.8540802001953, "logps/rejected": -248.3970947265625, "loss": 0.4325, "rewards/accuracies": 0.75, "rewards/chosen": 0.041801512241363525, "rewards/margins": 0.1706947386264801, "rewards/rejected": -0.12889322638511658, "step": 20 }, { "debug/policy_chosen_logits": -1.0587340593338013, "debug/policy_chosen_logps": -148.18423461914062, "debug/policy_rejected_logits": -1.435739278793335, "debug/policy_rejected_logps": -331.1427001953125, "debug/reference_chosen_logps": -151.2082061767578, "debug/reference_rejected_logps": -314.77117919921875, "epoch": 0.328125, "grad_norm": 15.800648562809261, "learning_rate": 1e-06, "logits/chosen": -1.0587340593338013, "logits/rejected": -1.435739278793335, "logps/chosen": -148.18423461914062, "logps/rejected": -331.1427001953125, "loss": 0.3982, "rewards/accuracies": 0.875, "rewards/chosen": 0.030239801853895187, "rewards/margins": 0.1939551830291748, "rewards/rejected": -0.16371536254882812, "step": 21 }, { "debug/policy_chosen_logits": -1.0205200910568237, "debug/policy_chosen_logps": -157.31350708007812, "debug/policy_rejected_logits": -1.0888888835906982, "debug/policy_rejected_logps": -346.0768127441406, "debug/reference_chosen_logps": -161.5574493408203, "debug/reference_rejected_logps": -338.91650390625, "epoch": 0.34375, "grad_norm": 21.49065797596958, "learning_rate": 1e-06, "logits/chosen": -1.0205200910568237, "logits/rejected": -1.0888888835906982, "logps/chosen": -157.31350708007812, "logps/rejected": -346.0768127441406, "loss": 0.4361, "rewards/accuracies": 0.75, "rewards/chosen": 0.042439430952072144, "rewards/margins": 0.11404269933700562, "rewards/rejected": -0.07160326838493347, "step": 22 }, { "debug/policy_chosen_logits": -1.1462302207946777, "debug/policy_chosen_logps": -195.76788330078125, "debug/policy_rejected_logits": -1.2484185695648193, "debug/policy_rejected_logps": -277.576904296875, "debug/reference_chosen_logps": -198.74685668945312, "debug/reference_rejected_logps": -265.5393981933594, "epoch": 0.359375, "grad_norm": 17.749863549342045, "learning_rate": 1e-06, "logits/chosen": -1.1462302207946777, "logits/rejected": -1.2484185695648193, "logps/chosen": -195.76788330078125, "logps/rejected": -277.576904296875, "loss": 0.4165, "rewards/accuracies": 0.625, "rewards/chosen": 0.029789581894874573, "rewards/margins": 0.1501646637916565, "rewards/rejected": -0.12037509679794312, "step": 23 }, { "debug/policy_chosen_logits": -0.952358067035675, "debug/policy_chosen_logps": -115.6708984375, "debug/policy_rejected_logits": -1.036898136138916, "debug/policy_rejected_logps": -245.47000122070312, "debug/reference_chosen_logps": -131.1976776123047, "debug/reference_rejected_logps": -238.638427734375, "epoch": 0.375, "grad_norm": 16.031924320507283, "learning_rate": 1e-06, "logits/chosen": -0.952358067035675, "logits/rejected": -1.036898136138916, "logps/chosen": -115.6708984375, "logps/rejected": -245.47000122070312, "loss": 0.3771, "rewards/accuracies": 0.875, "rewards/chosen": 0.1552678644657135, "rewards/margins": 0.22358371317386627, "rewards/rejected": -0.06831584870815277, "step": 24 }, { "debug/policy_chosen_logits": -1.070897102355957, "debug/policy_chosen_logps": -178.87374877929688, "debug/policy_rejected_logits": -1.1623822450637817, "debug/policy_rejected_logps": -243.98184204101562, "debug/reference_chosen_logps": -179.05862426757812, "debug/reference_rejected_logps": -244.07818603515625, "epoch": 0.390625, "grad_norm": 38.66586744942012, "learning_rate": 1e-06, "logits/chosen": -1.070897102355957, "logits/rejected": -1.1623822450637817, "logps/chosen": -178.87374877929688, "logps/rejected": -243.98184204101562, "loss": 0.4396, "rewards/accuracies": 0.375, "rewards/chosen": 0.001848660409450531, "rewards/margins": 0.0008851997554302216, "rewards/rejected": 0.000963456928730011, "step": 25 }, { "debug/policy_chosen_logits": -1.1025017499923706, "debug/policy_chosen_logps": -173.5986328125, "debug/policy_rejected_logits": -1.1473654508590698, "debug/policy_rejected_logps": -245.47994995117188, "debug/reference_chosen_logps": -186.88778686523438, "debug/reference_rejected_logps": -241.27210998535156, "epoch": 0.40625, "grad_norm": 50.21384448251296, "learning_rate": 1e-06, "logits/chosen": -1.1025017499923706, "logits/rejected": -1.1473654508590698, "logps/chosen": -173.5986328125, "logps/rejected": -245.47994995117188, "loss": 0.4023, "rewards/accuracies": 0.625, "rewards/chosen": 0.13289162516593933, "rewards/margins": 0.1749698668718338, "rewards/rejected": -0.04207824170589447, "step": 26 }, { "debug/policy_chosen_logits": -1.0175386667251587, "debug/policy_chosen_logps": -94.42786407470703, "debug/policy_rejected_logits": -1.1346431970596313, "debug/policy_rejected_logps": -240.36541748046875, "debug/reference_chosen_logps": -106.56871032714844, "debug/reference_rejected_logps": -240.29310607910156, "epoch": 0.421875, "grad_norm": 17.96088818186707, "learning_rate": 1e-06, "logits/chosen": -1.0175386667251587, "logits/rejected": -1.1346431970596313, "logps/chosen": -94.42786407470703, "logps/rejected": -240.36541748046875, "loss": 0.39, "rewards/accuracies": 0.75, "rewards/chosen": 0.12140841782093048, "rewards/margins": 0.12213139981031418, "rewards/rejected": -0.0007229708135128021, "step": 27 }, { "debug/policy_chosen_logits": -1.1408073902130127, "debug/policy_chosen_logps": -126.60142517089844, "debug/policy_rejected_logits": -1.20956289768219, "debug/policy_rejected_logps": -313.8656311035156, "debug/reference_chosen_logps": -130.33799743652344, "debug/reference_rejected_logps": -291.3277893066406, "epoch": 0.4375, "grad_norm": 29.993745130410183, "learning_rate": 1e-06, "logits/chosen": -1.1408073902130127, "logits/rejected": -1.20956289768219, "logps/chosen": -126.60142517089844, "logps/rejected": -313.8656311035156, "loss": 0.3947, "rewards/accuracies": 0.875, "rewards/chosen": 0.03736574202775955, "rewards/margins": 0.2627440094947815, "rewards/rejected": -0.22537828981876373, "step": 28 }, { "debug/policy_chosen_logits": -1.0291798114776611, "debug/policy_chosen_logps": -192.92529296875, "debug/policy_rejected_logits": -1.2137432098388672, "debug/policy_rejected_logps": -315.1015930175781, "debug/reference_chosen_logps": -184.9921875, "debug/reference_rejected_logps": -301.6517639160156, "epoch": 0.453125, "grad_norm": 51.5929899248971, "learning_rate": 1e-06, "logits/chosen": -1.0291798114776611, "logits/rejected": -1.2137432098388672, "logps/chosen": -192.92529296875, "logps/rejected": -315.1015930175781, "loss": 0.4253, "rewards/accuracies": 0.625, "rewards/chosen": -0.0793309286236763, "rewards/margins": 0.05516732484102249, "rewards/rejected": -0.1344982385635376, "step": 29 }, { "debug/policy_chosen_logits": -1.033249020576477, "debug/policy_chosen_logps": -129.13734436035156, "debug/policy_rejected_logits": -1.1481682062149048, "debug/policy_rejected_logps": -319.0918884277344, "debug/reference_chosen_logps": -134.66598510742188, "debug/reference_rejected_logps": -297.1129150390625, "epoch": 0.46875, "grad_norm": 41.13041833853564, "learning_rate": 1e-06, "logits/chosen": -1.033249020576477, "logits/rejected": -1.1481682062149048, "logps/chosen": -129.13734436035156, "logps/rejected": -319.0918884277344, "loss": 0.4069, "rewards/accuracies": 0.875, "rewards/chosen": 0.05528645217418671, "rewards/margins": 0.2750762701034546, "rewards/rejected": -0.2197897881269455, "step": 30 }, { "debug/policy_chosen_logits": -1.1428550481796265, "debug/policy_chosen_logps": -174.7340087890625, "debug/policy_rejected_logits": -1.017913818359375, "debug/policy_rejected_logps": -238.23471069335938, "debug/reference_chosen_logps": -180.0450897216797, "debug/reference_rejected_logps": -228.79031372070312, "epoch": 0.484375, "grad_norm": 54.8216481339695, "learning_rate": 1e-06, "logits/chosen": -1.1428550481796265, "logits/rejected": -1.017913818359375, "logps/chosen": -174.7340087890625, "logps/rejected": -238.23471069335938, "loss": 0.4467, "rewards/accuracies": 0.5, "rewards/chosen": 0.053110986948013306, "rewards/margins": 0.1475549191236496, "rewards/rejected": -0.0944439247250557, "step": 31 }, { "debug/policy_chosen_logits": -1.0067996978759766, "debug/policy_chosen_logps": -145.49851989746094, "debug/policy_rejected_logits": -1.210583209991455, "debug/policy_rejected_logps": -274.90240478515625, "debug/reference_chosen_logps": -151.12542724609375, "debug/reference_rejected_logps": -264.36016845703125, "epoch": 0.5, "grad_norm": 31.943528016300796, "learning_rate": 1e-06, "logits/chosen": -1.0067996978759766, "logits/rejected": -1.210583209991455, "logps/chosen": -145.49851989746094, "logps/rejected": -274.90240478515625, "loss": 0.3966, "rewards/accuracies": 0.625, "rewards/chosen": 0.05626893788576126, "rewards/margins": 0.16169115900993347, "rewards/rejected": -0.10542222112417221, "step": 32 }, { "debug/policy_chosen_logits": -1.1181310415267944, "debug/policy_chosen_logps": -154.81201171875, "debug/policy_rejected_logits": -1.2310353517532349, "debug/policy_rejected_logps": -287.8173828125, "debug/reference_chosen_logps": -170.07876586914062, "debug/reference_rejected_logps": -274.1385498046875, "epoch": 0.515625, "grad_norm": 18.618810659036946, "learning_rate": 1e-06, "logits/chosen": -1.1181310415267944, "logits/rejected": -1.2310353517532349, "logps/chosen": -154.81201171875, "logps/rejected": -287.8173828125, "loss": 0.3581, "rewards/accuracies": 0.875, "rewards/chosen": 0.1526675671339035, "rewards/margins": 0.2894558906555176, "rewards/rejected": -0.13678830862045288, "step": 33 }, { "debug/policy_chosen_logits": -1.0529826879501343, "debug/policy_chosen_logps": -128.18128967285156, "debug/policy_rejected_logits": -1.2277421951293945, "debug/policy_rejected_logps": -326.91705322265625, "debug/reference_chosen_logps": -147.74295043945312, "debug/reference_rejected_logps": -300.6445617675781, "epoch": 0.53125, "grad_norm": 19.76877319971208, "learning_rate": 1e-06, "logits/chosen": -1.0529826879501343, "logits/rejected": -1.2277421951293945, "logps/chosen": -128.18128967285156, "logps/rejected": -326.91705322265625, "loss": 0.3702, "rewards/accuracies": 0.875, "rewards/chosen": 0.19561666250228882, "rewards/margins": 0.4583418369293213, "rewards/rejected": -0.26272517442703247, "step": 34 }, { "debug/policy_chosen_logits": -1.0484968423843384, "debug/policy_chosen_logps": -177.14181518554688, "debug/policy_rejected_logits": -1.0831434726715088, "debug/policy_rejected_logps": -277.63067626953125, "debug/reference_chosen_logps": -184.79954528808594, "debug/reference_rejected_logps": -262.337646484375, "epoch": 0.546875, "grad_norm": 13.405977545151604, "learning_rate": 1e-06, "logits/chosen": -1.0484968423843384, "logits/rejected": -1.0831434726715088, "logps/chosen": -177.14181518554688, "logps/rejected": -277.63067626953125, "loss": 0.3774, "rewards/accuracies": 0.625, "rewards/chosen": 0.07657738029956818, "rewards/margins": 0.22950761020183563, "rewards/rejected": -0.15293022990226746, "step": 35 }, { "debug/policy_chosen_logits": -1.070804476737976, "debug/policy_chosen_logps": -119.32257080078125, "debug/policy_rejected_logits": -1.1960089206695557, "debug/policy_rejected_logps": -257.6097412109375, "debug/reference_chosen_logps": -134.1144561767578, "debug/reference_rejected_logps": -249.19239807128906, "epoch": 0.5625, "grad_norm": 54.78669264655883, "learning_rate": 1e-06, "logits/chosen": -1.070804476737976, "logits/rejected": -1.1960089206695557, "logps/chosen": -119.32257080078125, "logps/rejected": -257.6097412109375, "loss": 0.4202, "rewards/accuracies": 0.875, "rewards/chosen": 0.1479189097881317, "rewards/margins": 0.23209232091903687, "rewards/rejected": -0.08417341113090515, "step": 36 }, { "debug/policy_chosen_logits": -1.0936942100524902, "debug/policy_chosen_logps": -198.59994506835938, "debug/policy_rejected_logits": -1.1287853717803955, "debug/policy_rejected_logps": -267.81048583984375, "debug/reference_chosen_logps": -206.68980407714844, "debug/reference_rejected_logps": -260.12896728515625, "epoch": 0.578125, "grad_norm": 18.618309162410206, "learning_rate": 1e-06, "logits/chosen": -1.0936942100524902, "logits/rejected": -1.1287853717803955, "logps/chosen": -198.59994506835938, "logps/rejected": -267.81048583984375, "loss": 0.3922, "rewards/accuracies": 0.5, "rewards/chosen": 0.08089858293533325, "rewards/margins": 0.15771383047103882, "rewards/rejected": -0.07681524008512497, "step": 37 }, { "debug/policy_chosen_logits": -1.0987818241119385, "debug/policy_chosen_logps": -156.1143798828125, "debug/policy_rejected_logits": -1.016094446182251, "debug/policy_rejected_logps": -280.1226806640625, "debug/reference_chosen_logps": -174.13986206054688, "debug/reference_rejected_logps": -272.59063720703125, "epoch": 0.59375, "grad_norm": 48.92722394829403, "learning_rate": 1e-06, "logits/chosen": -1.0987818241119385, "logits/rejected": -1.016094446182251, "logps/chosen": -156.1143798828125, "logps/rejected": -280.1226806640625, "loss": 0.4235, "rewards/accuracies": 0.875, "rewards/chosen": 0.18025492131710052, "rewards/margins": 0.2555754780769348, "rewards/rejected": -0.07532056421041489, "step": 38 }, { "debug/policy_chosen_logits": -1.1431177854537964, "debug/policy_chosen_logps": -121.45298767089844, "debug/policy_rejected_logits": -1.2573899030685425, "debug/policy_rejected_logps": -243.77618408203125, "debug/reference_chosen_logps": -132.9182891845703, "debug/reference_rejected_logps": -236.6573486328125, "epoch": 0.609375, "grad_norm": 28.262320173832173, "learning_rate": 1e-06, "logits/chosen": -1.1431177854537964, "logits/rejected": -1.2573899030685425, "logps/chosen": -121.45298767089844, "logps/rejected": -243.77618408203125, "loss": 0.3976, "rewards/accuracies": 0.75, "rewards/chosen": 0.11465291678905487, "rewards/margins": 0.18584111332893372, "rewards/rejected": -0.07118818163871765, "step": 39 }, { "debug/policy_chosen_logits": -1.1291528940200806, "debug/policy_chosen_logps": -124.88560485839844, "debug/policy_rejected_logits": -1.1997623443603516, "debug/policy_rejected_logps": -341.6507568359375, "debug/reference_chosen_logps": -145.1587677001953, "debug/reference_rejected_logps": -316.4557189941406, "epoch": 0.625, "grad_norm": 17.205504877297493, "learning_rate": 1e-06, "logits/chosen": -1.1291528940200806, "logits/rejected": -1.1997623443603516, "logps/chosen": -124.88560485839844, "logps/rejected": -341.6507568359375, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 0.202731654047966, "rewards/margins": 0.4546818137168884, "rewards/rejected": -0.25195014476776123, "step": 40 }, { "debug/policy_chosen_logits": -1.119407057762146, "debug/policy_chosen_logps": -155.58392333984375, "debug/policy_rejected_logits": -1.165313720703125, "debug/policy_rejected_logps": -216.57156372070312, "debug/reference_chosen_logps": -161.89459228515625, "debug/reference_rejected_logps": -214.2755126953125, "epoch": 0.640625, "grad_norm": 20.732094832807366, "learning_rate": 1e-06, "logits/chosen": -1.119407057762146, "logits/rejected": -1.165313720703125, "logps/chosen": -155.58392333984375, "logps/rejected": -216.57156372070312, "loss": 0.3763, "rewards/accuracies": 0.75, "rewards/chosen": 0.06310684233903885, "rewards/margins": 0.0860673040151596, "rewards/rejected": -0.02296045981347561, "step": 41 }, { "debug/policy_chosen_logits": -1.2078365087509155, "debug/policy_chosen_logps": -137.1336212158203, "debug/policy_rejected_logits": -1.2154945135116577, "debug/policy_rejected_logps": -227.4922637939453, "debug/reference_chosen_logps": -139.9180145263672, "debug/reference_rejected_logps": -215.813232421875, "epoch": 0.65625, "grad_norm": 34.027873181354636, "learning_rate": 1e-06, "logits/chosen": -1.2078365087509155, "logits/rejected": -1.2154945135116577, "logps/chosen": -137.1336212158203, "logps/rejected": -227.4922637939453, "loss": 0.4182, "rewards/accuracies": 0.75, "rewards/chosen": 0.02784401923418045, "rewards/margins": 0.14463430643081665, "rewards/rejected": -0.1167902946472168, "step": 42 }, { "debug/policy_chosen_logits": -1.0180912017822266, "debug/policy_chosen_logps": -173.8270263671875, "debug/policy_rejected_logits": -1.1830826997756958, "debug/policy_rejected_logps": -286.73638916015625, "debug/reference_chosen_logps": -174.58895874023438, "debug/reference_rejected_logps": -263.51458740234375, "epoch": 0.671875, "grad_norm": 26.885686366047068, "learning_rate": 1e-06, "logits/chosen": -1.0180912017822266, "logits/rejected": -1.1830826997756958, "logps/chosen": -173.8270263671875, "logps/rejected": -286.73638916015625, "loss": 0.3939, "rewards/accuracies": 0.75, "rewards/chosen": 0.007619347423315048, "rewards/margins": 0.2398374080657959, "rewards/rejected": -0.23221805691719055, "step": 43 }, { "debug/policy_chosen_logits": -0.8629423379898071, "debug/policy_chosen_logps": -186.4468994140625, "debug/policy_rejected_logits": -1.196955680847168, "debug/policy_rejected_logps": -291.8290710449219, "debug/reference_chosen_logps": -192.09939575195312, "debug/reference_rejected_logps": -283.04547119140625, "epoch": 0.6875, "grad_norm": 15.341359477798175, "learning_rate": 1e-06, "logits/chosen": -0.8629423379898071, "logits/rejected": -1.196955680847168, "logps/chosen": -186.4468994140625, "logps/rejected": -291.8290710449219, "loss": 0.3941, "rewards/accuracies": 0.875, "rewards/chosen": 0.05652495473623276, "rewards/margins": 0.14436087012290955, "rewards/rejected": -0.08783592283725739, "step": 44 }, { "debug/policy_chosen_logits": -1.1474281549453735, "debug/policy_chosen_logps": -185.1705322265625, "debug/policy_rejected_logits": -1.2113550901412964, "debug/policy_rejected_logps": -299.13165283203125, "debug/reference_chosen_logps": -184.02684020996094, "debug/reference_rejected_logps": -283.3847961425781, "epoch": 0.703125, "grad_norm": 27.424785120293386, "learning_rate": 1e-06, "logits/chosen": -1.1474281549453735, "logits/rejected": -1.2113550901412964, "logps/chosen": -185.1705322265625, "logps/rejected": -299.13165283203125, "loss": 0.4015, "rewards/accuracies": 0.75, "rewards/chosen": -0.011436812579631805, "rewards/margins": 0.1460317075252533, "rewards/rejected": -0.1574685126543045, "step": 45 }, { "debug/policy_chosen_logits": -1.0573773384094238, "debug/policy_chosen_logps": -127.71075439453125, "debug/policy_rejected_logits": -1.0924162864685059, "debug/policy_rejected_logps": -323.93768310546875, "debug/reference_chosen_logps": -139.21630859375, "debug/reference_rejected_logps": -311.1994323730469, "epoch": 0.71875, "grad_norm": 17.144934905131425, "learning_rate": 1e-06, "logits/chosen": -1.0573773384094238, "logits/rejected": -1.0924162864685059, "logps/chosen": -127.71075439453125, "logps/rejected": -323.93768310546875, "loss": 0.3624, "rewards/accuracies": 0.75, "rewards/chosen": 0.11505550146102905, "rewards/margins": 0.24243810772895813, "rewards/rejected": -0.12738259136676788, "step": 46 }, { "debug/policy_chosen_logits": -1.0909616947174072, "debug/policy_chosen_logps": -137.27731323242188, "debug/policy_rejected_logits": -1.2138352394104004, "debug/policy_rejected_logps": -241.8701171875, "debug/reference_chosen_logps": -147.23553466796875, "debug/reference_rejected_logps": -222.49639892578125, "epoch": 0.734375, "grad_norm": 12.93169650628382, "learning_rate": 1e-06, "logits/chosen": -1.0909616947174072, "logits/rejected": -1.2138352394104004, "logps/chosen": -137.27731323242188, "logps/rejected": -241.8701171875, "loss": 0.3217, "rewards/accuracies": 0.875, "rewards/chosen": 0.0995820164680481, "rewards/margins": 0.2933192849159241, "rewards/rejected": -0.19373726844787598, "step": 47 }, { "debug/policy_chosen_logits": -1.096240520477295, "debug/policy_chosen_logps": -232.75778198242188, "debug/policy_rejected_logits": -1.1766290664672852, "debug/policy_rejected_logps": -306.53369140625, "debug/reference_chosen_logps": -230.5318145751953, "debug/reference_rejected_logps": -294.82598876953125, "epoch": 0.75, "grad_norm": 26.099751982850893, "learning_rate": 1e-06, "logits/chosen": -1.096240520477295, "logits/rejected": -1.1766290664672852, "logps/chosen": -232.75778198242188, "logps/rejected": -306.53369140625, "loss": 0.4361, "rewards/accuracies": 0.625, "rewards/chosen": -0.022259674966335297, "rewards/margins": 0.09481699019670486, "rewards/rejected": -0.11707665771245956, "step": 48 }, { "debug/policy_chosen_logits": -1.1644705533981323, "debug/policy_chosen_logps": -166.67062377929688, "debug/policy_rejected_logits": -1.293932557106018, "debug/policy_rejected_logps": -293.45050048828125, "debug/reference_chosen_logps": -175.53598022460938, "debug/reference_rejected_logps": -276.24322509765625, "epoch": 0.765625, "grad_norm": 18.426480334845714, "learning_rate": 1e-06, "logits/chosen": -1.1644705533981323, "logits/rejected": -1.293932557106018, "logps/chosen": -166.67062377929688, "logps/rejected": -293.45050048828125, "loss": 0.4144, "rewards/accuracies": 0.75, "rewards/chosen": 0.08865345269441605, "rewards/margins": 0.2607261538505554, "rewards/rejected": -0.17207267880439758, "step": 49 }, { "debug/policy_chosen_logits": -1.2152189016342163, "debug/policy_chosen_logps": -170.15440368652344, "debug/policy_rejected_logits": -1.2675527334213257, "debug/policy_rejected_logps": -284.37353515625, "debug/reference_chosen_logps": -173.90533447265625, "debug/reference_rejected_logps": -265.96417236328125, "epoch": 0.78125, "grad_norm": 19.567832925259168, "learning_rate": 1e-06, "logits/chosen": -1.2152189016342163, "logits/rejected": -1.2675527334213257, "logps/chosen": -170.15440368652344, "logps/rejected": -284.37353515625, "loss": 0.3895, "rewards/accuracies": 0.75, "rewards/chosen": 0.03750941902399063, "rewards/margins": 0.22160324454307556, "rewards/rejected": -0.18409383296966553, "step": 50 }, { "debug/policy_chosen_logits": -1.1334317922592163, "debug/policy_chosen_logps": -127.97447204589844, "debug/policy_rejected_logits": -1.07590651512146, "debug/policy_rejected_logps": -220.5333251953125, "debug/reference_chosen_logps": -136.77487182617188, "debug/reference_rejected_logps": -219.20693969726562, "epoch": 0.796875, "grad_norm": 37.00007516828202, "learning_rate": 1e-06, "logits/chosen": -1.1334317922592163, "logits/rejected": -1.07590651512146, "logps/chosen": -127.97447204589844, "logps/rejected": -220.5333251953125, "loss": 0.3521, "rewards/accuracies": 0.625, "rewards/chosen": 0.0880039781332016, "rewards/margins": 0.10126776248216629, "rewards/rejected": -0.013263778761029243, "step": 51 }, { "debug/policy_chosen_logits": -1.207089900970459, "debug/policy_chosen_logps": -149.59579467773438, "debug/policy_rejected_logits": -1.3598229885101318, "debug/policy_rejected_logps": -312.65423583984375, "debug/reference_chosen_logps": -160.83349609375, "debug/reference_rejected_logps": -290.1050109863281, "epoch": 0.8125, "grad_norm": 34.43193601355931, "learning_rate": 1e-06, "logits/chosen": -1.207089900970459, "logits/rejected": -1.3598229885101318, "logps/chosen": -149.59579467773438, "logps/rejected": -312.65423583984375, "loss": 0.3701, "rewards/accuracies": 0.875, "rewards/chosen": 0.11237694323062897, "rewards/margins": 0.3378693461418152, "rewards/rejected": -0.22549240291118622, "step": 52 }, { "debug/policy_chosen_logits": -1.0188125371932983, "debug/policy_chosen_logps": -185.8585205078125, "debug/policy_rejected_logits": -1.0791672468185425, "debug/policy_rejected_logps": -251.5456085205078, "debug/reference_chosen_logps": -191.01089477539062, "debug/reference_rejected_logps": -245.11524963378906, "epoch": 0.828125, "grad_norm": 14.59754124103045, "learning_rate": 1e-06, "logits/chosen": -1.0188125371932983, "logits/rejected": -1.0791672468185425, "logps/chosen": -185.8585205078125, "logps/rejected": -251.5456085205078, "loss": 0.3737, "rewards/accuracies": 0.75, "rewards/chosen": 0.051523588597774506, "rewards/margins": 0.1158272996544838, "rewards/rejected": -0.0643036961555481, "step": 53 }, { "debug/policy_chosen_logits": -1.0893926620483398, "debug/policy_chosen_logps": -149.98660278320312, "debug/policy_rejected_logits": -1.0650213956832886, "debug/policy_rejected_logps": -274.9080810546875, "debug/reference_chosen_logps": -158.46145629882812, "debug/reference_rejected_logps": -258.4507141113281, "epoch": 0.84375, "grad_norm": 14.810580428901549, "learning_rate": 1e-06, "logits/chosen": -1.0893926620483398, "logits/rejected": -1.0650213956832886, "logps/chosen": -149.98660278320312, "logps/rejected": -274.9080810546875, "loss": 0.3213, "rewards/accuracies": 0.75, "rewards/chosen": 0.0847485214471817, "rewards/margins": 0.24932223558425903, "rewards/rejected": -0.16457369923591614, "step": 54 }, { "debug/policy_chosen_logits": -1.2362074851989746, "debug/policy_chosen_logps": -102.0992202758789, "debug/policy_rejected_logits": -1.3010079860687256, "debug/policy_rejected_logps": -289.4234313964844, "debug/reference_chosen_logps": -120.96076965332031, "debug/reference_rejected_logps": -275.486083984375, "epoch": 0.859375, "grad_norm": 13.99372317117744, "learning_rate": 1e-06, "logits/chosen": -1.2362074851989746, "logits/rejected": -1.3010079860687256, "logps/chosen": -102.0992202758789, "logps/rejected": -289.4234313964844, "loss": 0.4006, "rewards/accuracies": 1.0, "rewards/chosen": 0.18861553072929382, "rewards/margins": 0.32798925042152405, "rewards/rejected": -0.13937373459339142, "step": 55 }, { "debug/policy_chosen_logits": -1.0959794521331787, "debug/policy_chosen_logps": -176.76089477539062, "debug/policy_rejected_logits": -1.291311264038086, "debug/policy_rejected_logps": -313.87506103515625, "debug/reference_chosen_logps": -185.58998107910156, "debug/reference_rejected_logps": -290.28045654296875, "epoch": 0.875, "grad_norm": 37.669129247782706, "learning_rate": 1e-06, "logits/chosen": -1.0959794521331787, "logits/rejected": -1.291311264038086, "logps/chosen": -176.76089477539062, "logps/rejected": -313.87506103515625, "loss": 0.3289, "rewards/accuracies": 0.625, "rewards/chosen": 0.08829064667224884, "rewards/margins": 0.3242364823818207, "rewards/rejected": -0.23594582080841064, "step": 56 }, { "debug/policy_chosen_logits": -1.0723934173583984, "debug/policy_chosen_logps": -127.6189193725586, "debug/policy_rejected_logits": -1.1941779851913452, "debug/policy_rejected_logps": -263.9356689453125, "debug/reference_chosen_logps": -139.6109161376953, "debug/reference_rejected_logps": -251.62448120117188, "epoch": 0.890625, "grad_norm": 15.916622092420505, "learning_rate": 1e-06, "logits/chosen": -1.0723934173583984, "logits/rejected": -1.1941779851913452, "logps/chosen": -127.6189193725586, "logps/rejected": -263.9356689453125, "loss": 0.3641, "rewards/accuracies": 0.875, "rewards/chosen": 0.11991991102695465, "rewards/margins": 0.24303147196769714, "rewards/rejected": -0.12311156839132309, "step": 57 }, { "debug/policy_chosen_logits": -1.083500862121582, "debug/policy_chosen_logps": -212.94515991210938, "debug/policy_rejected_logits": -1.196679711341858, "debug/policy_rejected_logps": -263.7575378417969, "debug/reference_chosen_logps": -221.95928955078125, "debug/reference_rejected_logps": -267.07586669921875, "epoch": 0.90625, "grad_norm": 26.520012974267605, "learning_rate": 1e-06, "logits/chosen": -1.083500862121582, "logits/rejected": -1.196679711341858, "logps/chosen": -212.94515991210938, "logps/rejected": -263.7575378417969, "loss": 0.4082, "rewards/accuracies": 0.5, "rewards/chosen": 0.09014149010181427, "rewards/margins": 0.05695834010839462, "rewards/rejected": 0.03318314626812935, "step": 58 }, { "debug/policy_chosen_logits": -1.2750979661941528, "debug/policy_chosen_logps": -120.48554229736328, "debug/policy_rejected_logits": -1.2684656381607056, "debug/policy_rejected_logps": -331.54986572265625, "debug/reference_chosen_logps": -130.17742919921875, "debug/reference_rejected_logps": -307.9356689453125, "epoch": 0.921875, "grad_norm": 19.676716039926774, "learning_rate": 1e-06, "logits/chosen": -1.2750979661941528, "logits/rejected": -1.2684656381607056, "logps/chosen": -120.48554229736328, "logps/rejected": -331.54986572265625, "loss": 0.3486, "rewards/accuracies": 0.875, "rewards/chosen": 0.09691886603832245, "rewards/margins": 0.3330605924129486, "rewards/rejected": -0.23614171147346497, "step": 59 }, { "debug/policy_chosen_logits": -1.1700026988983154, "debug/policy_chosen_logps": -164.0662078857422, "debug/policy_rejected_logits": -1.0647200345993042, "debug/policy_rejected_logps": -289.3599548339844, "debug/reference_chosen_logps": -172.45896911621094, "debug/reference_rejected_logps": -276.63592529296875, "epoch": 0.9375, "grad_norm": 16.5920657057711, "learning_rate": 1e-06, "logits/chosen": -1.1700026988983154, "logits/rejected": -1.0647200345993042, "logps/chosen": -164.0662078857422, "logps/rejected": -289.3599548339844, "loss": 0.3551, "rewards/accuracies": 0.875, "rewards/chosen": 0.08392763137817383, "rewards/margins": 0.21116778254508972, "rewards/rejected": -0.1272401511669159, "step": 60 }, { "debug/policy_chosen_logits": -1.2429842948913574, "debug/policy_chosen_logps": -164.30996704101562, "debug/policy_rejected_logits": -1.2771668434143066, "debug/policy_rejected_logps": -291.66436767578125, "debug/reference_chosen_logps": -178.3618927001953, "debug/reference_rejected_logps": -263.1362609863281, "epoch": 0.953125, "grad_norm": 17.917957649513887, "learning_rate": 1e-06, "logits/chosen": -1.2429842948913574, "logits/rejected": -1.2771668434143066, "logps/chosen": -164.30996704101562, "logps/rejected": -291.66436767578125, "loss": 0.333, "rewards/accuracies": 1.0, "rewards/chosen": 0.14051929116249084, "rewards/margins": 0.42580026388168335, "rewards/rejected": -0.2852809429168701, "step": 61 }, { "debug/policy_chosen_logits": -1.1629077196121216, "debug/policy_chosen_logps": -171.4347381591797, "debug/policy_rejected_logits": -1.2383259534835815, "debug/policy_rejected_logps": -257.24322509765625, "debug/reference_chosen_logps": -176.8075408935547, "debug/reference_rejected_logps": -236.5648193359375, "epoch": 0.96875, "grad_norm": 27.22416658714319, "learning_rate": 1e-06, "logits/chosen": -1.1629077196121216, "logits/rejected": -1.2383259534835815, "logps/chosen": -171.4347381591797, "logps/rejected": -257.24322509765625, "loss": 0.3685, "rewards/accuracies": 0.75, "rewards/chosen": 0.053728047758340836, "rewards/margins": 0.2605122923851013, "rewards/rejected": -0.20678424835205078, "step": 62 }, { "debug/policy_chosen_logits": -1.167179822921753, "debug/policy_chosen_logps": -241.232666015625, "debug/policy_rejected_logits": -1.1904563903808594, "debug/policy_rejected_logps": -349.2745361328125, "debug/reference_chosen_logps": -237.09837341308594, "debug/reference_rejected_logps": -312.7959289550781, "epoch": 0.984375, "grad_norm": 35.440096057306455, "learning_rate": 1e-06, "logits/chosen": -1.167179822921753, "logits/rejected": -1.1904563903808594, "logps/chosen": -241.232666015625, "logps/rejected": -349.2745361328125, "loss": 0.3671, "rewards/accuracies": 0.875, "rewards/chosen": -0.04134296253323555, "rewards/margins": 0.32344281673431396, "rewards/rejected": -0.3647857904434204, "step": 63 }, { "debug/policy_chosen_logits": -1.1739040613174438, "debug/policy_chosen_logps": -139.26182556152344, "debug/policy_rejected_logits": -1.2884361743927002, "debug/policy_rejected_logps": -280.71124267578125, "debug/reference_chosen_logps": -148.495361328125, "debug/reference_rejected_logps": -259.9752197265625, "epoch": 1.0, "grad_norm": 43.313159052812985, "learning_rate": 1e-06, "logits/chosen": -1.1739040613174438, "logits/rejected": -1.2884361743927002, "logps/chosen": -139.26182556152344, "logps/rejected": -280.71124267578125, "loss": 0.3453, "rewards/accuracies": 0.625, "rewards/chosen": 0.09233523905277252, "rewards/margins": 0.2996952533721924, "rewards/rejected": -0.20735999941825867, "step": 64 }, { "epoch": 1.0, "step": 64, "total_flos": 0.0, "train_loss": 0.40815131505951285, "train_runtime": 194.2921, "train_samples_per_second": 20.979, "train_steps_per_second": 0.329 } ], "logging_steps": 1, "max_steps": 64, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }