{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8063872255489022, "eval_steps": 500, "global_step": 404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001996007984031936, "grad_norm": 19.011175567956343, "learning_rate": 1.9607843137254902e-08, "logits/chosen": -0.23683343827724457, "logits/rejected": -0.2160334289073944, "logps/chosen": -0.7725335359573364, "logps/rejected": -0.7464257478713989, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00998003992015968, "grad_norm": 24.85303405730939, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.17754913866519928, "logits/rejected": -0.1510540395975113, "logps/chosen": -0.6808110475540161, "logps/rejected": -0.7670315504074097, "loss": 0.6946, "rewards/accuracies": 0.296875, "rewards/chosen": -0.003806930035352707, "rewards/margins": -0.0012099393643438816, "rewards/rejected": -0.0025969906710088253, "step": 5 }, { "epoch": 0.01996007984031936, "grad_norm": 27.331245165405218, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.21193155646324158, "logits/rejected": -0.14373356103897095, "logps/chosen": -0.6621003746986389, "logps/rejected": -0.7422515153884888, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": -0.003911865875124931, "rewards/margins": -0.00975899025797844, "rewards/rejected": 0.005847124848514795, "step": 10 }, { "epoch": 0.029940119760479042, "grad_norm": 18.13812492001746, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.2625748813152313, "logits/rejected": -0.21968011558055878, "logps/chosen": -0.7216169238090515, "logps/rejected": -0.7573580741882324, "loss": 0.6955, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0036083627492189407, "rewards/margins": 0.0002948194742202759, "rewards/rejected": -0.0039031822234392166, "step": 15 }, { "epoch": 0.03992015968063872, "grad_norm": 29.30189659403405, "learning_rate": 3.92156862745098e-07, "logits/chosen": -0.2208031415939331, "logits/rejected": -0.21727153658866882, "logps/chosen": -0.806796669960022, "logps/rejected": -0.7866016626358032, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011953282169997692, "rewards/margins": 0.00740851229056716, "rewards/rejected": 0.0045447684824466705, "step": 20 }, { "epoch": 0.0499001996007984, "grad_norm": 17.484098617193414, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.23456409573554993, "logits/rejected": -0.20080241560935974, "logps/chosen": -0.752682089805603, "logps/rejected": -0.81329745054245, "loss": 0.6933, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.008837291970849037, "rewards/margins": -0.008301705121994019, "rewards/rejected": -0.0005355868488550186, "step": 25 }, { "epoch": 0.059880239520958084, "grad_norm": 22.79460314618375, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.19971349835395813, "logits/rejected": -0.1838277131319046, "logps/chosen": -0.741012454032898, "logps/rejected": -0.8006389737129211, "loss": 0.6905, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.007402450777590275, "rewards/margins": 0.008965044282376766, "rewards/rejected": -0.01636749505996704, "step": 30 }, { "epoch": 0.06986027944111776, "grad_norm": 19.972022304372274, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.20151765644550323, "logits/rejected": -0.19096672534942627, "logps/chosen": -0.7682427167892456, "logps/rejected": -0.8185433149337769, "loss": 0.6912, "rewards/accuracies": 0.4375, "rewards/chosen": -0.011349962092936039, "rewards/margins": -0.0024173937272280455, "rewards/rejected": -0.00893256813287735, "step": 35 }, { "epoch": 0.07984031936127745, "grad_norm": 18.601712274112785, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.24523372948169708, "logits/rejected": -0.2008388340473175, "logps/chosen": -0.784850537776947, "logps/rejected": -0.8351114392280579, "loss": 0.6888, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.019527489319443703, "rewards/margins": 0.02737308107316494, "rewards/rejected": -0.04690057039260864, "step": 40 }, { "epoch": 0.08982035928143713, "grad_norm": 22.21021114642223, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.20831866562366486, "logits/rejected": -0.19121626019477844, "logps/chosen": -0.7837399244308472, "logps/rejected": -0.8101400136947632, "loss": 0.6787, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02101544663310051, "rewards/margins": 0.036486249417066574, "rewards/rejected": -0.057501696050167084, "step": 45 }, { "epoch": 0.0998003992015968, "grad_norm": 35.946736239250875, "learning_rate": 9.80392156862745e-07, "logits/chosen": -0.2484116554260254, "logits/rejected": -0.21271376311779022, "logps/chosen": -0.7204190492630005, "logps/rejected": -0.8208059072494507, "loss": 0.6745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.018808891996741295, "rewards/margins": 0.07578931748867035, "rewards/rejected": -0.0945982038974762, "step": 50 }, { "epoch": 0.10978043912175649, "grad_norm": 17.586104380746775, "learning_rate": 9.99805057520177e-07, "logits/chosen": -0.22446580231189728, "logits/rejected": -0.21255891025066376, "logps/chosen": -0.768947958946228, "logps/rejected": -0.7895203828811646, "loss": 0.6709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03166341781616211, "rewards/margins": 0.14431020617485046, "rewards/rejected": -0.17597363889217377, "step": 55 }, { "epoch": 0.11976047904191617, "grad_norm": 16.613461293187417, "learning_rate": 9.990133642141357e-07, "logits/chosen": -0.24056999385356903, "logits/rejected": -0.2318592071533203, "logps/chosen": -0.7434613108634949, "logps/rejected": -0.8091262578964233, "loss": 0.6692, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08080559223890305, "rewards/margins": 0.17325755953788757, "rewards/rejected": -0.2540631592273712, "step": 60 }, { "epoch": 0.12974051896207583, "grad_norm": 20.90647324488342, "learning_rate": 9.976136999909155e-07, "logits/chosen": -0.18930143117904663, "logits/rejected": -0.17337587475776672, "logps/chosen": -0.7673609852790833, "logps/rejected": -0.7971418499946594, "loss": 0.6651, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06646038591861725, "rewards/margins": 0.06751471757888794, "rewards/rejected": -0.133975088596344, "step": 65 }, { "epoch": 0.13972055888223553, "grad_norm": 29.878943207680674, "learning_rate": 9.956077701257707e-07, "logits/chosen": -0.2409631907939911, "logits/rejected": -0.2077023983001709, "logps/chosen": -0.7360613346099854, "logps/rejected": -0.7820955514907837, "loss": 0.6524, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.04467972368001938, "rewards/margins": 0.07468457520008087, "rewards/rejected": -0.11936430633068085, "step": 70 }, { "epoch": 0.1497005988023952, "grad_norm": 23.171174553493415, "learning_rate": 9.929980185352525e-07, "logits/chosen": -0.3143348693847656, "logits/rejected": -0.31170645356178284, "logps/chosen": -0.7954690456390381, "logps/rejected": -0.8250002861022949, "loss": 0.6568, "rewards/accuracies": 0.75, "rewards/chosen": -0.09316639602184296, "rewards/margins": 0.060890208929777145, "rewards/rejected": -0.1540566086769104, "step": 75 }, { "epoch": 0.1596806387225549, "grad_norm": 17.553227599037466, "learning_rate": 9.89787624799672e-07, "logits/chosen": -0.38358816504478455, "logits/rejected": -0.32817578315734863, "logps/chosen": -0.7703452110290527, "logps/rejected": -0.8151019811630249, "loss": 0.6486, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10643620789051056, "rewards/margins": 0.10724379867315292, "rewards/rejected": -0.2136799842119217, "step": 80 }, { "epoch": 0.16966067864271456, "grad_norm": 13.431691131153842, "learning_rate": 9.859805002892731e-07, "logits/chosen": -0.3225359618663788, "logits/rejected": -0.3592807650566101, "logps/chosen": -0.7858971357345581, "logps/rejected": -0.8596378564834595, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": -0.1400977075099945, "rewards/margins": 0.16820211708545685, "rewards/rejected": -0.30829980969429016, "step": 85 }, { "epoch": 0.17964071856287425, "grad_norm": 27.30667277119807, "learning_rate": 9.81581283398829e-07, "logits/chosen": -0.34176284074783325, "logits/rejected": -0.34205198287963867, "logps/chosen": -0.7768250703811646, "logps/rejected": -0.820307731628418, "loss": 0.6451, "rewards/accuracies": 0.75, "rewards/chosen": -0.07238658517599106, "rewards/margins": 0.23312333226203918, "rewards/rejected": -0.30550986528396606, "step": 90 }, { "epoch": 0.18962075848303392, "grad_norm": 20.047219809230725, "learning_rate": 9.765953338964734e-07, "logits/chosen": -0.4065936207771301, "logits/rejected": -0.37638360261917114, "logps/chosen": -0.8715718388557434, "logps/rejected": -0.9510416984558105, "loss": 0.6336, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1532100886106491, "rewards/margins": 0.2994120717048645, "rewards/rejected": -0.4526221752166748, "step": 95 }, { "epoch": 0.1996007984031936, "grad_norm": 36.23944490923282, "learning_rate": 9.710287263936483e-07, "logits/chosen": -0.43931150436401367, "logits/rejected": -0.44534721970558167, "logps/chosen": -0.8958739042282104, "logps/rejected": -0.9571765065193176, "loss": 0.6432, "rewards/accuracies": 0.75, "rewards/chosen": -0.27455487847328186, "rewards/margins": 0.25604015588760376, "rewards/rejected": -0.5305949449539185, "step": 100 }, { "epoch": 0.20958083832335328, "grad_norm": 25.004187021683798, "learning_rate": 9.648882429441256e-07, "logits/chosen": -0.44500723481178284, "logits/rejected": -0.4199863076210022, "logps/chosen": -0.7884619832038879, "logps/rejected": -0.7948769330978394, "loss": 0.6333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.23280362784862518, "rewards/margins": 0.13038918375968933, "rewards/rejected": -0.3631927967071533, "step": 105 }, { "epoch": 0.21956087824351297, "grad_norm": 15.650165706867742, "learning_rate": 9.581813647811197e-07, "logits/chosen": -0.47238340973854065, "logits/rejected": -0.46014589071273804, "logps/chosen": -0.7871894240379333, "logps/rejected": -0.8054457902908325, "loss": 0.6277, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25777310132980347, "rewards/margins": 0.21288836002349854, "rewards/rejected": -0.470661461353302, "step": 110 }, { "epoch": 0.22954091816367264, "grad_norm": 18.380315632232364, "learning_rate": 9.509162632025569e-07, "logits/chosen": -0.5215901732444763, "logits/rejected": -0.5049930214881897, "logps/chosen": -0.9149976968765259, "logps/rejected": -0.9515384435653687, "loss": 0.6358, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3461766839027405, "rewards/margins": 0.15195634961128235, "rewards/rejected": -0.49813300371170044, "step": 115 }, { "epoch": 0.23952095808383234, "grad_norm": 21.138881533254413, "learning_rate": 9.431017896156073e-07, "logits/chosen": -0.4705902636051178, "logits/rejected": -0.4593280255794525, "logps/chosen": -0.8069537281990051, "logps/rejected": -0.8675839304924011, "loss": 0.614, "rewards/accuracies": 0.625, "rewards/chosen": -0.29863637685775757, "rewards/margins": 0.31691476702690125, "rewards/rejected": -0.6155511140823364, "step": 120 }, { "epoch": 0.249500998003992, "grad_norm": 18.813175578776896, "learning_rate": 9.347474647526095e-07, "logits/chosen": -0.4284445345401764, "logits/rejected": -0.3746599555015564, "logps/chosen": -0.7812046408653259, "logps/rejected": -0.8142662048339844, "loss": 0.6011, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3037291169166565, "rewards/margins": 0.3044063150882721, "rewards/rejected": -0.6081355214118958, "step": 125 }, { "epoch": 0.25948103792415167, "grad_norm": 17.070591448816156, "learning_rate": 9.258634670715237e-07, "logits/chosen": -0.5489827394485474, "logits/rejected": -0.5030359029769897, "logps/chosen": -0.7899664044380188, "logps/rejected": -0.8873406648635864, "loss": 0.6085, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3300507664680481, "rewards/margins": 0.2699635326862335, "rewards/rejected": -0.600014328956604, "step": 130 }, { "epoch": 0.2694610778443114, "grad_norm": 18.560449924688594, "learning_rate": 9.164606203550497e-07, "logits/chosen": -0.5027534365653992, "logits/rejected": -0.45569664239883423, "logps/chosen": -0.7548262476921082, "logps/rejected": -0.8240019083023071, "loss": 0.5993, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.33843767642974854, "rewards/margins": 0.27864545583724976, "rewards/rejected": -0.6170830726623535, "step": 135 }, { "epoch": 0.27944111776447106, "grad_norm": 15.456587593946033, "learning_rate": 9.065503805235137e-07, "logits/chosen": -0.5412222146987915, "logits/rejected": -0.5133959054946899, "logps/chosen": -0.8278282284736633, "logps/rejected": -0.8514043092727661, "loss": 0.6118, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.515911340713501, "rewards/margins": 0.23073478043079376, "rewards/rejected": -0.7466461658477783, "step": 140 }, { "epoch": 0.2894211576846307, "grad_norm": 13.75653652171432, "learning_rate": 8.961448216775953e-07, "logits/chosen": -0.531816303730011, "logits/rejected": -0.5432392358779907, "logps/chosen": -0.8441339731216431, "logps/rejected": -0.9052772521972656, "loss": 0.6047, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6214492917060852, "rewards/margins": 0.40321844816207886, "rewards/rejected": -1.024667739868164, "step": 145 }, { "epoch": 0.2994011976047904, "grad_norm": 14.061323030297434, "learning_rate": 8.852566213878946e-07, "logits/chosen": -0.5842114686965942, "logits/rejected": -0.5797411203384399, "logps/chosen": -0.7897329926490784, "logps/rejected": -0.8609731793403625, "loss": 0.6038, "rewards/accuracies": 0.75, "rewards/chosen": -0.49532920122146606, "rewards/margins": 0.36390385031700134, "rewards/rejected": -0.859233021736145, "step": 150 }, { "epoch": 0.3093812375249501, "grad_norm": 12.749472652825297, "learning_rate": 8.73899045249266e-07, "logits/chosen": -0.6203972697257996, "logits/rejected": -0.6109431982040405, "logps/chosen": -0.8315925598144531, "logps/rejected": -0.9241877794265747, "loss": 0.5848, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6116100549697876, "rewards/margins": 0.5322220325469971, "rewards/rejected": -1.1438319683074951, "step": 155 }, { "epoch": 0.3193612774451098, "grad_norm": 18.453987933361336, "learning_rate": 8.620859307187338e-07, "logits/chosen": -0.5765129923820496, "logits/rejected": -0.5761350393295288, "logps/chosen": -0.8263224363327026, "logps/rejected": -0.8567667007446289, "loss": 0.5871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5846480131149292, "rewards/margins": 0.3195909559726715, "rewards/rejected": -0.9042388796806335, "step": 160 }, { "epoch": 0.32934131736526945, "grad_norm": 24.93482299590875, "learning_rate": 8.498316702566826e-07, "logits/chosen": -0.6133869886398315, "logits/rejected": -0.5582712292671204, "logps/chosen": -0.828209400177002, "logps/rejected": -0.893822193145752, "loss": 0.5702, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8109213709831238, "rewards/margins": 0.3340582847595215, "rewards/rejected": -1.14497971534729, "step": 165 }, { "epoch": 0.3393213572854291, "grad_norm": 14.606101127926557, "learning_rate": 8.371511937918617e-07, "logits/chosen": -0.6824047565460205, "logits/rejected": -0.664223313331604, "logps/chosen": -0.8030799031257629, "logps/rejected": -0.8865512013435364, "loss": 0.6006, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6996228694915771, "rewards/margins": 0.5749022960662842, "rewards/rejected": -1.2745250463485718, "step": 170 }, { "epoch": 0.34930139720558884, "grad_norm": 14.615105697665864, "learning_rate": 8.240599505315654e-07, "logits/chosen": -0.6872956156730652, "logits/rejected": -0.6839295625686646, "logps/chosen": -0.8706089854240417, "logps/rejected": -0.9267762899398804, "loss": 0.5737, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7884671688079834, "rewards/margins": 0.42132559418678284, "rewards/rejected": -1.2097927331924438, "step": 175 }, { "epoch": 0.3592814371257485, "grad_norm": 17.325663221044394, "learning_rate": 8.105738901391551e-07, "logits/chosen": -0.702431857585907, "logits/rejected": -0.6738135814666748, "logps/chosen": -0.8363839387893677, "logps/rejected": -0.8939155340194702, "loss": 0.5712, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.724533200263977, "rewards/margins": 0.3838126063346863, "rewards/rejected": -1.108345866203308, "step": 180 }, { "epoch": 0.36926147704590817, "grad_norm": 16.365607087163703, "learning_rate": 7.967094433018508e-07, "logits/chosen": -0.7076471447944641, "logits/rejected": -0.6945314407348633, "logps/chosen": -0.9004107713699341, "logps/rejected": -1.046083688735962, "loss": 0.5742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9093888401985168, "rewards/margins": 0.6952089071273804, "rewards/rejected": -1.6045976877212524, "step": 185 }, { "epoch": 0.37924151696606784, "grad_norm": 43.74908736902799, "learning_rate": 7.82483501712469e-07, "logits/chosen": -0.6856539845466614, "logits/rejected": -0.6615663766860962, "logps/chosen": -0.9109293222427368, "logps/rejected": -0.9081963300704956, "loss": 0.5967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9922858476638794, "rewards/margins": 0.4050213396549225, "rewards/rejected": -1.39730703830719, "step": 190 }, { "epoch": 0.38922155688622756, "grad_norm": 19.60550699714766, "learning_rate": 7.679133974894982e-07, "logits/chosen": -0.6713054180145264, "logits/rejected": -0.6843950748443604, "logps/chosen": -0.8581186532974243, "logps/rejected": -0.9895851016044617, "loss": 0.5531, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9614213705062866, "rewards/margins": 0.7596980929374695, "rewards/rejected": -1.7211196422576904, "step": 195 }, { "epoch": 0.3992015968063872, "grad_norm": 15.996993544634163, "learning_rate": 7.530168820605818e-07, "logits/chosen": -0.6824347972869873, "logits/rejected": -0.6842206120491028, "logps/chosen": -0.9382361173629761, "logps/rejected": -0.9921613931655884, "loss": 0.5583, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1333320140838623, "rewards/margins": 0.5768089890480042, "rewards/rejected": -1.7101409435272217, "step": 200 }, { "epoch": 0.4091816367265469, "grad_norm": 15.558869167043554, "learning_rate": 7.378121045351377e-07, "logits/chosen": -0.6672806739807129, "logits/rejected": -0.6542935371398926, "logps/chosen": -0.8662320375442505, "logps/rejected": -0.9581985473632812, "loss": 0.5465, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0950018167495728, "rewards/margins": 0.47961243987083435, "rewards/rejected": -1.57461416721344, "step": 205 }, { "epoch": 0.41916167664670656, "grad_norm": 28.887132861516903, "learning_rate": 7.223175895924637e-07, "logits/chosen": -0.690104603767395, "logits/rejected": -0.6761881709098816, "logps/chosen": -0.863226592540741, "logps/rejected": -0.9225906133651733, "loss": 0.5553, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0103793144226074, "rewards/margins": 0.6174372434616089, "rewards/rejected": -1.6278165578842163, "step": 210 }, { "epoch": 0.4291417165668663, "grad_norm": 15.081565074387589, "learning_rate": 7.065522149122709e-07, "logits/chosen": -0.7095133662223816, "logits/rejected": -0.6953072547912598, "logps/chosen": -0.9815180897712708, "logps/rejected": -1.0388023853302002, "loss": 0.5393, "rewards/accuracies": 0.75, "rewards/chosen": -1.4431049823760986, "rewards/margins": 0.5409745573997498, "rewards/rejected": -1.9840797185897827, "step": 215 }, { "epoch": 0.43912175648702595, "grad_norm": 27.859691103168426, "learning_rate": 6.905351881751371e-07, "logits/chosen": -0.7326300144195557, "logits/rejected": -0.7271891832351685, "logps/chosen": -0.886702835559845, "logps/rejected": -0.9949976205825806, "loss": 0.5524, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.178215742111206, "rewards/margins": 0.5899935364723206, "rewards/rejected": -1.7682092189788818, "step": 220 }, { "epoch": 0.4491017964071856, "grad_norm": 20.049309145299137, "learning_rate": 6.742860236609076e-07, "logits/chosen": -0.7312344312667847, "logits/rejected": -0.7114800810813904, "logps/chosen": -0.9219420552253723, "logps/rejected": -0.9769749641418457, "loss": 0.5397, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2608001232147217, "rewards/margins": 0.6559053659439087, "rewards/rejected": -1.9167054891586304, "step": 225 }, { "epoch": 0.4590818363273453, "grad_norm": 21.600145701212536, "learning_rate": 6.578245184735512e-07, "logits/chosen": -0.7300236225128174, "logits/rejected": -0.7002195119857788, "logps/chosen": -0.9393006563186646, "logps/rejected": -0.9583779573440552, "loss": 0.5285, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3502466678619385, "rewards/margins": 0.5810690522193909, "rewards/rejected": -1.9313156604766846, "step": 230 }, { "epoch": 0.469061876247505, "grad_norm": 15.102196954163185, "learning_rate": 6.411707284214383e-07, "logits/chosen": -0.7769094705581665, "logits/rejected": -0.784206211566925, "logps/chosen": -0.937769889831543, "logps/rejected": -1.0256552696228027, "loss": 0.5481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3135572671890259, "rewards/margins": 0.6154013872146606, "rewards/rejected": -1.928958535194397, "step": 235 }, { "epoch": 0.47904191616766467, "grad_norm": 22.589088897581874, "learning_rate": 6.243449435824276e-07, "logits/chosen": -0.7115018367767334, "logits/rejected": -0.6854550242424011, "logps/chosen": -0.8389812707901001, "logps/rejected": -0.9537237882614136, "loss": 0.5204, "rewards/accuracies": 0.75, "rewards/chosen": -1.3218843936920166, "rewards/margins": 0.7666546106338501, "rewards/rejected": -2.088538885116577, "step": 240 }, { "epoch": 0.48902195608782434, "grad_norm": 17.05950897502288, "learning_rate": 6.073676635835316e-07, "logits/chosen": -0.7151886820793152, "logits/rejected": -0.7269682884216309, "logps/chosen": -0.8353130221366882, "logps/rejected": -1.045952320098877, "loss": 0.5198, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2784829139709473, "rewards/margins": 1.028085470199585, "rewards/rejected": -2.3065686225891113, "step": 245 }, { "epoch": 0.499001996007984, "grad_norm": 22.204919215887884, "learning_rate": 5.9025957262528e-07, "logits/chosen": -0.7427772879600525, "logits/rejected": -0.7114007472991943, "logps/chosen": -0.8959082365036011, "logps/rejected": -1.015169382095337, "loss": 0.5047, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4343379735946655, "rewards/margins": 0.8260199427604675, "rewards/rejected": -2.260357618331909, "step": 250 }, { "epoch": 0.5089820359281437, "grad_norm": 29.103918291256367, "learning_rate": 5.730415142812058e-07, "logits/chosen": -0.7786175608634949, "logits/rejected": -0.7618826627731323, "logps/chosen": -0.9678372144699097, "logps/rejected": -1.090914011001587, "loss": 0.5111, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7259149551391602, "rewards/margins": 0.8811995387077332, "rewards/rejected": -2.607114315032959, "step": 255 }, { "epoch": 0.5189620758483033, "grad_norm": 17.21881262945149, "learning_rate": 5.557344661031627e-07, "logits/chosen": -0.7823535799980164, "logits/rejected": -0.7699525356292725, "logps/chosen": -0.9408347010612488, "logps/rejected": -1.0648287534713745, "loss": 0.5406, "rewards/accuracies": 0.75, "rewards/chosen": -1.5965430736541748, "rewards/margins": 1.0114134550094604, "rewards/rejected": -2.6079564094543457, "step": 260 }, { "epoch": 0.5289421157684631, "grad_norm": 15.711105031107639, "learning_rate": 5.383595140634093e-07, "logits/chosen": -0.7325602769851685, "logits/rejected": -0.7186430096626282, "logps/chosen": -0.943433403968811, "logps/rejected": -1.0095100402832031, "loss": 0.5276, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7036699056625366, "rewards/margins": 0.8646278381347656, "rewards/rejected": -2.5682976245880127, "step": 265 }, { "epoch": 0.5389221556886228, "grad_norm": 13.20029610671174, "learning_rate": 5.209378268645997e-07, "logits/chosen": -0.7176939249038696, "logits/rejected": -0.7001760601997375, "logps/chosen": -0.8967610597610474, "logps/rejected": -1.0068597793579102, "loss": 0.5582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.737532377243042, "rewards/margins": 0.5873457193374634, "rewards/rejected": -2.324878215789795, "step": 270 }, { "epoch": 0.5489021956087824, "grad_norm": 20.138444852189, "learning_rate": 5.034906301489807e-07, "logits/chosen": -0.7280402779579163, "logits/rejected": -0.717974066734314, "logps/chosen": -0.9318618774414062, "logps/rejected": -1.0344207286834717, "loss": 0.5238, "rewards/accuracies": 0.75, "rewards/chosen": -1.814552903175354, "rewards/margins": 0.8326984643936157, "rewards/rejected": -2.647251605987549, "step": 275 }, { "epoch": 0.5588822355289421, "grad_norm": 37.63172145615237, "learning_rate": 4.860391806382156e-07, "logits/chosen": -0.8018016815185547, "logits/rejected": -0.8149466514587402, "logps/chosen": -0.9740177392959595, "logps/rejected": -1.010301113128662, "loss": 0.5217, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8418344259262085, "rewards/margins": 0.7042922973632812, "rewards/rejected": -2.5461268424987793, "step": 280 }, { "epoch": 0.5688622754491018, "grad_norm": 20.60319293019308, "learning_rate": 4.686047402353433e-07, "logits/chosen": -0.7303526997566223, "logits/rejected": -0.7184230089187622, "logps/chosen": -0.9589718580245972, "logps/rejected": -1.045867681503296, "loss": 0.523, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8164526224136353, "rewards/margins": 0.7647993564605713, "rewards/rejected": -2.581251621246338, "step": 285 }, { "epoch": 0.5788423153692615, "grad_norm": 31.323512708787323, "learning_rate": 4.512085501204253e-07, "logits/chosen": -0.7896037101745605, "logits/rejected": -0.7570759057998657, "logps/chosen": -0.9357205629348755, "logps/rejected": -1.0159623622894287, "loss": 0.5636, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7307599782943726, "rewards/margins": 0.7415143251419067, "rewards/rejected": -2.4722743034362793, "step": 290 }, { "epoch": 0.5888223552894212, "grad_norm": 16.113569930607966, "learning_rate": 4.338718048714387e-07, "logits/chosen": -0.7748357057571411, "logits/rejected": -0.7527580857276917, "logps/chosen": -0.9312615394592285, "logps/rejected": -0.9944890141487122, "loss": 0.5261, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7469574213027954, "rewards/margins": 0.5817708969116211, "rewards/rejected": -2.328728675842285, "step": 295 }, { "epoch": 0.5988023952095808, "grad_norm": 28.923235923466983, "learning_rate": 4.166156266419489e-07, "logits/chosen": -0.7699166536331177, "logits/rejected": -0.7807640433311462, "logps/chosen": -0.8693283200263977, "logps/rejected": -1.0598359107971191, "loss": 0.5223, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6791868209838867, "rewards/margins": 1.093235969543457, "rewards/rejected": -2.7724227905273438, "step": 300 }, { "epoch": 0.6087824351297405, "grad_norm": 43.24348090401382, "learning_rate": 3.9946103942701775e-07, "logits/chosen": -0.713184118270874, "logits/rejected": -0.7183653116226196, "logps/chosen": -0.877181887626648, "logps/rejected": -1.0504357814788818, "loss": 0.5006, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6295019388198853, "rewards/margins": 0.8480024337768555, "rewards/rejected": -2.477504253387451, "step": 305 }, { "epoch": 0.6187624750499002, "grad_norm": 18.17532898488996, "learning_rate": 3.8242894344870495e-07, "logits/chosen": -0.7871264815330505, "logits/rejected": -0.7895926237106323, "logps/chosen": -0.9971866607666016, "logps/rejected": -1.110032558441162, "loss": 0.5224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.071504592895508, "rewards/margins": 0.8322666883468628, "rewards/rejected": -2.90377140045166, "step": 310 }, { "epoch": 0.6287425149700598, "grad_norm": 22.503824878122693, "learning_rate": 3.6554008969236715e-07, "logits/chosen": -0.7987596392631531, "logits/rejected": -0.8034515380859375, "logps/chosen": -0.9355812072753906, "logps/rejected": -1.101746916770935, "loss": 0.4993, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8815546035766602, "rewards/margins": 1.0128275156021118, "rewards/rejected": -2.8943822383880615, "step": 315 }, { "epoch": 0.6387225548902196, "grad_norm": 18.4314421544152, "learning_rate": 3.488150546247778e-07, "logits/chosen": -0.7646247148513794, "logits/rejected": -0.7596274018287659, "logps/chosen": -0.9071539640426636, "logps/rejected": -0.9974797964096069, "loss": 0.5067, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.761649489402771, "rewards/margins": 0.7561764717102051, "rewards/rejected": -2.5178260803222656, "step": 320 }, { "epoch": 0.6487025948103793, "grad_norm": 13.46220126387281, "learning_rate": 3.3227421512487255e-07, "logits/chosen": -0.7663795948028564, "logits/rejected": -0.7689803838729858, "logps/chosen": -0.9501086473464966, "logps/rejected": -1.0315439701080322, "loss": 0.5057, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9115231037139893, "rewards/margins": 0.7186993360519409, "rewards/rejected": -2.630222797393799, "step": 325 }, { "epoch": 0.6586826347305389, "grad_norm": 43.00875479745304, "learning_rate": 3.15937723657661e-07, "logits/chosen": -0.8140354156494141, "logits/rejected": -0.8173881769180298, "logps/chosen": -0.9712478518486023, "logps/rejected": -1.0843861103057861, "loss": 0.5873, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.222109794616699, "rewards/margins": 0.6900671720504761, "rewards/rejected": -2.9121768474578857, "step": 330 }, { "epoch": 0.6686626746506986, "grad_norm": 32.97485130184056, "learning_rate": 2.9982548372155256e-07, "logits/chosen": -0.7841066718101501, "logits/rejected": -0.801399827003479, "logps/chosen": -0.9092627763748169, "logps/rejected": -1.0644382238388062, "loss": 0.5121, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8617267608642578, "rewards/margins": 1.0364662408828735, "rewards/rejected": -2.898192882537842, "step": 335 }, { "epoch": 0.6786427145708582, "grad_norm": 50.76904692556944, "learning_rate": 2.8395712559900874e-07, "logits/chosen": -0.8244466781616211, "logits/rejected": -0.8209226727485657, "logps/chosen": -1.0543348789215088, "logps/rejected": -1.1451005935668945, "loss": 0.5205, "rewards/accuracies": 0.75, "rewards/chosen": -2.412365674972534, "rewards/margins": 0.8550116419792175, "rewards/rejected": -3.2673773765563965, "step": 340 }, { "epoch": 0.688622754491018, "grad_norm": 27.915007397999354, "learning_rate": 2.683519824400692e-07, "logits/chosen": -0.8374012112617493, "logits/rejected": -0.8106748461723328, "logps/chosen": -0.9777308702468872, "logps/rejected": -1.1284379959106445, "loss": 0.5234, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.029723882675171, "rewards/margins": 1.0547873973846436, "rewards/rejected": -3.0845110416412354, "step": 345 }, { "epoch": 0.6986027944111777, "grad_norm": 22.882772557486724, "learning_rate": 2.530290667078846e-07, "logits/chosen": -0.8036109805107117, "logits/rejected": -0.7900259494781494, "logps/chosen": -0.9737772941589355, "logps/rejected": -1.085715651512146, "loss": 0.5015, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1590065956115723, "rewards/margins": 0.7729440927505493, "rewards/rejected": -2.931950807571411, "step": 350 }, { "epoch": 0.7085828343313373, "grad_norm": 18.205286801888395, "learning_rate": 2.380070470149605e-07, "logits/chosen": -0.8169373273849487, "logits/rejected": -0.8327714204788208, "logps/chosen": -0.9806594848632812, "logps/rejected": -1.1701897382736206, "loss": 0.496, "rewards/accuracies": 0.75, "rewards/chosen": -2.151632308959961, "rewards/margins": 1.1866142749786377, "rewards/rejected": -3.3382461071014404, "step": 355 }, { "epoch": 0.718562874251497, "grad_norm": 41.98073419560719, "learning_rate": 2.23304225378328e-07, "logits/chosen": -0.8116399049758911, "logits/rejected": -0.7772837281227112, "logps/chosen": -1.0802838802337646, "logps/rejected": -1.1937668323516846, "loss": 0.5124, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.334536075592041, "rewards/margins": 1.168308138847351, "rewards/rejected": -3.5028443336486816, "step": 360 }, { "epoch": 0.7285429141716567, "grad_norm": 27.12760666687319, "learning_rate": 2.0893851492135532e-07, "logits/chosen": -0.8288819193840027, "logits/rejected": -0.8238687515258789, "logps/chosen": -0.9590953588485718, "logps/rejected": -1.1089394092559814, "loss": 0.5151, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.967963457107544, "rewards/margins": 1.0413198471069336, "rewards/rejected": -3.0092828273773193, "step": 365 }, { "epoch": 0.7385229540918163, "grad_norm": 28.594288392858182, "learning_rate": 1.9492741804936618e-07, "logits/chosen": -0.8266215324401855, "logits/rejected": -0.8546527028083801, "logps/chosen": -1.0014851093292236, "logps/rejected": -1.0811948776245117, "loss": 0.5134, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.126549482345581, "rewards/margins": 0.8130427598953247, "rewards/rejected": -2.939592123031616, "step": 370 }, { "epoch": 0.7485029940119761, "grad_norm": 16.69085619906614, "learning_rate": 1.812880051256551e-07, "logits/chosen": -0.8046598434448242, "logits/rejected": -0.8114801645278931, "logps/chosen": -0.9752721786499023, "logps/rejected": -1.0586512088775635, "loss": 0.4999, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1684842109680176, "rewards/margins": 0.6114571690559387, "rewards/rejected": -2.7799415588378906, "step": 375 }, { "epoch": 0.7584830339321357, "grad_norm": 15.627879784878328, "learning_rate": 1.6803689367387918e-07, "logits/chosen": -0.823495090007782, "logits/rejected": -0.7943788170814514, "logps/chosen": -0.8508175015449524, "logps/rejected": -0.9670391082763672, "loss": 0.5187, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9837039709091187, "rewards/margins": 0.5095661878585815, "rewards/rejected": -2.4932703971862793, "step": 380 }, { "epoch": 0.7684630738522954, "grad_norm": 14.609956803422161, "learning_rate": 1.551902281321651e-07, "logits/chosen": -0.8435534238815308, "logits/rejected": -0.8091138601303101, "logps/chosen": -0.998641848564148, "logps/rejected": -1.133357286453247, "loss": 0.4954, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9779011011123657, "rewards/margins": 1.3220001459121704, "rewards/rejected": -3.2999014854431152, "step": 385 }, { "epoch": 0.7784431137724551, "grad_norm": 27.689053757448693, "learning_rate": 1.4276366018359842e-07, "logits/chosen": -0.8398737907409668, "logits/rejected": -0.811150848865509, "logps/chosen": -0.9453309178352356, "logps/rejected": -1.0934703350067139, "loss": 0.5225, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0561106204986572, "rewards/margins": 1.0419596433639526, "rewards/rejected": -3.0980706214904785, "step": 390 }, { "epoch": 0.7884231536926147, "grad_norm": 37.02718002584944, "learning_rate": 1.3077232968705805e-07, "logits/chosen": -0.8516527414321899, "logits/rejected": -0.8527445793151855, "logps/chosen": -0.969285786151886, "logps/rejected": -1.0676743984222412, "loss": 0.4928, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9902063608169556, "rewards/margins": 0.7401792407035828, "rewards/rejected": -2.7303855419158936, "step": 395 }, { "epoch": 0.7984031936127745, "grad_norm": 14.525059669424861, "learning_rate": 1.192308462316317e-07, "logits/chosen": -0.7803691625595093, "logits/rejected": -0.7951369285583496, "logps/chosen": -0.9650856852531433, "logps/rejected": -1.1078399419784546, "loss": 0.4959, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.048185110092163, "rewards/margins": 0.8834241628646851, "rewards/rejected": -2.9316093921661377, "step": 400 } ], "logging_steps": 5, "max_steps": 501, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 101, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }