{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 5804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034458993797381116, "grad_norm": 1.5268679857254028, "learning_rate": 8.605851979345955e-11, "logits/chosen": -3.024087429046631, "logits/rejected": -2.988196611404419, "logps/chosen": -47.308799743652344, "logps/rejected": -44.131954193115234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0034458993797381117, "grad_norm": 1.6066739559173584, "learning_rate": 8.605851979345954e-10, "logits/chosen": -3.0891270637512207, "logits/rejected": -3.071465015411377, "logps/chosen": -51.676212310791016, "logps/rejected": -51.65631103515625, "loss": 0.6931, "rewards/accuracies": 0.4409722089767456, "rewards/chosen": 1.6248530300799757e-05, "rewards/margins": 8.904636342776939e-05, "rewards/rejected": -7.279782585101202e-05, "step": 10 }, { "epoch": 0.006891798759476223, "grad_norm": 1.4660340547561646, "learning_rate": 1.7211703958691908e-09, "logits/chosen": -3.098437786102295, "logits/rejected": -3.0707993507385254, "logps/chosen": -56.029632568359375, "logps/rejected": -54.557579040527344, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 4.5064934965921566e-05, "rewards/margins": 8.188869287550915e-06, "rewards/rejected": 3.687605931190774e-05, "step": 20 }, { "epoch": 0.010337698139214336, "grad_norm": 1.8557682037353516, "learning_rate": 2.5817555938037863e-09, "logits/chosen": -3.127624988555908, "logits/rejected": -3.104484796524048, "logps/chosen": -55.351043701171875, "logps/rejected": -52.83721923828125, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 4.745673322759103e-06, "rewards/margins": -3.1675353966420516e-05, "rewards/rejected": 3.6421035474631935e-05, "step": 30 }, { "epoch": 0.013783597518952447, "grad_norm": 1.5628290176391602, "learning_rate": 3.4423407917383816e-09, "logits/chosen": -3.1053009033203125, "logits/rejected": -3.078575611114502, "logps/chosen": -56.41291427612305, "logps/rejected": -53.65864181518555, "loss": 0.6931, "rewards/accuracies": 0.546875, "rewards/chosen": 0.00019920275371987373, "rewards/margins": 0.00018488746718503535, "rewards/rejected": 1.4315284715848975e-05, "step": 40 }, { "epoch": 0.01722949689869056, "grad_norm": 1.7454065084457397, "learning_rate": 4.302925989672977e-09, "logits/chosen": -3.0816972255706787, "logits/rejected": -3.0432658195495605, "logps/chosen": -54.70732879638672, "logps/rejected": -51.237525939941406, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 6.539397872984409e-06, "rewards/margins": 2.3190634237835184e-05, "rewards/rejected": -1.6651232726871967e-05, "step": 50 }, { "epoch": 0.02067539627842867, "grad_norm": 1.8053064346313477, "learning_rate": 5.163511187607573e-09, "logits/chosen": -3.093721866607666, "logits/rejected": -3.074064254760742, "logps/chosen": -54.40106964111328, "logps/rejected": -53.95695877075195, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00021476794790942222, "rewards/margins": -0.00016086632967926562, "rewards/rejected": -5.390158548834734e-05, "step": 60 }, { "epoch": 0.024121295658166782, "grad_norm": 1.7542425394058228, "learning_rate": 6.024096385542168e-09, "logits/chosen": -3.111924648284912, "logits/rejected": -3.09572172164917, "logps/chosen": -54.690452575683594, "logps/rejected": -53.768272399902344, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -6.438524906116072e-06, "rewards/margins": 8.780825010035187e-05, "rewards/rejected": -9.424677409697324e-05, "step": 70 }, { "epoch": 0.027567195037904894, "grad_norm": 1.5708752870559692, "learning_rate": 6.884681583476763e-09, "logits/chosen": -3.0582454204559326, "logits/rejected": -3.0389389991760254, "logps/chosen": -53.05244064331055, "logps/rejected": -53.5573844909668, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 4.5851076720282435e-05, "rewards/margins": 9.506057540420443e-05, "rewards/rejected": -4.92095023219008e-05, "step": 80 }, { "epoch": 0.031013094417643005, "grad_norm": 1.7699593305587769, "learning_rate": 7.745266781411359e-09, "logits/chosen": -3.0694618225097656, "logits/rejected": -3.050314426422119, "logps/chosen": -56.33974075317383, "logps/rejected": -52.55852127075195, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 5.394589970819652e-05, "rewards/margins": 0.0001279563148273155, "rewards/rejected": -7.40104223950766e-05, "step": 90 }, { "epoch": 0.03445899379738112, "grad_norm": 1.757240891456604, "learning_rate": 8.605851979345954e-09, "logits/chosen": -3.0916152000427246, "logits/rejected": -3.0621144771575928, "logps/chosen": -56.1510124206543, "logps/rejected": -53.62493896484375, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 2.1132344045327045e-05, "rewards/margins": -2.9667915441677906e-05, "rewards/rejected": 5.0800241297110915e-05, "step": 100 }, { "epoch": 0.03445899379738112, "eval_logits/chosen": -3.1631598472595215, "eval_logits/rejected": -3.1574864387512207, "eval_logps/chosen": -58.709110260009766, "eval_logps/rejected": -63.17157745361328, "eval_loss": 0.6931766271591187, "eval_rewards/accuracies": 0.4804832637310028, "eval_rewards/chosen": 2.7851780032506213e-05, "eval_rewards/margins": -5.7606255722930655e-05, "eval_rewards/rejected": 8.545803575543687e-05, "eval_runtime": 384.4303, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 100 }, { "epoch": 0.03790489317711923, "grad_norm": 1.6156939268112183, "learning_rate": 9.46643717728055e-09, "logits/chosen": -3.032536745071411, "logits/rejected": -3.0125365257263184, "logps/chosen": -52.728843688964844, "logps/rejected": -54.300689697265625, "loss": 0.6931, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 2.9080998501740396e-05, "rewards/margins": 2.5842193736025365e-06, "rewards/rejected": 2.6496752980165184e-05, "step": 110 }, { "epoch": 0.04135079255685734, "grad_norm": 1.5372369289398193, "learning_rate": 1.0327022375215145e-08, "logits/chosen": -3.0377848148345947, "logits/rejected": -3.007380962371826, "logps/chosen": -52.42706298828125, "logps/rejected": -51.04817581176758, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 4.8243564378935844e-05, "rewards/margins": 3.2927131542237476e-05, "rewards/rejected": 1.5316421922761947e-05, "step": 120 }, { "epoch": 0.044796691936595454, "grad_norm": 1.7136356830596924, "learning_rate": 1.1187607573149742e-08, "logits/chosen": -3.1061851978302, "logits/rejected": -3.089826822280884, "logps/chosen": -53.56927490234375, "logps/rejected": -53.8600959777832, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 5.350602441467345e-05, "rewards/margins": 8.011364843696356e-05, "rewards/rejected": -2.6607634936226532e-05, "step": 130 }, { "epoch": 0.048242591316333565, "grad_norm": 1.8630205392837524, "learning_rate": 1.2048192771084337e-08, "logits/chosen": -3.0788228511810303, "logits/rejected": -3.0526034832000732, "logps/chosen": -55.3593864440918, "logps/rejected": -53.862342834472656, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 6.969690730329603e-05, "rewards/margins": 9.481948654865846e-05, "rewards/rejected": -2.5122590159298852e-05, "step": 140 }, { "epoch": 0.051688490696071676, "grad_norm": 1.626455545425415, "learning_rate": 1.2908777969018933e-08, "logits/chosen": -3.025050640106201, "logits/rejected": -3.0135657787323, "logps/chosen": -54.11144256591797, "logps/rejected": -54.102447509765625, "loss": 0.6932, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -2.0424617105163634e-05, "rewards/margins": -0.0001254867820534855, "rewards/rejected": 0.00010506215039640665, "step": 150 }, { "epoch": 0.05513439007580979, "grad_norm": 1.6855826377868652, "learning_rate": 1.3769363166953526e-08, "logits/chosen": -3.044309616088867, "logits/rejected": -3.02839732170105, "logps/chosen": -54.01361083984375, "logps/rejected": -51.27421188354492, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": 8.679249958731816e-07, "rewards/margins": 6.541889160871506e-05, "rewards/rejected": -6.455096445279196e-05, "step": 160 }, { "epoch": 0.0585802894555479, "grad_norm": 1.666648507118225, "learning_rate": 1.4629948364888123e-08, "logits/chosen": -3.044278621673584, "logits/rejected": -3.0216360092163086, "logps/chosen": -53.77402877807617, "logps/rejected": -52.072715759277344, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 5.8081197494175285e-05, "rewards/margins": 5.6167882576119155e-05, "rewards/rejected": 1.913309006340569e-06, "step": 170 }, { "epoch": 0.06202618883528601, "grad_norm": 1.6648340225219727, "learning_rate": 1.5490533562822718e-08, "logits/chosen": -3.053169012069702, "logits/rejected": -3.0219600200653076, "logps/chosen": -55.412353515625, "logps/rejected": -52.04193115234375, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00013867543020751327, "rewards/margins": -7.246668974403292e-05, "rewards/rejected": -6.620875501539558e-05, "step": 180 }, { "epoch": 0.06547208821502412, "grad_norm": 1.527038812637329, "learning_rate": 1.6351118760757314e-08, "logits/chosen": -3.153803586959839, "logits/rejected": -3.126765489578247, "logps/chosen": -52.947410583496094, "logps/rejected": -51.77288055419922, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.9722734325332567e-05, "rewards/margins": 4.115318733965978e-05, "rewards/rejected": -7.087593985488638e-05, "step": 190 }, { "epoch": 0.06891798759476224, "grad_norm": 1.733069896697998, "learning_rate": 1.7211703958691908e-08, "logits/chosen": -3.0855889320373535, "logits/rejected": -3.064948081970215, "logps/chosen": -54.34423828125, "logps/rejected": -53.98308181762695, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 8.129735942929983e-05, "rewards/margins": 3.768491296796128e-05, "rewards/rejected": 4.361245737527497e-05, "step": 200 }, { "epoch": 0.06891798759476224, "eval_logits/chosen": -3.163177728652954, "eval_logits/rejected": -3.1575300693511963, "eval_logps/chosen": -58.711936950683594, "eval_logps/rejected": -63.17683410644531, "eval_loss": 0.6931644678115845, "eval_rewards/accuracies": 0.4862918257713318, "eval_rewards/chosen": -4.3645013647619635e-07, "eval_rewards/margins": -3.331492916913703e-05, "eval_rewards/rejected": 3.287848085165024e-05, "eval_runtime": 384.7152, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 200 }, { "epoch": 0.07236388697450034, "grad_norm": 1.7262409925460815, "learning_rate": 1.8072289156626504e-08, "logits/chosen": -3.0673792362213135, "logits/rejected": -3.0613677501678467, "logps/chosen": -52.46643829345703, "logps/rejected": -54.71671676635742, "loss": 0.6932, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.00010696313984226435, "rewards/margins": -0.00014883882249705493, "rewards/rejected": 4.187567901681177e-05, "step": 210 }, { "epoch": 0.07580978635423846, "grad_norm": 1.730053186416626, "learning_rate": 1.89328743545611e-08, "logits/chosen": -3.110975742340088, "logits/rejected": -3.086411237716675, "logps/chosen": -53.7027702331543, "logps/rejected": -53.711639404296875, "loss": 0.6931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -4.8385940317530185e-05, "rewards/margins": 2.1820562324137427e-05, "rewards/rejected": -7.020650082267821e-05, "step": 220 }, { "epoch": 0.07925568573397657, "grad_norm": 1.6545181274414062, "learning_rate": 1.9793459552495694e-08, "logits/chosen": -3.0402209758758545, "logits/rejected": -3.014207363128662, "logps/chosen": -56.14927291870117, "logps/rejected": -53.79438400268555, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.179082664748421e-05, "rewards/margins": -1.239746325154556e-05, "rewards/rejected": 6.066344440114335e-07, "step": 230 }, { "epoch": 0.08270158511371468, "grad_norm": 1.8088655471801758, "learning_rate": 2.065404475043029e-08, "logits/chosen": -3.0466361045837402, "logits/rejected": -3.0282115936279297, "logps/chosen": -53.76200485229492, "logps/rejected": -55.18682861328125, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -9.619267075322568e-05, "rewards/margins": -2.319897794222925e-05, "rewards/rejected": -7.2993672802113e-05, "step": 240 }, { "epoch": 0.08614748449345279, "grad_norm": 1.5655380487442017, "learning_rate": 2.1514629948364887e-08, "logits/chosen": -2.986396551132202, "logits/rejected": -2.9472384452819824, "logps/chosen": -57.795745849609375, "logps/rejected": -51.48908615112305, "loss": 0.6931, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.0001305347977904603, "rewards/margins": 0.00012307525321375579, "rewards/rejected": -0.0002536100219003856, "step": 250 }, { "epoch": 0.08959338387319091, "grad_norm": 1.5828660726547241, "learning_rate": 2.2375215146299484e-08, "logits/chosen": -3.03403902053833, "logits/rejected": -3.010531187057495, "logps/chosen": -57.34452438354492, "logps/rejected": -51.8224983215332, "loss": 0.6931, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -3.587424725992605e-05, "rewards/margins": 6.571992707904428e-05, "rewards/rejected": -0.00010159417433897033, "step": 260 }, { "epoch": 0.09303928325292901, "grad_norm": 1.5742228031158447, "learning_rate": 2.3235800344234077e-08, "logits/chosen": -3.047518491744995, "logits/rejected": -3.018275499343872, "logps/chosen": -54.280174255371094, "logps/rejected": -52.072166442871094, "loss": 0.6931, "rewards/accuracies": 0.515625, "rewards/chosen": -4.243188232067041e-05, "rewards/margins": 0.00016101889195851982, "rewards/rejected": -0.00020345079246908426, "step": 270 }, { "epoch": 0.09648518263266713, "grad_norm": 1.7620676755905151, "learning_rate": 2.4096385542168673e-08, "logits/chosen": -3.0869364738464355, "logits/rejected": -3.073918581008911, "logps/chosen": -52.85234451293945, "logps/rejected": -53.48882293701172, "loss": 0.6931, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.00011127178004244342, "rewards/margins": 0.00011203387839486822, "rewards/rejected": -0.0002233056875411421, "step": 280 }, { "epoch": 0.09993108201240523, "grad_norm": 1.469874620437622, "learning_rate": 2.495697074010327e-08, "logits/chosen": -3.0485854148864746, "logits/rejected": -3.041419267654419, "logps/chosen": -51.24589920043945, "logps/rejected": -53.544586181640625, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.00010780607408378273, "rewards/margins": 1.8475009710527956e-05, "rewards/rejected": -0.00012628106924239546, "step": 290 }, { "epoch": 0.10337698139214335, "grad_norm": 1.7838884592056274, "learning_rate": 2.5817555938037866e-08, "logits/chosen": -3.0377209186553955, "logits/rejected": -3.0146260261535645, "logps/chosen": -54.3782844543457, "logps/rejected": -55.74077224731445, "loss": 0.6931, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 3.057599315070547e-05, "rewards/margins": 0.00016555582988075912, "rewards/rejected": -0.00013497984036803246, "step": 300 }, { "epoch": 0.10337698139214335, "eval_logits/chosen": -3.1631579399108887, "eval_logits/rejected": -3.157500743865967, "eval_logps/chosen": -58.70083236694336, "eval_logps/rejected": -63.1627197265625, "eval_loss": 0.6931794285774231, "eval_rewards/accuracies": 0.4756040871143341, "eval_rewards/chosen": 0.00011061962868552655, "eval_rewards/margins": -6.344748544506729e-05, "eval_rewards/rejected": 0.00017406711413059384, "eval_runtime": 384.668, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 300 }, { "epoch": 0.10682288077188146, "grad_norm": 1.6819980144500732, "learning_rate": 2.667814113597246e-08, "logits/chosen": -3.069870710372925, "logits/rejected": -3.0552239418029785, "logps/chosen": -53.801971435546875, "logps/rejected": -53.27043914794922, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00010162549733649939, "rewards/margins": -3.66856183973141e-05, "rewards/rejected": -6.49398862151429e-05, "step": 310 }, { "epoch": 0.11026878015161957, "grad_norm": 1.8077443838119507, "learning_rate": 2.7538726333907053e-08, "logits/chosen": -3.116912841796875, "logits/rejected": -3.097919225692749, "logps/chosen": -53.39704513549805, "logps/rejected": -52.60417556762695, "loss": 0.6931, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -8.59591382322833e-05, "rewards/margins": 5.404234252637252e-05, "rewards/rejected": -0.00014000148803461343, "step": 320 }, { "epoch": 0.11371467953135768, "grad_norm": 1.6062852144241333, "learning_rate": 2.8399311531841653e-08, "logits/chosen": -3.0585761070251465, "logits/rejected": -3.056246042251587, "logps/chosen": -53.057029724121094, "logps/rejected": -53.562095642089844, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -5.562259320868179e-05, "rewards/margins": 0.00014037203800398856, "rewards/rejected": -0.0001959946530405432, "step": 330 }, { "epoch": 0.1171605789110958, "grad_norm": 1.6745755672454834, "learning_rate": 2.9259896729776246e-08, "logits/chosen": -3.0025081634521484, "logits/rejected": -2.9875540733337402, "logps/chosen": -53.48546600341797, "logps/rejected": -54.20134735107422, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00016055100422818214, "rewards/margins": -7.695326348766685e-05, "rewards/rejected": -8.359774074051529e-05, "step": 340 }, { "epoch": 0.1206064782908339, "grad_norm": 1.6029772758483887, "learning_rate": 3.012048192771084e-08, "logits/chosen": -3.1061463356018066, "logits/rejected": -3.0769925117492676, "logps/chosen": -57.422828674316406, "logps/rejected": -51.77228927612305, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.00017459427181165665, "rewards/margins": -6.433665112126619e-05, "rewards/rejected": -0.00011025762796634808, "step": 350 }, { "epoch": 0.12405237767057202, "grad_norm": 1.6956379413604736, "learning_rate": 3.0981067125645436e-08, "logits/chosen": -3.044156551361084, "logits/rejected": -3.027600049972534, "logps/chosen": -53.99070358276367, "logps/rejected": -54.564674377441406, "loss": 0.6931, "rewards/accuracies": 0.546875, "rewards/chosen": -7.801742322044447e-05, "rewards/margins": 9.001044963952154e-05, "rewards/rejected": -0.00016802789468783885, "step": 360 }, { "epoch": 0.12749827705031014, "grad_norm": 1.6766176223754883, "learning_rate": 3.184165232358003e-08, "logits/chosen": -3.0830063819885254, "logits/rejected": -3.0575692653656006, "logps/chosen": -55.62574005126953, "logps/rejected": -53.142921447753906, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1775503480748739e-05, "rewards/margins": 0.00031634545302949846, "rewards/rejected": -0.0003281210083514452, "step": 370 }, { "epoch": 0.13094417643004824, "grad_norm": 1.7956558465957642, "learning_rate": 3.270223752151463e-08, "logits/chosen": -3.122870922088623, "logits/rejected": -3.089454412460327, "logps/chosen": -55.22282791137695, "logps/rejected": -51.79097366333008, "loss": 0.693, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 1.655972118896898e-05, "rewards/margins": 0.0003105142677668482, "rewards/rejected": -0.00029395456658676267, "step": 380 }, { "epoch": 0.13439007580978635, "grad_norm": 1.6882342100143433, "learning_rate": 3.356282271944922e-08, "logits/chosen": -3.0961623191833496, "logits/rejected": -3.068988084793091, "logps/chosen": -53.05096435546875, "logps/rejected": -51.719261169433594, "loss": 0.6931, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.00021331440075300634, "rewards/margins": 8.027394505916163e-05, "rewards/rejected": -0.0002935883530881256, "step": 390 }, { "epoch": 0.13783597518952448, "grad_norm": 1.7367823123931885, "learning_rate": 3.4423407917383815e-08, "logits/chosen": -3.04445743560791, "logits/rejected": -3.0147831439971924, "logps/chosen": -54.4653205871582, "logps/rejected": -54.02545166015625, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00013222855341155082, "rewards/margins": 0.00030294861062429845, "rewards/rejected": -0.00043517714948393404, "step": 400 }, { "epoch": 0.13783597518952448, "eval_logits/chosen": -3.1628847122192383, "eval_logits/rejected": -3.1572272777557373, "eval_logps/chosen": -58.69401550292969, "eval_logps/rejected": -63.163734436035156, "eval_loss": 0.6931403875350952, "eval_rewards/accuracies": 0.5006970167160034, "eval_rewards/chosen": 0.00017883002874441445, "eval_rewards/margins": 1.4981026652094442e-05, "eval_rewards/rejected": 0.00016384897753596306, "eval_runtime": 384.3938, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 400 }, { "epoch": 0.14128187456926258, "grad_norm": 1.5201988220214844, "learning_rate": 3.5283993115318415e-08, "logits/chosen": -3.0868654251098633, "logits/rejected": -3.059985637664795, "logps/chosen": -54.123985290527344, "logps/rejected": -53.197288513183594, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.00031447256333194673, "rewards/margins": 0.00020825877436436713, "rewards/rejected": -0.000522731221280992, "step": 410 }, { "epoch": 0.1447277739490007, "grad_norm": 1.7084550857543945, "learning_rate": 3.614457831325301e-08, "logits/chosen": -3.091403007507324, "logits/rejected": -3.070308208465576, "logps/chosen": -54.363189697265625, "logps/rejected": -51.84135055541992, "loss": 0.6929, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.0002584571484476328, "rewards/margins": 0.0005626108613796532, "rewards/rejected": -0.0008210679516196251, "step": 420 }, { "epoch": 0.1481736733287388, "grad_norm": 1.7150694131851196, "learning_rate": 3.70051635111876e-08, "logits/chosen": -3.031503200531006, "logits/rejected": -3.0161654949188232, "logps/chosen": -51.86822509765625, "logps/rejected": -53.66876220703125, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0002805610129144043, "rewards/margins": 0.00019659681129269302, "rewards/rejected": -0.00047715791151858866, "step": 430 }, { "epoch": 0.15161957270847692, "grad_norm": 1.5232292413711548, "learning_rate": 3.78657487091222e-08, "logits/chosen": -3.075045108795166, "logits/rejected": -3.061749219894409, "logps/chosen": -51.54164505004883, "logps/rejected": -52.585777282714844, "loss": 0.6929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00025903843925334513, "rewards/margins": 0.00041876602335833013, "rewards/rejected": -0.0006778044044040143, "step": 440 }, { "epoch": 0.15506547208821503, "grad_norm": 1.5611236095428467, "learning_rate": 3.8726333907056795e-08, "logits/chosen": -3.070613145828247, "logits/rejected": -3.0467374324798584, "logps/chosen": -56.33530807495117, "logps/rejected": -53.32719802856445, "loss": 0.693, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.0002340213832212612, "rewards/margins": 0.0003091735125053674, "rewards/rejected": -0.0005431949393823743, "step": 450 }, { "epoch": 0.15851137146795313, "grad_norm": 1.6240078210830688, "learning_rate": 3.958691910499139e-08, "logits/chosen": -3.0725510120391846, "logits/rejected": -3.0467612743377686, "logps/chosen": -52.70389938354492, "logps/rejected": -50.810211181640625, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0002929639595095068, "rewards/margins": 0.00025620352244004607, "rewards/rejected": -0.0005491675110533834, "step": 460 }, { "epoch": 0.16195727084769124, "grad_norm": 1.5414916276931763, "learning_rate": 4.044750430292599e-08, "logits/chosen": -3.1445844173431396, "logits/rejected": -3.1178226470947266, "logps/chosen": -56.37430953979492, "logps/rejected": -54.76704788208008, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.00029713130788877606, "rewards/margins": 0.0004932652227580547, "rewards/rejected": -0.0007903965306468308, "step": 470 }, { "epoch": 0.16540317022742937, "grad_norm": 1.7192236185073853, "learning_rate": 4.130808950086058e-08, "logits/chosen": -2.9115607738494873, "logits/rejected": -2.9044690132141113, "logps/chosen": -53.07261276245117, "logps/rejected": -55.70893478393555, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0006627115653827786, "rewards/margins": 0.00027101318119093776, "rewards/rejected": -0.0009337246301583946, "step": 480 }, { "epoch": 0.16884906960716747, "grad_norm": 1.8793227672576904, "learning_rate": 4.216867469879518e-08, "logits/chosen": -3.1179113388061523, "logits/rejected": -3.0901989936828613, "logps/chosen": -58.39832305908203, "logps/rejected": -53.71763229370117, "loss": 0.693, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0004218885151203722, "rewards/margins": 0.000330808776197955, "rewards/rejected": -0.000752697407733649, "step": 490 }, { "epoch": 0.17229496898690558, "grad_norm": 1.5864213705062866, "learning_rate": 4.3029259896729774e-08, "logits/chosen": -3.0117759704589844, "logits/rejected": -2.9860451221466064, "logps/chosen": -55.77704620361328, "logps/rejected": -52.258628845214844, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0005830780719406903, "rewards/margins": 0.00019635938224382699, "rewards/rejected": -0.0007794374832883477, "step": 500 }, { "epoch": 0.17229496898690558, "eval_logits/chosen": -3.16249942779541, "eval_logits/rejected": -3.1568548679351807, "eval_logps/chosen": -58.682533264160156, "eval_logps/rejected": -63.158958435058594, "eval_loss": 0.6931070685386658, "eval_rewards/accuracies": 0.494191437959671, "eval_rewards/chosen": 0.0002936015371233225, "eval_rewards/margins": 8.19716660771519e-05, "eval_rewards/rejected": 0.0002116298710461706, "eval_runtime": 384.608, "eval_samples_per_second": 11.191, "eval_steps_per_second": 1.399, "step": 500 }, { "epoch": 0.17574086836664368, "grad_norm": 1.7202733755111694, "learning_rate": 4.388984509466437e-08, "logits/chosen": -3.015113353729248, "logits/rejected": -3.002572536468506, "logps/chosen": -55.490325927734375, "logps/rejected": -56.18473434448242, "loss": 0.6928, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.0006010312354192138, "rewards/margins": 0.0006601332570426166, "rewards/rejected": -0.0012611645506694913, "step": 510 }, { "epoch": 0.17918676774638181, "grad_norm": 1.5302149057388306, "learning_rate": 4.475043029259897e-08, "logits/chosen": -3.113670825958252, "logits/rejected": -3.0949208736419678, "logps/chosen": -53.60992431640625, "logps/rejected": -53.83058547973633, "loss": 0.6929, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0005134848761372268, "rewards/margins": 0.0005421391688287258, "rewards/rejected": -0.0010556241031736135, "step": 520 }, { "epoch": 0.18263266712611992, "grad_norm": 1.6602168083190918, "learning_rate": 4.561101549053356e-08, "logits/chosen": -2.997028112411499, "logits/rejected": -2.9696555137634277, "logps/chosen": -56.8779296875, "logps/rejected": -52.88819122314453, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0005577536066994071, "rewards/margins": 0.0005765163223259151, "rewards/rejected": -0.001134270103648305, "step": 530 }, { "epoch": 0.18607856650585802, "grad_norm": 1.5818272829055786, "learning_rate": 4.6471600688468154e-08, "logits/chosen": -3.134260892868042, "logits/rejected": -3.105567455291748, "logps/chosen": -55.92725372314453, "logps/rejected": -52.12944412231445, "loss": 0.6928, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0006873260135762393, "rewards/margins": 0.0007128252182155848, "rewards/rejected": -0.0014001511735841632, "step": 540 }, { "epoch": 0.18952446588559613, "grad_norm": 1.5802710056304932, "learning_rate": 4.7332185886402753e-08, "logits/chosen": -3.032979965209961, "logits/rejected": -3.027398109436035, "logps/chosen": -51.641571044921875, "logps/rejected": -53.729759216308594, "loss": 0.693, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.0007201815606094897, "rewards/margins": 0.0003699318622238934, "rewards/rejected": -0.001090113422833383, "step": 550 }, { "epoch": 0.19297036526533426, "grad_norm": 1.6604018211364746, "learning_rate": 4.8192771084337347e-08, "logits/chosen": -3.0785210132598877, "logits/rejected": -3.0737929344177246, "logps/chosen": -54.71602249145508, "logps/rejected": -55.244483947753906, "loss": 0.6928, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0008259072201326489, "rewards/margins": 0.0006385392043739557, "rewards/rejected": -0.0014644463080912828, "step": 560 }, { "epoch": 0.19641626464507236, "grad_norm": 1.7432787418365479, "learning_rate": 4.905335628227194e-08, "logits/chosen": -3.1137382984161377, "logits/rejected": -3.084575891494751, "logps/chosen": -54.5408821105957, "logps/rejected": -53.760047912597656, "loss": 0.6927, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0008401373634114861, "rewards/margins": 0.0008745190571062267, "rewards/rejected": -0.0017146564787253737, "step": 570 }, { "epoch": 0.19986216402481047, "grad_norm": 1.535184383392334, "learning_rate": 4.991394148020654e-08, "logits/chosen": -3.0551018714904785, "logits/rejected": -3.041159152984619, "logps/chosen": -53.745262145996094, "logps/rejected": -54.66472625732422, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": -0.0005425583804026246, "rewards/margins": 0.0009367944439873099, "rewards/rejected": -0.0014793528243899345, "step": 580 }, { "epoch": 0.2033080634045486, "grad_norm": 1.7447534799575806, "learning_rate": 4.9999633685875244e-08, "logits/chosen": -2.9614148139953613, "logits/rejected": -2.9393579959869385, "logps/chosen": -52.52238845825195, "logps/rejected": -53.02446365356445, "loss": 0.6926, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0008184127509593964, "rewards/margins": 0.0010490169515833259, "rewards/rejected": -0.0018674297025427222, "step": 590 }, { "epoch": 0.2067539627842867, "grad_norm": 1.4818027019500732, "learning_rate": 4.9998367428608654e-08, "logits/chosen": -3.0609617233276367, "logits/rejected": -3.0357613563537598, "logps/chosen": -56.0200309753418, "logps/rejected": -50.999908447265625, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0008805854013189673, "rewards/margins": 0.0007603298290632665, "rewards/rejected": -0.0016409152885898948, "step": 600 }, { "epoch": 0.2067539627842867, "eval_logits/chosen": -3.1612586975097656, "eval_logits/rejected": -3.155609369277954, "eval_logps/chosen": -58.64760208129883, "eval_logps/rejected": -63.13199234008789, "eval_loss": 0.693067729473114, "eval_rewards/accuracies": 0.5023234486579895, "eval_rewards/chosen": 0.0006429245695471764, "eval_rewards/margins": 0.00016164187400136143, "eval_rewards/rejected": 0.0004812826809938997, "eval_runtime": 384.5126, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 600 }, { "epoch": 0.2101998621640248, "grad_norm": 1.5632553100585938, "learning_rate": 4.999619675160485e-08, "logits/chosen": -3.081904649734497, "logits/rejected": -3.0513620376586914, "logps/chosen": -53.667579650878906, "logps/rejected": -52.85166549682617, "loss": 0.6926, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.000827993091661483, "rewards/margins": 0.00119282235391438, "rewards/rejected": -0.002020815387368202, "step": 610 }, { "epoch": 0.2136457615437629, "grad_norm": 1.629331111907959, "learning_rate": 4.999312173339707e-08, "logits/chosen": -3.0883796215057373, "logits/rejected": -3.0587565898895264, "logps/chosen": -54.1837272644043, "logps/rejected": -52.726890563964844, "loss": 0.6926, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0008975326200015843, "rewards/margins": 0.0011468585580587387, "rewards/rejected": -0.0020443913526833057, "step": 620 }, { "epoch": 0.21709166092350105, "grad_norm": 1.5568369626998901, "learning_rate": 4.998914248523688e-08, "logits/chosen": -3.063368558883667, "logits/rejected": -3.0294599533081055, "logps/chosen": -53.4654655456543, "logps/rejected": -50.96184539794922, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0012578107416629791, "rewards/margins": 0.0010019788751378655, "rewards/rejected": -0.002259789500385523, "step": 630 }, { "epoch": 0.22053756030323915, "grad_norm": 1.6668413877487183, "learning_rate": 4.998425915109009e-08, "logits/chosen": -3.084184169769287, "logits/rejected": -3.084564685821533, "logps/chosen": -51.63975143432617, "logps/rejected": -57.51726531982422, "loss": 0.6927, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0011606714688241482, "rewards/margins": 0.0009209074196405709, "rewards/rejected": -0.002081578830257058, "step": 640 }, { "epoch": 0.22398345968297725, "grad_norm": 1.5113067626953125, "learning_rate": 4.9978471907631604e-08, "logits/chosen": -3.0601143836975098, "logits/rejected": -3.0378782749176025, "logps/chosen": -52.5632438659668, "logps/rejected": -52.316322326660156, "loss": 0.6927, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0015532078687101603, "rewards/margins": 0.0008993824012577534, "rewards/rejected": -0.00245259003713727, "step": 650 }, { "epoch": 0.22742935906271536, "grad_norm": 1.772717833518982, "learning_rate": 4.9971780964238976e-08, "logits/chosen": -3.086937427520752, "logits/rejected": -3.0564024448394775, "logps/chosen": -54.497314453125, "logps/rejected": -50.39165496826172, "loss": 0.6922, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0011715441942214966, "rewards/margins": 0.00196392135694623, "rewards/rejected": -0.0031354655511677265, "step": 660 }, { "epoch": 0.2308752584424535, "grad_norm": 1.6012787818908691, "learning_rate": 4.996418656298486e-08, "logits/chosen": -3.0736892223358154, "logits/rejected": -3.0475335121154785, "logps/chosen": -55.251243591308594, "logps/rejected": -51.75853729248047, "loss": 0.6922, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0007274464005604386, "rewards/margins": 0.0018245524261146784, "rewards/rejected": -0.002551998710259795, "step": 670 }, { "epoch": 0.2343211578221916, "grad_norm": 1.6301146745681763, "learning_rate": 4.995568897862825e-08, "logits/chosen": -3.0388023853302, "logits/rejected": -3.020338773727417, "logps/chosen": -54.80192184448242, "logps/rejected": -54.8926887512207, "loss": 0.6926, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.001500314916484058, "rewards/margins": 0.0011322436621412635, "rewards/rejected": -0.002632558811455965, "step": 680 }, { "epoch": 0.2377670572019297, "grad_norm": 1.5935677289962769, "learning_rate": 4.994628851860456e-08, "logits/chosen": -3.0772476196289062, "logits/rejected": -3.058328866958618, "logps/chosen": -53.629234313964844, "logps/rejected": -52.86656951904297, "loss": 0.6922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.001254276023246348, "rewards/margins": 0.0018880158895626664, "rewards/rejected": -0.003142292145639658, "step": 690 }, { "epoch": 0.2412129565816678, "grad_norm": 1.5870788097381592, "learning_rate": 4.993598552301446e-08, "logits/chosen": -3.0830237865448, "logits/rejected": -3.056472063064575, "logps/chosen": -56.46472930908203, "logps/rejected": -53.47216033935547, "loss": 0.692, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.0011341057252138853, "rewards/margins": 0.0022793817333877087, "rewards/rejected": -0.0034134879242628813, "step": 700 }, { "epoch": 0.2412129565816678, "eval_logits/chosen": -3.1599061489105225, "eval_logits/rejected": -3.1542632579803467, "eval_logps/chosen": -58.6091423034668, "eval_logps/rejected": -63.11530685424805, "eval_loss": 0.6929602026939392, "eval_rewards/accuracies": 0.5413568615913391, "eval_rewards/chosen": 0.0010274943197146058, "eval_rewards/margins": 0.00037930175312794745, "eval_rewards/rejected": 0.0006481926538981497, "eval_runtime": 384.5168, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 700 }, { "epoch": 0.24465885596140594, "grad_norm": 1.4943526983261108, "learning_rate": 4.9924780364611574e-08, "logits/chosen": -3.0165517330169678, "logits/rejected": -3.013462781906128, "logps/chosen": -52.749977111816406, "logps/rejected": -54.24238204956055, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0018606961239129305, "rewards/margins": 0.0011449294397607446, "rewards/rejected": -0.0030056254472583532, "step": 710 }, { "epoch": 0.24810475534114404, "grad_norm": 1.5908006429672241, "learning_rate": 4.9912673448789055e-08, "logits/chosen": -3.0585460662841797, "logits/rejected": -3.0370562076568604, "logps/chosen": -52.21095657348633, "logps/rejected": -52.79679489135742, "loss": 0.6922, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0018738650251179934, "rewards/margins": 0.001822577090933919, "rewards/rejected": -0.003696442348882556, "step": 720 }, { "epoch": 0.25155065472088217, "grad_norm": 1.6703251600265503, "learning_rate": 4.989966521356484e-08, "logits/chosen": -3.0375118255615234, "logits/rejected": -3.00249981880188, "logps/chosen": -53.77020263671875, "logps/rejected": -51.606651306152344, "loss": 0.692, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0018283795798197389, "rewards/margins": 0.0023119805846363306, "rewards/rejected": -0.004140359815210104, "step": 730 }, { "epoch": 0.2549965541006203, "grad_norm": 1.5016419887542725, "learning_rate": 4.9885756129565855e-08, "logits/chosen": -3.187788963317871, "logits/rejected": -3.1516430377960205, "logps/chosen": -54.3302116394043, "logps/rejected": -53.6005859375, "loss": 0.692, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0016302086878567934, "rewards/margins": 0.0022917932365089655, "rewards/rejected": -0.003922001924365759, "step": 740 }, { "epoch": 0.2584424534803584, "grad_norm": 1.652618169784546, "learning_rate": 4.9870946700010963e-08, "logits/chosen": -3.0573348999023438, "logits/rejected": -3.043586254119873, "logps/chosen": -53.8933219909668, "logps/rejected": -53.5998649597168, "loss": 0.692, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.002071264898404479, "rewards/margins": 0.002245786366984248, "rewards/rejected": -0.0043170517310500145, "step": 750 }, { "epoch": 0.2618883528600965, "grad_norm": 1.5802987813949585, "learning_rate": 4.985523746069277e-08, "logits/chosen": -3.0178141593933105, "logits/rejected": -2.99599289894104, "logps/chosen": -55.6571159362793, "logps/rejected": -52.568031311035156, "loss": 0.6923, "rewards/accuracies": 0.578125, "rewards/chosen": -0.002288275398313999, "rewards/margins": 0.001769710099324584, "rewards/rejected": -0.004057985730469227, "step": 760 }, { "epoch": 0.2653342522398346, "grad_norm": 1.6327557563781738, "learning_rate": 4.9838628979958226e-08, "logits/chosen": -3.025636911392212, "logits/rejected": -3.001251697540283, "logps/chosen": -54.04003143310547, "logps/rejected": -51.364707946777344, "loss": 0.6919, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.002429876709356904, "rewards/margins": 0.0024292119778692722, "rewards/rejected": -0.00485908892005682, "step": 770 }, { "epoch": 0.2687801516195727, "grad_norm": 1.6433818340301514, "learning_rate": 4.982112185868809e-08, "logits/chosen": -3.016303300857544, "logits/rejected": -2.9975686073303223, "logps/chosen": -52.39423751831055, "logps/rejected": -51.11736297607422, "loss": 0.692, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.002856833627447486, "rewards/margins": 0.002312553348019719, "rewards/rejected": -0.005169386975467205, "step": 780 }, { "epoch": 0.2722260509993108, "grad_norm": 1.7514872550964355, "learning_rate": 4.980271673027517e-08, "logits/chosen": -3.042994737625122, "logits/rejected": -3.038914442062378, "logps/chosen": -52.71642303466797, "logps/rejected": -55.53397750854492, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0030075139366090298, "rewards/margins": 0.0017282769549638033, "rewards/rejected": -0.004735790658742189, "step": 790 }, { "epoch": 0.27567195037904896, "grad_norm": 1.5455923080444336, "learning_rate": 4.9783414260601395e-08, "logits/chosen": -3.052412509918213, "logits/rejected": -3.0258965492248535, "logps/chosen": -53.40227127075195, "logps/rejected": -53.219749450683594, "loss": 0.6923, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.0024742651730775833, "rewards/margins": 0.00176246277987957, "rewards/rejected": -0.004236727952957153, "step": 800 }, { "epoch": 0.27567195037904896, "eval_logits/chosen": -3.158536672592163, "eval_logits/rejected": -3.1529228687286377, "eval_logps/chosen": -58.58611297607422, "eval_logps/rejected": -63.12191390991211, "eval_loss": 0.6928143501281738, "eval_rewards/accuracies": 0.5587825179100037, "eval_rewards/chosen": 0.0012578194728121161, "eval_rewards/margins": 0.000675736868288368, "eval_rewards/rejected": 0.0005820823716931045, "eval_runtime": 384.4564, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 800 }, { "epoch": 0.27911784975878706, "grad_norm": 1.6539554595947266, "learning_rate": 4.976321514801376e-08, "logits/chosen": -3.0484116077423096, "logits/rejected": -3.0248422622680664, "logps/chosen": -53.9824333190918, "logps/rejected": -56.396575927734375, "loss": 0.6922, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0031535557936877012, "rewards/margins": 0.001993196550756693, "rewards/rejected": -0.0051467521116137505, "step": 810 }, { "epoch": 0.28256374913852517, "grad_norm": 1.732709288597107, "learning_rate": 4.974212012329902e-08, "logits/chosen": -3.1013197898864746, "logits/rejected": -3.0694527626037598, "logps/chosen": -56.43525314331055, "logps/rejected": -52.12022018432617, "loss": 0.6913, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0020319526083767414, "rewards/margins": 0.003823560895398259, "rewards/rejected": -0.005855513270944357, "step": 820 }, { "epoch": 0.28600964851826327, "grad_norm": 1.6947790384292603, "learning_rate": 4.97201299496573e-08, "logits/chosen": -3.1101901531219482, "logits/rejected": -3.0727791786193848, "logps/chosen": -56.2694206237793, "logps/rejected": -52.88818359375, "loss": 0.6913, "rewards/accuracies": 0.671875, "rewards/chosen": -0.0016542660305276513, "rewards/margins": 0.0037212413735687733, "rewards/rejected": -0.005375507287681103, "step": 830 }, { "epoch": 0.2894555478980014, "grad_norm": 1.650404453277588, "learning_rate": 4.969724542267442e-08, "logits/chosen": -3.0947813987731934, "logits/rejected": -3.0706396102905273, "logps/chosen": -55.293067932128906, "logps/rejected": -55.382408142089844, "loss": 0.6914, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.002315860940143466, "rewards/margins": 0.003439708147197962, "rewards/rejected": -0.005755568854510784, "step": 840 }, { "epoch": 0.2929014472777395, "grad_norm": 1.7584972381591797, "learning_rate": 4.967346737029316e-08, "logits/chosen": -3.0069823265075684, "logits/rejected": -3.001641273498535, "logps/chosen": -52.9250602722168, "logps/rejected": -54.31984329223633, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003708853619173169, "rewards/margins": 0.001558844349347055, "rewards/rejected": -0.005267697852104902, "step": 850 }, { "epoch": 0.2963473466574776, "grad_norm": 1.7942557334899902, "learning_rate": 4.964879665278331e-08, "logits/chosen": -3.0983834266662598, "logits/rejected": -3.0633091926574707, "logps/chosen": -57.68050003051758, "logps/rejected": -53.3126106262207, "loss": 0.6918, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.00262661837041378, "rewards/margins": 0.0028027433436363935, "rewards/rejected": -0.005429361946880817, "step": 860 }, { "epoch": 0.2997932460372157, "grad_norm": 1.6487774848937988, "learning_rate": 4.9623234162710505e-08, "logits/chosen": -3.072366952896118, "logits/rejected": -3.0587127208709717, "logps/chosen": -53.41727828979492, "logps/rejected": -53.83111572265625, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003994829952716827, "rewards/margins": 0.001481076586060226, "rewards/rejected": -0.005475906189531088, "step": 870 }, { "epoch": 0.30323914541695385, "grad_norm": 1.6196177005767822, "learning_rate": 4.959678082490396e-08, "logits/chosen": -3.0850799083709717, "logits/rejected": -3.0610485076904297, "logps/chosen": -55.864410400390625, "logps/rejected": -55.08472442626953, "loss": 0.6915, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0025982023216784, "rewards/margins": 0.0033730953000485897, "rewards/rejected": -0.00597129762172699, "step": 880 }, { "epoch": 0.30668504479669195, "grad_norm": 1.7501091957092285, "learning_rate": 4.9569437596423006e-08, "logits/chosen": -3.0634050369262695, "logits/rejected": -3.0458738803863525, "logps/chosen": -55.1644401550293, "logps/rejected": -54.405372619628906, "loss": 0.6917, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.004143073223531246, "rewards/margins": 0.002887632232159376, "rewards/rejected": -0.007030704524368048, "step": 890 }, { "epoch": 0.31013094417643006, "grad_norm": 1.6706045866012573, "learning_rate": 4.954120546652246e-08, "logits/chosen": -3.1494853496551514, "logits/rejected": -3.1262238025665283, "logps/chosen": -52.21024703979492, "logps/rejected": -52.95808792114258, "loss": 0.6912, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.0031147089321166277, "rewards/margins": 0.0038592598866671324, "rewards/rejected": -0.006973968353122473, "step": 900 }, { "epoch": 0.31013094417643006, "eval_logits/chosen": -3.1557774543762207, "eval_logits/rejected": -3.1501119136810303, "eval_logps/chosen": -58.54638671875, "eval_logps/rejected": -63.11029815673828, "eval_loss": 0.6926776766777039, "eval_rewards/accuracies": 0.5659851431846619, "eval_rewards/chosen": 0.0016550758155062795, "eval_rewards/margins": 0.0009568364475853741, "eval_rewards/rejected": 0.0006982393097132444, "eval_runtime": 384.6781, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 900 }, { "epoch": 0.31357684355616816, "grad_norm": 1.5960333347320557, "learning_rate": 4.9512085456616845e-08, "logits/chosen": -3.119786024093628, "logits/rejected": -3.0845987796783447, "logps/chosen": -56.18413162231445, "logps/rejected": -52.79380416870117, "loss": 0.6916, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.0029451637528836727, "rewards/margins": 0.003129506018012762, "rewards/rejected": -0.006074669770896435, "step": 910 }, { "epoch": 0.31702274293590627, "grad_norm": 1.6668741703033447, "learning_rate": 4.948207862024345e-08, "logits/chosen": -3.104740619659424, "logits/rejected": -3.0957703590393066, "logps/chosen": -55.60581588745117, "logps/rejected": -55.4895133972168, "loss": 0.6923, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.004241111688315868, "rewards/margins": 0.0017586927860975266, "rewards/rejected": -0.005999804940074682, "step": 920 }, { "epoch": 0.32046864231564437, "grad_norm": 1.724545955657959, "learning_rate": 4.9451186043024136e-08, "logits/chosen": -3.041184902191162, "logits/rejected": -3.02308988571167, "logps/chosen": -55.5092887878418, "logps/rejected": -54.970558166503906, "loss": 0.6907, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.0025846255011856556, "rewards/margins": 0.004906138870865107, "rewards/rejected": -0.007490763906389475, "step": 930 }, { "epoch": 0.3239145416953825, "grad_norm": 1.652547001838684, "learning_rate": 4.941940884262618e-08, "logits/chosen": -3.0771613121032715, "logits/rejected": -3.045358896255493, "logps/chosen": -54.669090270996094, "logps/rejected": -53.556488037109375, "loss": 0.6902, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0033632400445640087, "rewards/margins": 0.0059061916545033455, "rewards/rejected": -0.009269431233406067, "step": 940 }, { "epoch": 0.32736044107512063, "grad_norm": 1.7784395217895508, "learning_rate": 4.938674816872173e-08, "logits/chosen": -3.0897176265716553, "logits/rejected": -3.070265054702759, "logps/chosen": -55.5160026550293, "logps/rejected": -54.05512237548828, "loss": 0.691, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.00267231953330338, "rewards/margins": 0.00433064391836524, "rewards/rejected": -0.007002964615821838, "step": 950 }, { "epoch": 0.33080634045485874, "grad_norm": 1.64098060131073, "learning_rate": 4.935320520294628e-08, "logits/chosen": -3.0121991634368896, "logits/rejected": -2.980740785598755, "logps/chosen": -55.34636306762695, "logps/rejected": -54.396728515625, "loss": 0.6908, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.004281845409423113, "rewards/margins": 0.004687915090471506, "rewards/rejected": -0.008969759568572044, "step": 960 }, { "epoch": 0.33425223983459684, "grad_norm": 1.5851314067840576, "learning_rate": 4.931878115885591e-08, "logits/chosen": -3.029776096343994, "logits/rejected": -3.0019984245300293, "logps/chosen": -52.888648986816406, "logps/rejected": -52.870269775390625, "loss": 0.6909, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.004734398797154427, "rewards/margins": 0.004596917890012264, "rewards/rejected": -0.009331315755844116, "step": 970 }, { "epoch": 0.33769813921433495, "grad_norm": 1.506502389907837, "learning_rate": 4.9283477281883315e-08, "logits/chosen": -3.05110764503479, "logits/rejected": -3.0407040119171143, "logps/chosen": -53.92938232421875, "logps/rejected": -55.473533630371094, "loss": 0.6915, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0049580661579966545, "rewards/margins": 0.00340564688667655, "rewards/rejected": -0.008363713510334492, "step": 980 }, { "epoch": 0.34114403859407305, "grad_norm": 1.5995771884918213, "learning_rate": 4.9247294849292856e-08, "logits/chosen": -3.052551507949829, "logits/rejected": -3.035020589828491, "logps/chosen": -56.33484649658203, "logps/rejected": -53.77924346923828, "loss": 0.6913, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.004676403012126684, "rewards/margins": 0.0036912120413035154, "rewards/rejected": -0.008367614820599556, "step": 990 }, { "epoch": 0.34458993797381116, "grad_norm": 1.676594614982605, "learning_rate": 4.9210235170134244e-08, "logits/chosen": -3.0952134132385254, "logits/rejected": -3.080845832824707, "logps/chosen": -50.99755096435547, "logps/rejected": -53.95912551879883, "loss": 0.6909, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.005998981185257435, "rewards/margins": 0.004682415165007114, "rewards/rejected": -0.01068139635026455, "step": 1000 }, { "epoch": 0.34458993797381116, "eval_logits/chosen": -3.1538045406341553, "eval_logits/rejected": -3.148146629333496, "eval_logps/chosen": -58.5271110534668, "eval_logps/rejected": -63.12847900390625, "eval_loss": 0.6924968361854553, "eval_rewards/accuracies": 0.5645910501480103, "eval_rewards/chosen": 0.0018478184938430786, "eval_rewards/margins": 0.00133140804246068, "eval_rewards/rejected": 0.0005164103349670768, "eval_runtime": 384.1578, "eval_samples_per_second": 11.204, "eval_steps_per_second": 1.4, "step": 1000 }, { "epoch": 0.34803583735354926, "grad_norm": 1.7784764766693115, "learning_rate": 4.917229958519526e-08, "logits/chosen": -3.0560386180877686, "logits/rejected": -3.0308563709259033, "logps/chosen": -53.219703674316406, "logps/rejected": -54.31676483154297, "loss": 0.6903, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005041136406362057, "rewards/margins": 0.005721528083086014, "rewards/rejected": -0.010762663558125496, "step": 1010 }, { "epoch": 0.35148173673328736, "grad_norm": 1.6811140775680542, "learning_rate": 4.9133489466953204e-08, "logits/chosen": -3.0694663524627686, "logits/rejected": -3.0522894859313965, "logps/chosen": -55.4501838684082, "logps/rejected": -55.52228546142578, "loss": 0.6913, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.005660675000399351, "rewards/margins": 0.0038871937431395054, "rewards/rejected": -0.009547867812216282, "step": 1020 }, { "epoch": 0.3549276361130255, "grad_norm": 1.588110089302063, "learning_rate": 4.909380621952524e-08, "logits/chosen": -3.112725257873535, "logits/rejected": -3.081007957458496, "logps/chosen": -53.6580696105957, "logps/rejected": -53.864463806152344, "loss": 0.6908, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0038233071099966764, "rewards/margins": 0.004857801832258701, "rewards/rejected": -0.008681108243763447, "step": 1030 }, { "epoch": 0.35837353549276363, "grad_norm": 1.6203489303588867, "learning_rate": 4.9053251278617604e-08, "logits/chosen": -3.0942468643188477, "logits/rejected": -3.0676419734954834, "logps/chosen": -54.168235778808594, "logps/rejected": -53.65093231201172, "loss": 0.6914, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -0.00587734580039978, "rewards/margins": 0.0035653274971991777, "rewards/rejected": -0.00944267213344574, "step": 1040 }, { "epoch": 0.36181943487250173, "grad_norm": 1.779880166053772, "learning_rate": 4.9011826111473685e-08, "logits/chosen": -3.063671112060547, "logits/rejected": -3.050913095474243, "logps/chosen": -55.40948486328125, "logps/rejected": -53.851905822753906, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": -0.0053887562826275826, "rewards/margins": 0.003592419670894742, "rewards/rejected": -0.008981176652014256, "step": 1050 }, { "epoch": 0.36526533425223984, "grad_norm": 1.7278673648834229, "learning_rate": 4.89695322168209e-08, "logits/chosen": -3.0266060829162598, "logits/rejected": -3.0144662857055664, "logps/chosen": -51.79481887817383, "logps/rejected": -54.04804611206055, "loss": 0.691, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005571847315877676, "rewards/margins": 0.004314024467021227, "rewards/rejected": -0.009885871782898903, "step": 1060 }, { "epoch": 0.36871123363197794, "grad_norm": 1.6478562355041504, "learning_rate": 4.89263711248165e-08, "logits/chosen": -2.9854063987731934, "logits/rejected": -2.97200608253479, "logps/chosen": -52.4482307434082, "logps/rejected": -54.30084991455078, "loss": 0.6914, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.008854800835251808, "rewards/margins": 0.0037074810825288296, "rewards/rejected": -0.012562280520796776, "step": 1070 }, { "epoch": 0.37215713301171605, "grad_norm": 1.499592900276184, "learning_rate": 4.8882344396992184e-08, "logits/chosen": -3.009160041809082, "logits/rejected": -2.978325843811035, "logps/chosen": -55.19956588745117, "logps/rejected": -49.916481018066406, "loss": 0.6899, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.004767565988004208, "rewards/margins": 0.006583952344954014, "rewards/rejected": -0.011351518332958221, "step": 1080 }, { "epoch": 0.37560303239145415, "grad_norm": 1.8961305618286133, "learning_rate": 4.883745362619765e-08, "logits/chosen": -3.1398892402648926, "logits/rejected": -3.112936496734619, "logps/chosen": -56.82250213623047, "logps/rejected": -53.331634521484375, "loss": 0.6903, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006448267959058285, "rewards/margins": 0.0057860021479427814, "rewards/rejected": -0.012234269641339779, "step": 1090 }, { "epoch": 0.37904893177119225, "grad_norm": 1.9023057222366333, "learning_rate": 4.8791700436542915e-08, "logits/chosen": -3.1427078247070312, "logits/rejected": -3.127922773361206, "logps/chosen": -53.18560791015625, "logps/rejected": -55.44195556640625, "loss": 0.6907, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.006103273946791887, "rewards/margins": 0.005071353167295456, "rewards/rejected": -0.011174628511071205, "step": 1100 }, { "epoch": 0.37904893177119225, "eval_logits/chosen": -3.151279926300049, "eval_logits/rejected": -3.145657777786255, "eval_logps/chosen": -58.5153694152832, "eval_logps/rejected": -63.14689636230469, "eval_loss": 0.6923530697822571, "eval_rewards/accuracies": 0.5604089498519897, "eval_rewards/chosen": 0.0019652547780424356, "eval_rewards/margins": 0.0016330406069755554, "eval_rewards/rejected": 0.0003322141710668802, "eval_runtime": 384.3912, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 1100 }, { "epoch": 0.3824948311509304, "grad_norm": 1.788101315498352, "learning_rate": 4.874508648333959e-08, "logits/chosen": -3.028311252593994, "logits/rejected": -3.0164096355438232, "logps/chosen": -54.771568298339844, "logps/rejected": -54.829750061035156, "loss": 0.6903, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.004693931899964809, "rewards/margins": 0.005819953512400389, "rewards/rejected": -0.01051388494670391, "step": 1110 }, { "epoch": 0.3859407305306685, "grad_norm": 1.7864511013031006, "learning_rate": 4.8697613453040974e-08, "logits/chosen": -3.0762057304382324, "logits/rejected": -3.0420901775360107, "logps/chosen": -55.974891662597656, "logps/rejected": -53.59397506713867, "loss": 0.6898, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.00731278071179986, "rewards/margins": 0.0069493455812335014, "rewards/rejected": -0.014262126758694649, "step": 1120 }, { "epoch": 0.3893866299104066, "grad_norm": 1.6350274085998535, "learning_rate": 4.864928306318104e-08, "logits/chosen": -2.9895944595336914, "logits/rejected": -2.9592795372009277, "logps/chosen": -58.717308044433594, "logps/rejected": -56.6348876953125, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.006385552231222391, "rewards/margins": 0.006410741712898016, "rewards/rejected": -0.012796293012797832, "step": 1130 }, { "epoch": 0.3928325292901447, "grad_norm": 1.6556651592254639, "learning_rate": 4.860009706231234e-08, "logits/chosen": -2.992837429046631, "logits/rejected": -2.971184730529785, "logps/chosen": -54.53647994995117, "logps/rejected": -54.9399528503418, "loss": 0.6906, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.007840116508305073, "rewards/margins": 0.005326352082192898, "rewards/rejected": -0.013166469521820545, "step": 1140 }, { "epoch": 0.39627842866988283, "grad_norm": 1.8056648969650269, "learning_rate": 4.8550057229942654e-08, "logits/chosen": -3.019346237182617, "logits/rejected": -2.987921953201294, "logps/chosen": -54.99890899658203, "logps/rejected": -55.09080123901367, "loss": 0.6892, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.005849289242178202, "rewards/margins": 0.007977871224284172, "rewards/rejected": -0.013827161863446236, "step": 1150 }, { "epoch": 0.39972432804962094, "grad_norm": 1.6521730422973633, "learning_rate": 4.849916537647071e-08, "logits/chosen": -3.0615134239196777, "logits/rejected": -3.032876491546631, "logps/chosen": -55.03501510620117, "logps/rejected": -52.776710510253906, "loss": 0.6897, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.007752572186291218, "rewards/margins": 0.007105639670044184, "rewards/rejected": -0.014858213253319263, "step": 1160 }, { "epoch": 0.40317022742935904, "grad_norm": 1.550277829170227, "learning_rate": 4.844742334312059e-08, "logits/chosen": -3.0416417121887207, "logits/rejected": -3.0246546268463135, "logps/chosen": -54.85407257080078, "logps/rejected": -55.321006774902344, "loss": 0.6897, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.007986031472682953, "rewards/margins": 0.007059283554553986, "rewards/rejected": -0.015045315027236938, "step": 1170 }, { "epoch": 0.4066161268090972, "grad_norm": 1.6416267156600952, "learning_rate": 4.8394833001875206e-08, "logits/chosen": -3.0568747520446777, "logits/rejected": -3.0389015674591064, "logps/chosen": -55.069847106933594, "logps/rejected": -54.663665771484375, "loss": 0.6898, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.007581622339785099, "rewards/margins": 0.0068257213570177555, "rewards/rejected": -0.014407342299818993, "step": 1180 }, { "epoch": 0.4100620261888353, "grad_norm": 1.6217989921569824, "learning_rate": 4.834139625540851e-08, "logits/chosen": -3.044999599456787, "logits/rejected": -3.029223918914795, "logps/chosen": -54.880348205566406, "logps/rejected": -54.82160568237305, "loss": 0.6901, "rewards/accuracies": 0.640625, "rewards/chosen": -0.008282794617116451, "rewards/margins": 0.00628671795129776, "rewards/rejected": -0.014569511637091637, "step": 1190 }, { "epoch": 0.4135079255685734, "grad_norm": 1.6303709745407104, "learning_rate": 4.828711503701667e-08, "logits/chosen": -3.1172478199005127, "logits/rejected": -3.093177556991577, "logps/chosen": -55.1821174621582, "logps/rejected": -54.7901725769043, "loss": 0.6898, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.009148449636995792, "rewards/margins": 0.0069651217199862, "rewards/rejected": -0.01611356995999813, "step": 1200 }, { "epoch": 0.4135079255685734, "eval_logits/chosen": -3.1480495929718018, "eval_logits/rejected": -3.14241886138916, "eval_logps/chosen": -58.530601501464844, "eval_logps/rejected": -63.214256286621094, "eval_loss": 0.6921030282974243, "eval_rewards/accuracies": 0.5743494629859924, "eval_rewards/chosen": 0.0018129091477021575, "eval_rewards/margins": 0.0021542287431657314, "eval_rewards/rejected": -0.0003413195663597435, "eval_runtime": 384.364, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 1200 }, { "epoch": 0.4169538249483115, "grad_norm": 1.654517412185669, "learning_rate": 4.823199131054816e-08, "logits/chosen": -3.113901138305664, "logits/rejected": -3.082550525665283, "logps/chosen": -55.60883331298828, "logps/rejected": -53.55535888671875, "loss": 0.6889, "rewards/accuracies": 0.609375, "rewards/chosen": -0.005926042329519987, "rewards/margins": 0.00877379346638918, "rewards/rejected": -0.014699837192893028, "step": 1210 }, { "epoch": 0.4203997243280496, "grad_norm": 1.735965371131897, "learning_rate": 4.8176027070332646e-08, "logits/chosen": -3.0720534324645996, "logits/rejected": -3.05411958694458, "logps/chosen": -55.67919921875, "logps/rejected": -54.55861282348633, "loss": 0.6897, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.007292713038623333, "rewards/margins": 0.007126192562282085, "rewards/rejected": -0.014418904669582844, "step": 1220 }, { "epoch": 0.4238456237077877, "grad_norm": 1.8626728057861328, "learning_rate": 4.811922434110889e-08, "logits/chosen": -3.0178306102752686, "logits/rejected": -2.9898033142089844, "logps/chosen": -55.622398376464844, "logps/rejected": -54.017578125, "loss": 0.6903, "rewards/accuracies": 0.609375, "rewards/chosen": -0.01022510789334774, "rewards/margins": 0.005996005143970251, "rewards/rejected": -0.01622111350297928, "step": 1230 }, { "epoch": 0.4272915230875258, "grad_norm": 1.7841885089874268, "learning_rate": 4.806158517795148e-08, "logits/chosen": -3.11910343170166, "logits/rejected": -3.0926966667175293, "logps/chosen": -55.5859375, "logps/rejected": -53.11517333984375, "loss": 0.6899, "rewards/accuracies": 0.640625, "rewards/chosen": -0.008747449144721031, "rewards/margins": 0.006734578870236874, "rewards/rejected": -0.015482030808925629, "step": 1240 }, { "epoch": 0.43073742246726393, "grad_norm": 1.685684084892273, "learning_rate": 4.800311166619646e-08, "logits/chosen": -3.0900144577026367, "logits/rejected": -3.073540210723877, "logps/chosen": -54.59074783325195, "logps/rejected": -55.54296875, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.011259722523391247, "rewards/margins": 0.003347150282934308, "rewards/rejected": -0.014606873504817486, "step": 1250 }, { "epoch": 0.4341833218470021, "grad_norm": 1.6296279430389404, "learning_rate": 4.794380592136591e-08, "logits/chosen": -2.9728333950042725, "logits/rejected": -2.9540724754333496, "logps/chosen": -53.40140914916992, "logps/rejected": -52.111854553222656, "loss": 0.69, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009744682349264622, "rewards/margins": 0.006456801202148199, "rewards/rejected": -0.016201484948396683, "step": 1260 }, { "epoch": 0.4376292212267402, "grad_norm": 1.6278390884399414, "learning_rate": 4.788367008909139e-08, "logits/chosen": -3.076103448867798, "logits/rejected": -3.0685179233551025, "logps/chosen": -53.551544189453125, "logps/rejected": -55.704368591308594, "loss": 0.6907, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.01196536235511303, "rewards/margins": 0.005077089183032513, "rewards/rejected": -0.017042452469468117, "step": 1270 }, { "epoch": 0.4410751206064783, "grad_norm": 1.77932870388031, "learning_rate": 4.782270634503631e-08, "logits/chosen": -3.0844674110412598, "logits/rejected": -3.0556795597076416, "logps/chosen": -57.6111946105957, "logps/rejected": -55.6341667175293, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -0.006394694559276104, "rewards/margins": 0.009988361969590187, "rewards/rejected": -0.016383057460188866, "step": 1280 }, { "epoch": 0.4445210199862164, "grad_norm": 1.7250643968582153, "learning_rate": 4.776091689481725e-08, "logits/chosen": -3.075246810913086, "logits/rejected": -3.056575059890747, "logps/chosen": -57.056922912597656, "logps/rejected": -56.40248489379883, "loss": 0.6908, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.01063154824078083, "rewards/margins": 0.004961362108588219, "rewards/rejected": -0.015592910349369049, "step": 1290 }, { "epoch": 0.4479669193659545, "grad_norm": 1.6703568696975708, "learning_rate": 4.7698303973924136e-08, "logits/chosen": -3.051967144012451, "logits/rejected": -3.009127140045166, "logps/chosen": -57.8825798034668, "logps/rejected": -51.91571807861328, "loss": 0.688, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009872758761048317, "rewards/margins": 0.010731091722846031, "rewards/rejected": -0.0206038486212492, "step": 1300 }, { "epoch": 0.4479669193659545, "eval_logits/chosen": -3.144803047180176, "eval_logits/rejected": -3.139164924621582, "eval_logps/chosen": -58.535133361816406, "eval_logps/rejected": -63.26057052612305, "eval_loss": 0.6919035315513611, "eval_rewards/accuracies": 0.574117124080658, "eval_rewards/chosen": 0.0017676005372777581, "eval_rewards/margins": 0.0025721341371536255, "eval_rewards/rejected": -0.0008045334252528846, "eval_runtime": 384.3396, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 1300 }, { "epoch": 0.4514128187456926, "grad_norm": 1.5745819807052612, "learning_rate": 4.7634869847639334e-08, "logits/chosen": -3.062584638595581, "logits/rejected": -3.0274033546447754, "logps/chosen": -55.93726348876953, "logps/rejected": -53.12489700317383, "loss": 0.6885, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.011147616431117058, "rewards/margins": 0.009501030668616295, "rewards/rejected": -0.020648647099733353, "step": 1310 }, { "epoch": 0.4548587181254307, "grad_norm": 1.6925148963928223, "learning_rate": 4.757061681095577e-08, "logits/chosen": -3.001038074493408, "logits/rejected": -2.9740278720855713, "logps/chosen": -53.52008056640625, "logps/rejected": -53.030731201171875, "loss": 0.6888, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.010656071826815605, "rewards/margins": 0.008936294354498386, "rewards/rejected": -0.019592367112636566, "step": 1320 }, { "epoch": 0.4583046175051689, "grad_norm": 1.7766765356063843, "learning_rate": 4.750554718849381e-08, "logits/chosen": -2.998490810394287, "logits/rejected": -2.9684324264526367, "logps/chosen": -56.33980178833008, "logps/rejected": -55.1796875, "loss": 0.6892, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.009794693440198898, "rewards/margins": 0.008208018727600574, "rewards/rejected": -0.018002711236476898, "step": 1330 }, { "epoch": 0.461750516884907, "grad_norm": 1.8487436771392822, "learning_rate": 4.743966333441723e-08, "logits/chosen": -3.0082459449768066, "logits/rejected": -2.980231761932373, "logps/chosen": -57.111976623535156, "logps/rejected": -54.838539123535156, "loss": 0.6885, "rewards/accuracies": 0.640625, "rewards/chosen": -0.010152112692594528, "rewards/margins": 0.009611548855900764, "rewards/rejected": -0.019763659685850143, "step": 1340 }, { "epoch": 0.4651964162646451, "grad_norm": 1.653855800628662, "learning_rate": 4.7372967632348016e-08, "logits/chosen": -3.0194742679595947, "logits/rejected": -2.9947919845581055, "logps/chosen": -53.08203887939453, "logps/rejected": -53.925636291503906, "loss": 0.6878, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.011461066082119942, "rewards/margins": 0.010980509221553802, "rewards/rejected": -0.022441575303673744, "step": 1350 }, { "epoch": 0.4686423156443832, "grad_norm": 1.7544745206832886, "learning_rate": 4.7305462495280103e-08, "logits/chosen": -3.0522308349609375, "logits/rejected": -3.044433832168579, "logps/chosen": -56.90116500854492, "logps/rejected": -56.19316482543945, "loss": 0.6909, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.011542152613401413, "rewards/margins": 0.004821972921490669, "rewards/rejected": -0.016364123672246933, "step": 1360 }, { "epoch": 0.4720882150241213, "grad_norm": 1.8341665267944336, "learning_rate": 4.723715036549211e-08, "logits/chosen": -3.041679620742798, "logits/rejected": -3.020946979522705, "logps/chosen": -56.77973556518555, "logps/rejected": -54.12712478637695, "loss": 0.6895, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.010667492635548115, "rewards/margins": 0.00756409764289856, "rewards/rejected": -0.0182315893471241, "step": 1370 }, { "epoch": 0.4755341144038594, "grad_norm": 1.7911666631698608, "learning_rate": 4.7168033714458986e-08, "logits/chosen": -2.9922127723693848, "logits/rejected": -2.9809184074401855, "logps/chosen": -53.072364807128906, "logps/rejected": -56.233436584472656, "loss": 0.6892, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.013029935769736767, "rewards/margins": 0.00814075767993927, "rewards/rejected": -0.021170692518353462, "step": 1380 }, { "epoch": 0.4789800137835975, "grad_norm": 1.6858317852020264, "learning_rate": 4.7098115042762554e-08, "logits/chosen": -3.0591578483581543, "logits/rejected": -3.028982400894165, "logps/chosen": -55.264068603515625, "logps/rejected": -53.70880126953125, "loss": 0.6901, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.012422731146216393, "rewards/margins": 0.006396573968231678, "rewards/rejected": -0.018819306045770645, "step": 1390 }, { "epoch": 0.4824259131633356, "grad_norm": 1.8360016345977783, "learning_rate": 4.702739688000106e-08, "logits/chosen": -3.07702898979187, "logits/rejected": -3.0475409030914307, "logps/chosen": -56.95623779296875, "logps/rejected": -55.4343376159668, "loss": 0.6888, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.010582750663161278, "rewards/margins": 0.009008489549160004, "rewards/rejected": -0.019591238349676132, "step": 1400 }, { "epoch": 0.4824259131633356, "eval_logits/chosen": -3.142038106918335, "eval_logits/rejected": -3.136406183242798, "eval_logps/chosen": -58.60540771484375, "eval_logps/rejected": -63.37487030029297, "eval_loss": 0.6916956305503845, "eval_rewards/accuracies": 0.5722583532333374, "eval_rewards/chosen": 0.0010648738825693727, "eval_rewards/margins": 0.003012324683368206, "eval_rewards/rejected": -0.001947450335137546, "eval_runtime": 384.5538, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 1400 }, { "epoch": 0.48587181254307377, "grad_norm": 1.7492179870605469, "learning_rate": 4.695588178469768e-08, "logits/chosen": -3.0327694416046143, "logits/rejected": -3.014371871948242, "logps/chosen": -56.01494598388672, "logps/rejected": -56.41218948364258, "loss": 0.6891, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.011834423989057541, "rewards/margins": 0.00837056152522564, "rewards/rejected": -0.02020498737692833, "step": 1410 }, { "epoch": 0.48931771192281187, "grad_norm": 1.6037969589233398, "learning_rate": 4.688357234420793e-08, "logits/chosen": -2.9960224628448486, "logits/rejected": -2.9826016426086426, "logps/chosen": -55.37086868286133, "logps/rejected": -55.3111686706543, "loss": 0.6891, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.013024615123867989, "rewards/margins": 0.008364452980458736, "rewards/rejected": -0.0213890690356493, "step": 1420 }, { "epoch": 0.49276361130255, "grad_norm": 1.633384346961975, "learning_rate": 4.681047117462605e-08, "logits/chosen": -3.034104824066162, "logits/rejected": -3.0122103691101074, "logps/chosen": -54.78871536254883, "logps/rejected": -55.884490966796875, "loss": 0.6869, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.011832429096102715, "rewards/margins": 0.012881122529506683, "rewards/rejected": -0.02471354976296425, "step": 1430 }, { "epoch": 0.4962095106822881, "grad_norm": 1.8843494653701782, "learning_rate": 4.673658092069036e-08, "logits/chosen": -3.103280544281006, "logits/rejected": -3.0781211853027344, "logps/chosen": -56.96147537231445, "logps/rejected": -54.37610626220703, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.012295748107135296, "rewards/margins": 0.011648855172097683, "rewards/rejected": -0.023944605141878128, "step": 1440 }, { "epoch": 0.4996554100620262, "grad_norm": 1.6782125234603882, "learning_rate": 4.666190425568761e-08, "logits/chosen": -3.1093602180480957, "logits/rejected": -3.1019256114959717, "logps/chosen": -53.99622344970703, "logps/rejected": -53.916046142578125, "loss": 0.6902, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.01273355446755886, "rewards/margins": 0.006157218478620052, "rewards/rejected": -0.018890773877501488, "step": 1450 }, { "epoch": 0.5031013094417643, "grad_norm": 1.6588455438613892, "learning_rate": 4.658644388135622e-08, "logits/chosen": -3.080059289932251, "logits/rejected": -3.072068691253662, "logps/chosen": -56.12352752685547, "logps/rejected": -57.99721145629883, "loss": 0.6883, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01284876000136137, "rewards/margins": 0.01009051688015461, "rewards/rejected": -0.022939275950193405, "step": 1460 }, { "epoch": 0.5065472088215024, "grad_norm": 1.5984660387039185, "learning_rate": 4.651020252778855e-08, "logits/chosen": -3.040616035461426, "logits/rejected": -3.007890224456787, "logps/chosen": -53.751983642578125, "logps/rejected": -53.546485900878906, "loss": 0.6882, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.012961966916918755, "rewards/margins": 0.010237214155495167, "rewards/rejected": -0.023199182003736496, "step": 1470 }, { "epoch": 0.5099931082012406, "grad_norm": 1.9550174474716187, "learning_rate": 4.6433182953332116e-08, "logits/chosen": -3.0700085163116455, "logits/rejected": -3.047051429748535, "logps/chosen": -55.44309616088867, "logps/rejected": -55.55956268310547, "loss": 0.6882, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.013085572049021721, "rewards/margins": 0.010364504531025887, "rewards/rejected": -0.023450080305337906, "step": 1480 }, { "epoch": 0.5134390075809786, "grad_norm": 1.6405853033065796, "learning_rate": 4.635538794448982e-08, "logits/chosen": -2.9804530143737793, "logits/rejected": -2.9533791542053223, "logps/chosen": -55.882049560546875, "logps/rejected": -54.833106994628906, "loss": 0.6874, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.012936905026435852, "rewards/margins": 0.011920436285436153, "rewards/rejected": -0.02485733851790428, "step": 1490 }, { "epoch": 0.5168849069607168, "grad_norm": 1.5811128616333008, "learning_rate": 4.627682031581913e-08, "logits/chosen": -3.031785726547241, "logits/rejected": -3.015012264251709, "logps/chosen": -55.61000442504883, "logps/rejected": -56.501609802246094, "loss": 0.6886, "rewards/accuracies": 0.609375, "rewards/chosen": -0.012989061884582043, "rewards/margins": 0.009481636807322502, "rewards/rejected": -0.02247069776058197, "step": 1500 }, { "epoch": 0.5168849069607168, "eval_logits/chosen": -3.1381611824035645, "eval_logits/rejected": -3.1325039863586426, "eval_logps/chosen": -58.68781661987305, "eval_logps/rejected": -63.50574493408203, "eval_loss": 0.6914681792259216, "eval_rewards/accuracies": 0.5736523866653442, "eval_rewards/chosen": 0.00024078537535388023, "eval_rewards/margins": 0.003497007070109248, "eval_rewards/rejected": -0.0032562220003455877, "eval_runtime": 384.6728, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 1500 }, { "epoch": 0.5203308063404548, "grad_norm": 1.8088315725326538, "learning_rate": 4.619748290983022e-08, "logits/chosen": -3.1065077781677246, "logits/rejected": -3.0788774490356445, "logps/chosen": -56.224525451660156, "logps/rejected": -54.217689514160156, "loss": 0.6884, "rewards/accuracies": 0.578125, "rewards/chosen": -0.015114299952983856, "rewards/margins": 0.01001181174069643, "rewards/rejected": -0.025126110762357712, "step": 1510 }, { "epoch": 0.523776705720193, "grad_norm": 1.5406330823898315, "learning_rate": 4.611737859688317e-08, "logits/chosen": -3.102351188659668, "logits/rejected": -3.0882105827331543, "logps/chosen": -55.27317428588867, "logps/rejected": -57.054290771484375, "loss": 0.6903, "rewards/accuracies": 0.5625, "rewards/chosen": -0.018418293446302414, "rewards/margins": 0.006099226884543896, "rewards/rejected": -0.024517521262168884, "step": 1520 }, { "epoch": 0.5272226050999311, "grad_norm": 1.724234700202942, "learning_rate": 4.6036510275084114e-08, "logits/chosen": -3.031350612640381, "logits/rejected": -3.015336036682129, "logps/chosen": -56.156715393066406, "logps/rejected": -56.938873291015625, "loss": 0.6907, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.015397797338664532, "rewards/margins": 0.0053660026751458645, "rewards/rejected": -0.02076379954814911, "step": 1530 }, { "epoch": 0.5306685044796692, "grad_norm": 1.558053970336914, "learning_rate": 4.5954880870180344e-08, "logits/chosen": -2.9651525020599365, "logits/rejected": -2.9437994956970215, "logps/chosen": -56.369361877441406, "logps/rejected": -57.072608947753906, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.019688406959176064, "rewards/margins": 0.007340868003666401, "rewards/rejected": -0.02702927589416504, "step": 1540 }, { "epoch": 0.5341144038594073, "grad_norm": 1.822608470916748, "learning_rate": 4.587249333545453e-08, "logits/chosen": -3.0255002975463867, "logits/rejected": -3.0009472370147705, "logps/chosen": -55.275184631347656, "logps/rejected": -55.24955368041992, "loss": 0.6878, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.015089382417500019, "rewards/margins": 0.011031006462872028, "rewards/rejected": -0.026120388880372047, "step": 1550 }, { "epoch": 0.5375603032391454, "grad_norm": 1.7990676164627075, "learning_rate": 4.578935065161782e-08, "logits/chosen": -3.008293867111206, "logits/rejected": -3.001120090484619, "logps/chosen": -55.151153564453125, "logps/rejected": -57.88622283935547, "loss": 0.69, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.018395179882645607, "rewards/margins": 0.006701651960611343, "rewards/rejected": -0.0250968299806118, "step": 1560 }, { "epoch": 0.5410062026188835, "grad_norm": 1.6503915786743164, "learning_rate": 4.570545582670201e-08, "logits/chosen": -3.008349657058716, "logits/rejected": -2.9964206218719482, "logps/chosen": -53.5858039855957, "logps/rejected": -55.97298049926758, "loss": 0.6897, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.01696760021150112, "rewards/margins": 0.007355398032814264, "rewards/rejected": -0.0243229977786541, "step": 1570 }, { "epoch": 0.5444521019986216, "grad_norm": 1.6711068153381348, "learning_rate": 4.5620811895950746e-08, "logits/chosen": -3.0385825634002686, "logits/rejected": -3.009280204772949, "logps/chosen": -55.50677490234375, "logps/rejected": -55.899436950683594, "loss": 0.6858, "rewards/accuracies": 0.609375, "rewards/chosen": -0.012491394765675068, "rewards/margins": 0.015325082466006279, "rewards/rejected": -0.02781647816300392, "step": 1580 }, { "epoch": 0.5478980013783598, "grad_norm": 1.53829824924469, "learning_rate": 4.553542192170966e-08, "logits/chosen": -3.060349225997925, "logits/rejected": -3.025090217590332, "logps/chosen": -56.067955017089844, "logps/rejected": -53.33306884765625, "loss": 0.6865, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.014950567856431007, "rewards/margins": 0.013809828087687492, "rewards/rejected": -0.02876039408147335, "step": 1590 }, { "epoch": 0.5513439007580979, "grad_norm": 1.8090956211090088, "learning_rate": 4.5449288993315615e-08, "logits/chosen": -3.047544002532959, "logits/rejected": -3.0376791954040527, "logps/chosen": -55.40331268310547, "logps/rejected": -56.957740783691406, "loss": 0.6885, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.01590132713317871, "rewards/margins": 0.00968841277062893, "rewards/rejected": -0.02558973990380764, "step": 1600 }, { "epoch": 0.5513439007580979, "eval_logits/chosen": -3.1351428031921387, "eval_logits/rejected": -3.1294989585876465, "eval_logps/chosen": -58.74068832397461, "eval_logps/rejected": -63.60573196411133, "eval_loss": 0.6912448406219482, "eval_rewards/accuracies": 0.5769051909446716, "eval_rewards/chosen": -0.00028791907243430614, "eval_rewards/margins": 0.0039681848138570786, "eval_rewards/rejected": -0.004256103653460741, "eval_runtime": 384.8288, "eval_samples_per_second": 11.184, "eval_steps_per_second": 1.398, "step": 1600 }, { "epoch": 0.554789800137836, "grad_norm": 1.7352491617202759, "learning_rate": 4.536241622698493e-08, "logits/chosen": -2.9448189735412598, "logits/rejected": -2.929994821548462, "logps/chosen": -53.825408935546875, "logps/rejected": -55.268638610839844, "loss": 0.6889, "rewards/accuracies": 0.59375, "rewards/chosen": -0.016777951270341873, "rewards/margins": 0.009007781744003296, "rewards/rejected": -0.02578573301434517, "step": 1610 }, { "epoch": 0.5582356995175741, "grad_norm": 1.6469370126724243, "learning_rate": 4.5274806765700636e-08, "logits/chosen": -3.078242540359497, "logits/rejected": -3.0581135749816895, "logps/chosen": -56.996124267578125, "logps/rejected": -57.18330764770508, "loss": 0.6888, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01799107901751995, "rewards/margins": 0.009230229072272778, "rewards/rejected": -0.027221307158470154, "step": 1620 }, { "epoch": 0.5616815988973122, "grad_norm": 1.6323329210281372, "learning_rate": 4.518646377909875e-08, "logits/chosen": -3.038658857345581, "logits/rejected": -3.0193114280700684, "logps/chosen": -53.77470779418945, "logps/rejected": -56.057960510253906, "loss": 0.6868, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.017207052558660507, "rewards/margins": 0.013134879060089588, "rewards/rejected": -0.03034193441271782, "step": 1630 }, { "epoch": 0.5651274982770503, "grad_norm": 1.6922211647033691, "learning_rate": 4.5097390463353626e-08, "logits/chosen": -3.10493540763855, "logits/rejected": -3.097712516784668, "logps/chosen": -54.40106201171875, "logps/rejected": -58.211830139160156, "loss": 0.6887, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0212591253221035, "rewards/margins": 0.009430269710719585, "rewards/rejected": -0.03068939410150051, "step": 1640 }, { "epoch": 0.5685733976567884, "grad_norm": 1.8530436754226685, "learning_rate": 4.5007590041062295e-08, "logits/chosen": -3.092582941055298, "logits/rejected": -3.0659327507019043, "logps/chosen": -55.72516632080078, "logps/rejected": -56.46184539794922, "loss": 0.6867, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.016838040202856064, "rewards/margins": 0.013281027786433697, "rewards/rejected": -0.030119070783257484, "step": 1650 }, { "epoch": 0.5720192970365265, "grad_norm": 1.7546192407608032, "learning_rate": 4.4917065761127907e-08, "logits/chosen": -3.00036358833313, "logits/rejected": -2.968268871307373, "logps/chosen": -55.17334747314453, "logps/rejected": -53.06787109375, "loss": 0.6861, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.017701324075460434, "rewards/margins": 0.014755235984921455, "rewards/rejected": -0.03245655819773674, "step": 1660 }, { "epoch": 0.5754651964162646, "grad_norm": 1.7550996541976929, "learning_rate": 4.482582089864214e-08, "logits/chosen": -3.0978662967681885, "logits/rejected": -3.072986602783203, "logps/chosen": -56.618186950683594, "logps/rejected": -56.91802978515625, "loss": 0.6858, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.015549172647297382, "rewards/margins": 0.015354210510849953, "rewards/rejected": -0.03090338036417961, "step": 1670 }, { "epoch": 0.5789110957960028, "grad_norm": 1.7752835750579834, "learning_rate": 4.473385875476675e-08, "logits/chosen": -3.021265745162964, "logits/rejected": -2.9964027404785156, "logps/chosen": -57.2200813293457, "logps/rejected": -58.078155517578125, "loss": 0.6866, "rewards/accuracies": 0.59375, "rewards/chosen": -0.01923130638897419, "rewards/margins": 0.013771514408290386, "rewards/rejected": -0.033002819865942, "step": 1680 }, { "epoch": 0.5823569951757409, "grad_norm": 1.647120714187622, "learning_rate": 4.464118265661414e-08, "logits/chosen": -3.063469648361206, "logits/rejected": -3.0422072410583496, "logps/chosen": -54.968994140625, "logps/rejected": -55.223297119140625, "loss": 0.6865, "rewards/accuracies": 0.609375, "rewards/chosen": -0.017609132453799248, "rewards/margins": 0.013836865313351154, "rewards/rejected": -0.031446002423763275, "step": 1690 }, { "epoch": 0.585802894555479, "grad_norm": 1.740160584449768, "learning_rate": 4.454779595712694e-08, "logits/chosen": -3.06133770942688, "logits/rejected": -3.0284037590026855, "logps/chosen": -55.181007385253906, "logps/rejected": -54.75300979614258, "loss": 0.6861, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.01888805255293846, "rewards/margins": 0.014551205560564995, "rewards/rejected": -0.03343925625085831, "step": 1700 }, { "epoch": 0.585802894555479, "eval_logits/chosen": -3.1310038566589355, "eval_logits/rejected": -3.125328540802002, "eval_logps/chosen": -58.872894287109375, "eval_logps/rejected": -63.80035400390625, "eval_loss": 0.6909549236297607, "eval_rewards/accuracies": 0.5745818018913269, "eval_rewards/chosen": -0.0016099718632176518, "eval_rewards/margins": 0.0045923274010419846, "eval_rewards/rejected": -0.006202299147844315, "eval_runtime": 384.7457, "eval_samples_per_second": 11.187, "eval_steps_per_second": 1.398, "step": 1700 }, { "epoch": 0.5892487939352171, "grad_norm": 1.9542814493179321, "learning_rate": 4.4453702034956785e-08, "logits/chosen": -2.9857747554779053, "logits/rejected": -2.959059000015259, "logps/chosen": -53.07960891723633, "logps/rejected": -53.60251998901367, "loss": 0.6859, "rewards/accuracies": 0.640625, "rewards/chosen": -0.019093209877610207, "rewards/margins": 0.015096555463969707, "rewards/rejected": -0.03418976441025734, "step": 1710 }, { "epoch": 0.5926946933149552, "grad_norm": 1.7722985744476318, "learning_rate": 4.435890429434197e-08, "logits/chosen": -3.0411362648010254, "logits/rejected": -3.0229859352111816, "logps/chosen": -54.2125129699707, "logps/rejected": -57.09526824951172, "loss": 0.6878, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.022556353360414505, "rewards/margins": 0.011233055964112282, "rewards/rejected": -0.033789411187171936, "step": 1720 }, { "epoch": 0.5961405926946933, "grad_norm": 1.6309845447540283, "learning_rate": 4.426340616498437e-08, "logits/chosen": -3.139572858810425, "logits/rejected": -3.112715005874634, "logps/chosen": -57.57489013671875, "logps/rejected": -57.2770881652832, "loss": 0.6874, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.017954563722014427, "rewards/margins": 0.012080615386366844, "rewards/rejected": -0.03003517910838127, "step": 1730 }, { "epoch": 0.5995864920744314, "grad_norm": 1.7834956645965576, "learning_rate": 4.416721110192535e-08, "logits/chosen": -3.0707812309265137, "logits/rejected": -3.0469467639923096, "logps/chosen": -57.28666305541992, "logps/rejected": -56.38450241088867, "loss": 0.6862, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.018242638558149338, "rewards/margins": 0.014507819898426533, "rewards/rejected": -0.032750457525253296, "step": 1740 }, { "epoch": 0.6030323914541695, "grad_norm": 1.556868314743042, "learning_rate": 4.407032258542071e-08, "logits/chosen": -3.015918731689453, "logits/rejected": -2.9973530769348145, "logps/chosen": -55.209632873535156, "logps/rejected": -58.076988220214844, "loss": 0.6864, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.020735980942845345, "rewards/margins": 0.0141109898686409, "rewards/rejected": -0.03484697267413139, "step": 1750 }, { "epoch": 0.6064782908339077, "grad_norm": 1.7012434005737305, "learning_rate": 4.3972744120814834e-08, "logits/chosen": -2.9489564895629883, "logits/rejected": -2.926241874694824, "logps/chosen": -56.891990661621094, "logps/rejected": -55.18810272216797, "loss": 0.6877, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.024412229657173157, "rewards/margins": 0.011419257149100304, "rewards/rejected": -0.03583148494362831, "step": 1760 }, { "epoch": 0.6099241902136457, "grad_norm": 1.8544518947601318, "learning_rate": 4.387447923841383e-08, "logits/chosen": -3.044332504272461, "logits/rejected": -3.0102765560150146, "logps/chosen": -56.91345977783203, "logps/rejected": -55.81806182861328, "loss": 0.6855, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.019783537834882736, "rewards/margins": 0.015991825610399246, "rewards/rejected": -0.03577536344528198, "step": 1770 }, { "epoch": 0.6133700895933839, "grad_norm": 1.6657090187072754, "learning_rate": 4.377553149335783e-08, "logits/chosen": -3.0116004943847656, "logits/rejected": -2.992893934249878, "logps/chosen": -56.234764099121094, "logps/rejected": -56.17246627807617, "loss": 0.6879, "rewards/accuracies": 0.59375, "rewards/chosen": -0.025032171979546547, "rewards/margins": 0.01119538675993681, "rewards/rejected": -0.03622755408287048, "step": 1780 }, { "epoch": 0.616815988973122, "grad_norm": 1.7814066410064697, "learning_rate": 4.367590446549234e-08, "logits/chosen": -3.0905067920684814, "logits/rejected": -3.0614664554595947, "logps/chosen": -59.580650329589844, "logps/rejected": -59.031532287597656, "loss": 0.6853, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.020469004288315773, "rewards/margins": 0.016377722844481468, "rewards/rejected": -0.03684672713279724, "step": 1790 }, { "epoch": 0.6202618883528601, "grad_norm": 1.8220956325531006, "learning_rate": 4.357560175923876e-08, "logits/chosen": -3.0348381996154785, "logits/rejected": -3.0145459175109863, "logps/chosen": -57.11140060424805, "logps/rejected": -56.62044143676758, "loss": 0.6872, "rewards/accuracies": 0.609375, "rewards/chosen": -0.025753721594810486, "rewards/margins": 0.012716737575829029, "rewards/rejected": -0.03847045823931694, "step": 1800 }, { "epoch": 0.6202618883528601, "eval_logits/chosen": -3.126986503601074, "eval_logits/rejected": -3.1213557720184326, "eval_logps/chosen": -59.06035232543945, "eval_logps/rejected": -64.03250122070312, "eval_loss": 0.6907546520233154, "eval_rewards/accuracies": 0.5838754773139954, "eval_rewards/chosen": -0.0034845659974962473, "eval_rewards/margins": 0.005039151292294264, "eval_rewards/rejected": -0.00852371659129858, "eval_runtime": 384.947, "eval_samples_per_second": 11.181, "eval_steps_per_second": 1.398, "step": 1800 }, { "epoch": 0.6237077877325982, "grad_norm": 1.7491681575775146, "learning_rate": 4.347462700346395e-08, "logits/chosen": -2.9711337089538574, "logits/rejected": -2.9505839347839355, "logps/chosen": -55.86274337768555, "logps/rejected": -55.249229431152344, "loss": 0.6867, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.023622069507837296, "rewards/margins": 0.01367507316172123, "rewards/rejected": -0.037297140806913376, "step": 1810 }, { "epoch": 0.6271536871123363, "grad_norm": 1.7659626007080078, "learning_rate": 4.337298385134896e-08, "logits/chosen": -3.0822091102600098, "logits/rejected": -3.0714335441589355, "logps/chosen": -55.31257247924805, "logps/rejected": -59.754554748535156, "loss": 0.6855, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.02220662496984005, "rewards/margins": 0.016015606001019478, "rewards/rejected": -0.03822223097085953, "step": 1820 }, { "epoch": 0.6305995864920745, "grad_norm": 1.7817442417144775, "learning_rate": 4.327067598025686e-08, "logits/chosen": -3.037313938140869, "logits/rejected": -3.025007963180542, "logps/chosen": -54.705711364746094, "logps/rejected": -57.494056701660156, "loss": 0.6874, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.022355765104293823, "rewards/margins": 0.012223458848893642, "rewards/rejected": -0.03457922488451004, "step": 1830 }, { "epoch": 0.6340454858718125, "grad_norm": 1.644735336303711, "learning_rate": 4.316770709159966e-08, "logits/chosen": -3.0322089195251465, "logits/rejected": -2.9977164268493652, "logps/chosen": -56.084678649902344, "logps/rejected": -55.208106994628906, "loss": 0.6857, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.023211777210235596, "rewards/margins": 0.015626708045601845, "rewards/rejected": -0.03883848711848259, "step": 1840 }, { "epoch": 0.6374913852515507, "grad_norm": 1.698492169380188, "learning_rate": 4.306408091070445e-08, "logits/chosen": -2.997825860977173, "logits/rejected": -2.988842010498047, "logps/chosen": -57.037139892578125, "logps/rejected": -63.068214416503906, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": -0.021915119141340256, "rewards/margins": 0.016361277550458908, "rewards/rejected": -0.038276396691799164, "step": 1850 }, { "epoch": 0.6409372846312887, "grad_norm": 1.78380286693573, "learning_rate": 4.29598011866786e-08, "logits/chosen": -3.060973882675171, "logits/rejected": -3.0355865955352783, "logps/chosen": -55.15557861328125, "logps/rejected": -56.25408172607422, "loss": 0.6847, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.02272849902510643, "rewards/margins": 0.01771937869489193, "rewards/rejected": -0.04044787958264351, "step": 1860 }, { "epoch": 0.6443831840110269, "grad_norm": 1.8728792667388916, "learning_rate": 4.285487169227408e-08, "logits/chosen": -3.0278170108795166, "logits/rejected": -3.004077911376953, "logps/chosen": -56.48118209838867, "logps/rejected": -57.5565299987793, "loss": 0.6861, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025048470124602318, "rewards/margins": 0.014941403642296791, "rewards/rejected": -0.03998987004160881, "step": 1870 }, { "epoch": 0.647829083390765, "grad_norm": 1.72579026222229, "learning_rate": 4.2749296223751055e-08, "logits/chosen": -3.076904535293579, "logits/rejected": -3.061049699783325, "logps/chosen": -57.126136779785156, "logps/rejected": -58.63282012939453, "loss": 0.6859, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025299783796072006, "rewards/margins": 0.015334163792431355, "rewards/rejected": -0.040633946657180786, "step": 1880 }, { "epoch": 0.6512749827705031, "grad_norm": 1.6683820486068726, "learning_rate": 4.264307860074045e-08, "logits/chosen": -3.045011281967163, "logits/rejected": -3.015040159225464, "logps/chosen": -56.37493896484375, "logps/rejected": -55.854759216308594, "loss": 0.6869, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.025647884234786034, "rewards/margins": 0.013144433498382568, "rewards/rejected": -0.03879231959581375, "step": 1890 }, { "epoch": 0.6547208821502413, "grad_norm": 2.010503053665161, "learning_rate": 4.253622266610579e-08, "logits/chosen": -3.0306594371795654, "logits/rejected": -2.999948501586914, "logps/chosen": -58.246925354003906, "logps/rejected": -55.802650451660156, "loss": 0.6862, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024062279611825943, "rewards/margins": 0.01467285118997097, "rewards/rejected": -0.03873513266444206, "step": 1900 }, { "epoch": 0.6547208821502413, "eval_logits/chosen": -3.121394157409668, "eval_logits/rejected": -3.1157350540161133, "eval_logps/chosen": -59.24892807006836, "eval_logps/rejected": -64.2826156616211, "eval_loss": 0.6904721260070801, "eval_rewards/accuracies": 0.580157995223999, "eval_rewards/chosen": -0.005370323546230793, "eval_rewards/margins": 0.005654662381857634, "eval_rewards/rejected": -0.011024984531104565, "eval_runtime": 384.523, "eval_samples_per_second": 11.193, "eval_steps_per_second": 1.399, "step": 1900 }, { "epoch": 0.6581667815299793, "grad_norm": 1.7444661855697632, "learning_rate": 4.24287322858042e-08, "logits/chosen": -3.049037456512451, "logits/rejected": -3.018188953399658, "logps/chosen": -58.074134826660156, "logps/rejected": -56.562660217285156, "loss": 0.6851, "rewards/accuracies": 0.640625, "rewards/chosen": -0.022909775376319885, "rewards/margins": 0.016958903521299362, "rewards/rejected": -0.03986867889761925, "step": 1910 }, { "epoch": 0.6616126809097175, "grad_norm": 1.6985474824905396, "learning_rate": 4.2320611348746484e-08, "logits/chosen": -3.040519952774048, "logits/rejected": -3.0109729766845703, "logps/chosen": -58.26678466796875, "logps/rejected": -59.101173400878906, "loss": 0.6835, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01885036565363407, "rewards/margins": 0.020201385021209717, "rewards/rejected": -0.03905175253748894, "step": 1920 }, { "epoch": 0.6650585802894555, "grad_norm": 1.9158495664596558, "learning_rate": 4.221186376665648e-08, "logits/chosen": -3.1348938941955566, "logits/rejected": -3.1139190196990967, "logps/chosen": -58.493896484375, "logps/rejected": -57.275177001953125, "loss": 0.6878, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.027531281113624573, "rewards/margins": 0.011677147820591927, "rewards/rejected": -0.03920843079686165, "step": 1930 }, { "epoch": 0.6685044796691937, "grad_norm": 1.7860701084136963, "learning_rate": 4.210249347392949e-08, "logits/chosen": -3.0448033809661865, "logits/rejected": -3.034738063812256, "logps/chosen": -57.56203079223633, "logps/rejected": -58.446510314941406, "loss": 0.6877, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.026961099356412888, "rewards/margins": 0.011725891381502151, "rewards/rejected": -0.03868699073791504, "step": 1940 }, { "epoch": 0.6719503790489317, "grad_norm": 1.7874598503112793, "learning_rate": 4.199250442748998e-08, "logits/chosen": -3.0932488441467285, "logits/rejected": -3.051062822341919, "logps/chosen": -57.90545654296875, "logps/rejected": -55.56101608276367, "loss": 0.6838, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.02397969365119934, "rewards/margins": 0.01958349347114563, "rewards/rejected": -0.04356318712234497, "step": 1950 }, { "epoch": 0.6753962784286699, "grad_norm": 1.737123966217041, "learning_rate": 4.188190060664839e-08, "logits/chosen": -3.0911624431610107, "logits/rejected": -3.0598087310791016, "logps/chosen": -59.625099182128906, "logps/rejected": -57.037322998046875, "loss": 0.6835, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.02469709701836109, "rewards/margins": 0.02040681801736355, "rewards/rejected": -0.04510391876101494, "step": 1960 }, { "epoch": 0.6788421778084079, "grad_norm": 1.9258623123168945, "learning_rate": 4.1770686012957165e-08, "logits/chosen": -3.038626194000244, "logits/rejected": -3.026581287384033, "logps/chosen": -54.969688415527344, "logps/rejected": -57.46106719970703, "loss": 0.6855, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.025907838717103004, "rewards/margins": 0.016301840543746948, "rewards/rejected": -0.0422096848487854, "step": 1970 }, { "epoch": 0.6822880771881461, "grad_norm": 1.9813158512115479, "learning_rate": 4.1658864670066e-08, "logits/chosen": -3.0048956871032715, "logits/rejected": -2.9818878173828125, "logps/chosen": -56.708648681640625, "logps/rejected": -59.611244201660156, "loss": 0.6845, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.026203909888863564, "rewards/margins": 0.018200259655714035, "rewards/rejected": -0.04440417140722275, "step": 1980 }, { "epoch": 0.6857339765678843, "grad_norm": 1.7088649272918701, "learning_rate": 4.154644062357629e-08, "logits/chosen": -3.008385419845581, "logits/rejected": -2.9863734245300293, "logps/chosen": -56.71128463745117, "logps/rejected": -57.5838737487793, "loss": 0.6862, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.030050843954086304, "rewards/margins": 0.014781134203076363, "rewards/rejected": -0.044831980019807816, "step": 1990 }, { "epoch": 0.6891798759476223, "grad_norm": 1.862519383430481, "learning_rate": 4.143341794089469e-08, "logits/chosen": -3.094057559967041, "logits/rejected": -3.0693211555480957, "logps/chosen": -58.01648712158203, "logps/rejected": -58.354042053222656, "loss": 0.6859, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.028060251846909523, "rewards/margins": 0.015334056690335274, "rewards/rejected": -0.0433943085372448, "step": 2000 }, { "epoch": 0.6891798759476223, "eval_logits/chosen": -3.117588758468628, "eval_logits/rejected": -3.111891746520996, "eval_logps/chosen": -59.51368713378906, "eval_logps/rejected": -64.59818267822266, "eval_loss": 0.6902511119842529, "eval_rewards/accuracies": 0.5868958830833435, "eval_rewards/chosen": -0.008017915301024914, "eval_rewards/margins": 0.006162704434245825, "eval_rewards/rejected": -0.014180620200932026, "eval_runtime": 385.0437, "eval_samples_per_second": 11.178, "eval_steps_per_second": 1.397, "step": 2000 }, { "epoch": 0.6926257753273605, "grad_norm": 1.9090850353240967, "learning_rate": 4.1319800711086036e-08, "logits/chosen": -3.015761375427246, "logits/rejected": -3.00416898727417, "logps/chosen": -53.12788772583008, "logps/rejected": -57.91374588012695, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.03025425598025322, "rewards/margins": 0.018167123198509216, "rewards/rejected": -0.048421382904052734, "step": 2010 }, { "epoch": 0.6960716747070985, "grad_norm": 1.8756160736083984, "learning_rate": 4.120559304472536e-08, "logits/chosen": -3.0625293254852295, "logits/rejected": -3.0455217361450195, "logps/chosen": -58.8310546875, "logps/rejected": -59.229408264160156, "loss": 0.6856, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02707960642874241, "rewards/margins": 0.01600963994860649, "rewards/rejected": -0.04308924451470375, "step": 2020 }, { "epoch": 0.6995175740868367, "grad_norm": 1.7497738599777222, "learning_rate": 4.10907990737492e-08, "logits/chosen": -3.0139236450195312, "logits/rejected": -2.9887921810150146, "logps/chosen": -56.769683837890625, "logps/rejected": -57.542869567871094, "loss": 0.6849, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.03307128697633743, "rewards/margins": 0.017539430409669876, "rewards/rejected": -0.05061071366071701, "step": 2030 }, { "epoch": 0.7029634734665747, "grad_norm": 1.9670817852020264, "learning_rate": 4.0975422951306095e-08, "logits/chosen": -3.007533311843872, "logits/rejected": -2.992617607116699, "logps/chosen": -56.2066535949707, "logps/rejected": -58.416419982910156, "loss": 0.6856, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.03340793401002884, "rewards/margins": 0.01625545509159565, "rewards/rejected": -0.04966338723897934, "step": 2040 }, { "epoch": 0.7064093728463129, "grad_norm": 1.6379413604736328, "learning_rate": 4.08594688516063e-08, "logits/chosen": -3.0516562461853027, "logits/rejected": -3.021713972091675, "logps/chosen": -56.81854248046875, "logps/rejected": -56.809295654296875, "loss": 0.6824, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.026581842452287674, "rewards/margins": 0.02265850640833378, "rewards/rejected": -0.0492403544485569, "step": 2050 }, { "epoch": 0.709855272226051, "grad_norm": 1.8194429874420166, "learning_rate": 4.0742940969770864e-08, "logits/chosen": -2.977799892425537, "logits/rejected": -2.9734408855438232, "logps/chosen": -56.75453567504883, "logps/rejected": -58.05543899536133, "loss": 0.6878, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.03740880638360977, "rewards/margins": 0.011692319065332413, "rewards/rejected": -0.049101125448942184, "step": 2060 }, { "epoch": 0.7133011716057891, "grad_norm": 1.7173527479171753, "learning_rate": 4.062584352167971e-08, "logits/chosen": -3.059565782546997, "logits/rejected": -3.033684492111206, "logps/chosen": -57.686187744140625, "logps/rejected": -56.419921875, "loss": 0.6843, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.029393959790468216, "rewards/margins": 0.018624670803546906, "rewards/rejected": -0.04801863431930542, "step": 2070 }, { "epoch": 0.7167470709855273, "grad_norm": 1.6900782585144043, "learning_rate": 4.0508180743819255e-08, "logits/chosen": -3.028625965118408, "logits/rejected": -2.9993066787719727, "logps/chosen": -58.17266082763672, "logps/rejected": -55.96406173706055, "loss": 0.6833, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.03412729874253273, "rewards/margins": 0.020834842696785927, "rewards/rejected": -0.05496213957667351, "step": 2080 }, { "epoch": 0.7201929703652653, "grad_norm": 1.733019471168518, "learning_rate": 4.038995689312901e-08, "logits/chosen": -3.0275211334228516, "logits/rejected": -3.0161385536193848, "logps/chosen": -56.534019470214844, "logps/rejected": -59.515838623046875, "loss": 0.6866, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.0316321887075901, "rewards/margins": 0.014154776930809021, "rewards/rejected": -0.04578696936368942, "step": 2090 }, { "epoch": 0.7236388697450035, "grad_norm": 1.9503872394561768, "learning_rate": 4.027117624684765e-08, "logits/chosen": -3.033446788787842, "logits/rejected": -3.018099784851074, "logps/chosen": -56.406578063964844, "logps/rejected": -57.39359664916992, "loss": 0.6846, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030192747712135315, "rewards/margins": 0.018196851015090942, "rewards/rejected": -0.048389606177806854, "step": 2100 }, { "epoch": 0.7236388697450035, "eval_logits/chosen": -3.11161208152771, "eval_logits/rejected": -3.1059489250183105, "eval_logps/chosen": -59.78423309326172, "eval_logps/rejected": -64.94279479980469, "eval_loss": 0.6899173259735107, "eval_rewards/accuracies": 0.5829461216926575, "eval_rewards/chosen": -0.01072339154779911, "eval_rewards/margins": 0.006903324741870165, "eval_rewards/rejected": -0.017626715824007988, "eval_runtime": 384.4459, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 2100 }, { "epoch": 0.7270847691247415, "grad_norm": 1.7966265678405762, "learning_rate": 4.0151843102358255e-08, "logits/chosen": -2.9671010971069336, "logits/rejected": -2.9459662437438965, "logps/chosen": -57.2274284362793, "logps/rejected": -58.16614532470703, "loss": 0.6839, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03469712659716606, "rewards/margins": 0.019563738256692886, "rewards/rejected": -0.05426086112856865, "step": 2110 }, { "epoch": 0.7305306685044797, "grad_norm": 1.7504278421401978, "learning_rate": 4.0031961777032796e-08, "logits/chosen": -3.030264377593994, "logits/rejected": -3.007645606994629, "logps/chosen": -58.34819412231445, "logps/rejected": -60.5435905456543, "loss": 0.6828, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03323102742433548, "rewards/margins": 0.021936681121587753, "rewards/rejected": -0.055167704820632935, "step": 2120 }, { "epoch": 0.7339765678842178, "grad_norm": 1.8038078546524048, "learning_rate": 3.991153660807599e-08, "logits/chosen": -2.9835686683654785, "logits/rejected": -2.9676289558410645, "logps/chosen": -58.33893966674805, "logps/rejected": -58.84006881713867, "loss": 0.6862, "rewards/accuracies": 0.609375, "rewards/chosen": -0.03550464287400246, "rewards/margins": 0.01528511755168438, "rewards/rejected": -0.050789762288331985, "step": 2130 }, { "epoch": 0.7374224672639559, "grad_norm": 1.7348674535751343, "learning_rate": 3.979057195236834e-08, "logits/chosen": -2.9713988304138184, "logits/rejected": -2.945173740386963, "logps/chosen": -54.82487869262695, "logps/rejected": -56.58161544799805, "loss": 0.6831, "rewards/accuracies": 0.640625, "rewards/chosen": -0.03161891549825668, "rewards/margins": 0.021184608340263367, "rewards/rejected": -0.05280352383852005, "step": 2140 }, { "epoch": 0.740868366643694, "grad_norm": 1.775078296661377, "learning_rate": 3.9669072186308496e-08, "logits/chosen": -3.017688035964966, "logits/rejected": -3.000037670135498, "logps/chosen": -57.452980041503906, "logps/rejected": -57.556190490722656, "loss": 0.6853, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0355488583445549, "rewards/margins": 0.016615424305200577, "rewards/rejected": -0.05216427892446518, "step": 2150 }, { "epoch": 0.7443142660234321, "grad_norm": 1.6424776315689087, "learning_rate": 3.9547041705655e-08, "logits/chosen": -3.0127129554748535, "logits/rejected": -2.988957643508911, "logps/chosen": -59.63279342651367, "logps/rejected": -58.4204216003418, "loss": 0.6836, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.0328582227230072, "rewards/margins": 0.020023513585329056, "rewards/rejected": -0.05288173630833626, "step": 2160 }, { "epoch": 0.7477601654031703, "grad_norm": 1.6106654405593872, "learning_rate": 3.942448492536717e-08, "logits/chosen": -2.959467649459839, "logits/rejected": -2.934079647064209, "logps/chosen": -56.54290771484375, "logps/rejected": -56.23845672607422, "loss": 0.6848, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.04299347847700119, "rewards/margins": 0.017936604097485542, "rewards/rejected": -0.06093008443713188, "step": 2170 }, { "epoch": 0.7512060647829083, "grad_norm": 1.8046953678131104, "learning_rate": 3.930140627944539e-08, "logits/chosen": -2.9991118907928467, "logits/rejected": -2.9810397624969482, "logps/chosen": -55.923851013183594, "logps/rejected": -57.81981658935547, "loss": 0.6846, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.03583117201924324, "rewards/margins": 0.01813691481947899, "rewards/rejected": -0.05396808311343193, "step": 2180 }, { "epoch": 0.7546519641626465, "grad_norm": 1.6673351526260376, "learning_rate": 3.9177810220770714e-08, "logits/chosen": -3.047650098800659, "logits/rejected": -3.028332233428955, "logps/chosen": -58.14948272705078, "logps/rejected": -58.61583709716797, "loss": 0.6856, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.035184066742658615, "rewards/margins": 0.016083333641290665, "rewards/rejected": -0.05126740410923958, "step": 2190 }, { "epoch": 0.7580978635423845, "grad_norm": 1.6753615140914917, "learning_rate": 3.905370122094375e-08, "logits/chosen": -2.993964672088623, "logits/rejected": -2.978240966796875, "logps/chosen": -58.799041748046875, "logps/rejected": -59.7036247253418, "loss": 0.6861, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03728087246417999, "rewards/margins": 0.015164054930210114, "rewards/rejected": -0.0524449348449707, "step": 2200 }, { "epoch": 0.7580978635423845, "eval_logits/chosen": -3.108147144317627, "eval_logits/rejected": -3.102468252182007, "eval_logps/chosen": -60.04545974731445, "eval_logps/rejected": -65.24906158447266, "eval_loss": 0.6897270083427429, "eval_rewards/accuracies": 0.5868958830833435, "eval_rewards/chosen": -0.013335632160305977, "eval_rewards/margins": 0.007353761233389378, "eval_rewards/rejected": -0.02068939432501793, "eval_runtime": 384.2736, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 2200 }, { "epoch": 0.7615437629221227, "grad_norm": 1.6932401657104492, "learning_rate": 3.892908377012286e-08, "logits/chosen": -3.028898000717163, "logits/rejected": -2.9983444213867188, "logps/chosen": -58.44950485229492, "logps/rejected": -58.597312927246094, "loss": 0.6826, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.03595678508281708, "rewards/margins": 0.02251359447836876, "rewards/rejected": -0.05847037956118584, "step": 2210 }, { "epoch": 0.7649896623018608, "grad_norm": 1.7982257604599, "learning_rate": 3.8803962376861776e-08, "logits/chosen": -3.013913869857788, "logits/rejected": -2.982335329055786, "logps/chosen": -60.489967346191406, "logps/rejected": -58.84846115112305, "loss": 0.6848, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.038976140320301056, "rewards/margins": 0.017788967117667198, "rewards/rejected": -0.0567651093006134, "step": 2220 }, { "epoch": 0.7684355616815989, "grad_norm": 1.8083417415618896, "learning_rate": 3.86783415679464e-08, "logits/chosen": -2.9765384197235107, "logits/rejected": -2.966383457183838, "logps/chosen": -58.33076858520508, "logps/rejected": -59.06047439575195, "loss": 0.6861, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.041071441024541855, "rewards/margins": 0.015299441292881966, "rewards/rejected": -0.05637087672948837, "step": 2230 }, { "epoch": 0.771881461061337, "grad_norm": 1.6782116889953613, "learning_rate": 3.8552225888231084e-08, "logits/chosen": -3.083024740219116, "logits/rejected": -3.064627170562744, "logps/chosen": -57.59839630126953, "logps/rejected": -57.254180908203125, "loss": 0.6843, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.041320037096738815, "rewards/margins": 0.018795475363731384, "rewards/rejected": -0.0601155161857605, "step": 2240 }, { "epoch": 0.7753273604410751, "grad_norm": 1.9075140953063965, "learning_rate": 3.842561990047419e-08, "logits/chosen": -2.994036912918091, "logits/rejected": -2.97294282913208, "logps/chosen": -57.764801025390625, "logps/rejected": -57.16424560546875, "loss": 0.685, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.04026733711361885, "rewards/margins": 0.017371635884046555, "rewards/rejected": -0.05763896554708481, "step": 2250 }, { "epoch": 0.7787732598208132, "grad_norm": 1.7869060039520264, "learning_rate": 3.829852818517301e-08, "logits/chosen": -3.080289363861084, "logits/rejected": -3.0555710792541504, "logps/chosen": -57.22008514404297, "logps/rejected": -58.78937530517578, "loss": 0.685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.04204849153757095, "rewards/margins": 0.017353584989905357, "rewards/rejected": -0.05940207093954086, "step": 2260 }, { "epoch": 0.7822191592005513, "grad_norm": 1.9243491888046265, "learning_rate": 3.8170955340398024e-08, "logits/chosen": -2.994227886199951, "logits/rejected": -2.9837756156921387, "logps/chosen": -57.66072463989258, "logps/rejected": -58.144371032714844, "loss": 0.6861, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04407629370689392, "rewards/margins": 0.015283094719052315, "rewards/rejected": -0.05935938283801079, "step": 2270 }, { "epoch": 0.7856650585802895, "grad_norm": 1.8666582107543945, "learning_rate": 3.804290598162661e-08, "logits/chosen": -3.0024993419647217, "logits/rejected": -2.971998691558838, "logps/chosen": -59.71482467651367, "logps/rejected": -59.344825744628906, "loss": 0.6804, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.03427065163850784, "rewards/margins": 0.02677471563220024, "rewards/rejected": -0.06104537099599838, "step": 2280 }, { "epoch": 0.7891109579600276, "grad_norm": 1.7960370779037476, "learning_rate": 3.7914384741575963e-08, "logits/chosen": -2.9649546146392822, "logits/rejected": -2.9389636516571045, "logps/chosen": -56.56310272216797, "logps/rejected": -57.679107666015625, "loss": 0.6835, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.03888117894530296, "rewards/margins": 0.020335400477051735, "rewards/rejected": -0.05921658128499985, "step": 2290 }, { "epoch": 0.7925568573397657, "grad_norm": 1.7282792329788208, "learning_rate": 3.778539627003561e-08, "logits/chosen": -2.988661289215088, "logits/rejected": -2.9646308422088623, "logps/chosen": -58.56208038330078, "logps/rejected": -59.764549255371094, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.044373877346515656, "rewards/margins": 0.02048449032008648, "rewards/rejected": -0.06485836952924728, "step": 2300 }, { "epoch": 0.7925568573397657, "eval_logits/chosen": -3.1044154167175293, "eval_logits/rejected": -3.098724603652954, "eval_logps/chosen": -60.39040756225586, "eval_logps/rejected": -65.65296936035156, "eval_loss": 0.6894731521606445, "eval_rewards/accuracies": 0.5922397971153259, "eval_rewards/chosen": -0.016785062849521637, "eval_rewards/margins": 0.007943346165120602, "eval_rewards/rejected": -0.024728409945964813, "eval_runtime": 384.4947, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 2300 }, { "epoch": 0.7960027567195038, "grad_norm": 1.946254849433899, "learning_rate": 3.7655945233699046e-08, "logits/chosen": -2.975891590118408, "logits/rejected": -2.958909749984741, "logps/chosen": -57.987144470214844, "logps/rejected": -59.5714111328125, "loss": 0.6849, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.045588064938783646, "rewards/margins": 0.017828360199928284, "rewards/rejected": -0.06341642886400223, "step": 2310 }, { "epoch": 0.7994486560992419, "grad_norm": 1.8851513862609863, "learning_rate": 3.7526036315995024e-08, "logits/chosen": -3.035238742828369, "logits/rejected": -3.0107016563415527, "logps/chosen": -60.47370147705078, "logps/rejected": -62.362586975097656, "loss": 0.6846, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.04454559087753296, "rewards/margins": 0.018349364399909973, "rewards/rejected": -0.06289495527744293, "step": 2320 }, { "epoch": 0.80289455547898, "grad_norm": 1.8436781167984009, "learning_rate": 3.739567421691803e-08, "logits/chosen": -3.0294172763824463, "logits/rejected": -3.0034923553466797, "logps/chosen": -58.335426330566406, "logps/rejected": -58.9324836730957, "loss": 0.6823, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.04106391221284866, "rewards/margins": 0.02297310158610344, "rewards/rejected": -0.0640370100736618, "step": 2330 }, { "epoch": 0.8063404548587181, "grad_norm": 1.6702615022659302, "learning_rate": 3.726486365285828e-08, "logits/chosen": -3.010782241821289, "logits/rejected": -2.9824962615966797, "logps/chosen": -56.69477462768555, "logps/rejected": -56.83466339111328, "loss": 0.6828, "rewards/accuracies": 0.609375, "rewards/chosen": -0.04235810041427612, "rewards/margins": 0.02205110900104046, "rewards/rejected": -0.06440921127796173, "step": 2340 }, { "epoch": 0.8097863542384562, "grad_norm": 1.8457934856414795, "learning_rate": 3.713360935643105e-08, "logits/chosen": -3.0336556434631348, "logits/rejected": -3.0187900066375732, "logps/chosen": -58.77387237548828, "logps/rejected": -61.80967330932617, "loss": 0.6849, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.050251416862010956, "rewards/margins": 0.017895471304655075, "rewards/rejected": -0.06814688444137573, "step": 2350 }, { "epoch": 0.8132322536181944, "grad_norm": 1.8453093767166138, "learning_rate": 3.7001916076305515e-08, "logits/chosen": -3.008915662765503, "logits/rejected": -2.974138021469116, "logps/chosen": -61.52025604248047, "logps/rejected": -59.27776336669922, "loss": 0.6804, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.04145478457212448, "rewards/margins": 0.027239182963967323, "rewards/rejected": -0.06869396567344666, "step": 2360 }, { "epoch": 0.8166781529979324, "grad_norm": 2.0200343132019043, "learning_rate": 3.686978857703287e-08, "logits/chosen": -2.981760263442993, "logits/rejected": -2.9633803367614746, "logps/chosen": -58.65894317626953, "logps/rejected": -58.9062385559082, "loss": 0.6848, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.04565044492483139, "rewards/margins": 0.0179426372051239, "rewards/rejected": -0.06359308958053589, "step": 2370 }, { "epoch": 0.8201240523776706, "grad_norm": 2.0217244625091553, "learning_rate": 3.6737231638874e-08, "logits/chosen": -2.9946258068084717, "logits/rejected": -2.9741220474243164, "logps/chosen": -58.064208984375, "logps/rejected": -59.4661979675293, "loss": 0.6825, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.041321393102407455, "rewards/margins": 0.022583045065402985, "rewards/rejected": -0.06390444189310074, "step": 2380 }, { "epoch": 0.8235699517574087, "grad_norm": 1.9615968465805054, "learning_rate": 3.660425005762656e-08, "logits/chosen": -2.9946742057800293, "logits/rejected": -2.9747567176818848, "logps/chosen": -60.14471435546875, "logps/rejected": -61.221946716308594, "loss": 0.6836, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04614607244729996, "rewards/margins": 0.020358974114060402, "rewards/rejected": -0.0665050521492958, "step": 2390 }, { "epoch": 0.8270158511371468, "grad_norm": 1.947713017463684, "learning_rate": 3.647084864445137e-08, "logits/chosen": -2.9955036640167236, "logits/rejected": -2.97472882270813, "logps/chosen": -59.67714309692383, "logps/rejected": -58.160728454589844, "loss": 0.6847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0478370264172554, "rewards/margins": 0.018191467970609665, "rewards/rejected": -0.06602849066257477, "step": 2400 }, { "epoch": 0.8270158511371468, "eval_logits/chosen": -3.100661516189575, "eval_logits/rejected": -3.0949013233184814, "eval_logps/chosen": -60.806888580322266, "eval_logps/rejected": -66.14020538330078, "eval_loss": 0.6891672611236572, "eval_rewards/accuracies": 0.5868958830833435, "eval_rewards/chosen": -0.020949942991137505, "eval_rewards/margins": 0.00865084771066904, "eval_rewards/rejected": -0.02960078790783882, "eval_runtime": 384.3481, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 2400 }, { "epoch": 0.8304617505168849, "grad_norm": 1.8975228071212769, "learning_rate": 3.633703222569846e-08, "logits/chosen": -2.9866557121276855, "logits/rejected": -2.9641973972320557, "logps/chosen": -58.975730895996094, "logps/rejected": -59.30794143676758, "loss": 0.6819, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.047889094799757004, "rewards/margins": 0.02423194609582424, "rewards/rejected": -0.0721210390329361, "step": 2410 }, { "epoch": 0.833907649896623, "grad_norm": 2.121293783187866, "learning_rate": 3.620280564273241e-08, "logits/chosen": -2.993213176727295, "logits/rejected": -2.966607093811035, "logps/chosen": -59.87786865234375, "logps/rejected": -59.63630294799805, "loss": 0.6834, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.05004245787858963, "rewards/margins": 0.021116720512509346, "rewards/rejected": -0.07115916907787323, "step": 2420 }, { "epoch": 0.8373535492763611, "grad_norm": 1.939035415649414, "learning_rate": 3.606817375175716e-08, "logits/chosen": -3.0517094135284424, "logits/rejected": -3.0259761810302734, "logps/chosen": -61.3172492980957, "logps/rejected": -59.26446533203125, "loss": 0.6831, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05021647736430168, "rewards/margins": 0.0214863158762455, "rewards/rejected": -0.07170280069112778, "step": 2430 }, { "epoch": 0.8407994486560992, "grad_norm": 1.7287660837173462, "learning_rate": 3.5933141423640376e-08, "logits/chosen": -3.0139081478118896, "logits/rejected": -2.9831302165985107, "logps/chosen": -60.192726135253906, "logps/rejected": -58.50376510620117, "loss": 0.681, "rewards/accuracies": 0.640625, "rewards/chosen": -0.04867525398731232, "rewards/margins": 0.02579745091497898, "rewards/rejected": -0.07447270303964615, "step": 2440 }, { "epoch": 0.8442453480358374, "grad_norm": 1.7438111305236816, "learning_rate": 3.579771354373721e-08, "logits/chosen": -2.996495008468628, "logits/rejected": -2.968963861465454, "logps/chosen": -59.237281799316406, "logps/rejected": -59.39753341674805, "loss": 0.6812, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.04925781860947609, "rewards/margins": 0.025830138474702835, "rewards/rejected": -0.07508794963359833, "step": 2450 }, { "epoch": 0.8476912474155754, "grad_norm": 1.944701910018921, "learning_rate": 3.5661895011713494e-08, "logits/chosen": -3.036695718765259, "logits/rejected": -3.0117411613464355, "logps/chosen": -60.16267013549805, "logps/rejected": -60.866302490234375, "loss": 0.6798, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05073127895593643, "rewards/margins": 0.02843235433101654, "rewards/rejected": -0.07916363328695297, "step": 2460 }, { "epoch": 0.8511371467953136, "grad_norm": 2.0044193267822266, "learning_rate": 3.552569074136858e-08, "logits/chosen": -3.09183931350708, "logits/rejected": -3.071075916290283, "logps/chosen": -60.33824920654297, "logps/rejected": -63.61500930786133, "loss": 0.6829, "rewards/accuracies": 0.609375, "rewards/chosen": -0.058057595044374466, "rewards/margins": 0.021942198276519775, "rewards/rejected": -0.07999978959560394, "step": 2470 }, { "epoch": 0.8545830461750517, "grad_norm": 1.7847065925598145, "learning_rate": 3.5389105660457474e-08, "logits/chosen": -2.915327787399292, "logits/rejected": -2.9085865020751953, "logps/chosen": -58.845794677734375, "logps/rejected": -61.94459915161133, "loss": 0.687, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.053052984178066254, "rewards/margins": 0.013705862686038017, "rewards/rejected": -0.06675885617733002, "step": 2480 }, { "epoch": 0.8580289455547898, "grad_norm": 1.920035719871521, "learning_rate": 3.525214471051258e-08, "logits/chosen": -2.9986464977264404, "logits/rejected": -2.990365982055664, "logps/chosen": -57.812095642089844, "logps/rejected": -60.44374465942383, "loss": 0.687, "rewards/accuracies": 0.578125, "rewards/chosen": -0.05696609616279602, "rewards/margins": 0.013766164891421795, "rewards/rejected": -0.07073226571083069, "step": 2490 }, { "epoch": 0.8614748449345279, "grad_norm": 1.8735101222991943, "learning_rate": 3.511481284666496e-08, "logits/chosen": -2.9600300788879395, "logits/rejected": -2.9459688663482666, "logps/chosen": -58.956642150878906, "logps/rejected": -60.98919677734375, "loss": 0.6838, "rewards/accuracies": 0.59375, "rewards/chosen": -0.051624685525894165, "rewards/margins": 0.020304836332798004, "rewards/rejected": -0.07192952930927277, "step": 2500 }, { "epoch": 0.8614748449345279, "eval_logits/chosen": -3.0967652797698975, "eval_logits/rejected": -3.0910022258758545, "eval_logps/chosen": -61.215702056884766, "eval_logps/rejected": -66.6113052368164, "eval_loss": 0.6889049410820007, "eval_rewards/accuracies": 0.5903810262680054, "eval_rewards/chosen": -0.025038031861186028, "eval_rewards/margins": 0.009273835457861423, "eval_rewards/rejected": -0.03431186452507973, "eval_runtime": 384.2297, "eval_samples_per_second": 11.202, "eval_steps_per_second": 1.4, "step": 2500 }, { "epoch": 0.864920744314266, "grad_norm": 1.9118610620498657, "learning_rate": 3.4977115037464985e-08, "logits/chosen": -2.9711413383483887, "logits/rejected": -2.944815158843994, "logps/chosen": -58.86348342895508, "logps/rejected": -59.43571090698242, "loss": 0.6827, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.05588380619883537, "rewards/margins": 0.022921234369277954, "rewards/rejected": -0.07880503684282303, "step": 2510 }, { "epoch": 0.8683666436940042, "grad_norm": 1.8457341194152832, "learning_rate": 3.483905626470265e-08, "logits/chosen": -3.032449960708618, "logits/rejected": -3.0067577362060547, "logps/chosen": -60.454444885253906, "logps/rejected": -59.48230743408203, "loss": 0.6854, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.05567127466201782, "rewards/margins": 0.017124414443969727, "rewards/rejected": -0.07279568165540695, "step": 2520 }, { "epoch": 0.8718125430737422, "grad_norm": 1.744009256362915, "learning_rate": 3.470064152322728e-08, "logits/chosen": -2.9465372562408447, "logits/rejected": -2.9316067695617676, "logps/chosen": -58.74321746826172, "logps/rejected": -61.12950897216797, "loss": 0.6837, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.05705111101269722, "rewards/margins": 0.020534086972475052, "rewards/rejected": -0.07758519053459167, "step": 2530 }, { "epoch": 0.8752584424534804, "grad_norm": 1.977340579032898, "learning_rate": 3.4561875820766864e-08, "logits/chosen": -3.083641290664673, "logits/rejected": -3.051438808441162, "logps/chosen": -60.03351593017578, "logps/rejected": -57.312896728515625, "loss": 0.6795, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04907160997390747, "rewards/margins": 0.028961068019270897, "rewards/rejected": -0.07803267240524292, "step": 2540 }, { "epoch": 0.8787043418332184, "grad_norm": 1.8939472436904907, "learning_rate": 3.442276417774684e-08, "logits/chosen": -2.963914394378662, "logits/rejected": -2.930088520050049, "logps/chosen": -60.17865753173828, "logps/rejected": -60.11580276489258, "loss": 0.6806, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.054998576641082764, "rewards/margins": 0.027044588699936867, "rewards/rejected": -0.08204315602779388, "step": 2550 }, { "epoch": 0.8821502412129566, "grad_norm": 1.9534904956817627, "learning_rate": 3.4283311627108525e-08, "logits/chosen": -3.0542969703674316, "logits/rejected": -3.020608901977539, "logps/chosen": -61.627174377441406, "logps/rejected": -59.780677795410156, "loss": 0.6841, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.05754048749804497, "rewards/margins": 0.019636686891317368, "rewards/rejected": -0.07717718183994293, "step": 2560 }, { "epoch": 0.8855961405926946, "grad_norm": 1.8603425025939941, "learning_rate": 3.4143523214126946e-08, "logits/chosen": -3.0562100410461426, "logits/rejected": -3.0208911895751953, "logps/chosen": -60.71092987060547, "logps/rejected": -59.271934509277344, "loss": 0.6801, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.05018734186887741, "rewards/margins": 0.02779127098619938, "rewards/rejected": -0.07797860354185104, "step": 2570 }, { "epoch": 0.8890420399724328, "grad_norm": 1.9357118606567383, "learning_rate": 3.4003403996228354e-08, "logits/chosen": -3.037440538406372, "logits/rejected": -3.014925718307495, "logps/chosen": -58.04810333251953, "logps/rejected": -59.7235107421875, "loss": 0.6814, "rewards/accuracies": 0.640625, "rewards/chosen": -0.05774794891476631, "rewards/margins": 0.02505926415324211, "rewards/rejected": -0.08280721306800842, "step": 2580 }, { "epoch": 0.892487939352171, "grad_norm": 1.9867465496063232, "learning_rate": 3.386295904280725e-08, "logits/chosen": -3.059483289718628, "logits/rejected": -3.0427792072296143, "logps/chosen": -60.69512176513672, "logps/rejected": -60.93449783325195, "loss": 0.682, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.053453318774700165, "rewards/margins": 0.024111952632665634, "rewards/rejected": -0.0775652676820755, "step": 2590 }, { "epoch": 0.895933838731909, "grad_norm": 1.870638132095337, "learning_rate": 3.3722193435042965e-08, "logits/chosen": -3.014085054397583, "logits/rejected": -2.9972643852233887, "logps/chosen": -58.396324157714844, "logps/rejected": -61.89160919189453, "loss": 0.6841, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.05936922878026962, "rewards/margins": 0.01977996714413166, "rewards/rejected": -0.07914920151233673, "step": 2600 }, { "epoch": 0.895933838731909, "eval_logits/chosen": -3.0933353900909424, "eval_logits/rejected": -3.087651014328003, "eval_logps/chosen": -61.549625396728516, "eval_logps/rejected": -67.02259826660156, "eval_loss": 0.6885599493980408, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.028377274051308632, "eval_rewards/margins": 0.010047496296465397, "eval_rewards/rejected": -0.038424767553806305, "eval_runtime": 384.4537, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 2600 }, { "epoch": 0.8993797381116472, "grad_norm": 2.0609378814697266, "learning_rate": 3.358111226571583e-08, "logits/chosen": -3.0247185230255127, "logits/rejected": -3.0110678672790527, "logps/chosen": -57.71208572387695, "logps/rejected": -60.53589630126953, "loss": 0.6841, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.055869292467832565, "rewards/margins": 0.019581874832510948, "rewards/rejected": -0.07545118033885956, "step": 2610 }, { "epoch": 0.9028256374913852, "grad_norm": 2.041883945465088, "learning_rate": 3.3439720639022914e-08, "logits/chosen": -2.97845458984375, "logits/rejected": -2.961709499359131, "logps/chosen": -59.412628173828125, "logps/rejected": -63.06536865234375, "loss": 0.6794, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.05394721031188965, "rewards/margins": 0.029407376423478127, "rewards/rejected": -0.08335459232330322, "step": 2620 }, { "epoch": 0.9062715368711234, "grad_norm": 2.0693163871765137, "learning_rate": 3.32980236703934e-08, "logits/chosen": -2.970226764678955, "logits/rejected": -2.9422554969787598, "logps/chosen": -59.864227294921875, "logps/rejected": -59.469871520996094, "loss": 0.6825, "rewards/accuracies": 0.59375, "rewards/chosen": -0.059826843440532684, "rewards/margins": 0.02292069047689438, "rewards/rejected": -0.08274753391742706, "step": 2630 }, { "epoch": 0.9097174362508614, "grad_norm": 1.9728206396102905, "learning_rate": 3.3156026486303463e-08, "logits/chosen": -3.065502882003784, "logits/rejected": -3.0424623489379883, "logps/chosen": -59.48903274536133, "logps/rejected": -59.970909118652344, "loss": 0.6814, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06527196615934372, "rewards/margins": 0.02535632625222206, "rewards/rejected": -0.09062829613685608, "step": 2640 }, { "epoch": 0.9131633356305996, "grad_norm": 1.9187884330749512, "learning_rate": 3.301373422409082e-08, "logits/chosen": -2.965153217315674, "logits/rejected": -2.9357123374938965, "logps/chosen": -60.7358283996582, "logps/rejected": -61.5562744140625, "loss": 0.6806, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05258868262171745, "rewards/margins": 0.02703956887125969, "rewards/rejected": -0.07962825149297714, "step": 2650 }, { "epoch": 0.9166092350103378, "grad_norm": 1.8104689121246338, "learning_rate": 3.287115203176887e-08, "logits/chosen": -3.0565733909606934, "logits/rejected": -3.023632287979126, "logps/chosen": -63.6373176574707, "logps/rejected": -59.726165771484375, "loss": 0.681, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.05793512985110283, "rewards/margins": 0.02604956366121769, "rewards/rejected": -0.08398470282554626, "step": 2660 }, { "epoch": 0.9200551343900758, "grad_norm": 2.0160820484161377, "learning_rate": 3.2728285067840426e-08, "logits/chosen": -2.970146417617798, "logits/rejected": -2.945462942123413, "logps/chosen": -59.9456901550293, "logps/rejected": -60.949256896972656, "loss": 0.683, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.06337870657444, "rewards/margins": 0.022111540660262108, "rewards/rejected": -0.08549024909734726, "step": 2670 }, { "epoch": 0.923501033769814, "grad_norm": 1.902436375617981, "learning_rate": 3.258513850111112e-08, "logits/chosen": -3.000199556350708, "logits/rejected": -2.9784107208251953, "logps/chosen": -59.2768669128418, "logps/rejected": -62.97795867919922, "loss": 0.6821, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.06348595023155212, "rewards/margins": 0.024028928950428963, "rewards/rejected": -0.08751488476991653, "step": 2680 }, { "epoch": 0.926946933149552, "grad_norm": 1.916286826133728, "learning_rate": 3.244171751050235e-08, "logits/chosen": -3.0135703086853027, "logits/rejected": -2.984884023666382, "logps/chosen": -60.307334899902344, "logps/rejected": -60.89008712768555, "loss": 0.6804, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.056459106504917145, "rewards/margins": 0.027269680052995682, "rewards/rejected": -0.08372878283262253, "step": 2690 }, { "epoch": 0.9303928325292902, "grad_norm": 1.8760229349136353, "learning_rate": 3.229802728486395e-08, "logits/chosen": -2.998236894607544, "logits/rejected": -2.9683120250701904, "logps/chosen": -61.063201904296875, "logps/rejected": -61.303009033203125, "loss": 0.6824, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06226380914449692, "rewards/margins": 0.023240935057401657, "rewards/rejected": -0.08550475537776947, "step": 2700 }, { "epoch": 0.9303928325292902, "eval_logits/chosen": -3.089698076248169, "eval_logits/rejected": -3.0839478969573975, "eval_logps/chosen": -61.91862869262695, "eval_logps/rejected": -67.45933532714844, "eval_loss": 0.6882630586624146, "eval_rewards/accuracies": 0.5855018496513367, "eval_rewards/chosen": -0.032067302614450455, "eval_rewards/margins": 0.010724782943725586, "eval_rewards/rejected": -0.04279208183288574, "eval_runtime": 384.3637, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 2700 }, { "epoch": 0.9338387319090282, "grad_norm": 1.9357118606567383, "learning_rate": 3.215407302278644e-08, "logits/chosen": -3.050286054611206, "logits/rejected": -3.033205032348633, "logps/chosen": -61.737022399902344, "logps/rejected": -62.07782745361328, "loss": 0.686, "rewards/accuracies": 0.578125, "rewards/chosen": -0.06558212637901306, "rewards/margins": 0.015997232869267464, "rewards/rejected": -0.08157936483621597, "step": 2710 }, { "epoch": 0.9372846312887664, "grad_norm": 1.9162743091583252, "learning_rate": 3.200985993241298e-08, "logits/chosen": -3.0029079914093018, "logits/rejected": -2.9888548851013184, "logps/chosen": -61.6423454284668, "logps/rejected": -62.909019470214844, "loss": 0.6807, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.06742598116397858, "rewards/margins": 0.026688963174819946, "rewards/rejected": -0.09411494433879852, "step": 2720 }, { "epoch": 0.9407305306685044, "grad_norm": 1.9952343702316284, "learning_rate": 3.1865393231250884e-08, "logits/chosen": -2.979914426803589, "logits/rejected": -2.9662184715270996, "logps/chosen": -61.29094314575195, "logps/rejected": -61.263267517089844, "loss": 0.6844, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.06579684466123581, "rewards/margins": 0.019393552094697952, "rewards/rejected": -0.08519040048122406, "step": 2730 }, { "epoch": 0.9441764300482426, "grad_norm": 2.008897304534912, "learning_rate": 3.172067814598291e-08, "logits/chosen": -2.989284038543701, "logits/rejected": -2.966811180114746, "logps/chosen": -58.96021270751953, "logps/rejected": -60.29884719848633, "loss": 0.682, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06809289008378983, "rewards/margins": 0.024256303906440735, "rewards/rejected": -0.09234919399023056, "step": 2740 }, { "epoch": 0.9476223294279807, "grad_norm": 1.9547452926635742, "learning_rate": 3.1575719912278146e-08, "logits/chosen": -3.0216493606567383, "logits/rejected": -2.99959659576416, "logps/chosen": -61.20268630981445, "logps/rejected": -62.02368927001953, "loss": 0.6837, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06514692306518555, "rewards/margins": 0.020700866356492043, "rewards/rejected": -0.08584778755903244, "step": 2750 }, { "epoch": 0.9510682288077188, "grad_norm": 2.0816023349761963, "learning_rate": 3.143052377460257e-08, "logits/chosen": -2.964078903198242, "logits/rejected": -2.9428696632385254, "logps/chosen": -60.8036994934082, "logps/rejected": -60.589134216308594, "loss": 0.6832, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.06277766823768616, "rewards/margins": 0.021733291447162628, "rewards/rejected": -0.08451096713542938, "step": 2760 }, { "epoch": 0.954514128187457, "grad_norm": 1.8665035963058472, "learning_rate": 3.128509498602933e-08, "logits/chosen": -3.0003015995025635, "logits/rejected": -2.9855659008026123, "logps/chosen": -62.09807586669922, "logps/rejected": -63.236656188964844, "loss": 0.6822, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.05971125513315201, "rewards/margins": 0.023781852796673775, "rewards/rejected": -0.08349311351776123, "step": 2770 }, { "epoch": 0.957960027567195, "grad_norm": 2.0942790508270264, "learning_rate": 3.113943880804867e-08, "logits/chosen": -2.926558017730713, "logits/rejected": -2.9052395820617676, "logps/chosen": -59.053306579589844, "logps/rejected": -60.78388214111328, "loss": 0.6833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06801018118858337, "rewards/margins": 0.02199479192495346, "rewards/rejected": -0.09000497311353683, "step": 2780 }, { "epoch": 0.9614059269469332, "grad_norm": 1.8221739530563354, "learning_rate": 3.0993560510377636e-08, "logits/chosen": -2.9689197540283203, "logits/rejected": -2.9574332237243652, "logps/chosen": -58.8261833190918, "logps/rejected": -64.0365982055664, "loss": 0.6853, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.06904148310422897, "rewards/margins": 0.017710790038108826, "rewards/rejected": -0.0867522731423378, "step": 2790 }, { "epoch": 0.9648518263266712, "grad_norm": 1.8329358100891113, "learning_rate": 3.084746537076932e-08, "logits/chosen": -3.058472156524658, "logits/rejected": -3.035177707672119, "logps/chosen": -60.95489501953125, "logps/rejected": -64.85383605957031, "loss": 0.6824, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06287367641925812, "rewards/margins": 0.023363064974546432, "rewards/rejected": -0.08623673766851425, "step": 2800 }, { "epoch": 0.9648518263266712, "eval_logits/chosen": -3.0867879390716553, "eval_logits/rejected": -3.081066608428955, "eval_logps/chosen": -62.05662536621094, "eval_logps/rejected": -67.6514663696289, "eval_loss": 0.688012421131134, "eval_rewards/accuracies": 0.5929368138313293, "eval_rewards/chosen": -0.03344728797674179, "eval_rewards/margins": 0.0112661337479949, "eval_rewards/rejected": -0.044713422656059265, "eval_runtime": 384.6351, "eval_samples_per_second": 11.19, "eval_steps_per_second": 1.399, "step": 2800 }, { "epoch": 0.9682977257064094, "grad_norm": 2.0032522678375244, "learning_rate": 3.070115867482202e-08, "logits/chosen": -3.013986825942993, "logits/rejected": -2.993983507156372, "logps/chosen": -60.495140075683594, "logps/rejected": -62.9716796875, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": -0.06785044074058533, "rewards/margins": 0.026449289172887802, "rewards/rejected": -0.09429973363876343, "step": 2810 }, { "epoch": 0.9717436250861475, "grad_norm": 2.1149818897247314, "learning_rate": 3.0554645715787926e-08, "logits/chosen": -3.0158185958862305, "logits/rejected": -2.9941751956939697, "logps/chosen": -60.9375, "logps/rejected": -61.80029296875, "loss": 0.6832, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.06010693311691284, "rewards/margins": 0.021654192358255386, "rewards/rejected": -0.08176112920045853, "step": 2820 }, { "epoch": 0.9751895244658856, "grad_norm": 1.9543670415878296, "learning_rate": 3.040793179438167e-08, "logits/chosen": -2.915527582168579, "logits/rejected": -2.910946846008301, "logps/chosen": -58.91196823120117, "logps/rejected": -60.60822677612305, "loss": 0.6839, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06974777579307556, "rewards/margins": 0.020312650129199028, "rewards/rejected": -0.09006042778491974, "step": 2830 }, { "epoch": 0.9786354238456237, "grad_norm": 1.9782330989837646, "learning_rate": 3.026102221858853e-08, "logits/chosen": -2.9936022758483887, "logits/rejected": -2.96232271194458, "logps/chosen": -60.83803176879883, "logps/rejected": -61.680091857910156, "loss": 0.6801, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.0662757009267807, "rewards/margins": 0.028109584003686905, "rewards/rejected": -0.09438528120517731, "step": 2840 }, { "epoch": 0.9820813232253618, "grad_norm": 1.9992231130599976, "learning_rate": 3.0113922303472386e-08, "logits/chosen": -2.9486582279205322, "logits/rejected": -2.924018383026123, "logps/chosen": -64.09626770019531, "logps/rejected": -61.55986404418945, "loss": 0.6797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.062040697783231735, "rewards/margins": 0.028691541403532028, "rewards/rejected": -0.09073223918676376, "step": 2850 }, { "epoch": 0.9855272226051, "grad_norm": 2.026111125946045, "learning_rate": 2.9966637370983444e-08, "logits/chosen": -3.0231735706329346, "logits/rejected": -2.996793031692505, "logps/chosen": -60.3083381652832, "logps/rejected": -61.34482955932617, "loss": 0.6773, "rewards/accuracies": 0.640625, "rewards/chosen": -0.06745734810829163, "rewards/margins": 0.03389859199523926, "rewards/rejected": -0.10135593265295029, "step": 2860 }, { "epoch": 0.988973121984838, "grad_norm": 1.9893444776535034, "learning_rate": 2.981917274976568e-08, "logits/chosen": -3.01432728767395, "logits/rejected": -2.9927713871002197, "logps/chosen": -62.2581787109375, "logps/rejected": -62.59688186645508, "loss": 0.6808, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06791607290506363, "rewards/margins": 0.026418615132570267, "rewards/rejected": -0.094334676861763, "step": 2870 }, { "epoch": 0.9924190213645762, "grad_norm": 2.100264072418213, "learning_rate": 2.967153377496405e-08, "logits/chosen": -3.0361812114715576, "logits/rejected": -3.0079872608184814, "logps/chosen": -63.25773239135742, "logps/rejected": -62.428985595703125, "loss": 0.6802, "rewards/accuracies": 0.609375, "rewards/chosen": -0.06720717996358871, "rewards/margins": 0.027882525697350502, "rewards/rejected": -0.09508970379829407, "step": 2880 }, { "epoch": 0.9958649207443143, "grad_norm": 2.0951766967773438, "learning_rate": 2.9523725788031473e-08, "logits/chosen": -2.951953649520874, "logits/rejected": -2.9324944019317627, "logps/chosen": -58.438323974609375, "logps/rejected": -62.59917068481445, "loss": 0.6824, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.06279212236404419, "rewards/margins": 0.023123985156416893, "rewards/rejected": -0.08591610938310623, "step": 2890 }, { "epoch": 0.9993108201240524, "grad_norm": 1.9077363014221191, "learning_rate": 2.9375754136535602e-08, "logits/chosen": -2.9489340782165527, "logits/rejected": -2.917557954788208, "logps/chosen": -57.6536979675293, "logps/rejected": -62.20427703857422, "loss": 0.6812, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06541205942630768, "rewards/margins": 0.026028072461485863, "rewards/rejected": -0.09144014120101929, "step": 2900 }, { "epoch": 0.9993108201240524, "eval_logits/chosen": -3.0832176208496094, "eval_logits/rejected": -3.077537775039673, "eval_logps/chosen": -62.34251022338867, "eval_logps/rejected": -67.98897552490234, "eval_loss": 0.6877877116203308, "eval_rewards/accuracies": 0.5906133651733398, "eval_rewards/chosen": -0.03630611672997475, "eval_rewards/margins": 0.01178241241723299, "eval_rewards/rejected": -0.04808852821588516, "eval_runtime": 384.1302, "eval_samples_per_second": 11.205, "eval_steps_per_second": 1.401, "step": 2900 }, { "epoch": 1.0027567195037905, "grad_norm": 1.9044243097305298, "learning_rate": 2.922762417396531e-08, "logits/chosen": -3.0532944202423096, "logits/rejected": -3.0282721519470215, "logps/chosen": -60.56848907470703, "logps/rejected": -61.425506591796875, "loss": 0.6794, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0701994001865387, "rewards/margins": 0.029416104778647423, "rewards/rejected": -0.09961550682783127, "step": 2910 }, { "epoch": 1.0062026188835287, "grad_norm": 1.9171464443206787, "learning_rate": 2.9079341259537044e-08, "logits/chosen": -2.955644130706787, "logits/rejected": -2.9289050102233887, "logps/chosen": -59.532318115234375, "logps/rejected": -63.27251434326172, "loss": 0.6794, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.06590018421411514, "rewards/margins": 0.029649043455719948, "rewards/rejected": -0.09554923325777054, "step": 2920 }, { "epoch": 1.0096485182632666, "grad_norm": 2.3012123107910156, "learning_rate": 2.893091075800092e-08, "logits/chosen": -2.9719414710998535, "logits/rejected": -2.956596851348877, "logps/chosen": -58.37969207763672, "logps/rejected": -63.62909698486328, "loss": 0.6805, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07035626471042633, "rewards/margins": 0.027267420664429665, "rewards/rejected": -0.09762369096279144, "step": 2930 }, { "epoch": 1.0130944176430048, "grad_norm": 1.9921698570251465, "learning_rate": 2.878233803944663e-08, "logits/chosen": -2.975639820098877, "logits/rejected": -2.9587063789367676, "logps/chosen": -61.81868362426758, "logps/rejected": -63.246192932128906, "loss": 0.6802, "rewards/accuracies": 0.625, "rewards/chosen": -0.06357287615537643, "rewards/margins": 0.02810371294617653, "rewards/rejected": -0.09167659282684326, "step": 2940 }, { "epoch": 1.016540317022743, "grad_norm": 1.9301649332046509, "learning_rate": 2.863362847910914e-08, "logits/chosen": -2.9926259517669678, "logits/rejected": -2.975491762161255, "logps/chosen": -61.97704315185547, "logps/rejected": -65.73519897460938, "loss": 0.6805, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.06561344116926193, "rewards/margins": 0.027345973998308182, "rewards/rejected": -0.09295941889286041, "step": 2950 }, { "epoch": 1.019986216402481, "grad_norm": 2.173574209213257, "learning_rate": 2.8484787457174276e-08, "logits/chosen": -2.967440128326416, "logits/rejected": -2.9601893424987793, "logps/chosen": -58.916534423828125, "logps/rejected": -64.43171691894531, "loss": 0.6847, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.06918451935052872, "rewards/margins": 0.01901223696768284, "rewards/rejected": -0.0881967544555664, "step": 2960 }, { "epoch": 1.0234321157822193, "grad_norm": 1.9122604131698608, "learning_rate": 2.833582035858399e-08, "logits/chosen": -2.9992122650146484, "logits/rejected": -2.9755702018737793, "logps/chosen": -59.261016845703125, "logps/rejected": -62.033790588378906, "loss": 0.6796, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07028323411941528, "rewards/margins": 0.0288868248462677, "rewards/rejected": -0.09917005896568298, "step": 2970 }, { "epoch": 1.0268780151619572, "grad_norm": 1.9919573068618774, "learning_rate": 2.81867325728416e-08, "logits/chosen": -2.8979835510253906, "logits/rejected": -2.870373487472534, "logps/chosen": -62.300819396972656, "logps/rejected": -61.94474411010742, "loss": 0.6802, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06286562979221344, "rewards/margins": 0.027720922604203224, "rewards/rejected": -0.09058655053377151, "step": 2980 }, { "epoch": 1.0303239145416954, "grad_norm": 1.9404138326644897, "learning_rate": 2.8037529493816785e-08, "logits/chosen": -2.9238524436950684, "logits/rejected": -2.914903402328491, "logps/chosen": -59.53535842895508, "logps/rejected": -62.575408935546875, "loss": 0.6838, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07301677763462067, "rewards/margins": 0.02059059776365757, "rewards/rejected": -0.09360738098621368, "step": 2990 }, { "epoch": 1.0337698139214335, "grad_norm": 1.7792117595672607, "learning_rate": 2.788821651955044e-08, "logits/chosen": -2.9866509437561035, "logits/rejected": -2.9607677459716797, "logps/chosen": -60.698631286621094, "logps/rejected": -61.67456817626953, "loss": 0.6819, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07110203802585602, "rewards/margins": 0.024299288168549538, "rewards/rejected": -0.09540131688117981, "step": 3000 }, { "epoch": 1.0337698139214335, "eval_logits/chosen": -3.0797338485717773, "eval_logits/rejected": -3.074019193649292, "eval_logps/chosen": -62.44396209716797, "eval_logps/rejected": -68.11664581298828, "eval_loss": 0.6876720786094666, "eval_rewards/accuracies": 0.5931691527366638, "eval_rewards/chosen": -0.03732062131166458, "eval_rewards/margins": 0.012044590897858143, "eval_rewards/rejected": -0.04936521500349045, "eval_runtime": 384.3196, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 3000 }, { "epoch": 1.0372157133011717, "grad_norm": 1.8993059396743774, "learning_rate": 2.773879905205936e-08, "logits/chosen": -3.061885356903076, "logits/rejected": -3.0365428924560547, "logps/chosen": -60.7781867980957, "logps/rejected": -60.112342834472656, "loss": 0.6823, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07066932320594788, "rewards/margins": 0.023926690220832825, "rewards/rejected": -0.0945960059762001, "step": 3010 }, { "epoch": 1.0406616126809096, "grad_norm": 1.7303686141967773, "learning_rate": 2.7589282497140826e-08, "logits/chosen": -2.90051531791687, "logits/rejected": -2.8854193687438965, "logps/chosen": -59.404823303222656, "logps/rejected": -60.72951126098633, "loss": 0.6837, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.07631709426641464, "rewards/margins": 0.021014804020524025, "rewards/rejected": -0.09733189642429352, "step": 3020 }, { "epoch": 1.0441075120606478, "grad_norm": 1.7988367080688477, "learning_rate": 2.7439672264177017e-08, "logits/chosen": -2.9338347911834717, "logits/rejected": -2.9088456630706787, "logps/chosen": -63.385589599609375, "logps/rejected": -62.660133361816406, "loss": 0.6798, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.06967137008905411, "rewards/margins": 0.02861020527780056, "rewards/rejected": -0.09828157722949982, "step": 3030 }, { "epoch": 1.047553411440386, "grad_norm": 2.00886869430542, "learning_rate": 2.7289973765939316e-08, "logits/chosen": -2.961655378341675, "logits/rejected": -2.9402718544006348, "logps/chosen": -63.45885467529297, "logps/rejected": -62.679222106933594, "loss": 0.6788, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06949891895055771, "rewards/margins": 0.030794551596045494, "rewards/rejected": -0.10029347240924835, "step": 3040 }, { "epoch": 1.050999310820124, "grad_norm": 1.9969176054000854, "learning_rate": 2.7140192418392456e-08, "logits/chosen": -2.998516082763672, "logits/rejected": -2.9880592823028564, "logps/chosen": -60.910118103027344, "logps/rejected": -62.53899002075195, "loss": 0.6787, "rewards/accuracies": 0.640625, "rewards/chosen": -0.06814457476139069, "rewards/margins": 0.031134705990552902, "rewards/rejected": -0.09927927702665329, "step": 3050 }, { "epoch": 1.0544452101998623, "grad_norm": 2.0115458965301514, "learning_rate": 2.699033364049858e-08, "logits/chosen": -3.0406007766723633, "logits/rejected": -3.008848190307617, "logps/chosen": -59.562950134277344, "logps/rejected": -63.31853103637695, "loss": 0.6779, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06552625447511673, "rewards/margins": 0.03282874822616577, "rewards/rejected": -0.0983550101518631, "step": 3060 }, { "epoch": 1.0578911095796002, "grad_norm": 2.114454746246338, "learning_rate": 2.684040285402122e-08, "logits/chosen": -3.008307933807373, "logits/rejected": -2.9894957542419434, "logps/chosen": -61.14850997924805, "logps/rejected": -62.975914001464844, "loss": 0.6824, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.07413209974765778, "rewards/margins": 0.023501945659518242, "rewards/rejected": -0.09763404726982117, "step": 3070 }, { "epoch": 1.0613370089593384, "grad_norm": 1.9892925024032593, "learning_rate": 2.6690405483329103e-08, "logits/chosen": -2.9078526496887207, "logits/rejected": -2.895010471343994, "logps/chosen": -59.090423583984375, "logps/rejected": -60.913917541503906, "loss": 0.6813, "rewards/accuracies": 0.625, "rewards/chosen": -0.07098571211099625, "rewards/margins": 0.025464143604040146, "rewards/rejected": -0.09644986689090729, "step": 3080 }, { "epoch": 1.0647829083390765, "grad_norm": 2.013815402984619, "learning_rate": 2.6540346955199894e-08, "logits/chosen": -3.019172191619873, "logits/rejected": -2.998669385910034, "logps/chosen": -65.81825256347656, "logps/rejected": -64.49800109863281, "loss": 0.6805, "rewards/accuracies": 0.609375, "rewards/chosen": -0.07010366022586823, "rewards/margins": 0.027619188651442528, "rewards/rejected": -0.0977228507399559, "step": 3090 }, { "epoch": 1.0682288077188147, "grad_norm": 1.989579200744629, "learning_rate": 2.6390232698623925e-08, "logits/chosen": -2.934821605682373, "logits/rejected": -2.9127309322357178, "logps/chosen": -63.19129180908203, "logps/rejected": -63.9019775390625, "loss": 0.6796, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.06846065074205399, "rewards/margins": 0.029360279440879822, "rewards/rejected": -0.09782092273235321, "step": 3100 }, { "epoch": 1.0682288077188147, "eval_logits/chosen": -3.075859546661377, "eval_logits/rejected": -3.070146322250366, "eval_logps/chosen": -62.62958908081055, "eval_logps/rejected": -68.35604858398438, "eval_loss": 0.6874265074729919, "eval_rewards/accuracies": 0.5987453460693359, "eval_rewards/chosen": -0.039176926016807556, "eval_rewards/margins": 0.012582373805344105, "eval_rewards/rejected": -0.051759302616119385, "eval_runtime": 384.4384, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.399, "step": 3100 }, { "epoch": 1.0716747070985528, "grad_norm": 1.7961196899414062, "learning_rate": 2.624006814460772e-08, "logits/chosen": -2.987920045852661, "logits/rejected": -2.9627695083618164, "logps/chosen": -60.823204040527344, "logps/rejected": -62.9400520324707, "loss": 0.6798, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.07341060042381287, "rewards/margins": 0.02877485193312168, "rewards/rejected": -0.10218546539545059, "step": 3110 }, { "epoch": 1.0751206064782908, "grad_norm": 2.1381261348724365, "learning_rate": 2.608985872597749e-08, "logits/chosen": -3.026332139968872, "logits/rejected": -3.0009701251983643, "logps/chosen": -64.1605224609375, "logps/rejected": -65.11061096191406, "loss": 0.6773, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06550760567188263, "rewards/margins": 0.034324102103710175, "rewards/rejected": -0.09983170032501221, "step": 3120 }, { "epoch": 1.078566505858029, "grad_norm": 2.180638313293457, "learning_rate": 2.5939609877182672e-08, "logits/chosen": -2.9584217071533203, "logits/rejected": -2.9367153644561768, "logps/chosen": -62.10315704345703, "logps/rejected": -62.0599250793457, "loss": 0.6811, "rewards/accuracies": 0.578125, "rewards/chosen": -0.07277052104473114, "rewards/margins": 0.02645127847790718, "rewards/rejected": -0.09922181069850922, "step": 3130 }, { "epoch": 1.082012405237767, "grad_norm": 2.0120506286621094, "learning_rate": 2.5789327034099196e-08, "logits/chosen": -3.004655599594116, "logits/rejected": -2.991770029067993, "logps/chosen": -60.61780548095703, "logps/rejected": -63.46196746826172, "loss": 0.6799, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.06961344182491302, "rewards/margins": 0.02859930694103241, "rewards/rejected": -0.09821274131536484, "step": 3140 }, { "epoch": 1.0854583046175053, "grad_norm": 1.911136269569397, "learning_rate": 2.5639015633832895e-08, "logits/chosen": -2.9630255699157715, "logits/rejected": -2.9336097240448, "logps/chosen": -59.48429489135742, "logps/rejected": -62.6728630065918, "loss": 0.6797, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.07576960325241089, "rewards/margins": 0.028852378949522972, "rewards/rejected": -0.10462198406457901, "step": 3150 }, { "epoch": 1.0889042039972432, "grad_norm": 1.9514598846435547, "learning_rate": 2.548868111452281e-08, "logits/chosen": -2.9592223167419434, "logits/rejected": -2.932917833328247, "logps/chosen": -60.18004608154297, "logps/rejected": -62.1012077331543, "loss": 0.6837, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -0.07590612769126892, "rewards/margins": 0.020728668197989464, "rewards/rejected": -0.09663479030132294, "step": 3160 }, { "epoch": 1.0923501033769814, "grad_norm": 2.163417339324951, "learning_rate": 2.5338328915144336e-08, "logits/chosen": -2.948763132095337, "logits/rejected": -2.9275569915771484, "logps/chosen": -61.674354553222656, "logps/rejected": -63.544090270996094, "loss": 0.6789, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.07235665619373322, "rewards/margins": 0.030740192160010338, "rewards/rejected": -0.1030968576669693, "step": 3170 }, { "epoch": 1.0957960027567195, "grad_norm": 1.9518697261810303, "learning_rate": 2.5187964475312597e-08, "logits/chosen": -2.948805570602417, "logits/rejected": -2.944000482559204, "logps/chosen": -61.725982666015625, "logps/rejected": -65.04237365722656, "loss": 0.6823, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07234276086091995, "rewards/margins": 0.023657139390707016, "rewards/rejected": -0.09599989652633667, "step": 3180 }, { "epoch": 1.0992419021364577, "grad_norm": 2.125180721282959, "learning_rate": 2.503759323508552e-08, "logits/chosen": -3.0046706199645996, "logits/rejected": -2.9987471103668213, "logps/chosen": -60.51375198364258, "logps/rejected": -64.95499420166016, "loss": 0.6847, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.07574325799942017, "rewards/margins": 0.0189945288002491, "rewards/rejected": -0.09473778307437897, "step": 3190 }, { "epoch": 1.1026878015161956, "grad_norm": 2.1917178630828857, "learning_rate": 2.4887220634767067e-08, "logits/chosen": -3.024988889694214, "logits/rejected": -2.9950709342956543, "logps/chosen": -63.498199462890625, "logps/rejected": -63.7032356262207, "loss": 0.6776, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.06609619408845901, "rewards/margins": 0.033227093517780304, "rewards/rejected": -0.09932328760623932, "step": 3200 }, { "epoch": 1.1026878015161956, "eval_logits/chosen": -3.0731794834136963, "eval_logits/rejected": -3.067418098449707, "eval_logps/chosen": -62.80425262451172, "eval_logps/rejected": -68.5819091796875, "eval_loss": 0.6871995329856873, "eval_rewards/accuracies": 0.5906133651733398, "eval_rewards/chosen": -0.04092356562614441, "eval_rewards/margins": 0.013094342313706875, "eval_rewards/rejected": -0.05401790514588356, "eval_runtime": 384.4612, "eval_samples_per_second": 11.195, "eval_steps_per_second": 1.399, "step": 3200 }, { "epoch": 1.1061337008959338, "grad_norm": 1.9683195352554321, "learning_rate": 2.4736852114710417e-08, "logits/chosen": -3.0620484352111816, "logits/rejected": -3.0399696826934814, "logps/chosen": -61.793212890625, "logps/rejected": -62.917564392089844, "loss": 0.6799, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.07215774804353714, "rewards/margins": 0.028695005923509598, "rewards/rejected": -0.10085275024175644, "step": 3210 }, { "epoch": 1.109579600275672, "grad_norm": 2.0262603759765625, "learning_rate": 2.458649311512114e-08, "logits/chosen": -2.94061541557312, "logits/rejected": -2.924363851547241, "logps/chosen": -59.32355499267578, "logps/rejected": -60.30609130859375, "loss": 0.6839, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.07498239725828171, "rewards/margins": 0.020939212292432785, "rewards/rejected": -0.09592162072658539, "step": 3220 }, { "epoch": 1.11302549965541, "grad_norm": 2.0922205448150635, "learning_rate": 2.443614907586034e-08, "logits/chosen": -2.944875717163086, "logits/rejected": -2.9296464920043945, "logps/chosen": -61.308441162109375, "logps/rejected": -65.41650390625, "loss": 0.6785, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0720820352435112, "rewards/margins": 0.03132672980427742, "rewards/rejected": -0.10340876877307892, "step": 3230 }, { "epoch": 1.1164713990351482, "grad_norm": 2.039001941680908, "learning_rate": 2.4285825436247875e-08, "logits/chosen": -2.9310574531555176, "logits/rejected": -2.901254415512085, "logps/chosen": -61.543861389160156, "logps/rejected": -61.45790481567383, "loss": 0.6769, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.06877841800451279, "rewards/margins": 0.03471370413899422, "rewards/rejected": -0.1034921258687973, "step": 3240 }, { "epoch": 1.1199172984148862, "grad_norm": 2.0376031398773193, "learning_rate": 2.413552763486558e-08, "logits/chosen": -3.0727105140686035, "logits/rejected": -3.0629258155822754, "logps/chosen": -63.042259216308594, "logps/rejected": -63.58977508544922, "loss": 0.684, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.08128751814365387, "rewards/margins": 0.0204035434871912, "rewards/rejected": -0.10169105231761932, "step": 3250 }, { "epoch": 1.1233631977946243, "grad_norm": 2.0699033737182617, "learning_rate": 2.3985261109360457e-08, "logits/chosen": -2.9842300415039062, "logits/rejected": -2.948054313659668, "logps/chosen": -61.926849365234375, "logps/rejected": -62.5003662109375, "loss": 0.6765, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.0672898143529892, "rewards/margins": 0.035381607711315155, "rewards/rejected": -0.10267140716314316, "step": 3260 }, { "epoch": 1.1268090971743625, "grad_norm": 2.1659767627716064, "learning_rate": 2.3835031296247988e-08, "logits/chosen": -2.9566397666931152, "logits/rejected": -2.9284496307373047, "logps/chosen": -62.831298828125, "logps/rejected": -64.27127838134766, "loss": 0.6789, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.0733214020729065, "rewards/margins": 0.03064587712287903, "rewards/rejected": -0.10396728664636612, "step": 3270 }, { "epoch": 1.1302549965541007, "grad_norm": 1.9470397233963013, "learning_rate": 2.3684843630715446e-08, "logits/chosen": -2.9487035274505615, "logits/rejected": -2.9159140586853027, "logps/chosen": -61.108551025390625, "logps/rejected": -62.888671875, "loss": 0.6753, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.07333475351333618, "rewards/margins": 0.037748340517282486, "rewards/rejected": -0.11108310520648956, "step": 3280 }, { "epoch": 1.1337008959338388, "grad_norm": 1.8661203384399414, "learning_rate": 2.3534703546425203e-08, "logits/chosen": -3.003861904144287, "logits/rejected": -2.9693593978881836, "logps/chosen": -61.7313346862793, "logps/rejected": -61.66279220581055, "loss": 0.6798, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.07435169070959091, "rewards/margins": 0.028780505061149597, "rewards/rejected": -0.1031322032213211, "step": 3290 }, { "epoch": 1.1371467953135768, "grad_norm": 1.8614228963851929, "learning_rate": 2.338461647531821e-08, "logits/chosen": -3.0054256916046143, "logits/rejected": -2.986172914505005, "logps/chosen": -62.00334548950195, "logps/rejected": -63.677406311035156, "loss": 0.6824, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.07767204940319061, "rewards/margins": 0.023757968097925186, "rewards/rejected": -0.1014300212264061, "step": 3300 }, { "epoch": 1.1371467953135768, "eval_logits/chosen": -3.0700747966766357, "eval_logits/rejected": -3.064316749572754, "eval_logps/chosen": -63.07503128051758, "eval_logps/rejected": -68.88993835449219, "eval_loss": 0.6870441436767578, "eval_rewards/accuracies": 0.5945631861686707, "eval_rewards/chosen": -0.04363138601183891, "eval_rewards/margins": 0.013466770760715008, "eval_rewards/rejected": -0.057098157703876495, "eval_runtime": 384.3926, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 3300 }, { "epoch": 1.140592694693315, "grad_norm": 2.364335536956787, "learning_rate": 2.3234587847417447e-08, "logits/chosen": -2.9901270866394043, "logits/rejected": -2.969078540802002, "logps/chosen": -60.3624382019043, "logps/rejected": -62.73461151123047, "loss": 0.6811, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08183901011943817, "rewards/margins": 0.025942673906683922, "rewards/rejected": -0.10778167098760605, "step": 3310 }, { "epoch": 1.144038594073053, "grad_norm": 1.817582607269287, "learning_rate": 2.3084623090631447e-08, "logits/chosen": -2.979372501373291, "logits/rejected": -2.9465384483337402, "logps/chosen": -61.04265213012695, "logps/rejected": -60.45512008666992, "loss": 0.6781, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.07833977043628693, "rewards/margins": 0.032259080559015274, "rewards/rejected": -0.1105988472700119, "step": 3320 }, { "epoch": 1.1474844934527912, "grad_norm": 2.0053155422210693, "learning_rate": 2.2934727630557967e-08, "logits/chosen": -3.0550856590270996, "logits/rejected": -3.0343222618103027, "logps/chosen": -59.455810546875, "logps/rejected": -64.58208465576172, "loss": 0.6778, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07518685609102249, "rewards/margins": 0.03304674103856087, "rewards/rejected": -0.10823359340429306, "step": 3330 }, { "epoch": 1.1509303928325294, "grad_norm": 2.0891268253326416, "learning_rate": 2.278490689028765e-08, "logits/chosen": -2.9639458656311035, "logits/rejected": -2.9505555629730225, "logps/chosen": -59.85667037963867, "logps/rejected": -64.63966369628906, "loss": 0.6794, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.07990622520446777, "rewards/margins": 0.029703548178076744, "rewards/rejected": -0.10960976779460907, "step": 3340 }, { "epoch": 1.1543762922122673, "grad_norm": 2.108522891998291, "learning_rate": 2.263516629020784e-08, "logits/chosen": -2.969435930252075, "logits/rejected": -2.953042507171631, "logps/chosen": -63.971824645996094, "logps/rejected": -66.6336441040039, "loss": 0.6772, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.07327866554260254, "rewards/margins": 0.034395571798086166, "rewards/rejected": -0.107674241065979, "step": 3350 }, { "epoch": 1.1578221915920055, "grad_norm": 1.9754632711410522, "learning_rate": 2.2485511247806493e-08, "logits/chosen": -2.967040538787842, "logits/rejected": -2.9482016563415527, "logps/chosen": -61.31055450439453, "logps/rejected": -63.2735595703125, "loss": 0.6766, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07140516489744186, "rewards/margins": 0.03584301844239235, "rewards/rejected": -0.10724818706512451, "step": 3360 }, { "epoch": 1.1612680909717437, "grad_norm": 1.840278148651123, "learning_rate": 2.233594717747614e-08, "logits/chosen": -2.9520742893218994, "logits/rejected": -2.9398889541625977, "logps/chosen": -63.01189041137695, "logps/rejected": -65.7177505493164, "loss": 0.6813, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.07444876432418823, "rewards/margins": 0.025889653712511063, "rewards/rejected": -0.100338414311409, "step": 3370 }, { "epoch": 1.1647139903514818, "grad_norm": 1.9508111476898193, "learning_rate": 2.2186479490318026e-08, "logits/chosen": -3.0001091957092285, "logits/rejected": -2.9788658618927, "logps/chosen": -60.84368896484375, "logps/rejected": -62.41579055786133, "loss": 0.6812, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08587892353534698, "rewards/margins": 0.0261564739048481, "rewards/rejected": -0.11203539371490479, "step": 3380 }, { "epoch": 1.1681598897312198, "grad_norm": 2.1116995811462402, "learning_rate": 2.203711359394635e-08, "logits/chosen": -2.984722852706909, "logits/rejected": -2.966047763824463, "logps/chosen": -62.917694091796875, "logps/rejected": -65.20925903320312, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": -0.07749287784099579, "rewards/margins": 0.029815923422574997, "rewards/rejected": -0.10730880498886108, "step": 3390 }, { "epoch": 1.171605789110958, "grad_norm": 2.0296571254730225, "learning_rate": 2.1887854892292585e-08, "logits/chosen": -2.9498519897460938, "logits/rejected": -2.9285740852355957, "logps/chosen": -59.8641242980957, "logps/rejected": -63.11505126953125, "loss": 0.6787, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.08392973244190216, "rewards/margins": 0.031144190579652786, "rewards/rejected": -0.11507391929626465, "step": 3400 }, { "epoch": 1.171605789110958, "eval_logits/chosen": -3.0668275356292725, "eval_logits/rejected": -3.061082363128662, "eval_logps/chosen": -63.29131317138672, "eval_logps/rejected": -69.14148712158203, "eval_loss": 0.6868980526924133, "eval_rewards/accuracies": 0.5940985083580017, "eval_rewards/chosen": -0.04579411447048187, "eval_rewards/margins": 0.013819512911140919, "eval_rewards/rejected": -0.059613630175590515, "eval_runtime": 384.4849, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 3400 }, { "epoch": 1.175051688490696, "grad_norm": 2.0198545455932617, "learning_rate": 2.1738708785409993e-08, "logits/chosen": -3.011244058609009, "logits/rejected": -2.9873194694519043, "logps/chosen": -62.25665283203125, "logps/rejected": -63.576087951660156, "loss": 0.6729, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.06893068552017212, "rewards/margins": 0.04300907254219055, "rewards/rejected": -0.11193976551294327, "step": 3410 }, { "epoch": 1.1784975878704342, "grad_norm": 2.206679105758667, "learning_rate": 2.1589680669278273e-08, "logits/chosen": -3.0483782291412354, "logits/rejected": -3.0321924686431885, "logps/chosen": -63.279327392578125, "logps/rejected": -64.15853881835938, "loss": 0.6839, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.07862303406000137, "rewards/margins": 0.020904963836073875, "rewards/rejected": -0.0995279997587204, "step": 3420 }, { "epoch": 1.1819434872501722, "grad_norm": 1.9603242874145508, "learning_rate": 2.14407759356083e-08, "logits/chosen": -2.9316813945770264, "logits/rejected": -2.8970885276794434, "logps/chosen": -64.16649627685547, "logps/rejected": -64.34681701660156, "loss": 0.6753, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.07526973634958267, "rewards/margins": 0.038385529071092606, "rewards/rejected": -0.11365525424480438, "step": 3430 }, { "epoch": 1.1853893866299103, "grad_norm": 2.079928159713745, "learning_rate": 2.1291999971647077e-08, "logits/chosen": -2.967278003692627, "logits/rejected": -2.946516752243042, "logps/chosen": -61.39111328125, "logps/rejected": -62.85634231567383, "loss": 0.681, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.087169349193573, "rewards/margins": 0.026739057153463364, "rewards/rejected": -0.11390841007232666, "step": 3440 }, { "epoch": 1.1888352860096485, "grad_norm": 2.0616533756256104, "learning_rate": 2.1143358159982836e-08, "logits/chosen": -2.967345952987671, "logits/rejected": -2.9432525634765625, "logps/chosen": -60.566123962402344, "logps/rejected": -63.50170135498047, "loss": 0.6796, "rewards/accuracies": 0.625, "rewards/chosen": -0.07633674144744873, "rewards/margins": 0.029550742357969284, "rewards/rejected": -0.10588748753070831, "step": 3450 }, { "epoch": 1.1922811853893867, "grad_norm": 2.107825756072998, "learning_rate": 2.0994855878350274e-08, "logits/chosen": -3.0657877922058105, "logits/rejected": -3.0379040241241455, "logps/chosen": -64.24806213378906, "logps/rejected": -65.42948913574219, "loss": 0.6783, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.07753048092126846, "rewards/margins": 0.03188103809952736, "rewards/rejected": -0.10941150039434433, "step": 3460 }, { "epoch": 1.1957270847691248, "grad_norm": 2.052567720413208, "learning_rate": 2.084649849943604e-08, "logits/chosen": -2.89976167678833, "logits/rejected": -2.862915277481079, "logps/chosen": -64.68438720703125, "logps/rejected": -63.40746307373047, "loss": 0.6739, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.06885982304811478, "rewards/margins": 0.04099997133016586, "rewards/rejected": -0.10985980182886124, "step": 3470 }, { "epoch": 1.1991729841488628, "grad_norm": 1.953604817390442, "learning_rate": 2.0698291390684307e-08, "logits/chosen": -3.04626727104187, "logits/rejected": -3.0169880390167236, "logps/chosen": -62.52994918823242, "logps/rejected": -61.918418884277344, "loss": 0.6763, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.07942342013120651, "rewards/margins": 0.03616594150662422, "rewards/rejected": -0.11558938026428223, "step": 3480 }, { "epoch": 1.202618883528601, "grad_norm": 1.9615064859390259, "learning_rate": 2.0550239914102593e-08, "logits/chosen": -2.9919002056121826, "logits/rejected": -2.959578275680542, "logps/chosen": -61.7932243347168, "logps/rejected": -63.043975830078125, "loss": 0.6747, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.07409648597240448, "rewards/margins": 0.03935806080698967, "rewards/rejected": -0.11345455795526505, "step": 3490 }, { "epoch": 1.206064782908339, "grad_norm": 2.0523669719696045, "learning_rate": 2.0402349426067798e-08, "logits/chosen": -3.0102508068084717, "logits/rejected": -2.99145770072937, "logps/chosen": -65.25118255615234, "logps/rejected": -65.96293640136719, "loss": 0.6801, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.0861230120062828, "rewards/margins": 0.028741363435983658, "rewards/rejected": -0.11486438661813736, "step": 3500 }, { "epoch": 1.206064782908339, "eval_logits/chosen": -3.0645828247070312, "eval_logits/rejected": -3.058849573135376, "eval_logps/chosen": -63.53173065185547, "eval_logps/rejected": -69.41845703125, "eval_loss": 0.6867420673370361, "eval_rewards/accuracies": 0.5929368138313293, "eval_rewards/chosen": -0.04819829761981964, "eval_rewards/margins": 0.014185106381773949, "eval_rewards/rejected": -0.06238340213894844, "eval_runtime": 384.3982, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 3500 }, { "epoch": 1.2095106822880772, "grad_norm": 2.0280165672302246, "learning_rate": 2.0254625277132383e-08, "logits/chosen": -2.9636659622192383, "logits/rejected": -2.9369940757751465, "logps/chosen": -61.99406051635742, "logps/rejected": -64.51708984375, "loss": 0.6755, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.08114029467105865, "rewards/margins": 0.03792557865381241, "rewards/rejected": -0.11906588077545166, "step": 3510 }, { "epoch": 1.2129565816678154, "grad_norm": 2.152580976486206, "learning_rate": 2.0107072811830786e-08, "logits/chosen": -2.9667038917541504, "logits/rejected": -2.9484071731567383, "logps/chosen": -62.68111038208008, "logps/rejected": -66.8736343383789, "loss": 0.6795, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08092640340328217, "rewards/margins": 0.029753107577562332, "rewards/rejected": -0.1106795221567154, "step": 3520 }, { "epoch": 1.2164024810475533, "grad_norm": 2.1510519981384277, "learning_rate": 1.9959697368486107e-08, "logits/chosen": -2.968773365020752, "logits/rejected": -2.9507293701171875, "logps/chosen": -61.249794006347656, "logps/rejected": -65.181396484375, "loss": 0.6787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08916781842708588, "rewards/margins": 0.031191542744636536, "rewards/rejected": -0.1203593760728836, "step": 3530 }, { "epoch": 1.2198483804272915, "grad_norm": 2.2037718296051025, "learning_rate": 1.9812504279016915e-08, "logits/chosen": -3.019176483154297, "logits/rejected": -2.990429162979126, "logps/chosen": -62.22697830200195, "logps/rejected": -64.77278137207031, "loss": 0.6794, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08191390335559845, "rewards/margins": 0.029749279841780663, "rewards/rejected": -0.11166318506002426, "step": 3540 }, { "epoch": 1.2232942798070296, "grad_norm": 2.0587522983551025, "learning_rate": 1.9665498868744378e-08, "logits/chosen": -3.0102896690368652, "logits/rejected": -2.994324207305908, "logps/chosen": -63.87359619140625, "logps/rejected": -66.18248748779297, "loss": 0.679, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08578020334243774, "rewards/margins": 0.03075292706489563, "rewards/rejected": -0.11653313785791397, "step": 3550 }, { "epoch": 1.2267401791867678, "grad_norm": 2.123263120651245, "learning_rate": 1.95186864561996e-08, "logits/chosen": -2.9897990226745605, "logits/rejected": -2.949155330657959, "logps/chosen": -65.28316497802734, "logps/rejected": -62.66881561279297, "loss": 0.6774, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.075536347925663, "rewards/margins": 0.03343961387872696, "rewards/rejected": -0.10897596180438995, "step": 3560 }, { "epoch": 1.230186078566506, "grad_norm": 1.9812653064727783, "learning_rate": 1.9372072352931186e-08, "logits/chosen": -2.9194540977478027, "logits/rejected": -2.9036872386932373, "logps/chosen": -61.8432502746582, "logps/rejected": -63.653343200683594, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0801379531621933, "rewards/margins": 0.030780458822846413, "rewards/rejected": -0.11091840267181396, "step": 3570 }, { "epoch": 1.233631977946244, "grad_norm": 2.041963577270508, "learning_rate": 1.9225661863313063e-08, "logits/chosen": -3.0060529708862305, "logits/rejected": -2.9751877784729004, "logps/chosen": -63.72297286987305, "logps/rejected": -66.69009399414062, "loss": 0.6775, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.0869833379983902, "rewards/margins": 0.034079745411872864, "rewards/rejected": -0.12106309086084366, "step": 3580 }, { "epoch": 1.237077877325982, "grad_norm": 2.1056067943573, "learning_rate": 1.9079460284352616e-08, "logits/chosen": -2.989348888397217, "logits/rejected": -2.968977689743042, "logps/chosen": -64.03874206542969, "logps/rejected": -65.50846862792969, "loss": 0.6805, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08604481816291809, "rewards/margins": 0.027634482830762863, "rewards/rejected": -0.11367928981781006, "step": 3590 }, { "epoch": 1.2405237767057202, "grad_norm": 2.2646119594573975, "learning_rate": 1.893347290549901e-08, "logits/chosen": -2.9118189811706543, "logits/rejected": -2.8909945487976074, "logps/chosen": -61.814781188964844, "logps/rejected": -66.79518127441406, "loss": 0.6797, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08930834382772446, "rewards/margins": 0.029664453119039536, "rewards/rejected": -0.11897280067205429, "step": 3600 }, { "epoch": 1.2405237767057202, "eval_logits/chosen": -3.0616257190704346, "eval_logits/rejected": -3.0558876991271973, "eval_logps/chosen": -63.699798583984375, "eval_logps/rejected": -69.62061309814453, "eval_loss": 0.6865953207015991, "eval_rewards/accuracies": 0.5915427803993225, "eval_rewards/chosen": -0.049879107624292374, "eval_rewards/margins": 0.014525760896503925, "eval_rewards/rejected": -0.06440486758947372, "eval_runtime": 384.6476, "eval_samples_per_second": 11.189, "eval_steps_per_second": 1.399, "step": 3600 }, { "epoch": 1.2439696760854584, "grad_norm": 2.077509641647339, "learning_rate": 1.878770500845181e-08, "logits/chosen": -2.971928119659424, "logits/rejected": -2.9434304237365723, "logps/chosen": -64.08241271972656, "logps/rejected": -63.58278274536133, "loss": 0.6798, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.08303068578243256, "rewards/margins": 0.02932748757302761, "rewards/rejected": -0.11235816776752472, "step": 3610 }, { "epoch": 1.2474155754651963, "grad_norm": 2.0165388584136963, "learning_rate": 1.8642161866969946e-08, "logits/chosen": -2.955901622772217, "logits/rejected": -2.940150260925293, "logps/chosen": -64.76017761230469, "logps/rejected": -65.11692810058594, "loss": 0.6827, "rewards/accuracies": 0.578125, "rewards/chosen": -0.08926106989383698, "rewards/margins": 0.023149941116571426, "rewards/rejected": -0.1124110072851181, "step": 3620 }, { "epoch": 1.2508614748449345, "grad_norm": 2.059694528579712, "learning_rate": 1.8496848746680856e-08, "logits/chosen": -3.020172119140625, "logits/rejected": -2.988006114959717, "logps/chosen": -62.92469024658203, "logps/rejected": -64.46615600585938, "loss": 0.679, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08797968924045563, "rewards/margins": 0.03054668940603733, "rewards/rejected": -0.11852637678384781, "step": 3630 }, { "epoch": 1.2543073742246726, "grad_norm": 2.0419082641601562, "learning_rate": 1.8351770904890036e-08, "logits/chosen": -3.047367572784424, "logits/rejected": -3.0146524906158447, "logps/chosen": -63.642982482910156, "logps/rejected": -64.89458465576172, "loss": 0.6747, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.08200275897979736, "rewards/margins": 0.039216045290231705, "rewards/rejected": -0.12121880054473877, "step": 3640 }, { "epoch": 1.2577532736044108, "grad_norm": 2.025785207748413, "learning_rate": 1.8206933590390786e-08, "logits/chosen": -2.825869560241699, "logits/rejected": -2.8109097480773926, "logps/chosen": -61.419586181640625, "logps/rejected": -64.72754669189453, "loss": 0.6847, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.0889354795217514, "rewards/margins": 0.018987303599715233, "rewards/rejected": -0.10792279243469238, "step": 3650 }, { "epoch": 1.2611991729841487, "grad_norm": 2.176628351211548, "learning_rate": 1.8062342043274324e-08, "logits/chosen": -3.015015125274658, "logits/rejected": -2.9903564453125, "logps/chosen": -63.53308868408203, "logps/rejected": -63.09611892700195, "loss": 0.681, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08826801180839539, "rewards/margins": 0.027484869584441185, "rewards/rejected": -0.11575287580490112, "step": 3660 }, { "epoch": 1.264645072363887, "grad_norm": 2.1196792125701904, "learning_rate": 1.7918001494740237e-08, "logits/chosen": -2.9659061431884766, "logits/rejected": -2.9388701915740967, "logps/chosen": -62.460182189941406, "logps/rejected": -64.67069244384766, "loss": 0.6761, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.08238350600004196, "rewards/margins": 0.03688802570104599, "rewards/rejected": -0.11927153170108795, "step": 3670 }, { "epoch": 1.268090971743625, "grad_norm": 2.1168670654296875, "learning_rate": 1.777391716690718e-08, "logits/chosen": -2.9998278617858887, "logits/rejected": -2.9749624729156494, "logps/chosen": -64.06501770019531, "logps/rejected": -63.614013671875, "loss": 0.6785, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08456394076347351, "rewards/margins": 0.031805459409952164, "rewards/rejected": -0.11636941134929657, "step": 3680 }, { "epoch": 1.2715368711233632, "grad_norm": 2.04862380027771, "learning_rate": 1.7630094272623956e-08, "logits/chosen": -2.8991315364837646, "logits/rejected": -2.8779456615448, "logps/chosen": -61.9876708984375, "logps/rejected": -63.97594451904297, "loss": 0.6819, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09000525623559952, "rewards/margins": 0.025113940238952637, "rewards/rejected": -0.11511919647455215, "step": 3690 }, { "epoch": 1.2749827705031014, "grad_norm": 2.112943410873413, "learning_rate": 1.748653801528095e-08, "logits/chosen": -2.9221792221069336, "logits/rejected": -2.8926749229431152, "logps/chosen": -62.65099334716797, "logps/rejected": -63.81853103637695, "loss": 0.6783, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08818589150905609, "rewards/margins": 0.03205002099275589, "rewards/rejected": -0.12023589760065079, "step": 3700 }, { "epoch": 1.2749827705031014, "eval_logits/chosen": -3.0599453449249268, "eval_logits/rejected": -3.05422043800354, "eval_logps/chosen": -63.817176818847656, "eval_logps/rejected": -69.7728042602539, "eval_loss": 0.686441957950592, "eval_rewards/accuracies": 0.5903810262680054, "eval_rewards/chosen": -0.051052823662757874, "eval_rewards/margins": 0.014873947948217392, "eval_rewards/rejected": -0.06592677533626556, "eval_runtime": 384.3427, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 3700 }, { "epoch": 1.2784286698828393, "grad_norm": 2.092205047607422, "learning_rate": 1.734325358862181e-08, "logits/chosen": -2.863043785095215, "logits/rejected": -2.8345303535461426, "logps/chosen": -62.89555740356445, "logps/rejected": -64.21585845947266, "loss": 0.6783, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07933399826288223, "rewards/margins": 0.03237777203321457, "rewards/rejected": -0.1117117628455162, "step": 3710 }, { "epoch": 1.2818745692625775, "grad_norm": 2.1363396644592285, "learning_rate": 1.7200246176555605e-08, "logits/chosen": -2.9469122886657715, "logits/rejected": -2.9180445671081543, "logps/chosen": -64.09722900390625, "logps/rejected": -65.64142608642578, "loss": 0.6761, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09392695873975754, "rewards/margins": 0.036279790103435516, "rewards/rejected": -0.13020673394203186, "step": 3720 }, { "epoch": 1.2853204686423156, "grad_norm": 1.9918975830078125, "learning_rate": 1.7057520952969256e-08, "logits/chosen": -2.9698033332824707, "logits/rejected": -2.957746982574463, "logps/chosen": -61.49199676513672, "logps/rejected": -64.32593536376953, "loss": 0.679, "rewards/accuracies": 0.625, "rewards/chosen": -0.08447978645563126, "rewards/margins": 0.030800744891166687, "rewards/rejected": -0.11528053134679794, "step": 3730 }, { "epoch": 1.2887663680220538, "grad_norm": 1.9824419021606445, "learning_rate": 1.6915083081540328e-08, "logits/chosen": -2.9427554607391357, "logits/rejected": -2.9203765392303467, "logps/chosen": -63.301551818847656, "logps/rejected": -61.42414093017578, "loss": 0.6819, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0865807980298996, "rewards/margins": 0.02510489523410797, "rewards/rejected": -0.11168569326400757, "step": 3740 }, { "epoch": 1.292212267401792, "grad_norm": 1.8477709293365479, "learning_rate": 1.6772937715550234e-08, "logits/chosen": -2.9018781185150146, "logits/rejected": -2.882781505584717, "logps/chosen": -61.00310134887695, "logps/rejected": -64.47721099853516, "loss": 0.6796, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08985722810029984, "rewards/margins": 0.029285427182912827, "rewards/rejected": -0.11914266645908356, "step": 3750 }, { "epoch": 1.29565816678153, "grad_norm": 2.3149027824401855, "learning_rate": 1.6631089997697788e-08, "logits/chosen": -2.9336066246032715, "logits/rejected": -2.9068706035614014, "logps/chosen": -63.48087692260742, "logps/rejected": -64.29015350341797, "loss": 0.6777, "rewards/accuracies": 0.640625, "rewards/chosen": -0.08386462181806564, "rewards/margins": 0.0336444191634655, "rewards/rejected": -0.11750902980566025, "step": 3760 }, { "epoch": 1.299104066161268, "grad_norm": 1.8848634958267212, "learning_rate": 1.648954505991315e-08, "logits/chosen": -2.9510250091552734, "logits/rejected": -2.946815013885498, "logps/chosen": -60.505516052246094, "logps/rejected": -64.12696838378906, "loss": 0.6821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08626588433980942, "rewards/margins": 0.024447450414299965, "rewards/rejected": -0.11071332544088364, "step": 3770 }, { "epoch": 1.3025499655410062, "grad_norm": 2.1947124004364014, "learning_rate": 1.634830802317215e-08, "logits/chosen": -2.992312431335449, "logits/rejected": -2.9723546504974365, "logps/chosen": -59.96136474609375, "logps/rejected": -65.40067291259766, "loss": 0.6743, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.07778512686491013, "rewards/margins": 0.04034097120165825, "rewards/rejected": -0.11812610924243927, "step": 3780 }, { "epoch": 1.3059958649207444, "grad_norm": 2.0879478454589844, "learning_rate": 1.6207383997311025e-08, "logits/chosen": -2.9999008178710938, "logits/rejected": -2.9816055297851562, "logps/chosen": -64.22468566894531, "logps/rejected": -65.10344696044922, "loss": 0.6753, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07819289714097977, "rewards/margins": 0.03828134760260582, "rewards/rejected": -0.11647425591945648, "step": 3790 }, { "epoch": 1.3094417643004825, "grad_norm": 1.9533721208572388, "learning_rate": 1.6066778080841532e-08, "logits/chosen": -2.996748208999634, "logits/rejected": -2.9594829082489014, "logps/chosen": -64.72180938720703, "logps/rejected": -62.791358947753906, "loss": 0.6771, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09291788935661316, "rewards/margins": 0.034953463822603226, "rewards/rejected": -0.1278713345527649, "step": 3800 }, { "epoch": 1.3094417643004825, "eval_logits/chosen": -3.0579733848571777, "eval_logits/rejected": -3.052189826965332, "eval_logps/chosen": -63.923500061035156, "eval_logps/rejected": -69.89812469482422, "eval_loss": 0.6863577365875244, "eval_rewards/accuracies": 0.5920074582099915, "eval_rewards/chosen": -0.052116088569164276, "eval_rewards/margins": 0.01506392378360033, "eval_rewards/rejected": -0.06718001514673233, "eval_runtime": 384.3289, "eval_samples_per_second": 11.199, "eval_steps_per_second": 1.4, "step": 3800 }, { "epoch": 1.3128876636802205, "grad_norm": 2.1521520614624023, "learning_rate": 1.5926495360766518e-08, "logits/chosen": -2.952903985977173, "logits/rejected": -2.9230117797851562, "logps/chosen": -63.550682067871094, "logps/rejected": -63.614044189453125, "loss": 0.678, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.08338330686092377, "rewards/margins": 0.03267858177423477, "rewards/rejected": -0.11606190353631973, "step": 3810 }, { "epoch": 1.3163335630599586, "grad_norm": 2.238865852355957, "learning_rate": 1.5786540912395846e-08, "logits/chosen": -2.890934467315674, "logits/rejected": -2.885941982269287, "logps/chosen": -62.68060302734375, "logps/rejected": -64.84013366699219, "loss": 0.6803, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.08805284649133682, "rewards/margins": 0.028121262788772583, "rewards/rejected": -0.11617410182952881, "step": 3820 }, { "epoch": 1.3197794624396968, "grad_norm": 2.5020928382873535, "learning_rate": 1.564691979916278e-08, "logits/chosen": -2.9928812980651855, "logits/rejected": -2.960798740386963, "logps/chosen": -66.76985931396484, "logps/rejected": -67.78041076660156, "loss": 0.6774, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09056535363197327, "rewards/margins": 0.03447294607758522, "rewards/rejected": -0.1250382959842682, "step": 3830 }, { "epoch": 1.323225361819435, "grad_norm": 2.00618839263916, "learning_rate": 1.5507637072440824e-08, "logits/chosen": -2.9697744846343994, "logits/rejected": -2.9465317726135254, "logps/chosen": -62.504920959472656, "logps/rejected": -63.22511672973633, "loss": 0.677, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08414576202630997, "rewards/margins": 0.034877367317676544, "rewards/rejected": -0.11902312934398651, "step": 3840 }, { "epoch": 1.3266712611991731, "grad_norm": 2.2420315742492676, "learning_rate": 1.5368697771360922e-08, "logits/chosen": -3.0211071968078613, "logits/rejected": -2.982313632965088, "logps/chosen": -62.169822692871094, "logps/rejected": -62.871063232421875, "loss": 0.6762, "rewards/accuracies": 0.640625, "rewards/chosen": -0.0782238095998764, "rewards/margins": 0.03669989854097366, "rewards/rejected": -0.11492369323968887, "step": 3850 }, { "epoch": 1.330117160578911, "grad_norm": 2.1969306468963623, "learning_rate": 1.523010692262918e-08, "logits/chosen": -2.981954336166382, "logits/rejected": -2.956352710723877, "logps/chosen": -63.6324462890625, "logps/rejected": -63.42945098876953, "loss": 0.6806, "rewards/accuracies": 0.625, "rewards/chosen": -0.09774903953075409, "rewards/margins": 0.027855467051267624, "rewards/rejected": -0.125604510307312, "step": 3860 }, { "epoch": 1.3335630599586492, "grad_norm": 2.0029098987579346, "learning_rate": 1.5091869540345003e-08, "logits/chosen": -2.9232115745544434, "logits/rejected": -2.903977155685425, "logps/chosen": -61.975440979003906, "logps/rejected": -65.58573913574219, "loss": 0.6774, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09161805361509323, "rewards/margins": 0.03398921340703964, "rewards/rejected": -0.12560728192329407, "step": 3870 }, { "epoch": 1.3370089593383874, "grad_norm": 2.2118442058563232, "learning_rate": 1.495399062581966e-08, "logits/chosen": -2.8885598182678223, "logits/rejected": -2.8633341789245605, "logps/chosen": -60.8160400390625, "logps/rejected": -64.75898742675781, "loss": 0.6735, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.08591288328170776, "rewards/margins": 0.04224963113665581, "rewards/rejected": -0.12816253304481506, "step": 3880 }, { "epoch": 1.3404548587181253, "grad_norm": 2.047248601913452, "learning_rate": 1.481647516739537e-08, "logits/chosen": -2.9682374000549316, "logits/rejected": -2.947425127029419, "logps/chosen": -61.350563049316406, "logps/rejected": -65.50421905517578, "loss": 0.6806, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.09522966295480728, "rewards/margins": 0.027817973867058754, "rewards/rejected": -0.12304762750864029, "step": 3890 }, { "epoch": 1.3439007580978635, "grad_norm": 2.1648757457733154, "learning_rate": 1.4679328140264815e-08, "logits/chosen": -2.9958930015563965, "logits/rejected": -2.985562801361084, "logps/chosen": -62.2169189453125, "logps/rejected": -65.93193054199219, "loss": 0.6785, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09441477060317993, "rewards/margins": 0.03176296874880791, "rewards/rejected": -0.12617774307727814, "step": 3900 }, { "epoch": 1.3439007580978635, "eval_logits/chosen": -3.055612087249756, "eval_logits/rejected": -3.0498664379119873, "eval_logps/chosen": -64.0693359375, "eval_logps/rejected": -70.08139038085938, "eval_loss": 0.6861928701400757, "eval_rewards/accuracies": 0.5922397971153259, "eval_rewards/chosen": -0.053574394434690475, "eval_rewards/margins": 0.015438344329595566, "eval_rewards/rejected": -0.06901273876428604, "eval_runtime": 384.3531, "eval_samples_per_second": 11.198, "eval_steps_per_second": 1.4, "step": 3900 }, { "epoch": 1.3473466574776016, "grad_norm": 1.999382734298706, "learning_rate": 1.4542554506291169e-08, "logits/chosen": -3.0110673904418945, "logits/rejected": -2.9811036586761475, "logps/chosen": -65.58875274658203, "logps/rejected": -66.58174133300781, "loss": 0.6762, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07897213101387024, "rewards/margins": 0.036501772701740265, "rewards/rejected": -0.11547388881444931, "step": 3910 }, { "epoch": 1.3507925568573398, "grad_norm": 1.908639907836914, "learning_rate": 1.4406159213828506e-08, "logits/chosen": -2.8948254585266113, "logits/rejected": -2.8793833255767822, "logps/chosen": -61.128509521484375, "logps/rejected": -66.39567565917969, "loss": 0.6788, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09444743394851685, "rewards/margins": 0.031206313520669937, "rewards/rejected": -0.12565374374389648, "step": 3920 }, { "epoch": 1.354238456237078, "grad_norm": 2.048689365386963, "learning_rate": 1.427014719754287e-08, "logits/chosen": -2.902984142303467, "logits/rejected": -2.86833119392395, "logps/chosen": -63.13886642456055, "logps/rejected": -63.15281295776367, "loss": 0.6758, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.08902109414339066, "rewards/margins": 0.037360358983278275, "rewards/rejected": -0.12638147175312042, "step": 3930 }, { "epoch": 1.3576843556168159, "grad_norm": 2.2913434505462646, "learning_rate": 1.4134523378233698e-08, "logits/chosen": -2.973330497741699, "logits/rejected": -2.956834316253662, "logps/chosen": -63.40961456298828, "logps/rejected": -65.80083465576172, "loss": 0.6777, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0901583805680275, "rewards/margins": 0.03346914425492287, "rewards/rejected": -0.12362752109766006, "step": 3940 }, { "epoch": 1.361130254996554, "grad_norm": 2.275186538696289, "learning_rate": 1.3999292662655754e-08, "logits/chosen": -2.937798261642456, "logits/rejected": -2.9266557693481445, "logps/chosen": -62.40935516357422, "logps/rejected": -66.2726821899414, "loss": 0.6811, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09133242070674896, "rewards/margins": 0.026894664391875267, "rewards/rejected": -0.11822707951068878, "step": 3950 }, { "epoch": 1.3645761543762922, "grad_norm": 2.071085214614868, "learning_rate": 1.3864459943341675e-08, "logits/chosen": -2.936525583267212, "logits/rejected": -2.912968158721924, "logps/chosen": -64.78622436523438, "logps/rejected": -64.55671691894531, "loss": 0.6792, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.08964154869318008, "rewards/margins": 0.030369237065315247, "rewards/rejected": -0.12001077830791473, "step": 3960 }, { "epoch": 1.3680220537560304, "grad_norm": 2.226252794265747, "learning_rate": 1.3730030098424927e-08, "logits/chosen": -2.943465232849121, "logits/rejected": -2.92138671875, "logps/chosen": -67.17789459228516, "logps/rejected": -66.60552978515625, "loss": 0.6744, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08730453252792358, "rewards/margins": 0.04065509885549545, "rewards/rejected": -0.12795962393283844, "step": 3970 }, { "epoch": 1.3714679531357685, "grad_norm": 2.1076691150665283, "learning_rate": 1.3596007991463298e-08, "logits/chosen": -2.858703851699829, "logits/rejected": -2.8402962684631348, "logps/chosen": -60.17496871948242, "logps/rejected": -64.92707061767578, "loss": 0.6801, "rewards/accuracies": 0.609375, "rewards/chosen": -0.09182435274124146, "rewards/margins": 0.02867903746664524, "rewards/rejected": -0.12050338089466095, "step": 3980 }, { "epoch": 1.3749138525155065, "grad_norm": 2.15260910987854, "learning_rate": 1.3462398471262992e-08, "logits/chosen": -2.9901604652404785, "logits/rejected": -2.9676501750946045, "logps/chosen": -65.43225860595703, "logps/rejected": -66.92817687988281, "loss": 0.6769, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.09293768554925919, "rewards/margins": 0.03545621410012245, "rewards/rejected": -0.12839388847351074, "step": 3990 }, { "epoch": 1.3783597518952446, "grad_norm": 2.315973997116089, "learning_rate": 1.3329206371703166e-08, "logits/chosen": -2.9690933227539062, "logits/rejected": -2.9580492973327637, "logps/chosen": -61.68376541137695, "logps/rejected": -65.7838134765625, "loss": 0.6807, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0909140557050705, "rewards/margins": 0.027417322620749474, "rewards/rejected": -0.11833137273788452, "step": 4000 }, { "epoch": 1.3783597518952446, "eval_logits/chosen": -3.054121732711792, "eval_logits/rejected": -3.048370838165283, "eval_logps/chosen": -64.221435546875, "eval_logps/rejected": -70.25933074951172, "eval_loss": 0.6860847473144531, "eval_rewards/accuracies": 0.5908457040786743, "eval_rewards/chosen": -0.05509539321064949, "eval_rewards/margins": 0.01569669507443905, "eval_rewards/rejected": -0.07079208642244339, "eval_runtime": 384.3005, "eval_samples_per_second": 11.2, "eval_steps_per_second": 1.4, "step": 4000 }, { "epoch": 1.3818056512749828, "grad_norm": 2.1167848110198975, "learning_rate": 1.3196436511561027e-08, "logits/chosen": -2.933992385864258, "logits/rejected": -2.903357982635498, "logps/chosen": -67.85502624511719, "logps/rejected": -65.80936431884766, "loss": 0.6792, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0915619507431984, "rewards/margins": 0.030700866132974625, "rewards/rejected": -0.12226282060146332, "step": 4010 }, { "epoch": 1.385251550654721, "grad_norm": 2.022864818572998, "learning_rate": 1.3064093694337552e-08, "logits/chosen": -2.918813705444336, "logits/rejected": -2.895181655883789, "logps/chosen": -62.11616134643555, "logps/rejected": -65.38654327392578, "loss": 0.6779, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.09331174194812775, "rewards/margins": 0.03332715108990669, "rewards/rejected": -0.12663887441158295, "step": 4020 }, { "epoch": 1.388697450034459, "grad_norm": 2.205150842666626, "learning_rate": 1.2932182708083659e-08, "logits/chosen": -2.9874258041381836, "logits/rejected": -2.959113836288452, "logps/chosen": -63.654685974121094, "logps/rejected": -65.35508728027344, "loss": 0.6748, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08431627601385117, "rewards/margins": 0.039567284286022186, "rewards/rejected": -0.12388356775045395, "step": 4030 }, { "epoch": 1.392143349414197, "grad_norm": 2.237994909286499, "learning_rate": 1.2800708325226967e-08, "logits/chosen": -2.9087979793548584, "logits/rejected": -2.8909714221954346, "logps/chosen": -62.17333221435547, "logps/rejected": -65.90696716308594, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": -0.08950966596603394, "rewards/margins": 0.029924685135483742, "rewards/rejected": -0.11943434178829193, "step": 4040 }, { "epoch": 1.3955892487939352, "grad_norm": 2.220797538757324, "learning_rate": 1.2669675302399174e-08, "logits/chosen": -2.9189624786376953, "logits/rejected": -2.901270866394043, "logps/chosen": -62.6547737121582, "logps/rejected": -66.87588500976562, "loss": 0.6791, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09054653346538544, "rewards/margins": 0.030481979250907898, "rewards/rejected": -0.12102852761745453, "step": 4050 }, { "epoch": 1.3990351481736734, "grad_norm": 2.227811813354492, "learning_rate": 1.2539088380263958e-08, "logits/chosen": -2.9427545070648193, "logits/rejected": -2.9144515991210938, "logps/chosen": -64.08809661865234, "logps/rejected": -64.04742431640625, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": -0.0900612324476242, "rewards/margins": 0.030776774510741234, "rewards/rejected": -0.12083800882101059, "step": 4060 }, { "epoch": 1.4024810475534115, "grad_norm": 2.1904659271240234, "learning_rate": 1.240895228334542e-08, "logits/chosen": -2.9165213108062744, "logits/rejected": -2.894753932952881, "logps/chosen": -62.841758728027344, "logps/rejected": -63.72822189331055, "loss": 0.6797, "rewards/accuracies": 0.578125, "rewards/chosen": -0.09068237990140915, "rewards/margins": 0.029550915583968163, "rewards/rejected": -0.12023331224918365, "step": 4070 }, { "epoch": 1.4059269469331497, "grad_norm": 2.2701804637908936, "learning_rate": 1.2279271719857196e-08, "logits/chosen": -2.946728229522705, "logits/rejected": -2.9265058040618896, "logps/chosen": -63.28468704223633, "logps/rejected": -65.9094009399414, "loss": 0.6745, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.09204259514808655, "rewards/margins": 0.04024175554513931, "rewards/rejected": -0.13228434324264526, "step": 4080 }, { "epoch": 1.4093728463128876, "grad_norm": 2.4241137504577637, "learning_rate": 1.2150051381532137e-08, "logits/chosen": -2.972846746444702, "logits/rejected": -2.9496474266052246, "logps/chosen": -66.7400894165039, "logps/rejected": -65.05125427246094, "loss": 0.6803, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.08738545328378677, "rewards/margins": 0.028310012072324753, "rewards/rejected": -0.11569547653198242, "step": 4090 }, { "epoch": 1.4128187456926258, "grad_norm": 2.0823371410369873, "learning_rate": 1.2021295943452495e-08, "logits/chosen": -2.9438533782958984, "logits/rejected": -2.9151291847229004, "logps/chosen": -65.56358337402344, "logps/rejected": -65.81645202636719, "loss": 0.6769, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.090767040848732, "rewards/margins": 0.03532954305410385, "rewards/rejected": -0.12609657645225525, "step": 4100 }, { "epoch": 1.4128187456926258, "eval_logits/chosen": -3.052527904510498, "eval_logits/rejected": -3.046738386154175, "eval_logps/chosen": -64.33756256103516, "eval_logps/rejected": -70.39879608154297, "eval_loss": 0.6859830021858215, "eval_rewards/accuracies": 0.5929368138313293, "eval_rewards/chosen": -0.05625665932893753, "eval_rewards/margins": 0.01593007706105709, "eval_rewards/rejected": -0.07218674570322037, "eval_runtime": 384.5631, "eval_samples_per_second": 11.192, "eval_steps_per_second": 1.399, "step": 4100 }, { "epoch": 1.416264645072364, "grad_norm": 2.0488412380218506, "learning_rate": 1.1893010063880853e-08, "logits/chosen": -2.9012386798858643, "logits/rejected": -2.879481792449951, "logps/chosen": -63.4229621887207, "logps/rejected": -66.8333969116211, "loss": 0.6799, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09323953092098236, "rewards/margins": 0.029085148125886917, "rewards/rejected": -0.12232469022274017, "step": 4110 }, { "epoch": 1.4197105444521019, "grad_norm": 2.0539770126342773, "learning_rate": 1.1765198384091577e-08, "logits/chosen": -2.9786791801452637, "logits/rejected": -2.954512357711792, "logps/chosen": -64.86912536621094, "logps/rejected": -64.17298126220703, "loss": 0.6774, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.09158007800579071, "rewards/margins": 0.03424326702952385, "rewards/rejected": -0.12582334876060486, "step": 4120 }, { "epoch": 1.42315644383184, "grad_norm": 2.2448647022247314, "learning_rate": 1.1637865528202845e-08, "logits/chosen": -3.007713794708252, "logits/rejected": -2.9852776527404785, "logps/chosen": -65.87146759033203, "logps/rejected": -67.21484375, "loss": 0.6768, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08839482069015503, "rewards/margins": 0.03572607785463333, "rewards/rejected": -0.12412089109420776, "step": 4130 }, { "epoch": 1.4266023432115782, "grad_norm": 2.146552085876465, "learning_rate": 1.1511016103009425e-08, "logits/chosen": -2.973470449447632, "logits/rejected": -2.962482452392578, "logps/chosen": -63.569091796875, "logps/rejected": -66.65575408935547, "loss": 0.6852, "rewards/accuracies": 0.5625, "rewards/chosen": -0.106827512383461, "rewards/margins": 0.0185137577354908, "rewards/rejected": -0.1253412663936615, "step": 4140 }, { "epoch": 1.4300482425913164, "grad_norm": 2.1673996448516846, "learning_rate": 1.1384654697815973e-08, "logits/chosen": -2.997055768966675, "logits/rejected": -2.9608774185180664, "logps/chosen": -67.5733642578125, "logps/rejected": -66.72795104980469, "loss": 0.6768, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.08815548568964005, "rewards/margins": 0.035506755113601685, "rewards/rejected": -0.12366221845149994, "step": 4150 }, { "epoch": 1.4334941419710545, "grad_norm": 1.9435546398162842, "learning_rate": 1.1258785884270972e-08, "logits/chosen": -2.899432420730591, "logits/rejected": -2.8756940364837646, "logps/chosen": -62.95705032348633, "logps/rejected": -64.1615219116211, "loss": 0.6794, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08565764874219894, "rewards/margins": 0.029919400811195374, "rewards/rejected": -0.11557704210281372, "step": 4160 }, { "epoch": 1.4369400413507925, "grad_norm": 2.0451626777648926, "learning_rate": 1.1133414216201372e-08, "logits/chosen": -2.969944715499878, "logits/rejected": -2.9574761390686035, "logps/chosen": -61.43464279174805, "logps/rejected": -65.87757873535156, "loss": 0.6786, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.09905441105365753, "rewards/margins": 0.03209648281335831, "rewards/rejected": -0.13115090131759644, "step": 4170 }, { "epoch": 1.4403859407305306, "grad_norm": 2.095008134841919, "learning_rate": 1.1008544229447836e-08, "logits/chosen": -2.8462460041046143, "logits/rejected": -2.8191661834716797, "logps/chosen": -63.70579147338867, "logps/rejected": -65.39628601074219, "loss": 0.678, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0984322726726532, "rewards/margins": 0.03305213525891304, "rewards/rejected": -0.13148441910743713, "step": 4180 }, { "epoch": 1.4438318401102688, "grad_norm": 1.9745500087738037, "learning_rate": 1.0884180441700588e-08, "logits/chosen": -2.981600761413574, "logits/rejected": -2.9631736278533936, "logps/chosen": -60.92009353637695, "logps/rejected": -66.35347747802734, "loss": 0.6809, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09771338850259781, "rewards/margins": 0.02702244557440281, "rewards/rejected": -0.12473583221435547, "step": 4190 }, { "epoch": 1.447277739490007, "grad_norm": 2.2227063179016113, "learning_rate": 1.0760327352336024e-08, "logits/chosen": -2.9796640872955322, "logits/rejected": -2.9522905349731445, "logps/chosen": -62.57625198364258, "logps/rejected": -66.95372009277344, "loss": 0.6722, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.09075101464986801, "rewards/margins": 0.04433668404817581, "rewards/rejected": -0.13508769869804382, "step": 4200 }, { "epoch": 1.447277739490007, "eval_logits/chosen": -3.051344633102417, "eval_logits/rejected": -3.045560359954834, "eval_logps/chosen": -64.48454284667969, "eval_logps/rejected": -70.56287384033203, "eval_loss": 0.6859112977981567, "eval_rewards/accuracies": 0.5945631861686707, "eval_rewards/chosen": -0.057726457715034485, "eval_rewards/margins": 0.01610107533633709, "eval_rewards/rejected": -0.07382753491401672, "eval_runtime": 384.4021, "eval_samples_per_second": 11.197, "eval_steps_per_second": 1.4, "step": 4200 }, { "epoch": 1.450723638869745, "grad_norm": 2.0472159385681152, "learning_rate": 1.0636989442253914e-08, "logits/chosen": -2.865917682647705, "logits/rejected": -2.8423261642456055, "logps/chosen": -64.19918060302734, "logps/rejected": -65.27023315429688, "loss": 0.6737, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0954306572675705, "rewards/margins": 0.041881926357746124, "rewards/rejected": -0.13731257617473602, "step": 4210 }, { "epoch": 1.454169538249483, "grad_norm": 2.3937063217163086, "learning_rate": 1.0514171173715245e-08, "logits/chosen": -2.9541773796081543, "logits/rejected": -2.937562942504883, "logps/chosen": -63.5893440246582, "logps/rejected": -66.68025207519531, "loss": 0.6788, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09447337687015533, "rewards/margins": 0.03139979764819145, "rewards/rejected": -0.12587317824363708, "step": 4220 }, { "epoch": 1.4576154376292212, "grad_norm": 2.123077392578125, "learning_rate": 1.039187699018085e-08, "logits/chosen": -2.9126861095428467, "logits/rejected": -2.8963119983673096, "logps/chosen": -59.836151123046875, "logps/rejected": -66.63883972167969, "loss": 0.675, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09426899254322052, "rewards/margins": 0.038940686732530594, "rewards/rejected": -0.13320967555046082, "step": 4230 }, { "epoch": 1.4610613370089593, "grad_norm": 2.055708408355713, "learning_rate": 1.0270111316150585e-08, "logits/chosen": -2.9423460960388184, "logits/rejected": -2.9130282402038574, "logps/chosen": -64.48773956298828, "logps/rejected": -65.76631927490234, "loss": 0.6773, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.09935905039310455, "rewards/margins": 0.0346347838640213, "rewards/rejected": -0.13399383425712585, "step": 4240 }, { "epoch": 1.4645072363886975, "grad_norm": 2.029475688934326, "learning_rate": 1.0148878557003299e-08, "logits/chosen": -2.924720048904419, "logits/rejected": -2.9136366844177246, "logps/chosen": -64.05335998535156, "logps/rejected": -68.36885070800781, "loss": 0.6771, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09529970586299896, "rewards/margins": 0.03515272215008736, "rewards/rejected": -0.13045242428779602, "step": 4250 }, { "epoch": 1.4679531357684357, "grad_norm": 2.112820625305176, "learning_rate": 1.0028183098837409e-08, "logits/chosen": -2.9278299808502197, "logits/rejected": -2.892338275909424, "logps/chosen": -65.2867202758789, "logps/rejected": -63.962562561035156, "loss": 0.6734, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09577113389968872, "rewards/margins": 0.04274382442235947, "rewards/rejected": -0.13851496577262878, "step": 4260 }, { "epoch": 1.4713990351481736, "grad_norm": 2.1950058937072754, "learning_rate": 9.908029308312266e-09, "logits/chosen": -2.973106861114502, "logits/rejected": -2.9561305046081543, "logps/chosen": -63.97013473510742, "logps/rejected": -65.02579498291016, "loss": 0.6825, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.09679555147886276, "rewards/margins": 0.024157661944627762, "rewards/rejected": -0.12095322459936142, "step": 4270 }, { "epoch": 1.4748449345279118, "grad_norm": 2.039232015609741, "learning_rate": 9.788421532490134e-09, "logits/chosen": -3.002267360687256, "logits/rejected": -2.986990451812744, "logps/chosen": -63.63147735595703, "logps/rejected": -66.7364501953125, "loss": 0.6781, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.09608881175518036, "rewards/margins": 0.03295501321554184, "rewards/rejected": -0.1290438175201416, "step": 4280 }, { "epoch": 1.47829083390765, "grad_norm": 2.1652369499206543, "learning_rate": 9.669364098678912e-09, "logits/chosen": -2.946402072906494, "logits/rejected": -2.9230525493621826, "logps/chosen": -63.1927604675293, "logps/rejected": -66.66388702392578, "loss": 0.6724, "rewards/accuracies": 0.671875, "rewards/chosen": -0.08946090936660767, "rewards/margins": 0.044275470077991486, "rewards/rejected": -0.13373637199401855, "step": 4290 }, { "epoch": 1.481736733287388, "grad_norm": 2.233903646469116, "learning_rate": 9.550861314275613e-09, "logits/chosen": -2.9582359790802, "logits/rejected": -2.9278035163879395, "logps/chosen": -63.89642333984375, "logps/rejected": -64.79170227050781, "loss": 0.6769, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.08680012077093124, "rewards/margins": 0.03546776622533798, "rewards/rejected": -0.12226790189743042, "step": 4300 }, { "epoch": 1.481736733287388, "eval_logits/chosen": -3.049945592880249, "eval_logits/rejected": -3.0441927909851074, "eval_logps/chosen": -64.53496551513672, "eval_logps/rejected": -70.63489532470703, "eval_loss": 0.6858123540878296, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.05823073908686638, "eval_rewards/margins": 0.01631702482700348, "eval_rewards/rejected": -0.07454776763916016, "eval_runtime": 384.2598, "eval_samples_per_second": 11.201, "eval_steps_per_second": 1.4, "step": 4300 }, { "epoch": 1.4851826326671262, "grad_norm": 2.15106201171875, "learning_rate": 9.432917466610505e-09, "logits/chosen": -2.9038984775543213, "logits/rejected": -2.8834316730499268, "logps/chosen": -64.21763610839844, "logps/rejected": -63.90519332885742, "loss": 0.6784, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0975811555981636, "rewards/margins": 0.032436296343803406, "rewards/rejected": -0.1300174444913864, "step": 4310 }, { "epoch": 1.4886285320468642, "grad_norm": 2.1343038082122803, "learning_rate": 9.315536822791976e-09, "logits/chosen": -2.915823459625244, "logits/rejected": -2.8945200443267822, "logps/chosen": -62.840476989746094, "logps/rejected": -64.214111328125, "loss": 0.6811, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09117156267166138, "rewards/margins": 0.02677854336798191, "rewards/rejected": -0.11795011907815933, "step": 4320 }, { "epoch": 1.4920744314266023, "grad_norm": 2.0389392375946045, "learning_rate": 9.198723629552205e-09, "logits/chosen": -2.90847110748291, "logits/rejected": -2.8905720710754395, "logps/chosen": -63.195213317871094, "logps/rejected": -65.94998168945312, "loss": 0.6747, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.0903761237859726, "rewards/margins": 0.040145523846149445, "rewards/rejected": -0.13052165508270264, "step": 4330 }, { "epoch": 1.4955203308063405, "grad_norm": 2.3100173473358154, "learning_rate": 9.08248211309346e-09, "logits/chosen": -2.9821550846099854, "logits/rejected": -2.9700443744659424, "logps/chosen": -61.75278854370117, "logps/rejected": -66.24479675292969, "loss": 0.6793, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10415707528591156, "rewards/margins": 0.03043574094772339, "rewards/rejected": -0.13459283113479614, "step": 4340 }, { "epoch": 1.4989662301860784, "grad_norm": 2.2711362838745117, "learning_rate": 8.966816478935255e-09, "logits/chosen": -2.9720687866210938, "logits/rejected": -2.9405291080474854, "logps/chosen": -65.44647979736328, "logps/rejected": -63.351318359375, "loss": 0.6772, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.09429587423801422, "rewards/margins": 0.03446224331855774, "rewards/rejected": -0.12875810265541077, "step": 4350 }, { "epoch": 1.5024121295658168, "grad_norm": 2.1748154163360596, "learning_rate": 8.851730911762168e-09, "logits/chosen": -2.936713695526123, "logits/rejected": -2.917022943496704, "logps/chosen": -63.976097106933594, "logps/rejected": -66.04635620117188, "loss": 0.6801, "rewards/accuracies": 0.609375, "rewards/chosen": -0.09384501725435257, "rewards/margins": 0.02860867977142334, "rewards/rejected": -0.12245368957519531, "step": 4360 }, { "epoch": 1.5058580289455548, "grad_norm": 2.08954119682312, "learning_rate": 8.73722957527242e-09, "logits/chosen": -2.9445550441741943, "logits/rejected": -2.9229681491851807, "logps/chosen": -62.3499870300293, "logps/rejected": -65.8035888671875, "loss": 0.6801, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10512308776378632, "rewards/margins": 0.028672099113464355, "rewards/rejected": -0.13379518687725067, "step": 4370 }, { "epoch": 1.509303928325293, "grad_norm": 2.039855718612671, "learning_rate": 8.623316612027284e-09, "logits/chosen": -2.9334137439727783, "logits/rejected": -2.9160149097442627, "logps/chosen": -61.61011505126953, "logps/rejected": -67.03166198730469, "loss": 0.6754, "rewards/accuracies": 0.671875, "rewards/chosen": -0.09012607485055923, "rewards/margins": 0.03832302242517471, "rewards/rejected": -0.12844911217689514, "step": 4380 }, { "epoch": 1.512749827705031, "grad_norm": 2.3842740058898926, "learning_rate": 8.509996143301196e-09, "logits/chosen": -2.967993974685669, "logits/rejected": -2.9537787437438965, "logps/chosen": -61.7550163269043, "logps/rejected": -65.1753921508789, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": -0.09670481830835342, "rewards/margins": 0.03900107368826866, "rewards/rejected": -0.1357058882713318, "step": 4390 }, { "epoch": 1.516195727084769, "grad_norm": 2.1551573276519775, "learning_rate": 8.397272268932618e-09, "logits/chosen": -2.9331369400024414, "logits/rejected": -2.90215802192688, "logps/chosen": -64.6319580078125, "logps/rejected": -65.17381286621094, "loss": 0.6785, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0967051237821579, "rewards/margins": 0.03241132199764252, "rewards/rejected": -0.12911644577980042, "step": 4400 }, { "epoch": 1.516195727084769, "eval_logits/chosen": -3.049013376235962, "eval_logits/rejected": -3.0432231426239014, "eval_logps/chosen": -64.5702896118164, "eval_logps/rejected": -70.67756652832031, "eval_loss": 0.6857818365097046, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.05858391523361206, "eval_rewards/margins": 0.016390599310398102, "eval_rewards/rejected": -0.07497451454401016, "eval_runtime": 384.4066, "eval_samples_per_second": 11.196, "eval_steps_per_second": 1.4, "step": 4400 }, { "epoch": 1.5196416264645074, "grad_norm": 2.1354825496673584, "learning_rate": 8.285149067175734e-09, "logits/chosen": -2.999840259552002, "logits/rejected": -2.978172779083252, "logps/chosen": -60.5115852355957, "logps/rejected": -64.41769409179688, "loss": 0.6777, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0981929823756218, "rewards/margins": 0.03404136374592781, "rewards/rejected": -0.1322343647480011, "step": 4410 }, { "epoch": 1.5230875258442453, "grad_norm": 2.2247259616851807, "learning_rate": 8.173630594552924e-09, "logits/chosen": -2.8461086750030518, "logits/rejected": -2.8300206661224365, "logps/chosen": -62.6235237121582, "logps/rejected": -65.7718734741211, "loss": 0.6777, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0915134847164154, "rewards/margins": 0.03351100534200668, "rewards/rejected": -0.1250244826078415, "step": 4420 }, { "epoch": 1.5265334252239835, "grad_norm": 2.1211180686950684, "learning_rate": 8.062720885707983e-09, "logits/chosen": -3.0012969970703125, "logits/rejected": -2.981078624725342, "logps/chosen": -61.487266540527344, "logps/rejected": -65.78907775878906, "loss": 0.6772, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.08771520853042603, "rewards/margins": 0.0344039723277092, "rewards/rejected": -0.12211918830871582, "step": 4430 }, { "epoch": 1.5299793246037217, "grad_norm": 2.0191056728363037, "learning_rate": 7.95242395326011e-09, "logits/chosen": -2.9936842918395996, "logits/rejected": -2.9634318351745605, "logps/chosen": -65.0488510131836, "logps/rejected": -66.46080017089844, "loss": 0.6784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09938286244869232, "rewards/margins": 0.03235219046473503, "rewards/rejected": -0.13173505663871765, "step": 4440 }, { "epoch": 1.5334252239834596, "grad_norm": 2.232675075531006, "learning_rate": 7.842743787658812e-09, "logits/chosen": -2.952040910720825, "logits/rejected": -2.923306941986084, "logps/chosen": -64.1955337524414, "logps/rejected": -64.81131744384766, "loss": 0.6773, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09505288302898407, "rewards/margins": 0.03423549234867096, "rewards/rejected": -0.12928839027881622, "step": 4450 }, { "epoch": 1.5368711233631978, "grad_norm": 2.273517370223999, "learning_rate": 7.733684357039492e-09, "logits/chosen": -2.9811654090881348, "logits/rejected": -2.9586715698242188, "logps/chosen": -66.3079833984375, "logps/rejected": -66.52787017822266, "loss": 0.6748, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09947341680526733, "rewards/margins": 0.039953988045454025, "rewards/rejected": -0.13942742347717285, "step": 4460 }, { "epoch": 1.540317022742936, "grad_norm": 2.2406179904937744, "learning_rate": 7.62524960707986e-09, "logits/chosen": -2.9634737968444824, "logits/rejected": -2.938974618911743, "logps/chosen": -64.69285583496094, "logps/rejected": -65.86080169677734, "loss": 0.6789, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.09530860185623169, "rewards/margins": 0.03149518743157387, "rewards/rejected": -0.12680378556251526, "step": 4470 }, { "epoch": 1.5437629221226739, "grad_norm": 2.190739870071411, "learning_rate": 7.517443460857229e-09, "logits/chosen": -2.923884391784668, "logits/rejected": -2.9112801551818848, "logps/chosen": -64.11064147949219, "logps/rejected": -67.6056137084961, "loss": 0.6783, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0985846072435379, "rewards/margins": 0.03296193480491638, "rewards/rejected": -0.1315465271472931, "step": 4480 }, { "epoch": 1.5472088215024122, "grad_norm": 2.0800647735595703, "learning_rate": 7.410269818706574e-09, "logits/chosen": -2.979024648666382, "logits/rejected": -2.949497938156128, "logps/chosen": -63.89508819580078, "logps/rejected": -64.94587707519531, "loss": 0.6794, "rewards/accuracies": 0.640625, "rewards/chosen": -0.10012755542993546, "rewards/margins": 0.03033982776105404, "rewards/rejected": -0.13046738505363464, "step": 4490 }, { "epoch": 1.5506547208821502, "grad_norm": 2.15670108795166, "learning_rate": 7.303732558079379e-09, "logits/chosen": -2.946776866912842, "logits/rejected": -2.923444986343384, "logps/chosen": -64.97744750976562, "logps/rejected": -66.2874984741211, "loss": 0.6735, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.08795378357172012, "rewards/margins": 0.04191059246659279, "rewards/rejected": -0.12986436486244202, "step": 4500 }, { "epoch": 1.5506547208821502, "eval_logits/chosen": -3.04790997505188, "eval_logits/rejected": -3.042128801345825, "eval_logps/chosen": -64.68529510498047, "eval_logps/rejected": -70.79721069335938, "eval_loss": 0.6857719421386719, "eval_rewards/accuracies": 0.5920074582099915, "eval_rewards/chosen": -0.05973397567868233, "eval_rewards/margins": 0.016436897218227386, "eval_rewards/rejected": -0.07617087662220001, "eval_runtime": 384.4992, "eval_samples_per_second": 11.194, "eval_steps_per_second": 1.399, "step": 4500 }, { "epoch": 1.5541006202618883, "grad_norm": 2.1693248748779297, "learning_rate": 7.197835533403404e-09, "logits/chosen": -2.910331964492798, "logits/rejected": -2.889179229736328, "logps/chosen": -64.47348022460938, "logps/rejected": -65.3521957397461, "loss": 0.6756, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09691820293664932, "rewards/margins": 0.037731070071458817, "rewards/rejected": -0.13464927673339844, "step": 4510 }, { "epoch": 1.5575465196416265, "grad_norm": 2.151700496673584, "learning_rate": 7.092582575943218e-09, "logits/chosen": -2.9018807411193848, "logits/rejected": -2.8939027786254883, "logps/chosen": -61.126686096191406, "logps/rejected": -65.8957290649414, "loss": 0.6794, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09966224431991577, "rewards/margins": 0.03021586500108242, "rewards/rejected": -0.12987811863422394, "step": 4520 }, { "epoch": 1.5609924190213644, "grad_norm": 2.032938241958618, "learning_rate": 6.9879774936615645e-09, "logits/chosen": -2.9603374004364014, "logits/rejected": -2.9376699924468994, "logps/chosen": -64.51271057128906, "logps/rejected": -65.9655990600586, "loss": 0.6801, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10277440398931503, "rewards/margins": 0.0286547988653183, "rewards/rejected": -0.13142919540405273, "step": 4530 }, { "epoch": 1.5644383184011028, "grad_norm": 2.025911569595337, "learning_rate": 6.884024071081632e-09, "logits/chosen": -2.9223618507385254, "logits/rejected": -2.91190767288208, "logps/chosen": -62.47723388671875, "logps/rejected": -67.90995025634766, "loss": 0.6767, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09204110503196716, "rewards/margins": 0.03594468906521797, "rewards/rejected": -0.12798579037189484, "step": 4540 }, { "epoch": 1.5678842177808407, "grad_norm": 2.2641587257385254, "learning_rate": 6.7807260691501196e-09, "logits/chosen": -2.928924560546875, "logits/rejected": -2.8977210521698, "logps/chosen": -65.50735473632812, "logps/rejected": -65.04016876220703, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": -0.09474433958530426, "rewards/margins": 0.03602954000234604, "rewards/rejected": -0.1307738721370697, "step": 4550 }, { "epoch": 1.571330117160579, "grad_norm": 2.3552682399749756, "learning_rate": 6.67808722510112e-09, "logits/chosen": -2.9768292903900146, "logits/rejected": -2.945518732070923, "logps/chosen": -66.51522064208984, "logps/rejected": -66.87415313720703, "loss": 0.6733, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08753177523612976, "rewards/margins": 0.042732808738946915, "rewards/rejected": -0.13026458024978638, "step": 4560 }, { "epoch": 1.574776016540317, "grad_norm": 2.2089123725891113, "learning_rate": 6.576111252321001e-09, "logits/chosen": -2.938915729522705, "logits/rejected": -2.9010097980499268, "logps/chosen": -64.55790710449219, "logps/rejected": -64.71064758300781, "loss": 0.6716, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08407676219940186, "rewards/margins": 0.04665817320346832, "rewards/rejected": -0.13073492050170898, "step": 4570 }, { "epoch": 1.578221915920055, "grad_norm": 1.9244182109832764, "learning_rate": 6.474801840213995e-09, "logits/chosen": -2.9675724506378174, "logits/rejected": -2.9516897201538086, "logps/chosen": -63.53330612182617, "logps/rejected": -67.43972778320312, "loss": 0.6753, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.09795354306697845, "rewards/margins": 0.0386352464556694, "rewards/rejected": -0.13658878207206726, "step": 4580 }, { "epoch": 1.5816678152997934, "grad_norm": 2.2728357315063477, "learning_rate": 6.3741626540687156e-09, "logits/chosen": -2.9654381275177, "logits/rejected": -2.938734531402588, "logps/chosen": -65.29521179199219, "logps/rejected": -64.78874206542969, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1050899475812912, "rewards/margins": 0.03461449593305588, "rewards/rejected": -0.13970443606376648, "step": 4590 }, { "epoch": 1.5851137146795313, "grad_norm": 2.1909267902374268, "learning_rate": 6.274197334925596e-09, "logits/chosen": -3.022188186645508, "logits/rejected": -3.008340358734131, "logps/chosen": -63.90410614013672, "logps/rejected": -67.8543930053711, "loss": 0.6786, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10399514436721802, "rewards/margins": 0.03185725957155228, "rewards/rejected": -0.13585242629051208, "step": 4600 }, { "epoch": 1.5851137146795313, "eval_logits/chosen": -3.0471458435058594, "eval_logits/rejected": -3.0413613319396973, "eval_logps/chosen": -64.74617004394531, "eval_logps/rejected": -70.86981964111328, "eval_loss": 0.6857200264930725, "eval_rewards/accuracies": 0.5966542959213257, "eval_rewards/chosen": -0.06034281104803085, "eval_rewards/margins": 0.016554230824112892, "eval_rewards/rejected": -0.076897032558918, "eval_runtime": 383.2954, "eval_samples_per_second": 11.229, "eval_steps_per_second": 1.404, "step": 4600 }, { "epoch": 1.5885596140592695, "grad_norm": 2.1440834999084473, "learning_rate": 6.174909499445125e-09, "logits/chosen": -2.898419141769409, "logits/rejected": -2.873417377471924, "logps/chosen": -63.03126907348633, "logps/rejected": -64.13099670410156, "loss": 0.6778, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10101036727428436, "rewards/margins": 0.03360176831483841, "rewards/rejected": -0.13461214303970337, "step": 4610 }, { "epoch": 1.5920055134390076, "grad_norm": 2.136821746826172, "learning_rate": 6.07630273977699e-09, "logits/chosen": -2.8824410438537598, "logits/rejected": -2.8729777336120605, "logps/chosen": -62.408958435058594, "logps/rejected": -66.50130462646484, "loss": 0.6791, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.096805140376091, "rewards/margins": 0.03096623346209526, "rewards/rejected": -0.12777137756347656, "step": 4620 }, { "epoch": 1.5954514128187456, "grad_norm": 2.1429455280303955, "learning_rate": 5.978380623430152e-09, "logits/chosen": -2.9070467948913574, "logits/rejected": -2.8850066661834717, "logps/chosen": -61.453765869140625, "logps/rejected": -64.62207794189453, "loss": 0.6805, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.10029391199350357, "rewards/margins": 0.027933398261666298, "rewards/rejected": -0.12822730839252472, "step": 4630 }, { "epoch": 1.598897312198484, "grad_norm": 2.195216417312622, "learning_rate": 5.8811466931437624e-09, "logits/chosen": -2.99377703666687, "logits/rejected": -2.9689157009124756, "logps/chosen": -64.21118927001953, "logps/rejected": -66.28094482421875, "loss": 0.6773, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09979419410228729, "rewards/margins": 0.034975916147232056, "rewards/rejected": -0.13477011024951935, "step": 4640 }, { "epoch": 1.602343211578222, "grad_norm": 2.260923385620117, "learning_rate": 5.784604466758955e-09, "logits/chosen": -2.8885650634765625, "logits/rejected": -2.8749794960021973, "logps/chosen": -64.46866607666016, "logps/rejected": -66.56623840332031, "loss": 0.68, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10299519449472427, "rewards/margins": 0.028997087851166725, "rewards/rejected": -0.13199228048324585, "step": 4650 }, { "epoch": 1.60578911095796, "grad_norm": 2.0964856147766113, "learning_rate": 5.688757437091632e-09, "logits/chosen": -2.9830551147460938, "logits/rejected": -2.958256244659424, "logps/chosen": -62.82770538330078, "logps/rejected": -66.51475524902344, "loss": 0.6765, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10538817942142487, "rewards/margins": 0.03581491857767105, "rewards/rejected": -0.14120309054851532, "step": 4660 }, { "epoch": 1.6092350103376982, "grad_norm": 2.319704294204712, "learning_rate": 5.593609071806061e-09, "logits/chosen": -2.9287524223327637, "logits/rejected": -2.9094161987304688, "logps/chosen": -64.73773956298828, "logps/rejected": -67.68860626220703, "loss": 0.6757, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10544122755527496, "rewards/margins": 0.03820051625370979, "rewards/rejected": -0.14364174008369446, "step": 4670 }, { "epoch": 1.6126809097174362, "grad_norm": 1.9816330671310425, "learning_rate": 5.499162813289407e-09, "logits/chosen": -2.9848568439483643, "logits/rejected": -2.9498484134674072, "logps/chosen": -64.04865264892578, "logps/rejected": -66.0094223022461, "loss": 0.6731, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09432440996170044, "rewards/margins": 0.04329352080821991, "rewards/rejected": -0.13761794567108154, "step": 4680 }, { "epoch": 1.6161268090971743, "grad_norm": 2.197380781173706, "learning_rate": 5.405422078527233e-09, "logits/chosen": -3.0338993072509766, "logits/rejected": -3.0085062980651855, "logps/chosen": -63.707061767578125, "logps/rejected": -64.739013671875, "loss": 0.6816, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0958305075764656, "rewards/margins": 0.02577478252351284, "rewards/rejected": -0.1216052919626236, "step": 4690 }, { "epoch": 1.6195727084769125, "grad_norm": 2.2032158374786377, "learning_rate": 5.312390258979841e-09, "logits/chosen": -2.8527743816375732, "logits/rejected": -2.8366029262542725, "logps/chosen": -63.841766357421875, "logps/rejected": -67.6465072631836, "loss": 0.6803, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10257796943187714, "rewards/margins": 0.02846847102046013, "rewards/rejected": -0.13104644417762756, "step": 4700 }, { "epoch": 1.6195727084769125, "eval_logits/chosen": -3.0465500354766846, "eval_logits/rejected": -3.040802240371704, "eval_logps/chosen": -64.74346160888672, "eval_logps/rejected": -70.8780746459961, "eval_loss": 0.6856712698936462, "eval_rewards/accuracies": 0.597815990447998, "eval_rewards/chosen": -0.06031567603349686, "eval_rewards/margins": 0.01666383258998394, "eval_rewards/rejected": -0.07697951048612595, "eval_runtime": 382.8006, "eval_samples_per_second": 11.243, "eval_steps_per_second": 1.405, "step": 4700 }, { "epoch": 1.6230186078566504, "grad_norm": 2.3515686988830566, "learning_rate": 5.220070720459571e-09, "logits/chosen": -2.943565845489502, "logits/rejected": -2.9256954193115234, "logps/chosen": -64.6860122680664, "logps/rejected": -65.74381256103516, "loss": 0.6777, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.1022968515753746, "rewards/margins": 0.03354020044207573, "rewards/rejected": -0.13583704829216003, "step": 4710 }, { "epoch": 1.6264645072363888, "grad_norm": 2.104708671569824, "learning_rate": 5.1284668030090485e-09, "logits/chosen": -2.9260170459747314, "logits/rejected": -2.917184829711914, "logps/chosen": -61.97522735595703, "logps/rejected": -64.06890869140625, "loss": 0.6824, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1094181165099144, "rewards/margins": 0.02429693005979061, "rewards/rejected": -0.13371506333351135, "step": 4720 }, { "epoch": 1.6299104066161267, "grad_norm": 2.6643316745758057, "learning_rate": 5.037581820780335e-09, "logits/chosen": -2.986668348312378, "logits/rejected": -2.9803264141082764, "logps/chosen": -63.790992736816406, "logps/rejected": -66.82393646240234, "loss": 0.6772, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1097106784582138, "rewards/margins": 0.035210516303777695, "rewards/rejected": -0.1449211984872818, "step": 4730 }, { "epoch": 1.633356305995865, "grad_norm": 2.132471799850464, "learning_rate": 4.947419061915037e-09, "logits/chosen": -2.8456203937530518, "logits/rejected": -2.8134427070617676, "logps/chosen": -64.08192443847656, "logps/rejected": -65.60581970214844, "loss": 0.6782, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.10763104259967804, "rewards/margins": 0.03280221298336983, "rewards/rejected": -0.14043326675891876, "step": 4740 }, { "epoch": 1.636802205375603, "grad_norm": 1.9625790119171143, "learning_rate": 4.857981788425305e-09, "logits/chosen": -2.9289822578430176, "logits/rejected": -2.9128329753875732, "logps/chosen": -63.73872756958008, "logps/rejected": -66.92320251464844, "loss": 0.6775, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.09859279543161392, "rewards/margins": 0.03389236703515053, "rewards/rejected": -0.13248515129089355, "step": 4750 }, { "epoch": 1.640248104755341, "grad_norm": 2.143996000289917, "learning_rate": 4.7692732360758634e-09, "logits/chosen": -2.9664969444274902, "logits/rejected": -2.951070785522461, "logps/chosen": -61.64174270629883, "logps/rejected": -69.74330139160156, "loss": 0.6756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10031326115131378, "rewards/margins": 0.03815789893269539, "rewards/rejected": -0.13847115635871887, "step": 4760 }, { "epoch": 1.6436940041350794, "grad_norm": 2.208103656768799, "learning_rate": 4.68129661426693e-09, "logits/chosen": -2.9453184604644775, "logits/rejected": -2.9222664833068848, "logps/chosen": -64.02022552490234, "logps/rejected": -66.49871826171875, "loss": 0.6756, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10512950271368027, "rewards/margins": 0.03840099275112152, "rewards/rejected": -0.14353050291538239, "step": 4770 }, { "epoch": 1.6471399035148173, "grad_norm": 2.6903138160705566, "learning_rate": 4.594055105918071e-09, "logits/chosen": -2.9889702796936035, "logits/rejected": -2.9868006706237793, "logps/chosen": -63.49650955200195, "logps/rejected": -68.11885070800781, "loss": 0.679, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.10073445737361908, "rewards/margins": 0.031192371621727943, "rewards/rejected": -0.13192683458328247, "step": 4780 }, { "epoch": 1.6505858028945555, "grad_norm": 2.081115245819092, "learning_rate": 4.507551867353093e-09, "logits/chosen": -2.967514991760254, "logits/rejected": -2.9510347843170166, "logps/chosen": -62.506919860839844, "logps/rejected": -67.48614501953125, "loss": 0.6767, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.1013697013258934, "rewards/margins": 0.03546713665127754, "rewards/rejected": -0.13683685660362244, "step": 4790 }, { "epoch": 1.6540317022742936, "grad_norm": 2.1103594303131104, "learning_rate": 4.4217900281858236e-09, "logits/chosen": -2.9971890449523926, "logits/rejected": -2.9710536003112793, "logps/chosen": -65.31468963623047, "logps/rejected": -67.2812728881836, "loss": 0.6789, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09736104309558868, "rewards/margins": 0.03208557516336441, "rewards/rejected": -0.1294466108083725, "step": 4800 }, { "epoch": 1.6540317022742936, "eval_logits/chosen": -3.045680284500122, "eval_logits/rejected": -3.039926052093506, "eval_logps/chosen": -64.78036499023438, "eval_logps/rejected": -70.9262924194336, "eval_loss": 0.6856197118759155, "eval_rewards/accuracies": 0.5929368138313293, "eval_rewards/chosen": -0.060684628784656525, "eval_rewards/margins": 0.016777031123638153, "eval_rewards/rejected": -0.07746166735887527, "eval_runtime": 383.212, "eval_samples_per_second": 11.231, "eval_steps_per_second": 1.404, "step": 4800 }, { "epoch": 1.6574776016540316, "grad_norm": 2.2263193130493164, "learning_rate": 4.336772691206877e-09, "logits/chosen": -2.993753433227539, "logits/rejected": -2.97983455657959, "logps/chosen": -64.71400451660156, "logps/rejected": -67.22502136230469, "loss": 0.6806, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09266643226146698, "rewards/margins": 0.02778668701648712, "rewards/rejected": -0.1204531341791153, "step": 4810 }, { "epoch": 1.66092350103377, "grad_norm": 2.1050963401794434, "learning_rate": 4.252502932271423e-09, "logits/chosen": -2.9959557056427, "logits/rejected": -2.966916561126709, "logps/chosen": -66.26362609863281, "logps/rejected": -65.68085479736328, "loss": 0.6776, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10536062717437744, "rewards/margins": 0.03362419083714485, "rewards/rejected": -0.1389847993850708, "step": 4820 }, { "epoch": 1.664369400413508, "grad_norm": 2.623863697052002, "learning_rate": 4.168983800187892e-09, "logits/chosen": -2.9033761024475098, "logits/rejected": -2.8900322914123535, "logps/chosen": -63.26140594482422, "logps/rejected": -66.57955169677734, "loss": 0.6819, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.10230235755443573, "rewards/margins": 0.025283504277467728, "rewards/rejected": -0.12758587300777435, "step": 4830 }, { "epoch": 1.667815299793246, "grad_norm": 2.2002789974212646, "learning_rate": 4.086218316607654e-09, "logits/chosen": -2.985736608505249, "logits/rejected": -2.9631614685058594, "logps/chosen": -64.74356079101562, "logps/rejected": -65.73597717285156, "loss": 0.6772, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.09450788795948029, "rewards/margins": 0.035158026963472366, "rewards/rejected": -0.12966591119766235, "step": 4840 }, { "epoch": 1.6712611991729842, "grad_norm": 2.155360460281372, "learning_rate": 4.004209475915732e-09, "logits/chosen": -2.8897435665130615, "logits/rejected": -2.8641912937164307, "logps/chosen": -64.5123519897461, "logps/rejected": -67.00016784667969, "loss": 0.6782, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10245025157928467, "rewards/margins": 0.033127255737781525, "rewards/rejected": -0.1355775147676468, "step": 4850 }, { "epoch": 1.6747070985527222, "grad_norm": 2.3200953006744385, "learning_rate": 3.9229602451224554e-09, "logits/chosen": -2.875943183898926, "logits/rejected": -2.8616209030151367, "logps/chosen": -63.52336502075195, "logps/rejected": -64.73592376708984, "loss": 0.6793, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10029806196689606, "rewards/margins": 0.030711418017745018, "rewards/rejected": -0.13100948929786682, "step": 4860 }, { "epoch": 1.6781529979324605, "grad_norm": 1.9992374181747437, "learning_rate": 3.8424735637560965e-09, "logits/chosen": -2.949007034301758, "logits/rejected": -2.9328529834747314, "logps/chosen": -64.01113891601562, "logps/rejected": -66.80305480957031, "loss": 0.6794, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09905761480331421, "rewards/margins": 0.030456161126494408, "rewards/rejected": -0.12951377034187317, "step": 4870 }, { "epoch": 1.6815988973121985, "grad_norm": 2.017805576324463, "learning_rate": 3.762752343756531e-09, "logits/chosen": -2.971914768218994, "logits/rejected": -2.9384541511535645, "logps/chosen": -63.82556915283203, "logps/rejected": -64.0127182006836, "loss": 0.6745, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09669949859380722, "rewards/margins": 0.04025216028094292, "rewards/rejected": -0.13695165514945984, "step": 4880 }, { "epoch": 1.6850447966919366, "grad_norm": 2.0400807857513428, "learning_rate": 3.683799469369919e-09, "logits/chosen": -2.929330825805664, "logits/rejected": -2.895925521850586, "logps/chosen": -67.0200424194336, "logps/rejected": -66.51060485839844, "loss": 0.6738, "rewards/accuracies": 0.671875, "rewards/chosen": -0.09524741768836975, "rewards/margins": 0.042062025517225266, "rewards/rejected": -0.13730944693088531, "step": 4890 }, { "epoch": 1.6884906960716748, "grad_norm": 2.2557101249694824, "learning_rate": 3.6056177970442995e-09, "logits/chosen": -2.9533286094665527, "logits/rejected": -2.935368061065674, "logps/chosen": -64.12738037109375, "logps/rejected": -69.1882553100586, "loss": 0.6723, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09020443260669708, "rewards/margins": 0.04525913670659065, "rewards/rejected": -0.13546356558799744, "step": 4900 }, { "epoch": 1.6884906960716748, "eval_logits/chosen": -3.0447659492492676, "eval_logits/rejected": -3.0389816761016846, "eval_logps/chosen": -64.8213119506836, "eval_logps/rejected": -70.97413635253906, "eval_loss": 0.6855877041816711, "eval_rewards/accuracies": 0.5985130071640015, "eval_rewards/chosen": -0.06109423562884331, "eval_rewards/margins": 0.016845842823386192, "eval_rewards/rejected": -0.07794006913900375, "eval_runtime": 382.7336, "eval_samples_per_second": 11.245, "eval_steps_per_second": 1.406, "step": 4900 }, { "epoch": 1.6919365954514127, "grad_norm": 2.2069523334503174, "learning_rate": 3.528210155326289e-09, "logits/chosen": -2.9314873218536377, "logits/rejected": -2.9017434120178223, "logps/chosen": -64.22460174560547, "logps/rejected": -68.1082992553711, "loss": 0.6711, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.09454147517681122, "rewards/margins": 0.04726605862379074, "rewards/rejected": -0.14180754125118256, "step": 4910 }, { "epoch": 1.6953824948311509, "grad_norm": 2.172011613845825, "learning_rate": 3.4515793447587342e-09, "logits/chosen": -2.960524082183838, "logits/rejected": -2.9422361850738525, "logps/chosen": -67.4477310180664, "logps/rejected": -68.8906478881836, "loss": 0.6813, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.10514837503433228, "rewards/margins": 0.026843750849366188, "rewards/rejected": -0.1319921314716339, "step": 4920 }, { "epoch": 1.698828394210889, "grad_norm": 2.2750051021575928, "learning_rate": 3.3757281377793793e-09, "logits/chosen": -2.9778566360473633, "logits/rejected": -2.9670004844665527, "logps/chosen": -65.68384552001953, "logps/rejected": -65.3903579711914, "loss": 0.6826, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.10219858586788177, "rewards/margins": 0.023961419239640236, "rewards/rejected": -0.12615999579429626, "step": 4930 }, { "epoch": 1.7022742935906272, "grad_norm": 2.0987696647644043, "learning_rate": 3.3006592786205793e-09, "logits/chosen": -2.9296352863311768, "logits/rejected": -2.914144992828369, "logps/chosen": -62.513702392578125, "logps/rejected": -66.65951538085938, "loss": 0.6776, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.1011534184217453, "rewards/margins": 0.0337558388710022, "rewards/rejected": -0.1349092572927475, "step": 4940 }, { "epoch": 1.7057201929703654, "grad_norm": 2.069197416305542, "learning_rate": 3.226375483210017e-09, "logits/chosen": -2.890801191329956, "logits/rejected": -2.8730523586273193, "logps/chosen": -63.73029327392578, "logps/rejected": -65.79854583740234, "loss": 0.6735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09668199717998505, "rewards/margins": 0.04293070361018181, "rewards/rejected": -0.13961270451545715, "step": 4950 }, { "epoch": 1.7091660923501033, "grad_norm": 2.062432050704956, "learning_rate": 3.152879439072409e-09, "logits/chosen": -3.0091025829315186, "logits/rejected": -2.9726009368896484, "logps/chosen": -64.92745208740234, "logps/rejected": -63.91938400268555, "loss": 0.674, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09798876196146011, "rewards/margins": 0.04130322486162186, "rewards/rejected": -0.13929200172424316, "step": 4960 }, { "epoch": 1.7126119917298415, "grad_norm": 2.0526952743530273, "learning_rate": 3.0801738052323224e-09, "logits/chosen": -2.986570358276367, "logits/rejected": -2.982990264892578, "logps/chosen": -63.114768981933594, "logps/rejected": -70.09123229980469, "loss": 0.6813, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.09630588442087173, "rewards/margins": 0.026602035388350487, "rewards/rejected": -0.12290791422128677, "step": 4970 }, { "epoch": 1.7160578911095796, "grad_norm": 2.1260287761688232, "learning_rate": 3.0082612121179434e-09, "logits/chosen": -3.0010061264038086, "logits/rejected": -2.9745213985443115, "logps/chosen": -64.26847076416016, "logps/rejected": -66.21635437011719, "loss": 0.6814, "rewards/accuracies": 0.609375, "rewards/chosen": -0.10567764937877655, "rewards/margins": 0.026792770251631737, "rewards/rejected": -0.13247041404247284, "step": 4980 }, { "epoch": 1.7195037904893176, "grad_norm": 2.227865695953369, "learning_rate": 2.9371442614659096e-09, "logits/chosen": -2.9317967891693115, "logits/rejected": -2.915430784225464, "logps/chosen": -62.3828125, "logps/rejected": -66.34077453613281, "loss": 0.6767, "rewards/accuracies": 0.640625, "rewards/chosen": -0.10304798930883408, "rewards/margins": 0.035633284598588943, "rewards/rejected": -0.13868127763271332, "step": 4990 }, { "epoch": 1.722949689869056, "grad_norm": 2.139073610305786, "learning_rate": 2.8668255262271985e-09, "logits/chosen": -2.99995493888855, "logits/rejected": -2.965244770050049, "logps/chosen": -64.31925201416016, "logps/rejected": -66.57722473144531, "loss": 0.6767, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09578624367713928, "rewards/margins": 0.03556639701128006, "rewards/rejected": -0.13135263323783875, "step": 5000 }, { "epoch": 1.722949689869056, "eval_logits/chosen": -3.0446012020111084, "eval_logits/rejected": -3.0388095378875732, "eval_logps/chosen": -64.83766174316406, "eval_logps/rejected": -70.99246215820312, "eval_loss": 0.6855791211128235, "eval_rewards/accuracies": 0.5959572196006775, "eval_rewards/chosen": -0.06125764176249504, "eval_rewards/margins": 0.016865791752934456, "eval_rewards/rejected": -0.07812343537807465, "eval_runtime": 383.0259, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 5000 }, { "epoch": 1.7263955892487939, "grad_norm": 2.236734390258789, "learning_rate": 2.7973075504740317e-09, "logits/chosen": -2.9484639167785645, "logits/rejected": -2.921968460083008, "logps/chosen": -63.20332717895508, "logps/rejected": -65.99695587158203, "loss": 0.6749, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.10177341848611832, "rewards/margins": 0.039411745965480804, "rewards/rejected": -0.14118514955043793, "step": 5010 }, { "epoch": 1.729841488628532, "grad_norm": 2.2744734287261963, "learning_rate": 2.7285928493078174e-09, "logits/chosen": -3.028642177581787, "logits/rejected": -3.005147933959961, "logps/chosen": -65.42494201660156, "logps/rejected": -65.94140625, "loss": 0.6777, "rewards/accuracies": 0.625, "rewards/chosen": -0.09308835119009018, "rewards/margins": 0.033491719514131546, "rewards/rejected": -0.12658005952835083, "step": 5020 }, { "epoch": 1.7332873880082702, "grad_norm": 1.9225391149520874, "learning_rate": 2.660683908768191e-09, "logits/chosen": -2.914492130279541, "logits/rejected": -2.909972667694092, "logps/chosen": -62.579566955566406, "logps/rejected": -66.6541976928711, "loss": 0.6889, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.11225111782550812, "rewards/margins": 0.011178082786500454, "rewards/rejected": -0.12342919409275055, "step": 5030 }, { "epoch": 1.7367332873880081, "grad_norm": 2.368408679962158, "learning_rate": 2.5935831857430283e-09, "logits/chosen": -2.9320456981658936, "logits/rejected": -2.91937255859375, "logps/chosen": -64.72118377685547, "logps/rejected": -67.03709411621094, "loss": 0.6795, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10224765539169312, "rewards/margins": 0.030294686555862427, "rewards/rejected": -0.13254234194755554, "step": 5040 }, { "epoch": 1.7401791867677465, "grad_norm": 2.43100905418396, "learning_rate": 2.527293107879602e-09, "logits/chosen": -3.0009281635284424, "logits/rejected": -2.9862048625946045, "logps/chosen": -64.345458984375, "logps/rejected": -69.44126892089844, "loss": 0.6762, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0966586247086525, "rewards/margins": 0.03664679080247879, "rewards/rejected": -0.13330543041229248, "step": 5050 }, { "epoch": 1.7436250861474845, "grad_norm": 2.1548216342926025, "learning_rate": 2.4618160734967168e-09, "logits/chosen": -2.9960198402404785, "logits/rejected": -2.989203453063965, "logps/chosen": -62.73298263549805, "logps/rejected": -66.20845794677734, "loss": 0.6796, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10584567487239838, "rewards/margins": 0.02997753582894802, "rewards/rejected": -0.13582322001457214, "step": 5060 }, { "epoch": 1.7470709855272226, "grad_norm": 2.455150604248047, "learning_rate": 2.397154451497957e-09, "logits/chosen": -2.993448257446289, "logits/rejected": -2.961956024169922, "logps/chosen": -65.48920440673828, "logps/rejected": -66.84199523925781, "loss": 0.6727, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.08603125810623169, "rewards/margins": 0.044304654002189636, "rewards/rejected": -0.13033589720726013, "step": 5070 }, { "epoch": 1.7505168849069608, "grad_norm": 2.25144362449646, "learning_rate": 2.333310581285988e-09, "logits/chosen": -2.9321632385253906, "logits/rejected": -2.9152073860168457, "logps/chosen": -64.05274963378906, "logps/rejected": -67.70162963867188, "loss": 0.677, "rewards/accuracies": 0.625, "rewards/chosen": -0.10330088436603546, "rewards/margins": 0.03567297011613846, "rewards/rejected": -0.13897386193275452, "step": 5080 }, { "epoch": 1.7539627842866987, "grad_norm": 2.1542866230010986, "learning_rate": 2.27028677267789e-09, "logits/chosen": -2.918034553527832, "logits/rejected": -2.898874282836914, "logps/chosen": -63.1301155090332, "logps/rejected": -66.78999328613281, "loss": 0.6772, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09454191476106644, "rewards/margins": 0.03478521481156349, "rewards/rejected": -0.12932713329792023, "step": 5090 }, { "epoch": 1.757408683666437, "grad_norm": 2.031991720199585, "learning_rate": 2.2080853058216274e-09, "logits/chosen": -2.960707426071167, "logits/rejected": -2.938732385635376, "logps/chosen": -61.773902893066406, "logps/rejected": -64.69305419921875, "loss": 0.6774, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.10084456205368042, "rewards/margins": 0.03442123904824257, "rewards/rejected": -0.1352657973766327, "step": 5100 }, { "epoch": 1.757408683666437, "eval_logits/chosen": -3.0444910526275635, "eval_logits/rejected": -3.0386948585510254, "eval_logps/chosen": -64.86609649658203, "eval_logps/rejected": -71.0175552368164, "eval_loss": 0.685597836971283, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.0615420788526535, "eval_rewards/margins": 0.01683231070637703, "eval_rewards/rejected": -0.07837438583374023, "eval_runtime": 383.0373, "eval_samples_per_second": 11.237, "eval_steps_per_second": 1.405, "step": 5100 }, { "epoch": 1.760854583046175, "grad_norm": 2.0644540786743164, "learning_rate": 2.1467084311135226e-09, "logits/chosen": -2.9693267345428467, "logits/rejected": -2.9648232460021973, "logps/chosen": -61.736297607421875, "logps/rejected": -67.93791198730469, "loss": 0.6782, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.10537086427211761, "rewards/margins": 0.032763343304395676, "rewards/rejected": -0.1381342113018036, "step": 5110 }, { "epoch": 1.7643004824259132, "grad_norm": 2.284945011138916, "learning_rate": 2.0861583691168637e-09, "logits/chosen": -2.8928122520446777, "logits/rejected": -2.866619110107422, "logps/chosen": -62.84453582763672, "logps/rejected": -66.07553100585938, "loss": 0.6769, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10395550727844238, "rewards/margins": 0.035675663501024246, "rewards/rejected": -0.13963118195533752, "step": 5120 }, { "epoch": 1.7677463818056514, "grad_norm": 2.1467080116271973, "learning_rate": 2.0264373104815602e-09, "logits/chosen": -2.9610791206359863, "logits/rejected": -2.9423394203186035, "logps/chosen": -65.15470123291016, "logps/rejected": -66.92606353759766, "loss": 0.6769, "rewards/accuracies": 0.640625, "rewards/chosen": -0.10134553909301758, "rewards/margins": 0.035156022757291794, "rewards/rejected": -0.13650156557559967, "step": 5130 }, { "epoch": 1.7711922811853893, "grad_norm": 2.283031463623047, "learning_rate": 1.967547415864862e-09, "logits/chosen": -2.924994945526123, "logits/rejected": -2.9106478691101074, "logps/chosen": -65.27745819091797, "logps/rejected": -68.17292785644531, "loss": 0.6774, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10397525876760483, "rewards/margins": 0.03433610126376152, "rewards/rejected": -0.13831135630607605, "step": 5140 }, { "epoch": 1.7746381805651275, "grad_norm": 2.338268756866455, "learning_rate": 1.909490815853232e-09, "logits/chosen": -2.9108078479766846, "logits/rejected": -2.9008777141571045, "logps/chosen": -63.09922409057617, "logps/rejected": -68.67888641357422, "loss": 0.6767, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10057171434164047, "rewards/margins": 0.03636406734585762, "rewards/rejected": -0.1369357705116272, "step": 5150 }, { "epoch": 1.7780840799448656, "grad_norm": 2.083338499069214, "learning_rate": 1.8522696108852348e-09, "logits/chosen": -2.9106225967407227, "logits/rejected": -2.8816580772399902, "logps/chosen": -66.49388122558594, "logps/rejected": -66.64405059814453, "loss": 0.6738, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.09981705248355865, "rewards/margins": 0.041750866919755936, "rewards/rejected": -0.1415679156780243, "step": 5160 }, { "epoch": 1.7815299793246038, "grad_norm": 2.3757216930389404, "learning_rate": 1.795885871175537e-09, "logits/chosen": -2.967358112335205, "logits/rejected": -2.9483819007873535, "logps/chosen": -64.157958984375, "logps/rejected": -68.16368865966797, "loss": 0.6767, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10042033344507217, "rewards/margins": 0.03579426556825638, "rewards/rejected": -0.13621459901332855, "step": 5170 }, { "epoch": 1.784975878704342, "grad_norm": 2.0755326747894287, "learning_rate": 1.7403416366400385e-09, "logits/chosen": -2.9863715171813965, "logits/rejected": -2.9560065269470215, "logps/chosen": -64.21257781982422, "logps/rejected": -64.93058776855469, "loss": 0.6755, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0919819101691246, "rewards/margins": 0.03802384436130524, "rewards/rejected": -0.13000576198101044, "step": 5180 }, { "epoch": 1.7884217780840799, "grad_norm": 2.1907148361206055, "learning_rate": 1.6856389168220547e-09, "logits/chosen": -2.8582282066345215, "logits/rejected": -2.836745262145996, "logps/chosen": -62.414825439453125, "logps/rejected": -66.34150695800781, "loss": 0.6783, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.10486672073602676, "rewards/margins": 0.0326237753033638, "rewards/rejected": -0.13749048113822937, "step": 5190 }, { "epoch": 1.791867677463818, "grad_norm": 2.2690272331237793, "learning_rate": 1.6317796908195985e-09, "logits/chosen": -2.9531779289245605, "logits/rejected": -2.9294018745422363, "logps/chosen": -64.02655792236328, "logps/rejected": -68.70619201660156, "loss": 0.6748, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.10051832348108292, "rewards/margins": 0.03954003006219864, "rewards/rejected": -0.14005833864212036, "step": 5200 }, { "epoch": 1.791867677463818, "eval_logits/chosen": -3.0440940856933594, "eval_logits/rejected": -3.038301467895508, "eval_logps/chosen": -64.87361145019531, "eval_logps/rejected": -71.03768920898438, "eval_loss": 0.6855441331863403, "eval_rewards/accuracies": 0.5938661694526672, "eval_rewards/chosen": -0.06161721795797348, "eval_rewards/margins": 0.016958480700850487, "eval_rewards/rejected": -0.07857570797204971, "eval_runtime": 382.8884, "eval_samples_per_second": 11.241, "eval_steps_per_second": 1.405, "step": 5200 }, { "epoch": 1.7953135768435562, "grad_norm": 2.0647106170654297, "learning_rate": 1.5787659072137944e-09, "logits/chosen": -2.946464776992798, "logits/rejected": -2.913212299346924, "logps/chosen": -64.98263549804688, "logps/rejected": -67.13221740722656, "loss": 0.6751, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09663587808609009, "rewards/margins": 0.03867806866765022, "rewards/rejected": -0.13531394302845, "step": 5210 }, { "epoch": 1.7987594762232941, "grad_norm": 2.2280561923980713, "learning_rate": 1.5265994839983893e-09, "logits/chosen": -3.0170464515686035, "logits/rejected": -3.0034279823303223, "logps/chosen": -66.4332504272461, "logps/rejected": -68.14265441894531, "loss": 0.68, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1006295308470726, "rewards/margins": 0.02905629202723503, "rewards/rejected": -0.12968583405017853, "step": 5220 }, { "epoch": 1.8022053756030325, "grad_norm": 2.3320565223693848, "learning_rate": 1.4752823085103476e-09, "logits/chosen": -2.971269130706787, "logits/rejected": -2.9558329582214355, "logps/chosen": -63.98261642456055, "logps/rejected": -68.88826751708984, "loss": 0.6762, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09884113073348999, "rewards/margins": 0.03671208769083023, "rewards/rejected": -0.13555321097373962, "step": 5230 }, { "epoch": 1.8056512749827704, "grad_norm": 2.208251953125, "learning_rate": 1.4248162373615536e-09, "logits/chosen": -2.8929576873779297, "logits/rejected": -2.862753391265869, "logps/chosen": -63.76152801513672, "logps/rejected": -65.0456771850586, "loss": 0.6765, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0982288122177124, "rewards/margins": 0.03631594032049179, "rewards/rejected": -0.1345447450876236, "step": 5240 }, { "epoch": 1.8090971743625086, "grad_norm": 2.184211015701294, "learning_rate": 1.37520309637168e-09, "logits/chosen": -2.9574813842773438, "logits/rejected": -2.9355812072753906, "logps/chosen": -62.548095703125, "logps/rejected": -67.11660766601562, "loss": 0.6749, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09293146431446075, "rewards/margins": 0.03991398215293884, "rewards/rejected": -0.1328454464673996, "step": 5250 }, { "epoch": 1.8125430737422468, "grad_norm": 2.1589388847351074, "learning_rate": 1.326444680502098e-09, "logits/chosen": -2.974087715148926, "logits/rejected": -2.9640023708343506, "logps/chosen": -63.1976203918457, "logps/rejected": -67.73504638671875, "loss": 0.6759, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.10160807520151138, "rewards/margins": 0.03716858848929405, "rewards/rejected": -0.13877665996551514, "step": 5260 }, { "epoch": 1.8159889731219847, "grad_norm": 2.1708314418792725, "learning_rate": 1.2785427537909481e-09, "logits/chosen": -2.9062724113464355, "logits/rejected": -2.882266044616699, "logps/chosen": -63.059059143066406, "logps/rejected": -64.22677612304688, "loss": 0.6784, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.10223253071308136, "rewards/margins": 0.03243548795580864, "rewards/rejected": -0.1346680223941803, "step": 5270 }, { "epoch": 1.819434872501723, "grad_norm": 2.188985586166382, "learning_rate": 1.2314990492893278e-09, "logits/chosen": -2.9831230640411377, "logits/rejected": -2.9730265140533447, "logps/chosen": -65.0916976928711, "logps/rejected": -68.92658996582031, "loss": 0.6781, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.10400126874446869, "rewards/margins": 0.033252011984586716, "rewards/rejected": -0.1372532844543457, "step": 5280 }, { "epoch": 1.822880771881461, "grad_norm": 2.0532279014587402, "learning_rate": 1.185315268998574e-09, "logits/chosen": -2.9915385246276855, "logits/rejected": -2.9743523597717285, "logps/chosen": -64.66096496582031, "logps/rejected": -68.3445053100586, "loss": 0.6768, "rewards/accuracies": 0.671875, "rewards/chosen": -0.10006566345691681, "rewards/margins": 0.035489775240421295, "rewards/rejected": -0.1355554461479187, "step": 5290 }, { "epoch": 1.8263266712611992, "grad_norm": 2.076235055923462, "learning_rate": 1.1399930838086962e-09, "logits/chosen": -2.983145236968994, "logits/rejected": -2.957339286804199, "logps/chosen": -64.48441314697266, "logps/rejected": -66.11888122558594, "loss": 0.6761, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.10090861469507217, "rewards/margins": 0.03716624155640602, "rewards/rejected": -0.1380748599767685, "step": 5300 }, { "epoch": 1.8263266712611992, "eval_logits/chosen": -3.043860912322998, "eval_logits/rejected": -3.038022518157959, "eval_logps/chosen": -64.87781524658203, "eval_logps/rejected": -71.046875, "eval_loss": 0.685516357421875, "eval_rewards/accuracies": 0.5950278639793396, "eval_rewards/chosen": -0.06165924295783043, "eval_rewards/margins": 0.017008250579237938, "eval_rewards/rejected": -0.07866749167442322, "eval_runtime": 382.9671, "eval_samples_per_second": 11.239, "eval_steps_per_second": 1.405, "step": 5300 }, { "epoch": 1.8297725706409373, "grad_norm": 2.3067526817321777, "learning_rate": 1.095534133437928e-09, "logits/chosen": -2.9042887687683105, "logits/rejected": -2.887679100036621, "logps/chosen": -62.7168083190918, "logps/rejected": -65.52967834472656, "loss": 0.6788, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10282865911722183, "rewards/margins": 0.03149380534887314, "rewards/rejected": -0.13432244956493378, "step": 5310 }, { "epoch": 1.8332184700206753, "grad_norm": 2.2261345386505127, "learning_rate": 1.051940026373399e-09, "logits/chosen": -2.9474563598632812, "logits/rejected": -2.9266719818115234, "logps/chosen": -64.75906372070312, "logps/rejected": -67.07856750488281, "loss": 0.6768, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10096816718578339, "rewards/margins": 0.03607845678925514, "rewards/rejected": -0.13704662024974823, "step": 5320 }, { "epoch": 1.8366643694004137, "grad_norm": 2.0321414470672607, "learning_rate": 1.0092123398129343e-09, "logits/chosen": -2.9617254734039307, "logits/rejected": -2.944420337677002, "logps/chosen": -64.52486419677734, "logps/rejected": -68.56999206542969, "loss": 0.6761, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.10365060716867447, "rewards/margins": 0.0370609350502491, "rewards/rejected": -0.14071156084537506, "step": 5330 }, { "epoch": 1.8401102687801516, "grad_norm": 2.1324450969696045, "learning_rate": 9.673526196080029e-10, "logits/chosen": -2.952425003051758, "logits/rejected": -2.9208550453186035, "logps/chosen": -64.861328125, "logps/rejected": -65.89351654052734, "loss": 0.6745, "rewards/accuracies": 0.625, "rewards/chosen": -0.10647694021463394, "rewards/margins": 0.04073434695601463, "rewards/rejected": -0.14721128344535828, "step": 5340 }, { "epoch": 1.8435561681598898, "grad_norm": 1.9503605365753174, "learning_rate": 9.263623802078014e-10, "logits/chosen": -2.9817001819610596, "logits/rejected": -2.958287000656128, "logps/chosen": -64.78105926513672, "logps/rejected": -66.6141357421875, "loss": 0.6775, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09822291880846024, "rewards/margins": 0.034339554607868195, "rewards/rejected": -0.13256247341632843, "step": 5350 }, { "epoch": 1.847002067539628, "grad_norm": 2.1737399101257324, "learning_rate": 8.862431046044172e-10, "logits/chosen": -2.934706449508667, "logits/rejected": -2.904364824295044, "logps/chosen": -63.380645751953125, "logps/rejected": -64.81143951416016, "loss": 0.6767, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.10364832729101181, "rewards/margins": 0.0358717143535614, "rewards/rejected": -0.13952001929283142, "step": 5360 }, { "epoch": 1.8504479669193659, "grad_norm": 2.612893581390381, "learning_rate": 8.469962442792355e-10, "logits/chosen": -2.9706287384033203, "logits/rejected": -2.947335720062256, "logps/chosen": -64.75955200195312, "logps/rejected": -66.10575866699219, "loss": 0.6788, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.10434827953577042, "rewards/margins": 0.03156623989343643, "rewards/rejected": -0.13591453433036804, "step": 5370 }, { "epoch": 1.853893866299104, "grad_norm": 2.0904970169067383, "learning_rate": 8.086232191503839e-10, "logits/chosen": -2.9142327308654785, "logits/rejected": -2.8894944190979004, "logps/chosen": -62.5820198059082, "logps/rejected": -66.3860855102539, "loss": 0.6748, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.09712487459182739, "rewards/margins": 0.039482034742832184, "rewards/rejected": -0.13660690188407898, "step": 5380 }, { "epoch": 1.8573397656788422, "grad_norm": 2.241687297821045, "learning_rate": 7.711254175213705e-10, "logits/chosen": -2.968531847000122, "logits/rejected": -2.951683759689331, "logps/chosen": -64.6425552368164, "logps/rejected": -67.84783935546875, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09521899372339249, "rewards/margins": 0.03488728031516075, "rewards/rejected": -0.13010628521442413, "step": 5390 }, { "epoch": 1.8607856650585803, "grad_norm": 2.245168685913086, "learning_rate": 7.345041960308663e-10, "logits/chosen": -3.0399162769317627, "logits/rejected": -3.0135927200317383, "logps/chosen": -61.74034881591797, "logps/rejected": -67.44328308105469, "loss": 0.6738, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09590132534503937, "rewards/margins": 0.041902266442775726, "rewards/rejected": -0.1378035843372345, "step": 5400 }, { "epoch": 1.8607856650585803, "eval_logits/chosen": -3.043799638748169, "eval_logits/rejected": -3.038034200668335, "eval_logps/chosen": -64.88846588134766, "eval_logps/rejected": -71.0632553100586, "eval_loss": 0.6854896545410156, "eval_rewards/accuracies": 0.5985130071640015, "eval_rewards/chosen": -0.06176569312810898, "eval_rewards/margins": 0.017065657302737236, "eval_rewards/rejected": -0.07883134484291077, "eval_runtime": 383.0482, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.405, "step": 5400 }, { "epoch": 1.8642315644383185, "grad_norm": 1.990185022354126, "learning_rate": 6.987608796036132e-10, "logits/chosen": -2.9385836124420166, "logits/rejected": -2.9050357341766357, "logps/chosen": -64.62165832519531, "logps/rejected": -64.18122863769531, "loss": 0.6744, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.09384270012378693, "rewards/margins": 0.0403972789645195, "rewards/rejected": -0.13424000144004822, "step": 5410 }, { "epoch": 1.8676774638180564, "grad_norm": 2.0287582874298096, "learning_rate": 6.638967614024937e-10, "logits/chosen": -2.953791856765747, "logits/rejected": -2.924999713897705, "logps/chosen": -62.311973571777344, "logps/rejected": -65.3152847290039, "loss": 0.6781, "rewards/accuracies": 0.625, "rewards/chosen": -0.09908489882946014, "rewards/margins": 0.033005110919475555, "rewards/rejected": -0.1320900171995163, "step": 5420 }, { "epoch": 1.8711233631977946, "grad_norm": 2.1748769283294678, "learning_rate": 6.299131027817401e-10, "logits/chosen": -2.963130235671997, "logits/rejected": -2.9388515949249268, "logps/chosen": -62.30155563354492, "logps/rejected": -66.90140533447266, "loss": 0.6753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09591764956712723, "rewards/margins": 0.038752518594264984, "rewards/rejected": -0.1346701681613922, "step": 5430 }, { "epoch": 1.8745692625775328, "grad_norm": 2.203587770462036, "learning_rate": 5.968111332413095e-10, "logits/chosen": -2.9019947052001953, "logits/rejected": -2.8894219398498535, "logps/chosen": -62.104217529296875, "logps/rejected": -67.97303771972656, "loss": 0.6753, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09756793826818466, "rewards/margins": 0.03838181495666504, "rewards/rejected": -0.1359497606754303, "step": 5440 }, { "epoch": 1.8780151619572707, "grad_norm": 2.005970001220703, "learning_rate": 5.645920503823898e-10, "logits/chosen": -3.024892568588257, "logits/rejected": -2.9996724128723145, "logps/chosen": -63.72404861450195, "logps/rejected": -65.83196258544922, "loss": 0.677, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09647668153047562, "rewards/margins": 0.03470000997185707, "rewards/rejected": -0.13117669522762299, "step": 5450 }, { "epoch": 1.881461061337009, "grad_norm": 2.098788022994995, "learning_rate": 5.332570198640779e-10, "logits/chosen": -2.9300484657287598, "logits/rejected": -2.9011282920837402, "logps/chosen": -65.2178955078125, "logps/rejected": -67.35456848144531, "loss": 0.6729, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09465857595205307, "rewards/margins": 0.04338352754712105, "rewards/rejected": -0.13804210722446442, "step": 5460 }, { "epoch": 1.884906960716747, "grad_norm": 2.382202625274658, "learning_rate": 5.028071753612167e-10, "logits/chosen": -2.869131565093994, "logits/rejected": -2.8438637256622314, "logps/chosen": -65.35198974609375, "logps/rejected": -66.67762756347656, "loss": 0.678, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10016383975744247, "rewards/margins": 0.033315885812044144, "rewards/rejected": -0.13347972929477692, "step": 5470 }, { "epoch": 1.8883528600964852, "grad_norm": 2.279611587524414, "learning_rate": 4.73243618523353e-10, "logits/chosen": -2.9345710277557373, "logits/rejected": -2.9160847663879395, "logps/chosen": -64.69429779052734, "logps/rejected": -67.59037780761719, "loss": 0.6772, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10485859960317612, "rewards/margins": 0.03470388427376747, "rewards/rejected": -0.1395624727010727, "step": 5480 }, { "epoch": 1.8917987594762233, "grad_norm": 2.3026857376098633, "learning_rate": 4.4456741893491023e-10, "logits/chosen": -2.977466583251953, "logits/rejected": -2.9550909996032715, "logps/chosen": -65.40562438964844, "logps/rejected": -68.69434356689453, "loss": 0.6757, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09687596559524536, "rewards/margins": 0.03861897066235542, "rewards/rejected": -0.13549493253231049, "step": 5490 }, { "epoch": 1.8952446588559613, "grad_norm": 2.6653425693511963, "learning_rate": 4.1677961407647345e-10, "logits/chosen": -2.9842467308044434, "logits/rejected": -2.9766757488250732, "logps/chosen": -63.60997772216797, "logps/rejected": -68.15296173095703, "loss": 0.6821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.10921657085418701, "rewards/margins": 0.024912817403674126, "rewards/rejected": -0.13412940502166748, "step": 5500 }, { "epoch": 1.8952446588559613, "eval_logits/chosen": -3.043705701828003, "eval_logits/rejected": -3.037916421890259, "eval_logps/chosen": -64.89189147949219, "eval_logps/rejected": -71.06378936767578, "eval_loss": 0.6855042576789856, "eval_rewards/accuracies": 0.5934014916419983, "eval_rewards/chosen": -0.06179996207356453, "eval_rewards/margins": 0.01703677512705326, "eval_rewards/rejected": -0.07883673906326294, "eval_runtime": 382.8045, "eval_samples_per_second": 11.243, "eval_steps_per_second": 1.405, "step": 5500 }, { "epoch": 1.8986905582356997, "grad_norm": 2.232734203338623, "learning_rate": 3.8988120928726274e-10, "logits/chosen": -3.009869337081909, "logits/rejected": -2.984719753265381, "logps/chosen": -63.23027801513672, "logps/rejected": -65.56352233886719, "loss": 0.6737, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09125113487243652, "rewards/margins": 0.0418318547308445, "rewards/rejected": -0.13308298587799072, "step": 5510 }, { "epoch": 1.9021364576154376, "grad_norm": 2.130568504333496, "learning_rate": 3.6387317772875457e-10, "logits/chosen": -2.999778985977173, "logits/rejected": -2.9881069660186768, "logps/chosen": -65.87593841552734, "logps/rejected": -69.14570617675781, "loss": 0.6753, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10248447954654694, "rewards/margins": 0.03881872445344925, "rewards/rejected": -0.1413031965494156, "step": 5520 }, { "epoch": 1.9055823569951758, "grad_norm": 2.2380268573760986, "learning_rate": 3.3875646034947634e-10, "logits/chosen": -2.9597513675689697, "logits/rejected": -2.928396224975586, "logps/chosen": -62.51630401611328, "logps/rejected": -67.06792449951172, "loss": 0.6739, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09396801888942719, "rewards/margins": 0.04169601574540138, "rewards/rejected": -0.13566403090953827, "step": 5530 }, { "epoch": 1.909028256374914, "grad_norm": 2.1978256702423096, "learning_rate": 3.145319658509699e-10, "logits/chosen": -2.9985365867614746, "logits/rejected": -2.961423635482788, "logps/chosen": -63.85062789916992, "logps/rejected": -65.30555725097656, "loss": 0.6709, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09511172026395798, "rewards/margins": 0.04764672368764877, "rewards/rejected": -0.14275844395160675, "step": 5540 }, { "epoch": 1.9124741557546519, "grad_norm": 2.0679409503936768, "learning_rate": 2.9120057065490365e-10, "logits/chosen": -2.8995628356933594, "logits/rejected": -2.882148265838623, "logps/chosen": -64.40314483642578, "logps/rejected": -66.68975830078125, "loss": 0.6783, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10052362829446793, "rewards/margins": 0.032626282423734665, "rewards/rejected": -0.1331499069929123, "step": 5550 }, { "epoch": 1.9159200551343902, "grad_norm": 2.1227855682373047, "learning_rate": 2.687631188713735e-10, "logits/chosen": -3.0305912494659424, "logits/rejected": -3.0032315254211426, "logps/chosen": -67.49004364013672, "logps/rejected": -68.33811950683594, "loss": 0.675, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.10781770944595337, "rewards/margins": 0.039582930505275726, "rewards/rejected": -0.1474006474018097, "step": 5560 }, { "epoch": 1.9193659545141282, "grad_norm": 2.0987863540649414, "learning_rate": 2.4722042226835993e-10, "logits/chosen": -2.889064073562622, "logits/rejected": -2.8677499294281006, "logps/chosen": -65.31095886230469, "logps/rejected": -65.89826965332031, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": -0.10466232150793076, "rewards/margins": 0.0305057130753994, "rewards/rejected": -0.13516804575920105, "step": 5570 }, { "epoch": 1.9228118538938663, "grad_norm": 2.0254909992218018, "learning_rate": 2.2657326024235755e-10, "logits/chosen": -3.000871419906616, "logits/rejected": -2.9850192070007324, "logps/chosen": -64.88275909423828, "logps/rejected": -66.03553771972656, "loss": 0.6822, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09776648133993149, "rewards/margins": 0.02495993860065937, "rewards/rejected": -0.12272641807794571, "step": 5580 }, { "epoch": 1.9262577532736045, "grad_norm": 2.1231415271759033, "learning_rate": 2.0682237979018636e-10, "logits/chosen": -2.937507152557373, "logits/rejected": -2.9114415645599365, "logps/chosen": -65.44194030761719, "logps/rejected": -63.282936096191406, "loss": 0.6772, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.09877943992614746, "rewards/margins": 0.03481840714812279, "rewards/rejected": -0.13359785079956055, "step": 5590 }, { "epoch": 1.9297036526533424, "grad_norm": 2.185314893722534, "learning_rate": 1.8796849548195215e-10, "logits/chosen": -3.00420880317688, "logits/rejected": -2.9785873889923096, "logps/chosen": -63.3000373840332, "logps/rejected": -66.32828521728516, "loss": 0.6724, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.09202561527490616, "rewards/margins": 0.04433347284793854, "rewards/rejected": -0.1363590806722641, "step": 5600 }, { "epoch": 1.9297036526533424, "eval_logits/chosen": -3.0436623096466064, "eval_logits/rejected": -3.037909507751465, "eval_logps/chosen": -64.89786529541016, "eval_logps/rejected": -71.06352996826172, "eval_loss": 0.6855349540710449, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.06185971945524216, "eval_rewards/margins": 0.016974303871393204, "eval_rewards/rejected": -0.07883401960134506, "eval_runtime": 383.1627, "eval_samples_per_second": 11.233, "eval_steps_per_second": 1.404, "step": 5600 }, { "epoch": 1.9331495520330806, "grad_norm": 2.23110032081604, "learning_rate": 1.7001228943520075e-10, "logits/chosen": -2.909824848175049, "logits/rejected": -2.8948612213134766, "logps/chosen": -61.312904357910156, "logps/rejected": -68.531982421875, "loss": 0.6767, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.10082676261663437, "rewards/margins": 0.03634323552250862, "rewards/rejected": -0.1371699869632721, "step": 5610 }, { "epoch": 1.9365954514128187, "grad_norm": 2.072162628173828, "learning_rate": 1.5295441129024312e-10, "logits/chosen": -2.9221153259277344, "logits/rejected": -2.9079785346984863, "logps/chosen": -65.98023986816406, "logps/rejected": -65.65779113769531, "loss": 0.6825, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.1077071875333786, "rewards/margins": 0.024027761071920395, "rewards/rejected": -0.1317349374294281, "step": 5620 }, { "epoch": 1.940041350792557, "grad_norm": 2.2267067432403564, "learning_rate": 1.3679547818664927e-10, "logits/chosen": -2.920483112335205, "logits/rejected": -2.8899428844451904, "logps/chosen": -65.67730712890625, "logps/rejected": -66.02433013916016, "loss": 0.6754, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09616454690694809, "rewards/margins": 0.03814741224050522, "rewards/rejected": -0.1343119889497757, "step": 5630 }, { "epoch": 1.943487250172295, "grad_norm": 2.207374095916748, "learning_rate": 1.2153607474091332e-10, "logits/chosen": -2.9759020805358887, "logits/rejected": -2.9456405639648438, "logps/chosen": -65.62007141113281, "logps/rejected": -65.35276794433594, "loss": 0.6748, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.09680736809968948, "rewards/margins": 0.039464615285396576, "rewards/rejected": -0.13627198338508606, "step": 5640 }, { "epoch": 1.946933149552033, "grad_norm": 2.3040826320648193, "learning_rate": 1.0717675302531482e-10, "logits/chosen": -2.946089029312134, "logits/rejected": -2.925248861312866, "logps/chosen": -68.79296875, "logps/rejected": -67.72799682617188, "loss": 0.6776, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.09351503849029541, "rewards/margins": 0.03450186923146248, "rewards/rejected": -0.1280169039964676, "step": 5650 }, { "epoch": 1.9503790489317712, "grad_norm": 2.2648158073425293, "learning_rate": 9.371803254794308e-11, "logits/chosen": -3.002150297164917, "logits/rejected": -2.984257459640503, "logps/chosen": -62.06635665893555, "logps/rejected": -66.76404571533203, "loss": 0.6714, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09374018013477325, "rewards/margins": 0.046740688383579254, "rewards/rejected": -0.1404808610677719, "step": 5660 }, { "epoch": 1.9538249483115093, "grad_norm": 2.20288348197937, "learning_rate": 8.116040023388448e-11, "logits/chosen": -2.9187135696411133, "logits/rejected": -2.8906636238098145, "logps/chosen": -64.13285064697266, "logps/rejected": -65.82918548583984, "loss": 0.6736, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1016378179192543, "rewards/margins": 0.04218579828739166, "rewards/rejected": -0.14382360875606537, "step": 5670 }, { "epoch": 1.9572708476912473, "grad_norm": 2.0635271072387695, "learning_rate": 6.950431040763371e-11, "logits/chosen": -2.9942047595977783, "logits/rejected": -2.9737515449523926, "logps/chosen": -65.24699401855469, "logps/rejected": -66.28753662109375, "loss": 0.6781, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10666315257549286, "rewards/margins": 0.033177699893713, "rewards/rejected": -0.13984087109565735, "step": 5680 }, { "epoch": 1.9607167470709856, "grad_norm": 2.0154037475585938, "learning_rate": 5.875018477663752e-11, "logits/chosen": -2.9185800552368164, "logits/rejected": -2.8994216918945312, "logps/chosen": -64.3661880493164, "logps/rejected": -65.51478576660156, "loss": 0.6779, "rewards/accuracies": 0.609375, "rewards/chosen": -0.10311891883611679, "rewards/margins": 0.03355059400200844, "rewards/rejected": -0.13666951656341553, "step": 5690 }, { "epoch": 1.9641626464507236, "grad_norm": 2.2101991176605225, "learning_rate": 4.8898412416040203e-11, "logits/chosen": -2.9987101554870605, "logits/rejected": -2.971622943878174, "logps/chosen": -63.42491912841797, "logps/rejected": -67.65690612792969, "loss": 0.6745, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09306361526250839, "rewards/margins": 0.04037128761410713, "rewards/rejected": -0.13343490660190582, "step": 5700 }, { "epoch": 1.9641626464507236, "eval_logits/chosen": -3.0438084602355957, "eval_logits/rejected": -3.037998914718628, "eval_logps/chosen": -64.90370178222656, "eval_logps/rejected": -71.07880401611328, "eval_loss": 0.685490071773529, "eval_rewards/accuracies": 0.595724880695343, "eval_rewards/chosen": -0.06191807612776756, "eval_rewards/margins": 0.01706887222826481, "eval_rewards/rejected": -0.07898694276809692, "eval_runtime": 382.8267, "eval_samples_per_second": 11.243, "eval_steps_per_second": 1.405, "step": 5700 }, { "epoch": 1.9676085458304617, "grad_norm": 2.2737042903900146, "learning_rate": 3.994934975461439e-11, "logits/chosen": -2.9715285301208496, "logits/rejected": -2.9414966106414795, "logps/chosen": -64.19334411621094, "logps/rejected": -65.5436782836914, "loss": 0.6778, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.09507995843887329, "rewards/margins": 0.033717669546604156, "rewards/rejected": -0.12879762053489685, "step": 5710 }, { "epoch": 1.9710544452102, "grad_norm": 2.0696685314178467, "learning_rate": 3.190332056186018e-11, "logits/chosen": -2.9221713542938232, "logits/rejected": -2.9031994342803955, "logps/chosen": -64.0558853149414, "logps/rejected": -65.50578308105469, "loss": 0.6817, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.10295647382736206, "rewards/margins": 0.02570858970284462, "rewards/rejected": -0.12866505980491638, "step": 5720 }, { "epoch": 1.9745003445899378, "grad_norm": 2.249941110610962, "learning_rate": 2.4760615936289532e-11, "logits/chosen": -2.905991315841675, "logits/rejected": -2.884575366973877, "logps/chosen": -64.17007446289062, "logps/rejected": -67.10668182373047, "loss": 0.6746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10047076642513275, "rewards/margins": 0.04041288048028946, "rewards/rejected": -0.1408836394548416, "step": 5730 }, { "epoch": 1.9779462439696762, "grad_norm": 2.211496353149414, "learning_rate": 1.8521494294898578e-11, "logits/chosen": -2.997009754180908, "logits/rejected": -2.9652652740478516, "logps/chosen": -66.19706726074219, "logps/rejected": -64.80754089355469, "loss": 0.6729, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09728576242923737, "rewards/margins": 0.04374868795275688, "rewards/rejected": -0.14103445410728455, "step": 5740 }, { "epoch": 1.9813921433494142, "grad_norm": 1.8820128440856934, "learning_rate": 1.318618136381955e-11, "logits/chosen": -2.9602997303009033, "logits/rejected": -2.9354488849639893, "logps/chosen": -62.84728240966797, "logps/rejected": -63.922279357910156, "loss": 0.6815, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -0.10357487201690674, "rewards/margins": 0.026113441213965416, "rewards/rejected": -0.1296883076429367, "step": 5750 }, { "epoch": 1.9848380427291523, "grad_norm": 2.149010181427002, "learning_rate": 8.75487017014953e-12, "logits/chosen": -2.9395554065704346, "logits/rejected": -2.9065656661987305, "logps/chosen": -66.78532409667969, "logps/rejected": -65.26570129394531, "loss": 0.6724, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.09145772457122803, "rewards/margins": 0.04497409611940384, "rewards/rejected": -0.13643182814121246, "step": 5760 }, { "epoch": 1.9882839421088905, "grad_norm": 2.0296709537506104, "learning_rate": 5.227721034969934e-12, "logits/chosen": -2.9788923263549805, "logits/rejected": -2.9554014205932617, "logps/chosen": -61.77091598510742, "logps/rejected": -65.63267517089844, "loss": 0.6753, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0966632291674614, "rewards/margins": 0.038477830588817596, "rewards/rejected": -0.1351410448551178, "step": 5770 }, { "epoch": 1.9917298414886284, "grad_norm": 2.1751058101654053, "learning_rate": 2.6048615675483555e-12, "logits/chosen": -2.9429450035095215, "logits/rejected": -2.9181408882141113, "logps/chosen": -63.3074836730957, "logps/rejected": -65.84387969970703, "loss": 0.6777, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.10151276737451553, "rewards/margins": 0.033877789974212646, "rewards/rejected": -0.13539054989814758, "step": 5780 }, { "epoch": 1.9951757408683668, "grad_norm": 2.299027919769287, "learning_rate": 8.863866607144999e-13, "logits/chosen": -2.9754512310028076, "logits/rejected": -2.9556355476379395, "logps/chosen": -66.12899780273438, "logps/rejected": -69.43649291992188, "loss": 0.6805, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -0.10944394022226334, "rewards/margins": 0.028050214052200317, "rewards/rejected": -0.13749414682388306, "step": 5790 }, { "epoch": 1.9986216402481047, "grad_norm": 2.1351449489593506, "learning_rate": 7.235848743236683e-14, "logits/chosen": -2.9115071296691895, "logits/rejected": -2.88586688041687, "logps/chosen": -64.3592758178711, "logps/rejected": -65.3360366821289, "loss": 0.6767, "rewards/accuracies": 0.640625, "rewards/chosen": -0.09383897483348846, "rewards/margins": 0.03549923375248909, "rewards/rejected": -0.12933818995952606, "step": 5800 }, { "epoch": 1.9986216402481047, "eval_logits/chosen": -3.0438718795776367, "eval_logits/rejected": -3.0380868911743164, "eval_logps/chosen": -64.8961181640625, "eval_logps/rejected": -71.05843353271484, "eval_loss": 0.6855542659759521, "eval_rewards/accuracies": 0.5954925417900085, "eval_rewards/chosen": -0.06184223294258118, "eval_rewards/margins": 0.016940835863351822, "eval_rewards/rejected": -0.0787830725312233, "eval_runtime": 383.0406, "eval_samples_per_second": 11.236, "eval_steps_per_second": 1.405, "step": 5800 }, { "epoch": 2.0, "step": 5804, "total_flos": 0.0, "train_loss": 0.6830481951767456, "train_runtime": 69306.1943, "train_samples_per_second": 2.68, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 5804, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }