{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 1, "global_step": 1180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01694915254237288, "grad_norm": 35.90666675768779, "learning_rate": 4.2372881355932205e-09, "logits/chosen": 14.802189826965332, "logits/rejected": 14.350337982177734, "logps/chosen": -11.634295463562012, "logps/rejected": -13.271549224853516, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03389830508474576, "grad_norm": 36.054608327135526, "learning_rate": 8.474576271186441e-09, "logits/chosen": 12.264768600463867, "logits/rejected": 12.60151195526123, "logps/chosen": -12.491461753845215, "logps/rejected": -14.43135929107666, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.05084745762711865, "grad_norm": 35.49641367727226, "learning_rate": 1.2711864406779661e-08, "logits/chosen": 16.878520965576172, "logits/rejected": 13.18266487121582, "logps/chosen": -9.232068061828613, "logps/rejected": -17.441871643066406, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0281001478433609, "rewards/margins": 0.0005521401762962341, "rewards/rejected": 0.027548007667064667, "step": 3 }, { "epoch": 0.06779661016949153, "grad_norm": 39.16805126128429, "learning_rate": 1.6949152542372882e-08, "logits/chosen": 13.905435562133789, "logits/rejected": 14.600114822387695, "logps/chosen": -8.362595558166504, "logps/rejected": -12.51268196105957, "loss": 0.7114, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00230209156870842, "rewards/margins": -0.0342477448284626, "rewards/rejected": 0.03194565325975418, "step": 4 }, { "epoch": 0.0847457627118644, "grad_norm": 35.99356386144667, "learning_rate": 2.11864406779661e-08, "logits/chosen": 11.287035942077637, "logits/rejected": 11.306232452392578, "logps/chosen": -11.428975105285645, "logps/rejected": -9.96834659576416, "loss": 0.6957, "rewards/accuracies": 0.3125, "rewards/chosen": -0.06366005539894104, "rewards/margins": -0.12454245984554291, "rewards/rejected": 0.06088239327073097, "step": 5 }, { "epoch": 0.1016949152542373, "grad_norm": 38.91747229948154, "learning_rate": 2.5423728813559323e-08, "logits/chosen": 9.630241394042969, "logits/rejected": 12.905715942382812, "logps/chosen": -11.144521713256836, "logps/rejected": -11.662797927856445, "loss": 0.7051, "rewards/accuracies": 0.3125, "rewards/chosen": -0.035100989043712616, "rewards/margins": -0.046221472322940826, "rewards/rejected": 0.01112048327922821, "step": 6 }, { "epoch": 0.11864406779661017, "grad_norm": 33.24376209141878, "learning_rate": 2.966101694915254e-08, "logits/chosen": 12.640701293945312, "logits/rejected": 15.201984405517578, "logps/chosen": -10.405349731445312, "logps/rejected": -12.381065368652344, "loss": 0.6935, "rewards/accuracies": 0.4375, "rewards/chosen": -0.043475911021232605, "rewards/margins": -0.03674769774079323, "rewards/rejected": -0.006728213280439377, "step": 7 }, { "epoch": 0.13559322033898305, "grad_norm": 33.93739002283182, "learning_rate": 3.3898305084745764e-08, "logits/chosen": 12.368184089660645, "logits/rejected": 10.674546241760254, "logps/chosen": -8.561328887939453, "logps/rejected": -13.179322242736816, "loss": 0.7008, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03725726902484894, "rewards/margins": 0.03952927142381668, "rewards/rejected": -0.002272002398967743, "step": 8 }, { "epoch": 0.15254237288135594, "grad_norm": 33.764877302185795, "learning_rate": 3.813559322033898e-08, "logits/chosen": 9.811960220336914, "logits/rejected": 12.911907196044922, "logps/chosen": -10.999805450439453, "logps/rejected": -9.398270606994629, "loss": 0.7076, "rewards/accuracies": 0.375, "rewards/chosen": -0.012884721159934998, "rewards/margins": -0.04326022416353226, "rewards/rejected": 0.03037550300359726, "step": 9 }, { "epoch": 0.1694915254237288, "grad_norm": 30.816455835047037, "learning_rate": 4.23728813559322e-08, "logits/chosen": 11.430095672607422, "logits/rejected": 14.928672790527344, "logps/chosen": -13.877382278442383, "logps/rejected": -10.65487289428711, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": -0.011533252894878387, "rewards/margins": -0.014867149293422699, "rewards/rejected": 0.0033338963985443115, "step": 10 }, { "epoch": 0.1864406779661017, "grad_norm": 33.938752767793865, "learning_rate": 4.661016949152542e-08, "logits/chosen": 13.052696228027344, "logits/rejected": 10.422144889831543, "logps/chosen": -10.610231399536133, "logps/rejected": -17.128759384155273, "loss": 0.7095, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00851757824420929, "rewards/margins": 0.14825767278671265, "rewards/rejected": -0.15677525103092194, "step": 11 }, { "epoch": 0.2033898305084746, "grad_norm": 33.17188984628549, "learning_rate": 5.0847457627118645e-08, "logits/chosen": 12.457121849060059, "logits/rejected": 13.516885757446289, "logps/chosen": -8.226775169372559, "logps/rejected": -10.04336166381836, "loss": 0.6985, "rewards/accuracies": 0.4375, "rewards/chosen": -0.03804200887680054, "rewards/margins": -0.024549927562475204, "rewards/rejected": -0.013492081314325333, "step": 12 }, { "epoch": 0.22033898305084745, "grad_norm": 33.400283269891275, "learning_rate": 5.508474576271186e-08, "logits/chosen": 12.457138061523438, "logits/rejected": 12.47688102722168, "logps/chosen": -10.179587364196777, "logps/rejected": -10.717220306396484, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.02907254546880722, "rewards/margins": 0.004769522696733475, "rewards/rejected": 0.024303022772073746, "step": 13 }, { "epoch": 0.23728813559322035, "grad_norm": 34.76291522662287, "learning_rate": 5.932203389830508e-08, "logits/chosen": 10.563559532165527, "logits/rejected": 7.753628730773926, "logps/chosen": -12.341764450073242, "logps/rejected": -22.06093978881836, "loss": 0.7001, "rewards/accuracies": 0.5, "rewards/chosen": 0.03608255088329315, "rewards/margins": 0.0593097060918808, "rewards/rejected": -0.023227155208587646, "step": 14 }, { "epoch": 0.2542372881355932, "grad_norm": 34.19169656059493, "learning_rate": 6.35593220338983e-08, "logits/chosen": 17.89315414428711, "logits/rejected": 16.98625373840332, "logps/chosen": -8.502942085266113, "logps/rejected": -14.32985782623291, "loss": 0.6923, "rewards/accuracies": 0.4375, "rewards/chosen": -0.013201236724853516, "rewards/margins": -0.03858032077550888, "rewards/rejected": 0.025379084050655365, "step": 15 }, { "epoch": 0.2711864406779661, "grad_norm": 33.123595149389935, "learning_rate": 6.779661016949153e-08, "logits/chosen": 13.16607666015625, "logits/rejected": 13.377177238464355, "logps/chosen": -9.61911392211914, "logps/rejected": -10.953754425048828, "loss": 0.6818, "rewards/accuracies": 0.75, "rewards/chosen": 0.03972737491130829, "rewards/margins": 0.11253049969673157, "rewards/rejected": -0.07280312478542328, "step": 16 }, { "epoch": 0.288135593220339, "grad_norm": 33.75941443866429, "learning_rate": 7.203389830508475e-08, "logits/chosen": 14.967769622802734, "logits/rejected": 16.429962158203125, "logps/chosen": -7.799526691436768, "logps/rejected": -11.779705047607422, "loss": 0.6918, "rewards/accuracies": 0.375, "rewards/chosen": 0.038096603006124496, "rewards/margins": -0.038292884826660156, "rewards/rejected": 0.07638949155807495, "step": 17 }, { "epoch": 0.3050847457627119, "grad_norm": 35.23652082493691, "learning_rate": 7.627118644067796e-08, "logits/chosen": 12.490198135375977, "logits/rejected": 12.536822319030762, "logps/chosen": -7.3060431480407715, "logps/rejected": -8.912150382995605, "loss": 0.6897, "rewards/accuracies": 0.5625, "rewards/chosen": 0.040348462760448456, "rewards/margins": 0.032108671963214874, "rewards/rejected": 0.008239790797233582, "step": 18 }, { "epoch": 0.3220338983050847, "grad_norm": 38.14490304653045, "learning_rate": 8.050847457627117e-08, "logits/chosen": 15.213144302368164, "logits/rejected": 13.012377738952637, "logps/chosen": -6.454628944396973, "logps/rejected": -13.794097900390625, "loss": 0.6924, "rewards/accuracies": 0.75, "rewards/chosen": 0.032369788736104965, "rewards/margins": 0.0747758001089096, "rewards/rejected": -0.04240601509809494, "step": 19 }, { "epoch": 0.3389830508474576, "grad_norm": 32.64981912960275, "learning_rate": 8.47457627118644e-08, "logits/chosen": 12.59264087677002, "logits/rejected": 13.965180397033691, "logps/chosen": -11.896605491638184, "logps/rejected": -10.930459976196289, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04358021169900894, "rewards/margins": -0.07576116919517517, "rewards/rejected": 0.03218095749616623, "step": 20 }, { "epoch": 0.3559322033898305, "grad_norm": 32.48889752101817, "learning_rate": 8.898305084745762e-08, "logits/chosen": 14.423205375671387, "logits/rejected": 11.800539016723633, "logps/chosen": -9.143863677978516, "logps/rejected": -10.3524169921875, "loss": 0.7, "rewards/accuracies": 0.625, "rewards/chosen": -0.03891824558377266, "rewards/margins": 0.013888102024793625, "rewards/rejected": -0.052806347608566284, "step": 21 }, { "epoch": 0.3728813559322034, "grad_norm": 33.74637684508929, "learning_rate": 9.322033898305084e-08, "logits/chosen": 11.911141395568848, "logits/rejected": 13.506253242492676, "logps/chosen": -7.444547653198242, "logps/rejected": -10.66550350189209, "loss": 0.7014, "rewards/accuracies": 0.5, "rewards/chosen": 0.023162133991718292, "rewards/margins": -0.015818648040294647, "rewards/rejected": 0.03898078203201294, "step": 22 }, { "epoch": 0.3898305084745763, "grad_norm": 34.986738432295624, "learning_rate": 9.745762711864407e-08, "logits/chosen": 12.02149772644043, "logits/rejected": 17.192615509033203, "logps/chosen": -9.3740234375, "logps/rejected": -12.00601863861084, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": -0.015855148434638977, "rewards/margins": -0.013700824230909348, "rewards/rejected": -0.0021543242037296295, "step": 23 }, { "epoch": 0.4067796610169492, "grad_norm": 35.517675126988536, "learning_rate": 1.0169491525423729e-07, "logits/chosen": 14.339700698852539, "logits/rejected": 16.159269332885742, "logps/chosen": -10.706647872924805, "logps/rejected": -14.890447616577148, "loss": 0.6947, "rewards/accuracies": 0.625, "rewards/chosen": 0.018318627029657364, "rewards/margins": 0.04457775130867958, "rewards/rejected": -0.026259124279022217, "step": 24 }, { "epoch": 0.423728813559322, "grad_norm": 36.16238724964674, "learning_rate": 1.059322033898305e-07, "logits/chosen": 12.644755363464355, "logits/rejected": 14.913492202758789, "logps/chosen": -12.103818893432617, "logps/rejected": -14.517330169677734, "loss": 0.7023, "rewards/accuracies": 0.3125, "rewards/chosen": 0.03186403214931488, "rewards/margins": -0.08982641249895096, "rewards/rejected": 0.12169044464826584, "step": 25 }, { "epoch": 0.4406779661016949, "grad_norm": 34.58148372441541, "learning_rate": 1.1016949152542372e-07, "logits/chosen": 16.518342971801758, "logits/rejected": 15.1292142868042, "logps/chosen": -8.292799949645996, "logps/rejected": -11.320685386657715, "loss": 0.6993, "rewards/accuracies": 0.8125, "rewards/chosen": 0.052115682512521744, "rewards/margins": 0.12297011911869049, "rewards/rejected": -0.07085443288087845, "step": 26 }, { "epoch": 0.4576271186440678, "grad_norm": 37.35790607963465, "learning_rate": 1.1440677966101695e-07, "logits/chosen": 13.867864608764648, "logits/rejected": 14.343127250671387, "logps/chosen": -10.37988567352295, "logps/rejected": -14.269682884216309, "loss": 0.7057, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04574650526046753, "rewards/margins": 0.07950454950332642, "rewards/rejected": -0.03375804424285889, "step": 27 }, { "epoch": 0.4745762711864407, "grad_norm": 32.982896424641616, "learning_rate": 1.1864406779661017e-07, "logits/chosen": 12.250972747802734, "logits/rejected": 14.854573249816895, "logps/chosen": -8.742226600646973, "logps/rejected": -10.888657569885254, "loss": 0.6889, "rewards/accuracies": 0.875, "rewards/chosen": 0.05363437905907631, "rewards/margins": 0.06778371334075928, "rewards/rejected": -0.014149338006973267, "step": 28 }, { "epoch": 0.4915254237288136, "grad_norm": 33.38571979724709, "learning_rate": 1.228813559322034e-07, "logits/chosen": 13.085405349731445, "logits/rejected": 11.611092567443848, "logps/chosen": -8.457245826721191, "logps/rejected": -11.445099830627441, "loss": 0.6947, "rewards/accuracies": 0.3125, "rewards/chosen": -0.017215125262737274, "rewards/margins": -0.09329110383987427, "rewards/rejected": 0.076075978577137, "step": 29 }, { "epoch": 0.5084745762711864, "grad_norm": 36.829555030284546, "learning_rate": 1.271186440677966e-07, "logits/chosen": 12.227721214294434, "logits/rejected": 12.230497360229492, "logps/chosen": -12.069671630859375, "logps/rejected": -13.662412643432617, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": -0.005466394126415253, "rewards/margins": 0.025227222591638565, "rewards/rejected": -0.030693616718053818, "step": 30 }, { "epoch": 0.5254237288135594, "grad_norm": 35.20409336336582, "learning_rate": 1.3135593220338984e-07, "logits/chosen": 10.929096221923828, "logits/rejected": 13.200987815856934, "logps/chosen": -11.148241996765137, "logps/rejected": -12.558090209960938, "loss": 0.6954, "rewards/accuracies": 0.375, "rewards/chosen": -0.05978451669216156, "rewards/margins": -0.03725840896368027, "rewards/rejected": -0.022526109591126442, "step": 31 }, { "epoch": 0.5423728813559322, "grad_norm": 35.6615556471091, "learning_rate": 1.3559322033898305e-07, "logits/chosen": 10.920124053955078, "logits/rejected": 11.50726318359375, "logps/chosen": -10.602476119995117, "logps/rejected": -14.5696382522583, "loss": 0.7016, "rewards/accuracies": 0.6875, "rewards/chosen": 0.040968988090753555, "rewards/margins": 0.10942812263965607, "rewards/rejected": -0.06845913827419281, "step": 32 }, { "epoch": 0.559322033898305, "grad_norm": 36.15961322810205, "learning_rate": 1.3983050847457625e-07, "logits/chosen": 14.529788970947266, "logits/rejected": 13.727234840393066, "logps/chosen": -10.854070663452148, "logps/rejected": -17.989377975463867, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03216388076543808, "rewards/margins": -0.0035803131759166718, "rewards/rejected": -0.028583567589521408, "step": 33 }, { "epoch": 0.576271186440678, "grad_norm": 35.17806533487164, "learning_rate": 1.440677966101695e-07, "logits/chosen": 13.581205368041992, "logits/rejected": 16.608570098876953, "logps/chosen": -8.734363555908203, "logps/rejected": -11.665853500366211, "loss": 0.705, "rewards/accuracies": 0.5, "rewards/chosen": 0.04102402180433273, "rewards/margins": -0.013214722275733948, "rewards/rejected": 0.05423874408006668, "step": 34 }, { "epoch": 0.5932203389830508, "grad_norm": 37.36296835575698, "learning_rate": 1.483050847457627e-07, "logits/chosen": 18.549928665161133, "logits/rejected": 16.754329681396484, "logps/chosen": -7.439009666442871, "logps/rejected": -17.70270347595215, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.02313202992081642, "rewards/margins": -0.03136378899216652, "rewards/rejected": 0.05449581891298294, "step": 35 }, { "epoch": 0.6101694915254238, "grad_norm": 34.998157162726095, "learning_rate": 1.5254237288135593e-07, "logits/chosen": 12.90966796875, "logits/rejected": 13.895225524902344, "logps/chosen": -8.504295349121094, "logps/rejected": -10.12105941772461, "loss": 0.7, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01411505788564682, "rewards/margins": -0.08190415799617767, "rewards/rejected": 0.06778909265995026, "step": 36 }, { "epoch": 0.6271186440677966, "grad_norm": 36.426085212372584, "learning_rate": 1.5677966101694915e-07, "logits/chosen": 15.71446418762207, "logits/rejected": 16.843730926513672, "logps/chosen": -10.690804481506348, "logps/rejected": -9.679794311523438, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": 0.02468421310186386, "rewards/margins": 0.05196612328290939, "rewards/rejected": -0.027281910181045532, "step": 37 }, { "epoch": 0.6440677966101694, "grad_norm": 35.237590028285744, "learning_rate": 1.6101694915254234e-07, "logits/chosen": 16.050735473632812, "logits/rejected": 12.214224815368652, "logps/chosen": -11.611053466796875, "logps/rejected": -20.413999557495117, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": 0.04573787748813629, "rewards/margins": 0.12110111117362976, "rewards/rejected": -0.07536323368549347, "step": 38 }, { "epoch": 0.6610169491525424, "grad_norm": 33.16682284542454, "learning_rate": 1.6525423728813559e-07, "logits/chosen": 12.356346130371094, "logits/rejected": 15.030713081359863, "logps/chosen": -14.8650541305542, "logps/rejected": -17.080093383789062, "loss": 0.7061, "rewards/accuracies": 0.6875, "rewards/chosen": 0.027096569538116455, "rewards/margins": 0.07355399429798126, "rewards/rejected": -0.04645742475986481, "step": 39 }, { "epoch": 0.6779661016949152, "grad_norm": 34.805189868761, "learning_rate": 1.694915254237288e-07, "logits/chosen": 11.95240592956543, "logits/rejected": 11.009622573852539, "logps/chosen": -7.867511749267578, "logps/rejected": -13.811777114868164, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0293671116232872, "rewards/margins": 0.0956479161977768, "rewards/rejected": -0.06628081202507019, "step": 40 }, { "epoch": 0.6949152542372882, "grad_norm": 35.56777767064069, "learning_rate": 1.7372881355932202e-07, "logits/chosen": 15.345881462097168, "logits/rejected": 10.699317932128906, "logps/chosen": -8.943795204162598, "logps/rejected": -15.38254451751709, "loss": 0.6941, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008422672748565674, "rewards/margins": 0.005998820066452026, "rewards/rejected": 0.0024238526821136475, "step": 41 }, { "epoch": 0.711864406779661, "grad_norm": 32.67790911292791, "learning_rate": 1.7796610169491524e-07, "logits/chosen": 13.560546875, "logits/rejected": 13.775779724121094, "logps/chosen": -6.909335136413574, "logps/rejected": -10.375238418579102, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": 0.0649556964635849, "rewards/margins": 0.06534145772457123, "rewards/rejected": -0.0003857612609863281, "step": 42 }, { "epoch": 0.7288135593220338, "grad_norm": 37.03733040428068, "learning_rate": 1.8220338983050846e-07, "logits/chosen": 12.041620254516602, "logits/rejected": 14.68472957611084, "logps/chosen": -15.107216835021973, "logps/rejected": -14.546531677246094, "loss": 0.7004, "rewards/accuracies": 0.4375, "rewards/chosen": 0.028534866869449615, "rewards/margins": -0.001826740801334381, "rewards/rejected": 0.030361607670783997, "step": 43 }, { "epoch": 0.7457627118644068, "grad_norm": 32.84788840363755, "learning_rate": 1.8644067796610168e-07, "logits/chosen": 14.930094718933105, "logits/rejected": 17.373828887939453, "logps/chosen": -12.436162948608398, "logps/rejected": -16.592809677124023, "loss": 0.715, "rewards/accuracies": 0.25, "rewards/chosen": -0.033111147582530975, "rewards/margins": -0.061914995312690735, "rewards/rejected": 0.02880384773015976, "step": 44 }, { "epoch": 0.7627118644067796, "grad_norm": 33.21001966655798, "learning_rate": 1.906779661016949e-07, "logits/chosen": 12.37698745727539, "logits/rejected": 12.670955657958984, "logps/chosen": -9.219972610473633, "logps/rejected": -16.6455078125, "loss": 0.7021, "rewards/accuracies": 0.375, "rewards/chosen": -0.06785572320222855, "rewards/margins": -0.00494810938835144, "rewards/rejected": -0.0629076212644577, "step": 45 }, { "epoch": 0.7796610169491526, "grad_norm": 34.44107326645517, "learning_rate": 1.9491525423728814e-07, "logits/chosen": 12.624075889587402, "logits/rejected": 13.895041465759277, "logps/chosen": -10.401080131530762, "logps/rejected": -12.175220489501953, "loss": 0.6812, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010198384523391724, "rewards/margins": 0.061251018196344376, "rewards/rejected": -0.05105263367295265, "step": 46 }, { "epoch": 0.7966101694915254, "grad_norm": 35.32959175447224, "learning_rate": 1.9915254237288134e-07, "logits/chosen": 12.925259590148926, "logits/rejected": 14.440532684326172, "logps/chosen": -12.382270812988281, "logps/rejected": -11.013741493225098, "loss": 0.6828, "rewards/accuracies": 0.625, "rewards/chosen": 0.04461570829153061, "rewards/margins": 0.06247672438621521, "rewards/rejected": -0.0178610160946846, "step": 47 }, { "epoch": 0.8135593220338984, "grad_norm": 35.75324287683784, "learning_rate": 2.0338983050847458e-07, "logits/chosen": 13.918556213378906, "logits/rejected": 14.322575569152832, "logps/chosen": -11.076623916625977, "logps/rejected": -16.675630569458008, "loss": 0.6545, "rewards/accuracies": 0.875, "rewards/chosen": 0.05672513693571091, "rewards/margins": 0.19194935262203217, "rewards/rejected": -0.13522422313690186, "step": 48 }, { "epoch": 0.8305084745762712, "grad_norm": 31.28472915428401, "learning_rate": 2.076271186440678e-07, "logits/chosen": 14.541460037231445, "logits/rejected": 16.299976348876953, "logps/chosen": -7.96132230758667, "logps/rejected": -9.541074752807617, "loss": 0.6772, "rewards/accuracies": 0.625, "rewards/chosen": 0.021021980792284012, "rewards/margins": 0.09664987027645111, "rewards/rejected": -0.0756278932094574, "step": 49 }, { "epoch": 0.847457627118644, "grad_norm": 30.522178193238684, "learning_rate": 2.11864406779661e-07, "logits/chosen": 12.885459899902344, "logits/rejected": 14.966306686401367, "logps/chosen": -8.131207466125488, "logps/rejected": -14.213689804077148, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.035199426114559174, "rewards/margins": 0.02447110041975975, "rewards/rejected": 0.010728325694799423, "step": 50 }, { "epoch": 0.864406779661017, "grad_norm": 36.47219407480772, "learning_rate": 2.1610169491525424e-07, "logits/chosen": 15.588902473449707, "logits/rejected": 15.775507926940918, "logps/chosen": -11.82562255859375, "logps/rejected": -19.91376495361328, "loss": 0.6735, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00851084291934967, "rewards/margins": 0.03356066346168518, "rewards/rejected": -0.02504982054233551, "step": 51 }, { "epoch": 0.8813559322033898, "grad_norm": 34.09388988223221, "learning_rate": 2.2033898305084743e-07, "logits/chosen": 11.618022918701172, "logits/rejected": 12.69122314453125, "logps/chosen": -11.289080619812012, "logps/rejected": -14.602575302124023, "loss": 0.6646, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02516426146030426, "rewards/margins": 0.05319439619779587, "rewards/rejected": -0.028030134737491608, "step": 52 }, { "epoch": 0.8983050847457628, "grad_norm": 35.24051417693975, "learning_rate": 2.2457627118644068e-07, "logits/chosen": 13.005451202392578, "logits/rejected": 15.351945877075195, "logps/chosen": -9.357630729675293, "logps/rejected": -12.059124946594238, "loss": 0.6868, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00409119576215744, "rewards/margins": -0.020457960665225983, "rewards/rejected": 0.016366764903068542, "step": 53 }, { "epoch": 0.9152542372881356, "grad_norm": 31.849779567852845, "learning_rate": 2.288135593220339e-07, "logits/chosen": 15.671916007995605, "logits/rejected": 15.454113006591797, "logps/chosen": -10.503100395202637, "logps/rejected": -10.274392127990723, "loss": 0.6771, "rewards/accuracies": 0.625, "rewards/chosen": 0.06473356485366821, "rewards/margins": 0.06081206351518631, "rewards/rejected": 0.003921501338481903, "step": 54 }, { "epoch": 0.9322033898305084, "grad_norm": 33.88074879519746, "learning_rate": 2.330508474576271e-07, "logits/chosen": 13.267528533935547, "logits/rejected": 13.739250183105469, "logps/chosen": -11.132416725158691, "logps/rejected": -13.396270751953125, "loss": 0.6706, "rewards/accuracies": 0.75, "rewards/chosen": 0.09757491946220398, "rewards/margins": 0.16509681940078735, "rewards/rejected": -0.06752191483974457, "step": 55 }, { "epoch": 0.9491525423728814, "grad_norm": 33.372486380706356, "learning_rate": 2.3728813559322033e-07, "logits/chosen": 15.237256050109863, "logits/rejected": 15.872620582580566, "logps/chosen": -8.667885780334473, "logps/rejected": -11.945577621459961, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 0.01572229713201523, "rewards/margins": 0.08544715493917465, "rewards/rejected": -0.06972485780715942, "step": 56 }, { "epoch": 0.9661016949152542, "grad_norm": 32.086485860665775, "learning_rate": 2.4152542372881355e-07, "logits/chosen": 17.312305450439453, "logits/rejected": 15.455362319946289, "logps/chosen": -13.120665550231934, "logps/rejected": -18.88037872314453, "loss": 0.6377, "rewards/accuracies": 0.5, "rewards/chosen": 0.07254119217395782, "rewards/margins": 0.16330546140670776, "rewards/rejected": -0.09076426923274994, "step": 57 }, { "epoch": 0.9830508474576272, "grad_norm": 32.83780367143196, "learning_rate": 2.457627118644068e-07, "logits/chosen": 12.152743339538574, "logits/rejected": 15.03101634979248, "logps/chosen": -13.570626258850098, "logps/rejected": -15.182299613952637, "loss": 0.6645, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05790925770998001, "rewards/margins": 0.16572001576423645, "rewards/rejected": -0.10781076550483704, "step": 58 }, { "epoch": 1.0, "grad_norm": 33.173046993726444, "learning_rate": 2.5e-07, "logits/chosen": 13.197469711303711, "logits/rejected": 12.665360450744629, "logps/chosen": -9.60036849975586, "logps/rejected": -13.494672775268555, "loss": 0.6636, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05081889033317566, "rewards/margins": 0.08736976981163025, "rewards/rejected": -0.03655087947845459, "step": 59 }, { "epoch": 1.0169491525423728, "grad_norm": 34.12246854349812, "learning_rate": 2.542372881355932e-07, "logits/chosen": 16.638633728027344, "logits/rejected": 14.497950553894043, "logps/chosen": -7.4926934242248535, "logps/rejected": -9.887690544128418, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": 0.02329724282026291, "rewards/margins": 0.08896321058273315, "rewards/rejected": -0.06566596031188965, "step": 60 }, { "epoch": 1.0338983050847457, "grad_norm": 32.48073277255417, "learning_rate": 2.584745762711864e-07, "logits/chosen": 13.186823844909668, "logits/rejected": 12.39754581451416, "logps/chosen": -7.40197229385376, "logps/rejected": -9.912740707397461, "loss": 0.6404, "rewards/accuracies": 0.75, "rewards/chosen": 0.0660327598452568, "rewards/margins": 0.08431532979011536, "rewards/rejected": -0.01828256994485855, "step": 61 }, { "epoch": 1.0508474576271187, "grad_norm": 31.810793473176147, "learning_rate": 2.6271186440677967e-07, "logits/chosen": 9.925346374511719, "logits/rejected": 12.604837417602539, "logps/chosen": -9.344438552856445, "logps/rejected": -12.725444793701172, "loss": 0.6681, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016713112592697144, "rewards/margins": 0.06962401419878006, "rewards/rejected": -0.0863371342420578, "step": 62 }, { "epoch": 1.0677966101694916, "grad_norm": 31.270602966387727, "learning_rate": 2.6694915254237286e-07, "logits/chosen": 12.649188041687012, "logits/rejected": 11.9768648147583, "logps/chosen": -10.402519226074219, "logps/rejected": -16.783111572265625, "loss": 0.6522, "rewards/accuracies": 0.625, "rewards/chosen": -0.022982105612754822, "rewards/margins": 0.12150964140892029, "rewards/rejected": -0.1444917470216751, "step": 63 }, { "epoch": 1.0847457627118644, "grad_norm": 32.65672591502704, "learning_rate": 2.711864406779661e-07, "logits/chosen": 14.806360244750977, "logits/rejected": 17.47248077392578, "logps/chosen": -11.753227233886719, "logps/rejected": -15.529240608215332, "loss": 0.6439, "rewards/accuracies": 0.5, "rewards/chosen": -0.002056751400232315, "rewards/margins": 0.05395819991827011, "rewards/rejected": -0.056014951318502426, "step": 64 }, { "epoch": 1.1016949152542372, "grad_norm": 30.439729822840757, "learning_rate": 2.754237288135593e-07, "logits/chosen": 13.062178611755371, "logits/rejected": 15.618377685546875, "logps/chosen": -10.648831367492676, "logps/rejected": -17.41346549987793, "loss": 0.6379, "rewards/accuracies": 0.5625, "rewards/chosen": 0.014630310237407684, "rewards/margins": 0.04037298262119293, "rewards/rejected": -0.025742672383785248, "step": 65 }, { "epoch": 1.11864406779661, "grad_norm": 33.491004186333484, "learning_rate": 2.796610169491525e-07, "logits/chosen": 11.60132884979248, "logits/rejected": 12.90021800994873, "logps/chosen": -10.080259323120117, "logps/rejected": -14.711465835571289, "loss": 0.6572, "rewards/accuracies": 0.5625, "rewards/chosen": -0.021400080993771553, "rewards/margins": 0.12133632600307465, "rewards/rejected": -0.14273640513420105, "step": 66 }, { "epoch": 1.1355932203389831, "grad_norm": 31.23541277963251, "learning_rate": 2.838983050847458e-07, "logits/chosen": 12.128422737121582, "logits/rejected": 14.061056137084961, "logps/chosen": -9.6464204788208, "logps/rejected": -9.97637939453125, "loss": 0.654, "rewards/accuracies": 0.4375, "rewards/chosen": -0.069414421916008, "rewards/margins": 0.04286954551935196, "rewards/rejected": -0.11228397488594055, "step": 67 }, { "epoch": 1.152542372881356, "grad_norm": 31.64971711981137, "learning_rate": 2.88135593220339e-07, "logits/chosen": 13.560795783996582, "logits/rejected": 15.241508483886719, "logps/chosen": -9.716387748718262, "logps/rejected": -16.67053985595703, "loss": 0.6505, "rewards/accuracies": 0.625, "rewards/chosen": 0.08715072274208069, "rewards/margins": 0.1715814769268036, "rewards/rejected": -0.0844307616353035, "step": 68 }, { "epoch": 1.1694915254237288, "grad_norm": 31.743762826648183, "learning_rate": 2.923728813559322e-07, "logits/chosen": 12.650971412658691, "logits/rejected": 12.528360366821289, "logps/chosen": -8.169756889343262, "logps/rejected": -11.55956745147705, "loss": 0.644, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004966534674167633, "rewards/margins": 0.04012346267700195, "rewards/rejected": -0.045089997351169586, "step": 69 }, { "epoch": 1.1864406779661016, "grad_norm": 27.05553118573593, "learning_rate": 2.966101694915254e-07, "logits/chosen": 13.4546537399292, "logits/rejected": 12.574522018432617, "logps/chosen": -8.672929763793945, "logps/rejected": -14.113090515136719, "loss": 0.6159, "rewards/accuracies": 0.8125, "rewards/chosen": 0.018237508833408356, "rewards/margins": 0.1680130809545517, "rewards/rejected": -0.14977556467056274, "step": 70 }, { "epoch": 1.2033898305084745, "grad_norm": 29.505587279506713, "learning_rate": 3.008474576271186e-07, "logits/chosen": 13.520177841186523, "logits/rejected": 13.430837631225586, "logps/chosen": -9.639127731323242, "logps/rejected": -13.659337043762207, "loss": 0.6309, "rewards/accuracies": 0.5, "rewards/chosen": -0.08626913279294968, "rewards/margins": 0.041161876171827316, "rewards/rejected": -0.1274310052394867, "step": 71 }, { "epoch": 1.2203389830508475, "grad_norm": 31.37091432132666, "learning_rate": 3.0508474576271186e-07, "logits/chosen": 11.49694538116455, "logits/rejected": 11.778303146362305, "logps/chosen": -11.05677604675293, "logps/rejected": -15.848922729492188, "loss": 0.6181, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16539010405540466, "rewards/margins": 0.4443053603172302, "rewards/rejected": -0.27891525626182556, "step": 72 }, { "epoch": 1.2372881355932204, "grad_norm": 29.563451670160653, "learning_rate": 3.093220338983051e-07, "logits/chosen": 15.092836380004883, "logits/rejected": 17.037139892578125, "logps/chosen": -9.66404914855957, "logps/rejected": -11.75003433227539, "loss": 0.6278, "rewards/accuracies": 0.6875, "rewards/chosen": 0.073647640645504, "rewards/margins": 0.2144932746887207, "rewards/rejected": -0.1408456414937973, "step": 73 }, { "epoch": 1.2542372881355932, "grad_norm": 32.962262587265975, "learning_rate": 3.135593220338983e-07, "logits/chosen": 12.5877103805542, "logits/rejected": 13.999431610107422, "logps/chosen": -9.522225379943848, "logps/rejected": -12.351940155029297, "loss": 0.6207, "rewards/accuracies": 0.75, "rewards/chosen": 0.0037403330206871033, "rewards/margins": 0.20362704992294312, "rewards/rejected": -0.19988670945167542, "step": 74 }, { "epoch": 1.271186440677966, "grad_norm": 31.381915402396228, "learning_rate": 3.177966101694915e-07, "logits/chosen": 11.979763984680176, "logits/rejected": 13.482416152954102, "logps/chosen": -8.215496063232422, "logps/rejected": -11.69947624206543, "loss": 0.6179, "rewards/accuracies": 0.5, "rewards/chosen": 0.01762581244111061, "rewards/margins": 0.14459285140037537, "rewards/rejected": -0.12696704268455505, "step": 75 }, { "epoch": 1.288135593220339, "grad_norm": 30.626294632951648, "learning_rate": 3.220338983050847e-07, "logits/chosen": 13.793957710266113, "logits/rejected": 14.482337951660156, "logps/chosen": -8.5573091506958, "logps/rejected": -12.453176498413086, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": -0.08360793441534042, "rewards/margins": 0.08104290813207626, "rewards/rejected": -0.1646508425474167, "step": 76 }, { "epoch": 1.305084745762712, "grad_norm": 28.664877026711565, "learning_rate": 3.26271186440678e-07, "logits/chosen": 11.402955055236816, "logits/rejected": 12.819219589233398, "logps/chosen": -14.218494415283203, "logps/rejected": -19.227710723876953, "loss": 0.6185, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13961467146873474, "rewards/margins": 0.4163624048233032, "rewards/rejected": -0.2767477333545685, "step": 77 }, { "epoch": 1.3220338983050848, "grad_norm": 30.319960677264497, "learning_rate": 3.3050847457627117e-07, "logits/chosen": 15.883766174316406, "logits/rejected": 15.542499542236328, "logps/chosen": -8.39336109161377, "logps/rejected": -14.079061508178711, "loss": 0.6211, "rewards/accuracies": 0.5, "rewards/chosen": -0.014538049697875977, "rewards/margins": 0.14333194494247437, "rewards/rejected": -0.15786999464035034, "step": 78 }, { "epoch": 1.3389830508474576, "grad_norm": 31.140285215166347, "learning_rate": 3.3474576271186436e-07, "logits/chosen": 16.721675872802734, "logits/rejected": 16.151338577270508, "logps/chosen": -8.125932693481445, "logps/rejected": -10.540145874023438, "loss": 0.606, "rewards/accuracies": 0.625, "rewards/chosen": 0.009473755955696106, "rewards/margins": 0.20827658474445343, "rewards/rejected": -0.19880282878875732, "step": 79 }, { "epoch": 1.3559322033898304, "grad_norm": 28.563629733456608, "learning_rate": 3.389830508474576e-07, "logits/chosen": 15.79937744140625, "logits/rejected": 15.94279670715332, "logps/chosen": -9.857381820678711, "logps/rejected": -15.813158988952637, "loss": 0.5986, "rewards/accuracies": 0.625, "rewards/chosen": -0.006986264139413834, "rewards/margins": 0.2487819939851761, "rewards/rejected": -0.2557682693004608, "step": 80 }, { "epoch": 1.3728813559322033, "grad_norm": 28.565703501108878, "learning_rate": 3.432203389830508e-07, "logits/chosen": 12.46065616607666, "logits/rejected": 12.378948211669922, "logps/chosen": -9.123543739318848, "logps/rejected": -12.124061584472656, "loss": 0.5794, "rewards/accuracies": 0.625, "rewards/chosen": 0.09940391778945923, "rewards/margins": 0.17284835875034332, "rewards/rejected": -0.0734444409608841, "step": 81 }, { "epoch": 1.3898305084745763, "grad_norm": 28.291061250399547, "learning_rate": 3.4745762711864405e-07, "logits/chosen": 9.341460227966309, "logits/rejected": 10.108131408691406, "logps/chosen": -7.376352787017822, "logps/rejected": -6.77155876159668, "loss": 0.6319, "rewards/accuracies": 0.625, "rewards/chosen": -8.321180939674377e-05, "rewards/margins": 0.13180197775363922, "rewards/rejected": -0.13188520073890686, "step": 82 }, { "epoch": 1.4067796610169492, "grad_norm": 29.854142613641514, "learning_rate": 3.516949152542373e-07, "logits/chosen": 12.311843872070312, "logits/rejected": 14.436249732971191, "logps/chosen": -10.377118110656738, "logps/rejected": -13.012676239013672, "loss": 0.6077, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10611987113952637, "rewards/margins": 0.29748332500457764, "rewards/rejected": -0.19136345386505127, "step": 83 }, { "epoch": 1.423728813559322, "grad_norm": 35.835030120584115, "learning_rate": 3.559322033898305e-07, "logits/chosen": 9.823848724365234, "logits/rejected": 11.089227676391602, "logps/chosen": -9.431833267211914, "logps/rejected": -10.586324691772461, "loss": 0.6234, "rewards/accuracies": 0.625, "rewards/chosen": 0.014358047395944595, "rewards/margins": 0.16177169978618622, "rewards/rejected": -0.14741365611553192, "step": 84 }, { "epoch": 1.4406779661016949, "grad_norm": 30.07505572818561, "learning_rate": 3.601694915254237e-07, "logits/chosen": 12.886320114135742, "logits/rejected": 13.918362617492676, "logps/chosen": -8.843177795410156, "logps/rejected": -10.307863235473633, "loss": 0.6085, "rewards/accuracies": 0.5625, "rewards/chosen": 0.02218719571828842, "rewards/margins": 0.27985820174217224, "rewards/rejected": -0.2576709985733032, "step": 85 }, { "epoch": 1.457627118644068, "grad_norm": 32.81575222960369, "learning_rate": 3.644067796610169e-07, "logits/chosen": 9.922818183898926, "logits/rejected": 8.803034782409668, "logps/chosen": -10.362354278564453, "logps/rejected": -14.556231498718262, "loss": 0.6153, "rewards/accuracies": 0.625, "rewards/chosen": 0.04402754455804825, "rewards/margins": 0.2732781767845154, "rewards/rejected": -0.22925063967704773, "step": 86 }, { "epoch": 1.4745762711864407, "grad_norm": 31.143487290913797, "learning_rate": 3.6864406779661017e-07, "logits/chosen": 13.284112930297852, "logits/rejected": 13.448722839355469, "logps/chosen": -8.040769577026367, "logps/rejected": -14.666115760803223, "loss": 0.5883, "rewards/accuracies": 0.875, "rewards/chosen": -0.025847814977169037, "rewards/margins": 0.34201639890670776, "rewards/rejected": -0.367864191532135, "step": 87 }, { "epoch": 1.4915254237288136, "grad_norm": 27.87355361043025, "learning_rate": 3.7288135593220336e-07, "logits/chosen": 11.64570426940918, "logits/rejected": 12.12906551361084, "logps/chosen": -10.226296424865723, "logps/rejected": -15.823101997375488, "loss": 0.5825, "rewards/accuracies": 0.8125, "rewards/chosen": 0.049983687698841095, "rewards/margins": 0.46055829524993896, "rewards/rejected": -0.41057464480400085, "step": 88 }, { "epoch": 1.5084745762711864, "grad_norm": 28.514766608508296, "learning_rate": 3.771186440677966e-07, "logits/chosen": 10.8696928024292, "logits/rejected": 12.655576705932617, "logps/chosen": -11.517805099487305, "logps/rejected": -13.284784317016602, "loss": 0.6049, "rewards/accuracies": 0.8125, "rewards/chosen": -0.029194893315434456, "rewards/margins": 0.3180268406867981, "rewards/rejected": -0.3472217321395874, "step": 89 }, { "epoch": 1.5254237288135593, "grad_norm": 30.117702099043008, "learning_rate": 3.813559322033898e-07, "logits/chosen": 16.787080764770508, "logits/rejected": 14.550413131713867, "logps/chosen": -10.547918319702148, "logps/rejected": -18.722667694091797, "loss": 0.58, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04852820187807083, "rewards/margins": 0.4919710159301758, "rewards/rejected": -0.44344282150268555, "step": 90 }, { "epoch": 1.542372881355932, "grad_norm": 27.570356926191867, "learning_rate": 3.8559322033898304e-07, "logits/chosen": 10.767607688903809, "logits/rejected": 13.919551849365234, "logps/chosen": -10.187631607055664, "logps/rejected": -12.379902839660645, "loss": 0.5751, "rewards/accuracies": 0.875, "rewards/chosen": 0.15341579914093018, "rewards/margins": 0.4169505834579468, "rewards/rejected": -0.2635347247123718, "step": 91 }, { "epoch": 1.559322033898305, "grad_norm": 32.16465688493385, "learning_rate": 3.898305084745763e-07, "logits/chosen": 9.810237884521484, "logits/rejected": 10.345829010009766, "logps/chosen": -9.855244636535645, "logps/rejected": -11.927900314331055, "loss": 0.5719, "rewards/accuracies": 0.625, "rewards/chosen": 0.01897813379764557, "rewards/margins": 0.12236368656158447, "rewards/rejected": -0.1033855527639389, "step": 92 }, { "epoch": 1.576271186440678, "grad_norm": 30.793068504647245, "learning_rate": 3.940677966101695e-07, "logits/chosen": 8.736921310424805, "logits/rejected": 12.200434684753418, "logps/chosen": -9.910384178161621, "logps/rejected": -13.29830265045166, "loss": 0.6152, "rewards/accuracies": 0.625, "rewards/chosen": 0.08518315851688385, "rewards/margins": 0.37882399559020996, "rewards/rejected": -0.2936408817768097, "step": 93 }, { "epoch": 1.5932203389830508, "grad_norm": 29.423734502829934, "learning_rate": 3.9830508474576267e-07, "logits/chosen": 12.854459762573242, "logits/rejected": 11.398431777954102, "logps/chosen": -8.642826080322266, "logps/rejected": -16.943462371826172, "loss": 0.588, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04286940395832062, "rewards/margins": 0.4083571434020996, "rewards/rejected": -0.4512265622615814, "step": 94 }, { "epoch": 1.6101694915254239, "grad_norm": 36.27015230712179, "learning_rate": 4.025423728813559e-07, "logits/chosen": 11.945028305053711, "logits/rejected": 11.686726570129395, "logps/chosen": -7.234800815582275, "logps/rejected": -14.528942108154297, "loss": 0.5922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.024997025728225708, "rewards/margins": 0.5870181322097778, "rewards/rejected": -0.5620210766792297, "step": 95 }, { "epoch": 1.6271186440677967, "grad_norm": 31.19227016453443, "learning_rate": 4.0677966101694916e-07, "logits/chosen": 13.797384262084961, "logits/rejected": 14.542064666748047, "logps/chosen": -11.304056167602539, "logps/rejected": -11.900882720947266, "loss": 0.5784, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0709955096244812, "rewards/margins": 0.1039753258228302, "rewards/rejected": -0.032979816198349, "step": 96 }, { "epoch": 1.6440677966101696, "grad_norm": 27.4635272839829, "learning_rate": 4.1101694915254236e-07, "logits/chosen": 13.015619277954102, "logits/rejected": 11.862337112426758, "logps/chosen": -7.38511323928833, "logps/rejected": -10.880597114562988, "loss": 0.5627, "rewards/accuracies": 0.75, "rewards/chosen": 0.01526942104101181, "rewards/margins": 0.4992905557155609, "rewards/rejected": -0.4840211272239685, "step": 97 }, { "epoch": 1.6610169491525424, "grad_norm": 27.238291695236995, "learning_rate": 4.152542372881356e-07, "logits/chosen": 10.700840950012207, "logits/rejected": 11.397660255432129, "logps/chosen": -9.450662612915039, "logps/rejected": -12.259298324584961, "loss": 0.5442, "rewards/accuracies": 0.75, "rewards/chosen": 0.0722656399011612, "rewards/margins": 0.16584718227386475, "rewards/rejected": -0.09358153492212296, "step": 98 }, { "epoch": 1.6779661016949152, "grad_norm": 28.24564494185518, "learning_rate": 4.194915254237288e-07, "logits/chosen": 11.284452438354492, "logits/rejected": 14.807559967041016, "logps/chosen": -10.771215438842773, "logps/rejected": -11.973726272583008, "loss": 0.5979, "rewards/accuracies": 0.6875, "rewards/chosen": 0.051828235387802124, "rewards/margins": 0.19837984442710876, "rewards/rejected": -0.14655160903930664, "step": 99 }, { "epoch": 1.694915254237288, "grad_norm": 32.39712681739478, "learning_rate": 4.23728813559322e-07, "logits/chosen": 4.555298328399658, "logits/rejected": 9.199655532836914, "logps/chosen": -13.74587345123291, "logps/rejected": -13.853545188903809, "loss": 0.5942, "rewards/accuracies": 0.625, "rewards/chosen": 0.051489535719156265, "rewards/margins": 0.5184900760650635, "rewards/rejected": -0.46700048446655273, "step": 100 }, { "epoch": 1.711864406779661, "grad_norm": 27.79221873761442, "learning_rate": 4.279661016949153e-07, "logits/chosen": 13.323826789855957, "logits/rejected": 11.545331954956055, "logps/chosen": -8.155689239501953, "logps/rejected": -13.511098861694336, "loss": 0.5431, "rewards/accuracies": 0.625, "rewards/chosen": 0.009415552020072937, "rewards/margins": 0.5583093762397766, "rewards/rejected": -0.5488938689231873, "step": 101 }, { "epoch": 1.7288135593220337, "grad_norm": 28.264522962910007, "learning_rate": 4.322033898305085e-07, "logits/chosen": 8.916523933410645, "logits/rejected": 10.580684661865234, "logps/chosen": -10.751327514648438, "logps/rejected": -10.95659351348877, "loss": 0.5516, "rewards/accuracies": 0.75, "rewards/chosen": 0.1171780452132225, "rewards/margins": 0.515388548374176, "rewards/rejected": -0.39821046590805054, "step": 102 }, { "epoch": 1.7457627118644068, "grad_norm": 26.154236239225785, "learning_rate": 4.3644067796610167e-07, "logits/chosen": 13.819053649902344, "logits/rejected": 14.223098754882812, "logps/chosen": -8.082857131958008, "logps/rejected": -13.987797737121582, "loss": 0.5396, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05364694446325302, "rewards/margins": 0.7236331105232239, "rewards/rejected": -0.6699862480163574, "step": 103 }, { "epoch": 1.7627118644067796, "grad_norm": 27.50021000006553, "learning_rate": 4.4067796610169486e-07, "logits/chosen": 11.863626480102539, "logits/rejected": 10.52764892578125, "logps/chosen": -6.9673237800598145, "logps/rejected": -8.23715877532959, "loss": 0.5938, "rewards/accuracies": 0.875, "rewards/chosen": 0.14590652287006378, "rewards/margins": 0.16770131886005402, "rewards/rejected": -0.02179480344057083, "step": 104 }, { "epoch": 1.7796610169491527, "grad_norm": 28.474373560254055, "learning_rate": 4.449152542372881e-07, "logits/chosen": 16.65330696105957, "logits/rejected": 16.556903839111328, "logps/chosen": -11.08348274230957, "logps/rejected": -19.825658798217773, "loss": 0.5366, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06215184926986694, "rewards/margins": 0.8033376932144165, "rewards/rejected": -0.7411857843399048, "step": 105 }, { "epoch": 1.7966101694915255, "grad_norm": 27.473650011351655, "learning_rate": 4.4915254237288135e-07, "logits/chosen": 13.014068603515625, "logits/rejected": 13.48980712890625, "logps/chosen": -9.113512992858887, "logps/rejected": -13.425411224365234, "loss": 0.5333, "rewards/accuracies": 0.875, "rewards/chosen": 0.029549360275268555, "rewards/margins": 0.8095859289169312, "rewards/rejected": -0.7800365686416626, "step": 106 }, { "epoch": 1.8135593220338984, "grad_norm": 28.191261008380913, "learning_rate": 4.5338983050847454e-07, "logits/chosen": 13.008545875549316, "logits/rejected": 11.879390716552734, "logps/chosen": -9.581355094909668, "logps/rejected": -15.122631072998047, "loss": 0.5785, "rewards/accuracies": 0.375, "rewards/chosen": -0.0392971932888031, "rewards/margins": 0.02920607477426529, "rewards/rejected": -0.06850326806306839, "step": 107 }, { "epoch": 1.8305084745762712, "grad_norm": 31.619473725308314, "learning_rate": 4.576271186440678e-07, "logits/chosen": 10.609634399414062, "logits/rejected": 10.654576301574707, "logps/chosen": -7.444589138031006, "logps/rejected": -15.026788711547852, "loss": 0.5572, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08635948598384857, "rewards/margins": 0.7383730411529541, "rewards/rejected": -0.6520135402679443, "step": 108 }, { "epoch": 1.847457627118644, "grad_norm": 32.17526198659112, "learning_rate": 4.61864406779661e-07, "logits/chosen": 12.902010917663574, "logits/rejected": 11.985198020935059, "logps/chosen": -7.153221130371094, "logps/rejected": -13.385372161865234, "loss": 0.571, "rewards/accuracies": 0.5625, "rewards/chosen": 0.043889474123716354, "rewards/margins": 0.2885042130947113, "rewards/rejected": -0.24461475014686584, "step": 109 }, { "epoch": 1.8644067796610169, "grad_norm": 27.857085867574753, "learning_rate": 4.661016949152542e-07, "logits/chosen": 10.333656311035156, "logits/rejected": 12.67178726196289, "logps/chosen": -17.524898529052734, "logps/rejected": -15.563018798828125, "loss": 0.5713, "rewards/accuracies": 0.4375, "rewards/chosen": -0.018213175237178802, "rewards/margins": 0.060524992644786835, "rewards/rejected": -0.07873816788196564, "step": 110 }, { "epoch": 1.8813559322033897, "grad_norm": 28.764899008069285, "learning_rate": 4.7033898305084747e-07, "logits/chosen": 11.594640731811523, "logits/rejected": 11.983922004699707, "logps/chosen": -8.291284561157227, "logps/rejected": -14.15444564819336, "loss": 0.532, "rewards/accuracies": 0.6875, "rewards/chosen": 0.017433345317840576, "rewards/margins": 0.700124979019165, "rewards/rejected": -0.6826916337013245, "step": 111 }, { "epoch": 1.8983050847457628, "grad_norm": 28.655586786759734, "learning_rate": 4.7457627118644066e-07, "logits/chosen": 10.416437149047852, "logits/rejected": 8.874842643737793, "logps/chosen": -8.323904037475586, "logps/rejected": -12.19089126586914, "loss": 0.5498, "rewards/accuracies": 0.5, "rewards/chosen": 0.0059746429324150085, "rewards/margins": 0.41193652153015137, "rewards/rejected": -0.405961811542511, "step": 112 }, { "epoch": 1.9152542372881356, "grad_norm": 26.702839696044048, "learning_rate": 4.788135593220339e-07, "logits/chosen": 8.583914756774902, "logits/rejected": 11.768655776977539, "logps/chosen": -10.75674819946289, "logps/rejected": -13.667054176330566, "loss": 0.5249, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04430105537176132, "rewards/margins": 0.6275098323822021, "rewards/rejected": -0.5832087993621826, "step": 113 }, { "epoch": 1.9322033898305084, "grad_norm": 27.420724418994222, "learning_rate": 4.830508474576271e-07, "logits/chosen": 14.770956039428711, "logits/rejected": 17.315418243408203, "logps/chosen": -12.913301467895508, "logps/rejected": -18.054443359375, "loss": 0.5272, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04427202790975571, "rewards/margins": 0.9159756302833557, "rewards/rejected": -0.8717036843299866, "step": 114 }, { "epoch": 1.9491525423728815, "grad_norm": 29.355284582434965, "learning_rate": 4.872881355932203e-07, "logits/chosen": 14.43870735168457, "logits/rejected": 14.848833084106445, "logps/chosen": -10.624906539916992, "logps/rejected": -13.744641304016113, "loss": 0.5595, "rewards/accuracies": 0.625, "rewards/chosen": 0.06107292324304581, "rewards/margins": 0.63875812292099, "rewards/rejected": -0.5776851773262024, "step": 115 }, { "epoch": 1.9661016949152543, "grad_norm": 28.31017869263379, "learning_rate": 4.915254237288136e-07, "logits/chosen": 5.89711332321167, "logits/rejected": 8.889683723449707, "logps/chosen": -11.735050201416016, "logps/rejected": -16.08776092529297, "loss": 0.5137, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010795101523399353, "rewards/margins": 0.7134937047958374, "rewards/rejected": -0.7026985883712769, "step": 116 }, { "epoch": 1.9830508474576272, "grad_norm": 25.543721865788335, "learning_rate": 4.957627118644068e-07, "logits/chosen": 12.527676582336426, "logits/rejected": 13.638514518737793, "logps/chosen": -11.032867431640625, "logps/rejected": -16.750507354736328, "loss": 0.4784, "rewards/accuracies": 0.75, "rewards/chosen": 0.06131690740585327, "rewards/margins": 0.800919771194458, "rewards/rejected": -0.7396028637886047, "step": 117 }, { "epoch": 2.0, "grad_norm": 27.898132683327685, "learning_rate": 5e-07, "logits/chosen": 13.574739456176758, "logits/rejected": 13.461164474487305, "logps/chosen": -11.047101020812988, "logps/rejected": -14.235968589782715, "loss": 0.5421, "rewards/accuracies": 0.5, "rewards/chosen": 0.01252034679055214, "rewards/margins": 0.5880539417266846, "rewards/rejected": -0.5755336284637451, "step": 118 }, { "epoch": 2.016949152542373, "grad_norm": 24.420931650002732, "learning_rate": 4.99998906143358e-07, "logits/chosen": 8.436079025268555, "logits/rejected": 8.032705307006836, "logps/chosen": -8.76571273803711, "logps/rejected": -15.768316268920898, "loss": 0.5087, "rewards/accuracies": 0.8125, "rewards/chosen": -0.029925979673862457, "rewards/margins": 0.830885112285614, "rewards/rejected": -0.8608111143112183, "step": 119 }, { "epoch": 2.0338983050847457, "grad_norm": 24.437724993379234, "learning_rate": 4.999956245830044e-07, "logits/chosen": 8.596471786499023, "logits/rejected": 9.941722869873047, "logps/chosen": -10.456941604614258, "logps/rejected": -13.767266273498535, "loss": 0.4937, "rewards/accuracies": 0.75, "rewards/chosen": 0.04792099446058273, "rewards/margins": 0.7964321970939636, "rewards/rejected": -0.7485113143920898, "step": 120 }, { "epoch": 2.0508474576271185, "grad_norm": 26.385817925226156, "learning_rate": 4.999901553476555e-07, "logits/chosen": 9.167165756225586, "logits/rejected": 10.949917793273926, "logps/chosen": -11.454400062561035, "logps/rejected": -10.427359580993652, "loss": 0.5448, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0467328205704689, "rewards/margins": 0.23926034569740295, "rewards/rejected": -0.28599315881729126, "step": 121 }, { "epoch": 2.0677966101694913, "grad_norm": 26.51488464478305, "learning_rate": 4.999824984851718e-07, "logits/chosen": 11.507087707519531, "logits/rejected": 12.06271743774414, "logps/chosen": -14.055889129638672, "logps/rejected": -18.92905044555664, "loss": 0.506, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05761418491601944, "rewards/margins": 0.7195183038711548, "rewards/rejected": -0.6619042158126831, "step": 122 }, { "epoch": 2.084745762711864, "grad_norm": 25.46072474496047, "learning_rate": 4.999726540625574e-07, "logits/chosen": 9.345511436462402, "logits/rejected": 5.943528652191162, "logps/chosen": -9.2318754196167, "logps/rejected": -15.882528305053711, "loss": 0.4653, "rewards/accuracies": 0.8125, "rewards/chosen": 0.012520495802164078, "rewards/margins": 0.6578124761581421, "rewards/rejected": -0.6452920436859131, "step": 123 }, { "epoch": 2.1016949152542375, "grad_norm": 24.699856400048255, "learning_rate": 4.999606221659594e-07, "logits/chosen": 8.952792167663574, "logits/rejected": 9.421079635620117, "logps/chosen": -12.475508689880371, "logps/rejected": -15.839692115783691, "loss": 0.5147, "rewards/accuracies": 0.875, "rewards/chosen": 0.12024357169866562, "rewards/margins": 0.6879490613937378, "rewards/rejected": -0.567705512046814, "step": 124 }, { "epoch": 2.1186440677966103, "grad_norm": 24.645717371536033, "learning_rate": 4.999464029006672e-07, "logits/chosen": 11.617278099060059, "logits/rejected": 12.094820976257324, "logps/chosen": -8.739863395690918, "logps/rejected": -16.15949249267578, "loss": 0.4914, "rewards/accuracies": 0.875, "rewards/chosen": 0.10157185047864914, "rewards/margins": 1.0363792181015015, "rewards/rejected": -0.9348073601722717, "step": 125 }, { "epoch": 2.135593220338983, "grad_norm": 25.01415224410205, "learning_rate": 4.999299963911115e-07, "logits/chosen": 8.700315475463867, "logits/rejected": 9.787883758544922, "logps/chosen": -9.654312133789062, "logps/rejected": -10.987586975097656, "loss": 0.5217, "rewards/accuracies": 0.75, "rewards/chosen": 0.05968749523162842, "rewards/margins": 0.37098097801208496, "rewards/rejected": -0.31129348278045654, "step": 126 }, { "epoch": 2.152542372881356, "grad_norm": 24.639525161190353, "learning_rate": 4.999114027808631e-07, "logits/chosen": 9.593833923339844, "logits/rejected": 13.520515441894531, "logps/chosen": -9.5872802734375, "logps/rejected": -14.834587097167969, "loss": 0.4919, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03321126848459244, "rewards/margins": 1.004889965057373, "rewards/rejected": -0.9716786742210388, "step": 127 }, { "epoch": 2.169491525423729, "grad_norm": 25.714664571452275, "learning_rate": 4.998906222326321e-07, "logits/chosen": 10.212748527526855, "logits/rejected": 8.447935104370117, "logps/chosen": -14.084490776062012, "logps/rejected": -20.66387939453125, "loss": 0.5057, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07896807044744492, "rewards/margins": 1.2706775665283203, "rewards/rejected": -1.1917095184326172, "step": 128 }, { "epoch": 2.1864406779661016, "grad_norm": 26.119485701445264, "learning_rate": 4.99867654928266e-07, "logits/chosen": 13.961820602416992, "logits/rejected": 15.463374137878418, "logps/chosen": -13.826520919799805, "logps/rejected": -18.094711303710938, "loss": 0.4806, "rewards/accuracies": 0.75, "rewards/chosen": 0.17537821829319, "rewards/margins": 1.1025470495224, "rewards/rejected": -0.9271686673164368, "step": 129 }, { "epoch": 2.2033898305084745, "grad_norm": 25.992789405675488, "learning_rate": 4.998425010687483e-07, "logits/chosen": 10.33218002319336, "logits/rejected": 9.227500915527344, "logps/chosen": -10.892607688903809, "logps/rejected": -20.151803970336914, "loss": 0.5113, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04175170511007309, "rewards/margins": 1.3892616033554077, "rewards/rejected": -1.4310133457183838, "step": 130 }, { "epoch": 2.2203389830508473, "grad_norm": 31.86911962760966, "learning_rate": 4.998151608741969e-07, "logits/chosen": 11.884110450744629, "logits/rejected": 11.618425369262695, "logps/chosen": -9.044775009155273, "logps/rejected": -18.19227409362793, "loss": 0.4792, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08620918542146683, "rewards/margins": 1.41904616355896, "rewards/rejected": -1.3328371047973633, "step": 131 }, { "epoch": 2.23728813559322, "grad_norm": 25.915056331398198, "learning_rate": 4.997856345838614e-07, "logits/chosen": 10.366816520690918, "logits/rejected": 11.786434173583984, "logps/chosen": -10.501708030700684, "logps/rejected": -17.16826629638672, "loss": 0.4849, "rewards/accuracies": 0.875, "rewards/chosen": 0.08089535683393478, "rewards/margins": 1.430103063583374, "rewards/rejected": -1.3492075204849243, "step": 132 }, { "epoch": 2.2542372881355934, "grad_norm": 26.862491847754576, "learning_rate": 4.997539224561225e-07, "logits/chosen": 9.815896987915039, "logits/rejected": 8.910368919372559, "logps/chosen": -7.234807014465332, "logps/rejected": -10.03347110748291, "loss": 0.4671, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17896637320518494, "rewards/margins": 0.49002087116241455, "rewards/rejected": -0.3110544979572296, "step": 133 }, { "epoch": 2.2711864406779663, "grad_norm": 25.040024723467038, "learning_rate": 4.99720024768488e-07, "logits/chosen": 11.472432136535645, "logits/rejected": 12.454916000366211, "logps/chosen": -11.175409317016602, "logps/rejected": -14.160651206970215, "loss": 0.4648, "rewards/accuracies": 0.875, "rewards/chosen": 0.15106943249702454, "rewards/margins": 0.8142722845077515, "rewards/rejected": -0.6632028818130493, "step": 134 }, { "epoch": 2.288135593220339, "grad_norm": 28.03197887833454, "learning_rate": 4.996839418175918e-07, "logits/chosen": 13.856090545654297, "logits/rejected": 13.056228637695312, "logps/chosen": -9.581791877746582, "logps/rejected": -19.141881942749023, "loss": 0.4768, "rewards/accuracies": 0.75, "rewards/chosen": 0.15028533339500427, "rewards/margins": 1.267728328704834, "rewards/rejected": -1.1174428462982178, "step": 135 }, { "epoch": 2.305084745762712, "grad_norm": 24.660609353156655, "learning_rate": 4.996456739191904e-07, "logits/chosen": 12.02409839630127, "logits/rejected": 11.062095642089844, "logps/chosen": -9.458813667297363, "logps/rejected": -14.94155216217041, "loss": 0.4563, "rewards/accuracies": 0.75, "rewards/chosen": 0.08914165943861008, "rewards/margins": 0.9309415817260742, "rewards/rejected": -0.8417999148368835, "step": 136 }, { "epoch": 2.3220338983050848, "grad_norm": 26.028218734293073, "learning_rate": 4.996052214081608e-07, "logits/chosen": 9.56547737121582, "logits/rejected": 12.38303279876709, "logps/chosen": -9.920650482177734, "logps/rejected": -13.999842643737793, "loss": 0.5123, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08545555174350739, "rewards/margins": 0.8639965057373047, "rewards/rejected": -0.7785409688949585, "step": 137 }, { "epoch": 2.3389830508474576, "grad_norm": 25.90341742150353, "learning_rate": 4.995625846384966e-07, "logits/chosen": 12.650365829467773, "logits/rejected": 12.255863189697266, "logps/chosen": -10.3493070602417, "logps/rejected": -11.783307075500488, "loss": 0.4901, "rewards/accuracies": 0.5625, "rewards/chosen": 0.14267519116401672, "rewards/margins": 0.39001455903053284, "rewards/rejected": -0.2473393678665161, "step": 138 }, { "epoch": 2.3559322033898304, "grad_norm": 28.40837942784419, "learning_rate": 4.995177639833061e-07, "logits/chosen": 9.189220428466797, "logits/rejected": 9.674077033996582, "logps/chosen": -8.151750564575195, "logps/rejected": -12.80848503112793, "loss": 0.5066, "rewards/accuracies": 0.625, "rewards/chosen": 0.09756353497505188, "rewards/margins": 0.5721523761749268, "rewards/rejected": -0.4745888411998749, "step": 139 }, { "epoch": 2.3728813559322033, "grad_norm": 26.068404683176787, "learning_rate": 4.994707598348084e-07, "logits/chosen": 11.096114158630371, "logits/rejected": 9.946800231933594, "logps/chosen": -9.776742935180664, "logps/rejected": -19.20057487487793, "loss": 0.5062, "rewards/accuracies": 0.75, "rewards/chosen": 0.0900912657380104, "rewards/margins": 0.896281361579895, "rewards/rejected": -0.806190013885498, "step": 140 }, { "epoch": 2.389830508474576, "grad_norm": 26.279589404349043, "learning_rate": 4.994215726043297e-07, "logits/chosen": 13.786352157592773, "logits/rejected": 12.066240310668945, "logps/chosen": -10.624284744262695, "logps/rejected": -15.999090194702148, "loss": 0.4801, "rewards/accuracies": 0.75, "rewards/chosen": -0.11691068112850189, "rewards/margins": 0.8987638354301453, "rewards/rejected": -1.0156744718551636, "step": 141 }, { "epoch": 2.406779661016949, "grad_norm": 25.385659931811325, "learning_rate": 4.993702027223003e-07, "logits/chosen": 11.07485294342041, "logits/rejected": 12.823025703430176, "logps/chosen": -9.523748397827148, "logps/rejected": -19.099346160888672, "loss": 0.4461, "rewards/accuracies": 1.0, "rewards/chosen": 0.11692683398723602, "rewards/margins": 1.9571490287780762, "rewards/rejected": -1.8402220010757446, "step": 142 }, { "epoch": 2.423728813559322, "grad_norm": 28.17104878053314, "learning_rate": 4.993166506382505e-07, "logits/chosen": 12.44686508178711, "logits/rejected": 13.07184886932373, "logps/chosen": -7.501523971557617, "logps/rejected": -12.960819244384766, "loss": 0.5077, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08402667939662933, "rewards/margins": 1.0415351390838623, "rewards/rejected": -0.9575084447860718, "step": 143 }, { "epoch": 2.440677966101695, "grad_norm": 25.091201340296504, "learning_rate": 4.992609168208068e-07, "logits/chosen": 5.851665019989014, "logits/rejected": 9.226675033569336, "logps/chosen": -13.629108428955078, "logps/rejected": -12.965569496154785, "loss": 0.5181, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13420507311820984, "rewards/margins": 0.6294023990631104, "rewards/rejected": -0.49519726634025574, "step": 144 }, { "epoch": 2.457627118644068, "grad_norm": 23.73455464805867, "learning_rate": 4.992030017576875e-07, "logits/chosen": 11.717859268188477, "logits/rejected": 14.757806777954102, "logps/chosen": -14.967961311340332, "logps/rejected": -22.604053497314453, "loss": 0.4534, "rewards/accuracies": 0.75, "rewards/chosen": 0.0003174692392349243, "rewards/margins": 1.5173003673553467, "rewards/rejected": -1.5169827938079834, "step": 145 }, { "epoch": 2.4745762711864407, "grad_norm": 24.58533088616242, "learning_rate": 4.991429059556989e-07, "logits/chosen": 11.908763885498047, "logits/rejected": 12.790480613708496, "logps/chosen": -12.356618881225586, "logps/rejected": -15.337615966796875, "loss": 0.4701, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03432869166135788, "rewards/margins": 1.1666860580444336, "rewards/rejected": -1.201014757156372, "step": 146 }, { "epoch": 2.4915254237288136, "grad_norm": 26.21401018621895, "learning_rate": 4.990806299407305e-07, "logits/chosen": 10.173805236816406, "logits/rejected": 9.579325675964355, "logps/chosen": -9.430241584777832, "logps/rejected": -11.20843505859375, "loss": 0.5049, "rewards/accuracies": 0.4375, "rewards/chosen": 0.10169175267219543, "rewards/margins": 0.23200133442878723, "rewards/rejected": -0.1303095817565918, "step": 147 }, { "epoch": 2.5084745762711864, "grad_norm": 25.415342669771636, "learning_rate": 4.990161742577506e-07, "logits/chosen": 9.909467697143555, "logits/rejected": 10.56566047668457, "logps/chosen": -8.367213249206543, "logps/rejected": -14.22795581817627, "loss": 0.4641, "rewards/accuracies": 0.875, "rewards/chosen": 0.1012047678232193, "rewards/margins": 1.2070566415786743, "rewards/rejected": -1.1058518886566162, "step": 148 }, { "epoch": 2.5254237288135593, "grad_norm": 23.40734112242746, "learning_rate": 4.989495394708015e-07, "logits/chosen": 9.81765365600586, "logits/rejected": 13.320455551147461, "logps/chosen": -15.109245300292969, "logps/rejected": -17.49165916442871, "loss": 0.4431, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2697944641113281, "rewards/margins": 1.2794241905212402, "rewards/rejected": -1.0096296072006226, "step": 149 }, { "epoch": 2.542372881355932, "grad_norm": 24.624713192478026, "learning_rate": 4.988807261629942e-07, "logits/chosen": 10.242950439453125, "logits/rejected": 12.858686447143555, "logps/chosen": -10.181503295898438, "logps/rejected": -11.994699478149414, "loss": 0.4411, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02304082363843918, "rewards/margins": 1.0250604152679443, "rewards/rejected": -1.0020196437835693, "step": 150 }, { "epoch": 2.559322033898305, "grad_norm": 24.990088355481344, "learning_rate": 4.988097349365039e-07, "logits/chosen": 7.55122184753418, "logits/rejected": 9.793380737304688, "logps/chosen": -11.03127384185791, "logps/rejected": -17.439645767211914, "loss": 0.4557, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16880398988723755, "rewards/margins": 0.9546310901641846, "rewards/rejected": -0.7858270406723022, "step": 151 }, { "epoch": 2.576271186440678, "grad_norm": 24.98304302249675, "learning_rate": 4.987365664125646e-07, "logits/chosen": 9.416153907775879, "logits/rejected": 10.089354515075684, "logps/chosen": -10.366312026977539, "logps/rejected": -12.546512603759766, "loss": 0.4712, "rewards/accuracies": 0.75, "rewards/chosen": 0.005661562085151672, "rewards/margins": 0.5207768082618713, "rewards/rejected": -0.5151152610778809, "step": 152 }, { "epoch": 2.593220338983051, "grad_norm": 28.025863819113706, "learning_rate": 4.986612212314632e-07, "logits/chosen": 12.203873634338379, "logits/rejected": 10.61597728729248, "logps/chosen": -10.178508758544922, "logps/rejected": -15.887847900390625, "loss": 0.5206, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24399825930595398, "rewards/margins": 0.9938374161720276, "rewards/rejected": -0.749839186668396, "step": 153 }, { "epoch": 2.610169491525424, "grad_norm": 25.025894346113112, "learning_rate": 4.985837000525343e-07, "logits/chosen": 12.996820449829102, "logits/rejected": 11.24425220489502, "logps/chosen": -7.2731475830078125, "logps/rejected": -9.428850173950195, "loss": 0.5125, "rewards/accuracies": 0.75, "rewards/chosen": 0.09597513824701309, "rewards/margins": 0.3146960139274597, "rewards/rejected": -0.21872088313102722, "step": 154 }, { "epoch": 2.6271186440677967, "grad_norm": 23.275619582693544, "learning_rate": 4.985040035541542e-07, "logits/chosen": 13.19137191772461, "logits/rejected": 11.988751411437988, "logps/chosen": -9.661288261413574, "logps/rejected": -16.59218406677246, "loss": 0.4505, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06454382091760635, "rewards/margins": 1.3431710004806519, "rewards/rejected": -1.2786271572113037, "step": 155 }, { "epoch": 2.6440677966101696, "grad_norm": 24.401688438016123, "learning_rate": 4.984221324337356e-07, "logits/chosen": 12.422380447387695, "logits/rejected": 11.99139404296875, "logps/chosen": -7.512474536895752, "logps/rejected": -15.746870040893555, "loss": 0.4729, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12945565581321716, "rewards/margins": 1.181099534034729, "rewards/rejected": -1.0516438484191895, "step": 156 }, { "epoch": 2.6610169491525424, "grad_norm": 24.728783919464327, "learning_rate": 4.983380874077204e-07, "logits/chosen": 9.292040824890137, "logits/rejected": 11.206062316894531, "logps/chosen": -8.320369720458984, "logps/rejected": -9.853104591369629, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": 0.03576832637190819, "rewards/margins": 0.6088409423828125, "rewards/rejected": -0.5730725526809692, "step": 157 }, { "epoch": 2.6779661016949152, "grad_norm": 25.05007863888729, "learning_rate": 4.982518692115743e-07, "logits/chosen": 9.616560935974121, "logits/rejected": 10.010045051574707, "logps/chosen": -9.049758911132812, "logps/rejected": -12.729308128356934, "loss": 0.4896, "rewards/accuracies": 0.8125, "rewards/chosen": 0.08253885060548782, "rewards/margins": 0.7497320175170898, "rewards/rejected": -0.6671931743621826, "step": 158 }, { "epoch": 2.694915254237288, "grad_norm": 25.318700331957388, "learning_rate": 4.981634785997801e-07, "logits/chosen": 10.468634605407715, "logits/rejected": 10.967134475708008, "logps/chosen": -9.911856651306152, "logps/rejected": -12.62133502960205, "loss": 0.4631, "rewards/accuracies": 0.875, "rewards/chosen": 0.09426818788051605, "rewards/margins": 0.9589300751686096, "rewards/rejected": -0.8646619319915771, "step": 159 }, { "epoch": 2.711864406779661, "grad_norm": 23.73451475971574, "learning_rate": 4.980729163458311e-07, "logits/chosen": 7.005346775054932, "logits/rejected": 8.92467212677002, "logps/chosen": -9.383186340332031, "logps/rejected": -11.193894386291504, "loss": 0.4974, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19052043557167053, "rewards/margins": 0.8246796727180481, "rewards/rejected": -0.6341591477394104, "step": 160 }, { "epoch": 2.7288135593220337, "grad_norm": 23.8047083086088, "learning_rate": 4.979801832422243e-07, "logits/chosen": 12.244979858398438, "logits/rejected": 12.103124618530273, "logps/chosen": -6.470426559448242, "logps/rejected": -9.479026794433594, "loss": 0.4738, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04870966449379921, "rewards/margins": 0.3689252734184265, "rewards/rejected": -0.3202156126499176, "step": 161 }, { "epoch": 2.7457627118644066, "grad_norm": 23.564234251226335, "learning_rate": 4.978852801004533e-07, "logits/chosen": 9.73720932006836, "logits/rejected": 8.43739128112793, "logps/chosen": -10.515070915222168, "logps/rejected": -13.127669334411621, "loss": 0.4756, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1742311269044876, "rewards/margins": 0.537544310092926, "rewards/rejected": -0.36331313848495483, "step": 162 }, { "epoch": 2.7627118644067794, "grad_norm": 25.59634193506336, "learning_rate": 4.977882077510018e-07, "logits/chosen": 9.140570640563965, "logits/rejected": 11.064943313598633, "logps/chosen": -7.318413257598877, "logps/rejected": -16.608783721923828, "loss": 0.4596, "rewards/accuracies": 0.875, "rewards/chosen": 0.0647045448422432, "rewards/margins": 1.8001742362976074, "rewards/rejected": -1.7354698181152344, "step": 163 }, { "epoch": 2.7796610169491527, "grad_norm": 23.29024669512435, "learning_rate": 4.976889670433355e-07, "logits/chosen": 7.721864700317383, "logits/rejected": 11.411937713623047, "logps/chosen": -13.955901145935059, "logps/rejected": -17.241910934448242, "loss": 0.4214, "rewards/accuracies": 0.625, "rewards/chosen": 0.09154816716909409, "rewards/margins": 1.4261943101882935, "rewards/rejected": -1.334646224975586, "step": 164 }, { "epoch": 2.7966101694915255, "grad_norm": 24.3882056521309, "learning_rate": 4.975875588458953e-07, "logits/chosen": 9.531468391418457, "logits/rejected": 9.778902053833008, "logps/chosen": -13.27505111694336, "logps/rejected": -13.49073314666748, "loss": 0.5075, "rewards/accuracies": 0.625, "rewards/chosen": -0.06451751291751862, "rewards/margins": 0.2705305218696594, "rewards/rejected": -0.33504801988601685, "step": 165 }, { "epoch": 2.8135593220338984, "grad_norm": 23.673522300640947, "learning_rate": 4.974839840460894e-07, "logits/chosen": 12.992244720458984, "logits/rejected": 12.923137664794922, "logps/chosen": -5.916741371154785, "logps/rejected": -12.199560165405273, "loss": 0.4475, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09810908138751984, "rewards/margins": 1.2945395708084106, "rewards/rejected": -1.1964305639266968, "step": 166 }, { "epoch": 2.830508474576271, "grad_norm": 26.17099733938422, "learning_rate": 4.973782435502858e-07, "logits/chosen": 11.463674545288086, "logits/rejected": 11.526470184326172, "logps/chosen": -11.625694274902344, "logps/rejected": -18.08144760131836, "loss": 0.4692, "rewards/accuracies": 0.875, "rewards/chosen": 0.0962575376033783, "rewards/margins": 1.393619418144226, "rewards/rejected": -1.2973618507385254, "step": 167 }, { "epoch": 2.847457627118644, "grad_norm": 23.375003008104915, "learning_rate": 4.97270338283804e-07, "logits/chosen": 6.848824501037598, "logits/rejected": 6.401797294616699, "logps/chosen": -7.540995121002197, "logps/rejected": -10.878442764282227, "loss": 0.4503, "rewards/accuracies": 0.875, "rewards/chosen": 0.08859586715698242, "rewards/margins": 0.7138924598693848, "rewards/rejected": -0.6252965927124023, "step": 168 }, { "epoch": 2.864406779661017, "grad_norm": 24.319685449672303, "learning_rate": 4.97160269190907e-07, "logits/chosen": 11.042668342590332, "logits/rejected": 12.226555824279785, "logps/chosen": -9.243000030517578, "logps/rejected": -10.139057159423828, "loss": 0.4639, "rewards/accuracies": 0.875, "rewards/chosen": 0.1013827919960022, "rewards/margins": 0.4762725234031677, "rewards/rejected": -0.3748897314071655, "step": 169 }, { "epoch": 2.8813559322033897, "grad_norm": 23.719800999038476, "learning_rate": 4.970480372347933e-07, "logits/chosen": 5.58452844619751, "logits/rejected": 5.292469024658203, "logps/chosen": -8.558345794677734, "logps/rejected": -11.133590698242188, "loss": 0.4829, "rewards/accuracies": 0.8125, "rewards/chosen": 0.12931324541568756, "rewards/margins": 0.7034010887145996, "rewards/rejected": -0.5740878582000732, "step": 170 }, { "epoch": 2.898305084745763, "grad_norm": 26.325102228250078, "learning_rate": 4.969336433975886e-07, "logits/chosen": 10.18309211730957, "logits/rejected": 9.86977767944336, "logps/chosen": -9.139179229736328, "logps/rejected": -16.562314987182617, "loss": 0.512, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11633327603340149, "rewards/margins": 0.8496271371841431, "rewards/rejected": -0.733293890953064, "step": 171 }, { "epoch": 2.915254237288136, "grad_norm": 23.30874461173125, "learning_rate": 4.968170886803361e-07, "logits/chosen": 6.269238471984863, "logits/rejected": 5.880119323730469, "logps/chosen": -11.019979476928711, "logps/rejected": -14.050888061523438, "loss": 0.4454, "rewards/accuracies": 1.0, "rewards/chosen": 0.15095578134059906, "rewards/margins": 0.910696804523468, "rewards/rejected": -0.759740948677063, "step": 172 }, { "epoch": 2.9322033898305087, "grad_norm": 26.260317853292694, "learning_rate": 4.966983741029893e-07, "logits/chosen": 9.132369995117188, "logits/rejected": 7.901233673095703, "logps/chosen": -9.164523124694824, "logps/rejected": -15.087239265441895, "loss": 0.446, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06711259484291077, "rewards/margins": 0.5440139174461365, "rewards/rejected": -0.4769013524055481, "step": 173 }, { "epoch": 2.9491525423728815, "grad_norm": 25.432426091848676, "learning_rate": 4.965775007044019e-07, "logits/chosen": 7.7744951248168945, "logits/rejected": 12.220438003540039, "logps/chosen": -12.57607650756836, "logps/rejected": -19.119516372680664, "loss": 0.4158, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03255147486925125, "rewards/margins": 1.7571169137954712, "rewards/rejected": -1.7896684408187866, "step": 174 }, { "epoch": 2.9661016949152543, "grad_norm": 24.66287767238989, "learning_rate": 4.964544695423193e-07, "logits/chosen": 9.675177574157715, "logits/rejected": 10.935977935791016, "logps/chosen": -8.286613464355469, "logps/rejected": -9.868494033813477, "loss": 0.5054, "rewards/accuracies": 0.875, "rewards/chosen": 0.0919422060251236, "rewards/margins": 0.8523497581481934, "rewards/rejected": -0.760407567024231, "step": 175 }, { "epoch": 2.983050847457627, "grad_norm": 24.155134143035227, "learning_rate": 4.963292816933691e-07, "logits/chosen": 8.695963859558105, "logits/rejected": 8.458662033081055, "logps/chosen": -14.033685684204102, "logps/rejected": -18.679027557373047, "loss": 0.4945, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17841431498527527, "rewards/margins": 1.1647181510925293, "rewards/rejected": -0.9863038063049316, "step": 176 }, { "epoch": 3.0, "grad_norm": 24.711170078854128, "learning_rate": 4.96201938253052e-07, "logits/chosen": 10.343132019042969, "logits/rejected": 8.42280101776123, "logps/chosen": -9.407011032104492, "logps/rejected": -15.08864974975586, "loss": 0.5274, "rewards/accuracies": 0.875, "rewards/chosen": 0.08468975871801376, "rewards/margins": 0.7279720306396484, "rewards/rejected": -0.6432822942733765, "step": 177 }, { "epoch": 3.016949152542373, "grad_norm": 23.669291103097716, "learning_rate": 4.960724403357314e-07, "logits/chosen": 9.52169132232666, "logits/rejected": 9.99760913848877, "logps/chosen": -9.479093551635742, "logps/rejected": -11.957143783569336, "loss": 0.4462, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0010063312947750092, "rewards/margins": 0.8099563717842102, "rewards/rejected": -0.8089500665664673, "step": 178 }, { "epoch": 3.0338983050847457, "grad_norm": 22.62026751615169, "learning_rate": 4.959407890746248e-07, "logits/chosen": 7.577289581298828, "logits/rejected": 7.524887561798096, "logps/chosen": -8.504217147827148, "logps/rejected": -11.792627334594727, "loss": 0.4502, "rewards/accuracies": 1.0, "rewards/chosen": 0.2125588059425354, "rewards/margins": 1.011501669883728, "rewards/rejected": -0.7989429235458374, "step": 179 }, { "epoch": 3.0508474576271185, "grad_norm": 20.80462624892399, "learning_rate": 4.958069856217929e-07, "logits/chosen": 8.124763488769531, "logits/rejected": 8.247533798217773, "logps/chosen": -8.557960510253906, "logps/rejected": -12.606611251831055, "loss": 0.4059, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12852689623832703, "rewards/margins": 1.2782764434814453, "rewards/rejected": -1.149749517440796, "step": 180 }, { "epoch": 3.0677966101694913, "grad_norm": 21.12610029770411, "learning_rate": 4.956710311481302e-07, "logits/chosen": 9.805615425109863, "logits/rejected": 10.935819625854492, "logps/chosen": -8.481396675109863, "logps/rejected": -19.484060287475586, "loss": 0.4014, "rewards/accuracies": 0.9375, "rewards/chosen": 0.007218081504106522, "rewards/margins": 1.4923968315124512, "rewards/rejected": -1.4851785898208618, "step": 181 }, { "epoch": 3.084745762711864, "grad_norm": 21.84840584181411, "learning_rate": 4.955329268433542e-07, "logits/chosen": 8.906585693359375, "logits/rejected": 7.750245571136475, "logps/chosen": -11.109460830688477, "logps/rejected": -12.152922630310059, "loss": 0.4288, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04717100411653519, "rewards/margins": 1.006048321723938, "rewards/rejected": -0.9588773846626282, "step": 182 }, { "epoch": 3.1016949152542375, "grad_norm": 23.956931308245203, "learning_rate": 4.953926739159956e-07, "logits/chosen": 11.974356651306152, "logits/rejected": 13.56727409362793, "logps/chosen": -9.361337661743164, "logps/rejected": -15.33359146118164, "loss": 0.4187, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0379180982708931, "rewards/margins": 1.3598583936691284, "rewards/rejected": -1.3219404220581055, "step": 183 }, { "epoch": 3.1186440677966103, "grad_norm": 27.743137262738262, "learning_rate": 4.952502735933869e-07, "logits/chosen": 8.663519859313965, "logits/rejected": 8.262752532958984, "logps/chosen": -11.083414077758789, "logps/rejected": -18.736698150634766, "loss": 0.4226, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21909113228321075, "rewards/margins": 1.2035876512527466, "rewards/rejected": -0.9844965934753418, "step": 184 }, { "epoch": 3.135593220338983, "grad_norm": 22.371627214758053, "learning_rate": 4.951057271216525e-07, "logits/chosen": 11.679509162902832, "logits/rejected": 8.517683029174805, "logps/chosen": -8.450439453125, "logps/rejected": -15.144365310668945, "loss": 0.4182, "rewards/accuracies": 0.875, "rewards/chosen": 0.21107473969459534, "rewards/margins": 1.0494379997253418, "rewards/rejected": -0.8383632302284241, "step": 185 }, { "epoch": 3.152542372881356, "grad_norm": 22.7306705476798, "learning_rate": 4.949590357656974e-07, "logits/chosen": 12.699131965637207, "logits/rejected": 10.048919677734375, "logps/chosen": -10.202484130859375, "logps/rejected": -18.82345199584961, "loss": 0.3911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15570127964019775, "rewards/margins": 1.4168462753295898, "rewards/rejected": -1.2611451148986816, "step": 186 }, { "epoch": 3.169491525423729, "grad_norm": 23.344069957384136, "learning_rate": 4.948102008091962e-07, "logits/chosen": 12.0389986038208, "logits/rejected": 12.07477855682373, "logps/chosen": -9.893170356750488, "logps/rejected": -17.06765365600586, "loss": 0.3773, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21715691685676575, "rewards/margins": 1.2834587097167969, "rewards/rejected": -1.0663018226623535, "step": 187 }, { "epoch": 3.1864406779661016, "grad_norm": 21.858930833074545, "learning_rate": 4.946592235545815e-07, "logits/chosen": 10.424958229064941, "logits/rejected": 10.42190170288086, "logps/chosen": -14.09011459350586, "logps/rejected": -17.815031051635742, "loss": 0.4075, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21966154873371124, "rewards/margins": 1.0660887956619263, "rewards/rejected": -0.8464272618293762, "step": 188 }, { "epoch": 3.2033898305084745, "grad_norm": 22.597248903481418, "learning_rate": 4.945061053230333e-07, "logits/chosen": 10.407960891723633, "logits/rejected": 7.39249324798584, "logps/chosen": -10.86440372467041, "logps/rejected": -25.083696365356445, "loss": 0.4143, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04429125040769577, "rewards/margins": 2.1463186740875244, "rewards/rejected": -2.102027416229248, "step": 189 }, { "epoch": 3.2203389830508473, "grad_norm": 20.70136919634923, "learning_rate": 4.943508474544666e-07, "logits/chosen": 9.571917533874512, "logits/rejected": 11.19800090789795, "logps/chosen": -7.767718315124512, "logps/rejected": -14.589805603027344, "loss": 0.3856, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17417725920677185, "rewards/margins": 1.5476055145263672, "rewards/rejected": -1.3734283447265625, "step": 190 }, { "epoch": 3.23728813559322, "grad_norm": 22.135267167333776, "learning_rate": 4.941934513075204e-07, "logits/chosen": 4.429686069488525, "logits/rejected": 9.858856201171875, "logps/chosen": -15.572264671325684, "logps/rejected": -16.965652465820312, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": 0.21184024214744568, "rewards/margins": 1.4814664125442505, "rewards/rejected": -1.269626259803772, "step": 191 }, { "epoch": 3.2542372881355934, "grad_norm": 21.499041506406705, "learning_rate": 4.94033918259545e-07, "logits/chosen": 13.0203275680542, "logits/rejected": 11.126030921936035, "logps/chosen": -9.320621490478516, "logps/rejected": -12.93569278717041, "loss": 0.4398, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15274274349212646, "rewards/margins": 1.1112730503082275, "rewards/rejected": -0.9585303068161011, "step": 192 }, { "epoch": 3.2711864406779663, "grad_norm": 20.87416005163883, "learning_rate": 4.938722497065909e-07, "logits/chosen": 8.742778778076172, "logits/rejected": 11.352007865905762, "logps/chosen": -10.20619010925293, "logps/rejected": -11.195541381835938, "loss": 0.3891, "rewards/accuracies": 1.0, "rewards/chosen": 0.18298028409481049, "rewards/margins": 1.1817280054092407, "rewards/rejected": -0.9987477660179138, "step": 193 }, { "epoch": 3.288135593220339, "grad_norm": 22.195177566377918, "learning_rate": 4.937084470633958e-07, "logits/chosen": 8.346685409545898, "logits/rejected": 12.437751770019531, "logps/chosen": -10.978531837463379, "logps/rejected": -16.579553604125977, "loss": 0.3915, "rewards/accuracies": 0.875, "rewards/chosen": 0.1756804883480072, "rewards/margins": 1.4822646379470825, "rewards/rejected": -1.306584119796753, "step": 194 }, { "epoch": 3.305084745762712, "grad_norm": 21.792253504042495, "learning_rate": 4.935425117633726e-07, "logits/chosen": 9.004284858703613, "logits/rejected": 9.3473482131958, "logps/chosen": -9.778764724731445, "logps/rejected": -11.804350852966309, "loss": 0.4259, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18937112390995026, "rewards/margins": 0.568367063999176, "rewards/rejected": -0.37899595499038696, "step": 195 }, { "epoch": 3.3220338983050848, "grad_norm": 22.229138483410182, "learning_rate": 4.933744452585966e-07, "logits/chosen": 7.870884895324707, "logits/rejected": 10.313871383666992, "logps/chosen": -9.134467124938965, "logps/rejected": -12.303489685058594, "loss": 0.4394, "rewards/accuracies": 0.875, "rewards/chosen": 0.12609566748142242, "rewards/margins": 1.0664055347442627, "rewards/rejected": -0.9403098821640015, "step": 196 }, { "epoch": 3.3389830508474576, "grad_norm": 23.149619465198537, "learning_rate": 4.932042490197933e-07, "logits/chosen": 3.4697465896606445, "logits/rejected": 6.568927764892578, "logps/chosen": -14.070352554321289, "logps/rejected": -14.763322830200195, "loss": 0.4446, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24967879056930542, "rewards/margins": 1.045470952987671, "rewards/rejected": -0.7957921624183655, "step": 197 }, { "epoch": 3.3559322033898304, "grad_norm": 22.984979929041984, "learning_rate": 4.930319245363248e-07, "logits/chosen": 9.373580932617188, "logits/rejected": 9.523336410522461, "logps/chosen": -5.632663726806641, "logps/rejected": -13.255867004394531, "loss": 0.4067, "rewards/accuracies": 0.875, "rewards/chosen": 0.13241903483867645, "rewards/margins": 1.1132423877716064, "rewards/rejected": -0.9808233380317688, "step": 198 }, { "epoch": 3.3728813559322033, "grad_norm": 20.69437622287786, "learning_rate": 4.928574733161775e-07, "logits/chosen": 7.692915439605713, "logits/rejected": 7.09061861038208, "logps/chosen": -7.571112632751465, "logps/rejected": -13.790243148803711, "loss": 0.4096, "rewards/accuracies": 0.875, "rewards/chosen": 0.20739738643169403, "rewards/margins": 1.091109037399292, "rewards/rejected": -0.8837117552757263, "step": 199 }, { "epoch": 3.389830508474576, "grad_norm": 19.60119003237251, "learning_rate": 4.926808968859483e-07, "logits/chosen": 5.386170387268066, "logits/rejected": 7.581046104431152, "logps/chosen": -8.80381965637207, "logps/rejected": -14.835564613342285, "loss": 0.3412, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20112082362174988, "rewards/margins": 1.5490384101867676, "rewards/rejected": -1.3479175567626953, "step": 200 }, { "epoch": 3.406779661016949, "grad_norm": 22.208560067748735, "learning_rate": 4.925021967908316e-07, "logits/chosen": 7.9428558349609375, "logits/rejected": 7.730855464935303, "logps/chosen": -7.504685878753662, "logps/rejected": -10.529887199401855, "loss": 0.414, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12443725764751434, "rewards/margins": 0.5505168437957764, "rewards/rejected": -0.4260796010494232, "step": 201 }, { "epoch": 3.423728813559322, "grad_norm": 20.495015232824937, "learning_rate": 4.923213745946059e-07, "logits/chosen": 9.569754600524902, "logits/rejected": 10.065143585205078, "logps/chosen": -6.785487174987793, "logps/rejected": -17.289140701293945, "loss": 0.3793, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2570599913597107, "rewards/margins": 1.8961150646209717, "rewards/rejected": -1.6390550136566162, "step": 202 }, { "epoch": 3.440677966101695, "grad_norm": 21.481240539670026, "learning_rate": 4.921384318796193e-07, "logits/chosen": 10.811732292175293, "logits/rejected": 13.857377052307129, "logps/chosen": -11.36143684387207, "logps/rejected": -14.535248756408691, "loss": 0.4095, "rewards/accuracies": 0.875, "rewards/chosen": 0.20432500541210175, "rewards/margins": 1.145780086517334, "rewards/rejected": -0.9414551258087158, "step": 203 }, { "epoch": 3.457627118644068, "grad_norm": 19.51511374436272, "learning_rate": 4.919533702467771e-07, "logits/chosen": 8.838293075561523, "logits/rejected": 11.331209182739258, "logps/chosen": -9.482587814331055, "logps/rejected": -17.566062927246094, "loss": 0.3544, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11788967251777649, "rewards/margins": 1.833619475364685, "rewards/rejected": -1.7157299518585205, "step": 204 }, { "epoch": 3.4745762711864407, "grad_norm": 20.794868312344995, "learning_rate": 4.91766191315526e-07, "logits/chosen": 7.887833595275879, "logits/rejected": 10.80614948272705, "logps/chosen": -12.706953048706055, "logps/rejected": -16.633255004882812, "loss": 0.3632, "rewards/accuracies": 0.9375, "rewards/chosen": 0.137722447514534, "rewards/margins": 1.4804778099060059, "rewards/rejected": -1.3427555561065674, "step": 205 }, { "epoch": 3.4915254237288136, "grad_norm": 22.111290792251523, "learning_rate": 4.915768967238417e-07, "logits/chosen": 6.357418060302734, "logits/rejected": 7.8472113609313965, "logps/chosen": -10.277403831481934, "logps/rejected": -10.781571388244629, "loss": 0.4027, "rewards/accuracies": 0.9375, "rewards/chosen": 0.32281461358070374, "rewards/margins": 0.8708096742630005, "rewards/rejected": -0.5479950904846191, "step": 206 }, { "epoch": 3.5084745762711864, "grad_norm": 22.2878678999551, "learning_rate": 4.913854881282131e-07, "logits/chosen": 10.417675018310547, "logits/rejected": 10.893251419067383, "logps/chosen": -9.740762710571289, "logps/rejected": -15.510869979858398, "loss": 0.3839, "rewards/accuracies": 0.8125, "rewards/chosen": 0.055107712745666504, "rewards/margins": 1.458693504333496, "rewards/rejected": -1.4035859107971191, "step": 207 }, { "epoch": 3.5254237288135593, "grad_norm": 20.99161748301627, "learning_rate": 4.91191967203629e-07, "logits/chosen": 6.382161617279053, "logits/rejected": 4.985961437225342, "logps/chosen": -7.836081027984619, "logps/rejected": -13.391523361206055, "loss": 0.4012, "rewards/accuracies": 1.0, "rewards/chosen": 0.1940561830997467, "rewards/margins": 1.115774154663086, "rewards/rejected": -0.9217178821563721, "step": 208 }, { "epoch": 3.542372881355932, "grad_norm": 20.40271917104514, "learning_rate": 4.909963356435624e-07, "logits/chosen": 9.568527221679688, "logits/rejected": 10.793418884277344, "logps/chosen": -7.237433433532715, "logps/rejected": -18.871238708496094, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 0.11054301261901855, "rewards/margins": 2.871230125427246, "rewards/rejected": -2.7606871128082275, "step": 209 }, { "epoch": 3.559322033898305, "grad_norm": 21.549650672810273, "learning_rate": 4.907985951599563e-07, "logits/chosen": 7.44000768661499, "logits/rejected": 10.265032768249512, "logps/chosen": -9.132471084594727, "logps/rejected": -13.506124496459961, "loss": 0.3843, "rewards/accuracies": 0.75, "rewards/chosen": 0.16352702677249908, "rewards/margins": 0.9725521802902222, "rewards/rejected": -0.8090251684188843, "step": 210 }, { "epoch": 3.576271186440678, "grad_norm": 20.570670881015133, "learning_rate": 4.905987474832087e-07, "logits/chosen": 9.317378044128418, "logits/rejected": 5.925313472747803, "logps/chosen": -14.4925537109375, "logps/rejected": -20.435415267944336, "loss": 0.3668, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1997138261795044, "rewards/margins": 1.3157970905303955, "rewards/rejected": -1.1160831451416016, "step": 211 }, { "epoch": 3.593220338983051, "grad_norm": 20.97327386001296, "learning_rate": 4.903967943621573e-07, "logits/chosen": 6.413149833679199, "logits/rejected": 9.610268592834473, "logps/chosen": -13.387876510620117, "logps/rejected": -18.008541107177734, "loss": 0.3388, "rewards/accuracies": 1.0, "rewards/chosen": 0.2004997879266739, "rewards/margins": 2.024320363998413, "rewards/rejected": -1.8238208293914795, "step": 212 }, { "epoch": 3.610169491525424, "grad_norm": 20.265155078908585, "learning_rate": 4.901927375640642e-07, "logits/chosen": 7.389824390411377, "logits/rejected": 9.059728622436523, "logps/chosen": -8.919748306274414, "logps/rejected": -15.606636047363281, "loss": 0.3897, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26465845108032227, "rewards/margins": 1.466994285583496, "rewards/rejected": -1.2023358345031738, "step": 213 }, { "epoch": 3.6271186440677967, "grad_norm": 20.578725152301015, "learning_rate": 4.899865788746005e-07, "logits/chosen": 8.600379943847656, "logits/rejected": 10.468989372253418, "logps/chosen": -12.677905082702637, "logps/rejected": -19.546112060546875, "loss": 0.3711, "rewards/accuracies": 0.875, "rewards/chosen": 0.18351367115974426, "rewards/margins": 2.7776191234588623, "rewards/rejected": -2.5941052436828613, "step": 214 }, { "epoch": 3.6440677966101696, "grad_norm": 18.091261285176355, "learning_rate": 4.897783200978305e-07, "logits/chosen": 7.124416828155518, "logits/rejected": 10.905740737915039, "logps/chosen": -10.265155792236328, "logps/rejected": -13.548370361328125, "loss": 0.3485, "rewards/accuracies": 0.875, "rewards/chosen": 0.31245559453964233, "rewards/margins": 1.6994444131851196, "rewards/rejected": -1.386988878250122, "step": 215 }, { "epoch": 3.6610169491525424, "grad_norm": 21.908270820999903, "learning_rate": 4.895679630561963e-07, "logits/chosen": 9.098257064819336, "logits/rejected": 8.591686248779297, "logps/chosen": -8.658933639526367, "logps/rejected": -12.28408145904541, "loss": 0.3971, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11362498998641968, "rewards/margins": 1.1206563711166382, "rewards/rejected": -1.0070313215255737, "step": 216 }, { "epoch": 3.6779661016949152, "grad_norm": 22.830751729597864, "learning_rate": 4.893555095905013e-07, "logits/chosen": 5.396052837371826, "logits/rejected": 7.402496337890625, "logps/chosen": -12.782821655273438, "logps/rejected": -17.051944732666016, "loss": 0.3867, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31852054595947266, "rewards/margins": 1.6121286153793335, "rewards/rejected": -1.2936079502105713, "step": 217 }, { "epoch": 3.694915254237288, "grad_norm": 20.000263876312037, "learning_rate": 4.891409615598949e-07, "logits/chosen": 6.522953033447266, "logits/rejected": 9.26037883758545, "logps/chosen": -9.460394859313965, "logps/rejected": -14.80904769897461, "loss": 0.3662, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2533794939517975, "rewards/margins": 1.6217294931411743, "rewards/rejected": -1.3683499097824097, "step": 218 }, { "epoch": 3.711864406779661, "grad_norm": 20.593308405599913, "learning_rate": 4.889243208418549e-07, "logits/chosen": 6.3479533195495605, "logits/rejected": 5.238455295562744, "logps/chosen": -11.332074165344238, "logps/rejected": -16.5391845703125, "loss": 0.3812, "rewards/accuracies": 0.875, "rewards/chosen": 0.03672366216778755, "rewards/margins": 1.5255634784698486, "rewards/rejected": -1.488839864730835, "step": 219 }, { "epoch": 3.7288135593220337, "grad_norm": 20.145188470497885, "learning_rate": 4.88705589332173e-07, "logits/chosen": 7.08650016784668, "logits/rejected": 8.531864166259766, "logps/chosen": -7.60097074508667, "logps/rejected": -11.326334953308105, "loss": 0.3778, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3768389821052551, "rewards/margins": 1.0568028688430786, "rewards/rejected": -0.6799638271331787, "step": 220 }, { "epoch": 3.7457627118644066, "grad_norm": 24.41361721957706, "learning_rate": 4.884847689449361e-07, "logits/chosen": 9.343852043151855, "logits/rejected": 6.67693567276001, "logps/chosen": -9.532071113586426, "logps/rejected": -20.449419021606445, "loss": 0.4415, "rewards/accuracies": 1.0, "rewards/chosen": 0.2507076859474182, "rewards/margins": 1.9644031524658203, "rewards/rejected": -1.7136951684951782, "step": 221 }, { "epoch": 3.7627118644067794, "grad_norm": 20.511147442858544, "learning_rate": 4.88261861612511e-07, "logits/chosen": 6.1774139404296875, "logits/rejected": 8.693314552307129, "logps/chosen": -10.729753494262695, "logps/rejected": -13.79563045501709, "loss": 0.3302, "rewards/accuracies": 0.875, "rewards/chosen": 0.20722553133964539, "rewards/margins": 1.8044683933258057, "rewards/rejected": -1.597243070602417, "step": 222 }, { "epoch": 3.7796610169491527, "grad_norm": 21.53866158472204, "learning_rate": 4.880368692855273e-07, "logits/chosen": 2.2718300819396973, "logits/rejected": 7.136542797088623, "logps/chosen": -12.451478958129883, "logps/rejected": -18.369718551635742, "loss": 0.3836, "rewards/accuracies": 0.75, "rewards/chosen": 0.11350546777248383, "rewards/margins": 1.4630939960479736, "rewards/rejected": -1.3495887517929077, "step": 223 }, { "epoch": 3.7966101694915255, "grad_norm": 21.097920842210723, "learning_rate": 4.878097939328596e-07, "logits/chosen": 7.12879753112793, "logits/rejected": 7.652994632720947, "logps/chosen": -10.000338554382324, "logps/rejected": -11.711467742919922, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": 0.22102656960487366, "rewards/margins": 1.0897376537322998, "rewards/rejected": -0.868710994720459, "step": 224 }, { "epoch": 3.8135593220338984, "grad_norm": 20.22398030765245, "learning_rate": 4.875806375416109e-07, "logits/chosen": 8.740015029907227, "logits/rejected": 13.572087287902832, "logps/chosen": -11.432252883911133, "logps/rejected": -12.60297679901123, "loss": 0.376, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20887497067451477, "rewards/margins": 1.4403488636016846, "rewards/rejected": -1.2314739227294922, "step": 225 }, { "epoch": 3.830508474576271, "grad_norm": 20.621677218225578, "learning_rate": 4.873494021170954e-07, "logits/chosen": 9.282068252563477, "logits/rejected": 10.690045356750488, "logps/chosen": -8.891980171203613, "logps/rejected": -12.23794174194336, "loss": 0.3845, "rewards/accuracies": 0.875, "rewards/chosen": 0.015214920043945312, "rewards/margins": 1.3658978939056396, "rewards/rejected": -1.3506828546524048, "step": 226 }, { "epoch": 3.847457627118644, "grad_norm": 19.999247373468872, "learning_rate": 4.871160896828199e-07, "logits/chosen": 7.256874084472656, "logits/rejected": 10.629111289978027, "logps/chosen": -11.401512145996094, "logps/rejected": -16.319957733154297, "loss": 0.3531, "rewards/accuracies": 1.0, "rewards/chosen": 0.39265120029449463, "rewards/margins": 2.058181047439575, "rewards/rejected": -1.665529727935791, "step": 227 }, { "epoch": 3.864406779661017, "grad_norm": 21.02660956059569, "learning_rate": 4.868807022804678e-07, "logits/chosen": 8.826285362243652, "logits/rejected": 7.1804375648498535, "logps/chosen": -8.303820610046387, "logps/rejected": -18.380210876464844, "loss": 0.3719, "rewards/accuracies": 1.0, "rewards/chosen": 0.27124762535095215, "rewards/margins": 1.8690426349639893, "rewards/rejected": -1.5977948904037476, "step": 228 }, { "epoch": 3.8813559322033897, "grad_norm": 21.93005411442054, "learning_rate": 4.866432419698792e-07, "logits/chosen": 3.6723451614379883, "logits/rejected": 4.766155242919922, "logps/chosen": -10.370809555053711, "logps/rejected": -13.244355201721191, "loss": 0.409, "rewards/accuracies": 0.875, "rewards/chosen": 0.05697673559188843, "rewards/margins": 1.2714791297912598, "rewards/rejected": -1.2145024538040161, "step": 229 }, { "epoch": 3.898305084745763, "grad_norm": 20.227968165373298, "learning_rate": 4.864037108290347e-07, "logits/chosen": 8.437172889709473, "logits/rejected": 7.416140556335449, "logps/chosen": -9.189136505126953, "logps/rejected": -20.316320419311523, "loss": 0.3529, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09363769739866257, "rewards/margins": 2.3706040382385254, "rewards/rejected": -2.2769663333892822, "step": 230 }, { "epoch": 3.915254237288136, "grad_norm": 21.82010790729604, "learning_rate": 4.86162110954036e-07, "logits/chosen": 2.241933822631836, "logits/rejected": 4.145862579345703, "logps/chosen": -9.749147415161133, "logps/rejected": -9.986723899841309, "loss": 0.4128, "rewards/accuracies": 0.75, "rewards/chosen": 0.24059665203094482, "rewards/margins": 0.7164955139160156, "rewards/rejected": -0.4758988916873932, "step": 231 }, { "epoch": 3.9322033898305087, "grad_norm": 19.529430603984025, "learning_rate": 4.859184444590881e-07, "logits/chosen": 7.928268909454346, "logits/rejected": 7.564013481140137, "logps/chosen": -9.2333345413208, "logps/rejected": -11.309907913208008, "loss": 0.3496, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2154204249382019, "rewards/margins": 0.6944783926010132, "rewards/rejected": -0.47905805706977844, "step": 232 }, { "epoch": 3.9491525423728815, "grad_norm": 20.910743589602404, "learning_rate": 4.856727134764809e-07, "logits/chosen": 8.220526695251465, "logits/rejected": 9.600720405578613, "logps/chosen": -6.642482757568359, "logps/rejected": -15.970149993896484, "loss": 0.3602, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06534373760223389, "rewards/margins": 1.8218185901641846, "rewards/rejected": -1.7564747333526611, "step": 233 }, { "epoch": 3.9661016949152543, "grad_norm": 20.74274651410422, "learning_rate": 4.8542492015657e-07, "logits/chosen": 10.452482223510742, "logits/rejected": 13.264217376708984, "logps/chosen": -11.146896362304688, "logps/rejected": -18.784910202026367, "loss": 0.4025, "rewards/accuracies": 0.75, "rewards/chosen": 0.20790499448776245, "rewards/margins": 2.184065341949463, "rewards/rejected": -1.9761605262756348, "step": 234 }, { "epoch": 3.983050847457627, "grad_norm": 21.513229116400385, "learning_rate": 4.851750666677583e-07, "logits/chosen": 9.31413745880127, "logits/rejected": 8.258987426757812, "logps/chosen": -8.405204772949219, "logps/rejected": -13.575927734375, "loss": 0.4065, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12339913845062256, "rewards/margins": 0.7723197937011719, "rewards/rejected": -0.6489205956459045, "step": 235 }, { "epoch": 4.0, "grad_norm": 21.694593284683215, "learning_rate": 4.849231551964771e-07, "logits/chosen": 3.4820375442504883, "logits/rejected": 8.22888469696045, "logps/chosen": -10.688333511352539, "logps/rejected": -16.323883056640625, "loss": 0.3582, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1764301061630249, "rewards/margins": 2.344557285308838, "rewards/rejected": -2.1681272983551025, "step": 236 }, { "epoch": 4.016949152542373, "grad_norm": 17.165516200116453, "learning_rate": 4.846691879471666e-07, "logits/chosen": 7.24074125289917, "logits/rejected": 10.560139656066895, "logps/chosen": -12.36288833618164, "logps/rejected": -17.59840202331543, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/chosen": 0.20638218522071838, "rewards/margins": 2.1878502368927, "rewards/rejected": -1.9814679622650146, "step": 237 }, { "epoch": 4.033898305084746, "grad_norm": 18.267129061896295, "learning_rate": 4.844131671422569e-07, "logits/chosen": 4.005539417266846, "logits/rejected": 6.256585121154785, "logps/chosen": -9.463366508483887, "logps/rejected": -15.193839073181152, "loss": 0.3557, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3415685296058655, "rewards/margins": 1.9723048210144043, "rewards/rejected": -1.630736231803894, "step": 238 }, { "epoch": 4.0508474576271185, "grad_norm": 18.230196378772558, "learning_rate": 4.841550950221485e-07, "logits/chosen": 6.972199440002441, "logits/rejected": 8.425640106201172, "logps/chosen": -12.009435653686523, "logps/rejected": -17.968767166137695, "loss": 0.3264, "rewards/accuracies": 1.0, "rewards/chosen": 0.10153679549694061, "rewards/margins": 1.6484508514404297, "rewards/rejected": -1.546913981437683, "step": 239 }, { "epoch": 4.067796610169491, "grad_norm": 17.815979438227107, "learning_rate": 4.838949738451928e-07, "logits/chosen": 7.805523872375488, "logits/rejected": 11.780006408691406, "logps/chosen": -12.637031555175781, "logps/rejected": -20.631893157958984, "loss": 0.3414, "rewards/accuracies": 0.9375, "rewards/chosen": 0.028789594769477844, "rewards/margins": 2.5881569385528564, "rewards/rejected": -2.5593671798706055, "step": 240 }, { "epoch": 4.084745762711864, "grad_norm": 16.67503277302023, "learning_rate": 4.836328058876717e-07, "logits/chosen": 4.916990756988525, "logits/rejected": 5.723315238952637, "logps/chosen": -10.50915241241455, "logps/rejected": -12.09021282196045, "loss": 0.3289, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26938343048095703, "rewards/margins": 1.0055718421936035, "rewards/rejected": -0.7361884713172913, "step": 241 }, { "epoch": 4.101694915254237, "grad_norm": 18.459637642971867, "learning_rate": 4.833685934437787e-07, "logits/chosen": 8.218949317932129, "logits/rejected": 4.173165321350098, "logps/chosen": -9.293395042419434, "logps/rejected": -15.193696022033691, "loss": 0.3194, "rewards/accuracies": 0.875, "rewards/chosen": 0.3008555769920349, "rewards/margins": 1.5985466241836548, "rewards/rejected": -1.2976911067962646, "step": 242 }, { "epoch": 4.11864406779661, "grad_norm": 16.84514978201183, "learning_rate": 4.831023388255979e-07, "logits/chosen": 8.695550918579102, "logits/rejected": 11.430813789367676, "logps/chosen": -9.363493919372559, "logps/rejected": -18.699190139770508, "loss": 0.2861, "rewards/accuracies": 0.9375, "rewards/chosen": 0.11874284595251083, "rewards/margins": 2.3261449337005615, "rewards/rejected": -2.207401990890503, "step": 243 }, { "epoch": 4.135593220338983, "grad_norm": 17.815881783852166, "learning_rate": 4.828340443630846e-07, "logits/chosen": 5.674130916595459, "logits/rejected": 6.590822219848633, "logps/chosen": -8.572710037231445, "logps/rejected": -16.258394241333008, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 0.26820075511932373, "rewards/margins": 1.8350509405136108, "rewards/rejected": -1.566850185394287, "step": 244 }, { "epoch": 4.1525423728813555, "grad_norm": 19.076762128934657, "learning_rate": 4.825637124040441e-07, "logits/chosen": 6.261494159698486, "logits/rejected": 10.208662033081055, "logps/chosen": -12.498882293701172, "logps/rejected": -19.198043823242188, "loss": 0.352, "rewards/accuracies": 1.0, "rewards/chosen": 0.23819471895694733, "rewards/margins": 2.1077005863189697, "rewards/rejected": -1.8695058822631836, "step": 245 }, { "epoch": 4.169491525423728, "grad_norm": 19.346185362099984, "learning_rate": 4.822913453141117e-07, "logits/chosen": 6.8853654861450195, "logits/rejected": 6.2803955078125, "logps/chosen": -10.658153533935547, "logps/rejected": -18.49957847595215, "loss": 0.3353, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1655520498752594, "rewards/margins": 2.306309223175049, "rewards/rejected": -2.1407570838928223, "step": 246 }, { "epoch": 4.186440677966102, "grad_norm": 18.617703119188207, "learning_rate": 4.820169454767318e-07, "logits/chosen": 8.95741081237793, "logits/rejected": 10.199117660522461, "logps/chosen": -9.932829856872559, "logps/rejected": -17.307279586791992, "loss": 0.3198, "rewards/accuracies": 0.875, "rewards/chosen": 0.33301711082458496, "rewards/margins": 1.9417872428894043, "rewards/rejected": -1.6087702512741089, "step": 247 }, { "epoch": 4.203389830508475, "grad_norm": 19.448914932645447, "learning_rate": 4.81740515293137e-07, "logits/chosen": 4.610191345214844, "logits/rejected": 6.457115650177002, "logps/chosen": -10.178339958190918, "logps/rejected": -14.412067413330078, "loss": 0.3339, "rewards/accuracies": 0.875, "rewards/chosen": 0.337657630443573, "rewards/margins": 1.3123055696487427, "rewards/rejected": -0.9746479988098145, "step": 248 }, { "epoch": 4.220338983050848, "grad_norm": 19.07405219931799, "learning_rate": 4.814620571823274e-07, "logits/chosen": 3.8473613262176514, "logits/rejected": 4.449122428894043, "logps/chosen": -13.519302368164062, "logps/rejected": -16.95071792602539, "loss": 0.3406, "rewards/accuracies": 0.875, "rewards/chosen": 0.37721583247184753, "rewards/margins": 1.67903470993042, "rewards/rejected": -1.3018189668655396, "step": 249 }, { "epoch": 4.237288135593221, "grad_norm": 16.24922812475198, "learning_rate": 4.811815735810489e-07, "logits/chosen": 7.148613929748535, "logits/rejected": 6.880765914916992, "logps/chosen": -11.212467193603516, "logps/rejected": -19.45187759399414, "loss": 0.2912, "rewards/accuracies": 1.0, "rewards/chosen": 0.3127380907535553, "rewards/margins": 2.7761077880859375, "rewards/rejected": -2.463369846343994, "step": 250 }, { "epoch": 4.254237288135593, "grad_norm": 17.553230728699816, "learning_rate": 4.808990669437724e-07, "logits/chosen": 7.376377105712891, "logits/rejected": 6.04884672164917, "logps/chosen": -9.064533233642578, "logps/rejected": -17.936084747314453, "loss": 0.2991, "rewards/accuracies": 1.0, "rewards/chosen": 0.027374181896448135, "rewards/margins": 2.2804818153381348, "rewards/rejected": -2.253107786178589, "step": 251 }, { "epoch": 4.271186440677966, "grad_norm": 17.955648025407335, "learning_rate": 4.806145397426719e-07, "logits/chosen": 9.423786163330078, "logits/rejected": 10.659478187561035, "logps/chosen": -7.087960243225098, "logps/rejected": -12.86429500579834, "loss": 0.3305, "rewards/accuracies": 1.0, "rewards/chosen": 0.2393162101507187, "rewards/margins": 1.852463960647583, "rewards/rejected": -1.6131477355957031, "step": 252 }, { "epoch": 4.288135593220339, "grad_norm": 16.31241903989762, "learning_rate": 4.803279944676032e-07, "logits/chosen": 7.690080642700195, "logits/rejected": 4.676759719848633, "logps/chosen": -7.623948097229004, "logps/rejected": -16.224029541015625, "loss": 0.2743, "rewards/accuracies": 1.0, "rewards/chosen": 0.2924440801143646, "rewards/margins": 2.0772933959960938, "rewards/rejected": -1.7848492860794067, "step": 253 }, { "epoch": 4.305084745762712, "grad_norm": 17.555016757847234, "learning_rate": 4.800394336260819e-07, "logits/chosen": 6.041158676147461, "logits/rejected": 6.416534900665283, "logps/chosen": -7.688037872314453, "logps/rejected": -17.980430603027344, "loss": 0.3153, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1410144865512848, "rewards/margins": 2.001737356185913, "rewards/rejected": -1.8607230186462402, "step": 254 }, { "epoch": 4.322033898305085, "grad_norm": 18.480066982296236, "learning_rate": 4.797488597432616e-07, "logits/chosen": 2.997715473175049, "logits/rejected": 8.167374610900879, "logps/chosen": -15.387086868286133, "logps/rejected": -14.066549301147461, "loss": 0.2997, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4183131158351898, "rewards/margins": 1.9756748676300049, "rewards/rejected": -1.5573619604110718, "step": 255 }, { "epoch": 4.338983050847458, "grad_norm": 17.471201870112832, "learning_rate": 4.794562753619117e-07, "logits/chosen": 4.013498783111572, "logits/rejected": 4.414510250091553, "logps/chosen": -8.612306594848633, "logps/rejected": -14.004947662353516, "loss": 0.3257, "rewards/accuracies": 0.875, "rewards/chosen": 0.3176363706588745, "rewards/margins": 1.533604621887207, "rewards/rejected": -1.215968132019043, "step": 256 }, { "epoch": 4.3559322033898304, "grad_norm": 19.17554175787275, "learning_rate": 4.791616830423949e-07, "logits/chosen": 4.06189489364624, "logits/rejected": 6.982814311981201, "logps/chosen": -9.344228744506836, "logps/rejected": -15.20322036743164, "loss": 0.3233, "rewards/accuracies": 0.875, "rewards/chosen": 0.228972390294075, "rewards/margins": 1.8103268146514893, "rewards/rejected": -1.5813543796539307, "step": 257 }, { "epoch": 4.372881355932203, "grad_norm": 17.10464741278365, "learning_rate": 4.788650853626456e-07, "logits/chosen": 2.260240316390991, "logits/rejected": 6.677258014678955, "logps/chosen": -9.949880599975586, "logps/rejected": -13.5770845413208, "loss": 0.3136, "rewards/accuracies": 1.0, "rewards/chosen": 0.28663721680641174, "rewards/margins": 1.7761832475662231, "rewards/rejected": -1.4895460605621338, "step": 258 }, { "epoch": 4.389830508474576, "grad_norm": 18.82388276814341, "learning_rate": 4.785664849181465e-07, "logits/chosen": 4.183163642883301, "logits/rejected": 4.152279376983643, "logps/chosen": -6.331787586212158, "logps/rejected": -11.512680053710938, "loss": 0.33, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1800474226474762, "rewards/margins": 1.1619161367416382, "rewards/rejected": -0.9818687438964844, "step": 259 }, { "epoch": 4.406779661016949, "grad_norm": 17.663331505017794, "learning_rate": 4.78265884321906e-07, "logits/chosen": 6.67874813079834, "logits/rejected": 4.668858528137207, "logps/chosen": -10.521224021911621, "logps/rejected": -19.912858963012695, "loss": 0.3094, "rewards/accuracies": 1.0, "rewards/chosen": 0.1548883020877838, "rewards/margins": 2.28428316116333, "rewards/rejected": -2.12939453125, "step": 260 }, { "epoch": 4.423728813559322, "grad_norm": 17.311775354907358, "learning_rate": 4.779632862044361e-07, "logits/chosen": 5.639666557312012, "logits/rejected": 6.38239049911499, "logps/chosen": -8.55907154083252, "logps/rejected": -18.988445281982422, "loss": 0.298, "rewards/accuracies": 1.0, "rewards/chosen": 0.22225412726402283, "rewards/margins": 2.3125150203704834, "rewards/rejected": -2.0902609825134277, "step": 261 }, { "epoch": 4.440677966101695, "grad_norm": 17.440600929155575, "learning_rate": 4.776586932137283e-07, "logits/chosen": 3.9185001850128174, "logits/rejected": 6.49030065536499, "logps/chosen": -10.537013053894043, "logps/rejected": -14.84063720703125, "loss": 0.3009, "rewards/accuracies": 0.75, "rewards/chosen": 0.13585788011550903, "rewards/margins": 1.624847412109375, "rewards/rejected": -1.4889897108078003, "step": 262 }, { "epoch": 4.4576271186440675, "grad_norm": 17.13685594779347, "learning_rate": 4.773521080152311e-07, "logits/chosen": 2.5429065227508545, "logits/rejected": 5.310009002685547, "logps/chosen": -12.58240795135498, "logps/rejected": -20.367462158203125, "loss": 0.2836, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13424983620643616, "rewards/margins": 1.8994662761688232, "rewards/rejected": -1.765216588973999, "step": 263 }, { "epoch": 4.47457627118644, "grad_norm": 17.308598065836613, "learning_rate": 4.770435332918267e-07, "logits/chosen": 4.594322204589844, "logits/rejected": 8.247635841369629, "logps/chosen": -11.638622283935547, "logps/rejected": -18.57332992553711, "loss": 0.2982, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2245895266532898, "rewards/margins": 2.225818395614624, "rewards/rejected": -2.0012288093566895, "step": 264 }, { "epoch": 4.491525423728813, "grad_norm": 16.88232961307194, "learning_rate": 4.76732971743807e-07, "logits/chosen": 6.63725471496582, "logits/rejected": 8.631773948669434, "logps/chosen": -7.823244094848633, "logps/rejected": -17.624019622802734, "loss": 0.2864, "rewards/accuracies": 1.0, "rewards/chosen": 0.2145896852016449, "rewards/margins": 3.0045228004455566, "rewards/rejected": -2.789933204650879, "step": 265 }, { "epoch": 4.508474576271187, "grad_norm": 18.037592468816126, "learning_rate": 4.7642042608885056e-07, "logits/chosen": 5.675311088562012, "logits/rejected": 4.931497573852539, "logps/chosen": -11.616440773010254, "logps/rejected": -20.281126022338867, "loss": 0.336, "rewards/accuracies": 1.0, "rewards/chosen": 0.11937360465526581, "rewards/margins": 2.490065574645996, "rewards/rejected": -2.370692014694214, "step": 266 }, { "epoch": 4.52542372881356, "grad_norm": 18.473347743493118, "learning_rate": 4.761058990619986e-07, "logits/chosen": 2.7812986373901367, "logits/rejected": 3.597932815551758, "logps/chosen": -8.612319946289062, "logps/rejected": -15.196998596191406, "loss": 0.286, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41151559352874756, "rewards/margins": 1.9993846416473389, "rewards/rejected": -1.5878691673278809, "step": 267 }, { "epoch": 4.5423728813559325, "grad_norm": 16.332029360751594, "learning_rate": 4.757893934156309e-07, "logits/chosen": 7.0438232421875, "logits/rejected": 6.700860023498535, "logps/chosen": -8.056821823120117, "logps/rejected": -19.757720947265625, "loss": 0.3042, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12074195593595505, "rewards/margins": 2.930652379989624, "rewards/rejected": -2.809910535812378, "step": 268 }, { "epoch": 4.559322033898305, "grad_norm": 17.180403373586987, "learning_rate": 4.754709119194418e-07, "logits/chosen": 12.024563789367676, "logits/rejected": 11.642518997192383, "logps/chosen": -9.898269653320312, "logps/rejected": -20.960111618041992, "loss": 0.2921, "rewards/accuracies": 1.0, "rewards/chosen": 0.20630189776420593, "rewards/margins": 2.34810209274292, "rewards/rejected": -2.1418001651763916, "step": 269 }, { "epoch": 4.576271186440678, "grad_norm": 18.08908952286864, "learning_rate": 4.7515045736041615e-07, "logits/chosen": 8.689579010009766, "logits/rejected": 9.39737606048584, "logps/chosen": -6.704339504241943, "logps/rejected": -15.77403736114502, "loss": 0.3367, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3202212452888489, "rewards/margins": 2.208920955657959, "rewards/rejected": -1.8886998891830444, "step": 270 }, { "epoch": 4.593220338983051, "grad_norm": 20.731272870943574, "learning_rate": 4.748280325428048e-07, "logits/chosen": 5.354104995727539, "logits/rejected": 6.051662445068359, "logps/chosen": -9.395597457885742, "logps/rejected": -16.60767936706543, "loss": 0.3087, "rewards/accuracies": 0.9375, "rewards/chosen": 0.25916188955307007, "rewards/margins": 2.148326873779297, "rewards/rejected": -1.889164924621582, "step": 271 }, { "epoch": 4.610169491525424, "grad_norm": 17.659884235704403, "learning_rate": 4.745036402880999e-07, "logits/chosen": 5.418169021606445, "logits/rejected": 6.012850761413574, "logps/chosen": -7.826754570007324, "logps/rejected": -18.563873291015625, "loss": 0.2977, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2566584050655365, "rewards/margins": 3.3900957107543945, "rewards/rejected": -3.133437395095825, "step": 272 }, { "epoch": 4.627118644067797, "grad_norm": 17.60711001593079, "learning_rate": 4.741772834350104e-07, "logits/chosen": 5.240487098693848, "logits/rejected": 5.013432025909424, "logps/chosen": -11.003212928771973, "logps/rejected": -17.381736755371094, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 0.22370412945747375, "rewards/margins": 1.2826619148254395, "rewards/rejected": -1.058957576751709, "step": 273 }, { "epoch": 4.6440677966101696, "grad_norm": 18.485652371137643, "learning_rate": 4.7384896483943726e-07, "logits/chosen": 6.851320266723633, "logits/rejected": 5.790674209594727, "logps/chosen": -6.928519248962402, "logps/rejected": -15.897483825683594, "loss": 0.3131, "rewards/accuracies": 1.0, "rewards/chosen": 0.2791842818260193, "rewards/margins": 1.9868839979171753, "rewards/rejected": -1.7076997756958008, "step": 274 }, { "epoch": 4.661016949152542, "grad_norm": 17.27421393446924, "learning_rate": 4.7351868737444825e-07, "logits/chosen": 9.60102367401123, "logits/rejected": 7.984978199005127, "logps/chosen": -6.305352210998535, "logps/rejected": -12.598247528076172, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 0.3262409567832947, "rewards/margins": 1.0967921018600464, "rewards/rejected": -0.7705512046813965, "step": 275 }, { "epoch": 4.677966101694915, "grad_norm": 17.655139325548166, "learning_rate": 4.7318645393025305e-07, "logits/chosen": 6.915894031524658, "logits/rejected": 8.634527206420898, "logps/chosen": -9.319221496582031, "logps/rejected": -12.086976051330566, "loss": 0.3315, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3277999758720398, "rewards/margins": 1.5269964933395386, "rewards/rejected": -1.199196457862854, "step": 276 }, { "epoch": 4.694915254237288, "grad_norm": 17.011095996634904, "learning_rate": 4.7285226741417753e-07, "logits/chosen": 3.976977825164795, "logits/rejected": 7.2318501472473145, "logps/chosen": -10.538625717163086, "logps/rejected": -16.68062973022461, "loss": 0.2857, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3574908375740051, "rewards/margins": 2.6200029850006104, "rewards/rejected": -2.262512445449829, "step": 277 }, { "epoch": 4.711864406779661, "grad_norm": 17.65277324244951, "learning_rate": 4.7251613075063905e-07, "logits/chosen": 6.255118370056152, "logits/rejected": 6.385594844818115, "logps/chosen": -8.837267875671387, "logps/rejected": -17.89482307434082, "loss": 0.3087, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2775583565235138, "rewards/margins": 3.0456676483154297, "rewards/rejected": -2.7681093215942383, "step": 278 }, { "epoch": 4.728813559322034, "grad_norm": 16.6287870272674, "learning_rate": 4.721780468811201e-07, "logits/chosen": 4.9652299880981445, "logits/rejected": 8.796908378601074, "logps/chosen": -11.446728706359863, "logps/rejected": -15.115416526794434, "loss": 0.2724, "rewards/accuracies": 1.0, "rewards/chosen": 0.26096999645233154, "rewards/margins": 1.8385089635849, "rewards/rejected": -1.5775389671325684, "step": 279 }, { "epoch": 4.745762711864407, "grad_norm": 17.25865918677704, "learning_rate": 4.7183801876414286e-07, "logits/chosen": 5.870490074157715, "logits/rejected": 8.393874168395996, "logps/chosen": -8.048952102661133, "logps/rejected": -15.552209854125977, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 0.1697208136320114, "rewards/margins": 2.0907211303710938, "rewards/rejected": -1.921000361442566, "step": 280 }, { "epoch": 4.762711864406779, "grad_norm": 17.566147545689287, "learning_rate": 4.7149604937524356e-07, "logits/chosen": 3.231328248977661, "logits/rejected": 3.152560234069824, "logps/chosen": -13.5642728805542, "logps/rejected": -19.40642547607422, "loss": 0.313, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5033832788467407, "rewards/margins": 1.4650967121124268, "rewards/rejected": -0.961713433265686, "step": 281 }, { "epoch": 4.779661016949152, "grad_norm": 15.6998720784932, "learning_rate": 4.7115214170694616e-07, "logits/chosen": 5.659430503845215, "logits/rejected": 9.334473609924316, "logps/chosen": -9.407824516296387, "logps/rejected": -18.11457061767578, "loss": 0.258, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1942892074584961, "rewards/margins": 2.8978137969970703, "rewards/rejected": -2.7035248279571533, "step": 282 }, { "epoch": 4.796610169491525, "grad_norm": 17.884964474492214, "learning_rate": 4.70806298768736e-07, "logits/chosen": 4.105790138244629, "logits/rejected": 6.558419227600098, "logps/chosen": -10.256916999816895, "logps/rejected": -11.880553245544434, "loss": 0.3091, "rewards/accuracies": 1.0, "rewards/chosen": 0.23252363502979279, "rewards/margins": 2.0612881183624268, "rewards/rejected": -1.8287646770477295, "step": 283 }, { "epoch": 4.813559322033898, "grad_norm": 19.503195523702683, "learning_rate": 4.70458523587034e-07, "logits/chosen": 5.803598403930664, "logits/rejected": 6.407049655914307, "logps/chosen": -9.721452713012695, "logps/rejected": -20.844100952148438, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 0.22893205285072327, "rewards/margins": 2.159566879272461, "rewards/rejected": -1.93063485622406, "step": 284 }, { "epoch": 4.830508474576272, "grad_norm": 17.660468010554457, "learning_rate": 4.701088192051695e-07, "logits/chosen": 4.224084377288818, "logits/rejected": 4.460755348205566, "logps/chosen": -11.501953125, "logps/rejected": -19.310644149780273, "loss": 0.3149, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5221762657165527, "rewards/margins": 2.612990379333496, "rewards/rejected": -2.0908143520355225, "step": 285 }, { "epoch": 4.847457627118644, "grad_norm": 15.44242173696732, "learning_rate": 4.697571886833543e-07, "logits/chosen": 5.924899101257324, "logits/rejected": 6.679788589477539, "logps/chosen": -7.886451721191406, "logps/rejected": -15.066838264465332, "loss": 0.2688, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3297101855278015, "rewards/margins": 2.0337629318237305, "rewards/rejected": -1.7040526866912842, "step": 286 }, { "epoch": 4.864406779661017, "grad_norm": 17.217120104408682, "learning_rate": 4.6940363509865553e-07, "logits/chosen": 5.9503583908081055, "logits/rejected": 6.5264105796813965, "logps/chosen": -10.39914608001709, "logps/rejected": -15.116087913513184, "loss": 0.2827, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23787519335746765, "rewards/margins": 1.552558422088623, "rewards/rejected": -1.314683198928833, "step": 287 }, { "epoch": 4.88135593220339, "grad_norm": 15.856929183783881, "learning_rate": 4.6904816154496854e-07, "logits/chosen": 6.741863250732422, "logits/rejected": 9.094884872436523, "logps/chosen": -13.021467208862305, "logps/rejected": -19.538970947265625, "loss": 0.2779, "rewards/accuracies": 0.9375, "rewards/chosen": -0.047575704753398895, "rewards/margins": 2.4877426624298096, "rewards/rejected": -2.53531813621521, "step": 288 }, { "epoch": 4.898305084745763, "grad_norm": 17.193550836836813, "learning_rate": 4.6869077113299025e-07, "logits/chosen": 5.560794830322266, "logits/rejected": 4.232423305511475, "logps/chosen": -9.655923843383789, "logps/rejected": -19.288549423217773, "loss": 0.318, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09008821845054626, "rewards/margins": 2.3839855194091797, "rewards/rejected": -2.2938973903656006, "step": 289 }, { "epoch": 4.915254237288136, "grad_norm": 17.956803106276503, "learning_rate": 4.6833146699019177e-07, "logits/chosen": 2.667480945587158, "logits/rejected": 3.094351053237915, "logps/chosen": -7.915768146514893, "logps/rejected": -12.158473014831543, "loss": 0.3469, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3719676733016968, "rewards/margins": 1.006954550743103, "rewards/rejected": -0.6349868774414062, "step": 290 }, { "epoch": 4.932203389830509, "grad_norm": 17.633436631978487, "learning_rate": 4.6797025226079074e-07, "logits/chosen": 7.160995960235596, "logits/rejected": 7.154771327972412, "logps/chosen": -8.890524864196777, "logps/rejected": -14.562738418579102, "loss": 0.2858, "rewards/accuracies": 1.0, "rewards/chosen": 0.040711283683776855, "rewards/margins": 1.5542259216308594, "rewards/rejected": -1.5135146379470825, "step": 291 }, { "epoch": 4.9491525423728815, "grad_norm": 18.029997556885586, "learning_rate": 4.676071301057243e-07, "logits/chosen": 6.709413528442383, "logits/rejected": 5.719005584716797, "logps/chosen": -10.442892074584961, "logps/rejected": -16.07100486755371, "loss": 0.3138, "rewards/accuracies": 1.0, "rewards/chosen": 0.20800533890724182, "rewards/margins": 1.062894582748413, "rewards/rejected": -0.8548891544342041, "step": 292 }, { "epoch": 4.966101694915254, "grad_norm": 18.15563498375766, "learning_rate": 4.67242103702621e-07, "logits/chosen": 4.255777359008789, "logits/rejected": 4.3217949867248535, "logps/chosen": -10.030130386352539, "logps/rejected": -16.675334930419922, "loss": 0.3034, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2056305855512619, "rewards/margins": 1.8735519647598267, "rewards/rejected": -1.6679213047027588, "step": 293 }, { "epoch": 4.983050847457627, "grad_norm": 16.468882573077856, "learning_rate": 4.668751762457733e-07, "logits/chosen": 6.377828598022461, "logits/rejected": 6.773748397827148, "logps/chosen": -8.659324645996094, "logps/rejected": -16.170530319213867, "loss": 0.2782, "rewards/accuracies": 1.0, "rewards/chosen": 0.22924266755580902, "rewards/margins": 1.764962911605835, "rewards/rejected": -1.5357201099395752, "step": 294 }, { "epoch": 5.0, "grad_norm": 15.799532890303503, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 3.3482489585876465, "logits/rejected": 5.720559120178223, "logps/chosen": -9.48496150970459, "logps/rejected": -18.143577575683594, "loss": 0.2819, "rewards/accuracies": 1.0, "rewards/chosen": 0.14067193865776062, "rewards/margins": 2.612166166305542, "rewards/rejected": -2.471494197845459, "step": 295 }, { "epoch": 5.016949152542373, "grad_norm": 15.974359618646485, "learning_rate": 4.661356310311659e-07, "logits/chosen": 8.018171310424805, "logits/rejected": 8.172922134399414, "logps/chosen": -8.21717643737793, "logps/rejected": -18.745868682861328, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 0.1397463083267212, "rewards/margins": 2.444672107696533, "rewards/rejected": -2.3049259185791016, "step": 296 }, { "epoch": 5.033898305084746, "grad_norm": 15.886523739916017, "learning_rate": 4.657630197450576e-07, "logits/chosen": 2.65042781829834, "logits/rejected": 5.1384806632995605, "logps/chosen": -13.963510513305664, "logps/rejected": -24.165008544921875, "loss": 0.247, "rewards/accuracies": 1.0, "rewards/chosen": 0.4832766354084015, "rewards/margins": 3.386579990386963, "rewards/rejected": -2.9033029079437256, "step": 297 }, { "epoch": 5.0508474576271185, "grad_norm": 14.661014856292772, "learning_rate": 4.653885203484515e-07, "logits/chosen": 2.559359073638916, "logits/rejected": 4.847626209259033, "logps/chosen": -7.78449821472168, "logps/rejected": -13.427138328552246, "loss": 0.2826, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22836291790008545, "rewards/margins": 1.7525341510772705, "rewards/rejected": -1.524170994758606, "step": 298 }, { "epoch": 5.067796610169491, "grad_norm": 13.461143030238789, "learning_rate": 4.6501213611853673e-07, "logits/chosen": 5.067084789276123, "logits/rejected": 2.6621310710906982, "logps/chosen": -8.320889472961426, "logps/rejected": -19.413127899169922, "loss": 0.2194, "rewards/accuracies": 1.0, "rewards/chosen": 0.301086962223053, "rewards/margins": 2.9841580390930176, "rewards/rejected": -2.6830711364746094, "step": 299 }, { "epoch": 5.084745762711864, "grad_norm": 16.070827271212256, "learning_rate": 4.6463387034899643e-07, "logits/chosen": 6.388011932373047, "logits/rejected": 8.232110977172852, "logps/chosen": -9.567124366760254, "logps/rejected": -16.695533752441406, "loss": 0.271, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23213094472885132, "rewards/margins": 2.3436355590820312, "rewards/rejected": -2.1115047931671143, "step": 300 }, { "epoch": 5.101694915254237, "grad_norm": 14.41016072543074, "learning_rate": 4.642537263499788e-07, "logits/chosen": 3.605557441711426, "logits/rejected": 2.710143566131592, "logps/chosen": -6.5597028732299805, "logps/rejected": -16.52556037902832, "loss": 0.2586, "rewards/accuracies": 1.0, "rewards/chosen": 0.4225674867630005, "rewards/margins": 2.3175463676452637, "rewards/rejected": -1.8949788808822632, "step": 301 }, { "epoch": 5.11864406779661, "grad_norm": 13.805577818353347, "learning_rate": 4.6387170744806813e-07, "logits/chosen": 0.8485604524612427, "logits/rejected": 1.88863205909729, "logps/chosen": -12.944626808166504, "logps/rejected": -24.060951232910156, "loss": 0.2307, "rewards/accuracies": 1.0, "rewards/chosen": 0.1453239619731903, "rewards/margins": 3.271425247192383, "rewards/rejected": -3.12610125541687, "step": 302 }, { "epoch": 5.135593220338983, "grad_norm": 14.226631709836239, "learning_rate": 4.634878169862557e-07, "logits/chosen": 5.334882736206055, "logits/rejected": 4.9238667488098145, "logps/chosen": -9.677927017211914, "logps/rejected": -18.596200942993164, "loss": 0.2558, "rewards/accuracies": 0.875, "rewards/chosen": 0.2794516682624817, "rewards/margins": 1.9560503959655762, "rewards/rejected": -1.6765985488891602, "step": 303 }, { "epoch": 5.1525423728813555, "grad_norm": 14.025700719644135, "learning_rate": 4.6310205832391065e-07, "logits/chosen": 6.083189964294434, "logits/rejected": 10.50492000579834, "logps/chosen": -8.656221389770508, "logps/rejected": -19.979228973388672, "loss": 0.225, "rewards/accuracies": 1.0, "rewards/chosen": 0.12934735417366028, "rewards/margins": 3.684925079345703, "rewards/rejected": -3.5555777549743652, "step": 304 }, { "epoch": 5.169491525423728, "grad_norm": 15.45904891320478, "learning_rate": 4.6271443483675027e-07, "logits/chosen": 6.795848369598389, "logits/rejected": 7.770328521728516, "logps/chosen": -8.92910385131836, "logps/rejected": -12.093315124511719, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 0.4935965836048126, "rewards/margins": 1.7713218927383423, "rewards/rejected": -1.2777252197265625, "step": 305 }, { "epoch": 5.186440677966102, "grad_norm": 24.29778055150147, "learning_rate": 4.6232494991681087e-07, "logits/chosen": 3.3796167373657227, "logits/rejected": 4.94102144241333, "logps/chosen": -10.549798011779785, "logps/rejected": -21.496875762939453, "loss": 0.2313, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19755910336971283, "rewards/margins": 3.451889753341675, "rewards/rejected": -3.254331111907959, "step": 306 }, { "epoch": 5.203389830508475, "grad_norm": 15.025755227790897, "learning_rate": 4.6193360697241766e-07, "logits/chosen": 6.718583583831787, "logits/rejected": 4.653804302215576, "logps/chosen": -10.026581764221191, "logps/rejected": -22.026464462280273, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 0.1300102323293686, "rewards/margins": 2.7540884017944336, "rewards/rejected": -2.624077796936035, "step": 307 }, { "epoch": 5.220338983050848, "grad_norm": 15.0870418032954, "learning_rate": 4.615404094281554e-07, "logits/chosen": 2.41433048248291, "logits/rejected": 3.862748146057129, "logps/chosen": -9.14163589477539, "logps/rejected": -14.814375877380371, "loss": 0.2391, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35920146107673645, "rewards/margins": 1.8867554664611816, "rewards/rejected": -1.527553915977478, "step": 308 }, { "epoch": 5.237288135593221, "grad_norm": 14.76721553235894, "learning_rate": 4.611453607248381e-07, "logits/chosen": 0.7226177453994751, "logits/rejected": 3.7342004776000977, "logps/chosen": -10.703145027160645, "logps/rejected": -15.224998474121094, "loss": 0.2534, "rewards/accuracies": 1.0, "rewards/chosen": 0.6355695724487305, "rewards/margins": 2.3711819648742676, "rewards/rejected": -1.7356122732162476, "step": 309 }, { "epoch": 5.254237288135593, "grad_norm": 15.471489712702958, "learning_rate": 4.607484643194788e-07, "logits/chosen": 4.764775276184082, "logits/rejected": 6.933917999267578, "logps/chosen": -9.363961219787598, "logps/rejected": -14.59107780456543, "loss": 0.2601, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3514489531517029, "rewards/margins": 2.547487735748291, "rewards/rejected": -2.1960389614105225, "step": 310 }, { "epoch": 5.271186440677966, "grad_norm": 13.781971702591731, "learning_rate": 4.6034972368525957e-07, "logits/chosen": 1.9174288511276245, "logits/rejected": 2.8629069328308105, "logps/chosen": -7.767963409423828, "logps/rejected": -16.579254150390625, "loss": 0.2545, "rewards/accuracies": 1.0, "rewards/chosen": 0.08915120363235474, "rewards/margins": 2.5531845092773438, "rewards/rejected": -2.4640331268310547, "step": 311 }, { "epoch": 5.288135593220339, "grad_norm": 14.885786269143404, "learning_rate": 4.599491423115014e-07, "logits/chosen": 5.801702499389648, "logits/rejected": 5.572926998138428, "logps/chosen": -9.809563636779785, "logps/rejected": -15.179691314697266, "loss": 0.2611, "rewards/accuracies": 1.0, "rewards/chosen": 0.34644848108291626, "rewards/margins": 1.769092082977295, "rewards/rejected": -1.4226434230804443, "step": 312 }, { "epoch": 5.305084745762712, "grad_norm": 13.557354407320414, "learning_rate": 4.595467237036329e-07, "logits/chosen": 4.226743221282959, "logits/rejected": 5.284225940704346, "logps/chosen": -8.43330192565918, "logps/rejected": -13.308187484741211, "loss": 0.2422, "rewards/accuracies": 1.0, "rewards/chosen": 0.41138046979904175, "rewards/margins": 1.7838656902313232, "rewards/rejected": -1.3724852800369263, "step": 313 }, { "epoch": 5.322033898305085, "grad_norm": 15.314012448739035, "learning_rate": 4.591424713831602e-07, "logits/chosen": 3.3238039016723633, "logits/rejected": 5.186252593994141, "logps/chosen": -8.77778434753418, "logps/rejected": -20.426774978637695, "loss": 0.2583, "rewards/accuracies": 1.0, "rewards/chosen": 0.3558368980884552, "rewards/margins": 2.579214572906494, "rewards/rejected": -2.2233777046203613, "step": 314 }, { "epoch": 5.338983050847458, "grad_norm": 13.567292315558147, "learning_rate": 4.587363888876361e-07, "logits/chosen": 3.5054311752319336, "logits/rejected": 3.967586040496826, "logps/chosen": -7.4029154777526855, "logps/rejected": -15.959949493408203, "loss": 0.2411, "rewards/accuracies": 1.0, "rewards/chosen": -0.016373835504055023, "rewards/margins": 2.316739559173584, "rewards/rejected": -2.333113431930542, "step": 315 }, { "epoch": 5.3559322033898304, "grad_norm": 16.02951616634702, "learning_rate": 4.583284797706287e-07, "logits/chosen": 1.9606868028640747, "logits/rejected": 1.1389808654785156, "logps/chosen": -5.59510612487793, "logps/rejected": -10.972127914428711, "loss": 0.2638, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3377338647842407, "rewards/margins": 1.3146023750305176, "rewards/rejected": -0.9768685102462769, "step": 316 }, { "epoch": 5.372881355932203, "grad_norm": 14.199893451013141, "learning_rate": 4.5791874760169093e-07, "logits/chosen": 3.0882699489593506, "logits/rejected": 3.207479476928711, "logps/chosen": -8.481260299682617, "logps/rejected": -12.788800239562988, "loss": 0.2415, "rewards/accuracies": 0.875, "rewards/chosen": 0.3231436312198639, "rewards/margins": 1.855533242225647, "rewards/rejected": -1.532389760017395, "step": 317 }, { "epoch": 5.389830508474576, "grad_norm": 14.275237175931048, "learning_rate": 4.575071959663288e-07, "logits/chosen": 6.5589470863342285, "logits/rejected": 6.860996246337891, "logps/chosen": -10.277453422546387, "logps/rejected": -21.133487701416016, "loss": 0.2277, "rewards/accuracies": 0.875, "rewards/chosen": 0.1361951231956482, "rewards/margins": 2.5179457664489746, "rewards/rejected": -2.3817505836486816, "step": 318 }, { "epoch": 5.406779661016949, "grad_norm": 14.61554056890025, "learning_rate": 4.570938284659702e-07, "logits/chosen": 3.4965403079986572, "logits/rejected": 4.776882648468018, "logps/chosen": -8.628873825073242, "logps/rejected": -14.101314544677734, "loss": 0.2375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.28320109844207764, "rewards/margins": 1.9493257999420166, "rewards/rejected": -1.6661248207092285, "step": 319 }, { "epoch": 5.423728813559322, "grad_norm": 14.959871093742624, "learning_rate": 4.566786487179334e-07, "logits/chosen": 3.0084660053253174, "logits/rejected": 6.481236457824707, "logps/chosen": -10.179582595825195, "logps/rejected": -14.713674545288086, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 0.5509328842163086, "rewards/margins": 2.3514480590820312, "rewards/rejected": -1.8005151748657227, "step": 320 }, { "epoch": 5.440677966101695, "grad_norm": 14.548805507323163, "learning_rate": 4.5626166035539535e-07, "logits/chosen": 5.655772686004639, "logits/rejected": 8.056764602661133, "logps/chosen": -10.326370239257812, "logps/rejected": -17.405672073364258, "loss": 0.2405, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29461419582366943, "rewards/margins": 2.596235513687134, "rewards/rejected": -2.301621437072754, "step": 321 }, { "epoch": 5.4576271186440675, "grad_norm": 13.453869815890464, "learning_rate": 4.5584286702736007e-07, "logits/chosen": 2.1154050827026367, "logits/rejected": 2.4267899990081787, "logps/chosen": -8.321646690368652, "logps/rejected": -14.760269165039062, "loss": 0.2184, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35621577501296997, "rewards/margins": 1.7278010845184326, "rewards/rejected": -1.3715853691101074, "step": 322 }, { "epoch": 5.47457627118644, "grad_norm": 15.50160681365945, "learning_rate": 4.5542227239862654e-07, "logits/chosen": 3.743596076965332, "logits/rejected": 5.651052951812744, "logps/chosen": -8.57131290435791, "logps/rejected": -18.80699348449707, "loss": 0.2644, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2737744152545929, "rewards/margins": 2.8698837757110596, "rewards/rejected": -2.596109390258789, "step": 323 }, { "epoch": 5.491525423728813, "grad_norm": 13.868377244000722, "learning_rate": 4.5499988014975635e-07, "logits/chosen": 3.769761085510254, "logits/rejected": 4.874698162078857, "logps/chosen": -11.69039535522461, "logps/rejected": -17.498027801513672, "loss": 0.2212, "rewards/accuracies": 1.0, "rewards/chosen": 0.5320883989334106, "rewards/margins": 2.393782377243042, "rewards/rejected": -1.8616942167282104, "step": 324 }, { "epoch": 5.508474576271187, "grad_norm": 14.970272832391561, "learning_rate": 4.545756939770422e-07, "logits/chosen": 6.82029390335083, "logits/rejected": 7.361518383026123, "logps/chosen": -7.692866325378418, "logps/rejected": -20.093639373779297, "loss": 0.2336, "rewards/accuracies": 0.875, "rewards/chosen": 0.10636795312166214, "rewards/margins": 3.6429443359375, "rewards/rejected": -3.536576509475708, "step": 325 }, { "epoch": 5.52542372881356, "grad_norm": 14.52118449485477, "learning_rate": 4.54149717592475e-07, "logits/chosen": 7.36668062210083, "logits/rejected": 6.56944465637207, "logps/chosen": -10.787076950073242, "logps/rejected": -15.239648818969727, "loss": 0.2543, "rewards/accuracies": 1.0, "rewards/chosen": 0.1467917561531067, "rewards/margins": 1.7701894044876099, "rewards/rejected": -1.6233974695205688, "step": 326 }, { "epoch": 5.5423728813559325, "grad_norm": 14.294998066529104, "learning_rate": 4.537219547237114e-07, "logits/chosen": 8.102130889892578, "logits/rejected": 4.980132579803467, "logps/chosen": -7.948538303375244, "logps/rejected": -23.7041015625, "loss": 0.2387, "rewards/accuracies": 1.0, "rewards/chosen": 0.26779377460479736, "rewards/margins": 3.2698512077331543, "rewards/rejected": -3.0020575523376465, "step": 327 }, { "epoch": 5.559322033898305, "grad_norm": 13.319270840448068, "learning_rate": 4.5329240911404167e-07, "logits/chosen": 3.3187613487243652, "logits/rejected": 3.88653564453125, "logps/chosen": -5.821469783782959, "logps/rejected": -11.25735092163086, "loss": 0.2142, "rewards/accuracies": 0.9375, "rewards/chosen": 0.34002581238746643, "rewards/margins": 2.070734977722168, "rewards/rejected": -1.7307093143463135, "step": 328 }, { "epoch": 5.576271186440678, "grad_norm": 15.137980503523977, "learning_rate": 4.528610845223562e-07, "logits/chosen": 4.278947353363037, "logits/rejected": 4.24318265914917, "logps/chosen": -10.437315940856934, "logps/rejected": -21.30280113220215, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 0.2784523665904999, "rewards/margins": 3.11037540435791, "rewards/rejected": -2.831923246383667, "step": 329 }, { "epoch": 5.593220338983051, "grad_norm": 13.121474650704997, "learning_rate": 4.5242798472311306e-07, "logits/chosen": 2.994493007659912, "logits/rejected": 3.7606515884399414, "logps/chosen": -7.844595909118652, "logps/rejected": -12.961830139160156, "loss": 0.1932, "rewards/accuracies": 1.0, "rewards/chosen": 0.39790380001068115, "rewards/margins": 2.1576051712036133, "rewards/rejected": -1.7597013711929321, "step": 330 }, { "epoch": 5.610169491525424, "grad_norm": 13.576296069584211, "learning_rate": 4.519931135063051e-07, "logits/chosen": 3.9403624534606934, "logits/rejected": 5.117575168609619, "logps/chosen": -8.309492111206055, "logps/rejected": -19.150638580322266, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 0.2991102933883667, "rewards/margins": 3.163229465484619, "rewards/rejected": -2.864119291305542, "step": 331 }, { "epoch": 5.627118644067797, "grad_norm": 12.920872703304005, "learning_rate": 4.515564746774265e-07, "logits/chosen": 1.2953753471374512, "logits/rejected": 3.210388422012329, "logps/chosen": -8.38802719116211, "logps/rejected": -16.91384506225586, "loss": 0.2175, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08069738000631332, "rewards/margins": 2.5369935035705566, "rewards/rejected": -2.45629620552063, "step": 332 }, { "epoch": 5.6440677966101696, "grad_norm": 13.209851039880665, "learning_rate": 4.5111807205743945e-07, "logits/chosen": 1.0812115669250488, "logits/rejected": 1.451087236404419, "logps/chosen": -11.948184967041016, "logps/rejected": -27.34232521057129, "loss": 0.2169, "rewards/accuracies": 1.0, "rewards/chosen": 0.26642170548439026, "rewards/margins": 3.5682811737060547, "rewards/rejected": -3.3018593788146973, "step": 333 }, { "epoch": 5.661016949152542, "grad_norm": 12.057003448494973, "learning_rate": 4.5067790948274085e-07, "logits/chosen": 2.796851873397827, "logits/rejected": 4.601650714874268, "logps/chosen": -7.660577297210693, "logps/rejected": -13.53750228881836, "loss": 0.2083, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6349484920501709, "rewards/margins": 2.123163938522339, "rewards/rejected": -1.488215446472168, "step": 334 }, { "epoch": 5.677966101694915, "grad_norm": 14.844392112328164, "learning_rate": 4.5023599080512896e-07, "logits/chosen": 4.955301284790039, "logits/rejected": 6.5992326736450195, "logps/chosen": -11.873647689819336, "logps/rejected": -17.834644317626953, "loss": 0.227, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2461644411087036, "rewards/margins": 2.8159372806549072, "rewards/rejected": -2.569772958755493, "step": 335 }, { "epoch": 5.694915254237288, "grad_norm": 14.019832702174794, "learning_rate": 4.4979231989176905e-07, "logits/chosen": 3.787350654602051, "logits/rejected": 3.4726927280426025, "logps/chosen": -7.431969165802002, "logps/rejected": -14.662132263183594, "loss": 0.211, "rewards/accuracies": 0.9375, "rewards/chosen": 0.34543120861053467, "rewards/margins": 2.9022555351257324, "rewards/rejected": -2.5568246841430664, "step": 336 }, { "epoch": 5.711864406779661, "grad_norm": 14.060606447822217, "learning_rate": 4.493469006251601e-07, "logits/chosen": 7.4255757331848145, "logits/rejected": 5.965138912200928, "logps/chosen": -10.696208000183105, "logps/rejected": -22.840984344482422, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": 0.09379026293754578, "rewards/margins": 3.5261425971984863, "rewards/rejected": -3.432352066040039, "step": 337 }, { "epoch": 5.728813559322034, "grad_norm": 14.237009692386826, "learning_rate": 4.488997369031008e-07, "logits/chosen": 3.596536636352539, "logits/rejected": 3.5504150390625, "logps/chosen": -6.90710973739624, "logps/rejected": -13.495116233825684, "loss": 0.2298, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3260919749736786, "rewards/margins": 1.7033262252807617, "rewards/rejected": -1.3772342205047607, "step": 338 }, { "epoch": 5.745762711864407, "grad_norm": 14.67091033902795, "learning_rate": 4.4845083263865514e-07, "logits/chosen": 2.061657428741455, "logits/rejected": 3.5068767070770264, "logps/chosen": -10.289358139038086, "logps/rejected": -16.851903915405273, "loss": 0.2412, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16148579120635986, "rewards/margins": 2.721928358078003, "rewards/rejected": -2.5604424476623535, "step": 339 }, { "epoch": 5.762711864406779, "grad_norm": 13.586237133155059, "learning_rate": 4.4800019176011847e-07, "logits/chosen": 3.836963415145874, "logits/rejected": 1.2509939670562744, "logps/chosen": -8.10957145690918, "logps/rejected": -17.303131103515625, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 0.13127543032169342, "rewards/margins": 2.2816011905670166, "rewards/rejected": -2.1503257751464844, "step": 340 }, { "epoch": 5.779661016949152, "grad_norm": 12.360126542845666, "learning_rate": 4.4754781821098286e-07, "logits/chosen": 1.5891176462173462, "logits/rejected": 3.962115526199341, "logps/chosen": -11.134986877441406, "logps/rejected": -16.536781311035156, "loss": 0.2008, "rewards/accuracies": 0.875, "rewards/chosen": 0.3413625955581665, "rewards/margins": 2.944985866546631, "rewards/rejected": -2.603623151779175, "step": 341 }, { "epoch": 5.796610169491525, "grad_norm": 14.480265988536571, "learning_rate": 4.470937159499028e-07, "logits/chosen": 2.7056524753570557, "logits/rejected": 1.9229453802108765, "logps/chosen": -7.356058597564697, "logps/rejected": -13.569494247436523, "loss": 0.2406, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2115810215473175, "rewards/margins": 2.17071270942688, "rewards/rejected": -1.9591315984725952, "step": 342 }, { "epoch": 5.813559322033898, "grad_norm": 14.17521457305599, "learning_rate": 4.4663788895066065e-07, "logits/chosen": 0.2597987651824951, "logits/rejected": 1.4270880222320557, "logps/chosen": -10.596329689025879, "logps/rejected": -15.119043350219727, "loss": 0.2373, "rewards/accuracies": 1.0, "rewards/chosen": 0.22062750160694122, "rewards/margins": 1.7454050779342651, "rewards/rejected": -1.5247777700424194, "step": 343 }, { "epoch": 5.830508474576272, "grad_norm": 13.160996137481407, "learning_rate": 4.4618034120213135e-07, "logits/chosen": 6.123937129974365, "logits/rejected": 5.778842449188232, "logps/chosen": -10.282336235046387, "logps/rejected": -23.72997283935547, "loss": 0.2025, "rewards/accuracies": 0.9375, "rewards/chosen": 0.15460780262947083, "rewards/margins": 3.5308094024658203, "rewards/rejected": -3.376201629638672, "step": 344 }, { "epoch": 5.847457627118644, "grad_norm": 12.86869971192147, "learning_rate": 4.4572107670824806e-07, "logits/chosen": 2.6840929985046387, "logits/rejected": 3.7933554649353027, "logps/chosen": -8.170498847961426, "logps/rejected": -17.079763412475586, "loss": 0.2101, "rewards/accuracies": 1.0, "rewards/chosen": 0.3931899964809418, "rewards/margins": 2.5557689666748047, "rewards/rejected": -2.162578582763672, "step": 345 }, { "epoch": 5.864406779661017, "grad_norm": 13.691838438775193, "learning_rate": 4.45260099487967e-07, "logits/chosen": -0.25332632660865784, "logits/rejected": 3.3269095420837402, "logps/chosen": -13.765853881835938, "logps/rejected": -15.467521667480469, "loss": 0.2287, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21706271171569824, "rewards/margins": 2.0856997966766357, "rewards/rejected": -1.8686370849609375, "step": 346 }, { "epoch": 5.88135593220339, "grad_norm": 14.00814231188169, "learning_rate": 4.4479741357523204e-07, "logits/chosen": 4.334939002990723, "logits/rejected": 5.97941780090332, "logps/chosen": -12.119684219360352, "logps/rejected": -21.12580680847168, "loss": 0.2097, "rewards/accuracies": 1.0, "rewards/chosen": 0.2201918661594391, "rewards/margins": 3.210035800933838, "rewards/rejected": -2.9898438453674316, "step": 347 }, { "epoch": 5.898305084745763, "grad_norm": 14.913136382206837, "learning_rate": 4.4433302301893983e-07, "logits/chosen": 4.66344690322876, "logits/rejected": 2.972669839859009, "logps/chosen": -6.611598491668701, "logps/rejected": -15.623875617980957, "loss": 0.2632, "rewards/accuracies": 0.9375, "rewards/chosen": 0.37935081124305725, "rewards/margins": 1.7020004987716675, "rewards/rejected": -1.3226497173309326, "step": 348 }, { "epoch": 5.915254237288136, "grad_norm": 16.427089053073924, "learning_rate": 4.438669318829037e-07, "logits/chosen": 0.8019882440567017, "logits/rejected": 2.8923540115356445, "logps/chosen": -9.76960563659668, "logps/rejected": -16.897960662841797, "loss": 0.2374, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39484477043151855, "rewards/margins": 1.9051614999771118, "rewards/rejected": -1.5103168487548828, "step": 349 }, { "epoch": 5.932203389830509, "grad_norm": 12.808532494066004, "learning_rate": 4.433991442458188e-07, "logits/chosen": 0.4681244492530823, "logits/rejected": 5.965130805969238, "logps/chosen": -12.381902694702148, "logps/rejected": -12.533705711364746, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 0.378155380487442, "rewards/margins": 1.931134581565857, "rewards/rejected": -1.5529789924621582, "step": 350 }, { "epoch": 5.9491525423728815, "grad_norm": 12.913971857694005, "learning_rate": 4.4292966420122613e-07, "logits/chosen": 6.719675540924072, "logits/rejected": 6.983699798583984, "logps/chosen": -9.271611213684082, "logps/rejected": -18.097660064697266, "loss": 0.1996, "rewards/accuracies": 1.0, "rewards/chosen": 0.14226177334785461, "rewards/margins": 2.8130500316619873, "rewards/rejected": -2.670788288116455, "step": 351 }, { "epoch": 5.966101694915254, "grad_norm": 14.302051781978857, "learning_rate": 4.4245849585747655e-07, "logits/chosen": 2.9296581745147705, "logits/rejected": 1.7646398544311523, "logps/chosen": -7.856363296508789, "logps/rejected": -15.518735885620117, "loss": 0.2496, "rewards/accuracies": 0.875, "rewards/chosen": 0.3406248092651367, "rewards/margins": 2.079533815383911, "rewards/rejected": -1.7389090061187744, "step": 352 }, { "epoch": 5.983050847457627, "grad_norm": 14.425920748137944, "learning_rate": 4.41985643337695e-07, "logits/chosen": 5.654947280883789, "logits/rejected": 6.085771560668945, "logps/chosen": -10.340957641601562, "logps/rejected": -20.093236923217773, "loss": 0.2211, "rewards/accuracies": 1.0, "rewards/chosen": 0.28536200523376465, "rewards/margins": 2.75244140625, "rewards/rejected": -2.4670796394348145, "step": 353 }, { "epoch": 6.0, "grad_norm": 14.592286734921624, "learning_rate": 4.415111107797445e-07, "logits/chosen": 3.670210599899292, "logits/rejected": 3.209463119506836, "logps/chosen": -5.669580459594727, "logps/rejected": -11.425719261169434, "loss": 0.2338, "rewards/accuracies": 0.875, "rewards/chosen": 0.14339257776737213, "rewards/margins": 1.7983019351959229, "rewards/rejected": -1.6549094915390015, "step": 354 }, { "epoch": 6.016949152542373, "grad_norm": 12.720475020903885, "learning_rate": 4.410349023361897e-07, "logits/chosen": 2.473912239074707, "logits/rejected": 5.576210975646973, "logps/chosen": -13.222108840942383, "logps/rejected": -21.111351013183594, "loss": 0.204, "rewards/accuracies": 1.0, "rewards/chosen": 0.32114630937576294, "rewards/margins": 3.3848023414611816, "rewards/rejected": -3.0636560916900635, "step": 355 }, { "epoch": 6.033898305084746, "grad_norm": 11.400040105830213, "learning_rate": 4.4055702217426085e-07, "logits/chosen": 0.8407840132713318, "logits/rejected": 1.0289647579193115, "logps/chosen": -6.723813056945801, "logps/rejected": -15.182109832763672, "loss": 0.1874, "rewards/accuracies": 1.0, "rewards/chosen": 0.3703401982784271, "rewards/margins": 3.1831440925598145, "rewards/rejected": -2.8128037452697754, "step": 356 }, { "epoch": 6.0508474576271185, "grad_norm": 12.244242829788378, "learning_rate": 4.40077474475817e-07, "logits/chosen": 1.6306920051574707, "logits/rejected": 0.06805920600891113, "logps/chosen": -10.396472930908203, "logps/rejected": -20.96820068359375, "loss": 0.2088, "rewards/accuracies": 0.875, "rewards/chosen": 0.41604703664779663, "rewards/margins": 2.542591094970703, "rewards/rejected": -2.126544237136841, "step": 357 }, { "epoch": 6.067796610169491, "grad_norm": 12.052664494870422, "learning_rate": 4.395962634373096e-07, "logits/chosen": 3.180180549621582, "logits/rejected": 4.185020923614502, "logps/chosen": -7.90302848815918, "logps/rejected": -16.847829818725586, "loss": 0.1947, "rewards/accuracies": 1.0, "rewards/chosen": 0.3513253927230835, "rewards/margins": 2.89188551902771, "rewards/rejected": -2.540560245513916, "step": 358 }, { "epoch": 6.084745762711864, "grad_norm": 11.56580544056517, "learning_rate": 4.3911339326974584e-07, "logits/chosen": 5.388302803039551, "logits/rejected": 4.333691596984863, "logps/chosen": -8.595808982849121, "logps/rejected": -21.796253204345703, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": 0.07518626749515533, "rewards/margins": 4.378305435180664, "rewards/rejected": -4.303119659423828, "step": 359 }, { "epoch": 6.101694915254237, "grad_norm": 12.493899194292592, "learning_rate": 4.386288681986516e-07, "logits/chosen": 5.380532264709473, "logits/rejected": 6.32417631149292, "logps/chosen": -9.998344421386719, "logps/rejected": -18.48993682861328, "loss": 0.1899, "rewards/accuracies": 1.0, "rewards/chosen": 0.20978182554244995, "rewards/margins": 3.287187337875366, "rewards/rejected": -3.0774056911468506, "step": 360 }, { "epoch": 6.11864406779661, "grad_norm": 12.08153937994342, "learning_rate": 4.3814269246403456e-07, "logits/chosen": 2.539783239364624, "logits/rejected": 5.148946762084961, "logps/chosen": -8.870214462280273, "logps/rejected": -16.426620483398438, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": 0.35913634300231934, "rewards/margins": 2.705012559890747, "rewards/rejected": -2.3458759784698486, "step": 361 }, { "epoch": 6.135593220338983, "grad_norm": 11.40653610156602, "learning_rate": 4.3765487032034737e-07, "logits/chosen": -2.6000680923461914, "logits/rejected": -1.892746925354004, "logps/chosen": -9.39770221710205, "logps/rejected": -16.299577713012695, "loss": 0.1838, "rewards/accuracies": 1.0, "rewards/chosen": 0.5937013626098633, "rewards/margins": 2.4688711166381836, "rewards/rejected": -1.8751695156097412, "step": 362 }, { "epoch": 6.1525423728813555, "grad_norm": 12.510337865575506, "learning_rate": 4.371654060364498e-07, "logits/chosen": -1.1375783681869507, "logits/rejected": 0.94313645362854, "logps/chosen": -7.737663269042969, "logps/rejected": -11.724308967590332, "loss": 0.2046, "rewards/accuracies": 1.0, "rewards/chosen": 0.366363525390625, "rewards/margins": 1.983730673789978, "rewards/rejected": -1.6173672676086426, "step": 363 }, { "epoch": 6.169491525423728, "grad_norm": 11.55035638277415, "learning_rate": 4.366743038955719e-07, "logits/chosen": 2.8494374752044678, "logits/rejected": 4.877542495727539, "logps/chosen": -11.219352722167969, "logps/rejected": -17.120630264282227, "loss": 0.1796, "rewards/accuracies": 1.0, "rewards/chosen": 0.37523311376571655, "rewards/margins": 2.4836084842681885, "rewards/rejected": -2.1083755493164062, "step": 364 }, { "epoch": 6.186440677966102, "grad_norm": 12.048369200248766, "learning_rate": 4.361815681952765e-07, "logits/chosen": -1.1336802244186401, "logits/rejected": 2.4179670810699463, "logps/chosen": -9.88912582397461, "logps/rejected": -12.742366790771484, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 0.42945533990859985, "rewards/margins": 2.399317502975464, "rewards/rejected": -1.9698621034622192, "step": 365 }, { "epoch": 6.203389830508475, "grad_norm": 11.244435376738915, "learning_rate": 4.3568720324742126e-07, "logits/chosen": 5.716616153717041, "logits/rejected": 6.348829746246338, "logps/chosen": -8.830109596252441, "logps/rejected": -20.635944366455078, "loss": 0.1765, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2376314103603363, "rewards/margins": 2.6832034587860107, "rewards/rejected": -2.4455718994140625, "step": 366 }, { "epoch": 6.220338983050848, "grad_norm": 11.895191500315683, "learning_rate": 4.351912133781212e-07, "logits/chosen": 2.2640492916107178, "logits/rejected": 3.6460046768188477, "logps/chosen": -7.952423095703125, "logps/rejected": -10.814785957336426, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 0.5884957313537598, "rewards/margins": 1.6586506366729736, "rewards/rejected": -1.0701547861099243, "step": 367 }, { "epoch": 6.237288135593221, "grad_norm": 11.319942761188097, "learning_rate": 4.3469360292771096e-07, "logits/chosen": -0.033799976110458374, "logits/rejected": 1.3880383968353271, "logps/chosen": -8.749774932861328, "logps/rejected": -15.943452835083008, "loss": 0.1799, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45636826753616333, "rewards/margins": 2.9518792629241943, "rewards/rejected": -2.495511054992676, "step": 368 }, { "epoch": 6.254237288135593, "grad_norm": 11.763342241935318, "learning_rate": 4.3419437625070634e-07, "logits/chosen": 0.5066416263580322, "logits/rejected": 2.990574836730957, "logps/chosen": -8.957874298095703, "logps/rejected": -13.657116889953613, "loss": 0.2052, "rewards/accuracies": 1.0, "rewards/chosen": 0.3993526101112366, "rewards/margins": 2.0434536933898926, "rewards/rejected": -1.6441012620925903, "step": 369 }, { "epoch": 6.271186440677966, "grad_norm": 12.602104399660293, "learning_rate": 4.336935377157668e-07, "logits/chosen": -0.9369416832923889, "logits/rejected": 1.4667794704437256, "logps/chosen": -9.532140731811523, "logps/rejected": -20.544631958007812, "loss": 0.1962, "rewards/accuracies": 1.0, "rewards/chosen": 0.45484790205955505, "rewards/margins": 3.5946388244628906, "rewards/rejected": -3.1397910118103027, "step": 370 }, { "epoch": 6.288135593220339, "grad_norm": 12.229135597671917, "learning_rate": 4.3319109170565676e-07, "logits/chosen": 3.292764663696289, "logits/rejected": 2.433797597885132, "logps/chosen": -8.80831527709961, "logps/rejected": -18.14662742614746, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": 0.22665278613567352, "rewards/margins": 2.3303961753845215, "rewards/rejected": -2.103743076324463, "step": 371 }, { "epoch": 6.305084745762712, "grad_norm": 11.372981937167529, "learning_rate": 4.3268704261720745e-07, "logits/chosen": -0.022482722997665405, "logits/rejected": 1.80076265335083, "logps/chosen": -9.301206588745117, "logps/rejected": -17.570545196533203, "loss": 0.1803, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3969723582267761, "rewards/margins": 2.1740834712982178, "rewards/rejected": -1.7771109342575073, "step": 372 }, { "epoch": 6.322033898305085, "grad_norm": 12.165079363604997, "learning_rate": 4.321813948612785e-07, "logits/chosen": 4.012811183929443, "logits/rejected": 6.188882827758789, "logps/chosen": -8.608428955078125, "logps/rejected": -14.812787055969238, "loss": 0.1815, "rewards/accuracies": 1.0, "rewards/chosen": 0.282976359128952, "rewards/margins": 2.6606106758117676, "rewards/rejected": -2.3776345252990723, "step": 373 }, { "epoch": 6.338983050847458, "grad_norm": 11.471916125631306, "learning_rate": 4.31674152862719e-07, "logits/chosen": 4.980255126953125, "logits/rejected": 2.6463794708251953, "logps/chosen": -7.498459815979004, "logps/rejected": -16.970762252807617, "loss": 0.1787, "rewards/accuracies": 1.0, "rewards/chosen": 0.12228435277938843, "rewards/margins": 2.9980876445770264, "rewards/rejected": -2.875803232192993, "step": 374 }, { "epoch": 6.3559322033898304, "grad_norm": 11.343224798553283, "learning_rate": 4.311653210603293e-07, "logits/chosen": -2.399540901184082, "logits/rejected": 2.918029546737671, "logps/chosen": -13.647743225097656, "logps/rejected": -23.035350799560547, "loss": 0.1718, "rewards/accuracies": 1.0, "rewards/chosen": 0.2999897003173828, "rewards/margins": 3.9240126609802246, "rewards/rejected": -3.624022960662842, "step": 375 }, { "epoch": 6.372881355932203, "grad_norm": 12.578548456336826, "learning_rate": 4.306549039068218e-07, "logits/chosen": 0.9330506920814514, "logits/rejected": 1.1180548667907715, "logps/chosen": -8.449841499328613, "logps/rejected": -13.091574668884277, "loss": 0.1939, "rewards/accuracies": 1.0, "rewards/chosen": 0.5931252837181091, "rewards/margins": 2.422457218170166, "rewards/rejected": -1.8293319940567017, "step": 376 }, { "epoch": 6.389830508474576, "grad_norm": 11.960020308408605, "learning_rate": 4.301429058687819e-07, "logits/chosen": 3.4172210693359375, "logits/rejected": 0.8663151264190674, "logps/chosen": -7.801892280578613, "logps/rejected": -23.506187438964844, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": 0.642574667930603, "rewards/margins": 4.627707004547119, "rewards/rejected": -3.9851319789886475, "step": 377 }, { "epoch": 6.406779661016949, "grad_norm": 12.061426426986905, "learning_rate": 4.296293314266294e-07, "logits/chosen": 0.8170385360717773, "logits/rejected": 3.078815460205078, "logps/chosen": -9.448894500732422, "logps/rejected": -19.359838485717773, "loss": 0.1744, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19764302670955658, "rewards/margins": 3.298597574234009, "rewards/rejected": -3.100954532623291, "step": 378 }, { "epoch": 6.423728813559322, "grad_norm": 12.206478327976352, "learning_rate": 4.2911418507457876e-07, "logits/chosen": -2.6637320518493652, "logits/rejected": -1.9219914674758911, "logps/chosen": -10.741297721862793, "logps/rejected": -16.847227096557617, "loss": 0.1775, "rewards/accuracies": 1.0, "rewards/chosen": 0.36193591356277466, "rewards/margins": 2.7289984226226807, "rewards/rejected": -2.367062568664551, "step": 379 }, { "epoch": 6.440677966101695, "grad_norm": 11.10482434906408, "learning_rate": 4.285974713206e-07, "logits/chosen": 0.4309033751487732, "logits/rejected": 4.5554327964782715, "logps/chosen": -9.133827209472656, "logps/rejected": -21.082012176513672, "loss": 0.1891, "rewards/accuracies": 1.0, "rewards/chosen": 0.343951940536499, "rewards/margins": 4.336812973022461, "rewards/rejected": -3.992860794067383, "step": 380 }, { "epoch": 6.4576271186440675, "grad_norm": 12.183583706496009, "learning_rate": 4.280791946863794e-07, "logits/chosen": -2.6161293983459473, "logits/rejected": -2.8485751152038574, "logps/chosen": -9.101305961608887, "logps/rejected": -17.507917404174805, "loss": 0.1954, "rewards/accuracies": 0.875, "rewards/chosen": 0.23070837557315826, "rewards/margins": 2.2931227684020996, "rewards/rejected": -2.0624141693115234, "step": 381 }, { "epoch": 6.47457627118644, "grad_norm": 11.626006216103972, "learning_rate": 4.275593597072795e-07, "logits/chosen": 1.787920594215393, "logits/rejected": 4.995972633361816, "logps/chosen": -10.633196830749512, "logps/rejected": -16.80924415588379, "loss": 0.1567, "rewards/accuracies": 1.0, "rewards/chosen": 0.3805018961429596, "rewards/margins": 2.5669453144073486, "rewards/rejected": -2.186443567276001, "step": 382 }, { "epoch": 6.491525423728813, "grad_norm": 11.632616465398266, "learning_rate": 4.270379709323001e-07, "logits/chosen": 6.529628276824951, "logits/rejected": 4.649651050567627, "logps/chosen": -12.372982025146484, "logps/rejected": -26.309274673461914, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": -0.03969073295593262, "rewards/margins": 4.078218460083008, "rewards/rejected": -4.117908477783203, "step": 383 }, { "epoch": 6.508474576271187, "grad_norm": 39.632679942094846, "learning_rate": 4.265150329240376e-07, "logits/chosen": 3.1415085792541504, "logits/rejected": 5.441884994506836, "logps/chosen": -8.506572723388672, "logps/rejected": -14.266595840454102, "loss": 0.1893, "rewards/accuracies": 0.9375, "rewards/chosen": 0.27693843841552734, "rewards/margins": 2.0275750160217285, "rewards/rejected": -1.7506364583969116, "step": 384 }, { "epoch": 6.52542372881356, "grad_norm": 11.50776715255362, "learning_rate": 4.259905502586457e-07, "logits/chosen": -4.798701763153076, "logits/rejected": -3.0812864303588867, "logps/chosen": -9.191747665405273, "logps/rejected": -15.61034107208252, "loss": 0.1962, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3073745667934418, "rewards/margins": 2.615311861038208, "rewards/rejected": -2.3079373836517334, "step": 385 }, { "epoch": 6.5423728813559325, "grad_norm": 13.297389708968069, "learning_rate": 4.254645275257953e-07, "logits/chosen": 2.7727794647216797, "logits/rejected": 1.049268364906311, "logps/chosen": -9.638428688049316, "logps/rejected": -19.182289123535156, "loss": 0.1946, "rewards/accuracies": 1.0, "rewards/chosen": 0.21066324412822723, "rewards/margins": 2.508415937423706, "rewards/rejected": -2.297752618789673, "step": 386 }, { "epoch": 6.559322033898305, "grad_norm": 11.982375330029488, "learning_rate": 4.24936969328634e-07, "logits/chosen": 1.112790584564209, "logits/rejected": 0.8258357048034668, "logps/chosen": -6.70481014251709, "logps/rejected": -16.066164016723633, "loss": 0.2029, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12247809022665024, "rewards/margins": 2.890721559524536, "rewards/rejected": -2.7682437896728516, "step": 387 }, { "epoch": 6.576271186440678, "grad_norm": 13.521630088491362, "learning_rate": 4.244078802837462e-07, "logits/chosen": 2.7497763633728027, "logits/rejected": 3.7148149013519287, "logps/chosen": -9.611839294433594, "logps/rejected": -12.181758880615234, "loss": 0.208, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35590308904647827, "rewards/margins": 1.9600085020065308, "rewards/rejected": -1.6041054725646973, "step": 388 }, { "epoch": 6.593220338983051, "grad_norm": 11.804130811346875, "learning_rate": 4.238772650211123e-07, "logits/chosen": 2.2374987602233887, "logits/rejected": 2.2601318359375, "logps/chosen": -7.375918388366699, "logps/rejected": -18.406349182128906, "loss": 0.1699, "rewards/accuracies": 1.0, "rewards/chosen": 0.30262285470962524, "rewards/margins": 3.1041243076324463, "rewards/rejected": -2.801501750946045, "step": 389 }, { "epoch": 6.610169491525424, "grad_norm": 12.379786650904354, "learning_rate": 4.233451281840685e-07, "logits/chosen": 3.232478618621826, "logits/rejected": 3.9859728813171387, "logps/chosen": -8.100415229797363, "logps/rejected": -15.276487350463867, "loss": 0.1983, "rewards/accuracies": 0.9375, "rewards/chosen": 0.42072463035583496, "rewards/margins": 2.0583455562591553, "rewards/rejected": -1.6376209259033203, "step": 390 }, { "epoch": 6.627118644067797, "grad_norm": 11.67383799621466, "learning_rate": 4.2281147442926636e-07, "logits/chosen": -0.6115279197692871, "logits/rejected": -1.3101024627685547, "logps/chosen": -6.683277130126953, "logps/rejected": -13.343738555908203, "loss": 0.1943, "rewards/accuracies": 0.875, "rewards/chosen": 0.28772106766700745, "rewards/margins": 2.299579620361328, "rewards/rejected": -2.0118587017059326, "step": 391 }, { "epoch": 6.6440677966101696, "grad_norm": 11.123053528086508, "learning_rate": 4.222763084266313e-07, "logits/chosen": 0.5712847709655762, "logits/rejected": 4.945223331451416, "logps/chosen": -9.13448715209961, "logps/rejected": -18.582347869873047, "loss": 0.1701, "rewards/accuracies": 1.0, "rewards/chosen": 0.05717560648918152, "rewards/margins": 3.1925296783447266, "rewards/rejected": -3.135354518890381, "step": 392 }, { "epoch": 6.661016949152542, "grad_norm": 11.036054591352455, "learning_rate": 4.217396348593224e-07, "logits/chosen": 1.5376551151275635, "logits/rejected": 3.0253007411956787, "logps/chosen": -12.541561126708984, "logps/rejected": -21.630020141601562, "loss": 0.183, "rewards/accuracies": 1.0, "rewards/chosen": 0.24118569493293762, "rewards/margins": 3.7048823833465576, "rewards/rejected": -3.4636964797973633, "step": 393 }, { "epoch": 6.677966101694915, "grad_norm": 13.009053036332219, "learning_rate": 4.2120145842369137e-07, "logits/chosen": 2.204005718231201, "logits/rejected": 1.796671986579895, "logps/chosen": -7.913434028625488, "logps/rejected": -19.2489070892334, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": 0.2287922501564026, "rewards/margins": 3.174703598022461, "rewards/rejected": -2.945911407470703, "step": 394 }, { "epoch": 6.694915254237288, "grad_norm": 11.383455653308374, "learning_rate": 4.206617838292411e-07, "logits/chosen": 2.897174119949341, "logits/rejected": 5.16046667098999, "logps/chosen": -8.984628677368164, "logps/rejected": -21.642620086669922, "loss": 0.1618, "rewards/accuracies": 0.9375, "rewards/chosen": 0.250646710395813, "rewards/margins": 4.119473457336426, "rewards/rejected": -3.8688266277313232, "step": 395 }, { "epoch": 6.711864406779661, "grad_norm": 11.683503762881484, "learning_rate": 4.201206157985846e-07, "logits/chosen": 3.2230072021484375, "logits/rejected": 4.522119045257568, "logps/chosen": -7.677939414978027, "logps/rejected": -15.542098045349121, "loss": 0.1786, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2948039770126343, "rewards/margins": 2.671304941177368, "rewards/rejected": -2.3765010833740234, "step": 396 }, { "epoch": 6.728813559322034, "grad_norm": 12.164249408349729, "learning_rate": 4.1957795906740403e-07, "logits/chosen": -2.505526542663574, "logits/rejected": -2.6718220710754395, "logps/chosen": -6.572645664215088, "logps/rejected": -13.47610092163086, "loss": 0.1799, "rewards/accuracies": 1.0, "rewards/chosen": 0.5586151480674744, "rewards/margins": 2.246644973754883, "rewards/rejected": -1.6880300045013428, "step": 397 }, { "epoch": 6.745762711864407, "grad_norm": 11.22721717289838, "learning_rate": 4.1903381838440853e-07, "logits/chosen": -0.2245522439479828, "logits/rejected": -1.0013046264648438, "logps/chosen": -8.802066802978516, "logps/rejected": -16.421092987060547, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 0.3151366710662842, "rewards/margins": 2.0241858959198, "rewards/rejected": -1.7090493440628052, "step": 398 }, { "epoch": 6.762711864406779, "grad_norm": 11.2826876856426, "learning_rate": 4.1848819851129345e-07, "logits/chosen": 0.24271805584430695, "logits/rejected": -0.0052538588643074036, "logps/chosen": -11.282722473144531, "logps/rejected": -17.96334457397461, "loss": 0.1618, "rewards/accuracies": 1.0, "rewards/chosen": 0.5386340022087097, "rewards/margins": 3.651751756668091, "rewards/rejected": -3.1131176948547363, "step": 399 }, { "epoch": 6.779661016949152, "grad_norm": 12.565118366696153, "learning_rate": 4.179411042226982e-07, "logits/chosen": 2.8625733852386475, "logits/rejected": 5.791099548339844, "logps/chosen": -12.863717079162598, "logps/rejected": -20.775606155395508, "loss": 0.1896, "rewards/accuracies": 0.9375, "rewards/chosen": -0.035915374755859375, "rewards/margins": 3.369300365447998, "rewards/rejected": -3.4052159786224365, "step": 400 }, { "epoch": 6.796610169491525, "grad_norm": 11.833011467732364, "learning_rate": 4.173925403061644e-07, "logits/chosen": 0.865548849105835, "logits/rejected": 1.7070629596710205, "logps/chosen": -11.461730003356934, "logps/rejected": -29.828683853149414, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": 0.28670307993888855, "rewards/margins": 4.051923751831055, "rewards/rejected": -3.7652201652526855, "step": 401 }, { "epoch": 6.813559322033898, "grad_norm": 11.010173899657152, "learning_rate": 4.1684251156209437e-07, "logits/chosen": 3.118610382080078, "logits/rejected": 3.4988858699798584, "logps/chosen": -9.700953483581543, "logps/rejected": -21.816139221191406, "loss": 0.1706, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2815594971179962, "rewards/margins": 3.5297186374664307, "rewards/rejected": -3.248159170150757, "step": 402 }, { "epoch": 6.830508474576272, "grad_norm": 12.448855337269192, "learning_rate": 4.16291022803709e-07, "logits/chosen": 1.8236751556396484, "logits/rejected": 3.2149503231048584, "logps/chosen": -9.841229438781738, "logps/rejected": -15.228986740112305, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": 0.3855394721031189, "rewards/margins": 2.7110955715179443, "rewards/rejected": -2.3255560398101807, "step": 403 }, { "epoch": 6.847457627118644, "grad_norm": 13.363779328365679, "learning_rate": 4.1573807885700523e-07, "logits/chosen": 2.889011859893799, "logits/rejected": 3.5058770179748535, "logps/chosen": -11.454680442810059, "logps/rejected": -24.09137725830078, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 0.421461820602417, "rewards/margins": 3.8386220932006836, "rewards/rejected": -3.4171600341796875, "step": 404 }, { "epoch": 6.864406779661017, "grad_norm": 12.307644038866737, "learning_rate": 4.151836845607144e-07, "logits/chosen": 3.0431900024414062, "logits/rejected": 3.4132754802703857, "logps/chosen": -11.601396560668945, "logps/rejected": -18.625232696533203, "loss": 0.1713, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6460351943969727, "rewards/margins": 3.304206609725952, "rewards/rejected": -2.6581716537475586, "step": 405 }, { "epoch": 6.88135593220339, "grad_norm": 11.401784945653475, "learning_rate": 4.146278447662597e-07, "logits/chosen": 4.435127258300781, "logits/rejected": 3.706249237060547, "logps/chosen": -7.475669860839844, "logps/rejected": -11.912774085998535, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/chosen": 0.37761247158050537, "rewards/margins": 2.3568031787872314, "rewards/rejected": -1.979190707206726, "step": 406 }, { "epoch": 6.898305084745763, "grad_norm": 10.845200868398582, "learning_rate": 4.1407056433771324e-07, "logits/chosen": 4.9845380783081055, "logits/rejected": 4.5369486808776855, "logps/chosen": -9.58263874053955, "logps/rejected": -20.3066349029541, "loss": 0.1542, "rewards/accuracies": 1.0, "rewards/chosen": -0.0040725767612457275, "rewards/margins": 3.683826208114624, "rewards/rejected": -3.687898874282837, "step": 407 }, { "epoch": 6.915254237288136, "grad_norm": 11.322141311865895, "learning_rate": 4.1351184815175456e-07, "logits/chosen": -0.577551543712616, "logits/rejected": 1.696087121963501, "logps/chosen": -11.518467903137207, "logps/rejected": -24.423654556274414, "loss": 0.1658, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30860117077827454, "rewards/margins": 3.8829498291015625, "rewards/rejected": -3.5743489265441895, "step": 408 }, { "epoch": 6.932203389830509, "grad_norm": 10.870149379931851, "learning_rate": 4.1295170109762677e-07, "logits/chosen": 1.2290717363357544, "logits/rejected": 2.379326105117798, "logps/chosen": -8.551403999328613, "logps/rejected": -16.353551864624023, "loss": 0.1642, "rewards/accuracies": 1.0, "rewards/chosen": 0.384010374546051, "rewards/margins": 3.345313787460327, "rewards/rejected": -2.961303472518921, "step": 409 }, { "epoch": 6.9491525423728815, "grad_norm": 11.781844222361585, "learning_rate": 4.1239012807709444e-07, "logits/chosen": -0.22932285070419312, "logits/rejected": 2.510235548019409, "logps/chosen": -9.169673919677734, "logps/rejected": -22.106855392456055, "loss": 0.1667, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12222418189048767, "rewards/margins": 3.730660915374756, "rewards/rejected": -3.6084365844726562, "step": 410 }, { "epoch": 6.966101694915254, "grad_norm": 11.10813604751353, "learning_rate": 4.1182713400440074e-07, "logits/chosen": 0.7548007965087891, "logits/rejected": 4.313141345977783, "logps/chosen": -10.161952018737793, "logps/rejected": -18.345703125, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 0.5395796298980713, "rewards/margins": 3.6666393280029297, "rewards/rejected": -3.1270599365234375, "step": 411 }, { "epoch": 6.983050847457627, "grad_norm": 10.950373908675447, "learning_rate": 4.112627238062238e-07, "logits/chosen": 0.7141157388687134, "logits/rejected": 2.862635850906372, "logps/chosen": -7.2346720695495605, "logps/rejected": -14.257221221923828, "loss": 0.1791, "rewards/accuracies": 1.0, "rewards/chosen": 0.6277063488960266, "rewards/margins": 2.4815032482147217, "rewards/rejected": -1.8537968397140503, "step": 412 }, { "epoch": 7.0, "grad_norm": 12.070707974127568, "learning_rate": 4.106969024216348e-07, "logits/chosen": 3.3420722484588623, "logits/rejected": 2.9186367988586426, "logps/chosen": -8.318929672241211, "logps/rejected": -19.41041374206543, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 0.258792519569397, "rewards/margins": 3.3974623680114746, "rewards/rejected": -3.138669729232788, "step": 413 }, { "epoch": 7.016949152542373, "grad_norm": 10.149398873013046, "learning_rate": 4.101296748020533e-07, "logits/chosen": -1.3689672946929932, "logits/rejected": -0.603845477104187, "logps/chosen": -10.28722858428955, "logps/rejected": -19.97595977783203, "loss": 0.1591, "rewards/accuracies": 0.9375, "rewards/chosen": 0.41477838158607483, "rewards/margins": 3.526080846786499, "rewards/rejected": -3.1113028526306152, "step": 414 }, { "epoch": 7.033898305084746, "grad_norm": 9.47123309900026, "learning_rate": 4.09561045911205e-07, "logits/chosen": 0.6049144268035889, "logits/rejected": 0.7107745409011841, "logps/chosen": -8.770289421081543, "logps/rejected": -14.066744804382324, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 0.39408162236213684, "rewards/margins": 2.6371524333953857, "rewards/rejected": -2.243070602416992, "step": 415 }, { "epoch": 7.0508474576271185, "grad_norm": 10.052821182121836, "learning_rate": 4.0899102072507773e-07, "logits/chosen": -0.4487457871437073, "logits/rejected": 0.46110111474990845, "logps/chosen": -7.779388427734375, "logps/rejected": -14.188379287719727, "loss": 0.1533, "rewards/accuracies": 1.0, "rewards/chosen": 0.5070340037345886, "rewards/margins": 2.450317621231079, "rewards/rejected": -1.9432835578918457, "step": 416 }, { "epoch": 7.067796610169491, "grad_norm": 11.587099181096272, "learning_rate": 4.084196042318783e-07, "logits/chosen": -3.301814079284668, "logits/rejected": -2.8254811763763428, "logps/chosen": -9.915962219238281, "logps/rejected": -14.52343463897705, "loss": 0.1823, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3799024820327759, "rewards/margins": 2.0725831985473633, "rewards/rejected": -1.6926804780960083, "step": 417 }, { "epoch": 7.084745762711864, "grad_norm": 9.783875636578681, "learning_rate": 4.0784680143198837e-07, "logits/chosen": 2.509547472000122, "logits/rejected": 4.715401649475098, "logps/chosen": -6.57558012008667, "logps/rejected": -17.49691390991211, "loss": 0.1474, "rewards/accuracies": 1.0, "rewards/chosen": 0.4137152135372162, "rewards/margins": 3.915511131286621, "rewards/rejected": -3.501796245574951, "step": 418 }, { "epoch": 7.101694915254237, "grad_norm": 10.663457705346287, "learning_rate": 4.0727261733792124e-07, "logits/chosen": 0.4356730580329895, "logits/rejected": 1.356868028640747, "logps/chosen": -7.484723091125488, "logps/rejected": -17.026552200317383, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 0.37025827169418335, "rewards/margins": 3.1471047401428223, "rewards/rejected": -2.7768466472625732, "step": 419 }, { "epoch": 7.11864406779661, "grad_norm": 10.814503714644006, "learning_rate": 4.0669705697427754e-07, "logits/chosen": 5.528136253356934, "logits/rejected": 7.37587833404541, "logps/chosen": -13.677026748657227, "logps/rejected": -22.8214054107666, "loss": 0.1739, "rewards/accuracies": 1.0, "rewards/chosen": 0.6871755719184875, "rewards/margins": 3.2107880115509033, "rewards/rejected": -2.5236124992370605, "step": 420 }, { "epoch": 7.135593220338983, "grad_norm": 9.947743029988125, "learning_rate": 4.061201253777015e-07, "logits/chosen": 0.058755338191986084, "logits/rejected": 1.8083921670913696, "logps/chosen": -10.58738899230957, "logps/rejected": -18.75304412841797, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 0.1582307070493698, "rewards/margins": 3.144596815109253, "rewards/rejected": -2.9863665103912354, "step": 421 }, { "epoch": 7.1525423728813555, "grad_norm": 9.876244438495242, "learning_rate": 4.0554182759683675e-07, "logits/chosen": 1.8721250295639038, "logits/rejected": 2.4072189331054688, "logps/chosen": -6.363444805145264, "logps/rejected": -18.347625732421875, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 0.3553791344165802, "rewards/margins": 3.64908504486084, "rewards/rejected": -3.293706178665161, "step": 422 }, { "epoch": 7.169491525423728, "grad_norm": 10.133228361779297, "learning_rate": 4.049621686922823e-07, "logits/chosen": -0.28978219628334045, "logits/rejected": -0.5108487606048584, "logps/chosen": -9.293232917785645, "logps/rejected": -13.629794120788574, "loss": 0.1536, "rewards/accuracies": 1.0, "rewards/chosen": 0.1518397033214569, "rewards/margins": 2.251439332962036, "rewards/rejected": -2.099599838256836, "step": 423 }, { "epoch": 7.186440677966102, "grad_norm": 9.307930017331996, "learning_rate": 4.0438115373654795e-07, "logits/chosen": -3.549588203430176, "logits/rejected": -1.5921013355255127, "logps/chosen": -10.396879196166992, "logps/rejected": -15.234865188598633, "loss": 0.1351, "rewards/accuracies": 1.0, "rewards/chosen": 0.5600487589836121, "rewards/margins": 3.305903911590576, "rewards/rejected": -2.7458550930023193, "step": 424 }, { "epoch": 7.203389830508475, "grad_norm": 9.812274829008038, "learning_rate": 4.0379878781401046e-07, "logits/chosen": 0.6426932215690613, "logits/rejected": 0.6157368421554565, "logps/chosen": -8.68526554107666, "logps/rejected": -20.45929527282715, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 0.32862192392349243, "rewards/margins": 3.3139867782592773, "rewards/rejected": -2.9853649139404297, "step": 425 }, { "epoch": 7.220338983050848, "grad_norm": 10.179065073117089, "learning_rate": 4.0321507602086836e-07, "logits/chosen": -0.9165471196174622, "logits/rejected": 1.5311022996902466, "logps/chosen": -10.904990196228027, "logps/rejected": -20.312908172607422, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 0.22916960716247559, "rewards/margins": 3.8574917316436768, "rewards/rejected": -3.6283223628997803, "step": 426 }, { "epoch": 7.237288135593221, "grad_norm": 9.70945311905393, "learning_rate": 4.026300234650979e-07, "logits/chosen": -4.223483085632324, "logits/rejected": -4.719742774963379, "logps/chosen": -10.978636741638184, "logps/rejected": -19.935565948486328, "loss": 0.1295, "rewards/accuracies": 1.0, "rewards/chosen": 0.6995031237602234, "rewards/margins": 2.482172727584839, "rewards/rejected": -1.7826696634292603, "step": 427 }, { "epoch": 7.254237288135593, "grad_norm": 9.565067688440125, "learning_rate": 4.020436352664079e-07, "logits/chosen": -1.7467882633209229, "logits/rejected": 0.7342086434364319, "logps/chosen": -9.958053588867188, "logps/rejected": -21.70626449584961, "loss": 0.1375, "rewards/accuracies": 1.0, "rewards/chosen": 0.24878710508346558, "rewards/margins": 4.359116554260254, "rewards/rejected": -4.110329627990723, "step": 428 }, { "epoch": 7.271186440677966, "grad_norm": 9.60457205625089, "learning_rate": 4.014559165561956e-07, "logits/chosen": 2.2311694622039795, "logits/rejected": 7.812325954437256, "logps/chosen": -9.30516242980957, "logps/rejected": -18.18703269958496, "loss": 0.1467, "rewards/accuracies": 1.0, "rewards/chosen": 0.2606554925441742, "rewards/margins": 3.457913398742676, "rewards/rejected": -3.1972575187683105, "step": 429 }, { "epoch": 7.288135593220339, "grad_norm": 9.82463351889948, "learning_rate": 4.0086687247750095e-07, "logits/chosen": -3.501877546310425, "logits/rejected": -3.8577065467834473, "logps/chosen": -7.8808465003967285, "logps/rejected": -15.253175735473633, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 0.4603765904903412, "rewards/margins": 2.3972482681274414, "rewards/rejected": -1.9368716478347778, "step": 430 }, { "epoch": 7.305084745762712, "grad_norm": 9.784793133340768, "learning_rate": 4.0027650818496226e-07, "logits/chosen": 5.14243745803833, "logits/rejected": 3.4985580444335938, "logps/chosen": -10.001608848571777, "logps/rejected": -26.74629020690918, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": -0.08906976878643036, "rewards/margins": 4.575994968414307, "rewards/rejected": -4.665064811706543, "step": 431 }, { "epoch": 7.322033898305085, "grad_norm": 10.071653241978936, "learning_rate": 3.996848288447707e-07, "logits/chosen": -4.888203144073486, "logits/rejected": -4.589768886566162, "logps/chosen": -7.157283782958984, "logps/rejected": -17.219411849975586, "loss": 0.1546, "rewards/accuracies": 1.0, "rewards/chosen": 0.3107476532459259, "rewards/margins": 2.8994295597076416, "rewards/rejected": -2.588681936264038, "step": 432 }, { "epoch": 7.338983050847458, "grad_norm": 10.22451359355856, "learning_rate": 3.9909183963462536e-07, "logits/chosen": -1.3133256435394287, "logits/rejected": 0.9133470058441162, "logps/chosen": -13.30894660949707, "logps/rejected": -25.437026977539062, "loss": 0.1597, "rewards/accuracies": 1.0, "rewards/chosen": 0.3724314570426941, "rewards/margins": 4.273044586181641, "rewards/rejected": -3.900613307952881, "step": 433 }, { "epoch": 7.3559322033898304, "grad_norm": 10.254114217242893, "learning_rate": 3.984975457436876e-07, "logits/chosen": -1.5063201189041138, "logits/rejected": 0.4495699405670166, "logps/chosen": -6.538341999053955, "logps/rejected": -13.631461143493652, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 0.3549283444881439, "rewards/margins": 2.530862808227539, "rewards/rejected": -2.1759345531463623, "step": 434 }, { "epoch": 7.372881355932203, "grad_norm": 9.787862344597166, "learning_rate": 3.979019523725361e-07, "logits/chosen": -0.7558501958847046, "logits/rejected": 1.1639087200164795, "logps/chosen": -9.700092315673828, "logps/rejected": -13.182903289794922, "loss": 0.1509, "rewards/accuracies": 1.0, "rewards/chosen": 0.4032173752784729, "rewards/margins": 2.2667555809020996, "rewards/rejected": -1.8635382652282715, "step": 435 }, { "epoch": 7.389830508474576, "grad_norm": 11.242250708423146, "learning_rate": 3.973050647331209e-07, "logits/chosen": -0.858704149723053, "logits/rejected": -0.11889542639255524, "logps/chosen": -11.472419738769531, "logps/rejected": -20.94268798828125, "loss": 0.1546, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4044639766216278, "rewards/margins": 3.0765256881713867, "rewards/rejected": -2.6720614433288574, "step": 436 }, { "epoch": 7.406779661016949, "grad_norm": 10.476314866762824, "learning_rate": 3.967068880487181e-07, "logits/chosen": -1.5288808345794678, "logits/rejected": -1.7149463891983032, "logps/chosen": -10.024654388427734, "logps/rejected": -21.697467803955078, "loss": 0.1652, "rewards/accuracies": 1.0, "rewards/chosen": 0.524260401725769, "rewards/margins": 4.019008636474609, "rewards/rejected": -3.494748592376709, "step": 437 }, { "epoch": 7.423728813559322, "grad_norm": 10.970528134679885, "learning_rate": 3.9610742755388406e-07, "logits/chosen": -2.111297369003296, "logits/rejected": 0.004118561744689941, "logps/chosen": -7.667480945587158, "logps/rejected": -13.297545433044434, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 0.4699084162712097, "rewards/margins": 2.736049175262451, "rewards/rejected": -2.2661406993865967, "step": 438 }, { "epoch": 7.440677966101695, "grad_norm": 9.842004888488471, "learning_rate": 3.955066884944094e-07, "logits/chosen": 0.6597349643707275, "logits/rejected": -0.04374626278877258, "logps/chosen": -11.741231918334961, "logps/rejected": -21.247833251953125, "loss": 0.1255, "rewards/accuracies": 1.0, "rewards/chosen": 0.4969398081302643, "rewards/margins": 3.5522348880767822, "rewards/rejected": -3.055295467376709, "step": 439 }, { "epoch": 7.4576271186440675, "grad_norm": 8.972971902148998, "learning_rate": 3.949046761272735e-07, "logits/chosen": 2.3338708877563477, "logits/rejected": 2.589047908782959, "logps/chosen": -5.845430850982666, "logps/rejected": -9.895733833312988, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": 0.6523416042327881, "rewards/margins": 1.911478042602539, "rewards/rejected": -1.2591363191604614, "step": 440 }, { "epoch": 7.47457627118644, "grad_norm": 10.066026413188887, "learning_rate": 3.9430139572059815e-07, "logits/chosen": -1.8239307403564453, "logits/rejected": -2.2824392318725586, "logps/chosen": -9.65200424194336, "logps/rejected": -22.091182708740234, "loss": 0.1518, "rewards/accuracies": 1.0, "rewards/chosen": 0.40659016370773315, "rewards/margins": 3.9431817531585693, "rewards/rejected": -3.5365915298461914, "step": 441 }, { "epoch": 7.491525423728813, "grad_norm": 10.09896085548317, "learning_rate": 3.9369685255360173e-07, "logits/chosen": 1.6324787139892578, "logits/rejected": -0.07427901029586792, "logps/chosen": -8.006882667541504, "logps/rejected": -11.395492553710938, "loss": 0.145, "rewards/accuracies": 1.0, "rewards/chosen": 0.6132466197013855, "rewards/margins": 2.4093189239501953, "rewards/rejected": -1.796072244644165, "step": 442 }, { "epoch": 7.508474576271187, "grad_norm": 10.035637606591012, "learning_rate": 3.9309105191655247e-07, "logits/chosen": -2.1883177757263184, "logits/rejected": -1.8495557308197021, "logps/chosen": -9.062346458435059, "logps/rejected": -20.01108741760254, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": 0.11416606605052948, "rewards/margins": 3.4943981170654297, "rewards/rejected": -3.380232334136963, "step": 443 }, { "epoch": 7.52542372881356, "grad_norm": 10.206926858110808, "learning_rate": 3.924839991107229e-07, "logits/chosen": -1.1415421962738037, "logits/rejected": 0.6667934656143188, "logps/chosen": -10.95050048828125, "logps/rejected": -24.7608642578125, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 0.4104726314544678, "rewards/margins": 4.508932113647461, "rewards/rejected": -4.098459720611572, "step": 444 }, { "epoch": 7.5423728813559325, "grad_norm": 9.72571019158722, "learning_rate": 3.918756994483429e-07, "logits/chosen": 2.068418025970459, "logits/rejected": 4.546534061431885, "logps/chosen": -7.813698768615723, "logps/rejected": -14.570735931396484, "loss": 0.1401, "rewards/accuracies": 1.0, "rewards/chosen": 0.4646166265010834, "rewards/margins": 2.6729631423950195, "rewards/rejected": -2.2083466053009033, "step": 445 }, { "epoch": 7.559322033898305, "grad_norm": 10.55354685964242, "learning_rate": 3.912661582525536e-07, "logits/chosen": -3.7456912994384766, "logits/rejected": -1.1216098070144653, "logps/chosen": -10.544078826904297, "logps/rejected": -15.885772705078125, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 0.6655651330947876, "rewards/margins": 2.157383680343628, "rewards/rejected": -1.4918183088302612, "step": 446 }, { "epoch": 7.576271186440678, "grad_norm": 8.874108679556333, "learning_rate": 3.906553808573604e-07, "logits/chosen": -1.0291190147399902, "logits/rejected": 0.6091427803039551, "logps/chosen": -9.206077575683594, "logps/rejected": -21.849958419799805, "loss": 0.1418, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3792378604412079, "rewards/margins": 4.100043296813965, "rewards/rejected": -3.7208056449890137, "step": 447 }, { "epoch": 7.593220338983051, "grad_norm": 9.89378514039557, "learning_rate": 3.9004337260758644e-07, "logits/chosen": 1.9020898342132568, "logits/rejected": 0.8474722504615784, "logps/chosen": -9.850662231445312, "logps/rejected": -23.219379425048828, "loss": 0.1377, "rewards/accuracies": 1.0, "rewards/chosen": 0.36318060755729675, "rewards/margins": 3.600226640701294, "rewards/rejected": -3.237046003341675, "step": 448 }, { "epoch": 7.610169491525424, "grad_norm": 10.865068959798018, "learning_rate": 3.894301388588264e-07, "logits/chosen": -0.9812300801277161, "logits/rejected": 0.4254589080810547, "logps/chosen": -10.075271606445312, "logps/rejected": -14.64460563659668, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 0.38537198305130005, "rewards/margins": 2.6098413467407227, "rewards/rejected": -2.2244694232940674, "step": 449 }, { "epoch": 7.627118644067797, "grad_norm": 10.555586251241943, "learning_rate": 3.888156849773985e-07, "logits/chosen": -1.378682017326355, "logits/rejected": 3.440859794616699, "logps/chosen": -8.580948829650879, "logps/rejected": -17.025653839111328, "loss": 0.1538, "rewards/accuracies": 1.0, "rewards/chosen": 0.21920417249202728, "rewards/margins": 3.138881206512451, "rewards/rejected": -2.919677257537842, "step": 450 }, { "epoch": 7.6440677966101696, "grad_norm": 10.209560205921942, "learning_rate": 3.882000163402983e-07, "logits/chosen": 2.072082757949829, "logits/rejected": 3.475949764251709, "logps/chosen": -10.055817604064941, "logps/rejected": -17.178794860839844, "loss": 0.146, "rewards/accuracies": 1.0, "rewards/chosen": 0.36519867181777954, "rewards/margins": 3.059540271759033, "rewards/rejected": -2.6943416595458984, "step": 451 }, { "epoch": 7.661016949152542, "grad_norm": 10.663402608136824, "learning_rate": 3.8758313833515186e-07, "logits/chosen": 0.2695200443267822, "logits/rejected": 0.7485695481300354, "logps/chosen": -10.24669075012207, "logps/rejected": -22.458515167236328, "loss": 0.1532, "rewards/accuracies": 0.9375, "rewards/chosen": -0.023169204592704773, "rewards/margins": 4.44655704498291, "rewards/rejected": -4.469725608825684, "step": 452 }, { "epoch": 7.677966101694915, "grad_norm": 10.663452691065979, "learning_rate": 3.86965056360168e-07, "logits/chosen": 0.023821651935577393, "logits/rejected": 0.6948651075363159, "logps/chosen": -7.995731353759766, "logps/rejected": -17.48453140258789, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": 0.333486407995224, "rewards/margins": 3.026156425476074, "rewards/rejected": -2.6926703453063965, "step": 453 }, { "epoch": 7.694915254237288, "grad_norm": 10.29075290611524, "learning_rate": 3.8634577582409115e-07, "logits/chosen": 2.8640151023864746, "logits/rejected": 0.4052382707595825, "logps/chosen": -5.422905921936035, "logps/rejected": -15.552241325378418, "loss": 0.1561, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40284836292266846, "rewards/margins": 2.800489664077759, "rewards/rejected": -2.397641181945801, "step": 454 }, { "epoch": 7.711864406779661, "grad_norm": 8.656662324471474, "learning_rate": 3.857253021461545e-07, "logits/chosen": -2.2958803176879883, "logits/rejected": 0.16803821921348572, "logps/chosen": -8.433700561523438, "logps/rejected": -16.503564834594727, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 0.6839694380760193, "rewards/margins": 3.1981723308563232, "rewards/rejected": -2.514202833175659, "step": 455 }, { "epoch": 7.728813559322034, "grad_norm": 9.015380165768256, "learning_rate": 3.8510364075603185e-07, "logits/chosen": 0.7340269684791565, "logits/rejected": 2.136843681335449, "logps/chosen": -8.46870231628418, "logps/rejected": -23.236953735351562, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": -0.07831863313913345, "rewards/margins": 4.905647277832031, "rewards/rejected": -4.983965873718262, "step": 456 }, { "epoch": 7.745762711864407, "grad_norm": 11.048345802234925, "learning_rate": 3.84480797093791e-07, "logits/chosen": -0.974209189414978, "logits/rejected": -1.9440693855285645, "logps/chosen": -7.408576488494873, "logps/rejected": -13.765433311462402, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": 0.5464452505111694, "rewards/margins": 2.7349631786346436, "rewards/rejected": -2.1885178089141846, "step": 457 }, { "epoch": 7.762711864406779, "grad_norm": 9.33901404478676, "learning_rate": 3.8385677660984514e-07, "logits/chosen": 2.3597500324249268, "logits/rejected": 2.794603109359741, "logps/chosen": -10.939367294311523, "logps/rejected": -29.22560691833496, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 0.06992827355861664, "rewards/margins": 5.073367118835449, "rewards/rejected": -5.003438472747803, "step": 458 }, { "epoch": 7.779661016949152, "grad_norm": 10.001912701586786, "learning_rate": 3.83231584764906e-07, "logits/chosen": -6.860588550567627, "logits/rejected": -1.7909083366394043, "logps/chosen": -11.336816787719727, "logps/rejected": -23.062660217285156, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 0.20468257367610931, "rewards/margins": 3.9968860149383545, "rewards/rejected": -3.792203426361084, "step": 459 }, { "epoch": 7.796610169491525, "grad_norm": 8.988574047583393, "learning_rate": 3.826052270299356e-07, "logits/chosen": -1.0738238096237183, "logits/rejected": 0.2171797752380371, "logps/chosen": -8.178009033203125, "logps/rejected": -14.752239227294922, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 0.6255313754081726, "rewards/margins": 3.040125846862793, "rewards/rejected": -2.4145946502685547, "step": 460 }, { "epoch": 7.813559322033898, "grad_norm": 9.050244068170231, "learning_rate": 3.8197770888609846e-07, "logits/chosen": 0.6120907664299011, "logits/rejected": 2.2083263397216797, "logps/chosen": -9.618986129760742, "logps/rejected": -16.45279312133789, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 0.29597193002700806, "rewards/margins": 3.233238935470581, "rewards/rejected": -2.937267303466797, "step": 461 }, { "epoch": 7.830508474576272, "grad_norm": 9.875563292935468, "learning_rate": 3.813490358247137e-07, "logits/chosen": -0.11616599559783936, "logits/rejected": -3.459362030029297, "logps/chosen": -8.407455444335938, "logps/rejected": -23.708166122436523, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": 0.33320125937461853, "rewards/margins": 3.724797248840332, "rewards/rejected": -3.3915958404541016, "step": 462 }, { "epoch": 7.847457627118644, "grad_norm": 8.566033021813828, "learning_rate": 3.807192133472069e-07, "logits/chosen": 0.0005160421133041382, "logits/rejected": -0.07690905034542084, "logps/chosen": -9.008073806762695, "logps/rejected": -22.778635025024414, "loss": 0.1055, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08228336274623871, "rewards/margins": 4.054976463317871, "rewards/rejected": -4.1372599601745605, "step": 463 }, { "epoch": 7.864406779661017, "grad_norm": 9.207368732490258, "learning_rate": 3.80088246965062e-07, "logits/chosen": 0.7204670906066895, "logits/rejected": 0.8215815424919128, "logps/chosen": -7.013503551483154, "logps/rejected": -18.432910919189453, "loss": 0.1173, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24773481488227844, "rewards/margins": 3.5829970836639404, "rewards/rejected": -3.3352622985839844, "step": 464 }, { "epoch": 7.88135593220339, "grad_norm": 10.374440714970705, "learning_rate": 3.794561421997734e-07, "logits/chosen": -1.2234055995941162, "logits/rejected": 0.8710986375808716, "logps/chosen": -11.177177429199219, "logps/rejected": -19.628103256225586, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 0.19700887799263, "rewards/margins": 2.63451886177063, "rewards/rejected": -2.437509775161743, "step": 465 }, { "epoch": 7.898305084745763, "grad_norm": 10.507765784627404, "learning_rate": 3.78822904582797e-07, "logits/chosen": -4.167044639587402, "logits/rejected": -2.423272132873535, "logps/chosen": -10.025394439697266, "logps/rejected": -18.540422439575195, "loss": 0.1409, "rewards/accuracies": 1.0, "rewards/chosen": 0.6198136806488037, "rewards/margins": 3.351442813873291, "rewards/rejected": -2.7316291332244873, "step": 466 }, { "epoch": 7.915254237288136, "grad_norm": 10.278415787981471, "learning_rate": 3.781885396555019e-07, "logits/chosen": -0.4185442328453064, "logits/rejected": 0.9195470809936523, "logps/chosen": -7.53629207611084, "logps/rejected": -18.34429931640625, "loss": 0.1598, "rewards/accuracies": 1.0, "rewards/chosen": 0.3725518584251404, "rewards/margins": 2.788588047027588, "rewards/rejected": -2.4160361289978027, "step": 467 }, { "epoch": 7.932203389830509, "grad_norm": 9.472880803747143, "learning_rate": 3.775530529691227e-07, "logits/chosen": 0.5384914875030518, "logits/rejected": -0.05707889795303345, "logps/chosen": -5.18881368637085, "logps/rejected": -16.295425415039062, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 0.3952220678329468, "rewards/margins": 3.6583235263824463, "rewards/rejected": -3.263101816177368, "step": 468 }, { "epoch": 7.9491525423728815, "grad_norm": 10.770547462530029, "learning_rate": 3.7691645008470997e-07, "logits/chosen": -0.21460148692131042, "logits/rejected": -1.5379085540771484, "logps/chosen": -10.350830078125, "logps/rejected": -23.51275062561035, "loss": 0.1776, "rewards/accuracies": 1.0, "rewards/chosen": 0.22241616249084473, "rewards/margins": 3.3379037380218506, "rewards/rejected": -3.115487813949585, "step": 469 }, { "epoch": 7.966101694915254, "grad_norm": 9.808497051053497, "learning_rate": 3.7627873657308206e-07, "logits/chosen": 0.3349153995513916, "logits/rejected": 0.8287909030914307, "logps/chosen": -7.198885440826416, "logps/rejected": -20.29372787475586, "loss": 0.1436, "rewards/accuracies": 1.0, "rewards/chosen": 0.5376969575881958, "rewards/margins": 3.864384174346924, "rewards/rejected": -3.3266870975494385, "step": 470 }, { "epoch": 7.983050847457627, "grad_norm": 9.697572526938723, "learning_rate": 3.7563991801477624e-07, "logits/chosen": -1.9151415824890137, "logits/rejected": 0.3530935049057007, "logps/chosen": -11.494583129882812, "logps/rejected": -14.665804862976074, "loss": 0.1399, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44558390974998474, "rewards/margins": 2.9561033248901367, "rewards/rejected": -2.51051926612854, "step": 471 }, { "epoch": 8.0, "grad_norm": 9.96471014960958, "learning_rate": 3.75e-07, "logits/chosen": -2.6446385383605957, "logits/rejected": 0.2764410376548767, "logps/chosen": -8.184099197387695, "logps/rejected": -14.341254234313965, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 0.7961742877960205, "rewards/margins": 3.1740105152130127, "rewards/rejected": -2.3778364658355713, "step": 472 }, { "epoch": 8.016949152542374, "grad_norm": 9.307834825130646, "learning_rate": 3.743589881285818e-07, "logits/chosen": -2.628649950027466, "logits/rejected": -2.251711368560791, "logps/chosen": -11.012161254882812, "logps/rejected": -15.441204071044922, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": 0.37242552638053894, "rewards/margins": 2.7831168174743652, "rewards/rejected": -2.410691022872925, "step": 473 }, { "epoch": 8.033898305084746, "grad_norm": 9.015474057721343, "learning_rate": 3.737168880099223e-07, "logits/chosen": 3.7287445068359375, "logits/rejected": 5.9911274909973145, "logps/chosen": -13.733976364135742, "logps/rejected": -18.655841827392578, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 0.7684003114700317, "rewards/margins": 2.949385643005371, "rewards/rejected": -2.18098521232605, "step": 474 }, { "epoch": 8.05084745762712, "grad_norm": 8.718508664791036, "learning_rate": 3.7307370526294553e-07, "logits/chosen": -0.2663005590438843, "logits/rejected": 3.8709819316864014, "logps/chosen": -12.367705345153809, "logps/rejected": -19.1514835357666, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 0.8267220854759216, "rewards/margins": 3.8209853172302246, "rewards/rejected": -2.994263172149658, "step": 475 }, { "epoch": 8.067796610169491, "grad_norm": 8.867026862703991, "learning_rate": 3.724294455160491e-07, "logits/chosen": 1.3887877464294434, "logits/rejected": 1.3702775239944458, "logps/chosen": -10.322896957397461, "logps/rejected": -20.81346893310547, "loss": 0.12, "rewards/accuracies": 0.9375, "rewards/chosen": 0.36723631620407104, "rewards/margins": 3.4072937965393066, "rewards/rejected": -3.040057420730591, "step": 476 }, { "epoch": 8.084745762711865, "grad_norm": 8.620420643542406, "learning_rate": 3.7178411440705556e-07, "logits/chosen": 1.245548129081726, "logits/rejected": 2.1392176151275635, "logps/chosen": -9.19916820526123, "logps/rejected": -18.40573501586914, "loss": 0.1326, "rewards/accuracies": 0.9375, "rewards/chosen": 0.26294079422950745, "rewards/margins": 3.6965670585632324, "rewards/rejected": -3.433626651763916, "step": 477 }, { "epoch": 8.101694915254237, "grad_norm": 8.485802757673804, "learning_rate": 3.7113771758316255e-07, "logits/chosen": -1.3473069667816162, "logits/rejected": -0.09861618280410767, "logps/chosen": -10.57664966583252, "logps/rejected": -15.270515441894531, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 0.6301343441009521, "rewards/margins": 2.7143757343292236, "rewards/rejected": -2.0842413902282715, "step": 478 }, { "epoch": 8.11864406779661, "grad_norm": 7.85197378030076, "learning_rate": 3.704902607008938e-07, "logits/chosen": -5.499878883361816, "logits/rejected": -2.5568430423736572, "logps/chosen": -12.366991996765137, "logps/rejected": -19.026046752929688, "loss": 0.1019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5573859214782715, "rewards/margins": 3.0999138355255127, "rewards/rejected": -2.5425281524658203, "step": 479 }, { "epoch": 8.135593220338983, "grad_norm": 8.267677399685025, "learning_rate": 3.698417494260494e-07, "logits/chosen": -3.2886805534362793, "logits/rejected": 1.1429243087768555, "logps/chosen": -11.720975875854492, "logps/rejected": -20.598499298095703, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 0.3746921122074127, "rewards/margins": 4.293571949005127, "rewards/rejected": -3.918879985809326, "step": 480 }, { "epoch": 8.152542372881356, "grad_norm": 9.251264900051627, "learning_rate": 3.691921894336563e-07, "logits/chosen": -2.1884472370147705, "logits/rejected": 0.30788105726242065, "logps/chosen": -8.096731185913086, "logps/rejected": -19.28396987915039, "loss": 0.1377, "rewards/accuracies": 1.0, "rewards/chosen": 0.2780647873878479, "rewards/margins": 4.298841953277588, "rewards/rejected": -4.020777225494385, "step": 481 }, { "epoch": 8.169491525423728, "grad_norm": 8.218055554135557, "learning_rate": 3.685415864079185e-07, "logits/chosen": -0.6059857606887817, "logits/rejected": 3.4607341289520264, "logps/chosen": -11.999606132507324, "logps/rejected": -26.547691345214844, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": 0.3711860775947571, "rewards/margins": 4.486515998840332, "rewards/rejected": -4.115329742431641, "step": 482 }, { "epoch": 8.186440677966102, "grad_norm": 7.924489428934806, "learning_rate": 3.6788994604216764e-07, "logits/chosen": 1.4279998540878296, "logits/rejected": 0.009126663208007812, "logps/chosen": -8.281014442443848, "logps/rejected": -22.807849884033203, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": -0.12314409017562866, "rewards/margins": 4.270468711853027, "rewards/rejected": -4.393612861633301, "step": 483 }, { "epoch": 8.203389830508474, "grad_norm": 7.961845338761622, "learning_rate": 3.6723727403881275e-07, "logits/chosen": -1.9475922584533691, "logits/rejected": 1.0407593250274658, "logps/chosen": -10.443235397338867, "logps/rejected": -19.41378402709961, "loss": 0.1175, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6596993207931519, "rewards/margins": 3.718507766723633, "rewards/rejected": -3.0588088035583496, "step": 484 }, { "epoch": 8.220338983050848, "grad_norm": 9.857929233042386, "learning_rate": 3.665835761092908e-07, "logits/chosen": -6.954108238220215, "logits/rejected": -5.27763032913208, "logps/chosen": -10.771953582763672, "logps/rejected": -15.511201858520508, "loss": 0.1441, "rewards/accuracies": 1.0, "rewards/chosen": 0.36188673973083496, "rewards/margins": 2.4951171875, "rewards/rejected": -2.133230209350586, "step": 485 }, { "epoch": 8.23728813559322, "grad_norm": 8.273526335703513, "learning_rate": 3.659288579740163e-07, "logits/chosen": -0.05287922918796539, "logits/rejected": 3.0080344676971436, "logps/chosen": -13.40024185180664, "logps/rejected": -19.83489418029785, "loss": 0.1245, "rewards/accuracies": 1.0, "rewards/chosen": 0.5862030982971191, "rewards/margins": 3.320868492126465, "rewards/rejected": -2.7346653938293457, "step": 486 }, { "epoch": 8.254237288135593, "grad_norm": 8.034704759700503, "learning_rate": 3.6527312536233147e-07, "logits/chosen": 3.8432722091674805, "logits/rejected": 2.92391037940979, "logps/chosen": -7.863999366760254, "logps/rejected": -19.7979793548584, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 0.3808861970901489, "rewards/margins": 3.8671412467956543, "rewards/rejected": -3.486255168914795, "step": 487 }, { "epoch": 8.271186440677965, "grad_norm": 7.529231991013856, "learning_rate": 3.646163840124561e-07, "logits/chosen": -2.5350327491760254, "logits/rejected": -4.587088584899902, "logps/chosen": -9.05827522277832, "logps/rejected": -19.514944076538086, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": 0.37517881393432617, "rewards/margins": 3.4480714797973633, "rewards/rejected": -3.072892904281616, "step": 488 }, { "epoch": 8.288135593220339, "grad_norm": 8.139721923043574, "learning_rate": 3.639586396714374e-07, "logits/chosen": -2.1481618881225586, "logits/rejected": -3.349088191986084, "logps/chosen": -7.184938907623291, "logps/rejected": -14.99942398071289, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 0.43559348583221436, "rewards/margins": 3.009117603302002, "rewards/rejected": -2.573524236679077, "step": 489 }, { "epoch": 8.305084745762711, "grad_norm": 8.643611452832333, "learning_rate": 3.6329989809509933e-07, "logits/chosen": -1.287747859954834, "logits/rejected": 0.13855817914009094, "logps/chosen": -8.292243957519531, "logps/rejected": -19.404233932495117, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/chosen": 0.5273905396461487, "rewards/margins": 4.194699764251709, "rewards/rejected": -3.667309045791626, "step": 490 }, { "epoch": 8.322033898305085, "grad_norm": 8.788616790979715, "learning_rate": 3.626401650479927e-07, "logits/chosen": -1.215428352355957, "logits/rejected": -0.3887985944747925, "logps/chosen": -7.66940975189209, "logps/rejected": -18.244686126708984, "loss": 0.1232, "rewards/accuracies": 1.0, "rewards/chosen": 0.2789570093154907, "rewards/margins": 3.3360958099365234, "rewards/rejected": -3.057138681411743, "step": 491 }, { "epoch": 8.338983050847457, "grad_norm": 8.939118589459243, "learning_rate": 3.6197944630334465e-07, "logits/chosen": -3.850637912750244, "logits/rejected": -3.5334858894348145, "logps/chosen": -8.223553657531738, "logps/rejected": -18.250638961791992, "loss": 0.131, "rewards/accuracies": 0.9375, "rewards/chosen": 0.587809145450592, "rewards/margins": 3.762028694152832, "rewards/rejected": -3.1742193698883057, "step": 492 }, { "epoch": 8.35593220338983, "grad_norm": 8.69530410733376, "learning_rate": 3.6131774764300785e-07, "logits/chosen": 1.4184993505477905, "logits/rejected": 1.3965762853622437, "logps/chosen": -8.896749496459961, "logps/rejected": -15.651692390441895, "loss": 0.1241, "rewards/accuracies": 1.0, "rewards/chosen": 0.4817228317260742, "rewards/margins": 2.8906168937683105, "rewards/rejected": -2.4088940620422363, "step": 493 }, { "epoch": 8.372881355932204, "grad_norm": 7.73443230414031, "learning_rate": 3.6065507485741e-07, "logits/chosen": -0.946702241897583, "logits/rejected": 2.4696478843688965, "logps/chosen": -10.482526779174805, "logps/rejected": -19.93212127685547, "loss": 0.1066, "rewards/accuracies": 1.0, "rewards/chosen": 0.24369022250175476, "rewards/margins": 3.84941029548645, "rewards/rejected": -3.605720281600952, "step": 494 }, { "epoch": 8.389830508474576, "grad_norm": 8.36862719416917, "learning_rate": 3.5999143374550334e-07, "logits/chosen": -2.797267198562622, "logits/rejected": -0.19566676020622253, "logps/chosen": -13.18895149230957, "logps/rejected": -21.73252296447754, "loss": 0.1216, "rewards/accuracies": 1.0, "rewards/chosen": 0.17871427536010742, "rewards/margins": 3.6249942779541016, "rewards/rejected": -3.446279764175415, "step": 495 }, { "epoch": 8.40677966101695, "grad_norm": 8.949802588720999, "learning_rate": 3.593268301147139e-07, "logits/chosen": -0.16709482669830322, "logits/rejected": 1.0957790613174438, "logps/chosen": -9.600016593933105, "logps/rejected": -18.39324188232422, "loss": 0.1255, "rewards/accuracies": 1.0, "rewards/chosen": 0.3831973075866699, "rewards/margins": 3.5357797145843506, "rewards/rejected": -3.1525821685791016, "step": 496 }, { "epoch": 8.423728813559322, "grad_norm": 8.700313769157882, "learning_rate": 3.586612697808902e-07, "logits/chosen": -3.6146655082702637, "logits/rejected": -0.2656732201576233, "logps/chosen": -9.611889839172363, "logps/rejected": -17.33000373840332, "loss": 0.1255, "rewards/accuracies": 1.0, "rewards/chosen": 0.5966626405715942, "rewards/margins": 3.35612416267395, "rewards/rejected": -2.7594618797302246, "step": 497 }, { "epoch": 8.440677966101696, "grad_norm": 8.226137318837338, "learning_rate": 3.579947585682532e-07, "logits/chosen": -0.020598918199539185, "logits/rejected": -1.0154800415039062, "logps/chosen": -8.14766788482666, "logps/rejected": -22.971723556518555, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 0.3941883444786072, "rewards/margins": 4.393586158752441, "rewards/rejected": -3.9993979930877686, "step": 498 }, { "epoch": 8.457627118644067, "grad_norm": 8.87014354799229, "learning_rate": 3.573273023093446e-07, "logits/chosen": -1.4326260089874268, "logits/rejected": 0.6970917582511902, "logps/chosen": -11.785725593566895, "logps/rejected": -22.613496780395508, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 0.1566278487443924, "rewards/margins": 4.172502040863037, "rewards/rejected": -4.01587438583374, "step": 499 }, { "epoch": 8.474576271186441, "grad_norm": 8.695490568604848, "learning_rate": 3.5665890684497605e-07, "logits/chosen": 3.693131685256958, "logits/rejected": 2.3012924194335938, "logps/chosen": -9.150883674621582, "logps/rejected": -21.891143798828125, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 0.2128208726644516, "rewards/margins": 4.2375898361206055, "rewards/rejected": -4.024769306182861, "step": 500 }, { "epoch": 8.491525423728813, "grad_norm": 8.223688535482426, "learning_rate": 3.559895780241781e-07, "logits/chosen": -2.5774970054626465, "logits/rejected": 2.790902614593506, "logps/chosen": -13.14190673828125, "logps/rejected": -14.023480415344238, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 0.5792840719223022, "rewards/margins": 2.0238969326019287, "rewards/rejected": -1.444612741470337, "step": 501 }, { "epoch": 8.508474576271187, "grad_norm": 8.31992146309658, "learning_rate": 3.553193217041489e-07, "logits/chosen": -2.0898971557617188, "logits/rejected": -1.4773600101470947, "logps/chosen": -10.491347312927246, "logps/rejected": -17.18193244934082, "loss": 0.1033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5015342831611633, "rewards/margins": 3.777496337890625, "rewards/rejected": -3.2759618759155273, "step": 502 }, { "epoch": 8.525423728813559, "grad_norm": 8.792292957123891, "learning_rate": 3.546481437502032e-07, "logits/chosen": 0.26719558238983154, "logits/rejected": -0.14614611864089966, "logps/chosen": -9.920225143432617, "logps/rejected": -20.05290412902832, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": 0.27186596393585205, "rewards/margins": 3.6169862747192383, "rewards/rejected": -3.3451201915740967, "step": 503 }, { "epoch": 8.542372881355933, "grad_norm": 8.522509964933947, "learning_rate": 3.539760500357206e-07, "logits/chosen": -2.289971351623535, "logits/rejected": 0.8186124563217163, "logps/chosen": -10.877229690551758, "logps/rejected": -17.28212547302246, "loss": 0.1088, "rewards/accuracies": 1.0, "rewards/chosen": 0.41209787130355835, "rewards/margins": 3.048102378845215, "rewards/rejected": -2.6360044479370117, "step": 504 }, { "epoch": 8.559322033898304, "grad_norm": 7.291393026249104, "learning_rate": 3.533030464420945e-07, "logits/chosen": -3.1184239387512207, "logits/rejected": -2.1694648265838623, "logps/chosen": -12.604846000671387, "logps/rejected": -22.09140396118164, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": 0.24311965703964233, "rewards/margins": 3.960993528366089, "rewards/rejected": -3.71787428855896, "step": 505 }, { "epoch": 8.576271186440678, "grad_norm": 8.644017276232843, "learning_rate": 3.526291388586806e-07, "logits/chosen": 0.4400590658187866, "logits/rejected": -2.208275079727173, "logps/chosen": -7.662543296813965, "logps/rejected": -19.525211334228516, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 0.34743475914001465, "rewards/margins": 3.0827534198760986, "rewards/rejected": -2.735318660736084, "step": 506 }, { "epoch": 8.59322033898305, "grad_norm": 7.526044625787631, "learning_rate": 3.5195433318274515e-07, "logits/chosen": -1.0548756122589111, "logits/rejected": -0.435749351978302, "logps/chosen": -10.05634880065918, "logps/rejected": -19.988792419433594, "loss": 0.0953, "rewards/accuracies": 1.0, "rewards/chosen": 0.1322399377822876, "rewards/margins": 3.569153308868408, "rewards/rejected": -3.43691349029541, "step": 507 }, { "epoch": 8.610169491525424, "grad_norm": 7.766458527111232, "learning_rate": 3.5127863531941335e-07, "logits/chosen": -1.3730721473693848, "logits/rejected": -3.2859532833099365, "logps/chosen": -8.660323143005371, "logps/rejected": -23.785953521728516, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 0.35617661476135254, "rewards/margins": 4.860879898071289, "rewards/rejected": -4.504702568054199, "step": 508 }, { "epoch": 8.627118644067796, "grad_norm": 9.345905694030758, "learning_rate": 3.5060205118161816e-07, "logits/chosen": -1.1814749240875244, "logits/rejected": 2.160520553588867, "logps/chosen": -12.414260864257812, "logps/rejected": -21.517459869384766, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 0.5118588209152222, "rewards/margins": 4.0089640617370605, "rewards/rejected": -3.497105360031128, "step": 509 }, { "epoch": 8.64406779661017, "grad_norm": 8.036367102131655, "learning_rate": 3.49924586690048e-07, "logits/chosen": -6.377943515777588, "logits/rejected": -2.662942886352539, "logps/chosen": -13.506998062133789, "logps/rejected": -15.274138450622559, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 0.6932135820388794, "rewards/margins": 2.7833752632141113, "rewards/rejected": -2.0901618003845215, "step": 510 }, { "epoch": 8.661016949152543, "grad_norm": 8.350289239582875, "learning_rate": 3.4924624777309504e-07, "logits/chosen": -0.48674535751342773, "logits/rejected": 0.6451427936553955, "logps/chosen": -9.116971969604492, "logps/rejected": -24.52935028076172, "loss": 0.113, "rewards/accuracies": 1.0, "rewards/chosen": -0.22862689197063446, "rewards/margins": 4.943426132202148, "rewards/rejected": -5.172053337097168, "step": 511 }, { "epoch": 8.677966101694915, "grad_norm": 8.998528825313201, "learning_rate": 3.4856704036680355e-07, "logits/chosen": 0.11914621293544769, "logits/rejected": -0.580173909664154, "logps/chosen": -8.883934020996094, "logps/rejected": -19.471128463745117, "loss": 0.1169, "rewards/accuracies": 1.0, "rewards/chosen": 0.2940840423107147, "rewards/margins": 2.7323853969573975, "rewards/rejected": -2.4383013248443604, "step": 512 }, { "epoch": 8.694915254237289, "grad_norm": 7.853420086841468, "learning_rate": 3.4788697041481786e-07, "logits/chosen": -0.5393266081809998, "logits/rejected": -1.3182928562164307, "logps/chosen": -7.222754001617432, "logps/rejected": -24.662555694580078, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 0.7391365766525269, "rewards/margins": 4.932739734649658, "rewards/rejected": -4.193603515625, "step": 513 }, { "epoch": 8.711864406779661, "grad_norm": 9.392762891289523, "learning_rate": 3.472060438683302e-07, "logits/chosen": -2.0386834144592285, "logits/rejected": 0.20640242099761963, "logps/chosen": -13.925080299377441, "logps/rejected": -24.007619857788086, "loss": 0.1249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1902267336845398, "rewards/margins": 4.3349409103393555, "rewards/rejected": -4.144713878631592, "step": 514 }, { "epoch": 8.728813559322035, "grad_norm": 8.50842429830542, "learning_rate": 3.4652426668602863e-07, "logits/chosen": -1.551865577697754, "logits/rejected": -1.284023642539978, "logps/chosen": -7.325331687927246, "logps/rejected": -18.12867546081543, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 0.25790315866470337, "rewards/margins": 4.056914329528809, "rewards/rejected": -3.799011468887329, "step": 515 }, { "epoch": 8.745762711864407, "grad_norm": 8.880551860604504, "learning_rate": 3.4584164483404535e-07, "logits/chosen": -7.2592034339904785, "logits/rejected": -6.2020769119262695, "logps/chosen": -5.269482135772705, "logps/rejected": -12.893440246582031, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 0.49459022283554077, "rewards/margins": 3.262979030609131, "rewards/rejected": -2.7683887481689453, "step": 516 }, { "epoch": 8.76271186440678, "grad_norm": 8.4228523758051, "learning_rate": 3.4515818428590393e-07, "logits/chosen": -0.3235975503921509, "logits/rejected": 4.60111665725708, "logps/chosen": -8.66872501373291, "logps/rejected": -18.84377098083496, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 0.2904120087623596, "rewards/margins": 3.9645678997039795, "rewards/rejected": -3.6741552352905273, "step": 517 }, { "epoch": 8.779661016949152, "grad_norm": 8.78305720155635, "learning_rate": 3.444738910224671e-07, "logits/chosen": -2.8099265098571777, "logits/rejected": -2.0938467979431152, "logps/chosen": -9.935127258300781, "logps/rejected": -16.927433013916016, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 0.39175349473953247, "rewards/margins": 2.82077693939209, "rewards/rejected": -2.429023265838623, "step": 518 }, { "epoch": 8.796610169491526, "grad_norm": 9.039209454575285, "learning_rate": 3.437887710318848e-07, "logits/chosen": -2.7464828491210938, "logits/rejected": -0.2714478373527527, "logps/chosen": -8.532281875610352, "logps/rejected": -18.432205200195312, "loss": 0.1052, "rewards/accuracies": 1.0, "rewards/chosen": 0.6497199535369873, "rewards/margins": 3.2518179416656494, "rewards/rejected": -2.602097511291504, "step": 519 }, { "epoch": 8.813559322033898, "grad_norm": 9.059999115636574, "learning_rate": 3.4310283030954146e-07, "logits/chosen": -4.571805953979492, "logits/rejected": 1.363074779510498, "logps/chosen": -10.748517990112305, "logps/rejected": -19.172378540039062, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 0.40731969475746155, "rewards/margins": 3.988267660140991, "rewards/rejected": -3.5809478759765625, "step": 520 }, { "epoch": 8.830508474576272, "grad_norm": 8.9775888768164, "learning_rate": 3.4241607485800363e-07, "logits/chosen": 2.731873035430908, "logits/rejected": 4.494401931762695, "logps/chosen": -8.267045974731445, "logps/rejected": -22.165498733520508, "loss": 0.1322, "rewards/accuracies": 0.9375, "rewards/chosen": 0.14054572582244873, "rewards/margins": 4.468079090118408, "rewards/rejected": -4.327533721923828, "step": 521 }, { "epoch": 8.847457627118644, "grad_norm": 8.025227471056242, "learning_rate": 3.417285106869673e-07, "logits/chosen": -1.5573604106903076, "logits/rejected": -1.8264210224151611, "logps/chosen": -11.341778755187988, "logps/rejected": -19.851980209350586, "loss": 0.1038, "rewards/accuracies": 1.0, "rewards/chosen": 0.5154696702957153, "rewards/margins": 3.7622079849243164, "rewards/rejected": -3.2467384338378906, "step": 522 }, { "epoch": 8.864406779661017, "grad_norm": 9.101302965192287, "learning_rate": 3.4104014381320555e-07, "logits/chosen": 0.13104411959648132, "logits/rejected": -0.2756563723087311, "logps/chosen": -10.39586067199707, "logps/rejected": -18.803264617919922, "loss": 0.1399, "rewards/accuracies": 1.0, "rewards/chosen": 0.21374063193798065, "rewards/margins": 3.2453413009643555, "rewards/rejected": -3.0316004753112793, "step": 523 }, { "epoch": 8.88135593220339, "grad_norm": 9.570891193025966, "learning_rate": 3.403509802605159e-07, "logits/chosen": 1.3925392627716064, "logits/rejected": 0.888724148273468, "logps/chosen": -8.070535659790039, "logps/rejected": -20.617847442626953, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": -0.059673890471458435, "rewards/margins": 4.375804901123047, "rewards/rejected": -4.435479164123535, "step": 524 }, { "epoch": 8.898305084745763, "grad_norm": 8.31602132944693, "learning_rate": 3.396610260596673e-07, "logits/chosen": -0.9920660257339478, "logits/rejected": 1.4377026557922363, "logps/chosen": -12.001575469970703, "logps/rejected": -24.17807388305664, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 0.287417471408844, "rewards/margins": 3.8803770542144775, "rewards/rejected": -3.5929596424102783, "step": 525 }, { "epoch": 8.915254237288135, "grad_norm": 9.29327403297671, "learning_rate": 3.389702872483477e-07, "logits/chosen": -5.677180290222168, "logits/rejected": -4.914144515991211, "logps/chosen": -8.847039222717285, "logps/rejected": -15.790771484375, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 0.6716170907020569, "rewards/margins": 2.9184117317199707, "rewards/rejected": -2.2467947006225586, "step": 526 }, { "epoch": 8.932203389830509, "grad_norm": 7.609548901847679, "learning_rate": 3.38278769871111e-07, "logits/chosen": -3.445300579071045, "logits/rejected": -0.7922675013542175, "logps/chosen": -8.873404502868652, "logps/rejected": -18.53123664855957, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 0.34169894456863403, "rewards/margins": 3.791015386581421, "rewards/rejected": -3.4493167400360107, "step": 527 }, { "epoch": 8.94915254237288, "grad_norm": 8.765971983118401, "learning_rate": 3.375864799793242e-07, "logits/chosen": -2.4345242977142334, "logits/rejected": -0.5172919034957886, "logps/chosen": -9.083272933959961, "logps/rejected": -16.6153564453125, "loss": 0.1149, "rewards/accuracies": 1.0, "rewards/chosen": 0.4702245891094208, "rewards/margins": 3.8038463592529297, "rewards/rejected": -3.3336217403411865, "step": 528 }, { "epoch": 8.966101694915254, "grad_norm": 9.413770924733209, "learning_rate": 3.368934236311143e-07, "logits/chosen": 0.7276126742362976, "logits/rejected": 0.228562593460083, "logps/chosen": -11.080302238464355, "logps/rejected": -17.104206085205078, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 0.5077924728393555, "rewards/margins": 2.7850677967071533, "rewards/rejected": -2.2772750854492188, "step": 529 }, { "epoch": 8.983050847457626, "grad_norm": 8.323751589388499, "learning_rate": 3.361996068913159e-07, "logits/chosen": -4.382465839385986, "logits/rejected": -1.4773523807525635, "logps/chosen": -8.98313045501709, "logps/rejected": -20.052814483642578, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 0.1586902141571045, "rewards/margins": 4.111644744873047, "rewards/rejected": -3.9529545307159424, "step": 530 }, { "epoch": 9.0, "grad_norm": 7.938869997211871, "learning_rate": 3.355050358314172e-07, "logits/chosen": -3.1977574825286865, "logits/rejected": -1.4724467992782593, "logps/chosen": -8.792470932006836, "logps/rejected": -23.886981964111328, "loss": 0.103, "rewards/accuracies": 0.9375, "rewards/chosen": 0.39109301567077637, "rewards/margins": 5.085795879364014, "rewards/rejected": -4.6947021484375, "step": 531 }, { "epoch": 9.016949152542374, "grad_norm": 8.223351028454932, "learning_rate": 3.348097165295075e-07, "logits/chosen": -3.76702880859375, "logits/rejected": -3.2690422534942627, "logps/chosen": -10.75297737121582, "logps/rejected": -22.901792526245117, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": -0.2717806398868561, "rewards/margins": 4.02412223815918, "rewards/rejected": -4.295902729034424, "step": 532 }, { "epoch": 9.033898305084746, "grad_norm": 7.03793187410003, "learning_rate": 3.341136550702241e-07, "logits/chosen": -2.2170581817626953, "logits/rejected": -2.0040411949157715, "logps/chosen": -11.038703918457031, "logps/rejected": -21.323108673095703, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 0.13433638215065002, "rewards/margins": 3.86452317237854, "rewards/rejected": -3.730186700820923, "step": 533 }, { "epoch": 9.05084745762712, "grad_norm": 7.602772575996602, "learning_rate": 3.334168575446985e-07, "logits/chosen": -5.060166835784912, "logits/rejected": -4.465640544891357, "logps/chosen": -11.088794708251953, "logps/rejected": -18.969402313232422, "loss": 0.1035, "rewards/accuracies": 1.0, "rewards/chosen": 0.3551236391067505, "rewards/margins": 3.3989412784576416, "rewards/rejected": -3.0438175201416016, "step": 534 }, { "epoch": 9.067796610169491, "grad_norm": 6.440621371807224, "learning_rate": 3.327193300505035e-07, "logits/chosen": -0.4635174870491028, "logits/rejected": -2.2793354988098145, "logps/chosen": -9.368732452392578, "logps/rejected": -23.782150268554688, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": 0.38101768493652344, "rewards/margins": 3.7073676586151123, "rewards/rejected": -3.326350212097168, "step": 535 }, { "epoch": 9.084745762711865, "grad_norm": 7.385659302243604, "learning_rate": 3.3202107869159967e-07, "logits/chosen": -5.72601842880249, "logits/rejected": -2.5037083625793457, "logps/chosen": -10.806863784790039, "logps/rejected": -22.083637237548828, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 0.5078091621398926, "rewards/margins": 3.9449386596679688, "rewards/rejected": -3.4371299743652344, "step": 536 }, { "epoch": 9.101694915254237, "grad_norm": 7.252768835140123, "learning_rate": 3.313221095782822e-07, "logits/chosen": -4.031281471252441, "logits/rejected": -0.6388194561004639, "logps/chosen": -11.841888427734375, "logps/rejected": -23.276737213134766, "loss": 0.1023, "rewards/accuracies": 1.0, "rewards/chosen": 0.25421464443206787, "rewards/margins": 3.5815634727478027, "rewards/rejected": -3.3273487091064453, "step": 537 }, { "epoch": 9.11864406779661, "grad_norm": 7.284892827250599, "learning_rate": 3.306224288271272e-07, "logits/chosen": -1.3736153841018677, "logits/rejected": -0.9906230568885803, "logps/chosen": -6.747933387756348, "logps/rejected": -14.778159141540527, "loss": 0.0871, "rewards/accuracies": 1.0, "rewards/chosen": 0.5323705673217773, "rewards/margins": 3.7420241832733154, "rewards/rejected": -3.20965313911438, "step": 538 }, { "epoch": 9.135593220338983, "grad_norm": 8.256668503021041, "learning_rate": 3.2992204256093807e-07, "logits/chosen": -6.387026786804199, "logits/rejected": -6.708343505859375, "logps/chosen": -9.023993492126465, "logps/rejected": -20.56617546081543, "loss": 0.1127, "rewards/accuracies": 1.0, "rewards/chosen": 0.5091872215270996, "rewards/margins": 4.272690296173096, "rewards/rejected": -3.763503313064575, "step": 539 }, { "epoch": 9.152542372881356, "grad_norm": 7.188798518332266, "learning_rate": 3.2922095690869224e-07, "logits/chosen": -3.261608362197876, "logits/rejected": -1.6736987829208374, "logps/chosen": -7.519070148468018, "logps/rejected": -19.2508487701416, "loss": 0.0863, "rewards/accuracies": 1.0, "rewards/chosen": 0.38497814536094666, "rewards/margins": 4.242589950561523, "rewards/rejected": -3.857612133026123, "step": 540 }, { "epoch": 9.169491525423728, "grad_norm": 7.53021595804872, "learning_rate": 3.2851917800548725e-07, "logits/chosen": -2.0664360523223877, "logits/rejected": -2.3352723121643066, "logps/chosen": -11.097540855407715, "logps/rejected": -26.991085052490234, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 0.5812000036239624, "rewards/margins": 5.354377746582031, "rewards/rejected": -4.773178577423096, "step": 541 }, { "epoch": 9.186440677966102, "grad_norm": 9.484835899086871, "learning_rate": 3.278167119924871e-07, "logits/chosen": -3.301103353500366, "logits/rejected": -1.0746486186981201, "logps/chosen": -10.791460990905762, "logps/rejected": -16.7052059173584, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": 0.34394583106040955, "rewards/margins": 3.833061456680298, "rewards/rejected": -3.4891157150268555, "step": 542 }, { "epoch": 9.203389830508474, "grad_norm": 7.058316114345679, "learning_rate": 3.2711356501686886e-07, "logits/chosen": -2.03525710105896, "logits/rejected": -1.2074201107025146, "logps/chosen": -9.512802124023438, "logps/rejected": -23.904556274414062, "loss": 0.0913, "rewards/accuracies": 0.9375, "rewards/chosen": 0.09114706516265869, "rewards/margins": 4.969361305236816, "rewards/rejected": -4.878214359283447, "step": 543 }, { "epoch": 9.220338983050848, "grad_norm": 6.5571727716017545, "learning_rate": 3.2640974323176843e-07, "logits/chosen": -6.242308616638184, "logits/rejected": -4.988770008087158, "logps/chosen": -7.075240135192871, "logps/rejected": -18.64996910095215, "loss": 0.086, "rewards/accuracies": 0.9375, "rewards/chosen": 0.35722628235816956, "rewards/margins": 4.036096096038818, "rewards/rejected": -3.6788697242736816, "step": 544 }, { "epoch": 9.23728813559322, "grad_norm": 7.197720127226005, "learning_rate": 3.257052527962269e-07, "logits/chosen": -7.732370853424072, "logits/rejected": -6.282393455505371, "logps/chosen": -10.46985912322998, "logps/rejected": -16.54497528076172, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 0.39111438393592834, "rewards/margins": 2.915557384490967, "rewards/rejected": -2.5244431495666504, "step": 545 }, { "epoch": 9.254237288135593, "grad_norm": 8.563229116322319, "learning_rate": 3.250000998751365e-07, "logits/chosen": -1.0374642610549927, "logits/rejected": -0.5755556225776672, "logps/chosen": -8.41156005859375, "logps/rejected": -18.4447078704834, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 0.36313942074775696, "rewards/margins": 3.338423728942871, "rewards/rejected": -2.9752843379974365, "step": 546 }, { "epoch": 9.271186440677965, "grad_norm": 8.40637322936493, "learning_rate": 3.2429429063918694e-07, "logits/chosen": -0.7971823215484619, "logits/rejected": -2.113372325897217, "logps/chosen": -7.886877536773682, "logps/rejected": -16.608707427978516, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 0.35710078477859497, "rewards/margins": 3.141465663909912, "rewards/rejected": -2.784365177154541, "step": 547 }, { "epoch": 9.288135593220339, "grad_norm": 7.915859269522675, "learning_rate": 3.235878312648112e-07, "logits/chosen": -1.2610503435134888, "logits/rejected": -2.4442033767700195, "logps/chosen": -6.95026969909668, "logps/rejected": -21.118806838989258, "loss": 0.1182, "rewards/accuracies": 1.0, "rewards/chosen": 0.18833884596824646, "rewards/margins": 4.624729633331299, "rewards/rejected": -4.436390399932861, "step": 548 }, { "epoch": 9.305084745762711, "grad_norm": 8.306527345428725, "learning_rate": 3.2288072793413147e-07, "logits/chosen": -3.399160385131836, "logits/rejected": -2.3630385398864746, "logps/chosen": -9.676424026489258, "logps/rejected": -16.499492645263672, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 0.6130249500274658, "rewards/margins": 3.416779041290283, "rewards/rejected": -2.8037540912628174, "step": 549 }, { "epoch": 9.322033898305085, "grad_norm": 7.279865363183246, "learning_rate": 3.2217298683490525e-07, "logits/chosen": -2.800569772720337, "logits/rejected": -3.325770139694214, "logps/chosen": -7.290124893188477, "logps/rejected": -14.377586364746094, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 0.6072530746459961, "rewards/margins": 3.4065473079681396, "rewards/rejected": -2.7992939949035645, "step": 550 }, { "epoch": 9.338983050847457, "grad_norm": 7.729749155687732, "learning_rate": 3.214646141604709e-07, "logits/chosen": -6.679795742034912, "logits/rejected": -2.8563101291656494, "logps/chosen": -14.170299530029297, "logps/rejected": -16.713886260986328, "loss": 0.1021, "rewards/accuracies": 1.0, "rewards/chosen": 0.821824848651886, "rewards/margins": 2.613480567932129, "rewards/rejected": -1.7916556596755981, "step": 551 }, { "epoch": 9.35593220338983, "grad_norm": 7.005656049681332, "learning_rate": 3.2075561610969347e-07, "logits/chosen": -2.8699162006378174, "logits/rejected": -0.10029095411300659, "logps/chosen": -12.255592346191406, "logps/rejected": -25.76495361328125, "loss": 0.0979, "rewards/accuracies": 0.9375, "rewards/chosen": -0.16769805550575256, "rewards/margins": 4.608583450317383, "rewards/rejected": -4.77628231048584, "step": 552 }, { "epoch": 9.372881355932204, "grad_norm": 7.258931690887565, "learning_rate": 3.200459988869111e-07, "logits/chosen": -2.4785892963409424, "logits/rejected": 0.07733534276485443, "logps/chosen": -10.000097274780273, "logps/rejected": -19.482460021972656, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 0.09004150331020355, "rewards/margins": 4.15089225769043, "rewards/rejected": -4.060850620269775, "step": 553 }, { "epoch": 9.389830508474576, "grad_norm": 7.006363318272141, "learning_rate": 3.193357687018797e-07, "logits/chosen": 4.693775653839111, "logits/rejected": 4.2272820472717285, "logps/chosen": -10.474517822265625, "logps/rejected": -27.757699966430664, "loss": 0.0903, "rewards/accuracies": 1.0, "rewards/chosen": -0.13867327570915222, "rewards/margins": 5.896880626678467, "rewards/rejected": -6.035553932189941, "step": 554 }, { "epoch": 9.40677966101695, "grad_norm": 7.35604084339138, "learning_rate": 3.186249317697194e-07, "logits/chosen": 2.288403272628784, "logits/rejected": 6.304751873016357, "logps/chosen": -14.09248161315918, "logps/rejected": -24.245834350585938, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 0.20254839956760406, "rewards/margins": 4.757707595825195, "rewards/rejected": -4.555159091949463, "step": 555 }, { "epoch": 9.423728813559322, "grad_norm": 7.092403419199733, "learning_rate": 3.1791349431085965e-07, "logits/chosen": -1.7431526184082031, "logits/rejected": 1.8960849046707153, "logps/chosen": -9.011571884155273, "logps/rejected": -23.43387794494629, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 0.27249670028686523, "rewards/margins": 5.282358169555664, "rewards/rejected": -5.009861946105957, "step": 556 }, { "epoch": 9.440677966101696, "grad_norm": 8.175083653294603, "learning_rate": 3.1720146255098537e-07, "logits/chosen": -2.9318580627441406, "logits/rejected": -0.37776219844818115, "logps/chosen": -10.542088508605957, "logps/rejected": -25.695281982421875, "loss": 0.0941, "rewards/accuracies": 1.0, "rewards/chosen": -0.04570953547954559, "rewards/margins": 5.096731662750244, "rewards/rejected": -5.142441749572754, "step": 557 }, { "epoch": 9.457627118644067, "grad_norm": 7.329507900240542, "learning_rate": 3.1648884272098177e-07, "logits/chosen": -5.442708969116211, "logits/rejected": -2.653074026107788, "logps/chosen": -9.153959274291992, "logps/rejected": -12.319401741027832, "loss": 0.0968, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6025005578994751, "rewards/margins": 2.6678457260131836, "rewards/rejected": -2.065345287322998, "step": 558 }, { "epoch": 9.474576271186441, "grad_norm": 7.872704826602449, "learning_rate": 3.157756410568803e-07, "logits/chosen": -5.923219680786133, "logits/rejected": -4.431153297424316, "logps/chosen": -10.067678451538086, "logps/rejected": -16.93128204345703, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": 0.24123656749725342, "rewards/margins": 2.855665922164917, "rewards/rejected": -2.614429235458374, "step": 559 }, { "epoch": 9.491525423728813, "grad_norm": 23.783216732996646, "learning_rate": 3.150618637998041e-07, "logits/chosen": -0.8499359488487244, "logits/rejected": 0.2799752950668335, "logps/chosen": -8.114256858825684, "logps/rejected": -21.467369079589844, "loss": 0.1144, "rewards/accuracies": 1.0, "rewards/chosen": 0.47225862741470337, "rewards/margins": 4.720645904541016, "rewards/rejected": -4.248387336730957, "step": 560 }, { "epoch": 9.508474576271187, "grad_norm": 7.705930836558833, "learning_rate": 3.1434751719591305e-07, "logits/chosen": -6.150752544403076, "logits/rejected": -7.172792434692383, "logps/chosen": -10.911544799804688, "logps/rejected": -21.380325317382812, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 0.19267332553863525, "rewards/margins": 3.4200611114501953, "rewards/rejected": -3.2273876667022705, "step": 561 }, { "epoch": 9.525423728813559, "grad_norm": 7.005357396590622, "learning_rate": 3.136326074963494e-07, "logits/chosen": -3.0026967525482178, "logits/rejected": -3.8512182235717773, "logps/chosen": -7.109619140625, "logps/rejected": -14.219776153564453, "loss": 0.0911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4708208739757538, "rewards/margins": 2.4455618858337402, "rewards/rejected": -1.974740982055664, "step": 562 }, { "epoch": 9.542372881355933, "grad_norm": 7.270706931110439, "learning_rate": 3.1291714095718294e-07, "logits/chosen": 2.667048454284668, "logits/rejected": 2.047011375427246, "logps/chosen": -6.924215793609619, "logps/rejected": -19.053524017333984, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 0.019068308174610138, "rewards/margins": 4.171212196350098, "rewards/rejected": -4.152143955230713, "step": 563 }, { "epoch": 9.559322033898304, "grad_norm": 7.465791344564993, "learning_rate": 3.122011238393562e-07, "logits/chosen": -4.9882612228393555, "logits/rejected": -4.5862298011779785, "logps/chosen": -7.148963928222656, "logps/rejected": -14.193519592285156, "loss": 0.106, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16198576986789703, "rewards/margins": 2.9971303939819336, "rewards/rejected": -2.835144519805908, "step": 564 }, { "epoch": 9.576271186440678, "grad_norm": 6.422080871628065, "learning_rate": 3.1148456240862993e-07, "logits/chosen": -3.4394102096557617, "logits/rejected": 0.08007746934890747, "logps/chosen": -11.544246673583984, "logps/rejected": -27.53937530517578, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -0.0750107690691948, "rewards/margins": 5.368021011352539, "rewards/rejected": -5.443032264709473, "step": 565 }, { "epoch": 9.59322033898305, "grad_norm": 6.9205639042896365, "learning_rate": 3.1076746293552785e-07, "logits/chosen": -2.836104154586792, "logits/rejected": -1.123127818107605, "logps/chosen": -6.161153793334961, "logps/rejected": -22.165803909301758, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": 0.13143664598464966, "rewards/margins": 5.175368309020996, "rewards/rejected": -5.043931007385254, "step": 566 }, { "epoch": 9.610169491525424, "grad_norm": 7.223836249722002, "learning_rate": 3.1004983169528225e-07, "logits/chosen": -1.548119068145752, "logits/rejected": -1.392944097518921, "logps/chosen": -7.682084083557129, "logps/rejected": -19.642316818237305, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 0.27215924859046936, "rewards/margins": 4.37581729888916, "rewards/rejected": -4.103658199310303, "step": 567 }, { "epoch": 9.627118644067796, "grad_norm": 8.772767733241357, "learning_rate": 3.0933167496777873e-07, "logits/chosen": -4.728096008300781, "logits/rejected": -3.047065496444702, "logps/chosen": -8.965621948242188, "logps/rejected": -16.117294311523438, "loss": 0.1167, "rewards/accuracies": 1.0, "rewards/chosen": 0.35291966795921326, "rewards/margins": 3.55570387840271, "rewards/rejected": -3.202784299850464, "step": 568 }, { "epoch": 9.64406779661017, "grad_norm": 7.844719017241512, "learning_rate": 3.0861299903750115e-07, "logits/chosen": -1.934074878692627, "logits/rejected": -0.8083376884460449, "logps/chosen": -9.919522285461426, "logps/rejected": -26.1778564453125, "loss": 0.1001, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3015662431716919, "rewards/margins": 5.8502116203308105, "rewards/rejected": -5.548645496368408, "step": 569 }, { "epoch": 9.661016949152543, "grad_norm": 7.188158645049839, "learning_rate": 3.0789381019347724e-07, "logits/chosen": -1.5368669033050537, "logits/rejected": -2.4642491340637207, "logps/chosen": -6.37646484375, "logps/rejected": -16.949405670166016, "loss": 0.0949, "rewards/accuracies": 1.0, "rewards/chosen": 0.7365732192993164, "rewards/margins": 3.487222909927368, "rewards/rejected": -2.7506494522094727, "step": 570 }, { "epoch": 9.677966101694915, "grad_norm": 7.111649275956974, "learning_rate": 3.071741147292229e-07, "logits/chosen": 0.26448094844818115, "logits/rejected": 1.6492478847503662, "logps/chosen": -10.899922370910645, "logps/rejected": -22.683032989501953, "loss": 0.0849, "rewards/accuracies": 1.0, "rewards/chosen": 0.09248735755681992, "rewards/margins": 3.582092761993408, "rewards/rejected": -3.48960542678833, "step": 571 }, { "epoch": 9.694915254237289, "grad_norm": 8.336863079121871, "learning_rate": 3.0645391894268734e-07, "logits/chosen": 1.917724847793579, "logits/rejected": -0.7316230535507202, "logps/chosen": -10.622394561767578, "logps/rejected": -26.208999633789062, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 0.2793700397014618, "rewards/margins": 4.600907325744629, "rewards/rejected": -4.321537017822266, "step": 572 }, { "epoch": 9.711864406779661, "grad_norm": 6.886792783120748, "learning_rate": 3.057332291361983e-07, "logits/chosen": -3.8753855228424072, "logits/rejected": -0.0695408284664154, "logps/chosen": -13.180882453918457, "logps/rejected": -26.468891143798828, "loss": 0.0851, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012535899877548218, "rewards/margins": 5.588648319244385, "rewards/rejected": -5.5873942375183105, "step": 573 }, { "epoch": 9.728813559322035, "grad_norm": 6.669887643578233, "learning_rate": 3.050120516164062e-07, "logits/chosen": -6.483968734741211, "logits/rejected": -3.4908714294433594, "logps/chosen": -9.973274230957031, "logps/rejected": -24.82187271118164, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 0.3250216543674469, "rewards/margins": 4.841268539428711, "rewards/rejected": -4.516247272491455, "step": 574 }, { "epoch": 9.745762711864407, "grad_norm": 7.103637533168144, "learning_rate": 3.042903926942297e-07, "logits/chosen": -3.9618115425109863, "logits/rejected": -2.0849714279174805, "logps/chosen": -12.599139213562012, "logps/rejected": -23.257089614868164, "loss": 0.089, "rewards/accuracies": 1.0, "rewards/chosen": 0.22399751842021942, "rewards/margins": 5.019968509674072, "rewards/rejected": -4.795970439910889, "step": 575 }, { "epoch": 9.76271186440678, "grad_norm": 7.530244017046209, "learning_rate": 3.0356825868480014e-07, "logits/chosen": -4.090577125549316, "logits/rejected": -4.245079040527344, "logps/chosen": -8.236763000488281, "logps/rejected": -16.553316116333008, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 0.24407736957073212, "rewards/margins": 3.3017423152923584, "rewards/rejected": -3.0576651096343994, "step": 576 }, { "epoch": 9.779661016949152, "grad_norm": 6.697617479828195, "learning_rate": 3.0284565590740607e-07, "logits/chosen": -3.7148067951202393, "logits/rejected": -2.6984448432922363, "logps/chosen": -7.846210479736328, "logps/rejected": -23.022817611694336, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": 0.580223023891449, "rewards/margins": 4.813726902008057, "rewards/rejected": -4.233503341674805, "step": 577 }, { "epoch": 9.796610169491526, "grad_norm": 8.516783557390537, "learning_rate": 3.021225906854383e-07, "logits/chosen": -0.506150484085083, "logits/rejected": 0.20308029651641846, "logps/chosen": -9.165265083312988, "logps/rejected": -18.618465423583984, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 0.2960697114467621, "rewards/margins": 4.122585773468018, "rewards/rejected": -3.8265163898468018, "step": 578 }, { "epoch": 9.813559322033898, "grad_norm": 6.993927074929075, "learning_rate": 3.013990693463344e-07, "logits/chosen": -4.301695823669434, "logits/rejected": -0.8326593637466431, "logps/chosen": -10.31252384185791, "logps/rejected": -18.033140182495117, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 0.5753101706504822, "rewards/margins": 4.074686050415039, "rewards/rejected": -3.499375820159912, "step": 579 }, { "epoch": 9.830508474576272, "grad_norm": 7.627560630006767, "learning_rate": 3.006750982215234e-07, "logits/chosen": -5.807773590087891, "logits/rejected": -3.786421298980713, "logps/chosen": -10.408415794372559, "logps/rejected": -18.810245513916016, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 0.49460792541503906, "rewards/margins": 3.6803393363952637, "rewards/rejected": -3.1857314109802246, "step": 580 }, { "epoch": 9.847457627118644, "grad_norm": 7.042175636446051, "learning_rate": 2.9995068364637023e-07, "logits/chosen": -0.17031973600387573, "logits/rejected": 0.04508787393569946, "logps/chosen": -6.5416436195373535, "logps/rejected": -20.05823516845703, "loss": 0.0947, "rewards/accuracies": 1.0, "rewards/chosen": 0.3072429895401001, "rewards/margins": 3.74953031539917, "rewards/rejected": -3.4422874450683594, "step": 581 }, { "epoch": 9.864406779661017, "grad_norm": 7.956967460140704, "learning_rate": 2.9922583196012035e-07, "logits/chosen": -3.6927130222320557, "logits/rejected": -2.2789618968963623, "logps/chosen": -7.582281589508057, "logps/rejected": -15.243419647216797, "loss": 0.1074, "rewards/accuracies": 1.0, "rewards/chosen": 0.5607558488845825, "rewards/margins": 3.3911430835723877, "rewards/rejected": -2.8303873538970947, "step": 582 }, { "epoch": 9.88135593220339, "grad_norm": 6.546803898387923, "learning_rate": 2.985005495058446e-07, "logits/chosen": -0.39755862951278687, "logits/rejected": 1.024298906326294, "logps/chosen": -7.9590888023376465, "logps/rejected": -18.462146759033203, "loss": 0.0802, "rewards/accuracies": 1.0, "rewards/chosen": 0.16562320291996002, "rewards/margins": 4.475764274597168, "rewards/rejected": -4.310141086578369, "step": 583 }, { "epoch": 9.898305084745763, "grad_norm": 7.55002338431414, "learning_rate": 2.9777484263038303e-07, "logits/chosen": -3.177365303039551, "logits/rejected": -2.0949947834014893, "logps/chosen": -10.2957763671875, "logps/rejected": -23.79370880126953, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 0.552627682685852, "rewards/margins": 4.92409086227417, "rewards/rejected": -4.371463298797607, "step": 584 }, { "epoch": 9.915254237288135, "grad_norm": 6.614441847101758, "learning_rate": 2.9704871768429016e-07, "logits/chosen": -5.1951799392700195, "logits/rejected": -2.615152597427368, "logps/chosen": -10.698162078857422, "logps/rejected": -20.58434295654297, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 0.5016829967498779, "rewards/margins": 4.080619812011719, "rewards/rejected": -3.5789365768432617, "step": 585 }, { "epoch": 9.932203389830509, "grad_norm": 7.549594151251059, "learning_rate": 2.9632218102177856e-07, "logits/chosen": -4.657960891723633, "logits/rejected": -1.054868459701538, "logps/chosen": -8.243659973144531, "logps/rejected": -18.8740234375, "loss": 0.0885, "rewards/accuracies": 1.0, "rewards/chosen": 0.2816173732280731, "rewards/margins": 3.8149704933166504, "rewards/rejected": -3.533353328704834, "step": 586 }, { "epoch": 9.94915254237288, "grad_norm": 6.816475554516112, "learning_rate": 2.9559523900066393e-07, "logits/chosen": -1.284484624862671, "logits/rejected": 0.5438723564147949, "logps/chosen": -8.53143310546875, "logps/rejected": -17.49203872680664, "loss": 0.0993, "rewards/accuracies": 1.0, "rewards/chosen": 0.3136657178401947, "rewards/margins": 3.7685282230377197, "rewards/rejected": -3.454862117767334, "step": 587 }, { "epoch": 9.966101694915254, "grad_norm": 7.345727939212483, "learning_rate": 2.948678979823092e-07, "logits/chosen": -6.1464433670043945, "logits/rejected": -3.958988666534424, "logps/chosen": -12.240889549255371, "logps/rejected": -18.993349075317383, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 0.6476763486862183, "rewards/margins": 3.881906270980835, "rewards/rejected": -3.234229803085327, "step": 588 }, { "epoch": 9.983050847457626, "grad_norm": 7.228864234275865, "learning_rate": 2.941401643315686e-07, "logits/chosen": -4.1069722175598145, "logits/rejected": -6.370417594909668, "logps/chosen": -6.387879371643066, "logps/rejected": -18.39813232421875, "loss": 0.0833, "rewards/accuracies": 1.0, "rewards/chosen": 0.6458454132080078, "rewards/margins": 3.298928737640381, "rewards/rejected": -2.653083324432373, "step": 589 }, { "epoch": 10.0, "grad_norm": 7.2671869247406855, "learning_rate": 2.934120444167326e-07, "logits/chosen": -5.320376873016357, "logits/rejected": 1.0061277151107788, "logps/chosen": -7.889092445373535, "logps/rejected": -15.941211700439453, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 0.37462252378463745, "rewards/margins": 3.2933833599090576, "rewards/rejected": -2.9187610149383545, "step": 590 }, { "epoch": 10.016949152542374, "grad_norm": 6.5332566713967335, "learning_rate": 2.926835446094716e-07, "logits/chosen": -2.9494495391845703, "logits/rejected": -2.0883994102478027, "logps/chosen": -9.400456428527832, "logps/rejected": -18.402284622192383, "loss": 0.0724, "rewards/accuracies": 1.0, "rewards/chosen": 0.8871166110038757, "rewards/margins": 3.988942861557007, "rewards/rejected": -3.1018261909484863, "step": 591 }, { "epoch": 10.033898305084746, "grad_norm": 6.87699072202025, "learning_rate": 2.919546712847804e-07, "logits/chosen": -0.45388537645339966, "logits/rejected": -1.2807650566101074, "logps/chosen": -10.14447021484375, "logps/rejected": -25.88265609741211, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 0.40496236085891724, "rewards/margins": 4.389737129211426, "rewards/rejected": -3.984774589538574, "step": 592 }, { "epoch": 10.05084745762712, "grad_norm": 7.807442385203102, "learning_rate": 2.9122543082092246e-07, "logits/chosen": 0.45776844024658203, "logits/rejected": -0.1286400556564331, "logps/chosen": -10.70620346069336, "logps/rejected": -24.24842071533203, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 0.3894589841365814, "rewards/margins": 4.869680404663086, "rewards/rejected": -4.480220794677734, "step": 593 }, { "epoch": 10.067796610169491, "grad_norm": 6.669283919691698, "learning_rate": 2.9049582959937393e-07, "logits/chosen": -6.220808029174805, "logits/rejected": -4.7131781578063965, "logps/chosen": -11.595624923706055, "logps/rejected": -19.858522415161133, "loss": 0.0753, "rewards/accuracies": 1.0, "rewards/chosen": 0.48867008090019226, "rewards/margins": 3.6481728553771973, "rewards/rejected": -3.1595029830932617, "step": 594 }, { "epoch": 10.084745762711865, "grad_norm": 6.1751896707000995, "learning_rate": 2.89765874004768e-07, "logits/chosen": -4.981171607971191, "logits/rejected": -4.294511795043945, "logps/chosen": -9.736584663391113, "logps/rejected": -21.595191955566406, "loss": 0.0829, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3352711796760559, "rewards/margins": 4.752230644226074, "rewards/rejected": -4.416959762573242, "step": 595 }, { "epoch": 10.101694915254237, "grad_norm": 6.519996226278627, "learning_rate": 2.890355704248388e-07, "logits/chosen": -5.755083084106445, "logits/rejected": -6.4575982093811035, "logps/chosen": -8.622859001159668, "logps/rejected": -17.94690704345703, "loss": 0.0984, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07391562312841415, "rewards/margins": 3.822164535522461, "rewards/rejected": -3.748249053955078, "step": 596 }, { "epoch": 10.11864406779661, "grad_norm": 6.870500428526926, "learning_rate": 2.8830492525036587e-07, "logits/chosen": -5.710309982299805, "logits/rejected": -5.270742416381836, "logps/chosen": -9.003686904907227, "logps/rejected": -21.707897186279297, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 0.4468466341495514, "rewards/margins": 4.289839744567871, "rewards/rejected": -3.8429930210113525, "step": 597 }, { "epoch": 10.135593220338983, "grad_norm": 6.82302561225996, "learning_rate": 2.875739448751176e-07, "logits/chosen": -2.099620819091797, "logits/rejected": -2.333075761795044, "logps/chosen": -8.265347480773926, "logps/rejected": -18.967870712280273, "loss": 0.0907, "rewards/accuracies": 1.0, "rewards/chosen": 0.41659247875213623, "rewards/margins": 4.089541912078857, "rewards/rejected": -3.6729493141174316, "step": 598 }, { "epoch": 10.152542372881356, "grad_norm": 6.822421678285191, "learning_rate": 2.8684263569579603e-07, "logits/chosen": -3.583547830581665, "logits/rejected": -2.7302725315093994, "logps/chosen": -9.004634857177734, "logps/rejected": -17.12502670288086, "loss": 0.0991, "rewards/accuracies": 1.0, "rewards/chosen": 0.4453964829444885, "rewards/margins": 3.3031258583068848, "rewards/rejected": -2.857728958129883, "step": 599 }, { "epoch": 10.169491525423728, "grad_norm": 5.433585077950351, "learning_rate": 2.8611100411198035e-07, "logits/chosen": -4.182163715362549, "logits/rejected": -3.384242534637451, "logps/chosen": -6.375938415527344, "logps/rejected": -14.676736831665039, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 0.5399359464645386, "rewards/margins": 3.6615355014801025, "rewards/rejected": -3.1215996742248535, "step": 600 }, { "epoch": 10.186440677966102, "grad_norm": 5.767263681153962, "learning_rate": 2.853790565260712e-07, "logits/chosen": -5.147862434387207, "logits/rejected": -4.546764373779297, "logps/chosen": -5.3771514892578125, "logps/rejected": -18.10373878479004, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 0.17288702726364136, "rewards/margins": 4.0040411949157715, "rewards/rejected": -3.8311543464660645, "step": 601 }, { "epoch": 10.203389830508474, "grad_norm": 6.640532340744767, "learning_rate": 2.846467993432342e-07, "logits/chosen": -3.296685218811035, "logits/rejected": -2.427095890045166, "logps/chosen": -10.636516571044922, "logps/rejected": -20.535572052001953, "loss": 0.0723, "rewards/accuracies": 1.0, "rewards/chosen": 0.5197865962982178, "rewards/margins": 4.117679595947266, "rewards/rejected": -3.5978927612304688, "step": 602 }, { "epoch": 10.220338983050848, "grad_norm": 6.768047660140451, "learning_rate": 2.8391423897134454e-07, "logits/chosen": 0.3417333662509918, "logits/rejected": -0.7083436846733093, "logps/chosen": -10.16650104522705, "logps/rejected": -27.248626708984375, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": -0.18731170892715454, "rewards/margins": 5.726678848266602, "rewards/rejected": -5.913990497589111, "step": 603 }, { "epoch": 10.23728813559322, "grad_norm": 7.058789634806724, "learning_rate": 2.8318138182093047e-07, "logits/chosen": -0.23608046770095825, "logits/rejected": -1.7864173650741577, "logps/chosen": -7.249157428741455, "logps/rejected": -24.418014526367188, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 0.44495925307273865, "rewards/margins": 5.741467475891113, "rewards/rejected": -5.296508312225342, "step": 604 }, { "epoch": 10.254237288135593, "grad_norm": 6.401714878017643, "learning_rate": 2.8244823430511725e-07, "logits/chosen": -4.940008163452148, "logits/rejected": -4.818403720855713, "logps/chosen": -10.677308082580566, "logps/rejected": -20.77131462097168, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 0.5091035962104797, "rewards/margins": 4.647761344909668, "rewards/rejected": -4.138657569885254, "step": 605 }, { "epoch": 10.271186440677965, "grad_norm": 6.1298913343452845, "learning_rate": 2.8171480283957117e-07, "logits/chosen": -4.699034214019775, "logits/rejected": -3.9625940322875977, "logps/chosen": -7.439517974853516, "logps/rejected": -15.482069969177246, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 0.34953752160072327, "rewards/margins": 3.54054594039917, "rewards/rejected": -3.1910083293914795, "step": 606 }, { "epoch": 10.288135593220339, "grad_norm": 7.1393461037986246, "learning_rate": 2.8098109384244315e-07, "logits/chosen": -6.9001569747924805, "logits/rejected": -4.933772563934326, "logps/chosen": -8.911727905273438, "logps/rejected": -17.510486602783203, "loss": 0.0795, "rewards/accuracies": 1.0, "rewards/chosen": 0.5511279702186584, "rewards/margins": 3.9823532104492188, "rewards/rejected": -3.431225299835205, "step": 607 }, { "epoch": 10.305084745762711, "grad_norm": 6.982071096916762, "learning_rate": 2.8024711373431297e-07, "logits/chosen": -0.6912120580673218, "logits/rejected": 1.0574913024902344, "logps/chosen": -11.467482566833496, "logps/rejected": -25.087936401367188, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 0.07352522760629654, "rewards/margins": 5.616614818572998, "rewards/rejected": -5.543089866638184, "step": 608 }, { "epoch": 10.322033898305085, "grad_norm": 7.205356800311916, "learning_rate": 2.795128689381327e-07, "logits/chosen": -5.367947101593018, "logits/rejected": -6.2112226486206055, "logps/chosen": -9.169400215148926, "logps/rejected": -20.133502960205078, "loss": 0.0956, "rewards/accuracies": 1.0, "rewards/chosen": 0.30789613723754883, "rewards/margins": 3.6526689529418945, "rewards/rejected": -3.3447728157043457, "step": 609 }, { "epoch": 10.338983050847457, "grad_norm": 6.828839846657555, "learning_rate": 2.787783658791707e-07, "logits/chosen": -1.397652506828308, "logits/rejected": 0.4360477924346924, "logps/chosen": -12.576018333435059, "logps/rejected": -24.168855667114258, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 0.1414359211921692, "rewards/margins": 5.024531364440918, "rewards/rejected": -4.883095741271973, "step": 610 }, { "epoch": 10.35593220338983, "grad_norm": 6.148294911277272, "learning_rate": 2.7804361098495547e-07, "logits/chosen": -0.5597133636474609, "logits/rejected": 2.5741095542907715, "logps/chosen": -14.489322662353516, "logps/rejected": -28.154443740844727, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": -0.3005380630493164, "rewards/margins": 5.198799133300781, "rewards/rejected": -5.499336242675781, "step": 611 }, { "epoch": 10.372881355932204, "grad_norm": 5.923649965413688, "learning_rate": 2.7730861068521913e-07, "logits/chosen": -6.315664768218994, "logits/rejected": -6.235509872436523, "logps/chosen": -8.011595726013184, "logps/rejected": -15.550950050354004, "loss": 0.079, "rewards/accuracies": 1.0, "rewards/chosen": 0.6943686008453369, "rewards/margins": 2.8963940143585205, "rewards/rejected": -2.2020251750946045, "step": 612 }, { "epoch": 10.389830508474576, "grad_norm": 7.280729728178173, "learning_rate": 2.7657337141184134e-07, "logits/chosen": -10.305787086486816, "logits/rejected": -6.9726457595825195, "logps/chosen": -9.634078025817871, "logps/rejected": -17.562524795532227, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 0.4106173515319824, "rewards/margins": 3.9181861877441406, "rewards/rejected": -3.507568597793579, "step": 613 }, { "epoch": 10.40677966101695, "grad_norm": 5.774792538092833, "learning_rate": 2.75837899598793e-07, "logits/chosen": -8.121630668640137, "logits/rejected": -7.595673561096191, "logps/chosen": -7.544984817504883, "logps/rejected": -17.279922485351562, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 0.5641006231307983, "rewards/margins": 3.87762188911438, "rewards/rejected": -3.313521146774292, "step": 614 }, { "epoch": 10.423728813559322, "grad_norm": 6.278562216017734, "learning_rate": 2.7510220168207996e-07, "logits/chosen": -4.6755170822143555, "logits/rejected": -1.8191864490509033, "logps/chosen": -8.91657543182373, "logps/rejected": -21.299772262573242, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": 0.18500903248786926, "rewards/margins": 4.516014575958252, "rewards/rejected": -4.331005573272705, "step": 615 }, { "epoch": 10.440677966101696, "grad_norm": 9.075465593759464, "learning_rate": 2.743662840996866e-07, "logits/chosen": -5.730457782745361, "logits/rejected": -4.239251613616943, "logps/chosen": -17.843042373657227, "logps/rejected": -25.541109085083008, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 0.508560299873352, "rewards/margins": 3.5839507579803467, "rewards/rejected": -3.075390577316284, "step": 616 }, { "epoch": 10.457627118644067, "grad_norm": 6.8993665196076055, "learning_rate": 2.736301532915196e-07, "logits/chosen": -1.6923491954803467, "logits/rejected": 0.2853749990463257, "logps/chosen": -10.134171485900879, "logps/rejected": -18.974462509155273, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 0.15618927776813507, "rewards/margins": 3.761019468307495, "rewards/rejected": -3.604830503463745, "step": 617 }, { "epoch": 10.474576271186441, "grad_norm": 6.81887848068769, "learning_rate": 2.7289381569935167e-07, "logits/chosen": -0.2558657228946686, "logits/rejected": -0.3321121633052826, "logps/chosen": -10.028180122375488, "logps/rejected": -21.12971305847168, "loss": 0.0789, "rewards/accuracies": 0.9375, "rewards/chosen": 0.24801495671272278, "rewards/margins": 4.917619228363037, "rewards/rejected": -4.6696038246154785, "step": 618 }, { "epoch": 10.491525423728813, "grad_norm": 6.438977935034483, "learning_rate": 2.7215727776676476e-07, "logits/chosen": -0.4277447760105133, "logits/rejected": -3.587709426879883, "logps/chosen": -8.177022933959961, "logps/rejected": -21.388124465942383, "loss": 0.0751, "rewards/accuracies": 1.0, "rewards/chosen": 0.3959938883781433, "rewards/margins": 4.029510974884033, "rewards/rejected": -3.633517265319824, "step": 619 }, { "epoch": 10.508474576271187, "grad_norm": 6.455920427735316, "learning_rate": 2.714205459390942e-07, "logits/chosen": -5.1421990394592285, "logits/rejected": -0.8779691457748413, "logps/chosen": -12.399337768554688, "logps/rejected": -30.643911361694336, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 0.21704591810703278, "rewards/margins": 5.969428062438965, "rewards/rejected": -5.752381324768066, "step": 620 }, { "epoch": 10.525423728813559, "grad_norm": 6.321866244111642, "learning_rate": 2.7068362666337213e-07, "logits/chosen": -1.289355754852295, "logits/rejected": -2.6756012439727783, "logps/chosen": -9.784612655639648, "logps/rejected": -23.54802703857422, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": -0.008989214897155762, "rewards/margins": 3.653341293334961, "rewards/rejected": -3.6623306274414062, "step": 621 }, { "epoch": 10.542372881355933, "grad_norm": 6.433422295166518, "learning_rate": 2.6994652638827075e-07, "logits/chosen": -4.133284568786621, "logits/rejected": -2.0943939685821533, "logps/chosen": -8.259592056274414, "logps/rejected": -22.115522384643555, "loss": 0.076, "rewards/accuracies": 1.0, "rewards/chosen": 0.17010678350925446, "rewards/margins": 5.009650707244873, "rewards/rejected": -4.83954381942749, "step": 622 }, { "epoch": 10.559322033898304, "grad_norm": 6.57812996928514, "learning_rate": 2.6920925156404644e-07, "logits/chosen": -4.605247497558594, "logits/rejected": -1.550370216369629, "logps/chosen": -11.494009971618652, "logps/rejected": -20.429784774780273, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": 0.3749243915081024, "rewards/margins": 3.869821310043335, "rewards/rejected": -3.494896650314331, "step": 623 }, { "epoch": 10.576271186440678, "grad_norm": 6.387704872855451, "learning_rate": 2.684718086424828e-07, "logits/chosen": -2.2779276371002197, "logits/rejected": -4.718961238861084, "logps/chosen": -8.743425369262695, "logps/rejected": -21.723159790039062, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 0.06385959684848785, "rewards/margins": 4.543567657470703, "rewards/rejected": -4.479708194732666, "step": 624 }, { "epoch": 10.59322033898305, "grad_norm": 6.832554855689087, "learning_rate": 2.677342040768346e-07, "logits/chosen": -9.985498428344727, "logits/rejected": -11.14006233215332, "logps/chosen": -6.7391886711120605, "logps/rejected": -12.377889633178711, "loss": 0.0874, "rewards/accuracies": 1.0, "rewards/chosen": 0.4747909903526306, "rewards/margins": 3.002551317214966, "rewards/rejected": -2.5277605056762695, "step": 625 }, { "epoch": 10.610169491525424, "grad_norm": 6.673592359136678, "learning_rate": 2.669964443217711e-07, "logits/chosen": -4.82094669342041, "logits/rejected": -1.3282444477081299, "logps/chosen": -9.106605529785156, "logps/rejected": -19.12204933166504, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": 0.47643429040908813, "rewards/margins": 4.661433696746826, "rewards/rejected": -4.184998989105225, "step": 626 }, { "epoch": 10.627118644067796, "grad_norm": 5.581163046682789, "learning_rate": 2.662585358333194e-07, "logits/chosen": -0.6283246278762817, "logits/rejected": 1.6329344511032104, "logps/chosen": -8.678421020507812, "logps/rejected": -19.060213088989258, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": 0.324451744556427, "rewards/margins": 4.503561019897461, "rewards/rejected": -4.1791090965271, "step": 627 }, { "epoch": 10.64406779661017, "grad_norm": 6.532943519152192, "learning_rate": 2.655204850688085e-07, "logits/chosen": -6.0032806396484375, "logits/rejected": -6.153221130371094, "logps/chosen": -9.993376731872559, "logps/rejected": -19.9271240234375, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 0.19192150235176086, "rewards/margins": 3.7709221839904785, "rewards/rejected": -3.57900071144104, "step": 628 }, { "epoch": 10.661016949152543, "grad_norm": 6.791855490737261, "learning_rate": 2.6478229848681217e-07, "logits/chosen": -1.7947826385498047, "logits/rejected": 1.245511770248413, "logps/chosen": -14.064645767211914, "logps/rejected": -31.12624740600586, "loss": 0.0803, "rewards/accuracies": 1.0, "rewards/chosen": 0.21336092054843903, "rewards/margins": 4.568236351013184, "rewards/rejected": -4.3548760414123535, "step": 629 }, { "epoch": 10.677966101694915, "grad_norm": 5.998567580225641, "learning_rate": 2.6404398254709283e-07, "logits/chosen": -6.073047637939453, "logits/rejected": -3.7751412391662598, "logps/chosen": -10.10608959197998, "logps/rejected": -17.09992218017578, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 0.5900631546974182, "rewards/margins": 3.2727785110473633, "rewards/rejected": -2.682715654373169, "step": 630 }, { "epoch": 10.694915254237289, "grad_norm": 6.347527247412782, "learning_rate": 2.633055437105446e-07, "logits/chosen": -2.816498041152954, "logits/rejected": -1.7764393091201782, "logps/chosen": -9.664608001708984, "logps/rejected": -18.57364273071289, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 0.41163548827171326, "rewards/margins": 4.226436614990234, "rewards/rejected": -3.8148012161254883, "step": 631 }, { "epoch": 10.711864406779661, "grad_norm": 7.657664667513132, "learning_rate": 2.6256698843913765e-07, "logits/chosen": -2.3866946697235107, "logits/rejected": -3.51227068901062, "logps/chosen": -9.459527969360352, "logps/rejected": -23.772916793823242, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": -0.11973743140697479, "rewards/margins": 4.514531135559082, "rewards/rejected": -4.634268283843994, "step": 632 }, { "epoch": 10.728813559322035, "grad_norm": 7.398836752330527, "learning_rate": 2.6182832319586045e-07, "logits/chosen": -7.299160957336426, "logits/rejected": -3.781843423843384, "logps/chosen": -13.079208374023438, "logps/rejected": -16.370201110839844, "loss": 0.0971, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5853415131568909, "rewards/margins": 2.8693883419036865, "rewards/rejected": -2.2840466499328613, "step": 633 }, { "epoch": 10.745762711864407, "grad_norm": 5.588120159912262, "learning_rate": 2.6108955444466407e-07, "logits/chosen": -3.830315351486206, "logits/rejected": -3.4206957817077637, "logps/chosen": -9.309571266174316, "logps/rejected": -21.44207191467285, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.001529298722743988, "rewards/margins": 4.183349609375, "rewards/rejected": -4.184878826141357, "step": 634 }, { "epoch": 10.76271186440678, "grad_norm": 6.535406399654082, "learning_rate": 2.6035068865040556e-07, "logits/chosen": -5.687456130981445, "logits/rejected": -7.988400936126709, "logps/chosen": -10.812385559082031, "logps/rejected": -24.102890014648438, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": -0.08460170030593872, "rewards/margins": 3.884694814682007, "rewards/rejected": -3.96929669380188, "step": 635 }, { "epoch": 10.779661016949152, "grad_norm": 5.789102480389914, "learning_rate": 2.596117322787907e-07, "logits/chosen": -5.722503185272217, "logits/rejected": -7.290640830993652, "logps/chosen": -6.970130920410156, "logps/rejected": -20.396114349365234, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 0.18026211857795715, "rewards/margins": 3.8470044136047363, "rewards/rejected": -3.6667425632476807, "step": 636 }, { "epoch": 10.796610169491526, "grad_norm": 6.355357984144602, "learning_rate": 2.588726917963183e-07, "logits/chosen": -9.194799423217773, "logits/rejected": -6.714663505554199, "logps/chosen": -11.45416259765625, "logps/rejected": -18.329471588134766, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": 0.32029563188552856, "rewards/margins": 3.6258060932159424, "rewards/rejected": -3.3055105209350586, "step": 637 }, { "epoch": 10.813559322033898, "grad_norm": 6.858758613575888, "learning_rate": 2.58133573670223e-07, "logits/chosen": -6.48911190032959, "logits/rejected": -7.002935409545898, "logps/chosen": -12.041778564453125, "logps/rejected": -26.613550186157227, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 0.21291455626487732, "rewards/margins": 4.537956237792969, "rewards/rejected": -4.3250412940979, "step": 638 }, { "epoch": 10.830508474576272, "grad_norm": 6.385436811626369, "learning_rate": 2.5739438436841923e-07, "logits/chosen": -4.354445934295654, "logits/rejected": -1.8925871849060059, "logps/chosen": -6.685940742492676, "logps/rejected": -17.4467716217041, "loss": 0.0772, "rewards/accuracies": 1.0, "rewards/chosen": 0.6550213694572449, "rewards/margins": 4.483064651489258, "rewards/rejected": -3.828043222427368, "step": 639 }, { "epoch": 10.847457627118644, "grad_norm": 7.5412230424923115, "learning_rate": 2.566551303594437e-07, "logits/chosen": -4.869339942932129, "logits/rejected": -4.128410339355469, "logps/chosen": -8.929168701171875, "logps/rejected": -21.09054946899414, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 0.3591841459274292, "rewards/margins": 4.5191779136657715, "rewards/rejected": -4.1599931716918945, "step": 640 }, { "epoch": 10.864406779661017, "grad_norm": 7.2669719823592, "learning_rate": 2.559158181123998e-07, "logits/chosen": -7.467385292053223, "logits/rejected": -5.042152404785156, "logps/chosen": -9.017333030700684, "logps/rejected": -22.0870361328125, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 0.3601089119911194, "rewards/margins": 4.670373916625977, "rewards/rejected": -4.310265064239502, "step": 641 }, { "epoch": 10.88135593220339, "grad_norm": 6.052749440889756, "learning_rate": 2.5517645409690045e-07, "logits/chosen": -5.470639705657959, "logits/rejected": -1.8173260688781738, "logps/chosen": -7.751472473144531, "logps/rejected": -19.017440795898438, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 0.41869232058525085, "rewards/margins": 4.302485466003418, "rewards/rejected": -3.8837931156158447, "step": 642 }, { "epoch": 10.898305084745763, "grad_norm": 6.902697924495179, "learning_rate": 2.544370447830115e-07, "logits/chosen": -5.353979110717773, "logits/rejected": -4.941605567932129, "logps/chosen": -6.707101345062256, "logps/rejected": -21.526065826416016, "loss": 0.0789, "rewards/accuracies": 1.0, "rewards/chosen": 0.31695109605789185, "rewards/margins": 4.552976608276367, "rewards/rejected": -4.236025810241699, "step": 643 }, { "epoch": 10.915254237288135, "grad_norm": 6.361652513747436, "learning_rate": 2.5369759664119533e-07, "logits/chosen": -4.966207504272461, "logits/rejected": -6.723392963409424, "logps/chosen": -7.939865589141846, "logps/rejected": -20.92068862915039, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": 0.5712255239486694, "rewards/margins": 4.196403980255127, "rewards/rejected": -3.625178337097168, "step": 644 }, { "epoch": 10.932203389830509, "grad_norm": 6.733540459748604, "learning_rate": 2.52958116142254e-07, "logits/chosen": -4.9014058113098145, "logits/rejected": -4.164872646331787, "logps/chosen": -13.86292839050293, "logps/rejected": -21.466909408569336, "loss": 0.086, "rewards/accuracies": 1.0, "rewards/chosen": 0.17265652120113373, "rewards/margins": 3.9277124404907227, "rewards/rejected": -3.7550559043884277, "step": 645 }, { "epoch": 10.94915254237288, "grad_norm": 5.831373240324112, "learning_rate": 2.522186097572727e-07, "logits/chosen": -4.4887495040893555, "logits/rejected": -3.214918613433838, "logps/chosen": -9.671069145202637, "logps/rejected": -20.055328369140625, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 0.14893700182437897, "rewards/margins": 3.7495431900024414, "rewards/rejected": -3.6006062030792236, "step": 646 }, { "epoch": 10.966101694915254, "grad_norm": 6.262791048779491, "learning_rate": 2.514790839575634e-07, "logits/chosen": -3.553161144256592, "logits/rejected": -3.589228630065918, "logps/chosen": -8.84348201751709, "logps/rejected": -21.927766799926758, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 0.36416810750961304, "rewards/margins": 4.637812614440918, "rewards/rejected": -4.27364444732666, "step": 647 }, { "epoch": 10.983050847457626, "grad_norm": 6.806327508709908, "learning_rate": 2.507395452146074e-07, "logits/chosen": -7.03935432434082, "logits/rejected": -5.568624496459961, "logps/chosen": -9.785100936889648, "logps/rejected": -17.360050201416016, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 0.31295379996299744, "rewards/margins": 3.047978401184082, "rewards/rejected": -2.7350244522094727, "step": 648 }, { "epoch": 11.0, "grad_norm": 7.811002818864437, "learning_rate": 2.5e-07, "logits/chosen": -0.1611688733100891, "logits/rejected": 1.0501610040664673, "logps/chosen": -9.675187110900879, "logps/rejected": -22.014890670776367, "loss": 0.0948, "rewards/accuracies": 1.0, "rewards/chosen": 0.20374059677124023, "rewards/margins": 5.133134841918945, "rewards/rejected": -4.929394721984863, "step": 649 }, { "epoch": 11.016949152542374, "grad_norm": 6.850549464625482, "learning_rate": 2.4926045478539256e-07, "logits/chosen": -4.6589884757995605, "logits/rejected": -6.010820388793945, "logps/chosen": -7.22625207901001, "logps/rejected": -20.878189086914062, "loss": 0.0887, "rewards/accuracies": 1.0, "rewards/chosen": 0.2166891098022461, "rewards/margins": 4.188607692718506, "rewards/rejected": -3.9719185829162598, "step": 650 }, { "epoch": 11.033898305084746, "grad_norm": 5.919496708388121, "learning_rate": 2.485209160424366e-07, "logits/chosen": -6.218087673187256, "logits/rejected": -5.569622993469238, "logps/chosen": -8.723810195922852, "logps/rejected": -16.413066864013672, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": 0.4738181531429291, "rewards/margins": 3.431518316268921, "rewards/rejected": -2.957700490951538, "step": 651 }, { "epoch": 11.05084745762712, "grad_norm": 6.0611919294355925, "learning_rate": 2.477813902427272e-07, "logits/chosen": -3.49674654006958, "logits/rejected": -1.136789321899414, "logps/chosen": -11.503744125366211, "logps/rejected": -25.44773292541504, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 0.26914724707603455, "rewards/margins": 5.494405269622803, "rewards/rejected": -5.2252583503723145, "step": 652 }, { "epoch": 11.067796610169491, "grad_norm": 6.0573343175910725, "learning_rate": 2.47041883857746e-07, "logits/chosen": -5.083340167999268, "logits/rejected": 0.8065661191940308, "logps/chosen": -11.080831527709961, "logps/rejected": -31.152498245239258, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 0.15857788920402527, "rewards/margins": 6.871697902679443, "rewards/rejected": -6.713120460510254, "step": 653 }, { "epoch": 11.084745762711865, "grad_norm": 5.56917503889775, "learning_rate": 2.463024033588046e-07, "logits/chosen": -9.74613094329834, "logits/rejected": -6.1578569412231445, "logps/chosen": -11.67827320098877, "logps/rejected": -19.71649932861328, "loss": 0.0635, "rewards/accuracies": 1.0, "rewards/chosen": 0.24430200457572937, "rewards/margins": 4.060480117797852, "rewards/rejected": -3.816178798675537, "step": 654 }, { "epoch": 11.101694915254237, "grad_norm": 6.5567009060929795, "learning_rate": 2.455629552169885e-07, "logits/chosen": -3.267183542251587, "logits/rejected": -4.768028259277344, "logps/chosen": -7.853949546813965, "logps/rejected": -21.788009643554688, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": 0.3967207968235016, "rewards/margins": 4.657608985900879, "rewards/rejected": -4.260887622833252, "step": 655 }, { "epoch": 11.11864406779661, "grad_norm": 6.581903262468121, "learning_rate": 2.448235459030996e-07, "logits/chosen": -5.512940883636475, "logits/rejected": -3.1800599098205566, "logps/chosen": -9.868915557861328, "logps/rejected": -18.821285247802734, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 0.6598990559577942, "rewards/margins": 4.025852680206299, "rewards/rejected": -3.3659536838531494, "step": 656 }, { "epoch": 11.135593220338983, "grad_norm": 5.596217563555503, "learning_rate": 2.4408418188760024e-07, "logits/chosen": 0.2966378331184387, "logits/rejected": -2.103040933609009, "logps/chosen": -9.515241622924805, "logps/rejected": -22.730234146118164, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": -0.0926487147808075, "rewards/margins": 4.650430679321289, "rewards/rejected": -4.743080139160156, "step": 657 }, { "epoch": 11.152542372881356, "grad_norm": 5.538518104376337, "learning_rate": 2.433448696405563e-07, "logits/chosen": -3.6215038299560547, "logits/rejected": -5.576323509216309, "logps/chosen": -8.517637252807617, "logps/rejected": -22.78083610534668, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 0.04233184829354286, "rewards/margins": 3.963124990463257, "rewards/rejected": -3.920792818069458, "step": 658 }, { "epoch": 11.169491525423728, "grad_norm": 5.6139538426783355, "learning_rate": 2.426056156315808e-07, "logits/chosen": -2.620246410369873, "logits/rejected": -2.592529296875, "logps/chosen": -9.597460746765137, "logps/rejected": -19.386272430419922, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 0.47715649008750916, "rewards/margins": 4.265936851501465, "rewards/rejected": -3.7887799739837646, "step": 659 }, { "epoch": 11.186440677966102, "grad_norm": 5.209822541646444, "learning_rate": 2.4186642632977697e-07, "logits/chosen": -1.9901256561279297, "logits/rejected": -8.246919631958008, "logps/chosen": -12.452547073364258, "logps/rejected": -33.33019256591797, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.12005601823329926, "rewards/margins": 5.84098482131958, "rewards/rejected": -5.720929145812988, "step": 660 }, { "epoch": 11.203389830508474, "grad_norm": 5.930477352951743, "learning_rate": 2.4112730820368174e-07, "logits/chosen": -8.356374740600586, "logits/rejected": -7.305706024169922, "logps/chosen": -7.346405982971191, "logps/rejected": -16.096595764160156, "loss": 0.0738, "rewards/accuracies": 1.0, "rewards/chosen": 0.24272796511650085, "rewards/margins": 3.5312068462371826, "rewards/rejected": -3.2884786128997803, "step": 661 }, { "epoch": 11.220338983050848, "grad_norm": 5.591612639284657, "learning_rate": 2.403882677212093e-07, "logits/chosen": -4.104741096496582, "logits/rejected": -0.9902318716049194, "logps/chosen": -10.457313537597656, "logps/rejected": -21.854230880737305, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 0.4416648745536804, "rewards/margins": 4.539022445678711, "rewards/rejected": -4.097356796264648, "step": 662 }, { "epoch": 11.23728813559322, "grad_norm": 6.348118722950897, "learning_rate": 2.3964931134959447e-07, "logits/chosen": -4.671243667602539, "logits/rejected": -2.5600554943084717, "logps/chosen": -9.85940170288086, "logps/rejected": -19.52853775024414, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 0.5523775815963745, "rewards/margins": 4.105027675628662, "rewards/rejected": -3.552649736404419, "step": 663 }, { "epoch": 11.254237288135593, "grad_norm": 5.369729124450522, "learning_rate": 2.3891044555533586e-07, "logits/chosen": -5.128889083862305, "logits/rejected": -3.718061923980713, "logps/chosen": -8.306253433227539, "logps/rejected": -17.347557067871094, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 0.7414266467094421, "rewards/margins": 3.849276065826416, "rewards/rejected": -3.107849597930908, "step": 664 }, { "epoch": 11.271186440677965, "grad_norm": 5.612901635708372, "learning_rate": 2.381716768041395e-07, "logits/chosen": -0.0374007374048233, "logits/rejected": -1.22528874874115, "logps/chosen": -11.215349197387695, "logps/rejected": -35.25605392456055, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 0.17258042097091675, "rewards/margins": 6.277578353881836, "rewards/rejected": -6.1049981117248535, "step": 665 }, { "epoch": 11.288135593220339, "grad_norm": 5.1813207304899835, "learning_rate": 2.374330115608624e-07, "logits/chosen": -7.9421257972717285, "logits/rejected": -6.050485134124756, "logps/chosen": -10.09483528137207, "logps/rejected": -19.81658935546875, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 0.29093074798583984, "rewards/margins": 4.107818603515625, "rewards/rejected": -3.816887855529785, "step": 666 }, { "epoch": 11.305084745762711, "grad_norm": 5.514831436596614, "learning_rate": 2.3669445628945538e-07, "logits/chosen": -3.944185256958008, "logits/rejected": -1.4804234504699707, "logps/chosen": -10.032756805419922, "logps/rejected": -24.357975006103516, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 0.18329276144504547, "rewards/margins": 5.466524600982666, "rewards/rejected": -5.28323221206665, "step": 667 }, { "epoch": 11.322033898305085, "grad_norm": 6.63887413664787, "learning_rate": 2.3595601745290725e-07, "logits/chosen": -6.884547710418701, "logits/rejected": -7.041402339935303, "logps/chosen": -7.076028347015381, "logps/rejected": -14.57210922241211, "loss": 0.0842, "rewards/accuracies": 1.0, "rewards/chosen": 0.6141047477722168, "rewards/margins": 2.990374803543091, "rewards/rejected": -2.376270055770874, "step": 668 }, { "epoch": 11.338983050847457, "grad_norm": 6.548504856420001, "learning_rate": 2.3521770151318784e-07, "logits/chosen": -4.259178638458252, "logits/rejected": -0.449784517288208, "logps/chosen": -12.715999603271484, "logps/rejected": -26.444982528686523, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 0.2763742506504059, "rewards/margins": 5.15761137008667, "rewards/rejected": -4.881237030029297, "step": 669 }, { "epoch": 11.35593220338983, "grad_norm": 5.985995367055966, "learning_rate": 2.344795149311915e-07, "logits/chosen": -4.259335517883301, "logits/rejected": -3.376661777496338, "logps/chosen": -10.479347229003906, "logps/rejected": -23.774837493896484, "loss": 0.0654, "rewards/accuracies": 1.0, "rewards/chosen": 0.12627626955509186, "rewards/margins": 5.6051483154296875, "rewards/rejected": -5.478872299194336, "step": 670 }, { "epoch": 11.372881355932204, "grad_norm": 5.127913932930688, "learning_rate": 2.3374146416668062e-07, "logits/chosen": -2.8877933025360107, "logits/rejected": -3.261695384979248, "logps/chosen": -11.766398429870605, "logps/rejected": -24.1757755279541, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.2051323503255844, "rewards/margins": 5.357700347900391, "rewards/rejected": -5.1525678634643555, "step": 671 }, { "epoch": 11.389830508474576, "grad_norm": 6.20562392512599, "learning_rate": 2.3300355567822893e-07, "logits/chosen": -4.444530010223389, "logits/rejected": -4.568050384521484, "logps/chosen": -6.976926803588867, "logps/rejected": -17.10205841064453, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 0.17499062418937683, "rewards/margins": 3.3871519565582275, "rewards/rejected": -3.212161064147949, "step": 672 }, { "epoch": 11.40677966101695, "grad_norm": 5.413356962659108, "learning_rate": 2.3226579592316537e-07, "logits/chosen": -6.2134528160095215, "logits/rejected": -5.652405261993408, "logps/chosen": -10.496795654296875, "logps/rejected": -20.421342849731445, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.4378880262374878, "rewards/margins": 4.105364799499512, "rewards/rejected": -3.6674766540527344, "step": 673 }, { "epoch": 11.423728813559322, "grad_norm": 5.2094796996358514, "learning_rate": 2.315281913575172e-07, "logits/chosen": -7.498478889465332, "logits/rejected": -5.903684139251709, "logps/chosen": -9.15145492553711, "logps/rejected": -19.5275936126709, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 0.4203518033027649, "rewards/margins": 4.569279193878174, "rewards/rejected": -4.148927688598633, "step": 674 }, { "epoch": 11.440677966101696, "grad_norm": 6.6716832648994915, "learning_rate": 2.3079074843595354e-07, "logits/chosen": -5.070525169372559, "logits/rejected": -0.550679087638855, "logps/chosen": -10.488363265991211, "logps/rejected": -22.19812774658203, "loss": 0.0783, "rewards/accuracies": 0.9375, "rewards/chosen": -0.12508079409599304, "rewards/margins": 4.43815803527832, "rewards/rejected": -4.563239097595215, "step": 675 }, { "epoch": 11.457627118644067, "grad_norm": 5.956099811481841, "learning_rate": 2.300534736117292e-07, "logits/chosen": -9.618514060974121, "logits/rejected": -5.595645904541016, "logps/chosen": -12.044401168823242, "logps/rejected": -19.89811134338379, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 0.46925345063209534, "rewards/margins": 4.163661479949951, "rewards/rejected": -3.6944079399108887, "step": 676 }, { "epoch": 11.474576271186441, "grad_norm": 6.389828344273028, "learning_rate": 2.2931637333662785e-07, "logits/chosen": -8.7474365234375, "logits/rejected": -6.6938796043396, "logps/chosen": -8.219264030456543, "logps/rejected": -13.722729682922363, "loss": 0.0689, "rewards/accuracies": 1.0, "rewards/chosen": 0.4416693150997162, "rewards/margins": 2.8839468955993652, "rewards/rejected": -2.442277669906616, "step": 677 }, { "epoch": 11.491525423728813, "grad_norm": 6.000219805644383, "learning_rate": 2.2857945406090578e-07, "logits/chosen": -2.0206470489501953, "logits/rejected": -2.674741744995117, "logps/chosen": -7.267889499664307, "logps/rejected": -19.866804122924805, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 0.17590032517910004, "rewards/margins": 3.9376792907714844, "rewards/rejected": -3.7617790699005127, "step": 678 }, { "epoch": 11.508474576271187, "grad_norm": 8.721240193785789, "learning_rate": 2.2784272223323527e-07, "logits/chosen": -3.9873390197753906, "logits/rejected": -3.3875303268432617, "logps/chosen": -7.695770740509033, "logps/rejected": -14.943570137023926, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": 0.27957093715667725, "rewards/margins": 3.1959755420684814, "rewards/rejected": -2.9164047241210938, "step": 679 }, { "epoch": 11.525423728813559, "grad_norm": 5.950185450594928, "learning_rate": 2.271061843006484e-07, "logits/chosen": -5.7673821449279785, "logits/rejected": -6.350340843200684, "logps/chosen": -6.748774528503418, "logps/rejected": -19.038349151611328, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 0.4857510030269623, "rewards/margins": 3.901714563369751, "rewards/rejected": -3.415963649749756, "step": 680 }, { "epoch": 11.542372881355933, "grad_norm": 5.8543694845957726, "learning_rate": 2.263698467084804e-07, "logits/chosen": -0.47089025378227234, "logits/rejected": -6.059227466583252, "logps/chosen": -8.830419540405273, "logps/rejected": -32.42097091674805, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": -0.03702600300312042, "rewards/margins": 5.787201881408691, "rewards/rejected": -5.824227809906006, "step": 681 }, { "epoch": 11.559322033898304, "grad_norm": 6.175920832831188, "learning_rate": 2.2563371590031338e-07, "logits/chosen": -5.129776954650879, "logits/rejected": -5.330681324005127, "logps/chosen": -8.20430850982666, "logps/rejected": -20.813114166259766, "loss": 0.0765, "rewards/accuracies": 1.0, "rewards/chosen": 0.30605781078338623, "rewards/margins": 4.445309162139893, "rewards/rejected": -4.139252185821533, "step": 682 }, { "epoch": 11.576271186440678, "grad_norm": 6.113551707587027, "learning_rate": 2.2489779831792004e-07, "logits/chosen": -5.852320671081543, "logits/rejected": -5.197740077972412, "logps/chosen": -8.29546070098877, "logps/rejected": -18.657737731933594, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": 0.22355306148529053, "rewards/margins": 3.745954990386963, "rewards/rejected": -3.522402048110962, "step": 683 }, { "epoch": 11.59322033898305, "grad_norm": 6.50222345937985, "learning_rate": 2.2416210040120701e-07, "logits/chosen": -5.681268215179443, "logits/rejected": -0.8399734497070312, "logps/chosen": -10.5161714553833, "logps/rejected": -18.5819034576416, "loss": 0.0858, "rewards/accuracies": 1.0, "rewards/chosen": 0.4941365420818329, "rewards/margins": 4.430834770202637, "rewards/rejected": -3.9366986751556396, "step": 684 }, { "epoch": 11.610169491525424, "grad_norm": 6.07353126745076, "learning_rate": 2.2342662858815867e-07, "logits/chosen": -3.1647231578826904, "logits/rejected": -2.3442485332489014, "logps/chosen": -12.0752534866333, "logps/rejected": -26.96086883544922, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 0.2649264633655548, "rewards/margins": 4.969583988189697, "rewards/rejected": -4.704657554626465, "step": 685 }, { "epoch": 11.627118644067796, "grad_norm": 5.5404557799180685, "learning_rate": 2.2269138931478082e-07, "logits/chosen": -5.84234094619751, "logits/rejected": -5.363994121551514, "logps/chosen": -7.794708251953125, "logps/rejected": -14.919957160949707, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 0.5045041441917419, "rewards/margins": 3.4364819526672363, "rewards/rejected": -2.9319777488708496, "step": 686 }, { "epoch": 11.64406779661017, "grad_norm": 7.170816477696418, "learning_rate": 2.2195638901504448e-07, "logits/chosen": -7.435305595397949, "logits/rejected": -2.8277387619018555, "logps/chosen": -8.017237663269043, "logps/rejected": -14.647412300109863, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 0.4582928419113159, "rewards/margins": 3.0573978424072266, "rewards/rejected": -2.599104881286621, "step": 687 }, { "epoch": 11.661016949152543, "grad_norm": 5.025012358974028, "learning_rate": 2.2122163412082927e-07, "logits/chosen": -10.500146865844727, "logits/rejected": -7.918832778930664, "logps/chosen": -8.945805549621582, "logps/rejected": -18.38872528076172, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 0.3134109377861023, "rewards/margins": 4.1275715827941895, "rewards/rejected": -3.8141608238220215, "step": 688 }, { "epoch": 11.677966101694915, "grad_norm": 5.966921308745028, "learning_rate": 2.2048713106186737e-07, "logits/chosen": -0.8204070329666138, "logits/rejected": -3.1629364490509033, "logps/chosen": -7.831719875335693, "logps/rejected": -28.17206573486328, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -0.04579088091850281, "rewards/margins": 6.21815299987793, "rewards/rejected": -6.263944149017334, "step": 689 }, { "epoch": 11.694915254237289, "grad_norm": 5.50353479601336, "learning_rate": 2.197528862656871e-07, "logits/chosen": -2.501692533493042, "logits/rejected": -3.2817344665527344, "logps/chosen": -7.619758605957031, "logps/rejected": -18.473947525024414, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 0.3159796893596649, "rewards/margins": 3.803527355194092, "rewards/rejected": -3.4875473976135254, "step": 690 }, { "epoch": 11.711864406779661, "grad_norm": 5.440614178576993, "learning_rate": 2.190189061575569e-07, "logits/chosen": -3.884366512298584, "logits/rejected": -6.531481742858887, "logps/chosen": -6.7981486320495605, "logps/rejected": -24.70162582397461, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": 0.21428728103637695, "rewards/margins": 5.436726093292236, "rewards/rejected": -5.222438812255859, "step": 691 }, { "epoch": 11.728813559322035, "grad_norm": 4.6174992967015385, "learning_rate": 2.1828519716042886e-07, "logits/chosen": -5.7983717918396, "logits/rejected": -4.691634178161621, "logps/chosen": -9.15369987487793, "logps/rejected": -23.07421875, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 0.2911912202835083, "rewards/margins": 4.7826385498046875, "rewards/rejected": -4.491447448730469, "step": 692 }, { "epoch": 11.745762711864407, "grad_norm": 5.455493758962524, "learning_rate": 2.1755176569488273e-07, "logits/chosen": -1.9908509254455566, "logits/rejected": 1.1757967472076416, "logps/chosen": -8.67979621887207, "logps/rejected": -19.0824031829834, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 0.4927636981010437, "rewards/margins": 4.18226432800293, "rewards/rejected": -3.6895008087158203, "step": 693 }, { "epoch": 11.76271186440678, "grad_norm": 6.939821229573557, "learning_rate": 2.168186181790695e-07, "logits/chosen": -2.7588908672332764, "logits/rejected": -3.0735645294189453, "logps/chosen": -8.97663688659668, "logps/rejected": -26.288864135742188, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": 0.5594474673271179, "rewards/margins": 5.872201919555664, "rewards/rejected": -5.312753677368164, "step": 694 }, { "epoch": 11.779661016949152, "grad_norm": 6.5447294626149315, "learning_rate": 2.1608576102865547e-07, "logits/chosen": -3.781676769256592, "logits/rejected": -3.6431427001953125, "logps/chosen": -11.655670166015625, "logps/rejected": -20.786060333251953, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": -0.08528602868318558, "rewards/margins": 3.7099151611328125, "rewards/rejected": -3.795201301574707, "step": 695 }, { "epoch": 11.796610169491526, "grad_norm": 5.9844635273883995, "learning_rate": 2.1535320065676578e-07, "logits/chosen": -2.6350269317626953, "logits/rejected": -1.517306923866272, "logps/chosen": -9.438491821289062, "logps/rejected": -23.236515045166016, "loss": 0.0745, "rewards/accuracies": 1.0, "rewards/chosen": 0.21704833209514618, "rewards/margins": 5.231599807739258, "rewards/rejected": -5.014551639556885, "step": 696 }, { "epoch": 11.813559322033898, "grad_norm": 6.08157149813121, "learning_rate": 2.1462094347392884e-07, "logits/chosen": -5.0445051193237305, "logits/rejected": -4.414592742919922, "logps/chosen": -9.716800689697266, "logps/rejected": -20.61695098876953, "loss": 0.0742, "rewards/accuracies": 0.9375, "rewards/chosen": 0.30450063943862915, "rewards/margins": 4.38283634185791, "rewards/rejected": -4.078335762023926, "step": 697 }, { "epoch": 11.830508474576272, "grad_norm": 5.443804980328509, "learning_rate": 2.1388899588801963e-07, "logits/chosen": -6.142149925231934, "logits/rejected": -6.617219924926758, "logps/chosen": -9.288625717163086, "logps/rejected": -21.615276336669922, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": -0.029895126819610596, "rewards/margins": 4.5009918212890625, "rewards/rejected": -4.530886650085449, "step": 698 }, { "epoch": 11.847457627118644, "grad_norm": 5.261191623846564, "learning_rate": 2.131573643042039e-07, "logits/chosen": -2.0807480812072754, "logits/rejected": -5.6200079917907715, "logps/chosen": -9.350138664245605, "logps/rejected": -24.03493309020996, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 0.460003137588501, "rewards/margins": 4.993373394012451, "rewards/rejected": -4.533370018005371, "step": 699 }, { "epoch": 11.864406779661017, "grad_norm": 8.431810204613281, "learning_rate": 2.1242605512488245e-07, "logits/chosen": -2.674501895904541, "logits/rejected": -2.5828213691711426, "logps/chosen": -8.736321449279785, "logps/rejected": -21.578317642211914, "loss": 0.06, "rewards/accuracies": 1.0, "rewards/chosen": 0.9485386610031128, "rewards/margins": 4.861410140991211, "rewards/rejected": -3.9128713607788086, "step": 700 }, { "epoch": 11.88135593220339, "grad_norm": 6.151330495250116, "learning_rate": 2.116950747496342e-07, "logits/chosen": -5.063408851623535, "logits/rejected": -3.6914243698120117, "logps/chosen": -10.055356979370117, "logps/rejected": -25.340904235839844, "loss": 0.0794, "rewards/accuracies": 1.0, "rewards/chosen": 0.1456144005060196, "rewards/margins": 5.068149089813232, "rewards/rejected": -4.922534465789795, "step": 701 }, { "epoch": 11.898305084745763, "grad_norm": 5.355508219261581, "learning_rate": 2.1096442957516116e-07, "logits/chosen": -3.058699131011963, "logits/rejected": -3.3943307399749756, "logps/chosen": -7.250519275665283, "logps/rejected": -17.219770431518555, "loss": 0.0737, "rewards/accuracies": 1.0, "rewards/chosen": 0.6987836956977844, "rewards/margins": 3.889528751373291, "rewards/rejected": -3.190744638442993, "step": 702 }, { "epoch": 11.915254237288135, "grad_norm": 6.202575722704129, "learning_rate": 2.10234125995232e-07, "logits/chosen": -2.8255527019500732, "logits/rejected": -4.576816082000732, "logps/chosen": -5.164361000061035, "logps/rejected": -15.157373428344727, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": 0.5878112316131592, "rewards/margins": 3.485034227371216, "rewards/rejected": -2.8972229957580566, "step": 703 }, { "epoch": 11.932203389830509, "grad_norm": 5.559092253894643, "learning_rate": 2.0950417040062607e-07, "logits/chosen": -2.9859113693237305, "logits/rejected": -2.63259220123291, "logps/chosen": -7.869375705718994, "logps/rejected": -16.07195281982422, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 0.16196058690547943, "rewards/margins": 3.7745909690856934, "rewards/rejected": -3.612630605697632, "step": 704 }, { "epoch": 11.94915254237288, "grad_norm": 5.674022553504681, "learning_rate": 2.0877456917907757e-07, "logits/chosen": -2.325315475463867, "logits/rejected": -2.9422802925109863, "logps/chosen": -11.614786148071289, "logps/rejected": -21.07794952392578, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": 0.35842809081077576, "rewards/margins": 4.692770481109619, "rewards/rejected": -4.334342002868652, "step": 705 }, { "epoch": 11.966101694915254, "grad_norm": 5.541880018229243, "learning_rate": 2.0804532871521957e-07, "logits/chosen": -4.538088798522949, "logits/rejected": -2.091925621032715, "logps/chosen": -7.1544880867004395, "logps/rejected": -23.85821533203125, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 0.29876646399497986, "rewards/margins": 4.888092041015625, "rewards/rejected": -4.589325904846191, "step": 706 }, { "epoch": 11.983050847457626, "grad_norm": 6.255400445675394, "learning_rate": 2.0731645539052842e-07, "logits/chosen": -4.6718974113464355, "logits/rejected": -4.809595584869385, "logps/chosen": -6.028842926025391, "logps/rejected": -19.23764419555664, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8090718388557434, "rewards/margins": 5.258846282958984, "rewards/rejected": -4.449774265289307, "step": 707 }, { "epoch": 12.0, "grad_norm": 5.115387616418879, "learning_rate": 2.065879555832674e-07, "logits/chosen": -7.058655738830566, "logits/rejected": -5.943211555480957, "logps/chosen": -8.043971061706543, "logps/rejected": -24.434711456298828, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 0.44858622550964355, "rewards/margins": 5.742889881134033, "rewards/rejected": -5.294303894042969, "step": 708 }, { "epoch": 12.016949152542374, "grad_norm": 5.263119162726668, "learning_rate": 2.0585983566843142e-07, "logits/chosen": -4.840389728546143, "logits/rejected": -2.096611499786377, "logps/chosen": -7.987666130065918, "logps/rejected": -25.624900817871094, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 0.5033466815948486, "rewards/margins": 6.154224872589111, "rewards/rejected": -5.65087890625, "step": 709 }, { "epoch": 12.033898305084746, "grad_norm": 5.453331088324317, "learning_rate": 2.0513210201769083e-07, "logits/chosen": -2.46553897857666, "logits/rejected": -2.2434611320495605, "logps/chosen": -9.034485816955566, "logps/rejected": -20.003265380859375, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 0.40343374013900757, "rewards/margins": 4.3479838371276855, "rewards/rejected": -3.944549560546875, "step": 710 }, { "epoch": 12.05084745762712, "grad_norm": 6.185812331075285, "learning_rate": 2.0440476099933602e-07, "logits/chosen": -10.844772338867188, "logits/rejected": -7.556892395019531, "logps/chosen": -10.017423629760742, "logps/rejected": -15.887184143066406, "loss": 0.0675, "rewards/accuracies": 1.0, "rewards/chosen": 0.8177813291549683, "rewards/margins": 3.716082811355591, "rewards/rejected": -2.898301362991333, "step": 711 }, { "epoch": 12.067796610169491, "grad_norm": 6.247377514005049, "learning_rate": 2.0367781897822144e-07, "logits/chosen": -4.39729118347168, "logits/rejected": -2.3856217861175537, "logps/chosen": -9.275962829589844, "logps/rejected": -21.623519897460938, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": -0.19558584690093994, "rewards/margins": 4.383027076721191, "rewards/rejected": -4.578612327575684, "step": 712 }, { "epoch": 12.084745762711865, "grad_norm": 5.032949557222304, "learning_rate": 2.0295128231570984e-07, "logits/chosen": -1.5685780048370361, "logits/rejected": -4.263853549957275, "logps/chosen": -7.182946681976318, "logps/rejected": -22.17525863647461, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 0.3165295720100403, "rewards/margins": 5.067912578582764, "rewards/rejected": -4.751383304595947, "step": 713 }, { "epoch": 12.101694915254237, "grad_norm": 5.183886724119971, "learning_rate": 2.0222515736961692e-07, "logits/chosen": -4.001754283905029, "logits/rejected": -0.3880186378955841, "logps/chosen": -11.332290649414062, "logps/rejected": -31.85840606689453, "loss": 0.0617, "rewards/accuracies": 1.0, "rewards/chosen": -0.14374534785747528, "rewards/margins": 6.776453971862793, "rewards/rejected": -6.920198917388916, "step": 714 }, { "epoch": 12.11864406779661, "grad_norm": 6.1280771227485165, "learning_rate": 2.0149945049415546e-07, "logits/chosen": -2.7417237758636475, "logits/rejected": -3.603642225265503, "logps/chosen": -7.821854591369629, "logps/rejected": -17.309070587158203, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.09123823791742325, "rewards/margins": 3.7358312606811523, "rewards/rejected": -3.64459228515625, "step": 715 }, { "epoch": 12.135593220338983, "grad_norm": 4.628880282204041, "learning_rate": 2.0077416803987963e-07, "logits/chosen": -3.0061495304107666, "logits/rejected": -0.7661735415458679, "logps/chosen": -10.945513725280762, "logps/rejected": -21.080097198486328, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": 0.4002856910228729, "rewards/margins": 4.950218677520752, "rewards/rejected": -4.549932956695557, "step": 716 }, { "epoch": 12.152542372881356, "grad_norm": 5.481576880897221, "learning_rate": 2.0004931635362982e-07, "logits/chosen": -6.2298903465271, "logits/rejected": -4.165480136871338, "logps/chosen": -7.415446758270264, "logps/rejected": -15.942654609680176, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 0.5828918218612671, "rewards/margins": 3.7826273441314697, "rewards/rejected": -3.199735403060913, "step": 717 }, { "epoch": 12.169491525423728, "grad_norm": 6.912292590497126, "learning_rate": 1.993249017784766e-07, "logits/chosen": -2.234239101409912, "logits/rejected": -5.491997241973877, "logps/chosen": -9.92994213104248, "logps/rejected": -25.07433319091797, "loss": 0.0734, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20779605209827423, "rewards/margins": 4.874294757843018, "rewards/rejected": -5.082091331481934, "step": 718 }, { "epoch": 12.186440677966102, "grad_norm": 5.57790520194197, "learning_rate": 1.9860093065366557e-07, "logits/chosen": -4.336525917053223, "logits/rejected": -4.3841552734375, "logps/chosen": -8.806467056274414, "logps/rejected": -17.178787231445312, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 0.4746827781200409, "rewards/margins": 4.141698360443115, "rewards/rejected": -3.667015552520752, "step": 719 }, { "epoch": 12.203389830508474, "grad_norm": 4.367463057983305, "learning_rate": 1.9787740931456164e-07, "logits/chosen": -4.236856937408447, "logits/rejected": -2.3356170654296875, "logps/chosen": -8.016395568847656, "logps/rejected": -24.094526290893555, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": -0.06415438652038574, "rewards/margins": 5.513944149017334, "rewards/rejected": -5.578098773956299, "step": 720 }, { "epoch": 12.220338983050848, "grad_norm": 4.980211475796958, "learning_rate": 1.971543440925939e-07, "logits/chosen": -2.7962565422058105, "logits/rejected": -1.1794178485870361, "logps/chosen": -7.825432300567627, "logps/rejected": -19.563974380493164, "loss": 0.0525, "rewards/accuracies": 1.0, "rewards/chosen": 0.6326717138290405, "rewards/margins": 4.928226947784424, "rewards/rejected": -4.2955546379089355, "step": 721 }, { "epoch": 12.23728813559322, "grad_norm": 5.0620318025271365, "learning_rate": 1.9643174131519984e-07, "logits/chosen": -2.530458450317383, "logits/rejected": -3.3694586753845215, "logps/chosen": -6.862641334533691, "logps/rejected": -18.251625061035156, "loss": 0.0549, "rewards/accuracies": 1.0, "rewards/chosen": 0.48348268866539, "rewards/margins": 4.146088600158691, "rewards/rejected": -3.6626057624816895, "step": 722 }, { "epoch": 12.254237288135593, "grad_norm": 4.935925973312889, "learning_rate": 1.9570960730577032e-07, "logits/chosen": -3.856574058532715, "logits/rejected": -2.0448527336120605, "logps/chosen": -10.856523513793945, "logps/rejected": -23.267667770385742, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": 0.4581919014453888, "rewards/margins": 5.144651412963867, "rewards/rejected": -4.686459541320801, "step": 723 }, { "epoch": 12.271186440677965, "grad_norm": 5.192261425939682, "learning_rate": 1.949879483835939e-07, "logits/chosen": -5.752046585083008, "logits/rejected": -4.203609943389893, "logps/chosen": -6.465901851654053, "logps/rejected": -17.767559051513672, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": 0.44363754987716675, "rewards/margins": 4.3280720710754395, "rewards/rejected": -3.8844351768493652, "step": 724 }, { "epoch": 12.288135593220339, "grad_norm": 6.164340215886749, "learning_rate": 1.9426677086380183e-07, "logits/chosen": -4.548156261444092, "logits/rejected": -4.889017581939697, "logps/chosen": -7.712782859802246, "logps/rejected": -18.098644256591797, "loss": 0.0855, "rewards/accuracies": 1.0, "rewards/chosen": 0.684238612651825, "rewards/margins": 4.058871269226074, "rewards/rejected": -3.3746328353881836, "step": 725 }, { "epoch": 12.305084745762711, "grad_norm": 5.311974710959193, "learning_rate": 1.9354608105731267e-07, "logits/chosen": -2.2404370307922363, "logits/rejected": -4.380362033843994, "logps/chosen": -9.613097190856934, "logps/rejected": -27.764812469482422, "loss": 0.0674, "rewards/accuracies": 1.0, "rewards/chosen": -0.3453305661678314, "rewards/margins": 6.031594753265381, "rewards/rejected": -6.376925468444824, "step": 726 }, { "epoch": 12.322033898305085, "grad_norm": 5.378269931109981, "learning_rate": 1.9282588527077713e-07, "logits/chosen": -2.930131435394287, "logits/rejected": -1.716202735900879, "logps/chosen": -10.826783180236816, "logps/rejected": -19.09994125366211, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 0.4629979431629181, "rewards/margins": 4.1832051277160645, "rewards/rejected": -3.7202072143554688, "step": 727 }, { "epoch": 12.338983050847457, "grad_norm": 5.279035335530526, "learning_rate": 1.9210618980652273e-07, "logits/chosen": -2.356511354446411, "logits/rejected": -2.470968246459961, "logps/chosen": -7.703697204589844, "logps/rejected": -24.5485897064209, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": -0.13995756208896637, "rewards/margins": 5.8754167556762695, "rewards/rejected": -6.015374660491943, "step": 728 }, { "epoch": 12.35593220338983, "grad_norm": 5.022723359498386, "learning_rate": 1.9138700096249883e-07, "logits/chosen": -5.711466312408447, "logits/rejected": -1.9884896278381348, "logps/chosen": -11.275728225708008, "logps/rejected": -26.373491287231445, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 0.19736520946025848, "rewards/margins": 5.823462963104248, "rewards/rejected": -5.626098155975342, "step": 729 }, { "epoch": 12.372881355932204, "grad_norm": 5.309462197313035, "learning_rate": 1.9066832503222128e-07, "logits/chosen": -4.810001373291016, "logits/rejected": 0.02377176284790039, "logps/chosen": -11.607244491577148, "logps/rejected": -23.0208740234375, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 0.31827598810195923, "rewards/margins": 4.640523433685303, "rewards/rejected": -4.32224702835083, "step": 730 }, { "epoch": 12.389830508474576, "grad_norm": 4.6506237974520666, "learning_rate": 1.899501683047177e-07, "logits/chosen": -6.378528118133545, "logits/rejected": -8.353179931640625, "logps/chosen": -8.333313941955566, "logps/rejected": -23.246667861938477, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.32500994205474854, "rewards/margins": 5.235971927642822, "rewards/rejected": -4.910961151123047, "step": 731 }, { "epoch": 12.40677966101695, "grad_norm": 5.738231213417738, "learning_rate": 1.892325370644721e-07, "logits/chosen": -6.588768005371094, "logits/rejected": -4.150164604187012, "logps/chosen": -10.492864608764648, "logps/rejected": -16.70100212097168, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 0.6345978379249573, "rewards/margins": 3.688321590423584, "rewards/rejected": -3.0537235736846924, "step": 732 }, { "epoch": 12.423728813559322, "grad_norm": 3.9687270061891198, "learning_rate": 1.8851543759137007e-07, "logits/chosen": -8.90609359741211, "logits/rejected": -6.525782108306885, "logps/chosen": -7.384334087371826, "logps/rejected": -19.212717056274414, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.3278910517692566, "rewards/margins": 5.070918083190918, "rewards/rejected": -4.743027687072754, "step": 733 }, { "epoch": 12.440677966101696, "grad_norm": 4.6943629699640725, "learning_rate": 1.8779887616064382e-07, "logits/chosen": -5.408133029937744, "logits/rejected": -2.645357608795166, "logps/chosen": -10.299897193908691, "logps/rejected": -20.829387664794922, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.3884427547454834, "rewards/margins": 4.583587169647217, "rewards/rejected": -4.195144176483154, "step": 734 }, { "epoch": 12.457627118644067, "grad_norm": 5.171698330959028, "learning_rate": 1.8708285904281712e-07, "logits/chosen": -4.199699878692627, "logits/rejected": -2.1007070541381836, "logps/chosen": -7.45955228805542, "logps/rejected": -18.949785232543945, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 0.09868350625038147, "rewards/margins": 4.4975457191467285, "rewards/rejected": -4.398862361907959, "step": 735 }, { "epoch": 12.474576271186441, "grad_norm": 5.075833146805721, "learning_rate": 1.8636739250365056e-07, "logits/chosen": -4.879715919494629, "logits/rejected": -2.4047141075134277, "logps/chosen": -11.846234321594238, "logps/rejected": -20.621950149536133, "loss": 0.0591, "rewards/accuracies": 1.0, "rewards/chosen": 0.5136569142341614, "rewards/margins": 4.873780250549316, "rewards/rejected": -4.360123634338379, "step": 736 }, { "epoch": 12.491525423728813, "grad_norm": 4.994169132268942, "learning_rate": 1.8565248280408698e-07, "logits/chosen": -0.9355611801147461, "logits/rejected": -2.9057836532592773, "logps/chosen": -8.956902503967285, "logps/rejected": -23.145809173583984, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": -0.0478157103061676, "rewards/margins": 4.980042457580566, "rewards/rejected": -5.027858257293701, "step": 737 }, { "epoch": 12.508474576271187, "grad_norm": 4.615304739621222, "learning_rate": 1.8493813620019595e-07, "logits/chosen": -6.629713535308838, "logits/rejected": -4.551311492919922, "logps/chosen": -11.932062149047852, "logps/rejected": -27.589099884033203, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": -0.35736513137817383, "rewards/margins": 5.075533866882324, "rewards/rejected": -5.432898998260498, "step": 738 }, { "epoch": 12.525423728813559, "grad_norm": 4.899248416753311, "learning_rate": 1.8422435894311973e-07, "logits/chosen": -7.817357540130615, "logits/rejected": -6.282201766967773, "logps/chosen": -7.510641574859619, "logps/rejected": -20.142223358154297, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 0.23153254389762878, "rewards/margins": 4.454207897186279, "rewards/rejected": -4.222675323486328, "step": 739 }, { "epoch": 12.542372881355933, "grad_norm": 4.572546849568084, "learning_rate": 1.8351115727901829e-07, "logits/chosen": -3.3442132472991943, "logits/rejected": -3.23862886428833, "logps/chosen": -8.925397872924805, "logps/rejected": -25.86237907409668, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 0.3724052906036377, "rewards/margins": 5.9618425369262695, "rewards/rejected": -5.589437007904053, "step": 740 }, { "epoch": 12.559322033898304, "grad_norm": 5.890950067266909, "learning_rate": 1.8279853744901464e-07, "logits/chosen": -7.5882344245910645, "logits/rejected": -7.436162948608398, "logps/chosen": -9.983795166015625, "logps/rejected": -19.23735237121582, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 0.12066636979579926, "rewards/margins": 3.7871222496032715, "rewards/rejected": -3.6664562225341797, "step": 741 }, { "epoch": 12.576271186440678, "grad_norm": 5.125645184008797, "learning_rate": 1.8208650568914033e-07, "logits/chosen": -6.591865062713623, "logits/rejected": -5.549574851989746, "logps/chosen": -10.08639907836914, "logps/rejected": -19.847394943237305, "loss": 0.0567, "rewards/accuracies": 1.0, "rewards/chosen": 0.5254420638084412, "rewards/margins": 3.9040541648864746, "rewards/rejected": -3.3786120414733887, "step": 742 }, { "epoch": 12.59322033898305, "grad_norm": 5.0651911831079, "learning_rate": 1.8137506823028065e-07, "logits/chosen": -5.7834367752075195, "logits/rejected": -3.119711399078369, "logps/chosen": -10.867972373962402, "logps/rejected": -15.676458358764648, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671466112136841, "rewards/margins": 3.3360648155212402, "rewards/rejected": -2.4689180850982666, "step": 743 }, { "epoch": 12.610169491525424, "grad_norm": 5.631498137773476, "learning_rate": 1.8066423129812026e-07, "logits/chosen": -2.676837205886841, "logits/rejected": -1.9061572551727295, "logps/chosen": -11.207535743713379, "logps/rejected": -25.026824951171875, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 0.09902028739452362, "rewards/margins": 5.083338260650635, "rewards/rejected": -4.984318256378174, "step": 744 }, { "epoch": 12.627118644067796, "grad_norm": 5.140393808106211, "learning_rate": 1.7995400111308883e-07, "logits/chosen": -5.798111438751221, "logits/rejected": -4.84065580368042, "logps/chosen": -10.029037475585938, "logps/rejected": -19.738197326660156, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 0.39238882064819336, "rewards/margins": 4.5251593589782715, "rewards/rejected": -4.132770538330078, "step": 745 }, { "epoch": 12.64406779661017, "grad_norm": 5.651054369991429, "learning_rate": 1.7924438389030648e-07, "logits/chosen": -5.302846431732178, "logits/rejected": -3.5109519958496094, "logps/chosen": -12.664636611938477, "logps/rejected": -25.82659912109375, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": -0.18043912947177887, "rewards/margins": 5.055805206298828, "rewards/rejected": -5.236244201660156, "step": 746 }, { "epoch": 12.661016949152543, "grad_norm": 5.632292139453602, "learning_rate": 1.785353858395292e-07, "logits/chosen": -6.3755574226379395, "logits/rejected": -6.289969444274902, "logps/chosen": -9.670694351196289, "logps/rejected": -17.61966896057129, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 0.5971437692642212, "rewards/margins": 3.45253849029541, "rewards/rejected": -2.8553946018218994, "step": 747 }, { "epoch": 12.677966101694915, "grad_norm": 5.14742573667959, "learning_rate": 1.7782701316509478e-07, "logits/chosen": -5.7314066886901855, "logits/rejected": -3.3766136169433594, "logps/chosen": -10.241357803344727, "logps/rejected": -22.874204635620117, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 0.4667729139328003, "rewards/margins": 5.620718479156494, "rewards/rejected": -5.1539459228515625, "step": 748 }, { "epoch": 12.694915254237289, "grad_norm": 5.452805903306714, "learning_rate": 1.7711927206586853e-07, "logits/chosen": -7.6883649826049805, "logits/rejected": -8.14977741241455, "logps/chosen": -10.177472114562988, "logps/rejected": -21.427345275878906, "loss": 0.063, "rewards/accuracies": 1.0, "rewards/chosen": 0.34390830993652344, "rewards/margins": 4.701626300811768, "rewards/rejected": -4.357717990875244, "step": 749 }, { "epoch": 12.711864406779661, "grad_norm": 4.870367464314958, "learning_rate": 1.7641216873518876e-07, "logits/chosen": -2.2180004119873047, "logits/rejected": -2.018139123916626, "logps/chosen": -8.882790565490723, "logps/rejected": -25.841060638427734, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 0.22583720088005066, "rewards/margins": 5.492011070251465, "rewards/rejected": -5.26617431640625, "step": 750 }, { "epoch": 12.728813559322035, "grad_norm": 5.243426061747342, "learning_rate": 1.7570570936081306e-07, "logits/chosen": -9.123380661010742, "logits/rejected": -8.786087036132812, "logps/chosen": -8.865097999572754, "logps/rejected": -18.413890838623047, "loss": 0.0664, "rewards/accuracies": 1.0, "rewards/chosen": 0.4756356179714203, "rewards/margins": 4.1750288009643555, "rewards/rejected": -3.6993932723999023, "step": 751 }, { "epoch": 12.745762711864407, "grad_norm": 6.5035914338241145, "learning_rate": 1.7499990012486348e-07, "logits/chosen": -4.059915542602539, "logits/rejected": -4.546253681182861, "logps/chosen": -8.72022819519043, "logps/rejected": -30.666339874267578, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": -0.05255214869976044, "rewards/margins": 5.699767112731934, "rewards/rejected": -5.7523193359375, "step": 752 }, { "epoch": 12.76271186440678, "grad_norm": 5.570786457572866, "learning_rate": 1.7429474720377312e-07, "logits/chosen": -7.756892204284668, "logits/rejected": -6.753458023071289, "logps/chosen": -8.187812805175781, "logps/rejected": -15.387481689453125, "loss": 0.0686, "rewards/accuracies": 1.0, "rewards/chosen": 0.4039037227630615, "rewards/margins": 3.7153241634368896, "rewards/rejected": -3.311420202255249, "step": 753 }, { "epoch": 12.779661016949152, "grad_norm": 6.154811299294534, "learning_rate": 1.735902567682315e-07, "logits/chosen": -2.9708542823791504, "logits/rejected": 0.3598502278327942, "logps/chosen": -11.489412307739258, "logps/rejected": -21.9397029876709, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 0.35112035274505615, "rewards/margins": 4.90283203125, "rewards/rejected": -4.551711559295654, "step": 754 }, { "epoch": 12.796610169491526, "grad_norm": 5.353849373734212, "learning_rate": 1.7288643498313104e-07, "logits/chosen": -6.833067417144775, "logits/rejected": -6.609170913696289, "logps/chosen": -9.3986177444458, "logps/rejected": -21.662946701049805, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": 0.4572463035583496, "rewards/margins": 4.598602294921875, "rewards/rejected": -4.141356468200684, "step": 755 }, { "epoch": 12.813559322033898, "grad_norm": 5.19613647792081, "learning_rate": 1.7218328800751285e-07, "logits/chosen": -6.809295177459717, "logits/rejected": -9.63763427734375, "logps/chosen": -11.164754867553711, "logps/rejected": -22.834970474243164, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": 0.1647118628025055, "rewards/margins": 4.19931697845459, "rewards/rejected": -4.034604549407959, "step": 756 }, { "epoch": 12.830508474576272, "grad_norm": 5.032797215130399, "learning_rate": 1.7148082199451286e-07, "logits/chosen": -2.553732395172119, "logits/rejected": -2.5321171283721924, "logps/chosen": -7.590460777282715, "logps/rejected": -25.94837188720703, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": 0.32335829734802246, "rewards/margins": 5.733863353729248, "rewards/rejected": -5.4105048179626465, "step": 757 }, { "epoch": 12.847457627118644, "grad_norm": 5.461302763721142, "learning_rate": 1.7077904309130782e-07, "logits/chosen": -5.5044026374816895, "logits/rejected": -3.5391619205474854, "logps/chosen": -8.875229835510254, "logps/rejected": -20.904315948486328, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.3172765374183655, "rewards/margins": 4.838804721832275, "rewards/rejected": -4.521528244018555, "step": 758 }, { "epoch": 12.864406779661017, "grad_norm": 5.630964578420358, "learning_rate": 1.7007795743906194e-07, "logits/chosen": -5.3108906745910645, "logits/rejected": -5.781566619873047, "logps/chosen": -8.304010391235352, "logps/rejected": -21.03519058227539, "loss": 0.0672, "rewards/accuracies": 1.0, "rewards/chosen": 0.25181227922439575, "rewards/margins": 4.968446254730225, "rewards/rejected": -4.716634750366211, "step": 759 }, { "epoch": 12.88135593220339, "grad_norm": 5.170697641451785, "learning_rate": 1.6937757117287276e-07, "logits/chosen": -6.395049095153809, "logits/rejected": -5.568882942199707, "logps/chosen": -12.260860443115234, "logps/rejected": -17.58432388305664, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 0.49597105383872986, "rewards/margins": 3.307673692703247, "rewards/rejected": -2.811702251434326, "step": 760 }, { "epoch": 12.898305084745763, "grad_norm": 5.63554625757226, "learning_rate": 1.6867789042171777e-07, "logits/chosen": -3.965508460998535, "logits/rejected": -3.2860910892486572, "logps/chosen": -11.189289093017578, "logps/rejected": -24.705078125, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/chosen": -0.03508012741804123, "rewards/margins": 5.3851799964904785, "rewards/rejected": -5.420260906219482, "step": 761 }, { "epoch": 12.915254237288135, "grad_norm": 5.826484833366724, "learning_rate": 1.6797892130840036e-07, "logits/chosen": -10.473329544067383, "logits/rejected": -9.278584480285645, "logps/chosen": -12.040523529052734, "logps/rejected": -23.865997314453125, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 0.543164849281311, "rewards/margins": 4.920264720916748, "rewards/rejected": -4.377099514007568, "step": 762 }, { "epoch": 12.932203389830509, "grad_norm": 5.169064532647496, "learning_rate": 1.6728066994949658e-07, "logits/chosen": -4.377648830413818, "logits/rejected": -4.380948543548584, "logps/chosen": -8.87318229675293, "logps/rejected": -22.197044372558594, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": 0.24227860569953918, "rewards/margins": 5.380238056182861, "rewards/rejected": -5.1379594802856445, "step": 763 }, { "epoch": 12.94915254237288, "grad_norm": 4.626923336921723, "learning_rate": 1.6658314245530148e-07, "logits/chosen": -3.902984619140625, "logits/rejected": -0.4850635528564453, "logps/chosen": -11.315608978271484, "logps/rejected": -27.440704345703125, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 0.050329677760601044, "rewards/margins": 6.103644847869873, "rewards/rejected": -6.053314685821533, "step": 764 }, { "epoch": 12.966101694915254, "grad_norm": 5.36856823765751, "learning_rate": 1.6588634492977582e-07, "logits/chosen": -6.079492568969727, "logits/rejected": -4.708076477050781, "logps/chosen": -12.596566200256348, "logps/rejected": -27.42017936706543, "loss": 0.0643, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2698909044265747, "rewards/margins": 5.405095100402832, "rewards/rejected": -5.135204315185547, "step": 765 }, { "epoch": 12.983050847457626, "grad_norm": 6.43088486955128, "learning_rate": 1.651902834704924e-07, "logits/chosen": -6.696640491485596, "logits/rejected": -3.6787824630737305, "logps/chosen": -8.136153221130371, "logps/rejected": -16.924489974975586, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 0.07825213670730591, "rewards/margins": 3.4701426029205322, "rewards/rejected": -3.391890525817871, "step": 766 }, { "epoch": 13.0, "grad_norm": 5.831494815474743, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -5.741722106933594, "logits/rejected": -3.713162422180176, "logps/chosen": -11.334854125976562, "logps/rejected": -20.919025421142578, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": 0.28780484199523926, "rewards/margins": 4.361874103546143, "rewards/rejected": -4.074069499969482, "step": 767 }, { "epoch": 13.016949152542374, "grad_norm": 5.025171216775976, "learning_rate": 1.6380039310868414e-07, "logits/chosen": -3.283559560775757, "logits/rejected": -3.4452576637268066, "logps/chosen": -10.504578590393066, "logps/rejected": -24.13515853881836, "loss": 0.0588, "rewards/accuracies": 1.0, "rewards/chosen": 0.18408998847007751, "rewards/margins": 4.807743072509766, "rewards/rejected": -4.623653411865234, "step": 768 }, { "epoch": 13.033898305084746, "grad_norm": 4.871931611746412, "learning_rate": 1.631065763688857e-07, "logits/chosen": -5.76885461807251, "logits/rejected": -5.4496684074401855, "logps/chosen": -7.35520601272583, "logps/rejected": -18.789472579956055, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 0.3069876730442047, "rewards/margins": 4.473510265350342, "rewards/rejected": -4.166522026062012, "step": 769 }, { "epoch": 13.05084745762712, "grad_norm": 4.8078397196063305, "learning_rate": 1.6241352002067588e-07, "logits/chosen": -5.301861763000488, "logits/rejected": -4.853543281555176, "logps/chosen": -9.35759162902832, "logps/rejected": -26.16400146484375, "loss": 0.0603, "rewards/accuracies": 1.0, "rewards/chosen": 0.29260993003845215, "rewards/margins": 5.623096942901611, "rewards/rejected": -5.33048677444458, "step": 770 }, { "epoch": 13.067796610169491, "grad_norm": 4.196532190439989, "learning_rate": 1.61721230128889e-07, "logits/chosen": -1.4309144020080566, "logits/rejected": -2.1246204376220703, "logps/chosen": -6.944146156311035, "logps/rejected": -26.82303237915039, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.12401877343654633, "rewards/margins": 6.203535079956055, "rewards/rejected": -6.079516410827637, "step": 771 }, { "epoch": 13.084745762711865, "grad_norm": 5.593577080714422, "learning_rate": 1.6102971275165227e-07, "logits/chosen": -9.79638957977295, "logits/rejected": -9.661079406738281, "logps/chosen": -8.487841606140137, "logps/rejected": -24.75509262084961, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": 0.19384168088436127, "rewards/margins": 5.573680400848389, "rewards/rejected": -5.379838943481445, "step": 772 }, { "epoch": 13.101694915254237, "grad_norm": 5.150797332106759, "learning_rate": 1.603389739403327e-07, "logits/chosen": -9.676387786865234, "logits/rejected": -6.1768341064453125, "logps/chosen": -8.65725326538086, "logps/rejected": -17.568424224853516, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 0.8734726309776306, "rewards/margins": 3.8431637287139893, "rewards/rejected": -2.969691276550293, "step": 773 }, { "epoch": 13.11864406779661, "grad_norm": 5.3467717221921935, "learning_rate": 1.5964901973948408e-07, "logits/chosen": -2.5845415592193604, "logits/rejected": -2.7042267322540283, "logps/chosen": -10.360690116882324, "logps/rejected": -22.25835418701172, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 0.15665322542190552, "rewards/margins": 4.414241313934326, "rewards/rejected": -4.2575883865356445, "step": 774 }, { "epoch": 13.135593220338983, "grad_norm": 4.0651480960897945, "learning_rate": 1.5895985618679445e-07, "logits/chosen": -5.20015811920166, "logits/rejected": -5.799063205718994, "logps/chosen": -10.640718460083008, "logps/rejected": -29.960647583007812, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 0.23330990970134735, "rewards/margins": 6.547354698181152, "rewards/rejected": -6.314043998718262, "step": 775 }, { "epoch": 13.152542372881356, "grad_norm": 4.703853020014504, "learning_rate": 1.5827148931303275e-07, "logits/chosen": -2.3715322017669678, "logits/rejected": -3.2630908489227295, "logps/chosen": -9.838077545166016, "logps/rejected": -31.98811912536621, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -0.32375001907348633, "rewards/margins": 6.667496204376221, "rewards/rejected": -6.991246223449707, "step": 776 }, { "epoch": 13.169491525423728, "grad_norm": 5.442664426502708, "learning_rate": 1.5758392514199643e-07, "logits/chosen": -7.819540023803711, "logits/rejected": -10.007020950317383, "logps/chosen": -6.59602165222168, "logps/rejected": -20.028287887573242, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 0.19013340771198273, "rewards/margins": 4.252629280090332, "rewards/rejected": -4.062495708465576, "step": 777 }, { "epoch": 13.186440677966102, "grad_norm": 4.768607649775109, "learning_rate": 1.5689716969045847e-07, "logits/chosen": -3.3728349208831787, "logits/rejected": 0.18725888431072235, "logps/chosen": -10.546842575073242, "logps/rejected": -24.771841049194336, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.25277644395828247, "rewards/margins": 5.527108192443848, "rewards/rejected": -5.274331569671631, "step": 778 }, { "epoch": 13.203389830508474, "grad_norm": 5.460483523807034, "learning_rate": 1.5621122896811522e-07, "logits/chosen": -4.294860363006592, "logits/rejected": -2.532308578491211, "logps/chosen": -9.680610656738281, "logps/rejected": -26.630155563354492, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 0.0907769426703453, "rewards/margins": 5.914505958557129, "rewards/rejected": -5.823729038238525, "step": 779 }, { "epoch": 13.220338983050848, "grad_norm": 5.4173458280819435, "learning_rate": 1.555261089775329e-07, "logits/chosen": -7.844461917877197, "logits/rejected": -6.5587477684021, "logps/chosen": -7.928609371185303, "logps/rejected": -20.538318634033203, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 0.5045410394668579, "rewards/margins": 4.878368854522705, "rewards/rejected": -4.373827934265137, "step": 780 }, { "epoch": 13.23728813559322, "grad_norm": 4.976900202865331, "learning_rate": 1.548418157140961e-07, "logits/chosen": -3.608536720275879, "logits/rejected": -6.020019054412842, "logps/chosen": -9.638221740722656, "logps/rejected": -27.089221954345703, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.0017676204442977905, "rewards/margins": 5.250937461853027, "rewards/rejected": -5.249169826507568, "step": 781 }, { "epoch": 13.254237288135593, "grad_norm": 4.686999391779152, "learning_rate": 1.5415835516595463e-07, "logits/chosen": -4.551198959350586, "logits/rejected": -5.998251914978027, "logps/chosen": -8.980722427368164, "logps/rejected": -18.155977249145508, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 0.38983237743377686, "rewards/margins": 4.47466516494751, "rewards/rejected": -4.084832191467285, "step": 782 }, { "epoch": 13.271186440677965, "grad_norm": 4.5955159595956605, "learning_rate": 1.5347573331397135e-07, "logits/chosen": -2.852198600769043, "logits/rejected": 1.1061744689941406, "logps/chosen": -14.022387504577637, "logps/rejected": -28.37848663330078, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.03788289427757263, "rewards/margins": 6.151963233947754, "rewards/rejected": -6.114079475402832, "step": 783 }, { "epoch": 13.288135593220339, "grad_norm": 5.598431066257892, "learning_rate": 1.5279395613166985e-07, "logits/chosen": -6.422479629516602, "logits/rejected": -2.8612544536590576, "logps/chosen": -10.72292709350586, "logps/rejected": -20.895212173461914, "loss": 0.0634, "rewards/accuracies": 1.0, "rewards/chosen": 0.2630663514137268, "rewards/margins": 4.2991042137146, "rewards/rejected": -4.036037921905518, "step": 784 }, { "epoch": 13.305084745762711, "grad_norm": 4.614906303156567, "learning_rate": 1.5211302958518214e-07, "logits/chosen": -5.681360244750977, "logits/rejected": -2.0222373008728027, "logps/chosen": -10.26196575164795, "logps/rejected": -23.793285369873047, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.12604331970214844, "rewards/margins": 5.187099933624268, "rewards/rejected": -5.061056137084961, "step": 785 }, { "epoch": 13.322033898305085, "grad_norm": 4.626071080438597, "learning_rate": 1.5143295963319642e-07, "logits/chosen": -10.177433013916016, "logits/rejected": -6.988361358642578, "logps/chosen": -8.759820938110352, "logps/rejected": -20.91160011291504, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 0.06292536109685898, "rewards/margins": 5.223330020904541, "rewards/rejected": -5.160404682159424, "step": 786 }, { "epoch": 13.338983050847457, "grad_norm": 4.643506672060413, "learning_rate": 1.5075375222690496e-07, "logits/chosen": -5.985132694244385, "logits/rejected": -4.864323139190674, "logps/chosen": -9.475852012634277, "logps/rejected": -23.88833999633789, "loss": 0.0482, "rewards/accuracies": 1.0, "rewards/chosen": 0.5428158044815063, "rewards/margins": 5.8675432205200195, "rewards/rejected": -5.3247270584106445, "step": 787 }, { "epoch": 13.35593220338983, "grad_norm": 5.251627791060617, "learning_rate": 1.5007541330995198e-07, "logits/chosen": -8.681631088256836, "logits/rejected": -5.984912395477295, "logps/chosen": -11.887125968933105, "logps/rejected": -21.75457763671875, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 0.10201403498649597, "rewards/margins": 3.9955077171325684, "rewards/rejected": -3.893493413925171, "step": 788 }, { "epoch": 13.372881355932204, "grad_norm": 5.767774243872824, "learning_rate": 1.4939794881838176e-07, "logits/chosen": -8.51830768585205, "logits/rejected": -5.052129745483398, "logps/chosen": -11.381778717041016, "logps/rejected": -18.73246955871582, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 0.6155125498771667, "rewards/margins": 3.5688974857330322, "rewards/rejected": -2.9533848762512207, "step": 789 }, { "epoch": 13.389830508474576, "grad_norm": 4.824365860620941, "learning_rate": 1.487213646805866e-07, "logits/chosen": -9.106437683105469, "logits/rejected": -6.022385597229004, "logps/chosen": -9.311746597290039, "logps/rejected": -12.84929084777832, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 0.3897091746330261, "rewards/margins": 3.2690494060516357, "rewards/rejected": -2.879340171813965, "step": 790 }, { "epoch": 13.40677966101695, "grad_norm": 5.650311473571556, "learning_rate": 1.4804566681725496e-07, "logits/chosen": -4.26154899597168, "logits/rejected": -3.876025915145874, "logps/chosen": -8.089516639709473, "logps/rejected": -18.416603088378906, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 0.020768344402313232, "rewards/margins": 4.610015392303467, "rewards/rejected": -4.589247226715088, "step": 791 }, { "epoch": 13.423728813559322, "grad_norm": 4.381684628603866, "learning_rate": 1.473708611413194e-07, "logits/chosen": -3.8422200679779053, "logits/rejected": -4.606509208679199, "logps/chosen": -8.617932319641113, "logps/rejected": -21.121320724487305, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 0.10309768468141556, "rewards/margins": 4.8339691162109375, "rewards/rejected": -4.730871200561523, "step": 792 }, { "epoch": 13.440677966101696, "grad_norm": 4.645592608030263, "learning_rate": 1.4669695355790552e-07, "logits/chosen": -6.06674861907959, "logits/rejected": -7.425795555114746, "logps/chosen": -12.599617004394531, "logps/rejected": -24.435813903808594, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 0.21700027585029602, "rewards/margins": 4.384744167327881, "rewards/rejected": -4.167743682861328, "step": 793 }, { "epoch": 13.457627118644067, "grad_norm": 4.489498968125063, "learning_rate": 1.4602394996427942e-07, "logits/chosen": -5.401136875152588, "logits/rejected": -4.132033348083496, "logps/chosen": -8.727059364318848, "logps/rejected": -16.557653427124023, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 0.43375176191329956, "rewards/margins": 3.8064019680023193, "rewards/rejected": -3.372650146484375, "step": 794 }, { "epoch": 13.474576271186441, "grad_norm": 5.383940537311471, "learning_rate": 1.4535185624979687e-07, "logits/chosen": -5.567165374755859, "logits/rejected": -4.549276828765869, "logps/chosen": -10.690811157226562, "logps/rejected": -23.260143280029297, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": -0.002038337290287018, "rewards/margins": 5.000219345092773, "rewards/rejected": -5.002257823944092, "step": 795 }, { "epoch": 13.491525423728813, "grad_norm": 4.796108266845068, "learning_rate": 1.4468067829585108e-07, "logits/chosen": -5.905200004577637, "logits/rejected": -6.398066997528076, "logps/chosen": -8.744976043701172, "logps/rejected": -25.000789642333984, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 0.5944421291351318, "rewards/margins": 5.0441575050354, "rewards/rejected": -4.4497151374816895, "step": 796 }, { "epoch": 13.508474576271187, "grad_norm": 4.414074994930087, "learning_rate": 1.4401042197582192e-07, "logits/chosen": -4.122818946838379, "logits/rejected": -5.4599175453186035, "logps/chosen": -6.608936309814453, "logps/rejected": -23.324615478515625, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 0.14724817872047424, "rewards/margins": 5.297210216522217, "rewards/rejected": -5.149961471557617, "step": 797 }, { "epoch": 13.525423728813559, "grad_norm": 5.569789182924492, "learning_rate": 1.4334109315502392e-07, "logits/chosen": -8.031323432922363, "logits/rejected": -10.76991081237793, "logps/chosen": -10.417862892150879, "logps/rejected": -24.26569366455078, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -0.0844353437423706, "rewards/margins": 4.921995162963867, "rewards/rejected": -5.006430149078369, "step": 798 }, { "epoch": 13.542372881355933, "grad_norm": 4.99311530534324, "learning_rate": 1.4267269769065537e-07, "logits/chosen": -5.482168674468994, "logits/rejected": -2.70070743560791, "logps/chosen": -9.569720268249512, "logps/rejected": -21.80959129333496, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 0.5751017332077026, "rewards/margins": 5.077325344085693, "rewards/rejected": -4.502223968505859, "step": 799 }, { "epoch": 13.559322033898304, "grad_norm": 4.3955518518263155, "learning_rate": 1.4200524143174676e-07, "logits/chosen": -7.552706718444824, "logits/rejected": -4.6099114418029785, "logps/chosen": -10.882405281066895, "logps/rejected": -27.207590103149414, "loss": 0.0512, "rewards/accuracies": 0.9375, "rewards/chosen": -0.36488914489746094, "rewards/margins": 5.9867987632751465, "rewards/rejected": -6.351687431335449, "step": 800 }, { "epoch": 13.576271186440678, "grad_norm": 4.775382165631764, "learning_rate": 1.4133873021910976e-07, "logits/chosen": -1.3878110647201538, "logits/rejected": 1.4275829792022705, "logps/chosen": -9.222979545593262, "logps/rejected": -21.31915283203125, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": 0.2374023199081421, "rewards/margins": 4.305314540863037, "rewards/rejected": -4.0679121017456055, "step": 801 }, { "epoch": 13.59322033898305, "grad_norm": 5.569756855857142, "learning_rate": 1.4067316988528616e-07, "logits/chosen": -6.881744861602783, "logits/rejected": -3.4725799560546875, "logps/chosen": -11.406499862670898, "logps/rejected": -22.18549156188965, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": 0.6805269122123718, "rewards/margins": 5.206305027008057, "rewards/rejected": -4.525778293609619, "step": 802 }, { "epoch": 13.610169491525424, "grad_norm": 5.280571870724313, "learning_rate": 1.4000856625449664e-07, "logits/chosen": -9.837797164916992, "logits/rejected": -6.981907367706299, "logps/chosen": -10.905115127563477, "logps/rejected": -23.980911254882812, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 0.20386749505996704, "rewards/margins": 5.112873077392578, "rewards/rejected": -4.909005641937256, "step": 803 }, { "epoch": 13.627118644067796, "grad_norm": 4.716612003997788, "learning_rate": 1.3934492514259003e-07, "logits/chosen": -7.098214626312256, "logits/rejected": -7.429448127746582, "logps/chosen": -9.4953031539917, "logps/rejected": -22.437997817993164, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 0.6349058151245117, "rewards/margins": 5.2428460121154785, "rewards/rejected": -4.607940673828125, "step": 804 }, { "epoch": 13.64406779661017, "grad_norm": 5.461889345352294, "learning_rate": 1.3868225235699216e-07, "logits/chosen": -4.462307453155518, "logits/rejected": -4.318324565887451, "logps/chosen": -11.038957595825195, "logps/rejected": -25.736881256103516, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": -0.12135696411132812, "rewards/margins": 5.704156875610352, "rewards/rejected": -5.82551383972168, "step": 805 }, { "epoch": 13.661016949152543, "grad_norm": 4.634781728056408, "learning_rate": 1.3802055369665533e-07, "logits/chosen": -3.216710329055786, "logits/rejected": -1.861129641532898, "logps/chosen": -8.412116050720215, "logps/rejected": -21.057117462158203, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 0.2881760001182556, "rewards/margins": 4.603650093078613, "rewards/rejected": -4.315474033355713, "step": 806 }, { "epoch": 13.677966101694915, "grad_norm": 4.989725061376435, "learning_rate": 1.373598349520073e-07, "logits/chosen": -6.761763572692871, "logits/rejected": -4.394807815551758, "logps/chosen": -6.5797576904296875, "logps/rejected": -18.811870574951172, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 0.08568145334720612, "rewards/margins": 4.4652605056762695, "rewards/rejected": -4.379578590393066, "step": 807 }, { "epoch": 13.694915254237289, "grad_norm": 4.680733186063038, "learning_rate": 1.3670010190490073e-07, "logits/chosen": -4.460152626037598, "logits/rejected": -1.1188685894012451, "logps/chosen": -13.548200607299805, "logps/rejected": -31.156145095825195, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": -0.218767911195755, "rewards/margins": 5.674424171447754, "rewards/rejected": -5.893191814422607, "step": 808 }, { "epoch": 13.711864406779661, "grad_norm": 4.663236624627772, "learning_rate": 1.3604136032856268e-07, "logits/chosen": -7.653531074523926, "logits/rejected": -5.643984794616699, "logps/chosen": -9.94079303741455, "logps/rejected": -27.503480911254883, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 0.43097323179244995, "rewards/margins": 5.7583465576171875, "rewards/rejected": -5.327373027801514, "step": 809 }, { "epoch": 13.728813559322035, "grad_norm": 4.730840953500074, "learning_rate": 1.3538361598754382e-07, "logits/chosen": -5.666280269622803, "logits/rejected": -3.015167474746704, "logps/chosen": -9.375484466552734, "logps/rejected": -27.635007858276367, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 0.13278841972351074, "rewards/margins": 6.769484519958496, "rewards/rejected": -6.636696815490723, "step": 810 }, { "epoch": 13.745762711864407, "grad_norm": 5.490668032046639, "learning_rate": 1.3472687463766848e-07, "logits/chosen": -4.392643928527832, "logits/rejected": -5.210497856140137, "logps/chosen": -9.478880882263184, "logps/rejected": -18.758119583129883, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 0.3752535581588745, "rewards/margins": 4.221700191497803, "rewards/rejected": -3.8464465141296387, "step": 811 }, { "epoch": 13.76271186440678, "grad_norm": 5.590368558372794, "learning_rate": 1.3407114202598368e-07, "logits/chosen": -6.054229259490967, "logits/rejected": -5.513613700866699, "logps/chosen": -7.454735279083252, "logps/rejected": -15.90285873413086, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": 0.13070914149284363, "rewards/margins": 3.5557241439819336, "rewards/rejected": -3.4250149726867676, "step": 812 }, { "epoch": 13.779661016949152, "grad_norm": 4.735779420046063, "learning_rate": 1.3341642389070926e-07, "logits/chosen": -4.318917751312256, "logits/rejected": -5.7969183921813965, "logps/chosen": -9.046194076538086, "logps/rejected": -22.119319915771484, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 0.45455262064933777, "rewards/margins": 4.551388263702393, "rewards/rejected": -4.096835613250732, "step": 813 }, { "epoch": 13.796610169491526, "grad_norm": 4.898977970422225, "learning_rate": 1.3276272596118728e-07, "logits/chosen": -5.6698713302612305, "logits/rejected": -6.0464863777160645, "logps/chosen": -14.398414611816406, "logps/rejected": -27.341615676879883, "loss": 0.0598, "rewards/accuracies": 1.0, "rewards/chosen": -0.03556087613105774, "rewards/margins": 5.173557758331299, "rewards/rejected": -5.209118843078613, "step": 814 }, { "epoch": 13.813559322033898, "grad_norm": 5.281083471645692, "learning_rate": 1.3211005395783244e-07, "logits/chosen": -7.419910430908203, "logits/rejected": -5.7298359870910645, "logps/chosen": -11.2025728225708, "logps/rejected": -25.59320068359375, "loss": 0.0676, "rewards/accuracies": 1.0, "rewards/chosen": 0.4744488298892975, "rewards/margins": 5.016169548034668, "rewards/rejected": -4.541720390319824, "step": 815 }, { "epoch": 13.830508474576272, "grad_norm": 5.083021458478777, "learning_rate": 1.3145841359208148e-07, "logits/chosen": -6.718883991241455, "logits/rejected": -5.874282360076904, "logps/chosen": -10.914901733398438, "logps/rejected": -26.438758850097656, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": 0.15237705409526825, "rewards/margins": 5.802890777587891, "rewards/rejected": -5.650513172149658, "step": 816 }, { "epoch": 13.847457627118644, "grad_norm": 5.160155412631123, "learning_rate": 1.308078105663437e-07, "logits/chosen": -4.9868268966674805, "logits/rejected": -6.539076805114746, "logps/chosen": -10.791238784790039, "logps/rejected": -22.204166412353516, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 0.15927231311798096, "rewards/margins": 4.936960697174072, "rewards/rejected": -4.777688503265381, "step": 817 }, { "epoch": 13.864406779661017, "grad_norm": 4.142873406210084, "learning_rate": 1.3015825057395058e-07, "logits/chosen": -4.892223358154297, "logits/rejected": -4.1584930419921875, "logps/chosen": -5.886479377746582, "logps/rejected": -17.27469253540039, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.7119540572166443, "rewards/margins": 4.798526287078857, "rewards/rejected": -4.086572170257568, "step": 818 }, { "epoch": 13.88135593220339, "grad_norm": 5.554456588133767, "learning_rate": 1.2950973929910619e-07, "logits/chosen": -9.246594429016113, "logits/rejected": -4.345804214477539, "logps/chosen": -10.645049095153809, "logps/rejected": -23.079753875732422, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": -0.04009474813938141, "rewards/margins": 5.105522155761719, "rewards/rejected": -5.14561653137207, "step": 819 }, { "epoch": 13.898305084745763, "grad_norm": 7.076055638827002, "learning_rate": 1.2886228241683748e-07, "logits/chosen": -6.933260917663574, "logits/rejected": -4.676466941833496, "logps/chosen": -9.63884449005127, "logps/rejected": -24.883792877197266, "loss": 0.0547, "rewards/accuracies": 1.0, "rewards/chosen": 0.6752225756645203, "rewards/margins": 6.20891809463501, "rewards/rejected": -5.533695697784424, "step": 820 }, { "epoch": 13.915254237288135, "grad_norm": 5.578307094576147, "learning_rate": 1.282158855929445e-07, "logits/chosen": -5.170792102813721, "logits/rejected": -4.953229904174805, "logps/chosen": -10.70461654663086, "logps/rejected": -23.960208892822266, "loss": 0.0577, "rewards/accuracies": 1.0, "rewards/chosen": 0.2784963846206665, "rewards/margins": 4.557374954223633, "rewards/rejected": -4.278878688812256, "step": 821 }, { "epoch": 13.932203389830509, "grad_norm": 5.40778040476045, "learning_rate": 1.275705544839509e-07, "logits/chosen": -9.584942817687988, "logits/rejected": -7.174380302429199, "logps/chosen": -11.899053573608398, "logps/rejected": -22.629398345947266, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 0.3301153779029846, "rewards/margins": 4.465504169464111, "rewards/rejected": -4.13538932800293, "step": 822 }, { "epoch": 13.94915254237288, "grad_norm": 4.935477856465082, "learning_rate": 1.2692629473705452e-07, "logits/chosen": -6.729738235473633, "logits/rejected": -5.973828315734863, "logps/chosen": -9.960297584533691, "logps/rejected": -18.796049118041992, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 0.06524574756622314, "rewards/margins": 4.380879878997803, "rewards/rejected": -4.315633773803711, "step": 823 }, { "epoch": 13.966101694915254, "grad_norm": 4.711427189745618, "learning_rate": 1.2628311199007762e-07, "logits/chosen": -5.152575492858887, "logits/rejected": -3.88008975982666, "logps/chosen": -9.844609260559082, "logps/rejected": -20.80732536315918, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": -0.09208326786756516, "rewards/margins": 4.425102710723877, "rewards/rejected": -4.517186164855957, "step": 824 }, { "epoch": 13.983050847457626, "grad_norm": 5.01651920767794, "learning_rate": 1.2564101187141828e-07, "logits/chosen": -8.57741641998291, "logits/rejected": -6.389130592346191, "logps/chosen": -6.646769046783447, "logps/rejected": -20.195098876953125, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 0.5168790817260742, "rewards/margins": 4.831752777099609, "rewards/rejected": -4.314873695373535, "step": 825 }, { "epoch": 14.0, "grad_norm": 4.532284739310805, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -5.515041351318359, "logits/rejected": -2.865309715270996, "logps/chosen": -13.2843656539917, "logps/rejected": -25.315540313720703, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -0.19785913825035095, "rewards/margins": 5.822000980377197, "rewards/rejected": -6.01986026763916, "step": 826 }, { "epoch": 14.016949152542374, "grad_norm": 4.026799410999227, "learning_rate": 1.2436008198522374e-07, "logits/chosen": -5.045407295227051, "logits/rejected": -4.171008586883545, "logps/chosen": -9.222007751464844, "logps/rejected": -22.0806827545166, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": 0.6829844117164612, "rewards/margins": 5.049844741821289, "rewards/rejected": -4.366860389709473, "step": 827 }, { "epoch": 14.033898305084746, "grad_norm": 5.238355012181834, "learning_rate": 1.2372126342691797e-07, "logits/chosen": -4.42636775970459, "logits/rejected": -4.819035530090332, "logps/chosen": -11.446056365966797, "logps/rejected": -24.577653884887695, "loss": 0.0599, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1556832194328308, "rewards/margins": 4.2256999015808105, "rewards/rejected": -4.381383419036865, "step": 828 }, { "epoch": 14.05084745762712, "grad_norm": 5.126062091047994, "learning_rate": 1.2308354991529006e-07, "logits/chosen": -0.3282977342605591, "logits/rejected": -4.660909652709961, "logps/chosen": -8.18812370300293, "logps/rejected": -21.934959411621094, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 0.6030260324478149, "rewards/margins": 3.95294451713562, "rewards/rejected": -3.3499186038970947, "step": 829 }, { "epoch": 14.067796610169491, "grad_norm": 4.911341582897822, "learning_rate": 1.2244694703087727e-07, "logits/chosen": -7.362742900848389, "logits/rejected": -4.4875359535217285, "logps/chosen": -14.022963523864746, "logps/rejected": -23.285682678222656, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": 0.14519502222537994, "rewards/margins": 4.577618598937988, "rewards/rejected": -4.4324235916137695, "step": 830 }, { "epoch": 14.084745762711865, "grad_norm": 4.4598825344534045, "learning_rate": 1.2181146034449807e-07, "logits/chosen": -7.692203998565674, "logits/rejected": -5.220171928405762, "logps/chosen": -8.62202262878418, "logps/rejected": -17.138887405395508, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 0.3423607349395752, "rewards/margins": 4.227006912231445, "rewards/rejected": -3.884646415710449, "step": 831 }, { "epoch": 14.101694915254237, "grad_norm": 4.8353921985009665, "learning_rate": 1.2117709541720306e-07, "logits/chosen": -5.6293487548828125, "logits/rejected": -6.787869453430176, "logps/chosen": -14.639093399047852, "logps/rejected": -27.194129943847656, "loss": 0.0625, "rewards/accuracies": 1.0, "rewards/chosen": 0.09962216019630432, "rewards/margins": 4.305108070373535, "rewards/rejected": -4.205486297607422, "step": 832 }, { "epoch": 14.11864406779661, "grad_norm": 5.568638154486435, "learning_rate": 1.2054385780022655e-07, "logits/chosen": -4.221851825714111, "logits/rejected": -6.096976280212402, "logps/chosen": -8.260061264038086, "logps/rejected": -19.653345108032227, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": 0.706796407699585, "rewards/margins": 4.410439968109131, "rewards/rejected": -3.703643560409546, "step": 833 }, { "epoch": 14.135593220338983, "grad_norm": 4.675450379157052, "learning_rate": 1.199117530349379e-07, "logits/chosen": -5.025335311889648, "logits/rejected": -5.071759223937988, "logps/chosen": -10.142559051513672, "logps/rejected": -21.743587493896484, "loss": 0.055, "rewards/accuracies": 1.0, "rewards/chosen": 0.0700664073228836, "rewards/margins": 4.8901286125183105, "rewards/rejected": -4.820062160491943, "step": 834 }, { "epoch": 14.152542372881356, "grad_norm": 4.592697469691946, "learning_rate": 1.192807866527931e-07, "logits/chosen": -7.847444534301758, "logits/rejected": -6.507003307342529, "logps/chosen": -10.491202354431152, "logps/rejected": -21.514968872070312, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 0.2933105528354645, "rewards/margins": 5.166686534881592, "rewards/rejected": -4.87337589263916, "step": 835 }, { "epoch": 14.169491525423728, "grad_norm": 5.075415557100472, "learning_rate": 1.1865096417528633e-07, "logits/chosen": -6.532116413116455, "logits/rejected": -4.443775177001953, "logps/chosen": -8.443931579589844, "logps/rejected": -23.405532836914062, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 0.06578953564167023, "rewards/margins": 5.535974502563477, "rewards/rejected": -5.470184326171875, "step": 836 }, { "epoch": 14.186440677966102, "grad_norm": 4.296095365548908, "learning_rate": 1.1802229111390155e-07, "logits/chosen": -1.0841858386993408, "logits/rejected": -3.881087303161621, "logps/chosen": -8.981799125671387, "logps/rejected": -31.96260643005371, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 0.05703720450401306, "rewards/margins": 7.540444374084473, "rewards/rejected": -7.483407497406006, "step": 837 }, { "epoch": 14.203389830508474, "grad_norm": 4.434101376540436, "learning_rate": 1.173947729700644e-07, "logits/chosen": -5.153438091278076, "logits/rejected": -4.827155113220215, "logps/chosen": -12.063497543334961, "logps/rejected": -26.99883270263672, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 0.024201437830924988, "rewards/margins": 5.5701093673706055, "rewards/rejected": -5.545908451080322, "step": 838 }, { "epoch": 14.220338983050848, "grad_norm": 4.706145613555284, "learning_rate": 1.1676841523509398e-07, "logits/chosen": -8.671034812927246, "logits/rejected": -6.122951507568359, "logps/chosen": -11.669129371643066, "logps/rejected": -21.667890548706055, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 0.29969891905784607, "rewards/margins": 4.794971942901611, "rewards/rejected": -4.495272636413574, "step": 839 }, { "epoch": 14.23728813559322, "grad_norm": 4.580061612060666, "learning_rate": 1.1614322339015484e-07, "logits/chosen": -3.6043882369995117, "logits/rejected": -4.311732292175293, "logps/chosen": -13.588913917541504, "logps/rejected": -29.708572387695312, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -0.38750195503234863, "rewards/margins": 6.415001392364502, "rewards/rejected": -6.802502632141113, "step": 840 }, { "epoch": 14.254237288135593, "grad_norm": 4.3425766056242265, "learning_rate": 1.1551920290620903e-07, "logits/chosen": -4.859739780426025, "logits/rejected": -6.222860336303711, "logps/chosen": -7.076902389526367, "logps/rejected": -19.721343994140625, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 0.23864012956619263, "rewards/margins": 4.4084906578063965, "rewards/rejected": -4.1698503494262695, "step": 841 }, { "epoch": 14.271186440677965, "grad_norm": 4.836047729889714, "learning_rate": 1.1489635924396815e-07, "logits/chosen": -8.631258964538574, "logits/rejected": -10.491349220275879, "logps/chosen": -12.263872146606445, "logps/rejected": -31.80056381225586, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": 0.026434574276208878, "rewards/margins": 5.849242687225342, "rewards/rejected": -5.822807788848877, "step": 842 }, { "epoch": 14.288135593220339, "grad_norm": 5.8126592232749354, "learning_rate": 1.1427469785384558e-07, "logits/chosen": -1.0946826934814453, "logits/rejected": -3.664766788482666, "logps/chosen": -10.965002059936523, "logps/rejected": -21.787538528442383, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 0.5762641429901123, "rewards/margins": 4.274231433868408, "rewards/rejected": -3.697967529296875, "step": 843 }, { "epoch": 14.305084745762711, "grad_norm": 5.0651053975440705, "learning_rate": 1.1365422417590878e-07, "logits/chosen": -3.439162254333496, "logits/rejected": 1.3838034868240356, "logps/chosen": -13.087545394897461, "logps/rejected": -30.054899215698242, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 0.21956637501716614, "rewards/margins": 6.333641529083252, "rewards/rejected": -6.114074230194092, "step": 844 }, { "epoch": 14.322033898305085, "grad_norm": 5.096385076595446, "learning_rate": 1.1303494363983196e-07, "logits/chosen": -6.096810340881348, "logits/rejected": -5.0730085372924805, "logps/chosen": -7.8111467361450195, "logps/rejected": -15.42635726928711, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.21816721558570862, "rewards/margins": 3.222285032272339, "rewards/rejected": -3.004117965698242, "step": 845 }, { "epoch": 14.338983050847457, "grad_norm": 5.033259197227216, "learning_rate": 1.1241686166484804e-07, "logits/chosen": -7.2436065673828125, "logits/rejected": -5.567520618438721, "logps/chosen": -10.714032173156738, "logps/rejected": -20.56938362121582, "loss": 0.0538, "rewards/accuracies": 0.9375, "rewards/chosen": 0.04737303406000137, "rewards/margins": 4.353013515472412, "rewards/rejected": -4.30564022064209, "step": 846 }, { "epoch": 14.35593220338983, "grad_norm": 4.57859881932484, "learning_rate": 1.1179998365970172e-07, "logits/chosen": -4.692959308624268, "logits/rejected": -2.923541784286499, "logps/chosen": -9.625382423400879, "logps/rejected": -21.11334228515625, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.3075934946537018, "rewards/margins": 5.190447807312012, "rewards/rejected": -4.882854461669922, "step": 847 }, { "epoch": 14.372881355932204, "grad_norm": 4.804675788465381, "learning_rate": 1.1118431502260162e-07, "logits/chosen": -4.305268287658691, "logits/rejected": -3.777709484100342, "logps/chosen": -6.272968292236328, "logps/rejected": -21.6550235748291, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.6106405258178711, "rewards/margins": 5.49233341217041, "rewards/rejected": -4.881693363189697, "step": 848 }, { "epoch": 14.389830508474576, "grad_norm": 4.617090621688473, "learning_rate": 1.1056986114117367e-07, "logits/chosen": -6.763372898101807, "logits/rejected": -5.744204521179199, "logps/chosen": -8.708209037780762, "logps/rejected": -17.52611541748047, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 0.39696207642555237, "rewards/margins": 3.6503450870513916, "rewards/rejected": -3.253383159637451, "step": 849 }, { "epoch": 14.40677966101695, "grad_norm": 4.362269121942627, "learning_rate": 1.0995662739241346e-07, "logits/chosen": -4.924978256225586, "logits/rejected": -5.879234313964844, "logps/chosen": -12.350116729736328, "logps/rejected": -28.583274841308594, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 0.15409313142299652, "rewards/margins": 5.438337802886963, "rewards/rejected": -5.284245014190674, "step": 850 }, { "epoch": 14.423728813559322, "grad_norm": 4.197218623377624, "learning_rate": 1.0934461914263965e-07, "logits/chosen": -6.324525833129883, "logits/rejected": -3.778895378112793, "logps/chosen": -8.720869064331055, "logps/rejected": -19.028125762939453, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 0.3219558000564575, "rewards/margins": 4.60241174697876, "rewards/rejected": -4.280456066131592, "step": 851 }, { "epoch": 14.440677966101696, "grad_norm": 4.923401961976329, "learning_rate": 1.087338417474464e-07, "logits/chosen": -5.350639820098877, "logits/rejected": -7.599613666534424, "logps/chosen": -8.26685619354248, "logps/rejected": -24.998441696166992, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -0.20556461811065674, "rewards/margins": 5.382943153381348, "rewards/rejected": -5.588507175445557, "step": 852 }, { "epoch": 14.457627118644067, "grad_norm": 3.7969022055546713, "learning_rate": 1.0812430055165709e-07, "logits/chosen": -5.8174920082092285, "logits/rejected": -6.306069374084473, "logps/chosen": -12.027060508728027, "logps/rejected": -25.023365020751953, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 0.5196723937988281, "rewards/margins": 5.552087783813477, "rewards/rejected": -5.032415390014648, "step": 853 }, { "epoch": 14.474576271186441, "grad_norm": 4.15778836042614, "learning_rate": 1.0751600088927712e-07, "logits/chosen": -3.9236257076263428, "logits/rejected": -3.4127421379089355, "logps/chosen": -8.900745391845703, "logps/rejected": -25.145057678222656, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 0.26206323504447937, "rewards/margins": 5.242336273193359, "rewards/rejected": -4.9802727699279785, "step": 854 }, { "epoch": 14.491525423728813, "grad_norm": 4.203245294110667, "learning_rate": 1.0690894808344756e-07, "logits/chosen": -4.772054195404053, "logits/rejected": -3.865562915802002, "logps/chosen": -11.955608367919922, "logps/rejected": -25.580827713012695, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -0.23099081218242645, "rewards/margins": 5.626684188842773, "rewards/rejected": -5.857674598693848, "step": 855 }, { "epoch": 14.508474576271187, "grad_norm": 5.622321826841454, "learning_rate": 1.0630314744639829e-07, "logits/chosen": -5.114489555358887, "logits/rejected": -4.372469902038574, "logps/chosen": -10.947568893432617, "logps/rejected": -20.671138763427734, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 0.20464643836021423, "rewards/margins": 4.4105024337768555, "rewards/rejected": -4.205855369567871, "step": 856 }, { "epoch": 14.525423728813559, "grad_norm": 4.357657280031394, "learning_rate": 1.0569860427940178e-07, "logits/chosen": -8.80482006072998, "logits/rejected": -8.39877986907959, "logps/chosen": -9.7009859085083, "logps/rejected": -23.01410675048828, "loss": 0.0542, "rewards/accuracies": 1.0, "rewards/chosen": -0.0699918121099472, "rewards/margins": 4.5847649574279785, "rewards/rejected": -4.654757022857666, "step": 857 }, { "epoch": 14.542372881355933, "grad_norm": 4.473948955486904, "learning_rate": 1.050953238727264e-07, "logits/chosen": -3.8625569343566895, "logits/rejected": -4.641469478607178, "logps/chosen": -8.324604034423828, "logps/rejected": -19.920515060424805, "loss": 0.0536, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4261910319328308, "rewards/margins": 4.780582427978516, "rewards/rejected": -4.354391574859619, "step": 858 }, { "epoch": 14.559322033898304, "grad_norm": 4.416058452804211, "learning_rate": 1.0449331150559063e-07, "logits/chosen": -6.369832515716553, "logits/rejected": -5.154065132141113, "logps/chosen": -8.499757766723633, "logps/rejected": -18.445560455322266, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 0.5627738833427429, "rewards/margins": 4.360942363739014, "rewards/rejected": -3.798168420791626, "step": 859 }, { "epoch": 14.576271186440678, "grad_norm": 5.151908410777473, "learning_rate": 1.0389257244611601e-07, "logits/chosen": -9.489761352539062, "logits/rejected": -8.491812705993652, "logps/chosen": -10.10033893585205, "logps/rejected": -17.174280166625977, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 0.6827844977378845, "rewards/margins": 3.9337456226348877, "rewards/rejected": -3.2509608268737793, "step": 860 }, { "epoch": 14.59322033898305, "grad_norm": 4.508708554427197, "learning_rate": 1.0329311195128193e-07, "logits/chosen": -6.457301139831543, "logits/rejected": -4.161015033721924, "logps/chosen": -8.108702659606934, "logps/rejected": -17.485946655273438, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 0.2239864021539688, "rewards/margins": 4.468382835388184, "rewards/rejected": -4.244396209716797, "step": 861 }, { "epoch": 14.610169491525424, "grad_norm": 4.153746327758954, "learning_rate": 1.0269493526687914e-07, "logits/chosen": -14.252533912658691, "logits/rejected": -9.489017486572266, "logps/chosen": -14.082448959350586, "logps/rejected": -21.335691452026367, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 0.5519404411315918, "rewards/margins": 4.63496732711792, "rewards/rejected": -4.08302640914917, "step": 862 }, { "epoch": 14.627118644067796, "grad_norm": 4.751730032840892, "learning_rate": 1.0209804762746396e-07, "logits/chosen": -6.768604278564453, "logits/rejected": -4.19866943359375, "logps/chosen": -11.16037654876709, "logps/rejected": -21.334613800048828, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": 0.26084911823272705, "rewards/margins": 4.942337989807129, "rewards/rejected": -4.681488990783691, "step": 863 }, { "epoch": 14.64406779661017, "grad_norm": 4.9134107504574205, "learning_rate": 1.0150245425631235e-07, "logits/chosen": -2.556035280227661, "logits/rejected": -4.578921794891357, "logps/chosen": -7.398219108581543, "logps/rejected": -19.08116340637207, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.21627086400985718, "rewards/margins": 4.1095075607299805, "rewards/rejected": -3.8932361602783203, "step": 864 }, { "epoch": 14.661016949152543, "grad_norm": 4.096385228647857, "learning_rate": 1.0090816036537461e-07, "logits/chosen": -7.6018877029418945, "logits/rejected": -7.894321441650391, "logps/chosen": -7.223687171936035, "logps/rejected": -23.89727783203125, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 0.1827097237110138, "rewards/margins": 5.98267936706543, "rewards/rejected": -5.799968719482422, "step": 865 }, { "epoch": 14.677966101694915, "grad_norm": 5.560154999681085, "learning_rate": 1.0031517115522925e-07, "logits/chosen": -5.176994323730469, "logits/rejected": -6.375298976898193, "logps/chosen": -6.474493026733398, "logps/rejected": -17.692594528198242, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 0.27740585803985596, "rewards/margins": 3.594590663909912, "rewards/rejected": -3.3171846866607666, "step": 866 }, { "epoch": 14.694915254237289, "grad_norm": 4.720320647825455, "learning_rate": 9.972349181503773e-08, "logits/chosen": -6.104518413543701, "logits/rejected": -6.355710983276367, "logps/chosen": -7.5375752449035645, "logps/rejected": -18.257652282714844, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 0.37600499391555786, "rewards/margins": 4.874422550201416, "rewards/rejected": -4.498417854309082, "step": 867 }, { "epoch": 14.711864406779661, "grad_norm": 4.5609809580288525, "learning_rate": 9.913312752249903e-08, "logits/chosen": -8.388524055480957, "logits/rejected": -8.490981101989746, "logps/chosen": -10.754916191101074, "logps/rejected": -23.119861602783203, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": -0.26782697439193726, "rewards/margins": 5.011294364929199, "rewards/rejected": -5.2791218757629395, "step": 868 }, { "epoch": 14.728813559322035, "grad_norm": 5.039462820883621, "learning_rate": 9.85440834438044e-08, "logits/chosen": -9.018473625183105, "logits/rejected": -6.955737590789795, "logps/chosen": -10.54946231842041, "logps/rejected": -22.88796615600586, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.005913838744163513, "rewards/margins": 4.554727077484131, "rewards/rejected": -4.548813343048096, "step": 869 }, { "epoch": 14.745762711864407, "grad_norm": 4.79804613276376, "learning_rate": 9.795636473359207e-08, "logits/chosen": -11.686652183532715, "logits/rejected": -9.616609573364258, "logps/chosen": -10.668220520019531, "logps/rejected": -19.37090492248535, "loss": 0.0568, "rewards/accuracies": 1.0, "rewards/chosen": 0.39386722445487976, "rewards/margins": 3.586113452911377, "rewards/rejected": -3.192246437072754, "step": 870 }, { "epoch": 14.76271186440678, "grad_norm": 5.238510670938149, "learning_rate": 9.736997653490214e-08, "logits/chosen": -3.2652204036712646, "logits/rejected": -3.1794800758361816, "logps/chosen": -6.136098861694336, "logps/rejected": -23.864599227905273, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 0.4961877763271332, "rewards/margins": 6.1632819175720215, "rewards/rejected": -5.6670942306518555, "step": 871 }, { "epoch": 14.779661016949152, "grad_norm": 4.158137074830622, "learning_rate": 9.678492397913165e-08, "logits/chosen": -9.135766983032227, "logits/rejected": -4.244735240936279, "logps/chosen": -15.490636825561523, "logps/rejected": -28.337665557861328, "loss": 0.0391, "rewards/accuracies": 1.0, "rewards/chosen": 0.1223401129245758, "rewards/margins": 6.128490924835205, "rewards/rejected": -6.00615119934082, "step": 872 }, { "epoch": 14.796610169491526, "grad_norm": 5.096270220086588, "learning_rate": 9.620121218598957e-08, "logits/chosen": -9.420333862304688, "logits/rejected": -7.442325115203857, "logps/chosen": -8.854183197021484, "logps/rejected": -21.26641082763672, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 0.6020601391792297, "rewards/margins": 6.024995803833008, "rewards/rejected": -5.422935962677002, "step": 873 }, { "epoch": 14.813559322033898, "grad_norm": 4.125705253274765, "learning_rate": 9.561884626345204e-08, "logits/chosen": -4.848762035369873, "logits/rejected": -5.289836406707764, "logps/chosen": -8.86702823638916, "logps/rejected": -19.24077606201172, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 0.1991223245859146, "rewards/margins": 3.673518180847168, "rewards/rejected": -3.474395990371704, "step": 874 }, { "epoch": 14.830508474576272, "grad_norm": 4.8180346160723735, "learning_rate": 9.503783130771778e-08, "logits/chosen": -5.396355152130127, "logits/rejected": -3.8392302989959717, "logps/chosen": -10.044886589050293, "logps/rejected": -24.429101943969727, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 0.6494219303131104, "rewards/margins": 5.044546127319336, "rewards/rejected": -4.3951239585876465, "step": 875 }, { "epoch": 14.847457627118644, "grad_norm": 4.910821646003884, "learning_rate": 9.445817240316332e-08, "logits/chosen": -6.426746368408203, "logits/rejected": -4.628366947174072, "logps/chosen": -7.040453910827637, "logps/rejected": -22.949737548828125, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": 0.2003602534532547, "rewards/margins": 5.139168739318848, "rewards/rejected": -4.938808441162109, "step": 876 }, { "epoch": 14.864406779661017, "grad_norm": 4.798926785767766, "learning_rate": 9.387987462229857e-08, "logits/chosen": -7.756277084350586, "logits/rejected": -7.002287864685059, "logps/chosen": -7.811505317687988, "logps/rejected": -20.826684951782227, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.46053773164749146, "rewards/margins": 4.936527252197266, "rewards/rejected": -4.475989818572998, "step": 877 }, { "epoch": 14.88135593220339, "grad_norm": 4.3625712644302626, "learning_rate": 9.330294302572242e-08, "logits/chosen": -7.890216827392578, "logits/rejected": -7.356686592102051, "logps/chosen": -6.975728988647461, "logps/rejected": -14.148273468017578, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": 0.5016485452651978, "rewards/margins": 3.3492910861968994, "rewards/rejected": -2.847642660140991, "step": 878 }, { "epoch": 14.898305084745763, "grad_norm": 5.106024259580478, "learning_rate": 9.272738266207871e-08, "logits/chosen": -6.573366165161133, "logits/rejected": -7.513491630554199, "logps/chosen": -10.01795482635498, "logps/rejected": -22.914081573486328, "loss": 0.058, "rewards/accuracies": 1.0, "rewards/chosen": 0.01393437385559082, "rewards/margins": 4.2963762283325195, "rewards/rejected": -4.282442092895508, "step": 879 }, { "epoch": 14.915254237288135, "grad_norm": 10.139064787648792, "learning_rate": 9.215319856801157e-08, "logits/chosen": -7.310423374176025, "logits/rejected": -3.5291025638580322, "logps/chosen": -9.833205223083496, "logps/rejected": -23.23526382446289, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 0.11848260462284088, "rewards/margins": 4.9702019691467285, "rewards/rejected": -4.851719856262207, "step": 880 }, { "epoch": 14.932203389830509, "grad_norm": 4.019168812870987, "learning_rate": 9.158039576812176e-08, "logits/chosen": -11.067143440246582, "logits/rejected": -9.051342010498047, "logps/chosen": -10.35849666595459, "logps/rejected": -22.214868545532227, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -0.25341349840164185, "rewards/margins": 4.845900535583496, "rewards/rejected": -5.099314212799072, "step": 881 }, { "epoch": 14.94915254237288, "grad_norm": 4.615927599216388, "learning_rate": 9.10089792749223e-08, "logits/chosen": -3.4698643684387207, "logits/rejected": -4.226621150970459, "logps/chosen": -7.663708686828613, "logps/rejected": -29.664623260498047, "loss": 0.0543, "rewards/accuracies": 1.0, "rewards/chosen": 0.4476359486579895, "rewards/margins": 7.534958362579346, "rewards/rejected": -7.08732271194458, "step": 882 }, { "epoch": 14.966101694915254, "grad_norm": 4.485706073296585, "learning_rate": 9.043895408879504e-08, "logits/chosen": -5.978943347930908, "logits/rejected": -3.7649073600769043, "logps/chosen": -8.316689491271973, "logps/rejected": -21.403587341308594, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 0.5265974402427673, "rewards/margins": 4.677036762237549, "rewards/rejected": -4.1504387855529785, "step": 883 }, { "epoch": 14.983050847457626, "grad_norm": 4.2730822752523965, "learning_rate": 8.987032519794666e-08, "logits/chosen": -11.319976806640625, "logits/rejected": -7.822196960449219, "logps/chosen": -10.064786911010742, "logps/rejected": -16.78985023498535, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 0.3657466769218445, "rewards/margins": 3.710754871368408, "rewards/rejected": -3.345008611679077, "step": 884 }, { "epoch": 15.0, "grad_norm": 4.149972484378902, "learning_rate": 8.930309757836516e-08, "logits/chosen": -6.867130756378174, "logits/rejected": -7.295436382293701, "logps/chosen": -6.439949035644531, "logps/rejected": -15.3391695022583, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": 0.10488628596067429, "rewards/margins": 3.8695998191833496, "rewards/rejected": -3.764713764190674, "step": 885 }, { "epoch": 15.016949152542374, "grad_norm": 3.781552998631691, "learning_rate": 8.87372761937761e-08, "logits/chosen": -9.987040519714355, "logits/rejected": -7.477032661437988, "logps/chosen": -10.782506942749023, "logps/rejected": -23.716060638427734, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 0.6830369234085083, "rewards/margins": 5.680238246917725, "rewards/rejected": -4.997201442718506, "step": 886 }, { "epoch": 15.033898305084746, "grad_norm": 4.663893838793768, "learning_rate": 8.817286599559931e-08, "logits/chosen": -3.7257680892944336, "logits/rejected": -3.5477094650268555, "logps/chosen": -8.740972518920898, "logps/rejected": -19.743122100830078, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 0.5974680185317993, "rewards/margins": 4.669073581695557, "rewards/rejected": -4.071605205535889, "step": 887 }, { "epoch": 15.05084745762712, "grad_norm": 5.010276787368709, "learning_rate": 8.760987192290556e-08, "logits/chosen": -5.386483192443848, "logits/rejected": -5.680109977722168, "logps/chosen": -8.287491798400879, "logps/rejected": -24.390138626098633, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.29073163866996765, "rewards/margins": 5.166617393493652, "rewards/rejected": -4.875885486602783, "step": 888 }, { "epoch": 15.067796610169491, "grad_norm": 4.300590568046875, "learning_rate": 8.704829890237326e-08, "logits/chosen": -4.816190719604492, "logits/rejected": -4.562450408935547, "logps/chosen": -8.473125457763672, "logps/rejected": -24.722415924072266, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 0.32061341404914856, "rewards/margins": 5.384640216827393, "rewards/rejected": -5.064026832580566, "step": 889 }, { "epoch": 15.084745762711865, "grad_norm": 4.056958939978676, "learning_rate": 8.648815184824543e-08, "logits/chosen": -8.428276062011719, "logits/rejected": -6.711330413818359, "logps/chosen": -13.22380256652832, "logps/rejected": -27.040882110595703, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": -0.07486316561698914, "rewards/margins": 5.384260654449463, "rewards/rejected": -5.4591240882873535, "step": 890 }, { "epoch": 15.101694915254237, "grad_norm": 4.116821025355182, "learning_rate": 8.592943566228669e-08, "logits/chosen": -7.393421173095703, "logits/rejected": -4.3779215812683105, "logps/chosen": -11.240400314331055, "logps/rejected": -23.093278884887695, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 0.6038632392883301, "rewards/margins": 5.859452724456787, "rewards/rejected": -5.255589962005615, "step": 891 }, { "epoch": 15.11864406779661, "grad_norm": 5.053699753605588, "learning_rate": 8.537215523374037e-08, "logits/chosen": -4.562039852142334, "logits/rejected": -4.087484359741211, "logps/chosen": -10.435542106628418, "logps/rejected": -17.37160873413086, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 0.45656490325927734, "rewards/margins": 3.8431553840637207, "rewards/rejected": -3.3865904808044434, "step": 892 }, { "epoch": 15.135593220338983, "grad_norm": 4.130713005185727, "learning_rate": 8.481631543928561e-08, "logits/chosen": -7.683807849884033, "logits/rejected": -6.336053848266602, "logps/chosen": -7.9568400382995605, "logps/rejected": -22.52141571044922, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 0.1728895902633667, "rewards/margins": 5.566967487335205, "rewards/rejected": -5.394078731536865, "step": 893 }, { "epoch": 15.152542372881356, "grad_norm": 3.8082945319240955, "learning_rate": 8.426192114299483e-08, "logits/chosen": -6.013499736785889, "logits/rejected": -6.379771709442139, "logps/chosen": -11.058837890625, "logps/rejected": -22.782060623168945, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 0.47234123945236206, "rewards/margins": 4.522031784057617, "rewards/rejected": -4.0496907234191895, "step": 894 }, { "epoch": 15.169491525423728, "grad_norm": 4.655654682322374, "learning_rate": 8.370897719629108e-08, "logits/chosen": -8.027939796447754, "logits/rejected": -5.110647678375244, "logps/chosen": -7.751594543457031, "logps/rejected": -16.943401336669922, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 0.3737262487411499, "rewards/margins": 4.178828239440918, "rewards/rejected": -3.8051021099090576, "step": 895 }, { "epoch": 15.186440677966102, "grad_norm": 4.585133437276424, "learning_rate": 8.315748843790562e-08, "logits/chosen": -8.03437614440918, "logits/rejected": -5.727774620056152, "logps/chosen": -13.897333145141602, "logps/rejected": -22.834522247314453, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 0.3705972135066986, "rewards/margins": 4.764908790588379, "rewards/rejected": -4.394311428070068, "step": 896 }, { "epoch": 15.203389830508474, "grad_norm": 4.2945931087395985, "learning_rate": 8.260745969383565e-08, "logits/chosen": -7.748586177825928, "logits/rejected": -5.3257622718811035, "logps/chosen": -6.9540486335754395, "logps/rejected": -15.131741523742676, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 0.15817368030548096, "rewards/margins": 4.12407112121582, "rewards/rejected": -3.965897560119629, "step": 897 }, { "epoch": 15.220338983050848, "grad_norm": 3.9554224005139877, "learning_rate": 8.205889577730179e-08, "logits/chosen": -5.89959716796875, "logits/rejected": -5.828517913818359, "logps/chosen": -9.16100788116455, "logps/rejected": -22.181655883789062, "loss": 0.0373, "rewards/accuracies": 1.0, "rewards/chosen": 0.26382455229759216, "rewards/margins": 5.5371880531311035, "rewards/rejected": -5.2733635902404785, "step": 898 }, { "epoch": 15.23728813559322, "grad_norm": 4.145154923229452, "learning_rate": 8.151180148870649e-08, "logits/chosen": -10.387269020080566, "logits/rejected": -9.048869132995605, "logps/chosen": -8.629597663879395, "logps/rejected": -18.4086971282959, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 0.4396933317184448, "rewards/margins": 4.135700702667236, "rewards/rejected": -3.696007251739502, "step": 899 }, { "epoch": 15.254237288135593, "grad_norm": 4.392474058377527, "learning_rate": 8.09661816155914e-08, "logits/chosen": -6.837314128875732, "logits/rejected": -4.007465362548828, "logps/chosen": -10.138846397399902, "logps/rejected": -29.386852264404297, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -0.17193610966205597, "rewards/margins": 6.914831161499023, "rewards/rejected": -7.086767196655273, "step": 900 }, { "epoch": 15.271186440677965, "grad_norm": 4.522224865877253, "learning_rate": 8.042204093259597e-08, "logits/chosen": -6.852179050445557, "logits/rejected": -4.5903706550598145, "logps/chosen": -13.024887084960938, "logps/rejected": -19.289447784423828, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 0.1210104376077652, "rewards/margins": 4.075073719024658, "rewards/rejected": -3.9540631771087646, "step": 901 }, { "epoch": 15.288135593220339, "grad_norm": 4.4900190318899185, "learning_rate": 7.987938420141536e-08, "logits/chosen": -6.153905391693115, "logits/rejected": -6.648481845855713, "logps/chosen": -8.1083345413208, "logps/rejected": -25.058744430541992, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 0.31352919340133667, "rewards/margins": 5.847841262817383, "rewards/rejected": -5.5343122482299805, "step": 902 }, { "epoch": 15.305084745762711, "grad_norm": 4.582566620003048, "learning_rate": 7.93382161707589e-08, "logits/chosen": -3.9638917446136475, "logits/rejected": -1.5824838876724243, "logps/chosen": -9.085407257080078, "logps/rejected": -18.560510635375977, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 0.17204411327838898, "rewards/margins": 4.696191787719727, "rewards/rejected": -4.524147987365723, "step": 903 }, { "epoch": 15.322033898305085, "grad_norm": 4.889430458232006, "learning_rate": 7.879854157630861e-08, "logits/chosen": -4.202753067016602, "logits/rejected": -3.8201920986175537, "logps/chosen": -8.303861618041992, "logps/rejected": -20.284696578979492, "loss": 0.0582, "rewards/accuracies": 1.0, "rewards/chosen": 0.18788239359855652, "rewards/margins": 4.125871181488037, "rewards/rejected": -3.937988519668579, "step": 904 }, { "epoch": 15.338983050847457, "grad_norm": 4.582348879875144, "learning_rate": 7.826036514067755e-08, "logits/chosen": -6.020393371582031, "logits/rejected": -4.9403557777404785, "logps/chosen": -7.944501876831055, "logps/rejected": -21.464771270751953, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 0.35279831290245056, "rewards/margins": 4.910175323486328, "rewards/rejected": -4.557377338409424, "step": 905 }, { "epoch": 15.35593220338983, "grad_norm": 4.626664369260017, "learning_rate": 7.772369157336872e-08, "logits/chosen": -6.890298843383789, "logits/rejected": -5.290185928344727, "logps/chosen": -9.014580726623535, "logps/rejected": -18.544498443603516, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 0.2904837131500244, "rewards/margins": 4.766071796417236, "rewards/rejected": -4.475588321685791, "step": 906 }, { "epoch": 15.372881355932204, "grad_norm": 4.837067860496764, "learning_rate": 7.718852557073366e-08, "logits/chosen": -6.288779258728027, "logits/rejected": -5.576810359954834, "logps/chosen": -8.71249008178711, "logps/rejected": -19.18750762939453, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.333800345659256, "rewards/margins": 5.200597286224365, "rewards/rejected": -4.866796493530273, "step": 907 }, { "epoch": 15.389830508474576, "grad_norm": 4.508705583344171, "learning_rate": 7.665487181593145e-08, "logits/chosen": -4.20571231842041, "logits/rejected": 0.7605342268943787, "logps/chosen": -14.706962585449219, "logps/rejected": -32.21023941040039, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.05708075314760208, "rewards/margins": 6.945536136627197, "rewards/rejected": -6.888455390930176, "step": 908 }, { "epoch": 15.40677966101695, "grad_norm": 4.357475863641545, "learning_rate": 7.612273497888775e-08, "logits/chosen": -5.5518798828125, "logits/rejected": -6.663199424743652, "logps/chosen": -9.692988395690918, "logps/rejected": -27.73391342163086, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 0.16855725646018982, "rewards/margins": 5.336852550506592, "rewards/rejected": -5.168295860290527, "step": 909 }, { "epoch": 15.423728813559322, "grad_norm": 4.104051386873784, "learning_rate": 7.559211971625384e-08, "logits/chosen": -8.114259719848633, "logits/rejected": -8.306021690368652, "logps/chosen": -9.408538818359375, "logps/rejected": -24.232013702392578, "loss": 0.0498, "rewards/accuracies": 1.0, "rewards/chosen": 0.08460733294487, "rewards/margins": 5.02858304977417, "rewards/rejected": -4.943976402282715, "step": 910 }, { "epoch": 15.440677966101696, "grad_norm": 4.10363074035625, "learning_rate": 7.506303067136602e-08, "logits/chosen": -7.748012542724609, "logits/rejected": -7.093140125274658, "logps/chosen": -9.258726119995117, "logps/rejected": -23.056177139282227, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.5200488567352295, "rewards/margins": 4.712107181549072, "rewards/rejected": -4.192058086395264, "step": 911 }, { "epoch": 15.457627118644067, "grad_norm": 3.7318632292671077, "learning_rate": 7.453547247420464e-08, "logits/chosen": -8.921445846557617, "logits/rejected": -7.705672264099121, "logps/chosen": -9.047325134277344, "logps/rejected": -21.919553756713867, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 0.2997787892818451, "rewards/margins": 5.2199907302856445, "rewards/rejected": -4.9202117919921875, "step": 912 }, { "epoch": 15.474576271186441, "grad_norm": 4.40499674775813, "learning_rate": 7.400944974135426e-08, "logits/chosen": -8.905839920043945, "logits/rejected": -7.21429443359375, "logps/chosen": -11.844295501708984, "logps/rejected": -25.040693283081055, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 0.3408765494823456, "rewards/margins": 5.200284004211426, "rewards/rejected": -4.8594069480896, "step": 913 }, { "epoch": 15.491525423728813, "grad_norm": 4.093105987084931, "learning_rate": 7.348496707596242e-08, "logits/chosen": -10.217541694641113, "logits/rejected": -8.00607681274414, "logps/chosen": -7.044992446899414, "logps/rejected": -16.269014358520508, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 0.4086524248123169, "rewards/margins": 4.565752983093262, "rewards/rejected": -4.157100677490234, "step": 914 }, { "epoch": 15.508474576271187, "grad_norm": 4.748659150209963, "learning_rate": 7.296202906769997e-08, "logits/chosen": -6.844855308532715, "logits/rejected": -5.689482688903809, "logps/chosen": -14.72694206237793, "logps/rejected": -26.045547485351562, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": -0.07029888778924942, "rewards/margins": 5.19006872177124, "rewards/rejected": -5.260367393493652, "step": 915 }, { "epoch": 15.525423728813559, "grad_norm": 4.591864509571797, "learning_rate": 7.244064029272049e-08, "logits/chosen": -5.819035530090332, "logits/rejected": -6.179313659667969, "logps/chosen": -7.19488000869751, "logps/rejected": -23.933685302734375, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": 0.15764935314655304, "rewards/margins": 6.297710418701172, "rewards/rejected": -6.140060901641846, "step": 916 }, { "epoch": 15.542372881355933, "grad_norm": 4.155066071491463, "learning_rate": 7.192080531362065e-08, "logits/chosen": -6.063582897186279, "logits/rejected": -7.798738956451416, "logps/chosen": -8.79454231262207, "logps/rejected": -21.187654495239258, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 0.010221108794212341, "rewards/margins": 4.549063205718994, "rewards/rejected": -4.538841724395752, "step": 917 }, { "epoch": 15.559322033898304, "grad_norm": 4.8476750798687664, "learning_rate": 7.140252867939994e-08, "logits/chosen": -7.400679111480713, "logits/rejected": -5.948519706726074, "logps/chosen": -9.957500457763672, "logps/rejected": -25.932926177978516, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 0.2069414108991623, "rewards/margins": 5.775245189666748, "rewards/rejected": -5.568304061889648, "step": 918 }, { "epoch": 15.576271186440678, "grad_norm": 4.873888730122289, "learning_rate": 7.08858149254212e-08, "logits/chosen": -6.478437900543213, "logits/rejected": -5.060609340667725, "logps/chosen": -12.641325950622559, "logps/rejected": -19.79878807067871, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": -0.02003513276576996, "rewards/margins": 4.015776634216309, "rewards/rejected": -4.035811901092529, "step": 919 }, { "epoch": 15.59322033898305, "grad_norm": 4.337491313818972, "learning_rate": 7.037066857337057e-08, "logits/chosen": -5.538844108581543, "logits/rejected": -3.830111265182495, "logps/chosen": -12.809090614318848, "logps/rejected": -25.132450103759766, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -0.20014286041259766, "rewards/margins": 5.1021037101745605, "rewards/rejected": -5.302246570587158, "step": 920 }, { "epoch": 15.610169491525424, "grad_norm": 4.467101523154651, "learning_rate": 6.985709413121804e-08, "logits/chosen": -7.16643762588501, "logits/rejected": -7.090363025665283, "logps/chosen": -11.620450973510742, "logps/rejected": -27.701143264770508, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": 0.13024349510669708, "rewards/margins": 5.549199104309082, "rewards/rejected": -5.4189558029174805, "step": 921 }, { "epoch": 15.627118644067796, "grad_norm": 4.206404750084994, "learning_rate": 6.934509609317821e-08, "logits/chosen": -5.28694486618042, "logits/rejected": -6.201131820678711, "logps/chosen": -8.330510139465332, "logps/rejected": -21.516273498535156, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": 0.0818605124950409, "rewards/margins": 4.693274974822998, "rewards/rejected": -4.611414432525635, "step": 922 }, { "epoch": 15.64406779661017, "grad_norm": 4.290802211777049, "learning_rate": 6.883467893967068e-08, "logits/chosen": -11.422243118286133, "logits/rejected": -6.941327095031738, "logps/chosen": -9.11874008178711, "logps/rejected": -20.560924530029297, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 0.3169085383415222, "rewards/margins": 5.488955020904541, "rewards/rejected": -5.172046661376953, "step": 923 }, { "epoch": 15.661016949152543, "grad_norm": 5.259403196135534, "learning_rate": 6.832584713728101e-08, "logits/chosen": -1.465713381767273, "logits/rejected": -1.5597341060638428, "logps/chosen": -11.35631275177002, "logps/rejected": -23.734394073486328, "loss": 0.059, "rewards/accuracies": 1.0, "rewards/chosen": 0.5144131779670715, "rewards/margins": 5.69120979309082, "rewards/rejected": -5.176797389984131, "step": 924 }, { "epoch": 15.677966101694915, "grad_norm": 5.295423529930121, "learning_rate": 6.781860513872154e-08, "logits/chosen": -6.635700702667236, "logits/rejected": -6.23319673538208, "logps/chosen": -12.26685905456543, "logps/rejected": -21.214061737060547, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": 0.17025622725486755, "rewards/margins": 4.827388763427734, "rewards/rejected": -4.657133102416992, "step": 925 }, { "epoch": 15.694915254237289, "grad_norm": 4.315266296273012, "learning_rate": 6.731295738279255e-08, "logits/chosen": -7.001347064971924, "logits/rejected": -2.145578384399414, "logps/chosen": -11.142807006835938, "logps/rejected": -21.77847671508789, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 0.27062392234802246, "rewards/margins": 5.010962009429932, "rewards/rejected": -4.740338325500488, "step": 926 }, { "epoch": 15.711864406779661, "grad_norm": 4.940221201075603, "learning_rate": 6.680890829434324e-08, "logits/chosen": -9.319822311401367, "logits/rejected": -8.537477493286133, "logps/chosen": -8.557331085205078, "logps/rejected": -22.469524383544922, "loss": 0.0558, "rewards/accuracies": 1.0, "rewards/chosen": 0.33681201934814453, "rewards/margins": 5.529571533203125, "rewards/rejected": -5.1927595138549805, "step": 927 }, { "epoch": 15.728813559322035, "grad_norm": 4.13577945145429, "learning_rate": 6.630646228423323e-08, "logits/chosen": -7.591739177703857, "logits/rejected": -5.0761003494262695, "logps/chosen": -7.780755519866943, "logps/rejected": -21.055946350097656, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 0.29918402433395386, "rewards/margins": 5.199295997619629, "rewards/rejected": -4.900112152099609, "step": 928 }, { "epoch": 15.745762711864407, "grad_norm": 3.6967955744291823, "learning_rate": 6.580562374929369e-08, "logits/chosen": -4.919278621673584, "logits/rejected": -0.548151969909668, "logps/chosen": -11.255338668823242, "logps/rejected": -28.93407440185547, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": 0.3684440851211548, "rewards/margins": 7.152661323547363, "rewards/rejected": -6.784217834472656, "step": 929 }, { "epoch": 15.76271186440678, "grad_norm": 3.582661160567403, "learning_rate": 6.53063970722891e-08, "logits/chosen": -6.948887825012207, "logits/rejected": -5.574216365814209, "logps/chosen": -9.181246757507324, "logps/rejected": -28.59181022644043, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": 0.4202234148979187, "rewards/margins": 6.4141845703125, "rewards/rejected": -5.993961334228516, "step": 930 }, { "epoch": 15.779661016949152, "grad_norm": 5.039230655446437, "learning_rate": 6.480878662187883e-08, "logits/chosen": -5.391602516174316, "logits/rejected": -1.1296690702438354, "logps/chosen": -10.732582092285156, "logps/rejected": -26.942331314086914, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": 0.08686523139476776, "rewards/margins": 6.540577411651611, "rewards/rejected": -6.453711986541748, "step": 931 }, { "epoch": 15.796610169491526, "grad_norm": 5.904351655682266, "learning_rate": 6.431279675257872e-08, "logits/chosen": -5.443461894989014, "logits/rejected": -7.360386848449707, "logps/chosen": -8.354605674743652, "logps/rejected": -22.984634399414062, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": 0.02095790207386017, "rewards/margins": 5.452232360839844, "rewards/rejected": -5.4312744140625, "step": 932 }, { "epoch": 15.813559322033898, "grad_norm": 5.007899634061934, "learning_rate": 6.381843180472349e-08, "logits/chosen": -4.189509868621826, "logits/rejected": -4.424144744873047, "logps/chosen": -8.072770118713379, "logps/rejected": -24.014766693115234, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 0.2011854648590088, "rewards/margins": 5.383251667022705, "rewards/rejected": -5.182065963745117, "step": 933 }, { "epoch": 15.830508474576272, "grad_norm": 4.113407110361293, "learning_rate": 6.332569610442806e-08, "logits/chosen": -2.852726936340332, "logits/rejected": -3.0774269104003906, "logps/chosen": -6.4904680252075195, "logps/rejected": -27.27833366394043, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 0.4382808208465576, "rewards/margins": 7.506036758422852, "rewards/rejected": -7.067756175994873, "step": 934 }, { "epoch": 15.847457627118644, "grad_norm": 4.626182534981252, "learning_rate": 6.28345939635502e-08, "logits/chosen": -6.271831035614014, "logits/rejected": -6.549797058105469, "logps/chosen": -7.970676422119141, "logps/rejected": -21.457536697387695, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 0.29878151416778564, "rewards/margins": 4.9347920417785645, "rewards/rejected": -4.636011123657227, "step": 935 }, { "epoch": 15.864406779661017, "grad_norm": 4.100236768772272, "learning_rate": 6.23451296796526e-08, "logits/chosen": -12.010820388793945, "logits/rejected": -6.9747443199157715, "logps/chosen": -12.61644458770752, "logps/rejected": -26.862714767456055, "loss": 0.0354, "rewards/accuracies": 1.0, "rewards/chosen": 0.52858567237854, "rewards/margins": 5.081140995025635, "rewards/rejected": -4.552555084228516, "step": 936 }, { "epoch": 15.88135593220339, "grad_norm": 4.555450958773166, "learning_rate": 6.185730753596538e-08, "logits/chosen": -0.7543103098869324, "logits/rejected": 0.3908948302268982, "logps/chosen": -12.7721529006958, "logps/rejected": -25.825170516967773, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 0.09586603194475174, "rewards/margins": 5.416937351226807, "rewards/rejected": -5.321071147918701, "step": 937 }, { "epoch": 15.898305084745763, "grad_norm": 4.87105997384688, "learning_rate": 6.137113180134842e-08, "logits/chosen": -5.579267501831055, "logits/rejected": -7.177576065063477, "logps/chosen": -8.82749080657959, "logps/rejected": -22.285377502441406, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.26872116327285767, "rewards/margins": 4.700954437255859, "rewards/rejected": -4.4322333335876465, "step": 938 }, { "epoch": 15.915254237288135, "grad_norm": 4.729392094465314, "learning_rate": 6.088660673025416e-08, "logits/chosen": -5.619653224945068, "logits/rejected": -4.975070953369141, "logps/chosen": -9.873350143432617, "logps/rejected": -26.303316116333008, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 0.05529198795557022, "rewards/margins": 5.834388732910156, "rewards/rejected": -5.779096603393555, "step": 939 }, { "epoch": 15.932203389830509, "grad_norm": 4.103739242054735, "learning_rate": 6.04037365626904e-08, "logits/chosen": -7.370528221130371, "logits/rejected": -6.9952778816223145, "logps/chosen": -9.415689468383789, "logps/rejected": -19.225749969482422, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.4855545461177826, "rewards/margins": 4.5416669845581055, "rewards/rejected": -4.056112766265869, "step": 940 }, { "epoch": 15.94915254237288, "grad_norm": 4.4572375897616, "learning_rate": 5.992252552418303e-08, "logits/chosen": -10.922706604003906, "logits/rejected": -10.116929054260254, "logps/chosen": -10.026784896850586, "logps/rejected": -22.07077980041504, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 0.7581459879875183, "rewards/margins": 4.968456745147705, "rewards/rejected": -4.210310935974121, "step": 941 }, { "epoch": 15.966101694915254, "grad_norm": 4.518155772533724, "learning_rate": 5.9442977825739175e-08, "logits/chosen": -5.720646858215332, "logits/rejected": -6.363611221313477, "logps/chosen": -10.6333646774292, "logps/rejected": -30.965347290039062, "loss": 0.0605, "rewards/accuracies": 1.0, "rewards/chosen": 0.214798241853714, "rewards/margins": 6.588019847869873, "rewards/rejected": -6.373221397399902, "step": 942 }, { "epoch": 15.983050847457626, "grad_norm": 4.745042956801933, "learning_rate": 5.896509766381028e-08, "logits/chosen": -3.2927167415618896, "logits/rejected": -1.5687626600265503, "logps/chosen": -11.610235214233398, "logps/rejected": -25.762611389160156, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": -0.06053323298692703, "rewards/margins": 4.873441696166992, "rewards/rejected": -4.933974742889404, "step": 943 }, { "epoch": 16.0, "grad_norm": 4.817602731785699, "learning_rate": 5.848888922025552e-08, "logits/chosen": -8.419713973999023, "logits/rejected": -7.601513385772705, "logps/chosen": -7.9321699142456055, "logps/rejected": -19.71578598022461, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 0.3660358488559723, "rewards/margins": 5.004828453063965, "rewards/rejected": -4.638792991638184, "step": 944 }, { "epoch": 16.016949152542374, "grad_norm": 5.075858314557022, "learning_rate": 5.8014356662305e-08, "logits/chosen": -2.367865562438965, "logits/rejected": -4.307644367218018, "logps/chosen": -9.921137809753418, "logps/rejected": -26.57358169555664, "loss": 0.0723, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17190691828727722, "rewards/margins": 4.076783180236816, "rewards/rejected": -3.9048757553100586, "step": 945 }, { "epoch": 16.033898305084747, "grad_norm": 4.764182818211839, "learning_rate": 5.75415041425234e-08, "logits/chosen": -9.120944023132324, "logits/rejected": -9.688711166381836, "logps/chosen": -5.0205864906311035, "logps/rejected": -15.899490356445312, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.6438949108123779, "rewards/margins": 4.424144268035889, "rewards/rejected": -3.7802491188049316, "step": 946 }, { "epoch": 16.050847457627118, "grad_norm": 4.405345673656158, "learning_rate": 5.707033579877379e-08, "logits/chosen": -8.686363220214844, "logits/rejected": -5.756230354309082, "logps/chosen": -10.461504936218262, "logps/rejected": -22.015207290649414, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.43740159273147583, "rewards/margins": 5.130666732788086, "rewards/rejected": -4.693264961242676, "step": 947 }, { "epoch": 16.06779661016949, "grad_norm": 5.20667843942836, "learning_rate": 5.660085575418114e-08, "logits/chosen": -4.197178840637207, "logits/rejected": -4.616957664489746, "logps/chosen": -14.62967586517334, "logps/rejected": -23.51697540283203, "loss": 0.0545, "rewards/accuracies": 0.9375, "rewards/chosen": -0.24510294198989868, "rewards/margins": 3.7846622467041016, "rewards/rejected": -4.0297651290893555, "step": 948 }, { "epoch": 16.084745762711865, "grad_norm": 4.282529438480496, "learning_rate": 5.6133068117096335e-08, "logits/chosen": -6.429426670074463, "logits/rejected": -8.026588439941406, "logps/chosen": -7.786294937133789, "logps/rejected": -22.525127410888672, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 0.13193783164024353, "rewards/margins": 4.904080867767334, "rewards/rejected": -4.772143363952637, "step": 949 }, { "epoch": 16.10169491525424, "grad_norm": 4.621515043010338, "learning_rate": 5.566697698106024e-08, "logits/chosen": -10.278071403503418, "logits/rejected": -8.686415672302246, "logps/chosen": -10.784829139709473, "logps/rejected": -22.454185485839844, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 0.09567110240459442, "rewards/margins": 4.569484233856201, "rewards/rejected": -4.473813056945801, "step": 950 }, { "epoch": 16.11864406779661, "grad_norm": 4.871432065569196, "learning_rate": 5.5202586424767967e-08, "logits/chosen": -7.282120227813721, "logits/rejected": -6.065773010253906, "logps/chosen": -8.455615997314453, "logps/rejected": -14.357915878295898, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": 0.5921602249145508, "rewards/margins": 3.3937811851501465, "rewards/rejected": -2.8016209602355957, "step": 951 }, { "epoch": 16.135593220338983, "grad_norm": 3.876482262335242, "learning_rate": 5.473990051203298e-08, "logits/chosen": -9.93977165222168, "logits/rejected": -8.173554420471191, "logps/chosen": -12.71878719329834, "logps/rejected": -24.868427276611328, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": -0.108131542801857, "rewards/margins": 5.222832679748535, "rewards/rejected": -5.330964088439941, "step": 952 }, { "epoch": 16.152542372881356, "grad_norm": 4.705045971613401, "learning_rate": 5.4278923291751934e-08, "logits/chosen": -6.992777347564697, "logits/rejected": -6.69704532623291, "logps/chosen": -7.97955846786499, "logps/rejected": -16.74382781982422, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": 0.43731003999710083, "rewards/margins": 4.202198505401611, "rewards/rejected": -3.7648885250091553, "step": 953 }, { "epoch": 16.16949152542373, "grad_norm": 4.743973876453088, "learning_rate": 5.381965879786868e-08, "logits/chosen": -6.325166702270508, "logits/rejected": -5.70654296875, "logps/chosen": -10.991593360900879, "logps/rejected": -21.800357818603516, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": 0.21553093194961548, "rewards/margins": 4.7880539894104, "rewards/rejected": -4.57252311706543, "step": 954 }, { "epoch": 16.1864406779661, "grad_norm": 4.628586645255542, "learning_rate": 5.336211104933938e-08, "logits/chosen": -4.82962703704834, "logits/rejected": -7.258959770202637, "logps/chosen": -10.598827362060547, "logps/rejected": -23.605480194091797, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 0.16052578389644623, "rewards/margins": 4.829559803009033, "rewards/rejected": -4.669033527374268, "step": 955 }, { "epoch": 16.203389830508474, "grad_norm": 4.17471237430466, "learning_rate": 5.290628405009717e-08, "logits/chosen": -6.717578411102295, "logits/rejected": -7.951920032501221, "logps/chosen": -7.13959264755249, "logps/rejected": -21.573381423950195, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 0.28929591178894043, "rewards/margins": 4.62062931060791, "rewards/rejected": -4.331333637237549, "step": 956 }, { "epoch": 16.220338983050848, "grad_norm": 3.0367710551215716, "learning_rate": 5.2452181789017166e-08, "logits/chosen": -7.549493789672852, "logits/rejected": -6.55128812789917, "logps/chosen": -8.085127830505371, "logps/rejected": -23.510971069335938, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": -0.12076622247695923, "rewards/margins": 5.681987762451172, "rewards/rejected": -5.802753448486328, "step": 957 }, { "epoch": 16.23728813559322, "grad_norm": 4.244814839225662, "learning_rate": 5.1999808239881564e-08, "logits/chosen": -8.300333023071289, "logits/rejected": -7.503015041351318, "logps/chosen": -11.905001640319824, "logps/rejected": -26.17755699157715, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.7613542079925537, "rewards/margins": 5.459731578826904, "rewards/rejected": -4.69837760925293, "step": 958 }, { "epoch": 16.25423728813559, "grad_norm": 5.017624513304328, "learning_rate": 5.1549167361344875e-08, "logits/chosen": -5.223151206970215, "logits/rejected": -4.507918834686279, "logps/chosen": -8.082959175109863, "logps/rejected": -23.3592586517334, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 0.510887861251831, "rewards/margins": 5.510470867156982, "rewards/rejected": -4.9995832443237305, "step": 959 }, { "epoch": 16.271186440677965, "grad_norm": 4.500188138065856, "learning_rate": 5.1100263096899215e-08, "logits/chosen": -6.816982746124268, "logits/rejected": -5.620564937591553, "logps/chosen": -9.452603340148926, "logps/rejected": -28.52019500732422, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 0.2641323506832123, "rewards/margins": 5.596342086791992, "rewards/rejected": -5.332209587097168, "step": 960 }, { "epoch": 16.28813559322034, "grad_norm": 4.419413086409128, "learning_rate": 5.065309937483991e-08, "logits/chosen": -7.277461051940918, "logits/rejected": -6.101316452026367, "logps/chosen": -10.160829544067383, "logps/rejected": -19.631698608398438, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.03897915035486221, "rewards/margins": 4.435064315795898, "rewards/rejected": -4.396084785461426, "step": 961 }, { "epoch": 16.305084745762713, "grad_norm": 3.7105875745545043, "learning_rate": 5.020768010823101e-08, "logits/chosen": -3.605870246887207, "logits/rejected": -6.897829055786133, "logps/chosen": -13.389876365661621, "logps/rejected": -29.775251388549805, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.13197000324726105, "rewards/margins": 6.818739891052246, "rewards/rejected": -6.950709819793701, "step": 962 }, { "epoch": 16.322033898305083, "grad_norm": 4.664383108703518, "learning_rate": 4.976400919487106e-08, "logits/chosen": -6.504683971405029, "logits/rejected": -3.7136526107788086, "logps/chosen": -11.577073097229004, "logps/rejected": -28.113012313842773, "loss": 0.0505, "rewards/accuracies": 1.0, "rewards/chosen": 0.3003220558166504, "rewards/margins": 6.02211332321167, "rewards/rejected": -5.721792221069336, "step": 963 }, { "epoch": 16.338983050847457, "grad_norm": 3.7490296393228455, "learning_rate": 4.932209051725914e-08, "logits/chosen": -10.529375076293945, "logits/rejected": -10.49166488647461, "logps/chosen": -9.599217414855957, "logps/rejected": -21.940521240234375, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 0.44774866104125977, "rewards/margins": 4.731534957885742, "rewards/rejected": -4.283786296844482, "step": 964 }, { "epoch": 16.35593220338983, "grad_norm": 4.241281553767145, "learning_rate": 4.88819279425606e-08, "logits/chosen": -9.115670204162598, "logits/rejected": -7.715078830718994, "logps/chosen": -9.38962173461914, "logps/rejected": -24.184432983398438, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 0.03572952747344971, "rewards/margins": 4.836929798126221, "rewards/rejected": -4.8012003898620605, "step": 965 }, { "epoch": 16.372881355932204, "grad_norm": 4.283645502102022, "learning_rate": 4.844352532257351e-08, "logits/chosen": -7.930306911468506, "logits/rejected": -6.915701866149902, "logps/chosen": -10.192066192626953, "logps/rejected": -22.797836303710938, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 0.013980336487293243, "rewards/margins": 4.946038246154785, "rewards/rejected": -4.932058334350586, "step": 966 }, { "epoch": 16.389830508474578, "grad_norm": 3.6881742053724973, "learning_rate": 4.8006886493694885e-08, "logits/chosen": -9.653305053710938, "logits/rejected": -6.4271650314331055, "logps/chosen": -9.224141120910645, "logps/rejected": -23.99065589904785, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 0.31926214694976807, "rewards/margins": 5.534373760223389, "rewards/rejected": -5.215112209320068, "step": 967 }, { "epoch": 16.406779661016948, "grad_norm": 4.051689710502391, "learning_rate": 4.757201527688692e-08, "logits/chosen": -0.17772293090820312, "logits/rejected": -2.985245704650879, "logps/chosen": -7.360447406768799, "logps/rejected": -19.838626861572266, "loss": 0.0442, "rewards/accuracies": 1.0, "rewards/chosen": 0.38335198163986206, "rewards/margins": 3.9256038665771484, "rewards/rejected": -3.5422518253326416, "step": 968 }, { "epoch": 16.423728813559322, "grad_norm": 4.438345671320595, "learning_rate": 4.713891547764384e-08, "logits/chosen": -3.5385613441467285, "logits/rejected": -2.1906332969665527, "logps/chosen": -11.252260208129883, "logps/rejected": -27.119346618652344, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 0.25384432077407837, "rewards/margins": 6.420773983001709, "rewards/rejected": -6.166929721832275, "step": 969 }, { "epoch": 16.440677966101696, "grad_norm": 4.119938068589781, "learning_rate": 4.67075908859583e-08, "logits/chosen": -5.7297210693359375, "logits/rejected": -3.09169864654541, "logps/chosen": -8.071650505065918, "logps/rejected": -20.209758758544922, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 0.26734113693237305, "rewards/margins": 5.061099052429199, "rewards/rejected": -4.793758392333984, "step": 970 }, { "epoch": 16.45762711864407, "grad_norm": 3.95463357002394, "learning_rate": 4.6278045276288565e-08, "logits/chosen": -7.240344047546387, "logits/rejected": -5.89601469039917, "logps/chosen": -11.00953483581543, "logps/rejected": -21.797813415527344, "loss": 0.0402, "rewards/accuracies": 1.0, "rewards/chosen": 0.6719949245452881, "rewards/margins": 5.352088928222656, "rewards/rejected": -4.680094242095947, "step": 971 }, { "epoch": 16.47457627118644, "grad_norm": 4.099529453892412, "learning_rate": 4.5850282407524975e-08, "logits/chosen": -4.743894577026367, "logits/rejected": -4.78099250793457, "logps/chosen": -8.380840301513672, "logps/rejected": -25.225353240966797, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -0.06848488748073578, "rewards/margins": 5.246893405914307, "rewards/rejected": -5.315378665924072, "step": 972 }, { "epoch": 16.491525423728813, "grad_norm": 5.069108591451907, "learning_rate": 4.5424306022957745e-08, "logits/chosen": -9.48492431640625, "logits/rejected": -11.09494400024414, "logps/chosen": -11.010992050170898, "logps/rejected": -17.152000427246094, "loss": 0.0605, "rewards/accuracies": 0.9375, "rewards/chosen": 0.36105436086654663, "rewards/margins": 3.41518497467041, "rewards/rejected": -3.0541305541992188, "step": 973 }, { "epoch": 16.508474576271187, "grad_norm": 4.282441228599341, "learning_rate": 4.5000119850243626e-08, "logits/chosen": -4.568205833435059, "logits/rejected": -3.961057662963867, "logps/chosen": -10.547223091125488, "logps/rejected": -31.581138610839844, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -0.23463180661201477, "rewards/margins": 7.356784343719482, "rewards/rejected": -7.591416358947754, "step": 974 }, { "epoch": 16.52542372881356, "grad_norm": 4.9671363698474575, "learning_rate": 4.457772760137349e-08, "logits/chosen": -4.574094772338867, "logits/rejected": -6.335107326507568, "logps/chosen": -9.245567321777344, "logps/rejected": -21.6871337890625, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.26459747552871704, "rewards/margins": 4.457276821136475, "rewards/rejected": -4.192679405212402, "step": 975 }, { "epoch": 16.54237288135593, "grad_norm": 4.077449203746537, "learning_rate": 4.415713297263987e-08, "logits/chosen": -9.965399742126465, "logits/rejected": -7.164189338684082, "logps/chosen": -11.206932067871094, "logps/rejected": -22.17264747619629, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 0.4054555594921112, "rewards/margins": 5.540596008300781, "rewards/rejected": -5.135139465332031, "step": 976 }, { "epoch": 16.559322033898304, "grad_norm": 4.516788897795623, "learning_rate": 4.3738339644604636e-08, "logits/chosen": -4.527769565582275, "logits/rejected": -3.9044039249420166, "logps/chosen": -9.113353729248047, "logps/rejected": -25.832937240600586, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.43333011865615845, "rewards/margins": 5.910764694213867, "rewards/rejected": -5.4774346351623535, "step": 977 }, { "epoch": 16.576271186440678, "grad_norm": 3.9477454836043058, "learning_rate": 4.3321351282066654e-08, "logits/chosen": -7.476665496826172, "logits/rejected": -6.898786544799805, "logps/chosen": -8.629772186279297, "logps/rejected": -23.448902130126953, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": 0.35876137018203735, "rewards/margins": 5.063984394073486, "rewards/rejected": -4.705223083496094, "step": 978 }, { "epoch": 16.593220338983052, "grad_norm": 4.17873330367698, "learning_rate": 4.290617153402984e-08, "logits/chosen": -4.806510925292969, "logits/rejected": -7.001298904418945, "logps/chosen": -7.695079326629639, "logps/rejected": -20.282554626464844, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": 0.4525095522403717, "rewards/margins": 4.603910446166992, "rewards/rejected": -4.151400566101074, "step": 979 }, { "epoch": 16.610169491525422, "grad_norm": 4.4906778542040575, "learning_rate": 4.249280403367114e-08, "logits/chosen": -8.551712989807129, "logits/rejected": -8.692821502685547, "logps/chosen": -7.653357982635498, "logps/rejected": -19.395835876464844, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 0.7328954339027405, "rewards/margins": 4.667267799377441, "rewards/rejected": -3.934372901916504, "step": 980 }, { "epoch": 16.627118644067796, "grad_norm": 4.582496619989306, "learning_rate": 4.208125239830901e-08, "logits/chosen": -4.369369029998779, "logits/rejected": -8.472222328186035, "logps/chosen": -6.89990758895874, "logps/rejected": -20.731672286987305, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 0.4413119852542877, "rewards/margins": 5.03058385848999, "rewards/rejected": -4.5892720222473145, "step": 981 }, { "epoch": 16.64406779661017, "grad_norm": 3.231930476691675, "learning_rate": 4.167152022937123e-08, "logits/chosen": -2.9806668758392334, "logits/rejected": -5.6147847175598145, "logps/chosen": -11.49258041381836, "logps/rejected": -30.111173629760742, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -0.06908190250396729, "rewards/margins": 6.325509071350098, "rewards/rejected": -6.394590377807617, "step": 982 }, { "epoch": 16.661016949152543, "grad_norm": 4.681138904637989, "learning_rate": 4.126361111236395e-08, "logits/chosen": -5.353288173675537, "logits/rejected": -8.29789924621582, "logps/chosen": -9.73030948638916, "logps/rejected": -21.84670639038086, "loss": 0.0521, "rewards/accuracies": 1.0, "rewards/chosen": 0.2567916512489319, "rewards/margins": 4.876055717468262, "rewards/rejected": -4.619264125823975, "step": 983 }, { "epoch": 16.677966101694913, "grad_norm": 4.336367247523663, "learning_rate": 4.08575286168398e-08, "logits/chosen": -7.448416709899902, "logits/rejected": -8.908486366271973, "logps/chosen": -5.859768867492676, "logps/rejected": -18.815467834472656, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 0.4510043263435364, "rewards/margins": 5.035971641540527, "rewards/rejected": -4.584967613220215, "step": 984 }, { "epoch": 16.694915254237287, "grad_norm": 3.9740284639920125, "learning_rate": 4.0453276296367134e-08, "logits/chosen": -6.158625602722168, "logits/rejected": -6.769649982452393, "logps/chosen": -8.739511489868164, "logps/rejected": -21.11366081237793, "loss": 0.0365, "rewards/accuracies": 1.0, "rewards/chosen": 0.20570890605449677, "rewards/margins": 4.766705513000488, "rewards/rejected": -4.560997009277344, "step": 985 }, { "epoch": 16.71186440677966, "grad_norm": 4.345605358423864, "learning_rate": 4.005085768849856e-08, "logits/chosen": -7.33724308013916, "logits/rejected": -7.3545241355896, "logps/chosen": -10.614309310913086, "logps/rejected": -25.42418670654297, "loss": 0.0572, "rewards/accuracies": 1.0, "rewards/chosen": -0.24098001420497894, "rewards/margins": 5.777004241943359, "rewards/rejected": -6.017984390258789, "step": 986 }, { "epoch": 16.728813559322035, "grad_norm": 5.174961944414816, "learning_rate": 3.965027631474035e-08, "logits/chosen": -5.207669734954834, "logits/rejected": -5.68195104598999, "logps/chosen": -8.797658920288086, "logps/rejected": -19.174301147460938, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.1935984045267105, "rewards/margins": 4.448380947113037, "rewards/rejected": -4.254782199859619, "step": 987 }, { "epoch": 16.74576271186441, "grad_norm": 4.914330167018493, "learning_rate": 3.9251535680521226e-08, "logits/chosen": -5.299089431762695, "logits/rejected": -5.562166690826416, "logps/chosen": -7.950815677642822, "logps/rejected": -23.660730361938477, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 0.5078102350234985, "rewards/margins": 5.57275390625, "rewards/rejected": -5.064943790435791, "step": 988 }, { "epoch": 16.76271186440678, "grad_norm": 4.423714547925529, "learning_rate": 3.885463927516189e-08, "logits/chosen": -7.4200263023376465, "logits/rejected": -6.687015533447266, "logps/chosen": -7.834423065185547, "logps/rejected": -20.39315414428711, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 0.6284862160682678, "rewards/margins": 4.580899238586426, "rewards/rejected": -3.9524128437042236, "step": 989 }, { "epoch": 16.779661016949152, "grad_norm": 3.9593534596195514, "learning_rate": 3.845959057184453e-08, "logits/chosen": -9.570975303649902, "logits/rejected": -10.773117065429688, "logps/chosen": -8.251781463623047, "logps/rejected": -19.07634735107422, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 0.48474591970443726, "rewards/margins": 3.843752384185791, "rewards/rejected": -3.359006404876709, "step": 990 }, { "epoch": 16.796610169491526, "grad_norm": 3.9910442343182297, "learning_rate": 3.806639302758227e-08, "logits/chosen": -5.000946044921875, "logits/rejected": -4.220808029174805, "logps/chosen": -8.912650108337402, "logps/rejected": -18.11932945251465, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.6203174591064453, "rewards/margins": 4.1827712059021, "rewards/rejected": -3.5624542236328125, "step": 991 }, { "epoch": 16.8135593220339, "grad_norm": 3.6120155500539144, "learning_rate": 3.767505008318914e-08, "logits/chosen": -5.242963790893555, "logits/rejected": -3.9384145736694336, "logps/chosen": -10.507009506225586, "logps/rejected": -27.637222290039062, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.36286771297454834, "rewards/margins": 5.9236369132995605, "rewards/rejected": -5.560769081115723, "step": 992 }, { "epoch": 16.83050847457627, "grad_norm": 4.329264208104702, "learning_rate": 3.728556516324971e-08, "logits/chosen": -3.621446371078491, "logits/rejected": 0.20612281560897827, "logps/chosen": -19.473979949951172, "logps/rejected": -28.62809181213379, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 0.3174746632575989, "rewards/margins": 5.846737861633301, "rewards/rejected": -5.529263496398926, "step": 993 }, { "epoch": 16.847457627118644, "grad_norm": 4.120460619502292, "learning_rate": 3.6897941676089365e-08, "logits/chosen": -9.164158821105957, "logits/rejected": -6.803136825561523, "logps/chosen": -14.740889549255371, "logps/rejected": -23.247711181640625, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.6667031049728394, "rewards/margins": 4.748815536499023, "rewards/rejected": -4.082113265991211, "step": 994 }, { "epoch": 16.864406779661017, "grad_norm": 3.5605423273208143, "learning_rate": 3.651218301374431e-08, "logits/chosen": -5.762743949890137, "logits/rejected": -4.326004505157471, "logps/chosen": -7.795363426208496, "logps/rejected": -30.06238555908203, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": -0.023066237568855286, "rewards/margins": 7.547117710113525, "rewards/rejected": -7.570183277130127, "step": 995 }, { "epoch": 16.88135593220339, "grad_norm": 4.515991407877982, "learning_rate": 3.612829255193192e-08, "logits/chosen": -6.335334777832031, "logits/rejected": -5.021296501159668, "logps/chosen": -10.029027938842773, "logps/rejected": -25.938034057617188, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 0.11987611651420593, "rewards/margins": 6.307064533233643, "rewards/rejected": -6.187188148498535, "step": 996 }, { "epoch": 16.89830508474576, "grad_norm": 4.162547805580534, "learning_rate": 3.574627365002122e-08, "logits/chosen": -4.132068634033203, "logits/rejected": -1.9441083669662476, "logps/chosen": -12.128137588500977, "logps/rejected": -35.27802658081055, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": -0.6331876516342163, "rewards/margins": 6.974603652954102, "rewards/rejected": -7.607790946960449, "step": 997 }, { "epoch": 16.915254237288135, "grad_norm": 5.532298109239178, "learning_rate": 3.536612965100361e-08, "logits/chosen": -6.583493232727051, "logits/rejected": -4.180866718292236, "logps/chosen": -10.267550468444824, "logps/rejected": -23.525606155395508, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": 0.0621405765414238, "rewards/margins": 5.680069923400879, "rewards/rejected": -5.617929458618164, "step": 998 }, { "epoch": 16.93220338983051, "grad_norm": 4.459495867860006, "learning_rate": 3.4987863881463296e-08, "logits/chosen": -7.422240257263184, "logits/rejected": -7.356123924255371, "logps/chosen": -6.346535682678223, "logps/rejected": -20.76394271850586, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 0.0896294116973877, "rewards/margins": 4.644069194793701, "rewards/rejected": -4.554439544677734, "step": 999 }, { "epoch": 16.949152542372882, "grad_norm": 4.133219670295683, "learning_rate": 3.461147965154845e-08, "logits/chosen": -6.083202838897705, "logits/rejected": -8.11595630645752, "logps/chosen": -6.606565475463867, "logps/rejected": -19.625391006469727, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 0.5629207491874695, "rewards/margins": 5.111311435699463, "rewards/rejected": -4.548391342163086, "step": 1000 }, { "epoch": 16.966101694915253, "grad_norm": 4.1740786495102755, "learning_rate": 3.423698025494234e-08, "logits/chosen": -10.186813354492188, "logits/rejected": -9.291478157043457, "logps/chosen": -7.909204483032227, "logps/rejected": -18.85116195678711, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.4344901144504547, "rewards/margins": 4.76976203918457, "rewards/rejected": -4.335272312164307, "step": 1001 }, { "epoch": 16.983050847457626, "grad_norm": 4.15892660544233, "learning_rate": 3.386436896883407e-08, "logits/chosen": -5.626341342926025, "logits/rejected": -3.084937810897827, "logps/chosen": -9.295838356018066, "logps/rejected": -24.96091079711914, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 0.06559734046459198, "rewards/margins": 5.8852386474609375, "rewards/rejected": -5.819642066955566, "step": 1002 }, { "epoch": 17.0, "grad_norm": 3.8140424835677633, "learning_rate": 3.349364905389032e-08, "logits/chosen": -9.058091163635254, "logits/rejected": -6.692200660705566, "logps/chosen": -12.010008811950684, "logps/rejected": -19.010009765625, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 0.41910070180892944, "rewards/margins": 3.528287887573242, "rewards/rejected": -3.109187364578247, "step": 1003 }, { "epoch": 17.016949152542374, "grad_norm": 4.197883218610352, "learning_rate": 3.3124823754226625e-08, "logits/chosen": -6.8970561027526855, "logits/rejected": -6.827741622924805, "logps/chosen": -8.531798362731934, "logps/rejected": -22.772443771362305, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 0.45479583740234375, "rewards/margins": 5.231786251068115, "rewards/rejected": -4.77699089050293, "step": 1004 }, { "epoch": 17.033898305084747, "grad_norm": 4.497698689530865, "learning_rate": 3.275789629737905e-08, "logits/chosen": -5.448521614074707, "logits/rejected": -5.657861709594727, "logps/chosen": -8.764067649841309, "logps/rejected": -20.854806900024414, "loss": 0.0516, "rewards/accuracies": 1.0, "rewards/chosen": 0.20594146847724915, "rewards/margins": 4.639956474304199, "rewards/rejected": -4.434015274047852, "step": 1005 }, { "epoch": 17.050847457627118, "grad_norm": 4.5378204622729825, "learning_rate": 3.2392869894275726e-08, "logits/chosen": -8.017354965209961, "logits/rejected": -7.641470909118652, "logps/chosen": -9.67556381225586, "logps/rejected": -24.360857009887695, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 0.02913103997707367, "rewards/margins": 5.586639404296875, "rewards/rejected": -5.55750846862793, "step": 1006 }, { "epoch": 17.06779661016949, "grad_norm": 4.100101534069955, "learning_rate": 3.2029747739209245e-08, "logits/chosen": -5.569543838500977, "logits/rejected": -6.685817718505859, "logps/chosen": -10.705131530761719, "logps/rejected": -23.21912956237793, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": 0.405699759721756, "rewards/margins": 5.077655792236328, "rewards/rejected": -4.671957015991211, "step": 1007 }, { "epoch": 17.084745762711865, "grad_norm": 4.211710710930013, "learning_rate": 3.166853300980821e-08, "logits/chosen": -6.40574836730957, "logits/rejected": -5.807041168212891, "logps/chosen": -7.398978233337402, "logps/rejected": -17.30287742614746, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.3856056034564972, "rewards/margins": 4.216588973999023, "rewards/rejected": -3.8309836387634277, "step": 1008 }, { "epoch": 17.10169491525424, "grad_norm": 4.273538651613034, "learning_rate": 3.130922886700968e-08, "logits/chosen": -10.244964599609375, "logits/rejected": -10.494848251342773, "logps/chosen": -8.509642601013184, "logps/rejected": -18.30198860168457, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 0.5988513231277466, "rewards/margins": 4.096191883087158, "rewards/rejected": -3.4973411560058594, "step": 1009 }, { "epoch": 17.11864406779661, "grad_norm": 4.25709039242675, "learning_rate": 3.095183845503144e-08, "logits/chosen": -7.940372467041016, "logits/rejected": -7.701578617095947, "logps/chosen": -10.985665321350098, "logps/rejected": -27.06873893737793, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 0.39520519971847534, "rewards/margins": 6.167887210845947, "rewards/rejected": -5.772682189941406, "step": 1010 }, { "epoch": 17.135593220338983, "grad_norm": 4.338372503042337, "learning_rate": 3.059636490134448e-08, "logits/chosen": -7.376964092254639, "logits/rejected": -7.629546642303467, "logps/chosen": -8.467211723327637, "logps/rejected": -22.428627014160156, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": 0.25212186574935913, "rewards/margins": 5.631921291351318, "rewards/rejected": -5.379799842834473, "step": 1011 }, { "epoch": 17.152542372881356, "grad_norm": 4.223004676181515, "learning_rate": 3.024281131664569e-08, "logits/chosen": -5.16343879699707, "logits/rejected": -6.354394912719727, "logps/chosen": -11.564605712890625, "logps/rejected": -31.499488830566406, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -0.3362666368484497, "rewards/margins": 7.0102033615112305, "rewards/rejected": -7.346470355987549, "step": 1012 }, { "epoch": 17.16949152542373, "grad_norm": 4.32681883158123, "learning_rate": 2.989118079483052e-08, "logits/chosen": -5.105967044830322, "logits/rejected": -4.726531028747559, "logps/chosen": -9.233987808227539, "logps/rejected": -22.715604782104492, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -0.053199321031570435, "rewards/margins": 4.993124008178711, "rewards/rejected": -5.046323776245117, "step": 1013 }, { "epoch": 17.1864406779661, "grad_norm": 4.308348206921959, "learning_rate": 2.9541476412966032e-08, "logits/chosen": -7.126892566680908, "logits/rejected": -7.172115802764893, "logps/chosen": -6.875704288482666, "logps/rejected": -17.938127517700195, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 0.6449772715568542, "rewards/margins": 4.4160332679748535, "rewards/rejected": -3.771056652069092, "step": 1014 }, { "epoch": 17.203389830508474, "grad_norm": 4.387522145212717, "learning_rate": 2.9193701231263967e-08, "logits/chosen": -7.878859043121338, "logits/rejected": -10.106086730957031, "logps/chosen": -8.252893447875977, "logps/rejected": -20.68426513671875, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 0.31005167961120605, "rewards/margins": 4.4923577308654785, "rewards/rejected": -4.182306289672852, "step": 1015 }, { "epoch": 17.220338983050848, "grad_norm": 4.017533952520582, "learning_rate": 2.8847858293053805e-08, "logits/chosen": -4.7985711097717285, "logits/rejected": -3.0869522094726562, "logps/chosen": -12.94370174407959, "logps/rejected": -25.18770980834961, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.2541043162345886, "rewards/margins": 5.228729248046875, "rewards/rejected": -5.482833385467529, "step": 1016 }, { "epoch": 17.23728813559322, "grad_norm": 3.9242681899847254, "learning_rate": 2.8503950624756413e-08, "logits/chosen": -3.531855821609497, "logits/rejected": -2.8809752464294434, "logps/chosen": -9.461271286010742, "logps/rejected": -28.473838806152344, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 0.07847141474485397, "rewards/margins": 7.158161163330078, "rewards/rejected": -7.079689979553223, "step": 1017 }, { "epoch": 17.25423728813559, "grad_norm": 3.7660967758299075, "learning_rate": 2.816198123585714e-08, "logits/chosen": -6.652817249298096, "logits/rejected": -2.6611244678497314, "logps/chosen": -10.80378246307373, "logps/rejected": -20.912567138671875, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": 0.6135255098342896, "rewards/margins": 4.984477996826172, "rewards/rejected": -4.37095308303833, "step": 1018 }, { "epoch": 17.271186440677965, "grad_norm": 4.1856106009455765, "learning_rate": 2.782195311887997e-08, "logits/chosen": -8.43665599822998, "logits/rejected": -5.13716459274292, "logps/chosen": -7.108492374420166, "logps/rejected": -21.752134323120117, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.394392728805542, "rewards/margins": 5.154807090759277, "rewards/rejected": -4.760413646697998, "step": 1019 }, { "epoch": 17.28813559322034, "grad_norm": 3.9921502276466527, "learning_rate": 2.7483869249360912e-08, "logits/chosen": -9.98592758178711, "logits/rejected": -9.118145942687988, "logps/chosen": -9.33875846862793, "logps/rejected": -22.786651611328125, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -0.24660375714302063, "rewards/margins": 4.794728755950928, "rewards/rejected": -5.041332721710205, "step": 1020 }, { "epoch": 17.305084745762713, "grad_norm": 4.1519514831089985, "learning_rate": 2.7147732585822425e-08, "logits/chosen": -6.564297199249268, "logits/rejected": -4.144745349884033, "logps/chosen": -8.098861694335938, "logps/rejected": -21.80293846130371, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 0.4752035140991211, "rewards/margins": 5.580191135406494, "rewards/rejected": -5.104987621307373, "step": 1021 }, { "epoch": 17.322033898305083, "grad_norm": 4.665077369798747, "learning_rate": 2.6813546069746978e-08, "logits/chosen": -6.514552116394043, "logits/rejected": -6.659337043762207, "logps/chosen": -13.189889907836914, "logps/rejected": -21.970909118652344, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": 0.22609473764896393, "rewards/margins": 4.234546661376953, "rewards/rejected": -4.008452415466309, "step": 1022 }, { "epoch": 17.338983050847457, "grad_norm": 4.136419957605596, "learning_rate": 2.6481312625551726e-08, "logits/chosen": -5.896478652954102, "logits/rejected": -5.556583404541016, "logps/chosen": -9.151643753051758, "logps/rejected": -23.223796844482422, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.22782325744628906, "rewards/margins": 4.698112964630127, "rewards/rejected": -4.47028923034668, "step": 1023 }, { "epoch": 17.35593220338983, "grad_norm": 4.0030922528193855, "learning_rate": 2.6151035160562747e-08, "logits/chosen": -5.044496059417725, "logits/rejected": -4.717165470123291, "logps/chosen": -13.499955177307129, "logps/rejected": -24.894947052001953, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.14019767940044403, "rewards/margins": 4.581329345703125, "rewards/rejected": -4.721526622772217, "step": 1024 }, { "epoch": 17.372881355932204, "grad_norm": 4.458265653482429, "learning_rate": 2.5822716564989605e-08, "logits/chosen": -5.746660232543945, "logits/rejected": -5.290646553039551, "logps/chosen": -8.748152732849121, "logps/rejected": -20.10523223876953, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 0.3295852839946747, "rewards/margins": 4.266857147216797, "rewards/rejected": -3.9372718334198, "step": 1025 }, { "epoch": 17.389830508474578, "grad_norm": 4.048704607082209, "learning_rate": 2.5496359711900117e-08, "logits/chosen": -7.465732574462891, "logits/rejected": -4.066118240356445, "logps/chosen": -11.653614044189453, "logps/rejected": -23.19156837463379, "loss": 0.0492, "rewards/accuracies": 1.0, "rewards/chosen": 0.04617789387702942, "rewards/margins": 4.813108444213867, "rewards/rejected": -4.766931056976318, "step": 1026 }, { "epoch": 17.406779661016948, "grad_norm": 4.763237864684059, "learning_rate": 2.5171967457195213e-08, "logits/chosen": -10.00543212890625, "logits/rejected": -7.8041534423828125, "logps/chosen": -10.356070518493652, "logps/rejected": -18.669546127319336, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 0.12074128538370132, "rewards/margins": 4.16883659362793, "rewards/rejected": -4.048095703125, "step": 1027 }, { "epoch": 17.423728813559322, "grad_norm": 3.807693575659305, "learning_rate": 2.4849542639583832e-08, "logits/chosen": -7.451657295227051, "logits/rejected": -4.065084457397461, "logps/chosen": -16.081920623779297, "logps/rejected": -29.290130615234375, "loss": 0.0417, "rewards/accuracies": 1.0, "rewards/chosen": 0.11072258651256561, "rewards/margins": 6.283128261566162, "rewards/rejected": -6.172406196594238, "step": 1028 }, { "epoch": 17.440677966101696, "grad_norm": 4.561097923698961, "learning_rate": 2.4529088080558202e-08, "logits/chosen": -7.064828395843506, "logits/rejected": -5.508311748504639, "logps/chosen": -10.803018569946289, "logps/rejected": -26.995498657226562, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": -0.13047471642494202, "rewards/margins": 6.475358963012695, "rewards/rejected": -6.605833053588867, "step": 1029 }, { "epoch": 17.45762711864407, "grad_norm": 4.094841195832772, "learning_rate": 2.4210606584369103e-08, "logits/chosen": -7.4426045417785645, "logits/rejected": -5.818709373474121, "logps/chosen": -5.992995262145996, "logps/rejected": -22.609210968017578, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 0.6274833083152771, "rewards/margins": 5.398805618286133, "rewards/rejected": -4.771321773529053, "step": 1030 }, { "epoch": 17.47457627118644, "grad_norm": 4.183739338552097, "learning_rate": 2.3894100938001372e-08, "logits/chosen": -7.961667060852051, "logits/rejected": -6.155003070831299, "logps/chosen": -9.56466007232666, "logps/rejected": -21.0900936126709, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 0.32264530658721924, "rewards/margins": 4.696996212005615, "rewards/rejected": -4.3743510246276855, "step": 1031 }, { "epoch": 17.491525423728813, "grad_norm": 4.53635220017222, "learning_rate": 2.3579573911149397e-08, "logits/chosen": -5.375433444976807, "logits/rejected": -2.4169154167175293, "logps/chosen": -13.866455078125, "logps/rejected": -29.32849884033203, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 0.09247753024101257, "rewards/margins": 6.1781907081604, "rewards/rejected": -6.085712432861328, "step": 1032 }, { "epoch": 17.508474576271187, "grad_norm": 4.7958550897023615, "learning_rate": 2.3267028256193034e-08, "logits/chosen": -2.3388025760650635, "logits/rejected": -4.182887077331543, "logps/chosen": -9.572820663452148, "logps/rejected": -19.907470703125, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": -0.11249032616615295, "rewards/margins": 4.0869975090026855, "rewards/rejected": -4.199487686157227, "step": 1033 }, { "epoch": 17.52542372881356, "grad_norm": 4.543938420668149, "learning_rate": 2.2956466708173304e-08, "logits/chosen": -7.524108409881592, "logits/rejected": -5.194488525390625, "logps/chosen": -10.331520080566406, "logps/rejected": -23.630775451660156, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 0.2804371118545532, "rewards/margins": 5.539437294006348, "rewards/rejected": -5.259000301361084, "step": 1034 }, { "epoch": 17.54237288135593, "grad_norm": 4.811132955244365, "learning_rate": 2.2647891984768853e-08, "logits/chosen": -4.286440849304199, "logits/rejected": -2.8485355377197266, "logps/chosen": -8.513978958129883, "logps/rejected": -24.73643684387207, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": -0.3493989109992981, "rewards/margins": 5.959595680236816, "rewards/rejected": -6.308995246887207, "step": 1035 }, { "epoch": 17.559322033898304, "grad_norm": 4.727659797948743, "learning_rate": 2.234130678627169e-08, "logits/chosen": -8.573627471923828, "logits/rejected": -8.825172424316406, "logps/chosen": -8.619029998779297, "logps/rejected": -18.161518096923828, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.2388451248407364, "rewards/margins": 4.048605442047119, "rewards/rejected": -3.809760332107544, "step": 1036 }, { "epoch": 17.576271186440678, "grad_norm": 3.612875798728581, "learning_rate": 2.2036713795563876e-08, "logits/chosen": -7.9563422203063965, "logits/rejected": -6.366887092590332, "logps/chosen": -8.55833625793457, "logps/rejected": -19.83431053161621, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": 0.2833678722381592, "rewards/margins": 4.818700313568115, "rewards/rejected": -4.535332202911377, "step": 1037 }, { "epoch": 17.593220338983052, "grad_norm": 3.498209777073271, "learning_rate": 2.1734115678093938e-08, "logits/chosen": -3.2180469036102295, "logits/rejected": -2.4762351512908936, "logps/chosen": -10.812689781188965, "logps/rejected": -24.2170352935791, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": 0.30085691809654236, "rewards/margins": 5.222414970397949, "rewards/rejected": -4.921557903289795, "step": 1038 }, { "epoch": 17.610169491525422, "grad_norm": 4.708256784933598, "learning_rate": 2.1433515081853594e-08, "logits/chosen": -4.538162708282471, "logits/rejected": -2.6741445064544678, "logps/chosen": -9.551587104797363, "logps/rejected": -27.897594451904297, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.22916676104068756, "rewards/margins": 6.038286209106445, "rewards/rejected": -5.809118747711182, "step": 1039 }, { "epoch": 17.627118644067796, "grad_norm": 3.917316226481937, "learning_rate": 2.1134914637354368e-08, "logits/chosen": -7.627310752868652, "logits/rejected": -7.482641220092773, "logps/chosen": -10.15949821472168, "logps/rejected": -23.192731857299805, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -0.25363242626190186, "rewards/margins": 4.933504581451416, "rewards/rejected": -5.187136650085449, "step": 1040 }, { "epoch": 17.64406779661017, "grad_norm": 4.028125385219284, "learning_rate": 2.0838316957605074e-08, "logits/chosen": -3.0444884300231934, "logits/rejected": -2.337344169616699, "logps/chosen": -7.633298873901367, "logps/rejected": -20.37324333190918, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 0.8732346296310425, "rewards/margins": 4.761894226074219, "rewards/rejected": -3.8886594772338867, "step": 1041 }, { "epoch": 17.661016949152543, "grad_norm": 4.097102323738361, "learning_rate": 2.0543724638088345e-08, "logits/chosen": -9.5912446975708, "logits/rejected": -8.915510177612305, "logps/chosen": -7.766392707824707, "logps/rejected": -18.353717803955078, "loss": 0.0389, "rewards/accuracies": 1.0, "rewards/chosen": 0.5832113027572632, "rewards/margins": 4.126861095428467, "rewards/rejected": -3.543649673461914, "step": 1042 }, { "epoch": 17.677966101694913, "grad_norm": 3.8524271511787362, "learning_rate": 2.0251140256738352e-08, "logits/chosen": -7.298845291137695, "logits/rejected": -5.406156063079834, "logps/chosen": -9.05811595916748, "logps/rejected": -18.826417922973633, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": -0.15431635081768036, "rewards/margins": 4.819045066833496, "rewards/rejected": -4.973361015319824, "step": 1043 }, { "epoch": 17.694915254237287, "grad_norm": 4.367851733163418, "learning_rate": 1.996056637391805e-08, "logits/chosen": -5.809117317199707, "logits/rejected": -6.983155250549316, "logps/chosen": -8.29562759399414, "logps/rejected": -21.50481414794922, "loss": 0.0433, "rewards/accuracies": 1.0, "rewards/chosen": 0.4013362526893616, "rewards/margins": 5.432553291320801, "rewards/rejected": -5.031217098236084, "step": 1044 }, { "epoch": 17.71186440677966, "grad_norm": 4.938930187142403, "learning_rate": 1.9672005532396756e-08, "logits/chosen": -5.387691020965576, "logits/rejected": -4.999997138977051, "logps/chosen": -6.544239044189453, "logps/rejected": -20.39162254333496, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 0.4906653165817261, "rewards/margins": 4.722982406616211, "rewards/rejected": -4.232316493988037, "step": 1045 }, { "epoch": 17.728813559322035, "grad_norm": 3.4996852036714134, "learning_rate": 1.938546025732807e-08, "logits/chosen": -4.897934436798096, "logits/rejected": -7.4452338218688965, "logps/chosen": -9.388957977294922, "logps/rejected": -24.98798370361328, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": 0.7924555540084839, "rewards/margins": 6.371291637420654, "rewards/rejected": -5.578836441040039, "step": 1046 }, { "epoch": 17.74576271186441, "grad_norm": 4.85977590522318, "learning_rate": 1.910093305622759e-08, "logits/chosen": -8.88355541229248, "logits/rejected": -9.622832298278809, "logps/chosen": -6.02414608001709, "logps/rejected": -18.758188247680664, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": 0.16433702409267426, "rewards/margins": 4.420986652374268, "rewards/rejected": -4.256649494171143, "step": 1047 }, { "epoch": 17.76271186440678, "grad_norm": 3.543759620480611, "learning_rate": 1.881842641895104e-08, "logits/chosen": -7.751880645751953, "logits/rejected": -7.712557315826416, "logps/chosen": -7.928264617919922, "logps/rejected": -22.417421340942383, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 0.4415779113769531, "rewards/margins": 5.357847213745117, "rewards/rejected": -4.916269779205322, "step": 1048 }, { "epoch": 17.779661016949152, "grad_norm": 3.708075011329131, "learning_rate": 1.853794281767257e-08, "logits/chosen": -5.700033664703369, "logits/rejected": -6.737403869628906, "logps/chosen": -9.658474922180176, "logps/rejected": -23.03371810913086, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 0.2553105354309082, "rewards/margins": 5.234497547149658, "rewards/rejected": -4.979187488555908, "step": 1049 }, { "epoch": 17.796610169491526, "grad_norm": 3.72922105134763, "learning_rate": 1.8259484706862948e-08, "logits/chosen": -6.088375568389893, "logits/rejected": -6.036728858947754, "logps/chosen": -9.52256965637207, "logps/rejected": -24.765438079833984, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.43426060676574707, "rewards/margins": 5.606012344360352, "rewards/rejected": -6.040272235870361, "step": 1050 }, { "epoch": 17.8135593220339, "grad_norm": 4.693389158310756, "learning_rate": 1.798305452326826e-08, "logits/chosen": -10.646463394165039, "logits/rejected": -7.9289679527282715, "logps/chosen": -12.208354949951172, "logps/rejected": -24.25341796875, "loss": 0.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.10868895053863525, "rewards/margins": 5.147779941558838, "rewards/rejected": -5.03909158706665, "step": 1051 }, { "epoch": 17.83050847457627, "grad_norm": 4.874265696950657, "learning_rate": 1.7708654685888336e-08, "logits/chosen": -8.247642517089844, "logits/rejected": -3.546093225479126, "logps/chosen": -7.774269104003906, "logps/rejected": -21.775653839111328, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 0.31666943430900574, "rewards/margins": 5.547506809234619, "rewards/rejected": -5.230836868286133, "step": 1052 }, { "epoch": 17.847457627118644, "grad_norm": 4.756915177911726, "learning_rate": 1.7436287595955944e-08, "logits/chosen": -3.051187038421631, "logits/rejected": -3.3574652671813965, "logps/chosen": -8.808416366577148, "logps/rejected": -24.266923904418945, "loss": 0.052, "rewards/accuracies": 1.0, "rewards/chosen": 0.7775027751922607, "rewards/margins": 4.976606369018555, "rewards/rejected": -4.199103355407715, "step": 1053 }, { "epoch": 17.864406779661017, "grad_norm": 4.879310156866597, "learning_rate": 1.7165955636915392e-08, "logits/chosen": -8.966073036193848, "logits/rejected": -8.02140998840332, "logps/chosen": -8.468016624450684, "logps/rejected": -20.691829681396484, "loss": 0.046, "rewards/accuracies": 1.0, "rewards/chosen": 0.2952655553817749, "rewards/margins": 4.937621593475342, "rewards/rejected": -4.642355442047119, "step": 1054 }, { "epoch": 17.88135593220339, "grad_norm": 4.916091941632118, "learning_rate": 1.6897661174402057e-08, "logits/chosen": -5.873769760131836, "logits/rejected": -5.605523586273193, "logps/chosen": -9.172090530395508, "logps/rejected": -26.349031448364258, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 0.1761208176612854, "rewards/margins": 6.015247344970703, "rewards/rejected": -5.8391265869140625, "step": 1055 }, { "epoch": 17.89830508474576, "grad_norm": 3.9104875119431144, "learning_rate": 1.6631406556221333e-08, "logits/chosen": -4.374204635620117, "logits/rejected": -4.46373987197876, "logps/chosen": -13.30536937713623, "logps/rejected": -24.613677978515625, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 0.5727440118789673, "rewards/margins": 5.206787586212158, "rewards/rejected": -4.6340436935424805, "step": 1056 }, { "epoch": 17.915254237288135, "grad_norm": 4.568419925624105, "learning_rate": 1.6367194112328288e-08, "logits/chosen": -5.129965782165527, "logits/rejected": -0.9094352722167969, "logps/chosen": -10.443857192993164, "logps/rejected": -30.575592041015625, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": -0.13605478405952454, "rewards/margins": 6.419304847717285, "rewards/rejected": -6.555359363555908, "step": 1057 }, { "epoch": 17.93220338983051, "grad_norm": 3.363814152935196, "learning_rate": 1.6105026154807215e-08, "logits/chosen": -8.336222648620605, "logits/rejected": -7.247498035430908, "logps/chosen": -9.268970489501953, "logps/rejected": -26.44733238220215, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -0.12646730244159698, "rewards/margins": 5.633796691894531, "rewards/rejected": -5.7602643966674805, "step": 1058 }, { "epoch": 17.949152542372882, "grad_norm": 3.992182802809226, "learning_rate": 1.5844904977851376e-08, "logits/chosen": -5.82818603515625, "logits/rejected": -7.631371021270752, "logps/chosen": -9.781405448913574, "logps/rejected": -27.68490982055664, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -0.055077582597732544, "rewards/margins": 5.673110485076904, "rewards/rejected": -5.7281880378723145, "step": 1059 }, { "epoch": 17.966101694915253, "grad_norm": 5.580584613487915, "learning_rate": 1.558683285774304e-08, "logits/chosen": -7.345905780792236, "logits/rejected": -4.286530017852783, "logps/chosen": -8.853185653686523, "logps/rejected": -21.969799041748047, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": 0.22210994362831116, "rewards/margins": 4.812788486480713, "rewards/rejected": -4.5906782150268555, "step": 1060 }, { "epoch": 17.983050847457626, "grad_norm": 4.198433535721578, "learning_rate": 1.5330812052833402e-08, "logits/chosen": -4.884190082550049, "logits/rejected": -5.0173821449279785, "logps/chosen": -12.070671081542969, "logps/rejected": -24.6473388671875, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 0.4499714970588684, "rewards/margins": 4.89157772064209, "rewards/rejected": -4.441605567932129, "step": 1061 }, { "epoch": 18.0, "grad_norm": 3.5016579794830283, "learning_rate": 1.507684480352292e-08, "logits/chosen": -12.084006309509277, "logits/rejected": -5.082461833953857, "logps/chosen": -12.766876220703125, "logps/rejected": -17.5455322265625, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.653103232383728, "rewards/margins": 4.414988040924072, "rewards/rejected": -3.7618846893310547, "step": 1062 }, { "epoch": 18.016949152542374, "grad_norm": 4.312459721063018, "learning_rate": 1.4824933332241691e-08, "logits/chosen": -10.255056381225586, "logits/rejected": -6.813674449920654, "logps/chosen": -13.091928482055664, "logps/rejected": -25.015186309814453, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": -0.17327140271663666, "rewards/margins": 4.631661415100098, "rewards/rejected": -4.804932594299316, "step": 1063 }, { "epoch": 18.033898305084747, "grad_norm": 4.087149777468893, "learning_rate": 1.457507984343001e-08, "logits/chosen": -8.456153869628906, "logits/rejected": -5.839759826660156, "logps/chosen": -10.093239784240723, "logps/rejected": -25.46035385131836, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.1286679208278656, "rewards/margins": 5.974670886993408, "rewards/rejected": -5.846002578735352, "step": 1064 }, { "epoch": 18.050847457627118, "grad_norm": 3.9089755517483824, "learning_rate": 1.4327286523519083e-08, "logits/chosen": -8.93708610534668, "logits/rejected": -10.533562660217285, "logps/chosen": -5.521016597747803, "logps/rejected": -15.870177268981934, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": 0.36885491013526917, "rewards/margins": 3.37803316116333, "rewards/rejected": -3.0091781616210938, "step": 1065 }, { "epoch": 18.06779661016949, "grad_norm": 4.2051588856191175, "learning_rate": 1.4081555540911837e-08, "logits/chosen": -4.821186542510986, "logits/rejected": -6.939448356628418, "logps/chosen": -9.59068489074707, "logps/rejected": -27.474403381347656, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 0.003167472779750824, "rewards/margins": 5.379495620727539, "rewards/rejected": -5.376327991485596, "step": 1066 }, { "epoch": 18.084745762711865, "grad_norm": 4.36531145641213, "learning_rate": 1.383788904596403e-08, "logits/chosen": -6.268502235412598, "logits/rejected": -5.068531513214111, "logps/chosen": -10.58634090423584, "logps/rejected": -26.577701568603516, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": -0.05589911341667175, "rewards/margins": 5.647587776184082, "rewards/rejected": -5.70348596572876, "step": 1067 }, { "epoch": 18.10169491525424, "grad_norm": 4.64611842993512, "learning_rate": 1.3596289170965308e-08, "logits/chosen": -6.981442928314209, "logits/rejected": -3.912247657775879, "logps/chosen": -9.348136901855469, "logps/rejected": -23.2004337310791, "loss": 0.0595, "rewards/accuracies": 1.0, "rewards/chosen": 0.3433188199996948, "rewards/margins": 5.749058246612549, "rewards/rejected": -5.405739784240723, "step": 1068 }, { "epoch": 18.11864406779661, "grad_norm": 4.3465310006915745, "learning_rate": 1.3356758030120762e-08, "logits/chosen": -4.298106670379639, "logits/rejected": -3.6244497299194336, "logps/chosen": -11.371649742126465, "logps/rejected": -27.595565795898438, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008899867534637451, "rewards/margins": 5.844209671020508, "rewards/rejected": -5.845099449157715, "step": 1069 }, { "epoch": 18.135593220338983, "grad_norm": 4.191316361646987, "learning_rate": 1.3119297719532241e-08, "logits/chosen": -7.277906894683838, "logits/rejected": -4.062497615814209, "logps/chosen": -9.189289093017578, "logps/rejected": -19.087574005126953, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 0.21245090663433075, "rewards/margins": 4.8552045822143555, "rewards/rejected": -4.642754077911377, "step": 1070 }, { "epoch": 18.152542372881356, "grad_norm": 4.984892397415893, "learning_rate": 1.2883910317180003e-08, "logits/chosen": -4.394636631011963, "logits/rejected": -5.624868392944336, "logps/chosen": -10.798135757446289, "logps/rejected": -24.8155517578125, "loss": 0.0592, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2576065957546234, "rewards/margins": 5.6426544189453125, "rewards/rejected": -5.900261402130127, "step": 1071 }, { "epoch": 18.16949152542373, "grad_norm": 3.6473470262218677, "learning_rate": 1.265059788290468e-08, "logits/chosen": -7.392230987548828, "logits/rejected": -6.054595947265625, "logps/chosen": -11.147891998291016, "logps/rejected": -27.8052978515625, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -0.2997833490371704, "rewards/margins": 5.084535598754883, "rewards/rejected": -5.384318828582764, "step": 1072 }, { "epoch": 18.1864406779661, "grad_norm": 3.7167659894466394, "learning_rate": 1.2419362458389093e-08, "logits/chosen": -4.075685024261475, "logits/rejected": -1.814130187034607, "logps/chosen": -11.281499862670898, "logps/rejected": -23.99729347229004, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 0.09586359560489655, "rewards/margins": 5.473601341247559, "rewards/rejected": -5.377737998962402, "step": 1073 }, { "epoch": 18.203389830508474, "grad_norm": 4.526089243624855, "learning_rate": 1.219020606714044e-08, "logits/chosen": -4.463575839996338, "logits/rejected": -4.573124885559082, "logps/chosen": -9.437901496887207, "logps/rejected": -21.910377502441406, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 0.4638257622718811, "rewards/margins": 5.842223644256592, "rewards/rejected": -5.3783979415893555, "step": 1074 }, { "epoch": 18.220338983050848, "grad_norm": 3.9597685733153942, "learning_rate": 1.196313071447269e-08, "logits/chosen": -7.910276412963867, "logits/rejected": -6.576176643371582, "logps/chosen": -9.151101112365723, "logps/rejected": -25.661685943603516, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": 0.5685608983039856, "rewards/margins": 6.249361515045166, "rewards/rejected": -5.680800437927246, "step": 1075 }, { "epoch": 18.23728813559322, "grad_norm": 4.875313470692362, "learning_rate": 1.1738138387488938e-08, "logits/chosen": -9.282812118530273, "logits/rejected": -10.521957397460938, "logps/chosen": -9.483545303344727, "logps/rejected": -22.795330047607422, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 0.16500984132289886, "rewards/margins": 4.907614231109619, "rewards/rejected": -4.7426042556762695, "step": 1076 }, { "epoch": 18.25423728813559, "grad_norm": 4.45319886027985, "learning_rate": 1.1515231055063911e-08, "logits/chosen": -5.865035057067871, "logits/rejected": -6.470462322235107, "logps/chosen": -8.85952091217041, "logps/rejected": -21.33811378479004, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 0.23279842734336853, "rewards/margins": 4.53937292098999, "rewards/rejected": -4.306574821472168, "step": 1077 }, { "epoch": 18.271186440677965, "grad_norm": 3.7408544031494886, "learning_rate": 1.129441066782702e-08, "logits/chosen": -3.0924148559570312, "logits/rejected": -3.013120174407959, "logps/chosen": -9.44249153137207, "logps/rejected": -24.7534122467041, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -0.33274826407432556, "rewards/margins": 6.343916416168213, "rewards/rejected": -6.676664352416992, "step": 1078 }, { "epoch": 18.28813559322034, "grad_norm": 4.13897439636955, "learning_rate": 1.1075679158145002e-08, "logits/chosen": -5.453994274139404, "logits/rejected": -5.731247901916504, "logps/chosen": -8.67552375793457, "logps/rejected": -22.82550811767578, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": 0.3107266426086426, "rewards/margins": 5.333024501800537, "rewards/rejected": -5.022297382354736, "step": 1079 }, { "epoch": 18.305084745762713, "grad_norm": 3.7977422206050173, "learning_rate": 1.0859038440105161e-08, "logits/chosen": -7.962723731994629, "logits/rejected": -7.311408996582031, "logps/chosen": -9.005022048950195, "logps/rejected": -16.626529693603516, "loss": 0.0464, "rewards/accuracies": 1.0, "rewards/chosen": 0.3624380826950073, "rewards/margins": 4.189855098724365, "rewards/rejected": -3.8274168968200684, "step": 1080 }, { "epoch": 18.322033898305083, "grad_norm": 4.17151909264831, "learning_rate": 1.0644490409498636e-08, "logits/chosen": -8.780117988586426, "logits/rejected": -8.144914627075195, "logps/chosen": -14.520820617675781, "logps/rejected": -26.92098045349121, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": -0.09730023145675659, "rewards/margins": 5.36954402923584, "rewards/rejected": -5.46684455871582, "step": 1081 }, { "epoch": 18.338983050847457, "grad_norm": 4.266354260595334, "learning_rate": 1.0432036943803707e-08, "logits/chosen": -7.444359302520752, "logits/rejected": -7.710020542144775, "logps/chosen": -9.911048889160156, "logps/rejected": -20.40279769897461, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.37346360087394714, "rewards/margins": 4.519195079803467, "rewards/rejected": -4.145731449127197, "step": 1082 }, { "epoch": 18.35593220338983, "grad_norm": 3.7359193614404496, "learning_rate": 1.0221679902169461e-08, "logits/chosen": -8.830029487609863, "logits/rejected": -8.9429931640625, "logps/chosen": -7.692529678344727, "logps/rejected": -25.185260772705078, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": -0.25370359420776367, "rewards/margins": 6.122130393981934, "rewards/rejected": -6.375833511352539, "step": 1083 }, { "epoch": 18.372881355932204, "grad_norm": 4.376752842603358, "learning_rate": 1.0013421125399519e-08, "logits/chosen": -5.871336460113525, "logits/rejected": -3.8425660133361816, "logps/chosen": -10.602272033691406, "logps/rejected": -19.069087982177734, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 0.12840460240840912, "rewards/margins": 3.8662519454956055, "rewards/rejected": -3.737847328186035, "step": 1084 }, { "epoch": 18.389830508474578, "grad_norm": 4.335279206101846, "learning_rate": 9.80726243593577e-09, "logits/chosen": -6.976935386657715, "logits/rejected": -7.360747814178467, "logps/chosen": -9.483010292053223, "logps/rejected": -24.190677642822266, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.08014628291130066, "rewards/margins": 5.549870014190674, "rewards/rejected": -5.469723701477051, "step": 1085 }, { "epoch": 18.406779661016948, "grad_norm": 4.346101682290994, "learning_rate": 9.603205637842698e-09, "logits/chosen": -6.531325340270996, "logits/rejected": -7.804195404052734, "logps/chosen": -7.574182033538818, "logps/rejected": -21.1744384765625, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": 0.5449045300483704, "rewards/margins": 4.856822967529297, "rewards/rejected": -4.311918258666992, "step": 1086 }, { "epoch": 18.423728813559322, "grad_norm": 5.645565210955278, "learning_rate": 9.401252516791302e-09, "logits/chosen": -7.214651584625244, "logits/rejected": -5.821599006652832, "logps/chosen": -7.03403377532959, "logps/rejected": -22.67394256591797, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": 0.3359869122505188, "rewards/margins": 4.7078537940979, "rewards/rejected": -4.371866226196289, "step": 1087 }, { "epoch": 18.440677966101696, "grad_norm": 3.6044174957716923, "learning_rate": 9.201404840043725e-09, "logits/chosen": -8.01170539855957, "logits/rejected": -5.888266086578369, "logps/chosen": -9.264163970947266, "logps/rejected": -28.278165817260742, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 0.20131486654281616, "rewards/margins": 6.650158405303955, "rewards/rejected": -6.448843479156494, "step": 1088 }, { "epoch": 18.45762711864407, "grad_norm": 4.146359056361594, "learning_rate": 9.003664356437651e-09, "logits/chosen": -4.719537258148193, "logits/rejected": -4.794631004333496, "logps/chosen": -7.586413383483887, "logps/rejected": -18.851707458496094, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": 0.4252668023109436, "rewards/margins": 5.126582622528076, "rewards/rejected": -4.701315402984619, "step": 1089 }, { "epoch": 18.47457627118644, "grad_norm": 4.5351885213756695, "learning_rate": 8.808032796371017e-09, "logits/chosen": -9.071616172790527, "logits/rejected": -5.767463684082031, "logps/chosen": -11.853965759277344, "logps/rejected": -22.780263900756836, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": 0.14982807636260986, "rewards/margins": 5.077415466308594, "rewards/rejected": -4.927587509155273, "step": 1090 }, { "epoch": 18.491525423728813, "grad_norm": 4.15770864036275, "learning_rate": 8.614511871786828e-09, "logits/chosen": -5.206994533538818, "logits/rejected": -2.0901315212249756, "logps/chosen": -9.402706146240234, "logps/rejected": -19.950143814086914, "loss": 0.0575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.043576233088970184, "rewards/margins": 4.414394378662109, "rewards/rejected": -4.3708176612854, "step": 1091 }, { "epoch": 18.508474576271187, "grad_norm": 4.293926960441934, "learning_rate": 8.423103276158306e-09, "logits/chosen": -6.7276129722595215, "logits/rejected": -6.699288368225098, "logps/chosen": -10.428839683532715, "logps/rejected": -19.753442764282227, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 0.09627915173768997, "rewards/margins": 4.188758850097656, "rewards/rejected": -4.092479705810547, "step": 1092 }, { "epoch": 18.52542372881356, "grad_norm": 3.4689974534493664, "learning_rate": 8.233808684473959e-09, "logits/chosen": -6.171113014221191, "logits/rejected": -2.7044224739074707, "logps/chosen": -8.165644645690918, "logps/rejected": -26.037586212158203, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 0.3093603253364563, "rewards/margins": 6.091079235076904, "rewards/rejected": -5.781719207763672, "step": 1093 }, { "epoch": 18.54237288135593, "grad_norm": 4.108140260887782, "learning_rate": 8.046629753222955e-09, "logits/chosen": -4.509550094604492, "logits/rejected": -5.370866775512695, "logps/chosen": -9.223006248474121, "logps/rejected": -20.383045196533203, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": 0.22011250257492065, "rewards/margins": 4.451344013214111, "rewards/rejected": -4.231231689453125, "step": 1094 }, { "epoch": 18.559322033898304, "grad_norm": 4.0705282526634186, "learning_rate": 7.861568120380634e-09, "logits/chosen": -6.94540548324585, "logits/rejected": -6.977925777435303, "logps/chosen": -9.851380348205566, "logps/rejected": -25.77141571044922, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 0.09519022703170776, "rewards/margins": 5.343542098999023, "rewards/rejected": -5.24835205078125, "step": 1095 }, { "epoch": 18.576271186440678, "grad_norm": 4.011699369253127, "learning_rate": 7.678625405394157e-09, "logits/chosen": -5.413311004638672, "logits/rejected": -5.815068244934082, "logps/chosen": -8.874666213989258, "logps/rejected": -23.715831756591797, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 0.06100507080554962, "rewards/margins": 5.189451694488525, "rewards/rejected": -5.12844705581665, "step": 1096 }, { "epoch": 18.593220338983052, "grad_norm": 4.46202294370448, "learning_rate": 7.497803209168346e-09, "logits/chosen": -9.656492233276367, "logits/rejected": -8.445954322814941, "logps/chosen": -9.470484733581543, "logps/rejected": -19.208017349243164, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": 0.5084323287010193, "rewards/margins": 4.6789469718933105, "rewards/rejected": -4.1705145835876465, "step": 1097 }, { "epoch": 18.610169491525422, "grad_norm": 4.045645521696906, "learning_rate": 7.319103114051706e-09, "logits/chosen": -7.331865310668945, "logits/rejected": -7.503050804138184, "logps/chosen": -11.165914535522461, "logps/rejected": -21.40276336669922, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 0.15454208850860596, "rewards/margins": 4.488321304321289, "rewards/rejected": -4.3337788581848145, "step": 1098 }, { "epoch": 18.627118644067796, "grad_norm": 3.8341377299113075, "learning_rate": 7.142526683822536e-09, "logits/chosen": -11.285856246948242, "logits/rejected": -8.024409294128418, "logps/chosen": -11.741935729980469, "logps/rejected": -23.4906005859375, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 0.12301252037286758, "rewards/margins": 4.999948024749756, "rewards/rejected": -4.876935005187988, "step": 1099 }, { "epoch": 18.64406779661017, "grad_norm": 3.381727056154272, "learning_rate": 6.9680754636752e-09, "logits/chosen": -6.764732360839844, "logits/rejected": -2.8100860118865967, "logps/chosen": -12.623137474060059, "logps/rejected": -27.882747650146484, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 0.06339322030544281, "rewards/margins": 6.394018650054932, "rewards/rejected": -6.330626010894775, "step": 1100 }, { "epoch": 18.661016949152543, "grad_norm": 3.7073145614445306, "learning_rate": 6.7957509802067104e-09, "logits/chosen": -7.591949939727783, "logits/rejected": -4.612473487854004, "logps/chosen": -11.28479290008545, "logps/rejected": -26.188610076904297, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -0.2758193910121918, "rewards/margins": 6.186018943786621, "rewards/rejected": -6.461838722229004, "step": 1101 }, { "epoch": 18.677966101694913, "grad_norm": 3.999225968740304, "learning_rate": 6.625554741403333e-09, "logits/chosen": -6.6668806076049805, "logits/rejected": -6.30198335647583, "logps/chosen": -9.92651653289795, "logps/rejected": -21.492538452148438, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 0.06392458081245422, "rewards/margins": 4.802147388458252, "rewards/rejected": -4.738222599029541, "step": 1102 }, { "epoch": 18.694915254237287, "grad_norm": 3.8269904200968066, "learning_rate": 6.457488236627395e-09, "logits/chosen": -4.248052597045898, "logits/rejected": 0.813204824924469, "logps/chosen": -9.202914237976074, "logps/rejected": -23.664892196655273, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 0.42141085863113403, "rewards/margins": 6.465717792510986, "rewards/rejected": -6.044306755065918, "step": 1103 }, { "epoch": 18.71186440677966, "grad_norm": 4.022845003893159, "learning_rate": 6.291552936604133e-09, "logits/chosen": -7.192059516906738, "logits/rejected": -4.204983711242676, "logps/chosen": -10.985185623168945, "logps/rejected": -26.992740631103516, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 0.05412624776363373, "rewards/margins": 6.3019022941589355, "rewards/rejected": -6.247776031494141, "step": 1104 }, { "epoch": 18.728813559322035, "grad_norm": 4.631119734383148, "learning_rate": 6.127750293409006e-09, "logits/chosen": -7.748348712921143, "logits/rejected": -5.174810409545898, "logps/chosen": -10.34148120880127, "logps/rejected": -16.223203659057617, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 0.44356459379196167, "rewards/margins": 3.629425048828125, "rewards/rejected": -3.1858601570129395, "step": 1105 }, { "epoch": 18.74576271186441, "grad_norm": 4.056055906549981, "learning_rate": 5.966081740454931e-09, "logits/chosen": -6.4134321212768555, "logits/rejected": -5.019958972930908, "logps/chosen": -9.166025161743164, "logps/rejected": -25.963987350463867, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -0.254142165184021, "rewards/margins": 5.986145496368408, "rewards/rejected": -6.240288257598877, "step": 1106 }, { "epoch": 18.76271186440678, "grad_norm": 4.2510113340199815, "learning_rate": 5.806548692479623e-09, "logits/chosen": -4.511862277984619, "logits/rejected": -3.1294548511505127, "logps/chosen": -16.437929153442383, "logps/rejected": -32.30805969238281, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 0.07681269943714142, "rewards/margins": 5.85745096206665, "rewards/rejected": -5.780638217926025, "step": 1107 }, { "epoch": 18.779661016949152, "grad_norm": 3.9919824633820964, "learning_rate": 5.649152545533331e-09, "logits/chosen": -6.852721691131592, "logits/rejected": -6.540812015533447, "logps/chosen": -10.670454025268555, "logps/rejected": -21.6687068939209, "loss": 0.0409, "rewards/accuracies": 1.0, "rewards/chosen": 0.3178441822528839, "rewards/margins": 5.176324367523193, "rewards/rejected": -4.858480453491211, "step": 1108 }, { "epoch": 18.796610169491526, "grad_norm": 3.397614174449964, "learning_rate": 5.493894676966704e-09, "logits/chosen": -3.6471877098083496, "logits/rejected": -3.6686882972717285, "logps/chosen": -8.435153007507324, "logps/rejected": -25.47620391845703, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -0.010460421442985535, "rewards/margins": 5.795283317565918, "rewards/rejected": -5.80574369430542, "step": 1109 }, { "epoch": 18.8135593220339, "grad_norm": 3.9669584038833863, "learning_rate": 5.340776445418471e-09, "logits/chosen": -10.3143892288208, "logits/rejected": -7.56541633605957, "logps/chosen": -8.294013023376465, "logps/rejected": -23.466705322265625, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 0.12921765446662903, "rewards/margins": 5.621309757232666, "rewards/rejected": -5.492092132568359, "step": 1110 }, { "epoch": 18.83050847457627, "grad_norm": 4.954316687519291, "learning_rate": 5.1897991908038396e-09, "logits/chosen": -6.567185401916504, "logits/rejected": -5.2818284034729, "logps/chosen": -7.029053211212158, "logps/rejected": -15.25006103515625, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": 0.30523422360420227, "rewards/margins": 3.355952739715576, "rewards/rejected": -3.0507185459136963, "step": 1111 }, { "epoch": 18.847457627118644, "grad_norm": 4.968688843586282, "learning_rate": 5.040964234302558e-09, "logits/chosen": -7.132032871246338, "logits/rejected": -6.4067888259887695, "logps/chosen": -8.73871898651123, "logps/rejected": -24.557842254638672, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 0.2146884799003601, "rewards/margins": 6.353532791137695, "rewards/rejected": -6.138844013214111, "step": 1112 }, { "epoch": 18.864406779661017, "grad_norm": 4.4839891569309795, "learning_rate": 4.894272878347483e-09, "logits/chosen": -6.7498345375061035, "logits/rejected": -4.952764987945557, "logps/chosen": -6.822923183441162, "logps/rejected": -18.971647262573242, "loss": 0.0431, "rewards/accuracies": 1.0, "rewards/chosen": 0.01051950454711914, "rewards/margins": 4.419131755828857, "rewards/rejected": -4.408612251281738, "step": 1113 }, { "epoch": 18.88135593220339, "grad_norm": 4.040720712271205, "learning_rate": 4.749726406613142e-09, "logits/chosen": -12.106992721557617, "logits/rejected": -5.57379674911499, "logps/chosen": -16.29015350341797, "logps/rejected": -23.340852737426758, "loss": 0.0416, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2273421436548233, "rewards/margins": 4.638840675354004, "rewards/rejected": -4.866182804107666, "step": 1114 }, { "epoch": 18.89830508474576, "grad_norm": 4.234068539111867, "learning_rate": 4.607326084004437e-09, "logits/chosen": -9.570833206176758, "logits/rejected": -7.380236625671387, "logps/chosen": -7.770480632781982, "logps/rejected": -17.556217193603516, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 0.6741684675216675, "rewards/margins": 3.8052420616149902, "rewards/rejected": -3.131073474884033, "step": 1115 }, { "epoch": 18.915254237288135, "grad_norm": 4.759020975492104, "learning_rate": 4.467073156645712e-09, "logits/chosen": -7.897947311401367, "logits/rejected": -8.628703117370605, "logps/chosen": -9.987771987915039, "logps/rejected": -21.86614990234375, "loss": 0.0566, "rewards/accuracies": 1.0, "rewards/chosen": 0.1313033401966095, "rewards/margins": 4.395023345947266, "rewards/rejected": -4.2637200355529785, "step": 1116 }, { "epoch": 18.93220338983051, "grad_norm": 4.223825609967969, "learning_rate": 4.328968851869758e-09, "logits/chosen": -7.241960525512695, "logits/rejected": -3.7764053344726562, "logps/chosen": -12.674301147460938, "logps/rejected": -25.130722045898438, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -0.07294300943613052, "rewards/margins": 5.546260356903076, "rewards/rejected": -5.619204044342041, "step": 1117 }, { "epoch": 18.949152542372882, "grad_norm": 4.603862164649134, "learning_rate": 4.193014378207044e-09, "logits/chosen": -5.850318908691406, "logits/rejected": -2.6857542991638184, "logps/chosen": -9.542285919189453, "logps/rejected": -24.915546417236328, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 0.2751765847206116, "rewards/margins": 5.750458240509033, "rewards/rejected": -5.475281715393066, "step": 1118 }, { "epoch": 18.966101694915253, "grad_norm": 4.244835334089812, "learning_rate": 4.059210925375173e-09, "logits/chosen": -4.551603317260742, "logits/rejected": -3.4954090118408203, "logps/chosen": -12.582915306091309, "logps/rejected": -23.43550682067871, "loss": 0.0514, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12680701911449432, "rewards/margins": 4.264057159423828, "rewards/rejected": -4.13724946975708, "step": 1119 }, { "epoch": 18.983050847457626, "grad_norm": 4.295140577343596, "learning_rate": 3.927559664268554e-09, "logits/chosen": -6.582233905792236, "logits/rejected": -5.693815231323242, "logps/chosen": -8.524242401123047, "logps/rejected": -21.318532943725586, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": 0.16140446066856384, "rewards/margins": 5.285299777984619, "rewards/rejected": -5.123895168304443, "step": 1120 }, { "epoch": 19.0, "grad_norm": 4.488787452254915, "learning_rate": 3.798061746947995e-09, "logits/chosen": -6.865042686462402, "logits/rejected": -4.247623920440674, "logps/chosen": -11.491613388061523, "logps/rejected": -19.202152252197266, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -0.16015183925628662, "rewards/margins": 4.161006927490234, "rewards/rejected": -4.321159362792969, "step": 1121 }, { "epoch": 19.016949152542374, "grad_norm": 4.64037685756284, "learning_rate": 3.6707183066307656e-09, "logits/chosen": -9.344770431518555, "logits/rejected": -4.774504661560059, "logps/chosen": -9.032370567321777, "logps/rejected": -29.84403419494629, "loss": 0.0615, "rewards/accuracies": 1.0, "rewards/chosen": -0.22558191418647766, "rewards/margins": 6.387495994567871, "rewards/rejected": -6.613077640533447, "step": 1122 }, { "epoch": 19.033898305084747, "grad_norm": 4.955290733050375, "learning_rate": 3.5455304576806346e-09, "logits/chosen": -5.308284282684326, "logits/rejected": -4.995919227600098, "logps/chosen": -9.144086837768555, "logps/rejected": -28.654260635375977, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 0.16971349716186523, "rewards/margins": 5.827495574951172, "rewards/rejected": -5.657781600952148, "step": 1123 }, { "epoch": 19.050847457627118, "grad_norm": 3.9959696885818605, "learning_rate": 3.4224992955980693e-09, "logits/chosen": -4.088801383972168, "logits/rejected": -1.3501896858215332, "logps/chosen": -10.250041007995605, "logps/rejected": -22.934467315673828, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 0.48192471265792847, "rewards/margins": 4.999414443969727, "rewards/rejected": -4.517489433288574, "step": 1124 }, { "epoch": 19.06779661016949, "grad_norm": 4.112989021260128, "learning_rate": 3.3016258970106903e-09, "logits/chosen": -8.40589714050293, "logits/rejected": -7.380486488342285, "logps/chosen": -9.625313758850098, "logps/rejected": -29.280725479125977, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -0.07790735363960266, "rewards/margins": 6.871036529541016, "rewards/rejected": -6.948944568634033, "step": 1125 }, { "epoch": 19.084745762711865, "grad_norm": 4.018426131140643, "learning_rate": 3.1829113196638613e-09, "logits/chosen": -5.221579551696777, "logits/rejected": -5.25046443939209, "logps/chosen": -9.330720901489258, "logps/rejected": -27.666893005371094, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": 0.04562787711620331, "rewards/margins": 6.5740461349487305, "rewards/rejected": -6.528418064117432, "step": 1126 }, { "epoch": 19.10169491525424, "grad_norm": 4.339584337044476, "learning_rate": 3.0663566024114183e-09, "logits/chosen": -10.538304328918457, "logits/rejected": -7.795166015625, "logps/chosen": -10.814375877380371, "logps/rejected": -19.972862243652344, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": 0.4461347460746765, "rewards/margins": 4.960249423980713, "rewards/rejected": -4.5141143798828125, "step": 1127 }, { "epoch": 19.11864406779661, "grad_norm": 3.642298752875519, "learning_rate": 2.951962765206567e-09, "logits/chosen": -8.135869979858398, "logits/rejected": -5.980784893035889, "logps/chosen": -6.342912673950195, "logps/rejected": -17.059585571289062, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 0.7364379167556763, "rewards/margins": 4.4074482917785645, "rewards/rejected": -3.6710104942321777, "step": 1128 }, { "epoch": 19.135593220338983, "grad_norm": 3.512517269165827, "learning_rate": 2.839730809092972e-09, "logits/chosen": -7.244131565093994, "logits/rejected": -4.6190571784973145, "logps/chosen": -10.269086837768555, "logps/rejected": -21.036928176879883, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": 0.09786014258861542, "rewards/margins": 5.341797828674316, "rewards/rejected": -5.243937969207764, "step": 1129 }, { "epoch": 19.152542372881356, "grad_norm": 4.495700708917495, "learning_rate": 2.7296617161960413e-09, "logits/chosen": -8.322103500366211, "logits/rejected": -6.5170578956604, "logps/chosen": -7.848247528076172, "logps/rejected": -23.73181915283203, "loss": 0.0514, "rewards/accuracies": 0.9375, "rewards/chosen": 0.46722692251205444, "rewards/margins": 5.935376167297363, "rewards/rejected": -5.4681501388549805, "step": 1130 }, { "epoch": 19.16949152542373, "grad_norm": 4.098312139251818, "learning_rate": 2.6217564497141574e-09, "logits/chosen": -5.403450965881348, "logits/rejected": -6.877000331878662, "logps/chosen": -11.477622985839844, "logps/rejected": -23.035629272460938, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 0.3956793546676636, "rewards/margins": 4.676126003265381, "rewards/rejected": -4.280446529388428, "step": 1131 }, { "epoch": 19.1864406779661, "grad_norm": 3.5602830148283773, "learning_rate": 2.516015953910544e-09, "logits/chosen": -5.9252705574035645, "logits/rejected": -4.196394443511963, "logps/chosen": -8.186470985412598, "logps/rejected": -21.580202102661133, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 0.11510688811540604, "rewards/margins": 5.886081218719482, "rewards/rejected": -5.770974159240723, "step": 1132 }, { "epoch": 19.203389830508474, "grad_norm": 4.3782832810573105, "learning_rate": 2.4124411541047162e-09, "logits/chosen": -5.3209991455078125, "logits/rejected": -2.6802937984466553, "logps/chosen": -12.969682693481445, "logps/rejected": -27.95191192626953, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 0.3466143012046814, "rewards/margins": 6.493988990783691, "rewards/rejected": -6.147375106811523, "step": 1133 }, { "epoch": 19.220338983050848, "grad_norm": 4.41272303319751, "learning_rate": 2.3110329566645158e-09, "logits/chosen": -9.802309036254883, "logits/rejected": -9.715230941772461, "logps/chosen": -8.948881149291992, "logps/rejected": -20.29668617248535, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 0.3533758223056793, "rewards/margins": 4.4907755851745605, "rewards/rejected": -4.137399673461914, "step": 1134 }, { "epoch": 19.23728813559322, "grad_norm": 3.9731944084938835, "learning_rate": 2.2117922489982286e-09, "logits/chosen": -9.187678337097168, "logits/rejected": -8.45703411102295, "logps/chosen": -10.043983459472656, "logps/rejected": -24.24431800842285, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": 0.21984882652759552, "rewards/margins": 5.44026517868042, "rewards/rejected": -5.220417022705078, "step": 1135 }, { "epoch": 19.25423728813559, "grad_norm": 3.8630961033176288, "learning_rate": 2.1147198995466466e-09, "logits/chosen": -6.825324535369873, "logits/rejected": -6.143636703491211, "logps/chosen": -8.299385070800781, "logps/rejected": -18.08413314819336, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 0.3547424376010895, "rewards/margins": 4.631790637969971, "rewards/rejected": -4.277048110961914, "step": 1136 }, { "epoch": 19.271186440677965, "grad_norm": 4.963351485073133, "learning_rate": 2.0198167577757107e-09, "logits/chosen": -7.602474212646484, "logits/rejected": -7.027194976806641, "logps/chosen": -10.680564880371094, "logps/rejected": -21.71973991394043, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 0.2703138291835785, "rewards/margins": 5.047008037567139, "rewards/rejected": -4.776694297790527, "step": 1137 }, { "epoch": 19.28813559322034, "grad_norm": 4.4201548943030575, "learning_rate": 1.927083654168854e-09, "logits/chosen": -8.062311172485352, "logits/rejected": -6.282785892486572, "logps/chosen": -9.119327545166016, "logps/rejected": -17.638656616210938, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 0.48620134592056274, "rewards/margins": 4.4218525886535645, "rewards/rejected": -3.9356513023376465, "step": 1138 }, { "epoch": 19.305084745762713, "grad_norm": 3.5473965133479024, "learning_rate": 1.8365214002198648e-09, "logits/chosen": -6.863635540008545, "logits/rejected": -3.4915430545806885, "logps/chosen": -11.302085876464844, "logps/rejected": -21.60725975036621, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 0.44304215908050537, "rewards/margins": 4.834182262420654, "rewards/rejected": -4.391139984130859, "step": 1139 }, { "epoch": 19.322033898305083, "grad_norm": 4.764976937865656, "learning_rate": 1.7481307884256725e-09, "logits/chosen": -10.479633331298828, "logits/rejected": -4.6769304275512695, "logps/chosen": -11.336868286132812, "logps/rejected": -19.19312858581543, "loss": 0.0564, "rewards/accuracies": 1.0, "rewards/chosen": 0.17866046726703644, "rewards/margins": 3.89330792427063, "rewards/rejected": -3.714647054672241, "step": 1140 }, { "epoch": 19.338983050847457, "grad_norm": 5.020310454336254, "learning_rate": 1.6619125922796019e-09, "logits/chosen": -7.57431173324585, "logits/rejected": -2.7706239223480225, "logps/chosen": -13.79558277130127, "logps/rejected": -31.725343704223633, "loss": 0.0471, "rewards/accuracies": 1.0, "rewards/chosen": -0.6086219549179077, "rewards/margins": 6.808389186859131, "rewards/rejected": -7.417011737823486, "step": 1141 }, { "epoch": 19.35593220338983, "grad_norm": 3.8807659082830033, "learning_rate": 1.5778675662643791e-09, "logits/chosen": -6.23847770690918, "logits/rejected": -5.861680030822754, "logps/chosen": -7.707354545593262, "logps/rejected": -19.576440811157227, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 0.25658178329467773, "rewards/margins": 4.902749061584473, "rewards/rejected": -4.646167278289795, "step": 1142 }, { "epoch": 19.372881355932204, "grad_norm": 3.5873067908971983, "learning_rate": 1.4959964458456931e-09, "logits/chosen": -6.67938756942749, "logits/rejected": -6.05525541305542, "logps/chosen": -11.312013626098633, "logps/rejected": -30.832914352416992, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 0.16357643902301788, "rewards/margins": 7.093705177307129, "rewards/rejected": -6.930128574371338, "step": 1143 }, { "epoch": 19.389830508474578, "grad_norm": 4.662610656958441, "learning_rate": 1.4162999474657266e-09, "logits/chosen": -9.130437850952148, "logits/rejected": -7.133908748626709, "logps/chosen": -7.804490566253662, "logps/rejected": -15.638628005981445, "loss": 0.0527, "rewards/accuracies": 1.0, "rewards/chosen": 0.4274514317512512, "rewards/margins": 4.310211658477783, "rewards/rejected": -3.882760524749756, "step": 1144 }, { "epoch": 19.406779661016948, "grad_norm": 4.059115335211127, "learning_rate": 1.3387787685368024e-09, "logits/chosen": -10.194669723510742, "logits/rejected": -11.102928161621094, "logps/chosen": -9.580450057983398, "logps/rejected": -22.048954010009766, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": 0.114285409450531, "rewards/margins": 4.883891582489014, "rewards/rejected": -4.769606113433838, "step": 1145 }, { "epoch": 19.423728813559322, "grad_norm": 4.899057307983236, "learning_rate": 1.2634335874353585e-09, "logits/chosen": -4.248291969299316, "logits/rejected": -4.8470587730407715, "logps/chosen": -9.899078369140625, "logps/rejected": -27.653818130493164, "loss": 0.049, "rewards/accuracies": 1.0, "rewards/chosen": -0.09164927154779434, "rewards/margins": 6.349340915679932, "rewards/rejected": -6.440991401672363, "step": 1146 }, { "epoch": 19.440677966101696, "grad_norm": 3.8704721367512676, "learning_rate": 1.1902650634960377e-09, "logits/chosen": -5.251467704772949, "logits/rejected": -6.173592567443848, "logps/chosen": -10.023374557495117, "logps/rejected": -19.808534622192383, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": -0.15790097415447235, "rewards/margins": 4.519407272338867, "rewards/rejected": -4.677308082580566, "step": 1147 }, { "epoch": 19.45762711864407, "grad_norm": 3.359884250589287, "learning_rate": 1.1192738370058574e-09, "logits/chosen": -7.491716384887695, "logits/rejected": -7.280231475830078, "logps/chosen": -6.225736618041992, "logps/rejected": -18.614547729492188, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 0.6800587177276611, "rewards/margins": 4.669227600097656, "rewards/rejected": -3.989168882369995, "step": 1148 }, { "epoch": 19.47457627118644, "grad_norm": 4.678996121466518, "learning_rate": 1.050460529198577e-09, "logits/chosen": -6.2738728523254395, "logits/rejected": -7.710474967956543, "logps/chosen": -7.842569351196289, "logps/rejected": -22.311418533325195, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 0.5984208583831787, "rewards/margins": 5.338993549346924, "rewards/rejected": -4.740572929382324, "step": 1149 }, { "epoch": 19.491525423728813, "grad_norm": 3.680168733564192, "learning_rate": 9.838257422493667e-10, "logits/chosen": -7.964971542358398, "logits/rejected": -7.558823108673096, "logps/chosen": -10.521120071411133, "logps/rejected": -21.55910873413086, "loss": 0.0429, "rewards/accuracies": 1.0, "rewards/chosen": 0.28645265102386475, "rewards/margins": 4.186437606811523, "rewards/rejected": -3.8999853134155273, "step": 1150 }, { "epoch": 19.508474576271187, "grad_norm": 4.966531922721566, "learning_rate": 9.193700592694531e-10, "logits/chosen": -4.467232704162598, "logits/rejected": -4.619475364685059, "logps/chosen": -7.945337295532227, "logps/rejected": -19.41752052307129, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": -0.2579636871814728, "rewards/margins": 4.400396823883057, "rewards/rejected": -4.658360481262207, "step": 1151 }, { "epoch": 19.52542372881356, "grad_norm": 4.404996879947679, "learning_rate": 8.570940443010655e-10, "logits/chosen": -6.992228031158447, "logits/rejected": -5.218314170837402, "logps/chosen": -7.047336578369141, "logps/rejected": -14.934014320373535, "loss": 0.0499, "rewards/accuracies": 1.0, "rewards/chosen": 0.3601531386375427, "rewards/margins": 3.750903844833374, "rewards/rejected": -3.3907506465911865, "step": 1152 }, { "epoch": 19.54237288135593, "grad_norm": 4.429632740370815, "learning_rate": 7.969982423124689e-10, "logits/chosen": -11.009224891662598, "logits/rejected": -7.346052646636963, "logps/chosen": -8.033683776855469, "logps/rejected": -17.191102981567383, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 0.7259281873703003, "rewards/margins": 4.4079718589782715, "rewards/rejected": -3.6820435523986816, "step": 1153 }, { "epoch": 19.559322033898304, "grad_norm": 3.7837176771704844, "learning_rate": 7.390831791931895e-10, "logits/chosen": -6.310678482055664, "logits/rejected": -5.403589725494385, "logps/chosen": -11.5776948928833, "logps/rejected": -30.27614402770996, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.053039468824863434, "rewards/margins": 6.467260360717773, "rewards/rejected": -6.520299434661865, "step": 1154 }, { "epoch": 19.576271186440678, "grad_norm": 4.812999650447907, "learning_rate": 6.83349361749408e-10, "logits/chosen": -5.7683844566345215, "logits/rejected": -5.301779270172119, "logps/chosen": -7.299991130828857, "logps/rejected": -17.533212661743164, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 0.4515947699546814, "rewards/margins": 4.172460556030273, "rewards/rejected": -3.7208657264709473, "step": 1155 }, { "epoch": 19.593220338983052, "grad_norm": 4.580659403548684, "learning_rate": 6.297972776996285e-10, "logits/chosen": -5.716320991516113, "logits/rejected": -3.708559989929199, "logps/chosen": -8.555032730102539, "logps/rejected": -20.43698501586914, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": 0.30857783555984497, "rewards/margins": 4.984573841094971, "rewards/rejected": -4.67599630355835, "step": 1156 }, { "epoch": 19.610169491525422, "grad_norm": 4.238548965488203, "learning_rate": 5.78427395670239e-10, "logits/chosen": -7.459744930267334, "logits/rejected": -5.083245754241943, "logps/chosen": -9.815725326538086, "logps/rejected": -23.77956199645996, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -0.022835373878479004, "rewards/margins": 5.320274353027344, "rewards/rejected": -5.343109607696533, "step": 1157 }, { "epoch": 19.627118644067796, "grad_norm": 4.296226251958125, "learning_rate": 5.29240165191569e-10, "logits/chosen": -6.119602203369141, "logits/rejected": -3.446010112762451, "logps/chosen": -8.73384952545166, "logps/rejected": -21.660493850708008, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": 0.1783091127872467, "rewards/margins": 5.320026397705078, "rewards/rejected": -5.141717433929443, "step": 1158 }, { "epoch": 19.64406779661017, "grad_norm": 4.759685042111325, "learning_rate": 4.8223601669381e-10, "logits/chosen": -7.123500823974609, "logits/rejected": -5.013278484344482, "logps/chosen": -9.441107749938965, "logps/rejected": -22.8370304107666, "loss": 0.0544, "rewards/accuracies": 1.0, "rewards/chosen": 0.4494301974773407, "rewards/margins": 5.925429821014404, "rewards/rejected": -5.47599983215332, "step": 1159 }, { "epoch": 19.661016949152543, "grad_norm": 4.5051160807484605, "learning_rate": 4.3741536150337934e-10, "logits/chosen": -5.4635467529296875, "logits/rejected": -5.964138507843018, "logps/chosen": -10.20158576965332, "logps/rejected": -18.295839309692383, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 0.029481612145900726, "rewards/margins": 3.807978868484497, "rewards/rejected": -3.7784974575042725, "step": 1160 }, { "epoch": 19.677966101694913, "grad_norm": 4.532448176911285, "learning_rate": 3.9477859183925654e-10, "logits/chosen": -6.951817989349365, "logits/rejected": -6.393136978149414, "logps/chosen": -8.013717651367188, "logps/rejected": -21.919706344604492, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": 0.5353216528892517, "rewards/margins": 5.655571937561035, "rewards/rejected": -5.120250701904297, "step": 1161 }, { "epoch": 19.694915254237287, "grad_norm": 3.477802121577011, "learning_rate": 3.5432608080951386e-10, "logits/chosen": -7.770132541656494, "logits/rejected": -6.3644304275512695, "logps/chosen": -8.72642707824707, "logps/rejected": -19.723552703857422, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 0.2946310043334961, "rewards/margins": 5.015895366668701, "rewards/rejected": -4.721264839172363, "step": 1162 }, { "epoch": 19.71186440677966, "grad_norm": 3.602158767913972, "learning_rate": 3.160581824081798e-10, "logits/chosen": -8.145950317382812, "logits/rejected": -3.2530272006988525, "logps/chosen": -12.056177139282227, "logps/rejected": -27.599056243896484, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": 0.3399862051010132, "rewards/margins": 6.990494728088379, "rewards/rejected": -6.650508403778076, "step": 1163 }, { "epoch": 19.728813559322035, "grad_norm": 4.661769244717556, "learning_rate": 2.7997523151199186e-10, "logits/chosen": -5.5459675788879395, "logits/rejected": -4.609034538269043, "logps/chosen": -7.424615859985352, "logps/rejected": -21.676082611083984, "loss": 0.0418, "rewards/accuracies": 1.0, "rewards/chosen": 0.5069709420204163, "rewards/margins": 4.763805866241455, "rewards/rejected": -4.256834983825684, "step": 1164 }, { "epoch": 19.74576271186441, "grad_norm": 3.968610512394195, "learning_rate": 2.4607754387753753e-10, "logits/chosen": -8.559868812561035, "logits/rejected": -6.620041847229004, "logps/chosen": -10.406312942504883, "logps/rejected": -28.012836456298828, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": 0.023296553641557693, "rewards/margins": 7.254501819610596, "rewards/rejected": -7.231204986572266, "step": 1165 }, { "epoch": 19.76271186440678, "grad_norm": 4.078397515416586, "learning_rate": 2.1436541613853442e-10, "logits/chosen": -10.614221572875977, "logits/rejected": -6.317427635192871, "logps/chosen": -10.993356704711914, "logps/rejected": -20.877456665039062, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -0.15146180987358093, "rewards/margins": 4.370360374450684, "rewards/rejected": -4.521821975708008, "step": 1166 }, { "epoch": 19.779661016949152, "grad_norm": 3.7080068749451742, "learning_rate": 1.8483912580313787e-10, "logits/chosen": -7.377213478088379, "logits/rejected": -1.7765655517578125, "logps/chosen": -12.268579483032227, "logps/rejected": -25.69411277770996, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 0.23731692135334015, "rewards/margins": 5.506694316864014, "rewards/rejected": -5.2693772315979, "step": 1167 }, { "epoch": 19.796610169491526, "grad_norm": 4.082044069557541, "learning_rate": 1.574989312516095e-10, "logits/chosen": -4.877951622009277, "logits/rejected": -4.450176239013672, "logps/chosen": -10.917598724365234, "logps/rejected": -25.072479248046875, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -0.06078708916902542, "rewards/margins": 5.301979064941406, "rewards/rejected": -5.362766265869141, "step": 1168 }, { "epoch": 19.8135593220339, "grad_norm": 3.8856096679992516, "learning_rate": 1.3234507173393029e-10, "logits/chosen": -5.971585273742676, "logits/rejected": -2.853632926940918, "logps/chosen": -11.391792297363281, "logps/rejected": -19.528844833374023, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": 0.6338703036308289, "rewards/margins": 5.5579938888549805, "rewards/rejected": -4.924124240875244, "step": 1169 }, { "epoch": 19.83050847457627, "grad_norm": 3.9286703819740074, "learning_rate": 1.0937776736782978e-10, "logits/chosen": -9.924491882324219, "logits/rejected": -9.015110969543457, "logps/chosen": -7.7781782150268555, "logps/rejected": -16.215484619140625, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 0.5362356901168823, "rewards/margins": 4.200999736785889, "rewards/rejected": -3.664763927459717, "step": 1170 }, { "epoch": 19.847457627118644, "grad_norm": 4.526785004898851, "learning_rate": 8.859721913684337e-11, "logits/chosen": -9.032225608825684, "logits/rejected": -8.733574867248535, "logps/chosen": -13.94797134399414, "logps/rejected": -32.85288619995117, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 0.665097713470459, "rewards/margins": 5.278740882873535, "rewards/rejected": -4.613643646240234, "step": 1171 }, { "epoch": 19.864406779661017, "grad_norm": 4.606104834951581, "learning_rate": 7.000360888850809e-11, "logits/chosen": -8.41981029510498, "logits/rejected": -6.411755084991455, "logps/chosen": -11.847469329833984, "logps/rejected": -22.13727569580078, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": 0.2522156834602356, "rewards/margins": 5.082830429077148, "rewards/rejected": -4.830615043640137, "step": 1172 }, { "epoch": 19.88135593220339, "grad_norm": 4.297813577782475, "learning_rate": 5.35970993327528e-11, "logits/chosen": -9.063766479492188, "logits/rejected": -8.41303825378418, "logps/chosen": -8.27404499053955, "logps/rejected": -23.108671188354492, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": 0.1744605153799057, "rewards/margins": 5.226859092712402, "rewards/rejected": -5.052398681640625, "step": 1173 }, { "epoch": 19.89830508474576, "grad_norm": 4.297871723544772, "learning_rate": 3.9377834040538184e-11, "logits/chosen": -8.93664836883545, "logits/rejected": -6.258477210998535, "logps/chosen": -12.725931167602539, "logps/rejected": -24.41492462158203, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -0.029960155487060547, "rewards/margins": 4.409930229187012, "rewards/rejected": -4.439890384674072, "step": 1174 }, { "epoch": 19.915254237288135, "grad_norm": 4.197951910383987, "learning_rate": 2.7345937442552202e-11, "logits/chosen": -9.97773265838623, "logits/rejected": -6.583474159240723, "logps/chosen": -18.799739837646484, "logps/rejected": -23.433496475219727, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.007980942726135254, "rewards/margins": 4.562077045440674, "rewards/rejected": -4.570058345794678, "step": 1175 }, { "epoch": 19.93220338983051, "grad_norm": 4.040291712847653, "learning_rate": 1.7501514828183184e-11, "logits/chosen": -11.39698600769043, "logits/rejected": -12.546833038330078, "logps/chosen": -6.392749786376953, "logps/rejected": -22.700088500976562, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 0.04210100322961807, "rewards/margins": 5.175865173339844, "rewards/rejected": -5.133763790130615, "step": 1176 }, { "epoch": 19.949152542372882, "grad_norm": 4.065424008081176, "learning_rate": 9.844652344492832e-12, "logits/chosen": -8.840291976928711, "logits/rejected": -5.942503452301025, "logps/chosen": -7.859723091125488, "logps/rejected": -18.713224411010742, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": 0.774299144744873, "rewards/margins": 5.152860164642334, "rewards/rejected": -4.378561496734619, "step": 1177 }, { "epoch": 19.966101694915253, "grad_norm": 4.357195551014498, "learning_rate": 4.375416995577863e-12, "logits/chosen": -5.456747531890869, "logits/rejected": -4.642401218414307, "logps/chosen": -8.65817642211914, "logps/rejected": -18.599754333496094, "loss": 0.05, "rewards/accuracies": 1.0, "rewards/chosen": 0.2617643475532532, "rewards/margins": 4.422424793243408, "rewards/rejected": -4.160660743713379, "step": 1178 }, { "epoch": 19.983050847457626, "grad_norm": 3.6780075503878824, "learning_rate": 1.093856641931623e-12, "logits/chosen": -10.135025024414062, "logits/rejected": -5.059713840484619, "logps/chosen": -13.161264419555664, "logps/rejected": -26.968259811401367, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -0.34054410457611084, "rewards/margins": 5.3091139793396, "rewards/rejected": -5.649657249450684, "step": 1179 }, { "epoch": 20.0, "grad_norm": 4.1850799623874515, "learning_rate": 0.0, "logits/chosen": -7.4185967445373535, "logits/rejected": -7.274652004241943, "logps/chosen": -10.746148109436035, "logps/rejected": -25.2398738861084, "loss": 0.0439, "rewards/accuracies": 1.0, "rewards/chosen": 0.16116692125797272, "rewards/margins": 5.900046348571777, "rewards/rejected": -5.738879203796387, "step": 1180 }, { "epoch": 20.0, "step": 1180, "total_flos": 0.0, "train_loss": 0.18995573704399296, "train_runtime": 12341.7767, "train_samples_per_second": 12.232, "train_steps_per_second": 0.096 } ], "logging_steps": 1, "max_steps": 1180, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }