diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,17730 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 1, + "global_step": 1180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01694915254237288, + "grad_norm": 35.90666675768779, + "learning_rate": 4.2372881355932205e-09, + "logits/chosen": 14.802189826965332, + "logits/rejected": 14.350337982177734, + "logps/chosen": -11.634295463562012, + "logps/rejected": -13.271549224853516, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.03389830508474576, + "grad_norm": 36.054608327135526, + "learning_rate": 8.474576271186441e-09, + "logits/chosen": 12.264768600463867, + "logits/rejected": 12.60151195526123, + "logps/chosen": -12.491461753845215, + "logps/rejected": -14.43135929107666, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.05084745762711865, + "grad_norm": 35.49641367727226, + "learning_rate": 1.2711864406779661e-08, + "logits/chosen": 16.878520965576172, + "logits/rejected": 13.18266487121582, + "logps/chosen": -9.232068061828613, + "logps/rejected": -17.441871643066406, + "loss": 0.6929, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0281001478433609, + "rewards/margins": 0.0005521401762962341, + "rewards/rejected": 0.027548007667064667, + "step": 3 + }, + { + "epoch": 0.06779661016949153, + "grad_norm": 39.16805126128429, + "learning_rate": 1.6949152542372882e-08, + "logits/chosen": 13.905435562133789, + "logits/rejected": 14.600114822387695, + "logps/chosen": -8.362595558166504, + "logps/rejected": -12.51268196105957, + "loss": 0.7114, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00230209156870842, + "rewards/margins": -0.0342477448284626, + "rewards/rejected": 0.03194565325975418, + "step": 4 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 35.99356386144667, + "learning_rate": 2.11864406779661e-08, + "logits/chosen": 11.287035942077637, + "logits/rejected": 11.306232452392578, + "logps/chosen": -11.428975105285645, + "logps/rejected": -9.96834659576416, + "loss": 0.6957, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.06366005539894104, + "rewards/margins": -0.12454245984554291, + "rewards/rejected": 0.06088239327073097, + "step": 5 + }, + { + "epoch": 0.1016949152542373, + "grad_norm": 38.91747229948154, + "learning_rate": 2.5423728813559323e-08, + "logits/chosen": 9.630241394042969, + "logits/rejected": 12.905715942382812, + "logps/chosen": -11.144521713256836, + "logps/rejected": -11.662797927856445, + "loss": 0.7051, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.035100989043712616, + "rewards/margins": -0.046221472322940826, + "rewards/rejected": 0.01112048327922821, + "step": 6 + }, + { + "epoch": 0.11864406779661017, + "grad_norm": 33.24376209141878, + "learning_rate": 2.966101694915254e-08, + "logits/chosen": 12.640701293945312, + "logits/rejected": 15.201984405517578, + "logps/chosen": -10.405349731445312, + "logps/rejected": -12.381065368652344, + "loss": 0.6935, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.043475911021232605, + "rewards/margins": -0.03674769774079323, + "rewards/rejected": -0.006728213280439377, + "step": 7 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 33.93739002283182, + "learning_rate": 3.3898305084745764e-08, + "logits/chosen": 12.368184089660645, + "logits/rejected": 10.674546241760254, + "logps/chosen": -8.561328887939453, + "logps/rejected": -13.179322242736816, + "loss": 0.7008, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03725726902484894, + "rewards/margins": 0.03952927142381668, + "rewards/rejected": -0.002272002398967743, + "step": 8 + }, + { + "epoch": 0.15254237288135594, + "grad_norm": 33.764877302185795, + "learning_rate": 3.813559322033898e-08, + "logits/chosen": 9.811960220336914, + "logits/rejected": 12.911907196044922, + "logps/chosen": -10.999805450439453, + "logps/rejected": -9.398270606994629, + "loss": 0.7076, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.012884721159934998, + "rewards/margins": -0.04326022416353226, + "rewards/rejected": 0.03037550300359726, + "step": 9 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 30.816455835047037, + "learning_rate": 4.23728813559322e-08, + "logits/chosen": 11.430095672607422, + "logits/rejected": 14.928672790527344, + "logps/chosen": -13.877382278442383, + "logps/rejected": -10.65487289428711, + "loss": 0.6892, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.011533252894878387, + "rewards/margins": -0.014867149293422699, + "rewards/rejected": 0.0033338963985443115, + "step": 10 + }, + { + "epoch": 0.1864406779661017, + "grad_norm": 33.938752767793865, + "learning_rate": 4.661016949152542e-08, + "logits/chosen": 13.052696228027344, + "logits/rejected": 10.422144889831543, + "logps/chosen": -10.610231399536133, + "logps/rejected": -17.128759384155273, + "loss": 0.7095, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00851757824420929, + "rewards/margins": 0.14825767278671265, + "rewards/rejected": -0.15677525103092194, + "step": 11 + }, + { + "epoch": 0.2033898305084746, + "grad_norm": 33.17188984628549, + "learning_rate": 5.0847457627118645e-08, + "logits/chosen": 12.457121849060059, + "logits/rejected": 13.516885757446289, + "logps/chosen": -8.226775169372559, + "logps/rejected": -10.04336166381836, + "loss": 0.6985, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03804200887680054, + "rewards/margins": -0.024549927562475204, + "rewards/rejected": -0.013492081314325333, + "step": 12 + }, + { + "epoch": 0.22033898305084745, + "grad_norm": 33.400283269891275, + "learning_rate": 5.508474576271186e-08, + "logits/chosen": 12.457138061523438, + "logits/rejected": 12.47688102722168, + "logps/chosen": -10.179587364196777, + "logps/rejected": -10.717220306396484, + "loss": 0.686, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02907254546880722, + "rewards/margins": 0.004769522696733475, + "rewards/rejected": 0.024303022772073746, + "step": 13 + }, + { + "epoch": 0.23728813559322035, + "grad_norm": 34.76291522662287, + "learning_rate": 5.932203389830508e-08, + "logits/chosen": 10.563559532165527, + "logits/rejected": 7.753628730773926, + "logps/chosen": -12.341764450073242, + "logps/rejected": -22.06093978881836, + "loss": 0.7001, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03608255088329315, + "rewards/margins": 0.0593097060918808, + "rewards/rejected": -0.023227155208587646, + "step": 14 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 34.19169656059493, + "learning_rate": 6.35593220338983e-08, + "logits/chosen": 17.89315414428711, + "logits/rejected": 16.98625373840332, + "logps/chosen": -8.502942085266113, + "logps/rejected": -14.32985782623291, + "loss": 0.6923, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.013201236724853516, + "rewards/margins": -0.03858032077550888, + "rewards/rejected": 0.025379084050655365, + "step": 15 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 33.123595149389935, + "learning_rate": 6.779661016949153e-08, + "logits/chosen": 13.16607666015625, + "logits/rejected": 13.377177238464355, + "logps/chosen": -9.61911392211914, + "logps/rejected": -10.953754425048828, + "loss": 0.6818, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03972737491130829, + "rewards/margins": 0.11253049969673157, + "rewards/rejected": -0.07280312478542328, + "step": 16 + }, + { + "epoch": 0.288135593220339, + "grad_norm": 33.75941443866429, + "learning_rate": 7.203389830508475e-08, + "logits/chosen": 14.967769622802734, + "logits/rejected": 16.429962158203125, + "logps/chosen": -7.799526691436768, + "logps/rejected": -11.779705047607422, + "loss": 0.6918, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.038096603006124496, + "rewards/margins": -0.038292884826660156, + "rewards/rejected": 0.07638949155807495, + "step": 17 + }, + { + "epoch": 0.3050847457627119, + "grad_norm": 35.23652082493691, + "learning_rate": 7.627118644067796e-08, + "logits/chosen": 12.490198135375977, + "logits/rejected": 12.536822319030762, + "logps/chosen": -7.3060431480407715, + "logps/rejected": -8.912150382995605, + "loss": 0.6897, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.040348462760448456, + "rewards/margins": 0.032108671963214874, + "rewards/rejected": 0.008239790797233582, + "step": 18 + }, + { + "epoch": 0.3220338983050847, + "grad_norm": 38.14490304653045, + "learning_rate": 8.050847457627117e-08, + "logits/chosen": 15.213144302368164, + "logits/rejected": 13.012377738952637, + "logps/chosen": -6.454628944396973, + "logps/rejected": -13.794097900390625, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032369788736104965, + "rewards/margins": 0.0747758001089096, + "rewards/rejected": -0.04240601509809494, + "step": 19 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 32.64981912960275, + "learning_rate": 8.47457627118644e-08, + "logits/chosen": 12.59264087677002, + "logits/rejected": 13.965180397033691, + "logps/chosen": -11.896605491638184, + "logps/rejected": -10.930459976196289, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04358021169900894, + "rewards/margins": -0.07576116919517517, + "rewards/rejected": 0.03218095749616623, + "step": 20 + }, + { + "epoch": 0.3559322033898305, + "grad_norm": 32.48889752101817, + "learning_rate": 8.898305084745762e-08, + "logits/chosen": 14.423205375671387, + "logits/rejected": 11.800539016723633, + "logps/chosen": -9.143863677978516, + "logps/rejected": -10.3524169921875, + "loss": 0.7, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03891824558377266, + "rewards/margins": 0.013888102024793625, + "rewards/rejected": -0.052806347608566284, + "step": 21 + }, + { + "epoch": 0.3728813559322034, + "grad_norm": 33.74637684508929, + "learning_rate": 9.322033898305084e-08, + "logits/chosen": 11.911141395568848, + "logits/rejected": 13.506253242492676, + "logps/chosen": -7.444547653198242, + "logps/rejected": -10.66550350189209, + "loss": 0.7014, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023162133991718292, + "rewards/margins": -0.015818648040294647, + "rewards/rejected": 0.03898078203201294, + "step": 22 + }, + { + "epoch": 0.3898305084745763, + "grad_norm": 34.986738432295624, + "learning_rate": 9.745762711864407e-08, + "logits/chosen": 12.02149772644043, + "logits/rejected": 17.192615509033203, + "logps/chosen": -9.3740234375, + "logps/rejected": -12.00601863861084, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015855148434638977, + "rewards/margins": -0.013700824230909348, + "rewards/rejected": -0.0021543242037296295, + "step": 23 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 35.517675126988536, + "learning_rate": 1.0169491525423729e-07, + "logits/chosen": 14.339700698852539, + "logits/rejected": 16.159269332885742, + "logps/chosen": -10.706647872924805, + "logps/rejected": -14.890447616577148, + "loss": 0.6947, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018318627029657364, + "rewards/margins": 0.04457775130867958, + "rewards/rejected": -0.026259124279022217, + "step": 24 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 36.16238724964674, + "learning_rate": 1.059322033898305e-07, + "logits/chosen": 12.644755363464355, + "logits/rejected": 14.913492202758789, + "logps/chosen": -12.103818893432617, + "logps/rejected": -14.517330169677734, + "loss": 0.7023, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.03186403214931488, + "rewards/margins": -0.08982641249895096, + "rewards/rejected": 0.12169044464826584, + "step": 25 + }, + { + "epoch": 0.4406779661016949, + "grad_norm": 34.58148372441541, + "learning_rate": 1.1016949152542372e-07, + "logits/chosen": 16.518342971801758, + "logits/rejected": 15.1292142868042, + "logps/chosen": -8.292799949645996, + "logps/rejected": -11.320685386657715, + "loss": 0.6993, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.052115682512521744, + "rewards/margins": 0.12297011911869049, + "rewards/rejected": -0.07085443288087845, + "step": 26 + }, + { + "epoch": 0.4576271186440678, + "grad_norm": 37.35790607963465, + "learning_rate": 1.1440677966101695e-07, + "logits/chosen": 13.867864608764648, + "logits/rejected": 14.343127250671387, + "logps/chosen": -10.37988567352295, + "logps/rejected": -14.269682884216309, + "loss": 0.7057, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04574650526046753, + "rewards/margins": 0.07950454950332642, + "rewards/rejected": -0.03375804424285889, + "step": 27 + }, + { + "epoch": 0.4745762711864407, + "grad_norm": 32.982896424641616, + "learning_rate": 1.1864406779661017e-07, + "logits/chosen": 12.250972747802734, + "logits/rejected": 14.854573249816895, + "logps/chosen": -8.742226600646973, + "logps/rejected": -10.888657569885254, + "loss": 0.6889, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05363437905907631, + "rewards/margins": 0.06778371334075928, + "rewards/rejected": -0.014149338006973267, + "step": 28 + }, + { + "epoch": 0.4915254237288136, + "grad_norm": 33.38571979724709, + "learning_rate": 1.228813559322034e-07, + "logits/chosen": 13.085405349731445, + "logits/rejected": 11.611092567443848, + "logps/chosen": -8.457245826721191, + "logps/rejected": -11.445099830627441, + "loss": 0.6947, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.017215125262737274, + "rewards/margins": -0.09329110383987427, + "rewards/rejected": 0.076075978577137, + "step": 29 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 36.829555030284546, + "learning_rate": 1.271186440677966e-07, + "logits/chosen": 12.227721214294434, + "logits/rejected": 12.230497360229492, + "logps/chosen": -12.069671630859375, + "logps/rejected": -13.662412643432617, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005466394126415253, + "rewards/margins": 0.025227222591638565, + "rewards/rejected": -0.030693616718053818, + "step": 30 + }, + { + "epoch": 0.5254237288135594, + "grad_norm": 35.20409336336582, + "learning_rate": 1.3135593220338984e-07, + "logits/chosen": 10.929096221923828, + "logits/rejected": 13.200987815856934, + "logps/chosen": -11.148241996765137, + "logps/rejected": -12.558090209960938, + "loss": 0.6954, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05978451669216156, + "rewards/margins": -0.03725840896368027, + "rewards/rejected": -0.022526109591126442, + "step": 31 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 35.6615556471091, + "learning_rate": 1.3559322033898305e-07, + "logits/chosen": 10.920124053955078, + "logits/rejected": 11.50726318359375, + "logps/chosen": -10.602476119995117, + "logps/rejected": -14.5696382522583, + "loss": 0.7016, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.040968988090753555, + "rewards/margins": 0.10942812263965607, + "rewards/rejected": -0.06845913827419281, + "step": 32 + }, + { + "epoch": 0.559322033898305, + "grad_norm": 36.15961322810205, + "learning_rate": 1.3983050847457625e-07, + "logits/chosen": 14.529788970947266, + "logits/rejected": 13.727234840393066, + "logps/chosen": -10.854070663452148, + "logps/rejected": -17.989377975463867, + "loss": 0.6931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03216388076543808, + "rewards/margins": -0.0035803131759166718, + "rewards/rejected": -0.028583567589521408, + "step": 33 + }, + { + "epoch": 0.576271186440678, + "grad_norm": 35.17806533487164, + "learning_rate": 1.440677966101695e-07, + "logits/chosen": 13.581205368041992, + "logits/rejected": 16.608570098876953, + "logps/chosen": -8.734363555908203, + "logps/rejected": -11.665853500366211, + "loss": 0.705, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04102402180433273, + "rewards/margins": -0.013214722275733948, + "rewards/rejected": 0.05423874408006668, + "step": 34 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 37.36296835575698, + "learning_rate": 1.483050847457627e-07, + "logits/chosen": 18.549928665161133, + "logits/rejected": 16.754329681396484, + "logps/chosen": -7.439009666442871, + "logps/rejected": -17.70270347595215, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02313202992081642, + "rewards/margins": -0.03136378899216652, + "rewards/rejected": 0.05449581891298294, + "step": 35 + }, + { + "epoch": 0.6101694915254238, + "grad_norm": 34.998157162726095, + "learning_rate": 1.5254237288135593e-07, + "logits/chosen": 12.90966796875, + "logits/rejected": 13.895225524902344, + "logps/chosen": -8.504295349121094, + "logps/rejected": -10.12105941772461, + "loss": 0.7, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.01411505788564682, + "rewards/margins": -0.08190415799617767, + "rewards/rejected": 0.06778909265995026, + "step": 36 + }, + { + "epoch": 0.6271186440677966, + "grad_norm": 36.426085212372584, + "learning_rate": 1.5677966101694915e-07, + "logits/chosen": 15.71446418762207, + "logits/rejected": 16.843730926513672, + "logps/chosen": -10.690804481506348, + "logps/rejected": -9.679794311523438, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02468421310186386, + "rewards/margins": 0.05196612328290939, + "rewards/rejected": -0.027281910181045532, + "step": 37 + }, + { + "epoch": 0.6440677966101694, + "grad_norm": 35.237590028285744, + "learning_rate": 1.6101694915254234e-07, + "logits/chosen": 16.050735473632812, + "logits/rejected": 12.214224815368652, + "logps/chosen": -11.611053466796875, + "logps/rejected": -20.413999557495117, + "loss": 0.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04573787748813629, + "rewards/margins": 0.12110111117362976, + "rewards/rejected": -0.07536323368549347, + "step": 38 + }, + { + "epoch": 0.6610169491525424, + "grad_norm": 33.16682284542454, + "learning_rate": 1.6525423728813559e-07, + "logits/chosen": 12.356346130371094, + "logits/rejected": 15.030713081359863, + "logps/chosen": -14.8650541305542, + "logps/rejected": -17.080093383789062, + "loss": 0.7061, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.027096569538116455, + "rewards/margins": 0.07355399429798126, + "rewards/rejected": -0.04645742475986481, + "step": 39 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 34.805189868761, + "learning_rate": 1.694915254237288e-07, + "logits/chosen": 11.95240592956543, + "logits/rejected": 11.009622573852539, + "logps/chosen": -7.867511749267578, + "logps/rejected": -13.811777114868164, + "loss": 0.6888, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0293671116232872, + "rewards/margins": 0.0956479161977768, + "rewards/rejected": -0.06628081202507019, + "step": 40 + }, + { + "epoch": 0.6949152542372882, + "grad_norm": 35.56777767064069, + "learning_rate": 1.7372881355932202e-07, + "logits/chosen": 15.345881462097168, + "logits/rejected": 10.699317932128906, + "logps/chosen": -8.943795204162598, + "logps/rejected": -15.38254451751709, + "loss": 0.6941, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.008422672748565674, + "rewards/margins": 0.005998820066452026, + "rewards/rejected": 0.0024238526821136475, + "step": 41 + }, + { + "epoch": 0.711864406779661, + "grad_norm": 32.67790911292791, + "learning_rate": 1.7796610169491524e-07, + "logits/chosen": 13.560546875, + "logits/rejected": 13.775779724121094, + "logps/chosen": -6.909335136413574, + "logps/rejected": -10.375238418579102, + "loss": 0.6786, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0649556964635849, + "rewards/margins": 0.06534145772457123, + "rewards/rejected": -0.0003857612609863281, + "step": 42 + }, + { + "epoch": 0.7288135593220338, + "grad_norm": 37.03733040428068, + "learning_rate": 1.8220338983050846e-07, + "logits/chosen": 12.041620254516602, + "logits/rejected": 14.68472957611084, + "logps/chosen": -15.107216835021973, + "logps/rejected": -14.546531677246094, + "loss": 0.7004, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.028534866869449615, + "rewards/margins": -0.001826740801334381, + "rewards/rejected": 0.030361607670783997, + "step": 43 + }, + { + "epoch": 0.7457627118644068, + "grad_norm": 32.84788840363755, + "learning_rate": 1.8644067796610168e-07, + "logits/chosen": 14.930094718933105, + "logits/rejected": 17.373828887939453, + "logps/chosen": -12.436162948608398, + "logps/rejected": -16.592809677124023, + "loss": 0.715, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.033111147582530975, + "rewards/margins": -0.061914995312690735, + "rewards/rejected": 0.02880384773015976, + "step": 44 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 33.21001966655798, + "learning_rate": 1.906779661016949e-07, + "logits/chosen": 12.37698745727539, + "logits/rejected": 12.670955657958984, + "logps/chosen": -9.219972610473633, + "logps/rejected": -16.6455078125, + "loss": 0.7021, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06785572320222855, + "rewards/margins": -0.00494810938835144, + "rewards/rejected": -0.0629076212644577, + "step": 45 + }, + { + "epoch": 0.7796610169491526, + "grad_norm": 34.44107326645517, + "learning_rate": 1.9491525423728814e-07, + "logits/chosen": 12.624075889587402, + "logits/rejected": 13.895041465759277, + "logps/chosen": -10.401080131530762, + "logps/rejected": -12.175220489501953, + "loss": 0.6812, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010198384523391724, + "rewards/margins": 0.061251018196344376, + "rewards/rejected": -0.05105263367295265, + "step": 46 + }, + { + "epoch": 0.7966101694915254, + "grad_norm": 35.32959175447224, + "learning_rate": 1.9915254237288134e-07, + "logits/chosen": 12.925259590148926, + "logits/rejected": 14.440532684326172, + "logps/chosen": -12.382270812988281, + "logps/rejected": -11.013741493225098, + "loss": 0.6828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04461570829153061, + "rewards/margins": 0.06247672438621521, + "rewards/rejected": -0.0178610160946846, + "step": 47 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 35.75324287683784, + "learning_rate": 2.0338983050847458e-07, + "logits/chosen": 13.918556213378906, + "logits/rejected": 14.322575569152832, + "logps/chosen": -11.076623916625977, + "logps/rejected": -16.675630569458008, + "loss": 0.6545, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05672513693571091, + "rewards/margins": 0.19194935262203217, + "rewards/rejected": -0.13522422313690186, + "step": 48 + }, + { + "epoch": 0.8305084745762712, + "grad_norm": 31.28472915428401, + "learning_rate": 2.076271186440678e-07, + "logits/chosen": 14.541460037231445, + "logits/rejected": 16.299976348876953, + "logps/chosen": -7.96132230758667, + "logps/rejected": -9.541074752807617, + "loss": 0.6772, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021021980792284012, + "rewards/margins": 0.09664987027645111, + "rewards/rejected": -0.0756278932094574, + "step": 49 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 30.522178193238684, + "learning_rate": 2.11864406779661e-07, + "logits/chosen": 12.885459899902344, + "logits/rejected": 14.966306686401367, + "logps/chosen": -8.131207466125488, + "logps/rejected": -14.213689804077148, + "loss": 0.6884, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.035199426114559174, + "rewards/margins": 0.02447110041975975, + "rewards/rejected": 0.010728325694799423, + "step": 50 + }, + { + "epoch": 0.864406779661017, + "grad_norm": 36.47219407480772, + "learning_rate": 2.1610169491525424e-07, + "logits/chosen": 15.588902473449707, + "logits/rejected": 15.775507926940918, + "logps/chosen": -11.82562255859375, + "logps/rejected": -19.91376495361328, + "loss": 0.6735, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.00851084291934967, + "rewards/margins": 0.03356066346168518, + "rewards/rejected": -0.02504982054233551, + "step": 51 + }, + { + "epoch": 0.8813559322033898, + "grad_norm": 34.09388988223221, + "learning_rate": 2.2033898305084743e-07, + "logits/chosen": 11.618022918701172, + "logits/rejected": 12.69122314453125, + "logps/chosen": -11.289080619812012, + "logps/rejected": -14.602575302124023, + "loss": 0.6646, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02516426146030426, + "rewards/margins": 0.05319439619779587, + "rewards/rejected": -0.028030134737491608, + "step": 52 + }, + { + "epoch": 0.8983050847457628, + "grad_norm": 35.24051417693975, + "learning_rate": 2.2457627118644068e-07, + "logits/chosen": 13.005451202392578, + "logits/rejected": 15.351945877075195, + "logps/chosen": -9.357630729675293, + "logps/rejected": -12.059124946594238, + "loss": 0.6868, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00409119576215744, + "rewards/margins": -0.020457960665225983, + "rewards/rejected": 0.016366764903068542, + "step": 53 + }, + { + "epoch": 0.9152542372881356, + "grad_norm": 31.849779567852845, + "learning_rate": 2.288135593220339e-07, + "logits/chosen": 15.671916007995605, + "logits/rejected": 15.454113006591797, + "logps/chosen": -10.503100395202637, + "logps/rejected": -10.274392127990723, + "loss": 0.6771, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06473356485366821, + "rewards/margins": 0.06081206351518631, + "rewards/rejected": 0.003921501338481903, + "step": 54 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 33.88074879519746, + "learning_rate": 2.330508474576271e-07, + "logits/chosen": 13.267528533935547, + "logits/rejected": 13.739250183105469, + "logps/chosen": -11.132416725158691, + "logps/rejected": -13.396270751953125, + "loss": 0.6706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09757491946220398, + "rewards/margins": 0.16509681940078735, + "rewards/rejected": -0.06752191483974457, + "step": 55 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 33.372486380706356, + "learning_rate": 2.3728813559322033e-07, + "logits/chosen": 15.237256050109863, + "logits/rejected": 15.872620582580566, + "logps/chosen": -8.667885780334473, + "logps/rejected": -11.945577621459961, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01572229713201523, + "rewards/margins": 0.08544715493917465, + "rewards/rejected": -0.06972485780715942, + "step": 56 + }, + { + "epoch": 0.9661016949152542, + "grad_norm": 32.086485860665775, + "learning_rate": 2.4152542372881355e-07, + "logits/chosen": 17.312305450439453, + "logits/rejected": 15.455362319946289, + "logps/chosen": -13.120665550231934, + "logps/rejected": -18.88037872314453, + "loss": 0.6377, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07254119217395782, + "rewards/margins": 0.16330546140670776, + "rewards/rejected": -0.09076426923274994, + "step": 57 + }, + { + "epoch": 0.9830508474576272, + "grad_norm": 32.83780367143196, + "learning_rate": 2.457627118644068e-07, + "logits/chosen": 12.152743339538574, + "logits/rejected": 15.03101634979248, + "logps/chosen": -13.570626258850098, + "logps/rejected": -15.182299613952637, + "loss": 0.6645, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05790925770998001, + "rewards/margins": 0.16572001576423645, + "rewards/rejected": -0.10781076550483704, + "step": 58 + }, + { + "epoch": 1.0, + "grad_norm": 33.173046993726444, + "learning_rate": 2.5e-07, + "logits/chosen": 13.197469711303711, + "logits/rejected": 12.665360450744629, + "logps/chosen": -9.60036849975586, + "logps/rejected": -13.494672775268555, + "loss": 0.6636, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05081889033317566, + "rewards/margins": 0.08736976981163025, + "rewards/rejected": -0.03655087947845459, + "step": 59 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 34.12246854349812, + "learning_rate": 2.542372881355932e-07, + "logits/chosen": 16.638633728027344, + "logits/rejected": 14.497950553894043, + "logps/chosen": -7.4926934242248535, + "logps/rejected": -9.887690544128418, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02329724282026291, + "rewards/margins": 0.08896321058273315, + "rewards/rejected": -0.06566596031188965, + "step": 60 + }, + { + "epoch": 1.0338983050847457, + "grad_norm": 32.48073277255417, + "learning_rate": 2.584745762711864e-07, + "logits/chosen": 13.186823844909668, + "logits/rejected": 12.39754581451416, + "logps/chosen": -7.40197229385376, + "logps/rejected": -9.912740707397461, + "loss": 0.6404, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0660327598452568, + "rewards/margins": 0.08431532979011536, + "rewards/rejected": -0.01828256994485855, + "step": 61 + }, + { + "epoch": 1.0508474576271187, + "grad_norm": 31.810793473176147, + "learning_rate": 2.6271186440677967e-07, + "logits/chosen": 9.925346374511719, + "logits/rejected": 12.604837417602539, + "logps/chosen": -9.344438552856445, + "logps/rejected": -12.725444793701172, + "loss": 0.6681, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.016713112592697144, + "rewards/margins": 0.06962401419878006, + "rewards/rejected": -0.0863371342420578, + "step": 62 + }, + { + "epoch": 1.0677966101694916, + "grad_norm": 31.270602966387727, + "learning_rate": 2.6694915254237286e-07, + "logits/chosen": 12.649188041687012, + "logits/rejected": 11.9768648147583, + "logps/chosen": -10.402519226074219, + "logps/rejected": -16.783111572265625, + "loss": 0.6522, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022982105612754822, + "rewards/margins": 0.12150964140892029, + "rewards/rejected": -0.1444917470216751, + "step": 63 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 32.65672591502704, + "learning_rate": 2.711864406779661e-07, + "logits/chosen": 14.806360244750977, + "logits/rejected": 17.47248077392578, + "logps/chosen": -11.753227233886719, + "logps/rejected": -15.529240608215332, + "loss": 0.6439, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002056751400232315, + "rewards/margins": 0.05395819991827011, + "rewards/rejected": -0.056014951318502426, + "step": 64 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 30.439729822840757, + "learning_rate": 2.754237288135593e-07, + "logits/chosen": 13.062178611755371, + "logits/rejected": 15.618377685546875, + "logps/chosen": -10.648831367492676, + "logps/rejected": -17.41346549987793, + "loss": 0.6379, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.014630310237407684, + "rewards/margins": 0.04037298262119293, + "rewards/rejected": -0.025742672383785248, + "step": 65 + }, + { + "epoch": 1.11864406779661, + "grad_norm": 33.491004186333484, + "learning_rate": 2.796610169491525e-07, + "logits/chosen": 11.60132884979248, + "logits/rejected": 12.90021800994873, + "logps/chosen": -10.080259323120117, + "logps/rejected": -14.711465835571289, + "loss": 0.6572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.021400080993771553, + "rewards/margins": 0.12133632600307465, + "rewards/rejected": -0.14273640513420105, + "step": 66 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 31.23541277963251, + "learning_rate": 2.838983050847458e-07, + "logits/chosen": 12.128422737121582, + "logits/rejected": 14.061056137084961, + "logps/chosen": -9.6464204788208, + "logps/rejected": -9.97637939453125, + "loss": 0.654, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.069414421916008, + "rewards/margins": 0.04286954551935196, + "rewards/rejected": -0.11228397488594055, + "step": 67 + }, + { + "epoch": 1.152542372881356, + "grad_norm": 31.64971711981137, + "learning_rate": 2.88135593220339e-07, + "logits/chosen": 13.560795783996582, + "logits/rejected": 15.241508483886719, + "logps/chosen": -9.716387748718262, + "logps/rejected": -16.67053985595703, + "loss": 0.6505, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08715072274208069, + "rewards/margins": 0.1715814769268036, + "rewards/rejected": -0.0844307616353035, + "step": 68 + }, + { + "epoch": 1.1694915254237288, + "grad_norm": 31.743762826648183, + "learning_rate": 2.923728813559322e-07, + "logits/chosen": 12.650971412658691, + "logits/rejected": 12.528360366821289, + "logps/chosen": -8.169756889343262, + "logps/rejected": -11.55956745147705, + "loss": 0.644, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.004966534674167633, + "rewards/margins": 0.04012346267700195, + "rewards/rejected": -0.045089997351169586, + "step": 69 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 27.05553118573593, + "learning_rate": 2.966101694915254e-07, + "logits/chosen": 13.4546537399292, + "logits/rejected": 12.574522018432617, + "logps/chosen": -8.672929763793945, + "logps/rejected": -14.113090515136719, + "loss": 0.6159, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.018237508833408356, + "rewards/margins": 0.1680130809545517, + "rewards/rejected": -0.14977556467056274, + "step": 70 + }, + { + "epoch": 1.2033898305084745, + "grad_norm": 29.505587279506713, + "learning_rate": 3.008474576271186e-07, + "logits/chosen": 13.520177841186523, + "logits/rejected": 13.430837631225586, + "logps/chosen": -9.639127731323242, + "logps/rejected": -13.659337043762207, + "loss": 0.6309, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08626913279294968, + "rewards/margins": 0.041161876171827316, + "rewards/rejected": -0.1274310052394867, + "step": 71 + }, + { + "epoch": 1.2203389830508475, + "grad_norm": 31.37091432132666, + "learning_rate": 3.0508474576271186e-07, + "logits/chosen": 11.49694538116455, + "logits/rejected": 11.778303146362305, + "logps/chosen": -11.05677604675293, + "logps/rejected": -15.848922729492188, + "loss": 0.6181, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16539010405540466, + "rewards/margins": 0.4443053603172302, + "rewards/rejected": -0.27891525626182556, + "step": 72 + }, + { + "epoch": 1.2372881355932204, + "grad_norm": 29.563451670160653, + "learning_rate": 3.093220338983051e-07, + "logits/chosen": 15.092836380004883, + "logits/rejected": 17.037139892578125, + "logps/chosen": -9.66404914855957, + "logps/rejected": -11.75003433227539, + "loss": 0.6278, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.073647640645504, + "rewards/margins": 0.2144932746887207, + "rewards/rejected": -0.1408456414937973, + "step": 73 + }, + { + "epoch": 1.2542372881355932, + "grad_norm": 32.962262587265975, + "learning_rate": 3.135593220338983e-07, + "logits/chosen": 12.5877103805542, + "logits/rejected": 13.999431610107422, + "logps/chosen": -9.522225379943848, + "logps/rejected": -12.351940155029297, + "loss": 0.6207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0037403330206871033, + "rewards/margins": 0.20362704992294312, + "rewards/rejected": -0.19988670945167542, + "step": 74 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 31.381915402396228, + "learning_rate": 3.177966101694915e-07, + "logits/chosen": 11.979763984680176, + "logits/rejected": 13.482416152954102, + "logps/chosen": -8.215496063232422, + "logps/rejected": -11.69947624206543, + "loss": 0.6179, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01762581244111061, + "rewards/margins": 0.14459285140037537, + "rewards/rejected": -0.12696704268455505, + "step": 75 + }, + { + "epoch": 1.288135593220339, + "grad_norm": 30.626294632951648, + "learning_rate": 3.220338983050847e-07, + "logits/chosen": 13.793957710266113, + "logits/rejected": 14.482337951660156, + "logps/chosen": -8.5573091506958, + "logps/rejected": -12.453176498413086, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08360793441534042, + "rewards/margins": 0.08104290813207626, + "rewards/rejected": -0.1646508425474167, + "step": 76 + }, + { + "epoch": 1.305084745762712, + "grad_norm": 28.664877026711565, + "learning_rate": 3.26271186440678e-07, + "logits/chosen": 11.402955055236816, + "logits/rejected": 12.819219589233398, + "logps/chosen": -14.218494415283203, + "logps/rejected": -19.227710723876953, + "loss": 0.6185, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13961467146873474, + "rewards/margins": 0.4163624048233032, + "rewards/rejected": -0.2767477333545685, + "step": 77 + }, + { + "epoch": 1.3220338983050848, + "grad_norm": 30.319960677264497, + "learning_rate": 3.3050847457627117e-07, + "logits/chosen": 15.883766174316406, + "logits/rejected": 15.542499542236328, + "logps/chosen": -8.39336109161377, + "logps/rejected": -14.079061508178711, + "loss": 0.6211, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014538049697875977, + "rewards/margins": 0.14333194494247437, + "rewards/rejected": -0.15786999464035034, + "step": 78 + }, + { + "epoch": 1.3389830508474576, + "grad_norm": 31.140285215166347, + "learning_rate": 3.3474576271186436e-07, + "logits/chosen": 16.721675872802734, + "logits/rejected": 16.151338577270508, + "logps/chosen": -8.125932693481445, + "logps/rejected": -10.540145874023438, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009473755955696106, + "rewards/margins": 0.20827658474445343, + "rewards/rejected": -0.19880282878875732, + "step": 79 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 28.563629733456608, + "learning_rate": 3.389830508474576e-07, + "logits/chosen": 15.79937744140625, + "logits/rejected": 15.94279670715332, + "logps/chosen": -9.857381820678711, + "logps/rejected": -15.813158988952637, + "loss": 0.5986, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006986264139413834, + "rewards/margins": 0.2487819939851761, + "rewards/rejected": -0.2557682693004608, + "step": 80 + }, + { + "epoch": 1.3728813559322033, + "grad_norm": 28.565703501108878, + "learning_rate": 3.432203389830508e-07, + "logits/chosen": 12.46065616607666, + "logits/rejected": 12.378948211669922, + "logps/chosen": -9.123543739318848, + "logps/rejected": -12.124061584472656, + "loss": 0.5794, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09940391778945923, + "rewards/margins": 0.17284835875034332, + "rewards/rejected": -0.0734444409608841, + "step": 81 + }, + { + "epoch": 1.3898305084745763, + "grad_norm": 28.291061250399547, + "learning_rate": 3.4745762711864405e-07, + "logits/chosen": 9.341460227966309, + "logits/rejected": 10.108131408691406, + "logps/chosen": -7.376352787017822, + "logps/rejected": -6.77155876159668, + "loss": 0.6319, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.321180939674377e-05, + "rewards/margins": 0.13180197775363922, + "rewards/rejected": -0.13188520073890686, + "step": 82 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 29.854142613641514, + "learning_rate": 3.516949152542373e-07, + "logits/chosen": 12.311843872070312, + "logits/rejected": 14.436249732971191, + "logps/chosen": -10.377118110656738, + "logps/rejected": -13.012676239013672, + "loss": 0.6077, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.10611987113952637, + "rewards/margins": 0.29748332500457764, + "rewards/rejected": -0.19136345386505127, + "step": 83 + }, + { + "epoch": 1.423728813559322, + "grad_norm": 35.835030120584115, + "learning_rate": 3.559322033898305e-07, + "logits/chosen": 9.823848724365234, + "logits/rejected": 11.089227676391602, + "logps/chosen": -9.431833267211914, + "logps/rejected": -10.586324691772461, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014358047395944595, + "rewards/margins": 0.16177169978618622, + "rewards/rejected": -0.14741365611553192, + "step": 84 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 30.07505572818561, + "learning_rate": 3.601694915254237e-07, + "logits/chosen": 12.886320114135742, + "logits/rejected": 13.918362617492676, + "logps/chosen": -8.843177795410156, + "logps/rejected": -10.307863235473633, + "loss": 0.6085, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02218719571828842, + "rewards/margins": 0.27985820174217224, + "rewards/rejected": -0.2576709985733032, + "step": 85 + }, + { + "epoch": 1.457627118644068, + "grad_norm": 32.81575222960369, + "learning_rate": 3.644067796610169e-07, + "logits/chosen": 9.922818183898926, + "logits/rejected": 8.803034782409668, + "logps/chosen": -10.362354278564453, + "logps/rejected": -14.556231498718262, + "loss": 0.6153, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04402754455804825, + "rewards/margins": 0.2732781767845154, + "rewards/rejected": -0.22925063967704773, + "step": 86 + }, + { + "epoch": 1.4745762711864407, + "grad_norm": 31.143487290913797, + "learning_rate": 3.6864406779661017e-07, + "logits/chosen": 13.284112930297852, + "logits/rejected": 13.448722839355469, + "logps/chosen": -8.040769577026367, + "logps/rejected": -14.666115760803223, + "loss": 0.5883, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.025847814977169037, + "rewards/margins": 0.34201639890670776, + "rewards/rejected": -0.367864191532135, + "step": 87 + }, + { + "epoch": 1.4915254237288136, + "grad_norm": 27.87355361043025, + "learning_rate": 3.7288135593220336e-07, + "logits/chosen": 11.64570426940918, + "logits/rejected": 12.12906551361084, + "logps/chosen": -10.226296424865723, + "logps/rejected": -15.823101997375488, + "loss": 0.5825, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.049983687698841095, + "rewards/margins": 0.46055829524993896, + "rewards/rejected": -0.41057464480400085, + "step": 88 + }, + { + "epoch": 1.5084745762711864, + "grad_norm": 28.514766608508296, + "learning_rate": 3.771186440677966e-07, + "logits/chosen": 10.8696928024292, + "logits/rejected": 12.655576705932617, + "logps/chosen": -11.517805099487305, + "logps/rejected": -13.284784317016602, + "loss": 0.6049, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.029194893315434456, + "rewards/margins": 0.3180268406867981, + "rewards/rejected": -0.3472217321395874, + "step": 89 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 30.117702099043008, + "learning_rate": 3.813559322033898e-07, + "logits/chosen": 16.787080764770508, + "logits/rejected": 14.550413131713867, + "logps/chosen": -10.547918319702148, + "logps/rejected": -18.722667694091797, + "loss": 0.58, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04852820187807083, + "rewards/margins": 0.4919710159301758, + "rewards/rejected": -0.44344282150268555, + "step": 90 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 27.570356926191867, + "learning_rate": 3.8559322033898304e-07, + "logits/chosen": 10.767607688903809, + "logits/rejected": 13.919551849365234, + "logps/chosen": -10.187631607055664, + "logps/rejected": -12.379902839660645, + "loss": 0.5751, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15341579914093018, + "rewards/margins": 0.4169505834579468, + "rewards/rejected": -0.2635347247123718, + "step": 91 + }, + { + "epoch": 1.559322033898305, + "grad_norm": 32.16465688493385, + "learning_rate": 3.898305084745763e-07, + "logits/chosen": 9.810237884521484, + "logits/rejected": 10.345829010009766, + "logps/chosen": -9.855244636535645, + "logps/rejected": -11.927900314331055, + "loss": 0.5719, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01897813379764557, + "rewards/margins": 0.12236368656158447, + "rewards/rejected": -0.1033855527639389, + "step": 92 + }, + { + "epoch": 1.576271186440678, + "grad_norm": 30.793068504647245, + "learning_rate": 3.940677966101695e-07, + "logits/chosen": 8.736921310424805, + "logits/rejected": 12.200434684753418, + "logps/chosen": -9.910384178161621, + "logps/rejected": -13.29830265045166, + "loss": 0.6152, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08518315851688385, + "rewards/margins": 0.37882399559020996, + "rewards/rejected": -0.2936408817768097, + "step": 93 + }, + { + "epoch": 1.5932203389830508, + "grad_norm": 29.423734502829934, + "learning_rate": 3.9830508474576267e-07, + "logits/chosen": 12.854459762573242, + "logits/rejected": 11.398431777954102, + "logps/chosen": -8.642826080322266, + "logps/rejected": -16.943462371826172, + "loss": 0.588, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04286940395832062, + "rewards/margins": 0.4083571434020996, + "rewards/rejected": -0.4512265622615814, + "step": 94 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 36.27015230712179, + "learning_rate": 4.025423728813559e-07, + "logits/chosen": 11.945028305053711, + "logits/rejected": 11.686726570129395, + "logps/chosen": -7.234800815582275, + "logps/rejected": -14.528942108154297, + "loss": 0.5922, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.024997025728225708, + "rewards/margins": 0.5870181322097778, + "rewards/rejected": -0.5620210766792297, + "step": 95 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 31.19227016453443, + "learning_rate": 4.0677966101694916e-07, + "logits/chosen": 13.797384262084961, + "logits/rejected": 14.542064666748047, + "logps/chosen": -11.304056167602539, + "logps/rejected": -11.900882720947266, + "loss": 0.5784, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0709955096244812, + "rewards/margins": 0.1039753258228302, + "rewards/rejected": -0.032979816198349, + "step": 96 + }, + { + "epoch": 1.6440677966101696, + "grad_norm": 27.4635272839829, + "learning_rate": 4.1101694915254236e-07, + "logits/chosen": 13.015619277954102, + "logits/rejected": 11.862337112426758, + "logps/chosen": -7.38511323928833, + "logps/rejected": -10.880597114562988, + "loss": 0.5627, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01526942104101181, + "rewards/margins": 0.4992905557155609, + "rewards/rejected": -0.4840211272239685, + "step": 97 + }, + { + "epoch": 1.6610169491525424, + "grad_norm": 27.238291695236995, + "learning_rate": 4.152542372881356e-07, + "logits/chosen": 10.700840950012207, + "logits/rejected": 11.397660255432129, + "logps/chosen": -9.450662612915039, + "logps/rejected": -12.259298324584961, + "loss": 0.5442, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0722656399011612, + "rewards/margins": 0.16584718227386475, + "rewards/rejected": -0.09358153492212296, + "step": 98 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 28.24564494185518, + "learning_rate": 4.194915254237288e-07, + "logits/chosen": 11.284452438354492, + "logits/rejected": 14.807559967041016, + "logps/chosen": -10.771215438842773, + "logps/rejected": -11.973726272583008, + "loss": 0.5979, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.051828235387802124, + "rewards/margins": 0.19837984442710876, + "rewards/rejected": -0.14655160903930664, + "step": 99 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 32.39712681739478, + "learning_rate": 4.23728813559322e-07, + "logits/chosen": 4.555298328399658, + "logits/rejected": 9.199655532836914, + "logps/chosen": -13.74587345123291, + "logps/rejected": -13.853545188903809, + "loss": 0.5942, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.051489535719156265, + "rewards/margins": 0.5184900760650635, + "rewards/rejected": -0.46700048446655273, + "step": 100 + }, + { + "epoch": 1.711864406779661, + "grad_norm": 27.79221873761442, + "learning_rate": 4.279661016949153e-07, + "logits/chosen": 13.323826789855957, + "logits/rejected": 11.545331954956055, + "logps/chosen": -8.155689239501953, + "logps/rejected": -13.511098861694336, + "loss": 0.5431, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009415552020072937, + "rewards/margins": 0.5583093762397766, + "rewards/rejected": -0.5488938689231873, + "step": 101 + }, + { + "epoch": 1.7288135593220337, + "grad_norm": 28.264522962910007, + "learning_rate": 4.322033898305085e-07, + "logits/chosen": 8.916523933410645, + "logits/rejected": 10.580684661865234, + "logps/chosen": -10.751327514648438, + "logps/rejected": -10.95659351348877, + "loss": 0.5516, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1171780452132225, + "rewards/margins": 0.515388548374176, + "rewards/rejected": -0.39821046590805054, + "step": 102 + }, + { + "epoch": 1.7457627118644068, + "grad_norm": 26.154236239225785, + "learning_rate": 4.3644067796610167e-07, + "logits/chosen": 13.819053649902344, + "logits/rejected": 14.223098754882812, + "logps/chosen": -8.082857131958008, + "logps/rejected": -13.987797737121582, + "loss": 0.5396, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05364694446325302, + "rewards/margins": 0.7236331105232239, + "rewards/rejected": -0.6699862480163574, + "step": 103 + }, + { + "epoch": 1.7627118644067796, + "grad_norm": 27.50021000006553, + "learning_rate": 4.4067796610169486e-07, + "logits/chosen": 11.863626480102539, + "logits/rejected": 10.52764892578125, + "logps/chosen": -6.9673237800598145, + "logps/rejected": -8.23715877532959, + "loss": 0.5938, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14590652287006378, + "rewards/margins": 0.16770131886005402, + "rewards/rejected": -0.02179480344057083, + "step": 104 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 28.474373560254055, + "learning_rate": 4.449152542372881e-07, + "logits/chosen": 16.65330696105957, + "logits/rejected": 16.556903839111328, + "logps/chosen": -11.08348274230957, + "logps/rejected": -19.825658798217773, + "loss": 0.5366, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06215184926986694, + "rewards/margins": 0.8033376932144165, + "rewards/rejected": -0.7411857843399048, + "step": 105 + }, + { + "epoch": 1.7966101694915255, + "grad_norm": 27.473650011351655, + "learning_rate": 4.4915254237288135e-07, + "logits/chosen": 13.014068603515625, + "logits/rejected": 13.48980712890625, + "logps/chosen": -9.113512992858887, + "logps/rejected": -13.425411224365234, + "loss": 0.5333, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029549360275268555, + "rewards/margins": 0.8095859289169312, + "rewards/rejected": -0.7800365686416626, + "step": 106 + }, + { + "epoch": 1.8135593220338984, + "grad_norm": 28.191261008380913, + "learning_rate": 4.5338983050847454e-07, + "logits/chosen": 13.008545875549316, + "logits/rejected": 11.879390716552734, + "logps/chosen": -9.581355094909668, + "logps/rejected": -15.122631072998047, + "loss": 0.5785, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0392971932888031, + "rewards/margins": 0.02920607477426529, + "rewards/rejected": -0.06850326806306839, + "step": 107 + }, + { + "epoch": 1.8305084745762712, + "grad_norm": 31.619473725308314, + "learning_rate": 4.576271186440678e-07, + "logits/chosen": 10.609634399414062, + "logits/rejected": 10.654576301574707, + "logps/chosen": -7.444589138031006, + "logps/rejected": -15.026788711547852, + "loss": 0.5572, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08635948598384857, + "rewards/margins": 0.7383730411529541, + "rewards/rejected": -0.6520135402679443, + "step": 108 + }, + { + "epoch": 1.847457627118644, + "grad_norm": 32.17526198659112, + "learning_rate": 4.61864406779661e-07, + "logits/chosen": 12.902010917663574, + "logits/rejected": 11.985198020935059, + "logps/chosen": -7.153221130371094, + "logps/rejected": -13.385372161865234, + "loss": 0.571, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.043889474123716354, + "rewards/margins": 0.2885042130947113, + "rewards/rejected": -0.24461475014686584, + "step": 109 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 27.857085867574753, + "learning_rate": 4.661016949152542e-07, + "logits/chosen": 10.333656311035156, + "logits/rejected": 12.67178726196289, + "logps/chosen": -17.524898529052734, + "logps/rejected": -15.563018798828125, + "loss": 0.5713, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.018213175237178802, + "rewards/margins": 0.060524992644786835, + "rewards/rejected": -0.07873816788196564, + "step": 110 + }, + { + "epoch": 1.8813559322033897, + "grad_norm": 28.764899008069285, + "learning_rate": 4.7033898305084747e-07, + "logits/chosen": 11.594640731811523, + "logits/rejected": 11.983922004699707, + "logps/chosen": -8.291284561157227, + "logps/rejected": -14.15444564819336, + "loss": 0.532, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.017433345317840576, + "rewards/margins": 0.700124979019165, + "rewards/rejected": -0.6826916337013245, + "step": 111 + }, + { + "epoch": 1.8983050847457628, + "grad_norm": 28.655586786759734, + "learning_rate": 4.7457627118644066e-07, + "logits/chosen": 10.416437149047852, + "logits/rejected": 8.874842643737793, + "logps/chosen": -8.323904037475586, + "logps/rejected": -12.19089126586914, + "loss": 0.5498, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0059746429324150085, + "rewards/margins": 0.41193652153015137, + "rewards/rejected": -0.405961811542511, + "step": 112 + }, + { + "epoch": 1.9152542372881356, + "grad_norm": 26.702839696044048, + "learning_rate": 4.788135593220339e-07, + "logits/chosen": 8.583914756774902, + "logits/rejected": 11.768655776977539, + "logps/chosen": -10.75674819946289, + "logps/rejected": -13.667054176330566, + "loss": 0.5249, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04430105537176132, + "rewards/margins": 0.6275098323822021, + "rewards/rejected": -0.5832087993621826, + "step": 113 + }, + { + "epoch": 1.9322033898305084, + "grad_norm": 27.420724418994222, + "learning_rate": 4.830508474576271e-07, + "logits/chosen": 14.770956039428711, + "logits/rejected": 17.315418243408203, + "logps/chosen": -12.913301467895508, + "logps/rejected": -18.054443359375, + "loss": 0.5272, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04427202790975571, + "rewards/margins": 0.9159756302833557, + "rewards/rejected": -0.8717036843299866, + "step": 114 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 29.355284582434965, + "learning_rate": 4.872881355932203e-07, + "logits/chosen": 14.43870735168457, + "logits/rejected": 14.848833084106445, + "logps/chosen": -10.624906539916992, + "logps/rejected": -13.744641304016113, + "loss": 0.5595, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06107292324304581, + "rewards/margins": 0.63875812292099, + "rewards/rejected": -0.5776851773262024, + "step": 115 + }, + { + "epoch": 1.9661016949152543, + "grad_norm": 28.31017869263379, + "learning_rate": 4.915254237288136e-07, + "logits/chosen": 5.89711332321167, + "logits/rejected": 8.889683723449707, + "logps/chosen": -11.735050201416016, + "logps/rejected": -16.08776092529297, + "loss": 0.5137, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.010795101523399353, + "rewards/margins": 0.7134937047958374, + "rewards/rejected": -0.7026985883712769, + "step": 116 + }, + { + "epoch": 1.9830508474576272, + "grad_norm": 25.543721865788335, + "learning_rate": 4.957627118644068e-07, + "logits/chosen": 12.527676582336426, + "logits/rejected": 13.638514518737793, + "logps/chosen": -11.032867431640625, + "logps/rejected": -16.750507354736328, + "loss": 0.4784, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06131690740585327, + "rewards/margins": 0.800919771194458, + "rewards/rejected": -0.7396028637886047, + "step": 117 + }, + { + "epoch": 2.0, + "grad_norm": 27.898132683327685, + "learning_rate": 5e-07, + "logits/chosen": 13.574739456176758, + "logits/rejected": 13.461164474487305, + "logps/chosen": -11.047101020812988, + "logps/rejected": -14.235968589782715, + "loss": 0.5421, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01252034679055214, + "rewards/margins": 0.5880539417266846, + "rewards/rejected": -0.5755336284637451, + "step": 118 + }, + { + "epoch": 2.016949152542373, + "grad_norm": 24.420931650002732, + "learning_rate": 4.99998906143358e-07, + "logits/chosen": 8.436079025268555, + "logits/rejected": 8.032705307006836, + "logps/chosen": -8.76571273803711, + "logps/rejected": -15.768316268920898, + "loss": 0.5087, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.029925979673862457, + "rewards/margins": 0.830885112285614, + "rewards/rejected": -0.8608111143112183, + "step": 119 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 24.437724993379234, + "learning_rate": 4.999956245830044e-07, + "logits/chosen": 8.596471786499023, + "logits/rejected": 9.941722869873047, + "logps/chosen": -10.456941604614258, + "logps/rejected": -13.767266273498535, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04792099446058273, + "rewards/margins": 0.7964321970939636, + "rewards/rejected": -0.7485113143920898, + "step": 120 + }, + { + "epoch": 2.0508474576271185, + "grad_norm": 26.385817925226156, + "learning_rate": 4.999901553476555e-07, + "logits/chosen": 9.167165756225586, + "logits/rejected": 10.949917793273926, + "logps/chosen": -11.454400062561035, + "logps/rejected": -10.427359580993652, + "loss": 0.5448, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0467328205704689, + "rewards/margins": 0.23926034569740295, + "rewards/rejected": -0.28599315881729126, + "step": 121 + }, + { + "epoch": 2.0677966101694913, + "grad_norm": 26.51488464478305, + "learning_rate": 4.999824984851718e-07, + "logits/chosen": 11.507087707519531, + "logits/rejected": 12.06271743774414, + "logps/chosen": -14.055889129638672, + "logps/rejected": -18.92905044555664, + "loss": 0.506, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05761418491601944, + "rewards/margins": 0.7195183038711548, + "rewards/rejected": -0.6619042158126831, + "step": 122 + }, + { + "epoch": 2.084745762711864, + "grad_norm": 25.46072474496047, + "learning_rate": 4.999726540625574e-07, + "logits/chosen": 9.345511436462402, + "logits/rejected": 5.943528652191162, + "logps/chosen": -9.2318754196167, + "logps/rejected": -15.882528305053711, + "loss": 0.4653, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.012520495802164078, + "rewards/margins": 0.6578124761581421, + "rewards/rejected": -0.6452920436859131, + "step": 123 + }, + { + "epoch": 2.1016949152542375, + "grad_norm": 24.699856400048255, + "learning_rate": 4.999606221659594e-07, + "logits/chosen": 8.952792167663574, + "logits/rejected": 9.421079635620117, + "logps/chosen": -12.475508689880371, + "logps/rejected": -15.839692115783691, + "loss": 0.5147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12024357169866562, + "rewards/margins": 0.6879490613937378, + "rewards/rejected": -0.567705512046814, + "step": 124 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 24.645717371536033, + "learning_rate": 4.999464029006672e-07, + "logits/chosen": 11.617278099060059, + "logits/rejected": 12.094820976257324, + "logps/chosen": -8.739863395690918, + "logps/rejected": -16.15949249267578, + "loss": 0.4914, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10157185047864914, + "rewards/margins": 1.0363792181015015, + "rewards/rejected": -0.9348073601722717, + "step": 125 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 25.01415224410205, + "learning_rate": 4.999299963911115e-07, + "logits/chosen": 8.700315475463867, + "logits/rejected": 9.787883758544922, + "logps/chosen": -9.654312133789062, + "logps/rejected": -10.987586975097656, + "loss": 0.5217, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05968749523162842, + "rewards/margins": 0.37098097801208496, + "rewards/rejected": -0.31129348278045654, + "step": 126 + }, + { + "epoch": 2.152542372881356, + "grad_norm": 24.639525161190353, + "learning_rate": 4.999114027808631e-07, + "logits/chosen": 9.593833923339844, + "logits/rejected": 13.520515441894531, + "logps/chosen": -9.5872802734375, + "logps/rejected": -14.834587097167969, + "loss": 0.4919, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03321126848459244, + "rewards/margins": 1.004889965057373, + "rewards/rejected": -0.9716786742210388, + "step": 127 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 25.714664571452275, + "learning_rate": 4.998906222326321e-07, + "logits/chosen": 10.212748527526855, + "logits/rejected": 8.447935104370117, + "logps/chosen": -14.084490776062012, + "logps/rejected": -20.66387939453125, + "loss": 0.5057, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07896807044744492, + "rewards/margins": 1.2706775665283203, + "rewards/rejected": -1.1917095184326172, + "step": 128 + }, + { + "epoch": 2.1864406779661016, + "grad_norm": 26.119485701445264, + "learning_rate": 4.99867654928266e-07, + "logits/chosen": 13.961820602416992, + "logits/rejected": 15.463374137878418, + "logps/chosen": -13.826520919799805, + "logps/rejected": -18.094711303710938, + "loss": 0.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17537821829319, + "rewards/margins": 1.1025470495224, + "rewards/rejected": -0.9271686673164368, + "step": 129 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 25.992789405675488, + "learning_rate": 4.998425010687483e-07, + "logits/chosen": 10.33218002319336, + "logits/rejected": 9.227500915527344, + "logps/chosen": -10.892607688903809, + "logps/rejected": -20.151803970336914, + "loss": 0.5113, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04175170511007309, + "rewards/margins": 1.3892616033554077, + "rewards/rejected": -1.4310133457183838, + "step": 130 + }, + { + "epoch": 2.2203389830508473, + "grad_norm": 31.86911962760966, + "learning_rate": 4.998151608741969e-07, + "logits/chosen": 11.884110450744629, + "logits/rejected": 11.618425369262695, + "logps/chosen": -9.044775009155273, + "logps/rejected": -18.19227409362793, + "loss": 0.4792, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08620918542146683, + "rewards/margins": 1.41904616355896, + "rewards/rejected": -1.3328371047973633, + "step": 131 + }, + { + "epoch": 2.23728813559322, + "grad_norm": 25.915056331398198, + "learning_rate": 4.997856345838614e-07, + "logits/chosen": 10.366816520690918, + "logits/rejected": 11.786434173583984, + "logps/chosen": -10.501708030700684, + "logps/rejected": -17.16826629638672, + "loss": 0.4849, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08089535683393478, + "rewards/margins": 1.430103063583374, + "rewards/rejected": -1.3492075204849243, + "step": 132 + }, + { + "epoch": 2.2542372881355934, + "grad_norm": 26.862491847754576, + "learning_rate": 4.997539224561225e-07, + "logits/chosen": 9.815896987915039, + "logits/rejected": 8.910368919372559, + "logps/chosen": -7.234807014465332, + "logps/rejected": -10.03347110748291, + "loss": 0.4671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17896637320518494, + "rewards/margins": 0.49002087116241455, + "rewards/rejected": -0.3110544979572296, + "step": 133 + }, + { + "epoch": 2.2711864406779663, + "grad_norm": 25.040024723467038, + "learning_rate": 4.99720024768488e-07, + "logits/chosen": 11.472432136535645, + "logits/rejected": 12.454916000366211, + "logps/chosen": -11.175409317016602, + "logps/rejected": -14.160651206970215, + "loss": 0.4648, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15106943249702454, + "rewards/margins": 0.8142722845077515, + "rewards/rejected": -0.6632028818130493, + "step": 134 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 28.03197887833454, + "learning_rate": 4.996839418175918e-07, + "logits/chosen": 13.856090545654297, + "logits/rejected": 13.056228637695312, + "logps/chosen": -9.581791877746582, + "logps/rejected": -19.141881942749023, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15028533339500427, + "rewards/margins": 1.267728328704834, + "rewards/rejected": -1.1174428462982178, + "step": 135 + }, + { + "epoch": 2.305084745762712, + "grad_norm": 24.660609353156655, + "learning_rate": 4.996456739191904e-07, + "logits/chosen": 12.02409839630127, + "logits/rejected": 11.062095642089844, + "logps/chosen": -9.458813667297363, + "logps/rejected": -14.94155216217041, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08914165943861008, + "rewards/margins": 0.9309415817260742, + "rewards/rejected": -0.8417999148368835, + "step": 136 + }, + { + "epoch": 2.3220338983050848, + "grad_norm": 26.028218734293073, + "learning_rate": 4.996052214081608e-07, + "logits/chosen": 9.56547737121582, + "logits/rejected": 12.38303279876709, + "logps/chosen": -9.920650482177734, + "logps/rejected": -13.999842643737793, + "loss": 0.5123, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08545555174350739, + "rewards/margins": 0.8639965057373047, + "rewards/rejected": -0.7785409688949585, + "step": 137 + }, + { + "epoch": 2.3389830508474576, + "grad_norm": 25.90341742150353, + "learning_rate": 4.995625846384966e-07, + "logits/chosen": 12.650365829467773, + "logits/rejected": 12.255863189697266, + "logps/chosen": -10.3493070602417, + "logps/rejected": -11.783307075500488, + "loss": 0.4901, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14267519116401672, + "rewards/margins": 0.39001455903053284, + "rewards/rejected": -0.2473393678665161, + "step": 138 + }, + { + "epoch": 2.3559322033898304, + "grad_norm": 28.40837942784419, + "learning_rate": 4.995177639833061e-07, + "logits/chosen": 9.189220428466797, + "logits/rejected": 9.674077033996582, + "logps/chosen": -8.151750564575195, + "logps/rejected": -12.80848503112793, + "loss": 0.5066, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09756353497505188, + "rewards/margins": 0.5721523761749268, + "rewards/rejected": -0.4745888411998749, + "step": 139 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 26.068404683176787, + "learning_rate": 4.994707598348084e-07, + "logits/chosen": 11.096114158630371, + "logits/rejected": 9.946800231933594, + "logps/chosen": -9.776742935180664, + "logps/rejected": -19.20057487487793, + "loss": 0.5062, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0900912657380104, + "rewards/margins": 0.896281361579895, + "rewards/rejected": -0.806190013885498, + "step": 140 + }, + { + "epoch": 2.389830508474576, + "grad_norm": 26.279589404349043, + "learning_rate": 4.994215726043297e-07, + "logits/chosen": 13.786352157592773, + "logits/rejected": 12.066240310668945, + "logps/chosen": -10.624284744262695, + "logps/rejected": -15.999090194702148, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11691068112850189, + "rewards/margins": 0.8987638354301453, + "rewards/rejected": -1.0156744718551636, + "step": 141 + }, + { + "epoch": 2.406779661016949, + "grad_norm": 25.385659931811325, + "learning_rate": 4.993702027223003e-07, + "logits/chosen": 11.07485294342041, + "logits/rejected": 12.823025703430176, + "logps/chosen": -9.523748397827148, + "logps/rejected": -19.099346160888672, + "loss": 0.4461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11692683398723602, + "rewards/margins": 1.9571490287780762, + "rewards/rejected": -1.8402220010757446, + "step": 142 + }, + { + "epoch": 2.423728813559322, + "grad_norm": 28.17104878053314, + "learning_rate": 4.993166506382505e-07, + "logits/chosen": 12.44686508178711, + "logits/rejected": 13.07184886932373, + "logps/chosen": -7.501523971557617, + "logps/rejected": -12.960819244384766, + "loss": 0.5077, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08402667939662933, + "rewards/margins": 1.0415351390838623, + "rewards/rejected": -0.9575084447860718, + "step": 143 + }, + { + "epoch": 2.440677966101695, + "grad_norm": 25.091201340296504, + "learning_rate": 4.992609168208068e-07, + "logits/chosen": 5.851665019989014, + "logits/rejected": 9.226675033569336, + "logps/chosen": -13.629108428955078, + "logps/rejected": -12.965569496154785, + "loss": 0.5181, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13420507311820984, + "rewards/margins": 0.6294023990631104, + "rewards/rejected": -0.49519726634025574, + "step": 144 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 23.73455464805867, + "learning_rate": 4.992030017576875e-07, + "logits/chosen": 11.717859268188477, + "logits/rejected": 14.757806777954102, + "logps/chosen": -14.967961311340332, + "logps/rejected": -22.604053497314453, + "loss": 0.4534, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0003174692392349243, + "rewards/margins": 1.5173003673553467, + "rewards/rejected": -1.5169827938079834, + "step": 145 + }, + { + "epoch": 2.4745762711864407, + "grad_norm": 24.58533088616242, + "learning_rate": 4.991429059556989e-07, + "logits/chosen": 11.908763885498047, + "logits/rejected": 12.790480613708496, + "logps/chosen": -12.356618881225586, + "logps/rejected": -15.337615966796875, + "loss": 0.4701, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.03432869166135788, + "rewards/margins": 1.1666860580444336, + "rewards/rejected": -1.201014757156372, + "step": 146 + }, + { + "epoch": 2.4915254237288136, + "grad_norm": 26.21401018621895, + "learning_rate": 4.990806299407305e-07, + "logits/chosen": 10.173805236816406, + "logits/rejected": 9.579325675964355, + "logps/chosen": -9.430241584777832, + "logps/rejected": -11.20843505859375, + "loss": 0.5049, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.10169175267219543, + "rewards/margins": 0.23200133442878723, + "rewards/rejected": -0.1303095817565918, + "step": 147 + }, + { + "epoch": 2.5084745762711864, + "grad_norm": 25.415342669771636, + "learning_rate": 4.990161742577506e-07, + "logits/chosen": 9.909467697143555, + "logits/rejected": 10.56566047668457, + "logps/chosen": -8.367213249206543, + "logps/rejected": -14.22795581817627, + "loss": 0.4641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1012047678232193, + "rewards/margins": 1.2070566415786743, + "rewards/rejected": -1.1058518886566162, + "step": 148 + }, + { + "epoch": 2.5254237288135593, + "grad_norm": 23.40734112242746, + "learning_rate": 4.989495394708015e-07, + "logits/chosen": 9.81765365600586, + "logits/rejected": 13.320455551147461, + "logps/chosen": -15.109245300292969, + "logps/rejected": -17.49165916442871, + "loss": 0.4431, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2697944641113281, + "rewards/margins": 1.2794241905212402, + "rewards/rejected": -1.0096296072006226, + "step": 149 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 24.624713192478026, + "learning_rate": 4.988807261629942e-07, + "logits/chosen": 10.242950439453125, + "logits/rejected": 12.858686447143555, + "logps/chosen": -10.181503295898438, + "logps/rejected": -11.994699478149414, + "loss": 0.4411, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.02304082363843918, + "rewards/margins": 1.0250604152679443, + "rewards/rejected": -1.0020196437835693, + "step": 150 + }, + { + "epoch": 2.559322033898305, + "grad_norm": 24.990088355481344, + "learning_rate": 4.988097349365039e-07, + "logits/chosen": 7.55122184753418, + "logits/rejected": 9.793380737304688, + "logps/chosen": -11.03127384185791, + "logps/rejected": -17.439645767211914, + "loss": 0.4557, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16880398988723755, + "rewards/margins": 0.9546310901641846, + "rewards/rejected": -0.7858270406723022, + "step": 151 + }, + { + "epoch": 2.576271186440678, + "grad_norm": 24.98304302249675, + "learning_rate": 4.987365664125646e-07, + "logits/chosen": 9.416153907775879, + "logits/rejected": 10.089354515075684, + "logps/chosen": -10.366312026977539, + "logps/rejected": -12.546512603759766, + "loss": 0.4712, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005661562085151672, + "rewards/margins": 0.5207768082618713, + "rewards/rejected": -0.5151152610778809, + "step": 152 + }, + { + "epoch": 2.593220338983051, + "grad_norm": 28.025863819113706, + "learning_rate": 4.986612212314632e-07, + "logits/chosen": 12.203873634338379, + "logits/rejected": 10.61597728729248, + "logps/chosen": -10.178508758544922, + "logps/rejected": -15.887847900390625, + "loss": 0.5206, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24399825930595398, + "rewards/margins": 0.9938374161720276, + "rewards/rejected": -0.749839186668396, + "step": 153 + }, + { + "epoch": 2.610169491525424, + "grad_norm": 25.025894346113112, + "learning_rate": 4.985837000525343e-07, + "logits/chosen": 12.996820449829102, + "logits/rejected": 11.24425220489502, + "logps/chosen": -7.2731475830078125, + "logps/rejected": -9.428850173950195, + "loss": 0.5125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09597513824701309, + "rewards/margins": 0.3146960139274597, + "rewards/rejected": -0.21872088313102722, + "step": 154 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 23.275619582693544, + "learning_rate": 4.985040035541542e-07, + "logits/chosen": 13.19137191772461, + "logits/rejected": 11.988751411437988, + "logps/chosen": -9.661288261413574, + "logps/rejected": -16.59218406677246, + "loss": 0.4505, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06454382091760635, + "rewards/margins": 1.3431710004806519, + "rewards/rejected": -1.2786271572113037, + "step": 155 + }, + { + "epoch": 2.6440677966101696, + "grad_norm": 24.401688438016123, + "learning_rate": 4.984221324337356e-07, + "logits/chosen": 12.422380447387695, + "logits/rejected": 11.99139404296875, + "logps/chosen": -7.512474536895752, + "logps/rejected": -15.746870040893555, + "loss": 0.4729, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.12945565581321716, + "rewards/margins": 1.181099534034729, + "rewards/rejected": -1.0516438484191895, + "step": 156 + }, + { + "epoch": 2.6610169491525424, + "grad_norm": 24.728783919464327, + "learning_rate": 4.983380874077204e-07, + "logits/chosen": 9.292040824890137, + "logits/rejected": 11.206062316894531, + "logps/chosen": -8.320369720458984, + "logps/rejected": -9.853104591369629, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03576832637190819, + "rewards/margins": 0.6088409423828125, + "rewards/rejected": -0.5730725526809692, + "step": 157 + }, + { + "epoch": 2.6779661016949152, + "grad_norm": 25.05007863888729, + "learning_rate": 4.982518692115743e-07, + "logits/chosen": 9.616560935974121, + "logits/rejected": 10.010045051574707, + "logps/chosen": -9.049758911132812, + "logps/rejected": -12.729308128356934, + "loss": 0.4896, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08253885060548782, + "rewards/margins": 0.7497320175170898, + "rewards/rejected": -0.6671931743621826, + "step": 158 + }, + { + "epoch": 2.694915254237288, + "grad_norm": 25.318700331957388, + "learning_rate": 4.981634785997801e-07, + "logits/chosen": 10.468634605407715, + "logits/rejected": 10.967134475708008, + "logps/chosen": -9.911856651306152, + "logps/rejected": -12.62133502960205, + "loss": 0.4631, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09426818788051605, + "rewards/margins": 0.9589300751686096, + "rewards/rejected": -0.8646619319915771, + "step": 159 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 23.73451475971574, + "learning_rate": 4.980729163458311e-07, + "logits/chosen": 7.005346775054932, + "logits/rejected": 8.92467212677002, + "logps/chosen": -9.383186340332031, + "logps/rejected": -11.193894386291504, + "loss": 0.4974, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19052043557167053, + "rewards/margins": 0.8246796727180481, + "rewards/rejected": -0.6341591477394104, + "step": 160 + }, + { + "epoch": 2.7288135593220337, + "grad_norm": 23.8047083086088, + "learning_rate": 4.979801832422243e-07, + "logits/chosen": 12.244979858398438, + "logits/rejected": 12.103124618530273, + "logps/chosen": -6.470426559448242, + "logps/rejected": -9.479026794433594, + "loss": 0.4738, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04870966449379921, + "rewards/margins": 0.3689252734184265, + "rewards/rejected": -0.3202156126499176, + "step": 161 + }, + { + "epoch": 2.7457627118644066, + "grad_norm": 23.564234251226335, + "learning_rate": 4.978852801004533e-07, + "logits/chosen": 9.73720932006836, + "logits/rejected": 8.43739128112793, + "logps/chosen": -10.515070915222168, + "logps/rejected": -13.127669334411621, + "loss": 0.4756, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1742311269044876, + "rewards/margins": 0.537544310092926, + "rewards/rejected": -0.36331313848495483, + "step": 162 + }, + { + "epoch": 2.7627118644067794, + "grad_norm": 25.59634193506336, + "learning_rate": 4.977882077510018e-07, + "logits/chosen": 9.140570640563965, + "logits/rejected": 11.064943313598633, + "logps/chosen": -7.318413257598877, + "logps/rejected": -16.608783721923828, + "loss": 0.4596, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0647045448422432, + "rewards/margins": 1.8001742362976074, + "rewards/rejected": -1.7354698181152344, + "step": 163 + }, + { + "epoch": 2.7796610169491527, + "grad_norm": 23.29024669512435, + "learning_rate": 4.976889670433355e-07, + "logits/chosen": 7.721864700317383, + "logits/rejected": 11.411937713623047, + "logps/chosen": -13.955901145935059, + "logps/rejected": -17.241910934448242, + "loss": 0.4214, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09154816716909409, + "rewards/margins": 1.4261943101882935, + "rewards/rejected": -1.334646224975586, + "step": 164 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 24.3882056521309, + "learning_rate": 4.975875588458953e-07, + "logits/chosen": 9.531468391418457, + "logits/rejected": 9.778902053833008, + "logps/chosen": -13.27505111694336, + "logps/rejected": -13.49073314666748, + "loss": 0.5075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06451751291751862, + "rewards/margins": 0.2705305218696594, + "rewards/rejected": -0.33504801988601685, + "step": 165 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 23.673522300640947, + "learning_rate": 4.974839840460894e-07, + "logits/chosen": 12.992244720458984, + "logits/rejected": 12.923137664794922, + "logps/chosen": -5.916741371154785, + "logps/rejected": -12.199560165405273, + "loss": 0.4475, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09810908138751984, + "rewards/margins": 1.2945395708084106, + "rewards/rejected": -1.1964305639266968, + "step": 166 + }, + { + "epoch": 2.830508474576271, + "grad_norm": 26.17099733938422, + "learning_rate": 4.973782435502858e-07, + "logits/chosen": 11.463674545288086, + "logits/rejected": 11.526470184326172, + "logps/chosen": -11.625694274902344, + "logps/rejected": -18.08144760131836, + "loss": 0.4692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0962575376033783, + "rewards/margins": 1.393619418144226, + "rewards/rejected": -1.2973618507385254, + "step": 167 + }, + { + "epoch": 2.847457627118644, + "grad_norm": 23.375003008104915, + "learning_rate": 4.97270338283804e-07, + "logits/chosen": 6.848824501037598, + "logits/rejected": 6.401797294616699, + "logps/chosen": -7.540995121002197, + "logps/rejected": -10.878442764282227, + "loss": 0.4503, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08859586715698242, + "rewards/margins": 0.7138924598693848, + "rewards/rejected": -0.6252965927124023, + "step": 168 + }, + { + "epoch": 2.864406779661017, + "grad_norm": 24.319685449672303, + "learning_rate": 4.97160269190907e-07, + "logits/chosen": 11.042668342590332, + "logits/rejected": 12.226555824279785, + "logps/chosen": -9.243000030517578, + "logps/rejected": -10.139057159423828, + "loss": 0.4639, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1013827919960022, + "rewards/margins": 0.4762725234031677, + "rewards/rejected": -0.3748897314071655, + "step": 169 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 23.719800999038476, + "learning_rate": 4.970480372347933e-07, + "logits/chosen": 5.58452844619751, + "logits/rejected": 5.292469024658203, + "logps/chosen": -8.558345794677734, + "logps/rejected": -11.133590698242188, + "loss": 0.4829, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.12931324541568756, + "rewards/margins": 0.7034010887145996, + "rewards/rejected": -0.5740878582000732, + "step": 170 + }, + { + "epoch": 2.898305084745763, + "grad_norm": 26.325102228250078, + "learning_rate": 4.969336433975886e-07, + "logits/chosen": 10.18309211730957, + "logits/rejected": 9.86977767944336, + "logps/chosen": -9.139179229736328, + "logps/rejected": -16.562314987182617, + "loss": 0.512, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11633327603340149, + "rewards/margins": 0.8496271371841431, + "rewards/rejected": -0.733293890953064, + "step": 171 + }, + { + "epoch": 2.915254237288136, + "grad_norm": 23.30874461173125, + "learning_rate": 4.968170886803361e-07, + "logits/chosen": 6.269238471984863, + "logits/rejected": 5.880119323730469, + "logps/chosen": -11.019979476928711, + "logps/rejected": -14.050888061523438, + "loss": 0.4454, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15095578134059906, + "rewards/margins": 0.910696804523468, + "rewards/rejected": -0.759740948677063, + "step": 172 + }, + { + "epoch": 2.9322033898305087, + "grad_norm": 26.260317853292694, + "learning_rate": 4.966983741029893e-07, + "logits/chosen": 9.132369995117188, + "logits/rejected": 7.901233673095703, + "logps/chosen": -9.164523124694824, + "logps/rejected": -15.087239265441895, + "loss": 0.446, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06711259484291077, + "rewards/margins": 0.5440139174461365, + "rewards/rejected": -0.4769013524055481, + "step": 173 + }, + { + "epoch": 2.9491525423728815, + "grad_norm": 25.432426091848676, + "learning_rate": 4.965775007044019e-07, + "logits/chosen": 7.7744951248168945, + "logits/rejected": 12.220438003540039, + "logps/chosen": -12.57607650756836, + "logps/rejected": -19.119516372680664, + "loss": 0.4158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03255147486925125, + "rewards/margins": 1.7571169137954712, + "rewards/rejected": -1.7896684408187866, + "step": 174 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 24.66287767238989, + "learning_rate": 4.964544695423193e-07, + "logits/chosen": 9.675177574157715, + "logits/rejected": 10.935977935791016, + "logps/chosen": -8.286613464355469, + "logps/rejected": -9.868494033813477, + "loss": 0.5054, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0919422060251236, + "rewards/margins": 0.8523497581481934, + "rewards/rejected": -0.760407567024231, + "step": 175 + }, + { + "epoch": 2.983050847457627, + "grad_norm": 24.155134143035227, + "learning_rate": 4.963292816933691e-07, + "logits/chosen": 8.695963859558105, + "logits/rejected": 8.458662033081055, + "logps/chosen": -14.033685684204102, + "logps/rejected": -18.679027557373047, + "loss": 0.4945, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17841431498527527, + "rewards/margins": 1.1647181510925293, + "rewards/rejected": -0.9863038063049316, + "step": 176 + }, + { + "epoch": 3.0, + "grad_norm": 24.711170078854128, + "learning_rate": 4.96201938253052e-07, + "logits/chosen": 10.343132019042969, + "logits/rejected": 8.42280101776123, + "logps/chosen": -9.407011032104492, + "logps/rejected": -15.08864974975586, + "loss": 0.5274, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08468975871801376, + "rewards/margins": 0.7279720306396484, + "rewards/rejected": -0.6432822942733765, + "step": 177 + }, + { + "epoch": 3.016949152542373, + "grad_norm": 23.669291103097716, + "learning_rate": 4.960724403357314e-07, + "logits/chosen": 9.52169132232666, + "logits/rejected": 9.99760913848877, + "logps/chosen": -9.479093551635742, + "logps/rejected": -11.957143783569336, + "loss": 0.4462, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0010063312947750092, + "rewards/margins": 0.8099563717842102, + "rewards/rejected": -0.8089500665664673, + "step": 178 + }, + { + "epoch": 3.0338983050847457, + "grad_norm": 22.62026751615169, + "learning_rate": 4.959407890746248e-07, + "logits/chosen": 7.577289581298828, + "logits/rejected": 7.524887561798096, + "logps/chosen": -8.504217147827148, + "logps/rejected": -11.792627334594727, + "loss": 0.4502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2125588059425354, + "rewards/margins": 1.011501669883728, + "rewards/rejected": -0.7989429235458374, + "step": 179 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 20.80462624892399, + "learning_rate": 4.958069856217929e-07, + "logits/chosen": 8.124763488769531, + "logits/rejected": 8.247533798217773, + "logps/chosen": -8.557960510253906, + "logps/rejected": -12.606611251831055, + "loss": 0.4059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12852689623832703, + "rewards/margins": 1.2782764434814453, + "rewards/rejected": -1.149749517440796, + "step": 180 + }, + { + "epoch": 3.0677966101694913, + "grad_norm": 21.12610029770411, + "learning_rate": 4.956710311481302e-07, + "logits/chosen": 9.805615425109863, + "logits/rejected": 10.935819625854492, + "logps/chosen": -8.481396675109863, + "logps/rejected": -19.484060287475586, + "loss": 0.4014, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.007218081504106522, + "rewards/margins": 1.4923968315124512, + "rewards/rejected": -1.4851785898208618, + "step": 181 + }, + { + "epoch": 3.084745762711864, + "grad_norm": 21.84840584181411, + "learning_rate": 4.955329268433542e-07, + "logits/chosen": 8.906585693359375, + "logits/rejected": 7.750245571136475, + "logps/chosen": -11.109460830688477, + "logps/rejected": -12.152922630310059, + "loss": 0.4288, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04717100411653519, + "rewards/margins": 1.006048321723938, + "rewards/rejected": -0.9588773846626282, + "step": 182 + }, + { + "epoch": 3.1016949152542375, + "grad_norm": 23.956931308245203, + "learning_rate": 4.953926739159956e-07, + "logits/chosen": 11.974356651306152, + "logits/rejected": 13.56727409362793, + "logps/chosen": -9.361337661743164, + "logps/rejected": -15.33359146118164, + "loss": 0.4187, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0379180982708931, + "rewards/margins": 1.3598583936691284, + "rewards/rejected": -1.3219404220581055, + "step": 183 + }, + { + "epoch": 3.1186440677966103, + "grad_norm": 27.743137262738262, + "learning_rate": 4.952502735933869e-07, + "logits/chosen": 8.663519859313965, + "logits/rejected": 8.262752532958984, + "logps/chosen": -11.083414077758789, + "logps/rejected": -18.736698150634766, + "loss": 0.4226, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21909113228321075, + "rewards/margins": 1.2035876512527466, + "rewards/rejected": -0.9844965934753418, + "step": 184 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 22.371627214758053, + "learning_rate": 4.951057271216525e-07, + "logits/chosen": 11.679509162902832, + "logits/rejected": 8.517683029174805, + "logps/chosen": -8.450439453125, + "logps/rejected": -15.144365310668945, + "loss": 0.4182, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21107473969459534, + "rewards/margins": 1.0494379997253418, + "rewards/rejected": -0.8383632302284241, + "step": 185 + }, + { + "epoch": 3.152542372881356, + "grad_norm": 22.7306705476798, + "learning_rate": 4.949590357656974e-07, + "logits/chosen": 12.699131965637207, + "logits/rejected": 10.048919677734375, + "logps/chosen": -10.202484130859375, + "logps/rejected": -18.82345199584961, + "loss": 0.3911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15570127964019775, + "rewards/margins": 1.4168462753295898, + "rewards/rejected": -1.2611451148986816, + "step": 186 + }, + { + "epoch": 3.169491525423729, + "grad_norm": 23.344069957384136, + "learning_rate": 4.948102008091962e-07, + "logits/chosen": 12.0389986038208, + "logits/rejected": 12.07477855682373, + "logps/chosen": -9.893170356750488, + "logps/rejected": -17.06765365600586, + "loss": 0.3773, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21715691685676575, + "rewards/margins": 1.2834587097167969, + "rewards/rejected": -1.0663018226623535, + "step": 187 + }, + { + "epoch": 3.1864406779661016, + "grad_norm": 21.858930833074545, + "learning_rate": 4.946592235545815e-07, + "logits/chosen": 10.424958229064941, + "logits/rejected": 10.42190170288086, + "logps/chosen": -14.09011459350586, + "logps/rejected": -17.815031051635742, + "loss": 0.4075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21966154873371124, + "rewards/margins": 1.0660887956619263, + "rewards/rejected": -0.8464272618293762, + "step": 188 + }, + { + "epoch": 3.2033898305084745, + "grad_norm": 22.597248903481418, + "learning_rate": 4.945061053230333e-07, + "logits/chosen": 10.407960891723633, + "logits/rejected": 7.39249324798584, + "logps/chosen": -10.86440372467041, + "logps/rejected": -25.083696365356445, + "loss": 0.4143, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04429125040769577, + "rewards/margins": 2.1463186740875244, + "rewards/rejected": -2.102027416229248, + "step": 189 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 20.70136919634923, + "learning_rate": 4.943508474544666e-07, + "logits/chosen": 9.571917533874512, + "logits/rejected": 11.19800090789795, + "logps/chosen": -7.767718315124512, + "logps/rejected": -14.589805603027344, + "loss": 0.3856, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17417725920677185, + "rewards/margins": 1.5476055145263672, + "rewards/rejected": -1.3734283447265625, + "step": 190 + }, + { + "epoch": 3.23728813559322, + "grad_norm": 22.135267167333776, + "learning_rate": 4.941934513075204e-07, + "logits/chosen": 4.429686069488525, + "logits/rejected": 9.858856201171875, + "logps/chosen": -15.572264671325684, + "logps/rejected": -16.965652465820312, + "loss": 0.4091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21184024214744568, + "rewards/margins": 1.4814664125442505, + "rewards/rejected": -1.269626259803772, + "step": 191 + }, + { + "epoch": 3.2542372881355934, + "grad_norm": 21.499041506406705, + "learning_rate": 4.94033918259545e-07, + "logits/chosen": 13.0203275680542, + "logits/rejected": 11.126030921936035, + "logps/chosen": -9.320621490478516, + "logps/rejected": -12.93569278717041, + "loss": 0.4398, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15274274349212646, + "rewards/margins": 1.1112730503082275, + "rewards/rejected": -0.9585303068161011, + "step": 192 + }, + { + "epoch": 3.2711864406779663, + "grad_norm": 20.87416005163883, + "learning_rate": 4.938722497065909e-07, + "logits/chosen": 8.742778778076172, + "logits/rejected": 11.352007865905762, + "logps/chosen": -10.20619010925293, + "logps/rejected": -11.195541381835938, + "loss": 0.3891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18298028409481049, + "rewards/margins": 1.1817280054092407, + "rewards/rejected": -0.9987477660179138, + "step": 193 + }, + { + "epoch": 3.288135593220339, + "grad_norm": 22.195177566377918, + "learning_rate": 4.937084470633958e-07, + "logits/chosen": 8.346685409545898, + "logits/rejected": 12.437751770019531, + "logps/chosen": -10.978531837463379, + "logps/rejected": -16.579553604125977, + "loss": 0.3915, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1756804883480072, + "rewards/margins": 1.4822646379470825, + "rewards/rejected": -1.306584119796753, + "step": 194 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 21.792253504042495, + "learning_rate": 4.935425117633726e-07, + "logits/chosen": 9.004284858703613, + "logits/rejected": 9.3473482131958, + "logps/chosen": -9.778764724731445, + "logps/rejected": -11.804350852966309, + "loss": 0.4259, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.18937112390995026, + "rewards/margins": 0.568367063999176, + "rewards/rejected": -0.37899595499038696, + "step": 195 + }, + { + "epoch": 3.3220338983050848, + "grad_norm": 22.229138483410182, + "learning_rate": 4.933744452585966e-07, + "logits/chosen": 7.870884895324707, + "logits/rejected": 10.313871383666992, + "logps/chosen": -9.134467124938965, + "logps/rejected": -12.303489685058594, + "loss": 0.4394, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12609566748142242, + "rewards/margins": 1.0664055347442627, + "rewards/rejected": -0.9403098821640015, + "step": 196 + }, + { + "epoch": 3.3389830508474576, + "grad_norm": 23.149619465198537, + "learning_rate": 4.932042490197933e-07, + "logits/chosen": 3.4697465896606445, + "logits/rejected": 6.568927764892578, + "logps/chosen": -14.070352554321289, + "logps/rejected": -14.763322830200195, + "loss": 0.4446, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24967879056930542, + "rewards/margins": 1.045470952987671, + "rewards/rejected": -0.7957921624183655, + "step": 197 + }, + { + "epoch": 3.3559322033898304, + "grad_norm": 22.984979929041984, + "learning_rate": 4.930319245363248e-07, + "logits/chosen": 9.373580932617188, + "logits/rejected": 9.523336410522461, + "logps/chosen": -5.632663726806641, + "logps/rejected": -13.255867004394531, + "loss": 0.4067, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13241903483867645, + "rewards/margins": 1.1132423877716064, + "rewards/rejected": -0.9808233380317688, + "step": 198 + }, + { + "epoch": 3.3728813559322033, + "grad_norm": 20.69437622287786, + "learning_rate": 4.928574733161775e-07, + "logits/chosen": 7.692915439605713, + "logits/rejected": 7.09061861038208, + "logps/chosen": -7.571112632751465, + "logps/rejected": -13.790243148803711, + "loss": 0.4096, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20739738643169403, + "rewards/margins": 1.091109037399292, + "rewards/rejected": -0.8837117552757263, + "step": 199 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 19.60119003237251, + "learning_rate": 4.926808968859483e-07, + "logits/chosen": 5.386170387268066, + "logits/rejected": 7.581046104431152, + "logps/chosen": -8.80381965637207, + "logps/rejected": -14.835564613342285, + "loss": 0.3412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20112082362174988, + "rewards/margins": 1.5490384101867676, + "rewards/rejected": -1.3479175567626953, + "step": 200 + }, + { + "epoch": 3.406779661016949, + "grad_norm": 22.208560067748735, + "learning_rate": 4.925021967908316e-07, + "logits/chosen": 7.9428558349609375, + "logits/rejected": 7.730855464935303, + "logps/chosen": -7.504685878753662, + "logps/rejected": -10.529887199401855, + "loss": 0.414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12443725764751434, + "rewards/margins": 0.5505168437957764, + "rewards/rejected": -0.4260796010494232, + "step": 201 + }, + { + "epoch": 3.423728813559322, + "grad_norm": 20.495015232824937, + "learning_rate": 4.923213745946059e-07, + "logits/chosen": 9.569754600524902, + "logits/rejected": 10.065143585205078, + "logps/chosen": -6.785487174987793, + "logps/rejected": -17.289140701293945, + "loss": 0.3793, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2570599913597107, + "rewards/margins": 1.8961150646209717, + "rewards/rejected": -1.6390550136566162, + "step": 202 + }, + { + "epoch": 3.440677966101695, + "grad_norm": 21.481240539670026, + "learning_rate": 4.921384318796193e-07, + "logits/chosen": 10.811732292175293, + "logits/rejected": 13.857377052307129, + "logps/chosen": -11.36143684387207, + "logps/rejected": -14.535248756408691, + "loss": 0.4095, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20432500541210175, + "rewards/margins": 1.145780086517334, + "rewards/rejected": -0.9414551258087158, + "step": 203 + }, + { + "epoch": 3.457627118644068, + "grad_norm": 19.51511374436272, + "learning_rate": 4.919533702467771e-07, + "logits/chosen": 8.838293075561523, + "logits/rejected": 11.331209182739258, + "logps/chosen": -9.482587814331055, + "logps/rejected": -17.566062927246094, + "loss": 0.3544, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11788967251777649, + "rewards/margins": 1.833619475364685, + "rewards/rejected": -1.7157299518585205, + "step": 204 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 20.794868312344995, + "learning_rate": 4.91766191315526e-07, + "logits/chosen": 7.887833595275879, + "logits/rejected": 10.80614948272705, + "logps/chosen": -12.706953048706055, + "logps/rejected": -16.633255004882812, + "loss": 0.3632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.137722447514534, + "rewards/margins": 1.4804778099060059, + "rewards/rejected": -1.3427555561065674, + "step": 205 + }, + { + "epoch": 3.4915254237288136, + "grad_norm": 22.111290792251523, + "learning_rate": 4.915768967238417e-07, + "logits/chosen": 6.357418060302734, + "logits/rejected": 7.8472113609313965, + "logps/chosen": -10.277403831481934, + "logps/rejected": -10.781571388244629, + "loss": 0.4027, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32281461358070374, + "rewards/margins": 0.8708096742630005, + "rewards/rejected": -0.5479950904846191, + "step": 206 + }, + { + "epoch": 3.5084745762711864, + "grad_norm": 22.2878678999551, + "learning_rate": 4.913854881282131e-07, + "logits/chosen": 10.417675018310547, + "logits/rejected": 10.893251419067383, + "logps/chosen": -9.740762710571289, + "logps/rejected": -15.510869979858398, + "loss": 0.3839, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.055107712745666504, + "rewards/margins": 1.458693504333496, + "rewards/rejected": -1.4035859107971191, + "step": 207 + }, + { + "epoch": 3.5254237288135593, + "grad_norm": 20.99161748301627, + "learning_rate": 4.91191967203629e-07, + "logits/chosen": 6.382161617279053, + "logits/rejected": 4.985961437225342, + "logps/chosen": -7.836081027984619, + "logps/rejected": -13.391523361206055, + "loss": 0.4012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1940561830997467, + "rewards/margins": 1.115774154663086, + "rewards/rejected": -0.9217178821563721, + "step": 208 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 20.40271917104514, + "learning_rate": 4.909963356435624e-07, + "logits/chosen": 9.568527221679688, + "logits/rejected": 10.793418884277344, + "logps/chosen": -7.237433433532715, + "logps/rejected": -18.871238708496094, + "loss": 0.3936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11054301261901855, + "rewards/margins": 2.871230125427246, + "rewards/rejected": -2.7606871128082275, + "step": 209 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 21.549650672810273, + "learning_rate": 4.907985951599563e-07, + "logits/chosen": 7.44000768661499, + "logits/rejected": 10.265032768249512, + "logps/chosen": -9.132471084594727, + "logps/rejected": -13.506124496459961, + "loss": 0.3843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16352702677249908, + "rewards/margins": 0.9725521802902222, + "rewards/rejected": -0.8090251684188843, + "step": 210 + }, + { + "epoch": 3.576271186440678, + "grad_norm": 20.570670881015133, + "learning_rate": 4.905987474832087e-07, + "logits/chosen": 9.317378044128418, + "logits/rejected": 5.925313472747803, + "logps/chosen": -14.4925537109375, + "logps/rejected": -20.435415267944336, + "loss": 0.3668, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1997138261795044, + "rewards/margins": 1.3157970905303955, + "rewards/rejected": -1.1160831451416016, + "step": 211 + }, + { + "epoch": 3.593220338983051, + "grad_norm": 20.97327386001296, + "learning_rate": 4.903967943621573e-07, + "logits/chosen": 6.413149833679199, + "logits/rejected": 9.610268592834473, + "logps/chosen": -13.387876510620117, + "logps/rejected": -18.008541107177734, + "loss": 0.3388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2004997879266739, + "rewards/margins": 2.024320363998413, + "rewards/rejected": -1.8238208293914795, + "step": 212 + }, + { + "epoch": 3.610169491525424, + "grad_norm": 20.265155078908585, + "learning_rate": 4.901927375640642e-07, + "logits/chosen": 7.389824390411377, + "logits/rejected": 9.059728622436523, + "logps/chosen": -8.919748306274414, + "logps/rejected": -15.606636047363281, + "loss": 0.3897, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26465845108032227, + "rewards/margins": 1.466994285583496, + "rewards/rejected": -1.2023358345031738, + "step": 213 + }, + { + "epoch": 3.6271186440677967, + "grad_norm": 20.578725152301015, + "learning_rate": 4.899865788746005e-07, + "logits/chosen": 8.600379943847656, + "logits/rejected": 10.468989372253418, + "logps/chosen": -12.677905082702637, + "logps/rejected": -19.546112060546875, + "loss": 0.3711, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18351367115974426, + "rewards/margins": 2.7776191234588623, + "rewards/rejected": -2.5941052436828613, + "step": 214 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 18.091261285176355, + "learning_rate": 4.897783200978305e-07, + "logits/chosen": 7.124416828155518, + "logits/rejected": 10.905740737915039, + "logps/chosen": -10.265155792236328, + "logps/rejected": -13.548370361328125, + "loss": 0.3485, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31245559453964233, + "rewards/margins": 1.6994444131851196, + "rewards/rejected": -1.386988878250122, + "step": 215 + }, + { + "epoch": 3.6610169491525424, + "grad_norm": 21.908270820999903, + "learning_rate": 4.895679630561963e-07, + "logits/chosen": 9.098257064819336, + "logits/rejected": 8.591686248779297, + "logps/chosen": -8.658933639526367, + "logps/rejected": -12.28408145904541, + "loss": 0.3971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11362498998641968, + "rewards/margins": 1.1206563711166382, + "rewards/rejected": -1.0070313215255737, + "step": 216 + }, + { + "epoch": 3.6779661016949152, + "grad_norm": 22.830751729597864, + "learning_rate": 4.893555095905013e-07, + "logits/chosen": 5.396052837371826, + "logits/rejected": 7.402496337890625, + "logps/chosen": -12.782821655273438, + "logps/rejected": -17.051944732666016, + "loss": 0.3867, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31852054595947266, + "rewards/margins": 1.6121286153793335, + "rewards/rejected": -1.2936079502105713, + "step": 217 + }, + { + "epoch": 3.694915254237288, + "grad_norm": 20.000263876312037, + "learning_rate": 4.891409615598949e-07, + "logits/chosen": 6.522953033447266, + "logits/rejected": 9.26037883758545, + "logps/chosen": -9.460394859313965, + "logps/rejected": -14.80904769897461, + "loss": 0.3662, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2533794939517975, + "rewards/margins": 1.6217294931411743, + "rewards/rejected": -1.3683499097824097, + "step": 218 + }, + { + "epoch": 3.711864406779661, + "grad_norm": 20.593308405599913, + "learning_rate": 4.889243208418549e-07, + "logits/chosen": 6.3479533195495605, + "logits/rejected": 5.238455295562744, + "logps/chosen": -11.332074165344238, + "logps/rejected": -16.5391845703125, + "loss": 0.3812, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03672366216778755, + "rewards/margins": 1.5255634784698486, + "rewards/rejected": -1.488839864730835, + "step": 219 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 20.145188470497885, + "learning_rate": 4.88705589332173e-07, + "logits/chosen": 7.08650016784668, + "logits/rejected": 8.531864166259766, + "logps/chosen": -7.60097074508667, + "logps/rejected": -11.326334953308105, + "loss": 0.3778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3768389821052551, + "rewards/margins": 1.0568028688430786, + "rewards/rejected": -0.6799638271331787, + "step": 220 + }, + { + "epoch": 3.7457627118644066, + "grad_norm": 24.41361721957706, + "learning_rate": 4.884847689449361e-07, + "logits/chosen": 9.343852043151855, + "logits/rejected": 6.67693567276001, + "logps/chosen": -9.532071113586426, + "logps/rejected": -20.449419021606445, + "loss": 0.4415, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2507076859474182, + "rewards/margins": 1.9644031524658203, + "rewards/rejected": -1.7136951684951782, + "step": 221 + }, + { + "epoch": 3.7627118644067794, + "grad_norm": 20.511147442858544, + "learning_rate": 4.88261861612511e-07, + "logits/chosen": 6.1774139404296875, + "logits/rejected": 8.693314552307129, + "logps/chosen": -10.729753494262695, + "logps/rejected": -13.79563045501709, + "loss": 0.3302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20722553133964539, + "rewards/margins": 1.8044683933258057, + "rewards/rejected": -1.597243070602417, + "step": 222 + }, + { + "epoch": 3.7796610169491527, + "grad_norm": 21.53866158472204, + "learning_rate": 4.880368692855273e-07, + "logits/chosen": 2.2718300819396973, + "logits/rejected": 7.136542797088623, + "logps/chosen": -12.451478958129883, + "logps/rejected": -18.369718551635742, + "loss": 0.3836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11350546777248383, + "rewards/margins": 1.4630939960479736, + "rewards/rejected": -1.3495887517929077, + "step": 223 + }, + { + "epoch": 3.7966101694915255, + "grad_norm": 21.097920842210723, + "learning_rate": 4.878097939328596e-07, + "logits/chosen": 7.12879753112793, + "logits/rejected": 7.652994632720947, + "logps/chosen": -10.000338554382324, + "logps/rejected": -11.711467742919922, + "loss": 0.3724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22102656960487366, + "rewards/margins": 1.0897376537322998, + "rewards/rejected": -0.868710994720459, + "step": 224 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 20.22398030765245, + "learning_rate": 4.875806375416109e-07, + "logits/chosen": 8.740015029907227, + "logits/rejected": 13.572087287902832, + "logps/chosen": -11.432252883911133, + "logps/rejected": -12.60297679901123, + "loss": 0.376, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20887497067451477, + "rewards/margins": 1.4403488636016846, + "rewards/rejected": -1.2314739227294922, + "step": 225 + }, + { + "epoch": 3.830508474576271, + "grad_norm": 20.621677218225578, + "learning_rate": 4.873494021170954e-07, + "logits/chosen": 9.282068252563477, + "logits/rejected": 10.690045356750488, + "logps/chosen": -8.891980171203613, + "logps/rejected": -12.23794174194336, + "loss": 0.3845, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015214920043945312, + "rewards/margins": 1.3658978939056396, + "rewards/rejected": -1.3506828546524048, + "step": 226 + }, + { + "epoch": 3.847457627118644, + "grad_norm": 19.999247373468872, + "learning_rate": 4.871160896828199e-07, + "logits/chosen": 7.256874084472656, + "logits/rejected": 10.629111289978027, + "logps/chosen": -11.401512145996094, + "logps/rejected": -16.319957733154297, + "loss": 0.3531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39265120029449463, + "rewards/margins": 2.058181047439575, + "rewards/rejected": -1.665529727935791, + "step": 227 + }, + { + "epoch": 3.864406779661017, + "grad_norm": 21.02660956059569, + "learning_rate": 4.868807022804678e-07, + "logits/chosen": 8.826285362243652, + "logits/rejected": 7.1804375648498535, + "logps/chosen": -8.303820610046387, + "logps/rejected": -18.380210876464844, + "loss": 0.3719, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27124762535095215, + "rewards/margins": 1.8690426349639893, + "rewards/rejected": -1.5977948904037476, + "step": 228 + }, + { + "epoch": 3.8813559322033897, + "grad_norm": 21.93005411442054, + "learning_rate": 4.866432419698792e-07, + "logits/chosen": 3.6723451614379883, + "logits/rejected": 4.766155242919922, + "logps/chosen": -10.370809555053711, + "logps/rejected": -13.244355201721191, + "loss": 0.409, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05697673559188843, + "rewards/margins": 1.2714791297912598, + "rewards/rejected": -1.2145024538040161, + "step": 229 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 20.227968165373298, + "learning_rate": 4.864037108290347e-07, + "logits/chosen": 8.437172889709473, + "logits/rejected": 7.416140556335449, + "logps/chosen": -9.189136505126953, + "logps/rejected": -20.316320419311523, + "loss": 0.3529, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09363769739866257, + "rewards/margins": 2.3706040382385254, + "rewards/rejected": -2.2769663333892822, + "step": 230 + }, + { + "epoch": 3.915254237288136, + "grad_norm": 21.82010790729604, + "learning_rate": 4.86162110954036e-07, + "logits/chosen": 2.241933822631836, + "logits/rejected": 4.145862579345703, + "logps/chosen": -9.749147415161133, + "logps/rejected": -9.986723899841309, + "loss": 0.4128, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24059665203094482, + "rewards/margins": 0.7164955139160156, + "rewards/rejected": -0.4758988916873932, + "step": 231 + }, + { + "epoch": 3.9322033898305087, + "grad_norm": 19.529430603984025, + "learning_rate": 4.859184444590881e-07, + "logits/chosen": 7.928268909454346, + "logits/rejected": 7.564013481140137, + "logps/chosen": -9.2333345413208, + "logps/rejected": -11.309907913208008, + "loss": 0.3496, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2154204249382019, + "rewards/margins": 0.6944783926010132, + "rewards/rejected": -0.47905805706977844, + "step": 232 + }, + { + "epoch": 3.9491525423728815, + "grad_norm": 20.910743589602404, + "learning_rate": 4.856727134764809e-07, + "logits/chosen": 8.220526695251465, + "logits/rejected": 9.600720405578613, + "logps/chosen": -6.642482757568359, + "logps/rejected": -15.970149993896484, + "loss": 0.3602, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.06534373760223389, + "rewards/margins": 1.8218185901641846, + "rewards/rejected": -1.7564747333526611, + "step": 233 + }, + { + "epoch": 3.9661016949152543, + "grad_norm": 20.74274651410422, + "learning_rate": 4.8542492015657e-07, + "logits/chosen": 10.452482223510742, + "logits/rejected": 13.264217376708984, + "logps/chosen": -11.146896362304688, + "logps/rejected": -18.784910202026367, + "loss": 0.4025, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20790499448776245, + "rewards/margins": 2.184065341949463, + "rewards/rejected": -1.9761605262756348, + "step": 234 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 21.513229116400385, + "learning_rate": 4.851750666677583e-07, + "logits/chosen": 9.31413745880127, + "logits/rejected": 8.258987426757812, + "logps/chosen": -8.405204772949219, + "logps/rejected": -13.575927734375, + "loss": 0.4065, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12339913845062256, + "rewards/margins": 0.7723197937011719, + "rewards/rejected": -0.6489205956459045, + "step": 235 + }, + { + "epoch": 4.0, + "grad_norm": 21.694593284683215, + "learning_rate": 4.849231551964771e-07, + "logits/chosen": 3.4820375442504883, + "logits/rejected": 8.22888469696045, + "logps/chosen": -10.688333511352539, + "logps/rejected": -16.323883056640625, + "loss": 0.3582, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1764301061630249, + "rewards/margins": 2.344557285308838, + "rewards/rejected": -2.1681272983551025, + "step": 236 + }, + { + "epoch": 4.016949152542373, + "grad_norm": 17.165516200116453, + "learning_rate": 4.846691879471666e-07, + "logits/chosen": 7.24074125289917, + "logits/rejected": 10.560139656066895, + "logps/chosen": -12.36288833618164, + "logps/rejected": -17.59840202331543, + "loss": 0.2777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20638218522071838, + "rewards/margins": 2.1878502368927, + "rewards/rejected": -1.9814679622650146, + "step": 237 + }, + { + "epoch": 4.033898305084746, + "grad_norm": 18.267129061896295, + "learning_rate": 4.844131671422569e-07, + "logits/chosen": 4.005539417266846, + "logits/rejected": 6.256585121154785, + "logps/chosen": -9.463366508483887, + "logps/rejected": -15.193839073181152, + "loss": 0.3557, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3415685296058655, + "rewards/margins": 1.9723048210144043, + "rewards/rejected": -1.630736231803894, + "step": 238 + }, + { + "epoch": 4.0508474576271185, + "grad_norm": 18.230196378772558, + "learning_rate": 4.841550950221485e-07, + "logits/chosen": 6.972199440002441, + "logits/rejected": 8.425640106201172, + "logps/chosen": -12.009435653686523, + "logps/rejected": -17.968767166137695, + "loss": 0.3264, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10153679549694061, + "rewards/margins": 1.6484508514404297, + "rewards/rejected": -1.546913981437683, + "step": 239 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 17.815979438227107, + "learning_rate": 4.838949738451928e-07, + "logits/chosen": 7.805523872375488, + "logits/rejected": 11.780006408691406, + "logps/chosen": -12.637031555175781, + "logps/rejected": -20.631893157958984, + "loss": 0.3414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.028789594769477844, + "rewards/margins": 2.5881569385528564, + "rewards/rejected": -2.5593671798706055, + "step": 240 + }, + { + "epoch": 4.084745762711864, + "grad_norm": 16.67503277302023, + "learning_rate": 4.836328058876717e-07, + "logits/chosen": 4.916990756988525, + "logits/rejected": 5.723315238952637, + "logps/chosen": -10.50915241241455, + "logps/rejected": -12.09021282196045, + "loss": 0.3289, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26938343048095703, + "rewards/margins": 1.0055718421936035, + "rewards/rejected": -0.7361884713172913, + "step": 241 + }, + { + "epoch": 4.101694915254237, + "grad_norm": 18.459637642971867, + "learning_rate": 4.833685934437787e-07, + "logits/chosen": 8.218949317932129, + "logits/rejected": 4.173165321350098, + "logps/chosen": -9.293395042419434, + "logps/rejected": -15.193696022033691, + "loss": 0.3194, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3008555769920349, + "rewards/margins": 1.5985466241836548, + "rewards/rejected": -1.2976911067962646, + "step": 242 + }, + { + "epoch": 4.11864406779661, + "grad_norm": 16.84514978201183, + "learning_rate": 4.831023388255979e-07, + "logits/chosen": 8.695550918579102, + "logits/rejected": 11.430813789367676, + "logps/chosen": -9.363493919372559, + "logps/rejected": -18.699190139770508, + "loss": 0.2861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11874284595251083, + "rewards/margins": 2.3261449337005615, + "rewards/rejected": -2.207401990890503, + "step": 243 + }, + { + "epoch": 4.135593220338983, + "grad_norm": 17.815881783852166, + "learning_rate": 4.828340443630846e-07, + "logits/chosen": 5.674130916595459, + "logits/rejected": 6.590822219848633, + "logps/chosen": -8.572710037231445, + "logps/rejected": -16.258394241333008, + "loss": 0.3364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26820075511932373, + "rewards/margins": 1.8350509405136108, + "rewards/rejected": -1.566850185394287, + "step": 244 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 19.076762128934657, + "learning_rate": 4.825637124040441e-07, + "logits/chosen": 6.261494159698486, + "logits/rejected": 10.208662033081055, + "logps/chosen": -12.498882293701172, + "logps/rejected": -19.198043823242188, + "loss": 0.352, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23819471895694733, + "rewards/margins": 2.1077005863189697, + "rewards/rejected": -1.8695058822631836, + "step": 245 + }, + { + "epoch": 4.169491525423728, + "grad_norm": 19.346185362099984, + "learning_rate": 4.822913453141117e-07, + "logits/chosen": 6.8853654861450195, + "logits/rejected": 6.2803955078125, + "logps/chosen": -10.658153533935547, + "logps/rejected": -18.49957847595215, + "loss": 0.3353, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1655520498752594, + "rewards/margins": 2.306309223175049, + "rewards/rejected": -2.1407570838928223, + "step": 246 + }, + { + "epoch": 4.186440677966102, + "grad_norm": 18.617703119188207, + "learning_rate": 4.820169454767318e-07, + "logits/chosen": 8.95741081237793, + "logits/rejected": 10.199117660522461, + "logps/chosen": -9.932829856872559, + "logps/rejected": -17.307279586791992, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33301711082458496, + "rewards/margins": 1.9417872428894043, + "rewards/rejected": -1.6087702512741089, + "step": 247 + }, + { + "epoch": 4.203389830508475, + "grad_norm": 19.448914932645447, + "learning_rate": 4.81740515293137e-07, + "logits/chosen": 4.610191345214844, + "logits/rejected": 6.457115650177002, + "logps/chosen": -10.178339958190918, + "logps/rejected": -14.412067413330078, + "loss": 0.3339, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.337657630443573, + "rewards/margins": 1.3123055696487427, + "rewards/rejected": -0.9746479988098145, + "step": 248 + }, + { + "epoch": 4.220338983050848, + "grad_norm": 19.07405219931799, + "learning_rate": 4.814620571823274e-07, + "logits/chosen": 3.8473613262176514, + "logits/rejected": 4.449122428894043, + "logps/chosen": -13.519302368164062, + "logps/rejected": -16.95071792602539, + "loss": 0.3406, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37721583247184753, + "rewards/margins": 1.67903470993042, + "rewards/rejected": -1.3018189668655396, + "step": 249 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 16.24922812475198, + "learning_rate": 4.811815735810489e-07, + "logits/chosen": 7.148613929748535, + "logits/rejected": 6.880765914916992, + "logps/chosen": -11.212467193603516, + "logps/rejected": -19.45187759399414, + "loss": 0.2912, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3127380907535553, + "rewards/margins": 2.7761077880859375, + "rewards/rejected": -2.463369846343994, + "step": 250 + }, + { + "epoch": 4.254237288135593, + "grad_norm": 17.553230728699816, + "learning_rate": 4.808990669437724e-07, + "logits/chosen": 7.376377105712891, + "logits/rejected": 6.04884672164917, + "logps/chosen": -9.064533233642578, + "logps/rejected": -17.936084747314453, + "loss": 0.2991, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.027374181896448135, + "rewards/margins": 2.2804818153381348, + "rewards/rejected": -2.253107786178589, + "step": 251 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 17.955648025407335, + "learning_rate": 4.806145397426719e-07, + "logits/chosen": 9.423786163330078, + "logits/rejected": 10.659478187561035, + "logps/chosen": -7.087960243225098, + "logps/rejected": -12.86429500579834, + "loss": 0.3305, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2393162101507187, + "rewards/margins": 1.852463960647583, + "rewards/rejected": -1.6131477355957031, + "step": 252 + }, + { + "epoch": 4.288135593220339, + "grad_norm": 16.31241903989762, + "learning_rate": 4.803279944676032e-07, + "logits/chosen": 7.690080642700195, + "logits/rejected": 4.676759719848633, + "logps/chosen": -7.623948097229004, + "logps/rejected": -16.224029541015625, + "loss": 0.2743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2924440801143646, + "rewards/margins": 2.0772933959960938, + "rewards/rejected": -1.7848492860794067, + "step": 253 + }, + { + "epoch": 4.305084745762712, + "grad_norm": 17.555016757847234, + "learning_rate": 4.800394336260819e-07, + "logits/chosen": 6.041158676147461, + "logits/rejected": 6.416534900665283, + "logps/chosen": -7.688037872314453, + "logps/rejected": -17.980430603027344, + "loss": 0.3153, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1410144865512848, + "rewards/margins": 2.001737356185913, + "rewards/rejected": -1.8607230186462402, + "step": 254 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 18.480066982296236, + "learning_rate": 4.797488597432616e-07, + "logits/chosen": 2.997715473175049, + "logits/rejected": 8.167374610900879, + "logps/chosen": -15.387086868286133, + "logps/rejected": -14.066549301147461, + "loss": 0.2997, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4183131158351898, + "rewards/margins": 1.9756748676300049, + "rewards/rejected": -1.5573619604110718, + "step": 255 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 17.471201870112832, + "learning_rate": 4.794562753619117e-07, + "logits/chosen": 4.013498783111572, + "logits/rejected": 4.414510250091553, + "logps/chosen": -8.612306594848633, + "logps/rejected": -14.004947662353516, + "loss": 0.3257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3176363706588745, + "rewards/margins": 1.533604621887207, + "rewards/rejected": -1.215968132019043, + "step": 256 + }, + { + "epoch": 4.3559322033898304, + "grad_norm": 19.17554175787275, + "learning_rate": 4.791616830423949e-07, + "logits/chosen": 4.06189489364624, + "logits/rejected": 6.982814311981201, + "logps/chosen": -9.344228744506836, + "logps/rejected": -15.20322036743164, + "loss": 0.3233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.228972390294075, + "rewards/margins": 1.8103268146514893, + "rewards/rejected": -1.5813543796539307, + "step": 257 + }, + { + "epoch": 4.372881355932203, + "grad_norm": 17.10464741278365, + "learning_rate": 4.788650853626456e-07, + "logits/chosen": 2.260240316390991, + "logits/rejected": 6.677258014678955, + "logps/chosen": -9.949880599975586, + "logps/rejected": -13.5770845413208, + "loss": 0.3136, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28663721680641174, + "rewards/margins": 1.7761832475662231, + "rewards/rejected": -1.4895460605621338, + "step": 258 + }, + { + "epoch": 4.389830508474576, + "grad_norm": 18.82388276814341, + "learning_rate": 4.785664849181465e-07, + "logits/chosen": 4.183163642883301, + "logits/rejected": 4.152279376983643, + "logps/chosen": -6.331787586212158, + "logps/rejected": -11.512680053710938, + "loss": 0.33, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1800474226474762, + "rewards/margins": 1.1619161367416382, + "rewards/rejected": -0.9818687438964844, + "step": 259 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 17.663331505017794, + "learning_rate": 4.78265884321906e-07, + "logits/chosen": 6.67874813079834, + "logits/rejected": 4.668858528137207, + "logps/chosen": -10.521224021911621, + "logps/rejected": -19.912858963012695, + "loss": 0.3094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1548883020877838, + "rewards/margins": 2.28428316116333, + "rewards/rejected": -2.12939453125, + "step": 260 + }, + { + "epoch": 4.423728813559322, + "grad_norm": 17.311775354907358, + "learning_rate": 4.779632862044361e-07, + "logits/chosen": 5.639666557312012, + "logits/rejected": 6.38239049911499, + "logps/chosen": -8.55907154083252, + "logps/rejected": -18.988445281982422, + "loss": 0.298, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22225412726402283, + "rewards/margins": 2.3125150203704834, + "rewards/rejected": -2.0902609825134277, + "step": 261 + }, + { + "epoch": 4.440677966101695, + "grad_norm": 17.440600929155575, + "learning_rate": 4.776586932137283e-07, + "logits/chosen": 3.9185001850128174, + "logits/rejected": 6.49030065536499, + "logps/chosen": -10.537013053894043, + "logps/rejected": -14.84063720703125, + "loss": 0.3009, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13585788011550903, + "rewards/margins": 1.624847412109375, + "rewards/rejected": -1.4889897108078003, + "step": 262 + }, + { + "epoch": 4.4576271186440675, + "grad_norm": 17.13685594779347, + "learning_rate": 4.773521080152311e-07, + "logits/chosen": 2.5429065227508545, + "logits/rejected": 5.310009002685547, + "logps/chosen": -12.58240795135498, + "logps/rejected": -20.367462158203125, + "loss": 0.2836, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13424983620643616, + "rewards/margins": 1.8994662761688232, + "rewards/rejected": -1.765216588973999, + "step": 263 + }, + { + "epoch": 4.47457627118644, + "grad_norm": 17.308598065836613, + "learning_rate": 4.770435332918267e-07, + "logits/chosen": 4.594322204589844, + "logits/rejected": 8.247635841369629, + "logps/chosen": -11.638622283935547, + "logps/rejected": -18.57332992553711, + "loss": 0.2982, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2245895266532898, + "rewards/margins": 2.225818395614624, + "rewards/rejected": -2.0012288093566895, + "step": 264 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 16.88232961307194, + "learning_rate": 4.76732971743807e-07, + "logits/chosen": 6.63725471496582, + "logits/rejected": 8.631773948669434, + "logps/chosen": -7.823244094848633, + "logps/rejected": -17.624019622802734, + "loss": 0.2864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2145896852016449, + "rewards/margins": 3.0045228004455566, + "rewards/rejected": -2.789933204650879, + "step": 265 + }, + { + "epoch": 4.508474576271187, + "grad_norm": 18.037592468816126, + "learning_rate": 4.7642042608885056e-07, + "logits/chosen": 5.675311088562012, + "logits/rejected": 4.931497573852539, + "logps/chosen": -11.616440773010254, + "logps/rejected": -20.281126022338867, + "loss": 0.336, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11937360465526581, + "rewards/margins": 2.490065574645996, + "rewards/rejected": -2.370692014694214, + "step": 266 + }, + { + "epoch": 4.52542372881356, + "grad_norm": 18.473347743493118, + "learning_rate": 4.761058990619986e-07, + "logits/chosen": 2.7812986373901367, + "logits/rejected": 3.597932815551758, + "logps/chosen": -8.612319946289062, + "logps/rejected": -15.196998596191406, + "loss": 0.286, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41151559352874756, + "rewards/margins": 1.9993846416473389, + "rewards/rejected": -1.5878691673278809, + "step": 267 + }, + { + "epoch": 4.5423728813559325, + "grad_norm": 16.332029360751594, + "learning_rate": 4.757893934156309e-07, + "logits/chosen": 7.0438232421875, + "logits/rejected": 6.700860023498535, + "logps/chosen": -8.056821823120117, + "logps/rejected": -19.757720947265625, + "loss": 0.3042, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12074195593595505, + "rewards/margins": 2.930652379989624, + "rewards/rejected": -2.809910535812378, + "step": 268 + }, + { + "epoch": 4.559322033898305, + "grad_norm": 17.180403373586987, + "learning_rate": 4.754709119194418e-07, + "logits/chosen": 12.024563789367676, + "logits/rejected": 11.642518997192383, + "logps/chosen": -9.898269653320312, + "logps/rejected": -20.960111618041992, + "loss": 0.2921, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20630189776420593, + "rewards/margins": 2.34810209274292, + "rewards/rejected": -2.1418001651763916, + "step": 269 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 18.08908952286864, + "learning_rate": 4.7515045736041615e-07, + "logits/chosen": 8.689579010009766, + "logits/rejected": 9.39737606048584, + "logps/chosen": -6.704339504241943, + "logps/rejected": -15.77403736114502, + "loss": 0.3367, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3202212452888489, + "rewards/margins": 2.208920955657959, + "rewards/rejected": -1.8886998891830444, + "step": 270 + }, + { + "epoch": 4.593220338983051, + "grad_norm": 20.731272870943574, + "learning_rate": 4.748280325428048e-07, + "logits/chosen": 5.354104995727539, + "logits/rejected": 6.051662445068359, + "logps/chosen": -9.395597457885742, + "logps/rejected": -16.60767936706543, + "loss": 0.3087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.25916188955307007, + "rewards/margins": 2.148326873779297, + "rewards/rejected": -1.889164924621582, + "step": 271 + }, + { + "epoch": 4.610169491525424, + "grad_norm": 17.659884235704403, + "learning_rate": 4.745036402880999e-07, + "logits/chosen": 5.418169021606445, + "logits/rejected": 6.012850761413574, + "logps/chosen": -7.826754570007324, + "logps/rejected": -18.563873291015625, + "loss": 0.2977, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2566584050655365, + "rewards/margins": 3.3900957107543945, + "rewards/rejected": -3.133437395095825, + "step": 272 + }, + { + "epoch": 4.627118644067797, + "grad_norm": 17.60711001593079, + "learning_rate": 4.741772834350104e-07, + "logits/chosen": 5.240487098693848, + "logits/rejected": 5.013432025909424, + "logps/chosen": -11.003212928771973, + "logps/rejected": -17.381736755371094, + "loss": 0.3194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22370412945747375, + "rewards/margins": 1.2826619148254395, + "rewards/rejected": -1.058957576751709, + "step": 273 + }, + { + "epoch": 4.6440677966101696, + "grad_norm": 18.485652371137643, + "learning_rate": 4.7384896483943726e-07, + "logits/chosen": 6.851320266723633, + "logits/rejected": 5.790674209594727, + "logps/chosen": -6.928519248962402, + "logps/rejected": -15.897483825683594, + "loss": 0.3131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2791842818260193, + "rewards/margins": 1.9868839979171753, + "rewards/rejected": -1.7076997756958008, + "step": 274 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 17.27421393446924, + "learning_rate": 4.7351868737444825e-07, + "logits/chosen": 9.60102367401123, + "logits/rejected": 7.984978199005127, + "logps/chosen": -6.305352210998535, + "logps/rejected": -12.598247528076172, + "loss": 0.3242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3262409567832947, + "rewards/margins": 1.0967921018600464, + "rewards/rejected": -0.7705512046813965, + "step": 275 + }, + { + "epoch": 4.677966101694915, + "grad_norm": 17.655139325548166, + "learning_rate": 4.7318645393025305e-07, + "logits/chosen": 6.915894031524658, + "logits/rejected": 8.634527206420898, + "logps/chosen": -9.319221496582031, + "logps/rejected": -12.086976051330566, + "loss": 0.3315, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3277999758720398, + "rewards/margins": 1.5269964933395386, + "rewards/rejected": -1.199196457862854, + "step": 276 + }, + { + "epoch": 4.694915254237288, + "grad_norm": 17.011095996634904, + "learning_rate": 4.7285226741417753e-07, + "logits/chosen": 3.976977825164795, + "logits/rejected": 7.2318501472473145, + "logps/chosen": -10.538625717163086, + "logps/rejected": -16.68062973022461, + "loss": 0.2857, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3574908375740051, + "rewards/margins": 2.6200029850006104, + "rewards/rejected": -2.262512445449829, + "step": 277 + }, + { + "epoch": 4.711864406779661, + "grad_norm": 17.65277324244951, + "learning_rate": 4.7251613075063905e-07, + "logits/chosen": 6.255118370056152, + "logits/rejected": 6.385594844818115, + "logps/chosen": -8.837267875671387, + "logps/rejected": -17.89482307434082, + "loss": 0.3087, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2775583565235138, + "rewards/margins": 3.0456676483154297, + "rewards/rejected": -2.7681093215942383, + "step": 278 + }, + { + "epoch": 4.728813559322034, + "grad_norm": 16.6287870272674, + "learning_rate": 4.721780468811201e-07, + "logits/chosen": 4.9652299880981445, + "logits/rejected": 8.796908378601074, + "logps/chosen": -11.446728706359863, + "logps/rejected": -15.115416526794434, + "loss": 0.2724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26096999645233154, + "rewards/margins": 1.8385089635849, + "rewards/rejected": -1.5775389671325684, + "step": 279 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 17.25865918677704, + "learning_rate": 4.7183801876414286e-07, + "logits/chosen": 5.870490074157715, + "logits/rejected": 8.393874168395996, + "logps/chosen": -8.048952102661133, + "logps/rejected": -15.552209854125977, + "loss": 0.2926, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1697208136320114, + "rewards/margins": 2.0907211303710938, + "rewards/rejected": -1.921000361442566, + "step": 280 + }, + { + "epoch": 4.762711864406779, + "grad_norm": 17.566147545689287, + "learning_rate": 4.7149604937524356e-07, + "logits/chosen": 3.231328248977661, + "logits/rejected": 3.152560234069824, + "logps/chosen": -13.5642728805542, + "logps/rejected": -19.40642547607422, + "loss": 0.313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5033832788467407, + "rewards/margins": 1.4650967121124268, + "rewards/rejected": -0.961713433265686, + "step": 281 + }, + { + "epoch": 4.779661016949152, + "grad_norm": 15.6998720784932, + "learning_rate": 4.7115214170694616e-07, + "logits/chosen": 5.659430503845215, + "logits/rejected": 9.334473609924316, + "logps/chosen": -9.407824516296387, + "logps/rejected": -18.11457061767578, + "loss": 0.258, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1942892074584961, + "rewards/margins": 2.8978137969970703, + "rewards/rejected": -2.7035248279571533, + "step": 282 + }, + { + "epoch": 4.796610169491525, + "grad_norm": 17.884964474492214, + "learning_rate": 4.70806298768736e-07, + "logits/chosen": 4.105790138244629, + "logits/rejected": 6.558419227600098, + "logps/chosen": -10.256916999816895, + "logps/rejected": -11.880553245544434, + "loss": 0.3091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23252363502979279, + "rewards/margins": 2.0612881183624268, + "rewards/rejected": -1.8287646770477295, + "step": 283 + }, + { + "epoch": 4.813559322033898, + "grad_norm": 19.503195523702683, + "learning_rate": 4.70458523587034e-07, + "logits/chosen": 5.803598403930664, + "logits/rejected": 6.407049655914307, + "logps/chosen": -9.721452713012695, + "logps/rejected": -20.844100952148438, + "loss": 0.3453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22893205285072327, + "rewards/margins": 2.159566879272461, + "rewards/rejected": -1.93063485622406, + "step": 284 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 17.660468010554457, + "learning_rate": 4.701088192051695e-07, + "logits/chosen": 4.224084377288818, + "logits/rejected": 4.460755348205566, + "logps/chosen": -11.501953125, + "logps/rejected": -19.310644149780273, + "loss": 0.3149, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5221762657165527, + "rewards/margins": 2.612990379333496, + "rewards/rejected": -2.0908143520355225, + "step": 285 + }, + { + "epoch": 4.847457627118644, + "grad_norm": 15.44242173696732, + "learning_rate": 4.697571886833543e-07, + "logits/chosen": 5.924899101257324, + "logits/rejected": 6.679788589477539, + "logps/chosen": -7.886451721191406, + "logps/rejected": -15.066838264465332, + "loss": 0.2688, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3297101855278015, + "rewards/margins": 2.0337629318237305, + "rewards/rejected": -1.7040526866912842, + "step": 286 + }, + { + "epoch": 4.864406779661017, + "grad_norm": 17.217120104408682, + "learning_rate": 4.6940363509865553e-07, + "logits/chosen": 5.9503583908081055, + "logits/rejected": 6.5264105796813965, + "logps/chosen": -10.39914608001709, + "logps/rejected": -15.116087913513184, + "loss": 0.2827, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23787519335746765, + "rewards/margins": 1.552558422088623, + "rewards/rejected": -1.314683198928833, + "step": 287 + }, + { + "epoch": 4.88135593220339, + "grad_norm": 15.856929183783881, + "learning_rate": 4.6904816154496854e-07, + "logits/chosen": 6.741863250732422, + "logits/rejected": 9.094884872436523, + "logps/chosen": -13.021467208862305, + "logps/rejected": -19.538970947265625, + "loss": 0.2779, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.047575704753398895, + "rewards/margins": 2.4877426624298096, + "rewards/rejected": -2.53531813621521, + "step": 288 + }, + { + "epoch": 4.898305084745763, + "grad_norm": 17.193550836836813, + "learning_rate": 4.6869077113299025e-07, + "logits/chosen": 5.560794830322266, + "logits/rejected": 4.232423305511475, + "logps/chosen": -9.655923843383789, + "logps/rejected": -19.288549423217773, + "loss": 0.318, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09008821845054626, + "rewards/margins": 2.3839855194091797, + "rewards/rejected": -2.2938973903656006, + "step": 289 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 17.956803106276503, + "learning_rate": 4.6833146699019177e-07, + "logits/chosen": 2.667480945587158, + "logits/rejected": 3.094351053237915, + "logps/chosen": -7.915768146514893, + "logps/rejected": -12.158473014831543, + "loss": 0.3469, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3719676733016968, + "rewards/margins": 1.006954550743103, + "rewards/rejected": -0.6349868774414062, + "step": 290 + }, + { + "epoch": 4.932203389830509, + "grad_norm": 17.633436631978487, + "learning_rate": 4.6797025226079074e-07, + "logits/chosen": 7.160995960235596, + "logits/rejected": 7.154771327972412, + "logps/chosen": -8.890524864196777, + "logps/rejected": -14.562738418579102, + "loss": 0.2858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.040711283683776855, + "rewards/margins": 1.5542259216308594, + "rewards/rejected": -1.5135146379470825, + "step": 291 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 18.029997556885586, + "learning_rate": 4.676071301057243e-07, + "logits/chosen": 6.709413528442383, + "logits/rejected": 5.719005584716797, + "logps/chosen": -10.442892074584961, + "logps/rejected": -16.07100486755371, + "loss": 0.3138, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20800533890724182, + "rewards/margins": 1.062894582748413, + "rewards/rejected": -0.8548891544342041, + "step": 292 + }, + { + "epoch": 4.966101694915254, + "grad_norm": 18.15563498375766, + "learning_rate": 4.67242103702621e-07, + "logits/chosen": 4.255777359008789, + "logits/rejected": 4.3217949867248535, + "logps/chosen": -10.030130386352539, + "logps/rejected": -16.675334930419922, + "loss": 0.3034, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2056305855512619, + "rewards/margins": 1.8735519647598267, + "rewards/rejected": -1.6679213047027588, + "step": 293 + }, + { + "epoch": 4.983050847457627, + "grad_norm": 16.468882573077856, + "learning_rate": 4.668751762457733e-07, + "logits/chosen": 6.377828598022461, + "logits/rejected": 6.773748397827148, + "logps/chosen": -8.659324645996094, + "logps/rejected": -16.170530319213867, + "loss": 0.2782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22924266755580902, + "rewards/margins": 1.764962911605835, + "rewards/rejected": -1.5357201099395752, + "step": 294 + }, + { + "epoch": 5.0, + "grad_norm": 15.799532890303503, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": 3.3482489585876465, + "logits/rejected": 5.720559120178223, + "logps/chosen": -9.48496150970459, + "logps/rejected": -18.143577575683594, + "loss": 0.2819, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14067193865776062, + "rewards/margins": 2.612166166305542, + "rewards/rejected": -2.471494197845459, + "step": 295 + }, + { + "epoch": 5.016949152542373, + "grad_norm": 15.974359618646485, + "learning_rate": 4.661356310311659e-07, + "logits/chosen": 8.018171310424805, + "logits/rejected": 8.172922134399414, + "logps/chosen": -8.21717643737793, + "logps/rejected": -18.745868682861328, + "loss": 0.2576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1397463083267212, + "rewards/margins": 2.444672107696533, + "rewards/rejected": -2.3049259185791016, + "step": 296 + }, + { + "epoch": 5.033898305084746, + "grad_norm": 15.886523739916017, + "learning_rate": 4.657630197450576e-07, + "logits/chosen": 2.65042781829834, + "logits/rejected": 5.1384806632995605, + "logps/chosen": -13.963510513305664, + "logps/rejected": -24.165008544921875, + "loss": 0.247, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4832766354084015, + "rewards/margins": 3.386579990386963, + "rewards/rejected": -2.9033029079437256, + "step": 297 + }, + { + "epoch": 5.0508474576271185, + "grad_norm": 14.661014856292772, + "learning_rate": 4.653885203484515e-07, + "logits/chosen": 2.559359073638916, + "logits/rejected": 4.847626209259033, + "logps/chosen": -7.78449821472168, + "logps/rejected": -13.427138328552246, + "loss": 0.2826, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22836291790008545, + "rewards/margins": 1.7525341510772705, + "rewards/rejected": -1.524170994758606, + "step": 298 + }, + { + "epoch": 5.067796610169491, + "grad_norm": 13.461143030238789, + "learning_rate": 4.6501213611853673e-07, + "logits/chosen": 5.067084789276123, + "logits/rejected": 2.6621310710906982, + "logps/chosen": -8.320889472961426, + "logps/rejected": -19.413127899169922, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.301086962223053, + "rewards/margins": 2.9841580390930176, + "rewards/rejected": -2.6830711364746094, + "step": 299 + }, + { + "epoch": 5.084745762711864, + "grad_norm": 16.070827271212256, + "learning_rate": 4.6463387034899643e-07, + "logits/chosen": 6.388011932373047, + "logits/rejected": 8.232110977172852, + "logps/chosen": -9.567124366760254, + "logps/rejected": -16.695533752441406, + "loss": 0.271, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23213094472885132, + "rewards/margins": 2.3436355590820312, + "rewards/rejected": -2.1115047931671143, + "step": 300 + }, + { + "epoch": 5.101694915254237, + "grad_norm": 14.41016072543074, + "learning_rate": 4.642537263499788e-07, + "logits/chosen": 3.605557441711426, + "logits/rejected": 2.710143566131592, + "logps/chosen": -6.5597028732299805, + "logps/rejected": -16.52556037902832, + "loss": 0.2586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4225674867630005, + "rewards/margins": 2.3175463676452637, + "rewards/rejected": -1.8949788808822632, + "step": 301 + }, + { + "epoch": 5.11864406779661, + "grad_norm": 13.805577818353347, + "learning_rate": 4.6387170744806813e-07, + "logits/chosen": 0.8485604524612427, + "logits/rejected": 1.88863205909729, + "logps/chosen": -12.944626808166504, + "logps/rejected": -24.060951232910156, + "loss": 0.2307, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1453239619731903, + "rewards/margins": 3.271425247192383, + "rewards/rejected": -3.12610125541687, + "step": 302 + }, + { + "epoch": 5.135593220338983, + "grad_norm": 14.226631709836239, + "learning_rate": 4.634878169862557e-07, + "logits/chosen": 5.334882736206055, + "logits/rejected": 4.9238667488098145, + "logps/chosen": -9.677927017211914, + "logps/rejected": -18.596200942993164, + "loss": 0.2558, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2794516682624817, + "rewards/margins": 1.9560503959655762, + "rewards/rejected": -1.6765985488891602, + "step": 303 + }, + { + "epoch": 5.1525423728813555, + "grad_norm": 14.025700719644135, + "learning_rate": 4.6310205832391065e-07, + "logits/chosen": 6.083189964294434, + "logits/rejected": 10.50492000579834, + "logps/chosen": -8.656221389770508, + "logps/rejected": -19.979228973388672, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12934735417366028, + "rewards/margins": 3.684925079345703, + "rewards/rejected": -3.5555777549743652, + "step": 304 + }, + { + "epoch": 5.169491525423728, + "grad_norm": 15.45904891320478, + "learning_rate": 4.6271443483675027e-07, + "logits/chosen": 6.795848369598389, + "logits/rejected": 7.770328521728516, + "logps/chosen": -8.92910385131836, + "logps/rejected": -12.093315124511719, + "loss": 0.2799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4935965836048126, + "rewards/margins": 1.7713218927383423, + "rewards/rejected": -1.2777252197265625, + "step": 305 + }, + { + "epoch": 5.186440677966102, + "grad_norm": 24.29778055150147, + "learning_rate": 4.6232494991681087e-07, + "logits/chosen": 3.3796167373657227, + "logits/rejected": 4.94102144241333, + "logps/chosen": -10.549798011779785, + "logps/rejected": -21.496875762939453, + "loss": 0.2313, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19755910336971283, + "rewards/margins": 3.451889753341675, + "rewards/rejected": -3.254331111907959, + "step": 306 + }, + { + "epoch": 5.203389830508475, + "grad_norm": 15.025755227790897, + "learning_rate": 4.6193360697241766e-07, + "logits/chosen": 6.718583583831787, + "logits/rejected": 4.653804302215576, + "logps/chosen": -10.026581764221191, + "logps/rejected": -22.026464462280273, + "loss": 0.2576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1300102323293686, + "rewards/margins": 2.7540884017944336, + "rewards/rejected": -2.624077796936035, + "step": 307 + }, + { + "epoch": 5.220338983050848, + "grad_norm": 15.0870418032954, + "learning_rate": 4.615404094281554e-07, + "logits/chosen": 2.41433048248291, + "logits/rejected": 3.862748146057129, + "logps/chosen": -9.14163589477539, + "logps/rejected": -14.814375877380371, + "loss": 0.2391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35920146107673645, + "rewards/margins": 1.8867554664611816, + "rewards/rejected": -1.527553915977478, + "step": 308 + }, + { + "epoch": 5.237288135593221, + "grad_norm": 14.76721553235894, + "learning_rate": 4.611453607248381e-07, + "logits/chosen": 0.7226177453994751, + "logits/rejected": 3.7342004776000977, + "logps/chosen": -10.703145027160645, + "logps/rejected": -15.224998474121094, + "loss": 0.2534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6355695724487305, + "rewards/margins": 2.3711819648742676, + "rewards/rejected": -1.7356122732162476, + "step": 309 + }, + { + "epoch": 5.254237288135593, + "grad_norm": 15.471489712702958, + "learning_rate": 4.607484643194788e-07, + "logits/chosen": 4.764775276184082, + "logits/rejected": 6.933917999267578, + "logps/chosen": -9.363961219787598, + "logps/rejected": -14.59107780456543, + "loss": 0.2601, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3514489531517029, + "rewards/margins": 2.547487735748291, + "rewards/rejected": -2.1960389614105225, + "step": 310 + }, + { + "epoch": 5.271186440677966, + "grad_norm": 13.781971702591731, + "learning_rate": 4.6034972368525957e-07, + "logits/chosen": 1.9174288511276245, + "logits/rejected": 2.8629069328308105, + "logps/chosen": -7.767963409423828, + "logps/rejected": -16.579254150390625, + "loss": 0.2545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08915120363235474, + "rewards/margins": 2.5531845092773438, + "rewards/rejected": -2.4640331268310547, + "step": 311 + }, + { + "epoch": 5.288135593220339, + "grad_norm": 14.885786269143404, + "learning_rate": 4.599491423115014e-07, + "logits/chosen": 5.801702499389648, + "logits/rejected": 5.572926998138428, + "logps/chosen": -9.809563636779785, + "logps/rejected": -15.179691314697266, + "loss": 0.2611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34644848108291626, + "rewards/margins": 1.769092082977295, + "rewards/rejected": -1.4226434230804443, + "step": 312 + }, + { + "epoch": 5.305084745762712, + "grad_norm": 13.557354407320414, + "learning_rate": 4.595467237036329e-07, + "logits/chosen": 4.226743221282959, + "logits/rejected": 5.284225940704346, + "logps/chosen": -8.43330192565918, + "logps/rejected": -13.308187484741211, + "loss": 0.2422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41138046979904175, + "rewards/margins": 1.7838656902313232, + "rewards/rejected": -1.3724852800369263, + "step": 313 + }, + { + "epoch": 5.322033898305085, + "grad_norm": 15.314012448739035, + "learning_rate": 4.591424713831602e-07, + "logits/chosen": 3.3238039016723633, + "logits/rejected": 5.186252593994141, + "logps/chosen": -8.77778434753418, + "logps/rejected": -20.426774978637695, + "loss": 0.2583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3558368980884552, + "rewards/margins": 2.579214572906494, + "rewards/rejected": -2.2233777046203613, + "step": 314 + }, + { + "epoch": 5.338983050847458, + "grad_norm": 13.567292315558147, + "learning_rate": 4.587363888876361e-07, + "logits/chosen": 3.5054311752319336, + "logits/rejected": 3.967586040496826, + "logps/chosen": -7.4029154777526855, + "logps/rejected": -15.959949493408203, + "loss": 0.2411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.016373835504055023, + "rewards/margins": 2.316739559173584, + "rewards/rejected": -2.333113431930542, + "step": 315 + }, + { + "epoch": 5.3559322033898304, + "grad_norm": 16.02951616634702, + "learning_rate": 4.583284797706287e-07, + "logits/chosen": 1.9606868028640747, + "logits/rejected": 1.1389808654785156, + "logps/chosen": -5.59510612487793, + "logps/rejected": -10.972127914428711, + "loss": 0.2638, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3377338647842407, + "rewards/margins": 1.3146023750305176, + "rewards/rejected": -0.9768685102462769, + "step": 316 + }, + { + "epoch": 5.372881355932203, + "grad_norm": 14.199893451013141, + "learning_rate": 4.5791874760169093e-07, + "logits/chosen": 3.0882699489593506, + "logits/rejected": 3.207479476928711, + "logps/chosen": -8.481260299682617, + "logps/rejected": -12.788800239562988, + "loss": 0.2415, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3231436312198639, + "rewards/margins": 1.855533242225647, + "rewards/rejected": -1.532389760017395, + "step": 317 + }, + { + "epoch": 5.389830508474576, + "grad_norm": 14.275237175931048, + "learning_rate": 4.575071959663288e-07, + "logits/chosen": 6.5589470863342285, + "logits/rejected": 6.860996246337891, + "logps/chosen": -10.277453422546387, + "logps/rejected": -21.133487701416016, + "loss": 0.2277, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1361951231956482, + "rewards/margins": 2.5179457664489746, + "rewards/rejected": -2.3817505836486816, + "step": 318 + }, + { + "epoch": 5.406779661016949, + "grad_norm": 14.61554056890025, + "learning_rate": 4.570938284659702e-07, + "logits/chosen": 3.4965403079986572, + "logits/rejected": 4.776882648468018, + "logps/chosen": -8.628873825073242, + "logps/rejected": -14.101314544677734, + "loss": 0.2375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28320109844207764, + "rewards/margins": 1.9493257999420166, + "rewards/rejected": -1.6661248207092285, + "step": 319 + }, + { + "epoch": 5.423728813559322, + "grad_norm": 14.959871093742624, + "learning_rate": 4.566786487179334e-07, + "logits/chosen": 3.0084660053253174, + "logits/rejected": 6.481236457824707, + "logps/chosen": -10.179582595825195, + "logps/rejected": -14.713674545288086, + "loss": 0.2662, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5509328842163086, + "rewards/margins": 2.3514480590820312, + "rewards/rejected": -1.8005151748657227, + "step": 320 + }, + { + "epoch": 5.440677966101695, + "grad_norm": 14.548805507323163, + "learning_rate": 4.5626166035539535e-07, + "logits/chosen": 5.655772686004639, + "logits/rejected": 8.056764602661133, + "logps/chosen": -10.326370239257812, + "logps/rejected": -17.405672073364258, + "loss": 0.2405, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.29461419582366943, + "rewards/margins": 2.596235513687134, + "rewards/rejected": -2.301621437072754, + "step": 321 + }, + { + "epoch": 5.4576271186440675, + "grad_norm": 13.453869815890464, + "learning_rate": 4.5584286702736007e-07, + "logits/chosen": 2.1154050827026367, + "logits/rejected": 2.4267899990081787, + "logps/chosen": -8.321646690368652, + "logps/rejected": -14.760269165039062, + "loss": 0.2184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35621577501296997, + "rewards/margins": 1.7278010845184326, + "rewards/rejected": -1.3715853691101074, + "step": 322 + }, + { + "epoch": 5.47457627118644, + "grad_norm": 15.50160681365945, + "learning_rate": 4.5542227239862654e-07, + "logits/chosen": 3.743596076965332, + "logits/rejected": 5.651052951812744, + "logps/chosen": -8.57131290435791, + "logps/rejected": -18.80699348449707, + "loss": 0.2644, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2737744152545929, + "rewards/margins": 2.8698837757110596, + "rewards/rejected": -2.596109390258789, + "step": 323 + }, + { + "epoch": 5.491525423728813, + "grad_norm": 13.868377244000722, + "learning_rate": 4.5499988014975635e-07, + "logits/chosen": 3.769761085510254, + "logits/rejected": 4.874698162078857, + "logps/chosen": -11.69039535522461, + "logps/rejected": -17.498027801513672, + "loss": 0.2212, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5320883989334106, + "rewards/margins": 2.393782377243042, + "rewards/rejected": -1.8616942167282104, + "step": 324 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 14.970272832391561, + "learning_rate": 4.545756939770422e-07, + "logits/chosen": 6.82029390335083, + "logits/rejected": 7.361518383026123, + "logps/chosen": -7.692866325378418, + "logps/rejected": -20.093639373779297, + "loss": 0.2336, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10636795312166214, + "rewards/margins": 3.6429443359375, + "rewards/rejected": -3.536576509475708, + "step": 325 + }, + { + "epoch": 5.52542372881356, + "grad_norm": 14.52118449485477, + "learning_rate": 4.54149717592475e-07, + "logits/chosen": 7.36668062210083, + "logits/rejected": 6.56944465637207, + "logps/chosen": -10.787076950073242, + "logps/rejected": -15.239648818969727, + "loss": 0.2543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1467917561531067, + "rewards/margins": 1.7701894044876099, + "rewards/rejected": -1.6233974695205688, + "step": 326 + }, + { + "epoch": 5.5423728813559325, + "grad_norm": 14.294998066529104, + "learning_rate": 4.537219547237114e-07, + "logits/chosen": 8.102130889892578, + "logits/rejected": 4.980132579803467, + "logps/chosen": -7.948538303375244, + "logps/rejected": -23.7041015625, + "loss": 0.2387, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26779377460479736, + "rewards/margins": 3.2698512077331543, + "rewards/rejected": -3.0020575523376465, + "step": 327 + }, + { + "epoch": 5.559322033898305, + "grad_norm": 13.319270840448068, + "learning_rate": 4.5329240911404167e-07, + "logits/chosen": 3.3187613487243652, + "logits/rejected": 3.88653564453125, + "logps/chosen": -5.821469783782959, + "logps/rejected": -11.25735092163086, + "loss": 0.2142, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.34002581238746643, + "rewards/margins": 2.070734977722168, + "rewards/rejected": -1.7307093143463135, + "step": 328 + }, + { + "epoch": 5.576271186440678, + "grad_norm": 15.137980503523977, + "learning_rate": 4.528610845223562e-07, + "logits/chosen": 4.278947353363037, + "logits/rejected": 4.24318265914917, + "logps/chosen": -10.437315940856934, + "logps/rejected": -21.30280113220215, + "loss": 0.2289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2784523665904999, + "rewards/margins": 3.11037540435791, + "rewards/rejected": -2.831923246383667, + "step": 329 + }, + { + "epoch": 5.593220338983051, + "grad_norm": 13.121474650704997, + "learning_rate": 4.5242798472311306e-07, + "logits/chosen": 2.994493007659912, + "logits/rejected": 3.7606515884399414, + "logps/chosen": -7.844595909118652, + "logps/rejected": -12.961830139160156, + "loss": 0.1932, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39790380001068115, + "rewards/margins": 2.1576051712036133, + "rewards/rejected": -1.7597013711929321, + "step": 330 + }, + { + "epoch": 5.610169491525424, + "grad_norm": 13.576296069584211, + "learning_rate": 4.519931135063051e-07, + "logits/chosen": 3.9403624534606934, + "logits/rejected": 5.117575168609619, + "logps/chosen": -8.309492111206055, + "logps/rejected": -19.150638580322266, + "loss": 0.2126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2991102933883667, + "rewards/margins": 3.163229465484619, + "rewards/rejected": -2.864119291305542, + "step": 331 + }, + { + "epoch": 5.627118644067797, + "grad_norm": 12.920872703304005, + "learning_rate": 4.515564746774265e-07, + "logits/chosen": 1.2953753471374512, + "logits/rejected": 3.210388422012329, + "logps/chosen": -8.38802719116211, + "logps/rejected": -16.91384506225586, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08069738000631332, + "rewards/margins": 2.5369935035705566, + "rewards/rejected": -2.45629620552063, + "step": 332 + }, + { + "epoch": 5.6440677966101696, + "grad_norm": 13.209851039880665, + "learning_rate": 4.5111807205743945e-07, + "logits/chosen": 1.0812115669250488, + "logits/rejected": 1.451087236404419, + "logps/chosen": -11.948184967041016, + "logps/rejected": -27.34232521057129, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26642170548439026, + "rewards/margins": 3.5682811737060547, + "rewards/rejected": -3.3018593788146973, + "step": 333 + }, + { + "epoch": 5.661016949152542, + "grad_norm": 12.057003448494973, + "learning_rate": 4.5067790948274085e-07, + "logits/chosen": 2.796851873397827, + "logits/rejected": 4.601650714874268, + "logps/chosen": -7.660577297210693, + "logps/rejected": -13.53750228881836, + "loss": 0.2083, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6349484920501709, + "rewards/margins": 2.123163938522339, + "rewards/rejected": -1.488215446472168, + "step": 334 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 14.844392112328164, + "learning_rate": 4.5023599080512896e-07, + "logits/chosen": 4.955301284790039, + "logits/rejected": 6.5992326736450195, + "logps/chosen": -11.873647689819336, + "logps/rejected": -17.834644317626953, + "loss": 0.227, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2461644411087036, + "rewards/margins": 2.8159372806549072, + "rewards/rejected": -2.569772958755493, + "step": 335 + }, + { + "epoch": 5.694915254237288, + "grad_norm": 14.019832702174794, + "learning_rate": 4.4979231989176905e-07, + "logits/chosen": 3.787350654602051, + "logits/rejected": 3.4726927280426025, + "logps/chosen": -7.431969165802002, + "logps/rejected": -14.662132263183594, + "loss": 0.211, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.34543120861053467, + "rewards/margins": 2.9022555351257324, + "rewards/rejected": -2.5568246841430664, + "step": 336 + }, + { + "epoch": 5.711864406779661, + "grad_norm": 14.060606447822217, + "learning_rate": 4.493469006251601e-07, + "logits/chosen": 7.4255757331848145, + "logits/rejected": 5.965138912200928, + "logps/chosen": -10.696208000183105, + "logps/rejected": -22.840984344482422, + "loss": 0.2214, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09379026293754578, + "rewards/margins": 3.5261425971984863, + "rewards/rejected": -3.432352066040039, + "step": 337 + }, + { + "epoch": 5.728813559322034, + "grad_norm": 14.237009692386826, + "learning_rate": 4.488997369031008e-07, + "logits/chosen": 3.596536636352539, + "logits/rejected": 3.5504150390625, + "logps/chosen": -6.90710973739624, + "logps/rejected": -13.495116233825684, + "loss": 0.2298, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3260919749736786, + "rewards/margins": 1.7033262252807617, + "rewards/rejected": -1.3772342205047607, + "step": 338 + }, + { + "epoch": 5.745762711864407, + "grad_norm": 14.67091033902795, + "learning_rate": 4.4845083263865514e-07, + "logits/chosen": 2.061657428741455, + "logits/rejected": 3.5068767070770264, + "logps/chosen": -10.289358139038086, + "logps/rejected": -16.851903915405273, + "loss": 0.2412, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16148579120635986, + "rewards/margins": 2.721928358078003, + "rewards/rejected": -2.5604424476623535, + "step": 339 + }, + { + "epoch": 5.762711864406779, + "grad_norm": 13.586237133155059, + "learning_rate": 4.4800019176011847e-07, + "logits/chosen": 3.836963415145874, + "logits/rejected": 1.2509939670562744, + "logps/chosen": -8.10957145690918, + "logps/rejected": -17.303131103515625, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13127543032169342, + "rewards/margins": 2.2816011905670166, + "rewards/rejected": -2.1503257751464844, + "step": 340 + }, + { + "epoch": 5.779661016949152, + "grad_norm": 12.360126542845666, + "learning_rate": 4.4754781821098286e-07, + "logits/chosen": 1.5891176462173462, + "logits/rejected": 3.962115526199341, + "logps/chosen": -11.134986877441406, + "logps/rejected": -16.536781311035156, + "loss": 0.2008, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3413625955581665, + "rewards/margins": 2.944985866546631, + "rewards/rejected": -2.603623151779175, + "step": 341 + }, + { + "epoch": 5.796610169491525, + "grad_norm": 14.480265988536571, + "learning_rate": 4.470937159499028e-07, + "logits/chosen": 2.7056524753570557, + "logits/rejected": 1.9229453802108765, + "logps/chosen": -7.356058597564697, + "logps/rejected": -13.569494247436523, + "loss": 0.2406, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2115810215473175, + "rewards/margins": 2.17071270942688, + "rewards/rejected": -1.9591315984725952, + "step": 342 + }, + { + "epoch": 5.813559322033898, + "grad_norm": 14.17521457305599, + "learning_rate": 4.4663788895066065e-07, + "logits/chosen": 0.2597987651824951, + "logits/rejected": 1.4270880222320557, + "logps/chosen": -10.596329689025879, + "logps/rejected": -15.119043350219727, + "loss": 0.2373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22062750160694122, + "rewards/margins": 1.7454050779342651, + "rewards/rejected": -1.5247777700424194, + "step": 343 + }, + { + "epoch": 5.830508474576272, + "grad_norm": 13.160996137481407, + "learning_rate": 4.4618034120213135e-07, + "logits/chosen": 6.123937129974365, + "logits/rejected": 5.778842449188232, + "logps/chosen": -10.282336235046387, + "logps/rejected": -23.72997283935547, + "loss": 0.2025, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15460780262947083, + "rewards/margins": 3.5308094024658203, + "rewards/rejected": -3.376201629638672, + "step": 344 + }, + { + "epoch": 5.847457627118644, + "grad_norm": 12.86869971192147, + "learning_rate": 4.4572107670824806e-07, + "logits/chosen": 2.6840929985046387, + "logits/rejected": 3.7933554649353027, + "logps/chosen": -8.170498847961426, + "logps/rejected": -17.079763412475586, + "loss": 0.2101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3931899964809418, + "rewards/margins": 2.5557689666748047, + "rewards/rejected": -2.162578582763672, + "step": 345 + }, + { + "epoch": 5.864406779661017, + "grad_norm": 13.691838438775193, + "learning_rate": 4.45260099487967e-07, + "logits/chosen": -0.25332632660865784, + "logits/rejected": 3.3269095420837402, + "logps/chosen": -13.765853881835938, + "logps/rejected": -15.467521667480469, + "loss": 0.2287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21706271171569824, + "rewards/margins": 2.0856997966766357, + "rewards/rejected": -1.8686370849609375, + "step": 346 + }, + { + "epoch": 5.88135593220339, + "grad_norm": 14.00814231188169, + "learning_rate": 4.4479741357523204e-07, + "logits/chosen": 4.334939002990723, + "logits/rejected": 5.97941780090332, + "logps/chosen": -12.119684219360352, + "logps/rejected": -21.12580680847168, + "loss": 0.2097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2201918661594391, + "rewards/margins": 3.210035800933838, + "rewards/rejected": -2.9898438453674316, + "step": 347 + }, + { + "epoch": 5.898305084745763, + "grad_norm": 14.913136382206837, + "learning_rate": 4.4433302301893983e-07, + "logits/chosen": 4.66344690322876, + "logits/rejected": 2.972669839859009, + "logps/chosen": -6.611598491668701, + "logps/rejected": -15.623875617980957, + "loss": 0.2632, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.37935081124305725, + "rewards/margins": 1.7020004987716675, + "rewards/rejected": -1.3226497173309326, + "step": 348 + }, + { + "epoch": 5.915254237288136, + "grad_norm": 16.427089053073924, + "learning_rate": 4.438669318829037e-07, + "logits/chosen": 0.8019882440567017, + "logits/rejected": 2.8923540115356445, + "logps/chosen": -9.76960563659668, + "logps/rejected": -16.897960662841797, + "loss": 0.2374, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.39484477043151855, + "rewards/margins": 1.9051614999771118, + "rewards/rejected": -1.5103168487548828, + "step": 349 + }, + { + "epoch": 5.932203389830509, + "grad_norm": 12.808532494066004, + "learning_rate": 4.433991442458188e-07, + "logits/chosen": 0.4681244492530823, + "logits/rejected": 5.965130805969238, + "logps/chosen": -12.381902694702148, + "logps/rejected": -12.533705711364746, + "loss": 0.214, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.378155380487442, + "rewards/margins": 1.931134581565857, + "rewards/rejected": -1.5529789924621582, + "step": 350 + }, + { + "epoch": 5.9491525423728815, + "grad_norm": 12.913971857694005, + "learning_rate": 4.4292966420122613e-07, + "logits/chosen": 6.719675540924072, + "logits/rejected": 6.983699798583984, + "logps/chosen": -9.271611213684082, + "logps/rejected": -18.097660064697266, + "loss": 0.1996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14226177334785461, + "rewards/margins": 2.8130500316619873, + "rewards/rejected": -2.670788288116455, + "step": 351 + }, + { + "epoch": 5.966101694915254, + "grad_norm": 14.302051781978857, + "learning_rate": 4.4245849585747655e-07, + "logits/chosen": 2.9296581745147705, + "logits/rejected": 1.7646398544311523, + "logps/chosen": -7.856363296508789, + "logps/rejected": -15.518735885620117, + "loss": 0.2496, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3406248092651367, + "rewards/margins": 2.079533815383911, + "rewards/rejected": -1.7389090061187744, + "step": 352 + }, + { + "epoch": 5.983050847457627, + "grad_norm": 14.425920748137944, + "learning_rate": 4.41985643337695e-07, + "logits/chosen": 5.654947280883789, + "logits/rejected": 6.085771560668945, + "logps/chosen": -10.340957641601562, + "logps/rejected": -20.093236923217773, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28536200523376465, + "rewards/margins": 2.75244140625, + "rewards/rejected": -2.4670796394348145, + "step": 353 + }, + { + "epoch": 6.0, + "grad_norm": 14.592286734921624, + "learning_rate": 4.415111107797445e-07, + "logits/chosen": 3.670210599899292, + "logits/rejected": 3.209463119506836, + "logps/chosen": -5.669580459594727, + "logps/rejected": -11.425719261169434, + "loss": 0.2338, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14339257776737213, + "rewards/margins": 1.7983019351959229, + "rewards/rejected": -1.6549094915390015, + "step": 354 + }, + { + "epoch": 6.016949152542373, + "grad_norm": 12.720475020903885, + "learning_rate": 4.410349023361897e-07, + "logits/chosen": 2.473912239074707, + "logits/rejected": 5.576210975646973, + "logps/chosen": -13.222108840942383, + "logps/rejected": -21.111351013183594, + "loss": 0.204, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32114630937576294, + "rewards/margins": 3.3848023414611816, + "rewards/rejected": -3.0636560916900635, + "step": 355 + }, + { + "epoch": 6.033898305084746, + "grad_norm": 11.400040105830213, + "learning_rate": 4.4055702217426085e-07, + "logits/chosen": 0.8407840132713318, + "logits/rejected": 1.0289647579193115, + "logps/chosen": -6.723813056945801, + "logps/rejected": -15.182109832763672, + "loss": 0.1874, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3703401982784271, + "rewards/margins": 3.1831440925598145, + "rewards/rejected": -2.8128037452697754, + "step": 356 + }, + { + "epoch": 6.0508474576271185, + "grad_norm": 12.244242829788378, + "learning_rate": 4.40077474475817e-07, + "logits/chosen": 1.6306920051574707, + "logits/rejected": 0.06805920600891113, + "logps/chosen": -10.396472930908203, + "logps/rejected": -20.96820068359375, + "loss": 0.2088, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41604703664779663, + "rewards/margins": 2.542591094970703, + "rewards/rejected": -2.126544237136841, + "step": 357 + }, + { + "epoch": 6.067796610169491, + "grad_norm": 12.052664494870422, + "learning_rate": 4.395962634373096e-07, + "logits/chosen": 3.180180549621582, + "logits/rejected": 4.185020923614502, + "logps/chosen": -7.90302848815918, + "logps/rejected": -16.847829818725586, + "loss": 0.1947, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3513253927230835, + "rewards/margins": 2.89188551902771, + "rewards/rejected": -2.540560245513916, + "step": 358 + }, + { + "epoch": 6.084745762711864, + "grad_norm": 11.56580544056517, + "learning_rate": 4.3911339326974584e-07, + "logits/chosen": 5.388302803039551, + "logits/rejected": 4.333691596984863, + "logps/chosen": -8.595808982849121, + "logps/rejected": -21.796253204345703, + "loss": 0.1734, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07518626749515533, + "rewards/margins": 4.378305435180664, + "rewards/rejected": -4.303119659423828, + "step": 359 + }, + { + "epoch": 6.101694915254237, + "grad_norm": 12.493899194292592, + "learning_rate": 4.386288681986516e-07, + "logits/chosen": 5.380532264709473, + "logits/rejected": 6.32417631149292, + "logps/chosen": -9.998344421386719, + "logps/rejected": -18.48993682861328, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20978182554244995, + "rewards/margins": 3.287187337875366, + "rewards/rejected": -3.0774056911468506, + "step": 360 + }, + { + "epoch": 6.11864406779661, + "grad_norm": 12.08153937994342, + "learning_rate": 4.3814269246403456e-07, + "logits/chosen": 2.539783239364624, + "logits/rejected": 5.148946762084961, + "logps/chosen": -8.870214462280273, + "logps/rejected": -16.426620483398438, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35913634300231934, + "rewards/margins": 2.705012559890747, + "rewards/rejected": -2.3458759784698486, + "step": 361 + }, + { + "epoch": 6.135593220338983, + "grad_norm": 11.40653610156602, + "learning_rate": 4.3765487032034737e-07, + "logits/chosen": -2.6000680923461914, + "logits/rejected": -1.892746925354004, + "logps/chosen": -9.39770221710205, + "logps/rejected": -16.299577713012695, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5937013626098633, + "rewards/margins": 2.4688711166381836, + "rewards/rejected": -1.8751695156097412, + "step": 362 + }, + { + "epoch": 6.1525423728813555, + "grad_norm": 12.510337865575506, + "learning_rate": 4.371654060364498e-07, + "logits/chosen": -1.1375783681869507, + "logits/rejected": 0.94313645362854, + "logps/chosen": -7.737663269042969, + "logps/rejected": -11.724308967590332, + "loss": 0.2046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.366363525390625, + "rewards/margins": 1.983730673789978, + "rewards/rejected": -1.6173672676086426, + "step": 363 + }, + { + "epoch": 6.169491525423728, + "grad_norm": 11.55035638277415, + "learning_rate": 4.366743038955719e-07, + "logits/chosen": 2.8494374752044678, + "logits/rejected": 4.877542495727539, + "logps/chosen": -11.219352722167969, + "logps/rejected": -17.120630264282227, + "loss": 0.1796, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37523311376571655, + "rewards/margins": 2.4836084842681885, + "rewards/rejected": -2.1083755493164062, + "step": 364 + }, + { + "epoch": 6.186440677966102, + "grad_norm": 12.048369200248766, + "learning_rate": 4.361815681952765e-07, + "logits/chosen": -1.1336802244186401, + "logits/rejected": 2.4179670810699463, + "logps/chosen": -9.88912582397461, + "logps/rejected": -12.742366790771484, + "loss": 0.2062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42945533990859985, + "rewards/margins": 2.399317502975464, + "rewards/rejected": -1.9698621034622192, + "step": 365 + }, + { + "epoch": 6.203389830508475, + "grad_norm": 11.244435376738915, + "learning_rate": 4.3568720324742126e-07, + "logits/chosen": 5.716616153717041, + "logits/rejected": 6.348829746246338, + "logps/chosen": -8.830109596252441, + "logps/rejected": -20.635944366455078, + "loss": 0.1765, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2376314103603363, + "rewards/margins": 2.6832034587860107, + "rewards/rejected": -2.4455718994140625, + "step": 366 + }, + { + "epoch": 6.220338983050848, + "grad_norm": 11.895191500315683, + "learning_rate": 4.351912133781212e-07, + "logits/chosen": 2.2640492916107178, + "logits/rejected": 3.6460046768188477, + "logps/chosen": -7.952423095703125, + "logps/rejected": -10.814785957336426, + "loss": 0.2105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5884957313537598, + "rewards/margins": 1.6586506366729736, + "rewards/rejected": -1.0701547861099243, + "step": 367 + }, + { + "epoch": 6.237288135593221, + "grad_norm": 11.319942761188097, + "learning_rate": 4.3469360292771096e-07, + "logits/chosen": -0.033799976110458374, + "logits/rejected": 1.3880383968353271, + "logps/chosen": -8.749774932861328, + "logps/rejected": -15.943452835083008, + "loss": 0.1799, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45636826753616333, + "rewards/margins": 2.9518792629241943, + "rewards/rejected": -2.495511054992676, + "step": 368 + }, + { + "epoch": 6.254237288135593, + "grad_norm": 11.763342241935318, + "learning_rate": 4.3419437625070634e-07, + "logits/chosen": 0.5066416263580322, + "logits/rejected": 2.990574836730957, + "logps/chosen": -8.957874298095703, + "logps/rejected": -13.657116889953613, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3993526101112366, + "rewards/margins": 2.0434536933898926, + "rewards/rejected": -1.6441012620925903, + "step": 369 + }, + { + "epoch": 6.271186440677966, + "grad_norm": 12.602104399660293, + "learning_rate": 4.336935377157668e-07, + "logits/chosen": -0.9369416832923889, + "logits/rejected": 1.4667794704437256, + "logps/chosen": -9.532140731811523, + "logps/rejected": -20.544631958007812, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45484790205955505, + "rewards/margins": 3.5946388244628906, + "rewards/rejected": -3.1397910118103027, + "step": 370 + }, + { + "epoch": 6.288135593220339, + "grad_norm": 12.229135597671917, + "learning_rate": 4.3319109170565676e-07, + "logits/chosen": 3.292764663696289, + "logits/rejected": 2.433797597885132, + "logps/chosen": -8.80831527709961, + "logps/rejected": -18.14662742614746, + "loss": 0.1968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22665278613567352, + "rewards/margins": 2.3303961753845215, + "rewards/rejected": -2.103743076324463, + "step": 371 + }, + { + "epoch": 6.305084745762712, + "grad_norm": 11.372981937167529, + "learning_rate": 4.3268704261720745e-07, + "logits/chosen": -0.022482722997665405, + "logits/rejected": 1.80076265335083, + "logps/chosen": -9.301206588745117, + "logps/rejected": -17.570545196533203, + "loss": 0.1803, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3969723582267761, + "rewards/margins": 2.1740834712982178, + "rewards/rejected": -1.7771109342575073, + "step": 372 + }, + { + "epoch": 6.322033898305085, + "grad_norm": 12.165079363604997, + "learning_rate": 4.321813948612785e-07, + "logits/chosen": 4.012811183929443, + "logits/rejected": 6.188882827758789, + "logps/chosen": -8.608428955078125, + "logps/rejected": -14.812787055969238, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.282976359128952, + "rewards/margins": 2.6606106758117676, + "rewards/rejected": -2.3776345252990723, + "step": 373 + }, + { + "epoch": 6.338983050847458, + "grad_norm": 11.471916125631306, + "learning_rate": 4.31674152862719e-07, + "logits/chosen": 4.980255126953125, + "logits/rejected": 2.6463794708251953, + "logps/chosen": -7.498459815979004, + "logps/rejected": -16.970762252807617, + "loss": 0.1787, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12228435277938843, + "rewards/margins": 2.9980876445770264, + "rewards/rejected": -2.875803232192993, + "step": 374 + }, + { + "epoch": 6.3559322033898304, + "grad_norm": 11.343224798553283, + "learning_rate": 4.311653210603293e-07, + "logits/chosen": -2.399540901184082, + "logits/rejected": 2.918029546737671, + "logps/chosen": -13.647743225097656, + "logps/rejected": -23.035350799560547, + "loss": 0.1718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2999897003173828, + "rewards/margins": 3.9240126609802246, + "rewards/rejected": -3.624022960662842, + "step": 375 + }, + { + "epoch": 6.372881355932203, + "grad_norm": 12.578548456336826, + "learning_rate": 4.306549039068218e-07, + "logits/chosen": 0.9330506920814514, + "logits/rejected": 1.1180548667907715, + "logps/chosen": -8.449841499328613, + "logps/rejected": -13.091574668884277, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5931252837181091, + "rewards/margins": 2.422457218170166, + "rewards/rejected": -1.8293319940567017, + "step": 376 + }, + { + "epoch": 6.389830508474576, + "grad_norm": 11.960020308408605, + "learning_rate": 4.301429058687819e-07, + "logits/chosen": 3.4172210693359375, + "logits/rejected": 0.8663151264190674, + "logps/chosen": -7.801892280578613, + "logps/rejected": -23.506187438964844, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.642574667930603, + "rewards/margins": 4.627707004547119, + "rewards/rejected": -3.9851319789886475, + "step": 377 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 12.061426426986905, + "learning_rate": 4.296293314266294e-07, + "logits/chosen": 0.8170385360717773, + "logits/rejected": 3.078815460205078, + "logps/chosen": -9.448894500732422, + "logps/rejected": -19.359838485717773, + "loss": 0.1744, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19764302670955658, + "rewards/margins": 3.298597574234009, + "rewards/rejected": -3.100954532623291, + "step": 378 + }, + { + "epoch": 6.423728813559322, + "grad_norm": 12.206478327976352, + "learning_rate": 4.2911418507457876e-07, + "logits/chosen": -2.6637320518493652, + "logits/rejected": -1.9219914674758911, + "logps/chosen": -10.741297721862793, + "logps/rejected": -16.847227096557617, + "loss": 0.1775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36193591356277466, + "rewards/margins": 2.7289984226226807, + "rewards/rejected": -2.367062568664551, + "step": 379 + }, + { + "epoch": 6.440677966101695, + "grad_norm": 11.10482434906408, + "learning_rate": 4.285974713206e-07, + "logits/chosen": 0.4309033751487732, + "logits/rejected": 4.5554327964782715, + "logps/chosen": -9.133827209472656, + "logps/rejected": -21.082012176513672, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.343951940536499, + "rewards/margins": 4.336812973022461, + "rewards/rejected": -3.992860794067383, + "step": 380 + }, + { + "epoch": 6.4576271186440675, + "grad_norm": 12.183583706496009, + "learning_rate": 4.280791946863794e-07, + "logits/chosen": -2.6161293983459473, + "logits/rejected": -2.8485751152038574, + "logps/chosen": -9.101305961608887, + "logps/rejected": -17.507917404174805, + "loss": 0.1954, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23070837557315826, + "rewards/margins": 2.2931227684020996, + "rewards/rejected": -2.0624141693115234, + "step": 381 + }, + { + "epoch": 6.47457627118644, + "grad_norm": 11.626006216103972, + "learning_rate": 4.275593597072795e-07, + "logits/chosen": 1.787920594215393, + "logits/rejected": 4.995972633361816, + "logps/chosen": -10.633196830749512, + "logps/rejected": -16.80924415588379, + "loss": 0.1567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3805018961429596, + "rewards/margins": 2.5669453144073486, + "rewards/rejected": -2.186443567276001, + "step": 382 + }, + { + "epoch": 6.491525423728813, + "grad_norm": 11.632616465398266, + "learning_rate": 4.270379709323001e-07, + "logits/chosen": 6.529628276824951, + "logits/rejected": 4.649651050567627, + "logps/chosen": -12.372982025146484, + "logps/rejected": -26.309274673461914, + "loss": 0.1696, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03969073295593262, + "rewards/margins": 4.078218460083008, + "rewards/rejected": -4.117908477783203, + "step": 383 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 39.632679942094846, + "learning_rate": 4.265150329240376e-07, + "logits/chosen": 3.1415085792541504, + "logits/rejected": 5.441884994506836, + "logps/chosen": -8.506572723388672, + "logps/rejected": -14.266595840454102, + "loss": 0.1893, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27693843841552734, + "rewards/margins": 2.0275750160217285, + "rewards/rejected": -1.7506364583969116, + "step": 384 + }, + { + "epoch": 6.52542372881356, + "grad_norm": 11.50776715255362, + "learning_rate": 4.259905502586457e-07, + "logits/chosen": -4.798701763153076, + "logits/rejected": -3.0812864303588867, + "logps/chosen": -9.191747665405273, + "logps/rejected": -15.61034107208252, + "loss": 0.1962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3073745667934418, + "rewards/margins": 2.615311861038208, + "rewards/rejected": -2.3079373836517334, + "step": 385 + }, + { + "epoch": 6.5423728813559325, + "grad_norm": 13.297389708968069, + "learning_rate": 4.254645275257953e-07, + "logits/chosen": 2.7727794647216797, + "logits/rejected": 1.049268364906311, + "logps/chosen": -9.638428688049316, + "logps/rejected": -19.182289123535156, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21066324412822723, + "rewards/margins": 2.508415937423706, + "rewards/rejected": -2.297752618789673, + "step": 386 + }, + { + "epoch": 6.559322033898305, + "grad_norm": 11.982375330029488, + "learning_rate": 4.24936969328634e-07, + "logits/chosen": 1.112790584564209, + "logits/rejected": 0.8258357048034668, + "logps/chosen": -6.70481014251709, + "logps/rejected": -16.066164016723633, + "loss": 0.2029, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12247809022665024, + "rewards/margins": 2.890721559524536, + "rewards/rejected": -2.7682437896728516, + "step": 387 + }, + { + "epoch": 6.576271186440678, + "grad_norm": 13.521630088491362, + "learning_rate": 4.244078802837462e-07, + "logits/chosen": 2.7497763633728027, + "logits/rejected": 3.7148149013519287, + "logps/chosen": -9.611839294433594, + "logps/rejected": -12.181758880615234, + "loss": 0.208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35590308904647827, + "rewards/margins": 1.9600085020065308, + "rewards/rejected": -1.6041054725646973, + "step": 388 + }, + { + "epoch": 6.593220338983051, + "grad_norm": 11.804130811346875, + "learning_rate": 4.238772650211123e-07, + "logits/chosen": 2.2374987602233887, + "logits/rejected": 2.2601318359375, + "logps/chosen": -7.375918388366699, + "logps/rejected": -18.406349182128906, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30262285470962524, + "rewards/margins": 3.1041243076324463, + "rewards/rejected": -2.801501750946045, + "step": 389 + }, + { + "epoch": 6.610169491525424, + "grad_norm": 12.379786650904354, + "learning_rate": 4.233451281840685e-07, + "logits/chosen": 3.232478618621826, + "logits/rejected": 3.9859728813171387, + "logps/chosen": -8.100415229797363, + "logps/rejected": -15.276487350463867, + "loss": 0.1983, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.42072463035583496, + "rewards/margins": 2.0583455562591553, + "rewards/rejected": -1.6376209259033203, + "step": 390 + }, + { + "epoch": 6.627118644067797, + "grad_norm": 11.67383799621466, + "learning_rate": 4.2281147442926636e-07, + "logits/chosen": -0.6115279197692871, + "logits/rejected": -1.3101024627685547, + "logps/chosen": -6.683277130126953, + "logps/rejected": -13.343738555908203, + "loss": 0.1943, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28772106766700745, + "rewards/margins": 2.299579620361328, + "rewards/rejected": -2.0118587017059326, + "step": 391 + }, + { + "epoch": 6.6440677966101696, + "grad_norm": 11.123053528086508, + "learning_rate": 4.222763084266313e-07, + "logits/chosen": 0.5712847709655762, + "logits/rejected": 4.945223331451416, + "logps/chosen": -9.13448715209961, + "logps/rejected": -18.582347869873047, + "loss": 0.1701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05717560648918152, + "rewards/margins": 3.1925296783447266, + "rewards/rejected": -3.135354518890381, + "step": 392 + }, + { + "epoch": 6.661016949152542, + "grad_norm": 11.036054591352455, + "learning_rate": 4.217396348593224e-07, + "logits/chosen": 1.5376551151275635, + "logits/rejected": 3.0253007411956787, + "logps/chosen": -12.541561126708984, + "logps/rejected": -21.630020141601562, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24118569493293762, + "rewards/margins": 3.7048823833465576, + "rewards/rejected": -3.4636964797973633, + "step": 393 + }, + { + "epoch": 6.677966101694915, + "grad_norm": 13.009053036332219, + "learning_rate": 4.2120145842369137e-07, + "logits/chosen": 2.204005718231201, + "logits/rejected": 1.796671986579895, + "logps/chosen": -7.913434028625488, + "logps/rejected": -19.2489070892334, + "loss": 0.1972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2287922501564026, + "rewards/margins": 3.174703598022461, + "rewards/rejected": -2.945911407470703, + "step": 394 + }, + { + "epoch": 6.694915254237288, + "grad_norm": 11.383455653308374, + "learning_rate": 4.206617838292411e-07, + "logits/chosen": 2.897174119949341, + "logits/rejected": 5.16046667098999, + "logps/chosen": -8.984628677368164, + "logps/rejected": -21.642620086669922, + "loss": 0.1618, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.250646710395813, + "rewards/margins": 4.119473457336426, + "rewards/rejected": -3.8688266277313232, + "step": 395 + }, + { + "epoch": 6.711864406779661, + "grad_norm": 11.683503762881484, + "learning_rate": 4.201206157985846e-07, + "logits/chosen": 3.2230072021484375, + "logits/rejected": 4.522119045257568, + "logps/chosen": -7.677939414978027, + "logps/rejected": -15.542098045349121, + "loss": 0.1786, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2948039770126343, + "rewards/margins": 2.671304941177368, + "rewards/rejected": -2.3765010833740234, + "step": 396 + }, + { + "epoch": 6.728813559322034, + "grad_norm": 12.164249408349729, + "learning_rate": 4.1957795906740403e-07, + "logits/chosen": -2.505526542663574, + "logits/rejected": -2.6718220710754395, + "logps/chosen": -6.572645664215088, + "logps/rejected": -13.47610092163086, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5586151480674744, + "rewards/margins": 2.246644973754883, + "rewards/rejected": -1.6880300045013428, + "step": 397 + }, + { + "epoch": 6.745762711864407, + "grad_norm": 11.22721717289838, + "learning_rate": 4.1903381838440853e-07, + "logits/chosen": -0.2245522439479828, + "logits/rejected": -1.0013046264648438, + "logps/chosen": -8.802066802978516, + "logps/rejected": -16.421092987060547, + "loss": 0.1753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3151366710662842, + "rewards/margins": 2.0241858959198, + "rewards/rejected": -1.7090493440628052, + "step": 398 + }, + { + "epoch": 6.762711864406779, + "grad_norm": 11.2826876856426, + "learning_rate": 4.1848819851129345e-07, + "logits/chosen": 0.24271805584430695, + "logits/rejected": -0.0052538588643074036, + "logps/chosen": -11.282722473144531, + "logps/rejected": -17.96334457397461, + "loss": 0.1618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5386340022087097, + "rewards/margins": 3.651751756668091, + "rewards/rejected": -3.1131176948547363, + "step": 399 + }, + { + "epoch": 6.779661016949152, + "grad_norm": 12.565118366696153, + "learning_rate": 4.179411042226982e-07, + "logits/chosen": 2.8625733852386475, + "logits/rejected": 5.791099548339844, + "logps/chosen": -12.863717079162598, + "logps/rejected": -20.775606155395508, + "loss": 0.1896, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.035915374755859375, + "rewards/margins": 3.369300365447998, + "rewards/rejected": -3.4052159786224365, + "step": 400 + }, + { + "epoch": 6.796610169491525, + "grad_norm": 11.833011467732364, + "learning_rate": 4.173925403061644e-07, + "logits/chosen": 0.865548849105835, + "logits/rejected": 1.7070629596710205, + "logps/chosen": -11.461730003356934, + "logps/rejected": -29.828683853149414, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28670307993888855, + "rewards/margins": 4.051923751831055, + "rewards/rejected": -3.7652201652526855, + "step": 401 + }, + { + "epoch": 6.813559322033898, + "grad_norm": 11.010173899657152, + "learning_rate": 4.1684251156209437e-07, + "logits/chosen": 3.118610382080078, + "logits/rejected": 3.4988858699798584, + "logps/chosen": -9.700953483581543, + "logps/rejected": -21.816139221191406, + "loss": 0.1706, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2815594971179962, + "rewards/margins": 3.5297186374664307, + "rewards/rejected": -3.248159170150757, + "step": 402 + }, + { + "epoch": 6.830508474576272, + "grad_norm": 12.448855337269192, + "learning_rate": 4.16291022803709e-07, + "logits/chosen": 1.8236751556396484, + "logits/rejected": 3.2149503231048584, + "logps/chosen": -9.841229438781738, + "logps/rejected": -15.228986740112305, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3855394721031189, + "rewards/margins": 2.7110955715179443, + "rewards/rejected": -2.3255560398101807, + "step": 403 + }, + { + "epoch": 6.847457627118644, + "grad_norm": 13.363779328365679, + "learning_rate": 4.1573807885700523e-07, + "logits/chosen": 2.889011859893799, + "logits/rejected": 3.5058770179748535, + "logps/chosen": -11.454680442810059, + "logps/rejected": -24.09137725830078, + "loss": 0.1602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.421461820602417, + "rewards/margins": 3.8386220932006836, + "rewards/rejected": -3.4171600341796875, + "step": 404 + }, + { + "epoch": 6.864406779661017, + "grad_norm": 12.307644038866737, + "learning_rate": 4.151836845607144e-07, + "logits/chosen": 3.0431900024414062, + "logits/rejected": 3.4132754802703857, + "logps/chosen": -11.601396560668945, + "logps/rejected": -18.625232696533203, + "loss": 0.1713, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6460351943969727, + "rewards/margins": 3.304206609725952, + "rewards/rejected": -2.6581716537475586, + "step": 405 + }, + { + "epoch": 6.88135593220339, + "grad_norm": 11.401784945653475, + "learning_rate": 4.146278447662597e-07, + "logits/chosen": 4.435127258300781, + "logits/rejected": 3.706249237060547, + "logps/chosen": -7.475669860839844, + "logps/rejected": -11.912774085998535, + "loss": 0.1803, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37761247158050537, + "rewards/margins": 2.3568031787872314, + "rewards/rejected": -1.979190707206726, + "step": 406 + }, + { + "epoch": 6.898305084745763, + "grad_norm": 10.845200868398582, + "learning_rate": 4.1407056433771324e-07, + "logits/chosen": 4.9845380783081055, + "logits/rejected": 4.5369486808776855, + "logps/chosen": -9.58263874053955, + "logps/rejected": -20.3066349029541, + "loss": 0.1542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0040725767612457275, + "rewards/margins": 3.683826208114624, + "rewards/rejected": -3.687898874282837, + "step": 407 + }, + { + "epoch": 6.915254237288136, + "grad_norm": 11.322141311865895, + "learning_rate": 4.1351184815175456e-07, + "logits/chosen": -0.577551543712616, + "logits/rejected": 1.696087121963501, + "logps/chosen": -11.518467903137207, + "logps/rejected": -24.423654556274414, + "loss": 0.1658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30860117077827454, + "rewards/margins": 3.8829498291015625, + "rewards/rejected": -3.5743489265441895, + "step": 408 + }, + { + "epoch": 6.932203389830509, + "grad_norm": 10.870149379931851, + "learning_rate": 4.1295170109762677e-07, + "logits/chosen": 1.2290717363357544, + "logits/rejected": 2.379326105117798, + "logps/chosen": -8.551403999328613, + "logps/rejected": -16.353551864624023, + "loss": 0.1642, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.384010374546051, + "rewards/margins": 3.345313787460327, + "rewards/rejected": -2.961303472518921, + "step": 409 + }, + { + "epoch": 6.9491525423728815, + "grad_norm": 11.781844222361585, + "learning_rate": 4.1239012807709444e-07, + "logits/chosen": -0.22932285070419312, + "logits/rejected": 2.510235548019409, + "logps/chosen": -9.169673919677734, + "logps/rejected": -22.106855392456055, + "loss": 0.1667, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12222418189048767, + "rewards/margins": 3.730660915374756, + "rewards/rejected": -3.6084365844726562, + "step": 410 + }, + { + "epoch": 6.966101694915254, + "grad_norm": 11.10813604751353, + "learning_rate": 4.1182713400440074e-07, + "logits/chosen": 0.7548007965087891, + "logits/rejected": 4.313141345977783, + "logps/chosen": -10.161952018737793, + "logps/rejected": -18.345703125, + "loss": 0.1617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5395796298980713, + "rewards/margins": 3.6666393280029297, + "rewards/rejected": -3.1270599365234375, + "step": 411 + }, + { + "epoch": 6.983050847457627, + "grad_norm": 10.950373908675447, + "learning_rate": 4.112627238062238e-07, + "logits/chosen": 0.7141157388687134, + "logits/rejected": 2.862635850906372, + "logps/chosen": -7.2346720695495605, + "logps/rejected": -14.257221221923828, + "loss": 0.1791, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6277063488960266, + "rewards/margins": 2.4815032482147217, + "rewards/rejected": -1.8537968397140503, + "step": 412 + }, + { + "epoch": 7.0, + "grad_norm": 12.070707974127568, + "learning_rate": 4.106969024216348e-07, + "logits/chosen": 3.3420722484588623, + "logits/rejected": 2.9186367988586426, + "logps/chosen": -8.318929672241211, + "logps/rejected": -19.41041374206543, + "loss": 0.1917, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.258792519569397, + "rewards/margins": 3.3974623680114746, + "rewards/rejected": -3.138669729232788, + "step": 413 + }, + { + "epoch": 7.016949152542373, + "grad_norm": 10.149398873013046, + "learning_rate": 4.101296748020533e-07, + "logits/chosen": -1.3689672946929932, + "logits/rejected": -0.603845477104187, + "logps/chosen": -10.28722858428955, + "logps/rejected": -19.97595977783203, + "loss": 0.1591, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41477838158607483, + "rewards/margins": 3.526080846786499, + "rewards/rejected": -3.1113028526306152, + "step": 414 + }, + { + "epoch": 7.033898305084746, + "grad_norm": 9.47123309900026, + "learning_rate": 4.09561045911205e-07, + "logits/chosen": 0.6049144268035889, + "logits/rejected": 0.7107745409011841, + "logps/chosen": -8.770289421081543, + "logps/rejected": -14.066744804382324, + "loss": 0.1395, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39408162236213684, + "rewards/margins": 2.6371524333953857, + "rewards/rejected": -2.243070602416992, + "step": 415 + }, + { + "epoch": 7.0508474576271185, + "grad_norm": 10.052821182121836, + "learning_rate": 4.0899102072507773e-07, + "logits/chosen": -0.4487457871437073, + "logits/rejected": 0.46110111474990845, + "logps/chosen": -7.779388427734375, + "logps/rejected": -14.188379287719727, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5070340037345886, + "rewards/margins": 2.450317621231079, + "rewards/rejected": -1.9432835578918457, + "step": 416 + }, + { + "epoch": 7.067796610169491, + "grad_norm": 11.587099181096272, + "learning_rate": 4.084196042318783e-07, + "logits/chosen": -3.301814079284668, + "logits/rejected": -2.8254811763763428, + "logps/chosen": -9.915962219238281, + "logps/rejected": -14.52343463897705, + "loss": 0.1823, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3799024820327759, + "rewards/margins": 2.0725831985473633, + "rewards/rejected": -1.6926804780960083, + "step": 417 + }, + { + "epoch": 7.084745762711864, + "grad_norm": 9.783875636578681, + "learning_rate": 4.0784680143198837e-07, + "logits/chosen": 2.509547472000122, + "logits/rejected": 4.715401649475098, + "logps/chosen": -6.57558012008667, + "logps/rejected": -17.49691390991211, + "loss": 0.1474, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4137152135372162, + "rewards/margins": 3.915511131286621, + "rewards/rejected": -3.501796245574951, + "step": 418 + }, + { + "epoch": 7.101694915254237, + "grad_norm": 10.663457705346287, + "learning_rate": 4.0727261733792124e-07, + "logits/chosen": 0.4356730580329895, + "logits/rejected": 1.356868028640747, + "logps/chosen": -7.484723091125488, + "logps/rejected": -17.026552200317383, + "loss": 0.1498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37025827169418335, + "rewards/margins": 3.1471047401428223, + "rewards/rejected": -2.7768466472625732, + "step": 419 + }, + { + "epoch": 7.11864406779661, + "grad_norm": 10.814503714644006, + "learning_rate": 4.0669705697427754e-07, + "logits/chosen": 5.528136253356934, + "logits/rejected": 7.37587833404541, + "logps/chosen": -13.677026748657227, + "logps/rejected": -22.8214054107666, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6871755719184875, + "rewards/margins": 3.2107880115509033, + "rewards/rejected": -2.5236124992370605, + "step": 420 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 9.947743029988125, + "learning_rate": 4.061201253777015e-07, + "logits/chosen": 0.058755338191986084, + "logits/rejected": 1.8083921670913696, + "logps/chosen": -10.58738899230957, + "logps/rejected": -18.75304412841797, + "loss": 0.1398, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1582307070493698, + "rewards/margins": 3.144596815109253, + "rewards/rejected": -2.9863665103912354, + "step": 421 + }, + { + "epoch": 7.1525423728813555, + "grad_norm": 9.876244438495242, + "learning_rate": 4.0554182759683675e-07, + "logits/chosen": 1.8721250295639038, + "logits/rejected": 2.4072189331054688, + "logps/chosen": -6.363444805145264, + "logps/rejected": -18.347625732421875, + "loss": 0.1319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3553791344165802, + "rewards/margins": 3.64908504486084, + "rewards/rejected": -3.293706178665161, + "step": 422 + }, + { + "epoch": 7.169491525423728, + "grad_norm": 10.133228361779297, + "learning_rate": 4.049621686922823e-07, + "logits/chosen": -0.28978219628334045, + "logits/rejected": -0.5108487606048584, + "logps/chosen": -9.293232917785645, + "logps/rejected": -13.629794120788574, + "loss": 0.1536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1518397033214569, + "rewards/margins": 2.251439332962036, + "rewards/rejected": -2.099599838256836, + "step": 423 + }, + { + "epoch": 7.186440677966102, + "grad_norm": 9.307930017331996, + "learning_rate": 4.0438115373654795e-07, + "logits/chosen": -3.549588203430176, + "logits/rejected": -1.5921013355255127, + "logps/chosen": -10.396879196166992, + "logps/rejected": -15.234865188598633, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5600487589836121, + "rewards/margins": 3.305903911590576, + "rewards/rejected": -2.7458550930023193, + "step": 424 + }, + { + "epoch": 7.203389830508475, + "grad_norm": 9.812274829008038, + "learning_rate": 4.0379878781401046e-07, + "logits/chosen": 0.6426932215690613, + "logits/rejected": 0.6157368421554565, + "logps/chosen": -8.68526554107666, + "logps/rejected": -20.45929527282715, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32862192392349243, + "rewards/margins": 3.3139867782592773, + "rewards/rejected": -2.9853649139404297, + "step": 425 + }, + { + "epoch": 7.220338983050848, + "grad_norm": 10.179065073117089, + "learning_rate": 4.0321507602086836e-07, + "logits/chosen": -0.9165471196174622, + "logits/rejected": 1.5311022996902466, + "logps/chosen": -10.904990196228027, + "logps/rejected": -20.312908172607422, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22916960716247559, + "rewards/margins": 3.8574917316436768, + "rewards/rejected": -3.6283223628997803, + "step": 426 + }, + { + "epoch": 7.237288135593221, + "grad_norm": 9.70945311905393, + "learning_rate": 4.026300234650979e-07, + "logits/chosen": -4.223483085632324, + "logits/rejected": -4.719742774963379, + "logps/chosen": -10.978636741638184, + "logps/rejected": -19.935565948486328, + "loss": 0.1295, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6995031237602234, + "rewards/margins": 2.482172727584839, + "rewards/rejected": -1.7826696634292603, + "step": 427 + }, + { + "epoch": 7.254237288135593, + "grad_norm": 9.565067688440125, + "learning_rate": 4.020436352664079e-07, + "logits/chosen": -1.7467882633209229, + "logits/rejected": 0.7342086434364319, + "logps/chosen": -9.958053588867188, + "logps/rejected": -21.70626449584961, + "loss": 0.1375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24878710508346558, + "rewards/margins": 4.359116554260254, + "rewards/rejected": -4.110329627990723, + "step": 428 + }, + { + "epoch": 7.271186440677966, + "grad_norm": 9.60457205625089, + "learning_rate": 4.014559165561956e-07, + "logits/chosen": 2.2311694622039795, + "logits/rejected": 7.812325954437256, + "logps/chosen": -9.30516242980957, + "logps/rejected": -18.18703269958496, + "loss": 0.1467, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2606554925441742, + "rewards/margins": 3.457913398742676, + "rewards/rejected": -3.1972575187683105, + "step": 429 + }, + { + "epoch": 7.288135593220339, + "grad_norm": 9.82463351889948, + "learning_rate": 4.0086687247750095e-07, + "logits/chosen": -3.501877546310425, + "logits/rejected": -3.8577065467834473, + "logps/chosen": -7.8808465003967285, + "logps/rejected": -15.253175735473633, + "loss": 0.1347, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4603765904903412, + "rewards/margins": 2.3972482681274414, + "rewards/rejected": -1.9368716478347778, + "step": 430 + }, + { + "epoch": 7.305084745762712, + "grad_norm": 9.784793133340768, + "learning_rate": 4.0027650818496226e-07, + "logits/chosen": 5.14243745803833, + "logits/rejected": 3.4985580444335938, + "logps/chosen": -10.001608848571777, + "logps/rejected": -26.74629020690918, + "loss": 0.1407, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08906976878643036, + "rewards/margins": 4.575994968414307, + "rewards/rejected": -4.665064811706543, + "step": 431 + }, + { + "epoch": 7.322033898305085, + "grad_norm": 10.071653241978936, + "learning_rate": 3.996848288447707e-07, + "logits/chosen": -4.888203144073486, + "logits/rejected": -4.589768886566162, + "logps/chosen": -7.157283782958984, + "logps/rejected": -17.219411849975586, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3107476532459259, + "rewards/margins": 2.8994295597076416, + "rewards/rejected": -2.588681936264038, + "step": 432 + }, + { + "epoch": 7.338983050847458, + "grad_norm": 10.22451359355856, + "learning_rate": 3.9909183963462536e-07, + "logits/chosen": -1.3133256435394287, + "logits/rejected": 0.9133470058441162, + "logps/chosen": -13.30894660949707, + "logps/rejected": -25.437026977539062, + "loss": 0.1597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3724314570426941, + "rewards/margins": 4.273044586181641, + "rewards/rejected": -3.900613307952881, + "step": 433 + }, + { + "epoch": 7.3559322033898304, + "grad_norm": 10.254114217242893, + "learning_rate": 3.984975457436876e-07, + "logits/chosen": -1.5063201189041138, + "logits/rejected": 0.4495699405670166, + "logps/chosen": -6.538341999053955, + "logps/rejected": -13.631461143493652, + "loss": 0.1589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3549283444881439, + "rewards/margins": 2.530862808227539, + "rewards/rejected": -2.1759345531463623, + "step": 434 + }, + { + "epoch": 7.372881355932203, + "grad_norm": 9.787862344597166, + "learning_rate": 3.979019523725361e-07, + "logits/chosen": -0.7558501958847046, + "logits/rejected": 1.1639087200164795, + "logps/chosen": -9.700092315673828, + "logps/rejected": -13.182903289794922, + "loss": 0.1509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4032173752784729, + "rewards/margins": 2.2667555809020996, + "rewards/rejected": -1.8635382652282715, + "step": 435 + }, + { + "epoch": 7.389830508474576, + "grad_norm": 11.242250708423146, + "learning_rate": 3.973050647331209e-07, + "logits/chosen": -0.858704149723053, + "logits/rejected": -0.11889542639255524, + "logps/chosen": -11.472419738769531, + "logps/rejected": -20.94268798828125, + "loss": 0.1546, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4044639766216278, + "rewards/margins": 3.0765256881713867, + "rewards/rejected": -2.6720614433288574, + "step": 436 + }, + { + "epoch": 7.406779661016949, + "grad_norm": 10.476314866762824, + "learning_rate": 3.967068880487181e-07, + "logits/chosen": -1.5288808345794678, + "logits/rejected": -1.7149463891983032, + "logps/chosen": -10.024654388427734, + "logps/rejected": -21.697467803955078, + "loss": 0.1652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.524260401725769, + "rewards/margins": 4.019008636474609, + "rewards/rejected": -3.494748592376709, + "step": 437 + }, + { + "epoch": 7.423728813559322, + "grad_norm": 10.970528134679885, + "learning_rate": 3.9610742755388406e-07, + "logits/chosen": -2.111297369003296, + "logits/rejected": 0.004118561744689941, + "logps/chosen": -7.667480945587158, + "logps/rejected": -13.297545433044434, + "loss": 0.1659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4699084162712097, + "rewards/margins": 2.736049175262451, + "rewards/rejected": -2.2661406993865967, + "step": 438 + }, + { + "epoch": 7.440677966101695, + "grad_norm": 9.842004888488471, + "learning_rate": 3.955066884944094e-07, + "logits/chosen": 0.6597349643707275, + "logits/rejected": -0.04374626278877258, + "logps/chosen": -11.741231918334961, + "logps/rejected": -21.247833251953125, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4969398081302643, + "rewards/margins": 3.5522348880767822, + "rewards/rejected": -3.055295467376709, + "step": 439 + }, + { + "epoch": 7.4576271186440675, + "grad_norm": 8.972971902148998, + "learning_rate": 3.949046761272735e-07, + "logits/chosen": 2.3338708877563477, + "logits/rejected": 2.589047908782959, + "logps/chosen": -5.845430850982666, + "logps/rejected": -9.895733833312988, + "loss": 0.1276, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6523416042327881, + "rewards/margins": 1.911478042602539, + "rewards/rejected": -1.2591363191604614, + "step": 440 + }, + { + "epoch": 7.47457627118644, + "grad_norm": 10.066026413188887, + "learning_rate": 3.9430139572059815e-07, + "logits/chosen": -1.8239307403564453, + "logits/rejected": -2.2824392318725586, + "logps/chosen": -9.65200424194336, + "logps/rejected": -22.091182708740234, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40659016370773315, + "rewards/margins": 3.9431817531585693, + "rewards/rejected": -3.5365915298461914, + "step": 441 + }, + { + "epoch": 7.491525423728813, + "grad_norm": 10.09896085548317, + "learning_rate": 3.9369685255360173e-07, + "logits/chosen": 1.6324787139892578, + "logits/rejected": -0.07427901029586792, + "logps/chosen": -8.006882667541504, + "logps/rejected": -11.395492553710938, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6132466197013855, + "rewards/margins": 2.4093189239501953, + "rewards/rejected": -1.796072244644165, + "step": 442 + }, + { + "epoch": 7.508474576271187, + "grad_norm": 10.035637606591012, + "learning_rate": 3.9309105191655247e-07, + "logits/chosen": -2.1883177757263184, + "logits/rejected": -1.8495557308197021, + "logps/chosen": -9.062346458435059, + "logps/rejected": -20.01108741760254, + "loss": 0.1561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11416606605052948, + "rewards/margins": 3.4943981170654297, + "rewards/rejected": -3.380232334136963, + "step": 443 + }, + { + "epoch": 7.52542372881356, + "grad_norm": 10.206926858110808, + "learning_rate": 3.924839991107229e-07, + "logits/chosen": -1.1415421962738037, + "logits/rejected": 0.6667934656143188, + "logps/chosen": -10.95050048828125, + "logps/rejected": -24.7608642578125, + "loss": 0.1442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4104726314544678, + "rewards/margins": 4.508932113647461, + "rewards/rejected": -4.098459720611572, + "step": 444 + }, + { + "epoch": 7.5423728813559325, + "grad_norm": 9.72571019158722, + "learning_rate": 3.918756994483429e-07, + "logits/chosen": 2.068418025970459, + "logits/rejected": 4.546534061431885, + "logps/chosen": -7.813698768615723, + "logps/rejected": -14.570735931396484, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4646166265010834, + "rewards/margins": 2.6729631423950195, + "rewards/rejected": -2.2083466053009033, + "step": 445 + }, + { + "epoch": 7.559322033898305, + "grad_norm": 10.55354685964242, + "learning_rate": 3.912661582525536e-07, + "logits/chosen": -3.7456912994384766, + "logits/rejected": -1.1216098070144653, + "logps/chosen": -10.544078826904297, + "logps/rejected": -15.885772705078125, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6655651330947876, + "rewards/margins": 2.157383680343628, + "rewards/rejected": -1.4918183088302612, + "step": 446 + }, + { + "epoch": 7.576271186440678, + "grad_norm": 8.874108679556333, + "learning_rate": 3.906553808573604e-07, + "logits/chosen": -1.0291190147399902, + "logits/rejected": 0.6091427803039551, + "logps/chosen": -9.206077575683594, + "logps/rejected": -21.849958419799805, + "loss": 0.1418, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3792378604412079, + "rewards/margins": 4.100043296813965, + "rewards/rejected": -3.7208056449890137, + "step": 447 + }, + { + "epoch": 7.593220338983051, + "grad_norm": 9.89378514039557, + "learning_rate": 3.9004337260758644e-07, + "logits/chosen": 1.9020898342132568, + "logits/rejected": 0.8474722504615784, + "logps/chosen": -9.850662231445312, + "logps/rejected": -23.219379425048828, + "loss": 0.1377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36318060755729675, + "rewards/margins": 3.600226640701294, + "rewards/rejected": -3.237046003341675, + "step": 448 + }, + { + "epoch": 7.610169491525424, + "grad_norm": 10.865068959798018, + "learning_rate": 3.894301388588264e-07, + "logits/chosen": -0.9812300801277161, + "logits/rejected": 0.4254589080810547, + "logps/chosen": -10.075271606445312, + "logps/rejected": -14.64460563659668, + "loss": 0.1611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38537198305130005, + "rewards/margins": 2.6098413467407227, + "rewards/rejected": -2.2244694232940674, + "step": 449 + }, + { + "epoch": 7.627118644067797, + "grad_norm": 10.555586251241943, + "learning_rate": 3.888156849773985e-07, + "logits/chosen": -1.378682017326355, + "logits/rejected": 3.440859794616699, + "logps/chosen": -8.580948829650879, + "logps/rejected": -17.025653839111328, + "loss": 0.1538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21920417249202728, + "rewards/margins": 3.138881206512451, + "rewards/rejected": -2.919677257537842, + "step": 450 + }, + { + "epoch": 7.6440677966101696, + "grad_norm": 10.209560205921942, + "learning_rate": 3.882000163402983e-07, + "logits/chosen": 2.072082757949829, + "logits/rejected": 3.475949764251709, + "logps/chosen": -10.055817604064941, + "logps/rejected": -17.178794860839844, + "loss": 0.146, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36519867181777954, + "rewards/margins": 3.059540271759033, + "rewards/rejected": -2.6943416595458984, + "step": 451 + }, + { + "epoch": 7.661016949152542, + "grad_norm": 10.663402608136824, + "learning_rate": 3.8758313833515186e-07, + "logits/chosen": 0.2695200443267822, + "logits/rejected": 0.7485695481300354, + "logps/chosen": -10.24669075012207, + "logps/rejected": -22.458515167236328, + "loss": 0.1532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.023169204592704773, + "rewards/margins": 4.44655704498291, + "rewards/rejected": -4.469725608825684, + "step": 452 + }, + { + "epoch": 7.677966101694915, + "grad_norm": 10.663452691065979, + "learning_rate": 3.86965056360168e-07, + "logits/chosen": 0.023821651935577393, + "logits/rejected": 0.6948651075363159, + "logps/chosen": -7.995731353759766, + "logps/rejected": -17.48453140258789, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.333486407995224, + "rewards/margins": 3.026156425476074, + "rewards/rejected": -2.6926703453063965, + "step": 453 + }, + { + "epoch": 7.694915254237288, + "grad_norm": 10.29075290611524, + "learning_rate": 3.8634577582409115e-07, + "logits/chosen": 2.8640151023864746, + "logits/rejected": 0.4052382707595825, + "logps/chosen": -5.422905921936035, + "logps/rejected": -15.552241325378418, + "loss": 0.1561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40284836292266846, + "rewards/margins": 2.800489664077759, + "rewards/rejected": -2.397641181945801, + "step": 454 + }, + { + "epoch": 7.711864406779661, + "grad_norm": 8.656662324471474, + "learning_rate": 3.857253021461545e-07, + "logits/chosen": -2.2958803176879883, + "logits/rejected": 0.16803821921348572, + "logps/chosen": -8.433700561523438, + "logps/rejected": -16.503564834594727, + "loss": 0.1447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6839694380760193, + "rewards/margins": 3.1981723308563232, + "rewards/rejected": -2.514202833175659, + "step": 455 + }, + { + "epoch": 7.728813559322034, + "grad_norm": 9.015380165768256, + "learning_rate": 3.8510364075603185e-07, + "logits/chosen": 0.7340269684791565, + "logits/rejected": 2.136843681335449, + "logps/chosen": -8.46870231628418, + "logps/rejected": -23.236953735351562, + "loss": 0.1216, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07831863313913345, + "rewards/margins": 4.905647277832031, + "rewards/rejected": -4.983965873718262, + "step": 456 + }, + { + "epoch": 7.745762711864407, + "grad_norm": 11.048345802234925, + "learning_rate": 3.84480797093791e-07, + "logits/chosen": -0.974209189414978, + "logits/rejected": -1.9440693855285645, + "logps/chosen": -7.408576488494873, + "logps/rejected": -13.765433311462402, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5464452505111694, + "rewards/margins": 2.7349631786346436, + "rewards/rejected": -2.1885178089141846, + "step": 457 + }, + { + "epoch": 7.762711864406779, + "grad_norm": 9.33901404478676, + "learning_rate": 3.8385677660984514e-07, + "logits/chosen": 2.3597500324249268, + "logits/rejected": 2.794603109359741, + "logps/chosen": -10.939367294311523, + "logps/rejected": -29.22560691833496, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06992827355861664, + "rewards/margins": 5.073367118835449, + "rewards/rejected": -5.003438472747803, + "step": 458 + }, + { + "epoch": 7.779661016949152, + "grad_norm": 10.001912701586786, + "learning_rate": 3.83231584764906e-07, + "logits/chosen": -6.860588550567627, + "logits/rejected": -1.7909083366394043, + "logps/chosen": -11.336816787719727, + "logps/rejected": -23.062660217285156, + "loss": 0.1521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20468257367610931, + "rewards/margins": 3.9968860149383545, + "rewards/rejected": -3.792203426361084, + "step": 459 + }, + { + "epoch": 7.796610169491525, + "grad_norm": 8.988574047583393, + "learning_rate": 3.826052270299356e-07, + "logits/chosen": -1.0738238096237183, + "logits/rejected": 0.2171797752380371, + "logps/chosen": -8.178009033203125, + "logps/rejected": -14.752239227294922, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6255313754081726, + "rewards/margins": 3.040125846862793, + "rewards/rejected": -2.4145946502685547, + "step": 460 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 9.050244068170231, + "learning_rate": 3.8197770888609846e-07, + "logits/chosen": 0.6120907664299011, + "logits/rejected": 2.2083263397216797, + "logps/chosen": -9.618986129760742, + "logps/rejected": -16.45279312133789, + "loss": 0.1183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29597193002700806, + "rewards/margins": 3.233238935470581, + "rewards/rejected": -2.937267303466797, + "step": 461 + }, + { + "epoch": 7.830508474576272, + "grad_norm": 9.875563292935468, + "learning_rate": 3.813490358247137e-07, + "logits/chosen": -0.11616599559783936, + "logits/rejected": -3.459362030029297, + "logps/chosen": -8.407455444335938, + "logps/rejected": -23.708166122436523, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33320125937461853, + "rewards/margins": 3.724797248840332, + "rewards/rejected": -3.3915958404541016, + "step": 462 + }, + { + "epoch": 7.847457627118644, + "grad_norm": 8.566033021813828, + "learning_rate": 3.807192133472069e-07, + "logits/chosen": 0.0005160421133041382, + "logits/rejected": -0.07690905034542084, + "logps/chosen": -9.008073806762695, + "logps/rejected": -22.778635025024414, + "loss": 0.1055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08228336274623871, + "rewards/margins": 4.054976463317871, + "rewards/rejected": -4.1372599601745605, + "step": 463 + }, + { + "epoch": 7.864406779661017, + "grad_norm": 9.207368732490258, + "learning_rate": 3.80088246965062e-07, + "logits/chosen": 0.7204670906066895, + "logits/rejected": 0.8215815424919128, + "logps/chosen": -7.013503551483154, + "logps/rejected": -18.432910919189453, + "loss": 0.1173, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24773481488227844, + "rewards/margins": 3.5829970836639404, + "rewards/rejected": -3.3352622985839844, + "step": 464 + }, + { + "epoch": 7.88135593220339, + "grad_norm": 10.374440714970705, + "learning_rate": 3.794561421997734e-07, + "logits/chosen": -1.2234055995941162, + "logits/rejected": 0.8710986375808716, + "logps/chosen": -11.177177429199219, + "logps/rejected": -19.628103256225586, + "loss": 0.1405, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19700887799263, + "rewards/margins": 2.63451886177063, + "rewards/rejected": -2.437509775161743, + "step": 465 + }, + { + "epoch": 7.898305084745763, + "grad_norm": 10.507765784627404, + "learning_rate": 3.78822904582797e-07, + "logits/chosen": -4.167044639587402, + "logits/rejected": -2.423272132873535, + "logps/chosen": -10.025394439697266, + "logps/rejected": -18.540422439575195, + "loss": 0.1409, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6198136806488037, + "rewards/margins": 3.351442813873291, + "rewards/rejected": -2.7316291332244873, + "step": 466 + }, + { + "epoch": 7.915254237288136, + "grad_norm": 10.278415787981471, + "learning_rate": 3.781885396555019e-07, + "logits/chosen": -0.4185442328453064, + "logits/rejected": 0.9195470809936523, + "logps/chosen": -7.53629207611084, + "logps/rejected": -18.34429931640625, + "loss": 0.1598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3725518584251404, + "rewards/margins": 2.788588047027588, + "rewards/rejected": -2.4160361289978027, + "step": 467 + }, + { + "epoch": 7.932203389830509, + "grad_norm": 9.472880803747143, + "learning_rate": 3.775530529691227e-07, + "logits/chosen": 0.5384914875030518, + "logits/rejected": -0.05707889795303345, + "logps/chosen": -5.18881368637085, + "logps/rejected": -16.295425415039062, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3952220678329468, + "rewards/margins": 3.6583235263824463, + "rewards/rejected": -3.263101816177368, + "step": 468 + }, + { + "epoch": 7.9491525423728815, + "grad_norm": 10.770547462530029, + "learning_rate": 3.7691645008470997e-07, + "logits/chosen": -0.21460148692131042, + "logits/rejected": -1.5379085540771484, + "logps/chosen": -10.350830078125, + "logps/rejected": -23.51275062561035, + "loss": 0.1776, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22241616249084473, + "rewards/margins": 3.3379037380218506, + "rewards/rejected": -3.115487813949585, + "step": 469 + }, + { + "epoch": 7.966101694915254, + "grad_norm": 9.808497051053497, + "learning_rate": 3.7627873657308206e-07, + "logits/chosen": 0.3349153995513916, + "logits/rejected": 0.8287909030914307, + "logps/chosen": -7.198885440826416, + "logps/rejected": -20.29372787475586, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5376969575881958, + "rewards/margins": 3.864384174346924, + "rewards/rejected": -3.3266870975494385, + "step": 470 + }, + { + "epoch": 7.983050847457627, + "grad_norm": 9.697572526938723, + "learning_rate": 3.7563991801477624e-07, + "logits/chosen": -1.9151415824890137, + "logits/rejected": 0.3530935049057007, + "logps/chosen": -11.494583129882812, + "logps/rejected": -14.665804862976074, + "loss": 0.1399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.44558390974998474, + "rewards/margins": 2.9561033248901367, + "rewards/rejected": -2.51051926612854, + "step": 471 + }, + { + "epoch": 8.0, + "grad_norm": 9.96471014960958, + "learning_rate": 3.75e-07, + "logits/chosen": -2.6446385383605957, + "logits/rejected": 0.2764410376548767, + "logps/chosen": -8.184099197387695, + "logps/rejected": -14.341254234313965, + "loss": 0.1472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7961742877960205, + "rewards/margins": 3.1740105152130127, + "rewards/rejected": -2.3778364658355713, + "step": 472 + }, + { + "epoch": 8.016949152542374, + "grad_norm": 9.307834825130646, + "learning_rate": 3.743589881285818e-07, + "logits/chosen": -2.628649950027466, + "logits/rejected": -2.251711368560791, + "logps/chosen": -11.012161254882812, + "logps/rejected": -15.441204071044922, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37242552638053894, + "rewards/margins": 2.7831168174743652, + "rewards/rejected": -2.410691022872925, + "step": 473 + }, + { + "epoch": 8.033898305084746, + "grad_norm": 9.015474057721343, + "learning_rate": 3.737168880099223e-07, + "logits/chosen": 3.7287445068359375, + "logits/rejected": 5.9911274909973145, + "logps/chosen": -13.733976364135742, + "logps/rejected": -18.655841827392578, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7684003114700317, + "rewards/margins": 2.949385643005371, + "rewards/rejected": -2.18098521232605, + "step": 474 + }, + { + "epoch": 8.05084745762712, + "grad_norm": 8.718508664791036, + "learning_rate": 3.7307370526294553e-07, + "logits/chosen": -0.2663005590438843, + "logits/rejected": 3.8709819316864014, + "logps/chosen": -12.367705345153809, + "logps/rejected": -19.1514835357666, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8267220854759216, + "rewards/margins": 3.8209853172302246, + "rewards/rejected": -2.994263172149658, + "step": 475 + }, + { + "epoch": 8.067796610169491, + "grad_norm": 8.867026862703991, + "learning_rate": 3.724294455160491e-07, + "logits/chosen": 1.3887877464294434, + "logits/rejected": 1.3702775239944458, + "logps/chosen": -10.322896957397461, + "logps/rejected": -20.81346893310547, + "loss": 0.12, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.36723631620407104, + "rewards/margins": 3.4072937965393066, + "rewards/rejected": -3.040057420730591, + "step": 476 + }, + { + "epoch": 8.084745762711865, + "grad_norm": 8.620420643542406, + "learning_rate": 3.7178411440705556e-07, + "logits/chosen": 1.245548129081726, + "logits/rejected": 2.1392176151275635, + "logps/chosen": -9.19916820526123, + "logps/rejected": -18.40573501586914, + "loss": 0.1326, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26294079422950745, + "rewards/margins": 3.6965670585632324, + "rewards/rejected": -3.433626651763916, + "step": 477 + }, + { + "epoch": 8.101694915254237, + "grad_norm": 8.485802757673804, + "learning_rate": 3.7113771758316255e-07, + "logits/chosen": -1.3473069667816162, + "logits/rejected": -0.09861618280410767, + "logps/chosen": -10.57664966583252, + "logps/rejected": -15.270515441894531, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6301343441009521, + "rewards/margins": 2.7143757343292236, + "rewards/rejected": -2.0842413902282715, + "step": 478 + }, + { + "epoch": 8.11864406779661, + "grad_norm": 7.85197378030076, + "learning_rate": 3.704902607008938e-07, + "logits/chosen": -5.499878883361816, + "logits/rejected": -2.5568430423736572, + "logps/chosen": -12.366991996765137, + "logps/rejected": -19.026046752929688, + "loss": 0.1019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5573859214782715, + "rewards/margins": 3.0999138355255127, + "rewards/rejected": -2.5425281524658203, + "step": 479 + }, + { + "epoch": 8.135593220338983, + "grad_norm": 8.267677399685025, + "learning_rate": 3.698417494260494e-07, + "logits/chosen": -3.2886805534362793, + "logits/rejected": 1.1429243087768555, + "logps/chosen": -11.720975875854492, + "logps/rejected": -20.598499298095703, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3746921122074127, + "rewards/margins": 4.293571949005127, + "rewards/rejected": -3.918879985809326, + "step": 480 + }, + { + "epoch": 8.152542372881356, + "grad_norm": 9.251264900051627, + "learning_rate": 3.691921894336563e-07, + "logits/chosen": -2.1884472370147705, + "logits/rejected": 0.30788105726242065, + "logps/chosen": -8.096731185913086, + "logps/rejected": -19.28396987915039, + "loss": 0.1377, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2780647873878479, + "rewards/margins": 4.298841953277588, + "rewards/rejected": -4.020777225494385, + "step": 481 + }, + { + "epoch": 8.169491525423728, + "grad_norm": 8.218055554135557, + "learning_rate": 3.685415864079185e-07, + "logits/chosen": -0.6059857606887817, + "logits/rejected": 3.4607341289520264, + "logps/chosen": -11.999606132507324, + "logps/rejected": -26.547691345214844, + "loss": 0.1147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3711860775947571, + "rewards/margins": 4.486515998840332, + "rewards/rejected": -4.115329742431641, + "step": 482 + }, + { + "epoch": 8.186440677966102, + "grad_norm": 7.924489428934806, + "learning_rate": 3.6788994604216764e-07, + "logits/chosen": 1.4279998540878296, + "logits/rejected": 0.009126663208007812, + "logps/chosen": -8.281014442443848, + "logps/rejected": -22.807849884033203, + "loss": 0.0989, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12314409017562866, + "rewards/margins": 4.270468711853027, + "rewards/rejected": -4.393612861633301, + "step": 483 + }, + { + "epoch": 8.203389830508474, + "grad_norm": 7.961845338761622, + "learning_rate": 3.6723727403881275e-07, + "logits/chosen": -1.9475922584533691, + "logits/rejected": 1.0407593250274658, + "logps/chosen": -10.443235397338867, + "logps/rejected": -19.41378402709961, + "loss": 0.1175, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6596993207931519, + "rewards/margins": 3.718507766723633, + "rewards/rejected": -3.0588088035583496, + "step": 484 + }, + { + "epoch": 8.220338983050848, + "grad_norm": 9.857929233042386, + "learning_rate": 3.665835761092908e-07, + "logits/chosen": -6.954108238220215, + "logits/rejected": -5.27763032913208, + "logps/chosen": -10.771953582763672, + "logps/rejected": -15.511201858520508, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36188673973083496, + "rewards/margins": 2.4951171875, + "rewards/rejected": -2.133230209350586, + "step": 485 + }, + { + "epoch": 8.23728813559322, + "grad_norm": 8.273526335703513, + "learning_rate": 3.659288579740163e-07, + "logits/chosen": -0.05287922918796539, + "logits/rejected": 3.0080344676971436, + "logps/chosen": -13.40024185180664, + "logps/rejected": -19.83489418029785, + "loss": 0.1245, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5862030982971191, + "rewards/margins": 3.320868492126465, + "rewards/rejected": -2.7346653938293457, + "step": 486 + }, + { + "epoch": 8.254237288135593, + "grad_norm": 8.034704759700503, + "learning_rate": 3.6527312536233147e-07, + "logits/chosen": 3.8432722091674805, + "logits/rejected": 2.92391037940979, + "logps/chosen": -7.863999366760254, + "logps/rejected": -19.7979793548584, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3808861970901489, + "rewards/margins": 3.8671412467956543, + "rewards/rejected": -3.486255168914795, + "step": 487 + }, + { + "epoch": 8.271186440677965, + "grad_norm": 7.529231991013856, + "learning_rate": 3.646163840124561e-07, + "logits/chosen": -2.5350327491760254, + "logits/rejected": -4.587088584899902, + "logps/chosen": -9.05827522277832, + "logps/rejected": -19.514944076538086, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37517881393432617, + "rewards/margins": 3.4480714797973633, + "rewards/rejected": -3.072892904281616, + "step": 488 + }, + { + "epoch": 8.288135593220339, + "grad_norm": 8.139721923043574, + "learning_rate": 3.639586396714374e-07, + "logits/chosen": -2.1481618881225586, + "logits/rejected": -3.349088191986084, + "logps/chosen": -7.184938907623291, + "logps/rejected": -14.99942398071289, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43559348583221436, + "rewards/margins": 3.009117603302002, + "rewards/rejected": -2.573524236679077, + "step": 489 + }, + { + "epoch": 8.305084745762711, + "grad_norm": 8.643611452832333, + "learning_rate": 3.6329989809509933e-07, + "logits/chosen": -1.287747859954834, + "logits/rejected": 0.13855817914009094, + "logps/chosen": -8.292243957519531, + "logps/rejected": -19.404233932495117, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5273905396461487, + "rewards/margins": 4.194699764251709, + "rewards/rejected": -3.667309045791626, + "step": 490 + }, + { + "epoch": 8.322033898305085, + "grad_norm": 8.788616790979715, + "learning_rate": 3.626401650479927e-07, + "logits/chosen": -1.215428352355957, + "logits/rejected": -0.3887985944747925, + "logps/chosen": -7.66940975189209, + "logps/rejected": -18.244686126708984, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2789570093154907, + "rewards/margins": 3.3360958099365234, + "rewards/rejected": -3.057138681411743, + "step": 491 + }, + { + "epoch": 8.338983050847457, + "grad_norm": 8.939118589459243, + "learning_rate": 3.6197944630334465e-07, + "logits/chosen": -3.850637912750244, + "logits/rejected": -3.5334858894348145, + "logps/chosen": -8.223553657531738, + "logps/rejected": -18.250638961791992, + "loss": 0.131, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.587809145450592, + "rewards/margins": 3.762028694152832, + "rewards/rejected": -3.1742193698883057, + "step": 492 + }, + { + "epoch": 8.35593220338983, + "grad_norm": 8.69530410733376, + "learning_rate": 3.6131774764300785e-07, + "logits/chosen": 1.4184993505477905, + "logits/rejected": 1.3965762853622437, + "logps/chosen": -8.896749496459961, + "logps/rejected": -15.651692390441895, + "loss": 0.1241, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4817228317260742, + "rewards/margins": 2.8906168937683105, + "rewards/rejected": -2.4088940620422363, + "step": 493 + }, + { + "epoch": 8.372881355932204, + "grad_norm": 7.73443230414031, + "learning_rate": 3.6065507485741e-07, + "logits/chosen": -0.946702241897583, + "logits/rejected": 2.4696478843688965, + "logps/chosen": -10.482526779174805, + "logps/rejected": -19.93212127685547, + "loss": 0.1066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24369022250175476, + "rewards/margins": 3.84941029548645, + "rewards/rejected": -3.605720281600952, + "step": 494 + }, + { + "epoch": 8.389830508474576, + "grad_norm": 8.36862719416917, + "learning_rate": 3.5999143374550334e-07, + "logits/chosen": -2.797267198562622, + "logits/rejected": -0.19566676020622253, + "logps/chosen": -13.18895149230957, + "logps/rejected": -21.73252296447754, + "loss": 0.1216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17871427536010742, + "rewards/margins": 3.6249942779541016, + "rewards/rejected": -3.446279764175415, + "step": 495 + }, + { + "epoch": 8.40677966101695, + "grad_norm": 8.949802588720999, + "learning_rate": 3.593268301147139e-07, + "logits/chosen": -0.16709482669830322, + "logits/rejected": 1.0957790613174438, + "logps/chosen": -9.600016593933105, + "logps/rejected": -18.39324188232422, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3831973075866699, + "rewards/margins": 3.5357797145843506, + "rewards/rejected": -3.1525821685791016, + "step": 496 + }, + { + "epoch": 8.423728813559322, + "grad_norm": 8.700313769157882, + "learning_rate": 3.586612697808902e-07, + "logits/chosen": -3.6146655082702637, + "logits/rejected": -0.2656732201576233, + "logps/chosen": -9.611889839172363, + "logps/rejected": -17.33000373840332, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5966626405715942, + "rewards/margins": 3.35612416267395, + "rewards/rejected": -2.7594618797302246, + "step": 497 + }, + { + "epoch": 8.440677966101696, + "grad_norm": 8.226137318837338, + "learning_rate": 3.579947585682532e-07, + "logits/chosen": -0.020598918199539185, + "logits/rejected": -1.0154800415039062, + "logps/chosen": -8.14766788482666, + "logps/rejected": -22.971723556518555, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3941883444786072, + "rewards/margins": 4.393586158752441, + "rewards/rejected": -3.9993979930877686, + "step": 498 + }, + { + "epoch": 8.457627118644067, + "grad_norm": 8.87014354799229, + "learning_rate": 3.573273023093446e-07, + "logits/chosen": -1.4326260089874268, + "logits/rejected": 0.6970917582511902, + "logps/chosen": -11.785725593566895, + "logps/rejected": -22.613496780395508, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1566278487443924, + "rewards/margins": 4.172502040863037, + "rewards/rejected": -4.01587438583374, + "step": 499 + }, + { + "epoch": 8.474576271186441, + "grad_norm": 8.695490568604848, + "learning_rate": 3.5665890684497605e-07, + "logits/chosen": 3.693131685256958, + "logits/rejected": 2.3012924194335938, + "logps/chosen": -9.150883674621582, + "logps/rejected": -21.891143798828125, + "loss": 0.1067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2128208726644516, + "rewards/margins": 4.2375898361206055, + "rewards/rejected": -4.024769306182861, + "step": 500 + }, + { + "epoch": 8.491525423728813, + "grad_norm": 8.223688535482426, + "learning_rate": 3.559895780241781e-07, + "logits/chosen": -2.5774970054626465, + "logits/rejected": 2.790902614593506, + "logps/chosen": -13.14190673828125, + "logps/rejected": -14.023480415344238, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5792840719223022, + "rewards/margins": 2.0238969326019287, + "rewards/rejected": -1.444612741470337, + "step": 501 + }, + { + "epoch": 8.508474576271187, + "grad_norm": 8.31992146309658, + "learning_rate": 3.553193217041489e-07, + "logits/chosen": -2.0898971557617188, + "logits/rejected": -1.4773600101470947, + "logps/chosen": -10.491347312927246, + "logps/rejected": -17.18193244934082, + "loss": 0.1033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5015342831611633, + "rewards/margins": 3.777496337890625, + "rewards/rejected": -3.2759618759155273, + "step": 502 + }, + { + "epoch": 8.525423728813559, + "grad_norm": 8.792292957123891, + "learning_rate": 3.546481437502032e-07, + "logits/chosen": 0.26719558238983154, + "logits/rejected": -0.14614611864089966, + "logps/chosen": -9.920225143432617, + "logps/rejected": -20.05290412902832, + "loss": 0.1432, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27186596393585205, + "rewards/margins": 3.6169862747192383, + "rewards/rejected": -3.3451201915740967, + "step": 503 + }, + { + "epoch": 8.542372881355933, + "grad_norm": 8.522509964933947, + "learning_rate": 3.539760500357206e-07, + "logits/chosen": -2.289971351623535, + "logits/rejected": 0.8186124563217163, + "logps/chosen": -10.877229690551758, + "logps/rejected": -17.28212547302246, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41209787130355835, + "rewards/margins": 3.048102378845215, + "rewards/rejected": -2.6360044479370117, + "step": 504 + }, + { + "epoch": 8.559322033898304, + "grad_norm": 7.291393026249104, + "learning_rate": 3.533030464420945e-07, + "logits/chosen": -3.1184239387512207, + "logits/rejected": -2.1694648265838623, + "logps/chosen": -12.604846000671387, + "logps/rejected": -22.09140396118164, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24311965703964233, + "rewards/margins": 3.960993528366089, + "rewards/rejected": -3.71787428855896, + "step": 505 + }, + { + "epoch": 8.576271186440678, + "grad_norm": 8.644017276232843, + "learning_rate": 3.526291388586806e-07, + "logits/chosen": 0.4400590658187866, + "logits/rejected": -2.208275079727173, + "logps/chosen": -7.662543296813965, + "logps/rejected": -19.525211334228516, + "loss": 0.1154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34743475914001465, + "rewards/margins": 3.0827534198760986, + "rewards/rejected": -2.735318660736084, + "step": 506 + }, + { + "epoch": 8.59322033898305, + "grad_norm": 7.526044625787631, + "learning_rate": 3.5195433318274515e-07, + "logits/chosen": -1.0548756122589111, + "logits/rejected": -0.435749351978302, + "logps/chosen": -10.05634880065918, + "logps/rejected": -19.988792419433594, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1322399377822876, + "rewards/margins": 3.569153308868408, + "rewards/rejected": -3.43691349029541, + "step": 507 + }, + { + "epoch": 8.610169491525424, + "grad_norm": 7.766458527111232, + "learning_rate": 3.5127863531941335e-07, + "logits/chosen": -1.3730721473693848, + "logits/rejected": -3.2859532833099365, + "logps/chosen": -8.660323143005371, + "logps/rejected": -23.785953521728516, + "loss": 0.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35617661476135254, + "rewards/margins": 4.860879898071289, + "rewards/rejected": -4.504702568054199, + "step": 508 + }, + { + "epoch": 8.627118644067796, + "grad_norm": 9.345905694030758, + "learning_rate": 3.5060205118161816e-07, + "logits/chosen": -1.1814749240875244, + "logits/rejected": 2.160520553588867, + "logps/chosen": -12.414260864257812, + "logps/rejected": -21.517459869384766, + "loss": 0.1291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5118588209152222, + "rewards/margins": 4.0089640617370605, + "rewards/rejected": -3.497105360031128, + "step": 509 + }, + { + "epoch": 8.64406779661017, + "grad_norm": 8.036367102131655, + "learning_rate": 3.49924586690048e-07, + "logits/chosen": -6.377943515777588, + "logits/rejected": -2.662942886352539, + "logps/chosen": -13.506998062133789, + "logps/rejected": -15.274138450622559, + "loss": 0.1062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6932135820388794, + "rewards/margins": 2.7833752632141113, + "rewards/rejected": -2.0901618003845215, + "step": 510 + }, + { + "epoch": 8.661016949152543, + "grad_norm": 8.350289239582875, + "learning_rate": 3.4924624777309504e-07, + "logits/chosen": -0.48674535751342773, + "logits/rejected": 0.6451427936553955, + "logps/chosen": -9.116971969604492, + "logps/rejected": -24.52935028076172, + "loss": 0.113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22862689197063446, + "rewards/margins": 4.943426132202148, + "rewards/rejected": -5.172053337097168, + "step": 511 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 8.998528825313201, + "learning_rate": 3.4856704036680355e-07, + "logits/chosen": 0.11914621293544769, + "logits/rejected": -0.580173909664154, + "logps/chosen": -8.883934020996094, + "logps/rejected": -19.471128463745117, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2940840423107147, + "rewards/margins": 2.7323853969573975, + "rewards/rejected": -2.4383013248443604, + "step": 512 + }, + { + "epoch": 8.694915254237289, + "grad_norm": 7.853420086841468, + "learning_rate": 3.4788697041481786e-07, + "logits/chosen": -0.5393266081809998, + "logits/rejected": -1.3182928562164307, + "logps/chosen": -7.222754001617432, + "logps/rejected": -24.662555694580078, + "loss": 0.1037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7391365766525269, + "rewards/margins": 4.932739734649658, + "rewards/rejected": -4.193603515625, + "step": 513 + }, + { + "epoch": 8.711864406779661, + "grad_norm": 9.392762891289523, + "learning_rate": 3.472060438683302e-07, + "logits/chosen": -2.0386834144592285, + "logits/rejected": 0.20640242099761963, + "logps/chosen": -13.925080299377441, + "logps/rejected": -24.007619857788086, + "loss": 0.1249, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1902267336845398, + "rewards/margins": 4.3349409103393555, + "rewards/rejected": -4.144713878631592, + "step": 514 + }, + { + "epoch": 8.728813559322035, + "grad_norm": 8.50842429830542, + "learning_rate": 3.4652426668602863e-07, + "logits/chosen": -1.551865577697754, + "logits/rejected": -1.284023642539978, + "logps/chosen": -7.325331687927246, + "logps/rejected": -18.12867546081543, + "loss": 0.1124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25790315866470337, + "rewards/margins": 4.056914329528809, + "rewards/rejected": -3.799011468887329, + "step": 515 + }, + { + "epoch": 8.745762711864407, + "grad_norm": 8.880551860604504, + "learning_rate": 3.4584164483404535e-07, + "logits/chosen": -7.2592034339904785, + "logits/rejected": -6.2020769119262695, + "logps/chosen": -5.269482135772705, + "logps/rejected": -12.893440246582031, + "loss": 0.1124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49459022283554077, + "rewards/margins": 3.262979030609131, + "rewards/rejected": -2.7683887481689453, + "step": 516 + }, + { + "epoch": 8.76271186440678, + "grad_norm": 8.4228523758051, + "learning_rate": 3.4515818428590393e-07, + "logits/chosen": -0.3235975503921509, + "logits/rejected": 4.60111665725708, + "logps/chosen": -8.66872501373291, + "logps/rejected": -18.84377098083496, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2904120087623596, + "rewards/margins": 3.9645678997039795, + "rewards/rejected": -3.6741552352905273, + "step": 517 + }, + { + "epoch": 8.779661016949152, + "grad_norm": 8.78305720155635, + "learning_rate": 3.444738910224671e-07, + "logits/chosen": -2.8099265098571777, + "logits/rejected": -2.0938467979431152, + "logps/chosen": -9.935127258300781, + "logps/rejected": -16.927433013916016, + "loss": 0.1176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39175349473953247, + "rewards/margins": 2.82077693939209, + "rewards/rejected": -2.429023265838623, + "step": 518 + }, + { + "epoch": 8.796610169491526, + "grad_norm": 9.039209454575285, + "learning_rate": 3.437887710318848e-07, + "logits/chosen": -2.7464828491210938, + "logits/rejected": -0.2714478373527527, + "logps/chosen": -8.532281875610352, + "logps/rejected": -18.432205200195312, + "loss": 0.1052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6497199535369873, + "rewards/margins": 3.2518179416656494, + "rewards/rejected": -2.602097511291504, + "step": 519 + }, + { + "epoch": 8.813559322033898, + "grad_norm": 9.059999115636574, + "learning_rate": 3.4310283030954146e-07, + "logits/chosen": -4.571805953979492, + "logits/rejected": 1.363074779510498, + "logps/chosen": -10.748517990112305, + "logps/rejected": -19.172378540039062, + "loss": 0.1253, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40731969475746155, + "rewards/margins": 3.988267660140991, + "rewards/rejected": -3.5809478759765625, + "step": 520 + }, + { + "epoch": 8.830508474576272, + "grad_norm": 8.9775888768164, + "learning_rate": 3.4241607485800363e-07, + "logits/chosen": 2.731873035430908, + "logits/rejected": 4.494401931762695, + "logps/chosen": -8.267045974731445, + "logps/rejected": -22.165498733520508, + "loss": 0.1322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.14054572582244873, + "rewards/margins": 4.468079090118408, + "rewards/rejected": -4.327533721923828, + "step": 521 + }, + { + "epoch": 8.847457627118644, + "grad_norm": 8.025227471056242, + "learning_rate": 3.417285106869673e-07, + "logits/chosen": -1.5573604106903076, + "logits/rejected": -1.8264210224151611, + "logps/chosen": -11.341778755187988, + "logps/rejected": -19.851980209350586, + "loss": 0.1038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5154696702957153, + "rewards/margins": 3.7622079849243164, + "rewards/rejected": -3.2467384338378906, + "step": 522 + }, + { + "epoch": 8.864406779661017, + "grad_norm": 9.101302965192287, + "learning_rate": 3.4104014381320555e-07, + "logits/chosen": 0.13104411959648132, + "logits/rejected": -0.2756563723087311, + "logps/chosen": -10.39586067199707, + "logps/rejected": -18.803264617919922, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21374063193798065, + "rewards/margins": 3.2453413009643555, + "rewards/rejected": -3.0316004753112793, + "step": 523 + }, + { + "epoch": 8.88135593220339, + "grad_norm": 9.570891193025966, + "learning_rate": 3.403509802605159e-07, + "logits/chosen": 1.3925392627716064, + "logits/rejected": 0.888724148273468, + "logps/chosen": -8.070535659790039, + "logps/rejected": -20.617847442626953, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.059673890471458435, + "rewards/margins": 4.375804901123047, + "rewards/rejected": -4.435479164123535, + "step": 524 + }, + { + "epoch": 8.898305084745763, + "grad_norm": 8.31602132944693, + "learning_rate": 3.396610260596673e-07, + "logits/chosen": -0.9920660257339478, + "logits/rejected": 1.4377026557922363, + "logps/chosen": -12.001575469970703, + "logps/rejected": -24.17807388305664, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.287417471408844, + "rewards/margins": 3.8803770542144775, + "rewards/rejected": -3.5929596424102783, + "step": 525 + }, + { + "epoch": 8.915254237288135, + "grad_norm": 9.29327403297671, + "learning_rate": 3.389702872483477e-07, + "logits/chosen": -5.677180290222168, + "logits/rejected": -4.914144515991211, + "logps/chosen": -8.847039222717285, + "logps/rejected": -15.790771484375, + "loss": 0.1361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6716170907020569, + "rewards/margins": 2.9184117317199707, + "rewards/rejected": -2.2467947006225586, + "step": 526 + }, + { + "epoch": 8.932203389830509, + "grad_norm": 7.609548901847679, + "learning_rate": 3.38278769871111e-07, + "logits/chosen": -3.445300579071045, + "logits/rejected": -0.7922675013542175, + "logps/chosen": -8.873404502868652, + "logps/rejected": -18.53123664855957, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34169894456863403, + "rewards/margins": 3.791015386581421, + "rewards/rejected": -3.4493167400360107, + "step": 527 + }, + { + "epoch": 8.94915254237288, + "grad_norm": 8.765971983118401, + "learning_rate": 3.375864799793242e-07, + "logits/chosen": -2.4345242977142334, + "logits/rejected": -0.5172919034957886, + "logps/chosen": -9.083272933959961, + "logps/rejected": -16.6153564453125, + "loss": 0.1149, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4702245891094208, + "rewards/margins": 3.8038463592529297, + "rewards/rejected": -3.3336217403411865, + "step": 528 + }, + { + "epoch": 8.966101694915254, + "grad_norm": 9.413770924733209, + "learning_rate": 3.368934236311143e-07, + "logits/chosen": 0.7276126742362976, + "logits/rejected": 0.228562593460083, + "logps/chosen": -11.080302238464355, + "logps/rejected": -17.104206085205078, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5077924728393555, + "rewards/margins": 2.7850677967071533, + "rewards/rejected": -2.2772750854492188, + "step": 529 + }, + { + "epoch": 8.983050847457626, + "grad_norm": 8.323751589388499, + "learning_rate": 3.361996068913159e-07, + "logits/chosen": -4.382465839385986, + "logits/rejected": -1.4773523807525635, + "logps/chosen": -8.98313045501709, + "logps/rejected": -20.052814483642578, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1586902141571045, + "rewards/margins": 4.111644744873047, + "rewards/rejected": -3.9529545307159424, + "step": 530 + }, + { + "epoch": 9.0, + "grad_norm": 7.938869997211871, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -3.1977574825286865, + "logits/rejected": -1.4724467992782593, + "logps/chosen": -8.792470932006836, + "logps/rejected": -23.886981964111328, + "loss": 0.103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.39109301567077637, + "rewards/margins": 5.085795879364014, + "rewards/rejected": -4.6947021484375, + "step": 531 + }, + { + "epoch": 9.016949152542374, + "grad_norm": 8.223351028454932, + "learning_rate": 3.348097165295075e-07, + "logits/chosen": -3.76702880859375, + "logits/rejected": -3.2690422534942627, + "logps/chosen": -10.75297737121582, + "logps/rejected": -22.901792526245117, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2717806398868561, + "rewards/margins": 4.02412223815918, + "rewards/rejected": -4.295902729034424, + "step": 532 + }, + { + "epoch": 9.033898305084746, + "grad_norm": 7.03793187410003, + "learning_rate": 3.341136550702241e-07, + "logits/chosen": -2.2170581817626953, + "logits/rejected": -2.0040411949157715, + "logps/chosen": -11.038703918457031, + "logps/rejected": -21.323108673095703, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13433638215065002, + "rewards/margins": 3.86452317237854, + "rewards/rejected": -3.730186700820923, + "step": 533 + }, + { + "epoch": 9.05084745762712, + "grad_norm": 7.602772575996602, + "learning_rate": 3.334168575446985e-07, + "logits/chosen": -5.060166835784912, + "logits/rejected": -4.465640544891357, + "logps/chosen": -11.088794708251953, + "logps/rejected": -18.969402313232422, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3551236391067505, + "rewards/margins": 3.3989412784576416, + "rewards/rejected": -3.0438175201416016, + "step": 534 + }, + { + "epoch": 9.067796610169491, + "grad_norm": 6.440621371807224, + "learning_rate": 3.327193300505035e-07, + "logits/chosen": -0.4635174870491028, + "logits/rejected": -2.2793354988098145, + "logps/chosen": -9.368732452392578, + "logps/rejected": -23.782150268554688, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38101768493652344, + "rewards/margins": 3.7073676586151123, + "rewards/rejected": -3.326350212097168, + "step": 535 + }, + { + "epoch": 9.084745762711865, + "grad_norm": 7.385659302243604, + "learning_rate": 3.3202107869159967e-07, + "logits/chosen": -5.72601842880249, + "logits/rejected": -2.5037083625793457, + "logps/chosen": -10.806863784790039, + "logps/rejected": -22.083637237548828, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5078091621398926, + "rewards/margins": 3.9449386596679688, + "rewards/rejected": -3.4371299743652344, + "step": 536 + }, + { + "epoch": 9.101694915254237, + "grad_norm": 7.252768835140123, + "learning_rate": 3.313221095782822e-07, + "logits/chosen": -4.031281471252441, + "logits/rejected": -0.6388194561004639, + "logps/chosen": -11.841888427734375, + "logps/rejected": -23.276737213134766, + "loss": 0.1023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25421464443206787, + "rewards/margins": 3.5815634727478027, + "rewards/rejected": -3.3273487091064453, + "step": 537 + }, + { + "epoch": 9.11864406779661, + "grad_norm": 7.284892827250599, + "learning_rate": 3.306224288271272e-07, + "logits/chosen": -1.3736153841018677, + "logits/rejected": -0.9906230568885803, + "logps/chosen": -6.747933387756348, + "logps/rejected": -14.778159141540527, + "loss": 0.0871, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5323705673217773, + "rewards/margins": 3.7420241832733154, + "rewards/rejected": -3.20965313911438, + "step": 538 + }, + { + "epoch": 9.135593220338983, + "grad_norm": 8.256668503021041, + "learning_rate": 3.2992204256093807e-07, + "logits/chosen": -6.387026786804199, + "logits/rejected": -6.708343505859375, + "logps/chosen": -9.023993492126465, + "logps/rejected": -20.56617546081543, + "loss": 0.1127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5091872215270996, + "rewards/margins": 4.272690296173096, + "rewards/rejected": -3.763503313064575, + "step": 539 + }, + { + "epoch": 9.152542372881356, + "grad_norm": 7.188798518332266, + "learning_rate": 3.2922095690869224e-07, + "logits/chosen": -3.261608362197876, + "logits/rejected": -1.6736987829208374, + "logps/chosen": -7.519070148468018, + "logps/rejected": -19.2508487701416, + "loss": 0.0863, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38497814536094666, + "rewards/margins": 4.242589950561523, + "rewards/rejected": -3.857612133026123, + "step": 540 + }, + { + "epoch": 9.169491525423728, + "grad_norm": 7.53021595804872, + "learning_rate": 3.2851917800548725e-07, + "logits/chosen": -2.0664360523223877, + "logits/rejected": -2.3352723121643066, + "logps/chosen": -11.097540855407715, + "logps/rejected": -26.991085052490234, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5812000036239624, + "rewards/margins": 5.354377746582031, + "rewards/rejected": -4.773178577423096, + "step": 541 + }, + { + "epoch": 9.186440677966102, + "grad_norm": 9.484835899086871, + "learning_rate": 3.278167119924871e-07, + "logits/chosen": -3.301103353500366, + "logits/rejected": -1.0746486186981201, + "logps/chosen": -10.791460990905762, + "logps/rejected": -16.7052059173584, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34394583106040955, + "rewards/margins": 3.833061456680298, + "rewards/rejected": -3.4891157150268555, + "step": 542 + }, + { + "epoch": 9.203389830508474, + "grad_norm": 7.058316114345679, + "learning_rate": 3.2711356501686886e-07, + "logits/chosen": -2.03525710105896, + "logits/rejected": -1.2074201107025146, + "logps/chosen": -9.512802124023438, + "logps/rejected": -23.904556274414062, + "loss": 0.0913, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09114706516265869, + "rewards/margins": 4.969361305236816, + "rewards/rejected": -4.878214359283447, + "step": 543 + }, + { + "epoch": 9.220338983050848, + "grad_norm": 6.5571727716017545, + "learning_rate": 3.2640974323176843e-07, + "logits/chosen": -6.242308616638184, + "logits/rejected": -4.988770008087158, + "logps/chosen": -7.075240135192871, + "logps/rejected": -18.64996910095215, + "loss": 0.086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.35722628235816956, + "rewards/margins": 4.036096096038818, + "rewards/rejected": -3.6788697242736816, + "step": 544 + }, + { + "epoch": 9.23728813559322, + "grad_norm": 7.197720127226005, + "learning_rate": 3.257052527962269e-07, + "logits/chosen": -7.732370853424072, + "logits/rejected": -6.282393455505371, + "logps/chosen": -10.46985912322998, + "logps/rejected": -16.54497528076172, + "loss": 0.0949, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39111438393592834, + "rewards/margins": 2.915557384490967, + "rewards/rejected": -2.5244431495666504, + "step": 545 + }, + { + "epoch": 9.254237288135593, + "grad_norm": 8.563229116322319, + "learning_rate": 3.250000998751365e-07, + "logits/chosen": -1.0374642610549927, + "logits/rejected": -0.5755556225776672, + "logps/chosen": -8.41156005859375, + "logps/rejected": -18.4447078704834, + "loss": 0.106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36313942074775696, + "rewards/margins": 3.338423728942871, + "rewards/rejected": -2.9752843379974365, + "step": 546 + }, + { + "epoch": 9.271186440677965, + "grad_norm": 8.40637322936493, + "learning_rate": 3.2429429063918694e-07, + "logits/chosen": -0.7971823215484619, + "logits/rejected": -2.113372325897217, + "logps/chosen": -7.886877536773682, + "logps/rejected": -16.608707427978516, + "loss": 0.1032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35710078477859497, + "rewards/margins": 3.141465663909912, + "rewards/rejected": -2.784365177154541, + "step": 547 + }, + { + "epoch": 9.288135593220339, + "grad_norm": 7.915859269522675, + "learning_rate": 3.235878312648112e-07, + "logits/chosen": -1.2610503435134888, + "logits/rejected": -2.4442033767700195, + "logps/chosen": -6.95026969909668, + "logps/rejected": -21.118806838989258, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18833884596824646, + "rewards/margins": 4.624729633331299, + "rewards/rejected": -4.436390399932861, + "step": 548 + }, + { + "epoch": 9.305084745762711, + "grad_norm": 8.306527345428725, + "learning_rate": 3.2288072793413147e-07, + "logits/chosen": -3.399160385131836, + "logits/rejected": -2.3630385398864746, + "logps/chosen": -9.676424026489258, + "logps/rejected": -16.499492645263672, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6130249500274658, + "rewards/margins": 3.416779041290283, + "rewards/rejected": -2.8037540912628174, + "step": 549 + }, + { + "epoch": 9.322033898305085, + "grad_norm": 7.279865363183246, + "learning_rate": 3.2217298683490525e-07, + "logits/chosen": -2.800569772720337, + "logits/rejected": -3.325770139694214, + "logps/chosen": -7.290124893188477, + "logps/rejected": -14.377586364746094, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6072530746459961, + "rewards/margins": 3.4065473079681396, + "rewards/rejected": -2.7992939949035645, + "step": 550 + }, + { + "epoch": 9.338983050847457, + "grad_norm": 7.729749155687732, + "learning_rate": 3.214646141604709e-07, + "logits/chosen": -6.679795742034912, + "logits/rejected": -2.8563101291656494, + "logps/chosen": -14.170299530029297, + "logps/rejected": -16.713886260986328, + "loss": 0.1021, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.821824848651886, + "rewards/margins": 2.613480567932129, + "rewards/rejected": -1.7916556596755981, + "step": 551 + }, + { + "epoch": 9.35593220338983, + "grad_norm": 7.005656049681332, + "learning_rate": 3.2075561610969347e-07, + "logits/chosen": -2.8699162006378174, + "logits/rejected": -0.10029095411300659, + "logps/chosen": -12.255592346191406, + "logps/rejected": -25.76495361328125, + "loss": 0.0979, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.16769805550575256, + "rewards/margins": 4.608583450317383, + "rewards/rejected": -4.77628231048584, + "step": 552 + }, + { + "epoch": 9.372881355932204, + "grad_norm": 7.258931690887565, + "learning_rate": 3.200459988869111e-07, + "logits/chosen": -2.4785892963409424, + "logits/rejected": 0.07733534276485443, + "logps/chosen": -10.000097274780273, + "logps/rejected": -19.482460021972656, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09004150331020355, + "rewards/margins": 4.15089225769043, + "rewards/rejected": -4.060850620269775, + "step": 553 + }, + { + "epoch": 9.389830508474576, + "grad_norm": 7.006363318272141, + "learning_rate": 3.193357687018797e-07, + "logits/chosen": 4.693775653839111, + "logits/rejected": 4.2272820472717285, + "logps/chosen": -10.474517822265625, + "logps/rejected": -27.757699966430664, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13867327570915222, + "rewards/margins": 5.896880626678467, + "rewards/rejected": -6.035553932189941, + "step": 554 + }, + { + "epoch": 9.40677966101695, + "grad_norm": 7.35604084339138, + "learning_rate": 3.186249317697194e-07, + "logits/chosen": 2.288403272628784, + "logits/rejected": 6.304751873016357, + "logps/chosen": -14.09248161315918, + "logps/rejected": -24.245834350585938, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20254839956760406, + "rewards/margins": 4.757707595825195, + "rewards/rejected": -4.555159091949463, + "step": 555 + }, + { + "epoch": 9.423728813559322, + "grad_norm": 7.092403419199733, + "learning_rate": 3.1791349431085965e-07, + "logits/chosen": -1.7431526184082031, + "logits/rejected": 1.8960849046707153, + "logps/chosen": -9.011571884155273, + "logps/rejected": -23.43387794494629, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27249670028686523, + "rewards/margins": 5.282358169555664, + "rewards/rejected": -5.009861946105957, + "step": 556 + }, + { + "epoch": 9.440677966101696, + "grad_norm": 8.175083653294603, + "learning_rate": 3.1720146255098537e-07, + "logits/chosen": -2.9318580627441406, + "logits/rejected": -0.37776219844818115, + "logps/chosen": -10.542088508605957, + "logps/rejected": -25.695281982421875, + "loss": 0.0941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04570953547954559, + "rewards/margins": 5.096731662750244, + "rewards/rejected": -5.142441749572754, + "step": 557 + }, + { + "epoch": 9.457627118644067, + "grad_norm": 7.329507900240542, + "learning_rate": 3.1648884272098177e-07, + "logits/chosen": -5.442708969116211, + "logits/rejected": -2.653074026107788, + "logps/chosen": -9.153959274291992, + "logps/rejected": -12.319401741027832, + "loss": 0.0968, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6025005578994751, + "rewards/margins": 2.6678457260131836, + "rewards/rejected": -2.065345287322998, + "step": 558 + }, + { + "epoch": 9.474576271186441, + "grad_norm": 7.872704826602449, + "learning_rate": 3.157756410568803e-07, + "logits/chosen": -5.923219680786133, + "logits/rejected": -4.431153297424316, + "logps/chosen": -10.067678451538086, + "logps/rejected": -16.93128204345703, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24123656749725342, + "rewards/margins": 2.855665922164917, + "rewards/rejected": -2.614429235458374, + "step": 559 + }, + { + "epoch": 9.491525423728813, + "grad_norm": 23.783216732996646, + "learning_rate": 3.150618637998041e-07, + "logits/chosen": -0.8499359488487244, + "logits/rejected": 0.2799752950668335, + "logps/chosen": -8.114256858825684, + "logps/rejected": -21.467369079589844, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47225862741470337, + "rewards/margins": 4.720645904541016, + "rewards/rejected": -4.248387336730957, + "step": 560 + }, + { + "epoch": 9.508474576271187, + "grad_norm": 7.705930836558833, + "learning_rate": 3.1434751719591305e-07, + "logits/chosen": -6.150752544403076, + "logits/rejected": -7.172792434692383, + "logps/chosen": -10.911544799804688, + "logps/rejected": -21.380325317382812, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19267332553863525, + "rewards/margins": 3.4200611114501953, + "rewards/rejected": -3.2273876667022705, + "step": 561 + }, + { + "epoch": 9.525423728813559, + "grad_norm": 7.005357396590622, + "learning_rate": 3.136326074963494e-07, + "logits/chosen": -3.0026967525482178, + "logits/rejected": -3.8512182235717773, + "logps/chosen": -7.109619140625, + "logps/rejected": -14.219776153564453, + "loss": 0.0911, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4708208739757538, + "rewards/margins": 2.4455618858337402, + "rewards/rejected": -1.974740982055664, + "step": 562 + }, + { + "epoch": 9.542372881355933, + "grad_norm": 7.270706931110439, + "learning_rate": 3.1291714095718294e-07, + "logits/chosen": 2.667048454284668, + "logits/rejected": 2.047011375427246, + "logps/chosen": -6.924215793609619, + "logps/rejected": -19.053524017333984, + "loss": 0.0961, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019068308174610138, + "rewards/margins": 4.171212196350098, + "rewards/rejected": -4.152143955230713, + "step": 563 + }, + { + "epoch": 9.559322033898304, + "grad_norm": 7.465791344564993, + "learning_rate": 3.122011238393562e-07, + "logits/chosen": -4.9882612228393555, + "logits/rejected": -4.5862298011779785, + "logps/chosen": -7.148963928222656, + "logps/rejected": -14.193519592285156, + "loss": 0.106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16198576986789703, + "rewards/margins": 2.9971303939819336, + "rewards/rejected": -2.835144519805908, + "step": 564 + }, + { + "epoch": 9.576271186440678, + "grad_norm": 6.422080871628065, + "learning_rate": 3.1148456240862993e-07, + "logits/chosen": -3.4394102096557617, + "logits/rejected": 0.08007746934890747, + "logps/chosen": -11.544246673583984, + "logps/rejected": -27.53937530517578, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0750107690691948, + "rewards/margins": 5.368021011352539, + "rewards/rejected": -5.443032264709473, + "step": 565 + }, + { + "epoch": 9.59322033898305, + "grad_norm": 6.9205639042896365, + "learning_rate": 3.1076746293552785e-07, + "logits/chosen": -2.836104154586792, + "logits/rejected": -1.123127818107605, + "logps/chosen": -6.161153793334961, + "logps/rejected": -22.165803909301758, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13143664598464966, + "rewards/margins": 5.175368309020996, + "rewards/rejected": -5.043931007385254, + "step": 566 + }, + { + "epoch": 9.610169491525424, + "grad_norm": 7.223836249722002, + "learning_rate": 3.1004983169528225e-07, + "logits/chosen": -1.548119068145752, + "logits/rejected": -1.392944097518921, + "logps/chosen": -7.682084083557129, + "logps/rejected": -19.642316818237305, + "loss": 0.0945, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27215924859046936, + "rewards/margins": 4.37581729888916, + "rewards/rejected": -4.103658199310303, + "step": 567 + }, + { + "epoch": 9.627118644067796, + "grad_norm": 8.772767733241357, + "learning_rate": 3.0933167496777873e-07, + "logits/chosen": -4.728096008300781, + "logits/rejected": -3.047065496444702, + "logps/chosen": -8.965621948242188, + "logps/rejected": -16.117294311523438, + "loss": 0.1167, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35291966795921326, + "rewards/margins": 3.55570387840271, + "rewards/rejected": -3.202784299850464, + "step": 568 + }, + { + "epoch": 9.64406779661017, + "grad_norm": 7.844719017241512, + "learning_rate": 3.0861299903750115e-07, + "logits/chosen": -1.934074878692627, + "logits/rejected": -0.8083376884460449, + "logps/chosen": -9.919522285461426, + "logps/rejected": -26.1778564453125, + "loss": 0.1001, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3015662431716919, + "rewards/margins": 5.8502116203308105, + "rewards/rejected": -5.548645496368408, + "step": 569 + }, + { + "epoch": 9.661016949152543, + "grad_norm": 7.188158645049839, + "learning_rate": 3.0789381019347724e-07, + "logits/chosen": -1.5368669033050537, + "logits/rejected": -2.4642491340637207, + "logps/chosen": -6.37646484375, + "logps/rejected": -16.949405670166016, + "loss": 0.0949, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7365732192993164, + "rewards/margins": 3.487222909927368, + "rewards/rejected": -2.7506494522094727, + "step": 570 + }, + { + "epoch": 9.677966101694915, + "grad_norm": 7.111649275956974, + "learning_rate": 3.071741147292229e-07, + "logits/chosen": 0.26448094844818115, + "logits/rejected": 1.6492478847503662, + "logps/chosen": -10.899922370910645, + "logps/rejected": -22.683032989501953, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09248735755681992, + "rewards/margins": 3.582092761993408, + "rewards/rejected": -3.48960542678833, + "step": 571 + }, + { + "epoch": 9.694915254237289, + "grad_norm": 8.336863079121871, + "learning_rate": 3.0645391894268734e-07, + "logits/chosen": 1.917724847793579, + "logits/rejected": -0.7316230535507202, + "logps/chosen": -10.622394561767578, + "logps/rejected": -26.208999633789062, + "loss": 0.0972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2793700397014618, + "rewards/margins": 4.600907325744629, + "rewards/rejected": -4.321537017822266, + "step": 572 + }, + { + "epoch": 9.711864406779661, + "grad_norm": 6.886792783120748, + "learning_rate": 3.057332291361983e-07, + "logits/chosen": -3.8753855228424072, + "logits/rejected": -0.0695408284664154, + "logps/chosen": -13.180882453918457, + "logps/rejected": -26.468891143798828, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0012535899877548218, + "rewards/margins": 5.588648319244385, + "rewards/rejected": -5.5873942375183105, + "step": 573 + }, + { + "epoch": 9.728813559322035, + "grad_norm": 6.669887643578233, + "learning_rate": 3.050120516164062e-07, + "logits/chosen": -6.483968734741211, + "logits/rejected": -3.4908714294433594, + "logps/chosen": -9.973274230957031, + "logps/rejected": -24.82187271118164, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3250216543674469, + "rewards/margins": 4.841268539428711, + "rewards/rejected": -4.516247272491455, + "step": 574 + }, + { + "epoch": 9.745762711864407, + "grad_norm": 7.103637533168144, + "learning_rate": 3.042903926942297e-07, + "logits/chosen": -3.9618115425109863, + "logits/rejected": -2.0849714279174805, + "logps/chosen": -12.599139213562012, + "logps/rejected": -23.257089614868164, + "loss": 0.089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22399751842021942, + "rewards/margins": 5.019968509674072, + "rewards/rejected": -4.795970439910889, + "step": 575 + }, + { + "epoch": 9.76271186440678, + "grad_norm": 7.530244017046209, + "learning_rate": 3.0356825868480014e-07, + "logits/chosen": -4.090577125549316, + "logits/rejected": -4.245079040527344, + "logps/chosen": -8.236763000488281, + "logps/rejected": -16.553316116333008, + "loss": 0.0985, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24407736957073212, + "rewards/margins": 3.3017423152923584, + "rewards/rejected": -3.0576651096343994, + "step": 576 + }, + { + "epoch": 9.779661016949152, + "grad_norm": 6.697617479828195, + "learning_rate": 3.0284565590740607e-07, + "logits/chosen": -3.7148067951202393, + "logits/rejected": -2.6984448432922363, + "logps/chosen": -7.846210479736328, + "logps/rejected": -23.022817611694336, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.580223023891449, + "rewards/margins": 4.813726902008057, + "rewards/rejected": -4.233503341674805, + "step": 577 + }, + { + "epoch": 9.796610169491526, + "grad_norm": 8.516783557390537, + "learning_rate": 3.021225906854383e-07, + "logits/chosen": -0.506150484085083, + "logits/rejected": 0.20308029651641846, + "logps/chosen": -9.165265083312988, + "logps/rejected": -18.618465423583984, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2960697114467621, + "rewards/margins": 4.122585773468018, + "rewards/rejected": -3.8265163898468018, + "step": 578 + }, + { + "epoch": 9.813559322033898, + "grad_norm": 6.993927074929075, + "learning_rate": 3.013990693463344e-07, + "logits/chosen": -4.301695823669434, + "logits/rejected": -0.8326593637466431, + "logps/chosen": -10.31252384185791, + "logps/rejected": -18.033140182495117, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5753101706504822, + "rewards/margins": 4.074686050415039, + "rewards/rejected": -3.499375820159912, + "step": 579 + }, + { + "epoch": 9.830508474576272, + "grad_norm": 7.627560630006767, + "learning_rate": 3.006750982215234e-07, + "logits/chosen": -5.807773590087891, + "logits/rejected": -3.786421298980713, + "logps/chosen": -10.408415794372559, + "logps/rejected": -18.810245513916016, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49460792541503906, + "rewards/margins": 3.6803393363952637, + "rewards/rejected": -3.1857314109802246, + "step": 580 + }, + { + "epoch": 9.847457627118644, + "grad_norm": 7.042175636446051, + "learning_rate": 2.9995068364637023e-07, + "logits/chosen": -0.17031973600387573, + "logits/rejected": 0.04508787393569946, + "logps/chosen": -6.5416436195373535, + "logps/rejected": -20.05823516845703, + "loss": 0.0947, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3072429895401001, + "rewards/margins": 3.74953031539917, + "rewards/rejected": -3.4422874450683594, + "step": 581 + }, + { + "epoch": 9.864406779661017, + "grad_norm": 7.956967460140704, + "learning_rate": 2.9922583196012035e-07, + "logits/chosen": -3.6927130222320557, + "logits/rejected": -2.2789618968963623, + "logps/chosen": -7.582281589508057, + "logps/rejected": -15.243419647216797, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5607558488845825, + "rewards/margins": 3.3911430835723877, + "rewards/rejected": -2.8303873538970947, + "step": 582 + }, + { + "epoch": 9.88135593220339, + "grad_norm": 6.546803898387923, + "learning_rate": 2.985005495058446e-07, + "logits/chosen": -0.39755862951278687, + "logits/rejected": 1.024298906326294, + "logps/chosen": -7.9590888023376465, + "logps/rejected": -18.462146759033203, + "loss": 0.0802, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16562320291996002, + "rewards/margins": 4.475764274597168, + "rewards/rejected": -4.310141086578369, + "step": 583 + }, + { + "epoch": 9.898305084745763, + "grad_norm": 7.55002338431414, + "learning_rate": 2.9777484263038303e-07, + "logits/chosen": -3.177365303039551, + "logits/rejected": -2.0949947834014893, + "logps/chosen": -10.2957763671875, + "logps/rejected": -23.79370880126953, + "loss": 0.102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.552627682685852, + "rewards/margins": 4.92409086227417, + "rewards/rejected": -4.371463298797607, + "step": 584 + }, + { + "epoch": 9.915254237288135, + "grad_norm": 6.614441847101758, + "learning_rate": 2.9704871768429016e-07, + "logits/chosen": -5.1951799392700195, + "logits/rejected": -2.615152597427368, + "logps/chosen": -10.698162078857422, + "logps/rejected": -20.58434295654297, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5016829967498779, + "rewards/margins": 4.080619812011719, + "rewards/rejected": -3.5789365768432617, + "step": 585 + }, + { + "epoch": 9.932203389830509, + "grad_norm": 7.549594151251059, + "learning_rate": 2.9632218102177856e-07, + "logits/chosen": -4.657960891723633, + "logits/rejected": -1.054868459701538, + "logps/chosen": -8.243659973144531, + "logps/rejected": -18.8740234375, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2816173732280731, + "rewards/margins": 3.8149704933166504, + "rewards/rejected": -3.533353328704834, + "step": 586 + }, + { + "epoch": 9.94915254237288, + "grad_norm": 6.816475554516112, + "learning_rate": 2.9559523900066393e-07, + "logits/chosen": -1.284484624862671, + "logits/rejected": 0.5438723564147949, + "logps/chosen": -8.53143310546875, + "logps/rejected": -17.49203872680664, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3136657178401947, + "rewards/margins": 3.7685282230377197, + "rewards/rejected": -3.454862117767334, + "step": 587 + }, + { + "epoch": 9.966101694915254, + "grad_norm": 7.345727939212483, + "learning_rate": 2.948678979823092e-07, + "logits/chosen": -6.1464433670043945, + "logits/rejected": -3.958988666534424, + "logps/chosen": -12.240889549255371, + "logps/rejected": -18.993349075317383, + "loss": 0.0981, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6476763486862183, + "rewards/margins": 3.881906270980835, + "rewards/rejected": -3.234229803085327, + "step": 588 + }, + { + "epoch": 9.983050847457626, + "grad_norm": 7.228864234275865, + "learning_rate": 2.941401643315686e-07, + "logits/chosen": -4.1069722175598145, + "logits/rejected": -6.370417594909668, + "logps/chosen": -6.387879371643066, + "logps/rejected": -18.39813232421875, + "loss": 0.0833, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6458454132080078, + "rewards/margins": 3.298928737640381, + "rewards/rejected": -2.653083324432373, + "step": 589 + }, + { + "epoch": 10.0, + "grad_norm": 7.2671869247406855, + "learning_rate": 2.934120444167326e-07, + "logits/chosen": -5.320376873016357, + "logits/rejected": 1.0061277151107788, + "logps/chosen": -7.889092445373535, + "logps/rejected": -15.941211700439453, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37462252378463745, + "rewards/margins": 3.2933833599090576, + "rewards/rejected": -2.9187610149383545, + "step": 590 + }, + { + "epoch": 10.016949152542374, + "grad_norm": 6.5332566713967335, + "learning_rate": 2.926835446094716e-07, + "logits/chosen": -2.9494495391845703, + "logits/rejected": -2.0883994102478027, + "logps/chosen": -9.400456428527832, + "logps/rejected": -18.402284622192383, + "loss": 0.0724, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8871166110038757, + "rewards/margins": 3.988942861557007, + "rewards/rejected": -3.1018261909484863, + "step": 591 + }, + { + "epoch": 10.033898305084746, + "grad_norm": 6.87699072202025, + "learning_rate": 2.919546712847804e-07, + "logits/chosen": -0.45388537645339966, + "logits/rejected": -1.2807650566101074, + "logps/chosen": -10.14447021484375, + "logps/rejected": -25.88265609741211, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40496236085891724, + "rewards/margins": 4.389737129211426, + "rewards/rejected": -3.984774589538574, + "step": 592 + }, + { + "epoch": 10.05084745762712, + "grad_norm": 7.807442385203102, + "learning_rate": 2.9122543082092246e-07, + "logits/chosen": 0.45776844024658203, + "logits/rejected": -0.1286400556564331, + "logps/chosen": -10.70620346069336, + "logps/rejected": -24.24842071533203, + "loss": 0.0835, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3894589841365814, + "rewards/margins": 4.869680404663086, + "rewards/rejected": -4.480220794677734, + "step": 593 + }, + { + "epoch": 10.067796610169491, + "grad_norm": 6.669283919691698, + "learning_rate": 2.9049582959937393e-07, + "logits/chosen": -6.220808029174805, + "logits/rejected": -4.7131781578063965, + "logps/chosen": -11.595624923706055, + "logps/rejected": -19.858522415161133, + "loss": 0.0753, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48867008090019226, + "rewards/margins": 3.6481728553771973, + "rewards/rejected": -3.1595029830932617, + "step": 594 + }, + { + "epoch": 10.084745762711865, + "grad_norm": 6.1751896707000995, + "learning_rate": 2.89765874004768e-07, + "logits/chosen": -4.981171607971191, + "logits/rejected": -4.294511795043945, + "logps/chosen": -9.736584663391113, + "logps/rejected": -21.595191955566406, + "loss": 0.0829, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3352711796760559, + "rewards/margins": 4.752230644226074, + "rewards/rejected": -4.416959762573242, + "step": 595 + }, + { + "epoch": 10.101694915254237, + "grad_norm": 6.519996226278627, + "learning_rate": 2.890355704248388e-07, + "logits/chosen": -5.755083084106445, + "logits/rejected": -6.4575982093811035, + "logps/chosen": -8.622859001159668, + "logps/rejected": -17.94690704345703, + "loss": 0.0984, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07391562312841415, + "rewards/margins": 3.822164535522461, + "rewards/rejected": -3.748249053955078, + "step": 596 + }, + { + "epoch": 10.11864406779661, + "grad_norm": 6.870500428526926, + "learning_rate": 2.8830492525036587e-07, + "logits/chosen": -5.710309982299805, + "logits/rejected": -5.270742416381836, + "logps/chosen": -9.003686904907227, + "logps/rejected": -21.707897186279297, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4468466341495514, + "rewards/margins": 4.289839744567871, + "rewards/rejected": -3.8429930210113525, + "step": 597 + }, + { + "epoch": 10.135593220338983, + "grad_norm": 6.82302561225996, + "learning_rate": 2.875739448751176e-07, + "logits/chosen": -2.099620819091797, + "logits/rejected": -2.333075761795044, + "logps/chosen": -8.265347480773926, + "logps/rejected": -18.967870712280273, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41659247875213623, + "rewards/margins": 4.089541912078857, + "rewards/rejected": -3.6729493141174316, + "step": 598 + }, + { + "epoch": 10.152542372881356, + "grad_norm": 6.822421678285191, + "learning_rate": 2.8684263569579603e-07, + "logits/chosen": -3.583547830581665, + "logits/rejected": -2.7302725315093994, + "logps/chosen": -9.004634857177734, + "logps/rejected": -17.12502670288086, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4453964829444885, + "rewards/margins": 3.3031258583068848, + "rewards/rejected": -2.857728958129883, + "step": 599 + }, + { + "epoch": 10.169491525423728, + "grad_norm": 5.433585077950351, + "learning_rate": 2.8611100411198035e-07, + "logits/chosen": -4.182163715362549, + "logits/rejected": -3.384242534637451, + "logps/chosen": -6.375938415527344, + "logps/rejected": -14.676736831665039, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5399359464645386, + "rewards/margins": 3.6615355014801025, + "rewards/rejected": -3.1215996742248535, + "step": 600 + }, + { + "epoch": 10.186440677966102, + "grad_norm": 5.767263681153962, + "learning_rate": 2.853790565260712e-07, + "logits/chosen": -5.147862434387207, + "logits/rejected": -4.546764373779297, + "logps/chosen": -5.3771514892578125, + "logps/rejected": -18.10373878479004, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17288702726364136, + "rewards/margins": 4.0040411949157715, + "rewards/rejected": -3.8311543464660645, + "step": 601 + }, + { + "epoch": 10.203389830508474, + "grad_norm": 6.640532340744767, + "learning_rate": 2.846467993432342e-07, + "logits/chosen": -3.296685218811035, + "logits/rejected": -2.427095890045166, + "logps/chosen": -10.636516571044922, + "logps/rejected": -20.535572052001953, + "loss": 0.0723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5197865962982178, + "rewards/margins": 4.117679595947266, + "rewards/rejected": -3.5978927612304688, + "step": 602 + }, + { + "epoch": 10.220338983050848, + "grad_norm": 6.768047660140451, + "learning_rate": 2.8391423897134454e-07, + "logits/chosen": 0.3417333662509918, + "logits/rejected": -0.7083436846733093, + "logps/chosen": -10.16650104522705, + "logps/rejected": -27.248626708984375, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18731170892715454, + "rewards/margins": 5.726678848266602, + "rewards/rejected": -5.913990497589111, + "step": 603 + }, + { + "epoch": 10.23728813559322, + "grad_norm": 7.058789634806724, + "learning_rate": 2.8318138182093047e-07, + "logits/chosen": -0.23608046770095825, + "logits/rejected": -1.7864173650741577, + "logps/chosen": -7.249157428741455, + "logps/rejected": -24.418014526367188, + "loss": 0.0905, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44495925307273865, + "rewards/margins": 5.741467475891113, + "rewards/rejected": -5.296508312225342, + "step": 604 + }, + { + "epoch": 10.254237288135593, + "grad_norm": 6.401714878017643, + "learning_rate": 2.8244823430511725e-07, + "logits/chosen": -4.940008163452148, + "logits/rejected": -4.818403720855713, + "logps/chosen": -10.677308082580566, + "logps/rejected": -20.77131462097168, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5091035962104797, + "rewards/margins": 4.647761344909668, + "rewards/rejected": -4.138657569885254, + "step": 605 + }, + { + "epoch": 10.271186440677965, + "grad_norm": 6.1298913343452845, + "learning_rate": 2.8171480283957117e-07, + "logits/chosen": -4.699034214019775, + "logits/rejected": -3.9625940322875977, + "logps/chosen": -7.439517974853516, + "logps/rejected": -15.482069969177246, + "loss": 0.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34953752160072327, + "rewards/margins": 3.54054594039917, + "rewards/rejected": -3.1910083293914795, + "step": 606 + }, + { + "epoch": 10.288135593220339, + "grad_norm": 7.1393461037986246, + "learning_rate": 2.8098109384244315e-07, + "logits/chosen": -6.9001569747924805, + "logits/rejected": -4.933772563934326, + "logps/chosen": -8.911727905273438, + "logps/rejected": -17.510486602783203, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5511279702186584, + "rewards/margins": 3.9823532104492188, + "rewards/rejected": -3.431225299835205, + "step": 607 + }, + { + "epoch": 10.305084745762711, + "grad_norm": 6.982071096916762, + "learning_rate": 2.8024711373431297e-07, + "logits/chosen": -0.6912120580673218, + "logits/rejected": 1.0574913024902344, + "logps/chosen": -11.467482566833496, + "logps/rejected": -25.087936401367188, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07352522760629654, + "rewards/margins": 5.616614818572998, + "rewards/rejected": -5.543089866638184, + "step": 608 + }, + { + "epoch": 10.322033898305085, + "grad_norm": 7.205356800311916, + "learning_rate": 2.795128689381327e-07, + "logits/chosen": -5.367947101593018, + "logits/rejected": -6.2112226486206055, + "logps/chosen": -9.169400215148926, + "logps/rejected": -20.133502960205078, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30789613723754883, + "rewards/margins": 3.6526689529418945, + "rewards/rejected": -3.3447728157043457, + "step": 609 + }, + { + "epoch": 10.338983050847457, + "grad_norm": 6.828839846657555, + "learning_rate": 2.787783658791707e-07, + "logits/chosen": -1.397652506828308, + "logits/rejected": 0.4360477924346924, + "logps/chosen": -12.576018333435059, + "logps/rejected": -24.168855667114258, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1414359211921692, + "rewards/margins": 5.024531364440918, + "rewards/rejected": -4.883095741271973, + "step": 610 + }, + { + "epoch": 10.35593220338983, + "grad_norm": 6.148294911277272, + "learning_rate": 2.7804361098495547e-07, + "logits/chosen": -0.5597133636474609, + "logits/rejected": 2.5741095542907715, + "logps/chosen": -14.489322662353516, + "logps/rejected": -28.154443740844727, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3005380630493164, + "rewards/margins": 5.198799133300781, + "rewards/rejected": -5.499336242675781, + "step": 611 + }, + { + "epoch": 10.372881355932204, + "grad_norm": 5.923649965413688, + "learning_rate": 2.7730861068521913e-07, + "logits/chosen": -6.315664768218994, + "logits/rejected": -6.235509872436523, + "logps/chosen": -8.011595726013184, + "logps/rejected": -15.550950050354004, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6943686008453369, + "rewards/margins": 2.8963940143585205, + "rewards/rejected": -2.2020251750946045, + "step": 612 + }, + { + "epoch": 10.389830508474576, + "grad_norm": 7.280729728178173, + "learning_rate": 2.7657337141184134e-07, + "logits/chosen": -10.305787086486816, + "logits/rejected": -6.9726457595825195, + "logps/chosen": -9.634078025817871, + "logps/rejected": -17.562524795532227, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4106173515319824, + "rewards/margins": 3.9181861877441406, + "rewards/rejected": -3.507568597793579, + "step": 613 + }, + { + "epoch": 10.40677966101695, + "grad_norm": 5.774792538092833, + "learning_rate": 2.75837899598793e-07, + "logits/chosen": -8.121630668640137, + "logits/rejected": -7.595673561096191, + "logps/chosen": -7.544984817504883, + "logps/rejected": -17.279922485351562, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5641006231307983, + "rewards/margins": 3.87762188911438, + "rewards/rejected": -3.313521146774292, + "step": 614 + }, + { + "epoch": 10.423728813559322, + "grad_norm": 6.278562216017734, + "learning_rate": 2.7510220168207996e-07, + "logits/chosen": -4.6755170822143555, + "logits/rejected": -1.8191864490509033, + "logps/chosen": -8.91657543182373, + "logps/rejected": -21.299772262573242, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18500903248786926, + "rewards/margins": 4.516014575958252, + "rewards/rejected": -4.331005573272705, + "step": 615 + }, + { + "epoch": 10.440677966101696, + "grad_norm": 9.075465593759464, + "learning_rate": 2.743662840996866e-07, + "logits/chosen": -5.730457782745361, + "logits/rejected": -4.239251613616943, + "logps/chosen": -17.843042373657227, + "logps/rejected": -25.541109085083008, + "loss": 0.0999, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.508560299873352, + "rewards/margins": 3.5839507579803467, + "rewards/rejected": -3.075390577316284, + "step": 616 + }, + { + "epoch": 10.457627118644067, + "grad_norm": 6.8993665196076055, + "learning_rate": 2.736301532915196e-07, + "logits/chosen": -1.6923491954803467, + "logits/rejected": 0.2853749990463257, + "logps/chosen": -10.134171485900879, + "logps/rejected": -18.974462509155273, + "loss": 0.0835, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15618927776813507, + "rewards/margins": 3.761019468307495, + "rewards/rejected": -3.604830503463745, + "step": 617 + }, + { + "epoch": 10.474576271186441, + "grad_norm": 6.81887848068769, + "learning_rate": 2.7289381569935167e-07, + "logits/chosen": -0.2558657228946686, + "logits/rejected": -0.3321121633052826, + "logps/chosen": -10.028180122375488, + "logps/rejected": -21.12971305847168, + "loss": 0.0789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24801495671272278, + "rewards/margins": 4.917619228363037, + "rewards/rejected": -4.6696038246154785, + "step": 618 + }, + { + "epoch": 10.491525423728813, + "grad_norm": 6.438977935034483, + "learning_rate": 2.7215727776676476e-07, + "logits/chosen": -0.4277447760105133, + "logits/rejected": -3.587709426879883, + "logps/chosen": -8.177022933959961, + "logps/rejected": -21.388124465942383, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3959938883781433, + "rewards/margins": 4.029510974884033, + "rewards/rejected": -3.633517265319824, + "step": 619 + }, + { + "epoch": 10.508474576271187, + "grad_norm": 6.455920427735316, + "learning_rate": 2.714205459390942e-07, + "logits/chosen": -5.1421990394592285, + "logits/rejected": -0.8779691457748413, + "logps/chosen": -12.399337768554688, + "logps/rejected": -30.643911361694336, + "loss": 0.0767, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21704591810703278, + "rewards/margins": 5.969428062438965, + "rewards/rejected": -5.752381324768066, + "step": 620 + }, + { + "epoch": 10.525423728813559, + "grad_norm": 6.321866244111642, + "learning_rate": 2.7068362666337213e-07, + "logits/chosen": -1.289355754852295, + "logits/rejected": -2.6756012439727783, + "logps/chosen": -9.784612655639648, + "logps/rejected": -23.54802703857422, + "loss": 0.0771, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008989214897155762, + "rewards/margins": 3.653341293334961, + "rewards/rejected": -3.6623306274414062, + "step": 621 + }, + { + "epoch": 10.542372881355933, + "grad_norm": 6.433422295166518, + "learning_rate": 2.6994652638827075e-07, + "logits/chosen": -4.133284568786621, + "logits/rejected": -2.0943939685821533, + "logps/chosen": -8.259592056274414, + "logps/rejected": -22.115522384643555, + "loss": 0.076, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17010678350925446, + "rewards/margins": 5.009650707244873, + "rewards/rejected": -4.83954381942749, + "step": 622 + }, + { + "epoch": 10.559322033898304, + "grad_norm": 6.57812996928514, + "learning_rate": 2.6920925156404644e-07, + "logits/chosen": -4.605247497558594, + "logits/rejected": -1.550370216369629, + "logps/chosen": -11.494009971618652, + "logps/rejected": -20.429784774780273, + "loss": 0.0821, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3749243915081024, + "rewards/margins": 3.869821310043335, + "rewards/rejected": -3.494896650314331, + "step": 623 + }, + { + "epoch": 10.576271186440678, + "grad_norm": 6.387704872855451, + "learning_rate": 2.684718086424828e-07, + "logits/chosen": -2.2779276371002197, + "logits/rejected": -4.718961238861084, + "logps/chosen": -8.743425369262695, + "logps/rejected": -21.723159790039062, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06385959684848785, + "rewards/margins": 4.543567657470703, + "rewards/rejected": -4.479708194732666, + "step": 624 + }, + { + "epoch": 10.59322033898305, + "grad_norm": 6.832554855689087, + "learning_rate": 2.677342040768346e-07, + "logits/chosen": -9.985498428344727, + "logits/rejected": -11.14006233215332, + "logps/chosen": -6.7391886711120605, + "logps/rejected": -12.377889633178711, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4747909903526306, + "rewards/margins": 3.002551317214966, + "rewards/rejected": -2.5277605056762695, + "step": 625 + }, + { + "epoch": 10.610169491525424, + "grad_norm": 6.673592359136678, + "learning_rate": 2.669964443217711e-07, + "logits/chosen": -4.82094669342041, + "logits/rejected": -1.3282444477081299, + "logps/chosen": -9.106605529785156, + "logps/rejected": -19.12204933166504, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47643429040908813, + "rewards/margins": 4.661433696746826, + "rewards/rejected": -4.184998989105225, + "step": 626 + }, + { + "epoch": 10.627118644067796, + "grad_norm": 5.581163046682789, + "learning_rate": 2.662585358333194e-07, + "logits/chosen": -0.6283246278762817, + "logits/rejected": 1.6329344511032104, + "logps/chosen": -8.678421020507812, + "logps/rejected": -19.060213088989258, + "loss": 0.0674, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.324451744556427, + "rewards/margins": 4.503561019897461, + "rewards/rejected": -4.1791090965271, + "step": 627 + }, + { + "epoch": 10.64406779661017, + "grad_norm": 6.532943519152192, + "learning_rate": 2.655204850688085e-07, + "logits/chosen": -6.0032806396484375, + "logits/rejected": -6.153221130371094, + "logps/chosen": -9.993376731872559, + "logps/rejected": -19.9271240234375, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19192150235176086, + "rewards/margins": 3.7709221839904785, + "rewards/rejected": -3.57900071144104, + "step": 628 + }, + { + "epoch": 10.661016949152543, + "grad_norm": 6.791855490737261, + "learning_rate": 2.6478229848681217e-07, + "logits/chosen": -1.7947826385498047, + "logits/rejected": 1.245511770248413, + "logps/chosen": -14.064645767211914, + "logps/rejected": -31.12624740600586, + "loss": 0.0803, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21336092054843903, + "rewards/margins": 4.568236351013184, + "rewards/rejected": -4.3548760414123535, + "step": 629 + }, + { + "epoch": 10.677966101694915, + "grad_norm": 5.998567580225641, + "learning_rate": 2.6404398254709283e-07, + "logits/chosen": -6.073047637939453, + "logits/rejected": -3.7751412391662598, + "logps/chosen": -10.10608959197998, + "logps/rejected": -17.09992218017578, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5900631546974182, + "rewards/margins": 3.2727785110473633, + "rewards/rejected": -2.682715654373169, + "step": 630 + }, + { + "epoch": 10.694915254237289, + "grad_norm": 6.347527247412782, + "learning_rate": 2.633055437105446e-07, + "logits/chosen": -2.816498041152954, + "logits/rejected": -1.7764393091201782, + "logps/chosen": -9.664608001708984, + "logps/rejected": -18.57364273071289, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41163548827171326, + "rewards/margins": 4.226436614990234, + "rewards/rejected": -3.8148012161254883, + "step": 631 + }, + { + "epoch": 10.711864406779661, + "grad_norm": 7.657664667513132, + "learning_rate": 2.6256698843913765e-07, + "logits/chosen": -2.3866946697235107, + "logits/rejected": -3.51227068901062, + "logps/chosen": -9.459527969360352, + "logps/rejected": -23.772916793823242, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11973743140697479, + "rewards/margins": 4.514531135559082, + "rewards/rejected": -4.634268283843994, + "step": 632 + }, + { + "epoch": 10.728813559322035, + "grad_norm": 7.398836752330527, + "learning_rate": 2.6182832319586045e-07, + "logits/chosen": -7.299160957336426, + "logits/rejected": -3.781843423843384, + "logps/chosen": -13.079208374023438, + "logps/rejected": -16.370201110839844, + "loss": 0.0971, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5853415131568909, + "rewards/margins": 2.8693883419036865, + "rewards/rejected": -2.2840466499328613, + "step": 633 + }, + { + "epoch": 10.745762711864407, + "grad_norm": 5.588120159912262, + "learning_rate": 2.6108955444466407e-07, + "logits/chosen": -3.830315351486206, + "logits/rejected": -3.4206957817077637, + "logps/chosen": -9.309571266174316, + "logps/rejected": -21.44207191467285, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001529298722743988, + "rewards/margins": 4.183349609375, + "rewards/rejected": -4.184878826141357, + "step": 634 + }, + { + "epoch": 10.76271186440678, + "grad_norm": 6.535406399654082, + "learning_rate": 2.6035068865040556e-07, + "logits/chosen": -5.687456130981445, + "logits/rejected": -7.988400936126709, + "logps/chosen": -10.812385559082031, + "logps/rejected": -24.102890014648438, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08460170030593872, + "rewards/margins": 3.884694814682007, + "rewards/rejected": -3.96929669380188, + "step": 635 + }, + { + "epoch": 10.779661016949152, + "grad_norm": 5.789102480389914, + "learning_rate": 2.596117322787907e-07, + "logits/chosen": -5.722503185272217, + "logits/rejected": -7.290640830993652, + "logps/chosen": -6.970130920410156, + "logps/rejected": -20.396114349365234, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18026211857795715, + "rewards/margins": 3.8470044136047363, + "rewards/rejected": -3.6667425632476807, + "step": 636 + }, + { + "epoch": 10.796610169491526, + "grad_norm": 6.355357984144602, + "learning_rate": 2.588726917963183e-07, + "logits/chosen": -9.194799423217773, + "logits/rejected": -6.714663505554199, + "logps/chosen": -11.45416259765625, + "logps/rejected": -18.329471588134766, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32029563188552856, + "rewards/margins": 3.6258060932159424, + "rewards/rejected": -3.3055105209350586, + "step": 637 + }, + { + "epoch": 10.813559322033898, + "grad_norm": 6.858758613575888, + "learning_rate": 2.58133573670223e-07, + "logits/chosen": -6.48911190032959, + "logits/rejected": -7.002935409545898, + "logps/chosen": -12.041778564453125, + "logps/rejected": -26.613550186157227, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21291455626487732, + "rewards/margins": 4.537956237792969, + "rewards/rejected": -4.3250412940979, + "step": 638 + }, + { + "epoch": 10.830508474576272, + "grad_norm": 6.385436811626369, + "learning_rate": 2.5739438436841923e-07, + "logits/chosen": -4.354445934295654, + "logits/rejected": -1.8925871849060059, + "logps/chosen": -6.685940742492676, + "logps/rejected": -17.4467716217041, + "loss": 0.0772, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6550213694572449, + "rewards/margins": 4.483064651489258, + "rewards/rejected": -3.828043222427368, + "step": 639 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 7.5412230424923115, + "learning_rate": 2.566551303594437e-07, + "logits/chosen": -4.869339942932129, + "logits/rejected": -4.128410339355469, + "logps/chosen": -8.929168701171875, + "logps/rejected": -21.09054946899414, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3591841459274292, + "rewards/margins": 4.5191779136657715, + "rewards/rejected": -4.1599931716918945, + "step": 640 + }, + { + "epoch": 10.864406779661017, + "grad_norm": 7.2669719823592, + "learning_rate": 2.559158181123998e-07, + "logits/chosen": -7.467385292053223, + "logits/rejected": -5.042152404785156, + "logps/chosen": -9.017333030700684, + "logps/rejected": -22.0870361328125, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3601089119911194, + "rewards/margins": 4.670373916625977, + "rewards/rejected": -4.310265064239502, + "step": 641 + }, + { + "epoch": 10.88135593220339, + "grad_norm": 6.052749440889756, + "learning_rate": 2.5517645409690045e-07, + "logits/chosen": -5.470639705657959, + "logits/rejected": -1.8173260688781738, + "logps/chosen": -7.751472473144531, + "logps/rejected": -19.017440795898438, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41869232058525085, + "rewards/margins": 4.302485466003418, + "rewards/rejected": -3.8837931156158447, + "step": 642 + }, + { + "epoch": 10.898305084745763, + "grad_norm": 6.902697924495179, + "learning_rate": 2.544370447830115e-07, + "logits/chosen": -5.353979110717773, + "logits/rejected": -4.941605567932129, + "logps/chosen": -6.707101345062256, + "logps/rejected": -21.526065826416016, + "loss": 0.0789, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31695109605789185, + "rewards/margins": 4.552976608276367, + "rewards/rejected": -4.236025810241699, + "step": 643 + }, + { + "epoch": 10.915254237288135, + "grad_norm": 6.361652513747436, + "learning_rate": 2.5369759664119533e-07, + "logits/chosen": -4.966207504272461, + "logits/rejected": -6.723392963409424, + "logps/chosen": -7.939865589141846, + "logps/rejected": -20.92068862915039, + "loss": 0.0923, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5712255239486694, + "rewards/margins": 4.196403980255127, + "rewards/rejected": -3.625178337097168, + "step": 644 + }, + { + "epoch": 10.932203389830509, + "grad_norm": 6.733540459748604, + "learning_rate": 2.52958116142254e-07, + "logits/chosen": -4.9014058113098145, + "logits/rejected": -4.164872646331787, + "logps/chosen": -13.86292839050293, + "logps/rejected": -21.466909408569336, + "loss": 0.086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17265652120113373, + "rewards/margins": 3.9277124404907227, + "rewards/rejected": -3.7550559043884277, + "step": 645 + }, + { + "epoch": 10.94915254237288, + "grad_norm": 5.831373240324112, + "learning_rate": 2.522186097572727e-07, + "logits/chosen": -4.4887495040893555, + "logits/rejected": -3.214918613433838, + "logps/chosen": -9.671069145202637, + "logps/rejected": -20.055328369140625, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14893700182437897, + "rewards/margins": 3.7495431900024414, + "rewards/rejected": -3.6006062030792236, + "step": 646 + }, + { + "epoch": 10.966101694915254, + "grad_norm": 6.262791048779491, + "learning_rate": 2.514790839575634e-07, + "logits/chosen": -3.553161144256592, + "logits/rejected": -3.589228630065918, + "logps/chosen": -8.84348201751709, + "logps/rejected": -21.927766799926758, + "loss": 0.0775, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36416810750961304, + "rewards/margins": 4.637812614440918, + "rewards/rejected": -4.27364444732666, + "step": 647 + }, + { + "epoch": 10.983050847457626, + "grad_norm": 6.806327508709908, + "learning_rate": 2.507395452146074e-07, + "logits/chosen": -7.03935432434082, + "logits/rejected": -5.568624496459961, + "logps/chosen": -9.785100936889648, + "logps/rejected": -17.360050201416016, + "loss": 0.0916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31295379996299744, + "rewards/margins": 3.047978401184082, + "rewards/rejected": -2.7350244522094727, + "step": 648 + }, + { + "epoch": 11.0, + "grad_norm": 7.811002818864437, + "learning_rate": 2.5e-07, + "logits/chosen": -0.1611688733100891, + "logits/rejected": 1.0501610040664673, + "logps/chosen": -9.675187110900879, + "logps/rejected": -22.014890670776367, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20374059677124023, + "rewards/margins": 5.133134841918945, + "rewards/rejected": -4.929394721984863, + "step": 649 + }, + { + "epoch": 11.016949152542374, + "grad_norm": 6.850549464625482, + "learning_rate": 2.4926045478539256e-07, + "logits/chosen": -4.6589884757995605, + "logits/rejected": -6.010820388793945, + "logps/chosen": -7.22625207901001, + "logps/rejected": -20.878189086914062, + "loss": 0.0887, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2166891098022461, + "rewards/margins": 4.188607692718506, + "rewards/rejected": -3.9719185829162598, + "step": 650 + }, + { + "epoch": 11.033898305084746, + "grad_norm": 5.919496708388121, + "learning_rate": 2.485209160424366e-07, + "logits/chosen": -6.218087673187256, + "logits/rejected": -5.569622993469238, + "logps/chosen": -8.723810195922852, + "logps/rejected": -16.413066864013672, + "loss": 0.0683, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4738181531429291, + "rewards/margins": 3.431518316268921, + "rewards/rejected": -2.957700490951538, + "step": 651 + }, + { + "epoch": 11.05084745762712, + "grad_norm": 6.0611919294355925, + "learning_rate": 2.477813902427272e-07, + "logits/chosen": -3.49674654006958, + "logits/rejected": -1.136789321899414, + "logps/chosen": -11.503744125366211, + "logps/rejected": -25.44773292541504, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26914724707603455, + "rewards/margins": 5.494405269622803, + "rewards/rejected": -5.2252583503723145, + "step": 652 + }, + { + "epoch": 11.067796610169491, + "grad_norm": 6.0573343175910725, + "learning_rate": 2.47041883857746e-07, + "logits/chosen": -5.083340167999268, + "logits/rejected": 0.8065661191940308, + "logps/chosen": -11.080831527709961, + "logps/rejected": -31.152498245239258, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15857788920402527, + "rewards/margins": 6.871697902679443, + "rewards/rejected": -6.713120460510254, + "step": 653 + }, + { + "epoch": 11.084745762711865, + "grad_norm": 5.56917503889775, + "learning_rate": 2.463024033588046e-07, + "logits/chosen": -9.74613094329834, + "logits/rejected": -6.1578569412231445, + "logps/chosen": -11.67827320098877, + "logps/rejected": -19.71649932861328, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24430200457572937, + "rewards/margins": 4.060480117797852, + "rewards/rejected": -3.816178798675537, + "step": 654 + }, + { + "epoch": 11.101694915254237, + "grad_norm": 6.5567009060929795, + "learning_rate": 2.455629552169885e-07, + "logits/chosen": -3.267183542251587, + "logits/rejected": -4.768028259277344, + "logps/chosen": -7.853949546813965, + "logps/rejected": -21.788009643554688, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3967207968235016, + "rewards/margins": 4.657608985900879, + "rewards/rejected": -4.260887622833252, + "step": 655 + }, + { + "epoch": 11.11864406779661, + "grad_norm": 6.581903262468121, + "learning_rate": 2.448235459030996e-07, + "logits/chosen": -5.512940883636475, + "logits/rejected": -3.1800599098205566, + "logps/chosen": -9.868915557861328, + "logps/rejected": -18.821285247802734, + "loss": 0.0757, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6598990559577942, + "rewards/margins": 4.025852680206299, + "rewards/rejected": -3.3659536838531494, + "step": 656 + }, + { + "epoch": 11.135593220338983, + "grad_norm": 5.596217563555503, + "learning_rate": 2.4408418188760024e-07, + "logits/chosen": 0.2966378331184387, + "logits/rejected": -2.103040933609009, + "logps/chosen": -9.515241622924805, + "logps/rejected": -22.730234146118164, + "loss": 0.07, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0926487147808075, + "rewards/margins": 4.650430679321289, + "rewards/rejected": -4.743080139160156, + "step": 657 + }, + { + "epoch": 11.152542372881356, + "grad_norm": 5.538518104376337, + "learning_rate": 2.433448696405563e-07, + "logits/chosen": -3.6215038299560547, + "logits/rejected": -5.576323509216309, + "logps/chosen": -8.517637252807617, + "logps/rejected": -22.78083610534668, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04233184829354286, + "rewards/margins": 3.963124990463257, + "rewards/rejected": -3.920792818069458, + "step": 658 + }, + { + "epoch": 11.169491525423728, + "grad_norm": 5.6139538426783355, + "learning_rate": 2.426056156315808e-07, + "logits/chosen": -2.620246410369873, + "logits/rejected": -2.592529296875, + "logps/chosen": -9.597460746765137, + "logps/rejected": -19.386272430419922, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47715649008750916, + "rewards/margins": 4.265936851501465, + "rewards/rejected": -3.7887799739837646, + "step": 659 + }, + { + "epoch": 11.186440677966102, + "grad_norm": 5.209822541646444, + "learning_rate": 2.4186642632977697e-07, + "logits/chosen": -1.9901256561279297, + "logits/rejected": -8.246919631958008, + "logps/chosen": -12.452547073364258, + "logps/rejected": -33.33019256591797, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12005601823329926, + "rewards/margins": 5.84098482131958, + "rewards/rejected": -5.720929145812988, + "step": 660 + }, + { + "epoch": 11.203389830508474, + "grad_norm": 5.930477352951743, + "learning_rate": 2.4112730820368174e-07, + "logits/chosen": -8.356374740600586, + "logits/rejected": -7.305706024169922, + "logps/chosen": -7.346405982971191, + "logps/rejected": -16.096595764160156, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24272796511650085, + "rewards/margins": 3.5312068462371826, + "rewards/rejected": -3.2884786128997803, + "step": 661 + }, + { + "epoch": 11.220338983050848, + "grad_norm": 5.591612639284657, + "learning_rate": 2.403882677212093e-07, + "logits/chosen": -4.104741096496582, + "logits/rejected": -0.9902318716049194, + "logps/chosen": -10.457313537597656, + "logps/rejected": -21.854230880737305, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4416648745536804, + "rewards/margins": 4.539022445678711, + "rewards/rejected": -4.097356796264648, + "step": 662 + }, + { + "epoch": 11.23728813559322, + "grad_norm": 6.348118722950897, + "learning_rate": 2.3964931134959447e-07, + "logits/chosen": -4.671243667602539, + "logits/rejected": -2.5600554943084717, + "logps/chosen": -9.85940170288086, + "logps/rejected": -19.52853775024414, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5523775815963745, + "rewards/margins": 4.105027675628662, + "rewards/rejected": -3.552649736404419, + "step": 663 + }, + { + "epoch": 11.254237288135593, + "grad_norm": 5.369729124450522, + "learning_rate": 2.3891044555533586e-07, + "logits/chosen": -5.128889083862305, + "logits/rejected": -3.718061923980713, + "logps/chosen": -8.306253433227539, + "logps/rejected": -17.347557067871094, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7414266467094421, + "rewards/margins": 3.849276065826416, + "rewards/rejected": -3.107849597930908, + "step": 664 + }, + { + "epoch": 11.271186440677965, + "grad_norm": 5.612901635708372, + "learning_rate": 2.381716768041395e-07, + "logits/chosen": -0.0374007374048233, + "logits/rejected": -1.22528874874115, + "logps/chosen": -11.215349197387695, + "logps/rejected": -35.25605392456055, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17258042097091675, + "rewards/margins": 6.277578353881836, + "rewards/rejected": -6.1049981117248535, + "step": 665 + }, + { + "epoch": 11.288135593220339, + "grad_norm": 5.1813207304899835, + "learning_rate": 2.374330115608624e-07, + "logits/chosen": -7.9421257972717285, + "logits/rejected": -6.050485134124756, + "logps/chosen": -10.09483528137207, + "logps/rejected": -19.81658935546875, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29093074798583984, + "rewards/margins": 4.107818603515625, + "rewards/rejected": -3.816887855529785, + "step": 666 + }, + { + "epoch": 11.305084745762711, + "grad_norm": 5.514831436596614, + "learning_rate": 2.3669445628945538e-07, + "logits/chosen": -3.944185256958008, + "logits/rejected": -1.4804234504699707, + "logps/chosen": -10.032756805419922, + "logps/rejected": -24.357975006103516, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18329276144504547, + "rewards/margins": 5.466524600982666, + "rewards/rejected": -5.28323221206665, + "step": 667 + }, + { + "epoch": 11.322033898305085, + "grad_norm": 6.63887413664787, + "learning_rate": 2.3595601745290725e-07, + "logits/chosen": -6.884547710418701, + "logits/rejected": -7.041402339935303, + "logps/chosen": -7.076028347015381, + "logps/rejected": -14.57210922241211, + "loss": 0.0842, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6141047477722168, + "rewards/margins": 2.990374803543091, + "rewards/rejected": -2.376270055770874, + "step": 668 + }, + { + "epoch": 11.338983050847457, + "grad_norm": 6.548504856420001, + "learning_rate": 2.3521770151318784e-07, + "logits/chosen": -4.259178638458252, + "logits/rejected": -0.449784517288208, + "logps/chosen": -12.715999603271484, + "logps/rejected": -26.444982528686523, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2763742506504059, + "rewards/margins": 5.15761137008667, + "rewards/rejected": -4.881237030029297, + "step": 669 + }, + { + "epoch": 11.35593220338983, + "grad_norm": 5.985995367055966, + "learning_rate": 2.344795149311915e-07, + "logits/chosen": -4.259335517883301, + "logits/rejected": -3.376661777496338, + "logps/chosen": -10.479347229003906, + "logps/rejected": -23.774837493896484, + "loss": 0.0654, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12627626955509186, + "rewards/margins": 5.6051483154296875, + "rewards/rejected": -5.478872299194336, + "step": 670 + }, + { + "epoch": 11.372881355932204, + "grad_norm": 5.127913932930688, + "learning_rate": 2.3374146416668062e-07, + "logits/chosen": -2.8877933025360107, + "logits/rejected": -3.261695384979248, + "logps/chosen": -11.766398429870605, + "logps/rejected": -24.1757755279541, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2051323503255844, + "rewards/margins": 5.357700347900391, + "rewards/rejected": -5.1525678634643555, + "step": 671 + }, + { + "epoch": 11.389830508474576, + "grad_norm": 6.20562392512599, + "learning_rate": 2.3300355567822893e-07, + "logits/chosen": -4.444530010223389, + "logits/rejected": -4.568050384521484, + "logps/chosen": -6.976926803588867, + "logps/rejected": -17.10205841064453, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17499062418937683, + "rewards/margins": 3.3871519565582275, + "rewards/rejected": -3.212161064147949, + "step": 672 + }, + { + "epoch": 11.40677966101695, + "grad_norm": 5.413356962659108, + "learning_rate": 2.3226579592316537e-07, + "logits/chosen": -6.2134528160095215, + "logits/rejected": -5.652405261993408, + "logps/chosen": -10.496795654296875, + "logps/rejected": -20.421342849731445, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4378880262374878, + "rewards/margins": 4.105364799499512, + "rewards/rejected": -3.6674766540527344, + "step": 673 + }, + { + "epoch": 11.423728813559322, + "grad_norm": 5.2094796996358514, + "learning_rate": 2.315281913575172e-07, + "logits/chosen": -7.498478889465332, + "logits/rejected": -5.903684139251709, + "logps/chosen": -9.15145492553711, + "logps/rejected": -19.5275936126709, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4203518033027649, + "rewards/margins": 4.569279193878174, + "rewards/rejected": -4.148927688598633, + "step": 674 + }, + { + "epoch": 11.440677966101696, + "grad_norm": 6.6716832648994915, + "learning_rate": 2.3079074843595354e-07, + "logits/chosen": -5.070525169372559, + "logits/rejected": -0.550679087638855, + "logps/chosen": -10.488363265991211, + "logps/rejected": -22.19812774658203, + "loss": 0.0783, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12508079409599304, + "rewards/margins": 4.43815803527832, + "rewards/rejected": -4.563239097595215, + "step": 675 + }, + { + "epoch": 11.457627118644067, + "grad_norm": 5.956099811481841, + "learning_rate": 2.300534736117292e-07, + "logits/chosen": -9.618514060974121, + "logits/rejected": -5.595645904541016, + "logps/chosen": -12.044401168823242, + "logps/rejected": -19.89811134338379, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46925345063209534, + "rewards/margins": 4.163661479949951, + "rewards/rejected": -3.6944079399108887, + "step": 676 + }, + { + "epoch": 11.474576271186441, + "grad_norm": 6.389828344273028, + "learning_rate": 2.2931637333662785e-07, + "logits/chosen": -8.7474365234375, + "logits/rejected": -6.6938796043396, + "logps/chosen": -8.219264030456543, + "logps/rejected": -13.722729682922363, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4416693150997162, + "rewards/margins": 2.8839468955993652, + "rewards/rejected": -2.442277669906616, + "step": 677 + }, + { + "epoch": 11.491525423728813, + "grad_norm": 6.000219805644383, + "learning_rate": 2.2857945406090578e-07, + "logits/chosen": -2.0206470489501953, + "logits/rejected": -2.674741744995117, + "logps/chosen": -7.267889499664307, + "logps/rejected": -19.866804122924805, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17590032517910004, + "rewards/margins": 3.9376792907714844, + "rewards/rejected": -3.7617790699005127, + "step": 678 + }, + { + "epoch": 11.508474576271187, + "grad_norm": 8.721240193785789, + "learning_rate": 2.2784272223323527e-07, + "logits/chosen": -3.9873390197753906, + "logits/rejected": -3.3875303268432617, + "logps/chosen": -7.695770740509033, + "logps/rejected": -14.943570137023926, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27957093715667725, + "rewards/margins": 3.1959755420684814, + "rewards/rejected": -2.9164047241210938, + "step": 679 + }, + { + "epoch": 11.525423728813559, + "grad_norm": 5.950185450594928, + "learning_rate": 2.271061843006484e-07, + "logits/chosen": -5.7673821449279785, + "logits/rejected": -6.350340843200684, + "logps/chosen": -6.748774528503418, + "logps/rejected": -19.038349151611328, + "loss": 0.0834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4857510030269623, + "rewards/margins": 3.901714563369751, + "rewards/rejected": -3.415963649749756, + "step": 680 + }, + { + "epoch": 11.542372881355933, + "grad_norm": 5.8543694845957726, + "learning_rate": 2.263698467084804e-07, + "logits/chosen": -0.47089025378227234, + "logits/rejected": -6.059227466583252, + "logps/chosen": -8.830419540405273, + "logps/rejected": -32.42097091674805, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03702600300312042, + "rewards/margins": 5.787201881408691, + "rewards/rejected": -5.824227809906006, + "step": 681 + }, + { + "epoch": 11.559322033898304, + "grad_norm": 6.175920832831188, + "learning_rate": 2.2563371590031338e-07, + "logits/chosen": -5.129776954650879, + "logits/rejected": -5.330681324005127, + "logps/chosen": -8.20430850982666, + "logps/rejected": -20.813114166259766, + "loss": 0.0765, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30605781078338623, + "rewards/margins": 4.445309162139893, + "rewards/rejected": -4.139252185821533, + "step": 682 + }, + { + "epoch": 11.576271186440678, + "grad_norm": 6.113551707587027, + "learning_rate": 2.2489779831792004e-07, + "logits/chosen": -5.852320671081543, + "logits/rejected": -5.197740077972412, + "logps/chosen": -8.29546070098877, + "logps/rejected": -18.657737731933594, + "loss": 0.0693, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22355306148529053, + "rewards/margins": 3.745954990386963, + "rewards/rejected": -3.522402048110962, + "step": 683 + }, + { + "epoch": 11.59322033898305, + "grad_norm": 6.50222345937985, + "learning_rate": 2.2416210040120701e-07, + "logits/chosen": -5.681268215179443, + "logits/rejected": -0.8399734497070312, + "logps/chosen": -10.5161714553833, + "logps/rejected": -18.5819034576416, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4941365420818329, + "rewards/margins": 4.430834770202637, + "rewards/rejected": -3.9366986751556396, + "step": 684 + }, + { + "epoch": 11.610169491525424, + "grad_norm": 6.07353126745076, + "learning_rate": 2.2342662858815867e-07, + "logits/chosen": -3.1647231578826904, + "logits/rejected": -2.3442485332489014, + "logps/chosen": -12.0752534866333, + "logps/rejected": -26.96086883544922, + "loss": 0.0778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2649264633655548, + "rewards/margins": 4.969583988189697, + "rewards/rejected": -4.704657554626465, + "step": 685 + }, + { + "epoch": 11.627118644067796, + "grad_norm": 5.5404557799180685, + "learning_rate": 2.2269138931478082e-07, + "logits/chosen": -5.84234094619751, + "logits/rejected": -5.363994121551514, + "logps/chosen": -7.794708251953125, + "logps/rejected": -14.919957160949707, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5045041441917419, + "rewards/margins": 3.4364819526672363, + "rewards/rejected": -2.9319777488708496, + "step": 686 + }, + { + "epoch": 11.64406779661017, + "grad_norm": 7.170816477696418, + "learning_rate": 2.2195638901504448e-07, + "logits/chosen": -7.435305595397949, + "logits/rejected": -2.8277387619018555, + "logps/chosen": -8.017237663269043, + "logps/rejected": -14.647412300109863, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4582928419113159, + "rewards/margins": 3.0573978424072266, + "rewards/rejected": -2.599104881286621, + "step": 687 + }, + { + "epoch": 11.661016949152543, + "grad_norm": 5.025012358974028, + "learning_rate": 2.2122163412082927e-07, + "logits/chosen": -10.500146865844727, + "logits/rejected": -7.918832778930664, + "logps/chosen": -8.945805549621582, + "logps/rejected": -18.38872528076172, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3134109377861023, + "rewards/margins": 4.1275715827941895, + "rewards/rejected": -3.8141608238220215, + "step": 688 + }, + { + "epoch": 11.677966101694915, + "grad_norm": 5.966921308745028, + "learning_rate": 2.2048713106186737e-07, + "logits/chosen": -0.8204070329666138, + "logits/rejected": -3.1629364490509033, + "logps/chosen": -7.831719875335693, + "logps/rejected": -28.17206573486328, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04579088091850281, + "rewards/margins": 6.21815299987793, + "rewards/rejected": -6.263944149017334, + "step": 689 + }, + { + "epoch": 11.694915254237289, + "grad_norm": 5.50353479601336, + "learning_rate": 2.197528862656871e-07, + "logits/chosen": -2.501692533493042, + "logits/rejected": -3.2817344665527344, + "logps/chosen": -7.619758605957031, + "logps/rejected": -18.473947525024414, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3159796893596649, + "rewards/margins": 3.803527355194092, + "rewards/rejected": -3.4875473976135254, + "step": 690 + }, + { + "epoch": 11.711864406779661, + "grad_norm": 5.440614178576993, + "learning_rate": 2.190189061575569e-07, + "logits/chosen": -3.884366512298584, + "logits/rejected": -6.531481742858887, + "logps/chosen": -6.7981486320495605, + "logps/rejected": -24.70162582397461, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21428728103637695, + "rewards/margins": 5.436726093292236, + "rewards/rejected": -5.222438812255859, + "step": 691 + }, + { + "epoch": 11.728813559322035, + "grad_norm": 4.6174992967015385, + "learning_rate": 2.1828519716042886e-07, + "logits/chosen": -5.7983717918396, + "logits/rejected": -4.691634178161621, + "logps/chosen": -9.15369987487793, + "logps/rejected": -23.07421875, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2911912202835083, + "rewards/margins": 4.7826385498046875, + "rewards/rejected": -4.491447448730469, + "step": 692 + }, + { + "epoch": 11.745762711864407, + "grad_norm": 5.455493758962524, + "learning_rate": 2.1755176569488273e-07, + "logits/chosen": -1.9908509254455566, + "logits/rejected": 1.1757967472076416, + "logps/chosen": -8.67979621887207, + "logps/rejected": -19.0824031829834, + "loss": 0.0721, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4927636981010437, + "rewards/margins": 4.18226432800293, + "rewards/rejected": -3.6895008087158203, + "step": 693 + }, + { + "epoch": 11.76271186440678, + "grad_norm": 6.939821229573557, + "learning_rate": 2.168186181790695e-07, + "logits/chosen": -2.7588908672332764, + "logits/rejected": -3.0735645294189453, + "logps/chosen": -8.97663688659668, + "logps/rejected": -26.288864135742188, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5594474673271179, + "rewards/margins": 5.872201919555664, + "rewards/rejected": -5.312753677368164, + "step": 694 + }, + { + "epoch": 11.779661016949152, + "grad_norm": 6.5447294626149315, + "learning_rate": 2.1608576102865547e-07, + "logits/chosen": -3.781676769256592, + "logits/rejected": -3.6431427001953125, + "logps/chosen": -11.655670166015625, + "logps/rejected": -20.786060333251953, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08528602868318558, + "rewards/margins": 3.7099151611328125, + "rewards/rejected": -3.795201301574707, + "step": 695 + }, + { + "epoch": 11.796610169491526, + "grad_norm": 5.9844635273883995, + "learning_rate": 2.1535320065676578e-07, + "logits/chosen": -2.6350269317626953, + "logits/rejected": -1.517306923866272, + "logps/chosen": -9.438491821289062, + "logps/rejected": -23.236515045166016, + "loss": 0.0745, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21704833209514618, + "rewards/margins": 5.231599807739258, + "rewards/rejected": -5.014551639556885, + "step": 696 + }, + { + "epoch": 11.813559322033898, + "grad_norm": 6.08157149813121, + "learning_rate": 2.1462094347392884e-07, + "logits/chosen": -5.0445051193237305, + "logits/rejected": -4.414592742919922, + "logps/chosen": -9.716800689697266, + "logps/rejected": -20.61695098876953, + "loss": 0.0742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30450063943862915, + "rewards/margins": 4.38283634185791, + "rewards/rejected": -4.078335762023926, + "step": 697 + }, + { + "epoch": 11.830508474576272, + "grad_norm": 5.443804980328509, + "learning_rate": 2.1388899588801963e-07, + "logits/chosen": -6.142149925231934, + "logits/rejected": -6.617219924926758, + "logps/chosen": -9.288625717163086, + "logps/rejected": -21.615276336669922, + "loss": 0.0612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029895126819610596, + "rewards/margins": 4.5009918212890625, + "rewards/rejected": -4.530886650085449, + "step": 698 + }, + { + "epoch": 11.847457627118644, + "grad_norm": 5.261191623846564, + "learning_rate": 2.131573643042039e-07, + "logits/chosen": -2.0807480812072754, + "logits/rejected": -5.6200079917907715, + "logps/chosen": -9.350138664245605, + "logps/rejected": -24.03493309020996, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.460003137588501, + "rewards/margins": 4.993373394012451, + "rewards/rejected": -4.533370018005371, + "step": 699 + }, + { + "epoch": 11.864406779661017, + "grad_norm": 8.431810204613281, + "learning_rate": 2.1242605512488245e-07, + "logits/chosen": -2.674501895904541, + "logits/rejected": -2.5828213691711426, + "logps/chosen": -8.736321449279785, + "logps/rejected": -21.578317642211914, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9485386610031128, + "rewards/margins": 4.861410140991211, + "rewards/rejected": -3.9128713607788086, + "step": 700 + }, + { + "epoch": 11.88135593220339, + "grad_norm": 6.151330495250116, + "learning_rate": 2.116950747496342e-07, + "logits/chosen": -5.063408851623535, + "logits/rejected": -3.6914243698120117, + "logps/chosen": -10.055356979370117, + "logps/rejected": -25.340904235839844, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1456144005060196, + "rewards/margins": 5.068149089813232, + "rewards/rejected": -4.922534465789795, + "step": 701 + }, + { + "epoch": 11.898305084745763, + "grad_norm": 5.355508219261581, + "learning_rate": 2.1096442957516116e-07, + "logits/chosen": -3.058699131011963, + "logits/rejected": -3.3943307399749756, + "logps/chosen": -7.250519275665283, + "logps/rejected": -17.219770431518555, + "loss": 0.0737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6987836956977844, + "rewards/margins": 3.889528751373291, + "rewards/rejected": -3.190744638442993, + "step": 702 + }, + { + "epoch": 11.915254237288135, + "grad_norm": 6.202575722704129, + "learning_rate": 2.10234125995232e-07, + "logits/chosen": -2.8255527019500732, + "logits/rejected": -4.576816082000732, + "logps/chosen": -5.164361000061035, + "logps/rejected": -15.157373428344727, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5878112316131592, + "rewards/margins": 3.485034227371216, + "rewards/rejected": -2.8972229957580566, + "step": 703 + }, + { + "epoch": 11.932203389830509, + "grad_norm": 5.559092253894643, + "learning_rate": 2.0950417040062607e-07, + "logits/chosen": -2.9859113693237305, + "logits/rejected": -2.63259220123291, + "logps/chosen": -7.869375705718994, + "logps/rejected": -16.07195281982422, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16196058690547943, + "rewards/margins": 3.7745909690856934, + "rewards/rejected": -3.612630605697632, + "step": 704 + }, + { + "epoch": 11.94915254237288, + "grad_norm": 5.674022553504681, + "learning_rate": 2.0877456917907757e-07, + "logits/chosen": -2.325315475463867, + "logits/rejected": -2.9422802925109863, + "logps/chosen": -11.614786148071289, + "logps/rejected": -21.07794952392578, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35842809081077576, + "rewards/margins": 4.692770481109619, + "rewards/rejected": -4.334342002868652, + "step": 705 + }, + { + "epoch": 11.966101694915254, + "grad_norm": 5.541880018229243, + "learning_rate": 2.0804532871521957e-07, + "logits/chosen": -4.538088798522949, + "logits/rejected": -2.091925621032715, + "logps/chosen": -7.1544880867004395, + "logps/rejected": -23.85821533203125, + "loss": 0.0609, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29876646399497986, + "rewards/margins": 4.888092041015625, + "rewards/rejected": -4.589325904846191, + "step": 706 + }, + { + "epoch": 11.983050847457626, + "grad_norm": 6.255400445675394, + "learning_rate": 2.0731645539052842e-07, + "logits/chosen": -4.6718974113464355, + "logits/rejected": -4.809595584869385, + "logps/chosen": -6.028842926025391, + "logps/rejected": -19.23764419555664, + "loss": 0.0684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8090718388557434, + "rewards/margins": 5.258846282958984, + "rewards/rejected": -4.449774265289307, + "step": 707 + }, + { + "epoch": 12.0, + "grad_norm": 5.115387616418879, + "learning_rate": 2.065879555832674e-07, + "logits/chosen": -7.058655738830566, + "logits/rejected": -5.943211555480957, + "logps/chosen": -8.043971061706543, + "logps/rejected": -24.434711456298828, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44858622550964355, + "rewards/margins": 5.742889881134033, + "rewards/rejected": -5.294303894042969, + "step": 708 + }, + { + "epoch": 12.016949152542374, + "grad_norm": 5.263119162726668, + "learning_rate": 2.0585983566843142e-07, + "logits/chosen": -4.840389728546143, + "logits/rejected": -2.096611499786377, + "logps/chosen": -7.987666130065918, + "logps/rejected": -25.624900817871094, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5033466815948486, + "rewards/margins": 6.154224872589111, + "rewards/rejected": -5.65087890625, + "step": 709 + }, + { + "epoch": 12.033898305084746, + "grad_norm": 5.453331088324317, + "learning_rate": 2.0513210201769083e-07, + "logits/chosen": -2.46553897857666, + "logits/rejected": -2.2434611320495605, + "logps/chosen": -9.034485816955566, + "logps/rejected": -20.003265380859375, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40343374013900757, + "rewards/margins": 4.3479838371276855, + "rewards/rejected": -3.944549560546875, + "step": 710 + }, + { + "epoch": 12.05084745762712, + "grad_norm": 6.185812331075285, + "learning_rate": 2.0440476099933602e-07, + "logits/chosen": -10.844772338867188, + "logits/rejected": -7.556892395019531, + "logps/chosen": -10.017423629760742, + "logps/rejected": -15.887184143066406, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8177813291549683, + "rewards/margins": 3.716082811355591, + "rewards/rejected": -2.898301362991333, + "step": 711 + }, + { + "epoch": 12.067796610169491, + "grad_norm": 6.247377514005049, + "learning_rate": 2.0367781897822144e-07, + "logits/chosen": -4.39729118347168, + "logits/rejected": -2.3856217861175537, + "logps/chosen": -9.275962829589844, + "logps/rejected": -21.623519897460938, + "loss": 0.0837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19558584690093994, + "rewards/margins": 4.383027076721191, + "rewards/rejected": -4.578612327575684, + "step": 712 + }, + { + "epoch": 12.084745762711865, + "grad_norm": 5.032949557222304, + "learning_rate": 2.0295128231570984e-07, + "logits/chosen": -1.5685780048370361, + "logits/rejected": -4.263853549957275, + "logps/chosen": -7.182946681976318, + "logps/rejected": -22.17525863647461, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3165295720100403, + "rewards/margins": 5.067912578582764, + "rewards/rejected": -4.751383304595947, + "step": 713 + }, + { + "epoch": 12.101694915254237, + "grad_norm": 5.183886724119971, + "learning_rate": 2.0222515736961692e-07, + "logits/chosen": -4.001754283905029, + "logits/rejected": -0.3880186378955841, + "logps/chosen": -11.332290649414062, + "logps/rejected": -31.85840606689453, + "loss": 0.0617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14374534785747528, + "rewards/margins": 6.776453971862793, + "rewards/rejected": -6.920198917388916, + "step": 714 + }, + { + "epoch": 12.11864406779661, + "grad_norm": 6.1280771227485165, + "learning_rate": 2.0149945049415546e-07, + "logits/chosen": -2.7417237758636475, + "logits/rejected": -3.603642225265503, + "logps/chosen": -7.821854591369629, + "logps/rejected": -17.309070587158203, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09123823791742325, + "rewards/margins": 3.7358312606811523, + "rewards/rejected": -3.64459228515625, + "step": 715 + }, + { + "epoch": 12.135593220338983, + "grad_norm": 4.628880282204041, + "learning_rate": 2.0077416803987963e-07, + "logits/chosen": -3.0061495304107666, + "logits/rejected": -0.7661735415458679, + "logps/chosen": -10.945513725280762, + "logps/rejected": -21.080097198486328, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4002856910228729, + "rewards/margins": 4.950218677520752, + "rewards/rejected": -4.549932956695557, + "step": 716 + }, + { + "epoch": 12.152542372881356, + "grad_norm": 5.481576880897221, + "learning_rate": 2.0004931635362982e-07, + "logits/chosen": -6.2298903465271, + "logits/rejected": -4.165480136871338, + "logps/chosen": -7.415446758270264, + "logps/rejected": -15.942654609680176, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5828918218612671, + "rewards/margins": 3.7826273441314697, + "rewards/rejected": -3.199735403060913, + "step": 717 + }, + { + "epoch": 12.169491525423728, + "grad_norm": 6.912292590497126, + "learning_rate": 1.993249017784766e-07, + "logits/chosen": -2.234239101409912, + "logits/rejected": -5.491997241973877, + "logps/chosen": -9.92994213104248, + "logps/rejected": -25.07433319091797, + "loss": 0.0734, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20779605209827423, + "rewards/margins": 4.874294757843018, + "rewards/rejected": -5.082091331481934, + "step": 718 + }, + { + "epoch": 12.186440677966102, + "grad_norm": 5.57790520194197, + "learning_rate": 1.9860093065366557e-07, + "logits/chosen": -4.336525917053223, + "logits/rejected": -4.3841552734375, + "logps/chosen": -8.806467056274414, + "logps/rejected": -17.178787231445312, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4746827781200409, + "rewards/margins": 4.141698360443115, + "rewards/rejected": -3.667015552520752, + "step": 719 + }, + { + "epoch": 12.203389830508474, + "grad_norm": 4.367463057983305, + "learning_rate": 1.9787740931456164e-07, + "logits/chosen": -4.236856937408447, + "logits/rejected": -2.3356170654296875, + "logps/chosen": -8.016395568847656, + "logps/rejected": -24.094526290893555, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06415438652038574, + "rewards/margins": 5.513944149017334, + "rewards/rejected": -5.578098773956299, + "step": 720 + }, + { + "epoch": 12.220338983050848, + "grad_norm": 4.980211475796958, + "learning_rate": 1.971543440925939e-07, + "logits/chosen": -2.7962565422058105, + "logits/rejected": -1.1794178485870361, + "logps/chosen": -7.825432300567627, + "logps/rejected": -19.563974380493164, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6326717138290405, + "rewards/margins": 4.928226947784424, + "rewards/rejected": -4.2955546379089355, + "step": 721 + }, + { + "epoch": 12.23728813559322, + "grad_norm": 5.0620318025271365, + "learning_rate": 1.9643174131519984e-07, + "logits/chosen": -2.530458450317383, + "logits/rejected": -3.3694586753845215, + "logps/chosen": -6.862641334533691, + "logps/rejected": -18.251625061035156, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48348268866539, + "rewards/margins": 4.146088600158691, + "rewards/rejected": -3.6626057624816895, + "step": 722 + }, + { + "epoch": 12.254237288135593, + "grad_norm": 4.935925973312889, + "learning_rate": 1.9570960730577032e-07, + "logits/chosen": -3.856574058532715, + "logits/rejected": -2.0448527336120605, + "logps/chosen": -10.856523513793945, + "logps/rejected": -23.267667770385742, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4581919014453888, + "rewards/margins": 5.144651412963867, + "rewards/rejected": -4.686459541320801, + "step": 723 + }, + { + "epoch": 12.271186440677965, + "grad_norm": 5.192261425939682, + "learning_rate": 1.949879483835939e-07, + "logits/chosen": -5.752046585083008, + "logits/rejected": -4.203609943389893, + "logps/chosen": -6.465901851654053, + "logps/rejected": -17.767559051513672, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44363754987716675, + "rewards/margins": 4.3280720710754395, + "rewards/rejected": -3.8844351768493652, + "step": 724 + }, + { + "epoch": 12.288135593220339, + "grad_norm": 6.164340215886749, + "learning_rate": 1.9426677086380183e-07, + "logits/chosen": -4.548156261444092, + "logits/rejected": -4.889017581939697, + "logps/chosen": -7.712782859802246, + "logps/rejected": -18.098644256591797, + "loss": 0.0855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.684238612651825, + "rewards/margins": 4.058871269226074, + "rewards/rejected": -3.3746328353881836, + "step": 725 + }, + { + "epoch": 12.305084745762711, + "grad_norm": 5.311974710959193, + "learning_rate": 1.9354608105731267e-07, + "logits/chosen": -2.2404370307922363, + "logits/rejected": -4.380362033843994, + "logps/chosen": -9.613097190856934, + "logps/rejected": -27.764812469482422, + "loss": 0.0674, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3453305661678314, + "rewards/margins": 6.031594753265381, + "rewards/rejected": -6.376925468444824, + "step": 726 + }, + { + "epoch": 12.322033898305085, + "grad_norm": 5.378269931109981, + "learning_rate": 1.9282588527077713e-07, + "logits/chosen": -2.930131435394287, + "logits/rejected": -1.716202735900879, + "logps/chosen": -10.826783180236816, + "logps/rejected": -19.09994125366211, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4629979431629181, + "rewards/margins": 4.1832051277160645, + "rewards/rejected": -3.7202072143554688, + "step": 727 + }, + { + "epoch": 12.338983050847457, + "grad_norm": 5.279035335530526, + "learning_rate": 1.9210618980652273e-07, + "logits/chosen": -2.356511354446411, + "logits/rejected": -2.470968246459961, + "logps/chosen": -7.703697204589844, + "logps/rejected": -24.5485897064209, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13995756208896637, + "rewards/margins": 5.8754167556762695, + "rewards/rejected": -6.015374660491943, + "step": 728 + }, + { + "epoch": 12.35593220338983, + "grad_norm": 5.022723359498386, + "learning_rate": 1.9138700096249883e-07, + "logits/chosen": -5.711466312408447, + "logits/rejected": -1.9884896278381348, + "logps/chosen": -11.275728225708008, + "logps/rejected": -26.373491287231445, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19736520946025848, + "rewards/margins": 5.823462963104248, + "rewards/rejected": -5.626098155975342, + "step": 729 + }, + { + "epoch": 12.372881355932204, + "grad_norm": 5.309462197313035, + "learning_rate": 1.9066832503222128e-07, + "logits/chosen": -4.810001373291016, + "logits/rejected": 0.02377176284790039, + "logps/chosen": -11.607244491577148, + "logps/rejected": -23.0208740234375, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31827598810195923, + "rewards/margins": 4.640523433685303, + "rewards/rejected": -4.32224702835083, + "step": 730 + }, + { + "epoch": 12.389830508474576, + "grad_norm": 4.6506237974520666, + "learning_rate": 1.899501683047177e-07, + "logits/chosen": -6.378528118133545, + "logits/rejected": -8.353179931640625, + "logps/chosen": -8.333313941955566, + "logps/rejected": -23.246667861938477, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32500994205474854, + "rewards/margins": 5.235971927642822, + "rewards/rejected": -4.910961151123047, + "step": 731 + }, + { + "epoch": 12.40677966101695, + "grad_norm": 5.738231213417738, + "learning_rate": 1.892325370644721e-07, + "logits/chosen": -6.588768005371094, + "logits/rejected": -4.150164604187012, + "logps/chosen": -10.492864608764648, + "logps/rejected": -16.70100212097168, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6345978379249573, + "rewards/margins": 3.688321590423584, + "rewards/rejected": -3.0537235736846924, + "step": 732 + }, + { + "epoch": 12.423728813559322, + "grad_norm": 3.9687270061891198, + "learning_rate": 1.8851543759137007e-07, + "logits/chosen": -8.90609359741211, + "logits/rejected": -6.525782108306885, + "logps/chosen": -7.384334087371826, + "logps/rejected": -19.212717056274414, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3278910517692566, + "rewards/margins": 5.070918083190918, + "rewards/rejected": -4.743027687072754, + "step": 733 + }, + { + "epoch": 12.440677966101696, + "grad_norm": 4.6943629699640725, + "learning_rate": 1.8779887616064382e-07, + "logits/chosen": -5.408133029937744, + "logits/rejected": -2.645357608795166, + "logps/chosen": -10.299897193908691, + "logps/rejected": -20.829387664794922, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3884427547454834, + "rewards/margins": 4.583587169647217, + "rewards/rejected": -4.195144176483154, + "step": 734 + }, + { + "epoch": 12.457627118644067, + "grad_norm": 5.171698330959028, + "learning_rate": 1.8708285904281712e-07, + "logits/chosen": -4.199699878692627, + "logits/rejected": -2.1007070541381836, + "logps/chosen": -7.45955228805542, + "logps/rejected": -18.949785232543945, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09868350625038147, + "rewards/margins": 4.4975457191467285, + "rewards/rejected": -4.398862361907959, + "step": 735 + }, + { + "epoch": 12.474576271186441, + "grad_norm": 5.075833146805721, + "learning_rate": 1.8636739250365056e-07, + "logits/chosen": -4.879715919494629, + "logits/rejected": -2.4047141075134277, + "logps/chosen": -11.846234321594238, + "logps/rejected": -20.621950149536133, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5136569142341614, + "rewards/margins": 4.873780250549316, + "rewards/rejected": -4.360123634338379, + "step": 736 + }, + { + "epoch": 12.491525423728813, + "grad_norm": 4.994169132268942, + "learning_rate": 1.8565248280408698e-07, + "logits/chosen": -0.9355611801147461, + "logits/rejected": -2.9057836532592773, + "logps/chosen": -8.956902503967285, + "logps/rejected": -23.145809173583984, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0478157103061676, + "rewards/margins": 4.980042457580566, + "rewards/rejected": -5.027858257293701, + "step": 737 + }, + { + "epoch": 12.508474576271187, + "grad_norm": 4.615304739621222, + "learning_rate": 1.8493813620019595e-07, + "logits/chosen": -6.629713535308838, + "logits/rejected": -4.551311492919922, + "logps/chosen": -11.932062149047852, + "logps/rejected": -27.589099884033203, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35736513137817383, + "rewards/margins": 5.075533866882324, + "rewards/rejected": -5.432898998260498, + "step": 738 + }, + { + "epoch": 12.525423728813559, + "grad_norm": 4.899248416753311, + "learning_rate": 1.8422435894311973e-07, + "logits/chosen": -7.817357540130615, + "logits/rejected": -6.282201766967773, + "logps/chosen": -7.510641574859619, + "logps/rejected": -20.142223358154297, + "loss": 0.0645, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23153254389762878, + "rewards/margins": 4.454207897186279, + "rewards/rejected": -4.222675323486328, + "step": 739 + }, + { + "epoch": 12.542372881355933, + "grad_norm": 4.572546849568084, + "learning_rate": 1.8351115727901829e-07, + "logits/chosen": -3.3442132472991943, + "logits/rejected": -3.23862886428833, + "logps/chosen": -8.925397872924805, + "logps/rejected": -25.86237907409668, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3724052906036377, + "rewards/margins": 5.9618425369262695, + "rewards/rejected": -5.589437007904053, + "step": 740 + }, + { + "epoch": 12.559322033898304, + "grad_norm": 5.890950067266909, + "learning_rate": 1.8279853744901464e-07, + "logits/chosen": -7.5882344245910645, + "logits/rejected": -7.436162948608398, + "logps/chosen": -9.983795166015625, + "logps/rejected": -19.23735237121582, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12066636979579926, + "rewards/margins": 3.7871222496032715, + "rewards/rejected": -3.6664562225341797, + "step": 741 + }, + { + "epoch": 12.576271186440678, + "grad_norm": 5.125645184008797, + "learning_rate": 1.8208650568914033e-07, + "logits/chosen": -6.591865062713623, + "logits/rejected": -5.549574851989746, + "logps/chosen": -10.08639907836914, + "logps/rejected": -19.847394943237305, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5254420638084412, + "rewards/margins": 3.9040541648864746, + "rewards/rejected": -3.3786120414733887, + "step": 742 + }, + { + "epoch": 12.59322033898305, + "grad_norm": 5.0651911831079, + "learning_rate": 1.8137506823028065e-07, + "logits/chosen": -5.7834367752075195, + "logits/rejected": -3.119711399078369, + "logps/chosen": -10.867972373962402, + "logps/rejected": -15.676458358764648, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8671466112136841, + "rewards/margins": 3.3360648155212402, + "rewards/rejected": -2.4689180850982666, + "step": 743 + }, + { + "epoch": 12.610169491525424, + "grad_norm": 5.631498137773476, + "learning_rate": 1.8066423129812026e-07, + "logits/chosen": -2.676837205886841, + "logits/rejected": -1.9061572551727295, + "logps/chosen": -11.207535743713379, + "logps/rejected": -25.026824951171875, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09902028739452362, + "rewards/margins": 5.083338260650635, + "rewards/rejected": -4.984318256378174, + "step": 744 + }, + { + "epoch": 12.627118644067796, + "grad_norm": 5.140393808106211, + "learning_rate": 1.7995400111308883e-07, + "logits/chosen": -5.798111438751221, + "logits/rejected": -4.84065580368042, + "logps/chosen": -10.029037475585938, + "logps/rejected": -19.738197326660156, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39238882064819336, + "rewards/margins": 4.5251593589782715, + "rewards/rejected": -4.132770538330078, + "step": 745 + }, + { + "epoch": 12.64406779661017, + "grad_norm": 5.651054369991429, + "learning_rate": 1.7924438389030648e-07, + "logits/chosen": -5.302846431732178, + "logits/rejected": -3.5109519958496094, + "logps/chosen": -12.664636611938477, + "logps/rejected": -25.82659912109375, + "loss": 0.0665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18043912947177887, + "rewards/margins": 5.055805206298828, + "rewards/rejected": -5.236244201660156, + "step": 746 + }, + { + "epoch": 12.661016949152543, + "grad_norm": 5.632292139453602, + "learning_rate": 1.785353858395292e-07, + "logits/chosen": -6.3755574226379395, + "logits/rejected": -6.289969444274902, + "logps/chosen": -9.670694351196289, + "logps/rejected": -17.61966896057129, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5971437692642212, + "rewards/margins": 3.45253849029541, + "rewards/rejected": -2.8553946018218994, + "step": 747 + }, + { + "epoch": 12.677966101694915, + "grad_norm": 5.14742573667959, + "learning_rate": 1.7782701316509478e-07, + "logits/chosen": -5.7314066886901855, + "logits/rejected": -3.3766136169433594, + "logps/chosen": -10.241357803344727, + "logps/rejected": -22.874204635620117, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4667729139328003, + "rewards/margins": 5.620718479156494, + "rewards/rejected": -5.1539459228515625, + "step": 748 + }, + { + "epoch": 12.694915254237289, + "grad_norm": 5.452805903306714, + "learning_rate": 1.7711927206586853e-07, + "logits/chosen": -7.6883649826049805, + "logits/rejected": -8.14977741241455, + "logps/chosen": -10.177472114562988, + "logps/rejected": -21.427345275878906, + "loss": 0.063, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34390830993652344, + "rewards/margins": 4.701626300811768, + "rewards/rejected": -4.357717990875244, + "step": 749 + }, + { + "epoch": 12.711864406779661, + "grad_norm": 4.870367464314958, + "learning_rate": 1.7641216873518876e-07, + "logits/chosen": -2.2180004119873047, + "logits/rejected": -2.018139123916626, + "logps/chosen": -8.882790565490723, + "logps/rejected": -25.841060638427734, + "loss": 0.0528, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22583720088005066, + "rewards/margins": 5.492011070251465, + "rewards/rejected": -5.26617431640625, + "step": 750 + }, + { + "epoch": 12.728813559322035, + "grad_norm": 5.243426061747342, + "learning_rate": 1.7570570936081306e-07, + "logits/chosen": -9.123380661010742, + "logits/rejected": -8.786087036132812, + "logps/chosen": -8.865097999572754, + "logps/rejected": -18.413890838623047, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4756356179714203, + "rewards/margins": 4.1750288009643555, + "rewards/rejected": -3.6993932723999023, + "step": 751 + }, + { + "epoch": 12.745762711864407, + "grad_norm": 6.5035914338241145, + "learning_rate": 1.7499990012486348e-07, + "logits/chosen": -4.059915542602539, + "logits/rejected": -4.546253681182861, + "logps/chosen": -8.72022819519043, + "logps/rejected": -30.666339874267578, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05255214869976044, + "rewards/margins": 5.699767112731934, + "rewards/rejected": -5.7523193359375, + "step": 752 + }, + { + "epoch": 12.76271186440678, + "grad_norm": 5.570786457572866, + "learning_rate": 1.7429474720377312e-07, + "logits/chosen": -7.756892204284668, + "logits/rejected": -6.753458023071289, + "logps/chosen": -8.187812805175781, + "logps/rejected": -15.387481689453125, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4039037227630615, + "rewards/margins": 3.7153241634368896, + "rewards/rejected": -3.311420202255249, + "step": 753 + }, + { + "epoch": 12.779661016949152, + "grad_norm": 6.154811299294534, + "learning_rate": 1.735902567682315e-07, + "logits/chosen": -2.9708542823791504, + "logits/rejected": 0.3598502278327942, + "logps/chosen": -11.489412307739258, + "logps/rejected": -21.9397029876709, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35112035274505615, + "rewards/margins": 4.90283203125, + "rewards/rejected": -4.551711559295654, + "step": 754 + }, + { + "epoch": 12.796610169491526, + "grad_norm": 5.353849373734212, + "learning_rate": 1.7288643498313104e-07, + "logits/chosen": -6.833067417144775, + "logits/rejected": -6.609170913696289, + "logps/chosen": -9.3986177444458, + "logps/rejected": -21.662946701049805, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4572463035583496, + "rewards/margins": 4.598602294921875, + "rewards/rejected": -4.141356468200684, + "step": 755 + }, + { + "epoch": 12.813559322033898, + "grad_norm": 5.19613647792081, + "learning_rate": 1.7218328800751285e-07, + "logits/chosen": -6.809295177459717, + "logits/rejected": -9.63763427734375, + "logps/chosen": -11.164754867553711, + "logps/rejected": -22.834970474243164, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1647118628025055, + "rewards/margins": 4.19931697845459, + "rewards/rejected": -4.034604549407959, + "step": 756 + }, + { + "epoch": 12.830508474576272, + "grad_norm": 5.032797215130399, + "learning_rate": 1.7148082199451286e-07, + "logits/chosen": -2.553732395172119, + "logits/rejected": -2.5321171283721924, + "logps/chosen": -7.590460777282715, + "logps/rejected": -25.94837188720703, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32335829734802246, + "rewards/margins": 5.733863353729248, + "rewards/rejected": -5.4105048179626465, + "step": 757 + }, + { + "epoch": 12.847457627118644, + "grad_norm": 5.461302763721142, + "learning_rate": 1.7077904309130782e-07, + "logits/chosen": -5.5044026374816895, + "logits/rejected": -3.5391619205474854, + "logps/chosen": -8.875229835510254, + "logps/rejected": -20.904315948486328, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3172765374183655, + "rewards/margins": 4.838804721832275, + "rewards/rejected": -4.521528244018555, + "step": 758 + }, + { + "epoch": 12.864406779661017, + "grad_norm": 5.630964578420358, + "learning_rate": 1.7007795743906194e-07, + "logits/chosen": -5.3108906745910645, + "logits/rejected": -5.781566619873047, + "logps/chosen": -8.304010391235352, + "logps/rejected": -21.03519058227539, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25181227922439575, + "rewards/margins": 4.968446254730225, + "rewards/rejected": -4.716634750366211, + "step": 759 + }, + { + "epoch": 12.88135593220339, + "grad_norm": 5.170697641451785, + "learning_rate": 1.6937757117287276e-07, + "logits/chosen": -6.395049095153809, + "logits/rejected": -5.568882942199707, + "logps/chosen": -12.260860443115234, + "logps/rejected": -17.58432388305664, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49597105383872986, + "rewards/margins": 3.307673692703247, + "rewards/rejected": -2.811702251434326, + "step": 760 + }, + { + "epoch": 12.898305084745763, + "grad_norm": 5.63554625757226, + "learning_rate": 1.6867789042171777e-07, + "logits/chosen": -3.965508460998535, + "logits/rejected": -3.2860910892486572, + "logps/chosen": -11.189289093017578, + "logps/rejected": -24.705078125, + "loss": 0.0573, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03508012741804123, + "rewards/margins": 5.3851799964904785, + "rewards/rejected": -5.420260906219482, + "step": 761 + }, + { + "epoch": 12.915254237288135, + "grad_norm": 5.826484833366724, + "learning_rate": 1.6797892130840036e-07, + "logits/chosen": -10.473329544067383, + "logits/rejected": -9.278584480285645, + "logps/chosen": -12.040523529052734, + "logps/rejected": -23.865997314453125, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.543164849281311, + "rewards/margins": 4.920264720916748, + "rewards/rejected": -4.377099514007568, + "step": 762 + }, + { + "epoch": 12.932203389830509, + "grad_norm": 5.169064532647496, + "learning_rate": 1.6728066994949658e-07, + "logits/chosen": -4.377648830413818, + "logits/rejected": -4.380948543548584, + "logps/chosen": -8.87318229675293, + "logps/rejected": -22.197044372558594, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24227860569953918, + "rewards/margins": 5.380238056182861, + "rewards/rejected": -5.1379594802856445, + "step": 763 + }, + { + "epoch": 12.94915254237288, + "grad_norm": 4.626923336921723, + "learning_rate": 1.6658314245530148e-07, + "logits/chosen": -3.902984619140625, + "logits/rejected": -0.4850635528564453, + "logps/chosen": -11.315608978271484, + "logps/rejected": -27.440704345703125, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.050329677760601044, + "rewards/margins": 6.103644847869873, + "rewards/rejected": -6.053314685821533, + "step": 764 + }, + { + "epoch": 12.966101694915254, + "grad_norm": 5.36856823765751, + "learning_rate": 1.6588634492977582e-07, + "logits/chosen": -6.079492568969727, + "logits/rejected": -4.708076477050781, + "logps/chosen": -12.596566200256348, + "logps/rejected": -27.42017936706543, + "loss": 0.0643, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2698909044265747, + "rewards/margins": 5.405095100402832, + "rewards/rejected": -5.135204315185547, + "step": 765 + }, + { + "epoch": 12.983050847457626, + "grad_norm": 6.43088486955128, + "learning_rate": 1.651902834704924e-07, + "logits/chosen": -6.696640491485596, + "logits/rejected": -3.6787824630737305, + "logps/chosen": -8.136153221130371, + "logps/rejected": -16.924489974975586, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07825213670730591, + "rewards/margins": 3.4701426029205322, + "rewards/rejected": -3.391890525817871, + "step": 766 + }, + { + "epoch": 13.0, + "grad_norm": 5.831494815474743, + "learning_rate": 1.6449496416858282e-07, + "logits/chosen": -5.741722106933594, + "logits/rejected": -3.713162422180176, + "logps/chosen": -11.334854125976562, + "logps/rejected": -20.919025421142578, + "loss": 0.0589, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28780484199523926, + "rewards/margins": 4.361874103546143, + "rewards/rejected": -4.074069499969482, + "step": 767 + }, + { + "epoch": 13.016949152542374, + "grad_norm": 5.025171216775976, + "learning_rate": 1.6380039310868414e-07, + "logits/chosen": -3.283559560775757, + "logits/rejected": -3.4452576637268066, + "logps/chosen": -10.504578590393066, + "logps/rejected": -24.13515853881836, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18408998847007751, + "rewards/margins": 4.807743072509766, + "rewards/rejected": -4.623653411865234, + "step": 768 + }, + { + "epoch": 13.033898305084746, + "grad_norm": 4.871931611746412, + "learning_rate": 1.631065763688857e-07, + "logits/chosen": -5.76885461807251, + "logits/rejected": -5.4496684074401855, + "logps/chosen": -7.35520601272583, + "logps/rejected": -18.789472579956055, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3069876730442047, + "rewards/margins": 4.473510265350342, + "rewards/rejected": -4.166522026062012, + "step": 769 + }, + { + "epoch": 13.05084745762712, + "grad_norm": 4.8078397196063305, + "learning_rate": 1.6241352002067588e-07, + "logits/chosen": -5.301861763000488, + "logits/rejected": -4.853543281555176, + "logps/chosen": -9.35759162902832, + "logps/rejected": -26.16400146484375, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29260993003845215, + "rewards/margins": 5.623096942901611, + "rewards/rejected": -5.33048677444458, + "step": 770 + }, + { + "epoch": 13.067796610169491, + "grad_norm": 4.196532190439989, + "learning_rate": 1.61721230128889e-07, + "logits/chosen": -1.4309144020080566, + "logits/rejected": -2.1246204376220703, + "logps/chosen": -6.944146156311035, + "logps/rejected": -26.82303237915039, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12401877343654633, + "rewards/margins": 6.203535079956055, + "rewards/rejected": -6.079516410827637, + "step": 771 + }, + { + "epoch": 13.084745762711865, + "grad_norm": 5.593577080714422, + "learning_rate": 1.6102971275165227e-07, + "logits/chosen": -9.79638957977295, + "logits/rejected": -9.661079406738281, + "logps/chosen": -8.487841606140137, + "logps/rejected": -24.75509262084961, + "loss": 0.0673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19384168088436127, + "rewards/margins": 5.573680400848389, + "rewards/rejected": -5.379838943481445, + "step": 772 + }, + { + "epoch": 13.101694915254237, + "grad_norm": 5.150797332106759, + "learning_rate": 1.603389739403327e-07, + "logits/chosen": -9.676387786865234, + "logits/rejected": -6.1768341064453125, + "logps/chosen": -8.65725326538086, + "logps/rejected": -17.568424224853516, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8734726309776306, + "rewards/margins": 3.8431637287139893, + "rewards/rejected": -2.969691276550293, + "step": 773 + }, + { + "epoch": 13.11864406779661, + "grad_norm": 5.3467717221921935, + "learning_rate": 1.5964901973948408e-07, + "logits/chosen": -2.5845415592193604, + "logits/rejected": -2.7042267322540283, + "logps/chosen": -10.360690116882324, + "logps/rejected": -22.25835418701172, + "loss": 0.0629, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15665322542190552, + "rewards/margins": 4.414241313934326, + "rewards/rejected": -4.2575883865356445, + "step": 774 + }, + { + "epoch": 13.135593220338983, + "grad_norm": 4.0651480960897945, + "learning_rate": 1.5895985618679445e-07, + "logits/chosen": -5.20015811920166, + "logits/rejected": -5.799063205718994, + "logps/chosen": -10.640718460083008, + "logps/rejected": -29.960647583007812, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23330990970134735, + "rewards/margins": 6.547354698181152, + "rewards/rejected": -6.314043998718262, + "step": 775 + }, + { + "epoch": 13.152542372881356, + "grad_norm": 4.703853020014504, + "learning_rate": 1.5827148931303275e-07, + "logits/chosen": -2.3715322017669678, + "logits/rejected": -3.2630908489227295, + "logps/chosen": -9.838077545166016, + "logps/rejected": -31.98811912536621, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32375001907348633, + "rewards/margins": 6.667496204376221, + "rewards/rejected": -6.991246223449707, + "step": 776 + }, + { + "epoch": 13.169491525423728, + "grad_norm": 5.442664426502708, + "learning_rate": 1.5758392514199643e-07, + "logits/chosen": -7.819540023803711, + "logits/rejected": -10.007020950317383, + "logps/chosen": -6.59602165222168, + "logps/rejected": -20.028287887573242, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19013340771198273, + "rewards/margins": 4.252629280090332, + "rewards/rejected": -4.062495708465576, + "step": 777 + }, + { + "epoch": 13.186440677966102, + "grad_norm": 4.768607649775109, + "learning_rate": 1.5689716969045847e-07, + "logits/chosen": -3.3728349208831787, + "logits/rejected": 0.18725888431072235, + "logps/chosen": -10.546842575073242, + "logps/rejected": -24.771841049194336, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25277644395828247, + "rewards/margins": 5.527108192443848, + "rewards/rejected": -5.274331569671631, + "step": 778 + }, + { + "epoch": 13.203389830508474, + "grad_norm": 5.460483523807034, + "learning_rate": 1.5621122896811522e-07, + "logits/chosen": -4.294860363006592, + "logits/rejected": -2.532308578491211, + "logps/chosen": -9.680610656738281, + "logps/rejected": -26.630155563354492, + "loss": 0.0624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0907769426703453, + "rewards/margins": 5.914505958557129, + "rewards/rejected": -5.823729038238525, + "step": 779 + }, + { + "epoch": 13.220338983050848, + "grad_norm": 5.4173458280819435, + "learning_rate": 1.555261089775329e-07, + "logits/chosen": -7.844461917877197, + "logits/rejected": -6.5587477684021, + "logps/chosen": -7.928609371185303, + "logps/rejected": -20.538318634033203, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5045410394668579, + "rewards/margins": 4.878368854522705, + "rewards/rejected": -4.373827934265137, + "step": 780 + }, + { + "epoch": 13.23728813559322, + "grad_norm": 4.976900202865331, + "learning_rate": 1.548418157140961e-07, + "logits/chosen": -3.608536720275879, + "logits/rejected": -6.020019054412842, + "logps/chosen": -9.638221740722656, + "logps/rejected": -27.089221954345703, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0017676204442977905, + "rewards/margins": 5.250937461853027, + "rewards/rejected": -5.249169826507568, + "step": 781 + }, + { + "epoch": 13.254237288135593, + "grad_norm": 4.686999391779152, + "learning_rate": 1.5415835516595463e-07, + "logits/chosen": -4.551198959350586, + "logits/rejected": -5.998251914978027, + "logps/chosen": -8.980722427368164, + "logps/rejected": -18.155977249145508, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38983237743377686, + "rewards/margins": 4.47466516494751, + "rewards/rejected": -4.084832191467285, + "step": 782 + }, + { + "epoch": 13.271186440677965, + "grad_norm": 4.5955159595956605, + "learning_rate": 1.5347573331397135e-07, + "logits/chosen": -2.852198600769043, + "logits/rejected": 1.1061744689941406, + "logps/chosen": -14.022387504577637, + "logps/rejected": -28.37848663330078, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03788289427757263, + "rewards/margins": 6.151963233947754, + "rewards/rejected": -6.114079475402832, + "step": 783 + }, + { + "epoch": 13.288135593220339, + "grad_norm": 5.598431066257892, + "learning_rate": 1.5279395613166985e-07, + "logits/chosen": -6.422479629516602, + "logits/rejected": -2.8612544536590576, + "logps/chosen": -10.72292709350586, + "logps/rejected": -20.895212173461914, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2630663514137268, + "rewards/margins": 4.2991042137146, + "rewards/rejected": -4.036037921905518, + "step": 784 + }, + { + "epoch": 13.305084745762711, + "grad_norm": 4.614906303156567, + "learning_rate": 1.5211302958518214e-07, + "logits/chosen": -5.681360244750977, + "logits/rejected": -2.0222373008728027, + "logps/chosen": -10.26196575164795, + "logps/rejected": -23.793285369873047, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12604331970214844, + "rewards/margins": 5.187099933624268, + "rewards/rejected": -5.061056137084961, + "step": 785 + }, + { + "epoch": 13.322033898305085, + "grad_norm": 4.626071080438597, + "learning_rate": 1.5143295963319642e-07, + "logits/chosen": -10.177433013916016, + "logits/rejected": -6.988361358642578, + "logps/chosen": -8.759820938110352, + "logps/rejected": -20.91160011291504, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06292536109685898, + "rewards/margins": 5.223330020904541, + "rewards/rejected": -5.160404682159424, + "step": 786 + }, + { + "epoch": 13.338983050847457, + "grad_norm": 4.643506672060413, + "learning_rate": 1.5075375222690496e-07, + "logits/chosen": -5.985132694244385, + "logits/rejected": -4.864323139190674, + "logps/chosen": -9.475852012634277, + "logps/rejected": -23.88833999633789, + "loss": 0.0482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5428158044815063, + "rewards/margins": 5.8675432205200195, + "rewards/rejected": -5.3247270584106445, + "step": 787 + }, + { + "epoch": 13.35593220338983, + "grad_norm": 5.251627791060617, + "learning_rate": 1.5007541330995198e-07, + "logits/chosen": -8.681631088256836, + "logits/rejected": -5.984912395477295, + "logps/chosen": -11.887125968933105, + "logps/rejected": -21.75457763671875, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10201403498649597, + "rewards/margins": 3.9955077171325684, + "rewards/rejected": -3.893493413925171, + "step": 788 + }, + { + "epoch": 13.372881355932204, + "grad_norm": 5.767774243872824, + "learning_rate": 1.4939794881838176e-07, + "logits/chosen": -8.51830768585205, + "logits/rejected": -5.052129745483398, + "logps/chosen": -11.381778717041016, + "logps/rejected": -18.73246955871582, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6155125498771667, + "rewards/margins": 3.5688974857330322, + "rewards/rejected": -2.9533848762512207, + "step": 789 + }, + { + "epoch": 13.389830508474576, + "grad_norm": 4.824365860620941, + "learning_rate": 1.487213646805866e-07, + "logits/chosen": -9.106437683105469, + "logits/rejected": -6.022385597229004, + "logps/chosen": -9.311746597290039, + "logps/rejected": -12.84929084777832, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3897091746330261, + "rewards/margins": 3.2690494060516357, + "rewards/rejected": -2.879340171813965, + "step": 790 + }, + { + "epoch": 13.40677966101695, + "grad_norm": 5.650311473571556, + "learning_rate": 1.4804566681725496e-07, + "logits/chosen": -4.26154899597168, + "logits/rejected": -3.876025915145874, + "logps/chosen": -8.089516639709473, + "logps/rejected": -18.416603088378906, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.020768344402313232, + "rewards/margins": 4.610015392303467, + "rewards/rejected": -4.589247226715088, + "step": 791 + }, + { + "epoch": 13.423728813559322, + "grad_norm": 4.381684628603866, + "learning_rate": 1.473708611413194e-07, + "logits/chosen": -3.8422200679779053, + "logits/rejected": -4.606509208679199, + "logps/chosen": -8.617932319641113, + "logps/rejected": -21.121320724487305, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10309768468141556, + "rewards/margins": 4.8339691162109375, + "rewards/rejected": -4.730871200561523, + "step": 792 + }, + { + "epoch": 13.440677966101696, + "grad_norm": 4.645592608030263, + "learning_rate": 1.4669695355790552e-07, + "logits/chosen": -6.06674861907959, + "logits/rejected": -7.425795555114746, + "logps/chosen": -12.599617004394531, + "logps/rejected": -24.435813903808594, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21700027585029602, + "rewards/margins": 4.384744167327881, + "rewards/rejected": -4.167743682861328, + "step": 793 + }, + { + "epoch": 13.457627118644067, + "grad_norm": 4.489498968125063, + "learning_rate": 1.4602394996427942e-07, + "logits/chosen": -5.401136875152588, + "logits/rejected": -4.132033348083496, + "logps/chosen": -8.727059364318848, + "logps/rejected": -16.557653427124023, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43375176191329956, + "rewards/margins": 3.8064019680023193, + "rewards/rejected": -3.372650146484375, + "step": 794 + }, + { + "epoch": 13.474576271186441, + "grad_norm": 5.383940537311471, + "learning_rate": 1.4535185624979687e-07, + "logits/chosen": -5.567165374755859, + "logits/rejected": -4.549276828765869, + "logps/chosen": -10.690811157226562, + "logps/rejected": -23.260143280029297, + "loss": 0.0618, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.002038337290287018, + "rewards/margins": 5.000219345092773, + "rewards/rejected": -5.002257823944092, + "step": 795 + }, + { + "epoch": 13.491525423728813, + "grad_norm": 4.796108266845068, + "learning_rate": 1.4468067829585108e-07, + "logits/chosen": -5.905200004577637, + "logits/rejected": -6.398066997528076, + "logps/chosen": -8.744976043701172, + "logps/rejected": -25.000789642333984, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5944421291351318, + "rewards/margins": 5.0441575050354, + "rewards/rejected": -4.4497151374816895, + "step": 796 + }, + { + "epoch": 13.508474576271187, + "grad_norm": 4.414074994930087, + "learning_rate": 1.4401042197582192e-07, + "logits/chosen": -4.122818946838379, + "logits/rejected": -5.4599175453186035, + "logps/chosen": -6.608936309814453, + "logps/rejected": -23.324615478515625, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14724817872047424, + "rewards/margins": 5.297210216522217, + "rewards/rejected": -5.149961471557617, + "step": 797 + }, + { + "epoch": 13.525423728813559, + "grad_norm": 5.569789182924492, + "learning_rate": 1.4334109315502392e-07, + "logits/chosen": -8.031323432922363, + "logits/rejected": -10.76991081237793, + "logps/chosen": -10.417862892150879, + "logps/rejected": -24.26569366455078, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0844353437423706, + "rewards/margins": 4.921995162963867, + "rewards/rejected": -5.006430149078369, + "step": 798 + }, + { + "epoch": 13.542372881355933, + "grad_norm": 4.99311530534324, + "learning_rate": 1.4267269769065537e-07, + "logits/chosen": -5.482168674468994, + "logits/rejected": -2.70070743560791, + "logps/chosen": -9.569720268249512, + "logps/rejected": -21.80959129333496, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5751017332077026, + "rewards/margins": 5.077325344085693, + "rewards/rejected": -4.502223968505859, + "step": 799 + }, + { + "epoch": 13.559322033898304, + "grad_norm": 4.3955518518263155, + "learning_rate": 1.4200524143174676e-07, + "logits/chosen": -7.552706718444824, + "logits/rejected": -4.6099114418029785, + "logps/chosen": -10.882405281066895, + "logps/rejected": -27.207590103149414, + "loss": 0.0512, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.36488914489746094, + "rewards/margins": 5.9867987632751465, + "rewards/rejected": -6.351687431335449, + "step": 800 + }, + { + "epoch": 13.576271186440678, + "grad_norm": 4.775382165631764, + "learning_rate": 1.4133873021910976e-07, + "logits/chosen": -1.3878110647201538, + "logits/rejected": 1.4275829792022705, + "logps/chosen": -9.222979545593262, + "logps/rejected": -21.31915283203125, + "loss": 0.0599, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2374023199081421, + "rewards/margins": 4.305314540863037, + "rewards/rejected": -4.0679121017456055, + "step": 801 + }, + { + "epoch": 13.59322033898305, + "grad_norm": 5.569756855857142, + "learning_rate": 1.4067316988528616e-07, + "logits/chosen": -6.881744861602783, + "logits/rejected": -3.4725799560546875, + "logps/chosen": -11.406499862670898, + "logps/rejected": -22.18549156188965, + "loss": 0.0677, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6805269122123718, + "rewards/margins": 5.206305027008057, + "rewards/rejected": -4.525778293609619, + "step": 802 + }, + { + "epoch": 13.610169491525424, + "grad_norm": 5.280571870724313, + "learning_rate": 1.4000856625449664e-07, + "logits/chosen": -9.837797164916992, + "logits/rejected": -6.981907367706299, + "logps/chosen": -10.905115127563477, + "logps/rejected": -23.980911254882812, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20386749505996704, + "rewards/margins": 5.112873077392578, + "rewards/rejected": -4.909005641937256, + "step": 803 + }, + { + "epoch": 13.627118644067796, + "grad_norm": 4.716612003997788, + "learning_rate": 1.3934492514259003e-07, + "logits/chosen": -7.098214626312256, + "logits/rejected": -7.429448127746582, + "logps/chosen": -9.4953031539917, + "logps/rejected": -22.437997817993164, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6349058151245117, + "rewards/margins": 5.2428460121154785, + "rewards/rejected": -4.607940673828125, + "step": 804 + }, + { + "epoch": 13.64406779661017, + "grad_norm": 5.461889345352294, + "learning_rate": 1.3868225235699216e-07, + "logits/chosen": -4.462307453155518, + "logits/rejected": -4.318324565887451, + "logps/chosen": -11.038957595825195, + "logps/rejected": -25.736881256103516, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12135696411132812, + "rewards/margins": 5.704156875610352, + "rewards/rejected": -5.82551383972168, + "step": 805 + }, + { + "epoch": 13.661016949152543, + "grad_norm": 4.634781728056408, + "learning_rate": 1.3802055369665533e-07, + "logits/chosen": -3.216710329055786, + "logits/rejected": -1.861129641532898, + "logps/chosen": -8.412116050720215, + "logps/rejected": -21.057117462158203, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2881760001182556, + "rewards/margins": 4.603650093078613, + "rewards/rejected": -4.315474033355713, + "step": 806 + }, + { + "epoch": 13.677966101694915, + "grad_norm": 4.989725061376435, + "learning_rate": 1.373598349520073e-07, + "logits/chosen": -6.761763572692871, + "logits/rejected": -4.394807815551758, + "logps/chosen": -6.5797576904296875, + "logps/rejected": -18.811870574951172, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08568145334720612, + "rewards/margins": 4.4652605056762695, + "rewards/rejected": -4.379578590393066, + "step": 807 + }, + { + "epoch": 13.694915254237289, + "grad_norm": 4.680733186063038, + "learning_rate": 1.3670010190490073e-07, + "logits/chosen": -4.460152626037598, + "logits/rejected": -1.1188685894012451, + "logps/chosen": -13.548200607299805, + "logps/rejected": -31.156145095825195, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.218767911195755, + "rewards/margins": 5.674424171447754, + "rewards/rejected": -5.893191814422607, + "step": 808 + }, + { + "epoch": 13.711864406779661, + "grad_norm": 4.663236624627772, + "learning_rate": 1.3604136032856268e-07, + "logits/chosen": -7.653531074523926, + "logits/rejected": -5.643984794616699, + "logps/chosen": -9.94079303741455, + "logps/rejected": -27.503480911254883, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43097323179244995, + "rewards/margins": 5.7583465576171875, + "rewards/rejected": -5.327373027801514, + "step": 809 + }, + { + "epoch": 13.728813559322035, + "grad_norm": 4.730840953500074, + "learning_rate": 1.3538361598754382e-07, + "logits/chosen": -5.666280269622803, + "logits/rejected": -3.015167474746704, + "logps/chosen": -9.375484466552734, + "logps/rejected": -27.635007858276367, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13278841972351074, + "rewards/margins": 6.769484519958496, + "rewards/rejected": -6.636696815490723, + "step": 810 + }, + { + "epoch": 13.745762711864407, + "grad_norm": 5.490668032046639, + "learning_rate": 1.3472687463766848e-07, + "logits/chosen": -4.392643928527832, + "logits/rejected": -5.210497856140137, + "logps/chosen": -9.478880882263184, + "logps/rejected": -18.758119583129883, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3752535581588745, + "rewards/margins": 4.221700191497803, + "rewards/rejected": -3.8464465141296387, + "step": 811 + }, + { + "epoch": 13.76271186440678, + "grad_norm": 5.590368558372794, + "learning_rate": 1.3407114202598368e-07, + "logits/chosen": -6.054229259490967, + "logits/rejected": -5.513613700866699, + "logps/chosen": -7.454735279083252, + "logps/rejected": -15.90285873413086, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13070914149284363, + "rewards/margins": 3.5557241439819336, + "rewards/rejected": -3.4250149726867676, + "step": 812 + }, + { + "epoch": 13.779661016949152, + "grad_norm": 4.735779420046063, + "learning_rate": 1.3341642389070926e-07, + "logits/chosen": -4.318917751312256, + "logits/rejected": -5.7969183921813965, + "logps/chosen": -9.046194076538086, + "logps/rejected": -22.119319915771484, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45455262064933777, + "rewards/margins": 4.551388263702393, + "rewards/rejected": -4.096835613250732, + "step": 813 + }, + { + "epoch": 13.796610169491526, + "grad_norm": 4.898977970422225, + "learning_rate": 1.3276272596118728e-07, + "logits/chosen": -5.6698713302612305, + "logits/rejected": -6.0464863777160645, + "logps/chosen": -14.398414611816406, + "logps/rejected": -27.341615676879883, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03556087613105774, + "rewards/margins": 5.173557758331299, + "rewards/rejected": -5.209118843078613, + "step": 814 + }, + { + "epoch": 13.813559322033898, + "grad_norm": 5.281083471645692, + "learning_rate": 1.3211005395783244e-07, + "logits/chosen": -7.419910430908203, + "logits/rejected": -5.7298359870910645, + "logps/chosen": -11.2025728225708, + "logps/rejected": -25.59320068359375, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4744488298892975, + "rewards/margins": 5.016169548034668, + "rewards/rejected": -4.541720390319824, + "step": 815 + }, + { + "epoch": 13.830508474576272, + "grad_norm": 5.083021458478777, + "learning_rate": 1.3145841359208148e-07, + "logits/chosen": -6.718883991241455, + "logits/rejected": -5.874282360076904, + "logps/chosen": -10.914901733398438, + "logps/rejected": -26.438758850097656, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15237705409526825, + "rewards/margins": 5.802890777587891, + "rewards/rejected": -5.650513172149658, + "step": 816 + }, + { + "epoch": 13.847457627118644, + "grad_norm": 5.160155412631123, + "learning_rate": 1.308078105663437e-07, + "logits/chosen": -4.9868268966674805, + "logits/rejected": -6.539076805114746, + "logps/chosen": -10.791238784790039, + "logps/rejected": -22.204166412353516, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15927231311798096, + "rewards/margins": 4.936960697174072, + "rewards/rejected": -4.777688503265381, + "step": 817 + }, + { + "epoch": 13.864406779661017, + "grad_norm": 4.142873406210084, + "learning_rate": 1.3015825057395058e-07, + "logits/chosen": -4.892223358154297, + "logits/rejected": -4.1584930419921875, + "logps/chosen": -5.886479377746582, + "logps/rejected": -17.27469253540039, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7119540572166443, + "rewards/margins": 4.798526287078857, + "rewards/rejected": -4.086572170257568, + "step": 818 + }, + { + "epoch": 13.88135593220339, + "grad_norm": 5.554456588133767, + "learning_rate": 1.2950973929910619e-07, + "logits/chosen": -9.246594429016113, + "logits/rejected": -4.345804214477539, + "logps/chosen": -10.645049095153809, + "logps/rejected": -23.079753875732422, + "loss": 0.0735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04009474813938141, + "rewards/margins": 5.105522155761719, + "rewards/rejected": -5.14561653137207, + "step": 819 + }, + { + "epoch": 13.898305084745763, + "grad_norm": 7.076055638827002, + "learning_rate": 1.2886228241683748e-07, + "logits/chosen": -6.933260917663574, + "logits/rejected": -4.676466941833496, + "logps/chosen": -9.63884449005127, + "logps/rejected": -24.883792877197266, + "loss": 0.0547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6752225756645203, + "rewards/margins": 6.20891809463501, + "rewards/rejected": -5.533695697784424, + "step": 820 + }, + { + "epoch": 13.915254237288135, + "grad_norm": 5.578307094576147, + "learning_rate": 1.282158855929445e-07, + "logits/chosen": -5.170792102813721, + "logits/rejected": -4.953229904174805, + "logps/chosen": -10.70461654663086, + "logps/rejected": -23.960208892822266, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2784963846206665, + "rewards/margins": 4.557374954223633, + "rewards/rejected": -4.278878688812256, + "step": 821 + }, + { + "epoch": 13.932203389830509, + "grad_norm": 5.40778040476045, + "learning_rate": 1.275705544839509e-07, + "logits/chosen": -9.584942817687988, + "logits/rejected": -7.174380302429199, + "logps/chosen": -11.899053573608398, + "logps/rejected": -22.629398345947266, + "loss": 0.0645, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3301153779029846, + "rewards/margins": 4.465504169464111, + "rewards/rejected": -4.13538932800293, + "step": 822 + }, + { + "epoch": 13.94915254237288, + "grad_norm": 4.935477856465082, + "learning_rate": 1.2692629473705452e-07, + "logits/chosen": -6.729738235473633, + "logits/rejected": -5.973828315734863, + "logps/chosen": -9.960297584533691, + "logps/rejected": -18.796049118041992, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06524574756622314, + "rewards/margins": 4.380879878997803, + "rewards/rejected": -4.315633773803711, + "step": 823 + }, + { + "epoch": 13.966101694915254, + "grad_norm": 4.711427189745618, + "learning_rate": 1.2628311199007762e-07, + "logits/chosen": -5.152575492858887, + "logits/rejected": -3.88008975982666, + "logps/chosen": -9.844609260559082, + "logps/rejected": -20.80732536315918, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09208326786756516, + "rewards/margins": 4.425102710723877, + "rewards/rejected": -4.517186164855957, + "step": 824 + }, + { + "epoch": 13.983050847457626, + "grad_norm": 5.01651920767794, + "learning_rate": 1.2564101187141828e-07, + "logits/chosen": -8.57741641998291, + "logits/rejected": -6.389130592346191, + "logps/chosen": -6.646769046783447, + "logps/rejected": -20.195098876953125, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5168790817260742, + "rewards/margins": 4.831752777099609, + "rewards/rejected": -4.314873695373535, + "step": 825 + }, + { + "epoch": 14.0, + "grad_norm": 4.532284739310805, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -5.515041351318359, + "logits/rejected": -2.865309715270996, + "logps/chosen": -13.2843656539917, + "logps/rejected": -25.315540313720703, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19785913825035095, + "rewards/margins": 5.822000980377197, + "rewards/rejected": -6.01986026763916, + "step": 826 + }, + { + "epoch": 14.016949152542374, + "grad_norm": 4.026799410999227, + "learning_rate": 1.2436008198522374e-07, + "logits/chosen": -5.045407295227051, + "logits/rejected": -4.171008586883545, + "logps/chosen": -9.222007751464844, + "logps/rejected": -22.0806827545166, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6829844117164612, + "rewards/margins": 5.049844741821289, + "rewards/rejected": -4.366860389709473, + "step": 827 + }, + { + "epoch": 14.033898305084746, + "grad_norm": 5.238355012181834, + "learning_rate": 1.2372126342691797e-07, + "logits/chosen": -4.42636775970459, + "logits/rejected": -4.819035530090332, + "logps/chosen": -11.446056365966797, + "logps/rejected": -24.577653884887695, + "loss": 0.0599, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1556832194328308, + "rewards/margins": 4.2256999015808105, + "rewards/rejected": -4.381383419036865, + "step": 828 + }, + { + "epoch": 14.05084745762712, + "grad_norm": 5.126062091047994, + "learning_rate": 1.2308354991529006e-07, + "logits/chosen": -0.3282977342605591, + "logits/rejected": -4.660909652709961, + "logps/chosen": -8.18812370300293, + "logps/rejected": -21.934959411621094, + "loss": 0.0624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6030260324478149, + "rewards/margins": 3.95294451713562, + "rewards/rejected": -3.3499186038970947, + "step": 829 + }, + { + "epoch": 14.067796610169491, + "grad_norm": 4.911341582897822, + "learning_rate": 1.2244694703087727e-07, + "logits/chosen": -7.362742900848389, + "logits/rejected": -4.4875359535217285, + "logps/chosen": -14.022963523864746, + "logps/rejected": -23.285682678222656, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14519502222537994, + "rewards/margins": 4.577618598937988, + "rewards/rejected": -4.4324235916137695, + "step": 830 + }, + { + "epoch": 14.084745762711865, + "grad_norm": 4.4598825344534045, + "learning_rate": 1.2181146034449807e-07, + "logits/chosen": -7.692203998565674, + "logits/rejected": -5.220171928405762, + "logps/chosen": -8.62202262878418, + "logps/rejected": -17.138887405395508, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3423607349395752, + "rewards/margins": 4.227006912231445, + "rewards/rejected": -3.884646415710449, + "step": 831 + }, + { + "epoch": 14.101694915254237, + "grad_norm": 4.8353921985009665, + "learning_rate": 1.2117709541720306e-07, + "logits/chosen": -5.6293487548828125, + "logits/rejected": -6.787869453430176, + "logps/chosen": -14.639093399047852, + "logps/rejected": -27.194129943847656, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09962216019630432, + "rewards/margins": 4.305108070373535, + "rewards/rejected": -4.205486297607422, + "step": 832 + }, + { + "epoch": 14.11864406779661, + "grad_norm": 5.568638154486435, + "learning_rate": 1.2054385780022655e-07, + "logits/chosen": -4.221851825714111, + "logits/rejected": -6.096976280212402, + "logps/chosen": -8.260061264038086, + "logps/rejected": -19.653345108032227, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.706796407699585, + "rewards/margins": 4.410439968109131, + "rewards/rejected": -3.703643560409546, + "step": 833 + }, + { + "epoch": 14.135593220338983, + "grad_norm": 4.675450379157052, + "learning_rate": 1.199117530349379e-07, + "logits/chosen": -5.025335311889648, + "logits/rejected": -5.071759223937988, + "logps/chosen": -10.142559051513672, + "logps/rejected": -21.743587493896484, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0700664073228836, + "rewards/margins": 4.8901286125183105, + "rewards/rejected": -4.820062160491943, + "step": 834 + }, + { + "epoch": 14.152542372881356, + "grad_norm": 4.592697469691946, + "learning_rate": 1.192807866527931e-07, + "logits/chosen": -7.847444534301758, + "logits/rejected": -6.507003307342529, + "logps/chosen": -10.491202354431152, + "logps/rejected": -21.514968872070312, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2933105528354645, + "rewards/margins": 5.166686534881592, + "rewards/rejected": -4.87337589263916, + "step": 835 + }, + { + "epoch": 14.169491525423728, + "grad_norm": 5.075415557100472, + "learning_rate": 1.1865096417528633e-07, + "logits/chosen": -6.532116413116455, + "logits/rejected": -4.443775177001953, + "logps/chosen": -8.443931579589844, + "logps/rejected": -23.405532836914062, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06578953564167023, + "rewards/margins": 5.535974502563477, + "rewards/rejected": -5.470184326171875, + "step": 836 + }, + { + "epoch": 14.186440677966102, + "grad_norm": 4.296095365548908, + "learning_rate": 1.1802229111390155e-07, + "logits/chosen": -1.0841858386993408, + "logits/rejected": -3.881087303161621, + "logps/chosen": -8.981799125671387, + "logps/rejected": -31.96260643005371, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05703720450401306, + "rewards/margins": 7.540444374084473, + "rewards/rejected": -7.483407497406006, + "step": 837 + }, + { + "epoch": 14.203389830508474, + "grad_norm": 4.434101376540436, + "learning_rate": 1.173947729700644e-07, + "logits/chosen": -5.153438091278076, + "logits/rejected": -4.827155113220215, + "logps/chosen": -12.063497543334961, + "logps/rejected": -26.99883270263672, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.024201437830924988, + "rewards/margins": 5.5701093673706055, + "rewards/rejected": -5.545908451080322, + "step": 838 + }, + { + "epoch": 14.220338983050848, + "grad_norm": 4.706145613555284, + "learning_rate": 1.1676841523509398e-07, + "logits/chosen": -8.671034812927246, + "logits/rejected": -6.122951507568359, + "logps/chosen": -11.669129371643066, + "logps/rejected": -21.667890548706055, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29969891905784607, + "rewards/margins": 4.794971942901611, + "rewards/rejected": -4.495272636413574, + "step": 839 + }, + { + "epoch": 14.23728813559322, + "grad_norm": 4.580061612060666, + "learning_rate": 1.1614322339015484e-07, + "logits/chosen": -3.6043882369995117, + "logits/rejected": -4.311732292175293, + "logps/chosen": -13.588913917541504, + "logps/rejected": -29.708572387695312, + "loss": 0.0456, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38750195503234863, + "rewards/margins": 6.415001392364502, + "rewards/rejected": -6.802502632141113, + "step": 840 + }, + { + "epoch": 14.254237288135593, + "grad_norm": 4.3425766056242265, + "learning_rate": 1.1551920290620903e-07, + "logits/chosen": -4.859739780426025, + "logits/rejected": -6.222860336303711, + "logps/chosen": -7.076902389526367, + "logps/rejected": -19.721343994140625, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23864012956619263, + "rewards/margins": 4.4084906578063965, + "rewards/rejected": -4.1698503494262695, + "step": 841 + }, + { + "epoch": 14.271186440677965, + "grad_norm": 4.836047729889714, + "learning_rate": 1.1489635924396815e-07, + "logits/chosen": -8.631258964538574, + "logits/rejected": -10.491349220275879, + "logps/chosen": -12.263872146606445, + "logps/rejected": -31.80056381225586, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026434574276208878, + "rewards/margins": 5.849242687225342, + "rewards/rejected": -5.822807788848877, + "step": 842 + }, + { + "epoch": 14.288135593220339, + "grad_norm": 5.8126592232749354, + "learning_rate": 1.1427469785384558e-07, + "logits/chosen": -1.0946826934814453, + "logits/rejected": -3.664766788482666, + "logps/chosen": -10.965002059936523, + "logps/rejected": -21.787538528442383, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5762641429901123, + "rewards/margins": 4.274231433868408, + "rewards/rejected": -3.697967529296875, + "step": 843 + }, + { + "epoch": 14.305084745762711, + "grad_norm": 5.0651053975440705, + "learning_rate": 1.1365422417590878e-07, + "logits/chosen": -3.439162254333496, + "logits/rejected": 1.3838034868240356, + "logps/chosen": -13.087545394897461, + "logps/rejected": -30.054899215698242, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21956637501716614, + "rewards/margins": 6.333641529083252, + "rewards/rejected": -6.114074230194092, + "step": 844 + }, + { + "epoch": 14.322033898305085, + "grad_norm": 5.096385076595446, + "learning_rate": 1.1303494363983196e-07, + "logits/chosen": -6.096810340881348, + "logits/rejected": -5.0730085372924805, + "logps/chosen": -7.8111467361450195, + "logps/rejected": -15.42635726928711, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21816721558570862, + "rewards/margins": 3.222285032272339, + "rewards/rejected": -3.004117965698242, + "step": 845 + }, + { + "epoch": 14.338983050847457, + "grad_norm": 5.033259197227216, + "learning_rate": 1.1241686166484804e-07, + "logits/chosen": -7.2436065673828125, + "logits/rejected": -5.567520618438721, + "logps/chosen": -10.714032173156738, + "logps/rejected": -20.56938362121582, + "loss": 0.0538, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04737303406000137, + "rewards/margins": 4.353013515472412, + "rewards/rejected": -4.30564022064209, + "step": 846 + }, + { + "epoch": 14.35593220338983, + "grad_norm": 4.57859881932484, + "learning_rate": 1.1179998365970172e-07, + "logits/chosen": -4.692959308624268, + "logits/rejected": -2.923541784286499, + "logps/chosen": -9.625382423400879, + "logps/rejected": -21.11334228515625, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3075934946537018, + "rewards/margins": 5.190447807312012, + "rewards/rejected": -4.882854461669922, + "step": 847 + }, + { + "epoch": 14.372881355932204, + "grad_norm": 4.804675788465381, + "learning_rate": 1.1118431502260162e-07, + "logits/chosen": -4.305268287658691, + "logits/rejected": -3.777709484100342, + "logps/chosen": -6.272968292236328, + "logps/rejected": -21.6550235748291, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6106405258178711, + "rewards/margins": 5.49233341217041, + "rewards/rejected": -4.881693363189697, + "step": 848 + }, + { + "epoch": 14.389830508474576, + "grad_norm": 4.617090621688473, + "learning_rate": 1.1056986114117367e-07, + "logits/chosen": -6.763372898101807, + "logits/rejected": -5.744204521179199, + "logps/chosen": -8.708209037780762, + "logps/rejected": -17.52611541748047, + "loss": 0.0435, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39696207642555237, + "rewards/margins": 3.6503450870513916, + "rewards/rejected": -3.253383159637451, + "step": 849 + }, + { + "epoch": 14.40677966101695, + "grad_norm": 4.362269121942627, + "learning_rate": 1.0995662739241346e-07, + "logits/chosen": -4.924978256225586, + "logits/rejected": -5.879234313964844, + "logps/chosen": -12.350116729736328, + "logps/rejected": -28.583274841308594, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15409313142299652, + "rewards/margins": 5.438337802886963, + "rewards/rejected": -5.284245014190674, + "step": 850 + }, + { + "epoch": 14.423728813559322, + "grad_norm": 4.197218623377624, + "learning_rate": 1.0934461914263965e-07, + "logits/chosen": -6.324525833129883, + "logits/rejected": -3.778895378112793, + "logps/chosen": -8.720869064331055, + "logps/rejected": -19.028125762939453, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3219558000564575, + "rewards/margins": 4.60241174697876, + "rewards/rejected": -4.280456066131592, + "step": 851 + }, + { + "epoch": 14.440677966101696, + "grad_norm": 4.923401961976329, + "learning_rate": 1.087338417474464e-07, + "logits/chosen": -5.350639820098877, + "logits/rejected": -7.599613666534424, + "logps/chosen": -8.26685619354248, + "logps/rejected": -24.998441696166992, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20556461811065674, + "rewards/margins": 5.382943153381348, + "rewards/rejected": -5.588507175445557, + "step": 852 + }, + { + "epoch": 14.457627118644067, + "grad_norm": 3.7969022055546713, + "learning_rate": 1.0812430055165709e-07, + "logits/chosen": -5.8174920082092285, + "logits/rejected": -6.306069374084473, + "logps/chosen": -12.027060508728027, + "logps/rejected": -25.023365020751953, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5196723937988281, + "rewards/margins": 5.552087783813477, + "rewards/rejected": -5.032415390014648, + "step": 853 + }, + { + "epoch": 14.474576271186441, + "grad_norm": 4.15778836042614, + "learning_rate": 1.0751600088927712e-07, + "logits/chosen": -3.9236257076263428, + "logits/rejected": -3.4127421379089355, + "logps/chosen": -8.900745391845703, + "logps/rejected": -25.145057678222656, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26206323504447937, + "rewards/margins": 5.242336273193359, + "rewards/rejected": -4.9802727699279785, + "step": 854 + }, + { + "epoch": 14.491525423728813, + "grad_norm": 4.203245294110667, + "learning_rate": 1.0690894808344756e-07, + "logits/chosen": -4.772054195404053, + "logits/rejected": -3.865562915802002, + "logps/chosen": -11.955608367919922, + "logps/rejected": -25.580827713012695, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23099081218242645, + "rewards/margins": 5.626684188842773, + "rewards/rejected": -5.857674598693848, + "step": 855 + }, + { + "epoch": 14.508474576271187, + "grad_norm": 5.622321826841454, + "learning_rate": 1.0630314744639829e-07, + "logits/chosen": -5.114489555358887, + "logits/rejected": -4.372469902038574, + "logps/chosen": -10.947568893432617, + "logps/rejected": -20.671138763427734, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20464643836021423, + "rewards/margins": 4.4105024337768555, + "rewards/rejected": -4.205855369567871, + "step": 856 + }, + { + "epoch": 14.525423728813559, + "grad_norm": 4.357657280031394, + "learning_rate": 1.0569860427940178e-07, + "logits/chosen": -8.80482006072998, + "logits/rejected": -8.39877986907959, + "logps/chosen": -9.7009859085083, + "logps/rejected": -23.01410675048828, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0699918121099472, + "rewards/margins": 4.5847649574279785, + "rewards/rejected": -4.654757022857666, + "step": 857 + }, + { + "epoch": 14.542372881355933, + "grad_norm": 4.473948955486904, + "learning_rate": 1.050953238727264e-07, + "logits/chosen": -3.8625569343566895, + "logits/rejected": -4.641469478607178, + "logps/chosen": -8.324604034423828, + "logps/rejected": -19.920515060424805, + "loss": 0.0536, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4261910319328308, + "rewards/margins": 4.780582427978516, + "rewards/rejected": -4.354391574859619, + "step": 858 + }, + { + "epoch": 14.559322033898304, + "grad_norm": 4.416058452804211, + "learning_rate": 1.0449331150559063e-07, + "logits/chosen": -6.369832515716553, + "logits/rejected": -5.154065132141113, + "logps/chosen": -8.499757766723633, + "logps/rejected": -18.445560455322266, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5627738833427429, + "rewards/margins": 4.360942363739014, + "rewards/rejected": -3.798168420791626, + "step": 859 + }, + { + "epoch": 14.576271186440678, + "grad_norm": 5.151908410777473, + "learning_rate": 1.0389257244611601e-07, + "logits/chosen": -9.489761352539062, + "logits/rejected": -8.491812705993652, + "logps/chosen": -10.10033893585205, + "logps/rejected": -17.174280166625977, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6827844977378845, + "rewards/margins": 3.9337456226348877, + "rewards/rejected": -3.2509608268737793, + "step": 860 + }, + { + "epoch": 14.59322033898305, + "grad_norm": 4.508708554427197, + "learning_rate": 1.0329311195128193e-07, + "logits/chosen": -6.457301139831543, + "logits/rejected": -4.161015033721924, + "logps/chosen": -8.108702659606934, + "logps/rejected": -17.485946655273438, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2239864021539688, + "rewards/margins": 4.468382835388184, + "rewards/rejected": -4.244396209716797, + "step": 861 + }, + { + "epoch": 14.610169491525424, + "grad_norm": 4.153746327758954, + "learning_rate": 1.0269493526687914e-07, + "logits/chosen": -14.252533912658691, + "logits/rejected": -9.489017486572266, + "logps/chosen": -14.082448959350586, + "logps/rejected": -21.335691452026367, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5519404411315918, + "rewards/margins": 4.63496732711792, + "rewards/rejected": -4.08302640914917, + "step": 862 + }, + { + "epoch": 14.627118644067796, + "grad_norm": 4.751730032840892, + "learning_rate": 1.0209804762746396e-07, + "logits/chosen": -6.768604278564453, + "logits/rejected": -4.19866943359375, + "logps/chosen": -11.16037654876709, + "logps/rejected": -21.334613800048828, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26084911823272705, + "rewards/margins": 4.942337989807129, + "rewards/rejected": -4.681488990783691, + "step": 863 + }, + { + "epoch": 14.64406779661017, + "grad_norm": 4.9134107504574205, + "learning_rate": 1.0150245425631235e-07, + "logits/chosen": -2.556035280227661, + "logits/rejected": -4.578921794891357, + "logps/chosen": -7.398219108581543, + "logps/rejected": -19.08116340637207, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21627086400985718, + "rewards/margins": 4.1095075607299805, + "rewards/rejected": -3.8932361602783203, + "step": 864 + }, + { + "epoch": 14.661016949152543, + "grad_norm": 4.096385228647857, + "learning_rate": 1.0090816036537461e-07, + "logits/chosen": -7.6018877029418945, + "logits/rejected": -7.894321441650391, + "logps/chosen": -7.223687171936035, + "logps/rejected": -23.89727783203125, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1827097237110138, + "rewards/margins": 5.98267936706543, + "rewards/rejected": -5.799968719482422, + "step": 865 + }, + { + "epoch": 14.677966101694915, + "grad_norm": 5.560154999681085, + "learning_rate": 1.0031517115522925e-07, + "logits/chosen": -5.176994323730469, + "logits/rejected": -6.375298976898193, + "logps/chosen": -6.474493026733398, + "logps/rejected": -17.692594528198242, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27740585803985596, + "rewards/margins": 3.594590663909912, + "rewards/rejected": -3.3171846866607666, + "step": 866 + }, + { + "epoch": 14.694915254237289, + "grad_norm": 4.720320647825455, + "learning_rate": 9.972349181503773e-08, + "logits/chosen": -6.104518413543701, + "logits/rejected": -6.355710983276367, + "logps/chosen": -7.5375752449035645, + "logps/rejected": -18.257652282714844, + "loss": 0.0533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37600499391555786, + "rewards/margins": 4.874422550201416, + "rewards/rejected": -4.498417854309082, + "step": 867 + }, + { + "epoch": 14.711864406779661, + "grad_norm": 4.5609809580288525, + "learning_rate": 9.913312752249903e-08, + "logits/chosen": -8.388524055480957, + "logits/rejected": -8.490981101989746, + "logps/chosen": -10.754916191101074, + "logps/rejected": -23.119861602783203, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26782697439193726, + "rewards/margins": 5.011294364929199, + "rewards/rejected": -5.2791218757629395, + "step": 868 + }, + { + "epoch": 14.728813559322035, + "grad_norm": 5.039462820883621, + "learning_rate": 9.85440834438044e-08, + "logits/chosen": -9.018473625183105, + "logits/rejected": -6.955737590789795, + "logps/chosen": -10.54946231842041, + "logps/rejected": -22.88796615600586, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.005913838744163513, + "rewards/margins": 4.554727077484131, + "rewards/rejected": -4.548813343048096, + "step": 869 + }, + { + "epoch": 14.745762711864407, + "grad_norm": 4.79804613276376, + "learning_rate": 9.795636473359207e-08, + "logits/chosen": -11.686652183532715, + "logits/rejected": -9.616609573364258, + "logps/chosen": -10.668220520019531, + "logps/rejected": -19.37090492248535, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39386722445487976, + "rewards/margins": 3.586113452911377, + "rewards/rejected": -3.192246437072754, + "step": 870 + }, + { + "epoch": 14.76271186440678, + "grad_norm": 5.238510670938149, + "learning_rate": 9.736997653490214e-08, + "logits/chosen": -3.2652204036712646, + "logits/rejected": -3.1794800758361816, + "logps/chosen": -6.136098861694336, + "logps/rejected": -23.864599227905273, + "loss": 0.0646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4961877763271332, + "rewards/margins": 6.1632819175720215, + "rewards/rejected": -5.6670942306518555, + "step": 871 + }, + { + "epoch": 14.779661016949152, + "grad_norm": 4.158137074830622, + "learning_rate": 9.678492397913165e-08, + "logits/chosen": -9.135766983032227, + "logits/rejected": -4.244735240936279, + "logps/chosen": -15.490636825561523, + "logps/rejected": -28.337665557861328, + "loss": 0.0391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1223401129245758, + "rewards/margins": 6.128490924835205, + "rewards/rejected": -6.00615119934082, + "step": 872 + }, + { + "epoch": 14.796610169491526, + "grad_norm": 5.096270220086588, + "learning_rate": 9.620121218598957e-08, + "logits/chosen": -9.420333862304688, + "logits/rejected": -7.442325115203857, + "logps/chosen": -8.854183197021484, + "logps/rejected": -21.26641082763672, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6020601391792297, + "rewards/margins": 6.024995803833008, + "rewards/rejected": -5.422935962677002, + "step": 873 + }, + { + "epoch": 14.813559322033898, + "grad_norm": 4.125705253274765, + "learning_rate": 9.561884626345204e-08, + "logits/chosen": -4.848762035369873, + "logits/rejected": -5.289836406707764, + "logps/chosen": -8.86702823638916, + "logps/rejected": -19.24077606201172, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1991223245859146, + "rewards/margins": 3.673518180847168, + "rewards/rejected": -3.474395990371704, + "step": 874 + }, + { + "epoch": 14.830508474576272, + "grad_norm": 4.8180346160723735, + "learning_rate": 9.503783130771778e-08, + "logits/chosen": -5.396355152130127, + "logits/rejected": -3.8392302989959717, + "logps/chosen": -10.044886589050293, + "logps/rejected": -24.429101943969727, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6494219303131104, + "rewards/margins": 5.044546127319336, + "rewards/rejected": -4.3951239585876465, + "step": 875 + }, + { + "epoch": 14.847457627118644, + "grad_norm": 4.910821646003884, + "learning_rate": 9.445817240316332e-08, + "logits/chosen": -6.426746368408203, + "logits/rejected": -4.628366947174072, + "logps/chosen": -7.040453910827637, + "logps/rejected": -22.949737548828125, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2003602534532547, + "rewards/margins": 5.139168739318848, + "rewards/rejected": -4.938808441162109, + "step": 876 + }, + { + "epoch": 14.864406779661017, + "grad_norm": 4.798926785767766, + "learning_rate": 9.387987462229857e-08, + "logits/chosen": -7.756277084350586, + "logits/rejected": -7.002287864685059, + "logps/chosen": -7.811505317687988, + "logps/rejected": -20.826684951782227, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46053773164749146, + "rewards/margins": 4.936527252197266, + "rewards/rejected": -4.475989818572998, + "step": 877 + }, + { + "epoch": 14.88135593220339, + "grad_norm": 4.3625712644302626, + "learning_rate": 9.330294302572242e-08, + "logits/chosen": -7.890216827392578, + "logits/rejected": -7.356686592102051, + "logps/chosen": -6.975728988647461, + "logps/rejected": -14.148273468017578, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5016485452651978, + "rewards/margins": 3.3492910861968994, + "rewards/rejected": -2.847642660140991, + "step": 878 + }, + { + "epoch": 14.898305084745763, + "grad_norm": 5.106024259580478, + "learning_rate": 9.272738266207871e-08, + "logits/chosen": -6.573366165161133, + "logits/rejected": -7.513491630554199, + "logps/chosen": -10.01795482635498, + "logps/rejected": -22.914081573486328, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01393437385559082, + "rewards/margins": 4.2963762283325195, + "rewards/rejected": -4.282442092895508, + "step": 879 + }, + { + "epoch": 14.915254237288135, + "grad_norm": 10.139064787648792, + "learning_rate": 9.215319856801157e-08, + "logits/chosen": -7.310423374176025, + "logits/rejected": -3.5291025638580322, + "logps/chosen": -9.833205223083496, + "logps/rejected": -23.23526382446289, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11848260462284088, + "rewards/margins": 4.9702019691467285, + "rewards/rejected": -4.851719856262207, + "step": 880 + }, + { + "epoch": 14.932203389830509, + "grad_norm": 4.019168812870987, + "learning_rate": 9.158039576812176e-08, + "logits/chosen": -11.067143440246582, + "logits/rejected": -9.051342010498047, + "logps/chosen": -10.35849666595459, + "logps/rejected": -22.214868545532227, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25341349840164185, + "rewards/margins": 4.845900535583496, + "rewards/rejected": -5.099314212799072, + "step": 881 + }, + { + "epoch": 14.94915254237288, + "grad_norm": 4.615927599216388, + "learning_rate": 9.10089792749223e-08, + "logits/chosen": -3.4698643684387207, + "logits/rejected": -4.226621150970459, + "logps/chosen": -7.663708686828613, + "logps/rejected": -29.664623260498047, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4476359486579895, + "rewards/margins": 7.534958362579346, + "rewards/rejected": -7.08732271194458, + "step": 882 + }, + { + "epoch": 14.966101694915254, + "grad_norm": 4.485706073296585, + "learning_rate": 9.043895408879504e-08, + "logits/chosen": -5.978943347930908, + "logits/rejected": -3.7649073600769043, + "logps/chosen": -8.316689491271973, + "logps/rejected": -21.403587341308594, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5265974402427673, + "rewards/margins": 4.677036762237549, + "rewards/rejected": -4.1504387855529785, + "step": 883 + }, + { + "epoch": 14.983050847457626, + "grad_norm": 4.2730822752523965, + "learning_rate": 8.987032519794666e-08, + "logits/chosen": -11.319976806640625, + "logits/rejected": -7.822196960449219, + "logps/chosen": -10.064786911010742, + "logps/rejected": -16.78985023498535, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3657466769218445, + "rewards/margins": 3.710754871368408, + "rewards/rejected": -3.345008611679077, + "step": 884 + }, + { + "epoch": 15.0, + "grad_norm": 4.149972484378902, + "learning_rate": 8.930309757836516e-08, + "logits/chosen": -6.867130756378174, + "logits/rejected": -7.295436382293701, + "logps/chosen": -6.439949035644531, + "logps/rejected": -15.3391695022583, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10488628596067429, + "rewards/margins": 3.8695998191833496, + "rewards/rejected": -3.764713764190674, + "step": 885 + }, + { + "epoch": 15.016949152542374, + "grad_norm": 3.781552998631691, + "learning_rate": 8.87372761937761e-08, + "logits/chosen": -9.987040519714355, + "logits/rejected": -7.477032661437988, + "logps/chosen": -10.782506942749023, + "logps/rejected": -23.716060638427734, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6830369234085083, + "rewards/margins": 5.680238246917725, + "rewards/rejected": -4.997201442718506, + "step": 886 + }, + { + "epoch": 15.033898305084746, + "grad_norm": 4.663893838793768, + "learning_rate": 8.817286599559931e-08, + "logits/chosen": -3.7257680892944336, + "logits/rejected": -3.5477094650268555, + "logps/chosen": -8.740972518920898, + "logps/rejected": -19.743122100830078, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5974680185317993, + "rewards/margins": 4.669073581695557, + "rewards/rejected": -4.071605205535889, + "step": 887 + }, + { + "epoch": 15.05084745762712, + "grad_norm": 5.010276787368709, + "learning_rate": 8.760987192290556e-08, + "logits/chosen": -5.386483192443848, + "logits/rejected": -5.680109977722168, + "logps/chosen": -8.287491798400879, + "logps/rejected": -24.390138626098633, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29073163866996765, + "rewards/margins": 5.166617393493652, + "rewards/rejected": -4.875885486602783, + "step": 888 + }, + { + "epoch": 15.067796610169491, + "grad_norm": 4.300590568046875, + "learning_rate": 8.704829890237326e-08, + "logits/chosen": -4.816190719604492, + "logits/rejected": -4.562450408935547, + "logps/chosen": -8.473125457763672, + "logps/rejected": -24.722415924072266, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32061341404914856, + "rewards/margins": 5.384640216827393, + "rewards/rejected": -5.064026832580566, + "step": 889 + }, + { + "epoch": 15.084745762711865, + "grad_norm": 4.056958939978676, + "learning_rate": 8.648815184824543e-08, + "logits/chosen": -8.428276062011719, + "logits/rejected": -6.711330413818359, + "logps/chosen": -13.22380256652832, + "logps/rejected": -27.040882110595703, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07486316561698914, + "rewards/margins": 5.384260654449463, + "rewards/rejected": -5.4591240882873535, + "step": 890 + }, + { + "epoch": 15.101694915254237, + "grad_norm": 4.116821025355182, + "learning_rate": 8.592943566228669e-08, + "logits/chosen": -7.393421173095703, + "logits/rejected": -4.3779215812683105, + "logps/chosen": -11.240400314331055, + "logps/rejected": -23.093278884887695, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6038632392883301, + "rewards/margins": 5.859452724456787, + "rewards/rejected": -5.255589962005615, + "step": 891 + }, + { + "epoch": 15.11864406779661, + "grad_norm": 5.053699753605588, + "learning_rate": 8.537215523374037e-08, + "logits/chosen": -4.562039852142334, + "logits/rejected": -4.087484359741211, + "logps/chosen": -10.435542106628418, + "logps/rejected": -17.37160873413086, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45656490325927734, + "rewards/margins": 3.8431553840637207, + "rewards/rejected": -3.3865904808044434, + "step": 892 + }, + { + "epoch": 15.135593220338983, + "grad_norm": 4.130713005185727, + "learning_rate": 8.481631543928561e-08, + "logits/chosen": -7.683807849884033, + "logits/rejected": -6.336053848266602, + "logps/chosen": -7.9568400382995605, + "logps/rejected": -22.52141571044922, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1728895902633667, + "rewards/margins": 5.566967487335205, + "rewards/rejected": -5.394078731536865, + "step": 893 + }, + { + "epoch": 15.152542372881356, + "grad_norm": 3.8082945319240955, + "learning_rate": 8.426192114299483e-08, + "logits/chosen": -6.013499736785889, + "logits/rejected": -6.379771709442139, + "logps/chosen": -11.058837890625, + "logps/rejected": -22.782060623168945, + "loss": 0.0375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47234123945236206, + "rewards/margins": 4.522031784057617, + "rewards/rejected": -4.0496907234191895, + "step": 894 + }, + { + "epoch": 15.169491525423728, + "grad_norm": 4.655654682322374, + "learning_rate": 8.370897719629108e-08, + "logits/chosen": -8.027939796447754, + "logits/rejected": -5.110647678375244, + "logps/chosen": -7.751594543457031, + "logps/rejected": -16.943401336669922, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3737262487411499, + "rewards/margins": 4.178828239440918, + "rewards/rejected": -3.8051021099090576, + "step": 895 + }, + { + "epoch": 15.186440677966102, + "grad_norm": 4.585133437276424, + "learning_rate": 8.315748843790562e-08, + "logits/chosen": -8.03437614440918, + "logits/rejected": -5.727774620056152, + "logps/chosen": -13.897333145141602, + "logps/rejected": -22.834522247314453, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3705972135066986, + "rewards/margins": 4.764908790588379, + "rewards/rejected": -4.394311428070068, + "step": 896 + }, + { + "epoch": 15.203389830508474, + "grad_norm": 4.2945931087395985, + "learning_rate": 8.260745969383565e-08, + "logits/chosen": -7.748586177825928, + "logits/rejected": -5.3257622718811035, + "logps/chosen": -6.9540486335754395, + "logps/rejected": -15.131741523742676, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15817368030548096, + "rewards/margins": 4.12407112121582, + "rewards/rejected": -3.965897560119629, + "step": 897 + }, + { + "epoch": 15.220338983050848, + "grad_norm": 3.9554224005139877, + "learning_rate": 8.205889577730179e-08, + "logits/chosen": -5.89959716796875, + "logits/rejected": -5.828517913818359, + "logps/chosen": -9.16100788116455, + "logps/rejected": -22.181655883789062, + "loss": 0.0373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26382455229759216, + "rewards/margins": 5.5371880531311035, + "rewards/rejected": -5.2733635902404785, + "step": 898 + }, + { + "epoch": 15.23728813559322, + "grad_norm": 4.145154923229452, + "learning_rate": 8.151180148870649e-08, + "logits/chosen": -10.387269020080566, + "logits/rejected": -9.048869132995605, + "logps/chosen": -8.629597663879395, + "logps/rejected": -18.4086971282959, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4396933317184448, + "rewards/margins": 4.135700702667236, + "rewards/rejected": -3.696007251739502, + "step": 899 + }, + { + "epoch": 15.254237288135593, + "grad_norm": 4.392474058377527, + "learning_rate": 8.09661816155914e-08, + "logits/chosen": -6.837314128875732, + "logits/rejected": -4.007465362548828, + "logps/chosen": -10.138846397399902, + "logps/rejected": -29.386852264404297, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17193610966205597, + "rewards/margins": 6.914831161499023, + "rewards/rejected": -7.086767196655273, + "step": 900 + }, + { + "epoch": 15.271186440677965, + "grad_norm": 4.522224865877253, + "learning_rate": 8.042204093259597e-08, + "logits/chosen": -6.852179050445557, + "logits/rejected": -4.5903706550598145, + "logps/chosen": -13.024887084960938, + "logps/rejected": -19.289447784423828, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1210104376077652, + "rewards/margins": 4.075073719024658, + "rewards/rejected": -3.9540631771087646, + "step": 901 + }, + { + "epoch": 15.288135593220339, + "grad_norm": 4.4900190318899185, + "learning_rate": 7.987938420141536e-08, + "logits/chosen": -6.153905391693115, + "logits/rejected": -6.648481845855713, + "logps/chosen": -8.1083345413208, + "logps/rejected": -25.058744430541992, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31352919340133667, + "rewards/margins": 5.847841262817383, + "rewards/rejected": -5.5343122482299805, + "step": 902 + }, + { + "epoch": 15.305084745762711, + "grad_norm": 4.582566620003048, + "learning_rate": 7.93382161707589e-08, + "logits/chosen": -3.9638917446136475, + "logits/rejected": -1.5824838876724243, + "logps/chosen": -9.085407257080078, + "logps/rejected": -18.560510635375977, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17204411327838898, + "rewards/margins": 4.696191787719727, + "rewards/rejected": -4.524147987365723, + "step": 903 + }, + { + "epoch": 15.322033898305085, + "grad_norm": 4.889430458232006, + "learning_rate": 7.879854157630861e-08, + "logits/chosen": -4.202753067016602, + "logits/rejected": -3.8201920986175537, + "logps/chosen": -8.303861618041992, + "logps/rejected": -20.284696578979492, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18788239359855652, + "rewards/margins": 4.125871181488037, + "rewards/rejected": -3.937988519668579, + "step": 904 + }, + { + "epoch": 15.338983050847457, + "grad_norm": 4.582348879875144, + "learning_rate": 7.826036514067755e-08, + "logits/chosen": -6.020393371582031, + "logits/rejected": -4.9403557777404785, + "logps/chosen": -7.944501876831055, + "logps/rejected": -21.464771270751953, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35279831290245056, + "rewards/margins": 4.910175323486328, + "rewards/rejected": -4.557377338409424, + "step": 905 + }, + { + "epoch": 15.35593220338983, + "grad_norm": 4.626664369260017, + "learning_rate": 7.772369157336872e-08, + "logits/chosen": -6.890298843383789, + "logits/rejected": -5.290185928344727, + "logps/chosen": -9.014580726623535, + "logps/rejected": -18.544498443603516, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2904837131500244, + "rewards/margins": 4.766071796417236, + "rewards/rejected": -4.475588321685791, + "step": 906 + }, + { + "epoch": 15.372881355932204, + "grad_norm": 4.837067860496764, + "learning_rate": 7.718852557073366e-08, + "logits/chosen": -6.288779258728027, + "logits/rejected": -5.576810359954834, + "logps/chosen": -8.71249008178711, + "logps/rejected": -19.18750762939453, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.333800345659256, + "rewards/margins": 5.200597286224365, + "rewards/rejected": -4.866796493530273, + "step": 907 + }, + { + "epoch": 15.389830508474576, + "grad_norm": 4.508705583344171, + "learning_rate": 7.665487181593145e-08, + "logits/chosen": -4.20571231842041, + "logits/rejected": 0.7605342268943787, + "logps/chosen": -14.706962585449219, + "logps/rejected": -32.21023941040039, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05708075314760208, + "rewards/margins": 6.945536136627197, + "rewards/rejected": -6.888455390930176, + "step": 908 + }, + { + "epoch": 15.40677966101695, + "grad_norm": 4.357475863641545, + "learning_rate": 7.612273497888775e-08, + "logits/chosen": -5.5518798828125, + "logits/rejected": -6.663199424743652, + "logps/chosen": -9.692988395690918, + "logps/rejected": -27.73391342163086, + "loss": 0.0447, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16855725646018982, + "rewards/margins": 5.336852550506592, + "rewards/rejected": -5.168295860290527, + "step": 909 + }, + { + "epoch": 15.423728813559322, + "grad_norm": 4.104051386873784, + "learning_rate": 7.559211971625384e-08, + "logits/chosen": -8.114259719848633, + "logits/rejected": -8.306021690368652, + "logps/chosen": -9.408538818359375, + "logps/rejected": -24.232013702392578, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08460733294487, + "rewards/margins": 5.02858304977417, + "rewards/rejected": -4.943976402282715, + "step": 910 + }, + { + "epoch": 15.440677966101696, + "grad_norm": 4.10363074035625, + "learning_rate": 7.506303067136602e-08, + "logits/chosen": -7.748012542724609, + "logits/rejected": -7.093140125274658, + "logps/chosen": -9.258726119995117, + "logps/rejected": -23.056177139282227, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5200488567352295, + "rewards/margins": 4.712107181549072, + "rewards/rejected": -4.192058086395264, + "step": 911 + }, + { + "epoch": 15.457627118644067, + "grad_norm": 3.7318632292671077, + "learning_rate": 7.453547247420464e-08, + "logits/chosen": -8.921445846557617, + "logits/rejected": -7.705672264099121, + "logps/chosen": -9.047325134277344, + "logps/rejected": -21.919553756713867, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2997787892818451, + "rewards/margins": 5.2199907302856445, + "rewards/rejected": -4.9202117919921875, + "step": 912 + }, + { + "epoch": 15.474576271186441, + "grad_norm": 4.40499674775813, + "learning_rate": 7.400944974135426e-08, + "logits/chosen": -8.905839920043945, + "logits/rejected": -7.21429443359375, + "logps/chosen": -11.844295501708984, + "logps/rejected": -25.040693283081055, + "loss": 0.04, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3408765494823456, + "rewards/margins": 5.200284004211426, + "rewards/rejected": -4.8594069480896, + "step": 913 + }, + { + "epoch": 15.491525423728813, + "grad_norm": 4.093105987084931, + "learning_rate": 7.348496707596242e-08, + "logits/chosen": -10.217541694641113, + "logits/rejected": -8.00607681274414, + "logps/chosen": -7.044992446899414, + "logps/rejected": -16.269014358520508, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4086524248123169, + "rewards/margins": 4.565752983093262, + "rewards/rejected": -4.157100677490234, + "step": 914 + }, + { + "epoch": 15.508474576271187, + "grad_norm": 4.748659150209963, + "learning_rate": 7.296202906769997e-08, + "logits/chosen": -6.844855308532715, + "logits/rejected": -5.689482688903809, + "logps/chosen": -14.72694206237793, + "logps/rejected": -26.045547485351562, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07029888778924942, + "rewards/margins": 5.19006872177124, + "rewards/rejected": -5.260367393493652, + "step": 915 + }, + { + "epoch": 15.525423728813559, + "grad_norm": 4.591864509571797, + "learning_rate": 7.244064029272049e-08, + "logits/chosen": -5.819035530090332, + "logits/rejected": -6.179313659667969, + "logps/chosen": -7.19488000869751, + "logps/rejected": -23.933685302734375, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15764935314655304, + "rewards/margins": 6.297710418701172, + "rewards/rejected": -6.140060901641846, + "step": 916 + }, + { + "epoch": 15.542372881355933, + "grad_norm": 4.155066071491463, + "learning_rate": 7.192080531362065e-08, + "logits/chosen": -6.063582897186279, + "logits/rejected": -7.798738956451416, + "logps/chosen": -8.79454231262207, + "logps/rejected": -21.187654495239258, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010221108794212341, + "rewards/margins": 4.549063205718994, + "rewards/rejected": -4.538841724395752, + "step": 917 + }, + { + "epoch": 15.559322033898304, + "grad_norm": 4.8476750798687664, + "learning_rate": 7.140252867939994e-08, + "logits/chosen": -7.400679111480713, + "logits/rejected": -5.948519706726074, + "logps/chosen": -9.957500457763672, + "logps/rejected": -25.932926177978516, + "loss": 0.0584, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2069414108991623, + "rewards/margins": 5.775245189666748, + "rewards/rejected": -5.568304061889648, + "step": 918 + }, + { + "epoch": 15.576271186440678, + "grad_norm": 4.873888730122289, + "learning_rate": 7.08858149254212e-08, + "logits/chosen": -6.478437900543213, + "logits/rejected": -5.060609340667725, + "logps/chosen": -12.641325950622559, + "logps/rejected": -19.79878807067871, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02003513276576996, + "rewards/margins": 4.015776634216309, + "rewards/rejected": -4.035811901092529, + "step": 919 + }, + { + "epoch": 15.59322033898305, + "grad_norm": 4.337491313818972, + "learning_rate": 7.037066857337057e-08, + "logits/chosen": -5.538844108581543, + "logits/rejected": -3.830111265182495, + "logps/chosen": -12.809090614318848, + "logps/rejected": -25.132450103759766, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20014286041259766, + "rewards/margins": 5.1021037101745605, + "rewards/rejected": -5.302246570587158, + "step": 920 + }, + { + "epoch": 15.610169491525424, + "grad_norm": 4.467101523154651, + "learning_rate": 6.985709413121804e-08, + "logits/chosen": -7.16643762588501, + "logits/rejected": -7.090363025665283, + "logps/chosen": -11.620450973510742, + "logps/rejected": -27.701143264770508, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13024349510669708, + "rewards/margins": 5.549199104309082, + "rewards/rejected": -5.4189558029174805, + "step": 921 + }, + { + "epoch": 15.627118644067796, + "grad_norm": 4.206404750084994, + "learning_rate": 6.934509609317821e-08, + "logits/chosen": -5.28694486618042, + "logits/rejected": -6.201131820678711, + "logps/chosen": -8.330510139465332, + "logps/rejected": -21.516273498535156, + "loss": 0.0423, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0818605124950409, + "rewards/margins": 4.693274974822998, + "rewards/rejected": -4.611414432525635, + "step": 922 + }, + { + "epoch": 15.64406779661017, + "grad_norm": 4.290802211777049, + "learning_rate": 6.883467893967068e-08, + "logits/chosen": -11.422243118286133, + "logits/rejected": -6.941327095031738, + "logps/chosen": -9.11874008178711, + "logps/rejected": -20.560924530029297, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3169085383415222, + "rewards/margins": 5.488955020904541, + "rewards/rejected": -5.172046661376953, + "step": 923 + }, + { + "epoch": 15.661016949152543, + "grad_norm": 5.259403196135534, + "learning_rate": 6.832584713728101e-08, + "logits/chosen": -1.465713381767273, + "logits/rejected": -1.5597341060638428, + "logps/chosen": -11.35631275177002, + "logps/rejected": -23.734394073486328, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5144131779670715, + "rewards/margins": 5.69120979309082, + "rewards/rejected": -5.176797389984131, + "step": 924 + }, + { + "epoch": 15.677966101694915, + "grad_norm": 5.295423529930121, + "learning_rate": 6.781860513872154e-08, + "logits/chosen": -6.635700702667236, + "logits/rejected": -6.23319673538208, + "logps/chosen": -12.26685905456543, + "logps/rejected": -21.214061737060547, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17025622725486755, + "rewards/margins": 4.827388763427734, + "rewards/rejected": -4.657133102416992, + "step": 925 + }, + { + "epoch": 15.694915254237289, + "grad_norm": 4.315266296273012, + "learning_rate": 6.731295738279255e-08, + "logits/chosen": -7.001347064971924, + "logits/rejected": -2.145578384399414, + "logps/chosen": -11.142807006835938, + "logps/rejected": -21.77847671508789, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27062392234802246, + "rewards/margins": 5.010962009429932, + "rewards/rejected": -4.740338325500488, + "step": 926 + }, + { + "epoch": 15.711864406779661, + "grad_norm": 4.940221201075603, + "learning_rate": 6.680890829434324e-08, + "logits/chosen": -9.319822311401367, + "logits/rejected": -8.537477493286133, + "logps/chosen": -8.557331085205078, + "logps/rejected": -22.469524383544922, + "loss": 0.0558, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33681201934814453, + "rewards/margins": 5.529571533203125, + "rewards/rejected": -5.1927595138549805, + "step": 927 + }, + { + "epoch": 15.728813559322035, + "grad_norm": 4.13577945145429, + "learning_rate": 6.630646228423323e-08, + "logits/chosen": -7.591739177703857, + "logits/rejected": -5.0761003494262695, + "logps/chosen": -7.780755519866943, + "logps/rejected": -21.055946350097656, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29918402433395386, + "rewards/margins": 5.199295997619629, + "rewards/rejected": -4.900112152099609, + "step": 928 + }, + { + "epoch": 15.745762711864407, + "grad_norm": 3.6967955744291823, + "learning_rate": 6.580562374929369e-08, + "logits/chosen": -4.919278621673584, + "logits/rejected": -0.548151969909668, + "logps/chosen": -11.255338668823242, + "logps/rejected": -28.93407440185547, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3684440851211548, + "rewards/margins": 7.152661323547363, + "rewards/rejected": -6.784217834472656, + "step": 929 + }, + { + "epoch": 15.76271186440678, + "grad_norm": 3.582661160567403, + "learning_rate": 6.53063970722891e-08, + "logits/chosen": -6.948887825012207, + "logits/rejected": -5.574216365814209, + "logps/chosen": -9.181246757507324, + "logps/rejected": -28.59181022644043, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4202234148979187, + "rewards/margins": 6.4141845703125, + "rewards/rejected": -5.993961334228516, + "step": 930 + }, + { + "epoch": 15.779661016949152, + "grad_norm": 5.039230655446437, + "learning_rate": 6.480878662187883e-08, + "logits/chosen": -5.391602516174316, + "logits/rejected": -1.1296690702438354, + "logps/chosen": -10.732582092285156, + "logps/rejected": -26.942331314086914, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08686523139476776, + "rewards/margins": 6.540577411651611, + "rewards/rejected": -6.453711986541748, + "step": 931 + }, + { + "epoch": 15.796610169491526, + "grad_norm": 5.904351655682266, + "learning_rate": 6.431279675257872e-08, + "logits/chosen": -5.443461894989014, + "logits/rejected": -7.360386848449707, + "logps/chosen": -8.354605674743652, + "logps/rejected": -22.984634399414062, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02095790207386017, + "rewards/margins": 5.452232360839844, + "rewards/rejected": -5.4312744140625, + "step": 932 + }, + { + "epoch": 15.813559322033898, + "grad_norm": 5.007899634061934, + "learning_rate": 6.381843180472349e-08, + "logits/chosen": -4.189509868621826, + "logits/rejected": -4.424144744873047, + "logps/chosen": -8.072770118713379, + "logps/rejected": -24.014766693115234, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2011854648590088, + "rewards/margins": 5.383251667022705, + "rewards/rejected": -5.182065963745117, + "step": 933 + }, + { + "epoch": 15.830508474576272, + "grad_norm": 4.113407110361293, + "learning_rate": 6.332569610442806e-08, + "logits/chosen": -2.852726936340332, + "logits/rejected": -3.0774269104003906, + "logps/chosen": -6.4904680252075195, + "logps/rejected": -27.27833366394043, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4382808208465576, + "rewards/margins": 7.506036758422852, + "rewards/rejected": -7.067756175994873, + "step": 934 + }, + { + "epoch": 15.847457627118644, + "grad_norm": 4.626182534981252, + "learning_rate": 6.28345939635502e-08, + "logits/chosen": -6.271831035614014, + "logits/rejected": -6.549797058105469, + "logps/chosen": -7.970676422119141, + "logps/rejected": -21.457536697387695, + "loss": 0.0551, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29878151416778564, + "rewards/margins": 4.9347920417785645, + "rewards/rejected": -4.636011123657227, + "step": 935 + }, + { + "epoch": 15.864406779661017, + "grad_norm": 4.100236768772272, + "learning_rate": 6.23451296796526e-08, + "logits/chosen": -12.010820388793945, + "logits/rejected": -6.9747443199157715, + "logps/chosen": -12.61644458770752, + "logps/rejected": -26.862714767456055, + "loss": 0.0354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.52858567237854, + "rewards/margins": 5.081140995025635, + "rewards/rejected": -4.552555084228516, + "step": 936 + }, + { + "epoch": 15.88135593220339, + "grad_norm": 4.555450958773166, + "learning_rate": 6.185730753596538e-08, + "logits/chosen": -0.7543103098869324, + "logits/rejected": 0.3908948302268982, + "logps/chosen": -12.7721529006958, + "logps/rejected": -25.825170516967773, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09586603194475174, + "rewards/margins": 5.416937351226807, + "rewards/rejected": -5.321071147918701, + "step": 937 + }, + { + "epoch": 15.898305084745763, + "grad_norm": 4.87105997384688, + "learning_rate": 6.137113180134842e-08, + "logits/chosen": -5.579267501831055, + "logits/rejected": -7.177576065063477, + "logps/chosen": -8.82749080657959, + "logps/rejected": -22.285377502441406, + "loss": 0.0556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26872116327285767, + "rewards/margins": 4.700954437255859, + "rewards/rejected": -4.4322333335876465, + "step": 938 + }, + { + "epoch": 15.915254237288135, + "grad_norm": 4.729392094465314, + "learning_rate": 6.088660673025416e-08, + "logits/chosen": -5.619653224945068, + "logits/rejected": -4.975070953369141, + "logps/chosen": -9.873350143432617, + "logps/rejected": -26.303316116333008, + "loss": 0.0608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05529198795557022, + "rewards/margins": 5.834388732910156, + "rewards/rejected": -5.779096603393555, + "step": 939 + }, + { + "epoch": 15.932203389830509, + "grad_norm": 4.103739242054735, + "learning_rate": 6.04037365626904e-08, + "logits/chosen": -7.370528221130371, + "logits/rejected": -6.9952778816223145, + "logps/chosen": -9.415689468383789, + "logps/rejected": -19.225749969482422, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4855545461177826, + "rewards/margins": 4.5416669845581055, + "rewards/rejected": -4.056112766265869, + "step": 940 + }, + { + "epoch": 15.94915254237288, + "grad_norm": 4.4572375897616, + "learning_rate": 5.992252552418303e-08, + "logits/chosen": -10.922706604003906, + "logits/rejected": -10.116929054260254, + "logps/chosen": -10.026784896850586, + "logps/rejected": -22.07077980041504, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7581459879875183, + "rewards/margins": 4.968456745147705, + "rewards/rejected": -4.210310935974121, + "step": 941 + }, + { + "epoch": 15.966101694915254, + "grad_norm": 4.518155772533724, + "learning_rate": 5.9442977825739175e-08, + "logits/chosen": -5.720646858215332, + "logits/rejected": -6.363611221313477, + "logps/chosen": -10.6333646774292, + "logps/rejected": -30.965347290039062, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.214798241853714, + "rewards/margins": 6.588019847869873, + "rewards/rejected": -6.373221397399902, + "step": 942 + }, + { + "epoch": 15.983050847457626, + "grad_norm": 4.745042956801933, + "learning_rate": 5.896509766381028e-08, + "logits/chosen": -3.2927167415618896, + "logits/rejected": -1.5687626600265503, + "logps/chosen": -11.610235214233398, + "logps/rejected": -25.762611389160156, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06053323298692703, + "rewards/margins": 4.873441696166992, + "rewards/rejected": -4.933974742889404, + "step": 943 + }, + { + "epoch": 16.0, + "grad_norm": 4.817602731785699, + "learning_rate": 5.848888922025552e-08, + "logits/chosen": -8.419713973999023, + "logits/rejected": -7.601513385772705, + "logps/chosen": -7.9321699142456055, + "logps/rejected": -19.71578598022461, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3660358488559723, + "rewards/margins": 5.004828453063965, + "rewards/rejected": -4.638792991638184, + "step": 944 + }, + { + "epoch": 16.016949152542374, + "grad_norm": 5.075858314557022, + "learning_rate": 5.8014356662305e-08, + "logits/chosen": -2.367865562438965, + "logits/rejected": -4.307644367218018, + "logps/chosen": -9.921137809753418, + "logps/rejected": -26.57358169555664, + "loss": 0.0723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17190691828727722, + "rewards/margins": 4.076783180236816, + "rewards/rejected": -3.9048757553100586, + "step": 945 + }, + { + "epoch": 16.033898305084747, + "grad_norm": 4.764182818211839, + "learning_rate": 5.75415041425234e-08, + "logits/chosen": -9.120944023132324, + "logits/rejected": -9.688711166381836, + "logps/chosen": -5.0205864906311035, + "logps/rejected": -15.899490356445312, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6438949108123779, + "rewards/margins": 4.424144268035889, + "rewards/rejected": -3.7802491188049316, + "step": 946 + }, + { + "epoch": 16.050847457627118, + "grad_norm": 4.405345673656158, + "learning_rate": 5.707033579877379e-08, + "logits/chosen": -8.686363220214844, + "logits/rejected": -5.756230354309082, + "logps/chosen": -10.461504936218262, + "logps/rejected": -22.015207290649414, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43740159273147583, + "rewards/margins": 5.130666732788086, + "rewards/rejected": -4.693264961242676, + "step": 947 + }, + { + "epoch": 16.06779661016949, + "grad_norm": 5.20667843942836, + "learning_rate": 5.660085575418114e-08, + "logits/chosen": -4.197178840637207, + "logits/rejected": -4.616957664489746, + "logps/chosen": -14.62967586517334, + "logps/rejected": -23.51697540283203, + "loss": 0.0545, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.24510294198989868, + "rewards/margins": 3.7846622467041016, + "rewards/rejected": -4.0297651290893555, + "step": 948 + }, + { + "epoch": 16.084745762711865, + "grad_norm": 4.282529438480496, + "learning_rate": 5.6133068117096335e-08, + "logits/chosen": -6.429426670074463, + "logits/rejected": -8.026588439941406, + "logps/chosen": -7.786294937133789, + "logps/rejected": -22.525127410888672, + "loss": 0.053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13193783164024353, + "rewards/margins": 4.904080867767334, + "rewards/rejected": -4.772143363952637, + "step": 949 + }, + { + "epoch": 16.10169491525424, + "grad_norm": 4.621515043010338, + "learning_rate": 5.566697698106024e-08, + "logits/chosen": -10.278071403503418, + "logits/rejected": -8.686415672302246, + "logps/chosen": -10.784829139709473, + "logps/rejected": -22.454185485839844, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09567110240459442, + "rewards/margins": 4.569484233856201, + "rewards/rejected": -4.473813056945801, + "step": 950 + }, + { + "epoch": 16.11864406779661, + "grad_norm": 4.871432065569196, + "learning_rate": 5.5202586424767967e-08, + "logits/chosen": -7.282120227813721, + "logits/rejected": -6.065773010253906, + "logps/chosen": -8.455615997314453, + "logps/rejected": -14.357915878295898, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5921602249145508, + "rewards/margins": 3.3937811851501465, + "rewards/rejected": -2.8016209602355957, + "step": 951 + }, + { + "epoch": 16.135593220338983, + "grad_norm": 3.876482262335242, + "learning_rate": 5.473990051203298e-08, + "logits/chosen": -9.93977165222168, + "logits/rejected": -8.173554420471191, + "logps/chosen": -12.71878719329834, + "logps/rejected": -24.868427276611328, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.108131542801857, + "rewards/margins": 5.222832679748535, + "rewards/rejected": -5.330964088439941, + "step": 952 + }, + { + "epoch": 16.152542372881356, + "grad_norm": 4.705045971613401, + "learning_rate": 5.4278923291751934e-08, + "logits/chosen": -6.992777347564697, + "logits/rejected": -6.69704532623291, + "logps/chosen": -7.97955846786499, + "logps/rejected": -16.74382781982422, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43731003999710083, + "rewards/margins": 4.202198505401611, + "rewards/rejected": -3.7648885250091553, + "step": 953 + }, + { + "epoch": 16.16949152542373, + "grad_norm": 4.743973876453088, + "learning_rate": 5.381965879786868e-08, + "logits/chosen": -6.325166702270508, + "logits/rejected": -5.70654296875, + "logps/chosen": -10.991593360900879, + "logps/rejected": -21.800357818603516, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21553093194961548, + "rewards/margins": 4.7880539894104, + "rewards/rejected": -4.57252311706543, + "step": 954 + }, + { + "epoch": 16.1864406779661, + "grad_norm": 4.628586645255542, + "learning_rate": 5.336211104933938e-08, + "logits/chosen": -4.82962703704834, + "logits/rejected": -7.258959770202637, + "logps/chosen": -10.598827362060547, + "logps/rejected": -23.605480194091797, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16052578389644623, + "rewards/margins": 4.829559803009033, + "rewards/rejected": -4.669033527374268, + "step": 955 + }, + { + "epoch": 16.203389830508474, + "grad_norm": 4.17471237430466, + "learning_rate": 5.290628405009717e-08, + "logits/chosen": -6.717578411102295, + "logits/rejected": -7.951920032501221, + "logps/chosen": -7.13959264755249, + "logps/rejected": -21.573381423950195, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28929591178894043, + "rewards/margins": 4.62062931060791, + "rewards/rejected": -4.331333637237549, + "step": 956 + }, + { + "epoch": 16.220338983050848, + "grad_norm": 3.0367710551215716, + "learning_rate": 5.2452181789017166e-08, + "logits/chosen": -7.549493789672852, + "logits/rejected": -6.55128812789917, + "logps/chosen": -8.085127830505371, + "logps/rejected": -23.510971069335938, + "loss": 0.0291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12076622247695923, + "rewards/margins": 5.681987762451172, + "rewards/rejected": -5.802753448486328, + "step": 957 + }, + { + "epoch": 16.23728813559322, + "grad_norm": 4.244814839225662, + "learning_rate": 5.1999808239881564e-08, + "logits/chosen": -8.300333023071289, + "logits/rejected": -7.503015041351318, + "logps/chosen": -11.905001640319824, + "logps/rejected": -26.17755699157715, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7613542079925537, + "rewards/margins": 5.459731578826904, + "rewards/rejected": -4.69837760925293, + "step": 958 + }, + { + "epoch": 16.25423728813559, + "grad_norm": 5.017624513304328, + "learning_rate": 5.1549167361344875e-08, + "logits/chosen": -5.223151206970215, + "logits/rejected": -4.507918834686279, + "logps/chosen": -8.082959175109863, + "logps/rejected": -23.3592586517334, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.510887861251831, + "rewards/margins": 5.510470867156982, + "rewards/rejected": -4.9995832443237305, + "step": 959 + }, + { + "epoch": 16.271186440677965, + "grad_norm": 4.500188138065856, + "learning_rate": 5.1100263096899215e-08, + "logits/chosen": -6.816982746124268, + "logits/rejected": -5.620564937591553, + "logps/chosen": -9.452603340148926, + "logps/rejected": -28.52019500732422, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2641323506832123, + "rewards/margins": 5.596342086791992, + "rewards/rejected": -5.332209587097168, + "step": 960 + }, + { + "epoch": 16.28813559322034, + "grad_norm": 4.419413086409128, + "learning_rate": 5.065309937483991e-08, + "logits/chosen": -7.277461051940918, + "logits/rejected": -6.101316452026367, + "logps/chosen": -10.160829544067383, + "logps/rejected": -19.631698608398438, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03897915035486221, + "rewards/margins": 4.435064315795898, + "rewards/rejected": -4.396084785461426, + "step": 961 + }, + { + "epoch": 16.305084745762713, + "grad_norm": 3.7105875745545043, + "learning_rate": 5.020768010823101e-08, + "logits/chosen": -3.605870246887207, + "logits/rejected": -6.897829055786133, + "logps/chosen": -13.389876365661621, + "logps/rejected": -29.775251388549805, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13197000324726105, + "rewards/margins": 6.818739891052246, + "rewards/rejected": -6.950709819793701, + "step": 962 + }, + { + "epoch": 16.322033898305083, + "grad_norm": 4.664383108703518, + "learning_rate": 4.976400919487106e-08, + "logits/chosen": -6.504683971405029, + "logits/rejected": -3.7136526107788086, + "logps/chosen": -11.577073097229004, + "logps/rejected": -28.113012313842773, + "loss": 0.0505, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3003220558166504, + "rewards/margins": 6.02211332321167, + "rewards/rejected": -5.721792221069336, + "step": 963 + }, + { + "epoch": 16.338983050847457, + "grad_norm": 3.7490296393228455, + "learning_rate": 4.932209051725914e-08, + "logits/chosen": -10.529375076293945, + "logits/rejected": -10.49166488647461, + "logps/chosen": -9.599217414855957, + "logps/rejected": -21.940521240234375, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44774866104125977, + "rewards/margins": 4.731534957885742, + "rewards/rejected": -4.283786296844482, + "step": 964 + }, + { + "epoch": 16.35593220338983, + "grad_norm": 4.241281553767145, + "learning_rate": 4.88819279425606e-08, + "logits/chosen": -9.115670204162598, + "logits/rejected": -7.715078830718994, + "logps/chosen": -9.38962173461914, + "logps/rejected": -24.184432983398438, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03572952747344971, + "rewards/margins": 4.836929798126221, + "rewards/rejected": -4.8012003898620605, + "step": 965 + }, + { + "epoch": 16.372881355932204, + "grad_norm": 4.283645502102022, + "learning_rate": 4.844352532257351e-08, + "logits/chosen": -7.930306911468506, + "logits/rejected": -6.915701866149902, + "logps/chosen": -10.192066192626953, + "logps/rejected": -22.797836303710938, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.013980336487293243, + "rewards/margins": 4.946038246154785, + "rewards/rejected": -4.932058334350586, + "step": 966 + }, + { + "epoch": 16.389830508474578, + "grad_norm": 3.6881742053724973, + "learning_rate": 4.8006886493694885e-08, + "logits/chosen": -9.653305053710938, + "logits/rejected": -6.4271650314331055, + "logps/chosen": -9.224141120910645, + "logps/rejected": -23.99065589904785, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31926214694976807, + "rewards/margins": 5.534373760223389, + "rewards/rejected": -5.215112209320068, + "step": 967 + }, + { + "epoch": 16.406779661016948, + "grad_norm": 4.051689710502391, + "learning_rate": 4.757201527688692e-08, + "logits/chosen": -0.17772293090820312, + "logits/rejected": -2.985245704650879, + "logps/chosen": -7.360447406768799, + "logps/rejected": -19.838626861572266, + "loss": 0.0442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38335198163986206, + "rewards/margins": 3.9256038665771484, + "rewards/rejected": -3.5422518253326416, + "step": 968 + }, + { + "epoch": 16.423728813559322, + "grad_norm": 4.438345671320595, + "learning_rate": 4.713891547764384e-08, + "logits/chosen": -3.5385613441467285, + "logits/rejected": -2.1906332969665527, + "logps/chosen": -11.252260208129883, + "logps/rejected": -27.119346618652344, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25384432077407837, + "rewards/margins": 6.420773983001709, + "rewards/rejected": -6.166929721832275, + "step": 969 + }, + { + "epoch": 16.440677966101696, + "grad_norm": 4.119938068589781, + "learning_rate": 4.67075908859583e-08, + "logits/chosen": -5.7297210693359375, + "logits/rejected": -3.09169864654541, + "logps/chosen": -8.071650505065918, + "logps/rejected": -20.209758758544922, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26734113693237305, + "rewards/margins": 5.061099052429199, + "rewards/rejected": -4.793758392333984, + "step": 970 + }, + { + "epoch": 16.45762711864407, + "grad_norm": 3.95463357002394, + "learning_rate": 4.6278045276288565e-08, + "logits/chosen": -7.240344047546387, + "logits/rejected": -5.89601469039917, + "logps/chosen": -11.00953483581543, + "logps/rejected": -21.797813415527344, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6719949245452881, + "rewards/margins": 5.352088928222656, + "rewards/rejected": -4.680094242095947, + "step": 971 + }, + { + "epoch": 16.47457627118644, + "grad_norm": 4.099529453892412, + "learning_rate": 4.5850282407524975e-08, + "logits/chosen": -4.743894577026367, + "logits/rejected": -4.78099250793457, + "logps/chosen": -8.380840301513672, + "logps/rejected": -25.225353240966797, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06848488748073578, + "rewards/margins": 5.246893405914307, + "rewards/rejected": -5.315378665924072, + "step": 972 + }, + { + "epoch": 16.491525423728813, + "grad_norm": 5.069108591451907, + "learning_rate": 4.5424306022957745e-08, + "logits/chosen": -9.48492431640625, + "logits/rejected": -11.09494400024414, + "logps/chosen": -11.010992050170898, + "logps/rejected": -17.152000427246094, + "loss": 0.0605, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.36105436086654663, + "rewards/margins": 3.41518497467041, + "rewards/rejected": -3.0541305541992188, + "step": 973 + }, + { + "epoch": 16.508474576271187, + "grad_norm": 4.282441228599341, + "learning_rate": 4.5000119850243626e-08, + "logits/chosen": -4.568205833435059, + "logits/rejected": -3.961057662963867, + "logps/chosen": -10.547223091125488, + "logps/rejected": -31.581138610839844, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23463180661201477, + "rewards/margins": 7.356784343719482, + "rewards/rejected": -7.591416358947754, + "step": 974 + }, + { + "epoch": 16.52542372881356, + "grad_norm": 4.9671363698474575, + "learning_rate": 4.457772760137349e-08, + "logits/chosen": -4.574094772338867, + "logits/rejected": -6.335107326507568, + "logps/chosen": -9.245567321777344, + "logps/rejected": -21.6871337890625, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26459747552871704, + "rewards/margins": 4.457276821136475, + "rewards/rejected": -4.192679405212402, + "step": 975 + }, + { + "epoch": 16.54237288135593, + "grad_norm": 4.077449203746537, + "learning_rate": 4.415713297263987e-08, + "logits/chosen": -9.965399742126465, + "logits/rejected": -7.164189338684082, + "logps/chosen": -11.206932067871094, + "logps/rejected": -22.17264747619629, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4054555594921112, + "rewards/margins": 5.540596008300781, + "rewards/rejected": -5.135139465332031, + "step": 976 + }, + { + "epoch": 16.559322033898304, + "grad_norm": 4.516788897795623, + "learning_rate": 4.3738339644604636e-08, + "logits/chosen": -4.527769565582275, + "logits/rejected": -3.9044039249420166, + "logps/chosen": -9.113353729248047, + "logps/rejected": -25.832937240600586, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43333011865615845, + "rewards/margins": 5.910764694213867, + "rewards/rejected": -5.4774346351623535, + "step": 977 + }, + { + "epoch": 16.576271186440678, + "grad_norm": 3.9477454836043058, + "learning_rate": 4.3321351282066654e-08, + "logits/chosen": -7.476665496826172, + "logits/rejected": -6.898786544799805, + "logps/chosen": -8.629772186279297, + "logps/rejected": -23.448902130126953, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35876137018203735, + "rewards/margins": 5.063984394073486, + "rewards/rejected": -4.705223083496094, + "step": 978 + }, + { + "epoch": 16.593220338983052, + "grad_norm": 4.17873330367698, + "learning_rate": 4.290617153402984e-08, + "logits/chosen": -4.806510925292969, + "logits/rejected": -7.001298904418945, + "logps/chosen": -7.695079326629639, + "logps/rejected": -20.282554626464844, + "loss": 0.0364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4525095522403717, + "rewards/margins": 4.603910446166992, + "rewards/rejected": -4.151400566101074, + "step": 979 + }, + { + "epoch": 16.610169491525422, + "grad_norm": 4.4906778542040575, + "learning_rate": 4.249280403367114e-08, + "logits/chosen": -8.551712989807129, + "logits/rejected": -8.692821502685547, + "logps/chosen": -7.653357982635498, + "logps/rejected": -19.395835876464844, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7328954339027405, + "rewards/margins": 4.667267799377441, + "rewards/rejected": -3.934372901916504, + "step": 980 + }, + { + "epoch": 16.627118644067796, + "grad_norm": 4.582496619989306, + "learning_rate": 4.208125239830901e-08, + "logits/chosen": -4.369369029998779, + "logits/rejected": -8.472222328186035, + "logps/chosen": -6.89990758895874, + "logps/rejected": -20.731672286987305, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4413119852542877, + "rewards/margins": 5.03058385848999, + "rewards/rejected": -4.5892720222473145, + "step": 981 + }, + { + "epoch": 16.64406779661017, + "grad_norm": 3.231930476691675, + "learning_rate": 4.167152022937123e-08, + "logits/chosen": -2.9806668758392334, + "logits/rejected": -5.6147847175598145, + "logps/chosen": -11.49258041381836, + "logps/rejected": -30.111173629760742, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06908190250396729, + "rewards/margins": 6.325509071350098, + "rewards/rejected": -6.394590377807617, + "step": 982 + }, + { + "epoch": 16.661016949152543, + "grad_norm": 4.681138904637989, + "learning_rate": 4.126361111236395e-08, + "logits/chosen": -5.353288173675537, + "logits/rejected": -8.29789924621582, + "logps/chosen": -9.73030948638916, + "logps/rejected": -21.84670639038086, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2567916512489319, + "rewards/margins": 4.876055717468262, + "rewards/rejected": -4.619264125823975, + "step": 983 + }, + { + "epoch": 16.677966101694913, + "grad_norm": 4.336367247523663, + "learning_rate": 4.08575286168398e-08, + "logits/chosen": -7.448416709899902, + "logits/rejected": -8.908486366271973, + "logps/chosen": -5.859768867492676, + "logps/rejected": -18.815467834472656, + "loss": 0.0472, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4510043263435364, + "rewards/margins": 5.035971641540527, + "rewards/rejected": -4.584967613220215, + "step": 984 + }, + { + "epoch": 16.694915254237287, + "grad_norm": 3.9740284639920125, + "learning_rate": 4.0453276296367134e-08, + "logits/chosen": -6.158625602722168, + "logits/rejected": -6.769649982452393, + "logps/chosen": -8.739511489868164, + "logps/rejected": -21.11366081237793, + "loss": 0.0365, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20570890605449677, + "rewards/margins": 4.766705513000488, + "rewards/rejected": -4.560997009277344, + "step": 985 + }, + { + "epoch": 16.71186440677966, + "grad_norm": 4.345605358423864, + "learning_rate": 4.005085768849856e-08, + "logits/chosen": -7.33724308013916, + "logits/rejected": -7.3545241355896, + "logps/chosen": -10.614309310913086, + "logps/rejected": -25.42418670654297, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24098001420497894, + "rewards/margins": 5.777004241943359, + "rewards/rejected": -6.017984390258789, + "step": 986 + }, + { + "epoch": 16.728813559322035, + "grad_norm": 5.174961944414816, + "learning_rate": 3.965027631474035e-08, + "logits/chosen": -5.207669734954834, + "logits/rejected": -5.68195104598999, + "logps/chosen": -8.797658920288086, + "logps/rejected": -19.174301147460938, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1935984045267105, + "rewards/margins": 4.448380947113037, + "rewards/rejected": -4.254782199859619, + "step": 987 + }, + { + "epoch": 16.74576271186441, + "grad_norm": 4.914330167018493, + "learning_rate": 3.9251535680521226e-08, + "logits/chosen": -5.299089431762695, + "logits/rejected": -5.562166690826416, + "logps/chosen": -7.950815677642822, + "logps/rejected": -23.660730361938477, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5078102350234985, + "rewards/margins": 5.57275390625, + "rewards/rejected": -5.064943790435791, + "step": 988 + }, + { + "epoch": 16.76271186440678, + "grad_norm": 4.423714547925529, + "learning_rate": 3.885463927516189e-08, + "logits/chosen": -7.4200263023376465, + "logits/rejected": -6.687015533447266, + "logps/chosen": -7.834423065185547, + "logps/rejected": -20.39315414428711, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6284862160682678, + "rewards/margins": 4.580899238586426, + "rewards/rejected": -3.9524128437042236, + "step": 989 + }, + { + "epoch": 16.779661016949152, + "grad_norm": 3.9593534596195514, + "learning_rate": 3.845959057184453e-08, + "logits/chosen": -9.570975303649902, + "logits/rejected": -10.773117065429688, + "logps/chosen": -8.251781463623047, + "logps/rejected": -19.07634735107422, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48474591970443726, + "rewards/margins": 3.843752384185791, + "rewards/rejected": -3.359006404876709, + "step": 990 + }, + { + "epoch": 16.796610169491526, + "grad_norm": 3.9910442343182297, + "learning_rate": 3.806639302758227e-08, + "logits/chosen": -5.000946044921875, + "logits/rejected": -4.220808029174805, + "logps/chosen": -8.912650108337402, + "logps/rejected": -18.11932945251465, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6203174591064453, + "rewards/margins": 4.1827712059021, + "rewards/rejected": -3.5624542236328125, + "step": 991 + }, + { + "epoch": 16.8135593220339, + "grad_norm": 3.6120155500539144, + "learning_rate": 3.767505008318914e-08, + "logits/chosen": -5.242963790893555, + "logits/rejected": -3.9384145736694336, + "logps/chosen": -10.507009506225586, + "logps/rejected": -27.637222290039062, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36286771297454834, + "rewards/margins": 5.9236369132995605, + "rewards/rejected": -5.560769081115723, + "step": 992 + }, + { + "epoch": 16.83050847457627, + "grad_norm": 4.329264208104702, + "learning_rate": 3.728556516324971e-08, + "logits/chosen": -3.621446371078491, + "logits/rejected": 0.20612281560897827, + "logps/chosen": -19.473979949951172, + "logps/rejected": -28.62809181213379, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3174746632575989, + "rewards/margins": 5.846737861633301, + "rewards/rejected": -5.529263496398926, + "step": 993 + }, + { + "epoch": 16.847457627118644, + "grad_norm": 4.120460619502292, + "learning_rate": 3.6897941676089365e-08, + "logits/chosen": -9.164158821105957, + "logits/rejected": -6.803136825561523, + "logps/chosen": -14.740889549255371, + "logps/rejected": -23.247711181640625, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6667031049728394, + "rewards/margins": 4.748815536499023, + "rewards/rejected": -4.082113265991211, + "step": 994 + }, + { + "epoch": 16.864406779661017, + "grad_norm": 3.5605423273208143, + "learning_rate": 3.651218301374431e-08, + "logits/chosen": -5.762743949890137, + "logits/rejected": -4.326004505157471, + "logps/chosen": -7.795363426208496, + "logps/rejected": -30.06238555908203, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.023066237568855286, + "rewards/margins": 7.547117710113525, + "rewards/rejected": -7.570183277130127, + "step": 995 + }, + { + "epoch": 16.88135593220339, + "grad_norm": 4.515991407877982, + "learning_rate": 3.612829255193192e-08, + "logits/chosen": -6.335334777832031, + "logits/rejected": -5.021296501159668, + "logps/chosen": -10.029027938842773, + "logps/rejected": -25.938034057617188, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11987611651420593, + "rewards/margins": 6.307064533233643, + "rewards/rejected": -6.187188148498535, + "step": 996 + }, + { + "epoch": 16.89830508474576, + "grad_norm": 4.162547805580534, + "learning_rate": 3.574627365002122e-08, + "logits/chosen": -4.132068634033203, + "logits/rejected": -1.9441083669662476, + "logps/chosen": -12.128137588500977, + "logps/rejected": -35.27802658081055, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6331876516342163, + "rewards/margins": 6.974603652954102, + "rewards/rejected": -7.607790946960449, + "step": 997 + }, + { + "epoch": 16.915254237288135, + "grad_norm": 5.532298109239178, + "learning_rate": 3.536612965100361e-08, + "logits/chosen": -6.583493232727051, + "logits/rejected": -4.180866718292236, + "logps/chosen": -10.267550468444824, + "logps/rejected": -23.525606155395508, + "loss": 0.0553, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0621405765414238, + "rewards/margins": 5.680069923400879, + "rewards/rejected": -5.617929458618164, + "step": 998 + }, + { + "epoch": 16.93220338983051, + "grad_norm": 4.459495867860006, + "learning_rate": 3.4987863881463296e-08, + "logits/chosen": -7.422240257263184, + "logits/rejected": -7.356123924255371, + "logps/chosen": -6.346535682678223, + "logps/rejected": -20.76394271850586, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0896294116973877, + "rewards/margins": 4.644069194793701, + "rewards/rejected": -4.554439544677734, + "step": 999 + }, + { + "epoch": 16.949152542372882, + "grad_norm": 4.133219670295683, + "learning_rate": 3.461147965154845e-08, + "logits/chosen": -6.083202838897705, + "logits/rejected": -8.11595630645752, + "logps/chosen": -6.606565475463867, + "logps/rejected": -19.625391006469727, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5629207491874695, + "rewards/margins": 5.111311435699463, + "rewards/rejected": -4.548391342163086, + "step": 1000 + }, + { + "epoch": 16.966101694915253, + "grad_norm": 4.1740786495102755, + "learning_rate": 3.423698025494234e-08, + "logits/chosen": -10.186813354492188, + "logits/rejected": -9.291478157043457, + "logps/chosen": -7.909204483032227, + "logps/rejected": -18.85116195678711, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4344901144504547, + "rewards/margins": 4.76976203918457, + "rewards/rejected": -4.335272312164307, + "step": 1001 + }, + { + "epoch": 16.983050847457626, + "grad_norm": 4.15892660544233, + "learning_rate": 3.386436896883407e-08, + "logits/chosen": -5.626341342926025, + "logits/rejected": -3.084937810897827, + "logps/chosen": -9.295838356018066, + "logps/rejected": -24.96091079711914, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06559734046459198, + "rewards/margins": 5.8852386474609375, + "rewards/rejected": -5.819642066955566, + "step": 1002 + }, + { + "epoch": 17.0, + "grad_norm": 3.8140424835677633, + "learning_rate": 3.349364905389032e-08, + "logits/chosen": -9.058091163635254, + "logits/rejected": -6.692200660705566, + "logps/chosen": -12.010008811950684, + "logps/rejected": -19.010009765625, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41910070180892944, + "rewards/margins": 3.528287887573242, + "rewards/rejected": -3.109187364578247, + "step": 1003 + }, + { + "epoch": 17.016949152542374, + "grad_norm": 4.197883218610352, + "learning_rate": 3.3124823754226625e-08, + "logits/chosen": -6.8970561027526855, + "logits/rejected": -6.827741622924805, + "logps/chosen": -8.531798362731934, + "logps/rejected": -22.772443771362305, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45479583740234375, + "rewards/margins": 5.231786251068115, + "rewards/rejected": -4.77699089050293, + "step": 1004 + }, + { + "epoch": 17.033898305084747, + "grad_norm": 4.497698689530865, + "learning_rate": 3.275789629737905e-08, + "logits/chosen": -5.448521614074707, + "logits/rejected": -5.657861709594727, + "logps/chosen": -8.764067649841309, + "logps/rejected": -20.854806900024414, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20594146847724915, + "rewards/margins": 4.639956474304199, + "rewards/rejected": -4.434015274047852, + "step": 1005 + }, + { + "epoch": 17.050847457627118, + "grad_norm": 4.5378204622729825, + "learning_rate": 3.2392869894275726e-08, + "logits/chosen": -8.017354965209961, + "logits/rejected": -7.641470909118652, + "logps/chosen": -9.67556381225586, + "logps/rejected": -24.360857009887695, + "loss": 0.0536, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02913103997707367, + "rewards/margins": 5.586639404296875, + "rewards/rejected": -5.55750846862793, + "step": 1006 + }, + { + "epoch": 17.06779661016949, + "grad_norm": 4.100101534069955, + "learning_rate": 3.2029747739209245e-08, + "logits/chosen": -5.569543838500977, + "logits/rejected": -6.685817718505859, + "logps/chosen": -10.705131530761719, + "logps/rejected": -23.21912956237793, + "loss": 0.0477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.405699759721756, + "rewards/margins": 5.077655792236328, + "rewards/rejected": -4.671957015991211, + "step": 1007 + }, + { + "epoch": 17.084745762711865, + "grad_norm": 4.211710710930013, + "learning_rate": 3.166853300980821e-08, + "logits/chosen": -6.40574836730957, + "logits/rejected": -5.807041168212891, + "logps/chosen": -7.398978233337402, + "logps/rejected": -17.30287742614746, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3856056034564972, + "rewards/margins": 4.216588973999023, + "rewards/rejected": -3.8309836387634277, + "step": 1008 + }, + { + "epoch": 17.10169491525424, + "grad_norm": 4.273538651613034, + "learning_rate": 3.130922886700968e-08, + "logits/chosen": -10.244964599609375, + "logits/rejected": -10.494848251342773, + "logps/chosen": -8.509642601013184, + "logps/rejected": -18.30198860168457, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5988513231277466, + "rewards/margins": 4.096191883087158, + "rewards/rejected": -3.4973411560058594, + "step": 1009 + }, + { + "epoch": 17.11864406779661, + "grad_norm": 4.25709039242675, + "learning_rate": 3.095183845503144e-08, + "logits/chosen": -7.940372467041016, + "logits/rejected": -7.701578617095947, + "logps/chosen": -10.985665321350098, + "logps/rejected": -27.06873893737793, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39520519971847534, + "rewards/margins": 6.167887210845947, + "rewards/rejected": -5.772682189941406, + "step": 1010 + }, + { + "epoch": 17.135593220338983, + "grad_norm": 4.338372503042337, + "learning_rate": 3.059636490134448e-08, + "logits/chosen": -7.376964092254639, + "logits/rejected": -7.629546642303467, + "logps/chosen": -8.467211723327637, + "logps/rejected": -22.428627014160156, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25212186574935913, + "rewards/margins": 5.631921291351318, + "rewards/rejected": -5.379799842834473, + "step": 1011 + }, + { + "epoch": 17.152542372881356, + "grad_norm": 4.223004676181515, + "learning_rate": 3.024281131664569e-08, + "logits/chosen": -5.16343879699707, + "logits/rejected": -6.354394912719727, + "logps/chosen": -11.564605712890625, + "logps/rejected": -31.499488830566406, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3362666368484497, + "rewards/margins": 7.0102033615112305, + "rewards/rejected": -7.346470355987549, + "step": 1012 + }, + { + "epoch": 17.16949152542373, + "grad_norm": 4.32681883158123, + "learning_rate": 2.989118079483052e-08, + "logits/chosen": -5.105967044830322, + "logits/rejected": -4.726531028747559, + "logps/chosen": -9.233987808227539, + "logps/rejected": -22.715604782104492, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053199321031570435, + "rewards/margins": 4.993124008178711, + "rewards/rejected": -5.046323776245117, + "step": 1013 + }, + { + "epoch": 17.1864406779661, + "grad_norm": 4.308348206921959, + "learning_rate": 2.9541476412966032e-08, + "logits/chosen": -7.126892566680908, + "logits/rejected": -7.172115802764893, + "logps/chosen": -6.875704288482666, + "logps/rejected": -17.938127517700195, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6449772715568542, + "rewards/margins": 4.4160332679748535, + "rewards/rejected": -3.771056652069092, + "step": 1014 + }, + { + "epoch": 17.203389830508474, + "grad_norm": 4.387522145212717, + "learning_rate": 2.9193701231263967e-08, + "logits/chosen": -7.878859043121338, + "logits/rejected": -10.106086730957031, + "logps/chosen": -8.252893447875977, + "logps/rejected": -20.68426513671875, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31005167961120605, + "rewards/margins": 4.4923577308654785, + "rewards/rejected": -4.182306289672852, + "step": 1015 + }, + { + "epoch": 17.220338983050848, + "grad_norm": 4.017533952520582, + "learning_rate": 2.8847858293053805e-08, + "logits/chosen": -4.7985711097717285, + "logits/rejected": -3.0869522094726562, + "logps/chosen": -12.94370174407959, + "logps/rejected": -25.18770980834961, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2541043162345886, + "rewards/margins": 5.228729248046875, + "rewards/rejected": -5.482833385467529, + "step": 1016 + }, + { + "epoch": 17.23728813559322, + "grad_norm": 3.9242681899847254, + "learning_rate": 2.8503950624756413e-08, + "logits/chosen": -3.531855821609497, + "logits/rejected": -2.8809752464294434, + "logps/chosen": -9.461271286010742, + "logps/rejected": -28.473838806152344, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07847141474485397, + "rewards/margins": 7.158161163330078, + "rewards/rejected": -7.079689979553223, + "step": 1017 + }, + { + "epoch": 17.25423728813559, + "grad_norm": 3.7660967758299075, + "learning_rate": 2.816198123585714e-08, + "logits/chosen": -6.652817249298096, + "logits/rejected": -2.6611244678497314, + "logps/chosen": -10.80378246307373, + "logps/rejected": -20.912567138671875, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6135255098342896, + "rewards/margins": 4.984477996826172, + "rewards/rejected": -4.37095308303833, + "step": 1018 + }, + { + "epoch": 17.271186440677965, + "grad_norm": 4.1856106009455765, + "learning_rate": 2.782195311887997e-08, + "logits/chosen": -8.43665599822998, + "logits/rejected": -5.13716459274292, + "logps/chosen": -7.108492374420166, + "logps/rejected": -21.752134323120117, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.394392728805542, + "rewards/margins": 5.154807090759277, + "rewards/rejected": -4.760413646697998, + "step": 1019 + }, + { + "epoch": 17.28813559322034, + "grad_norm": 3.9921502276466527, + "learning_rate": 2.7483869249360912e-08, + "logits/chosen": -9.98592758178711, + "logits/rejected": -9.118145942687988, + "logps/chosen": -9.33875846862793, + "logps/rejected": -22.786651611328125, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24660375714302063, + "rewards/margins": 4.794728755950928, + "rewards/rejected": -5.041332721710205, + "step": 1020 + }, + { + "epoch": 17.305084745762713, + "grad_norm": 4.1519514831089985, + "learning_rate": 2.7147732585822425e-08, + "logits/chosen": -6.564297199249268, + "logits/rejected": -4.144745349884033, + "logps/chosen": -8.098861694335938, + "logps/rejected": -21.80293846130371, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4752035140991211, + "rewards/margins": 5.580191135406494, + "rewards/rejected": -5.104987621307373, + "step": 1021 + }, + { + "epoch": 17.322033898305083, + "grad_norm": 4.665077369798747, + "learning_rate": 2.6813546069746978e-08, + "logits/chosen": -6.514552116394043, + "logits/rejected": -6.659337043762207, + "logps/chosen": -13.189889907836914, + "logps/rejected": -21.970909118652344, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22609473764896393, + "rewards/margins": 4.234546661376953, + "rewards/rejected": -4.008452415466309, + "step": 1022 + }, + { + "epoch": 17.338983050847457, + "grad_norm": 4.136419957605596, + "learning_rate": 2.6481312625551726e-08, + "logits/chosen": -5.896478652954102, + "logits/rejected": -5.556583404541016, + "logps/chosen": -9.151643753051758, + "logps/rejected": -23.223796844482422, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22782325744628906, + "rewards/margins": 4.698112964630127, + "rewards/rejected": -4.47028923034668, + "step": 1023 + }, + { + "epoch": 17.35593220338983, + "grad_norm": 4.0030922528193855, + "learning_rate": 2.6151035160562747e-08, + "logits/chosen": -5.044496059417725, + "logits/rejected": -4.717165470123291, + "logps/chosen": -13.499955177307129, + "logps/rejected": -24.894947052001953, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14019767940044403, + "rewards/margins": 4.581329345703125, + "rewards/rejected": -4.721526622772217, + "step": 1024 + }, + { + "epoch": 17.372881355932204, + "grad_norm": 4.458265653482429, + "learning_rate": 2.5822716564989605e-08, + "logits/chosen": -5.746660232543945, + "logits/rejected": -5.290646553039551, + "logps/chosen": -8.748152732849121, + "logps/rejected": -20.10523223876953, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3295852839946747, + "rewards/margins": 4.266857147216797, + "rewards/rejected": -3.9372718334198, + "step": 1025 + }, + { + "epoch": 17.389830508474578, + "grad_norm": 4.048704607082209, + "learning_rate": 2.5496359711900117e-08, + "logits/chosen": -7.465732574462891, + "logits/rejected": -4.066118240356445, + "logps/chosen": -11.653614044189453, + "logps/rejected": -23.19156837463379, + "loss": 0.0492, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04617789387702942, + "rewards/margins": 4.813108444213867, + "rewards/rejected": -4.766931056976318, + "step": 1026 + }, + { + "epoch": 17.406779661016948, + "grad_norm": 4.763237864684059, + "learning_rate": 2.5171967457195213e-08, + "logits/chosen": -10.00543212890625, + "logits/rejected": -7.8041534423828125, + "logps/chosen": -10.356070518493652, + "logps/rejected": -18.669546127319336, + "loss": 0.0452, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12074128538370132, + "rewards/margins": 4.16883659362793, + "rewards/rejected": -4.048095703125, + "step": 1027 + }, + { + "epoch": 17.423728813559322, + "grad_norm": 3.807693575659305, + "learning_rate": 2.4849542639583832e-08, + "logits/chosen": -7.451657295227051, + "logits/rejected": -4.065084457397461, + "logps/chosen": -16.081920623779297, + "logps/rejected": -29.290130615234375, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11072258651256561, + "rewards/margins": 6.283128261566162, + "rewards/rejected": -6.172406196594238, + "step": 1028 + }, + { + "epoch": 17.440677966101696, + "grad_norm": 4.561097923698961, + "learning_rate": 2.4529088080558202e-08, + "logits/chosen": -7.064828395843506, + "logits/rejected": -5.508311748504639, + "logps/chosen": -10.803018569946289, + "logps/rejected": -26.995498657226562, + "loss": 0.0459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13047471642494202, + "rewards/margins": 6.475358963012695, + "rewards/rejected": -6.605833053588867, + "step": 1029 + }, + { + "epoch": 17.45762711864407, + "grad_norm": 4.094841195832772, + "learning_rate": 2.4210606584369103e-08, + "logits/chosen": -7.4426045417785645, + "logits/rejected": -5.818709373474121, + "logps/chosen": -5.992995262145996, + "logps/rejected": -22.609210968017578, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6274833083152771, + "rewards/margins": 5.398805618286133, + "rewards/rejected": -4.771321773529053, + "step": 1030 + }, + { + "epoch": 17.47457627118644, + "grad_norm": 4.183739338552097, + "learning_rate": 2.3894100938001372e-08, + "logits/chosen": -7.961667060852051, + "logits/rejected": -6.155003070831299, + "logps/chosen": -9.56466007232666, + "logps/rejected": -21.0900936126709, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32264530658721924, + "rewards/margins": 4.696996212005615, + "rewards/rejected": -4.3743510246276855, + "step": 1031 + }, + { + "epoch": 17.491525423728813, + "grad_norm": 4.53635220017222, + "learning_rate": 2.3579573911149397e-08, + "logits/chosen": -5.375433444976807, + "logits/rejected": -2.4169154167175293, + "logps/chosen": -13.866455078125, + "logps/rejected": -29.32849884033203, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09247753024101257, + "rewards/margins": 6.1781907081604, + "rewards/rejected": -6.085712432861328, + "step": 1032 + }, + { + "epoch": 17.508474576271187, + "grad_norm": 4.7958550897023615, + "learning_rate": 2.3267028256193034e-08, + "logits/chosen": -2.3388025760650635, + "logits/rejected": -4.182887077331543, + "logps/chosen": -9.572820663452148, + "logps/rejected": -19.907470703125, + "loss": 0.0627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11249032616615295, + "rewards/margins": 4.0869975090026855, + "rewards/rejected": -4.199487686157227, + "step": 1033 + }, + { + "epoch": 17.52542372881356, + "grad_norm": 4.543938420668149, + "learning_rate": 2.2956466708173304e-08, + "logits/chosen": -7.524108409881592, + "logits/rejected": -5.194488525390625, + "logps/chosen": -10.331520080566406, + "logps/rejected": -23.630775451660156, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2804371118545532, + "rewards/margins": 5.539437294006348, + "rewards/rejected": -5.259000301361084, + "step": 1034 + }, + { + "epoch": 17.54237288135593, + "grad_norm": 4.811132955244365, + "learning_rate": 2.2647891984768853e-08, + "logits/chosen": -4.286440849304199, + "logits/rejected": -2.8485355377197266, + "logps/chosen": -8.513978958129883, + "logps/rejected": -24.73643684387207, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3493989109992981, + "rewards/margins": 5.959595680236816, + "rewards/rejected": -6.308995246887207, + "step": 1035 + }, + { + "epoch": 17.559322033898304, + "grad_norm": 4.727659797948743, + "learning_rate": 2.234130678627169e-08, + "logits/chosen": -8.573627471923828, + "logits/rejected": -8.825172424316406, + "logps/chosen": -8.619029998779297, + "logps/rejected": -18.161518096923828, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2388451248407364, + "rewards/margins": 4.048605442047119, + "rewards/rejected": -3.809760332107544, + "step": 1036 + }, + { + "epoch": 17.576271186440678, + "grad_norm": 3.612875798728581, + "learning_rate": 2.2036713795563876e-08, + "logits/chosen": -7.9563422203063965, + "logits/rejected": -6.366887092590332, + "logps/chosen": -8.55833625793457, + "logps/rejected": -19.83431053161621, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2833678722381592, + "rewards/margins": 4.818700313568115, + "rewards/rejected": -4.535332202911377, + "step": 1037 + }, + { + "epoch": 17.593220338983052, + "grad_norm": 3.498209777073271, + "learning_rate": 2.1734115678093938e-08, + "logits/chosen": -3.2180469036102295, + "logits/rejected": -2.4762351512908936, + "logps/chosen": -10.812689781188965, + "logps/rejected": -24.2170352935791, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30085691809654236, + "rewards/margins": 5.222414970397949, + "rewards/rejected": -4.921557903289795, + "step": 1038 + }, + { + "epoch": 17.610169491525422, + "grad_norm": 4.708256784933598, + "learning_rate": 2.1433515081853594e-08, + "logits/chosen": -4.538162708282471, + "logits/rejected": -2.6741445064544678, + "logps/chosen": -9.551587104797363, + "logps/rejected": -27.897594451904297, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22916676104068756, + "rewards/margins": 6.038286209106445, + "rewards/rejected": -5.809118747711182, + "step": 1039 + }, + { + "epoch": 17.627118644067796, + "grad_norm": 3.917316226481937, + "learning_rate": 2.1134914637354368e-08, + "logits/chosen": -7.627310752868652, + "logits/rejected": -7.482641220092773, + "logps/chosen": -10.15949821472168, + "logps/rejected": -23.192731857299805, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25363242626190186, + "rewards/margins": 4.933504581451416, + "rewards/rejected": -5.187136650085449, + "step": 1040 + }, + { + "epoch": 17.64406779661017, + "grad_norm": 4.028125385219284, + "learning_rate": 2.0838316957605074e-08, + "logits/chosen": -3.0444884300231934, + "logits/rejected": -2.337344169616699, + "logps/chosen": -7.633298873901367, + "logps/rejected": -20.37324333190918, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8732346296310425, + "rewards/margins": 4.761894226074219, + "rewards/rejected": -3.8886594772338867, + "step": 1041 + }, + { + "epoch": 17.661016949152543, + "grad_norm": 4.097102323738361, + "learning_rate": 2.0543724638088345e-08, + "logits/chosen": -9.5912446975708, + "logits/rejected": -8.915510177612305, + "logps/chosen": -7.766392707824707, + "logps/rejected": -18.353717803955078, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5832113027572632, + "rewards/margins": 4.126861095428467, + "rewards/rejected": -3.543649673461914, + "step": 1042 + }, + { + "epoch": 17.677966101694913, + "grad_norm": 3.8524271511787362, + "learning_rate": 2.0251140256738352e-08, + "logits/chosen": -7.298845291137695, + "logits/rejected": -5.406156063079834, + "logps/chosen": -9.05811595916748, + "logps/rejected": -18.826417922973633, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15431635081768036, + "rewards/margins": 4.819045066833496, + "rewards/rejected": -4.973361015319824, + "step": 1043 + }, + { + "epoch": 17.694915254237287, + "grad_norm": 4.367851733163418, + "learning_rate": 1.996056637391805e-08, + "logits/chosen": -5.809117317199707, + "logits/rejected": -6.983155250549316, + "logps/chosen": -8.29562759399414, + "logps/rejected": -21.50481414794922, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4013362526893616, + "rewards/margins": 5.432553291320801, + "rewards/rejected": -5.031217098236084, + "step": 1044 + }, + { + "epoch": 17.71186440677966, + "grad_norm": 4.938930187142403, + "learning_rate": 1.9672005532396756e-08, + "logits/chosen": -5.387691020965576, + "logits/rejected": -4.999997138977051, + "logps/chosen": -6.544239044189453, + "logps/rejected": -20.39162254333496, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4906653165817261, + "rewards/margins": 4.722982406616211, + "rewards/rejected": -4.232316493988037, + "step": 1045 + }, + { + "epoch": 17.728813559322035, + "grad_norm": 3.4996852036714134, + "learning_rate": 1.938546025732807e-08, + "logits/chosen": -4.897934436798096, + "logits/rejected": -7.4452338218688965, + "logps/chosen": -9.388957977294922, + "logps/rejected": -24.98798370361328, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7924555540084839, + "rewards/margins": 6.371291637420654, + "rewards/rejected": -5.578836441040039, + "step": 1046 + }, + { + "epoch": 17.74576271186441, + "grad_norm": 4.85977590522318, + "learning_rate": 1.910093305622759e-08, + "logits/chosen": -8.88355541229248, + "logits/rejected": -9.622832298278809, + "logps/chosen": -6.02414608001709, + "logps/rejected": -18.758188247680664, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16433702409267426, + "rewards/margins": 4.420986652374268, + "rewards/rejected": -4.256649494171143, + "step": 1047 + }, + { + "epoch": 17.76271186440678, + "grad_norm": 3.543759620480611, + "learning_rate": 1.881842641895104e-08, + "logits/chosen": -7.751880645751953, + "logits/rejected": -7.712557315826416, + "logps/chosen": -7.928264617919922, + "logps/rejected": -22.417421340942383, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4415779113769531, + "rewards/margins": 5.357847213745117, + "rewards/rejected": -4.916269779205322, + "step": 1048 + }, + { + "epoch": 17.779661016949152, + "grad_norm": 3.708075011329131, + "learning_rate": 1.853794281767257e-08, + "logits/chosen": -5.700033664703369, + "logits/rejected": -6.737403869628906, + "logps/chosen": -9.658474922180176, + "logps/rejected": -23.03371810913086, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2553105354309082, + "rewards/margins": 5.234497547149658, + "rewards/rejected": -4.979187488555908, + "step": 1049 + }, + { + "epoch": 17.796610169491526, + "grad_norm": 3.72922105134763, + "learning_rate": 1.8259484706862948e-08, + "logits/chosen": -6.088375568389893, + "logits/rejected": -6.036728858947754, + "logps/chosen": -9.52256965637207, + "logps/rejected": -24.765438079833984, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43426060676574707, + "rewards/margins": 5.606012344360352, + "rewards/rejected": -6.040272235870361, + "step": 1050 + }, + { + "epoch": 17.8135593220339, + "grad_norm": 4.693389158310756, + "learning_rate": 1.798305452326826e-08, + "logits/chosen": -10.646463394165039, + "logits/rejected": -7.9289679527282715, + "logps/chosen": -12.208354949951172, + "logps/rejected": -24.25341796875, + "loss": 0.0557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10868895053863525, + "rewards/margins": 5.147779941558838, + "rewards/rejected": -5.03909158706665, + "step": 1051 + }, + { + "epoch": 17.83050847457627, + "grad_norm": 4.874265696950657, + "learning_rate": 1.7708654685888336e-08, + "logits/chosen": -8.247642517089844, + "logits/rejected": -3.546093225479126, + "logps/chosen": -7.774269104003906, + "logps/rejected": -21.775653839111328, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31666943430900574, + "rewards/margins": 5.547506809234619, + "rewards/rejected": -5.230836868286133, + "step": 1052 + }, + { + "epoch": 17.847457627118644, + "grad_norm": 4.756915177911726, + "learning_rate": 1.7436287595955944e-08, + "logits/chosen": -3.051187038421631, + "logits/rejected": -3.3574652671813965, + "logps/chosen": -8.808416366577148, + "logps/rejected": -24.266923904418945, + "loss": 0.052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7775027751922607, + "rewards/margins": 4.976606369018555, + "rewards/rejected": -4.199103355407715, + "step": 1053 + }, + { + "epoch": 17.864406779661017, + "grad_norm": 4.879310156866597, + "learning_rate": 1.7165955636915392e-08, + "logits/chosen": -8.966073036193848, + "logits/rejected": -8.02140998840332, + "logps/chosen": -8.468016624450684, + "logps/rejected": -20.691829681396484, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2952655553817749, + "rewards/margins": 4.937621593475342, + "rewards/rejected": -4.642355442047119, + "step": 1054 + }, + { + "epoch": 17.88135593220339, + "grad_norm": 4.916091941632118, + "learning_rate": 1.6897661174402057e-08, + "logits/chosen": -5.873769760131836, + "logits/rejected": -5.605523586273193, + "logps/chosen": -9.172090530395508, + "logps/rejected": -26.349031448364258, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1761208176612854, + "rewards/margins": 6.015247344970703, + "rewards/rejected": -5.8391265869140625, + "step": 1055 + }, + { + "epoch": 17.89830508474576, + "grad_norm": 3.9104875119431144, + "learning_rate": 1.6631406556221333e-08, + "logits/chosen": -4.374204635620117, + "logits/rejected": -4.46373987197876, + "logps/chosen": -13.30536937713623, + "logps/rejected": -24.613677978515625, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5727440118789673, + "rewards/margins": 5.206787586212158, + "rewards/rejected": -4.6340436935424805, + "step": 1056 + }, + { + "epoch": 17.915254237288135, + "grad_norm": 4.568419925624105, + "learning_rate": 1.6367194112328288e-08, + "logits/chosen": -5.129965782165527, + "logits/rejected": -0.9094352722167969, + "logps/chosen": -10.443857192993164, + "logps/rejected": -30.575592041015625, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13605478405952454, + "rewards/margins": 6.419304847717285, + "rewards/rejected": -6.555359363555908, + "step": 1057 + }, + { + "epoch": 17.93220338983051, + "grad_norm": 3.363814152935196, + "learning_rate": 1.6105026154807215e-08, + "logits/chosen": -8.336222648620605, + "logits/rejected": -7.247498035430908, + "logps/chosen": -9.268970489501953, + "logps/rejected": -26.44733238220215, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12646730244159698, + "rewards/margins": 5.633796691894531, + "rewards/rejected": -5.7602643966674805, + "step": 1058 + }, + { + "epoch": 17.949152542372882, + "grad_norm": 3.992182802809226, + "learning_rate": 1.5844904977851376e-08, + "logits/chosen": -5.82818603515625, + "logits/rejected": -7.631371021270752, + "logps/chosen": -9.781405448913574, + "logps/rejected": -27.68490982055664, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.055077582597732544, + "rewards/margins": 5.673110485076904, + "rewards/rejected": -5.7281880378723145, + "step": 1059 + }, + { + "epoch": 17.966101694915253, + "grad_norm": 5.580584613487915, + "learning_rate": 1.558683285774304e-08, + "logits/chosen": -7.345905780792236, + "logits/rejected": -4.286530017852783, + "logps/chosen": -8.853185653686523, + "logps/rejected": -21.969799041748047, + "loss": 0.0537, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22210994362831116, + "rewards/margins": 4.812788486480713, + "rewards/rejected": -4.5906782150268555, + "step": 1060 + }, + { + "epoch": 17.983050847457626, + "grad_norm": 4.198433535721578, + "learning_rate": 1.5330812052833402e-08, + "logits/chosen": -4.884190082550049, + "logits/rejected": -5.0173821449279785, + "logps/chosen": -12.070671081542969, + "logps/rejected": -24.6473388671875, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4499714970588684, + "rewards/margins": 4.89157772064209, + "rewards/rejected": -4.441605567932129, + "step": 1061 + }, + { + "epoch": 18.0, + "grad_norm": 3.5016579794830283, + "learning_rate": 1.507684480352292e-08, + "logits/chosen": -12.084006309509277, + "logits/rejected": -5.082461833953857, + "logps/chosen": -12.766876220703125, + "logps/rejected": -17.5455322265625, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.653103232383728, + "rewards/margins": 4.414988040924072, + "rewards/rejected": -3.7618846893310547, + "step": 1062 + }, + { + "epoch": 18.016949152542374, + "grad_norm": 4.312459721063018, + "learning_rate": 1.4824933332241691e-08, + "logits/chosen": -10.255056381225586, + "logits/rejected": -6.813674449920654, + "logps/chosen": -13.091928482055664, + "logps/rejected": -25.015186309814453, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17327140271663666, + "rewards/margins": 4.631661415100098, + "rewards/rejected": -4.804932594299316, + "step": 1063 + }, + { + "epoch": 18.033898305084747, + "grad_norm": 4.087149777468893, + "learning_rate": 1.457507984343001e-08, + "logits/chosen": -8.456153869628906, + "logits/rejected": -5.839759826660156, + "logps/chosen": -10.093239784240723, + "logps/rejected": -25.46035385131836, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1286679208278656, + "rewards/margins": 5.974670886993408, + "rewards/rejected": -5.846002578735352, + "step": 1064 + }, + { + "epoch": 18.050847457627118, + "grad_norm": 3.9089755517483824, + "learning_rate": 1.4327286523519083e-08, + "logits/chosen": -8.93708610534668, + "logits/rejected": -10.533562660217285, + "logps/chosen": -5.521016597747803, + "logps/rejected": -15.870177268981934, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36885491013526917, + "rewards/margins": 3.37803316116333, + "rewards/rejected": -3.0091781616210938, + "step": 1065 + }, + { + "epoch": 18.06779661016949, + "grad_norm": 4.2051588856191175, + "learning_rate": 1.4081555540911837e-08, + "logits/chosen": -4.821186542510986, + "logits/rejected": -6.939448356628418, + "logps/chosen": -9.59068489074707, + "logps/rejected": -27.474403381347656, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003167472779750824, + "rewards/margins": 5.379495620727539, + "rewards/rejected": -5.376327991485596, + "step": 1066 + }, + { + "epoch": 18.084745762711865, + "grad_norm": 4.36531145641213, + "learning_rate": 1.383788904596403e-08, + "logits/chosen": -6.268502235412598, + "logits/rejected": -5.068531513214111, + "logps/chosen": -10.58634090423584, + "logps/rejected": -26.577701568603516, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05589911341667175, + "rewards/margins": 5.647587776184082, + "rewards/rejected": -5.70348596572876, + "step": 1067 + }, + { + "epoch": 18.10169491525424, + "grad_norm": 4.64611842993512, + "learning_rate": 1.3596289170965308e-08, + "logits/chosen": -6.981442928314209, + "logits/rejected": -3.912247657775879, + "logps/chosen": -9.348136901855469, + "logps/rejected": -23.2004337310791, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3433188199996948, + "rewards/margins": 5.749058246612549, + "rewards/rejected": -5.405739784240723, + "step": 1068 + }, + { + "epoch": 18.11864406779661, + "grad_norm": 4.3465310006915745, + "learning_rate": 1.3356758030120762e-08, + "logits/chosen": -4.298106670379639, + "logits/rejected": -3.6244497299194336, + "logps/chosen": -11.371649742126465, + "logps/rejected": -27.595565795898438, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0008899867534637451, + "rewards/margins": 5.844209671020508, + "rewards/rejected": -5.845099449157715, + "step": 1069 + }, + { + "epoch": 18.135593220338983, + "grad_norm": 4.191316361646987, + "learning_rate": 1.3119297719532241e-08, + "logits/chosen": -7.277906894683838, + "logits/rejected": -4.062497615814209, + "logps/chosen": -9.189289093017578, + "logps/rejected": -19.087574005126953, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21245090663433075, + "rewards/margins": 4.8552045822143555, + "rewards/rejected": -4.642754077911377, + "step": 1070 + }, + { + "epoch": 18.152542372881356, + "grad_norm": 4.984892397415893, + "learning_rate": 1.2883910317180003e-08, + "logits/chosen": -4.394636631011963, + "logits/rejected": -5.624868392944336, + "logps/chosen": -10.798135757446289, + "logps/rejected": -24.8155517578125, + "loss": 0.0592, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2576065957546234, + "rewards/margins": 5.6426544189453125, + "rewards/rejected": -5.900261402130127, + "step": 1071 + }, + { + "epoch": 18.16949152542373, + "grad_norm": 3.6473470262218677, + "learning_rate": 1.265059788290468e-08, + "logits/chosen": -7.392230987548828, + "logits/rejected": -6.054595947265625, + "logps/chosen": -11.147891998291016, + "logps/rejected": -27.8052978515625, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2997833490371704, + "rewards/margins": 5.084535598754883, + "rewards/rejected": -5.384318828582764, + "step": 1072 + }, + { + "epoch": 18.1864406779661, + "grad_norm": 3.7167659894466394, + "learning_rate": 1.2419362458389093e-08, + "logits/chosen": -4.075685024261475, + "logits/rejected": -1.814130187034607, + "logps/chosen": -11.281499862670898, + "logps/rejected": -23.99729347229004, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09586359560489655, + "rewards/margins": 5.473601341247559, + "rewards/rejected": -5.377737998962402, + "step": 1073 + }, + { + "epoch": 18.203389830508474, + "grad_norm": 4.526089243624855, + "learning_rate": 1.219020606714044e-08, + "logits/chosen": -4.463575839996338, + "logits/rejected": -4.573124885559082, + "logps/chosen": -9.437901496887207, + "logps/rejected": -21.910377502441406, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4638257622718811, + "rewards/margins": 5.842223644256592, + "rewards/rejected": -5.3783979415893555, + "step": 1074 + }, + { + "epoch": 18.220338983050848, + "grad_norm": 3.9597685733153942, + "learning_rate": 1.196313071447269e-08, + "logits/chosen": -7.910276412963867, + "logits/rejected": -6.576176643371582, + "logps/chosen": -9.151101112365723, + "logps/rejected": -25.661685943603516, + "loss": 0.0376, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5685608983039856, + "rewards/margins": 6.249361515045166, + "rewards/rejected": -5.680800437927246, + "step": 1075 + }, + { + "epoch": 18.23728813559322, + "grad_norm": 4.875313470692362, + "learning_rate": 1.1738138387488938e-08, + "logits/chosen": -9.282812118530273, + "logits/rejected": -10.521957397460938, + "logps/chosen": -9.483545303344727, + "logps/rejected": -22.795330047607422, + "loss": 0.0563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16500984132289886, + "rewards/margins": 4.907614231109619, + "rewards/rejected": -4.7426042556762695, + "step": 1076 + }, + { + "epoch": 18.25423728813559, + "grad_norm": 4.45319886027985, + "learning_rate": 1.1515231055063911e-08, + "logits/chosen": -5.865035057067871, + "logits/rejected": -6.470462322235107, + "logps/chosen": -8.85952091217041, + "logps/rejected": -21.33811378479004, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23279842734336853, + "rewards/margins": 4.53937292098999, + "rewards/rejected": -4.306574821472168, + "step": 1077 + }, + { + "epoch": 18.271186440677965, + "grad_norm": 3.7408544031494886, + "learning_rate": 1.129441066782702e-08, + "logits/chosen": -3.0924148559570312, + "logits/rejected": -3.013120174407959, + "logps/chosen": -9.44249153137207, + "logps/rejected": -24.7534122467041, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33274826407432556, + "rewards/margins": 6.343916416168213, + "rewards/rejected": -6.676664352416992, + "step": 1078 + }, + { + "epoch": 18.28813559322034, + "grad_norm": 4.13897439636955, + "learning_rate": 1.1075679158145002e-08, + "logits/chosen": -5.453994274139404, + "logits/rejected": -5.731247901916504, + "logps/chosen": -8.67552375793457, + "logps/rejected": -22.82550811767578, + "loss": 0.0413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3107266426086426, + "rewards/margins": 5.333024501800537, + "rewards/rejected": -5.022297382354736, + "step": 1079 + }, + { + "epoch": 18.305084745762713, + "grad_norm": 3.7977422206050173, + "learning_rate": 1.0859038440105161e-08, + "logits/chosen": -7.962723731994629, + "logits/rejected": -7.311408996582031, + "logps/chosen": -9.005022048950195, + "logps/rejected": -16.626529693603516, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3624380826950073, + "rewards/margins": 4.189855098724365, + "rewards/rejected": -3.8274168968200684, + "step": 1080 + }, + { + "epoch": 18.322033898305083, + "grad_norm": 4.17151909264831, + "learning_rate": 1.0644490409498636e-08, + "logits/chosen": -8.780117988586426, + "logits/rejected": -8.144914627075195, + "logps/chosen": -14.520820617675781, + "logps/rejected": -26.92098045349121, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09730023145675659, + "rewards/margins": 5.36954402923584, + "rewards/rejected": -5.46684455871582, + "step": 1081 + }, + { + "epoch": 18.338983050847457, + "grad_norm": 4.266354260595334, + "learning_rate": 1.0432036943803707e-08, + "logits/chosen": -7.444359302520752, + "logits/rejected": -7.710020542144775, + "logps/chosen": -9.911048889160156, + "logps/rejected": -20.40279769897461, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37346360087394714, + "rewards/margins": 4.519195079803467, + "rewards/rejected": -4.145731449127197, + "step": 1082 + }, + { + "epoch": 18.35593220338983, + "grad_norm": 3.7359193614404496, + "learning_rate": 1.0221679902169461e-08, + "logits/chosen": -8.830029487609863, + "logits/rejected": -8.9429931640625, + "logps/chosen": -7.692529678344727, + "logps/rejected": -25.185260772705078, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25370359420776367, + "rewards/margins": 6.122130393981934, + "rewards/rejected": -6.375833511352539, + "step": 1083 + }, + { + "epoch": 18.372881355932204, + "grad_norm": 4.376752842603358, + "learning_rate": 1.0013421125399519e-08, + "logits/chosen": -5.871336460113525, + "logits/rejected": -3.8425660133361816, + "logps/chosen": -10.602272033691406, + "logps/rejected": -19.069087982177734, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12840460240840912, + "rewards/margins": 3.8662519454956055, + "rewards/rejected": -3.737847328186035, + "step": 1084 + }, + { + "epoch": 18.389830508474578, + "grad_norm": 4.335279206101846, + "learning_rate": 9.80726243593577e-09, + "logits/chosen": -6.976935386657715, + "logits/rejected": -7.360747814178467, + "logps/chosen": -9.483010292053223, + "logps/rejected": -24.190677642822266, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08014628291130066, + "rewards/margins": 5.549870014190674, + "rewards/rejected": -5.469723701477051, + "step": 1085 + }, + { + "epoch": 18.406779661016948, + "grad_norm": 4.346101682290994, + "learning_rate": 9.603205637842698e-09, + "logits/chosen": -6.531325340270996, + "logits/rejected": -7.804195404052734, + "logps/chosen": -7.574182033538818, + "logps/rejected": -21.1744384765625, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5449045300483704, + "rewards/margins": 4.856822967529297, + "rewards/rejected": -4.311918258666992, + "step": 1086 + }, + { + "epoch": 18.423728813559322, + "grad_norm": 5.645565210955278, + "learning_rate": 9.401252516791302e-09, + "logits/chosen": -7.214651584625244, + "logits/rejected": -5.821599006652832, + "logps/chosen": -7.03403377532959, + "logps/rejected": -22.67394256591797, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3359869122505188, + "rewards/margins": 4.7078537940979, + "rewards/rejected": -4.371866226196289, + "step": 1087 + }, + { + "epoch": 18.440677966101696, + "grad_norm": 3.6044174957716923, + "learning_rate": 9.201404840043725e-09, + "logits/chosen": -8.01170539855957, + "logits/rejected": -5.888266086578369, + "logps/chosen": -9.264163970947266, + "logps/rejected": -28.278165817260742, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20131486654281616, + "rewards/margins": 6.650158405303955, + "rewards/rejected": -6.448843479156494, + "step": 1088 + }, + { + "epoch": 18.45762711864407, + "grad_norm": 4.146359056361594, + "learning_rate": 9.003664356437651e-09, + "logits/chosen": -4.719537258148193, + "logits/rejected": -4.794631004333496, + "logps/chosen": -7.586413383483887, + "logps/rejected": -18.851707458496094, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4252668023109436, + "rewards/margins": 5.126582622528076, + "rewards/rejected": -4.701315402984619, + "step": 1089 + }, + { + "epoch": 18.47457627118644, + "grad_norm": 4.5351885213756695, + "learning_rate": 8.808032796371017e-09, + "logits/chosen": -9.071616172790527, + "logits/rejected": -5.767463684082031, + "logps/chosen": -11.853965759277344, + "logps/rejected": -22.780263900756836, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14982807636260986, + "rewards/margins": 5.077415466308594, + "rewards/rejected": -4.927587509155273, + "step": 1090 + }, + { + "epoch": 18.491525423728813, + "grad_norm": 4.15770864036275, + "learning_rate": 8.614511871786828e-09, + "logits/chosen": -5.206994533538818, + "logits/rejected": -2.0901315212249756, + "logps/chosen": -9.402706146240234, + "logps/rejected": -19.950143814086914, + "loss": 0.0575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.043576233088970184, + "rewards/margins": 4.414394378662109, + "rewards/rejected": -4.3708176612854, + "step": 1091 + }, + { + "epoch": 18.508474576271187, + "grad_norm": 4.293926960441934, + "learning_rate": 8.423103276158306e-09, + "logits/chosen": -6.7276129722595215, + "logits/rejected": -6.699288368225098, + "logps/chosen": -10.428839683532715, + "logps/rejected": -19.753442764282227, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09627915173768997, + "rewards/margins": 4.188758850097656, + "rewards/rejected": -4.092479705810547, + "step": 1092 + }, + { + "epoch": 18.52542372881356, + "grad_norm": 3.4689974534493664, + "learning_rate": 8.233808684473959e-09, + "logits/chosen": -6.171113014221191, + "logits/rejected": -2.7044224739074707, + "logps/chosen": -8.165644645690918, + "logps/rejected": -26.037586212158203, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3093603253364563, + "rewards/margins": 6.091079235076904, + "rewards/rejected": -5.781719207763672, + "step": 1093 + }, + { + "epoch": 18.54237288135593, + "grad_norm": 4.108140260887782, + "learning_rate": 8.046629753222955e-09, + "logits/chosen": -4.509550094604492, + "logits/rejected": -5.370866775512695, + "logps/chosen": -9.223006248474121, + "logps/rejected": -20.383045196533203, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22011250257492065, + "rewards/margins": 4.451344013214111, + "rewards/rejected": -4.231231689453125, + "step": 1094 + }, + { + "epoch": 18.559322033898304, + "grad_norm": 4.0705282526634186, + "learning_rate": 7.861568120380634e-09, + "logits/chosen": -6.94540548324585, + "logits/rejected": -6.977925777435303, + "logps/chosen": -9.851380348205566, + "logps/rejected": -25.77141571044922, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09519022703170776, + "rewards/margins": 5.343542098999023, + "rewards/rejected": -5.24835205078125, + "step": 1095 + }, + { + "epoch": 18.576271186440678, + "grad_norm": 4.011699369253127, + "learning_rate": 7.678625405394157e-09, + "logits/chosen": -5.413311004638672, + "logits/rejected": -5.815068244934082, + "logps/chosen": -8.874666213989258, + "logps/rejected": -23.715831756591797, + "loss": 0.0437, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06100507080554962, + "rewards/margins": 5.189451694488525, + "rewards/rejected": -5.12844705581665, + "step": 1096 + }, + { + "epoch": 18.593220338983052, + "grad_norm": 4.46202294370448, + "learning_rate": 7.497803209168346e-09, + "logits/chosen": -9.656492233276367, + "logits/rejected": -8.445954322814941, + "logps/chosen": -9.470484733581543, + "logps/rejected": -19.208017349243164, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5084323287010193, + "rewards/margins": 4.6789469718933105, + "rewards/rejected": -4.1705145835876465, + "step": 1097 + }, + { + "epoch": 18.610169491525422, + "grad_norm": 4.045645521696906, + "learning_rate": 7.319103114051706e-09, + "logits/chosen": -7.331865310668945, + "logits/rejected": -7.503050804138184, + "logps/chosen": -11.165914535522461, + "logps/rejected": -21.40276336669922, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15454208850860596, + "rewards/margins": 4.488321304321289, + "rewards/rejected": -4.3337788581848145, + "step": 1098 + }, + { + "epoch": 18.627118644067796, + "grad_norm": 3.8341377299113075, + "learning_rate": 7.142526683822536e-09, + "logits/chosen": -11.285856246948242, + "logits/rejected": -8.024409294128418, + "logps/chosen": -11.741935729980469, + "logps/rejected": -23.4906005859375, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12301252037286758, + "rewards/margins": 4.999948024749756, + "rewards/rejected": -4.876935005187988, + "step": 1099 + }, + { + "epoch": 18.64406779661017, + "grad_norm": 3.381727056154272, + "learning_rate": 6.9680754636752e-09, + "logits/chosen": -6.764732360839844, + "logits/rejected": -2.8100860118865967, + "logps/chosen": -12.623137474060059, + "logps/rejected": -27.882747650146484, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06339322030544281, + "rewards/margins": 6.394018650054932, + "rewards/rejected": -6.330626010894775, + "step": 1100 + }, + { + "epoch": 18.661016949152543, + "grad_norm": 3.7073145614445306, + "learning_rate": 6.7957509802067104e-09, + "logits/chosen": -7.591949939727783, + "logits/rejected": -4.612473487854004, + "logps/chosen": -11.28479290008545, + "logps/rejected": -26.188610076904297, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2758193910121918, + "rewards/margins": 6.186018943786621, + "rewards/rejected": -6.461838722229004, + "step": 1101 + }, + { + "epoch": 18.677966101694913, + "grad_norm": 3.999225968740304, + "learning_rate": 6.625554741403333e-09, + "logits/chosen": -6.6668806076049805, + "logits/rejected": -6.30198335647583, + "logps/chosen": -9.92651653289795, + "logps/rejected": -21.492538452148438, + "loss": 0.0407, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06392458081245422, + "rewards/margins": 4.802147388458252, + "rewards/rejected": -4.738222599029541, + "step": 1102 + }, + { + "epoch": 18.694915254237287, + "grad_norm": 3.8269904200968066, + "learning_rate": 6.457488236627395e-09, + "logits/chosen": -4.248052597045898, + "logits/rejected": 0.813204824924469, + "logps/chosen": -9.202914237976074, + "logps/rejected": -23.664892196655273, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42141085863113403, + "rewards/margins": 6.465717792510986, + "rewards/rejected": -6.044306755065918, + "step": 1103 + }, + { + "epoch": 18.71186440677966, + "grad_norm": 4.022845003893159, + "learning_rate": 6.291552936604133e-09, + "logits/chosen": -7.192059516906738, + "logits/rejected": -4.204983711242676, + "logps/chosen": -10.985185623168945, + "logps/rejected": -26.992740631103516, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05412624776363373, + "rewards/margins": 6.3019022941589355, + "rewards/rejected": -6.247776031494141, + "step": 1104 + }, + { + "epoch": 18.728813559322035, + "grad_norm": 4.631119734383148, + "learning_rate": 6.127750293409006e-09, + "logits/chosen": -7.748348712921143, + "logits/rejected": -5.174810409545898, + "logps/chosen": -10.34148120880127, + "logps/rejected": -16.223203659057617, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44356459379196167, + "rewards/margins": 3.629425048828125, + "rewards/rejected": -3.1858601570129395, + "step": 1105 + }, + { + "epoch": 18.74576271186441, + "grad_norm": 4.056055906549981, + "learning_rate": 5.966081740454931e-09, + "logits/chosen": -6.4134321212768555, + "logits/rejected": -5.019958972930908, + "logps/chosen": -9.166025161743164, + "logps/rejected": -25.963987350463867, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.254142165184021, + "rewards/margins": 5.986145496368408, + "rewards/rejected": -6.240288257598877, + "step": 1106 + }, + { + "epoch": 18.76271186440678, + "grad_norm": 4.2510113340199815, + "learning_rate": 5.806548692479623e-09, + "logits/chosen": -4.511862277984619, + "logits/rejected": -3.1294548511505127, + "logps/chosen": -16.437929153442383, + "logps/rejected": -32.30805969238281, + "loss": 0.0457, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07681269943714142, + "rewards/margins": 5.85745096206665, + "rewards/rejected": -5.780638217926025, + "step": 1107 + }, + { + "epoch": 18.779661016949152, + "grad_norm": 3.9919824633820964, + "learning_rate": 5.649152545533331e-09, + "logits/chosen": -6.852721691131592, + "logits/rejected": -6.540812015533447, + "logps/chosen": -10.670454025268555, + "logps/rejected": -21.6687068939209, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3178441822528839, + "rewards/margins": 5.176324367523193, + "rewards/rejected": -4.858480453491211, + "step": 1108 + }, + { + "epoch": 18.796610169491526, + "grad_norm": 3.397614174449964, + "learning_rate": 5.493894676966704e-09, + "logits/chosen": -3.6471877098083496, + "logits/rejected": -3.6686882972717285, + "logps/chosen": -8.435153007507324, + "logps/rejected": -25.47620391845703, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010460421442985535, + "rewards/margins": 5.795283317565918, + "rewards/rejected": -5.80574369430542, + "step": 1109 + }, + { + "epoch": 18.8135593220339, + "grad_norm": 3.9669584038833863, + "learning_rate": 5.340776445418471e-09, + "logits/chosen": -10.3143892288208, + "logits/rejected": -7.56541633605957, + "logps/chosen": -8.294013023376465, + "logps/rejected": -23.466705322265625, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12921765446662903, + "rewards/margins": 5.621309757232666, + "rewards/rejected": -5.492092132568359, + "step": 1110 + }, + { + "epoch": 18.83050847457627, + "grad_norm": 4.954316687519291, + "learning_rate": 5.1897991908038396e-09, + "logits/chosen": -6.567185401916504, + "logits/rejected": -5.2818284034729, + "logps/chosen": -7.029053211212158, + "logps/rejected": -15.25006103515625, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30523422360420227, + "rewards/margins": 3.355952739715576, + "rewards/rejected": -3.0507185459136963, + "step": 1111 + }, + { + "epoch": 18.847457627118644, + "grad_norm": 4.968688843586282, + "learning_rate": 5.040964234302558e-09, + "logits/chosen": -7.132032871246338, + "logits/rejected": -6.4067888259887695, + "logps/chosen": -8.73871898651123, + "logps/rejected": -24.557842254638672, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2146884799003601, + "rewards/margins": 6.353532791137695, + "rewards/rejected": -6.138844013214111, + "step": 1112 + }, + { + "epoch": 18.864406779661017, + "grad_norm": 4.4839891569309795, + "learning_rate": 4.894272878347483e-09, + "logits/chosen": -6.7498345375061035, + "logits/rejected": -4.952764987945557, + "logps/chosen": -6.822923183441162, + "logps/rejected": -18.971647262573242, + "loss": 0.0431, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01051950454711914, + "rewards/margins": 4.419131755828857, + "rewards/rejected": -4.408612251281738, + "step": 1113 + }, + { + "epoch": 18.88135593220339, + "grad_norm": 4.040720712271205, + "learning_rate": 4.749726406613142e-09, + "logits/chosen": -12.106992721557617, + "logits/rejected": -5.57379674911499, + "logps/chosen": -16.29015350341797, + "logps/rejected": -23.340852737426758, + "loss": 0.0416, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2273421436548233, + "rewards/margins": 4.638840675354004, + "rewards/rejected": -4.866182804107666, + "step": 1114 + }, + { + "epoch": 18.89830508474576, + "grad_norm": 4.234068539111867, + "learning_rate": 4.607326084004437e-09, + "logits/chosen": -9.570833206176758, + "logits/rejected": -7.380236625671387, + "logps/chosen": -7.770480632781982, + "logps/rejected": -17.556217193603516, + "loss": 0.0481, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6741684675216675, + "rewards/margins": 3.8052420616149902, + "rewards/rejected": -3.131073474884033, + "step": 1115 + }, + { + "epoch": 18.915254237288135, + "grad_norm": 4.759020975492104, + "learning_rate": 4.467073156645712e-09, + "logits/chosen": -7.897947311401367, + "logits/rejected": -8.628703117370605, + "logps/chosen": -9.987771987915039, + "logps/rejected": -21.86614990234375, + "loss": 0.0566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1313033401966095, + "rewards/margins": 4.395023345947266, + "rewards/rejected": -4.2637200355529785, + "step": 1116 + }, + { + "epoch": 18.93220338983051, + "grad_norm": 4.223825609967969, + "learning_rate": 4.328968851869758e-09, + "logits/chosen": -7.241960525512695, + "logits/rejected": -3.7764053344726562, + "logps/chosen": -12.674301147460938, + "logps/rejected": -25.130722045898438, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07294300943613052, + "rewards/margins": 5.546260356903076, + "rewards/rejected": -5.619204044342041, + "step": 1117 + }, + { + "epoch": 18.949152542372882, + "grad_norm": 4.603862164649134, + "learning_rate": 4.193014378207044e-09, + "logits/chosen": -5.850318908691406, + "logits/rejected": -2.6857542991638184, + "logps/chosen": -9.542285919189453, + "logps/rejected": -24.915546417236328, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2751765847206116, + "rewards/margins": 5.750458240509033, + "rewards/rejected": -5.475281715393066, + "step": 1118 + }, + { + "epoch": 18.966101694915253, + "grad_norm": 4.244835334089812, + "learning_rate": 4.059210925375173e-09, + "logits/chosen": -4.551603317260742, + "logits/rejected": -3.4954090118408203, + "logps/chosen": -12.582915306091309, + "logps/rejected": -23.43550682067871, + "loss": 0.0514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12680701911449432, + "rewards/margins": 4.264057159423828, + "rewards/rejected": -4.13724946975708, + "step": 1119 + }, + { + "epoch": 18.983050847457626, + "grad_norm": 4.295140577343596, + "learning_rate": 3.927559664268554e-09, + "logits/chosen": -6.582233905792236, + "logits/rejected": -5.693815231323242, + "logps/chosen": -8.524242401123047, + "logps/rejected": -21.318532943725586, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16140446066856384, + "rewards/margins": 5.285299777984619, + "rewards/rejected": -5.123895168304443, + "step": 1120 + }, + { + "epoch": 19.0, + "grad_norm": 4.488787452254915, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -6.865042686462402, + "logits/rejected": -4.247623920440674, + "logps/chosen": -11.491613388061523, + "logps/rejected": -19.202152252197266, + "loss": 0.0448, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16015183925628662, + "rewards/margins": 4.161006927490234, + "rewards/rejected": -4.321159362792969, + "step": 1121 + }, + { + "epoch": 19.016949152542374, + "grad_norm": 4.64037685756284, + "learning_rate": 3.6707183066307656e-09, + "logits/chosen": -9.344770431518555, + "logits/rejected": -4.774504661560059, + "logps/chosen": -9.032370567321777, + "logps/rejected": -29.84403419494629, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22558191418647766, + "rewards/margins": 6.387495994567871, + "rewards/rejected": -6.613077640533447, + "step": 1122 + }, + { + "epoch": 19.033898305084747, + "grad_norm": 4.955290733050375, + "learning_rate": 3.5455304576806346e-09, + "logits/chosen": -5.308284282684326, + "logits/rejected": -4.995919227600098, + "logps/chosen": -9.144086837768555, + "logps/rejected": -28.654260635375977, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16971349716186523, + "rewards/margins": 5.827495574951172, + "rewards/rejected": -5.657781600952148, + "step": 1123 + }, + { + "epoch": 19.050847457627118, + "grad_norm": 3.9959696885818605, + "learning_rate": 3.4224992955980693e-09, + "logits/chosen": -4.088801383972168, + "logits/rejected": -1.3501896858215332, + "logps/chosen": -10.250041007995605, + "logps/rejected": -22.934467315673828, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48192471265792847, + "rewards/margins": 4.999414443969727, + "rewards/rejected": -4.517489433288574, + "step": 1124 + }, + { + "epoch": 19.06779661016949, + "grad_norm": 4.112989021260128, + "learning_rate": 3.3016258970106903e-09, + "logits/chosen": -8.40589714050293, + "logits/rejected": -7.380486488342285, + "logps/chosen": -9.625313758850098, + "logps/rejected": -29.280725479125977, + "loss": 0.038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07790735363960266, + "rewards/margins": 6.871036529541016, + "rewards/rejected": -6.948944568634033, + "step": 1125 + }, + { + "epoch": 19.084745762711865, + "grad_norm": 4.018426131140643, + "learning_rate": 3.1829113196638613e-09, + "logits/chosen": -5.221579551696777, + "logits/rejected": -5.25046443939209, + "logps/chosen": -9.330720901489258, + "logps/rejected": -27.666893005371094, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04562787711620331, + "rewards/margins": 6.5740461349487305, + "rewards/rejected": -6.528418064117432, + "step": 1126 + }, + { + "epoch": 19.10169491525424, + "grad_norm": 4.339584337044476, + "learning_rate": 3.0663566024114183e-09, + "logits/chosen": -10.538304328918457, + "logits/rejected": -7.795166015625, + "logps/chosen": -10.814375877380371, + "logps/rejected": -19.972862243652344, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4461347460746765, + "rewards/margins": 4.960249423980713, + "rewards/rejected": -4.5141143798828125, + "step": 1127 + }, + { + "epoch": 19.11864406779661, + "grad_norm": 3.642298752875519, + "learning_rate": 2.951962765206567e-09, + "logits/chosen": -8.135869979858398, + "logits/rejected": -5.980784893035889, + "logps/chosen": -6.342912673950195, + "logps/rejected": -17.059585571289062, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7364379167556763, + "rewards/margins": 4.4074482917785645, + "rewards/rejected": -3.6710104942321777, + "step": 1128 + }, + { + "epoch": 19.135593220338983, + "grad_norm": 3.512517269165827, + "learning_rate": 2.839730809092972e-09, + "logits/chosen": -7.244131565093994, + "logits/rejected": -4.6190571784973145, + "logps/chosen": -10.269086837768555, + "logps/rejected": -21.036928176879883, + "loss": 0.0381, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09786014258861542, + "rewards/margins": 5.341797828674316, + "rewards/rejected": -5.243937969207764, + "step": 1129 + }, + { + "epoch": 19.152542372881356, + "grad_norm": 4.495700708917495, + "learning_rate": 2.7296617161960413e-09, + "logits/chosen": -8.322103500366211, + "logits/rejected": -6.5170578956604, + "logps/chosen": -7.848247528076172, + "logps/rejected": -23.73181915283203, + "loss": 0.0514, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.46722692251205444, + "rewards/margins": 5.935376167297363, + "rewards/rejected": -5.4681501388549805, + "step": 1130 + }, + { + "epoch": 19.16949152542373, + "grad_norm": 4.098312139251818, + "learning_rate": 2.6217564497141574e-09, + "logits/chosen": -5.403450965881348, + "logits/rejected": -6.877000331878662, + "logps/chosen": -11.477622985839844, + "logps/rejected": -23.035629272460938, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3956793546676636, + "rewards/margins": 4.676126003265381, + "rewards/rejected": -4.280446529388428, + "step": 1131 + }, + { + "epoch": 19.1864406779661, + "grad_norm": 3.5602830148283773, + "learning_rate": 2.516015953910544e-09, + "logits/chosen": -5.9252705574035645, + "logits/rejected": -4.196394443511963, + "logps/chosen": -8.186470985412598, + "logps/rejected": -21.580202102661133, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11510688811540604, + "rewards/margins": 5.886081218719482, + "rewards/rejected": -5.770974159240723, + "step": 1132 + }, + { + "epoch": 19.203389830508474, + "grad_norm": 4.3782832810573105, + "learning_rate": 2.4124411541047162e-09, + "logits/chosen": -5.3209991455078125, + "logits/rejected": -2.6802937984466553, + "logps/chosen": -12.969682693481445, + "logps/rejected": -27.95191192626953, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3466143012046814, + "rewards/margins": 6.493988990783691, + "rewards/rejected": -6.147375106811523, + "step": 1133 + }, + { + "epoch": 19.220338983050848, + "grad_norm": 4.41272303319751, + "learning_rate": 2.3110329566645158e-09, + "logits/chosen": -9.802309036254883, + "logits/rejected": -9.715230941772461, + "logps/chosen": -8.948881149291992, + "logps/rejected": -20.29668617248535, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3533758223056793, + "rewards/margins": 4.4907755851745605, + "rewards/rejected": -4.137399673461914, + "step": 1134 + }, + { + "epoch": 19.23728813559322, + "grad_norm": 3.9731944084938835, + "learning_rate": 2.2117922489982286e-09, + "logits/chosen": -9.187678337097168, + "logits/rejected": -8.45703411102295, + "logps/chosen": -10.043983459472656, + "logps/rejected": -24.24431800842285, + "loss": 0.0465, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21984882652759552, + "rewards/margins": 5.44026517868042, + "rewards/rejected": -5.220417022705078, + "step": 1135 + }, + { + "epoch": 19.25423728813559, + "grad_norm": 3.8630961033176288, + "learning_rate": 2.1147198995466466e-09, + "logits/chosen": -6.825324535369873, + "logits/rejected": -6.143636703491211, + "logps/chosen": -8.299385070800781, + "logps/rejected": -18.08413314819336, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3547424376010895, + "rewards/margins": 4.631790637969971, + "rewards/rejected": -4.277048110961914, + "step": 1136 + }, + { + "epoch": 19.271186440677965, + "grad_norm": 4.963351485073133, + "learning_rate": 2.0198167577757107e-09, + "logits/chosen": -7.602474212646484, + "logits/rejected": -7.027194976806641, + "logps/chosen": -10.680564880371094, + "logps/rejected": -21.71973991394043, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2703138291835785, + "rewards/margins": 5.047008037567139, + "rewards/rejected": -4.776694297790527, + "step": 1137 + }, + { + "epoch": 19.28813559322034, + "grad_norm": 4.4201548943030575, + "learning_rate": 1.927083654168854e-09, + "logits/chosen": -8.062311172485352, + "logits/rejected": -6.282785892486572, + "logps/chosen": -9.119327545166016, + "logps/rejected": -17.638656616210938, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48620134592056274, + "rewards/margins": 4.4218525886535645, + "rewards/rejected": -3.9356513023376465, + "step": 1138 + }, + { + "epoch": 19.305084745762713, + "grad_norm": 3.5473965133479024, + "learning_rate": 1.8365214002198648e-09, + "logits/chosen": -6.863635540008545, + "logits/rejected": -3.4915430545806885, + "logps/chosen": -11.302085876464844, + "logps/rejected": -21.60725975036621, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44304215908050537, + "rewards/margins": 4.834182262420654, + "rewards/rejected": -4.391139984130859, + "step": 1139 + }, + { + "epoch": 19.322033898305083, + "grad_norm": 4.764976937865656, + "learning_rate": 1.7481307884256725e-09, + "logits/chosen": -10.479633331298828, + "logits/rejected": -4.6769304275512695, + "logps/chosen": -11.336868286132812, + "logps/rejected": -19.19312858581543, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17866046726703644, + "rewards/margins": 3.89330792427063, + "rewards/rejected": -3.714647054672241, + "step": 1140 + }, + { + "epoch": 19.338983050847457, + "grad_norm": 5.020310454336254, + "learning_rate": 1.6619125922796019e-09, + "logits/chosen": -7.57431173324585, + "logits/rejected": -2.7706239223480225, + "logps/chosen": -13.79558277130127, + "logps/rejected": -31.725343704223633, + "loss": 0.0471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6086219549179077, + "rewards/margins": 6.808389186859131, + "rewards/rejected": -7.417011737823486, + "step": 1141 + }, + { + "epoch": 19.35593220338983, + "grad_norm": 3.8807659082830033, + "learning_rate": 1.5778675662643791e-09, + "logits/chosen": -6.23847770690918, + "logits/rejected": -5.861680030822754, + "logps/chosen": -7.707354545593262, + "logps/rejected": -19.576440811157227, + "loss": 0.0392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25658178329467773, + "rewards/margins": 4.902749061584473, + "rewards/rejected": -4.646167278289795, + "step": 1142 + }, + { + "epoch": 19.372881355932204, + "grad_norm": 3.5873067908971983, + "learning_rate": 1.4959964458456931e-09, + "logits/chosen": -6.67938756942749, + "logits/rejected": -6.05525541305542, + "logps/chosen": -11.312013626098633, + "logps/rejected": -30.832914352416992, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16357643902301788, + "rewards/margins": 7.093705177307129, + "rewards/rejected": -6.930128574371338, + "step": 1143 + }, + { + "epoch": 19.389830508474578, + "grad_norm": 4.662610656958441, + "learning_rate": 1.4162999474657266e-09, + "logits/chosen": -9.130437850952148, + "logits/rejected": -7.133908748626709, + "logps/chosen": -7.804490566253662, + "logps/rejected": -15.638628005981445, + "loss": 0.0527, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4274514317512512, + "rewards/margins": 4.310211658477783, + "rewards/rejected": -3.882760524749756, + "step": 1144 + }, + { + "epoch": 19.406779661016948, + "grad_norm": 4.059115335211127, + "learning_rate": 1.3387787685368024e-09, + "logits/chosen": -10.194669723510742, + "logits/rejected": -11.102928161621094, + "logps/chosen": -9.580450057983398, + "logps/rejected": -22.048954010009766, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.114285409450531, + "rewards/margins": 4.883891582489014, + "rewards/rejected": -4.769606113433838, + "step": 1145 + }, + { + "epoch": 19.423728813559322, + "grad_norm": 4.899057307983236, + "learning_rate": 1.2634335874353585e-09, + "logits/chosen": -4.248291969299316, + "logits/rejected": -4.8470587730407715, + "logps/chosen": -9.899078369140625, + "logps/rejected": -27.653818130493164, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09164927154779434, + "rewards/margins": 6.349340915679932, + "rewards/rejected": -6.440991401672363, + "step": 1146 + }, + { + "epoch": 19.440677966101696, + "grad_norm": 3.8704721367512676, + "learning_rate": 1.1902650634960377e-09, + "logits/chosen": -5.251467704772949, + "logits/rejected": -6.173592567443848, + "logps/chosen": -10.023374557495117, + "logps/rejected": -19.808534622192383, + "loss": 0.0453, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15790097415447235, + "rewards/margins": 4.519407272338867, + "rewards/rejected": -4.677308082580566, + "step": 1147 + }, + { + "epoch": 19.45762711864407, + "grad_norm": 3.359884250589287, + "learning_rate": 1.1192738370058574e-09, + "logits/chosen": -7.491716384887695, + "logits/rejected": -7.280231475830078, + "logps/chosen": -6.225736618041992, + "logps/rejected": -18.614547729492188, + "loss": 0.0357, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6800587177276611, + "rewards/margins": 4.669227600097656, + "rewards/rejected": -3.989168882369995, + "step": 1148 + }, + { + "epoch": 19.47457627118644, + "grad_norm": 4.678996121466518, + "learning_rate": 1.050460529198577e-09, + "logits/chosen": -6.2738728523254395, + "logits/rejected": -7.710474967956543, + "logps/chosen": -7.842569351196289, + "logps/rejected": -22.311418533325195, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5984208583831787, + "rewards/margins": 5.338993549346924, + "rewards/rejected": -4.740572929382324, + "step": 1149 + }, + { + "epoch": 19.491525423728813, + "grad_norm": 3.680168733564192, + "learning_rate": 9.838257422493667e-10, + "logits/chosen": -7.964971542358398, + "logits/rejected": -7.558823108673096, + "logps/chosen": -10.521120071411133, + "logps/rejected": -21.55910873413086, + "loss": 0.0429, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28645265102386475, + "rewards/margins": 4.186437606811523, + "rewards/rejected": -3.8999853134155273, + "step": 1150 + }, + { + "epoch": 19.508474576271187, + "grad_norm": 4.966531922721566, + "learning_rate": 9.193700592694531e-10, + "logits/chosen": -4.467232704162598, + "logits/rejected": -4.619475364685059, + "logps/chosen": -7.945337295532227, + "logps/rejected": -19.41752052307129, + "loss": 0.0559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2579636871814728, + "rewards/margins": 4.400396823883057, + "rewards/rejected": -4.658360481262207, + "step": 1151 + }, + { + "epoch": 19.52542372881356, + "grad_norm": 4.404996879947679, + "learning_rate": 8.570940443010655e-10, + "logits/chosen": -6.992228031158447, + "logits/rejected": -5.218314170837402, + "logps/chosen": -7.047336578369141, + "logps/rejected": -14.934014320373535, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3601531386375427, + "rewards/margins": 3.750903844833374, + "rewards/rejected": -3.3907506465911865, + "step": 1152 + }, + { + "epoch": 19.54237288135593, + "grad_norm": 4.429632740370815, + "learning_rate": 7.969982423124689e-10, + "logits/chosen": -11.009224891662598, + "logits/rejected": -7.346052646636963, + "logps/chosen": -8.033683776855469, + "logps/rejected": -17.191102981567383, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7259281873703003, + "rewards/margins": 4.4079718589782715, + "rewards/rejected": -3.6820435523986816, + "step": 1153 + }, + { + "epoch": 19.559322033898304, + "grad_norm": 3.7837176771704844, + "learning_rate": 7.390831791931895e-10, + "logits/chosen": -6.310678482055664, + "logits/rejected": -5.403589725494385, + "logps/chosen": -11.5776948928833, + "logps/rejected": -30.27614402770996, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.053039468824863434, + "rewards/margins": 6.467260360717773, + "rewards/rejected": -6.520299434661865, + "step": 1154 + }, + { + "epoch": 19.576271186440678, + "grad_norm": 4.812999650447907, + "learning_rate": 6.83349361749408e-10, + "logits/chosen": -5.7683844566345215, + "logits/rejected": -5.301779270172119, + "logps/chosen": -7.299991130828857, + "logps/rejected": -17.533212661743164, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4515947699546814, + "rewards/margins": 4.172460556030273, + "rewards/rejected": -3.7208657264709473, + "step": 1155 + }, + { + "epoch": 19.593220338983052, + "grad_norm": 4.580659403548684, + "learning_rate": 6.297972776996285e-10, + "logits/chosen": -5.716320991516113, + "logits/rejected": -3.708559989929199, + "logps/chosen": -8.555032730102539, + "logps/rejected": -20.43698501586914, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30857783555984497, + "rewards/margins": 4.984573841094971, + "rewards/rejected": -4.67599630355835, + "step": 1156 + }, + { + "epoch": 19.610169491525422, + "grad_norm": 4.238548965488203, + "learning_rate": 5.78427395670239e-10, + "logits/chosen": -7.459744930267334, + "logits/rejected": -5.083245754241943, + "logps/chosen": -9.815725326538086, + "logps/rejected": -23.77956199645996, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022835373878479004, + "rewards/margins": 5.320274353027344, + "rewards/rejected": -5.343109607696533, + "step": 1157 + }, + { + "epoch": 19.627118644067796, + "grad_norm": 4.296226251958125, + "learning_rate": 5.29240165191569e-10, + "logits/chosen": -6.119602203369141, + "logits/rejected": -3.446010112762451, + "logps/chosen": -8.73384952545166, + "logps/rejected": -21.660493850708008, + "loss": 0.0436, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1783091127872467, + "rewards/margins": 5.320026397705078, + "rewards/rejected": -5.141717433929443, + "step": 1158 + }, + { + "epoch": 19.64406779661017, + "grad_norm": 4.759685042111325, + "learning_rate": 4.8223601669381e-10, + "logits/chosen": -7.123500823974609, + "logits/rejected": -5.013278484344482, + "logps/chosen": -9.441107749938965, + "logps/rejected": -22.8370304107666, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4494301974773407, + "rewards/margins": 5.925429821014404, + "rewards/rejected": -5.47599983215332, + "step": 1159 + }, + { + "epoch": 19.661016949152543, + "grad_norm": 4.5051160807484605, + "learning_rate": 4.3741536150337934e-10, + "logits/chosen": -5.4635467529296875, + "logits/rejected": -5.964138507843018, + "logps/chosen": -10.20158576965332, + "logps/rejected": -18.295839309692383, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029481612145900726, + "rewards/margins": 3.807978868484497, + "rewards/rejected": -3.7784974575042725, + "step": 1160 + }, + { + "epoch": 19.677966101694913, + "grad_norm": 4.532448176911285, + "learning_rate": 3.9477859183925654e-10, + "logits/chosen": -6.951817989349365, + "logits/rejected": -6.393136978149414, + "logps/chosen": -8.013717651367188, + "logps/rejected": -21.919706344604492, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5353216528892517, + "rewards/margins": 5.655571937561035, + "rewards/rejected": -5.120250701904297, + "step": 1161 + }, + { + "epoch": 19.694915254237287, + "grad_norm": 3.477802121577011, + "learning_rate": 3.5432608080951386e-10, + "logits/chosen": -7.770132541656494, + "logits/rejected": -6.3644304275512695, + "logps/chosen": -8.72642707824707, + "logps/rejected": -19.723552703857422, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2946310043334961, + "rewards/margins": 5.015895366668701, + "rewards/rejected": -4.721264839172363, + "step": 1162 + }, + { + "epoch": 19.71186440677966, + "grad_norm": 3.602158767913972, + "learning_rate": 3.160581824081798e-10, + "logits/chosen": -8.145950317382812, + "logits/rejected": -3.2530272006988525, + "logps/chosen": -12.056177139282227, + "logps/rejected": -27.599056243896484, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3399862051010132, + "rewards/margins": 6.990494728088379, + "rewards/rejected": -6.650508403778076, + "step": 1163 + }, + { + "epoch": 19.728813559322035, + "grad_norm": 4.661769244717556, + "learning_rate": 2.7997523151199186e-10, + "logits/chosen": -5.5459675788879395, + "logits/rejected": -4.609034538269043, + "logps/chosen": -7.424615859985352, + "logps/rejected": -21.676082611083984, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5069709420204163, + "rewards/margins": 4.763805866241455, + "rewards/rejected": -4.256834983825684, + "step": 1164 + }, + { + "epoch": 19.74576271186441, + "grad_norm": 3.968610512394195, + "learning_rate": 2.4607754387753753e-10, + "logits/chosen": -8.559868812561035, + "logits/rejected": -6.620041847229004, + "logps/chosen": -10.406312942504883, + "logps/rejected": -28.012836456298828, + "loss": 0.0348, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023296553641557693, + "rewards/margins": 7.254501819610596, + "rewards/rejected": -7.231204986572266, + "step": 1165 + }, + { + "epoch": 19.76271186440678, + "grad_norm": 4.078397515416586, + "learning_rate": 2.1436541613853442e-10, + "logits/chosen": -10.614221572875977, + "logits/rejected": -6.317427635192871, + "logps/chosen": -10.993356704711914, + "logps/rejected": -20.877456665039062, + "loss": 0.0403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15146180987358093, + "rewards/margins": 4.370360374450684, + "rewards/rejected": -4.521821975708008, + "step": 1166 + }, + { + "epoch": 19.779661016949152, + "grad_norm": 3.7080068749451742, + "learning_rate": 1.8483912580313787e-10, + "logits/chosen": -7.377213478088379, + "logits/rejected": -1.7765655517578125, + "logps/chosen": -12.268579483032227, + "logps/rejected": -25.69411277770996, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23731692135334015, + "rewards/margins": 5.506694316864014, + "rewards/rejected": -5.2693772315979, + "step": 1167 + }, + { + "epoch": 19.796610169491526, + "grad_norm": 4.082044069557541, + "learning_rate": 1.574989312516095e-10, + "logits/chosen": -4.877951622009277, + "logits/rejected": -4.450176239013672, + "logps/chosen": -10.917598724365234, + "logps/rejected": -25.072479248046875, + "loss": 0.0443, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06078708916902542, + "rewards/margins": 5.301979064941406, + "rewards/rejected": -5.362766265869141, + "step": 1168 + }, + { + "epoch": 19.8135593220339, + "grad_norm": 3.8856096679992516, + "learning_rate": 1.3234507173393029e-10, + "logits/chosen": -5.971585273742676, + "logits/rejected": -2.853632926940918, + "logps/chosen": -11.391792297363281, + "logps/rejected": -19.528844833374023, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6338703036308289, + "rewards/margins": 5.5579938888549805, + "rewards/rejected": -4.924124240875244, + "step": 1169 + }, + { + "epoch": 19.83050847457627, + "grad_norm": 3.9286703819740074, + "learning_rate": 1.0937776736782978e-10, + "logits/chosen": -9.924491882324219, + "logits/rejected": -9.015110969543457, + "logps/chosen": -7.7781782150268555, + "logps/rejected": -16.215484619140625, + "loss": 0.0379, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5362356901168823, + "rewards/margins": 4.200999736785889, + "rewards/rejected": -3.664763927459717, + "step": 1170 + }, + { + "epoch": 19.847457627118644, + "grad_norm": 4.526785004898851, + "learning_rate": 8.859721913684337e-11, + "logits/chosen": -9.032225608825684, + "logits/rejected": -8.733574867248535, + "logps/chosen": -13.94797134399414, + "logps/rejected": -32.85288619995117, + "loss": 0.0535, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.665097713470459, + "rewards/margins": 5.278740882873535, + "rewards/rejected": -4.613643646240234, + "step": 1171 + }, + { + "epoch": 19.864406779661017, + "grad_norm": 4.606104834951581, + "learning_rate": 7.000360888850809e-11, + "logits/chosen": -8.41981029510498, + "logits/rejected": -6.411755084991455, + "logps/chosen": -11.847469329833984, + "logps/rejected": -22.13727569580078, + "loss": 0.0576, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2522156834602356, + "rewards/margins": 5.082830429077148, + "rewards/rejected": -4.830615043640137, + "step": 1172 + }, + { + "epoch": 19.88135593220339, + "grad_norm": 4.297813577782475, + "learning_rate": 5.35970993327528e-11, + "logits/chosen": -9.063766479492188, + "logits/rejected": -8.41303825378418, + "logps/chosen": -8.27404499053955, + "logps/rejected": -23.108671188354492, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1744605153799057, + "rewards/margins": 5.226859092712402, + "rewards/rejected": -5.052398681640625, + "step": 1173 + }, + { + "epoch": 19.89830508474576, + "grad_norm": 4.297871723544772, + "learning_rate": 3.9377834040538184e-11, + "logits/chosen": -8.93664836883545, + "logits/rejected": -6.258477210998535, + "logps/chosen": -12.725931167602539, + "logps/rejected": -24.41492462158203, + "loss": 0.0441, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029960155487060547, + "rewards/margins": 4.409930229187012, + "rewards/rejected": -4.439890384674072, + "step": 1174 + }, + { + "epoch": 19.915254237288135, + "grad_norm": 4.197951910383987, + "learning_rate": 2.7345937442552202e-11, + "logits/chosen": -9.97773265838623, + "logits/rejected": -6.583474159240723, + "logps/chosen": -18.799739837646484, + "logps/rejected": -23.433496475219727, + "loss": 0.0458, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.007980942726135254, + "rewards/margins": 4.562077045440674, + "rewards/rejected": -4.570058345794678, + "step": 1175 + }, + { + "epoch": 19.93220338983051, + "grad_norm": 4.040291712847653, + "learning_rate": 1.7501514828183184e-11, + "logits/chosen": -11.39698600769043, + "logits/rejected": -12.546833038330078, + "logps/chosen": -6.392749786376953, + "logps/rejected": -22.700088500976562, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04210100322961807, + "rewards/margins": 5.175865173339844, + "rewards/rejected": -5.133763790130615, + "step": 1176 + }, + { + "epoch": 19.949152542372882, + "grad_norm": 4.065424008081176, + "learning_rate": 9.844652344492832e-12, + "logits/chosen": -8.840291976928711, + "logits/rejected": -5.942503452301025, + "logps/chosen": -7.859723091125488, + "logps/rejected": -18.713224411010742, + "loss": 0.0477, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.774299144744873, + "rewards/margins": 5.152860164642334, + "rewards/rejected": -4.378561496734619, + "step": 1177 + }, + { + "epoch": 19.966101694915253, + "grad_norm": 4.357195551014498, + "learning_rate": 4.375416995577863e-12, + "logits/chosen": -5.456747531890869, + "logits/rejected": -4.642401218414307, + "logps/chosen": -8.65817642211914, + "logps/rejected": -18.599754333496094, + "loss": 0.05, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2617643475532532, + "rewards/margins": 4.422424793243408, + "rewards/rejected": -4.160660743713379, + "step": 1178 + }, + { + "epoch": 19.983050847457626, + "grad_norm": 3.6780075503878824, + "learning_rate": 1.093856641931623e-12, + "logits/chosen": -10.135025024414062, + "logits/rejected": -5.059713840484619, + "logps/chosen": -13.161264419555664, + "logps/rejected": -26.968259811401367, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34054410457611084, + "rewards/margins": 5.3091139793396, + "rewards/rejected": -5.649657249450684, + "step": 1179 + }, + { + "epoch": 20.0, + "grad_norm": 4.1850799623874515, + "learning_rate": 0.0, + "logits/chosen": -7.4185967445373535, + "logits/rejected": -7.274652004241943, + "logps/chosen": -10.746148109436035, + "logps/rejected": -25.2398738861084, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16116692125797272, + "rewards/margins": 5.900046348571777, + "rewards/rejected": -5.738879203796387, + "step": 1180 + }, + { + "epoch": 20.0, + "step": 1180, + "total_flos": 0.0, + "train_loss": 0.18995573704399296, + "train_runtime": 12341.7767, + "train_samples_per_second": 12.232, + "train_steps_per_second": 0.096 + } + ], + "logging_steps": 1, + "max_steps": 1180, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}