{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.925925925925926, "eval_steps": 1, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011851851851851851, "grad_norm": 30.633439415390697, "learning_rate": 7.352941176470588e-09, "logits/chosen": -1.1390000581741333, "logits/rejected": -1.004213571548462, "logps/chosen": -27.46249008178711, "logps/rejected": -40.97970962524414, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.023703703703703703, "grad_norm": 30.560266192737977, "learning_rate": 1.4705882352941176e-08, "logits/chosen": -0.9409990310668945, "logits/rejected": -1.0981616973876953, "logps/chosen": -25.160219192504883, "logps/rejected": -37.994651794433594, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.035555555555555556, "grad_norm": 29.12506200104364, "learning_rate": 2.2058823529411764e-08, "logits/chosen": -1.0592122077941895, "logits/rejected": -1.023957371711731, "logps/chosen": -24.85056495666504, "logps/rejected": -33.17691421508789, "loss": 0.6958, "rewards/accuracies": 0.6875, "rewards/chosen": 0.015119694173336029, "rewards/margins": 0.03308585658669472, "rewards/rejected": -0.01796616055071354, "step": 3 }, { "epoch": 0.047407407407407405, "grad_norm": 30.746154156018388, "learning_rate": 2.941176470588235e-08, "logits/chosen": -1.0859633684158325, "logits/rejected": -0.9216127991676331, "logps/chosen": -27.081607818603516, "logps/rejected": -31.82309913635254, "loss": 0.6924, "rewards/accuracies": 0.75, "rewards/chosen": -0.00960695743560791, "rewards/margins": 0.043169185519218445, "rewards/rejected": -0.05277615040540695, "step": 4 }, { "epoch": 0.05925925925925926, "grad_norm": 29.519287927141388, "learning_rate": 3.676470588235294e-08, "logits/chosen": -1.049912452697754, "logits/rejected": -1.1279696226119995, "logps/chosen": -28.21110725402832, "logps/rejected": -31.672449111938477, "loss": 0.6936, "rewards/accuracies": 0.5625, "rewards/chosen": 0.010185590013861656, "rewards/margins": 0.022015150636434555, "rewards/rejected": -0.011829562485218048, "step": 5 }, { "epoch": 0.07111111111111111, "grad_norm": 30.815268471480024, "learning_rate": 4.411764705882353e-08, "logits/chosen": -0.9929622411727905, "logits/rejected": -0.8766285181045532, "logps/chosen": -33.8680419921875, "logps/rejected": -32.97846221923828, "loss": 0.6901, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01366226002573967, "rewards/margins": -0.027053195983171463, "rewards/rejected": 0.013390939682722092, "step": 6 }, { "epoch": 0.08296296296296296, "grad_norm": 28.066993035093688, "learning_rate": 5.147058823529411e-08, "logits/chosen": -1.0458980798721313, "logits/rejected": -0.99763023853302, "logps/chosen": -27.67081069946289, "logps/rejected": -32.387882232666016, "loss": 0.6937, "rewards/accuracies": 0.375, "rewards/chosen": 0.004842352122068405, "rewards/margins": -0.027380306273698807, "rewards/rejected": 0.03222266212105751, "step": 7 }, { "epoch": 0.09481481481481481, "grad_norm": 30.97687763376115, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.7906761169433594, "logits/rejected": -0.9265250563621521, "logps/chosen": -21.296993255615234, "logps/rejected": -30.3665771484375, "loss": 0.6966, "rewards/accuracies": 0.3125, "rewards/chosen": -0.011183989234268665, "rewards/margins": -0.038471613079309464, "rewards/rejected": 0.027287624776363373, "step": 8 }, { "epoch": 0.10666666666666667, "grad_norm": 30.839253271730197, "learning_rate": 6.617647058823529e-08, "logits/chosen": -1.0798628330230713, "logits/rejected": -0.8085466623306274, "logps/chosen": -27.84103012084961, "logps/rejected": -27.858829498291016, "loss": 0.697, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00032351166009902954, "rewards/margins": 0.0009455680847167969, "rewards/rejected": -0.0006220571231096983, "step": 9 }, { "epoch": 0.11851851851851852, "grad_norm": 30.290928737637028, "learning_rate": 7.352941176470588e-08, "logits/chosen": -0.9101401567459106, "logits/rejected": -0.8849160671234131, "logps/chosen": -28.90041160583496, "logps/rejected": -36.99686050415039, "loss": 0.7062, "rewards/accuracies": 0.4375, "rewards/chosen": 0.002118426375091076, "rewards/margins": -0.00936745386570692, "rewards/rejected": 0.011485882103443146, "step": 10 }, { "epoch": 0.13037037037037036, "grad_norm": 29.947407184192368, "learning_rate": 8.088235294117647e-08, "logits/chosen": -0.6956688761711121, "logits/rejected": -0.6927211284637451, "logps/chosen": -26.63684844970703, "logps/rejected": -32.870521545410156, "loss": 0.6938, "rewards/accuracies": 0.3125, "rewards/chosen": -0.005509059876203537, "rewards/margins": -0.01887032575905323, "rewards/rejected": 0.013361264020204544, "step": 11 }, { "epoch": 0.14222222222222222, "grad_norm": 28.661495862175222, "learning_rate": 8.823529411764706e-08, "logits/chosen": -0.9000818133354187, "logits/rejected": -0.8827647566795349, "logps/chosen": -30.33894157409668, "logps/rejected": -39.22317886352539, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -5.582557059824467e-05, "rewards/margins": -0.02260081097483635, "rewards/rejected": 0.02254498563706875, "step": 12 }, { "epoch": 0.15407407407407409, "grad_norm": 28.65418832324272, "learning_rate": 9.558823529411763e-08, "logits/chosen": -1.002519130706787, "logits/rejected": -0.8338276147842407, "logps/chosen": -22.69075584411621, "logps/rejected": -28.403766632080078, "loss": 0.6928, "rewards/accuracies": 0.75, "rewards/chosen": 0.02947317063808441, "rewards/margins": 0.025819525122642517, "rewards/rejected": 0.003653643187135458, "step": 13 }, { "epoch": 0.16592592592592592, "grad_norm": 32.991997579379785, "learning_rate": 1.0294117647058822e-07, "logits/chosen": -1.103955864906311, "logits/rejected": -0.9304717183113098, "logps/chosen": -30.54917335510254, "logps/rejected": -35.995635986328125, "loss": 0.6951, "rewards/accuracies": 0.375, "rewards/chosen": -0.029121514409780502, "rewards/margins": -0.03852158039808273, "rewards/rejected": 0.00940006971359253, "step": 14 }, { "epoch": 0.17777777777777778, "grad_norm": 28.604312545703518, "learning_rate": 1.1029411764705881e-07, "logits/chosen": -0.9514233469963074, "logits/rejected": -1.0262576341629028, "logps/chosen": -24.50589942932129, "logps/rejected": -35.90400314331055, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": -0.0038123312406241894, "rewards/margins": -0.002235441468656063, "rewards/rejected": -0.001576889306306839, "step": 15 }, { "epoch": 0.18962962962962962, "grad_norm": 29.400959002049447, "learning_rate": 1.176470588235294e-07, "logits/chosen": -1.0083125829696655, "logits/rejected": -1.054337739944458, "logps/chosen": -25.947620391845703, "logps/rejected": -34.74080276489258, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.018293071538209915, "rewards/margins": 0.003816458163782954, "rewards/rejected": 0.014476614072918892, "step": 16 }, { "epoch": 0.20148148148148148, "grad_norm": 28.668497058092704, "learning_rate": 1.25e-07, "logits/chosen": -1.2847508192062378, "logits/rejected": -1.1500571966171265, "logps/chosen": -24.377540588378906, "logps/rejected": -24.707080841064453, "loss": 0.6955, "rewards/accuracies": 0.4375, "rewards/chosen": -0.020369501784443855, "rewards/margins": -0.04808269441127777, "rewards/rejected": 0.027713194489479065, "step": 17 }, { "epoch": 0.21333333333333335, "grad_norm": 28.83942987795299, "learning_rate": 1.3235294117647057e-07, "logits/chosen": -0.7682641744613647, "logits/rejected": -0.7946673035621643, "logps/chosen": -30.430973052978516, "logps/rejected": -38.10691833496094, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": -0.008304210379719734, "rewards/margins": 0.025393059477210045, "rewards/rejected": -0.03369727358222008, "step": 18 }, { "epoch": 0.22518518518518518, "grad_norm": 30.269138451288548, "learning_rate": 1.3970588235294117e-07, "logits/chosen": -0.9091902375221252, "logits/rejected": -0.8255650997161865, "logps/chosen": -23.93052864074707, "logps/rejected": -31.06536865234375, "loss": 0.6875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02613237127661705, "rewards/margins": 0.029419327154755592, "rewards/rejected": -0.0032869577407836914, "step": 19 }, { "epoch": 0.23703703703703705, "grad_norm": 28.790853669800715, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -0.8767175078392029, "logits/rejected": -0.7814630270004272, "logps/chosen": -24.231098175048828, "logps/rejected": -31.490211486816406, "loss": 0.6895, "rewards/accuracies": 0.4375, "rewards/chosen": 0.003405546071007848, "rewards/margins": -0.02764531597495079, "rewards/rejected": 0.03105086088180542, "step": 20 }, { "epoch": 0.24888888888888888, "grad_norm": 28.60949101677147, "learning_rate": 1.5441176470588236e-07, "logits/chosen": -0.7832755446434021, "logits/rejected": -0.6420150995254517, "logps/chosen": -30.503726959228516, "logps/rejected": -37.48731231689453, "loss": 0.6849, "rewards/accuracies": 0.375, "rewards/chosen": 0.015105541795492172, "rewards/margins": -0.011576179414987564, "rewards/rejected": 0.026681719347834587, "step": 21 }, { "epoch": 0.2607407407407407, "grad_norm": 30.79843186614584, "learning_rate": 1.6176470588235293e-07, "logits/chosen": -0.838936984539032, "logits/rejected": -0.890478789806366, "logps/chosen": -23.474294662475586, "logps/rejected": -27.067607879638672, "loss": 0.69, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00017893756739795208, "rewards/margins": 0.007045186124742031, "rewards/rejected": -0.00686624925583601, "step": 22 }, { "epoch": 0.2725925925925926, "grad_norm": 27.887885485403874, "learning_rate": 1.6911764705882354e-07, "logits/chosen": -1.018220067024231, "logits/rejected": -0.9071334600448608, "logps/chosen": -31.536334991455078, "logps/rejected": -39.14314270019531, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.034499503672122955, "rewards/margins": -0.007689610123634338, "rewards/rejected": -0.026809897273778915, "step": 23 }, { "epoch": 0.28444444444444444, "grad_norm": 28.284778127262, "learning_rate": 1.764705882352941e-07, "logits/chosen": -0.9286752939224243, "logits/rejected": -0.8299227952957153, "logps/chosen": -31.92042350769043, "logps/rejected": -35.5419921875, "loss": 0.6982, "rewards/accuracies": 0.4375, "rewards/chosen": -0.020512927323579788, "rewards/margins": -0.028788220137357712, "rewards/rejected": 0.008275296539068222, "step": 24 }, { "epoch": 0.2962962962962963, "grad_norm": 30.25505597249182, "learning_rate": 1.8382352941176472e-07, "logits/chosen": -1.0533839464187622, "logits/rejected": -1.1014021635055542, "logps/chosen": -19.149333953857422, "logps/rejected": -31.527441024780273, "loss": 0.6788, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0033930009230971336, "rewards/margins": 0.015363391488790512, "rewards/rejected": -0.01875639334321022, "step": 25 }, { "epoch": 0.30814814814814817, "grad_norm": 27.757686456876737, "learning_rate": 1.9117647058823527e-07, "logits/chosen": -0.9570952653884888, "logits/rejected": -1.2151724100112915, "logps/chosen": -24.398239135742188, "logps/rejected": -33.41880416870117, "loss": 0.6772, "rewards/accuracies": 0.75, "rewards/chosen": 0.0002640171442180872, "rewards/margins": 0.0642121285200119, "rewards/rejected": -0.06394810229539871, "step": 26 }, { "epoch": 0.32, "grad_norm": 26.596692102946548, "learning_rate": 1.9852941176470587e-07, "logits/chosen": -1.2522035837173462, "logits/rejected": -1.0859918594360352, "logps/chosen": -29.689172744750977, "logps/rejected": -29.878040313720703, "loss": 0.6767, "rewards/accuracies": 0.625, "rewards/chosen": -0.013057002797722816, "rewards/margins": 0.011676701717078686, "rewards/rejected": -0.024733707308769226, "step": 27 }, { "epoch": 0.33185185185185184, "grad_norm": 30.447771442461562, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -1.0458675622940063, "logits/rejected": -1.011217474937439, "logps/chosen": -33.10105895996094, "logps/rejected": -36.27530288696289, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": -0.022874630987644196, "rewards/margins": 0.024510394781827927, "rewards/rejected": -0.04738502576947212, "step": 28 }, { "epoch": 0.3437037037037037, "grad_norm": 28.662596504192038, "learning_rate": 2.1323529411764705e-07, "logits/chosen": -1.0796473026275635, "logits/rejected": -0.9194013476371765, "logps/chosen": -26.835773468017578, "logps/rejected": -32.48525619506836, "loss": 0.6772, "rewards/accuracies": 0.5, "rewards/chosen": -0.020335160195827484, "rewards/margins": -0.010507296770811081, "rewards/rejected": -0.009827865287661552, "step": 29 }, { "epoch": 0.35555555555555557, "grad_norm": 26.531589778840697, "learning_rate": 2.2058823529411763e-07, "logits/chosen": -1.2273566722869873, "logits/rejected": -1.1912459135055542, "logps/chosen": -25.345335006713867, "logps/rejected": -30.677719116210938, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": -0.01930350251495838, "rewards/margins": 0.0014111557975411415, "rewards/rejected": -0.0207146555185318, "step": 30 }, { "epoch": 0.3674074074074074, "grad_norm": 28.447408639251734, "learning_rate": 2.2794117647058823e-07, "logits/chosen": -0.9077786207199097, "logits/rejected": -0.8582189083099365, "logps/chosen": -21.32311248779297, "logps/rejected": -30.76042366027832, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": -0.023572970181703568, "rewards/margins": 0.055517010390758514, "rewards/rejected": -0.07908996939659119, "step": 31 }, { "epoch": 0.37925925925925924, "grad_norm": 28.62654390398505, "learning_rate": 2.352941176470588e-07, "logits/chosen": -0.7928012609481812, "logits/rejected": -0.9217997193336487, "logps/chosen": -25.130535125732422, "logps/rejected": -40.558807373046875, "loss": 0.6755, "rewards/accuracies": 0.5, "rewards/chosen": -0.011272218078374863, "rewards/margins": 0.00860257912427187, "rewards/rejected": -0.01987479254603386, "step": 32 }, { "epoch": 0.39111111111111113, "grad_norm": 28.128072104623097, "learning_rate": 2.426470588235294e-07, "logits/chosen": -0.6893962025642395, "logits/rejected": -0.7390108704566956, "logps/chosen": -26.575319290161133, "logps/rejected": -35.0635986328125, "loss": 0.6622, "rewards/accuracies": 0.625, "rewards/chosen": 0.027767837047576904, "rewards/margins": 0.07502231001853943, "rewards/rejected": -0.04725448414683342, "step": 33 }, { "epoch": 0.40296296296296297, "grad_norm": 30.211985121803504, "learning_rate": 2.5e-07, "logits/chosen": -1.1355931758880615, "logits/rejected": -0.9662358164787292, "logps/chosen": -23.865161895751953, "logps/rejected": -31.000974655151367, "loss": 0.6793, "rewards/accuracies": 0.6875, "rewards/chosen": 0.012976722791790962, "rewards/margins": 0.09340134263038635, "rewards/rejected": -0.08042460680007935, "step": 34 }, { "epoch": 0.4148148148148148, "grad_norm": 28.052145828737455, "learning_rate": 2.5735294117647057e-07, "logits/chosen": -1.0929094552993774, "logits/rejected": -1.000644564628601, "logps/chosen": -29.99643325805664, "logps/rejected": -35.5587272644043, "loss": 0.667, "rewards/accuracies": 0.6875, "rewards/chosen": -0.009614755399525166, "rewards/margins": 0.09663048386573792, "rewards/rejected": -0.10624523460865021, "step": 35 }, { "epoch": 0.4266666666666667, "grad_norm": 28.13933829077562, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -0.9170397520065308, "logits/rejected": -0.8025334477424622, "logps/chosen": -35.03669357299805, "logps/rejected": -38.15205001831055, "loss": 0.6579, "rewards/accuracies": 0.5, "rewards/chosen": -0.028206665068864822, "rewards/margins": 0.06581413000822067, "rewards/rejected": -0.0940207988023758, "step": 36 }, { "epoch": 0.43851851851851853, "grad_norm": 26.573245255968228, "learning_rate": 2.720588235294117e-07, "logits/chosen": -0.8840937614440918, "logits/rejected": -1.154779076576233, "logps/chosen": -21.523990631103516, "logps/rejected": -37.12339401245117, "loss": 0.6507, "rewards/accuracies": 0.625, "rewards/chosen": -0.03301115334033966, "rewards/margins": 0.07873430848121643, "rewards/rejected": -0.11174546182155609, "step": 37 }, { "epoch": 0.45037037037037037, "grad_norm": 27.415699326968735, "learning_rate": 2.7941176470588235e-07, "logits/chosen": -0.851655125617981, "logits/rejected": -0.848722517490387, "logps/chosen": -27.6691837310791, "logps/rejected": -35.29108810424805, "loss": 0.6516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.030980991199612617, "rewards/margins": 0.12533767521381378, "rewards/rejected": -0.15631866455078125, "step": 38 }, { "epoch": 0.4622222222222222, "grad_norm": 25.808526954893544, "learning_rate": 2.8676470588235293e-07, "logits/chosen": -1.019010066986084, "logits/rejected": -1.0574215650558472, "logps/chosen": -28.815650939941406, "logps/rejected": -38.8148193359375, "loss": 0.6379, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03295973688364029, "rewards/margins": 0.17373906075954437, "rewards/rejected": -0.20669879019260406, "step": 39 }, { "epoch": 0.4740740740740741, "grad_norm": 26.057899144470028, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.0933586359024048, "logits/rejected": -1.029239535331726, "logps/chosen": -29.62429428100586, "logps/rejected": -36.406829833984375, "loss": 0.6446, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03471359610557556, "rewards/margins": 0.10683241486549377, "rewards/rejected": -0.14154601097106934, "step": 40 }, { "epoch": 0.48592592592592593, "grad_norm": 26.768031824352317, "learning_rate": 3.014705882352941e-07, "logits/chosen": -1.01728093624115, "logits/rejected": -0.958576500415802, "logps/chosen": -27.431272506713867, "logps/rejected": -35.526336669921875, "loss": 0.6315, "rewards/accuracies": 0.8125, "rewards/chosen": -0.002386711537837982, "rewards/margins": 0.16936060786247253, "rewards/rejected": -0.17174731194972992, "step": 41 }, { "epoch": 0.49777777777777776, "grad_norm": 26.609263347631167, "learning_rate": 3.088235294117647e-07, "logits/chosen": -0.8449506759643555, "logits/rejected": -0.8115115761756897, "logps/chosen": -30.90084457397461, "logps/rejected": -41.651222229003906, "loss": 0.6449, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03619501739740372, "rewards/margins": 0.07611033320426941, "rewards/rejected": -0.11230535060167313, "step": 42 }, { "epoch": 0.5096296296296297, "grad_norm": 26.602574402864892, "learning_rate": 3.161764705882353e-07, "logits/chosen": -0.9452661871910095, "logits/rejected": -1.1044366359710693, "logps/chosen": -34.610469818115234, "logps/rejected": -46.29701232910156, "loss": 0.6076, "rewards/accuracies": 0.875, "rewards/chosen": -0.03930632770061493, "rewards/margins": 0.31100231409072876, "rewards/rejected": -0.3503086566925049, "step": 43 }, { "epoch": 0.5214814814814814, "grad_norm": 26.4843286159413, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -1.0949482917785645, "logits/rejected": -1.2503392696380615, "logps/chosen": -26.57402992248535, "logps/rejected": -37.58720016479492, "loss": 0.6344, "rewards/accuracies": 0.5, "rewards/chosen": -0.06539441645145416, "rewards/margins": 0.10176797211170197, "rewards/rejected": -0.16716240346431732, "step": 44 }, { "epoch": 0.5333333333333333, "grad_norm": 25.709202508798676, "learning_rate": 3.3088235294117644e-07, "logits/chosen": -0.9102402329444885, "logits/rejected": -0.6524286270141602, "logps/chosen": -33.4108772277832, "logps/rejected": -34.84819030761719, "loss": 0.5999, "rewards/accuracies": 0.8125, "rewards/chosen": -0.006676537916064262, "rewards/margins": 0.23027826845645905, "rewards/rejected": -0.23695479333400726, "step": 45 }, { "epoch": 0.5451851851851852, "grad_norm": 24.273462665693376, "learning_rate": 3.3823529411764707e-07, "logits/chosen": -1.174068808555603, "logits/rejected": -1.0645579099655151, "logps/chosen": -27.4698429107666, "logps/rejected": -40.68760681152344, "loss": 0.6215, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0283779539167881, "rewards/margins": 0.4031476676464081, "rewards/rejected": -0.37476974725723267, "step": 46 }, { "epoch": 0.557037037037037, "grad_norm": 26.030778079202978, "learning_rate": 3.4558823529411765e-07, "logits/chosen": -1.0018541812896729, "logits/rejected": -0.9415028095245361, "logps/chosen": -31.507640838623047, "logps/rejected": -30.727527618408203, "loss": 0.6199, "rewards/accuracies": 0.6875, "rewards/chosen": -0.034904323518276215, "rewards/margins": 0.14491260051727295, "rewards/rejected": -0.17981691658496857, "step": 47 }, { "epoch": 0.5688888888888889, "grad_norm": 25.162457677575347, "learning_rate": 3.529411764705882e-07, "logits/chosen": -1.079911708831787, "logits/rejected": -0.8244823217391968, "logps/chosen": -26.54584503173828, "logps/rejected": -32.15207290649414, "loss": 0.6022, "rewards/accuracies": 0.5625, "rewards/chosen": -0.045158352702856064, "rewards/margins": 0.24844199419021606, "rewards/rejected": -0.29360032081604004, "step": 48 }, { "epoch": 0.5807407407407408, "grad_norm": 26.364792800100354, "learning_rate": 3.602941176470588e-07, "logits/chosen": -1.0118129253387451, "logits/rejected": -1.0024306774139404, "logps/chosen": -30.346418380737305, "logps/rejected": -36.94379425048828, "loss": 0.5945, "rewards/accuracies": 0.875, "rewards/chosen": -0.004303845576941967, "rewards/margins": 0.29810506105422974, "rewards/rejected": -0.302408903837204, "step": 49 }, { "epoch": 0.5925925925925926, "grad_norm": 25.49611713846103, "learning_rate": 3.6764705882352943e-07, "logits/chosen": -1.1100342273712158, "logits/rejected": -1.2106306552886963, "logps/chosen": -27.509366989135742, "logps/rejected": -38.91688919067383, "loss": 0.6128, "rewards/accuracies": 0.75, "rewards/chosen": -0.02782953716814518, "rewards/margins": 0.22366918623447418, "rewards/rejected": -0.2514986991882324, "step": 50 }, { "epoch": 0.6044444444444445, "grad_norm": 25.714654341441545, "learning_rate": 3.75e-07, "logits/chosen": -1.0252456665039062, "logits/rejected": -0.9075378775596619, "logps/chosen": -22.085193634033203, "logps/rejected": -30.892152786254883, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": -0.005728382617235184, "rewards/margins": 0.43889015913009644, "rewards/rejected": -0.4446185231208801, "step": 51 }, { "epoch": 0.6162962962962963, "grad_norm": 23.576042719128466, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -1.3855410814285278, "logits/rejected": -1.2717393636703491, "logps/chosen": -26.286727905273438, "logps/rejected": -31.481882095336914, "loss": 0.5709, "rewards/accuracies": 0.75, "rewards/chosen": -0.08100883662700653, "rewards/margins": 0.31634223461151123, "rewards/rejected": -0.3973510265350342, "step": 52 }, { "epoch": 0.6281481481481481, "grad_norm": 24.575982948079687, "learning_rate": 3.8970588235294116e-07, "logits/chosen": -0.6433981657028198, "logits/rejected": -0.9913661479949951, "logps/chosen": -27.564329147338867, "logps/rejected": -41.453670501708984, "loss": 0.5978, "rewards/accuracies": 0.75, "rewards/chosen": -0.019089514389634132, "rewards/margins": 0.37892332673072815, "rewards/rejected": -0.3980128765106201, "step": 53 }, { "epoch": 0.64, "grad_norm": 23.450435346962628, "learning_rate": 3.9705882352941174e-07, "logits/chosen": -0.7840040326118469, "logits/rejected": -0.5774534344673157, "logps/chosen": -34.3664665222168, "logps/rejected": -39.58485412597656, "loss": 0.5313, "rewards/accuracies": 0.75, "rewards/chosen": -0.06814204156398773, "rewards/margins": 0.5044265389442444, "rewards/rejected": -0.5725685954093933, "step": 54 }, { "epoch": 0.6518518518518519, "grad_norm": 24.822846411492073, "learning_rate": 4.044117647058823e-07, "logits/chosen": -1.0658819675445557, "logits/rejected": -1.2129803895950317, "logps/chosen": -29.57436752319336, "logps/rejected": -47.35321807861328, "loss": 0.5771, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10051199793815613, "rewards/margins": 0.5884958505630493, "rewards/rejected": -0.6890078186988831, "step": 55 }, { "epoch": 0.6637037037037037, "grad_norm": 25.00801264031218, "learning_rate": 4.117647058823529e-07, "logits/chosen": -1.0222803354263306, "logits/rejected": -1.0347211360931396, "logps/chosen": -30.616716384887695, "logps/rejected": -36.40394592285156, "loss": 0.5714, "rewards/accuracies": 0.875, "rewards/chosen": -0.018573857843875885, "rewards/margins": 0.5196735262870789, "rewards/rejected": -0.5382473468780518, "step": 56 }, { "epoch": 0.6755555555555556, "grad_norm": 25.502073225889802, "learning_rate": 4.191176470588235e-07, "logits/chosen": -0.9514663219451904, "logits/rejected": -0.796308696269989, "logps/chosen": -30.12722396850586, "logps/rejected": -33.41557312011719, "loss": 0.5685, "rewards/accuracies": 0.6875, "rewards/chosen": -0.14312461018562317, "rewards/margins": 0.16847191751003265, "rewards/rejected": -0.31159651279449463, "step": 57 }, { "epoch": 0.6874074074074074, "grad_norm": 22.66345454008422, "learning_rate": 4.264705882352941e-07, "logits/chosen": -0.7501423358917236, "logits/rejected": -0.7819620370864868, "logps/chosen": -31.42731475830078, "logps/rejected": -47.55392074584961, "loss": 0.5282, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06128763407468796, "rewards/margins": 0.8270988464355469, "rewards/rejected": -0.888386607170105, "step": 58 }, { "epoch": 0.6992592592592592, "grad_norm": 22.074290348194058, "learning_rate": 4.338235294117647e-07, "logits/chosen": -0.9507533311843872, "logits/rejected": -0.9616032838821411, "logps/chosen": -28.218658447265625, "logps/rejected": -43.016136169433594, "loss": 0.5125, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12597304582595825, "rewards/margins": 0.6684972643852234, "rewards/rejected": -0.7944703102111816, "step": 59 }, { "epoch": 0.7111111111111111, "grad_norm": 23.08318692263043, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -1.2129336595535278, "logits/rejected": -1.283077359199524, "logps/chosen": -25.175594329833984, "logps/rejected": -30.427568435668945, "loss": 0.5012, "rewards/accuracies": 0.625, "rewards/chosen": -0.10316593945026398, "rewards/margins": 0.6034290194511414, "rewards/rejected": -0.7065950036048889, "step": 60 }, { "epoch": 0.7229629629629629, "grad_norm": 22.04689507649114, "learning_rate": 4.485294117647059e-07, "logits/chosen": -1.275814175605774, "logits/rejected": -1.2445783615112305, "logps/chosen": -24.249958038330078, "logps/rejected": -31.738033294677734, "loss": 0.508, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13786821067333221, "rewards/margins": 0.44874635338783264, "rewards/rejected": -0.5866145491600037, "step": 61 }, { "epoch": 0.7348148148148148, "grad_norm": 21.718511918361607, "learning_rate": 4.5588235294117646e-07, "logits/chosen": -0.931826651096344, "logits/rejected": -1.0001661777496338, "logps/chosen": -25.03376007080078, "logps/rejected": -40.26024627685547, "loss": 0.5085, "rewards/accuracies": 0.75, "rewards/chosen": -0.06334442645311356, "rewards/margins": 0.8100509643554688, "rewards/rejected": -0.8733953237533569, "step": 62 }, { "epoch": 0.7466666666666667, "grad_norm": 24.701982218388043, "learning_rate": 4.6323529411764704e-07, "logits/chosen": -1.1187516450881958, "logits/rejected": -0.9583245515823364, "logps/chosen": -29.71410369873047, "logps/rejected": -37.30023956298828, "loss": 0.5318, "rewards/accuracies": 0.75, "rewards/chosen": -0.0184539332985878, "rewards/margins": 0.7952549457550049, "rewards/rejected": -0.8137089014053345, "step": 63 }, { "epoch": 0.7585185185185185, "grad_norm": 23.712296041316314, "learning_rate": 4.705882352941176e-07, "logits/chosen": -0.8638966083526611, "logits/rejected": -0.7655866146087646, "logps/chosen": -23.166181564331055, "logps/rejected": -33.395484924316406, "loss": 0.523, "rewards/accuracies": 0.9375, "rewards/chosen": -0.020204465836286545, "rewards/margins": 0.6352415680885315, "rewards/rejected": -0.6554459929466248, "step": 64 }, { "epoch": 0.7703703703703704, "grad_norm": 24.60132344003472, "learning_rate": 4.779411764705882e-07, "logits/chosen": -0.726274311542511, "logits/rejected": -0.8394519686698914, "logps/chosen": -32.45426559448242, "logps/rejected": -38.104736328125, "loss": 0.5137, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1299125850200653, "rewards/margins": 0.2771841883659363, "rewards/rejected": -0.4070967733860016, "step": 65 }, { "epoch": 0.7822222222222223, "grad_norm": 22.34792389998632, "learning_rate": 4.852941176470588e-07, "logits/chosen": -0.9895652532577515, "logits/rejected": -0.8180733919143677, "logps/chosen": -29.497535705566406, "logps/rejected": -34.33931350708008, "loss": 0.5037, "rewards/accuracies": 0.8125, "rewards/chosen": -0.03963299095630646, "rewards/margins": 1.122789978981018, "rewards/rejected": -1.162423014640808, "step": 66 }, { "epoch": 0.794074074074074, "grad_norm": 22.03941990218751, "learning_rate": 4.926470588235295e-07, "logits/chosen": -1.2204893827438354, "logits/rejected": -0.9625455737113953, "logps/chosen": -24.71898651123047, "logps/rejected": -29.869098663330078, "loss": 0.4825, "rewards/accuracies": 0.875, "rewards/chosen": -0.01972150057554245, "rewards/margins": 1.2066266536712646, "rewards/rejected": -1.2263481616973877, "step": 67 }, { "epoch": 0.8059259259259259, "grad_norm": 24.106320190723114, "learning_rate": 5e-07, "logits/chosen": -1.2090452909469604, "logits/rejected": -0.8567001819610596, "logps/chosen": -40.8047981262207, "logps/rejected": -41.501075744628906, "loss": 0.5534, "rewards/accuracies": 0.625, "rewards/chosen": -0.23644912242889404, "rewards/margins": 0.4782140851020813, "rewards/rejected": -0.7146631479263306, "step": 68 }, { "epoch": 0.8177777777777778, "grad_norm": 21.771823138835973, "learning_rate": 4.999966183013662e-07, "logits/chosen": -0.8472105860710144, "logits/rejected": -0.74596107006073, "logps/chosen": -29.863178253173828, "logps/rejected": -41.79008483886719, "loss": 0.4941, "rewards/accuracies": 0.9375, "rewards/chosen": -0.031317003071308136, "rewards/margins": 0.7321377396583557, "rewards/rejected": -0.763454794883728, "step": 69 }, { "epoch": 0.8296296296296296, "grad_norm": 24.376274282207536, "learning_rate": 4.999864732969518e-07, "logits/chosen": -1.0997436046600342, "logits/rejected": -0.9688754081726074, "logps/chosen": -25.0852108001709, "logps/rejected": -30.06993865966797, "loss": 0.5104, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1426914781332016, "rewards/margins": 0.9001794457435608, "rewards/rejected": -1.0428708791732788, "step": 70 }, { "epoch": 0.8414814814814815, "grad_norm": 22.70502304759575, "learning_rate": 4.999695652612155e-07, "logits/chosen": -0.9267873167991638, "logits/rejected": -0.7994822263717651, "logps/chosen": -24.70890998840332, "logps/rejected": -32.99658203125, "loss": 0.5115, "rewards/accuracies": 0.75, "rewards/chosen": -0.08305026590824127, "rewards/margins": 0.9199594855308533, "rewards/rejected": -1.0030097961425781, "step": 71 }, { "epoch": 0.8533333333333334, "grad_norm": 24.314232282779354, "learning_rate": 4.999458946515807e-07, "logits/chosen": -1.0226339101791382, "logits/rejected": -0.8611736297607422, "logps/chosen": -24.211002349853516, "logps/rejected": -35.78715896606445, "loss": 0.5429, "rewards/accuracies": 0.75, "rewards/chosen": -0.05914217233657837, "rewards/margins": 0.8107442855834961, "rewards/rejected": -0.8698864579200745, "step": 72 }, { "epoch": 0.8651851851851852, "grad_norm": 23.657900907591774, "learning_rate": 4.999154621084221e-07, "logits/chosen": -0.9246867895126343, "logits/rejected": -0.8785493969917297, "logps/chosen": -30.946041107177734, "logps/rejected": -33.3994255065918, "loss": 0.5232, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11669294536113739, "rewards/margins": 0.31690362095832825, "rewards/rejected": -0.43359655141830444, "step": 73 }, { "epoch": 0.8770370370370371, "grad_norm": 22.551868004831768, "learning_rate": 4.998782684550491e-07, "logits/chosen": -0.9841519594192505, "logits/rejected": -1.094799518585205, "logps/chosen": -28.580307006835938, "logps/rejected": -43.50556182861328, "loss": 0.499, "rewards/accuracies": 0.8125, "rewards/chosen": -0.09129568934440613, "rewards/margins": 0.9107706546783447, "rewards/rejected": -1.0020663738250732, "step": 74 }, { "epoch": 0.8888888888888888, "grad_norm": 22.142782690350444, "learning_rate": 4.998343146976837e-07, "logits/chosen": -1.3637927770614624, "logits/rejected": -1.2565383911132812, "logps/chosen": -22.514596939086914, "logps/rejected": -34.200462341308594, "loss": 0.5102, "rewards/accuracies": 0.875, "rewards/chosen": -0.12911996245384216, "rewards/margins": 0.8493518233299255, "rewards/rejected": -0.9784718751907349, "step": 75 }, { "epoch": 0.9007407407407407, "grad_norm": 23.23826317747192, "learning_rate": 4.997836020254328e-07, "logits/chosen": -0.8813817501068115, "logits/rejected": -0.672915518283844, "logps/chosen": -29.698516845703125, "logps/rejected": -37.84008026123047, "loss": 0.4984, "rewards/accuracies": 0.875, "rewards/chosen": -0.13888058066368103, "rewards/margins": 0.8507511615753174, "rewards/rejected": -0.9896316528320312, "step": 76 }, { "epoch": 0.9125925925925926, "grad_norm": 21.893171367899452, "learning_rate": 4.99726131810256e-07, "logits/chosen": -0.9941626191139221, "logits/rejected": -0.8581103682518005, "logps/chosen": -25.82787322998047, "logps/rejected": -26.007766723632812, "loss": 0.4861, "rewards/accuracies": 0.6875, "rewards/chosen": -0.016278021037578583, "rewards/margins": 0.5648348331451416, "rewards/rejected": -0.5811129212379456, "step": 77 }, { "epoch": 0.9244444444444444, "grad_norm": 21.62182885137629, "learning_rate": 4.996619056069291e-07, "logits/chosen": -0.7729543447494507, "logits/rejected": -0.4389303922653198, "logps/chosen": -30.95619010925293, "logps/rejected": -32.129966735839844, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -0.08845072984695435, "rewards/margins": 0.7277066111564636, "rewards/rejected": -0.8161574602127075, "step": 78 }, { "epoch": 0.9362962962962963, "grad_norm": 23.213502848073986, "learning_rate": 4.995909251530013e-07, "logits/chosen": -0.94273841381073, "logits/rejected": -0.8963134288787842, "logps/chosen": -28.92119598388672, "logps/rejected": -32.17072296142578, "loss": 0.511, "rewards/accuracies": 0.625, "rewards/chosen": -0.11542296409606934, "rewards/margins": 0.23704171180725098, "rewards/rejected": -0.3524646759033203, "step": 79 }, { "epoch": 0.9481481481481482, "grad_norm": 22.2887907095195, "learning_rate": 4.995131923687487e-07, "logits/chosen": -0.8467817902565002, "logits/rejected": -0.9369296431541443, "logps/chosen": -34.357215881347656, "logps/rejected": -45.66539001464844, "loss": 0.4817, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1713842898607254, "rewards/margins": 0.5076985955238342, "rewards/rejected": -0.6790828108787537, "step": 80 }, { "epoch": 0.96, "grad_norm": 22.96182494835123, "learning_rate": 4.994287093571221e-07, "logits/chosen": -1.0453802347183228, "logits/rejected": -0.796444296836853, "logps/chosen": -32.47832107543945, "logps/rejected": -33.27830123901367, "loss": 0.4837, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07759351283311844, "rewards/margins": 0.7651993632316589, "rewards/rejected": -0.8427927494049072, "step": 81 }, { "epoch": 0.9718518518518519, "grad_norm": 21.57718887204458, "learning_rate": 4.993374784036901e-07, "logits/chosen": -0.8520927429199219, "logits/rejected": -0.7505441904067993, "logps/chosen": -31.684307098388672, "logps/rejected": -36.8294677734375, "loss": 0.438, "rewards/accuracies": 0.625, "rewards/chosen": -0.28118762373924255, "rewards/margins": 0.9425290822982788, "rewards/rejected": -1.2237166166305542, "step": 82 }, { "epoch": 0.9837037037037037, "grad_norm": 19.78935688611565, "learning_rate": 4.992395019765775e-07, "logits/chosen": -1.0383273363113403, "logits/rejected": -0.9351356625556946, "logps/chosen": -23.624879837036133, "logps/rejected": -39.410057067871094, "loss": 0.4564, "rewards/accuracies": 0.8125, "rewards/chosen": -0.16446058452129364, "rewards/margins": 1.0133116245269775, "rewards/rejected": -1.1777722835540771, "step": 83 }, { "epoch": 0.9955555555555555, "grad_norm": 20.85888712023128, "learning_rate": 4.991347827263982e-07, "logits/chosen": -0.8024070262908936, "logits/rejected": -0.9107375741004944, "logps/chosen": -23.73459815979004, "logps/rejected": -36.912132263183594, "loss": 0.4675, "rewards/accuracies": 0.75, "rewards/chosen": -0.15683181583881378, "rewards/margins": 0.8828233480453491, "rewards/rejected": -1.0396552085876465, "step": 84 }, { "epoch": 1.0074074074074073, "grad_norm": 21.201555546786057, "learning_rate": 4.990233234861839e-07, "logits/chosen": -1.1527302265167236, "logits/rejected": -0.7363994121551514, "logps/chosen": -33.276859283447266, "logps/rejected": -40.38740539550781, "loss": 0.4559, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10876104235649109, "rewards/margins": 1.702566385269165, "rewards/rejected": -1.8113272190093994, "step": 85 }, { "epoch": 1.0192592592592593, "grad_norm": 18.56352446519458, "learning_rate": 4.989051272713069e-07, "logits/chosen": -1.0480620861053467, "logits/rejected": -0.8963486552238464, "logps/chosen": -24.252384185791016, "logps/rejected": -33.39715576171875, "loss": 0.4078, "rewards/accuracies": 0.75, "rewards/chosen": -0.0004123076796531677, "rewards/margins": 1.1432445049285889, "rewards/rejected": -1.1436569690704346, "step": 86 }, { "epoch": 1.031111111111111, "grad_norm": 19.689301826887924, "learning_rate": 4.987801972793993e-07, "logits/chosen": -1.2163270711898804, "logits/rejected": -1.0421555042266846, "logps/chosen": -29.977680206298828, "logps/rejected": -45.905757904052734, "loss": 0.4003, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1577407568693161, "rewards/margins": 1.4759492874145508, "rewards/rejected": -1.6336898803710938, "step": 87 }, { "epoch": 1.0429629629629629, "grad_norm": 18.31837515764729, "learning_rate": 4.986485368902656e-07, "logits/chosen": -0.9305309653282166, "logits/rejected": -0.7790694832801819, "logps/chosen": -28.83968734741211, "logps/rejected": -45.16325378417969, "loss": 0.3903, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06216466426849365, "rewards/margins": 1.9154176712036133, "rewards/rejected": -1.977582335472107, "step": 88 }, { "epoch": 1.0548148148148149, "grad_norm": 20.64062540717056, "learning_rate": 4.985101496657918e-07, "logits/chosen": -0.9884095191955566, "logits/rejected": -0.9845250844955444, "logps/chosen": -26.488367080688477, "logps/rejected": -39.72848892211914, "loss": 0.4059, "rewards/accuracies": 1.0, "rewards/chosen": -0.2092103362083435, "rewards/margins": 1.6819262504577637, "rewards/rejected": -1.8911365270614624, "step": 89 }, { "epoch": 1.0666666666666667, "grad_norm": 20.58350959300496, "learning_rate": 4.983650393498489e-07, "logits/chosen": -0.9459998607635498, "logits/rejected": -1.0477434396743774, "logps/chosen": -28.036460876464844, "logps/rejected": -47.1868782043457, "loss": 0.4087, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07500169426202774, "rewards/margins": 1.858386516571045, "rewards/rejected": -1.933388113975525, "step": 90 }, { "epoch": 1.0785185185185184, "grad_norm": 20.268639694693597, "learning_rate": 4.982132098681923e-07, "logits/chosen": -0.9951722621917725, "logits/rejected": -1.0074284076690674, "logps/chosen": -27.722091674804688, "logps/rejected": -34.781681060791016, "loss": 0.4277, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0997595265507698, "rewards/margins": 0.8821978569030762, "rewards/rejected": -0.9819574356079102, "step": 91 }, { "epoch": 1.0903703703703704, "grad_norm": 18.542515385326787, "learning_rate": 4.980546653283537e-07, "logits/chosen": -0.8079184889793396, "logits/rejected": -0.7196828126907349, "logps/chosen": -26.553253173828125, "logps/rejected": -42.56141662597656, "loss": 0.3864, "rewards/accuracies": 0.9375, "rewards/chosen": -0.15712840855121613, "rewards/margins": 1.236411690711975, "rewards/rejected": -1.3935401439666748, "step": 92 }, { "epoch": 1.1022222222222222, "grad_norm": 19.264634270110978, "learning_rate": 4.978894100195324e-07, "logits/chosen": -0.8741264343261719, "logits/rejected": -0.7673499584197998, "logps/chosen": -30.112987518310547, "logps/rejected": -40.45325469970703, "loss": 0.3594, "rewards/accuracies": 0.75, "rewards/chosen": -0.09071722626686096, "rewards/margins": 1.353488802909851, "rewards/rejected": -1.4442059993743896, "step": 93 }, { "epoch": 1.114074074074074, "grad_norm": 19.800071465779904, "learning_rate": 4.977174484124775e-07, "logits/chosen": -0.9135103225708008, "logits/rejected": -0.9411644339561462, "logps/chosen": -31.76198959350586, "logps/rejected": -36.60845947265625, "loss": 0.3919, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11135930567979813, "rewards/margins": 1.0709939002990723, "rewards/rejected": -1.1823533773422241, "step": 94 }, { "epoch": 1.125925925925926, "grad_norm": 17.895705636922976, "learning_rate": 4.975387851593676e-07, "logits/chosen": -1.1691641807556152, "logits/rejected": -1.1903026103973389, "logps/chosen": -27.760555267333984, "logps/rejected": -39.08766555786133, "loss": 0.3879, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03780115395784378, "rewards/margins": 0.8077431321144104, "rewards/rejected": -0.769942045211792, "step": 95 }, { "epoch": 1.1377777777777778, "grad_norm": 20.599609019101436, "learning_rate": 4.97353425093685e-07, "logits/chosen": -0.7190616726875305, "logits/rejected": -0.7100368142127991, "logps/chosen": -25.934661865234375, "logps/rejected": -34.01348876953125, "loss": 0.4321, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10089172422885895, "rewards/margins": 0.6498050689697266, "rewards/rejected": -0.7506968379020691, "step": 96 }, { "epoch": 1.1496296296296296, "grad_norm": 21.095071119686782, "learning_rate": 4.971613732300848e-07, "logits/chosen": -0.9941329956054688, "logits/rejected": -0.9379687309265137, "logps/chosen": -27.864826202392578, "logps/rejected": -40.54108810424805, "loss": 0.3575, "rewards/accuracies": 0.875, "rewards/chosen": -0.0676848441362381, "rewards/margins": 1.5730375051498413, "rewards/rejected": -1.640722393989563, "step": 97 }, { "epoch": 1.1614814814814816, "grad_norm": 17.403612874481382, "learning_rate": 4.96962634764259e-07, "logits/chosen": -1.2904139757156372, "logits/rejected": -1.0033934116363525, "logps/chosen": -30.47223472595215, "logps/rejected": -40.54103088378906, "loss": 0.3457, "rewards/accuracies": 0.875, "rewards/chosen": 0.018950080499053, "rewards/margins": 1.4018616676330566, "rewards/rejected": -1.3829115629196167, "step": 98 }, { "epoch": 1.1733333333333333, "grad_norm": 17.611474070304414, "learning_rate": 4.967572150727964e-07, "logits/chosen": -1.046346664428711, "logits/rejected": -0.7940016388893127, "logps/chosen": -32.027244567871094, "logps/rejected": -32.67860412597656, "loss": 0.3469, "rewards/accuracies": 0.875, "rewards/chosen": -0.13017599284648895, "rewards/margins": 0.9814402461051941, "rewards/rejected": -1.1116162538528442, "step": 99 }, { "epoch": 1.1851851851851851, "grad_norm": 18.404945181563157, "learning_rate": 4.965451197130372e-07, "logits/chosen": -1.1877573728561401, "logits/rejected": -0.7191611528396606, "logps/chosen": -28.89226531982422, "logps/rejected": -38.4400634765625, "loss": 0.3818, "rewards/accuracies": 1.0, "rewards/chosen": 0.14333172142505646, "rewards/margins": 1.7097716331481934, "rewards/rejected": -1.5664398670196533, "step": 100 }, { "epoch": 1.1970370370370371, "grad_norm": 19.263820617064404, "learning_rate": 4.963263544229219e-07, "logits/chosen": -0.9433082938194275, "logits/rejected": -0.8548566699028015, "logps/chosen": -31.192596435546875, "logps/rejected": -43.11643600463867, "loss": 0.3602, "rewards/accuracies": 0.875, "rewards/chosen": -0.05815482884645462, "rewards/margins": 2.0110859870910645, "rewards/rejected": -2.0692405700683594, "step": 101 }, { "epoch": 1.208888888888889, "grad_norm": 19.438323567719507, "learning_rate": 4.961009251208367e-07, "logits/chosen": -0.847985565662384, "logits/rejected": -0.7285029292106628, "logps/chosen": -27.112821578979492, "logps/rejected": -36.1579475402832, "loss": 0.3828, "rewards/accuracies": 0.8125, "rewards/chosen": 0.09437136352062225, "rewards/margins": 1.5882363319396973, "rewards/rejected": -1.4938650131225586, "step": 102 }, { "epoch": 1.2207407407407407, "grad_norm": 20.89714425336938, "learning_rate": 4.958688379054535e-07, "logits/chosen": -0.7281448841094971, "logits/rejected": -0.7819719314575195, "logps/chosen": -30.29817771911621, "logps/rejected": -40.67383575439453, "loss": 0.41, "rewards/accuracies": 0.875, "rewards/chosen": -0.1488310694694519, "rewards/margins": 1.0787588357925415, "rewards/rejected": -1.2275900840759277, "step": 103 }, { "epoch": 1.2325925925925927, "grad_norm": 18.51161455832253, "learning_rate": 4.956300990555643e-07, "logits/chosen": -1.1979477405548096, "logits/rejected": -1.2166452407836914, "logps/chosen": -24.593063354492188, "logps/rejected": -36.1388053894043, "loss": 0.3529, "rewards/accuracies": 0.9375, "rewards/chosen": -0.10680893063545227, "rewards/margins": 1.3273341655731201, "rewards/rejected": -1.4341431856155396, "step": 104 }, { "epoch": 1.2444444444444445, "grad_norm": 18.348198834543016, "learning_rate": 4.953847150299118e-07, "logits/chosen": -1.2016291618347168, "logits/rejected": -1.0083853006362915, "logps/chosen": -26.028261184692383, "logps/rejected": -31.78951644897461, "loss": 0.3515, "rewards/accuracies": 0.9375, "rewards/chosen": 0.017123635858297348, "rewards/margins": 1.5143089294433594, "rewards/rejected": -1.497185230255127, "step": 105 }, { "epoch": 1.2562962962962962, "grad_norm": 18.885448615868953, "learning_rate": 4.951326924670147e-07, "logits/chosen": -1.356000304222107, "logits/rejected": -1.3348772525787354, "logps/chosen": -31.302087783813477, "logps/rejected": -37.98714828491211, "loss": 0.3631, "rewards/accuracies": 0.9375, "rewards/chosen": -0.029658418148756027, "rewards/margins": 1.4642913341522217, "rewards/rejected": -1.4939496517181396, "step": 106 }, { "epoch": 1.268148148148148, "grad_norm": 19.10542072499911, "learning_rate": 4.948740381849879e-07, "logits/chosen": -0.999729335308075, "logits/rejected": -0.707848846912384, "logps/chosen": -26.364988327026367, "logps/rejected": -24.70641326904297, "loss": 0.3761, "rewards/accuracies": 0.875, "rewards/chosen": 0.035974204540252686, "rewards/margins": 1.0942362546920776, "rewards/rejected": -1.0582619905471802, "step": 107 }, { "epoch": 1.28, "grad_norm": 20.66299414009225, "learning_rate": 4.94608759181358e-07, "logits/chosen": -1.053645133972168, "logits/rejected": -0.8790953755378723, "logps/chosen": -41.80066680908203, "logps/rejected": -44.09773254394531, "loss": 0.3575, "rewards/accuracies": 0.875, "rewards/chosen": -0.2904718518257141, "rewards/margins": 1.806175708770752, "rewards/rejected": -2.0966477394104004, "step": 108 }, { "epoch": 1.2918518518518518, "grad_norm": 17.54860479077947, "learning_rate": 4.943368626328741e-07, "logits/chosen": -0.9635467529296875, "logits/rejected": -0.7424119710922241, "logps/chosen": -30.89739990234375, "logps/rejected": -41.76382827758789, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 0.05257820338010788, "rewards/margins": 2.4511446952819824, "rewards/rejected": -2.398566484451294, "step": 109 }, { "epoch": 1.3037037037037038, "grad_norm": 18.829241299282575, "learning_rate": 4.940583558953137e-07, "logits/chosen": -0.9188340306282043, "logits/rejected": -1.0607569217681885, "logps/chosen": -22.153308868408203, "logps/rejected": -44.96165466308594, "loss": 0.3263, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0348847322165966, "rewards/margins": 2.250445604324341, "rewards/rejected": -2.285330057144165, "step": 110 }, { "epoch": 1.3155555555555556, "grad_norm": 18.93995233274985, "learning_rate": 4.937732465032838e-07, "logits/chosen": -0.7081223726272583, "logits/rejected": -0.8990048766136169, "logps/chosen": -26.534879684448242, "logps/rejected": -44.0599365234375, "loss": 0.3816, "rewards/accuracies": 0.875, "rewards/chosen": -0.27098196744918823, "rewards/margins": 1.7641065120697021, "rewards/rejected": -2.035088539123535, "step": 111 }, { "epoch": 1.3274074074074074, "grad_norm": 20.79198255104679, "learning_rate": 4.934815421700164e-07, "logits/chosen": -1.372730016708374, "logits/rejected": -1.3816195726394653, "logps/chosen": -24.733922958374023, "logps/rejected": -37.37799835205078, "loss": 0.3793, "rewards/accuracies": 1.0, "rewards/chosen": -0.2344232201576233, "rewards/margins": 1.4653730392456055, "rewards/rejected": -1.6997960805892944, "step": 112 }, { "epoch": 1.3392592592592591, "grad_norm": 17.64638457869075, "learning_rate": 4.93183250787161e-07, "logits/chosen": -0.9247840642929077, "logits/rejected": -1.0818623304367065, "logps/chosen": -28.89555549621582, "logps/rejected": -39.97201919555664, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -0.08911335468292236, "rewards/margins": 0.9882626533508301, "rewards/rejected": -1.0773760080337524, "step": 113 }, { "epoch": 1.3511111111111112, "grad_norm": 19.44771569200443, "learning_rate": 4.928783804245699e-07, "logits/chosen": -0.9975395202636719, "logits/rejected": -0.9134284257888794, "logps/chosen": -31.9511775970459, "logps/rejected": -44.12687683105469, "loss": 0.3613, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28740745782852173, "rewards/margins": 1.532348394393921, "rewards/rejected": -1.8197555541992188, "step": 114 }, { "epoch": 1.362962962962963, "grad_norm": 18.822336901951047, "learning_rate": 4.925669393300807e-07, "logits/chosen": -0.9019233584403992, "logits/rejected": -0.8816910982131958, "logps/chosen": -21.266921997070312, "logps/rejected": -38.15643310546875, "loss": 0.3558, "rewards/accuracies": 1.0, "rewards/chosen": 0.09768679738044739, "rewards/margins": 2.402815103530884, "rewards/rejected": -2.3051280975341797, "step": 115 }, { "epoch": 1.374814814814815, "grad_norm": 18.274649629189174, "learning_rate": 4.922489359292927e-07, "logits/chosen": -1.1270387172698975, "logits/rejected": -1.1085700988769531, "logps/chosen": -27.17832374572754, "logps/rejected": -37.22381591796875, "loss": 0.373, "rewards/accuracies": 0.875, "rewards/chosen": -0.028047073632478714, "rewards/margins": 1.7937216758728027, "rewards/rejected": -1.8217687606811523, "step": 116 }, { "epoch": 1.3866666666666667, "grad_norm": 16.688764844778802, "learning_rate": 4.919243788253393e-07, "logits/chosen": -1.0447142124176025, "logits/rejected": -1.042763590812683, "logps/chosen": -22.125022888183594, "logps/rejected": -39.07143020629883, "loss": 0.2971, "rewards/accuracies": 1.0, "rewards/chosen": -0.03231469914317131, "rewards/margins": 2.012996196746826, "rewards/rejected": -2.0453107357025146, "step": 117 }, { "epoch": 1.3985185185185185, "grad_norm": 18.17869075719721, "learning_rate": 4.915932767986551e-07, "logits/chosen": -1.2121999263763428, "logits/rejected": -1.1531389951705933, "logps/chosen": -24.337661743164062, "logps/rejected": -36.68294143676758, "loss": 0.3297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.051829662173986435, "rewards/margins": 1.766758680343628, "rewards/rejected": -1.8185884952545166, "step": 118 }, { "epoch": 1.4103703703703703, "grad_norm": 19.449877146869795, "learning_rate": 4.912556388067381e-07, "logits/chosen": -0.8913883566856384, "logits/rejected": -0.9280557036399841, "logps/chosen": -24.18404769897461, "logps/rejected": -36.907875061035156, "loss": 0.3403, "rewards/accuracies": 0.9375, "rewards/chosen": -0.250654011964798, "rewards/margins": 1.399787425994873, "rewards/rejected": -1.6504414081573486, "step": 119 }, { "epoch": 1.4222222222222223, "grad_norm": 20.342939608264548, "learning_rate": 4.909114739839079e-07, "logits/chosen": -1.1030035018920898, "logits/rejected": -1.1965720653533936, "logps/chosen": -23.354150772094727, "logps/rejected": -34.74749755859375, "loss": 0.3736, "rewards/accuracies": 0.875, "rewards/chosen": 0.05992694944143295, "rewards/margins": 1.4329164028167725, "rewards/rejected": -1.372989535331726, "step": 120 }, { "epoch": 1.434074074074074, "grad_norm": 15.715115469684475, "learning_rate": 4.90560791641058e-07, "logits/chosen": -0.990738570690155, "logits/rejected": -0.7945737838745117, "logps/chosen": -27.087312698364258, "logps/rejected": -49.8587646484375, "loss": 0.2877, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19137343764305115, "rewards/margins": 3.4916913509368896, "rewards/rejected": -3.3003177642822266, "step": 121 }, { "epoch": 1.445925925925926, "grad_norm": 17.221890947102303, "learning_rate": 4.902036012654048e-07, "logits/chosen": -1.0660780668258667, "logits/rejected": -1.02411687374115, "logps/chosen": -22.874664306640625, "logps/rejected": -33.08205795288086, "loss": 0.3072, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06796490401029587, "rewards/margins": 1.5805720090866089, "rewards/rejected": -1.512607216835022, "step": 122 }, { "epoch": 1.4577777777777778, "grad_norm": 16.005069196000893, "learning_rate": 4.898399125202295e-07, "logits/chosen": -1.1970123052597046, "logits/rejected": -1.1410984992980957, "logps/chosen": -28.59719467163086, "logps/rejected": -42.150535583496094, "loss": 0.2946, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3620311915874481, "rewards/margins": 2.2089571952819824, "rewards/rejected": -2.570988655090332, "step": 123 }, { "epoch": 1.4696296296296296, "grad_norm": 17.717492260773117, "learning_rate": 4.894697352446182e-07, "logits/chosen": -0.9282441735267639, "logits/rejected": -0.9922834038734436, "logps/chosen": -23.3466739654541, "logps/rejected": -40.82002258300781, "loss": 0.3261, "rewards/accuracies": 0.875, "rewards/chosen": 0.03719540312886238, "rewards/margins": 1.4782500267028809, "rewards/rejected": -1.4410545825958252, "step": 124 }, { "epoch": 1.4814814814814814, "grad_norm": 17.258307758545204, "learning_rate": 4.890930794531947e-07, "logits/chosen": -0.8717182278633118, "logits/rejected": -0.5630895495414734, "logps/chosen": -31.558549880981445, "logps/rejected": -39.52922821044922, "loss": 0.3095, "rewards/accuracies": 1.0, "rewards/chosen": -0.1237109825015068, "rewards/margins": 1.8089878559112549, "rewards/rejected": -1.9326988458633423, "step": 125 }, { "epoch": 1.4933333333333334, "grad_norm": 17.23695069915024, "learning_rate": 4.887099553358501e-07, "logits/chosen": -1.0558301210403442, "logits/rejected": -1.1235636472702026, "logps/chosen": -28.143768310546875, "logps/rejected": -45.462162017822266, "loss": 0.2959, "rewards/accuracies": 1.0, "rewards/chosen": -0.057410500943660736, "rewards/margins": 2.5853257179260254, "rewards/rejected": -2.6427361965179443, "step": 126 }, { "epoch": 1.5051851851851852, "grad_norm": 16.801429168404855, "learning_rate": 4.883203732574667e-07, "logits/chosen": -0.9709302186965942, "logits/rejected": -1.0419211387634277, "logps/chosen": -27.448535919189453, "logps/rejected": -36.76887512207031, "loss": 0.301, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22691871225833893, "rewards/margins": 1.6754447221755981, "rewards/rejected": -1.9023634195327759, "step": 127 }, { "epoch": 1.5170370370370372, "grad_norm": 17.682859896395517, "learning_rate": 4.879243437576383e-07, "logits/chosen": -1.0261518955230713, "logits/rejected": -1.2332643270492554, "logps/chosen": -26.014982223510742, "logps/rejected": -44.39478302001953, "loss": 0.3259, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13659992814064026, "rewards/margins": 1.5536094903945923, "rewards/rejected": -1.6902093887329102, "step": 128 }, { "epoch": 1.528888888888889, "grad_norm": 18.640841122291285, "learning_rate": 4.875218775503837e-07, "logits/chosen": -1.0689018964767456, "logits/rejected": -0.980902373790741, "logps/chosen": -24.02052116394043, "logps/rejected": -46.01612854003906, "loss": 0.315, "rewards/accuracies": 0.875, "rewards/chosen": -0.18941253423690796, "rewards/margins": 2.954482078552246, "rewards/rejected": -3.143894672393799, "step": 129 }, { "epoch": 1.5407407407407407, "grad_norm": 17.210019096220396, "learning_rate": 4.871129855238588e-07, "logits/chosen": -0.8552242517471313, "logits/rejected": -0.9273741245269775, "logps/chosen": -25.354354858398438, "logps/rejected": -45.11846160888672, "loss": 0.2987, "rewards/accuracies": 0.875, "rewards/chosen": -0.1260746717453003, "rewards/margins": 2.2974648475646973, "rewards/rejected": -2.423539638519287, "step": 130 }, { "epoch": 1.5525925925925925, "grad_norm": 19.619597361039357, "learning_rate": 4.866976787400601e-07, "logits/chosen": -0.9767991304397583, "logits/rejected": -0.8945188522338867, "logps/chosen": -21.340879440307617, "logps/rejected": -34.753665924072266, "loss": 0.3739, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011123912408947945, "rewards/margins": 1.912041187286377, "rewards/rejected": -1.9231650829315186, "step": 131 }, { "epoch": 1.5644444444444443, "grad_norm": 17.099351733067884, "learning_rate": 4.862759684345269e-07, "logits/chosen": -1.0631641149520874, "logits/rejected": -1.111916422843933, "logps/chosen": -22.13747787475586, "logps/rejected": -41.837890625, "loss": 0.3078, "rewards/accuracies": 0.875, "rewards/chosen": -0.3117489218711853, "rewards/margins": 2.6900596618652344, "rewards/rejected": -3.0018084049224854, "step": 132 }, { "epoch": 1.5762962962962963, "grad_norm": 18.810360092932576, "learning_rate": 4.858478660160363e-07, "logits/chosen": -1.0127646923065186, "logits/rejected": -1.1653410196304321, "logps/chosen": -29.74410057067871, "logps/rejected": -43.37088394165039, "loss": 0.3068, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22267529368400574, "rewards/margins": 2.1531336307525635, "rewards/rejected": -2.3758089542388916, "step": 133 }, { "epoch": 1.5881481481481483, "grad_norm": 16.58595912768427, "learning_rate": 4.854133830662955e-07, "logits/chosen": -0.9233917593955994, "logits/rejected": -1.0519709587097168, "logps/chosen": -21.388568878173828, "logps/rejected": -41.858436584472656, "loss": 0.3253, "rewards/accuracies": 1.0, "rewards/chosen": -0.1711762249469757, "rewards/margins": 1.9797799587249756, "rewards/rejected": -2.150956153869629, "step": 134 }, { "epoch": 1.6, "grad_norm": 18.097856896364952, "learning_rate": 4.849725313396274e-07, "logits/chosen": -1.0138639211654663, "logits/rejected": -1.048040747642517, "logps/chosen": -26.68703842163086, "logps/rejected": -38.373451232910156, "loss": 0.3383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31828218698501587, "rewards/margins": 1.6841886043548584, "rewards/rejected": -2.0024707317352295, "step": 135 }, { "epoch": 1.6118518518518519, "grad_norm": 18.718740932131244, "learning_rate": 4.845253227626536e-07, "logits/chosen": -1.1698411703109741, "logits/rejected": -0.9726054072380066, "logps/chosen": -31.787506103515625, "logps/rejected": -36.80691146850586, "loss": 0.3202, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5396450757980347, "rewards/margins": 1.1574627161026, "rewards/rejected": -1.6971076726913452, "step": 136 }, { "epoch": 1.6237037037037036, "grad_norm": 22.38512258408975, "learning_rate": 4.84071769433971e-07, "logits/chosen": -1.1697680950164795, "logits/rejected": -0.976157009601593, "logps/chosen": -33.117286682128906, "logps/rejected": -37.835052490234375, "loss": 0.3394, "rewards/accuracies": 0.75, "rewards/chosen": -0.3910491466522217, "rewards/margins": 1.6805509328842163, "rewards/rejected": -2.0715999603271484, "step": 137 }, { "epoch": 1.6355555555555554, "grad_norm": 12.949807947370504, "learning_rate": 4.836118836238252e-07, "logits/chosen": -1.0753389596939087, "logits/rejected": -1.0277228355407715, "logps/chosen": -28.288246154785156, "logps/rejected": -48.917877197265625, "loss": 0.2072, "rewards/accuracies": 1.0, "rewards/chosen": -0.13014695048332214, "rewards/margins": 2.663167953491211, "rewards/rejected": -2.7933151721954346, "step": 138 }, { "epoch": 1.6474074074074074, "grad_norm": 17.386532268496858, "learning_rate": 4.831456777737779e-07, "logits/chosen": -1.0588066577911377, "logits/rejected": -1.2369458675384521, "logps/chosen": -20.888940811157227, "logps/rejected": -32.269954681396484, "loss": 0.2997, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23290792107582092, "rewards/margins": 1.4484302997589111, "rewards/rejected": -1.6813381910324097, "step": 139 }, { "epoch": 1.6592592592592592, "grad_norm": 17.173102470345963, "learning_rate": 4.826731644963704e-07, "logits/chosen": -1.2401896715164185, "logits/rejected": -1.2012938261032104, "logps/chosen": -26.554821014404297, "logps/rejected": -38.8468132019043, "loss": 0.2831, "rewards/accuracies": 0.8125, "rewards/chosen": -0.18287095427513123, "rewards/margins": 1.5197136402130127, "rewards/rejected": -1.7025846242904663, "step": 140 }, { "epoch": 1.6711111111111112, "grad_norm": 19.479729521536182, "learning_rate": 4.82194356574783e-07, "logits/chosen": -0.905685544013977, "logits/rejected": -0.8802829384803772, "logps/chosen": -22.686073303222656, "logps/rejected": -41.08201599121094, "loss": 0.2982, "rewards/accuracies": 0.9375, "rewards/chosen": -0.32921814918518066, "rewards/margins": 2.4480390548706055, "rewards/rejected": -2.7772574424743652, "step": 141 }, { "epoch": 1.682962962962963, "grad_norm": 20.199723821111718, "learning_rate": 4.817092669624882e-07, "logits/chosen": -0.6202248930931091, "logits/rejected": -1.0940805673599243, "logps/chosen": -25.580005645751953, "logps/rejected": -48.21833419799805, "loss": 0.3245, "rewards/accuracies": 0.875, "rewards/chosen": -0.22085842490196228, "rewards/margins": 2.6502928733825684, "rewards/rejected": -2.8711516857147217, "step": 142 }, { "epoch": 1.6948148148148148, "grad_norm": 15.645495852904002, "learning_rate": 4.812179087829012e-07, "logits/chosen": -1.0544133186340332, "logits/rejected": -1.0916544198989868, "logps/chosen": -32.951744079589844, "logps/rejected": -39.71953582763672, "loss": 0.2821, "rewards/accuracies": 1.0, "rewards/chosen": 0.03797334432601929, "rewards/margins": 1.9893945455551147, "rewards/rejected": -1.9514211416244507, "step": 143 }, { "epoch": 1.7066666666666666, "grad_norm": 17.750295917804582, "learning_rate": 4.807202953290243e-07, "logits/chosen": -0.7967438697814941, "logits/rejected": -0.6681925058364868, "logps/chosen": -31.293800354003906, "logps/rejected": -36.21725845336914, "loss": 0.3091, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2945789396762848, "rewards/margins": 1.7707284688949585, "rewards/rejected": -2.0653076171875, "step": 144 }, { "epoch": 1.7185185185185186, "grad_norm": 19.260083896787478, "learning_rate": 4.802164400630872e-07, "logits/chosen": -1.1299808025360107, "logits/rejected": -0.9482597708702087, "logps/chosen": -32.00453567504883, "logps/rejected": -46.12092208862305, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": -0.30785343050956726, "rewards/margins": 2.812390089035034, "rewards/rejected": -3.1202433109283447, "step": 145 }, { "epoch": 1.7303703703703703, "grad_norm": 17.66224210217437, "learning_rate": 4.797063566161834e-07, "logits/chosen": -0.6721981763839722, "logits/rejected": -0.8360898494720459, "logps/chosen": -24.348730087280273, "logps/rejected": -43.98560333251953, "loss": 0.2693, "rewards/accuracies": 1.0, "rewards/chosen": 0.0023769522085785866, "rewards/margins": 2.497642993927002, "rewards/rejected": -2.4952659606933594, "step": 146 }, { "epoch": 1.7422222222222223, "grad_norm": 15.415156535878221, "learning_rate": 4.791900587879009e-07, "logits/chosen": -0.756319522857666, "logits/rejected": -0.8637655973434448, "logps/chosen": -23.030933380126953, "logps/rejected": -49.56877899169922, "loss": 0.2918, "rewards/accuracies": 0.9375, "rewards/chosen": -0.04079633206129074, "rewards/margins": 3.7812483310699463, "rewards/rejected": -3.822044849395752, "step": 147 }, { "epoch": 1.7540740740740741, "grad_norm": 16.570189118993856, "learning_rate": 4.786675605459487e-07, "logits/chosen": -0.9925118684768677, "logits/rejected": -0.9853470921516418, "logps/chosen": -30.18797492980957, "logps/rejected": -48.92622756958008, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": -0.33780062198638916, "rewards/margins": 2.136337995529175, "rewards/rejected": -2.4741387367248535, "step": 148 }, { "epoch": 1.765925925925926, "grad_norm": 14.487088917255884, "learning_rate": 4.781388760257799e-07, "logits/chosen": -0.7950170040130615, "logits/rejected": -1.119793176651001, "logps/chosen": -32.499168395996094, "logps/rejected": -43.4089469909668, "loss": 0.2295, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3429313600063324, "rewards/margins": 1.9538040161132812, "rewards/rejected": -2.2967352867126465, "step": 149 }, { "epoch": 1.7777777777777777, "grad_norm": 17.295525412903633, "learning_rate": 4.776040195302079e-07, "logits/chosen": -1.0716664791107178, "logits/rejected": -0.7986952066421509, "logps/chosen": -24.969951629638672, "logps/rejected": -40.855533599853516, "loss": 0.2815, "rewards/accuracies": 0.9375, "rewards/chosen": -0.06795130670070648, "rewards/margins": 2.534419298171997, "rewards/rejected": -2.6023707389831543, "step": 150 }, { "epoch": 1.7896296296296297, "grad_norm": 17.536732294132896, "learning_rate": 4.770630055290208e-07, "logits/chosen": -0.9179374575614929, "logits/rejected": -1.0814074277877808, "logps/chosen": -30.687143325805664, "logps/rejected": -50.01094436645508, "loss": 0.2793, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38232100009918213, "rewards/margins": 2.5078325271606445, "rewards/rejected": -2.890153646469116, "step": 151 }, { "epoch": 1.8014814814814815, "grad_norm": 18.46399583316236, "learning_rate": 4.76515848658589e-07, "logits/chosen": -0.9876240491867065, "logits/rejected": -0.6405973434448242, "logps/chosen": -27.701950073242188, "logps/rejected": -41.30625915527344, "loss": 0.3049, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17839840054512024, "rewards/margins": 2.12661075592041, "rewards/rejected": -2.305009126663208, "step": 152 }, { "epoch": 1.8133333333333335, "grad_norm": 15.710604254869798, "learning_rate": 4.759625637214696e-07, "logits/chosen": -1.0109493732452393, "logits/rejected": -1.0782215595245361, "logps/chosen": -24.582748413085938, "logps/rejected": -38.63544464111328, "loss": 0.2643, "rewards/accuracies": 0.75, "rewards/chosen": -0.6045380234718323, "rewards/margins": 2.0814037322998047, "rewards/rejected": -2.6859421730041504, "step": 153 }, { "epoch": 1.8251851851851852, "grad_norm": 17.19159369354583, "learning_rate": 4.754031656860059e-07, "logits/chosen": -0.9032812118530273, "logits/rejected": -0.9155201315879822, "logps/chosen": -26.887935638427734, "logps/rejected": -43.84403610229492, "loss": 0.2882, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3690129816532135, "rewards/margins": 2.0125298500061035, "rewards/rejected": -2.381542921066284, "step": 154 }, { "epoch": 1.837037037037037, "grad_norm": 17.64840405508708, "learning_rate": 4.748376696859226e-07, "logits/chosen": -0.8731366395950317, "logits/rejected": -0.9406657218933105, "logps/chosen": -35.8603401184082, "logps/rejected": -54.6721076965332, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": -0.39435121417045593, "rewards/margins": 2.3957834243774414, "rewards/rejected": -2.7901346683502197, "step": 155 }, { "epoch": 1.8488888888888888, "grad_norm": 16.897806608192592, "learning_rate": 4.74266091019916e-07, "logits/chosen": -1.1001473665237427, "logits/rejected": -1.0638172626495361, "logps/chosen": -32.389549255371094, "logps/rejected": -41.59333801269531, "loss": 0.2627, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47744885087013245, "rewards/margins": 2.048444986343384, "rewards/rejected": -2.5258936882019043, "step": 156 }, { "epoch": 1.8607407407407406, "grad_norm": 16.755773074311023, "learning_rate": 4.7368844515124046e-07, "logits/chosen": -1.2288262844085693, "logits/rejected": -1.0816912651062012, "logps/chosen": -20.01605796813965, "logps/rejected": -37.578102111816406, "loss": 0.2685, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2156611680984497, "rewards/margins": 3.439674139022827, "rewards/rejected": -3.224012851715088, "step": 157 }, { "epoch": 1.8725925925925926, "grad_norm": 18.096404326471813, "learning_rate": 4.7310474770728996e-07, "logits/chosen": -1.2636739015579224, "logits/rejected": -1.2596609592437744, "logps/chosen": -27.145275115966797, "logps/rejected": -46.22590637207031, "loss": 0.3012, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3016894459724426, "rewards/margins": 2.962885856628418, "rewards/rejected": -3.2645750045776367, "step": 158 }, { "epoch": 1.8844444444444446, "grad_norm": 20.583684819002986, "learning_rate": 4.725150144791753e-07, "logits/chosen": -1.1442478895187378, "logits/rejected": -1.1148111820220947, "logps/chosen": -29.054805755615234, "logps/rejected": -35.72052001953125, "loss": 0.3098, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17636916041374207, "rewards/margins": 1.371252179145813, "rewards/rejected": -1.547621488571167, "step": 159 }, { "epoch": 1.8962962962962964, "grad_norm": 19.46902005864018, "learning_rate": 4.719192614212969e-07, "logits/chosen": -0.7011775374412537, "logits/rejected": -1.0104095935821533, "logps/chosen": -26.7205753326416, "logps/rejected": -44.29551696777344, "loss": 0.2899, "rewards/accuracies": 0.9375, "rewards/chosen": -0.46987485885620117, "rewards/margins": 2.1664927005767822, "rewards/rejected": -2.6363675594329834, "step": 160 }, { "epoch": 1.9081481481481481, "grad_norm": 16.802592628002614, "learning_rate": 4.713175046509131e-07, "logits/chosen": -0.9744415283203125, "logits/rejected": -0.8675547242164612, "logps/chosen": -24.74662971496582, "logps/rejected": -43.67926788330078, "loss": 0.2793, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15661165118217468, "rewards/margins": 2.913247585296631, "rewards/rejected": -3.069859266281128, "step": 161 }, { "epoch": 1.92, "grad_norm": 17.67384446887718, "learning_rate": 4.707097604477045e-07, "logits/chosen": -0.8303030729293823, "logits/rejected": -0.8715736865997314, "logps/chosen": -27.890966415405273, "logps/rejected": -43.261348724365234, "loss": 0.2763, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22982636094093323, "rewards/margins": 3.134204149246216, "rewards/rejected": -3.364030361175537, "step": 162 }, { "epoch": 1.9318518518518517, "grad_norm": 19.255369598632385, "learning_rate": 4.700960452533328e-07, "logits/chosen": -0.9890022873878479, "logits/rejected": -0.789630115032196, "logps/chosen": -29.18407440185547, "logps/rejected": -35.658119201660156, "loss": 0.2787, "rewards/accuracies": 0.9375, "rewards/chosen": -0.19787032902240753, "rewards/margins": 2.666562080383301, "rewards/rejected": -2.8644325733184814, "step": 163 }, { "epoch": 1.9437037037037037, "grad_norm": 16.66917060987199, "learning_rate": 4.694763756709967e-07, "logits/chosen": -0.6940379738807678, "logits/rejected": -0.7478067874908447, "logps/chosen": -23.664865493774414, "logps/rejected": -43.157230377197266, "loss": 0.288, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17220884561538696, "rewards/margins": 2.759270668029785, "rewards/rejected": -2.9314796924591064, "step": 164 }, { "epoch": 1.9555555555555557, "grad_norm": 17.594348865094073, "learning_rate": 4.688507684649825e-07, "logits/chosen": -0.8272866010665894, "logits/rejected": -0.8017688989639282, "logps/chosen": -33.46213150024414, "logps/rejected": -47.20207977294922, "loss": 0.301, "rewards/accuracies": 0.875, "rewards/chosen": -0.6943995356559753, "rewards/margins": 2.664170742034912, "rewards/rejected": -3.3585705757141113, "step": 165 }, { "epoch": 1.9674074074074075, "grad_norm": 18.253880938939563, "learning_rate": 4.6821924056021053e-07, "logits/chosen": -1.0482605695724487, "logits/rejected": -0.9928416013717651, "logps/chosen": -31.832792282104492, "logps/rejected": -44.5983772277832, "loss": 0.2999, "rewards/accuracies": 1.0, "rewards/chosen": -0.025602400302886963, "rewards/margins": 2.2407517433166504, "rewards/rejected": -2.2663540840148926, "step": 166 }, { "epoch": 1.9792592592592593, "grad_norm": 18.56196744633324, "learning_rate": 4.6758180904177715e-07, "logits/chosen": -1.0284857749938965, "logits/rejected": -0.9405824542045593, "logps/chosen": -28.0195369720459, "logps/rejected": -47.371124267578125, "loss": 0.3204, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3122991621494293, "rewards/margins": 2.1272220611572266, "rewards/rejected": -2.439521312713623, "step": 167 }, { "epoch": 1.991111111111111, "grad_norm": 13.908988351918019, "learning_rate": 4.669384911544926e-07, "logits/chosen": -0.8129782676696777, "logits/rejected": -0.9865670800209045, "logps/chosen": -22.810033798217773, "logps/rejected": -52.896263122558594, "loss": 0.234, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1582772433757782, "rewards/margins": 3.5472700595855713, "rewards/rejected": -3.705547332763672, "step": 168 }, { "epoch": 2.002962962962963, "grad_norm": 14.484614182012795, "learning_rate": 4.6628930430241495e-07, "logits/chosen": -0.7469329237937927, "logits/rejected": -0.502386212348938, "logps/chosen": -27.284469604492188, "logps/rejected": -38.318477630615234, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 0.07404109835624695, "rewards/margins": 3.234562397003174, "rewards/rejected": -3.1605215072631836, "step": 169 }, { "epoch": 2.0148148148148146, "grad_norm": 10.915796020040675, "learning_rate": 4.6563426604837817e-07, "logits/chosen": -1.315629005432129, "logits/rejected": -1.4473894834518433, "logps/chosen": -26.924976348876953, "logps/rejected": -51.19914627075195, "loss": 0.2154, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18387992680072784, "rewards/margins": 2.322176218032837, "rewards/rejected": -2.5060558319091797, "step": 170 }, { "epoch": 2.026666666666667, "grad_norm": 10.428088727132913, "learning_rate": 4.649733941135183e-07, "logits/chosen": -0.8857893943786621, "logits/rejected": -0.9030505418777466, "logps/chosen": -24.972835540771484, "logps/rejected": -43.09310531616211, "loss": 0.1603, "rewards/accuracies": 1.0, "rewards/chosen": -0.06505993753671646, "rewards/margins": 3.5897836685180664, "rewards/rejected": -3.654843330383301, "step": 171 }, { "epoch": 2.0385185185185186, "grad_norm": 12.12837797479529, "learning_rate": 4.6430670637679294e-07, "logits/chosen": -0.7054718136787415, "logits/rejected": -0.8474084734916687, "logps/chosen": -21.63006019592285, "logps/rejected": -39.9059944152832, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": -0.05613371729850769, "rewards/margins": 2.584840774536133, "rewards/rejected": -2.640974521636963, "step": 172 }, { "epoch": 2.0503703703703704, "grad_norm": 10.668378226356547, "learning_rate": 4.636342208744981e-07, "logits/chosen": -1.0311239957809448, "logits/rejected": -1.260777473449707, "logps/chosen": -23.770797729492188, "logps/rejected": -47.466041564941406, "loss": 0.1668, "rewards/accuracies": 1.0, "rewards/chosen": -0.11173764616250992, "rewards/margins": 3.481583595275879, "rewards/rejected": -3.5933213233947754, "step": 173 }, { "epoch": 2.062222222222222, "grad_norm": 10.557285470526292, "learning_rate": 4.629559557997804e-07, "logits/chosen": -0.9182557463645935, "logits/rejected": -0.9343796968460083, "logps/chosen": -22.01238441467285, "logps/rejected": -42.540916442871094, "loss": 0.1777, "rewards/accuracies": 1.0, "rewards/chosen": -0.1580294966697693, "rewards/margins": 3.1986570358276367, "rewards/rejected": -3.356686592102051, "step": 174 }, { "epoch": 2.074074074074074, "grad_norm": 11.326533415566665, "learning_rate": 4.6227192950214435e-07, "logits/chosen": -1.0387791395187378, "logits/rejected": -0.9052732586860657, "logps/chosen": -27.1229190826416, "logps/rejected": -39.944488525390625, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": -0.016452651470899582, "rewards/margins": 2.701723575592041, "rewards/rejected": -2.7181763648986816, "step": 175 }, { "epoch": 2.0859259259259257, "grad_norm": 11.013716006067964, "learning_rate": 4.615821604869563e-07, "logits/chosen": -0.6593332290649414, "logits/rejected": -0.6703491806983948, "logps/chosen": -30.929479598999023, "logps/rejected": -53.678165435791016, "loss": 0.1575, "rewards/accuracies": 1.0, "rewards/chosen": -0.2048792690038681, "rewards/margins": 3.190211296081543, "rewards/rejected": -3.3950905799865723, "step": 176 }, { "epoch": 2.097777777777778, "grad_norm": 9.522443462876495, "learning_rate": 4.6088666741494384e-07, "logits/chosen": -1.015365719795227, "logits/rejected": -1.0170753002166748, "logps/chosen": -32.784976959228516, "logps/rejected": -62.65760040283203, "loss": 0.1466, "rewards/accuracies": 1.0, "rewards/chosen": -0.45717763900756836, "rewards/margins": 4.587207317352295, "rewards/rejected": -5.044384956359863, "step": 177 }, { "epoch": 2.1096296296296297, "grad_norm": 10.004257598897565, "learning_rate": 4.6018546910169067e-07, "logits/chosen": -0.7767499685287476, "logits/rejected": -0.8015838861465454, "logps/chosen": -30.55481719970703, "logps/rejected": -54.31606674194336, "loss": 0.1471, "rewards/accuracies": 1.0, "rewards/chosen": -0.2950357496738434, "rewards/margins": 3.3505477905273438, "rewards/rejected": -3.645583391189575, "step": 178 }, { "epoch": 2.1214814814814815, "grad_norm": 10.418887738265333, "learning_rate": 4.5947858451712773e-07, "logits/chosen": -1.1200653314590454, "logits/rejected": -1.180110216140747, "logps/chosen": -29.036422729492188, "logps/rejected": -50.913429260253906, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": 0.10382096469402313, "rewards/margins": 3.6935832500457764, "rewards/rejected": -3.5897626876831055, "step": 179 }, { "epoch": 2.1333333333333333, "grad_norm": 10.311681664393884, "learning_rate": 4.5876603278502027e-07, "logits/chosen": -0.6714676022529602, "logits/rejected": -0.9245094060897827, "logps/chosen": -24.886255264282227, "logps/rejected": -59.40272521972656, "loss": 0.1725, "rewards/accuracies": 1.0, "rewards/chosen": -0.18294279277324677, "rewards/margins": 3.529733180999756, "rewards/rejected": -3.7126760482788086, "step": 180 }, { "epoch": 2.145185185185185, "grad_norm": 10.598338326995862, "learning_rate": 4.580478331824498e-07, "logits/chosen": -0.9760642051696777, "logits/rejected": -1.0019596815109253, "logps/chosen": -20.441396713256836, "logps/rejected": -28.71832275390625, "loss": 0.1925, "rewards/accuracies": 1.0, "rewards/chosen": 0.18639449775218964, "rewards/margins": 2.0053529739379883, "rewards/rejected": -1.8189586400985718, "step": 181 }, { "epoch": 2.157037037037037, "grad_norm": 10.273866806923419, "learning_rate": 4.573240051392935e-07, "logits/chosen": -0.9190815687179565, "logits/rejected": -0.810968279838562, "logps/chosen": -31.995441436767578, "logps/rejected": -47.2755012512207, "loss": 0.1488, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1086086630821228, "rewards/margins": 2.7792186737060547, "rewards/rejected": -2.8878276348114014, "step": 182 }, { "epoch": 2.168888888888889, "grad_norm": 10.512121243888513, "learning_rate": 4.565945682376977e-07, "logits/chosen": -0.9659938216209412, "logits/rejected": -0.9679660797119141, "logps/chosen": -26.895111083984375, "logps/rejected": -43.16779708862305, "loss": 0.1553, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0222741961479187, "rewards/margins": 2.5385921001434326, "rewards/rejected": -2.5608668327331543, "step": 183 }, { "epoch": 2.180740740740741, "grad_norm": 11.292043059472906, "learning_rate": 4.5585954221154853e-07, "logits/chosen": -0.9343494176864624, "logits/rejected": -1.0062519311904907, "logps/chosen": -22.46919822692871, "logps/rejected": -44.482078552246094, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": -0.1603562980890274, "rewards/margins": 3.266386032104492, "rewards/rejected": -3.4267418384552, "step": 184 }, { "epoch": 2.1925925925925926, "grad_norm": 11.816672083489024, "learning_rate": 4.551189469459382e-07, "logits/chosen": -0.786745011806488, "logits/rejected": -0.6139059066772461, "logps/chosen": -29.019519805908203, "logps/rejected": -42.548431396484375, "loss": 0.1738, "rewards/accuracies": 1.0, "rewards/chosen": -0.33853879570961, "rewards/margins": 2.533534288406372, "rewards/rejected": -2.87207293510437, "step": 185 }, { "epoch": 2.2044444444444444, "grad_norm": 11.1270432752319, "learning_rate": 4.5437280247662646e-07, "logits/chosen": -1.0432802438735962, "logits/rejected": -0.9425604939460754, "logps/chosen": -32.64457702636719, "logps/rejected": -47.38275909423828, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": -0.1137063056230545, "rewards/margins": 2.688105583190918, "rewards/rejected": -2.801811933517456, "step": 186 }, { "epoch": 2.216296296296296, "grad_norm": 9.879497457215198, "learning_rate": 4.5362112898949947e-07, "logits/chosen": -1.1339491605758667, "logits/rejected": -0.8624970316886902, "logps/chosen": -29.631816864013672, "logps/rejected": -46.224395751953125, "loss": 0.155, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18955475091934204, "rewards/margins": 3.8356757164001465, "rewards/rejected": -3.64612078666687, "step": 187 }, { "epoch": 2.228148148148148, "grad_norm": 11.35395445567028, "learning_rate": 4.528639468200226e-07, "logits/chosen": -1.0913629531860352, "logits/rejected": -1.0643333196640015, "logps/chosen": -26.124116897583008, "logps/rejected": -44.2266845703125, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": -0.25467032194137573, "rewards/margins": 3.40421724319458, "rewards/rejected": -3.6588873863220215, "step": 188 }, { "epoch": 2.24, "grad_norm": 10.798532598173606, "learning_rate": 4.5210127645269125e-07, "logits/chosen": -0.8021432757377625, "logits/rejected": -0.8973668813705444, "logps/chosen": -24.38117790222168, "logps/rejected": -44.69050598144531, "loss": 0.1403, "rewards/accuracies": 1.0, "rewards/chosen": -0.16808071732521057, "rewards/margins": 3.162228584289551, "rewards/rejected": -3.3303093910217285, "step": 189 }, { "epoch": 2.251851851851852, "grad_norm": 11.00778435675744, "learning_rate": 4.5133313852047613e-07, "logits/chosen": -0.7498683929443359, "logits/rejected": -0.8889190554618835, "logps/chosen": -24.70016098022461, "logps/rejected": -40.018638610839844, "loss": 0.1443, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23445232212543488, "rewards/margins": 2.9690420627593994, "rewards/rejected": -3.203494071960449, "step": 190 }, { "epoch": 2.2637037037037038, "grad_norm": 9.650405141066392, "learning_rate": 4.5055955380426514e-07, "logits/chosen": -1.0229731798171997, "logits/rejected": -1.1155050992965698, "logps/chosen": -26.343704223632812, "logps/rejected": -41.02541732788086, "loss": 0.1544, "rewards/accuracies": 1.0, "rewards/chosen": 0.05786347761750221, "rewards/margins": 3.1654956340789795, "rewards/rejected": -3.1076321601867676, "step": 191 }, { "epoch": 2.2755555555555556, "grad_norm": 9.921102833174835, "learning_rate": 4.4978054323230144e-07, "logits/chosen": -1.1550260782241821, "logits/rejected": -1.0600630044937134, "logps/chosen": -28.518699645996094, "logps/rejected": -44.826168060302734, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": -0.3105316162109375, "rewards/margins": 4.2448906898498535, "rewards/rejected": -4.555422306060791, "step": 192 }, { "epoch": 2.2874074074074073, "grad_norm": 9.450635790377559, "learning_rate": 4.489961278796167e-07, "logits/chosen": -1.0403023958206177, "logits/rejected": -1.0114576816558838, "logps/chosen": -38.044036865234375, "logps/rejected": -50.515472412109375, "loss": 0.1563, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5873116254806519, "rewards/margins": 3.311967134475708, "rewards/rejected": -3.8992786407470703, "step": 193 }, { "epoch": 2.299259259259259, "grad_norm": 9.85110506830502, "learning_rate": 4.482063289674618e-07, "logits/chosen": -0.8661289215087891, "logits/rejected": -0.8539247512817383, "logps/chosen": -25.6925048828125, "logps/rejected": -40.71975326538086, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 0.09132147580385208, "rewards/margins": 3.0964341163635254, "rewards/rejected": -3.005112648010254, "step": 194 }, { "epoch": 2.311111111111111, "grad_norm": 9.57230358742824, "learning_rate": 4.4741116786273176e-07, "logits/chosen": -0.9691765308380127, "logits/rejected": -1.184732437133789, "logps/chosen": -26.444503784179688, "logps/rejected": -45.394161224365234, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 0.04939919337630272, "rewards/margins": 3.6279492378234863, "rewards/rejected": -3.578549861907959, "step": 195 }, { "epoch": 2.322962962962963, "grad_norm": 10.264213642082447, "learning_rate": 4.466106660773884e-07, "logits/chosen": -0.8523711562156677, "logits/rejected": -0.7561182379722595, "logps/chosen": -29.33661651611328, "logps/rejected": -43.26116180419922, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": -0.02262909710407257, "rewards/margins": 3.317958354949951, "rewards/rejected": -3.3405871391296387, "step": 196 }, { "epoch": 2.334814814814815, "grad_norm": 10.508283984586193, "learning_rate": 4.4580484526787807e-07, "logits/chosen": -0.7363325953483582, "logits/rejected": -0.7178278565406799, "logps/chosen": -24.33274269104004, "logps/rejected": -34.1280403137207, "loss": 0.1573, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02388627827167511, "rewards/margins": 2.7086284160614014, "rewards/rejected": -2.6847422122955322, "step": 197 }, { "epoch": 2.3466666666666667, "grad_norm": 9.573377777372183, "learning_rate": 4.44993727234546e-07, "logits/chosen": -0.8255077004432678, "logits/rejected": -0.7383131980895996, "logps/chosen": -20.382017135620117, "logps/rejected": -36.610069274902344, "loss": 0.1506, "rewards/accuracies": 1.0, "rewards/chosen": 0.15329806506633759, "rewards/margins": 2.9509220123291016, "rewards/rejected": -2.797624111175537, "step": 198 }, { "epoch": 2.3585185185185185, "grad_norm": 9.188058087289072, "learning_rate": 4.4417733392104585e-07, "logits/chosen": -0.9828134775161743, "logits/rejected": -0.9461156129837036, "logps/chosen": -29.49842071533203, "logps/rejected": -44.92794418334961, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": -0.010838674381375313, "rewards/margins": 2.9451708793640137, "rewards/rejected": -2.95600962638855, "step": 199 }, { "epoch": 2.3703703703703702, "grad_norm": 9.995938278634934, "learning_rate": 4.4335568741374695e-07, "logits/chosen": -1.1138256788253784, "logits/rejected": -0.9974204301834106, "logps/chosen": -27.004098892211914, "logps/rejected": -46.76878356933594, "loss": 0.1287, "rewards/accuracies": 1.0, "rewards/chosen": 0.01464901864528656, "rewards/margins": 3.5402395725250244, "rewards/rejected": -3.525590181350708, "step": 200 }, { "epoch": 2.3822222222222225, "grad_norm": 9.269425170104977, "learning_rate": 4.425288099411364e-07, "logits/chosen": -0.9596495628356934, "logits/rejected": -0.781050443649292, "logps/chosen": -36.49948501586914, "logps/rejected": -46.57865524291992, "loss": 0.116, "rewards/accuracies": 1.0, "rewards/chosen": -0.34845054149627686, "rewards/margins": 3.1367316246032715, "rewards/rejected": -3.485182285308838, "step": 201 }, { "epoch": 2.3940740740740742, "grad_norm": 8.391778679248985, "learning_rate": 4.4169672387321735e-07, "logits/chosen": -0.9714689254760742, "logits/rejected": -1.1357210874557495, "logps/chosen": -26.742515563964844, "logps/rejected": -48.0218620300293, "loss": 0.1191, "rewards/accuracies": 1.0, "rewards/chosen": 0.12224035710096359, "rewards/margins": 3.619513511657715, "rewards/rejected": -3.4972729682922363, "step": 202 }, { "epoch": 2.405925925925926, "grad_norm": 9.299446663078193, "learning_rate": 4.408594517209045e-07, "logits/chosen": -1.0814424753189087, "logits/rejected": -1.1163935661315918, "logps/chosen": -27.21017837524414, "logps/rejected": -46.6378288269043, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": -0.4369148015975952, "rewards/margins": 3.762479066848755, "rewards/rejected": -4.1993937492370605, "step": 203 }, { "epoch": 2.417777777777778, "grad_norm": 9.608908773757774, "learning_rate": 4.4001701613541454e-07, "logits/chosen": -1.0588593482971191, "logits/rejected": -0.7137413024902344, "logps/chosen": -29.38337516784668, "logps/rejected": -40.47539520263672, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 0.1216193437576294, "rewards/margins": 3.52174711227417, "rewards/rejected": -3.400127649307251, "step": 204 }, { "epoch": 2.4296296296296296, "grad_norm": 9.76537745467198, "learning_rate": 4.391694399076536e-07, "logits/chosen": -1.0141160488128662, "logits/rejected": -1.006161093711853, "logps/chosen": -18.852293014526367, "logps/rejected": -47.88473129272461, "loss": 0.1292, "rewards/accuracies": 1.0, "rewards/chosen": 0.040902793407440186, "rewards/margins": 4.4104509353637695, "rewards/rejected": -4.3695478439331055, "step": 205 }, { "epoch": 2.4414814814814814, "grad_norm": 11.151794344568627, "learning_rate": 4.383167459676008e-07, "logits/chosen": -1.2655813694000244, "logits/rejected": -1.3579968214035034, "logps/chosen": -36.24526596069336, "logps/rejected": -64.4418716430664, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": -0.33499276638031006, "rewards/margins": 4.198886394500732, "rewards/rejected": -4.533878803253174, "step": 206 }, { "epoch": 2.453333333333333, "grad_norm": 9.503514745890126, "learning_rate": 4.374589573836874e-07, "logits/chosen": -0.9888389706611633, "logits/rejected": -0.9190107583999634, "logps/chosen": -25.66510772705078, "logps/rejected": -48.77112579345703, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": -0.2253144383430481, "rewards/margins": 3.847475051879883, "rewards/rejected": -4.072790145874023, "step": 207 }, { "epoch": 2.4651851851851854, "grad_norm": 10.350601972964697, "learning_rate": 4.365960973621734e-07, "logits/chosen": -0.8265293836593628, "logits/rejected": -1.0150339603424072, "logps/chosen": -24.15806770324707, "logps/rejected": -58.613380432128906, "loss": 0.1432, "rewards/accuracies": 1.0, "rewards/chosen": -0.30442744493484497, "rewards/margins": 4.33758544921875, "rewards/rejected": -4.642012596130371, "step": 208 }, { "epoch": 2.477037037037037, "grad_norm": 8.643760874362815, "learning_rate": 4.357281892465191e-07, "logits/chosen": -0.9911346435546875, "logits/rejected": -1.0368643999099731, "logps/chosen": -26.939849853515625, "logps/rejected": -56.80491638183594, "loss": 0.119, "rewards/accuracies": 1.0, "rewards/chosen": -0.23226478695869446, "rewards/margins": 3.6605029106140137, "rewards/rejected": -3.892767906188965, "step": 209 }, { "epoch": 2.488888888888889, "grad_norm": 8.784181941966711, "learning_rate": 4.348552565167542e-07, "logits/chosen": -0.926344633102417, "logits/rejected": -0.8433751463890076, "logps/chosen": -28.94524574279785, "logps/rejected": -44.438072204589844, "loss": 0.1268, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3782831132411957, "rewards/margins": 3.0210046768188477, "rewards/rejected": -3.3992879390716553, "step": 210 }, { "epoch": 2.5007407407407407, "grad_norm": 9.511335480740543, "learning_rate": 4.3397732278884194e-07, "logits/chosen": -0.6906044483184814, "logits/rejected": -0.7649115324020386, "logps/chosen": -33.39212417602539, "logps/rejected": -48.415008544921875, "loss": 0.1236, "rewards/accuracies": 1.0, "rewards/chosen": -0.3532525897026062, "rewards/margins": 2.9834935665130615, "rewards/rejected": -3.3367464542388916, "step": 211 }, { "epoch": 2.5125925925925925, "grad_norm": 9.170534986447953, "learning_rate": 4.330944118140406e-07, "logits/chosen": -1.1332037448883057, "logits/rejected": -1.249849796295166, "logps/chosen": -23.844146728515625, "logps/rejected": -46.68682098388672, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": -0.03869599476456642, "rewards/margins": 3.55596923828125, "rewards/rejected": -3.59466552734375, "step": 212 }, { "epoch": 2.5244444444444447, "grad_norm": 10.570737637064244, "learning_rate": 4.322065474782609e-07, "logits/chosen": -0.7816108465194702, "logits/rejected": -0.8124703168869019, "logps/chosen": -27.124149322509766, "logps/rejected": -50.91386413574219, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": -0.2354280799627304, "rewards/margins": 3.9667367935180664, "rewards/rejected": -4.202165126800537, "step": 213 }, { "epoch": 2.536296296296296, "grad_norm": 12.000930534607912, "learning_rate": 4.313137538014198e-07, "logits/chosen": -1.0383765697479248, "logits/rejected": -1.054479956626892, "logps/chosen": -20.81098747253418, "logps/rejected": -40.86846923828125, "loss": 0.1474, "rewards/accuracies": 1.0, "rewards/chosen": -0.17991256713867188, "rewards/margins": 2.990743637084961, "rewards/rejected": -3.170656442642212, "step": 214 }, { "epoch": 2.5481481481481483, "grad_norm": 9.377408723940578, "learning_rate": 4.304160549367906e-07, "logits/chosen": -1.2460458278656006, "logits/rejected": -1.0874885320663452, "logps/chosen": -25.634910583496094, "logps/rejected": -35.86411666870117, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/chosen": -0.2946632504463196, "rewards/margins": 3.1012935638427734, "rewards/rejected": -3.3959569931030273, "step": 215 }, { "epoch": 2.56, "grad_norm": 11.87094854333206, "learning_rate": 4.295134751703492e-07, "logits/chosen": -0.928742527961731, "logits/rejected": -0.9784969091415405, "logps/chosen": -24.898561477661133, "logps/rejected": -51.34593963623047, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": -0.3665264844894409, "rewards/margins": 4.217537879943848, "rewards/rejected": -4.58406400680542, "step": 216 }, { "epoch": 2.571851851851852, "grad_norm": 9.07689060873071, "learning_rate": 4.28606038920118e-07, "logits/chosen": -0.8964906930923462, "logits/rejected": -0.8997987508773804, "logps/chosen": -22.863473892211914, "logps/rejected": -43.20418930053711, "loss": 0.1211, "rewards/accuracies": 1.0, "rewards/chosen": -0.36831557750701904, "rewards/margins": 3.239995002746582, "rewards/rejected": -3.6083106994628906, "step": 217 }, { "epoch": 2.5837037037037036, "grad_norm": 10.096635066622941, "learning_rate": 4.276937707355044e-07, "logits/chosen": -0.6937326192855835, "logits/rejected": -0.7635350823402405, "logps/chosen": -31.77448272705078, "logps/rejected": -58.26279830932617, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": -0.28910571336746216, "rewards/margins": 4.707897186279297, "rewards/rejected": -4.997003078460693, "step": 218 }, { "epoch": 2.5955555555555554, "grad_norm": 8.283171585551857, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -1.1899882555007935, "logits/rejected": -1.0946217775344849, "logps/chosen": -27.072429656982422, "logps/rejected": -38.691810607910156, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": -0.14001135528087616, "rewards/margins": 2.991671562194824, "rewards/rejected": -3.131682872772217, "step": 219 }, { "epoch": 2.6074074074074076, "grad_norm": 7.992901648965998, "learning_rate": 4.2585483741369755e-07, "logits/chosen": -0.7397277355194092, "logits/rejected": -0.9631584882736206, "logps/chosen": -23.084850311279297, "logps/rejected": -41.13532257080078, "loss": 0.1159, "rewards/accuracies": 1.0, "rewards/chosen": -0.06614308059215546, "rewards/margins": 2.895981788635254, "rewards/rejected": -2.962125301361084, "step": 220 }, { "epoch": 2.6192592592592594, "grad_norm": 11.310778875512614, "learning_rate": 4.2492822202625065e-07, "logits/chosen": -0.9721293449401855, "logits/rejected": -1.079483151435852, "logps/chosen": -21.184463500976562, "logps/rejected": -46.17851638793945, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": -0.06296464800834656, "rewards/margins": 4.013933181762695, "rewards/rejected": -4.076898097991943, "step": 221 }, { "epoch": 2.631111111111111, "grad_norm": 8.569036120090367, "learning_rate": 4.239968742025684e-07, "logits/chosen": -1.535069227218628, "logits/rejected": -1.5164189338684082, "logps/chosen": -23.31195640563965, "logps/rejected": -41.743492126464844, "loss": 0.1046, "rewards/accuracies": 1.0, "rewards/chosen": -0.2246365249156952, "rewards/margins": 3.2981185913085938, "rewards/rejected": -3.5227551460266113, "step": 222 }, { "epoch": 2.642962962962963, "grad_norm": 9.763643273504174, "learning_rate": 4.2306081913895177e-07, "logits/chosen": -0.8104032278060913, "logits/rejected": -0.7292832732200623, "logps/chosen": -26.804920196533203, "logps/rejected": -33.72347640991211, "loss": 0.1207, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5029786229133606, "rewards/margins": 2.10044527053833, "rewards/rejected": -2.603423833847046, "step": 223 }, { "epoch": 2.6548148148148147, "grad_norm": 8.110012154364268, "learning_rate": 4.2212008215905e-07, "logits/chosen": -0.9389030933380127, "logits/rejected": -1.1046040058135986, "logps/chosen": -38.54084014892578, "logps/rejected": -60.23002243041992, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": -0.8747321367263794, "rewards/margins": 4.18320369720459, "rewards/rejected": -5.05793571472168, "step": 224 }, { "epoch": 2.6666666666666665, "grad_norm": 7.651104704560613, "learning_rate": 4.2117468871317465e-07, "logits/chosen": -0.6991132497787476, "logits/rejected": -1.011946678161621, "logps/chosen": -25.952993392944336, "logps/rejected": -59.70689392089844, "loss": 0.0994, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3089551329612732, "rewards/margins": 4.471611499786377, "rewards/rejected": -4.780567169189453, "step": 225 }, { "epoch": 2.6785185185185183, "grad_norm": 8.662418287718898, "learning_rate": 4.2022466437761154e-07, "logits/chosen": -0.7546372413635254, "logits/rejected": -0.9424067735671997, "logps/chosen": -23.357009887695312, "logps/rejected": -48.18181610107422, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": -0.04475121945142746, "rewards/margins": 3.4985172748565674, "rewards/rejected": -3.5432686805725098, "step": 226 }, { "epoch": 2.6903703703703705, "grad_norm": 9.933414328601483, "learning_rate": 4.1927003485392873e-07, "logits/chosen": -0.9893782734870911, "logits/rejected": -0.8852970004081726, "logps/chosen": -29.632408142089844, "logps/rejected": -43.87162399291992, "loss": 0.1189, "rewards/accuracies": 1.0, "rewards/chosen": -0.17188510298728943, "rewards/margins": 3.24285626411438, "rewards/rejected": -3.414741039276123, "step": 227 }, { "epoch": 2.7022222222222223, "grad_norm": 9.996913313576108, "learning_rate": 4.18310825968281e-07, "logits/chosen": -0.6748302578926086, "logits/rejected": -0.5683417320251465, "logps/chosen": -29.551555633544922, "logps/rejected": -41.79899978637695, "loss": 0.1211, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5972310900688171, "rewards/margins": 2.882406711578369, "rewards/rejected": -3.47963809967041, "step": 228 }, { "epoch": 2.714074074074074, "grad_norm": 9.705460406332598, "learning_rate": 4.173470636707115e-07, "logits/chosen": -0.8065865635871887, "logits/rejected": -0.6933514475822449, "logps/chosen": -28.950008392333984, "logps/rejected": -50.10881042480469, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": -0.5557261109352112, "rewards/margins": 3.8404664993286133, "rewards/rejected": -4.396193027496338, "step": 229 }, { "epoch": 2.725925925925926, "grad_norm": 10.109073403000078, "learning_rate": 4.1637877403444923e-07, "logits/chosen": -1.1754989624023438, "logits/rejected": -1.002323031425476, "logps/chosen": -35.7281608581543, "logps/rejected": -55.17790222167969, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": -0.6030625104904175, "rewards/margins": 4.27664852142334, "rewards/rejected": -4.879711151123047, "step": 230 }, { "epoch": 2.7377777777777776, "grad_norm": 8.645925075071077, "learning_rate": 4.1540598325520406e-07, "logits/chosen": -1.399937391281128, "logits/rejected": -1.1283848285675049, "logps/chosen": -24.478912353515625, "logps/rejected": -35.189491271972656, "loss": 0.1134, "rewards/accuracies": 1.0, "rewards/chosen": -0.45019960403442383, "rewards/margins": 3.054943561553955, "rewards/rejected": -3.505143165588379, "step": 231 }, { "epoch": 2.74962962962963, "grad_norm": 10.429252502571046, "learning_rate": 4.144287176504582e-07, "logits/chosen": -0.7452750205993652, "logits/rejected": -0.5704357028007507, "logps/chosen": -36.41873550415039, "logps/rejected": -53.760765075683594, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": -0.666602611541748, "rewards/margins": 3.7374300956726074, "rewards/rejected": -4.4040327072143555, "step": 232 }, { "epoch": 2.7614814814814816, "grad_norm": 10.151446493994069, "learning_rate": 4.1344700365875353e-07, "logits/chosen": -1.1818780899047852, "logits/rejected": -1.3208847045898438, "logps/chosen": -22.36372184753418, "logps/rejected": -48.73514938354492, "loss": 0.1381, "rewards/accuracies": 1.0, "rewards/chosen": -0.4263989329338074, "rewards/margins": 3.8192248344421387, "rewards/rejected": -4.245623588562012, "step": 233 }, { "epoch": 2.7733333333333334, "grad_norm": 8.923699196652041, "learning_rate": 4.1246086783897713e-07, "logits/chosen": -1.076073169708252, "logits/rejected": -0.8665668964385986, "logps/chosen": -29.229549407958984, "logps/rejected": -52.872596740722656, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": -0.6716349124908447, "rewards/margins": 4.262983322143555, "rewards/rejected": -4.934618949890137, "step": 234 }, { "epoch": 2.785185185185185, "grad_norm": 9.216060394380802, "learning_rate": 4.1147033686964213e-07, "logits/chosen": -0.9747135043144226, "logits/rejected": -0.8782777190208435, "logps/chosen": -21.773653030395508, "logps/rejected": -36.27745056152344, "loss": 0.115, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3823280334472656, "rewards/margins": 3.0221927165985107, "rewards/rejected": -3.4045207500457764, "step": 235 }, { "epoch": 2.797037037037037, "grad_norm": 10.290543608918464, "learning_rate": 4.104754375481664e-07, "logits/chosen": -1.1765553951263428, "logits/rejected": -1.2661182880401611, "logps/chosen": -31.247966766357422, "logps/rejected": -58.30694580078125, "loss": 0.1424, "rewards/accuracies": 1.0, "rewards/chosen": -0.6638414263725281, "rewards/margins": 5.012779235839844, "rewards/rejected": -5.6766204833984375, "step": 236 }, { "epoch": 2.8088888888888888, "grad_norm": 9.054189170474283, "learning_rate": 4.0947619679014733e-07, "logits/chosen": -0.7130009531974792, "logits/rejected": -0.9547990560531616, "logps/chosen": -33.134037017822266, "logps/rejected": -51.69905471801758, "loss": 0.1251, "rewards/accuracies": 1.0, "rewards/chosen": -0.8230234980583191, "rewards/margins": 3.9294910430908203, "rewards/rejected": -4.752514362335205, "step": 237 }, { "epoch": 2.8207407407407405, "grad_norm": 7.909168568867212, "learning_rate": 4.084726416286337e-07, "logits/chosen": -0.9382051825523376, "logits/rejected": -0.887077808380127, "logps/chosen": -27.565048217773438, "logps/rejected": -46.6566047668457, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": -0.8664588928222656, "rewards/margins": 3.4661688804626465, "rewards/rejected": -4.332627773284912, "step": 238 }, { "epoch": 2.8325925925925928, "grad_norm": 11.86775329162737, "learning_rate": 4.0746479921339456e-07, "logits/chosen": -1.0731661319732666, "logits/rejected": -0.9709546566009521, "logps/chosen": -42.645668029785156, "logps/rejected": -56.481353759765625, "loss": 0.126, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8624676465988159, "rewards/margins": 3.5984742641448975, "rewards/rejected": -4.460941791534424, "step": 239 }, { "epoch": 2.8444444444444446, "grad_norm": 8.349623438951571, "learning_rate": 4.0645269681018434e-07, "logits/chosen": -0.8875783681869507, "logits/rejected": -0.7977613210678101, "logps/chosen": -32.80491638183594, "logps/rejected": -38.476165771484375, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": -0.5840538740158081, "rewards/margins": 2.323394298553467, "rewards/rejected": -2.9074482917785645, "step": 240 }, { "epoch": 2.8562962962962963, "grad_norm": 9.486189032436748, "learning_rate": 4.054363618000057e-07, "logits/chosen": -0.8179698586463928, "logits/rejected": -0.8795968294143677, "logps/chosen": -34.11067199707031, "logps/rejected": -53.53862762451172, "loss": 0.1261, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3519166111946106, "rewards/margins": 4.845457553863525, "rewards/rejected": -5.19737434387207, "step": 241 }, { "epoch": 2.868148148148148, "grad_norm": 10.099080966219972, "learning_rate": 4.044158216783684e-07, "logits/chosen": -1.0476202964782715, "logits/rejected": -1.0129348039627075, "logps/chosen": -31.891279220581055, "logps/rejected": -46.14010238647461, "loss": 0.1114, "rewards/accuracies": 1.0, "rewards/chosen": -0.42091822624206543, "rewards/margins": 3.888162851333618, "rewards/rejected": -4.309080600738525, "step": 242 }, { "epoch": 2.88, "grad_norm": 10.049719054947364, "learning_rate": 4.033911040545453e-07, "logits/chosen": -1.2477302551269531, "logits/rejected": -0.8980987071990967, "logps/chosen": -41.76299285888672, "logps/rejected": -53.11893081665039, "loss": 0.109, "rewards/accuracies": 1.0, "rewards/chosen": -0.1314184069633484, "rewards/margins": 4.016972064971924, "rewards/rejected": -4.148390769958496, "step": 243 }, { "epoch": 2.891851851851852, "grad_norm": 9.535753969791891, "learning_rate": 4.0236223665082605e-07, "logits/chosen": -1.010503888130188, "logits/rejected": -0.7471677660942078, "logps/chosen": -34.18782043457031, "logps/rejected": -46.442893981933594, "loss": 0.1214, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2552086412906647, "rewards/margins": 3.847254753112793, "rewards/rejected": -4.102463722229004, "step": 244 }, { "epoch": 2.9037037037037035, "grad_norm": 10.85723484411409, "learning_rate": 4.0132924730176653e-07, "logits/chosen": -1.1365649700164795, "logits/rejected": -0.9161389470100403, "logps/chosen": -24.016315460205078, "logps/rejected": -46.270179748535156, "loss": 0.1335, "rewards/accuracies": 0.9375, "rewards/chosen": -0.20293915271759033, "rewards/margins": 4.1694440841674805, "rewards/rejected": -4.372383117675781, "step": 245 }, { "epoch": 2.9155555555555557, "grad_norm": 8.982083345550631, "learning_rate": 4.0029216395343617e-07, "logits/chosen": -1.1369932889938354, "logits/rejected": -1.1437650918960571, "logps/chosen": -35.65043640136719, "logps/rejected": -61.12848663330078, "loss": 0.1136, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9092841744422913, "rewards/margins": 4.529256343841553, "rewards/rejected": -5.438540458679199, "step": 246 }, { "epoch": 2.9274074074074075, "grad_norm": 8.744966472645434, "learning_rate": 3.992510146626617e-07, "logits/chosen": -0.8251878023147583, "logits/rejected": -0.6449207663536072, "logps/chosen": -33.99653625488281, "logps/rejected": -56.43595886230469, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": -0.8442633152008057, "rewards/margins": 4.416510581970215, "rewards/rejected": -5.2607741355896, "step": 247 }, { "epoch": 2.9392592592592592, "grad_norm": 10.793634421067168, "learning_rate": 3.982058275962682e-07, "logits/chosen": -1.014967918395996, "logits/rejected": -0.7875441312789917, "logps/chosen": -29.53582000732422, "logps/rejected": -38.50312423706055, "loss": 0.1438, "rewards/accuracies": 0.875, "rewards/chosen": -0.7254617214202881, "rewards/margins": 1.988916039466858, "rewards/rejected": -2.7143778800964355, "step": 248 }, { "epoch": 2.951111111111111, "grad_norm": 8.961988941988425, "learning_rate": 3.9715663103031706e-07, "logits/chosen": -0.7619471549987793, "logits/rejected": -0.7742725610733032, "logps/chosen": -34.99053192138672, "logps/rejected": -46.08906173706055, "loss": 0.1042, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7473491430282593, "rewards/margins": 3.3373470306396484, "rewards/rejected": -4.084695816040039, "step": 249 }, { "epoch": 2.962962962962963, "grad_norm": 8.867625542462614, "learning_rate": 3.9610345334934094e-07, "logits/chosen": -1.1330126523971558, "logits/rejected": -1.1697674989700317, "logps/chosen": -23.25260353088379, "logps/rejected": -43.87166976928711, "loss": 0.1069, "rewards/accuracies": 1.0, "rewards/chosen": -0.29624325037002563, "rewards/margins": 3.2834043502807617, "rewards/rejected": -3.5796477794647217, "step": 250 }, { "epoch": 2.974814814814815, "grad_norm": 9.248026100787925, "learning_rate": 3.950463230455761e-07, "logits/chosen": -1.2669241428375244, "logits/rejected": -1.0569002628326416, "logps/chosen": -32.21596145629883, "logps/rejected": -60.97792053222656, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": -0.45450618863105774, "rewards/margins": 6.268526077270508, "rewards/rejected": -6.723032474517822, "step": 251 }, { "epoch": 2.986666666666667, "grad_norm": 9.82476144546843, "learning_rate": 3.939852687181915e-07, "logits/chosen": -0.6608531475067139, "logits/rejected": -0.8034135699272156, "logps/chosen": -24.462186813354492, "logps/rejected": -50.0887451171875, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": -0.4931001365184784, "rewards/margins": 4.019628047943115, "rewards/rejected": -4.512728691101074, "step": 252 }, { "epoch": 2.9985185185185186, "grad_norm": 9.046237713104512, "learning_rate": 3.9292031907251464e-07, "logits/chosen": -1.1486996412277222, "logits/rejected": -1.2216451168060303, "logps/chosen": -30.053659439086914, "logps/rejected": -57.20942306518555, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": -0.7489092350006104, "rewards/margins": 3.8848938941955566, "rewards/rejected": -4.633802890777588, "step": 253 }, { "epoch": 3.0103703703703704, "grad_norm": 6.092995378035237, "learning_rate": 3.9185150291925585e-07, "logits/chosen": -1.1667040586471558, "logits/rejected": -1.2177824974060059, "logps/chosen": -21.940555572509766, "logps/rejected": -45.18644332885742, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": -0.5815523862838745, "rewards/margins": 3.84624981880188, "rewards/rejected": -4.427802085876465, "step": 254 }, { "epoch": 3.022222222222222, "grad_norm": 5.889868303476228, "learning_rate": 3.9077884917372806e-07, "logits/chosen": -1.2974051237106323, "logits/rejected": -1.1213111877441406, "logps/chosen": -24.889671325683594, "logps/rejected": -53.81795883178711, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": -0.10434576869010925, "rewards/margins": 5.804339408874512, "rewards/rejected": -5.908684730529785, "step": 255 }, { "epoch": 3.034074074074074, "grad_norm": 5.6665752841935895, "learning_rate": 3.8970238685506486e-07, "logits/chosen": -0.9169086813926697, "logits/rejected": -0.8243510127067566, "logps/chosen": -25.639080047607422, "logps/rejected": -44.80521011352539, "loss": 0.0712, "rewards/accuracies": 1.0, "rewards/chosen": -0.4333856403827667, "rewards/margins": 4.004616737365723, "rewards/rejected": -4.438002586364746, "step": 256 }, { "epoch": 3.0459259259259257, "grad_norm": 6.195071442100622, "learning_rate": 3.8862214508543544e-07, "logits/chosen": -0.7453622817993164, "logits/rejected": -0.8452744483947754, "logps/chosen": -27.675142288208008, "logps/rejected": -54.29957580566406, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": -0.2128800004720688, "rewards/margins": 4.886155605316162, "rewards/rejected": -5.099035263061523, "step": 257 }, { "epoch": 3.057777777777778, "grad_norm": 4.577785723486363, "learning_rate": 3.8753815308925685e-07, "logits/chosen": -0.7777894735336304, "logits/rejected": -0.9331192970275879, "logps/chosen": -31.157546997070312, "logps/rejected": -64.40248107910156, "loss": 0.0512, "rewards/accuracies": 1.0, "rewards/chosen": -0.4510951638221741, "rewards/margins": 4.6957244873046875, "rewards/rejected": -5.146819591522217, "step": 258 }, { "epoch": 3.0696296296296297, "grad_norm": 6.4011580737866005, "learning_rate": 3.864504401924031e-07, "logits/chosen": -0.830951452255249, "logits/rejected": -0.8679866790771484, "logps/chosen": -27.256206512451172, "logps/rejected": -55.158851623535156, "loss": 0.0732, "rewards/accuracies": 1.0, "rewards/chosen": -0.7141133546829224, "rewards/margins": 4.929049968719482, "rewards/rejected": -5.643163204193115, "step": 259 }, { "epoch": 3.0814814814814815, "grad_norm": 5.755747603737944, "learning_rate": 3.8535903582141184e-07, "logits/chosen": -1.0873059034347534, "logits/rejected": -0.797295868396759, "logps/chosen": -40.58049011230469, "logps/rejected": -63.331024169921875, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": -0.8855617046356201, "rewards/margins": 5.494954586029053, "rewards/rejected": -6.380516052246094, "step": 260 }, { "epoch": 3.0933333333333333, "grad_norm": 5.377551785753514, "learning_rate": 3.8426396950268846e-07, "logits/chosen": -1.0642486810684204, "logits/rejected": -1.0869853496551514, "logps/chosen": -25.782983779907227, "logps/rejected": -47.94844436645508, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.5024091005325317, "rewards/margins": 4.591217041015625, "rewards/rejected": -5.093626022338867, "step": 261 }, { "epoch": 3.105185185185185, "grad_norm": 5.805844710790357, "learning_rate": 3.8316527086170727e-07, "logits/chosen": -1.4574161767959595, "logits/rejected": -1.214327335357666, "logps/chosen": -37.066261291503906, "logps/rejected": -46.37734603881836, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": -0.3814047574996948, "rewards/margins": 4.010954856872559, "rewards/rejected": -4.392359733581543, "step": 262 }, { "epoch": 3.117037037037037, "grad_norm": 5.631859361425549, "learning_rate": 3.820629696222096e-07, "logits/chosen": -1.037698745727539, "logits/rejected": -1.2875399589538574, "logps/chosen": -30.273286819458008, "logps/rejected": -63.62456130981445, "loss": 0.0589, "rewards/accuracies": 1.0, "rewards/chosen": -0.311270534992218, "rewards/margins": 4.6157755851745605, "rewards/rejected": -4.927045822143555, "step": 263 }, { "epoch": 3.128888888888889, "grad_norm": 6.4282160152251775, "learning_rate": 3.809570956054003e-07, "logits/chosen": -0.8523592352867126, "logits/rejected": -1.133123517036438, "logps/chosen": -25.40625762939453, "logps/rejected": -51.35416030883789, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": -0.44330236315727234, "rewards/margins": 4.313521862030029, "rewards/rejected": -4.756824016571045, "step": 264 }, { "epoch": 3.140740740740741, "grad_norm": 4.79153948993895, "learning_rate": 3.798476787291407e-07, "logits/chosen": -0.9867359399795532, "logits/rejected": -0.8984818458557129, "logps/chosen": -34.52312088012695, "logps/rejected": -60.01667785644531, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": -0.7610639929771423, "rewards/margins": 5.0160813331604, "rewards/rejected": -5.777144908905029, "step": 265 }, { "epoch": 3.1525925925925926, "grad_norm": 5.75718528813517, "learning_rate": 3.787347490071389e-07, "logits/chosen": -1.120936393737793, "logits/rejected": -1.2262120246887207, "logps/chosen": -23.01953125, "logps/rejected": -61.13671112060547, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": -0.14551204442977905, "rewards/margins": 5.611021041870117, "rewards/rejected": -5.756533622741699, "step": 266 }, { "epoch": 3.1644444444444444, "grad_norm": 7.215037149667764, "learning_rate": 3.776183365481385e-07, "logits/chosen": -0.8273714780807495, "logits/rejected": -0.6773754358291626, "logps/chosen": -22.21053123474121, "logps/rejected": -43.30362319946289, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": -0.09048902243375778, "rewards/margins": 4.530794620513916, "rewards/rejected": -4.621283531188965, "step": 267 }, { "epoch": 3.176296296296296, "grad_norm": 5.823890371998723, "learning_rate": 3.764984715551031e-07, "logits/chosen": -0.9349130392074585, "logits/rejected": -0.9820988178253174, "logps/chosen": -29.579639434814453, "logps/rejected": -57.95941925048828, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": -0.10734206438064575, "rewards/margins": 4.939681053161621, "rewards/rejected": -5.047023296356201, "step": 268 }, { "epoch": 3.188148148148148, "grad_norm": 5.996717704776933, "learning_rate": 3.753751843244003e-07, "logits/chosen": -0.9044997692108154, "logits/rejected": -1.1144652366638184, "logps/chosen": -29.467838287353516, "logps/rejected": -60.72572326660156, "loss": 0.0656, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6497594714164734, "rewards/margins": 5.051385402679443, "rewards/rejected": -5.701144695281982, "step": 269 }, { "epoch": 3.2, "grad_norm": 5.541764509927049, "learning_rate": 3.7424850524498113e-07, "logits/chosen": -1.0047956705093384, "logits/rejected": -0.9436285495758057, "logps/chosen": -45.23377227783203, "logps/rejected": -59.85185241699219, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": -0.6731371879577637, "rewards/margins": 4.904725074768066, "rewards/rejected": -5.577862739562988, "step": 270 }, { "epoch": 3.211851851851852, "grad_norm": 5.924296644494306, "learning_rate": 3.731184647975584e-07, "logits/chosen": -1.0159187316894531, "logits/rejected": -0.9188311100006104, "logps/chosen": -21.674226760864258, "logps/rejected": -37.375823974609375, "loss": 0.0726, "rewards/accuracies": 1.0, "rewards/chosen": 0.21266776323318481, "rewards/margins": 3.735647678375244, "rewards/rejected": -3.522979497909546, "step": 271 }, { "epoch": 3.2237037037037037, "grad_norm": 5.386999484278703, "learning_rate": 3.7198509355378207e-07, "logits/chosen": -1.1651026010513306, "logits/rejected": -1.1258482933044434, "logps/chosen": -26.176000595092773, "logps/rejected": -46.100807189941406, "loss": 0.0613, "rewards/accuracies": 1.0, "rewards/chosen": -0.20522062480449677, "rewards/margins": 4.301911354064941, "rewards/rejected": -4.507132053375244, "step": 272 }, { "epoch": 3.2355555555555555, "grad_norm": 5.601624961329212, "learning_rate": 3.7084842217541196e-07, "logits/chosen": -0.8905525803565979, "logits/rejected": -1.1875985860824585, "logps/chosen": -24.38876724243164, "logps/rejected": -56.33481979370117, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": -0.22321292757987976, "rewards/margins": 5.481816291809082, "rewards/rejected": -5.705029010772705, "step": 273 }, { "epoch": 3.2474074074074073, "grad_norm": 5.796108078800707, "learning_rate": 3.6970848141348855e-07, "logits/chosen": -1.0142302513122559, "logits/rejected": -1.140476107597351, "logps/chosen": -20.314678192138672, "logps/rejected": -47.31608963012695, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": -0.08062909543514252, "rewards/margins": 4.616469860076904, "rewards/rejected": -4.697099685668945, "step": 274 }, { "epoch": 3.259259259259259, "grad_norm": 5.702096262342128, "learning_rate": 3.685653021075006e-07, "logits/chosen": -0.8202773332595825, "logits/rejected": -0.8724027276039124, "logps/chosen": -33.18456268310547, "logps/rejected": -56.54239273071289, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": -0.5080665349960327, "rewards/margins": 4.730257987976074, "rewards/rejected": -5.238324165344238, "step": 275 }, { "epoch": 3.2711111111111113, "grad_norm": 5.539345867188406, "learning_rate": 3.6741891518455146e-07, "logits/chosen": -1.1294889450073242, "logits/rejected": -1.100435495376587, "logps/chosen": -41.69868087768555, "logps/rejected": -62.35628890991211, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": -1.2733755111694336, "rewards/margins": 4.73194694519043, "rewards/rejected": -6.005322456359863, "step": 276 }, { "epoch": 3.282962962962963, "grad_norm": 4.99860717998791, "learning_rate": 3.6626935165852183e-07, "logits/chosen": -1.112581491470337, "logits/rejected": -1.177794098854065, "logps/chosen": -33.80670928955078, "logps/rejected": -71.07241821289062, "loss": 0.062, "rewards/accuracies": 1.0, "rewards/chosen": -0.6145895719528198, "rewards/margins": 6.081070423126221, "rewards/rejected": -6.69566011428833, "step": 277 }, { "epoch": 3.294814814814815, "grad_norm": 5.307021962386122, "learning_rate": 3.6511664262923094e-07, "logits/chosen": -1.0228497982025146, "logits/rejected": -0.9828412532806396, "logps/chosen": -23.93638038635254, "logps/rejected": -44.23779296875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": 0.12847840785980225, "rewards/margins": 3.686278820037842, "rewards/rejected": -3.557800054550171, "step": 278 }, { "epoch": 3.3066666666666666, "grad_norm": 6.056323436180124, "learning_rate": 3.639608192815951e-07, "logits/chosen": -1.0668245553970337, "logits/rejected": -0.879786491394043, "logps/chosen": -31.651464462280273, "logps/rejected": -36.51784133911133, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": -0.16986359655857086, "rewards/margins": 3.040926218032837, "rewards/rejected": -3.210789680480957, "step": 279 }, { "epoch": 3.3185185185185184, "grad_norm": 6.040832198855085, "learning_rate": 3.6280191288478435e-07, "logits/chosen": -1.143092155456543, "logits/rejected": -0.9826564192771912, "logps/chosen": -36.029632568359375, "logps/rejected": -51.19088363647461, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": -0.4821978509426117, "rewards/margins": 4.130788803100586, "rewards/rejected": -4.612987041473389, "step": 280 }, { "epoch": 3.33037037037037, "grad_norm": 4.03276871934942, "learning_rate": 3.61639954791376e-07, "logits/chosen": -1.117720127105713, "logits/rejected": -1.1670171022415161, "logps/chosen": -30.51466178894043, "logps/rejected": -56.65740966796875, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -0.6547921895980835, "rewards/margins": 5.172966957092285, "rewards/rejected": -5.827759265899658, "step": 281 }, { "epoch": 3.3422222222222224, "grad_norm": 5.926942922995324, "learning_rate": 3.604749764365069e-07, "logits/chosen": -1.1983726024627686, "logits/rejected": -1.0965083837509155, "logps/chosen": -31.945249557495117, "logps/rejected": -42.11628341674805, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": -0.15036548674106598, "rewards/margins": 3.8085999488830566, "rewards/rejected": -3.958966016769409, "step": 282 }, { "epoch": 3.354074074074074, "grad_norm": 5.746588189734936, "learning_rate": 3.593070093370226e-07, "logits/chosen": -1.2473244667053223, "logits/rejected": -1.1790859699249268, "logps/chosen": -27.693424224853516, "logps/rejected": -55.838382720947266, "loss": 0.0683, "rewards/accuracies": 1.0, "rewards/chosen": -0.5475992560386658, "rewards/margins": 5.170629501342773, "rewards/rejected": -5.718228816986084, "step": 283 }, { "epoch": 3.365925925925926, "grad_norm": 4.050731197423161, "learning_rate": 3.5813608509062526e-07, "logits/chosen": -0.8557882308959961, "logits/rejected": -0.9357935190200806, "logps/chosen": -23.405031204223633, "logps/rejected": -61.17503356933594, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": -0.4729683995246887, "rewards/margins": 5.8998703956604, "rewards/rejected": -6.372838020324707, "step": 284 }, { "epoch": 3.3777777777777778, "grad_norm": 6.330628466200805, "learning_rate": 3.569622353750181e-07, "logits/chosen": -1.3175606727600098, "logits/rejected": -1.1783069372177124, "logps/chosen": -26.113262176513672, "logps/rejected": -49.43036651611328, "loss": 0.0793, "rewards/accuracies": 1.0, "rewards/chosen": -0.3910313546657562, "rewards/margins": 4.3150248527526855, "rewards/rejected": -4.706056118011475, "step": 285 }, { "epoch": 3.3896296296296295, "grad_norm": 4.891367910710421, "learning_rate": 3.557854919470491e-07, "logits/chosen": -1.0190260410308838, "logits/rejected": -0.8906891345977783, "logps/chosen": -30.935029983520508, "logps/rejected": -54.49614715576172, "loss": 0.0553, "rewards/accuracies": 1.0, "rewards/chosen": -0.5170705318450928, "rewards/margins": 4.817275524139404, "rewards/rejected": -5.334345817565918, "step": 286 }, { "epoch": 3.4014814814814813, "grad_norm": 4.73121969828735, "learning_rate": 3.546058866418513e-07, "logits/chosen": -1.2499243021011353, "logits/rejected": -1.1855663061141968, "logps/chosen": -26.786319732666016, "logps/rejected": -55.46076965332031, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": -0.5292675495147705, "rewards/margins": 5.712489128112793, "rewards/rejected": -6.241756916046143, "step": 287 }, { "epoch": 3.413333333333333, "grad_norm": 5.279313917975766, "learning_rate": 3.5342345137198206e-07, "logits/chosen": -1.0446308851242065, "logits/rejected": -0.8056778907775879, "logps/chosen": -29.395492553710938, "logps/rejected": -44.214805603027344, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": -0.3937348425388336, "rewards/margins": 4.55248498916626, "rewards/rejected": -4.946219444274902, "step": 288 }, { "epoch": 3.4251851851851853, "grad_norm": 5.3706152394158435, "learning_rate": 3.5223821812655903e-07, "logits/chosen": -1.045047640800476, "logits/rejected": -0.9570527076721191, "logps/chosen": -35.075313568115234, "logps/rejected": -41.273075103759766, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": -0.29294437170028687, "rewards/margins": 3.7492318153381348, "rewards/rejected": -4.042176246643066, "step": 289 }, { "epoch": 3.437037037037037, "grad_norm": 6.958208575648387, "learning_rate": 3.510502189703954e-07, "logits/chosen": -1.0246403217315674, "logits/rejected": -1.0500332117080688, "logps/chosen": -30.618261337280273, "logps/rejected": -51.143402099609375, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": -0.1609795093536377, "rewards/margins": 5.519993782043457, "rewards/rejected": -5.680973529815674, "step": 290 }, { "epoch": 3.448888888888889, "grad_norm": 5.661919128416755, "learning_rate": 3.4985948604313237e-07, "logits/chosen": -0.978698194026947, "logits/rejected": -1.042383074760437, "logps/chosen": -34.199241638183594, "logps/rejected": -55.27683639526367, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -1.0662914514541626, "rewards/margins": 4.541738510131836, "rewards/rejected": -5.608030319213867, "step": 291 }, { "epoch": 3.4607407407407407, "grad_norm": 5.985004769770914, "learning_rate": 3.486660515583691e-07, "logits/chosen": -1.0405309200286865, "logits/rejected": -1.1097911596298218, "logps/chosen": -27.69963836669922, "logps/rejected": -53.2136344909668, "loss": 0.0685, "rewards/accuracies": 1.0, "rewards/chosen": -0.88338702917099, "rewards/margins": 4.365427017211914, "rewards/rejected": -5.248814105987549, "step": 292 }, { "epoch": 3.4725925925925925, "grad_norm": 5.177960355626909, "learning_rate": 3.474699478027918e-07, "logits/chosen": -1.2111238241195679, "logits/rejected": -0.9757024049758911, "logps/chosen": -31.71354103088379, "logps/rejected": -41.57647705078125, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": -0.6544926166534424, "rewards/margins": 3.8716635704040527, "rewards/rejected": -4.526156425476074, "step": 293 }, { "epoch": 3.4844444444444447, "grad_norm": 4.613155996179155, "learning_rate": 3.4627120713529983e-07, "logits/chosen": -0.9755435585975647, "logits/rejected": -1.1237881183624268, "logps/chosen": -21.237268447875977, "logps/rejected": -54.971214294433594, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": -0.2568213641643524, "rewards/margins": 5.524716854095459, "rewards/rejected": -5.781538486480713, "step": 294 }, { "epoch": 3.4962962962962965, "grad_norm": 5.128029383000987, "learning_rate": 3.4506986198613077e-07, "logits/chosen": -0.9919889569282532, "logits/rejected": -0.8300825357437134, "logps/chosen": -29.0252742767334, "logps/rejected": -47.359825134277344, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": -0.028458386659622192, "rewards/margins": 5.328551292419434, "rewards/rejected": -5.357009410858154, "step": 295 }, { "epoch": 3.5081481481481482, "grad_norm": 4.006414442650582, "learning_rate": 3.438659448559825e-07, "logits/chosen": -1.0892466306686401, "logits/rejected": -0.9891116619110107, "logps/chosen": -27.066917419433594, "logps/rejected": -43.91146469116211, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": 0.07186317443847656, "rewards/margins": 4.227798938751221, "rewards/rejected": -4.155935764312744, "step": 296 }, { "epoch": 3.52, "grad_norm": 6.127057074618686, "learning_rate": 3.4265948831513434e-07, "logits/chosen": -1.1602692604064941, "logits/rejected": -0.9549179077148438, "logps/chosen": -38.73994064331055, "logps/rejected": -40.83653259277344, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": -0.6782684922218323, "rewards/margins": 3.3154821395874023, "rewards/rejected": -3.99375057220459, "step": 297 }, { "epoch": 3.531851851851852, "grad_norm": 5.969524211003228, "learning_rate": 3.414505250025659e-07, "logits/chosen": -1.2992161512374878, "logits/rejected": -1.39997136592865, "logps/chosen": -25.659305572509766, "logps/rejected": -57.98012924194336, "loss": 0.0546, "rewards/accuracies": 1.0, "rewards/chosen": -0.21638883650302887, "rewards/margins": 4.876415252685547, "rewards/rejected": -5.092803955078125, "step": 298 }, { "epoch": 3.5437037037037036, "grad_norm": 5.142417577054952, "learning_rate": 3.402390876250737e-07, "logits/chosen": -0.9474883675575256, "logits/rejected": -0.9179717302322388, "logps/chosen": -22.229196548461914, "logps/rejected": -46.438045501708984, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": -0.5016873478889465, "rewards/margins": 5.030646324157715, "rewards/rejected": -5.5323333740234375, "step": 299 }, { "epoch": 3.5555555555555554, "grad_norm": 4.2230056460228145, "learning_rate": 3.390252089563867e-07, "logits/chosen": -0.7143265008926392, "logits/rejected": -0.7287828922271729, "logps/chosen": -36.311458587646484, "logps/rejected": -57.43645095825195, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": -0.6835311651229858, "rewards/margins": 4.852078914642334, "rewards/rejected": -5.535609722137451, "step": 300 }, { "epoch": 3.5674074074074076, "grad_norm": 4.613790070173845, "learning_rate": 3.3780892183627974e-07, "logits/chosen": -0.9158973693847656, "logits/rejected": -0.9328266978263855, "logps/chosen": -26.87763214111328, "logps/rejected": -55.432861328125, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -0.9097545146942139, "rewards/margins": 5.234354019165039, "rewards/rejected": -6.144108295440674, "step": 301 }, { "epoch": 3.5792592592592594, "grad_norm": 5.438641971874985, "learning_rate": 3.3659025916968475e-07, "logits/chosen": -0.8110693693161011, "logits/rejected": -0.9373239278793335, "logps/chosen": -33.212890625, "logps/rejected": -57.53894805908203, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": -1.0058021545410156, "rewards/margins": 4.593456268310547, "rewards/rejected": -5.599257946014404, "step": 302 }, { "epoch": 3.591111111111111, "grad_norm": 5.85934801222626, "learning_rate": 3.353692539258006e-07, "logits/chosen": -1.0616428852081299, "logits/rejected": -1.0635976791381836, "logps/chosen": -42.99614334106445, "logps/rejected": -59.80973815917969, "loss": 0.0729, "rewards/accuracies": 1.0, "rewards/chosen": -0.5455619096755981, "rewards/margins": 4.6160502433776855, "rewards/rejected": -5.161612033843994, "step": 303 }, { "epoch": 3.602962962962963, "grad_norm": 4.758016084753321, "learning_rate": 3.3414593913720155e-07, "logits/chosen": -1.0797550678253174, "logits/rejected": -1.1135615110397339, "logps/chosen": -25.05910301208496, "logps/rejected": -48.85538864135742, "loss": 0.0502, "rewards/accuracies": 1.0, "rewards/chosen": -0.7004413604736328, "rewards/margins": 4.928831100463867, "rewards/rejected": -5.6292724609375, "step": 304 }, { "epoch": 3.6148148148148147, "grad_norm": 5.071406885912709, "learning_rate": 3.329203478989431e-07, "logits/chosen": -0.8254508972167969, "logits/rejected": -0.8509577512741089, "logps/chosen": -31.357934951782227, "logps/rejected": -56.72637176513672, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": -0.3466971218585968, "rewards/margins": 6.271115303039551, "rewards/rejected": -6.617812156677246, "step": 305 }, { "epoch": 3.626666666666667, "grad_norm": 5.745951047820061, "learning_rate": 3.3169251336766697e-07, "logits/chosen": -1.1452853679656982, "logits/rejected": -0.9981801509857178, "logps/chosen": -27.595762252807617, "logps/rejected": -57.56399917602539, "loss": 0.0673, "rewards/accuracies": 1.0, "rewards/chosen": -0.9206715822219849, "rewards/margins": 5.363818168640137, "rewards/rejected": -6.28449010848999, "step": 306 }, { "epoch": 3.6385185185185183, "grad_norm": 4.425440687205678, "learning_rate": 3.3046246876070405e-07, "logits/chosen": -1.078284502029419, "logits/rejected": -0.9102885127067566, "logps/chosen": -35.90202331542969, "logps/rejected": -54.64530944824219, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": -0.6325173377990723, "rewards/margins": 4.867710590362549, "rewards/rejected": -5.500227928161621, "step": 307 }, { "epoch": 3.6503703703703705, "grad_norm": 6.4970787913514965, "learning_rate": 3.2923024735517567e-07, "logits/chosen": -1.1114505529403687, "logits/rejected": -1.2475202083587646, "logps/chosen": -31.215063095092773, "logps/rejected": -52.50607681274414, "loss": 0.0792, "rewards/accuracies": 1.0, "rewards/chosen": -0.8711620569229126, "rewards/margins": 4.3640456199646, "rewards/rejected": -5.235208034515381, "step": 308 }, { "epoch": 3.6622222222222223, "grad_norm": 4.671255590927646, "learning_rate": 3.279958824870934e-07, "logits/chosen": -1.0347408056259155, "logits/rejected": -1.1540158987045288, "logps/chosen": -22.052547454833984, "logps/rejected": -37.25065994262695, "loss": 0.0488, "rewards/accuracies": 1.0, "rewards/chosen": -0.3115699291229248, "rewards/margins": 3.4501681327819824, "rewards/rejected": -3.7617380619049072, "step": 309 }, { "epoch": 3.674074074074074, "grad_norm": 5.203696540638213, "learning_rate": 3.2675940755045713e-07, "logits/chosen": -0.8645880818367004, "logits/rejected": -0.7690497636795044, "logps/chosen": -27.421972274780273, "logps/rejected": -46.4594612121582, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": -0.294041246175766, "rewards/margins": 5.109452724456787, "rewards/rejected": -5.403493881225586, "step": 310 }, { "epoch": 3.685925925925926, "grad_norm": 6.0369789648764955, "learning_rate": 3.2552085599635167e-07, "logits/chosen": -1.26742684841156, "logits/rejected": -1.0950877666473389, "logps/chosen": -29.363801956176758, "logps/rejected": -51.036216735839844, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": -0.4705730378627777, "rewards/margins": 5.166300296783447, "rewards/rejected": -5.636873722076416, "step": 311 }, { "epoch": 3.6977777777777776, "grad_norm": 5.411702389421841, "learning_rate": 3.242802613320418e-07, "logits/chosen": -1.2446568012237549, "logits/rejected": -1.2488821744918823, "logps/chosen": -34.51674270629883, "logps/rejected": -49.218116760253906, "loss": 0.0579, "rewards/accuracies": 1.0, "rewards/chosen": -0.41771867871284485, "rewards/margins": 4.54225492477417, "rewards/rejected": -4.9599738121032715, "step": 312 }, { "epoch": 3.70962962962963, "grad_norm": 6.254274807160154, "learning_rate": 3.2303765712006585e-07, "logits/chosen": -0.8596463203430176, "logits/rejected": -1.100625991821289, "logps/chosen": -39.566654205322266, "logps/rejected": -69.77198791503906, "loss": 0.0653, "rewards/accuracies": 1.0, "rewards/chosen": -1.135459542274475, "rewards/margins": 5.502881050109863, "rewards/rejected": -6.638340473175049, "step": 313 }, { "epoch": 3.7214814814814816, "grad_norm": 5.503511678320754, "learning_rate": 3.217930769773275e-07, "logits/chosen": -0.8999834060668945, "logits/rejected": -0.8143523931503296, "logps/chosen": -40.45894241333008, "logps/rejected": -51.68131637573242, "loss": 0.0566, "rewards/accuracies": 0.9375, "rewards/chosen": -1.05463707447052, "rewards/margins": 3.0763254165649414, "rewards/rejected": -4.130962371826172, "step": 314 }, { "epoch": 3.7333333333333334, "grad_norm": 4.936108454949577, "learning_rate": 3.2054655457418647e-07, "logits/chosen": -0.8420968055725098, "logits/rejected": -1.0272884368896484, "logps/chosen": -23.133481979370117, "logps/rejected": -41.11859130859375, "loss": 0.0578, "rewards/accuracies": 1.0, "rewards/chosen": -0.44954371452331543, "rewards/margins": 3.8556888103485107, "rewards/rejected": -4.305232524871826, "step": 315 }, { "epoch": 3.745185185185185, "grad_norm": 5.389922613533766, "learning_rate": 3.1929812363354764e-07, "logits/chosen": -1.0261280536651611, "logits/rejected": -0.7948772311210632, "logps/chosen": -31.51889991760254, "logps/rejected": -42.249786376953125, "loss": 0.0561, "rewards/accuracies": 1.0, "rewards/chosen": -0.4212304651737213, "rewards/margins": 4.244043827056885, "rewards/rejected": -4.665274620056152, "step": 316 }, { "epoch": 3.757037037037037, "grad_norm": 4.6295381857080296, "learning_rate": 3.1804781792994867e-07, "logits/chosen": -0.8965979218482971, "logits/rejected": -0.8617987632751465, "logps/chosen": -42.41337585449219, "logps/rejected": -72.1649169921875, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -0.9737859964370728, "rewards/margins": 6.099238395690918, "rewards/rejected": -7.073023796081543, "step": 317 }, { "epoch": 3.7688888888888887, "grad_norm": 5.5642008946868655, "learning_rate": 3.167956712886463e-07, "logits/chosen": -0.6077237129211426, "logits/rejected": -0.7936528325080872, "logps/chosen": -33.86177444458008, "logps/rejected": -64.47748565673828, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": -0.5910844206809998, "rewards/margins": 5.1871795654296875, "rewards/rejected": -5.778264045715332, "step": 318 }, { "epoch": 3.7807407407407405, "grad_norm": 6.861996538597475, "learning_rate": 3.155417175847011e-07, "logits/chosen": -1.2012813091278076, "logits/rejected": -1.1468443870544434, "logps/chosen": -31.32352638244629, "logps/rejected": -51.89019775390625, "loss": 0.0784, "rewards/accuracies": 1.0, "rewards/chosen": -1.28224515914917, "rewards/margins": 4.7101054191589355, "rewards/rejected": -5.9923505783081055, "step": 319 }, { "epoch": 3.7925925925925927, "grad_norm": 5.30619862961812, "learning_rate": 3.142859907420615e-07, "logits/chosen": -1.0433729887008667, "logits/rejected": -1.0667370557785034, "logps/chosen": -39.072513580322266, "logps/rejected": -58.01454544067383, "loss": 0.0504, "rewards/accuracies": 1.0, "rewards/chosen": -1.079565405845642, "rewards/margins": 3.7631685733795166, "rewards/rejected": -4.842733860015869, "step": 320 }, { "epoch": 3.8044444444444445, "grad_norm": 5.765561512142667, "learning_rate": 3.1302852473264537e-07, "logits/chosen": -1.352389931678772, "logits/rejected": -1.1685843467712402, "logps/chosen": -25.97576141357422, "logps/rejected": -41.47727584838867, "loss": 0.0644, "rewards/accuracies": 1.0, "rewards/chosen": -0.7056049108505249, "rewards/margins": 3.5812952518463135, "rewards/rejected": -4.286900043487549, "step": 321 }, { "epoch": 3.8162962962962963, "grad_norm": 5.083438759014095, "learning_rate": 3.117693535754213e-07, "logits/chosen": -1.1177499294281006, "logits/rejected": -0.9615808725357056, "logps/chosen": -30.529483795166016, "logps/rejected": -65.1328125, "loss": 0.0545, "rewards/accuracies": 1.0, "rewards/chosen": -1.3329801559448242, "rewards/margins": 6.611982345581055, "rewards/rejected": -7.944962501525879, "step": 322 }, { "epoch": 3.828148148148148, "grad_norm": 5.339542761344782, "learning_rate": 3.105085113354885e-07, "logits/chosen": -1.219346523284912, "logits/rejected": -1.0785218477249146, "logps/chosen": -24.757862091064453, "logps/rejected": -44.62859344482422, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": -0.687053918838501, "rewards/margins": 5.1285810470581055, "rewards/rejected": -5.8156352043151855, "step": 323 }, { "epoch": 3.84, "grad_norm": 5.388037597932235, "learning_rate": 3.092460321231547e-07, "logits/chosen": -0.8497295379638672, "logits/rejected": -0.6827176809310913, "logps/chosen": -28.461023330688477, "logps/rejected": -40.831329345703125, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": -0.3341638445854187, "rewards/margins": 4.207818984985352, "rewards/rejected": -4.541983127593994, "step": 324 }, { "epoch": 3.851851851851852, "grad_norm": 6.777852590076427, "learning_rate": 3.079819500930138e-07, "logits/chosen": -1.1702187061309814, "logits/rejected": -0.8639770746231079, "logps/chosen": -33.30535888671875, "logps/rejected": -45.3155632019043, "loss": 0.068, "rewards/accuracies": 1.0, "rewards/chosen": -0.7845231294631958, "rewards/margins": 4.671426296234131, "rewards/rejected": -5.455949783325195, "step": 325 }, { "epoch": 3.863703703703704, "grad_norm": 4.96340034468975, "learning_rate": 3.0671629944302164e-07, "logits/chosen": -1.1068304777145386, "logits/rejected": -0.8126644492149353, "logps/chosen": -24.921886444091797, "logps/rejected": -54.05364990234375, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": -0.4166131913661957, "rewards/margins": 6.512470722198486, "rewards/rejected": -6.929083824157715, "step": 326 }, { "epoch": 3.8755555555555556, "grad_norm": 5.138730208654112, "learning_rate": 3.054491144135707e-07, "logits/chosen": -1.1236650943756104, "logits/rejected": -0.91242516040802, "logps/chosen": -28.354129791259766, "logps/rejected": -43.563880920410156, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": -0.3069639801979065, "rewards/margins": 3.786219596862793, "rewards/rejected": -4.093183517456055, "step": 327 }, { "epoch": 3.8874074074074074, "grad_norm": 6.099421147689264, "learning_rate": 3.0418042928656415e-07, "logits/chosen": -0.7698428630828857, "logits/rejected": -0.8110767602920532, "logps/chosen": -38.043785095214844, "logps/rejected": -64.61091613769531, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": -0.9435144662857056, "rewards/margins": 5.062083721160889, "rewards/rejected": -6.005598068237305, "step": 328 }, { "epoch": 3.899259259259259, "grad_norm": 6.253989880304663, "learning_rate": 3.029102783844879e-07, "logits/chosen": -1.29204261302948, "logits/rejected": -1.051203966140747, "logps/chosen": -29.049633026123047, "logps/rejected": -43.23980712890625, "loss": 0.0655, "rewards/accuracies": 1.0, "rewards/chosen": -1.0605460405349731, "rewards/margins": 4.699088096618652, "rewards/rejected": -5.759634494781494, "step": 329 }, { "epoch": 3.911111111111111, "grad_norm": 6.957873933273759, "learning_rate": 3.016386960694827e-07, "logits/chosen": -1.1009610891342163, "logits/rejected": -0.9418026208877563, "logps/chosen": -27.438570022583008, "logps/rejected": -54.82490921020508, "loss": 0.0727, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6385165452957153, "rewards/margins": 5.449717998504639, "rewards/rejected": -6.088234901428223, "step": 330 }, { "epoch": 3.9229629629629628, "grad_norm": 5.279686775444548, "learning_rate": 3.003657167424139e-07, "logits/chosen": -1.222791314125061, "logits/rejected": -1.2143419981002808, "logps/chosen": -28.141613006591797, "logps/rejected": -51.94461441040039, "loss": 0.0599, "rewards/accuracies": 1.0, "rewards/chosen": -0.66292804479599, "rewards/margins": 4.610944747924805, "rewards/rejected": -5.2738728523254395, "step": 331 }, { "epoch": 3.934814814814815, "grad_norm": 4.870201008956739, "learning_rate": 2.990913748419411e-07, "logits/chosen": -0.8936185836791992, "logits/rejected": -0.5663695335388184, "logps/chosen": -38.59070587158203, "logps/rejected": -60.144142150878906, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": -0.8343983292579651, "rewards/margins": 6.496341228485107, "rewards/rejected": -7.330739974975586, "step": 332 }, { "epoch": 3.9466666666666668, "grad_norm": 4.2701436405891835, "learning_rate": 2.978157048435863e-07, "logits/chosen": -1.0724661350250244, "logits/rejected": -1.1288424730300903, "logps/chosen": -35.56442642211914, "logps/rejected": -64.08760070800781, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.9496961236000061, "rewards/margins": 5.328984260559082, "rewards/rejected": -6.278680324554443, "step": 333 }, { "epoch": 3.9585185185185185, "grad_norm": 5.0014798868853045, "learning_rate": 2.9653874125880167e-07, "logits/chosen": -0.849825918674469, "logits/rejected": -0.798513650894165, "logps/chosen": -24.454782485961914, "logps/rejected": -50.63400650024414, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": -0.20302775502204895, "rewards/margins": 5.389017105102539, "rewards/rejected": -5.592044353485107, "step": 334 }, { "epoch": 3.9703703703703703, "grad_norm": 5.388732511269792, "learning_rate": 2.9526051863403517e-07, "logits/chosen": -0.9176443815231323, "logits/rejected": -0.8973706364631653, "logps/chosen": -27.884946823120117, "logps/rejected": -67.28817749023438, "loss": 0.047, "rewards/accuracies": 1.0, "rewards/chosen": -0.4531194865703583, "rewards/margins": 7.574563503265381, "rewards/rejected": -8.02768325805664, "step": 335 }, { "epoch": 3.982222222222222, "grad_norm": 6.518100338649706, "learning_rate": 2.9398107154979634e-07, "logits/chosen": -1.078136920928955, "logits/rejected": -1.0960880517959595, "logps/chosen": -33.29356384277344, "logps/rejected": -51.3941650390625, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": -0.8194246292114258, "rewards/margins": 4.724632263183594, "rewards/rejected": -5.5440568923950195, "step": 336 }, { "epoch": 3.9940740740740743, "grad_norm": 5.2039964922640065, "learning_rate": 2.9270043461972097e-07, "logits/chosen": -1.067875623703003, "logits/rejected": -0.8327341675758362, "logps/chosen": -35.44911575317383, "logps/rejected": -61.61286544799805, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": -1.434535264968872, "rewards/margins": 6.484606742858887, "rewards/rejected": -7.919142723083496, "step": 337 }, { "epoch": 4.005925925925926, "grad_norm": 4.187898216155283, "learning_rate": 2.9141864248963427e-07, "logits/chosen": -1.328801155090332, "logits/rejected": -1.3687348365783691, "logps/chosen": -25.005905151367188, "logps/rejected": -51.1596794128418, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -0.5326629877090454, "rewards/margins": 5.868394374847412, "rewards/rejected": -6.401058197021484, "step": 338 }, { "epoch": 4.017777777777778, "grad_norm": 3.1974382675774202, "learning_rate": 2.9013572983661375e-07, "logits/chosen": -0.5688086152076721, "logits/rejected": -0.5790220499038696, "logps/chosen": -36.34614181518555, "logps/rejected": -59.652000427246094, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -0.793825626373291, "rewards/margins": 6.1709794998168945, "rewards/rejected": -6.964805603027344, "step": 339 }, { "epoch": 4.029629629629629, "grad_norm": 4.158358612889323, "learning_rate": 2.8885173136805125e-07, "logits/chosen": -1.0552133321762085, "logits/rejected": -0.8415440320968628, "logps/chosen": -41.49494934082031, "logps/rejected": -57.47507858276367, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -1.504111409187317, "rewards/margins": 5.205606937408447, "rewards/rejected": -6.709719181060791, "step": 340 }, { "epoch": 4.0414814814814815, "grad_norm": 3.365949611740308, "learning_rate": 2.8756668182071357e-07, "logits/chosen": -0.9030847549438477, "logits/rejected": -1.0248281955718994, "logps/chosen": -22.62586784362793, "logps/rejected": -57.38360595703125, "loss": 0.0332, "rewards/accuracies": 1.0, "rewards/chosen": -0.8877655267715454, "rewards/margins": 5.815491199493408, "rewards/rejected": -6.703256607055664, "step": 341 }, { "epoch": 4.053333333333334, "grad_norm": 3.7482771926949994, "learning_rate": 2.862806159598032e-07, "logits/chosen": -1.1101518869400024, "logits/rejected": -1.2761971950531006, "logps/chosen": -27.846036911010742, "logps/rejected": -53.2386360168457, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -0.8876510858535767, "rewards/margins": 5.131702899932861, "rewards/rejected": -6.019353866577148, "step": 342 }, { "epoch": 4.065185185185185, "grad_norm": 3.450390240669121, "learning_rate": 2.8499356857801744e-07, "logits/chosen": -1.0844999551773071, "logits/rejected": -1.003774881362915, "logps/chosen": -23.862234115600586, "logps/rejected": -48.92105484008789, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -0.2901920974254608, "rewards/margins": 5.612967491149902, "rewards/rejected": -5.903159141540527, "step": 343 }, { "epoch": 4.077037037037037, "grad_norm": 3.894756023928179, "learning_rate": 2.837055744946072e-07, "logits/chosen": -1.0588653087615967, "logits/rejected": -0.9954587817192078, "logps/chosen": -33.45431137084961, "logps/rejected": -54.49903106689453, "loss": 0.0376, "rewards/accuracies": 1.0, "rewards/chosen": -0.7320315837860107, "rewards/margins": 4.859332084655762, "rewards/rejected": -5.591363430023193, "step": 344 }, { "epoch": 4.088888888888889, "grad_norm": 3.76201693248286, "learning_rate": 2.8241666855443526e-07, "logits/chosen": -1.0309534072875977, "logits/rejected": -1.0565218925476074, "logps/chosen": -25.159849166870117, "logps/rejected": -55.27362823486328, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.9085606336593628, "rewards/margins": 5.737356185913086, "rewards/rejected": -6.645916938781738, "step": 345 }, { "epoch": 4.100740740740741, "grad_norm": 3.5741790516530005, "learning_rate": 2.811268856270332e-07, "logits/chosen": -0.6616291403770447, "logits/rejected": -0.747644305229187, "logps/chosen": -31.07400131225586, "logps/rejected": -61.612125396728516, "loss": 0.0322, "rewards/accuracies": 1.0, "rewards/chosen": -0.5435906052589417, "rewards/margins": 6.400445938110352, "rewards/rejected": -6.944036483764648, "step": 346 }, { "epoch": 4.112592592592593, "grad_norm": 3.704349704368933, "learning_rate": 2.798362606056583e-07, "logits/chosen": -1.056967854499817, "logits/rejected": -1.1185215711593628, "logps/chosen": -41.541934967041016, "logps/rejected": -66.00812530517578, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -1.4524059295654297, "rewards/margins": 6.422599792480469, "rewards/rejected": -7.875006198883057, "step": 347 }, { "epoch": 4.124444444444444, "grad_norm": 4.484355122741482, "learning_rate": 2.7854482840634965e-07, "logits/chosen": -1.4520745277404785, "logits/rejected": -1.226075291633606, "logps/chosen": -31.625812530517578, "logps/rejected": -38.356956481933594, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -0.793215274810791, "rewards/margins": 3.6655571460723877, "rewards/rejected": -4.458772659301758, "step": 348 }, { "epoch": 4.136296296296297, "grad_norm": 3.734160255747896, "learning_rate": 2.772526239669831e-07, "logits/chosen": -0.7964825630187988, "logits/rejected": -0.8359100222587585, "logps/chosen": -25.68545150756836, "logps/rejected": -69.7961196899414, "loss": 0.0385, "rewards/accuracies": 1.0, "rewards/chosen": -0.3613826334476471, "rewards/margins": 6.706874370574951, "rewards/rejected": -7.0682573318481445, "step": 349 }, { "epoch": 4.148148148148148, "grad_norm": 3.980785661890267, "learning_rate": 2.759596822463267e-07, "logits/chosen": -1.2085440158843994, "logits/rejected": -1.0254814624786377, "logps/chosen": -29.036041259765625, "logps/rejected": -51.60666275024414, "loss": 0.0465, "rewards/accuracies": 1.0, "rewards/chosen": -0.908053457736969, "rewards/margins": 5.615097522735596, "rewards/rejected": -6.52315092086792, "step": 350 }, { "epoch": 4.16, "grad_norm": 3.071416968940517, "learning_rate": 2.746660382230944e-07, "logits/chosen": -1.111541986465454, "logits/rejected": -1.015653371810913, "logps/chosen": -28.444028854370117, "logps/rejected": -46.22932815551758, "loss": 0.0265, "rewards/accuracies": 1.0, "rewards/chosen": -0.6811230182647705, "rewards/margins": 4.413417816162109, "rewards/rejected": -5.094541072845459, "step": 351 }, { "epoch": 4.1718518518518515, "grad_norm": 3.379553943501446, "learning_rate": 2.73371726895e-07, "logits/chosen": -1.0623794794082642, "logits/rejected": -0.7676531076431274, "logps/chosen": -28.41644859313965, "logps/rejected": -41.91035079956055, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -0.5011738538742065, "rewards/margins": 4.736425876617432, "rewards/rejected": -5.2375993728637695, "step": 352 }, { "epoch": 4.183703703703704, "grad_norm": 3.6570535219048494, "learning_rate": 2.7207678327781036e-07, "logits/chosen": -0.9565964937210083, "logits/rejected": -1.037332534790039, "logps/chosen": -30.713109970092773, "logps/rejected": -61.01609802246094, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -0.675375759601593, "rewards/margins": 6.953289031982422, "rewards/rejected": -7.628664970397949, "step": 353 }, { "epoch": 4.195555555555556, "grad_norm": 3.110297173394316, "learning_rate": 2.7078124240439793e-07, "logits/chosen": -1.219465970993042, "logits/rejected": -1.0699536800384521, "logps/chosen": -27.52557373046875, "logps/rejected": -58.77972412109375, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.6273876428604126, "rewards/margins": 6.733246803283691, "rewards/rejected": -7.360633850097656, "step": 354 }, { "epoch": 4.207407407407407, "grad_norm": 3.833981676241624, "learning_rate": 2.6948513932379307e-07, "logits/chosen": -0.987195611000061, "logits/rejected": -1.073306918144226, "logps/chosen": -26.008790969848633, "logps/rejected": -57.6981315612793, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": -0.4490770399570465, "rewards/margins": 5.844611167907715, "rewards/rejected": -6.29368782043457, "step": 355 }, { "epoch": 4.2192592592592595, "grad_norm": 4.554577776444402, "learning_rate": 2.68188509100236e-07, "logits/chosen": -1.3205063343048096, "logits/rejected": -1.1293655633926392, "logps/chosen": -35.74661636352539, "logps/rejected": -57.69905471801758, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -1.129513144493103, "rewards/margins": 5.85588264465332, "rewards/rejected": -6.985395908355713, "step": 356 }, { "epoch": 4.231111111111111, "grad_norm": 3.120753083179545, "learning_rate": 2.668913868122279e-07, "logits/chosen": -1.1194713115692139, "logits/rejected": -1.103295087814331, "logps/chosen": -24.680492401123047, "logps/rejected": -60.474884033203125, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": -0.4337795376777649, "rewards/margins": 6.006582260131836, "rewards/rejected": -6.440362453460693, "step": 357 }, { "epoch": 4.242962962962963, "grad_norm": 3.061931183232965, "learning_rate": 2.6559380755158206e-07, "logits/chosen": -0.8489376902580261, "logits/rejected": -0.8107261061668396, "logps/chosen": -34.79714584350586, "logps/rejected": -73.919189453125, "loss": 0.0274, "rewards/accuracies": 1.0, "rewards/chosen": -1.6460020542144775, "rewards/margins": 7.77467155456543, "rewards/rejected": -9.420673370361328, "step": 358 }, { "epoch": 4.254814814814814, "grad_norm": 2.8892305258233, "learning_rate": 2.642958064224747e-07, "logits/chosen": -1.3385485410690308, "logits/rejected": -1.154805064201355, "logps/chosen": -39.11994934082031, "logps/rejected": -50.91279602050781, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -0.9563882350921631, "rewards/margins": 4.840402126312256, "rewards/rejected": -5.796790599822998, "step": 359 }, { "epoch": 4.266666666666667, "grad_norm": 3.476070988392766, "learning_rate": 2.629974185404951e-07, "logits/chosen": -0.8726380467414856, "logits/rejected": -0.7995076179504395, "logps/chosen": -36.67843246459961, "logps/rejected": -66.06752014160156, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -0.965777575969696, "rewards/margins": 6.506186485290527, "rewards/rejected": -7.471963405609131, "step": 360 }, { "epoch": 4.278518518518519, "grad_norm": 3.479867841295816, "learning_rate": 2.616986790316952e-07, "logits/chosen": -1.0837681293487549, "logits/rejected": -1.1205651760101318, "logps/chosen": -32.18258285522461, "logps/rejected": -51.97950744628906, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -1.356218934059143, "rewards/margins": 5.362008571624756, "rewards/rejected": -6.718227386474609, "step": 361 }, { "epoch": 4.29037037037037, "grad_norm": 3.836705526683735, "learning_rate": 2.603996230316402e-07, "logits/chosen": -1.235141396522522, "logits/rejected": -1.0966753959655762, "logps/chosen": -32.577171325683594, "logps/rejected": -48.43400573730469, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -1.0464580059051514, "rewards/margins": 5.035365104675293, "rewards/rejected": -6.081823348999023, "step": 362 }, { "epoch": 4.302222222222222, "grad_norm": 4.050233783944833, "learning_rate": 2.5910028568445716e-07, "logits/chosen": -1.0736088752746582, "logits/rejected": -0.9677655696868896, "logps/chosen": -34.009925842285156, "logps/rejected": -55.578128814697266, "loss": 0.0458, "rewards/accuracies": 1.0, "rewards/chosen": -0.9978185892105103, "rewards/margins": 5.743760108947754, "rewards/rejected": -6.741579055786133, "step": 363 }, { "epoch": 4.314074074074074, "grad_norm": 3.229922787157764, "learning_rate": 2.5780070214188474e-07, "logits/chosen": -1.1710213422775269, "logits/rejected": -0.8957576155662537, "logps/chosen": -41.35856246948242, "logps/rejected": -65.46880340576172, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -1.9127827882766724, "rewards/margins": 6.122142791748047, "rewards/rejected": -8.03492546081543, "step": 364 }, { "epoch": 4.325925925925926, "grad_norm": 3.113178376367222, "learning_rate": 2.5650090756232226e-07, "logits/chosen": -1.1073739528656006, "logits/rejected": -1.2185232639312744, "logps/chosen": -26.82334327697754, "logps/rejected": -55.30256652832031, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.8536565899848938, "rewards/margins": 5.805755615234375, "rewards/rejected": -6.659412384033203, "step": 365 }, { "epoch": 4.337777777777778, "grad_norm": 3.247603709931723, "learning_rate": 2.552009371098778e-07, "logits/chosen": -1.2337862253189087, "logits/rejected": -1.0264849662780762, "logps/chosen": -32.95630645751953, "logps/rejected": -52.31919860839844, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -0.7740206122398376, "rewards/margins": 5.251730918884277, "rewards/rejected": -6.02575159072876, "step": 366 }, { "epoch": 4.3496296296296295, "grad_norm": 3.3943763130724007, "learning_rate": 2.5390082595341816e-07, "logits/chosen": -1.2210817337036133, "logits/rejected": -1.2176547050476074, "logps/chosen": -26.88553810119629, "logps/rejected": -59.887794494628906, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -0.8426360487937927, "rewards/margins": 6.936334609985352, "rewards/rejected": -7.778970718383789, "step": 367 }, { "epoch": 4.361481481481482, "grad_norm": 4.050869880734819, "learning_rate": 2.5260060926561604e-07, "logits/chosen": -0.9647274017333984, "logits/rejected": -1.103495478630066, "logps/chosen": -30.085792541503906, "logps/rejected": -62.07667922973633, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -1.0259263515472412, "rewards/margins": 6.067399024963379, "rewards/rejected": -7.093325138092041, "step": 368 }, { "epoch": 4.373333333333333, "grad_norm": 3.9890669950982125, "learning_rate": 2.5130032222199954e-07, "logits/chosen": -0.9496943950653076, "logits/rejected": -1.1947317123413086, "logps/chosen": -25.227035522460938, "logps/rejected": -64.11875915527344, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -0.5789353847503662, "rewards/margins": 5.880395889282227, "rewards/rejected": -6.459331035614014, "step": 369 }, { "epoch": 4.385185185185185, "grad_norm": 3.5570964159820257, "learning_rate": 2.5e-07, "logits/chosen": -1.3687735795974731, "logits/rejected": -1.352237343788147, "logps/chosen": -22.255563735961914, "logps/rejected": -51.99371337890625, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": -0.23616445064544678, "rewards/margins": 6.420718193054199, "rewards/rejected": -6.6568827629089355, "step": 370 }, { "epoch": 4.397037037037037, "grad_norm": 4.46941184264617, "learning_rate": 2.4869967777800055e-07, "logits/chosen": -1.0241132974624634, "logits/rejected": -1.050920844078064, "logps/chosen": -27.66728973388672, "logps/rejected": -46.08578109741211, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": -0.3025878369808197, "rewards/margins": 5.107050895690918, "rewards/rejected": -5.40963888168335, "step": 371 }, { "epoch": 4.408888888888889, "grad_norm": 3.4408179676361677, "learning_rate": 2.4739939073438393e-07, "logits/chosen": -1.1434128284454346, "logits/rejected": -1.2375893592834473, "logps/chosen": -27.630720138549805, "logps/rejected": -57.22761535644531, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -0.9288941621780396, "rewards/margins": 6.560557842254639, "rewards/rejected": -7.489451885223389, "step": 372 }, { "epoch": 4.420740740740741, "grad_norm": 4.094893116865017, "learning_rate": 2.460991740465819e-07, "logits/chosen": -1.1973354816436768, "logits/rejected": -0.9313310980796814, "logps/chosen": -28.657133102416992, "logps/rejected": -58.009613037109375, "loss": 0.0407, "rewards/accuracies": 1.0, "rewards/chosen": -1.197420358657837, "rewards/margins": 6.4741129875183105, "rewards/rejected": -7.671533107757568, "step": 373 }, { "epoch": 4.432592592592592, "grad_norm": 3.817829925143414, "learning_rate": 2.4479906289012216e-07, "logits/chosen": -1.1140527725219727, "logits/rejected": -0.9359537959098816, "logps/chosen": -24.56032943725586, "logps/rejected": -52.2829704284668, "loss": 0.0413, "rewards/accuracies": 1.0, "rewards/chosen": -0.37612664699554443, "rewards/margins": 5.741087913513184, "rewards/rejected": -6.117214679718018, "step": 374 }, { "epoch": 4.444444444444445, "grad_norm": 3.838687015003608, "learning_rate": 2.434990924376778e-07, "logits/chosen": -1.2416824102401733, "logits/rejected": -1.3780392408370972, "logps/chosen": -25.436866760253906, "logps/rejected": -55.5416259765625, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -1.0123906135559082, "rewards/margins": 5.48138427734375, "rewards/rejected": -6.493774890899658, "step": 375 }, { "epoch": 4.456296296296296, "grad_norm": 3.2461595488442763, "learning_rate": 2.421992978581152e-07, "logits/chosen": -1.0624195337295532, "logits/rejected": -1.0169167518615723, "logps/chosen": -44.57556915283203, "logps/rejected": -57.175086975097656, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -1.327272653579712, "rewards/margins": 4.468148231506348, "rewards/rejected": -5.795420169830322, "step": 376 }, { "epoch": 4.468148148148148, "grad_norm": 3.8483104907201993, "learning_rate": 2.4089971431554287e-07, "logits/chosen": -1.3426092863082886, "logits/rejected": -1.1109671592712402, "logps/chosen": -38.246315002441406, "logps/rejected": -52.28644561767578, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -1.9233973026275635, "rewards/margins": 4.656871795654297, "rewards/rejected": -6.580268859863281, "step": 377 }, { "epoch": 4.48, "grad_norm": 3.202841352923816, "learning_rate": 2.3960037696835987e-07, "logits/chosen": -1.1502716541290283, "logits/rejected": -1.059588074684143, "logps/chosen": -29.64549446105957, "logps/rejected": -53.606666564941406, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -1.0943526029586792, "rewards/margins": 4.84756326675415, "rewards/rejected": -5.941916465759277, "step": 378 }, { "epoch": 4.491851851851852, "grad_norm": 3.544360039590633, "learning_rate": 2.3830132096830475e-07, "logits/chosen": -0.9218529462814331, "logits/rejected": -0.9420297145843506, "logps/chosen": -24.30426597595215, "logps/rejected": -55.90702438354492, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": -0.09170357137918472, "rewards/margins": 6.058189868927002, "rewards/rejected": -6.149893760681152, "step": 379 }, { "epoch": 4.503703703703704, "grad_norm": 3.1662180648764906, "learning_rate": 2.3700258145950493e-07, "logits/chosen": -1.1394635438919067, "logits/rejected": -1.0172181129455566, "logps/chosen": -33.90563201904297, "logps/rejected": -55.3117790222168, "loss": 0.0269, "rewards/accuracies": 1.0, "rewards/chosen": -1.4586858749389648, "rewards/margins": 5.450444221496582, "rewards/rejected": -6.9091291427612305, "step": 380 }, { "epoch": 4.515555555555555, "grad_norm": 3.735511679116059, "learning_rate": 2.3570419357752518e-07, "logits/chosen": -0.8611398935317993, "logits/rejected": -0.926773190498352, "logps/chosen": -29.910818099975586, "logps/rejected": -64.39048767089844, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -1.275660753250122, "rewards/margins": 7.312250137329102, "rewards/rejected": -8.587909698486328, "step": 381 }, { "epoch": 4.5274074074074075, "grad_norm": 4.002410075302513, "learning_rate": 2.3440619244841794e-07, "logits/chosen": -0.9324372410774231, "logits/rejected": -0.9605002403259277, "logps/chosen": -25.845172882080078, "logps/rejected": -57.81169891357422, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -0.8365424871444702, "rewards/margins": 6.228395938873291, "rewards/rejected": -7.064938545227051, "step": 382 }, { "epoch": 4.539259259259259, "grad_norm": 4.0545032191525, "learning_rate": 2.3310861318777214e-07, "logits/chosen": -1.2109543085098267, "logits/rejected": -1.2952345609664917, "logps/chosen": -24.969423294067383, "logps/rejected": -50.565818786621094, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": -0.9535256028175354, "rewards/margins": 5.523766994476318, "rewards/rejected": -6.477292060852051, "step": 383 }, { "epoch": 4.551111111111111, "grad_norm": 3.8412901445263117, "learning_rate": 2.3181149089976404e-07, "logits/chosen": -1.0407981872558594, "logits/rejected": -1.0591387748718262, "logps/chosen": -24.82103157043457, "logps/rejected": -58.05794143676758, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -0.6855615973472595, "rewards/margins": 6.282498836517334, "rewards/rejected": -6.968060493469238, "step": 384 }, { "epoch": 4.562962962962963, "grad_norm": 3.035878137954456, "learning_rate": 2.30514860676207e-07, "logits/chosen": -0.8315334320068359, "logits/rejected": -0.6608816981315613, "logps/chosen": -33.760494232177734, "logps/rejected": -53.18769073486328, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -1.0541050434112549, "rewards/margins": 6.037905693054199, "rewards/rejected": -7.092010498046875, "step": 385 }, { "epoch": 4.574814814814815, "grad_norm": 3.046420967372995, "learning_rate": 2.2921875759560207e-07, "logits/chosen": -1.1959935426712036, "logits/rejected": -1.0914936065673828, "logps/chosen": -39.321346282958984, "logps/rejected": -64.97684478759766, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -1.732681155204773, "rewards/margins": 5.395863056182861, "rewards/rejected": -7.128544807434082, "step": 386 }, { "epoch": 4.586666666666667, "grad_norm": 3.5502888586749517, "learning_rate": 2.2792321672218967e-07, "logits/chosen": -0.7373791337013245, "logits/rejected": -0.7411605715751648, "logps/chosen": -28.97515106201172, "logps/rejected": -61.38172912597656, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -0.8913424015045166, "rewards/margins": 6.593505859375, "rewards/rejected": -7.4848480224609375, "step": 387 }, { "epoch": 4.598518518518518, "grad_norm": 3.4771354125546625, "learning_rate": 2.2662827310499995e-07, "logits/chosen": -1.158825397491455, "logits/rejected": -0.9315577745437622, "logps/chosen": -39.89890670776367, "logps/rejected": -61.11587142944336, "loss": 0.0307, "rewards/accuracies": 1.0, "rewards/chosen": -1.5864561796188354, "rewards/margins": 5.710073471069336, "rewards/rejected": -7.296529769897461, "step": 388 }, { "epoch": 4.6103703703703705, "grad_norm": 3.8828355066051503, "learning_rate": 2.2533396177690562e-07, "logits/chosen": -0.9048175811767578, "logits/rejected": -0.7590952515602112, "logps/chosen": -30.942026138305664, "logps/rejected": -51.84413528442383, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -1.0910900831222534, "rewards/margins": 4.568852424621582, "rewards/rejected": -5.659942626953125, "step": 389 }, { "epoch": 4.622222222222222, "grad_norm": 3.4046648013267315, "learning_rate": 2.2404031775367332e-07, "logits/chosen": -1.1725707054138184, "logits/rejected": -1.0109634399414062, "logps/chosen": -26.178606033325195, "logps/rejected": -56.07292175292969, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -0.29053014516830444, "rewards/margins": 6.551156520843506, "rewards/rejected": -6.841686248779297, "step": 390 }, { "epoch": 4.634074074074074, "grad_norm": 4.3144590946344445, "learning_rate": 2.227473760330169e-07, "logits/chosen": -1.1439464092254639, "logits/rejected": -1.023298978805542, "logps/chosen": -26.24483299255371, "logps/rejected": -43.60441589355469, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": -0.9386296272277832, "rewards/margins": 4.656396865844727, "rewards/rejected": -5.595026016235352, "step": 391 }, { "epoch": 4.645925925925926, "grad_norm": 3.9806588551475217, "learning_rate": 2.2145517159365043e-07, "logits/chosen": -1.1556141376495361, "logits/rejected": -0.9479185342788696, "logps/chosen": -38.38218688964844, "logps/rejected": -58.405189514160156, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -1.3316694498062134, "rewards/margins": 6.8166351318359375, "rewards/rejected": -8.148303985595703, "step": 392 }, { "epoch": 4.657777777777778, "grad_norm": 3.6433642546456553, "learning_rate": 2.2016373939434166e-07, "logits/chosen": -1.053426742553711, "logits/rejected": -1.010209560394287, "logps/chosen": -32.55500793457031, "logps/rejected": -55.443321228027344, "loss": 0.0381, "rewards/accuracies": 1.0, "rewards/chosen": -0.6891828775405884, "rewards/margins": 6.006121635437012, "rewards/rejected": -6.695303916931152, "step": 393 }, { "epoch": 4.66962962962963, "grad_norm": 3.7812535316967035, "learning_rate": 2.1887311437296684e-07, "logits/chosen": -0.9255619049072266, "logits/rejected": -0.7529337406158447, "logps/chosen": -28.533212661743164, "logps/rejected": -51.62010955810547, "loss": 0.039, "rewards/accuracies": 1.0, "rewards/chosen": -0.5108724236488342, "rewards/margins": 5.3941545486450195, "rewards/rejected": -5.905027389526367, "step": 394 }, { "epoch": 4.681481481481481, "grad_norm": 2.702856391414899, "learning_rate": 2.175833314455647e-07, "logits/chosen": -1.06898033618927, "logits/rejected": -1.0634517669677734, "logps/chosen": -46.982357025146484, "logps/rejected": -84.75677490234375, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -1.7449123859405518, "rewards/margins": 7.542050838470459, "rewards/rejected": -9.286964416503906, "step": 395 }, { "epoch": 4.693333333333333, "grad_norm": 3.1373317717455516, "learning_rate": 2.162944255053928e-07, "logits/chosen": -1.2334667444229126, "logits/rejected": -1.0262863636016846, "logps/chosen": -27.508880615234375, "logps/rejected": -52.11289978027344, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -1.0616217851638794, "rewards/margins": 5.611589431762695, "rewards/rejected": -6.673211574554443, "step": 396 }, { "epoch": 4.705185185185185, "grad_norm": 2.5700965268119016, "learning_rate": 2.1500643142198264e-07, "logits/chosen": -1.18964684009552, "logits/rejected": -1.1267262697219849, "logps/chosen": -28.83340072631836, "logps/rejected": -50.370338439941406, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -1.270019292831421, "rewards/margins": 5.287970066070557, "rewards/rejected": -6.557989120483398, "step": 397 }, { "epoch": 4.717037037037037, "grad_norm": 4.377276641204308, "learning_rate": 2.137193840401968e-07, "logits/chosen": -0.9488641619682312, "logits/rejected": -0.6226259469985962, "logps/chosen": -42.97636032104492, "logps/rejected": -64.65530395507812, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -1.293939232826233, "rewards/margins": 5.974466800689697, "rewards/rejected": -7.268405914306641, "step": 398 }, { "epoch": 4.728888888888889, "grad_norm": 4.104797851994462, "learning_rate": 2.1243331817928643e-07, "logits/chosen": -1.1831551790237427, "logits/rejected": -0.9473021030426025, "logps/chosen": -30.772737503051758, "logps/rejected": -56.16963577270508, "loss": 0.0327, "rewards/accuracies": 1.0, "rewards/chosen": -0.5397346019744873, "rewards/margins": 6.304899215698242, "rewards/rejected": -6.84463357925415, "step": 399 }, { "epoch": 4.7407407407407405, "grad_norm": 3.548586845462408, "learning_rate": 2.1114826863194878e-07, "logits/chosen": -1.1657699346542358, "logits/rejected": -1.1083228588104248, "logps/chosen": -25.673093795776367, "logps/rejected": -48.5228271484375, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -0.7490501999855042, "rewards/margins": 5.02781343460083, "rewards/rejected": -5.7768635749816895, "step": 400 }, { "epoch": 4.752592592592593, "grad_norm": 4.513251688450835, "learning_rate": 2.0986427016338623e-07, "logits/chosen": -0.6789465546607971, "logits/rejected": -0.6936579346656799, "logps/chosen": -30.768882751464844, "logps/rejected": -57.21465301513672, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": -1.0039122104644775, "rewards/margins": 6.220263957977295, "rewards/rejected": -7.224175453186035, "step": 401 }, { "epoch": 4.764444444444445, "grad_norm": 3.213793093164246, "learning_rate": 2.0858135751036568e-07, "logits/chosen": -1.1852058172225952, "logits/rejected": -0.9876963496208191, "logps/chosen": -24.95799446105957, "logps/rejected": -58.4112548828125, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -1.1165269613265991, "rewards/margins": 6.706012725830078, "rewards/rejected": -7.822539329528809, "step": 402 }, { "epoch": 4.776296296296296, "grad_norm": 3.3765755696597592, "learning_rate": 2.0729956538027904e-07, "logits/chosen": -1.2950583696365356, "logits/rejected": -1.2782042026519775, "logps/chosen": -28.596763610839844, "logps/rejected": -55.36784744262695, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": -1.079484224319458, "rewards/margins": 6.282561779022217, "rewards/rejected": -7.362045764923096, "step": 403 }, { "epoch": 4.7881481481481485, "grad_norm": 3.7436046482867567, "learning_rate": 2.060189284502037e-07, "logits/chosen": -1.0621609687805176, "logits/rejected": -1.1178123950958252, "logps/chosen": -27.336435317993164, "logps/rejected": -46.967952728271484, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -0.33046185970306396, "rewards/margins": 4.788575172424316, "rewards/rejected": -5.11903715133667, "step": 404 }, { "epoch": 4.8, "grad_norm": 2.2858826860151678, "learning_rate": 2.0473948136596486e-07, "logits/chosen": -1.21229887008667, "logits/rejected": -1.2507023811340332, "logps/chosen": -31.56397247314453, "logps/rejected": -67.93631744384766, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -1.2148863077163696, "rewards/margins": 8.519362449645996, "rewards/rejected": -9.734248161315918, "step": 405 }, { "epoch": 4.811851851851852, "grad_norm": 4.197533331570636, "learning_rate": 2.0346125874119838e-07, "logits/chosen": -1.1015260219573975, "logits/rejected": -1.130777359008789, "logps/chosen": -28.210693359375, "logps/rejected": -64.23535919189453, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": -1.4305483102798462, "rewards/margins": 6.870315074920654, "rewards/rejected": -8.300863265991211, "step": 406 }, { "epoch": 4.823703703703703, "grad_norm": 3.7607706364061055, "learning_rate": 2.0218429515641368e-07, "logits/chosen": -1.361039400100708, "logits/rejected": -1.3556692600250244, "logps/chosen": -21.602588653564453, "logps/rejected": -55.4856071472168, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -0.6024814248085022, "rewards/margins": 6.8341474533081055, "rewards/rejected": -7.436628341674805, "step": 407 }, { "epoch": 4.835555555555556, "grad_norm": 2.961152190191642, "learning_rate": 2.0090862515805895e-07, "logits/chosen": -1.1544498205184937, "logits/rejected": -1.037913203239441, "logps/chosen": -26.241416931152344, "logps/rejected": -55.529850006103516, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -1.1264173984527588, "rewards/margins": 6.888669490814209, "rewards/rejected": -8.015087127685547, "step": 408 }, { "epoch": 4.847407407407408, "grad_norm": 3.800097064071459, "learning_rate": 1.9963428325758613e-07, "logits/chosen": -1.397064447402954, "logits/rejected": -0.9890981316566467, "logps/chosen": -37.16471862792969, "logps/rejected": -63.38473129272461, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -1.5235031843185425, "rewards/margins": 6.591093063354492, "rewards/rejected": -8.114595413208008, "step": 409 }, { "epoch": 4.859259259259259, "grad_norm": 3.6060737188077017, "learning_rate": 1.983613039305173e-07, "logits/chosen": -1.189021348953247, "logits/rejected": -1.1319453716278076, "logps/chosen": -33.33507537841797, "logps/rejected": -58.435791015625, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -0.7115690112113953, "rewards/margins": 6.030098915100098, "rewards/rejected": -6.741668224334717, "step": 410 }, { "epoch": 4.871111111111111, "grad_norm": 3.8852443618244883, "learning_rate": 1.9708972161551213e-07, "logits/chosen": -1.3724188804626465, "logits/rejected": -1.3005732297897339, "logps/chosen": -36.054195404052734, "logps/rejected": -61.728668212890625, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": -1.7809228897094727, "rewards/margins": 5.685511112213135, "rewards/rejected": -7.466434478759766, "step": 411 }, { "epoch": 4.882962962962963, "grad_norm": 3.6521043308854098, "learning_rate": 1.9581957071343588e-07, "logits/chosen": -1.1163625717163086, "logits/rejected": -1.1054105758666992, "logps/chosen": -22.591720581054688, "logps/rejected": -56.60470962524414, "loss": 0.0353, "rewards/accuracies": 1.0, "rewards/chosen": -0.49366194009780884, "rewards/margins": 7.763273239135742, "rewards/rejected": -8.256935119628906, "step": 412 }, { "epoch": 4.894814814814815, "grad_norm": 2.844443421722238, "learning_rate": 1.9455088558642932e-07, "logits/chosen": -0.8721749186515808, "logits/rejected": -1.0191127061843872, "logps/chosen": -22.482418060302734, "logps/rejected": -54.2562370300293, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -0.4827694892883301, "rewards/margins": 6.972842693328857, "rewards/rejected": -7.455611705780029, "step": 413 }, { "epoch": 4.906666666666666, "grad_norm": 3.1150557643187424, "learning_rate": 1.9328370055697832e-07, "logits/chosen": -1.18574857711792, "logits/rejected": -0.9993014931678772, "logps/chosen": -36.91192626953125, "logps/rejected": -46.982643127441406, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -1.5664029121398926, "rewards/margins": 4.784036636352539, "rewards/rejected": -6.35044002532959, "step": 414 }, { "epoch": 4.9185185185185185, "grad_norm": 3.1584595235265462, "learning_rate": 1.9201804990698616e-07, "logits/chosen": -1.2075750827789307, "logits/rejected": -1.027672290802002, "logps/chosen": -30.833179473876953, "logps/rejected": -64.46182250976562, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -1.332330584526062, "rewards/margins": 7.531144142150879, "rewards/rejected": -8.86347484588623, "step": 415 }, { "epoch": 4.930370370370371, "grad_norm": 3.1465761585985264, "learning_rate": 1.907539678768453e-07, "logits/chosen": -1.372673749923706, "logits/rejected": -1.3895915746688843, "logps/chosen": -27.67334747314453, "logps/rejected": -49.52977752685547, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": -0.9234645962715149, "rewards/margins": 4.2349162101745605, "rewards/rejected": -5.15838098526001, "step": 416 }, { "epoch": 4.942222222222222, "grad_norm": 2.42226431528538, "learning_rate": 1.8949148866451152e-07, "logits/chosen": -1.155286431312561, "logits/rejected": -1.319460391998291, "logps/chosen": -25.511240005493164, "logps/rejected": -67.6077651977539, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": -0.60393226146698, "rewards/margins": 7.415879726409912, "rewards/rejected": -8.01981258392334, "step": 417 }, { "epoch": 4.954074074074074, "grad_norm": 3.75114428997573, "learning_rate": 1.8823064642457876e-07, "logits/chosen": -1.5565588474273682, "logits/rejected": -1.2644767761230469, "logps/chosen": -29.305103302001953, "logps/rejected": -61.24723815917969, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": -1.068399429321289, "rewards/margins": 6.726673603057861, "rewards/rejected": -7.79507303237915, "step": 418 }, { "epoch": 4.965925925925926, "grad_norm": 4.195691110710784, "learning_rate": 1.8697147526735466e-07, "logits/chosen": -0.9303781390190125, "logits/rejected": -1.2051749229431152, "logps/chosen": -29.477602005004883, "logps/rejected": -69.74530792236328, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": -1.8630962371826172, "rewards/margins": 7.451139450073242, "rewards/rejected": -9.314236640930176, "step": 419 }, { "epoch": 4.977777777777778, "grad_norm": 3.162433032789351, "learning_rate": 1.8571400925793852e-07, "logits/chosen": -0.7229827642440796, "logits/rejected": -0.685709536075592, "logps/chosen": -24.86301040649414, "logps/rejected": -57.18008041381836, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -0.692309558391571, "rewards/margins": 6.926962375640869, "rewards/rejected": -7.619271755218506, "step": 420 }, { "epoch": 4.989629629629629, "grad_norm": 4.385567117559209, "learning_rate": 1.844582824152988e-07, "logits/chosen": -1.165217638015747, "logits/rejected": -0.9067272543907166, "logps/chosen": -42.31658172607422, "logps/rejected": -66.11439514160156, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -1.9213820695877075, "rewards/margins": 6.796330451965332, "rewards/rejected": -8.71771240234375, "step": 421 }, { "epoch": 5.001481481481481, "grad_norm": 3.4081975873995063, "learning_rate": 1.8320432871135376e-07, "logits/chosen": -1.0193674564361572, "logits/rejected": -0.9758960604667664, "logps/chosen": -28.47673797607422, "logps/rejected": -48.93853759765625, "loss": 0.0343, "rewards/accuracies": 1.0, "rewards/chosen": -0.7848235964775085, "rewards/margins": 4.504067897796631, "rewards/rejected": -5.288891792297363, "step": 422 }, { "epoch": 5.013333333333334, "grad_norm": 2.9484160415611984, "learning_rate": 1.8195218207005136e-07, "logits/chosen": -0.9763575792312622, "logits/rejected": -1.1632423400878906, "logps/chosen": -37.08030319213867, "logps/rejected": -64.4261703491211, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -1.5703556537628174, "rewards/margins": 5.08331298828125, "rewards/rejected": -6.653668403625488, "step": 423 }, { "epoch": 5.025185185185185, "grad_norm": 3.2800699811427507, "learning_rate": 1.8070187636645237e-07, "logits/chosen": -1.2202644348144531, "logits/rejected": -0.9552056193351746, "logps/chosen": -36.58311080932617, "logps/rejected": -59.91649627685547, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": -1.681425929069519, "rewards/margins": 6.159786224365234, "rewards/rejected": -7.841211318969727, "step": 424 }, { "epoch": 5.037037037037037, "grad_norm": 2.3754727914899125, "learning_rate": 1.7945344542581353e-07, "logits/chosen": -1.0853219032287598, "logits/rejected": -0.8817991018295288, "logps/chosen": -33.09999465942383, "logps/rejected": -67.8524169921875, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -1.1994686126708984, "rewards/margins": 7.697717666625977, "rewards/rejected": -8.897185325622559, "step": 425 }, { "epoch": 5.0488888888888885, "grad_norm": 3.033283182538126, "learning_rate": 1.782069230226725e-07, "logits/chosen": -1.003400206565857, "logits/rejected": -1.0244035720825195, "logps/chosen": -24.51974868774414, "logps/rejected": -56.68452835083008, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -0.36259013414382935, "rewards/margins": 7.2209672927856445, "rewards/rejected": -7.583556652069092, "step": 426 }, { "epoch": 5.060740740740741, "grad_norm": 2.82767361060403, "learning_rate": 1.7696234287993413e-07, "logits/chosen": -0.9122418761253357, "logits/rejected": -0.8946365118026733, "logps/chosen": -37.71500015258789, "logps/rejected": -70.05258178710938, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -1.4320430755615234, "rewards/margins": 7.710330486297607, "rewards/rejected": -9.142374038696289, "step": 427 }, { "epoch": 5.072592592592593, "grad_norm": 2.5624017267747266, "learning_rate": 1.7571973866795813e-07, "logits/chosen": -1.1369318962097168, "logits/rejected": -0.8684489727020264, "logps/chosen": -43.49992370605469, "logps/rejected": -59.39040756225586, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -1.4235126972198486, "rewards/margins": 6.148205757141113, "rewards/rejected": -7.571718215942383, "step": 428 }, { "epoch": 5.084444444444444, "grad_norm": 3.295544800188778, "learning_rate": 1.7447914400364833e-07, "logits/chosen": -1.2425228357315063, "logits/rejected": -1.266234040260315, "logps/chosen": -26.035186767578125, "logps/rejected": -49.9383544921875, "loss": 0.0314, "rewards/accuracies": 1.0, "rewards/chosen": -0.4317536950111389, "rewards/margins": 5.771049976348877, "rewards/rejected": -6.202803611755371, "step": 429 }, { "epoch": 5.0962962962962965, "grad_norm": 2.9432919280317735, "learning_rate": 1.7324059244954292e-07, "logits/chosen": -1.421688437461853, "logits/rejected": -1.51046621799469, "logps/chosen": -29.30355453491211, "logps/rejected": -51.56528091430664, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -1.2735320329666138, "rewards/margins": 5.418951988220215, "rewards/rejected": -6.692483425140381, "step": 430 }, { "epoch": 5.108148148148148, "grad_norm": 2.951889299596433, "learning_rate": 1.720041175129066e-07, "logits/chosen": -0.8546741008758545, "logits/rejected": -0.8645275831222534, "logps/chosen": -28.239816665649414, "logps/rejected": -57.45808792114258, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -1.293044924736023, "rewards/margins": 6.973755359649658, "rewards/rejected": -8.266800880432129, "step": 431 }, { "epoch": 5.12, "grad_norm": 2.987267713140231, "learning_rate": 1.7076975264482433e-07, "logits/chosen": -1.0786815881729126, "logits/rejected": -0.9784872531890869, "logps/chosen": -34.05535125732422, "logps/rejected": -56.35095977783203, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -1.0851895809173584, "rewards/margins": 5.206171989440918, "rewards/rejected": -6.291361331939697, "step": 432 }, { "epoch": 5.131851851851851, "grad_norm": 3.143579423533942, "learning_rate": 1.6953753123929595e-07, "logits/chosen": -1.2658149003982544, "logits/rejected": -1.2656917572021484, "logps/chosen": -21.0126895904541, "logps/rejected": -56.76115798950195, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": -0.6274415254592896, "rewards/margins": 7.949789524078369, "rewards/rejected": -8.577230453491211, "step": 433 }, { "epoch": 5.143703703703704, "grad_norm": 3.0660601990113645, "learning_rate": 1.6830748663233303e-07, "logits/chosen": -1.1669660806655884, "logits/rejected": -1.2397806644439697, "logps/chosen": -31.919248580932617, "logps/rejected": -63.46931838989258, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": -1.6147698163986206, "rewards/margins": 6.417461395263672, "rewards/rejected": -8.032230377197266, "step": 434 }, { "epoch": 5.155555555555556, "grad_norm": 2.860051014922371, "learning_rate": 1.6707965210105687e-07, "logits/chosen": -0.5690155029296875, "logits/rejected": -0.7288935780525208, "logps/chosen": -29.347030639648438, "logps/rejected": -70.51150512695312, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": -0.8469069600105286, "rewards/margins": 7.336713790893555, "rewards/rejected": -8.183621406555176, "step": 435 }, { "epoch": 5.167407407407407, "grad_norm": 3.495746895563705, "learning_rate": 1.6585406086279846e-07, "logits/chosen": -0.7923306226730347, "logits/rejected": -0.900981068611145, "logps/chosen": -34.12835693359375, "logps/rejected": -65.13916015625, "loss": 0.028, "rewards/accuracies": 1.0, "rewards/chosen": -1.2421728372573853, "rewards/margins": 5.556445121765137, "rewards/rejected": -6.798617839813232, "step": 436 }, { "epoch": 5.1792592592592595, "grad_norm": 2.773845959525342, "learning_rate": 1.6463074607419942e-07, "logits/chosen": -1.100500464439392, "logits/rejected": -1.0029168128967285, "logps/chosen": -37.937015533447266, "logps/rejected": -46.45364761352539, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": -1.0404378175735474, "rewards/margins": 4.71007776260376, "rewards/rejected": -5.750515460968018, "step": 437 }, { "epoch": 5.191111111111111, "grad_norm": 3.6862802786560187, "learning_rate": 1.6340974083031523e-07, "logits/chosen": -1.1192785501480103, "logits/rejected": -0.712954580783844, "logps/chosen": -39.31867980957031, "logps/rejected": -59.20005798339844, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -1.3697164058685303, "rewards/margins": 7.339575290679932, "rewards/rejected": -8.7092924118042, "step": 438 }, { "epoch": 5.202962962962963, "grad_norm": 2.6145012544108637, "learning_rate": 1.6219107816372024e-07, "logits/chosen": -1.0265417098999023, "logits/rejected": -1.0565710067749023, "logps/chosen": -24.545156478881836, "logps/rejected": -57.86946105957031, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -0.8660164475440979, "rewards/margins": 6.856147289276123, "rewards/rejected": -7.722163677215576, "step": 439 }, { "epoch": 5.214814814814815, "grad_norm": 2.4016912622887774, "learning_rate": 1.6097479104361326e-07, "logits/chosen": -0.8214901685714722, "logits/rejected": -0.4467310905456543, "logps/chosen": -34.378780364990234, "logps/rejected": -54.939334869384766, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -0.4449349641799927, "rewards/margins": 6.770240306854248, "rewards/rejected": -7.215175628662109, "step": 440 }, { "epoch": 5.226666666666667, "grad_norm": 3.1135292579652587, "learning_rate": 1.5976091237492634e-07, "logits/chosen": -0.9876181483268738, "logits/rejected": -0.9957047700881958, "logps/chosen": -38.20779037475586, "logps/rejected": -80.09854888916016, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": -1.8875401020050049, "rewards/margins": 6.763437747955322, "rewards/rejected": -8.650979042053223, "step": 441 }, { "epoch": 5.238518518518519, "grad_norm": 3.663082317807131, "learning_rate": 1.5854947499743413e-07, "logits/chosen": -1.130734920501709, "logits/rejected": -1.0467098951339722, "logps/chosen": -43.32159423828125, "logps/rejected": -72.49787902832031, "loss": 0.0266, "rewards/accuracies": 1.0, "rewards/chosen": -1.9329556226730347, "rewards/margins": 7.5610575675964355, "rewards/rejected": -9.494012832641602, "step": 442 }, { "epoch": 5.25037037037037, "grad_norm": 3.053911267648753, "learning_rate": 1.573405116848656e-07, "logits/chosen": -0.8695877194404602, "logits/rejected": -1.033347487449646, "logps/chosen": -26.546667098999023, "logps/rejected": -55.25644302368164, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -0.794514536857605, "rewards/margins": 5.641097545623779, "rewards/rejected": -6.435612201690674, "step": 443 }, { "epoch": 5.262222222222222, "grad_norm": 2.9630105870365546, "learning_rate": 1.5613405514401757e-07, "logits/chosen": -1.2900482416152954, "logits/rejected": -0.8595216274261475, "logps/chosen": -42.08282470703125, "logps/rejected": -59.17259216308594, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -1.2677626609802246, "rewards/margins": 5.434237480163574, "rewards/rejected": -6.701999664306641, "step": 444 }, { "epoch": 5.274074074074074, "grad_norm": 3.028060174893764, "learning_rate": 1.5493013801386923e-07, "logits/chosen": -0.6489288210868835, "logits/rejected": -0.8385549783706665, "logps/chosen": -35.989559173583984, "logps/rejected": -69.44566345214844, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -1.2387605905532837, "rewards/margins": 7.3805999755859375, "rewards/rejected": -8.61936092376709, "step": 445 }, { "epoch": 5.285925925925926, "grad_norm": 2.9666582681649962, "learning_rate": 1.537287928647002e-07, "logits/chosen": -0.9402166604995728, "logits/rejected": -0.888253927230835, "logps/chosen": -42.73618698120117, "logps/rejected": -75.50381469726562, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -1.3216760158538818, "rewards/margins": 7.429603099822998, "rewards/rejected": -8.7512788772583, "step": 446 }, { "epoch": 5.297777777777778, "grad_norm": 3.0746084836275913, "learning_rate": 1.525300521972082e-07, "logits/chosen": -1.1609197854995728, "logits/rejected": -1.138796091079712, "logps/chosen": -33.730140686035156, "logps/rejected": -58.21106719970703, "loss": 0.0259, "rewards/accuracies": 1.0, "rewards/chosen": -1.451420783996582, "rewards/margins": 6.178999423980713, "rewards/rejected": -7.630419731140137, "step": 447 }, { "epoch": 5.3096296296296295, "grad_norm": 2.065490256316474, "learning_rate": 1.513339484416309e-07, "logits/chosen": -1.325588583946228, "logits/rejected": -1.179955244064331, "logps/chosen": -39.745296478271484, "logps/rejected": -57.22673797607422, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -2.0886282920837402, "rewards/margins": 5.040305137634277, "rewards/rejected": -7.128933429718018, "step": 448 }, { "epoch": 5.321481481481482, "grad_norm": 2.565247253136151, "learning_rate": 1.5014051395686766e-07, "logits/chosen": -1.3147672414779663, "logits/rejected": -1.3600736856460571, "logps/chosen": -28.266178131103516, "logps/rejected": -64.80318450927734, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -1.2247536182403564, "rewards/margins": 8.029702186584473, "rewards/rejected": -9.25445556640625, "step": 449 }, { "epoch": 5.333333333333333, "grad_norm": 1.6866324861080002, "learning_rate": 1.489497810296046e-07, "logits/chosen": -1.4508033990859985, "logits/rejected": -1.4773669242858887, "logps/chosen": -37.937896728515625, "logps/rejected": -70.61952209472656, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": -2.2039074897766113, "rewards/margins": 6.989311218261719, "rewards/rejected": -9.193219184875488, "step": 450 }, { "epoch": 5.345185185185185, "grad_norm": 2.9199939457346016, "learning_rate": 1.4776178187344105e-07, "logits/chosen": -0.642178475856781, "logits/rejected": -0.5968121290206909, "logps/chosen": -33.17547607421875, "logps/rejected": -73.9720458984375, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -0.686303973197937, "rewards/margins": 7.578001499176025, "rewards/rejected": -8.264305114746094, "step": 451 }, { "epoch": 5.357037037037037, "grad_norm": 2.677154410735394, "learning_rate": 1.4657654862801797e-07, "logits/chosen": -1.3107982873916626, "logits/rejected": -1.0267812013626099, "logps/chosen": -33.44599914550781, "logps/rejected": -51.6804084777832, "loss": 0.0202, "rewards/accuracies": 1.0, "rewards/chosen": -0.964032769203186, "rewards/margins": 5.850228309631348, "rewards/rejected": -6.814260482788086, "step": 452 }, { "epoch": 5.368888888888889, "grad_norm": 3.006190765683554, "learning_rate": 1.4539411335814866e-07, "logits/chosen": -0.8469028472900391, "logits/rejected": -0.8455516695976257, "logps/chosen": -34.55913543701172, "logps/rejected": -65.2293701171875, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -1.0158193111419678, "rewards/margins": 6.680525302886963, "rewards/rejected": -7.696345329284668, "step": 453 }, { "epoch": 5.380740740740741, "grad_norm": 2.903509151195431, "learning_rate": 1.4421450805295082e-07, "logits/chosen": -1.259004831314087, "logits/rejected": -1.0897257328033447, "logps/chosen": -34.40946960449219, "logps/rejected": -54.516990661621094, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -2.26055908203125, "rewards/margins": 4.923694610595703, "rewards/rejected": -7.184253215789795, "step": 454 }, { "epoch": 5.392592592592592, "grad_norm": 2.4956212068455947, "learning_rate": 1.4303776462498186e-07, "logits/chosen": -1.470017910003662, "logits/rejected": -1.487809181213379, "logps/chosen": -22.259977340698242, "logps/rejected": -61.082489013671875, "loss": 0.023, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6950967311859131, "rewards/margins": 7.098940849304199, "rewards/rejected": -7.794036865234375, "step": 455 }, { "epoch": 5.404444444444445, "grad_norm": 2.1074775036840228, "learning_rate": 1.418639149093748e-07, "logits/chosen": -1.1679542064666748, "logits/rejected": -1.185795783996582, "logps/chosen": -31.977489471435547, "logps/rejected": -60.19514465332031, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -1.5915217399597168, "rewards/margins": 6.129962921142578, "rewards/rejected": -7.721484184265137, "step": 456 }, { "epoch": 5.416296296296296, "grad_norm": 2.540463046994792, "learning_rate": 1.406929906629774e-07, "logits/chosen": -1.0157017707824707, "logits/rejected": -0.8678162693977356, "logps/chosen": -28.734365463256836, "logps/rejected": -61.63572311401367, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -1.1730878353118896, "rewards/margins": 6.156033992767334, "rewards/rejected": -7.329122066497803, "step": 457 }, { "epoch": 5.428148148148148, "grad_norm": 2.5036228156389035, "learning_rate": 1.3952502356349323e-07, "logits/chosen": -0.8575838804244995, "logits/rejected": -0.8398576378822327, "logps/chosen": -25.19296646118164, "logps/rejected": -50.973854064941406, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -0.8319410085678101, "rewards/margins": 5.684410095214844, "rewards/rejected": -6.516350746154785, "step": 458 }, { "epoch": 5.44, "grad_norm": 2.241821085085757, "learning_rate": 1.38360045208624e-07, "logits/chosen": -1.3081655502319336, "logits/rejected": -1.1967523097991943, "logps/chosen": -27.412466049194336, "logps/rejected": -59.24693298339844, "loss": 0.0166, "rewards/accuracies": 1.0, "rewards/chosen": -0.4743008017539978, "rewards/margins": 6.069201469421387, "rewards/rejected": -6.543503284454346, "step": 459 }, { "epoch": 5.451851851851852, "grad_norm": 2.8367212448316637, "learning_rate": 1.371980871152157e-07, "logits/chosen": -1.092638611793518, "logits/rejected": -1.0296908617019653, "logps/chosen": -40.45366668701172, "logps/rejected": -66.97181701660156, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -1.5610663890838623, "rewards/margins": 6.595659255981445, "rewards/rejected": -8.156725883483887, "step": 460 }, { "epoch": 5.463703703703704, "grad_norm": 2.137948549440119, "learning_rate": 1.3603918071840486e-07, "logits/chosen": -1.4155157804489136, "logits/rejected": -1.4184473752975464, "logps/chosen": -27.53375244140625, "logps/rejected": -57.63071060180664, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -0.5443230867385864, "rewards/margins": 6.214169502258301, "rewards/rejected": -6.7584919929504395, "step": 461 }, { "epoch": 5.475555555555555, "grad_norm": 3.172618852002527, "learning_rate": 1.3488335737076911e-07, "logits/chosen": -0.9026474356651306, "logits/rejected": -1.121160864830017, "logps/chosen": -27.04289436340332, "logps/rejected": -65.71441650390625, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -1.3873372077941895, "rewards/margins": 7.394488334655762, "rewards/rejected": -8.78182601928711, "step": 462 }, { "epoch": 5.4874074074074075, "grad_norm": 2.932474050290164, "learning_rate": 1.3373064834147817e-07, "logits/chosen": -1.1298450231552124, "logits/rejected": -1.2291852235794067, "logps/chosen": -27.613210678100586, "logps/rejected": -49.24211502075195, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -0.7571765184402466, "rewards/margins": 5.5813889503479, "rewards/rejected": -6.338565826416016, "step": 463 }, { "epoch": 5.499259259259259, "grad_norm": 1.9766796560537938, "learning_rate": 1.3258108481544847e-07, "logits/chosen": -1.0279194116592407, "logits/rejected": -1.0094687938690186, "logps/chosen": -28.237319946289062, "logps/rejected": -59.598148345947266, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -1.1605604887008667, "rewards/margins": 7.338843822479248, "rewards/rejected": -8.499403953552246, "step": 464 }, { "epoch": 5.511111111111111, "grad_norm": 2.6137685278696936, "learning_rate": 1.314346978924994e-07, "logits/chosen": -1.3097034692764282, "logits/rejected": -1.148521900177002, "logps/chosen": -32.19894027709961, "logps/rejected": -53.88103485107422, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": -1.4137828350067139, "rewards/margins": 6.188781261444092, "rewards/rejected": -7.602563858032227, "step": 465 }, { "epoch": 5.522962962962963, "grad_norm": 3.107119868952029, "learning_rate": 1.3029151858651143e-07, "logits/chosen": -1.031536340713501, "logits/rejected": -0.8185826539993286, "logps/chosen": -32.56446075439453, "logps/rejected": -59.30012512207031, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -1.0495141744613647, "rewards/margins": 6.904279708862305, "rewards/rejected": -7.953794479370117, "step": 466 }, { "epoch": 5.534814814814815, "grad_norm": 2.7144216036026543, "learning_rate": 1.2915157782458802e-07, "logits/chosen": -1.1127973794937134, "logits/rejected": -1.0301717519760132, "logps/chosen": -37.235652923583984, "logps/rejected": -61.32948303222656, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -1.7692995071411133, "rewards/margins": 6.307473659515381, "rewards/rejected": -8.076772689819336, "step": 467 }, { "epoch": 5.546666666666667, "grad_norm": 2.2942472784287875, "learning_rate": 1.2801490644621788e-07, "logits/chosen": -1.2554562091827393, "logits/rejected": -1.1846646070480347, "logps/chosen": -26.260223388671875, "logps/rejected": -48.008331298828125, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": -1.0319814682006836, "rewards/margins": 5.547323703765869, "rewards/rejected": -6.5793046951293945, "step": 468 }, { "epoch": 5.558518518518518, "grad_norm": 2.853816124944289, "learning_rate": 1.268815352024416e-07, "logits/chosen": -1.1085361242294312, "logits/rejected": -1.1656684875488281, "logps/chosen": -25.87590789794922, "logps/rejected": -59.65959930419922, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": -1.1430463790893555, "rewards/margins": 7.823276519775391, "rewards/rejected": -8.966323852539062, "step": 469 }, { "epoch": 5.57037037037037, "grad_norm": 2.5887796053682663, "learning_rate": 1.257514947550189e-07, "logits/chosen": -1.5515766143798828, "logits/rejected": -1.3911482095718384, "logps/chosen": -34.41626739501953, "logps/rejected": -56.96218490600586, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -1.6425933837890625, "rewards/margins": 5.508365154266357, "rewards/rejected": -7.150958061218262, "step": 470 }, { "epoch": 5.582222222222223, "grad_norm": 2.4882096910217877, "learning_rate": 1.2462481567559966e-07, "logits/chosen": -0.8641619086265564, "logits/rejected": -0.9501523971557617, "logps/chosen": -24.07305145263672, "logps/rejected": -56.903045654296875, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -0.9779921174049377, "rewards/margins": 6.467419624328613, "rewards/rejected": -7.445411682128906, "step": 471 }, { "epoch": 5.594074074074074, "grad_norm": 2.7668478038500153, "learning_rate": 1.2350152844489688e-07, "logits/chosen": -1.0938386917114258, "logits/rejected": -1.007843255996704, "logps/chosen": -36.72554016113281, "logps/rejected": -72.5058364868164, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -1.4429738521575928, "rewards/margins": 8.357629776000977, "rewards/rejected": -9.800602912902832, "step": 472 }, { "epoch": 5.605925925925926, "grad_norm": 2.987152924412883, "learning_rate": 1.2238166345186152e-07, "logits/chosen": -1.024975299835205, "logits/rejected": -1.231426477432251, "logps/chosen": -29.665260314941406, "logps/rejected": -77.71907043457031, "loss": 0.0253, "rewards/accuracies": 1.0, "rewards/chosen": -2.0892112255096436, "rewards/margins": 8.2877779006958, "rewards/rejected": -10.376989364624023, "step": 473 }, { "epoch": 5.6177777777777775, "grad_norm": 3.79148887390576, "learning_rate": 1.2126525099286108e-07, "logits/chosen": -0.9098807573318481, "logits/rejected": -0.6766495108604431, "logps/chosen": -34.021461486816406, "logps/rejected": -60.278900146484375, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": -1.586209774017334, "rewards/margins": 6.309510231018066, "rewards/rejected": -7.895719528198242, "step": 474 }, { "epoch": 5.62962962962963, "grad_norm": 2.790675449028797, "learning_rate": 1.201523212708593e-07, "logits/chosen": -1.2367568016052246, "logits/rejected": -1.0846775770187378, "logps/chosen": -32.69294738769531, "logps/rejected": -56.54222869873047, "loss": 0.0236, "rewards/accuracies": 1.0, "rewards/chosen": -1.708235263824463, "rewards/margins": 5.9487504959106445, "rewards/rejected": -7.656986236572266, "step": 475 }, { "epoch": 5.641481481481481, "grad_norm": 2.603743326248802, "learning_rate": 1.1904290439459971e-07, "logits/chosen": -1.2075260877609253, "logits/rejected": -1.1212043762207031, "logps/chosen": -32.18739318847656, "logps/rejected": -58.44438171386719, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -1.459539771080017, "rewards/margins": 7.072033882141113, "rewards/rejected": -8.531574249267578, "step": 476 }, { "epoch": 5.653333333333333, "grad_norm": 2.9504541803133684, "learning_rate": 1.1793703037779055e-07, "logits/chosen": -1.1014938354492188, "logits/rejected": -1.2106781005859375, "logps/chosen": -26.133529663085938, "logps/rejected": -69.6479721069336, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -0.8243754506111145, "rewards/margins": 8.763911247253418, "rewards/rejected": -9.588286399841309, "step": 477 }, { "epoch": 5.6651851851851855, "grad_norm": 2.6441222916008504, "learning_rate": 1.1683472913829284e-07, "logits/chosen": -1.0774693489074707, "logits/rejected": -1.1322718858718872, "logps/chosen": -31.37884521484375, "logps/rejected": -64.80766296386719, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -1.6216317415237427, "rewards/margins": 6.900985240936279, "rewards/rejected": -8.52261734008789, "step": 478 }, { "epoch": 5.677037037037037, "grad_norm": 4.661580579547073, "learning_rate": 1.1573603049731153e-07, "logits/chosen": -1.20614755153656, "logits/rejected": -0.8982871174812317, "logps/chosen": -48.37188720703125, "logps/rejected": -61.23235321044922, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": -1.8891024589538574, "rewards/margins": 6.148949146270752, "rewards/rejected": -8.038052558898926, "step": 479 }, { "epoch": 5.688888888888889, "grad_norm": 2.2006957188380785, "learning_rate": 1.146409641785882e-07, "logits/chosen": -1.3002790212631226, "logits/rejected": -1.0651378631591797, "logps/chosen": -25.08184051513672, "logps/rejected": -49.062400817871094, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -0.5322498679161072, "rewards/margins": 6.5421624183654785, "rewards/rejected": -7.074413299560547, "step": 480 }, { "epoch": 5.70074074074074, "grad_norm": 2.6580827536506346, "learning_rate": 1.1354955980759689e-07, "logits/chosen": -1.0489715337753296, "logits/rejected": -1.1641223430633545, "logps/chosen": -34.531166076660156, "logps/rejected": -67.30549621582031, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -1.0368874073028564, "rewards/margins": 7.102381229400635, "rewards/rejected": -8.13926887512207, "step": 481 }, { "epoch": 5.712592592592593, "grad_norm": 3.0113174027880363, "learning_rate": 1.1246184691074314e-07, "logits/chosen": -1.0164622068405151, "logits/rejected": -0.9794116616249084, "logps/chosen": -31.135639190673828, "logps/rejected": -64.71720886230469, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5428776741027832, "rewards/margins": 6.710593223571777, "rewards/rejected": -8.253470420837402, "step": 482 }, { "epoch": 5.724444444444444, "grad_norm": 2.615616578526652, "learning_rate": 1.1137785491456453e-07, "logits/chosen": -1.1091269254684448, "logits/rejected": -0.7848995327949524, "logps/chosen": -31.23629379272461, "logps/rejected": -50.40385437011719, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -1.2773419618606567, "rewards/margins": 4.750949382781982, "rewards/rejected": -6.02829122543335, "step": 483 }, { "epoch": 5.736296296296296, "grad_norm": 2.188593697679317, "learning_rate": 1.1029761314493518e-07, "logits/chosen": -1.2400413751602173, "logits/rejected": -1.1589823961257935, "logps/chosen": -35.95014953613281, "logps/rejected": -68.69471740722656, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": -1.5209312438964844, "rewards/margins": 7.694516658782959, "rewards/rejected": -9.215447425842285, "step": 484 }, { "epoch": 5.7481481481481485, "grad_norm": 3.5115952525737826, "learning_rate": 1.0922115082627196e-07, "logits/chosen": -1.3057835102081299, "logits/rejected": -1.158517599105835, "logps/chosen": -35.280059814453125, "logps/rejected": -68.58432006835938, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -1.7615442276000977, "rewards/margins": 7.462545871734619, "rewards/rejected": -9.224090576171875, "step": 485 }, { "epoch": 5.76, "grad_norm": 2.1510413328277123, "learning_rate": 1.0814849708074414e-07, "logits/chosen": -1.316745400428772, "logits/rejected": -1.0071841478347778, "logps/chosen": -30.68706703186035, "logps/rejected": -59.9561882019043, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": -0.7682050466537476, "rewards/margins": 7.350435256958008, "rewards/rejected": -8.118640899658203, "step": 486 }, { "epoch": 5.771851851851852, "grad_norm": 3.181914233461486, "learning_rate": 1.070796809274853e-07, "logits/chosen": -1.1496213674545288, "logits/rejected": -1.1401625871658325, "logps/chosen": -28.064807891845703, "logps/rejected": -66.26860046386719, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": -1.7950414419174194, "rewards/margins": 8.238448143005371, "rewards/rejected": -10.033490180969238, "step": 487 }, { "epoch": 5.783703703703703, "grad_norm": 2.5436639084646266, "learning_rate": 1.0601473128180854e-07, "logits/chosen": -1.0235388278961182, "logits/rejected": -0.7555651068687439, "logps/chosen": -37.206642150878906, "logps/rejected": -62.38896942138672, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -1.0393364429473877, "rewards/margins": 7.206859111785889, "rewards/rejected": -8.246195793151855, "step": 488 }, { "epoch": 5.795555555555556, "grad_norm": 1.9218672278778193, "learning_rate": 1.0495367695442392e-07, "logits/chosen": -1.169034719467163, "logits/rejected": -1.2706691026687622, "logps/chosen": -27.194473266601562, "logps/rejected": -62.6011962890625, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -1.8176056146621704, "rewards/margins": 6.933694362640381, "rewards/rejected": -8.751298904418945, "step": 489 }, { "epoch": 5.807407407407408, "grad_norm": 1.9977551486881404, "learning_rate": 1.0389654665065908e-07, "logits/chosen": -1.3202329874038696, "logits/rejected": -1.480948567390442, "logps/chosen": -28.038394927978516, "logps/rejected": -61.36920166015625, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -0.7177014946937561, "rewards/margins": 7.891351222991943, "rewards/rejected": -8.609053611755371, "step": 490 }, { "epoch": 5.819259259259259, "grad_norm": 2.250794182792882, "learning_rate": 1.0284336896968304e-07, "logits/chosen": -1.0238221883773804, "logits/rejected": -1.111476182937622, "logps/chosen": -32.318992614746094, "logps/rejected": -81.92007446289062, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -1.8146699666976929, "rewards/margins": 8.974444389343262, "rewards/rejected": -10.78911304473877, "step": 491 }, { "epoch": 5.831111111111111, "grad_norm": 3.0522503366900064, "learning_rate": 1.0179417240373182e-07, "logits/chosen": -1.0641810894012451, "logits/rejected": -1.1482521295547485, "logps/chosen": -27.489585876464844, "logps/rejected": -64.82382202148438, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.3054454326629639, "rewards/margins": 7.11977481842041, "rewards/rejected": -8.425220489501953, "step": 492 }, { "epoch": 5.842962962962963, "grad_norm": 2.3010920457773105, "learning_rate": 1.0074898533733833e-07, "logits/chosen": -1.274016261100769, "logits/rejected": -1.1447503566741943, "logps/chosen": -36.9109001159668, "logps/rejected": -71.91200256347656, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": -1.5965338945388794, "rewards/margins": 7.742778778076172, "rewards/rejected": -9.339312553405762, "step": 493 }, { "epoch": 5.854814814814815, "grad_norm": 2.8527471395619894, "learning_rate": 9.970783604656383e-08, "logits/chosen": -1.2319526672363281, "logits/rejected": -1.268723487854004, "logps/chosen": -24.263778686523438, "logps/rejected": -49.15123748779297, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -1.2518236637115479, "rewards/margins": 5.670311450958252, "rewards/rejected": -6.922135353088379, "step": 494 }, { "epoch": 5.866666666666667, "grad_norm": 3.7257951905167124, "learning_rate": 9.867075269823353e-08, "logits/chosen": -0.89796382188797, "logits/rejected": -0.7868020534515381, "logps/chosen": -33.333683013916016, "logps/rejected": -56.5141487121582, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": -0.8444391489028931, "rewards/margins": 6.135402202606201, "rewards/rejected": -6.9798407554626465, "step": 495 }, { "epoch": 5.8785185185185185, "grad_norm": 2.375314146679173, "learning_rate": 9.763776334917398e-08, "logits/chosen": -1.2864789962768555, "logits/rejected": -1.2498905658721924, "logps/chosen": -28.753864288330078, "logps/rejected": -63.23728942871094, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -1.3437196016311646, "rewards/margins": 7.696622371673584, "rewards/rejected": -9.040342330932617, "step": 496 }, { "epoch": 5.890370370370371, "grad_norm": 2.3165745968499927, "learning_rate": 9.660889594545469e-08, "logits/chosen": -0.8466818928718567, "logits/rejected": -0.88641756772995, "logps/chosen": -31.070556640625, "logps/rejected": -72.79938507080078, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -1.4867016077041626, "rewards/margins": 7.354016304016113, "rewards/rejected": -8.840718269348145, "step": 497 }, { "epoch": 5.902222222222222, "grad_norm": 3.081071543156241, "learning_rate": 9.558417832163162e-08, "logits/chosen": -1.1182941198349, "logits/rejected": -1.2115066051483154, "logps/chosen": -32.31890869140625, "logps/rejected": -61.95922088623047, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -1.6548502445220947, "rewards/margins": 6.778643608093262, "rewards/rejected": -8.433493614196777, "step": 498 }, { "epoch": 5.914074074074074, "grad_norm": 3.3156804186603317, "learning_rate": 9.456363819999419e-08, "logits/chosen": -1.1249277591705322, "logits/rejected": -1.197086215019226, "logps/chosen": -29.88912582397461, "logps/rejected": -65.54908752441406, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -1.750339388847351, "rewards/margins": 6.563896179199219, "rewards/rejected": -8.31423568725586, "step": 499 }, { "epoch": 5.925925925925926, "grad_norm": 3.160346367183824, "learning_rate": 9.354730318981561e-08, "logits/chosen": -1.2213014364242554, "logits/rejected": -0.8933899402618408, "logps/chosen": -29.600210189819336, "logps/rejected": -57.81951904296875, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -1.3094991445541382, "rewards/margins": 6.17173433303833, "rewards/rejected": -7.481233596801758, "step": 500 } ], "logging_steps": 1, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }