{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 1.1784705641049245, "learning_rate": 5.208333333333333e-08, "logits/chosen": -1.4463237524032593, "logits/rejected": -1.4477096796035767, "logps/chosen": -7.9823808670043945, "logps/rejected": -8.155555725097656, "loss": 1.798, "rewards/accuracies": 0.5, "rewards/chosen": -7.9823808670043945, "rewards/margins": 0.1731749176979065, "rewards/rejected": -8.155555725097656, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 0.9602764459933493, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.4451031684875488, "logits/rejected": -1.436326265335083, "logps/chosen": -8.046354293823242, "logps/rejected": -7.959715843200684, "loss": 1.7982, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.046354293823242, "rewards/margins": -0.0866377204656601, "rewards/rejected": -7.959715843200684, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 1.689559474976433, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -1.448919415473938, "logits/rejected": -1.444911003112793, "logps/chosen": -7.851029872894287, "logps/rejected": -7.866568565368652, "loss": 1.789, "rewards/accuracies": 0.5, "rewards/chosen": -7.851029872894287, "rewards/margins": 0.015538657084107399, "rewards/rejected": -7.866568565368652, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 1.4544738071780356, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.4363288879394531, "logits/rejected": -1.4409823417663574, "logps/chosen": -8.180266380310059, "logps/rejected": -8.204001426696777, "loss": 1.8048, "rewards/accuracies": 0.5, "rewards/chosen": -8.180266380310059, "rewards/margins": 0.023736288771033287, "rewards/rejected": -8.204001426696777, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 1.304458351649718, "learning_rate": 2.604166666666667e-07, "logits/chosen": -1.475970983505249, "logits/rejected": -1.4659450054168701, "logps/chosen": -8.084822654724121, "logps/rejected": -7.987278938293457, "loss": 1.8124, "rewards/accuracies": 0.4375, "rewards/chosen": -8.084822654724121, "rewards/margins": -0.09754323959350586, "rewards/rejected": -7.987278938293457, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 1.2140086077938599, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.4456737041473389, "logits/rejected": -1.4336217641830444, "logps/chosen": -7.773144721984863, "logps/rejected": -7.694056510925293, "loss": 1.8017, "rewards/accuracies": 0.46875, "rewards/chosen": -7.773144721984863, "rewards/margins": -0.0790884867310524, "rewards/rejected": -7.694056510925293, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 0.9984160768782803, "learning_rate": 3.645833333333333e-07, "logits/chosen": -1.4582607746124268, "logits/rejected": -1.4302797317504883, "logps/chosen": -7.980234622955322, "logps/rejected": -7.845289707183838, "loss": 1.7982, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -7.980234622955322, "rewards/margins": -0.13494457304477692, "rewards/rejected": -7.845289707183838, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 0.8920926404480951, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.454561471939087, "logits/rejected": -1.4629584550857544, "logps/chosen": -7.85833740234375, "logps/rejected": -8.055027961730957, "loss": 1.8008, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -7.85833740234375, "rewards/margins": 0.19669005274772644, "rewards/rejected": -8.055027961730957, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 1.0898338388611548, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.4877290725708008, "logits/rejected": -1.485259771347046, "logps/chosen": -7.855520725250244, "logps/rejected": -7.64023494720459, "loss": 1.7987, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -7.855520725250244, "rewards/margins": -0.21528606116771698, "rewards/rejected": -7.64023494720459, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 1.166801443928394, "learning_rate": 4.999731868769026e-07, "logits/chosen": -1.477698564529419, "logits/rejected": -1.4901760816574097, "logps/chosen": -8.011387825012207, "logps/rejected": -8.05154037475586, "loss": 1.8065, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.011387825012207, "rewards/margins": 0.040151696652173996, "rewards/rejected": -8.05154037475586, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 0.9751544943818836, "learning_rate": 4.996716052911017e-07, "logits/chosen": -1.48259699344635, "logits/rejected": -1.4775769710540771, "logps/chosen": -7.901836395263672, "logps/rejected": -7.928590297698975, "loss": 1.8086, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -7.901836395263672, "rewards/margins": 0.0267526563256979, "rewards/rejected": -7.928590297698975, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 1.453860208239476, "learning_rate": 4.990353313429303e-07, "logits/chosen": -1.4845072031021118, "logits/rejected": -1.495591402053833, "logps/chosen": -7.9037041664123535, "logps/rejected": -8.206459045410156, "loss": 1.7865, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -7.9037041664123535, "rewards/margins": 0.30275601148605347, "rewards/rejected": -8.206459045410156, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 1.734106310553517, "learning_rate": 4.980652179769217e-07, "logits/chosen": -1.4411917924880981, "logits/rejected": -1.4547481536865234, "logps/chosen": -7.689724922180176, "logps/rejected": -8.240130424499512, "loss": 1.7898, "rewards/accuracies": 0.59375, "rewards/chosen": -7.689724922180176, "rewards/margins": 0.5504060387611389, "rewards/rejected": -8.240130424499512, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 1.8736880005786551, "learning_rate": 4.967625656594781e-07, "logits/chosen": -1.430119514465332, "logits/rejected": -1.420650839805603, "logps/chosen": -7.873298645019531, "logps/rejected": -7.9891533851623535, "loss": 1.7922, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -7.873298645019531, "rewards/margins": 0.11585396528244019, "rewards/rejected": -7.9891533851623535, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 1.3664496432393678, "learning_rate": 4.951291206355559e-07, "logits/chosen": -1.4261605739593506, "logits/rejected": -1.4281210899353027, "logps/chosen": -7.770443916320801, "logps/rejected": -7.860281467437744, "loss": 1.7858, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -7.770443916320801, "rewards/margins": 0.08983758091926575, "rewards/rejected": -7.860281467437744, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 1.6834122106300342, "learning_rate": 4.93167072587771e-07, "logits/chosen": -1.4214476346969604, "logits/rejected": -1.4338067770004272, "logps/chosen": -7.932246208190918, "logps/rejected": -8.00547981262207, "loss": 1.7858, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -7.932246208190918, "rewards/margins": 0.07323212176561356, "rewards/rejected": -8.00547981262207, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 1.2997280215141844, "learning_rate": 4.908790517010636e-07, "logits/chosen": -1.435379981994629, "logits/rejected": -1.4292837381362915, "logps/chosen": -8.106579780578613, "logps/rejected": -8.394915580749512, "loss": 1.7975, "rewards/accuracies": 0.5, "rewards/chosen": -8.106579780578613, "rewards/margins": 0.2883368134498596, "rewards/rejected": -8.394915580749512, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 1.4395629403420762, "learning_rate": 4.882681251368548e-07, "logits/chosen": -1.405975341796875, "logits/rejected": -1.4189186096191406, "logps/chosen": -8.203514099121094, "logps/rejected": -8.477225303649902, "loss": 1.7939, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -8.203514099121094, "rewards/margins": 0.27371111512184143, "rewards/rejected": -8.477225303649902, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 1.4670380498675017, "learning_rate": 4.853377929214243e-07, "logits/chosen": -1.4155464172363281, "logits/rejected": -1.408503532409668, "logps/chosen": -8.385589599609375, "logps/rejected": -8.298436164855957, "loss": 1.7993, "rewards/accuracies": 0.4375, "rewards/chosen": -8.385589599609375, "rewards/margins": -0.08715416491031647, "rewards/rejected": -8.298436164855957, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 1.8173814697081554, "learning_rate": 4.820919832540181e-07, "logits/chosen": -1.4453513622283936, "logits/rejected": -1.4449245929718018, "logps/chosen": -8.229129791259766, "logps/rejected": -8.375448226928711, "loss": 1.7935, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -8.229129791259766, "rewards/margins": 0.1463182270526886, "rewards/rejected": -8.375448226928711, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 1.5123285049323918, "learning_rate": 4.785350472409791e-07, "logits/chosen": -1.417307734489441, "logits/rejected": -1.4141901731491089, "logps/chosen": -7.822029113769531, "logps/rejected": -8.184598922729492, "loss": 1.7821, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.822029113769531, "rewards/margins": 0.3625703752040863, "rewards/rejected": -8.184598922729492, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 1.6377413056182937, "learning_rate": 4.7467175306295647e-07, "logits/chosen": -1.4389078617095947, "logits/rejected": -1.4492288827896118, "logps/chosen": -8.023492813110352, "logps/rejected": -8.443803787231445, "loss": 1.7872, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.023492813110352, "rewards/margins": 0.4203101694583893, "rewards/rejected": -8.443803787231445, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 98.6962749407612, "learning_rate": 4.70507279583015e-07, "logits/chosen": -1.370867133140564, "logits/rejected": -1.3796217441558838, "logps/chosen": -7.94110107421875, "logps/rejected": -8.37704086303711, "loss": 1.7801, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -7.94110107421875, "rewards/margins": 0.4359405040740967, "rewards/rejected": -8.37704086303711, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 1.8443926963030073, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -1.3745276927947998, "logits/rejected": -1.3777697086334229, "logps/chosen": -8.194904327392578, "logps/rejected": -8.444576263427734, "loss": 1.7749, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.194904327392578, "rewards/margins": 0.24967141449451447, "rewards/rejected": -8.444576263427734, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 2.195114945918261, "learning_rate": 4.612975213859487e-07, "logits/chosen": -1.4081896543502808, "logits/rejected": -1.4049599170684814, "logps/chosen": -8.501805305480957, "logps/rejected": -8.36412525177002, "loss": 1.7924, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -8.501805305480957, "rewards/margins": -0.13768072426319122, "rewards/rejected": -8.36412525177002, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 2.0830951299288785, "learning_rate": 4.5626458262912735e-07, "logits/chosen": -1.3827464580535889, "logits/rejected": -1.396815299987793, "logps/chosen": -8.410286903381348, "logps/rejected": -8.63615608215332, "loss": 1.7843, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.410286903381348, "rewards/margins": 0.225869819521904, "rewards/rejected": -8.63615608215332, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 2.8119817250060923, "learning_rate": 4.5095513994085974e-07, "logits/chosen": -1.4122145175933838, "logits/rejected": -1.4246305227279663, "logps/chosen": -8.588987350463867, "logps/rejected": -9.209198951721191, "loss": 1.7873, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.588987350463867, "rewards/margins": 0.6202121376991272, "rewards/rejected": -9.209198951721191, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 2.927322308508806, "learning_rate": 4.453763107901675e-07, "logits/chosen": -1.435681700706482, "logits/rejected": -1.439152479171753, "logps/chosen": -8.261212348937988, "logps/rejected": -8.506053924560547, "loss": 1.7771, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -8.261212348937988, "rewards/margins": 0.2448415756225586, "rewards/rejected": -8.506053924560547, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 1.7606419766373003, "learning_rate": 4.395355737667985e-07, "logits/chosen": -1.4473952054977417, "logits/rejected": -1.4762256145477295, "logps/chosen": -8.1802396774292, "logps/rejected": -8.576211929321289, "loss": 1.7748, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -8.1802396774292, "rewards/margins": 0.3959727883338928, "rewards/rejected": -8.576211929321289, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 2.502338791234934, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -1.4698665142059326, "logits/rejected": -1.475287914276123, "logps/chosen": -8.098420143127441, "logps/rejected": -8.450376510620117, "loss": 1.7739, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -8.098420143127441, "rewards/margins": 0.3519565463066101, "rewards/rejected": -8.450376510620117, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 2.2537748410992924, "learning_rate": 4.271000354423425e-07, "logits/chosen": -1.459938645362854, "logits/rejected": -1.4374125003814697, "logps/chosen": -8.157195091247559, "logps/rejected": -8.490175247192383, "loss": 1.7767, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.157195091247559, "rewards/margins": 0.3329797685146332, "rewards/rejected": -8.490175247192383, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 2.9160012212300086, "learning_rate": 4.2052190435769554e-07, "logits/chosen": -1.447819471359253, "logits/rejected": -1.4601446390151978, "logps/chosen": -8.642538070678711, "logps/rejected": -8.85395336151123, "loss": 1.7908, "rewards/accuracies": 0.5625, "rewards/chosen": -8.642538070678711, "rewards/margins": 0.2114148586988449, "rewards/rejected": -8.85395336151123, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 2.332693331124937, "learning_rate": 4.137151834863213e-07, "logits/chosen": -1.4943044185638428, "logits/rejected": -1.4839502573013306, "logps/chosen": -8.424747467041016, "logps/rejected": -8.770976066589355, "loss": 1.7765, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -8.424747467041016, "rewards/margins": 0.3462288975715637, "rewards/rejected": -8.770976066589355, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 2.4152194824308517, "learning_rate": 4.0668899744407567e-07, "logits/chosen": -1.4917938709259033, "logits/rejected": -1.4810343980789185, "logps/chosen": -8.424253463745117, "logps/rejected": -8.7727632522583, "loss": 1.7856, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.424253463745117, "rewards/margins": 0.34850937128067017, "rewards/rejected": -8.7727632522583, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 2.653215706659519, "learning_rate": 3.994527650465352e-07, "logits/chosen": -1.5167206525802612, "logits/rejected": -1.5095903873443604, "logps/chosen": -8.30429458618164, "logps/rejected": -8.76606559753418, "loss": 1.7754, "rewards/accuracies": 0.5625, "rewards/chosen": -8.30429458618164, "rewards/margins": 0.46177130937576294, "rewards/rejected": -8.76606559753418, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 3.002251381640332, "learning_rate": 3.920161866827889e-07, "logits/chosen": -1.5167474746704102, "logits/rejected": -1.524743914604187, "logps/chosen": -8.360708236694336, "logps/rejected": -8.7758207321167, "loss": 1.7796, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.360708236694336, "rewards/margins": 0.41511401534080505, "rewards/rejected": -8.7758207321167, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 2.621525982629558, "learning_rate": 3.8438923131177237e-07, "logits/chosen": -1.5086079835891724, "logits/rejected": -1.5038981437683105, "logps/chosen": -8.088265419006348, "logps/rejected": -8.633108139038086, "loss": 1.7613, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.088265419006348, "rewards/margins": 0.5448437929153442, "rewards/rejected": -8.633108139038086, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 2.7052866783401983, "learning_rate": 3.765821230985757e-07, "logits/chosen": -1.530807375907898, "logits/rejected": -1.5209619998931885, "logps/chosen": -8.019875526428223, "logps/rejected": -8.384284973144531, "loss": 1.7721, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -8.019875526428223, "rewards/margins": 0.3644091784954071, "rewards/rejected": -8.384284973144531, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 3.1926636795520884, "learning_rate": 3.6860532770864005e-07, "logits/chosen": -1.4885269403457642, "logits/rejected": -1.483224630355835, "logps/chosen": -8.213752746582031, "logps/rejected": -8.523615837097168, "loss": 1.7799, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.213752746582031, "rewards/margins": 0.3098633289337158, "rewards/rejected": -8.523615837097168, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 2.929908108293166, "learning_rate": 3.604695382782159e-07, "logits/chosen": -1.5416061878204346, "logits/rejected": -1.5284559726715088, "logps/chosen": -8.0154447555542, "logps/rejected": -8.428897857666016, "loss": 1.7863, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -8.0154447555542, "rewards/margins": 0.4134535789489746, "rewards/rejected": -8.428897857666016, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 3.9208987619596365, "learning_rate": 3.5218566107988867e-07, "logits/chosen": -1.5039266347885132, "logits/rejected": -1.495948076248169, "logps/chosen": -8.161928176879883, "logps/rejected": -8.836877822875977, "loss": 1.7649, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -8.161928176879883, "rewards/margins": 0.6749483346939087, "rewards/rejected": -8.836877822875977, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 9.132837726981721, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -1.4978554248809814, "logits/rejected": -1.5001240968704224, "logps/chosen": -8.470359802246094, "logps/rejected": -8.813634872436523, "loss": 1.7745, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.470359802246094, "rewards/margins": 0.34327542781829834, "rewards/rejected": -8.813634872436523, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 2.6786514985319276, "learning_rate": 3.3521824616429284e-07, "logits/chosen": -1.5123167037963867, "logits/rejected": -1.5113458633422852, "logps/chosen": -7.9097490310668945, "logps/rejected": -8.861387252807617, "loss": 1.761, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -7.9097490310668945, "rewards/margins": 0.9516386985778809, "rewards/rejected": -8.861387252807617, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 3.81508961382941, "learning_rate": 3.265574537815398e-07, "logits/chosen": -1.4957890510559082, "logits/rejected": -1.503143072128296, "logps/chosen": -7.837327480316162, "logps/rejected": -8.386968612670898, "loss": 1.7778, "rewards/accuracies": 0.59375, "rewards/chosen": -7.837327480316162, "rewards/margins": 0.549641489982605, "rewards/rejected": -8.386968612670898, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 4.889361688771567, "learning_rate": 3.1779403380910425e-07, "logits/chosen": -1.530035376548767, "logits/rejected": -1.5252281427383423, "logps/chosen": -7.843958377838135, "logps/rejected": -8.631728172302246, "loss": 1.7605, "rewards/accuracies": 0.625, "rewards/chosen": -7.843958377838135, "rewards/margins": 0.787769079208374, "rewards/rejected": -8.631728172302246, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 4.621367765160452, "learning_rate": 3.0893973387735683e-07, "logits/chosen": -1.5648292303085327, "logits/rejected": -1.5472863912582397, "logps/chosen": -8.125567436218262, "logps/rejected": -9.158833503723145, "loss": 1.7587, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -8.125567436218262, "rewards/margins": 1.033265471458435, "rewards/rejected": -9.158833503723145, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 4.510315294900107, "learning_rate": 3.000064234440111e-07, "logits/chosen": -1.6225063800811768, "logits/rejected": -1.597538948059082, "logps/chosen": -8.116875648498535, "logps/rejected": -9.23936939239502, "loss": 1.7614, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -8.116875648498535, "rewards/margins": 1.1224939823150635, "rewards/rejected": -9.23936939239502, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 7.892385115240221, "learning_rate": 2.910060778827554e-07, "logits/chosen": -1.5823067426681519, "logits/rejected": -1.574851155281067, "logps/chosen": -8.326906204223633, "logps/rejected": -9.232850074768066, "loss": 1.7551, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -8.326906204223633, "rewards/margins": 0.9059435129165649, "rewards/rejected": -9.232850074768066, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 7.898664476513504, "learning_rate": 2.8195076242990116e-07, "logits/chosen": -1.5616357326507568, "logits/rejected": -1.5433732271194458, "logps/chosen": -9.017087936401367, "logps/rejected": -9.785380363464355, "loss": 1.7399, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -9.017087936401367, "rewards/margins": 0.7682939767837524, "rewards/rejected": -9.785380363464355, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 7.533973692196864, "learning_rate": 2.7285261601056697e-07, "logits/chosen": -1.5792642831802368, "logits/rejected": -1.5693460702896118, "logps/chosen": -7.595057010650635, "logps/rejected": -8.57075309753418, "loss": 1.7492, "rewards/accuracies": 0.625, "rewards/chosen": -7.595057010650635, "rewards/margins": 0.9756966829299927, "rewards/rejected": -8.57075309753418, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 10.624973349509592, "learning_rate": 2.6372383496608186e-07, "logits/chosen": -1.5440006256103516, "logits/rejected": -1.5143473148345947, "logps/chosen": -8.250520706176758, "logps/rejected": -9.301929473876953, "loss": 1.7613, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.250520706176758, "rewards/margins": 1.0514087677001953, "rewards/rejected": -9.301929473876953, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 6.827054099571504, "learning_rate": 2.5457665670441937e-07, "logits/chosen": -1.4768860340118408, "logits/rejected": -1.4396953582763672, "logps/chosen": -7.612262725830078, "logps/rejected": -9.350198745727539, "loss": 1.7355, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -7.612262725830078, "rewards/margins": 1.7379356622695923, "rewards/rejected": -9.350198745727539, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 8.933359425352409, "learning_rate": 2.454233432955807e-07, "logits/chosen": -1.4639561176300049, "logits/rejected": -1.453298807144165, "logps/chosen": -8.868209838867188, "logps/rejected": -9.566887855529785, "loss": 1.7463, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.868209838867188, "rewards/margins": 0.698679506778717, "rewards/rejected": -9.566887855529785, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 7.0066263897371535, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -1.4920735359191895, "logits/rejected": -1.4540319442749023, "logps/chosen": -8.762821197509766, "logps/rejected": -9.565961837768555, "loss": 1.7653, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.762821197509766, "rewards/margins": 0.8031412363052368, "rewards/rejected": -9.565961837768555, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 6.881020189800575, "learning_rate": 2.2714738398943308e-07, "logits/chosen": -1.5479252338409424, "logits/rejected": -1.5279855728149414, "logps/chosen": -7.815232753753662, "logps/rejected": -9.187639236450195, "loss": 1.7328, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -7.815232753753662, "rewards/margins": 1.3724067211151123, "rewards/rejected": -9.187639236450195, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 7.517058454845967, "learning_rate": 2.1804923757009882e-07, "logits/chosen": -1.5499231815338135, "logits/rejected": -1.5244100093841553, "logps/chosen": -8.39962100982666, "logps/rejected": -10.24171257019043, "loss": 1.7267, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -8.39962100982666, "rewards/margins": 1.842092514038086, "rewards/rejected": -10.24171257019043, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 8.013738309459756, "learning_rate": 2.089939221172446e-07, "logits/chosen": -1.5420681238174438, "logits/rejected": -1.5153690576553345, "logps/chosen": -8.524008750915527, "logps/rejected": -9.523444175720215, "loss": 1.7487, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.524008750915527, "rewards/margins": 0.999435305595398, "rewards/rejected": -9.523444175720215, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 6.486821674482447, "learning_rate": 1.9999357655598891e-07, "logits/chosen": -1.6035845279693604, "logits/rejected": -1.5711686611175537, "logps/chosen": -8.237485885620117, "logps/rejected": -9.552020072937012, "loss": 1.735, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -8.237485885620117, "rewards/margins": 1.3145345449447632, "rewards/rejected": -9.552020072937012, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 8.83521561541266, "learning_rate": 1.9106026612264315e-07, "logits/chosen": -1.6141548156738281, "logits/rejected": -1.594178557395935, "logps/chosen": -7.968149662017822, "logps/rejected": -8.619260787963867, "loss": 1.7566, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.968149662017822, "rewards/margins": 0.6511108875274658, "rewards/rejected": -8.619260787963867, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 6.975353142841212, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -1.5654271841049194, "logits/rejected": -1.546992301940918, "logps/chosen": -8.48424243927002, "logps/rejected": -9.418001174926758, "loss": 1.7583, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.48424243927002, "rewards/margins": 0.9337590336799622, "rewards/rejected": -9.418001174926758, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 5.4690365037746815, "learning_rate": 1.7344254621846017e-07, "logits/chosen": -1.5813648700714111, "logits/rejected": -1.5758841037750244, "logps/chosen": -8.727537155151367, "logps/rejected": -9.66854190826416, "loss": 1.732, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.727537155151367, "rewards/margins": 0.9410043954849243, "rewards/rejected": -9.66854190826416, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 5.361180962867588, "learning_rate": 1.647817538357072e-07, "logits/chosen": -1.6157958507537842, "logits/rejected": -1.6046196222305298, "logps/chosen": -8.187897682189941, "logps/rejected": -8.897558212280273, "loss": 1.7699, "rewards/accuracies": 0.5625, "rewards/chosen": -8.187897682189941, "rewards/margins": 0.7096610069274902, "rewards/rejected": -8.897558212280273, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 5.449920555042311, "learning_rate": 1.562351990976095e-07, "logits/chosen": -1.6411434412002563, "logits/rejected": -1.62042236328125, "logps/chosen": -7.746342658996582, "logps/rejected": -8.609481811523438, "loss": 1.7396, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -7.746342658996582, "rewards/margins": 0.8631397485733032, "rewards/rejected": -8.609481811523438, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 6.360997845008894, "learning_rate": 1.478143389201113e-07, "logits/chosen": -1.6342664957046509, "logits/rejected": -1.598560094833374, "logps/chosen": -8.291830062866211, "logps/rejected": -9.186594009399414, "loss": 1.7428, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -8.291830062866211, "rewards/margins": 0.8947637677192688, "rewards/rejected": -9.186594009399414, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 7.837240923658896, "learning_rate": 1.3953046172178413e-07, "logits/chosen": -1.6066200733184814, "logits/rejected": -1.5687167644500732, "logps/chosen": -8.601926803588867, "logps/rejected": -9.912989616394043, "loss": 1.7306, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.601926803588867, "rewards/margins": 1.3110629320144653, "rewards/rejected": -9.912989616394043, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 5.555679671882387, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -1.6012904644012451, "logits/rejected": -1.5771445035934448, "logps/chosen": -8.177051544189453, "logps/rejected": -8.873910903930664, "loss": 1.7496, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -8.177051544189453, "rewards/margins": 0.6968590021133423, "rewards/rejected": -8.873910903930664, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 5.848948219669636, "learning_rate": 1.2341787690142435e-07, "logits/chosen": -1.6410300731658936, "logits/rejected": -1.5906805992126465, "logps/chosen": -7.888881683349609, "logps/rejected": -8.846321105957031, "loss": 1.7444, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -7.888881683349609, "rewards/margins": 0.9574400782585144, "rewards/rejected": -8.846321105957031, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 5.607244300528084, "learning_rate": 1.1561076868822755e-07, "logits/chosen": -1.6162534952163696, "logits/rejected": -1.590820074081421, "logps/chosen": -8.39061450958252, "logps/rejected": -9.30118465423584, "loss": 1.7583, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.39061450958252, "rewards/margins": 0.9105701446533203, "rewards/rejected": -9.30118465423584, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 7.279367786185165, "learning_rate": 1.0798381331721107e-07, "logits/chosen": -1.6430072784423828, "logits/rejected": -1.6136901378631592, "logps/chosen": -8.215340614318848, "logps/rejected": -9.647343635559082, "loss": 1.7453, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.215340614318848, "rewards/margins": 1.4320037364959717, "rewards/rejected": -9.647343635559082, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 7.386464884448549, "learning_rate": 1.0054723495346482e-07, "logits/chosen": -1.5686066150665283, "logits/rejected": -1.5304858684539795, "logps/chosen": -8.146261215209961, "logps/rejected": -9.257192611694336, "loss": 1.7426, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -8.146261215209961, "rewards/margins": 1.1109315156936646, "rewards/rejected": -9.257192611694336, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 5.785018086716381, "learning_rate": 9.331100255592436e-08, "logits/chosen": -1.6148964166641235, "logits/rejected": -1.5602537393569946, "logps/chosen": -8.22186279296875, "logps/rejected": -9.004861831665039, "loss": 1.7401, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -8.22186279296875, "rewards/margins": 0.7829989194869995, "rewards/rejected": -9.004861831665039, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 6.331566938080069, "learning_rate": 8.628481651367875e-08, "logits/chosen": -1.6229692697525024, "logits/rejected": -1.5903146266937256, "logps/chosen": -7.560595512390137, "logps/rejected": -8.47083568572998, "loss": 1.7385, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -7.560595512390137, "rewards/margins": 0.9102400541305542, "rewards/rejected": -8.47083568572998, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 6.319560115409626, "learning_rate": 7.947809564230445e-08, "logits/chosen": -1.6155359745025635, "logits/rejected": -1.5554075241088867, "logps/chosen": -7.953149318695068, "logps/rejected": -8.690530776977539, "loss": 1.7605, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -7.953149318695068, "rewards/margins": 0.7373809814453125, "rewards/rejected": -8.690530776977539, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 6.673242442518388, "learning_rate": 7.289996455765748e-08, "logits/chosen": -1.6045255661010742, "logits/rejected": -1.5444786548614502, "logps/chosen": -8.404296875, "logps/rejected": -9.452798843383789, "loss": 1.7498, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.404296875, "rewards/margins": 1.04850172996521, "rewards/rejected": -9.452798843383789, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 7.020475966648844, "learning_rate": 6.655924144404906e-08, "logits/chosen": -1.5848079919815063, "logits/rejected": -1.5374637842178345, "logps/chosen": -8.454399108886719, "logps/rejected": -9.83703899383545, "loss": 1.739, "rewards/accuracies": 0.59375, "rewards/chosen": -8.454399108886719, "rewards/margins": 1.3826408386230469, "rewards/rejected": -9.83703899383545, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 6.653238331291029, "learning_rate": 6.046442623320145e-08, "logits/chosen": -1.5949599742889404, "logits/rejected": -1.5306172370910645, "logps/chosen": -8.711164474487305, "logps/rejected": -9.62182331085205, "loss": 1.7155, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -8.711164474487305, "rewards/margins": 0.910659670829773, "rewards/rejected": -9.62182331085205, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 7.782341216017442, "learning_rate": 5.4623689209832484e-08, "logits/chosen": -1.577016830444336, "logits/rejected": -1.5303077697753906, "logps/chosen": -8.757134437561035, "logps/rejected": -9.896451950073242, "loss": 1.7371, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.757134437561035, "rewards/margins": 1.1393181085586548, "rewards/rejected": -9.896451950073242, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 6.684411144542067, "learning_rate": 4.904486005914027e-08, "logits/chosen": -1.5992292165756226, "logits/rejected": -1.5533421039581299, "logps/chosen": -8.044672012329102, "logps/rejected": -9.49129581451416, "loss": 1.7299, "rewards/accuracies": 0.625, "rewards/chosen": -8.044672012329102, "rewards/margins": 1.446624517440796, "rewards/rejected": -9.49129581451416, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 8.354771172189295, "learning_rate": 4.373541737087263e-08, "logits/chosen": -1.5767982006072998, "logits/rejected": -1.5374451875686646, "logps/chosen": -7.573851108551025, "logps/rejected": -8.567195892333984, "loss": 1.7336, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.573851108551025, "rewards/margins": 0.9933451414108276, "rewards/rejected": -8.567195892333984, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 7.4009477251723865, "learning_rate": 3.8702478614051345e-08, "logits/chosen": -1.5977306365966797, "logits/rejected": -1.5405915975570679, "logps/chosen": -7.894219875335693, "logps/rejected": -8.81593132019043, "loss": 1.7454, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.894219875335693, "rewards/margins": 0.9217125177383423, "rewards/rejected": -8.81593132019043, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 6.210047238602565, "learning_rate": 3.3952790595787986e-08, "logits/chosen": -1.60799241065979, "logits/rejected": -1.6026899814605713, "logps/chosen": -8.381356239318848, "logps/rejected": -9.069351196289062, "loss": 1.742, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.381356239318848, "rewards/margins": 0.6879969835281372, "rewards/rejected": -9.069351196289062, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 6.765719877755184, "learning_rate": 2.9492720416985e-08, "logits/chosen": -1.6209001541137695, "logits/rejected": -1.5718666315078735, "logps/chosen": -7.976959228515625, "logps/rejected": -8.827299118041992, "loss": 1.7364, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -7.976959228515625, "rewards/margins": 0.8503401875495911, "rewards/rejected": -8.827299118041992, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 8.979600106055784, "learning_rate": 2.5328246937043525e-08, "logits/chosen": -1.6459277868270874, "logits/rejected": -1.6195147037506104, "logps/chosen": -7.984838008880615, "logps/rejected": -9.079233169555664, "loss": 1.7264, "rewards/accuracies": 0.59375, "rewards/chosen": -7.984838008880615, "rewards/margins": 1.094395637512207, "rewards/rejected": -9.079233169555664, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 6.181198265400907, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -1.6045997142791748, "logits/rejected": -1.5594054460525513, "logps/chosen": -7.9123215675354, "logps/rejected": -8.782036781311035, "loss": 1.7493, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -7.9123215675354, "rewards/margins": 0.8697155117988586, "rewards/rejected": -8.782036781311035, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 7.693545140925747, "learning_rate": 1.7908016745981856e-08, "logits/chosen": -1.615017294883728, "logits/rejected": -1.5611761808395386, "logps/chosen": -8.289026260375977, "logps/rejected": -9.341882705688477, "loss": 1.733, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -8.289026260375977, "rewards/margins": 1.0528560876846313, "rewards/rejected": -9.341882705688477, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 6.582142344582445, "learning_rate": 1.4662207078575684e-08, "logits/chosen": -1.5919125080108643, "logits/rejected": -1.5359071493148804, "logps/chosen": -7.956709861755371, "logps/rejected": -9.315800666809082, "loss": 1.7211, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -7.956709861755371, "rewards/margins": 1.3590909242630005, "rewards/rejected": -9.315800666809082, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 8.523918422589938, "learning_rate": 1.1731874863145142e-08, "logits/chosen": -1.599135398864746, "logits/rejected": -1.5693204402923584, "logps/chosen": -8.411205291748047, "logps/rejected": -9.192506790161133, "loss": 1.7356, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -8.411205291748047, "rewards/margins": 0.781301736831665, "rewards/rejected": -9.192506790161133, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 7.451771387036121, "learning_rate": 9.12094829893642e-09, "logits/chosen": -1.607877492904663, "logits/rejected": -1.5552005767822266, "logps/chosen": -8.183059692382812, "logps/rejected": -9.428353309631348, "loss": 1.7353, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -8.183059692382812, "rewards/margins": 1.2452945709228516, "rewards/rejected": -9.428353309631348, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 9.436081897769167, "learning_rate": 6.832927412229017e-09, "logits/chosen": -1.6107797622680664, "logits/rejected": -1.5790627002716064, "logps/chosen": -8.090998649597168, "logps/rejected": -9.029291152954102, "loss": 1.7392, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -8.090998649597168, "rewards/margins": 0.9382919073104858, "rewards/rejected": -9.029291152954102, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 8.582428584508216, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -1.5821881294250488, "logits/rejected": -1.5397818088531494, "logps/chosen": -8.387347221374512, "logps/rejected": -9.367807388305664, "loss": 1.746, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -8.387347221374512, "rewards/margins": 0.9804602861404419, "rewards/rejected": -9.367807388305664, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 6.129069741713946, "learning_rate": 3.2374343405217884e-09, "logits/chosen": -1.6205213069915771, "logits/rejected": -1.573266625404358, "logps/chosen": -8.136959075927734, "logps/rejected": -9.380440711975098, "loss": 1.7286, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.136959075927734, "rewards/margins": 1.243481993675232, "rewards/rejected": -9.380440711975098, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 7.975954172184094, "learning_rate": 1.9347820230782295e-09, "logits/chosen": -1.6145153045654297, "logits/rejected": -1.5640535354614258, "logps/chosen": -8.394238471984863, "logps/rejected": -9.535688400268555, "loss": 1.727, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -8.394238471984863, "rewards/margins": 1.14145028591156, "rewards/rejected": -9.535688400268555, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 5.8616098100207195, "learning_rate": 9.64668657069706e-10, "logits/chosen": -1.639035940170288, "logits/rejected": -1.5807130336761475, "logps/chosen": -8.210695266723633, "logps/rejected": -9.582908630371094, "loss": 1.7511, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -8.210695266723633, "rewards/margins": 1.372214674949646, "rewards/rejected": -9.582908630371094, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 6.126688648522468, "learning_rate": 3.2839470889836627e-10, "logits/chosen": -1.6169319152832031, "logits/rejected": -1.5936636924743652, "logps/chosen": -8.055643081665039, "logps/rejected": -8.981685638427734, "loss": 1.739, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -8.055643081665039, "rewards/margins": 0.9260419011116028, "rewards/rejected": -8.981685638427734, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 7.844036990048118, "learning_rate": 2.6813123097352287e-11, "logits/chosen": -1.6366430521011353, "logits/rejected": -1.5729566812515259, "logps/chosen": -8.483617782592773, "logps/rejected": -9.541167259216309, "loss": 1.7467, "rewards/accuracies": 0.59375, "rewards/chosen": -8.483617782592773, "rewards/margins": 1.0575507879257202, "rewards/rejected": -9.541167259216309, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 1.7629729861733299, "train_runtime": 8203.4319, "train_samples_per_second": 7.452, "train_steps_per_second": 0.058 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }